[Pkg-xen-changes] [xen] 02/30: Import xen_4.6.0.orig.tar.xz

Sun Nov 1 20:19:13 UTC 2015

This is an automated email from the git hooks/post-receive script.

waldi pushed a commit to branch develop
in repository xen.

commit 9fafe903bcadf774d3eb5fbef4666166aa876d2d
Author: Bastian Blank <waldi at debian.org>
Date:   Sun Nov 1 18:19:37 2015 +0100

    Import xen_4.6.0.orig.tar.xz
---
 .gitignore                                         |    24 +-
 .hgignore                                          |     5 -
 COPYING                                            |     3 +-
 Config.mk                                          |    42 +-
 INSTALL                                            |    22 +-
 MAINTAINERS                                        |    62 +-
 Makefile                                           |    92 +-
 README                                             |    50 +-
 config/Paths.mk.in                                 |    15 +-
 config/StdGNU.mk                                   |     3 +
 config/SunOS.mk                                    |     2 +
 config/Tools.mk.in                                 |     5 +-
 config/arm32.mk                                    |     2 +
 config/arm64.mk                                    |     3 +
 configure                                          |    38 +-
 docs/INDEX                                         |     6 +-
 docs/Makefile                                      |   197 +-
 docs/configure                                     |    18 +-
 docs/features/migration.pandoc                     |   123 +
 docs/features/template.pandoc                      |    75 +
 docs/man/xentop.pod.1                              |   111 +
 .../xentrace/xentrace.8 => docs/man/xentrace.pod.8 |   138 +-
 .../man/xentrace_format.pod.1                      |    46 +-
 docs/man/xl.cfg.pod.5                              |   318 +-
 docs/man/xl.pod.1                                  |   185 +-
 docs/man/xlcpupool.cfg.pod.5                       |    26 +-
 docs/misc/arm/device-tree/passthrough.txt          |     9 +
 docs/misc/arm/early-printk.txt                     |    51 +-
 docs/misc/arm/passthrough.txt                      |    62 +
 docs/misc/efi.markdown                             |     8 +-
 docs/misc/grant-tables.txt                         |    55 +-
 docs/misc/qemu-upstream_howto_use_it.markdown      |    12 -
 docs/misc/sedf_scheduler_mini-HOWTO.txt            |    44 -
 stubdom/README => docs/misc/stubdom.txt            |     0
 docs/misc/vbd-interface.txt                        |     6 +-
 docs/misc/vtd.txt                                  |    24 +
 docs/misc/vtpmmgr.txt                              |   155 +-
 docs/misc/xen-command-line.markdown                |   144 +-
 tools/xenmon/README => docs/misc/xenmon.txt        |     0
 docs/misc/xl-psr.markdown                          |   133 +
 docs/misc/xsm-flask.txt                            |    70 +-
 docs/specs/libxc-migration-stream.pandoc           |   696 ++
 docs/specs/libxl-migration-stream.pandoc           |   264 +
 extras/mini-os/COPYING                             |    36 -
 extras/mini-os/Config.mk                           |    57 -
 extras/mini-os/Makefile                            |   226 -
 extras/mini-os/README                              |    46 -
 extras/mini-os/app.lds                             |    11 -
 extras/mini-os/arch/arm/arm32.S                    |   233 -
 extras/mini-os/arch/arm/events.c                   |    31 -
 extras/mini-os/arch/arm/hypercalls32.S             |    64 -
 extras/mini-os/arch/arm/minios-arm32.lds           |    83 -
 extras/mini-os/arch/arm/mm.c                       |   139 -
 extras/mini-os/arch/arm/sched.c                    |    47 -
 extras/mini-os/arch/arm/setup.c                    |   119 -
 extras/mini-os/arch/x86/Makefile                   |    31 -
 extras/mini-os/arch/x86/arch.mk                    |    22 -
 extras/mini-os/arch/x86/events.c                   |    35 -
 extras/mini-os/arch/x86/ioremap.c                  |    75 -
 extras/mini-os/arch/x86/iorw.c                     |    35 -
 extras/mini-os/arch/x86/minios-x86_32.lds          |    74 -
 extras/mini-os/arch/x86/minios-x86_64.lds          |    74 -
 extras/mini-os/arch/x86/mm.c                       |   957 --
 extras/mini-os/arch/x86/sched.c                    |   139 -
 extras/mini-os/arch/x86/setup.c                    |   168 -
 extras/mini-os/arch/x86/time.c                     |   238 -
 extras/mini-os/arch/x86/traps.c                    |   333 -
 extras/mini-os/arch/x86/x86_32.S                   |   305 -
 extras/mini-os/arch/x86/x86_64.S                   |   386 -
 extras/mini-os/blkfront.c                          |   736 --
 extras/mini-os/console/console.c                   |   164 -
 extras/mini-os/console/console.h                   |     2 -
 extras/mini-os/console/xenbus.c                    |   195 -
 extras/mini-os/console/xencons_ring.c              |   195 -
 extras/mini-os/daytime.c                           |    67 -
 extras/mini-os/domain_config                       |    19 -
 extras/mini-os/events.c                            |   269 -
 extras/mini-os/fbfront.c                           |   710 --
 extras/mini-os/gntmap.c                            |   250 -
 extras/mini-os/gnttab.c                            |   196 -
 extras/mini-os/hypervisor.c                        |   132 -
 extras/mini-os/include/arch/cc.h                   |    87 -
 extras/mini-os/include/arch/perf.h                 |    15 -
 extras/mini-os/include/arch/sys_arch.h             |    35 -
 extras/mini-os/include/arm/arch_endian.h           |     7 -
 extras/mini-os/include/arm/arch_limits.h           |     9 -
 extras/mini-os/include/arm/arch_mm.h               |    38 -
 extras/mini-os/include/arm/arch_sched.h            |    19 -
 extras/mini-os/include/arm/arch_spinlock.h         |    36 -
 extras/mini-os/include/arm/arm32/arch_wordsize.h   |     1 -
 extras/mini-os/include/arm/gic.h                   |     1 -
 extras/mini-os/include/arm/hypercall-arm.h         |    98 -
 extras/mini-os/include/arm/os.h                    |   216 -
 extras/mini-os/include/arm/traps.h                 |    20 -
 extras/mini-os/include/blkfront.h                  |    54 -
 extras/mini-os/include/byteorder.h                 |    36 -
 extras/mini-os/include/byteswap.h                  |    39 -
 extras/mini-os/include/compiler.h                  |    10 -
 extras/mini-os/include/console.h                   |    89 -
 extras/mini-os/include/ctype.h                     |    60 -
 extras/mini-os/include/endian.h                    |    19 -
 extras/mini-os/include/err.h                       |    31 -
 extras/mini-os/include/errno-base.h                |    39 -
 extras/mini-os/include/errno.h                     |   122 -
 extras/mini-os/include/events.h                    |    59 -
 extras/mini-os/include/fbfront.h                   |    46 -
 extras/mini-os/include/fcntl.h                     |    99 -
 extras/mini-os/include/gntmap.h                    |    35 -
 extras/mini-os/include/gnttab.h                    |    17 -
 extras/mini-os/include/hypervisor.h                |    48 -
 extras/mini-os/include/ioremap.h                   |    33 -
 extras/mini-os/include/iorw.h                      |    16 -
 extras/mini-os/include/kernel.h                    |     9 -
 extras/mini-os/include/lib-gpl.h                   |    59 -
 extras/mini-os/include/lib.h                       |   230 -
 extras/mini-os/include/linux/types.h               |     5 -
 extras/mini-os/include/lwipopts.h                  |    23 -
 extras/mini-os/include/mm.h                        |    82 -
 extras/mini-os/include/netfront.h                  |    24 -
 extras/mini-os/include/pcifront.h                  |    29 -
 extras/mini-os/include/posix/arpa/inet.h           |     7 -
 extras/mini-os/include/posix/dirent.h              |    24 -
 extras/mini-os/include/posix/err.h                 |    15 -
 extras/mini-os/include/posix/fcntl.h               |    11 -
 extras/mini-os/include/posix/limits.h              |    48 -
 extras/mini-os/include/posix/net/if.h              |    85 -
 extras/mini-os/include/posix/netdb.h               |     9 -
 extras/mini-os/include/posix/netinet/in.h          |     7 -
 extras/mini-os/include/posix/netinet/tcp.h         |     6 -
 extras/mini-os/include/posix/poll.h                |     1 -
 extras/mini-os/include/posix/pthread.h             |    64 -
 extras/mini-os/include/posix/signal.h              |    10 -
 extras/mini-os/include/posix/stdlib.h              |     8 -
 extras/mini-os/include/posix/strings.h             |    12 -
 extras/mini-os/include/posix/sys/ioctl.h           |    16 -
 extras/mini-os/include/posix/sys/mman.h            |    22 -
 extras/mini-os/include/posix/sys/poll.h            |    79 -
 extras/mini-os/include/posix/sys/select.h          |     7 -
 extras/mini-os/include/posix/sys/socket.h          |    31 -
 extras/mini-os/include/posix/sys/stat.h            |     7 -
 extras/mini-os/include/posix/syslog.h              |    37 -
 extras/mini-os/include/posix/termios.h             |    87 -
 extras/mini-os/include/posix/time.h                |    11 -
 extras/mini-os/include/posix/unistd.h              |    16 -
 extras/mini-os/include/sched.h                     |    55 -
 extras/mini-os/include/semaphore.h                 |   110 -
 extras/mini-os/include/spinlock.h                  |    55 -
 extras/mini-os/include/sys/lock.h                  |    52 -
 extras/mini-os/include/sys/time.h                  |    47 -
 extras/mini-os/include/time.h                      |    63 -
 extras/mini-os/include/tpm_tis.h                   |    60 -
 extras/mini-os/include/tpmback.h                   |   104 -
 extras/mini-os/include/tpmfront.h                  |    97 -
 extras/mini-os/include/types.h                     |    74 -
 extras/mini-os/include/wait.h                      |   105 -
 extras/mini-os/include/waittypes.h                 |    32 -
 extras/mini-os/include/x86/arch_endian.h           |     7 -
 extras/mini-os/include/x86/arch_limits.h           |    20 -
 extras/mini-os/include/x86/arch_mm.h               |   233 -
 extras/mini-os/include/x86/arch_sched.h            |    25 -
 extras/mini-os/include/x86/arch_spinlock.h         |    94 -
 extras/mini-os/include/x86/os.h                    |   572 -
 extras/mini-os/include/x86/traps.h                 |    78 -
 extras/mini-os/include/x86/x86_32/arch_wordsize.h  |     1 -
 .../mini-os/include/x86/x86_32/hypercall-x86_32.h  |   337 -
 extras/mini-os/include/x86/x86_64/arch_wordsize.h  |     2 -
 .../mini-os/include/x86/x86_64/hypercall-x86_64.h  |   344 -
 extras/mini-os/include/xenbus.h                    |   120 -
 extras/mini-os/include/xmalloc.h                   |    44 -
 extras/mini-os/kernel.c                            |   198 -
 extras/mini-os/lib/ctype.c                         |    29 -
 extras/mini-os/lib/math.c                          |   426 -
 extras/mini-os/lib/printf.c                        |   786 --
 extras/mini-os/lib/stack_chk_fail.c                |     8 -
 extras/mini-os/lib/string.c                        |   228 -
 extras/mini-os/lib/sys.c                           |  1550 ---
 extras/mini-os/lib/xmalloc.c                       |   319 -
 extras/mini-os/lib/xs.c                            |   194 -
 extras/mini-os/lock.c                              |   112 -
 extras/mini-os/lwip-arch.c                         |   293 -
 extras/mini-os/lwip-net.c                          |   386 -
 extras/mini-os/main.c                              |   193 -
 extras/mini-os/minios.mk                           |    76 -
 extras/mini-os/mm.c                                |   441 -
 extras/mini-os/netfront.c                          |   677 --
 extras/mini-os/pcifront.c                          |   616 --
 extras/mini-os/sched.c                             |   304 -
 extras/mini-os/test.c                              |   577 -
 extras/mini-os/tpm_tis.c                           |  1367 ---
 extras/mini-os/tpmback.c                           |  1136 --
 extras/mini-os/tpmfront.c                          |   631 --
 extras/mini-os/xenbus/xenbus.c                     |   870 --
 m4/paths.m4                                        |    12 +
 m4/pkg.m4                                          |     3 +-
 m4/python_fortify_noopt.m4                         |    31 +
 m4/systemd.m4                                      |     9 +-
 scripts/git-checkout.sh                            |     2 +-
 stubdom/Makefile                                   |    20 +-
 stubdom/c/Makefile                                 |     2 +
 stubdom/caml/Makefile                              |     2 +
 stubdom/configure                                  |    20 +-
 stubdom/configure.ac                               |     2 +-
 stubdom/grub.patches/10graphics.diff               |    10 +-
 stubdom/grub.patches/61btrfs.diff                  |     6 +-
 stubdom/grub/kexec.c                               |     5 +-
 stubdom/vtpm-deepquote-anyloc.patch                |   127 +
 stubdom/vtpm/vtpm_cmd.c                            |    13 +-
 stubdom/vtpmmgr/Makefile                           |     2 +-
 stubdom/vtpmmgr/common_types.h                     |     9 +
 stubdom/vtpmmgr/disk_read.c                        |    29 +-
 stubdom/vtpmmgr/disk_tpm.c                         |    42 +-
 stubdom/vtpmmgr/disk_tpm.h                         |     4 +
 stubdom/vtpmmgr/disk_write.c                       |    13 +-
 stubdom/vtpmmgr/init.c                             |   279 +
 stubdom/vtpmmgr/marshal.h                          |     1 +
 stubdom/vtpmmgr/mgmt_authority.c                   |    91 +-
 stubdom/vtpmmgr/mgmt_authority.h                   |     2 +-
 stubdom/vtpmmgr/tcg.h                              |     7 +-
 stubdom/vtpmmgr/tpm2.c                             |   455 +
 stubdom/vtpmmgr/tpm2.h                             |   104 +
 stubdom/vtpmmgr/tpm2_marshal.h                     |   673 ++
 stubdom/vtpmmgr/tpm2_types.h                       |   969 ++
 stubdom/vtpmmgr/tpmrsa.c                           |     3 +-
 stubdom/vtpmmgr/tpmrsa.h                           |     3 +-
 stubdom/vtpmmgr/vtpm_cmd_handler.c                 |     7 +-
 stubdom/vtpmmgr/vtpm_manager.h                     |    27 +-
 stubdom/vtpmmgr/vtpmmgr.c                          |    46 +-
 stubdom/vtpmmgr/vtpmmgr.h                          |    29 +
 tools/Makefile                                     |   105 +-
 tools/Rules.mk                                     |    13 +-
 tools/blktap/Makefile                              |    13 -
 tools/blktap/README                                |   122 -
 tools/blktap/drivers/Makefile                      |    73 -
 tools/blktap/drivers/aes.c                         |  1319 ---
 tools/blktap/drivers/aes.h                         |    28 -
 tools/blktap/drivers/blk.h                         |     3 -
 tools/blktap/drivers/blk_linux.c                   |    42 -
 tools/blktap/drivers/blktapctrl.c                  |   937 --
 tools/blktap/drivers/blktapctrl_linux.c            |    89 -
 tools/blktap/drivers/block-aio.c                   |   259 -
 tools/blktap/drivers/block-qcow.c                  |  1434 ---
 tools/blktap/drivers/block-qcow2.c                 |  2098 ----
 tools/blktap/drivers/block-ram.c                   |   295 -
 tools/blktap/drivers/block-sync.c                  |   242 -
 tools/blktap/drivers/block-vmdk.c                  |   428 -
 tools/blktap/drivers/bswap.h                       |   178 -
 tools/blktap/drivers/img2qcow.c                    |   282 -
 tools/blktap/drivers/qcow-create.c                 |   130 -
 tools/blktap/drivers/qcow2raw.c                    |   348 -
 tools/blktap/drivers/tapaio.c                      |   357 -
 tools/blktap/drivers/tapaio.h                      |   108 -
 tools/blktap/drivers/tapdisk.c                     |   872 --
 tools/blktap/drivers/tapdisk.h                     |   259 -
 tools/blktap/lib/Makefile                          |    60 -
 tools/blktap/lib/blkif.c                           |   185 -
 tools/blktap/lib/blktaplib.h                       |   240 -
 tools/blktap/lib/list.h                            |    59 -
 tools/blktap/lib/xenbus.c                          |   617 --
 tools/blktap/lib/xs_api.c                          |   360 -
 tools/blktap/lib/xs_api.h                          |    50 -
 tools/blktap2/Makefile                             |     6 +-
 tools/blktap2/control/Makefile                     |    16 +-
 tools/blktap2/drivers/Makefile                     |     6 +-
 tools/blktap2/drivers/libaio-compat.h              |     4 +-
 tools/blktap2/drivers/tapdisk-vbd.c                |     2 +-
 tools/blktap2/include/Makefile                     |     5 +-
 tools/blktap2/lvm/Makefile                         |     4 +-
 tools/blktap2/vhd/Makefile                         |     6 +-
 tools/blktap2/vhd/lib/Makefile                     |     6 +-
 tools/blktap2/vhd/lib/libvhd.c                     |    27 +-
 tools/blktap2/vhd/vhd-util.c                       |     3 +
 tools/config.h.in                                  |    21 +
 tools/configure                                    |   641 +-
 tools/configure.ac                                 |    95 +-
 tools/console/Makefile                             |    13 +-
 tools/console/client/main.c                        |    81 +-
 tools/console/daemon/io.c                          |     7 +-
 tools/console/daemon/io.h                          |     3 +-
 tools/console/daemon/main.c                        |    41 +-
 tools/console/daemon/utils.c                       |     3 +-
 tools/console/daemon/utils.h                       |     3 +-
 tools/console/testsuite/Makefile                   |     5 +-
 tools/debugger/gdbsx/Makefile                      |     8 +-
 tools/debugger/gdbsx/gx/Makefile                   |     2 +
 tools/debugger/gdbsx/gx/gx.h                       |     4 +-
 tools/debugger/gdbsx/gx/gx_comm.c                  |     8 +-
 tools/debugger/gdbsx/gx/gx_local.c                 |     4 +-
 tools/debugger/gdbsx/gx/gx_main.c                  |     4 +-
 tools/debugger/gdbsx/gx/gx_utils.c                 |     4 +-
 tools/debugger/gdbsx/gx/xg_dummy.c                 |     4 +-
 tools/debugger/gdbsx/xg/Makefile                   |     2 +
 tools/debugger/gdbsx/xg/xg_main.c                  |     4 +-
 tools/debugger/gdbsx/xg/xg_public.h                |     4 +-
 tools/debugger/kdd/Makefile                        |     7 +-
 tools/examples/Makefile                            |     3 +
 tools/examples/README                              |     1 -
 tools/examples/cpupool                             |     2 +-
 tools/firmware/Makefile                            |     4 +-
 .../firmware/etherboot/patches/build-compare.patch |    19 +
 tools/firmware/etherboot/patches/build_fix_4.patch |   225 +
 tools/firmware/etherboot/patches/series            |     2 +
 tools/firmware/hvmloader/32bitbios_support.c       |     3 +-
 tools/firmware/hvmloader/Makefile                  |    16 +-
 tools/firmware/hvmloader/acpi/Makefile             |     9 +-
 tools/firmware/hvmloader/acpi/acpi2_0.h            |    64 +-
 tools/firmware/hvmloader/acpi/build.c              |   109 +-
 tools/firmware/hvmloader/acpi/dsdt.asl             |    27 +-
 tools/firmware/hvmloader/acpi/mk_dsdt.c            |    41 +-
 tools/firmware/hvmloader/acpi/ssdt_pm.asl          |    32 +-
 tools/firmware/hvmloader/acpi/ssdt_s3.asl          |     3 +-
 tools/firmware/hvmloader/acpi/ssdt_s4.asl          |     3 +-
 tools/firmware/hvmloader/acpi/ssdt_tpm.asl         |     3 +-
 tools/firmware/hvmloader/acpi/static_tables.c      |     3 +-
 tools/firmware/hvmloader/cacheattr.c               |     3 +-
 tools/firmware/hvmloader/e820.c                    |   135 +-
 tools/firmware/hvmloader/e820.h                    |     7 +
 tools/firmware/hvmloader/hvmloader.c               |     8 +-
 tools/firmware/hvmloader/mkhex                     |     3 +-
 tools/firmware/hvmloader/mp_tables.c               |     3 +-
 tools/firmware/hvmloader/optionroms.c              |     3 +-
 tools/firmware/hvmloader/ovmf.c                    |     3 +-
 tools/firmware/hvmloader/pci.c                     |    70 +-
 tools/firmware/hvmloader/pir_types.h               |     3 +-
 tools/firmware/hvmloader/rombios.c                 |     3 +-
 tools/firmware/hvmloader/seabios.c                 |     3 +-
 tools/firmware/hvmloader/smbios.c                  |     3 +-
 tools/firmware/hvmloader/smbios_types.h            |     3 +-
 tools/firmware/hvmloader/smp.c                     |     3 +-
 tools/firmware/hvmloader/tests.c                   |     3 +-
 tools/firmware/hvmloader/util.c                    |    32 +-
 tools/firmware/hvmloader/util.h                    |    21 +-
 tools/firmware/hvmloader/vnuma.c                   |    80 +
 .../blktapctrl.h => firmware/hvmloader/vnuma.h}    |    28 +-
 tools/firmware/hvmloader/xenbus.c                  |    48 +-
 tools/firmware/ovmf-makefile                       |     6 +-
 tools/firmware/rombios/32bit/32bitbios.c           |     3 +-
 tools/firmware/rombios/32bit/Makefile              |     3 +
 tools/firmware/rombios/32bit/mkhex                 |     3 +-
 tools/firmware/rombios/32bit/pmm.c                 |     3 +-
 tools/firmware/rombios/32bit/tcgbios/Makefile      |     3 +
 tools/firmware/rombios/32bit/tcgbios/tcgbios.c     |     3 +-
 tools/firmware/rombios/32bit/tcgbios/tpm_drivers.c |     3 +-
 tools/firmware/rombios/32bit/util.c                |     3 +-
 tools/firmware/rombios/32bitgateway.c              |     3 +-
 tools/firmware/rombios/Makefile                    |     3 +
 tools/firmware/rombios/apmbios.S                   |     3 +-
 tools/firmware/rombios/rombios.c                   |     3 +-
 tools/firmware/rombios/rombios.h                   |     3 +-
 tools/firmware/rombios/tcgbios.c                   |     3 +-
 tools/firmware/vgabios/COPYING                     |     3 +-
 tools/firmware/vgabios/Makefile                    |     7 +-
 tools/firmware/vgabios/biossums.c                  |     3 +-
 tools/firmware/vgabios/clext.c                     |     3 +-
 tools/firmware/vgabios/vbe.c                       |     3 +-
 tools/firmware/vgabios/vgabios.c                   |     3 +-
 tools/flask/Makefile                               |     4 +-
 tools/flask/policy/Makefile                        |    29 +-
 tools/flask/policy/policy/device_contexts          |    32 +
 tools/flask/policy/policy/initial_sids             |     4 +
 tools/flask/policy/policy/modules/xen/xen.if       |     8 +-
 tools/flask/policy/policy/modules/xen/xen.te       |    70 +-
 tools/flask/utils/Makefile                         |     7 +-
 tools/hotplug/FreeBSD/Makefile                     |     5 +-
 tools/hotplug/FreeBSD/rc.d/xencommons.in           |    16 +-
 tools/hotplug/FreeBSD/rc.d/xendriverdomain.in      |    48 +
 tools/hotplug/FreeBSD/vif-bridge                   |     4 +-
 tools/hotplug/Linux/Makefile                       |    22 +-
 tools/hotplug/Linux/blktap                         |    94 -
 tools/hotplug/Linux/block                          |    16 +-
 tools/hotplug/Linux/block-common.sh                |     3 +-
 tools/hotplug/Linux/block-drbd-probe               |     3 +-
 tools/hotplug/Linux/block-tap                      |   123 +
 tools/hotplug/Linux/external-device-migrate        |     3 +-
 tools/hotplug/Linux/init.d/sysconfig.xencommons.in |    10 +-
 tools/hotplug/Linux/init.d/xen-watchdog.in         |     2 +-
 tools/hotplug/Linux/init.d/xencommons.in           |    10 +-
 tools/hotplug/Linux/init.d/xendriverdomain.in      |    85 +
 tools/hotplug/Linux/locking.sh                     |     3 +-
 tools/hotplug/Linux/logging.sh                     |     3 +-
 tools/hotplug/Linux/systemd/Makefile               |     3 +
 .../systemd/xen-qemu-dom0-disk-backend.service.in  |     2 +-
 tools/hotplug/Linux/systemd/xenstored.service.in   |     1 +
 tools/hotplug/Linux/vif-common.sh                  |     5 +-
 tools/hotplug/Linux/xen-backend.rules.in           |    15 -
 tools/hotplug/Linux/xen-hotplug-common.sh.in       |    14 +-
 tools/hotplug/Linux/xen-network-common.sh          |     3 +-
 tools/hotplug/Linux/xen-script-common.sh           |     3 +-
 tools/hotplug/Linux/xendomains.in                  |     2 +-
 tools/hotplug/Makefile                             |     4 +-
 tools/hotplug/NetBSD/Makefile                      |     5 +-
 tools/hotplug/NetBSD/block                         |     2 +-
 tools/hotplug/NetBSD/rc.d/xen-watchdog             |     6 +-
 tools/hotplug/NetBSD/rc.d/xencommons.in            |    16 +-
 tools/hotplug/NetBSD/rc.d/xendomains               |     4 +-
 tools/hotplug/NetBSD/rc.d/xendriverdomain.in       |    49 +
 tools/hotplug/NetBSD/vif-bridge                    |     2 +-
 tools/hotplug/NetBSD/vif-ip                        |     2 +-
 tools/hotplug/common/Makefile                      |     3 +
 tools/include/Makefile                             |    59 +-
 tools/include/xen-external/bsd-sys-queue-h-seddery |     2 +
 tools/include/xen-foreign/Makefile                 |     6 +-
 tools/include/xen-foreign/reference.size           |     4 +-
 tools/include/xen-sys/NetBSDRump/evtchn.h          |    86 +
 tools/include/xen-sys/NetBSDRump/privcmd.h         |    81 +-
 tools/libfsimage/Rules.mk                          |     2 +-
 tools/libfsimage/common/Makefile                   |    16 +-
 tools/libfsimage/ext2fs-lib/Makefile               |     4 +-
 tools/libfsimage/ext2fs-lib/ext2fs-lib.c           |     3 -
 tools/libfsimage/ext2fs/fsys_ext2fs.c              |     3 +-
 tools/libfsimage/fat/fat.h                         |     3 +-
 tools/libfsimage/fat/fsys_fat.c                    |     3 +-
 tools/libfsimage/iso9660/fsys_iso9660.c            |     3 +-
 tools/libfsimage/iso9660/iso9660.h                 |     3 +-
 tools/libfsimage/reiserfs/fsys_reiserfs.c          |     3 +-
 tools/libfsimage/ufs/fsys_ufs.c                    |     3 +-
 tools/libfsimage/xfs/fsys_xfs.c                    |     3 +-
 tools/libfsimage/xfs/xfs.h                         |     3 +-
 tools/libfsimage/zfs/Makefile                      |     3 +-
 tools/libfsimage/zfs/filesys.h                     |     3 +-
 tools/libfsimage/zfs/fsi_zfs.c                     |     3 +-
 tools/libfsimage/zfs/fsi_zfs.h                     |     3 +-
 tools/libfsimage/zfs/fsys_zfs.c                    |     3 +-
 tools/libfsimage/zfs/fsys_zfs.h                    |     3 +-
 tools/libfsimage/zfs/mb_info.h                     |     3 +-
 tools/libfsimage/zfs/shared.h                      |     3 +-
 tools/libfsimage/zfs/zfs-include/dmu.h             |     3 +-
 tools/libfsimage/zfs/zfs-include/dmu_objset.h      |     3 +-
 tools/libfsimage/zfs/zfs-include/dnode.h           |     3 +-
 tools/libfsimage/zfs/zfs-include/dsl_dataset.h     |     3 +-
 tools/libfsimage/zfs/zfs-include/dsl_dir.h         |     3 +-
 tools/libfsimage/zfs/zfs-include/sa_impl.h         |     3 +-
 tools/libfsimage/zfs/zfs-include/spa.h             |     3 +-
 tools/libfsimage/zfs/zfs-include/uberblock_impl.h  |     3 +-
 tools/libfsimage/zfs/zfs-include/vdev_impl.h       |     3 +-
 tools/libfsimage/zfs/zfs-include/zap_impl.h        |     3 +-
 tools/libfsimage/zfs/zfs-include/zap_leaf.h        |     3 +-
 tools/libfsimage/zfs/zfs-include/zfs.h             |     3 +-
 tools/libfsimage/zfs/zfs-include/zfs_acl.h         |     3 +-
 tools/libfsimage/zfs/zfs-include/zfs_znode.h       |     3 +-
 tools/libfsimage/zfs/zfs-include/zil.h             |     3 +-
 tools/libfsimage/zfs/zfs-include/zio.h             |     3 +-
 tools/libfsimage/zfs/zfs-include/zio_checksum.h    |     3 +-
 tools/libfsimage/zfs/zfs_fletcher.c                |     3 +-
 tools/libfsimage/zfs/zfs_lzjb.c                    |     3 +-
 tools/libfsimage/zfs/zfs_sha256.c                  |     3 +-
 tools/libvchan/Makefile                            |    14 +-
 tools/libvchan/init.c                              |     3 +-
 tools/libvchan/io.c                                |     3 +-
 tools/libvchan/libxenvchan.h                       |     3 +-
 tools/libvchan/node-select.c                       |    17 +-
 tools/libvchan/node.c                              |     3 +-
 tools/libxc/Makefile                               |    50 +-
 tools/libxc/include/xc_dom.h                       |    16 +-
 tools/libxc/include/xenctrl.h                      |   273 +-
 tools/libxc/include/xenctrlosdep.h                 |     3 +-
 tools/libxc/include/xenguest.h                     |    43 +-
 tools/libxc/include/xentoollog.h                   |     3 +-
 tools/libxc/xc_altp2m.c                            |   247 +
 tools/libxc/xc_bitops.h                            |     5 +
 tools/libxc/xc_compression.c                       |     3 +-
 tools/libxc/xc_core.c                              |     3 +-
 tools/libxc/xc_core.h                              |     3 +-
 tools/libxc/xc_core_arm.c                          |    14 +-
 tools/libxc/xc_core_arm.h                          |     3 +-
 tools/libxc/xc_core_x86.c                          |    32 +-
 tools/libxc/xc_core_x86.h                          |     3 +-
 tools/libxc/xc_cpu_hotplug.c                       |     3 +-
 tools/libxc/xc_cpufeature.h                        |     3 +-
 tools/libxc/xc_cpuid_x86.c                         |     6 +-
 tools/libxc/xc_cpupool.c                           |     7 +-
 tools/libxc/xc_csched.c                            |     3 +-
 tools/libxc/xc_csched2.c                           |     3 +-
 tools/libxc/xc_dom_arm.c                           |    14 +-
 tools/libxc/xc_dom_armzimageloader.c               |     3 +-
 tools/libxc/xc_dom_binloader.c                     |     3 +-
 tools/libxc/xc_dom_boot.c                          |     9 +-
 tools/libxc/xc_dom_bzimageloader.c                 |    23 +-
 tools/libxc/xc_dom_compat_linux.c                  |     9 +-
 tools/libxc/xc_dom_core.c                          |    11 +-
 tools/libxc/xc_dom_elfloader.c                     |    36 +-
 tools/libxc/xc_dom_x86.c                           |   180 +-
 tools/libxc/xc_domain.c                            |   394 +-
 tools/libxc/xc_domain_restore.c                    |  2399 -----
 tools/libxc/xc_domain_save.c                       |  2192 ----
 tools/libxc/xc_efi.h                               |     3 +-
 tools/libxc/xc_elf.h                               |     3 +-
 tools/libxc/xc_evtchn.c                            |     3 +-
 tools/libxc/xc_flask.c                             |    15 +-
 tools/libxc/xc_foreign_memory.c                    |     3 +-
 tools/libxc/xc_freebsd.c                           |     3 +-
 tools/libxc/xc_freebsd_osdep.c                     |    10 +-
 tools/libxc/xc_gnttab.c                            |     3 +-
 tools/libxc/xc_hcall_buf.c                         |     9 +-
 tools/libxc/xc_hvm_build_arm.c                     |     3 +-
 tools/libxc/xc_hvm_build_x86.c                     |   341 +-
 tools/libxc/xc_linux.c                             |     3 +-
 tools/libxc/xc_linux_osdep.c                       |     9 +-
 tools/libxc/xc_mem_access.c                        |    59 +-
 tools/libxc/xc_mem_paging.c                        |    83 +-
 tools/libxc/xc_memshr.c                            |    32 +-
 tools/libxc/xc_minios.c                            |     3 +-
 tools/libxc/xc_misc.c                              |   121 +-
 tools/libxc/xc_monitor.c                           |   145 +
 tools/libxc/xc_msr_x86.h                           |     1 +
 tools/libxc/xc_netbsd.c                            |     3 +-
 tools/libxc/xc_nomigrate.c                         |     3 +-
 tools/libxc/xc_offline_page.c                      |   104 +-
 tools/libxc/xc_pagetab.c                           |     3 +-
 tools/libxc/xc_physdev.c                           |    15 +-
 tools/libxc/xc_pm.c                                |    57 +-
 tools/libxc/xc_private.c                           |   125 +-
 tools/libxc/xc_private.h                           |    36 +-
 tools/libxc/xc_psr.c                               |   155 +-
 tools/libxc/xc_resume.c                            |     3 +-
 tools/libxc/xc_rt.c                                |     3 +-
 tools/libxc/xc_sedf.c                              |    78 -
 tools/libxc/xc_solaris.c                           |     3 +-
 tools/libxc/xc_sr_common.c                         |   114 +
 tools/libxc/xc_sr_common.h                         |   375 +
 tools/libxc/xc_sr_common_x86.c                     |    54 +
 tools/libxc/xc_sr_common_x86.h                     |    26 +
 tools/libxc/xc_sr_common_x86_pv.c                  |   210 +
 tools/libxc/xc_sr_common_x86_pv.h                  |   102 +
 tools/libxc/xc_sr_restore.c                        |   802 ++
 tools/libxc/xc_sr_restore_x86_hvm.c                |   233 +
 tools/libxc/xc_sr_restore_x86_pv.c                 |  1165 +++
 tools/libxc/xc_sr_save.c                           |   906 ++
 tools/libxc/xc_sr_save_x86_hvm.c                   |   220 +
 tools/libxc/xc_sr_save_x86_pv.c                    |   894 ++
 tools/libxc/xc_sr_stream_format.h                  |   149 +
 tools/libxc/xc_suspend.c                           |     3 +-
 tools/libxc/xc_tbuf.c                              |    36 +-
 tools/libxc/xc_tmem.c                              |   140 +-
 tools/libxc/{xc_mem_event.c => xc_vm_event.c}      |    62 +-
 tools/libxc/xg_private.c                           |     5 +-
 tools/libxc/xg_private.h                           |     3 +-
 tools/libxc/xg_save_restore.h                      |   253 +-
 tools/libxc/xtl_core.c                             |    12 +-
 tools/libxc/xtl_logger_stdio.c                     |    14 +-
 tools/libxl/CODING_STYLE                           |     1 +
 tools/libxl/Makefile                               |    75 +-
 tools/libxl/gentest.py                             |    64 +-
 tools/libxl/gentypes.py                            |     7 +-
 tools/libxl/libxl.c                                |  1084 +-
 tools/libxl/libxl.h                                |   286 +-
 tools/libxl/libxl_aoutils.c                        |   221 +-
 tools/libxl/libxl_arch.h                           |    35 +
 tools/libxl/libxl_arm.c                            |   328 +-
 tools/libxl/libxl_bootloader.c                     |    33 +-
 tools/libxl/libxl_convert_callout.c                |   173 +
 tools/libxl/libxl_cpuid.c                          |    13 +-
 tools/libxl/libxl_create.c                         |   358 +-
 tools/libxl/libxl_device.c                         |   137 +-
 tools/libxl/libxl_dm.c                             |   530 +-
 tools/libxl/libxl_dom.c                            |  1167 +--
 tools/libxl/libxl_dom_suspend.c                    |   443 +
 tools/libxl/libxl_event.c                          |   513 +-
 tools/libxl/libxl_event.h                          |     2 +-
 tools/libxl/libxl_exec.c                           |    38 +-
 tools/libxl/libxl_flask.c                          |    13 +-
 tools/libxl/libxl_fork.c                           |     8 +-
 tools/libxl/libxl_freebsd.c                        |    12 +
 tools/libxl/libxl_internal.c                       |   105 +-
 tools/libxl/libxl_internal.h                       |   650 +-
 tools/libxl/libxl_json.c                           |    10 +-
 tools/libxl/libxl_libfdt_compat.c                  |    94 +
 tools/libxl/libxl_libfdt_compat.h                  |    90 +
 tools/libxl/libxl_linux.c                          |    88 +-
 tools/libxl/libxl_netbsd.c                         |    19 +-
 tools/libxl/libxl_netbuffer.c                      |    23 +-
 tools/libxl/libxl_no_convert_callout.c             |    35 +
 tools/libxl/libxl_osdeps.h                         |    37 +
 tools/libxl/libxl_pci.c                            |    54 +-
 tools/libxl/libxl_psr.c                            |   217 +-
 tools/libxl/libxl_qmp.c                            |    48 +-
 tools/libxl/libxl_remus_disk_drbd.c                |    14 +-
 tools/libxl/libxl_save_callout.c                   |   145 +-
 tools/libxl/libxl_save_helper.c                    |    93 +-
 tools/libxl/libxl_save_msgs_gen.pl                 |     9 +-
 tools/libxl/libxl_sr_stream_format.h               |    58 +
 tools/libxl/libxl_stream_read.c                    |   829 ++
 tools/libxl/libxl_stream_write.c                   |   625 ++
 tools/libxl/libxl_test_fdevent.c                   |    79 +
 tools/libxl/libxl_test_fdevent.h                   |    12 +
 tools/libxl/libxl_test_timedereg.c                 |    17 +-
 tools/libxl/libxl_types.idl                        |   102 +-
 tools/libxl/libxl_utils.c                          |   151 +
 tools/libxl/libxl_utils.h                          |    19 +-
 tools/libxl/libxl_vnuma.c                          |   325 +
 tools/libxl/libxl_x86.c                            |   245 +-
 tools/libxl/libxlu_cfg.c                           |   210 +-
 tools/libxl/libxlu_cfg_i.h                         |    14 +-
 tools/libxl/libxlu_cfg_y.c                         |    46 +-
 tools/libxl/libxlu_cfg_y.h                         |     2 +-
 tools/libxl/libxlu_cfg_y.y                         |    14 +-
 tools/libxl/libxlu_internal.h                      |    33 +-
 tools/libxl/libxlu_pci.c                           |    98 +-
 tools/libxl/libxlutil.h                            |    17 +
 tools/libxl/test_common.c                          |    44 +-
 tools/libxl/test_common.h                          |    15 +
 tools/libxl/test_fdderegrace.c                     |    56 +
 tools/libxl/xenlight.pc.in.in                      |    11 +
 tools/libxl/xl.c                                   |     2 +-
 tools/libxl/xl.h                                   |     6 +-
 tools/libxl/xl_cmdimpl.c                           |  1863 ++--
 tools/libxl/xl_cmdtable.c                          |    46 +-
 tools/libxl/xlutil.pc.in.in                        |     9 +
 tools/memshr/Makefile                              |     5 +-
 tools/memshr/bidir-daemon.c                        |     3 +-
 tools/memshr/bidir-daemon.h                        |     3 +-
 tools/memshr/bidir-hash.c                          |     3 +-
 tools/memshr/bidir-hash.h                          |     3 +-
 tools/memshr/bidir-namedefs.h                      |     3 +-
 tools/memshr/interface.c                           |     3 +-
 tools/memshr/memshr-priv.h                         |     3 +-
 tools/memshr/memshr.h                              |     3 +-
 tools/memshr/shm.c                                 |     3 +-
 tools/memshr/shm.h                                 |     3 +-
 tools/misc/Makefile                                |    90 +-
 tools/misc/gtracestat.c                            |     4 +-
 tools/misc/gtraceview.c                            |     7 +-
 tools/misc/mkrpm                                   |     4 +-
 tools/misc/mktarball                               |     4 +-
 tools/misc/sbdf2devicepath                         |    82 -
 tools/misc/xen-hptool.c                            |     6 +-
 tools/misc/xen-mfndump.c                           |     9 +-
 tools/misc/xen-ringwatch                           |     4 +-
 tools/misc/xencov.c                                |     3 +-
 tools/misc/xencov_split                            |     3 +-
 tools/misc/xenpm.c                                 |   106 +-
 tools/misc/xenpvnetboot                            |     4 +-
 tools/ocaml/LICENSE                                |     3 +-
 tools/ocaml/Makefile                               |     3 +
 tools/ocaml/Makefile.rules                         |     2 +
 tools/ocaml/libs/Makefile                          |     3 +
 tools/ocaml/libs/xb/op.ml                          |     6 +-
 tools/ocaml/libs/xb/xb.mli                         |     1 +
 tools/ocaml/libs/xb/xs_ring_stubs.c                |     4 +-
 tools/ocaml/libs/xc/xenctrl_stubs.c                |    99 +-
 tools/ocaml/libs/xl/genwrap.py                     |    39 +-
 tools/ocaml/libs/xs/xs.ml                          |     8 +-
 tools/ocaml/xenstored/Makefile                     |     9 +-
 tools/ocaml/xenstored/connection.ml                |     7 +
 tools/ocaml/xenstored/logging.ml                   |     1 +
 tools/ocaml/xenstored/process.ml                   |     6 +
 tools/ocaml/xenstored/systemd.ml                   |     2 +-
 tools/ocaml/xenstored/systemd.mli                  |     4 +-
 tools/ocaml/xenstored/systemd_stubs.c              |     7 +-
 tools/ocaml/xenstored/utils.ml                     |     2 +-
 tools/ocaml/xenstored/xenstored.ml                 |     4 +-
 tools/pygrub/Makefile                              |    15 +-
 tools/pygrub/examples/ubuntu-14.04-lts.grub2       |   234 +
 tools/pygrub/src/ExtLinuxConf.py                   |     3 +-
 tools/pygrub/src/GrubConf.py                       |     3 +-
 tools/pygrub/src/pygrub                            |     3 +-
 tools/python/Makefile                              |    14 +-
 tools/python/scripts/convert-legacy-stream         |   730 ++
 tools/python/scripts/verify-stream-v2              |   174 +
 tools/python/setup.py                              |     1 +
 tools/python/xen/lowlevel/xc/xc.c                  |   205 +-
 tools/python/xen/lowlevel/xl/xl.c                  |     3 +-
 tools/python/xen/lowlevel/xs/xs.c                  |     3 +-
 tools/python/xen/migration/__init__.py             |     0
 tools/python/xen/migration/legacy.py               |   315 +
 tools/python/xen/migration/libxc.py                |   446 +
 tools/python/xen/migration/libxl.py                |   227 +
 tools/python/xen/migration/public.py               |    21 +
 tools/python/xen/migration/tests.py                |    54 +
 tools/python/xen/migration/verify.py               |    37 +
 tools/python/xen/migration/xl.py                   |    12 +
 tools/tests/mce-test/Makefile                      |     5 +-
 tools/tests/mce-test/cases/srao_llc/dom0/cases.sh  |     3 +-
 tools/tests/mce-test/cases/srao_llc/guest/cases.sh |     3 +-
 tools/tests/mce-test/cases/srao_llc/xen/cases.sh   |     3 +-
 tools/tests/mce-test/cases/srao_mem/dom0/cases.sh  |     3 +-
 tools/tests/mce-test/cases/srao_mem/guest/cases.sh |     3 +-
 tools/tests/mce-test/cases/srao_mem/xen/cases.sh   |     3 +-
 tools/tests/mce-test/cases/ucna_llc/dom0/cases.sh  |     3 +-
 tools/tests/mce-test/cases/ucna_llc/guest/cases.sh |     3 +-
 tools/tests/mce-test/cases/ucna_llc/xen/cases.sh   |     3 +-
 tools/tests/mce-test/config/setup.conf             |     3 +-
 tools/tests/mce-test/lib/xen-mceinj-tool.sh        |     3 +-
 tools/tests/mce-test/tools/Makefile                |     5 +-
 tools/tests/mce-test/tools/xen-mceinj.c            |     3 +-
 tools/tests/mem-sharing/Makefile                   |     3 +
 tools/tests/mem-sharing/memshrtool.c               |    12 +-
 tools/tests/regression/Makefile                    |     4 +-
 tools/tests/utests/run_all_tests.py                |     3 +-
 tools/tests/vhpet/Makefile                         |     3 +
 tools/tests/vhpet/emul.h                           |     8 +-
 tools/tests/vhpet/main.c                           |     6 +-
 tools/tests/x86_emulator/Makefile                  |     3 +
 tools/tests/x86_emulator/blowfish.c                |     6 +-
 tools/tests/x86_emulator/test_x86_emulator.c       |    11 +-
 tools/tests/x86_emulator/x86_emulate.c             |     4 +
 tools/tests/xen-access/Makefile                    |     7 +-
 tools/tests/xen-access/xen-access.c                |   461 +-
 tools/xcutils/Makefile                             |     3 +
 tools/xenbackendd/Makefile                         |     7 +-
 tools/xenbackendd/xenbackendd.c                    |     3 +-
 tools/xenmon/COPYING                               |     3 +-
 tools/xenmon/Makefile                              |    13 +-
 tools/xenmon/setmask.c                             |     3 +-
 tools/xenmon/xenbaked.c                            |     3 +-
 tools/xenmon/xenbaked.h                            |     3 +-
 tools/xenmon/xenmon.py                             |     3 +-
 tools/xenpaging/Makefile                           |     4 +-
 tools/xenpaging/file_ops.c                         |     3 +-
 tools/xenpaging/file_ops.h                         |     3 +-
 tools/xenpaging/pagein.c                           |     2 +-
 tools/xenpaging/policy.h                           |     3 +-
 tools/xenpaging/policy_default.c                   |     3 +-
 tools/xenpaging/xenpaging.c                        |   158 +-
 tools/xenpaging/xenpaging.h                        |    11 +-
 tools/xenpmd/Makefile                              |     7 +-
 tools/xenpmd/xenpmd.c                              |     3 +-
 tools/xenstat/Makefile                             |     4 +-
 tools/xenstat/libxenstat/COPYING                   |     3 +-
 tools/xenstat/libxenstat/Makefile                  |    17 +-
 tools/xenstat/libxenstat/src/xenstat.c             |    32 +-
 tools/xenstat/libxenstat/src/xenstat_linux.c       |    17 +-
 tools/xenstat/libxenstat/src/xenstat_priv.h        |     2 +
 tools/xenstat/libxenstat/src/xenstat_qmp.c         |   448 +
 tools/xenstat/xentop/Makefile                      |    17 +-
 tools/xenstat/xentop/xentop.1                      |   104 -
 tools/xenstat/xentop/xentop.c                      |     7 +-
 tools/xenstore/COPYING                             |     3 +-
 tools/xenstore/Makefile                            |    50 +-
 tools/xenstore/include/xenstore.h                  |    30 +-
 tools/xenstore/include/xenstore_lib.h              |     3 +-
 tools/xenstore/talloc.c                            |     9 +-
 tools/xenstore/talloc.h                            |     3 +-
 tools/xenstore/tdb.c                               |     3 +-
 tools/xenstore/tdb.h                               |     3 +-
 tools/xenstore/xenstore_client.c                   |     1 +
 tools/xenstore/xenstored_core.c                    |   102 +-
 tools/xenstore/xenstored_core.h                    |     3 +-
 tools/xenstore/xenstored_domain.c                  |     7 +-
 tools/xenstore/xenstored_domain.h                  |     3 +-
 tools/xenstore/xenstored_minios.c                  |     3 +-
 tools/xenstore/xenstored_posix.c                   |     3 +-
 tools/xenstore/xenstored_transaction.c             |     3 +-
 tools/xenstore/xenstored_transaction.h             |     3 +-
 tools/xenstore/xenstored_watch.c                   |     3 +-
 tools/xenstore/xenstored_watch.h                   |     3 +-
 tools/xenstore/xs.c                                |     3 +-
 tools/xenstore/xs_lib.c                            |     5 +-
 tools/xenstore/xs_tdb_dump.c                       |    12 +-
 tools/xentrace/Makefile                            |    32 +-
 tools/xentrace/analyze.h                           |   107 +
 tools/xentrace/formats                             |     8 +-
 tools/xentrace/mread.c                             |   160 +
 tools/xentrace/mread.h                             |    18 +
 tools/xentrace/pv.h                                |    41 +
 tools/xentrace/xenalyze.c                          | 10407 +++++++++++++++++++
 tools/xentrace/xentrace.c                          |   197 +-
 unmodified_drivers/linux-2.6/platform-pci/evtchn.c |     6 +-
 .../linux-2.6/platform-pci/platform-pci.c          |     3 +-
 .../linux-2.6/platform-pci/platform-pci.h          |     3 +-
 .../linux-2.6/platform-pci/xen_support.c           |     3 +-
 xen/COPYING                                        |     3 +-
 xen/Makefile                                       |    18 +-
 xen/Rules.mk                                       |     5 +-
 xen/arch/arm/Makefile                              |     3 +-
 xen/arch/arm/README.LinuxPrimitives                |    28 -
 xen/arch/arm/Rules.mk                              |   103 +-
 xen/arch/arm/arm32/debug-8250.inc                  |     6 +-
 xen/arch/arm/arm32/debug-scif.inc                  |    49 +
 xen/arch/arm/arm32/lib/lib1funcs.S                 |     4 +-
 xen/arch/arm/arm32/lib/lshrdi3.S                   |     4 +-
 xen/arch/arm/arm64/debug-cadence.inc               |    45 +
 xen/arch/arm/arm64/head.S                          |     3 +-
 xen/arch/arm/arm64/smpboot.c                       |     2 +-
 xen/arch/arm/arm64/traps.c                         |    14 +-
 xen/arch/arm/bootfdt.c                             |    14 +-
 xen/arch/arm/decode.c                              |     6 +-
 xen/arch/arm/device.c                              |    31 +-
 xen/arch/arm/domain.c                              |    82 +-
 xen/arch/arm/domain_build.c                        |   393 +-
 xen/arch/arm/domctl.c                              |   103 +-
 xen/arch/arm/efi/efi-boot.h                        |    44 +-
 xen/arch/arm/{gic-v2.c => gic-hip04.c}             |   473 +-
 xen/arch/arm/gic-v2.c                              |   248 +-
 xen/arch/arm/gic-v3.c                              |   345 +-
 xen/arch/arm/gic.c                                 |   118 +-
 xen/arch/arm/guestcopy.c                           |     4 +-
 xen/arch/arm/irq.c                                 |   179 +-
 xen/arch/arm/kernel.c                              |     2 +-
 xen/arch/arm/kernel.h                              |     4 +
 xen/arch/arm/mm.c                                  |    50 +-
 xen/arch/arm/p2m.c                                 |   618 +-
 xen/arch/arm/platform.c                            |    38 +-
 xen/arch/arm/platforms/Makefile                    |     1 +
 xen/arch/arm/platforms/midway.c                    |     3 -
 xen/arch/arm/platforms/omap5.c                     |    18 -
 xen/arch/arm/platforms/rcar2.c                     |    68 +
 xen/arch/arm/platforms/seattle.c                   |     3 -
 xen/arch/arm/platforms/sunxi.c                     |     3 -
 xen/arch/arm/platforms/vexpress.c                  |     2 -
 xen/arch/arm/platforms/xgene-storm.c               |   172 +-
 xen/arch/arm/psci.c                                |     2 +-
 xen/arch/arm/setup.c                               |    26 +-
 xen/arch/arm/shutdown.c                            |     4 +
 xen/arch/arm/smpboot.c                             |    27 +-
 xen/arch/arm/time.c                                |   100 +-
 xen/arch/arm/traps.c                               |   839 +-
 xen/arch/arm/vgic-v2.c                             |   155 +-
 xen/arch/arm/vgic-v3.c                             |   624 +-
 xen/arch/arm/vgic.c                                |   199 +-
 xen/arch/arm/vpsci.c                               |     8 +-
 xen/arch/arm/vtimer.c                              |   140 +-
 xen/arch/arm/vtimer.h                              |     3 +-
 xen/arch/arm/vuart.c                               |     5 +
 xen/arch/x86/Makefile                              |     2 +
 xen/arch/x86/Rules.mk                              |    16 +-
 xen/arch/x86/acpi/boot.c                           |     4 +-
 xen/arch/x86/acpi/cpu_idle.c                       |   168 +-
 xen/arch/x86/acpi/cpufreq/cpufreq.c                |     3 +-
 xen/arch/x86/acpi/cpufreq/powernow.c               |     3 +-
 xen/arch/x86/acpi/cpuidle_menu.c                   |     3 +-
 xen/arch/x86/acpi/lib.c                            |     3 +-
 xen/arch/x86/alternative.c                         |     3 +-
 xen/arch/x86/apic.c                                |    40 +-
 xen/arch/x86/bitops.c                              |     2 +-
 xen/arch/x86/boot/head.S                           |    28 +-
 xen/arch/x86/boot/reloc.c                          |     1 -
 xen/arch/x86/boot/x86_64.S                         |    20 +
 xen/arch/x86/compat.c                              |    25 +-
 xen/arch/x86/cpu/Makefile                          |     1 +
 xen/arch/x86/cpu/amd.c                             |     2 +-
 xen/arch/x86/cpu/centaur.c                         |     2 +-
 xen/arch/x86/cpu/common.c                          |    68 +-
 xen/arch/x86/cpu/cpu.h                             |     2 +-
 xen/arch/x86/cpu/intel.c                           |     2 +-
 xen/arch/x86/cpu/mcheck/amd_nonfatal.c             |     3 +-
 xen/arch/x86/cpu/mcheck/mce-apei.c                 |     3 +-
 xen/arch/x86/cpu/mcheck/mce.c                      |    10 +-
 xen/arch/x86/cpu/mcheck/mce_amd.c                  |     3 +-
 xen/arch/x86/cpu/mcheck/mce_intel.c                |    22 +-
 xen/arch/x86/cpu/mcheck/mce_quirks.h               |     3 +-
 xen/arch/x86/cpu/mcheck/mctelem.c                  |     3 +-
 xen/arch/x86/cpu/mcheck/vmce.c                     |     3 +-
 xen/arch/x86/cpu/mcheck/x86_mca.h                  |     5 +-
 xen/arch/x86/cpu/mtrr/generic.c                    |    12 +
 xen/arch/x86/cpu/mtrr/main.c                       |     4 +-
 xen/arch/x86/cpu/mwait-idle.c                      |    91 +-
 xen/arch/x86/cpu/vpmu.c                            |   817 ++
 xen/arch/x86/{hvm/svm/vpmu.c => cpu/vpmu_amd.c}    |   347 +-
 .../x86/{hvm/vmx/vpmu_core2.c => cpu/vpmu_intel.c} |   888 +-
 xen/arch/x86/crash.c                               |     9 +-
 xen/arch/x86/debug.c                               |    59 +-
 xen/arch/x86/delay.c                               |     4 +-
 xen/arch/x86/dmi_scan.c                            |   344 +-
 xen/arch/x86/domain.c                              |   355 +-
 xen/arch/x86/domain_build.c                        |   282 +-
 xen/arch/x86/domain_page.c                         |    59 +-
 xen/arch/x86/domctl.c                              |   679 +-
 xen/arch/x86/e820.c                                |    47 +-
 xen/arch/x86/efi/efi-boot.h                        |    25 +-
 xen/arch/x86/efi/runtime.h                         |     7 +
 xen/arch/x86/efi/stub.c                            |     5 +-
 xen/arch/x86/gdbstub.c                             |     3 +-
 xen/arch/x86/genapic/x2apic.c                      |     3 +-
 xen/arch/x86/hpet.c                                |     9 +-
 xen/arch/x86/hvm/Makefile                          |     2 +-
 xen/arch/x86/hvm/asid.c                            |     3 +-
 xen/arch/x86/hvm/emulate.c                         |  1011 +-
 xen/arch/x86/hvm/event.c                           |   189 +
 xen/arch/x86/hvm/hpet.c                            |    35 +-
 xen/arch/x86/hvm/hvm.c                             |  1992 ++--
 xen/arch/x86/hvm/i8254.c                           |     8 +-
 xen/arch/x86/hvm/intercept.c                       |   533 +-
 xen/arch/x86/hvm/io.c                              |   286 +-
 xen/arch/x86/hvm/irq.c                             |    11 +-
 xen/arch/x86/hvm/mtrr.c                            |     9 +-
 xen/arch/x86/hvm/nestedhvm.c                       |     3 +-
 xen/arch/x86/hvm/pmtimer.c                         |    17 +-
 xen/arch/x86/hvm/quirks.c                          |     3 +-
 xen/arch/x86/hvm/rtc.c                             |     2 +-
 xen/arch/x86/hvm/save.c                            |     7 +-
 xen/arch/x86/hvm/stdvga.c                          |   217 +-
 xen/arch/x86/hvm/svm/Makefile                      |     1 -
 xen/arch/x86/hvm/svm/asid.c                        |     3 +-
 xen/arch/x86/hvm/svm/emulate.c                     |     5 +-
 xen/arch/x86/hvm/svm/entry.S                       |     3 +-
 xen/arch/x86/hvm/svm/intr.c                        |     3 +-
 xen/arch/x86/hvm/svm/nestedsvm.c                   |    57 +-
 xen/arch/x86/hvm/svm/svm.c                         |    31 +-
 xen/arch/x86/hvm/svm/svmdebug.c                    |     3 +-
 xen/arch/x86/hvm/svm/vmcb.c                        |     5 +-
 xen/arch/x86/hvm/vioapic.c                         |    75 +-
 xen/arch/x86/hvm/viridian.c                        |   115 +-
 xen/arch/x86/hvm/vlapic.c                          |   175 +-
 xen/arch/x86/hvm/vmsi.c                            |   179 +-
 xen/arch/x86/hvm/vmx/Makefile                      |     1 -
 xen/arch/x86/hvm/vmx/entry.S                       |     3 +-
 xen/arch/x86/hvm/vmx/intr.c                        |     3 +-
 xen/arch/x86/hvm/vmx/realmode.c                    |    29 +-
 xen/arch/x86/hvm/vmx/vmcs.c                        |   620 +-
 xen/arch/x86/hvm/vmx/vmx.c                         |   409 +-
 xen/arch/x86/hvm/vmx/vvmx.c                        |   131 +-
 xen/arch/x86/hvm/vpic.c                            |     6 +-
 xen/arch/x86/hvm/vpmu.c                            |   299 -
 xen/arch/x86/hvm/vpt.c                             |     3 +-
 xen/arch/x86/i387.c                                |    10 +-
 xen/arch/x86/io_apic.c                             |    34 +-
 xen/arch/x86/irq.c                                 |    49 +-
 xen/arch/x86/microcode.c                           |     6 +-
 xen/arch/x86/microcode_amd.c                       |    57 +-
 xen/arch/x86/microcode_intel.c                     |    15 +-
 xen/arch/x86/mm.c                                  |   355 +-
 xen/arch/x86/mm/Makefile                           |     1 +
 xen/arch/x86/mm/altp2m.c                           |    76 +
 xen/arch/x86/mm/guest_walk.c                       |     9 +-
 xen/arch/x86/mm/hap/guest_walk.c                   |     9 +-
 xen/arch/x86/mm/hap/hap.c                          |   112 +-
 xen/arch/x86/mm/hap/nested_ept.c                   |     7 +-
 xen/arch/x86/mm/hap/nested_hap.c                   |     7 +-
 xen/arch/x86/mm/hap/private.h                      |     3 +-
 xen/arch/x86/mm/mem_paging.c                       |    64 +-
 xen/arch/x86/mm/mem_sharing.c                      |   187 +-
 xen/arch/x86/mm/mm-locks.h                         |    65 +-
 xen/arch/x86/mm/p2m-ept.c                          |   173 +-
 xen/arch/x86/mm/p2m-pod.c                          |    27 +-
 xen/arch/x86/mm/p2m-pt.c                           |   158 +-
 xen/arch/x86/mm/p2m.c                              |  1079 +-
 xen/arch/x86/mm/paging.c                           |   105 +-
 xen/arch/x86/mm/shadow/Makefile                    |     6 +-
 xen/arch/x86/mm/shadow/common.c                    |   996 +-
 xen/arch/x86/mm/shadow/multi.c                     |  1431 +--
 xen/arch/x86/mm/shadow/multi.h                     |    67 +-
 xen/arch/x86/mm/shadow/none.c                      |    78 +
 xen/arch/x86/mm/shadow/private.h                   |   316 +-
 xen/arch/x86/mm/shadow/types.h                     |    45 +-
 xen/arch/x86/monitor.c                             |   217 +
 xen/arch/x86/mpparse.c                             |    19 +-
 xen/arch/x86/msi.c                                 |   474 +-
 xen/arch/x86/nmi.c                                 |     4 +-
 xen/arch/x86/numa.c                                |    86 +-
 xen/arch/x86/oprofile/op_model_ppro.c              |     8 +-
 xen/arch/x86/pci.c                                 |    25 +
 xen/arch/x86/physdev.c                             |   112 +-
 xen/arch/x86/platform_hypercall.c                  |    92 +-
 xen/arch/x86/psr.c                                 |   494 +-
 xen/arch/x86/setup.c                               |   150 +-
 xen/arch/x86/shutdown.c                            |    18 +-
 xen/arch/x86/smp.c                                 |     2 +-
 xen/arch/x86/smpboot.c                             |   147 +-
 xen/arch/x86/srat.c                                |   194 +-
 xen/arch/x86/string.c                              |     2 +-
 xen/arch/x86/sysctl.c                              |    24 +-
 xen/arch/x86/tboot.c                               |    23 +-
 xen/arch/x86/time.c                                |    71 +-
 xen/arch/x86/trace.c                               |    12 +-
 xen/arch/x86/traps.c                               |   413 +-
 xen/arch/x86/vm_event.c                            |   117 +
 xen/arch/x86/x86_64/acpi_mmcfg.c                   |     3 +-
 xen/arch/x86/x86_64/compat/entry.S                 |    19 +-
 xen/arch/x86/x86_64/compat/mm.c                    |    25 +-
 xen/arch/x86/x86_64/compat/traps.c                 |     4 +-
 xen/arch/x86/x86_64/cpu_idle.c                     |     3 +-
 xen/arch/x86/x86_64/cpufreq.c                      |     3 +-
 xen/arch/x86/x86_64/entry.S                        |    29 +-
 xen/arch/x86/x86_64/gdbstub.c                      |     3 +-
 xen/arch/x86/x86_64/mm.c                           |   107 +-
 xen/arch/x86/x86_64/mmconfig.h                     |     3 +-
 xen/arch/x86/x86_64/mmconfig_64.c                  |    63 +-
 xen/arch/x86/x86_64/traps.c                        |   123 +-
 xen/arch/x86/x86_emulate.c                         |    16 +
 xen/arch/x86/x86_emulate/x86_emulate.c             |   187 +-
 xen/arch/x86/x86_emulate/x86_emulate.h             |    42 +-
 xen/arch/x86/xen.lds.S                             |    11 +-
 xen/common/Makefile                                |    22 +-
 xen/common/compat/domain.c                         |     7 +-
 xen/common/compat/kernel.c                         |     5 +
 xen/common/compat/memory.c                         |    65 +
 xen/common/compat/tmem_xen.c                       |     4 +-
 xen/common/core_parking.c                          |    20 +-
 xen/common/cpu.c                                   |     6 +-
 xen/common/cpupool.c                               |   131 +-
 xen/common/device_tree.c                           |   405 +-
 xen/common/domain.c                                |    78 +-
 xen/common/domctl.c                                |   218 +-
 xen/common/earlycpio.c                             |    39 +-
 xen/common/efi/boot.c                              |   101 +-
 xen/common/efi/runtime.c                           |    11 +-
 xen/common/event_channel.c                         |   247 +-
 xen/common/event_fifo.c                            |     6 +-
 xen/common/gdbstub.c                               |     3 +-
 xen/common/grant_table.c                           |  1181 ++-
 xen/common/guestcopy.c                             |    31 +
 xen/common/hvm/save.c                              |     5 +-
 xen/common/kernel.c                                |    28 +-
 xen/common/kexec.c                                 |    42 +-
 xen/common/keyhandler.c                            |     6 +-
 xen/common/kimage.c                                |    27 +-
 xen/common/lib.c                                   |     4 +
 xen/common/libelf/libelf-dominfo.c                 |     9 +-
 xen/common/libelf/libelf-loader.c                  |     3 +-
 xen/common/libelf/libelf-private.h                 |     7 +-
 xen/common/libelf/libelf-tools.c                   |     3 +-
 xen/common/libfdt/fdt.c                            |     4 +-
 xen/common/libfdt/fdt_empty_tree.c                 |     4 +-
 xen/common/libfdt/fdt_ro.c                         |     4 +-
 xen/common/libfdt/fdt_rw.c                         |     4 +-
 xen/common/libfdt/fdt_strerror.c                   |     4 +-
 xen/common/libfdt/fdt_sw.c                         |     4 +-
 xen/common/libfdt/fdt_wip.c                        |     4 +-
 xen/common/libfdt/libfdt_internal.h                |     4 +-
 xen/common/mem_access.c                            |    62 +-
 xen/common/mem_event.c                             |   742 --
 xen/common/memory.c                                |   200 +-
 xen/common/page_alloc.c                            |    71 +-
 xen/common/pdx.c                                   |     3 +-
 xen/common/perfc.c                                 |     4 +-
 xen/common/preempt.c                               |     3 +-
 xen/common/radix-tree.c                            |     3 +-
 xen/common/random.c                                |    10 +
 xen/common/rangeset.c                              |    16 +-
 xen/common/rbtree.c                                |     3 +-
 xen/common/rcupdate.c                              |     3 +-
 xen/common/sched_arinc653.c                        |     6 +-
 xen/common/sched_credit.c                          |   116 +-
 xen/common/sched_credit2.c                         |   112 +-
 xen/common/sched_rt.c                              |   130 +-
 xen/common/sched_sedf.c                            |  1541 ---
 xen/common/schedule.c                              |   301 +-
 xen/common/shutdown.c                              |    24 +-
 xen/common/softirq.c                               |     6 +-
 xen/common/spinlock.c                              |   144 +-
 xen/common/stop_machine.c                          |     3 +-
 xen/common/symbols.c                               |    56 +-
 xen/common/sysctl.c                                |   212 +-
 xen/common/time.c                                  |     3 +-
 xen/common/tmem.c                                  |   477 +-
 xen/common/tmem_xen.c                              |     6 +-
 xen/common/unlzma.c                                |     3 +-
 xen/common/unlzo.c                                 |     3 +-
 xen/common/vm_event.c                              |   772 ++
 xen/common/vmap.c                                  |    79 +-
 xen/common/vsprintf.c                              |    11 +-
 xen/common/wait.c                                  |     3 +-
 xen/common/xencomm.c                               |   621 --
 xen/common/xenoprof.c                              |     2 +-
 xen/common/xmalloc_tlsf.c                          |     6 +-
 xen/common/xz/dec_lzma2.c                          |     4 +
 xen/drivers/acpi/apei/apei-base.c                  |     3 +-
 xen/drivers/acpi/apei/apei-io.c                    |     3 +-
 xen/drivers/acpi/apei/erst.c                       |     3 +-
 xen/drivers/acpi/apei/hest.c                       |     3 +-
 xen/drivers/acpi/numa.c                            |     3 +-
 xen/drivers/acpi/osl.c                             |     8 +-
 xen/drivers/acpi/pmstat.c                          |     3 +-
 xen/drivers/acpi/tables.c                          |     3 +-
 xen/drivers/char/Makefile                          |     2 +
 xen/drivers/char/cadence-uart.c                    |   224 +
 xen/drivers/char/console.c                         |    15 -
 xen/drivers/char/dt-uart.c                         |    43 +-
 xen/drivers/char/exynos4210-uart.c                 |     8 +-
 xen/drivers/char/ns16550.c                         |    12 +-
 xen/drivers/char/omap-uart.c                       |     8 +-
 xen/drivers/char/pl011.c                           |     8 +-
 xen/drivers/char/scif-uart.c                       |   367 +
 xen/drivers/cpufreq/cpufreq.c                      |     3 +-
 xen/drivers/passthrough/amd/iommu_acpi.c           |     3 +-
 xen/drivers/passthrough/amd/iommu_cmd.c            |     3 +-
 xen/drivers/passthrough/amd/iommu_detect.c         |     3 +-
 xen/drivers/passthrough/amd/iommu_guest.c          |    51 +-
 xen/drivers/passthrough/amd/iommu_init.c           |     7 +-
 xen/drivers/passthrough/amd/iommu_intr.c           |    13 +-
 xen/drivers/passthrough/amd/iommu_map.c            |    35 +-
 xen/drivers/passthrough/amd/pci_amd_iommu.c        |     6 +-
 xen/drivers/passthrough/arm/iommu.c                |     7 +-
 xen/drivers/passthrough/arm/smmu.c                 |  4096 +++++---
 xen/drivers/passthrough/ats.h                      |     3 +-
 xen/drivers/passthrough/device_tree.c              |   138 +-
 xen/drivers/passthrough/io.c                       |   297 +-
 xen/drivers/passthrough/iommu.c                    |    53 +-
 xen/drivers/passthrough/pci.c                      |   148 +-
 xen/drivers/passthrough/vtd/dmar.c                 |    79 +-
 xen/drivers/passthrough/vtd/dmar.h                 |     4 +-
 xen/drivers/passthrough/vtd/extern.h               |     4 +-
 xen/drivers/passthrough/vtd/intremap.c             |     3 +-
 xen/drivers/passthrough/vtd/iommu.c                |   124 +-
 xen/drivers/passthrough/vtd/iommu.h                |    19 +-
 xen/drivers/passthrough/vtd/qinval.c               |     3 +-
 xen/drivers/passthrough/vtd/quirks.c               |    16 +-
 xen/drivers/passthrough/vtd/utils.c                |    10 +-
 xen/drivers/passthrough/vtd/vtd.h                  |     3 +-
 xen/drivers/passthrough/vtd/x86/ats.c              |     3 +-
 xen/drivers/passthrough/vtd/x86/vtd.c              |     5 +-
 xen/drivers/passthrough/x86/ats.c                  |     3 +-
 xen/drivers/passthrough/x86/iommu.c                |    25 +-
 xen/include/Makefile                               |    32 +-
 xen/include/asm-arm/arm32/bitops.h                 |     2 +
 xen/include/asm-arm/arm32/page.h                   |     7 +-
 xen/include/asm-arm/arm32/spinlock.h               |    66 -
 xen/include/asm-arm/arm64/bitops.h                 |    11 +
 xen/include/asm-arm/arm64/page.h                   |     7 +-
 xen/include/asm-arm/arm64/spinlock.h               |    63 -
 xen/include/asm-arm/atomic.h                       |    26 +
 xen/include/asm-arm/bitops.h                       |    38 +-
 xen/include/asm-arm/cadence-uart.h                 |    55 +
 xen/include/asm-arm/config.h                       |     9 +-
 xen/include/asm-arm/cpregs.h                       |    10 +-
 xen/include/asm-arm/device.h                       |    48 +-
 xen/include/asm-arm/domain.h                       |    64 +-
 xen/include/asm-arm/gic.h                          |    48 +-
 xen/include/asm-arm/gic_v3_defs.h                  |    11 +-
 xen/include/asm-arm/grant_table.h                  |     3 +-
 xen/include/asm-arm/hypercall.h                    |     2 +-
 xen/include/asm-arm/iommu.h                        |     3 +-
 xen/include/asm-arm/irq.h                          |    11 +-
 xen/include/asm-arm/mm.h                           |     9 +-
 xen/include/asm-arm/monitor.h                      |    33 +
 xen/include/asm-arm/numa.h                         |     4 +-
 xen/include/asm-arm/p2m.h                          |    61 +-
 xen/include/asm-arm/page.h                         |     9 +-
 xen/include/asm-arm/perfc.h                        |    21 +
 xen/include/asm-arm/perfc_defn.h                   |    83 +
 xen/include/asm-arm/platform.h                     |    18 +-
 xen/include/asm-arm/processor.h                    |    23 +-
 xen/include/asm-arm/scif-uart.h                    |   107 +
 xen/include/asm-arm/setup.h                        |     2 -
 xen/include/asm-arm/spinlock.h                     |    19 +-
 xen/include/asm-arm/sysregs.h                      |    12 +-
 xen/include/asm-arm/system.h                       |     5 +
 xen/include/asm-arm/time.h                         |     8 +
 xen/include/asm-arm/vgic.h                         |    63 +-
 xen/include/asm-arm/vm_event.h                     |    50 +
 xen/include/asm-x86/acpi.h                         |     3 +-
 xen/include/asm-x86/alternative.h                  |    21 +
 xen/include/asm-x86/altp2m.h                       |    37 +
 xen/include/asm-x86/amd-iommu.h                    |     3 +-
 xen/include/asm-x86/apic.h                         |     1 -
 xen/include/asm-x86/asm_defns.h                    |    13 +-
 xen/include/asm-x86/atomic.h                       |    69 +-
 xen/include/asm-x86/bitops.h                       |   126 +-
 xen/include/asm-x86/bug.h                          |    50 +-
 xen/include/asm-x86/config.h                       |    51 +-
 xen/include/asm-x86/cpufeature.h                   |    13 +-
 xen/include/asm-x86/cpuidle.h                      |     2 +
 xen/include/asm-x86/current.h                      |    33 +-
 xen/include/asm-x86/debugger.h                     |     7 +-
 xen/include/asm-x86/debugreg.h                     |     2 +
 xen/include/asm-x86/desc.h                         |    12 +-
 xen/include/asm-x86/device.h                       |    25 +
 xen/include/asm-x86/domain.h                       |   121 +-
 xen/include/asm-x86/fixmap.h                       |     2 +
 xen/include/asm-x86/guest_pt.h                     |    12 +-
 xen/include/asm-x86/hap.h                          |    20 +-
 xen/include/asm-x86/hpet.h                         |     1 +
 xen/include/asm-x86/hvm/asid.h                     |     3 +-
 xen/include/asm-x86/hvm/domain.h                   |    10 +-
 xen/include/asm-x86/hvm/emulate.h                  |    27 +-
 xen/include/asm-x86/hvm/event.h                    |    45 +
 xen/include/asm-x86/hvm/hvm.h                      |   121 +-
 xen/include/asm-x86/hvm/io.h                       |   151 +-
 xen/include/asm-x86/hvm/iommu.h                    |     2 +
 xen/include/asm-x86/hvm/irq.h                      |     3 +-
 xen/include/asm-x86/hvm/nestedhvm.h                |     3 +-
 xen/include/asm-x86/hvm/support.h                  |    12 +-
 xen/include/asm-x86/hvm/svm/amd-iommu-defs.h       |     5 +-
 xen/include/asm-x86/hvm/svm/amd-iommu-proto.h      |     3 +-
 xen/include/asm-x86/hvm/svm/asid.h                 |     3 +-
 xen/include/asm-x86/hvm/svm/emulate.h              |     3 +-
 xen/include/asm-x86/hvm/svm/intr.h                 |     3 +-
 xen/include/asm-x86/hvm/svm/nestedsvm.h            |    14 +-
 xen/include/asm-x86/hvm/svm/svm.h                  |     3 +-
 xen/include/asm-x86/hvm/svm/svmdebug.h             |     3 +-
 xen/include/asm-x86/hvm/svm/vmcb.h                 |     3 +-
 xen/include/asm-x86/hvm/vcpu.h                     |    70 +-
 xen/include/asm-x86/hvm/vioapic.h                  |     6 +-
 xen/include/asm-x86/hvm/viridian.h                 |    25 +
 xen/include/asm-x86/hvm/vlapic.h                   |     5 +-
 xen/include/asm-x86/hvm/vmx/vmcs.h                 |   101 +-
 xen/include/asm-x86/hvm/vmx/vmx.h                  |    26 +-
 xen/include/asm-x86/hvm/vmx/vpmu_core2.h           |    51 -
 xen/include/asm-x86/hvm/vmx/vvmx.h                 |     9 +-
 xen/include/asm-x86/hvm/vpt.h                      |     3 +-
 xen/include/asm-x86/iommu.h                        |     3 +-
 xen/include/asm-x86/irq.h                          |     3 +-
 xen/include/asm-x86/ldt.h                          |     2 +-
 xen/include/asm-x86/mem_paging.h                   |     8 +-
 xen/include/asm-x86/mem_sharing.h                  |     7 +-
 xen/include/asm-x86/microcode.h                    |     9 +-
 xen/include/asm-x86/mm.h                           |    49 +-
 xen/include/asm-x86/monitor.h                      |    31 +
 xen/include/asm-x86/msi.h                          |    22 +-
 xen/include/asm-x86/msr-index.h                    |     2 +
 xen/include/asm-x86/msr.h                          |    15 +-
 xen/include/asm-x86/mtrr.h                         |     3 +-
 xen/include/asm-x86/multicall.h                    |    10 +-
 xen/include/asm-x86/numa.h                         |    26 +-
 xen/include/asm-x86/p2m.h                          |   203 +-
 xen/include/asm-x86/page.h                         |    24 +-
 xen/include/asm-x86/paging.h                       |    14 +-
 xen/include/asm-x86/pci.h                          |    13 +
 xen/include/asm-x86/perfc_defn.h                   |     2 +
 xen/include/asm-x86/processor.h                    |    87 +-
 xen/include/asm-x86/psr.h                          |    13 +-
 xen/include/asm-x86/setup.h                        |     4 +-
 xen/include/asm-x86/shadow.h                       |    54 +-
 xen/include/asm-x86/smp.h                          |    14 +-
 xen/include/asm-x86/softirq.h                      |     3 +-
 xen/include/asm-x86/spinlock.h                     |    31 +-
 xen/include/asm-x86/system.h                       |    73 +-
 xen/include/asm-x86/time.h                         |     4 +-
 xen/include/asm-x86/traps.h                        |     7 +-
 xen/include/asm-x86/vm_event.h                     |    33 +
 xen/include/asm-x86/{hvm => }/vpmu.h               |   105 +-
 xen/include/asm-x86/x86_64/page.h                  |    16 +-
 xen/include/asm-x86/xenoprof.h                     |     3 +-
 xen/include/efi/efidef.h                           |     6 +-
 xen/include/public/arch-arm.h                      |    60 +-
 xen/include/public/arch-x86/cpuid.h                |     5 +-
 xen/include/public/arch-x86/hvm/save.h             |    11 +
 xen/include/public/arch-x86/pmu.h                  |   167 +
 xen/include/public/arch-x86/xen-x86_32.h           |     1 +
 xen/include/public/arch-x86/xen.h                  |    50 +-
 xen/include/public/domctl.h                        |   273 +-
 xen/include/public/errno.h                         |    95 +
 xen/include/public/features.h                      |     3 +
 xen/include/public/grant_table.h                   |    10 +-
 xen/include/public/hvm/e820.h                      |     3 +-
 xen/include/public/hvm/hvm_info_table.h            |     2 +
 xen/include/public/hvm/hvm_op.h                    |   120 +
 xen/include/public/hvm/hvm_xs_strings.h            |     2 +
 xen/include/public/hvm/ioreq.h                     |    13 +-
 xen/include/public/hvm/params.h                    |    25 +-
 xen/include/public/io/blkif.h                      |     6 +
 xen/include/public/io/libxenvchan.h                |     3 +-
 xen/include/public/io/netif.h                      |   160 +-
 xen/include/public/io/protocols.h                  |     2 +
 xen/include/public/io/ring.h                       |     4 +-
 xen/include/public/io/usbif.h                      |   140 +-
 xen/include/public/mem_event.h                     |   134 -
 xen/include/public/memory.h                        |    66 +-
 xen/include/public/physdev.h                       |    12 +-
 xen/include/public/platform.h                      |    89 +-
 xen/include/public/pmu.h                           |   133 +
 xen/include/public/sysctl.h                        |   213 +-
 xen/include/public/tmem.h                          |    58 +-
 xen/include/public/trace.h                         |     2 +-
 xen/include/public/vcpu.h                          |     2 +-
 xen/include/public/vm_event.h                      |   269 +
 xen/include/public/xen-compat.h                    |     2 +-
 xen/include/public/xen.h                           |    44 +-
 xen/include/public/xsm/flask_op.h                  |    11 +
 xen/include/xen/acpi.h                             |     3 +-
 xen/include/xen/bitops.h                           |    40 +-
 xen/include/xen/config.h                           |    18 +-
 xen/include/xen/cper.h                             |     3 +-
 xen/include/xen/cpuidle.h                          |     4 +-
 xen/include/xen/cpumask.h                          |    24 +-
 xen/include/xen/device_tree.h                      |    75 +-
 xen/include/xen/dmi.h                              |     4 +-
 xen/include/xen/domain.h                           |     6 +-
 xen/include/xen/domain_page.h                      |    45 +-
 xen/include/xen/earlycpio.h                        |     1 +
 xen/include/xen/efi.h                              |     3 +-
 xen/include/xen/errno.h                            |   143 +-
 xen/include/xen/event.h                            |    16 +-
 xen/include/xen/gdbstub.h                          |     3 +-
 xen/include/xen/grant_table.h                      |    28 +-
 xen/include/xen/guest_access.h                     |     5 +
 xen/include/xen/hvm/iommu.h                        |     3 +-
 xen/include/xen/hvm/irq.h                          |     7 +-
 xen/include/xen/hvm/save.h                         |     3 +-
 xen/include/xen/hypercall.h                        |    13 +-
 xen/include/xen/inttypes.h                         |     4 +-
 xen/include/xen/iommu.h                            |    41 +-
 xen/include/xen/irq.h                              |     4 +
 xen/include/xen/kexec.h                            |     4 +-
 xen/include/xen/lib.h                              |    38 +-
 xen/include/xen/libfdt/fdt.h                       |     4 +-
 xen/include/xen/libfdt/libfdt.h                    |     4 +-
 xen/include/xen/list.h                             |    60 +
 xen/include/xen/mem_access.h                       |    21 +-
 xen/include/xen/mem_event.h                        |   143 -
 xen/include/xen/mm.h                               |   121 +-
 xen/include/xen/multiboot.h                        |     3 +-
 xen/include/xen/numa.h                             |     3 +-
 xen/include/xen/p2m-common.h                       |    17 +-
 xen/include/xen/paging.h                           |     2 +-
 xen/include/xen/pci.h                              |    16 +-
 xen/include/xen/perfc_defn.h                       |    29 +-
 xen/include/xen/radix-tree.h                       |     3 +-
 xen/include/xen/random.h                           |     3 +
 xen/include/xen/rangeset.h                         |    12 +-
 xen/include/xen/rbtree.h                           |     3 +-
 xen/include/xen/rcupdate.h                         |     3 +-
 xen/include/xen/sched-if.h                         |     1 -
 xen/include/xen/sched.h                            |   106 +-
 xen/include/xen/shared.h                           |     6 +-
 xen/include/xen/spinlock.h                         |    18 +-
 xen/include/xen/symbols.h                          |     3 +
 xen/include/xen/time.h                             |     4 +-
 xen/include/xen/tmem.h                             |     3 +
 xen/include/xen/tmem_xen.h                         |     6 +-
 xen/include/xen/typesafe.h                         |    46 +
 xen/include/xen/vm_event.h                         |    87 +
 xen/include/xen/vmap.h                             |     9 +-
 xen/include/xen/xencomm.h                          |   170 -
 xen/include/xlat.lst                               |    13 +-
 xen/include/xsm/dummy.h                            |   105 +-
 xen/include/xsm/xsm.h                              |   116 +-
 xen/xsm/dummy.c                                    |    27 +-
 xen/xsm/flask/Makefile                             |     2 +-
 xen/xsm/flask/avc.c                                |     5 +-
 xen/xsm/flask/flask_op.c                           |   120 +-
 xen/xsm/flask/hooks.c                              |   283 +-
 xen/xsm/flask/include/avc.h                        |     8 +-
 xen/xsm/flask/include/security.h                   |    29 +-
 xen/xsm/flask/policy/access_vectors                |    43 +-
 xen/xsm/flask/policy/initial_sids                  |     2 +
 xen/xsm/flask/policy/mkaccess_vector.sh            |     6 +-
 xen/xsm/flask/policy/mkflask.sh                    |    11 +-
 xen/xsm/flask/ss/policydb.c                        |   141 +-
 xen/xsm/flask/ss/policydb.h                        |     7 +-
 xen/xsm/flask/ss/services.c                        |   118 +-
 1321 files changed, 69832 insertions(+), 63935 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8c8c06f..9ead7c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 .hg
 *.orig
-*.rej
 *~
 *.swp
 *.o
@@ -44,13 +43,11 @@ dist/*
 docs/html/
 docs/man1/
 docs/man5/
+docs/man8/
 docs/pdf/
 docs/txt/
-extras/mini-os/include/mini-os
-extras/mini-os/include/x86/mini-os
-extras/mini-os/include/xen
-extras/mini-os/include/list.h
-extras/mini-os/mini-os*
+extras/mini-os
+extras/mini-os-remote
 install/*
 stubdom/autom4te.cache/
 stubdom/binutils-*
@@ -101,13 +98,9 @@ tools/blktap2/drivers/tapdisk2
 tools/blktap2/drivers/td-util
 tools/blktap2/vhd/vhd-update
 tools/blktap2/vhd/vhd-util
-tools/blktap/drivers/blktapctrl
-tools/blktap/drivers/img2qcow
-tools/blktap/drivers/qcow-create
-tools/blktap/drivers/qcow2raw
-tools/blktap/drivers/tapdisk
 tools/console/xenconsole
 tools/console/xenconsoled
+tools/console/client/_paths.h
 tools/debugger/gdb/gdb-6.2.1-linux-i386-xen/*
 tools/debugger/gdb/gdb-6.2.1/*
 tools/debugger/gdb/gdb-6.2.1.tar.bz2
@@ -141,10 +134,12 @@ tools/flask/utils/flask-set-bool
 tools/flask/utils/flask-label-pci
 tools/hotplug/common/hotplugpath.sh
 tools/hotplug/FreeBSD/rc.d/xencommons
+tools/hotplug/FreeBSD/rc.d/xendriverdomain
 tools/hotplug/Linux/init.d/sysconfig.xencommons
 tools/hotplug/Linux/init.d/xen-watchdog
 tools/hotplug/Linux/init.d/xencommons
 tools/hotplug/Linux/init.d/xendomains
+tools/hotplug/Linux/init.d/xendriverdomain
 tools/hotplug/Linux/systemd/*.conf
 tools/hotplug/Linux/systemd/*.mount
 tools/hotplug/Linux/systemd/*.socket
@@ -154,10 +149,14 @@ tools/hotplug/Linux/xen-backend.rules
 tools/hotplug/Linux/xen-hotplug-common.sh
 tools/hotplug/Linux/xendomains
 tools/hotplug/NetBSD/rc.d/xencommons
+tools/hotplug/NetBSD/rc.d/xendriverdomain
 tools/include/xen/*
+tools/include/xen-xsm/*
 tools/include/xen-foreign/*.(c|h|size)
 tools/include/xen-foreign/checker
 tools/libxl/libxlu_cfg_y.output
+tools/libxl/*.pc
+tools/libxl/*.pc.in
 tools/libxl/xl
 tools/libxl/testenum
 tools/libxl/testenum.c
@@ -178,6 +177,7 @@ tools/misc/gtracestat
 tools/misc/xenlockprof
 tools/misc/lowmemd
 tools/misc/xencov
+tools/xentrace/xenalyze
 tools/pygrub/build/*
 tools/python/build/*
 tools/security/secpol_tool
@@ -236,6 +236,7 @@ xen/arch/*/efi/compat.c
 xen/arch/*/efi/efi.h
 xen/arch/*/efi/runtime.c
 xen/include/headers.chk
+xen/include/headers++.chk
 xen/include/asm
 xen/include/asm-*/asm-offsets.h
 xen/include/compat/*
@@ -292,6 +293,7 @@ tools/libxl/testidl.c
 tools/libxl/*.pyc
 tools/libxl/libxl-save-helper
 tools/libxl/test_timedereg
+tools/libxl/test_fdderegrace
 tools/libxl/xen-init-dom0
 tools/blktap2/control/tap-ctl
 tools/firmware/etherboot/eb-roms.h
diff --git a/.hgignore b/.hgignore
index da27f80..0bd29a1 100644
--- a/.hgignore
+++ b/.hgignore
@@ -140,11 +140,6 @@
 ^tools/blktap2/drivers/td-util$
 ^tools/blktap2/vhd/vhd-update$
 ^tools/blktap2/vhd/vhd-util$
-^tools/blktap/drivers/blktapctrl$
-^tools/blktap/drivers/img2qcow$
-^tools/blktap/drivers/qcow-create$
-^tools/blktap/drivers/qcow2raw$
-^tools/blktap/drivers/tapdisk$
 ^tools/check/\..*$
 ^tools/console/xenconsole$
 ^tools/console/xenconsoled$
diff --git a/COPYING b/COPYING
index 07535ad..acd3016 100644
--- a/COPYING
+++ b/COPYING
@@ -343,8 +343,7 @@ the "copyright" line and a pointer to where the full notice is found.
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 
 
 Also add information on how to contact you by electronic and paper mail.
diff --git a/Config.mk b/Config.mk
index b9a89a1..54fbb9d 100644
--- a/Config.mk
+++ b/Config.mk
@@ -75,8 +75,8 @@ EXTRA_LIB += $(EXTRA_PREFIX)/lib
 endif
 
 PYTHON      ?= python
-PYTHON_PREFIX_ARG ?= --prefix="$(PREFIX)"
-# The above requires that PREFIX contains *no spaces*. This variable is here
+PYTHON_PREFIX_ARG ?= --prefix="$(prefix)"
+# The above requires that prefix contains *no spaces*. This variable is here
 # to permit the user to set PYTHON_PREFIX_ARG to '' to workaround this bug:
 #  https://bugs.launchpad.net/ubuntu/+bug/362570
 
@@ -142,7 +142,7 @@ define as-insn-check-closure
 endef
 
 define buildmakevars2shellvars
-    export PREFIX="$(PREFIX)";                                            \
+    export PREFIX="$(prefix)";                                            \
     export XEN_SCRIPT_DIR="$(XEN_SCRIPT_DIR)";                            \
     export XEN_ROOT="$(XEN_ROOT)"
 endef
@@ -157,9 +157,9 @@ define move-if-changed
 	if ! cmp -s $(1) $(2); then mv -f $(1) $(2); else rm -f $(1); fi
 endef
 
-BUILD_MAKE_VARS := SBINDIR BINDIR LIBEXEC LIBEXEC_BIN LIBDIR SHAREDIR \
+BUILD_MAKE_VARS := sbindir bindir LIBEXEC LIBEXEC_BIN libdir SHAREDIR \
                    XENFIRMWAREDIR XEN_CONFIG_DIR XEN_SCRIPT_DIR XEN_LOCK_DIR \
-                   XEN_RUN_DIR XEN_PAGING_DIR
+                   XEN_RUN_DIR XEN_PAGING_DIR XEN_DUMP_DIR
 
 buildmakevars2file = $(eval $(call buildmakevars2file-closure,$(1)))
 define buildmakevars2file-closure
@@ -204,7 +204,7 @@ CFLAGS += $(foreach i, $(EXTRA_INCLUDES), -I$(i))
 LDFLAGS += $(foreach i, $(PREPEND_LIB), -L$(i))
 CFLAGS += $(foreach i, $(PREPEND_INCLUDES), -I$(i))
 ifeq ($(XEN_TOOLS_RPATH),y)
-LDFLAGS += -Wl,-rpath,$(LIBDIR)
+LDFLAGS += -Wl,-rpath,$(libdir)
 endif
 APPEND_LDFLAGS += $(foreach i, $(APPEND_LIB), -L$(i))
 APPEND_CFLAGS += $(foreach i, $(APPEND_INCLUDES), -I$(i))
@@ -242,27 +242,33 @@ endif
 
 ifeq ($(GIT_HTTP),y)
 OVMF_UPSTREAM_URL ?= http://xenbits.xen.org/git-http/ovmf.git
-QEMU_UPSTREAM_URL ?= http://xenbits.xen.org/git-http/qemu-upstream-4.5-testing.git
-QEMU_TRADITIONAL_URL ?= http://xenbits.xen.org/git-http/qemu-xen-4.5-testing.git
+QEMU_UPSTREAM_URL ?= http://xenbits.xen.org/git-http/qemu-upstream-4.6-testing.git
+QEMU_TRADITIONAL_URL ?= http://xenbits.xen.org/git-http/qemu-xen-4.6-testing.git
 SEABIOS_UPSTREAM_URL ?= http://xenbits.xen.org/git-http/seabios.git
+MINIOS_UPSTREAM_URL ?= http://xenbits.xen.org/git-http/mini-os.git
 else
 OVMF_UPSTREAM_URL ?= git://xenbits.xen.org/ovmf.git
-QEMU_UPSTREAM_URL ?= git://xenbits.xen.org/qemu-upstream-4.5-testing.git
-QEMU_TRADITIONAL_URL ?= git://xenbits.xen.org/qemu-xen-4.5-testing.git
+QEMU_UPSTREAM_URL ?= git://xenbits.xen.org/qemu-upstream-4.6-testing.git
+QEMU_TRADITIONAL_URL ?= git://xenbits.xen.org/qemu-xen-4.6-testing.git
 SEABIOS_UPSTREAM_URL ?= git://xenbits.xen.org/seabios.git
+MINIOS_UPSTREAM_URL ?= git://xenbits.xen.org/mini-os.git
 endif
-OVMF_UPSTREAM_REVISION ?= 447d264115c476142f884af0be287622cd244423
-QEMU_UPSTREAM_REVISION ?= qemu-xen-4.5.1-rc1
-SEABIOS_UPSTREAM_REVISION ?= rel-1.7.5
-# Thu May 22 16:59:16 2014 -0400
-# python3 fixes for vgabios and csm builds.
+OVMF_UPSTREAM_REVISION ?= cb9a7ebabcd6b8a49dc0854b2f9592d732b5afbd
+QEMU_UPSTREAM_REVISION ?= qemu-xen-4.6.0
+MINIOS_UPSTREAM_REVISION ?= xen-RELEASE-4.6.0
+# Fri Jun 26 11:58:40 2015 +0100
+# Correct printf formatting for tpm_tis message.
+
+SEABIOS_UPSTREAM_REVISION ?= rel-1.8.2
+# Tue Mar 17 10:52:16 2015 -0400
+# vgabios: On bda_save_restore() the saved vbe_mode also has flags in it
 
 ETHERBOOT_NICS ?= rtl8139 8086100e
 
 
-QEMU_TRADITIONAL_REVISION ?= xen-4.5.1-rc1
-# Tue Mar 31 16:27:45 2015 +0100
-# xen: limit guest control of PCI command register
+QEMU_TRADITIONAL_REVISION ?= xen-4.6.0
+# Tue Sep 8 15:41:20 2015 +0100
+# Fix build after "ui/vnc: limit client_cut_text msg payload size"
 
 # Specify which qemu-dm to use. This may be `ioemu' to use the old
 # Mercurial in-tree version, or a local directory, or a git URL.
diff --git a/INSTALL b/INSTALL
index 71dd0eb..56e2950 100644
--- a/INSTALL
+++ b/INSTALL
@@ -128,6 +128,10 @@ original xenstored will be used. Valid names are xenstored and
 oxenstored.
   --with-xenstored=name
 
+The path where to store core dumps for domUs which are configured with
+coredump-destroy or coredump-restart can be specified with this option.
+  --with-xen-dumpdir=DIR
+
 Instead of starting the tools in dom0 with sysv runlevel scripts they
 can also be started by systemd. If this option is enabled xenstored will
 receive the communication socked directly from systemd. So starting it
@@ -142,7 +146,6 @@ this detection and the sysv runlevel scripts have to be used.
 
 The old backend drivers are disabled because qdisk is now the default.
 This option can be used to build them anyway.
-  --enable-blktap1
   --enable-blktap2
 
 Build various stubom components, some are only example code. Its usually
@@ -193,13 +196,17 @@ OCAMLFIND_DESTDIR= and OCAMLFIND_METADIR= will have the same effect.
 OCAMLDESTDIR=
 
 The xen subsystem will install the hypervisor into fixed locations.
-BOOT_DIR defaults to /boot, EFI_DIR to /usr/lib64/efi.
+BOOT_DIR defaults to /boot, DEBUG_DIR defaults to /usr/lib/debug and
+EFI_DIR to /usr/lib64/efi.
 BOOT_DIR=
+DEBUG_DIR=
 EFI_DIR=
 
 The make target 'rpmball' will build a xen.rpm. This variable can be
-used to append a custom string to the name.
+used to append a custom string to the name. In addition a string can be
+appended to the rpm Release: tag.
 PKG_SUFFIX=
+PKG_RELEASE=
 
 The hypervisor will report a certain version string. This variable can
 be used to append a custom string to the version.
@@ -210,6 +217,14 @@ changed with these variables.
 XEN_WHOAMI=
 XEN_DOMAIN=
 
+Some components of xen and tools will include an unpredictable timestamp
+into the binaries. To allow reproducible builds the following variables
+can be used to provide fixed timestamps in the expected format.
+XEN_BUILD_DATE=<output of date(1)>
+XEN_BUILD_TIME=hh:mm:ss
+SMBIOS_REL_DATE=mm/dd/yyyy
+VGABIOS_REL_DATE="dd Mon yyyy"
+
 The following variables can be used to tweak some aspects of the
 hypervisor build.
 verbose=y
@@ -233,6 +248,7 @@ OVMF_UPSTREAM_URL=
 QEMU_UPSTREAM_URL=
 QEMU_TRADITIONAL_URL=
 SEABIOS_UPSTREAM_URL=
+MINIOS_UPSTREAM_URL=
 
 Using additional CFLAGS to build tools which will run in dom0 is
 required when building distro packages. These variables can be used to
diff --git a/MAINTAINERS b/MAINTAINERS
index a205136..af77e30 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -49,14 +49,17 @@ for inclusion in xen-unstable.
 Please see http://wiki.xen.org/wiki/Xen_Maintenance_Releases for more
 information.
 
-Remember to copy the stable branch maintainer. The maintainer for this
-branch is:
+Backport requests should be made on the xen-devel at lists.xenproject.org
+list. Remember to copy the appropriate stable branch maintainer.
 
-	Jan Beulich <jbeulich at suse.com>
+The maintainer for this branch is:
+
+        Jan Beulich <jbeulich at suse.com>
 
 Tools backport requests should also be copied to:
 
-	Ian Jackson <Ian.Jackson at eu.citrix.com>
+        Ian Jackson <Ian.Jackson at eu.citrix.com>
+
 
 	Unstable Subsystem Maintainers
 	==============================
@@ -123,7 +126,7 @@ S:	Supported
 F:	xen/arch/x86/hvm/svm/
 
 ARINC653 SCHEDULER
-M:	Nathan Studer <nate.studer at dornerworks.com>
+M:	Josh Whitehead <josh.whitehead at dornerworks.com>
 M:	Robert VanVossen <robert.vanvossen at dornerworks.com>
 S:	Supported
 F:	xen/common/sched_arinc653.c
@@ -132,7 +135,6 @@ F:	tools/libxc/xc_arinc653.c
 ARM (W/ VIRTUALISATION EXTENSIONS) ARCHITECTURE
 M:	Ian Campbell <ian.campbell at citrix.com>
 M:	Stefano Stabellini <stefano.stabellini at citrix.com>
-M:	Tim Deegan <tim at xen.org>
 S:	Supported
 L:	xen-devel at lists.xen.org
 F:	xen/arch/arm/
@@ -143,6 +145,11 @@ F:	xen/drivers/char/omap-uart.c
 F:	xen/drivers/char/pl011.c
 F:	xen/drivers/passthrough/arm/
 
+HISILICON HIP04 SUPPORT
+M:	Zoltan Kiss <zoltan.kiss at huawei.com>
+S:	Supported
+F:	xen/arch/arm/gic-hip04.c
+
 CPU POOLS
 M:	Juergen Gross <jgross at suse.com>
 S:	Supported
@@ -151,12 +158,12 @@ F:	xen/common/cpupool.c
 DEVICE TREE
 M:	Ian Campbell <ian.campbell at citrix.com>
 M:	Stefano Stabellini <stefano.stabellini at citrix.com>
-M:	Tim Deegan <tim at xen.org>
 S:	Supported
 F:	xen/common/libfdt/
 F:	xen/common/device_tree.c
 F:	xen/include/xen/libfdt/
 F:	xen/include/xen/device_tree.h
+F:	xen/drivers/passthrough/device_tree.c
 
 EFI
 M:     Jan Beulich <jbeulich at suse.com>
@@ -181,11 +188,10 @@ F:	tools/debugger/gdbsx/
 
 KDD DEBUGGER
 M:	Tim Deegan <tim at xen.org>
-S:	Supported
+S:	Odd Fixes
 F:	tools/debugger/kdd/
 
 INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT)
-M:	Joseph Cihula <joseph.cihula at intel.com>
 M:	Gang Wei <gang.wei at intel.com>
 M:	Shane Wang <shane.wang at intel.com>
 S:	Supported
@@ -214,6 +220,7 @@ F:	xen/drivers/passthrough/
 X:	xen/drivers/passthrough/amd/
 X:	xen/drivers/passthrough/arm/
 X:	xen/drivers/passthrough/vtd/
+X:	xen/drivers/passthrough/device_tree.c
 F:	xen/include/xen/iommu.h
 
 KEXEC
@@ -238,10 +245,16 @@ F:	config/MiniOS.mk
 F:	extras/mini-os/
 
 OCAML TOOLS
-M:	David Scott <dave.scott at eu.citrix.com>
+M:	David Scott <dave at recoil.org>
 S:	Supported
 F:	tools/ocaml/
 
+OVMF UPSTREAM
+M:	Anthony PERARD <anthony.perard at citrix.com>
+M:	Wei Liu <wei.liu2 at citrix.com>
+S:	Supported
+T:	git git://xenbits.xen.org/ovmf.git
+
 POWER MANAGEMENT
 M:	Jan Beulich <jbeulich at suse.com>
 M:	Liu Jinsong <jinsong.liu at alibaba-inc.com>
@@ -267,8 +280,6 @@ M:	Shriram Rajagopalan <rshriram at cs.ubc.ca>
 M:	Yang Hongyang <yanghy at cn.fujitsu.com>
 S:	Maintained
 F:	docs/README.remus
-F:	tools/libxc/xc_domain_save.c
-F:	tools/libxc/xc_domain_restore.c
 F:	tools/blktap2/drivers/block-remus.c
 F:	tools/blktap2/drivers/hashtable*
 F:	tools/libxl/libxl_remus_*
@@ -277,6 +288,11 @@ F:	tools/libxl/libxl_nonetbuffer.c
 F:	tools/hotplug/Linux/remus-netbuf-setup
 F:	tools/hotplug/Linux/block-drbd-probe
 
+RTDS SCHEDULER
+M:	Dario Faggioli <dario.faggioli at citrix.com>
+S:	Supported
+F:	xen/common/sched_rt.c
+
 SCHEDULING
 M:	George Dunlap <george.dunlap at eu.citrix.com>
 S:	Supported
@@ -346,30 +362,40 @@ F:	docs/misc/vtpm.txt
 X86 ARCHITECTURE
 M:	Keir Fraser <keir at xen.org>
 M:	Jan Beulich <jbeulich at suse.com>
+M:	Andrew Cooper <andrew.cooper3 at citrix.com>
 S:	Supported
 L:	xen-devel at lists.xen.org
 F:	xen/arch/x86/
 F:	xen/include/asm-x86/
 F:	tools/firmware/hvmloader/
+F:	tools/tests/x86_emulator/
 
 X86 MEMORY MANAGEMENT
-M:	Tim Deegan <tim at xen.org>
+M:	George Dunlap <george.dunlap at eu.citrix.com>
 S:	Supported
 F:	xen/arch/x86/mm/
 
 X86 MEMORY SHARING AND PAGING
-M:	Andres Lagar-Cavilla <andres at lagarcavilla.org>
-M:	Tim Deegan <tim at xen.org>
-S:	Supported
+S:	Orphaned
 F:	xen/arch/x86/mm/mem_sharing.c
 F:	xen/arch/x86/mm/mem_paging.c
 F:	tools/memshr
 
-MEMORY EVENT AND ACCESS
+X86 SHADOW PAGETABLES
 M:	Tim Deegan <tim at xen.org>
+S:	Maintained
+F:	xen/arch/x86/mm/shadow/
+
+VM EVENT AND MEM ACCESS
+M:	Razvan Cojocaru <rcojocaru at bitdefender.com>
+M:	Tamas K Lengyel <tamas at tklengyel.com>
 S:	Supported
-F:	xen/common/mem_event.c
+F:	xen/common/vm_event.c
 F:	xen/common/mem_access.c
+F:	xen/arch/x86/hvm/event.c
+F:	xen/arch/x86/monitor.c
+F:	xen/arch/x86/vm_event.c
+F:	tools/tests/xen-access
 
 XENTRACE
 M:	George Dunlap <george.dunlap at eu.citrix.com>
diff --git a/Makefile b/Makefile
index 6e9a4c7..75177f0 100644
--- a/Makefile
+++ b/Makefile
@@ -10,10 +10,31 @@ all: dist
 SUBSYSTEMS?=xen tools stubdom docs
 TARGS_DIST=$(patsubst %, dist-%, $(SUBSYSTEMS))
 TARGS_INSTALL=$(patsubst %, install-%, $(SUBSYSTEMS))
+TARGS_BUILD=$(patsubst %, build-%, $(SUBSYSTEMS))
+TARGS_CLEAN=$(patsubst %, clean-%, $(SUBSYSTEMS))
+TARGS_DISTCLEAN=$(patsubst %, distclean-%, $(SUBSYSTEMS))
 
 export XEN_ROOT=$(CURDIR)
 include Config.mk
 
+.PHONY: mini-os-dir
+mini-os-dir:
+	if [ ! -d $(XEN_ROOT)/extras/mini-os ]; then \
+		GIT=$(GIT) $(XEN_ROOT)/scripts/git-checkout.sh \
+			$(MINIOS_UPSTREAM_URL) \
+			$(MINIOS_UPSTREAM_REVISION) \
+			$(XEN_ROOT)/extras/mini-os ; \
+	fi
+
+.PHONY: mini-os-dir-force-update
+mini-os-dir-force-update: mini-os-dir
+	set -ex; \
+	if [ "$(MINIOS_UPSTREAM_REVISION)" ]; then \
+		cd extras/mini-os-remote; \
+		$(GIT) fetch origin; \
+		$(GIT) reset --hard $(MINIOS_UPSTREAM_REVISION); \
+	fi
+
 SUBARCH := $(subst x86_32,i386,$(XEN_TARGET_ARCH))
 export XEN_TARGET_ARCH SUBARCH
 export DESTDIR
@@ -23,13 +44,25 @@ export DESTDIR
 install: $(TARGS_INSTALL)
 
 .PHONY: build
-build:
+build: $(TARGS_BUILD)
+
+.PHONY: build-xen
+build-xen:
 	$(MAKE) -C xen build
+
+.PHONY: build-tools
+build-tools:
 	$(MAKE) -C tools build
+
+.PHONY: build-stubdom
+build-stubdom: mini-os-dir
 	$(MAKE) -C stubdom build
 ifeq (x86_64,$(XEN_TARGET_ARCH))
 	XEN_TARGET_ARCH=x86_32 $(MAKE) -C stubdom pv-grub
 endif
+
+.PHONY: build-docs
+build-docs:
 	$(MAKE) -C docs build
 
 # The test target is for unit tests that can run without an installation.  Of
@@ -69,7 +102,7 @@ install-tools:
 	$(MAKE) -C tools install
 
 .PHONY: install-stubdom
-install-stubdom: install-tools
+install-stubdom: install-tools mini-os-dir
 	$(MAKE) -C stubdom install
 ifeq (x86_64,$(XEN_TARGET_ARCH))
 	XEN_TARGET_ARCH=x86_32 $(MAKE) -C stubdom install-grub
@@ -110,11 +143,11 @@ rpmball: dist
 	bash ./tools/misc/mkrpm $(XEN_ROOT) $$($(MAKE) -C xen xenversion --no-print-directory)
 
 .PHONY: subtree-force-update
-subtree-force-update:
+subtree-force-update: mini-os-dir-force-update
 	$(MAKE) -C tools subtree-force-update
 
 .PHONY: subtree-force-update-all
-subtree-force-update-all:
+subtree-force-update-all: mini-os-dir-force-update
 	$(MAKE) -C tools subtree-force-update-all
 
 # Make a source tarball, including qemu sub-trees.
@@ -135,28 +168,52 @@ src-tarball: subtree-force-update-all
 	bash ./tools/misc/mktarball $(XEN_ROOT) $$(git describe)
 
 .PHONY: clean
-clean::
+clean: $(TARGS_CLEAN)
+
+.PHONY: clean-xen
+clean-xen:
 	$(MAKE) -C xen clean
+
+.PHONY: clean-tools
+clean-tools:
 	$(MAKE) -C tools clean
+
+.PHONY: clean-stubdom
+clean-stubdom:
 	$(MAKE) -C stubdom crossclean
 ifeq (x86_64,$(XEN_TARGET_ARCH))
 	XEN_TARGET_ARCH=x86_32 $(MAKE) -C stubdom crossclean
 endif
+
+.PHONY: clean-docs
+clean-docs:
 	$(MAKE) -C docs clean
 
 # clean, but blow away tarballs
 .PHONY: distclean
-distclean:
+distclean: $(TARGS_DISTCLEAN)
 	rm -f config/Toplevel.mk
+	rm -rf dist
+	rm -rf config.log config.status config.cache autom4te.cache
+
+.PHONY: distclean-xen
+distclean-xen:
 	$(MAKE) -C xen distclean
+
+.PHONY: distclean-tools
+distclean-tools:
 	$(MAKE) -C tools distclean
+
+.PHONY: distclean-stubdom
+distclean-stubdom:
 	$(MAKE) -C stubdom distclean
 ifeq (x86_64,$(XEN_TARGET_ARCH))
 	XEN_TARGET_ARCH=x86_32 $(MAKE) -C stubdom distclean
 endif
+
+.PHONY: distclean-docs
+distclean-docs:
 	$(MAKE) -C docs distclean
-	rm -rf dist
-	rm -rf config.log config.status config.cache autom4te.cache
 
 # Linux name for GNU distclean
 .PHONY: mrproper
@@ -171,16 +228,23 @@ help:
 	@echo '  install-stubdom       - build and install the stubdomain images'
 	@echo '  install-docs          - build and install user documentation'
 	@echo ''
-	@echo 'Building targets:'
+	@echo 'Local dist targets:'
 	@echo '  dist                  - build and install everything into local dist directory'
 	@echo '  world                 - clean everything then make dist'
-	@echo '  xen                   - build and install Xen hypervisor'
-	@echo '  tools                 - build and install tools'
-	@echo '  stubdom               - build and install the stubdomain images'
-	@echo '  docs                  - build and install user documentation'
+	@echo '  dist-xen              - build Xen hypervisor and install into local dist'
+	@echo '  dist-tools            - build the tools and install into local dist'
+	@echo '  dist-stubdom          - build the stubdomain images and install into local dist'
+	@echo '  dist-docs             - build user documentation and install into local dist'
+	@echo ''
+	@echo 'Building targets:'
+	@echo '  build                 - build everything'
+	@echo '  build-xen             - build Xen hypervisor'
+	@echo '  build-tools           - build the tools'
+	@echo '  build-stubdom         - build the stubdomain images'
+	@echo '  build-docs            - build user documentation'
 	@echo ''
 	@echo 'Cleaning targets:'
-	@echo '  clean                 - clean the Xen, tools and docs (but not guest kernel trees)'
+	@echo '  clean                 - clean the Xen, tools and docs'
 	@echo '  distclean             - clean plus delete kernel build trees and'
 	@echo '                          local downloaded files'
 	@echo '  subtree-force-update  - Call *-force-update on all git subtrees (qemu, seabios, ovmf)'
diff --git a/README b/README
index 3bdc74e..5664a93 100644
--- a/README
+++ b/README
@@ -1,9 +1,9 @@
 #################################
-__  __            _  _    ____   ___
-\ \/ /___ _ __   | || |  | ___| / _ \
- \  // _ \ '_ \  | || |_ |___ \| | | |
- /  \  __/ | | | |__   _| ___) | |_| |
-/_/\_\___|_| |_|    |_|(_)____(_)___/
+__  __            _  _    __    ___
+\ \/ /___ _ __   | || |  / /_  / _ \
+ \  // _ \ '_ \  | || |_| '_ \| | | |
+ /  \  __/ | | | |__   _| (_) | |_| |
+/_/\_\___|_| |_|    |_|(_)___(_)___/
 
 #################################
 
@@ -19,39 +19,6 @@ is freely-distributable Open Source software, released under the GNU
 GPL. Since its initial public release, Xen has grown a large
 development community, spearheaded by xen.org (http://www.xen.org).
 
-The 4.5 release offers a number of improvements, including:
-improvements for large scale machines during bootup and for PCI
-passthrough; multiple IO-REQ servers (many QEMUs for a guest); soft
-affinity for vCPUs (aka NUMA affinity); and API expansion for guest
-introspection.  We also have number of updates for CPU specific
-changes, such as: Broadwell Supervisor Mode Access Prevention; Haswell
-Server Cache QoS Monitoring aka Intel Resource Director Technology;
-further extensions to vAPIC (SandyBridge feature); fixes in AMD
-microcode loading; Data Breaking Extensions; and further MSR masking
-support on AMD.
-
-On the experimental side we have added a new Real-Time Deferrable
-Server Based CPU Scheduler (rtds), and PVH initial domain (dom0)
-support for Intel CPUs.
-
-Additionally, 4.5 has a huge update to the ARM code, including support
-for: up to 1TB in guests; up to 8 CPUs; Power State Coordination
-Interface (0.2) to power up and down CPUs; UEFI booting; IOMMU support
-(SMMUv1); Super Page (2MB) support; passthrough of MMIO regions to
-guests; and lower interrupt latency.
-
-The toolstack has expanded to include support for: VM Generation ID (a
-Windows 2012 Server requirement); Remus initial support (for high
-availability) in libxl (since xend has been removed); libxenlight JSON
-support, HVM guest direct kernel boot, and persistent configuration
-support; systemd support; performance optimizations in oxenstored;
-and support in QEMU for expanding the PCI hole.
-
-Lastly, we have removed the Python toolstack (xend).
-
-And as always, there are a number of performance, stability, and security
-improvements under-the hood.
-
 This file contains some quick-start instructions to install Xen on
 your system. For more information see http:/www.xen.org/ and
 http://wiki.xen.org/
@@ -59,7 +26,12 @@ http://wiki.xen.org/
 Quick-Start Guide
 =================
 
-First, there are a number of prerequisites for building a Xen source
+First, this is just a quick-start guide. For more comprehensive
+information see the INSTALL file and the Xen wiki at
+http://wiki.xenproject.org and in particular
+http://wiki.xenproject.org/wiki/Getting_Started.
+
+Second, there are a number of prerequisites for building a Xen source
 release. Make sure you have all the following installed, either by
 visiting the project webpage or installing a pre-built package
 provided by your OS distributor:
diff --git a/config/Paths.mk.in b/config/Paths.mk.in
index fe10f76..d36504f 100644
--- a/config/Paths.mk.in
+++ b/config/Paths.mk.in
@@ -29,22 +29,14 @@ includedir               := @includedir@
 localstatedir            := @localstatedir@
 sysconfdir               := @sysconfdir@
 
-PREFIX                   := $(prefix)
-
-SBINDIR                  := $(sbindir)
-BINDIR                   := $(bindir)
 LIBEXEC                  := $(libexecdir)/$(PACKAGE_TARNAME)
 LIBEXEC_BIN              := @LIBEXEC_BIN@
 LIBEXEC_LIB              := $(LIBEXEC)/lib
 LIBEXEC_INC              := $(LIBEXEC)/include
 
-INCLUDEDIR               := $(includedir)
 SHAREDIR                 := @SHAREDIR@
-MANDIR                   := $(mandir)
-MAN1DIR                  := $(MANDIR)/man1
-MAN8DIR                  := $(MANDIR)/man8
-LIBDIR                   := $(libdir)
-DOCDIR                   := $(docdir)
+MAN1DIR                  := $(mandir)/man1
+MAN8DIR                  := $(mandir)/man8
 
 XEN_RUN_DIR              := @XEN_RUN_DIR@
 XEN_LOG_DIR              := @XEN_LOG_DIR@
@@ -56,8 +48,9 @@ CONFIG_LEAF_DIR          := @CONFIG_LEAF_DIR@
 BASH_COMPLETION_DIR      := $(CONFIG_DIR)/bash_completion.d
 XEN_LOCK_DIR             := @XEN_LOCK_DIR@
 XEN_PAGING_DIR           := @XEN_PAGING_DIR@
+XEN_DUMP_DIR             := @XEN_DUMP_DIR@
 
-XENFIRMWAREDIR           := $(LIBEXEC)/boot
+XENFIRMWAREDIR           := @XENFIRMWAREDIR@
 
 XEN_CONFIG_DIR           := @XEN_CONFIG_DIR@
 XEN_SCRIPT_DIR           := @XEN_SCRIPT_DIR@
diff --git a/config/StdGNU.mk b/config/StdGNU.mk
index 4efebe3..129d5c8 100644
--- a/config/StdGNU.mk
+++ b/config/StdGNU.mk
@@ -2,9 +2,11 @@ AS         = $(CROSS_COMPILE)as
 LD         = $(CROSS_COMPILE)ld
 ifeq ($(clang),y)
 CC         = $(CROSS_COMPILE)clang
+CXX        = $(CROSS_COMPILE)clang++
 LD_LTO     = $(CROSS_COMPILE)llvm-ld
 else
 CC         = $(CROSS_COMPILE)gcc
+CXX        = $(CROSS_COMPILE)g++
 LD_LTO     = $(CROSS_COMPILE)ld
 endif
 CPP        = $(CC) -E
@@ -25,6 +27,7 @@ INSTALL_DATA = $(INSTALL) -m0644 -p
 INSTALL_PROG = $(INSTALL) -m0755 -p
 
 BOOT_DIR ?= /boot
+DEBUG_DIR ?= /usr/lib/debug
 
 SOCKET_LIBS =
 UTIL_LIBS = -lutil
diff --git a/config/SunOS.mk b/config/SunOS.mk
index 3316280..db5e898 100644
--- a/config/SunOS.mk
+++ b/config/SunOS.mk
@@ -2,6 +2,7 @@ AS         = $(CROSS_COMPILE)gas
 LD         = $(CROSS_COMPILE)gld
 CC         = $(CROSS_COMPILE)gcc
 CPP        = $(CROSS_COMPILE)gcc -E
+CXX        = $(CROSS_COMPILE)g++
 AR         = $(CROSS_COMPILE)gar
 RANLIB     = $(CROSS_COMPILE)granlib
 NM         = $(CROSS_COMPILE)gnm
@@ -18,6 +19,7 @@ INSTALL_DATA = $(INSTALL) -m0644 -p
 INSTALL_PROG = $(INSTALL) -m0755 -p
 
 BOOT_DIR ?= /boot
+DEBUG_DIR ?= /usr/lib/debug
 
 SunOS_LIBDIR = /usr/sfw/lib
 SunOS_LIBDIR_x86_64 = /usr/sfw/lib/amd64
diff --git a/config/Tools.mk.in b/config/Tools.mk.in
index 89de5bd..9bd5f6c 100644
--- a/config/Tools.mk.in
+++ b/config/Tools.mk.in
@@ -13,6 +13,7 @@ BISON               := @BISON@
 FLEX                := @FLEX@
 PYTHON              := @PYTHON@
 PYTHON_PATH         := @PYTHONPATH@
+PY_NOOPT_CFLAGS     := @PY_NOOPT_CFLAGS@
 PERL                := @PERL@
 CURL_CONFIG         := @CURL@
 XML2_CONFIG         := @XML@
@@ -22,6 +23,7 @@ AS86                := @AS86@
 LD86                := @LD86@
 BCC                 := @BCC@
 IASL                := @IASL@
+AWK                 := @AWK@
 FETCHER             := @FETCHER@
 SEABIOS_PATH        := @seabios_path@
 OVMF_PATH           := @ovmf_path@
@@ -57,7 +59,6 @@ CONFIG_ROMBIOS      := @rombios@
 CONFIG_SEABIOS      := @seabios@
 CONFIG_QEMU_TRAD    := @qemu_traditional@
 CONFIG_QEMU_XEN     := @qemu_xen@
-CONFIG_BLKTAP1      := @blktap1@
 CONFIG_BLKTAP2      := @blktap2@
 CONFIG_QEMUU_EXTRA_ARGS:= @EXTRA_QEMUU_CONFIGURE_ARGS@
 CONFIG_REMUS_NETBUF := @remus_netbuf@
@@ -76,5 +77,7 @@ CONFIG_LIBICONV     := @libiconv@
 CONFIG_GCRYPT       := @libgcrypt@
 EXTFS_LIBS          := @EXTFS_LIBS@
 CURSES_LIBS         := @CURSES_LIBS@
+TINFO_LIBS          := @TINFO_LIBS@
+ARGP_LDFLAGS        := @argp_ldflags@
 
 FILE_OFFSET_BITS    := @FILE_OFFSET_BITS@
diff --git a/config/arm32.mk b/config/arm32.mk
index 4f83a63..cd97e42 100644
--- a/config/arm32.mk
+++ b/config/arm32.mk
@@ -12,7 +12,9 @@ CFLAGS += -marm
 HAS_PL011 := y
 HAS_EXYNOS4210 := y
 HAS_OMAP := y
+HAS_SCIF := y
 HAS_NS16550 := y
+HAS_MEM_ACCESS := y
 
 # Use only if calling $(LD) directly.
 LDFLAGS_DIRECT += -EL
diff --git a/config/arm64.mk b/config/arm64.mk
index 6eafda2..c5deb4e 100644
--- a/config/arm64.mk
+++ b/config/arm64.mk
@@ -7,7 +7,10 @@ CONFIG_XEN_INSTALL_SUFFIX :=
 CFLAGS += #-marm -march= -mcpu= etc
 
 HAS_PL011 := y
+HAS_CADENCE_UART := y
 HAS_NS16550 := y
+HAS_MEM_ACCESS := y
+HAS_GICV3 := y
 
 # Use only if calling $(LD) directly.
 LDFLAGS_DIRECT += -EL
diff --git a/configure b/configure
index 98a73d4..80b27d6 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for Xen Hypervisor 4.5.
+# Generated by GNU Autoconf 2.69 for Xen Hypervisor 4.6.
 #
 # Report bugs to <xen-devel at lists.xen.org>.
 #
@@ -579,8 +579,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='Xen Hypervisor'
 PACKAGE_TARNAME='xen'
-PACKAGE_VERSION='4.5'
-PACKAGE_STRING='Xen Hypervisor 4.5'
+PACKAGE_VERSION='4.6'
+PACKAGE_STRING='Xen Hypervisor 4.6'
 PACKAGE_BUGREPORT='xen-devel at lists.xen.org'
 PACKAGE_URL='http://www.xen.org/'
 
@@ -594,6 +594,7 @@ stubdom
 tools
 xen
 subdirs
+XEN_DUMP_DIR
 XEN_PAGING_DIR
 XEN_LOCK_DIR
 XEN_SCRIPT_DIR
@@ -604,6 +605,7 @@ SHAREDIR
 XEN_LIB_STORED
 XEN_LOG_DIR
 XEN_RUN_DIR
+XENFIRMWAREDIR
 LIBEXEC_BIN
 CONFIG_LEAF_DIR
 host_os
@@ -657,6 +659,7 @@ ac_user_opts='
 enable_option_checking
 with_initddir
 with_sysconfig_leaf_dir
+with_xen_dumpdir
 enable_xen
 enable_tools
 enable_stubdom
@@ -1208,7 +1211,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures Xen Hypervisor 4.5 to adapt to many kinds of systems.
+\`configure' configures Xen Hypervisor 4.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1273,7 +1276,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of Xen Hypervisor 4.5:";;
+     short | recursive ) echo "Configuration of Xen Hypervisor 4.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1296,6 +1299,8 @@ Optional Packages:
                           options for runlevel scripts and daemons such as
                           xenstored. This should be either "sysconfig" or
                           "default". [sysconfig]
+  --with-xen-dumpdir=DIR  Path to directory for domU crash dumps.
+                          [LOCALSTATEDIR/lib/xen/dump]
 
 Report bugs to <xen-devel at lists.xen.org>.
 Xen Hypervisor home page: <http://www.xen.org/>.
@@ -1361,7 +1366,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-Xen Hypervisor configure 4.5
+Xen Hypervisor configure 4.6
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1378,7 +1383,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by Xen Hypervisor $as_me 4.5, which was
+It was created by Xen Hypervisor $as_me 4.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -1918,6 +1923,15 @@ fi
 CONFIG_LEAF_DIR=$config_leaf_dir
 
 
+
+# Check whether --with-xen-dumpdir was given.
+if test "${with_xen_dumpdir+set}" = set; then :
+  withval=$with_xen_dumpdir; xen_dumpdir_path=$withval
+else
+  xen_dumpdir_path=$localstatedir/lib/xen/dump
+fi
+
+
 if test "$libexecdir" = '${exec_prefix}/libexec' ; then
     case "$host_os" in
          *netbsd*) ;;
@@ -1930,6 +1944,9 @@ libexecdir=`eval echo $libexecdir`
 LIBEXEC_BIN=`eval echo $libexecdir/$PACKAGE_TARNAME/bin`
 
 
+XENFIRMWAREDIR=`eval echo $libexecdir/$PACKAGE_TARNAME/boot`
+
+
 XEN_RUN_DIR=$localstatedir/run/xen
 
 
@@ -1964,6 +1981,9 @@ esac
 XEN_PAGING_DIR=$localstatedir/lib/xen/xenpaging
 
 
+XEN_DUMP_DIR=$xen_dumpdir_path
+
+
 
 case "$host_cpu" in
     i[3456]86|x86_64)
@@ -2727,7 +2747,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by Xen Hypervisor $as_me 4.5, which was
+This file was extended by Xen Hypervisor $as_me 4.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -2781,7 +2801,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-Xen Hypervisor config.status 4.5
+Xen Hypervisor config.status 4.6
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/docs/INDEX b/docs/INDEX
index fdf0ad8..7d26cf8 100644
--- a/docs/INDEX
+++ b/docs/INDEX
@@ -20,9 +20,13 @@ misc/xl-disk-configuration	XL Disk Configuration
 misc/xl-network-configuration	XL Network Configuration
 misc/distro_mapping		Distro Directory Layouts
 misc/dump-core-format		Xen Core Dump Format
-misc/sedf_scheduler_mini-HOWTO	sEDF Mini HOWTO
 misc/vtd			VT-d HOWTO
 misc/vtpm			Virtual TPM
 misc/xen-error-handling		Xen Error Handling
 misc/xenpaging			Xen Paging
 misc/xsm-flask			XSM/FLASK Configuration
+misc/arm/booting		How to boot Xen on ARM
+misc/arm/early-printk		Enabling early printk on ARM
+misc/arm/passthrough		Passthrough a device described in the Device Tree to a guest
+misc/arm/device-tree/booting	Device tree bindings to boot Xen
+misc/arm/device-tree/passthrough	Device tree binding to passthrough a device
diff --git a/docs/Makefile b/docs/Makefile
index 2c0903b..b9da605 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -7,29 +7,37 @@ VERSION		:= $(shell $(MAKE) -C $(XEN_ROOT)/xen --no-print-directory xenversion)
 DOC_ARCHES      := arm x86_32 x86_64
 
 # Documentation sources to build
-MAN1SRC-y := $(wildcard man/xl*.pod.1)
-MAN1SRC-y += $(wildcard man/xenstore*.pod.1)
+MAN1SRC-y := $(sort $(shell find man/ -name '*.pod.1' -print))
+MAN5SRC-y := $(sort $(shell find man/ -name '*.pod.5' -print))
+MAN8SRC-y := $(sort $(shell find man/ -name '*.pod.8' -print))
 
-MAN5SRC-y := $(wildcard man/xl*.pod.5)
+MARKDOWNSRC-y := $(sort $(shell find misc -name '*.markdown' -print))
 
-MARKDOWNSRC-y := $(wildcard misc/*.markdown)
-
-TXTSRC-y := $(wildcard misc/*.txt)
+TXTSRC-y := $(sort $(shell find misc -name '*.txt' -print))
 
+PANDOCSRC-y := $(sort $(shell find features/ misc/ specs/ -name '*.pandoc' -print))
 
+# Documentation targets
 DOC_MAN1 := $(patsubst man/%.pod.1,man1/%.1,$(MAN1SRC-y))
 DOC_MAN5 := $(patsubst man/%.pod.5,man5/%.5,$(MAN5SRC-y))
+DOC_MAN8 := $(patsubst man/%.pod.8,man8/%.8,$(MAN8SRC-y))
 DOC_HTML := $(patsubst %.markdown,html/%.html,$(MARKDOWNSRC-y)) \
+            $(patsubst %.pandoc,html/%.html,$(PANDOCSRC-y)) \
             $(patsubst man/%.pod.1,html/man/%.1.html,$(MAN1SRC-y)) \
             $(patsubst man/%.pod.5,html/man/%.5.html,$(MAN5SRC-y)) \
+            $(patsubst man/%.pod.8,html/man/%.8.html,$(MAN8SRC-y)) \
             $(patsubst %.txt,html/%.txt,$(TXTSRC-y)) \
             $(patsubst %,html/hypercall/%/index.html,$(DOC_ARCHES))
 DOC_TXT  := $(patsubst %.txt,txt/%.txt,$(TXTSRC-y)) \
             $(patsubst %.markdown,txt/%.txt,$(MARKDOWNSRC-y)) \
+            $(patsubst %.pandoc,txt/%.txt,$(PANDOCSRC-y)) \
             $(patsubst man/%.pod.1,txt/man/%.1.txt,$(MAN1SRC-y)) \
-            $(patsubst man/%.pod.5,txt/man/%.5.txt,$(MAN5SRC-y))
-DOC_PDF  := $(patsubst %.markdown,pdf/%.pdf,$(MARKDOWNSRC-y))
+            $(patsubst man/%.pod.5,txt/man/%.5.txt,$(MAN5SRC-y)) \
+            $(patsubst man/%.pod.8,txt/man/%.8.txt,$(MAN8SRC-y))
+DOC_PDF  := $(patsubst %.markdown,pdf/%.pdf,$(MARKDOWNSRC-y)) \
+            $(patsubst %.pandoc,pdf/%.pdf,$(PANDOCSRC-y))
 
+# Top level build targets
 .PHONY: all
 all: build
 
@@ -40,109 +48,114 @@ build: html txt pdf man-pages figs
 html: $(DOC_HTML) html/index.html
 
 .PHONY: txt
-txt:
-ifdef POD2TEXT
-	$(MAKE) $(DOC_TXT)
-else
-	@echo "pod2text not installed; skipping text outputs."
-endif
+txt: $(DOC_TXT)
 
 .PHONY: figs
 figs:
-ifdef FIG2DEV
+ifneq ($(FIG2DEV),)
 	set -x; $(MAKE) -C figs
 else
 	@echo "fig2dev (transfig) not installed; skipping figs."
 endif
 
-.PHONY: man-pages
-man-pages:
-ifdef POD2MAN
-	$(MAKE) $(DOC_MAN1) $(DOC_MAN5)
-else
-	@echo "pod2man not installed; skipping man-pages."
-endif
-
 .PHONY: pdf
-pdf:
-ifdef PANDOC
-	$(MAKE) $(DOC_PDF)
-else
-	@echo "pandoc not installed; skipping pdfs."
-endif
-
-man1/%.1: man/%.pod.1 Makefile
-	$(INSTALL_DIR) $(@D)
-	$(POD2MAN) --release=$(VERSION) --name=`echo $@ | sed 's/^man1.//'| \
-		sed 's/.1//'` -s 1 -c "Xen" $< $@
-
-man5/%.5: man/%.pod.5 Makefile
-	$(INSTALL_DIR) $(@D)
-	$(POD2MAN) --release=$(VERSION) --name=`echo $@ | sed 's/^man5.//'| \
-		sed 's/.5//'` -s 5 -c "Xen" $< $@
+pdf: $(DOC_PDF)
 
 .PHONY: clean
-clean:
+clean: clean-man-pages
 	$(MAKE) -C figs clean
-	rm -rf .word_count *.aux *.dvi *.bbl *.blg *.glo *.idx *~ 
+	rm -rf .word_count *.aux *.dvi *.bbl *.blg *.glo *.idx *~
 	rm -rf *.ilg *.log *.ind *.toc *.bak *.tmp core
 	rm -rf html txt pdf
-	rm -rf man5
-	rm -rf man1
 
 .PHONY: distclean
 distclean: clean
 	rm -rf $(XEN_ROOT)/config/Docs.mk config.log config.status config.cache \
 		autom4te.cache
 
-.PHONY: install-man-pages
-install-man-pages: man-pages
-	$(INSTALL_DIR) $(DESTDIR)$(MANDIR)
-	cp -R man1 $(DESTDIR)$(MANDIR)
-	cp -R man5 $(DESTDIR)$(MANDIR)
+# Top level install targets
+
+.PHONY: man-pages install-man-pages clean-man-pages
+
+# Metarules for generating manpages.  Run with $(1) substitued for section
+define GENERATE_MANPAGE_RULES
+
+# Real manpages
+man$(1)/%.$(1): man/%.pod.$(1) Makefile
+ifneq ($(POD2MAN),)
+	@$(INSTALL_DIR) $$(@D)
+	$(POD2MAN) --release=$(VERSION) --name=$$* -s $(1) -c "Xen" $$< $$@
+else
+	@echo "pod2man not installed; skipping $$@"
+endif
+
+# HTML manpages
+html/man/%.$(1).html: man/%.pod.$(1) Makefile
+ifneq ($(POD2HTML),)
+	@$(INSTALL_DIR) $$(@D)
+	$(POD2HTML) --infile=$$< --outfile=$$@
+else
+	@echo "pod2html not installed; skipping $$@"
+endif
+
+# Text manpages
+txt/man/%.$(1).txt: man/%.pod.$(1) Makefile
+ifneq ($(POD2TEXT),)
+	@$(INSTALL_DIR) $$(@D)
+	$(POD2TEXT) $$< $$@
+else
+	@echo "pod2text not installed; skipping $$@"
+endif
+
+# Build
+.PHONY: man$(1)-pages
+man$(1)-pages: $$(DOC_MAN$(1))
+
+# Install
+.PHONY: install-man$(1)-pages
+install-man$(1)-pages: man$(1)-pages
+	$(INSTALL_DIR) $(DESTDIR)$(mandir)
+	cp -r man$(1) $(DESTDIR)$(mandir)
+
+# Clean
+.PHONY: clean-man$(1)-pages
+clean-man$(1)-pages:
+	rm -rf man$(1)
+
+# Link buld/install/clean to toplevel rules
+man-pages: man$(1)-pages
+install-man-pages: install-man$(1)-pages
+clean-man-pages: clean-man$(1)-pages
+
+endef
+
+# Generate manpage rules for each section
+$(foreach i,1 5 8,$(eval $(call GENERATE_MANPAGE_RULES,$(i))))
 
 .PHONY: install-html
 install-html: html txt figs
-	$(INSTALL_DIR) $(DESTDIR)$(DOCDIR)
-	[ ! -d html ] || cp -R html $(DESTDIR)$(DOCDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(docdir)
+	[ ! -d html ] || cp -R html $(DESTDIR)$(docdir)
 
 .PHONY: install
 install: install-man-pages install-html
 
+# Individual file build targets
 html/index.html: $(DOC_HTML) $(CURDIR)/gen-html-index INDEX
 	$(PERL) -w -- $(CURDIR)/gen-html-index -i INDEX html $(DOC_HTML)
 
 html/%.html: %.markdown
-	$(INSTALL_DIR) $(@D)
-ifdef MARKDOWN
-	@echo "Running markdown to generate $*.html ... "
-	$(MARKDOWN) $< > $@.tmp ; \
-	$(call move-if-changed,$@.tmp,$@)
+ifneq ($(MARKDOWN),)
+	@$(INSTALL_DIR) $(@D)
+	$(MARKDOWN) $< > $@
 else
-	@echo "markdown not installed; skipping $*.html."
+	@echo "markdown not installed; skipping $@"
 endif
 
 html/%.txt: %.txt
-	$(INSTALL_DIR) $(@D)
+	@$(INSTALL_DIR) $(@D)
 	$(INSTALL_DATA) $< $@
 
-html/man/%.1.html: man/%.pod.1 Makefile
-	$(INSTALL_DIR) $(@D)
-ifdef POD2HTML
-	$(POD2HTML) --infile=$< --outfile=$@.tmp
-	$(call move-if-changed,$@.tmp,$@)
-else
-	@echo "pod2html not installed; skipping $<."
-endif
-
-html/man/%.5.html: man/%.pod.5 Makefile
-	$(INSTALL_DIR) $(@D)
-ifdef POD2HTML
-	$(POD2HTML) --infile=$< --outfile=$@.tmp
-	$(call move-if-changed,$@.tmp,$@)
-else
-	@echo "pod2html not installed; skipping $<."
-endif
 
 # For non-x86 arches exclude the subarch whole x86 arch.
 $(foreach i,$(filter-out x86_32 x86_64,$(DOC_ARCHES)),html/hypercall/$(i)/index.html): EXTRA_EXCLUDE := -X arch-x86
@@ -160,37 +173,29 @@ html/hypercall/%/index.html: $(CURDIR)/xen-headers Makefile
 -include $(wildcard html/hypercall/*/.deps)
 
 txt/%.txt: %.txt
-	$(INSTALL_DIR) $(@D)
-	cp $< $@.tmp
-	$(call move-if-changed,$@.tmp,$@)
+	@$(INSTALL_DIR) $(@D)
+	$(INSTALL_DATA) $< $@
 
 txt/%.txt: %.markdown
-	$(INSTALL_DIR) $(@D)
-	cp $< $@.tmp
-	$(call move-if-changed,$@.tmp,$@)
+	@$(INSTALL_DIR) $(@D)
+	$(INSTALL_DATA) $< $@
 
-txt/man/%.1.txt: man/%.pod.1 Makefile
-	$(INSTALL_DIR) $(@D)
-ifdef POD2TEXT
-	$(POD2TEXT) $< $@.tmp
-	$(call move-if-changed,$@.tmp,$@)
+pdf/%.pdf: %.markdown
+ifneq ($(PANDOC),)
+	@$(INSTALL_DIR) $(@D)
+	$(PANDOC) --number-sections --toc --standalone $< --output $@
 else
-	@echo "pod2text not installed; skipping $<."
+	@echo "pandoc not installed; skipping $@"
 endif
 
-txt/man/%.5.txt: man/%.pod.5 Makefile
-	$(INSTALL_DIR) $(@D)
-ifdef POD2TEXT
-	$(POD2TEXT) $< $@.tmp
-	$(call move-if-changed,$@.tmp,$@)
+pdf/%.pdf txt/%.txt html/%.html: %.pandoc
+ifneq ($(PANDOC),)
+	@$(INSTALL_DIR) $(@D)
+	$(PANDOC) --number-sections --toc --standalone $< --output $@
 else
-	@echo "pod2text not installed; skipping $<."
+	@echo "pandoc not installed; skipping $@"
 endif
 
-pdf/%.pdf: %.markdown
-	$(INSTALL_DIR) $(@D)
-	pandoc -N --toc --standalone $< --output $@
-
 ifeq (,$(findstring clean,$(MAKECMDGOALS)))
 $(XEN_ROOT)/config/Docs.mk:
 	$(error You have to run ./configure before building docs)
diff --git a/docs/configure b/docs/configure
index c36e249..fb1d96c 100755
--- a/docs/configure
+++ b/docs/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for Xen Hypervisor Documentation 4.5.
+# Generated by GNU Autoconf 2.69 for Xen Hypervisor Documentation 4.6.
 #
 # Report bugs to <xen-devel at lists.xen.org>.
 #
@@ -579,8 +579,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='Xen Hypervisor Documentation'
 PACKAGE_TARNAME='xen'
-PACKAGE_VERSION='4.5'
-PACKAGE_STRING='Xen Hypervisor Documentation 4.5'
+PACKAGE_VERSION='4.6'
+PACKAGE_STRING='Xen Hypervisor Documentation 4.6'
 PACKAGE_BUGREPORT='xen-devel at lists.xen.org'
 PACKAGE_URL='http://www.xen.org/'
 
@@ -1186,7 +1186,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures Xen Hypervisor Documentation 4.5 to adapt to many kinds of systems.
+\`configure' configures Xen Hypervisor Documentation 4.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1247,7 +1247,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of Xen Hypervisor Documentation 4.5:";;
+     short | recursive ) echo "Configuration of Xen Hypervisor Documentation 4.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1327,7 +1327,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-Xen Hypervisor Documentation configure 4.5
+Xen Hypervisor Documentation configure 4.6
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1344,7 +1344,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by Xen Hypervisor Documentation $as_me 4.5, which was
+It was created by Xen Hypervisor Documentation $as_me 4.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2628,7 +2628,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by Xen Hypervisor Documentation $as_me 4.5, which was
+This file was extended by Xen Hypervisor Documentation $as_me 4.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -2682,7 +2682,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-Xen Hypervisor Documentation config.status 4.5
+Xen Hypervisor Documentation config.status 4.6
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/docs/features/migration.pandoc b/docs/features/migration.pandoc
new file mode 100644
index 0000000..9852a19
--- /dev/null
+++ b/docs/features/migration.pandoc
@@ -0,0 +1,123 @@
+% Migration
+% Revision 1
+
+\clearpage
+
+# Basics
+--------------- -------------
+        Status: **Supported**
+
+  Architecture: x86
+
+     Component: Toolstack
+--------------- -------------
+
+# Overview
+
+Migration is a mechanism to move a virtual machine while the VM is
+running.  Live migration moves a running virtual machine between two
+physical servers, but the same mechanism can be used for non-live
+migration (pause and copy) and suspend/resume from disk.
+
+# User details
+
+No hardware requirements, although hypervisor logdirty support is
+required for live migration.
+
+From the command line, `xl migrate/save/restore` are the top level
+interactions.  e.g.
+
+    xl create my-vm.cfg
+    xl migrate my-vm localhost
+
+or
+
+    xl create my-vm.cfg
+    xl save my-vm /path/to/save/file
+    xl restore /path/to/save/file
+
+Xen 4.6 sees the introduction of Migration v2.  There is no change for
+people using `xl`, although the `libxl` API has had an extension.
+
+# Technical details
+
+Migration is formed of several layers.  `libxc` is responsible for the
+contents of the VM (ram, vcpus, etc) and the live migration loop, while
+`libxl` is responsible for items such as emulator state.
+
+The format of the migration v2 stream is specified in two documents, and
+is architecture neutral.  Compatibility with legacy streams is
+maintained via the `convert-legacy-stream` script which transforms a
+legacy stream into a migration v2 stream.
+
+* Documents
+    * `docs/specs/libxc-migration-stream.pandoc`
+    * `docs/specs/libxl-migration-stream.pandoc`
+* `libxc`
+    * `tools/libxc/xc_sr_*.[hc]`
+* `libxl`
+    * `tools/libxl/libxl_stream_{read,write}.c`
+    * `tools/libxl/libxl_convert_callout.c`
+* Scripts
+    * `tools/python/xen/migration/*.py`
+    * `tools/python/scripts/convert-legacy-stream`
+    * `tools/python/scripts/verify-stream-v2`
+
+## libxl
+
+With migration v2 support, LIBXL\_HAVE\_SRM\_V2 and LIBXL\_HAVE\_SRM\_V1
+are introduced to indicate support.  `domain_restore_params` gains a new
+parameter, `stream_version`, which is used to distinguish between legacy and
+v2 migration streams, and hence whether legacy conversion is required.
+
+# Limitations
+
+Hypervisor logdirty support is incompatible with hardware passthrough,
+as IOMMU faults cannot be used to track writes.
+
+While not a bug in migration specifically, VMs are very sensitive to
+changes in cpuid information, and cpuid levelling support currently has
+its issues.  Extreme care should be taken when migrating VMs between
+non-identical CPUs until the cpuid levelling improvements are complete.
+
+# Testing
+
+Changes in libxc should be tested with every guest type (32bit PV, 64bit
+PV, HVM), while changes in libxl should test HVM guests with both
+qemu-traditional and qemu-upstream.
+
+In general, testing can be done on a single host using `xl
+save/restore` or `xl migrate $VM localhost`.
+
+Any changes to the conversion script should be tested in all upgrade
+scenarios, which will involve starting with VMs from Xen 4.5
+
+# Areas for improvement
+
+* Arm support
+* Linear P2M support for x86 PV
+* Live looping parameters
+
+# Known issues
+
+* x86 HVM guest physmap operations (not reflected in logdirty bitmap)
+* x86 HVM with PoD pages (attempts to map cause PoD allocations)
+* x86 HVM with nested-virt (no relevant information included in the
+  stream)
+* x86 PV ballooning (P2M marked dirty, target frame not marked)
+* x86 PV P2M structure changes (not noticed, stale mappings used)
+
+# References
+
+Xen Developer Summit 2015 Presentation
+[video](https://www.youtube.com/watch?v=RwiDeG21lrc) and
+[slides](http://events.linuxfoundation.org/sites/events/files/slides/migv2.pdf)
+for Migration v2
+
+# History
+
+------------------------------------------------------------------------
+Date       Revision Version  Notes
+---------- -------- -------- -------------------------------------------
+2015-10-24 1        Xen 4.6  Document written
+---------- -------- -------- -------------------------------------------
diff --git a/docs/features/template.pandoc b/docs/features/template.pandoc
new file mode 100644
index 0000000..7698291
--- /dev/null
+++ b/docs/features/template.pandoc
@@ -0,0 +1,75 @@
+% Template for feature documents
+% Revision $N
+
+\clearpage
+
+This is a suggested template for formatting of a Xen feature document in
+tree.
+
+The purpose of this document is to provide a concrete support statement
+for the feature (indicating its security status), as well as brief user
+and technical documentation.
+
+# Basics
+
+A table with an overview of the support status and applicability.
+
+---------------- ----------------------------------------------------
+         Status: e.g. **Supported**/**Tech Preview**/**Experimental**
+
+Architecture(s): e.g. x86, arm
+
+   Component(s): e.g. Hypervisor, toolstack, guest
+
+       Hardware: _where applicable_
+---------------- ----------------------------------------------------
+
+# Overview
+
+A short description the feature, similar to an abstract for a
+paper/presentation.
+
+# User information
+
+Information for a user attempting to use the feature.  Should include
+how to enable the feature (is it enabled by default? If not, how to turn
+it on?), and how to interact with the feature (typically via `xl`).
+
+# Limitations
+
+Information concerning incompatibilities with other features or hardware
+combinations.
+
+# Technical information
+
+Information for a developer or power user.  Should include where to look
+in-tree for detailed documents and code.
+
+# Testing
+
+Information concerning how to properly test changes affecting this feature.
+
+# Areas for improvement
+
+List of enhancements which could be undertaken, e.g. to improve the
+feature itself, or improve interaction with other features.
+
+# Known issues
+
+List of known issues or bugs.  For tech preview or experimental
+features, this section must contain the list of items needing fixing for
+its status to be upgraded.
+
+# References
+
+Relevant external references for this feature.
+
+# History
+
+A table of changes to the document, in chronological order.
+
+------------------------------------------------------------------------
+Date       Revision Version  Notes
+---------- -------- -------- -------------------------------------------
+YYYY-MM-DD N        Xen X.Y  ...
+---------- -------- -------- -------------------------------------------
diff --git a/docs/man/xentop.pod.1 b/docs/man/xentop.pod.1
new file mode 100644
index 0000000..1d0eb50
--- /dev/null
+++ b/docs/man/xentop.pod.1
@@ -0,0 +1,111 @@
+=head1 NAME
+
+xentop - displays real-time information about a Xen system and domains
+
+=head1 SYNOPSIS
+
+B<xentop> [B<-h>] [B<-V>] [B<-d>SECONDS] [B<-n>] [B<-r>] [B<-v>] [B<-f>]
+[B<-b>] [B<-i>ITERATIONS]
+
+=head1 DESCRIPTION
+
+B<xentop> displays information about the Xen system and domains, in a
+continually-updating manner.  Command-line options and interactive commands
+can change the detail and format of the information displayed by B<xentop>.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-h>, B<--help>
+
+display help and exit
+
+=item B<-V>, B<--version>
+
+output version information and exit
+
+=item B<-d>, B<--delay>=I<SECONDS>
+
+seconds between updates (default 3)
+
+=item B<-n>, B<--networks>
+
+output network information
+
+=item B<-x>, B<--vbds>
+
+output vbd block device data
+
+=item B<-r>, B<--repeat-header>
+
+repeat table header before each domain
+
+=item B<-v>, B<--vcpus>
+
+output VCPU data
+
+=item B<-f>, B<--full-name>
+
+output the full domain name (not truncated)
+
+=item B<-b>, B<--batch>
+
+output data in batch mode (to stdout)
+
+=item B<-i>, B<--iterations>=I<ITERATIONS>
+
+maximum number of iterations xentop should produce before ending
+
+=back
+
+=head1 INTERACTIVE COMMANDS
+
+All interactive commands are case-insensitive.
+
+=over 4
+
+=item B<D>
+
+set delay between updates
+
+=item B<N>
+
+toggle display of network information
+
+=item B<Q>, B<Esc>
+
+quit
+
+=item B<R>
+
+toggle table header before each domain
+
+=item B<S>
+
+cycle sort order
+
+=item B<V>
+
+toggle display of VCPU information
+
+=item B<Arrows>
+
+scroll domain display
+
+=back
+
+=head1 AUTHORS
+
+Written by Judy Fischbach, David Hendricks, and Josh Triplett
+
+=head1 REPORTING BUGS
+
+Report bugs to <xen-devel at lists.xen.org>.
+
+=head1 COPYRIGHT
+
+Copyright 2005  International Business Machines  Corp
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
diff --git a/tools/xentrace/xentrace.8 b/docs/man/xentrace.pod.8
similarity index 63%
rename from tools/xentrace/xentrace.8
rename to docs/man/xentrace.pod.8
index ac18e9f..69aef05 100644
--- a/tools/xentrace/xentrace.8
+++ b/docs/man/xentrace.pod.8
@@ -1,62 +1,93 @@
-.TH XENTRACE 8 "22 February 2007" "Xen domain 0 utils"
-.SH NAME
-xentrace \- capture Xen trace buffer data
-.SH SYNOPSIS
-.B xentrace
-[
-.I OPTIONS
-] [
-.I FILE
-]
-.SH DESCRIPTION
-.B xentrace
-is used to capture trace buffer data from Xen.  The data is
+=head1 NAME
+
+xentrace - capture Xen trace buffer data
+
+=head1 SYNOPSIS
+
+B<xentrace> [ I<OPTIONS> ] [ I<FILE> ]
+
+=head1 DESCRIPTION
+
+B<xentrace> is used to capture trace buffer data from Xen.  The data is
 output in the following binary format (host endian):
-.PP
+
     CPU(uint) TSC(u64) EVENT(u32) D1 D2 D3 D4 D5 (all u32)
-.PP
+
 Where CPU is the processor number, TSC is the record's timestamp
 (the value of the CPU cycle counter), EVENT is the event ID and
 D1...D5 are the trace data.
 
 Data is dumped onto the standard output (which must not be a TTY) or a
-\fIFILE\fP specified on the command line.
+I<FILE> specified on the command line.
 
 The output should be parsed using the tool xentrace_format, which can
 produce human-readable output in ASCII format.
 
 
-.SS Options
-.TP
-.B -t, --log-thresh=l
-set the threshold number, l, of new records required to trigger a write of
+=head1 OPTIONS
+
+=over 4
+
+=item B<-t> I<l>, B<--log-thresh>=I<l>
+
+set the threshold number, I<l>, of new records required to trigger a write of
 all new records to the output
-.TP
-.B -s, --poll-sleep=p
-set the time, p, (in milliseconds) to sleep between polling the buffers
+
+=item B<-s> I<p>, B<--poll-sleep>=I<p>
+
+set the time, I<p>, (in milliseconds) to sleep between polling the buffers
 for new data.
-.TP
-.B -c, --cpu-mask=c
-set bitmask of CPUs to trace. It is limited to 32-bits.
-.TP
-.B -e, --evt-mask=e
+
+=item B<-c> [I<c>|I<CPU-LIST>|I<all>], B<--cpu-mask>=[I<c>|I<CPU-LIST>|I<all>]
+
+This can be: a hex value (of the form 0xNNNN...), or a set of cpu
+ranges as described below, or the string I<all>. Hex values are limited
+to 32 bits. If not specified, the cpu-mask as set during bootup will be
+constructed. If using the I<CPU-LIST> it expects decimal numbers, which
+may be specified as follows:
+
+  "0-3"
+      Trace only on CPUs 0 through 3
+
+  "0,2,5-7"
+      Trace only on CPUs 0, 2, and 5 through 7
+
+  "-3"
+      Trace only on CPUs 0 through 3
+
+  "-3,7"
+      Trace only on CPUs 0 through 3 and 7
+
+  "3-"
+      Trace only on CPUs 3 up to maximum numbers of CPUs the host has
+
+If using I<all> it will use all of the CPUs the host has.
+
+=item B<-e> I<mask>, B<--evt-mask>=I<mask>
+
 set event capture mask. If not specified the TRC_ALL will be used.
-.TP
-.B -?, --help
+
+=item B<-?>, B<--help>
+
 Give this help list
-.TP
-.B --usage
+
+=item B<--usage>
+
 Give a short usage message
-.TP
-.B -V, --version
+
+=item B<-V>, B<--version>
+
 Print program version
 
-.SS Event Classes (Masks)
+=back
+
+=head2 Event Classes (Masks)
+
 The following event classes (masks) can be used to filter the events being
 gathered by xentrace:
-.PP
-        \fIID\fP                  \fIDescription\fP
-.PP
+
+        ID                  Description
+
         0x0001f000          TRC_GEN
         0x0002f000          TRC_SCHED
         0x0004f000          TRC_DOM0OP
@@ -65,22 +96,23 @@ gathered by xentrace:
         0xfffff000          TRC_ALL
 
 
-.SS Event Subclasses (More Masks)
+=head2 Event Subclasses (More Masks)
+
 The following event subclasses (masks) can also be used to filter the events being
 gathered by xentrace:
-.PP
-        \fIID\fP                  \fIDescription\fP
-.PP
+
+        ID                  Description
+
         0x00081000          TRC_HVM_ENTRYEXIT
         0x00082000          TRC_HVM_HANDLER
 
 
-.SS Events
-.B xentrace
-collects the following events from the trace buffer:
-.PP
-        \fIID\fP                \fIDescription\fP
-.PP
+=head2 Events
+
+B<xentrace> collects the following events from the trace buffer:
+
+        ID                 Description
+
         0x0001f001         TRC_LOST_RECORDS
         0x0002f001         TRC_SCHED_DOM_ADD
         0x0002f002         TRC_SCHED_DOM_REM
@@ -115,8 +147,8 @@ collects the following events from the trace buffer:
         0x0008200D         TRC_HVM_MSR_WRITE
         0x0008200E         TRC_HVM_CPUID
         0x0008200F         TRC_HVM_INTR
-        0x00082010         TRC_HVM_NMI 
-        0x00082011         TRC_HVM_SMI 
+        0x00082010         TRC_HVM_NMI
+        0x00082011         TRC_HVM_SMI
         0x00082012         TRC_HVM_VMMCALL
         0x00082013         TRC_HVM_HLT
         0x00082014         TRC_HVM_INVLPG
@@ -125,10 +157,10 @@ collects the following events from the trace buffer:
         0x0010f002         TRC_MEM_PAGE_GRANT_UNMAP
         0x0010f003         TRC_MEM_PAGE_GRANT_TRANSFER
 
-.PP
+=head1 AUTHOR
 
-.SH AUTHOR
 Mark A. Williamson <mark.a.williamson at intel.com>
 
-.SH "SEE ALSO"
+=head1 SEE ALSO
+
 xentrace_format(1)
diff --git a/tools/xentrace/xentrace_format.1 b/docs/man/xentrace_format.pod.1
similarity index 55%
rename from tools/xentrace/xentrace_format.1
rename to docs/man/xentrace_format.pod.1
index 374ec6d..e05479a 100644
--- a/tools/xentrace/xentrace_format.1
+++ b/docs/man/xentrace_format.pod.1
@@ -1,20 +1,20 @@
-.TH XENTRACE_FORMAT 1 "11 May 2004" "Xen domain 0 utils"
-.SH NAME
-xentrace_format \- pretty-print Xen trace data
-.SH SYNOPSIS
-.B xentrace_format
-[
-.I DEFS-FILE
-]
-.SH DESCRIPTION
-.B xentrace_format
-parses trace data in \fBxentrace\fP binary format from standard input
-and reformats it according to the rules in a file of definitions
-(\fIDEFS-FILE\fP), printing to standard output.
-
-The rules in \fIDEFS-FILE\fP should have the format shown below:
-
-\fIevent_id\fP \fIwhitespace\fP \fIformat\fP
+=head1 NAME
+
+xentrace_format - pretty-print Xen trace data
+
+=head1 SYNOPSIS
+
+B<xentrace_format> [ I<DEFS-FILE> ]
+
+=head1 DESCRIPTION
+
+B<xentrace_format> parses trace data in B<xentrace> binary format from
+standard input and reformats it according to the rules in a file of
+definitions (I<DEFS-FILE>), printing to standard output.
+
+The rules in I<DEFS-FILE> should have the format shown below:
+
+I<event_id> I<whitespace> I<format>
 
 Each rule should start on a new line.
 
@@ -34,11 +34,13 @@ in the file tools/xentrace/formats in the Xen source tree.
 
 Depending on your system and the rate at which trace data is produced,
 this script may not be able to keep up with the output of
-\fBxentrace\fP if it is piped directly.  In these circumstances you
-should have \fBxentrace\fP output to a file for processing off-line.
+B<xentrace> if it is piped directly.  In these circumstances you
+should have B<xentrace> output to a file for processing off-line.
+
+=head1 AUTHOR
 
-.SH AUTHOR
 Mark A. Williamson <mark.a.williamson at intel.com>
 
-.SH "SEE ALSO"
-xentrace(8), xentrace_cpusplit(1)
+=head1 SEE ALSO
+
+xentrace(8)
diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
index 622ea53..d422924 100644
--- a/docs/man/xl.cfg.pod.5
+++ b/docs/man/xl.cfg.pod.5
@@ -41,8 +41,8 @@ value).
 
 =item B<[ VALUE, VALUE, ... ]>
 
-A list of C<VALUES> of the above types. Lists are homogeneous and are
-not nested.
+A list of C<VALUES> of the above types. Lists can be heterogeneous and
+nested.
 
 =back
 
@@ -198,7 +198,7 @@ For more details, see F<docs/misc/xl-numa-placement.markdown>.
 A domain with a weight of 512 will get twice as much CPU as a domain
 with a weight of 256 on a contended host.
 Legal weights range from 1 to 65535 and the default is 256.
-Honoured by the credit, credit2 and sedf schedulers.
+Honoured by the credit and credit2 schedulers.
 
 =item B<cap=N>
 
@@ -222,28 +222,6 @@ that your VM gets 25% of the available power (50% of 1GHz) rather than
 look at performance and cpufreq options in your operating system and
 your BIOS.
 
-=item B<period=NANOSECONDS>
-
-The normal EDF scheduling usage in nanoseconds. This means every period
-the domain gets cpu time defined in slice.
-Honoured by the sedf scheduler.
-
-=item B<slice=NANOSECONDS>
-
-The normal EDF scheduling usage in nanoseconds. it defines the time 
-a domain get every period time.
-Honoured by the sedf scheduler.
-
-=item B<latency=N>
-
-Scaled period if domain is doing heavy I/O.
-Honoured by the sedf scheduler.
-
-=item B<extratime=BOOLEAN>
-
-Flag for allowing domain to run in extra time.
-Honoured by the sedf scheduler.
-
 =back
 
 =head3 Memory Allocation
@@ -264,6 +242,73 @@ if the values of B<memory=> and B<maxmem=> differ.
 A "pre-ballooned" HVM guest needs a balloon driver, without a balloon driver
 it will crash.
 
+NOTE: Because of the way ballooning works, the guest has to allocate
+memory to keep track of maxmem pages, regardless of how much memory it
+actually has available to it.  A guest with maxmem=262144 and
+memory=8096 will report significantly less memory available for use
+than a system with maxmem=8096 memory=8096 due to the memory overhead
+of having to track the unused pages.
+
+=back
+
+=head3 Guest Virtual NUMA Configuration
+
+=over 4
+
+=item B<vnuma=[ VNODE_SPEC, VNODE_SPEC, ... ]>
+
+Specify virtual NUMA configuration with positional arguments. The
+nth B<VNODE_SPEC> in the list specifies the configuration of nth
+virtual node.
+
+Note that virtual NUMA for PV guest is not yet supported, because
+there is an issue with cpuid handling that affects PV virtual NUMA.
+Furthermore, guests with virtual NUMA cannot be saved or migrated
+because the migration stream does not preserve node information.
+
+Each B<VNODE_SPEC> is a list, which has a form of
+"[VNODE_CONFIG_OPTION,VNODE_CONFIG_OPTION, ... ]"  (without quotes).
+
+For example vnuma = [ ["pnode=0","size=512","vcpus=0-4","vdistances=10,20"] ]
+means vnode 0 is mapped to pnode 0, has 512MB ram, has vcpus 0 to 4, the
+distance to itself is 10 and the distance to vnode 1 is 20.
+
+Each B<VNODE_CONFIG_OPTION> is a quoted key=value pair. Supported
+B<VNODE_CONFIG_OPTION>s are (they are all mandatory at the moment):
+
+=over 4
+
+=item B<pnode=NUMBER>
+
+Specify which physical node this virtual node maps to.
+
+=item B<size=MBYTES>
+
+Specify the size of this virtual node. The sum of memory size of all
+vnodes will become B<maxmem=>. If B<maxmem=> is specified separately,
+a check is performed to make sure the sum of all vnode memory matches
+B<maxmem=>.
+
+=item B<vcpus=CPU-STRING>
+
+Specify which vcpus belong to this node. B<CPU-STRING> is a string
+separated by comma. You can specify range and single cpu. An example
+is "vcpus=0-5,8", which means you specify vcpu 0 to vcpu 5, and vcpu
+8.
+
+=item B<vdistances=NUMBER, NUMBER, ... >
+
+Specify virtual distance from this node to all nodes (including
+itself) with positional arguments. For example, "vdistance=10,20"
+for vnode 0 means the distance from vnode 0 to vnode 0 is 10, from
+vnode 0 to vnode 1 is 20. The number of arguments supplied must match
+the total number of vnodes.
+
+Normally you can use the values from "xl info -n" or "numactl
+--hardware" to fill in vdistance list.
+
+=back
+
 =back
 
 =head3 Event Actions
@@ -298,12 +343,12 @@ destroy`.
 
 =item B<coredump-destroy>
 
-write a "coredump" of the domain to F</var/xen/dump/NAME> and then
+write a "coredump" of the domain to F</var/lib/xen/dump/NAME> and then
 destroy the domain.
 
 =item B<coredump-restart>
 
-write a "coredump" of the domain to F</var/xen/dump/NAME> and then
+write a "coredump" of the domain to F</var/lib/xen/dump/NAME> and then
 restart the domain.
 
 =back
@@ -398,6 +443,19 @@ not emulated.
 Specify that this domain is a driver domain. This enables certain
 features needed in order to run a driver domain.
 
+=item B<device_tree=PATH>
+
+Specify a partial device tree (compiled via the Device Tree Compiler).
+Everything under the node "/passthrough" will be copied into the guest
+device tree. For convenience, the node "/aliases" is also copied to allow
+the user to defined aliases which can be used by the guest kernel.
+
+Given the complexity of verifying the validity of a device tree, this
+option should only be used with trusted device tree.
+
+Note that the partial device tree should avoid to use the phandle 65000
+which is reserved by the toolstack.
+
 =back
 
 =head2 Devices
@@ -453,7 +511,7 @@ value is optional if this is a guest domain.
 Specifies the paravirtual framebuffer devices which should be supplied
 to the domain.
 
-This options does not control the emulated graphics card presented to
+This option does not control the emulated graphics card presented to
 an HVM guest. See L<Emulated VGA Graphics Device> below for how to
 configure the emulated device. If L<Emulated VGA Graphics Device> options
 are used in a PV guest configuration, xl will pick up B<vnc>, B<vnclisten>,
@@ -502,15 +560,12 @@ Simple DirectMedia Layer). The default is to not enable this mode.
 =item C<display=DISPLAY>
 
 Specifies the X Window display that should be used when the sdl option
-is used. Note: passing this value to the device-model is not currently
-implemented, so providing this option will have no effect.
+is used.
 
 =item C<xauthority=XAUTHORITY>
 
 Specifies the path to the X authority file that should be used to
-connect to the X server when the sdl option is used. Note: passing
-this value to the device-model is not currently implemented, so
-providing this option will have no effect.
+connect to the X server when the sdl option is used.
 
 =item C<opengl=BOOLEAN>
 
@@ -583,6 +638,79 @@ assigned slave device.
 
 =back
 
+=item B<rdm="RDM_RESERVATION_STRING">
+
+(HVM/x86 only) Specifies information about Reserved Device Memory (RDM),
+which is necessary to enable robust device passthrough. One example of RDM
+is reported through ACPI Reserved Memory Region Reporting (RMRR) structure
+on x86 platform.
+
+B<RDM_RESERVE_STRING> has the form C<[KEY=VALUE,KEY=VALUE,...> where:
+
+=over 4
+
+=item B<KEY=VALUE>
+
+Possible B<KEY>s are:
+
+=over 4
+
+=item B<strategy="STRING">
+
+Currently there is only one valid type:
+
+"host" means all reserved device memory on this platform should be checked to
+reserve regions in this VM's guest address space. This global rdm parameter
+allows user to specify reserved regions explicitly, and using "host" includes
+all reserved regions reported on this platform, which is useful when doing
+hotplug.
+
+By default this isn't set so we don't check all rdms. Instead, we just check
+rdm specific to a given device if you're assigning this kind of device. Note
+this option is not recommended unless you can make sure any conflict does exist.
+
+For example, you're trying to set "memory = 2800" to allocate memory to one
+given VM but the platform owns two RDM regions like,
+
+Device A [sbdf_A]: RMRR region_A: base_addr ac6d3000 end_address ac6e6fff
+Device B [sbdf_B]: RMRR region_B: base_addr ad800000 end_address afffffff
+
+In this conflict case,
+
+#1. If B<strategy> is set to "host", for example,
+
+rdm = "strategy=host,policy=strict" or rdm = "strategy=host,policy=relaxed"
+
+It means all conflicts will be handled according to the policy
+introduced by B<policy> as described below.
+
+#2. If B<strategy> is not set at all, but
+
+pci = [ 'sbdf_A, rdm_policy=xxxxx' ]
+
+It means only one conflict of region_A will be handled according to the policy
+introduced by B<rdm_policy="STRING"> as described inside pci options.
+
+=item B<policy="STRING">
+
+Specifies how to deal with conflicts when reserving reserved device
+memory in guest address space.
+
+When that conflict is unsolved,
+
+"strict" means VM can't be created, or the associated device can't be
+attached in the case of hotplug.
+
+"relaxed" allows VM to be created but may cause VM to crash if
+pass-through device accesses RDM. For exampl,e Windows IGD GFX driver
+always accessed RDM regions so it leads to VM crash.
+
+Note this may be overridden by rdm_policy option in PCI device configuration.
+
+=back
+
+=back
+
 =item B<pci=[ "PCI_SPEC_STRING", "PCI_SPEC_STRING", ... ]>
 
 Specifies the host PCI devices to passthrough to this guest. Each B<PCI_SPEC_STRING>
@@ -645,6 +773,14 @@ dom0 without confirmation.  Please use with care.
 D0-D3hot power management states for the PCI device. False (0) by
 default.
 
+=item B<rdm_policy="STRING">
+
+(HVM/x86 only) This is same as policy option inside the rdm option but
+just specific to a given device. Therefore the default is "relaxed" as
+same as policy option as well.
+
+Note this would override global B<rdm> option.
+
 =back
 
 =back
@@ -714,6 +850,33 @@ More information about Xen gfx_passthru feature is available
 on the XenVGAPassthrough L<http://wiki.xen.org/wiki/XenVGAPassthrough>
 wiki page.
 
+=item B<rdm_mem_boundary=MBYTES>
+
+Number of megabytes to set a boundary for checking rdm conflict.
+
+When RDM conflicts with RAM, RDM probably scatter the whole RAM space.
+Especially multiple RDM entries would worsen this to lead a complicated
+memory layout. So here we're trying to figure out a simple solution to
+avoid breaking existing layout. So when a conflict occurs,
+
+    #1. Above a predefined boundary
+        - move lowmem_end below reserved region to solve conflict;
+
+    #2. Below a predefined boundary
+        - Check strict/relaxed policy.
+        "strict" policy leads to fail libxl. Note when both policies
+        are specified on a given region, 'strict' is always preferred.
+        "relaxed" policy issue a warning message and also mask this
+        entry INVALID to indicate we shouldn't expose this entry to
+        hvmloader.
+
+Here the default is 2G.
+
+=item B<dtdev=[ "DTDEV_PATH", "DTDEV_PATH", ... ]>
+
+Specifies the host device tree nodes to passthrough to this guest. Each
+DTDEV_PATH is the absolute path in the device tree.
+
 =item B<ioports=[ "IOPORT_RANGE", "IOPORT_RANGE", ... ]>
 
 Allow guest to access specific legacy I/O ports. Each B<IOPORT_RANGE>
@@ -827,6 +990,20 @@ default is B<cd>.
 
 =back
 
+=head3 Emulated disk controller type
+
+=over 4
+
+=item B<hdtype="STRING">
+
+Select the hd disk type (ide|ahci).
+If hdtype=ahci adds ich9 disk controller in AHCI mode and uses it with
+upstream qemu to emulate disks instead of IDE. It decreases boot time
+but may not be supported by default in Windows xp and older Windows.
+The default is ide.
+
+=back
+
 =head3 Paging
 
 The following options control the mechanisms used to virtualise guest
@@ -958,6 +1135,18 @@ enabled by default and you should usually omit it. It may be necessary
 to disable the HPET in order to improve compatibility with guest
 Operating Systems (X86 only)
 
+=item B<altp2mhvm=BOOLEAN>
+
+Enables or disables hvm guest access to alternate-p2m capability.
+Alternate-p2m allows a guest to manage multiple p2m guest physical
+"memory views" (as opposed to a single p2m). This option is
+disabled by default and is available only to hvm domains.
+You may want this option if you want to access-control/isolate
+access to specific guest physical memory pages accessed by
+the guest, e.g. for HVM domain memory introspection or
+for isolation/access-control of memory between components within
+a single guest hvm domain.
+
 =item B<nestedhvm=BOOLEAN>
 
 Enable or disables guest access to hardware virtualisation features,
@@ -1231,6 +1420,12 @@ This group incorporates Partition Time Reference Counter MSR. This
 enlightenment can improve performance of Windows 8 and Windows
 Server 2012 onwards.
 
+=item B<reference_tsc>
+
+This set incorporates the Partition Reference TSC MSR. This
+enlightenment can improve performance of Windows 7 and Windows
+Server 2008 R2 onwards.
+
 =item B<defaults>
 
 This is a special value that enables the default set of groups, which
@@ -1292,6 +1487,9 @@ qemu-xen-traditional device-model, the amount of video RAM is fixed at 4 MB,
 which is sufficient for 1024x768 at 32 bpp. For the upstream qemu-xen
 device-model, the default and minimum is 8 MB.
 
+For B<qxl> vga, the default is both default and minimal 128MB.
+If B<videoram> is set less than 128MB, an error will be triggered.
+
 =item B<stdvga=BOOLEAN>
 
 Select a standard VGA card with VBE (VESA BIOS Extensions) as the
@@ -1303,9 +1501,14 @@ This option is deprecated, use vga="stdvga" instead.
 
 =item B<vga="STRING">
 
-Selects the emulated video card (none|stdvga|cirrus).
+Selects the emulated video card (none|stdvga|cirrus|qxl).
 The default is cirrus.
 
+In general, QXL should work with the Spice remote display protocol
+for acceleration, and QXL driver is necessary in guest in this case.
+QXL can also work with the VNC protocol, but it will be like a standard
+VGA without acceleration.
+
 =item B<vnc=BOOLEAN>
 
 Allow access to the display via the VNC protocol.  This enables the
@@ -1421,6 +1624,17 @@ for redirection of up to 4 usb devices from spice client to domU's qemu.
 It requires an usb controller and if not defined it will automatically adds
 an usb2 controller. The default is disabled (0).
 
+=item B<spice_image_compression=[auto_glz|auto_lz|quic|glz|lz|off]>
+
+Specifies what image compression is to be used by spice (if given), otherwise
+the qemu default will be used. Please see documentations of your current qemu
+version for details.
+
+=item B<spice_streaming_video=[filter|all|off]>
+
+Specifies what streaming video setting is to be used by spice (if given),
+otherwise the qemu default will be used.
+
 =back
 
 =head3 Miscellaneous Emulated Hardware
@@ -1586,6 +1800,40 @@ The default is B<en-us>.
 
 See L<qemu(1)> for more information.
 
+=head2 Architecture Specific options
+
+=head3 ARM
+
+=over 4
+
+=item B<gic_version="vN">
+
+Version of the GIC emulated for the guest. Currently, the following
+versions are supported:
+
+=over 4
+
+=item B<v2>
+
+Emulate a GICv2
+
+=item B<v3>
+
+Emulate a GICv3. Note that the emulated GIC does not support the
+GICv2 compatibility mode.
+
+=item B<default>
+
+Emulate the same version as the native GIC hardware used by host where
+the domain was created.
+
+=back
+
+This requires hardware compatibility with the requested version. Either
+natively or via hardware backwards compatibility support.
+
+=back
+
 =head1 SEE ALSO
 
 =over 4
@@ -1605,7 +1853,7 @@ See L<qemu(1)> for more information.
 =head1 FILES
 
 F</etc/xen/NAME.cfg>
-F</var/xen/dump/NAME>
+F</var/lib/xen/dump/NAME>
 
 =head1 BUGS
 
diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index 6b89ba8..d0cd612 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -115,6 +115,9 @@ Create will return B<as soon> as the domain is started.  This B<does
 not> mean the guest OS in the domain has actually booted, or is
 available for input.
 
+If the I<-F> option is specified, create will start the domain and not
+return until its death.
+
 B<OPTIONS>
 
 =over 4 
@@ -131,6 +134,10 @@ Use the given configuration file.
 
 Leave the domain paused after it is created.
 
+=item B<-F>
+
+Run in foreground until death of the domain.
+
 =item B<-V>, B<--vncviewer>
 
 Attach to domain's VNC server, forking a vncviewer process.
@@ -273,7 +280,7 @@ Change the domain name of I<domain-id> to I<new-name>.
 Dumps the virtual machine's memory for the specified domain to the
 I<filename> specified, without pausing the domain.  The dump file will
 be written to a distribution specific directory for dump files.  Such
-as: /var/lib/xen/dump or /var/xen/dump.
+as: /var/lib/xen/dump.
 
 =item B<help> [I<--long>]
 
@@ -304,6 +311,14 @@ Also prints the security labels.
 
 Also prints the domain UUIDs, the shutdown reason and security labels.
 
+=item B<-c>, <--cpupool>
+
+Also prints the cpupool the domain belong to.
+
+=item B<-n>, <--numa>
+
+Also prints the domain NUMA node affinity.
+
 =back
 
 B<EXAMPLE>
@@ -1016,48 +1031,6 @@ Restrict output to domains in the specified cpupool.
 
 =back
 
-=item B<sched-sedf> [I<OPTIONS>]
-
-Set or get Simple EDF (Earliest Deadline First) scheduler parameters. This
-scheduler provides weighted CPU sharing in an intuitive way and uses
-realtime-algorithms to ensure time guarantees.  For more information see
-docs/misc/sedf_scheduler_mini-HOWTO.txt in the Xen distribution.
-
-B<OPTIONS>
-
-=over 4
-
-=item B<-d DOMAIN>, B<--domain=DOMAIN>
-
-Specify domain for which scheduler parameters are to be modified or retrieved.
-Mandatory for modifying scheduler parameters.
-
-=item B<-p PERIOD>, B<--period=PERIOD>
-
-The normal EDF scheduling usage in milliseconds.
-
-=item B<-s SLICE>, B<--slice=SLICE>
-
-The normal EDF scheduling usage in milliseconds.
-
-=item B<-l LATENCY>, B<--latency=LATENCY>
-
-Scaled period if domain is doing heavy I/O.
-
-=item B<-e EXTRA>, B<--extra=EXTRA>
-
-Flag for allowing domain to run in extra time (0 or 1).
-
-=item B<-w WEIGHT>, B<--weight=WEIGHT>
-
-Another way of setting CPU slice.
-
-=item B<-c CPUPOOL>, B<--cpupool=CPUPOOL>
-
-Restrict output to domains in the specified cpupool.
-
-=back
-
 =item B<sched-rtds> [I<OPTIONS>]
 
 Set or get rtds (Real Time Deferrable Server) scheduler parameters.
@@ -1139,13 +1112,30 @@ This is possible only if no domain is active in the cpu-pool.
 
 Renames a cpu-pool to I<newname>.
 
-=item B<cpupool-cpu-add> I<cpu-pool> I<cpu-nr|node:node-nr>
+=item B<cpupool-cpu-add> I<cpu-pool> I<cpus|node:nodes>
+
+Adds one or more CPUs or NUMA nodes to I<cpu-pool>. CPUs and NUMA
+nodes can be specified as single CPU/node IDs or as ranges.
 
-Adds a cpu or all cpus of a numa node to a cpu-pool.
+For example:
+
+ (a) xl cpupool-cpu-add mypool 4
+ (b) xl cpupool-cpu-add mypool 1,5,10-16,^13
+ (c) xl cpupool-cpu-add mypool node:0,nodes:2-3,^10-12,8
+
+means adding CPU 4 to mypool, in (a); adding CPUs 1,5,10,11,12,14,15
+and 16, in (b); and adding all the CPUs of NUMA nodes 0, 2 and 3,
+plus CPU 8, but keeping out CPUs 10,11,12, in (c).
 
-=item B<cpupool-cpu-remove> I<cpu-nr|node:node-nr>
+All the specified CPUs that can be added to the cpupool will be added
+to it. If some CPU can't (e.g., because they're already part of another
+cpupool), an error is reported about each one of them.
 
-Removes a cpu or all cpus of a numa node from a cpu-pool.
+=item B<cpupool-cpu-remove> I<cpus|node:nodes>
+
+Removes one or more CPUs or NUMA nodes from I<cpu-pool>. CPUs and NUMA
+nodes can be specified as single CPU/node IDs or as ranges, using the
+exact same syntax as in B<cpupool-cpu-add> above.
 
 =item B<cpupool-migrate> I<domain> I<cpu-pool>
 
@@ -1441,8 +1431,8 @@ Determine if the FLASK security module is loaded and enforcing its policy.
 =item B<setenforce> I<1|0|Enforcing|Permissive>
 
 Enable or disable enforcing of the FLASK access controls. The default is
-permissive and can be changed using the flask_enforcing option on the
-hypervisor's command line.
+permissive, but this can be changed to enforcing by specifying "flask=enforcing"
+or "flask=late" on the hypervisor's command line.
 
 =item B<loadpolicy> I<policy-file>
 
@@ -1452,14 +1442,52 @@ policy. Loading new security policy will reset runtime changes to device labels.
 
 =back
 
-=head1 CACHE MONITORING TECHNOLOGY
+=head1 PLATFORM SHARED RESOURCE MONITORING/CONTROL
+
+Intel Haswell and later server platforms offer shared resource monitoring
+and control technologies. The availability of these technologies and the
+hardware capabilities can be shown with B<psr-hwinfo>.
+
+See L<http://xenbits.xen.org/docs/unstable/misc/xl-psr.html> for more
+information.
+
+=over 4
+
+=item B<psr-hwinfo> [I<OPTIONS>]
+
+Show Platform Shared Resource (PSR) hardware information.
+
+B<OPTIONS>
+
+=over 4
+
+=item B<-m>, B<--cmt>
+
+Show Cache Monitoring Technology (CMT) hardware information.
+
+=item B<-a>, B<--cat>
+
+Show Cache Allocation Technology (CAT) hardware information.
+
+=back
+
+=back
+
+=head2 CACHE MONITORING TECHNOLOGY
 
 Intel Haswell and later server platforms offer monitoring capability in each
 logical processor to measure specific platform shared resource metric, for
-example, L3 cache occupancy. In Xen implementation, the monitoring granularity
-is domain level. To monitor a specific domain, just attach the domain id with
-the monitoring service. When the domain doesn't need to be monitored any more,
-detach the domain id from the monitoring service.
+example, L3 cache occupancy. In the Xen implementation, the monitoring
+granularity is domain level. To monitor a specific domain, just attach the
+domain id with the monitoring service. When the domain doesn't need to be
+monitored any more, detach the domain id from the monitoring service.
+
+Intel Broadwell and later server platforms also offer total/local memory
+bandwidth monitoring. Xen supports per-domain monitoring for these two
+additional monitoring types. Both memory bandwidth monitoring and L3 cache
+occupancy monitoring share the same set of underlying monitoring service. Once
+a domain is attached to the monitoring service, monitoring data can be shown
+for any of these monitoring types.
 
 =over 4
 
@@ -1475,7 +1503,53 @@ detach: Detach the platform shared resource monitoring service from a domain.
 
 Show monitoring data for a certain domain or all domains. Current supported
 monitor types are:
- - "cache-occupancy": showing the L3 cache occupancy.
+ - "cache-occupancy": showing the L3 cache occupancy(KB).
+ - "total-mem-bandwidth": showing the total memory bandwidth(KB/s).
+ - "local-mem-bandwidth": showing the local memory bandwidth(KB/s).
+
+=back
+
+=head2 CACHE ALLOCATION TECHNOLOGY
+
+Intel Broadwell and later server platforms offer capabilities to configure and
+make use of the Cache Allocation Technology (CAT) mechanisms, which enable more
+cache resources (i.e. L3 cache) to be made available for high priority
+applications. In the Xen implementation, CAT is used to control cache allocation
+on VM basis. To enforce cache on a specific domain, just set capacity bitmasks
+(CBM) for the domain.
+
+=over 4
+
+=item B<psr-cat-cbm-set> [I<OPTIONS>] I<domain-id> I<cbm>
+
+Set cache capacity bitmasks(CBM) for a domain. For how to specify I<cbm>
+please refer to L<http://xenbits.xen.org/docs/unstable/misc/xl-psr.html>.
+
+B<OPTIONS>
+
+=over 4
+
+=item B<-s SOCKET>, B<--socket=SOCKET>
+
+Specify the socket to process, otherwise all sockets are processed.
+
+=back
+
+=item B<psr-cat-show> [I<domain-id>]
+
+Show CAT settings for a certain domain or all domains.
+
+=back
+
+=head1 IGNORED FOR COMPATIBILITY WITH XM
+
+xl is mostly command-line compatible with the old xm utility used with
+the old Python xend.  For compatibility, the following options are
+ignored:
+
+=over 4
+
+=item B<xl migrate --live>
 
 =back
 
@@ -1502,6 +1576,7 @@ And the following documents on the xen.org website:
 L<http://xenbits.xen.org/docs/unstable/misc/xl-network-configuration.html>
 L<http://xenbits.xen.org/docs/unstable/misc/xl-disk-configuration.txt>
 L<http://xenbits.xen.org/docs/unstable/misc/xsm-flask.txt>
+L<http://xenbits.xen.org/docs/unstable/misc/xl-psr.html>
 
 For systems that don't automatically bring CPU online:
 
diff --git a/docs/man/xlcpupool.cfg.pod.5 b/docs/man/xlcpupool.cfg.pod.5
index e32ce17..792cf4f 100644
--- a/docs/man/xlcpupool.cfg.pod.5
+++ b/docs/man/xlcpupool.cfg.pod.5
@@ -74,9 +74,9 @@ the credit scheduler
 
 the credit2 scheduler
 
-=item B<sedf>
+=item B<rtds>
 
-the SEDF scheduler
+the RTDS scheduler
 
 =back
 
@@ -89,10 +89,26 @@ Specifies the cpus of the NUMA-nodes given in C<NODES> (an integer or
 a list of integers) to be member of the cpupool. The free cpus in the
 specified nodes are allocated in the new cpupool.
 
-=item B<cpus="CPUS">
+=item B<cpus="CPU-LIST">
 
-The specified C<CPUS> are allocated in the new cpupool. All cpus must
-be free. Must not be specified together with B<nodes>.
+Specifies the cpus that will be member of the cpupool. All the specified
+cpus must be free, or creation will fail. C<CPU-LIST> may be specified
+as follows:
+
+=over 4
+
+=item ["2", "3", "5"]
+
+means that cpus 2,3,5 will be member of the cpupool.
+
+=item "0-3,5,^1"
+
+means that cpus 0,2,3 and 5 will be member of the cpupool. A "node:" or
+"nodes:" modifier can be used. E.g., "0,node:1,nodes:2-3,^10-13" means
+that pcpus 0, plus all the cpus of NUMA nodes 1,2,3 with the exception
+of cpus 10,11,12,13 will be memeber of the cpupool.
+
+=back
 
 If neither B<nodes> nor B<cpus> are specified only the first free cpu
 found will be allocated in the new cpupool.
diff --git a/docs/misc/arm/device-tree/passthrough.txt b/docs/misc/arm/device-tree/passthrough.txt
new file mode 100644
index 0000000..6715646
--- /dev/null
+++ b/docs/misc/arm/device-tree/passthrough.txt
@@ -0,0 +1,9 @@
+Device passthrough
+===================
+
+Any device with the property "xen,passthrough" set will not be exposed to
+DOM0 and therefore no driver will be loaded.
+
+It is highly recommended to set this property on devices which are passed
+through since many devices will not cope with being accessed by dom0 and
+then handed over to another domain.
diff --git a/docs/misc/arm/early-printk.txt b/docs/misc/arm/early-printk.txt
index 71a0247..7e03955 100644
--- a/docs/misc/arm/early-printk.txt
+++ b/docs/misc/arm/early-printk.txt
@@ -7,24 +7,51 @@ Note that selecting this option will limit Xen to a single UART definition.
 Attempting to boot Xen image on a different platform *will not work*, so this
 option should not be enable for Xens that are intended to be portable.
 
-CONFIG_EARLY_PRINTK=mach
-where mach is the name of the machine:
-  - vexpress: printk with pl011 for versatile express
+CONFIG_EARLY_PRINTK=<INC>,<BASE_ADDRESS>,<OTHER_OPTIONS>
+
+<INC> and <BASE_ADDRESS> are mandatory arguments:
+
+  - <INC> is the name of the driver, see xen/arch/arm/arm{32,64}/debug-*.inc
+    (where <INC> corresponds to the wildcarded *).
+  - <BASE_ADDRESS> is the base physical address of the UART to use
+
+<OTHER_OPTIONS> varies depending on <INC>:
+
+  - 8250,<BASE_ADDRESS>,<REG_SHIFT>
+    - <REG_SHIFT> is, optionally, the left-shift to apply to the
+      register offsets within the uart.
+  - pl011,<BASE_ADDRESS>,<BAUD_RATE>
+    - <BAUD_RATE> is, optionally a baud rate which should be used to
+      configure the UART at start of day.
+
+      If <BAUD_RATE> is not given then the code will not try to
+      initialize the UART, so that bootloader or firmware settings can
+     be used for maximum compatibility.
+  - For all other uarts there are no additional options.
+
+As a convenience it is also possible to select from a list of
+predefined configurations using CONFIG_EARLY_PRINTK=mach where mach is
+the name of the machine:
+
+  - brcm: printk with 8250 on Broadcom 7445D0 boards with A15 processors.
+  - dra7: printk with 8250 on DRA7 platform
   - exynos5250: printk with the second UART
-  - midway: printk with the pl011 on Calxeda Midway processors
   - fastmodel: printk on ARM Fastmodel software emulators
+  - hip04-d01: printk with 8250 on HiSilicon Hip-04 D01
+  - juno: printk with pl011 on Juno platform
+  - lager: printk with SCIF0 on Renesas R-Car H2 processors
+  - midway: printk with the pl011 on Calxeda Midway processors
   - omap5432: printk with UART3 on TI OMAP5432 processors
+  - seattle: printk with pl011 for AMD Seattle processor
   - sun6i: printk with 8250 on Allwinner A31 processors
   - sun7i: printk with 8250 on Allwinner A20 processors
-  - brcm: printk with 8250 on Broadcom 7445D0 boards with A15 processors.
-  - hip04-d01: printk with 8250 on HiSilicon Hip-04 D01
-  - seattle: printk with pl011 for AMD Seattle processor
+  - thunderx: printk with pl011 for Cavium ThunderX processor
+  - vexpress: printk with pl011 for versatile express
+  - xgene-mcdivitt: printk with 820 on Xgene mcdivitt platform
+  - xgene-storm: printk with 820 on Xgene storm platform
+  - zynqmp: printk with Cadence UART for Xilinx ZynqMP SoCs
 
-The base address and baud rate is hardcoded in xen/arch/arm/Rules.mk,
+These settings are is hardcoded in xen/arch/arm/Rules.mk,
 see there when adding support for new machines.
-If not explicitly requested with "EARLY_PRINTK_INIT_UART := y" in Rules.mk,
-the code will not try to initialize the UART, so that bootloader or
-firmware settings can be used for maximum compatibility. The baud rate
-parameter is ignored in this case.
 
 By default early printk is disabled.
diff --git a/docs/misc/arm/passthrough.txt b/docs/misc/arm/passthrough.txt
new file mode 100644
index 0000000..082e9ab
--- /dev/null
+++ b/docs/misc/arm/passthrough.txt
@@ -0,0 +1,62 @@
+Passthrough a device described in the Device Tree to a guest
+============================================================
+
+The example will use the secondary network card for the midway server.
+
+1) Mark the device to let Xen know the device will be used for passthrough.
+This is done in the device tree node describing the device by adding the
+property "xen,passthrough". The command to do it in U-Boot is:
+
+    fdt set /soc/ethernet at fff51000 xen,passthrough
+
+2) Create a partial device tree describing the device. The IRQ are mapped
+1:1 to the guest (i.e VIRQ == IRQ). For MMIO, you will have to find a hole
+in the guest memory layout (see xen/include/public/arch-arm.h, note that
+the layout is not stable and can change between versions of Xen).
+
+/dts-v1/;
+
+/ {
+    /* #*cells are here to keep DTC happy */
+    #address-cells = <2>;
+    #size-cells = <2>;
+
+    aliases {
+        net = &mac0;
+    };
+
+    passthrough {
+        compatible = "simple-bus";
+        ranges;
+        #address-cells = <2>;
+        #size-cells = <2>;
+        mac0: ethernet at 10000000 {
+            compatible = "calxeda,hb-xgmac";
+            reg = <0 0x10000000 0 0x1000>;
+            interrupts = <0 80 4  0 81 4  0 82 4>;
+        };
+    };
+};
+
+Note:
+    * The interrupt-parent property will be added by the toolstack in the
+    root node;
+    * The following properties are mandatory with the /passthrough node:
+        - compatible: It should always contain "simple-bus"
+        - ranges
+        - #address-cells
+        - #size-cells
+    * See http://www.devicetree.org/Device_Tree_Usage for more
+    information about device tree.
+
+3) Compile the partial guest device with dtc (Device Tree Compiler).
+For our purpose, the compiled file will be called guest-midway.dtb and
+placed in /root in DOM0.
+
+3) Add the following options in the guest configuration file:
+
+device_tree = "/root/guest-midway.dtb"
+dtdev = [ "/soc/ethernet at fff51000" ]
+irqs = [ 112, 113, 114 ]
+iomem = [ "0xfff51,1 at 0x10000" ]
+
diff --git a/docs/misc/efi.markdown b/docs/misc/efi.markdown
index f435ec7..5b54314 100644
--- a/docs/misc/efi.markdown
+++ b/docs/misc/efi.markdown
@@ -50,7 +50,7 @@ thus look like this (`#` serving as comment character):
     
     [sle11sp2]
     options=console=vga,com1 com1=57600 loglvl=all noreboot
-    kernel=vmlinuz-3.0.31-0.4-xen ignore_loglevel #earlyprintk=xen
+    kernel=vmlinuz-3.0.31-0.4-xen [domain 0 command line options]
     ramdisk=initrd-3.0.31-0.4-xen
 
     **************************example end********************************
@@ -73,6 +73,12 @@ Line Options](xen-command-line.html).
 
 Specifies the Dom0 kernel binary and the options to pass to it.
 
+The options should in general be the same as is used when booting
+natively, e.g. including `root=...` etc.
+
+Check your bootloader (e.g. grub) configuration or `/proc/cmdline` for
+the native configuration.
+
 ###`ramdisk=<filename>`
 
 Specifies a Linux-style initial RAM disk image to load.
diff --git a/docs/misc/grant-tables.txt b/docs/misc/grant-tables.txt
index 19db4ec..417ce2d 100644
--- a/docs/misc/grant-tables.txt
+++ b/docs/misc/grant-tables.txt
@@ -63,6 +63,7 @@ is complete.
   act->domid : remote domain being granted rights
   act->frame : machine frame being granted
   act->pin   : used to hold reference counts
+  act->lock  : spinlock used to serialize access to active entry state
 
  Map tracking
  ~~~~~~~~~~~~
@@ -74,7 +75,59 @@ is complete.
  matching map track entry is then removed, as if unmap had been invoked.
  These are not used by the transfer mechanism.
   map->domid         : owner of the mapped frame
-  map->ref_and_flags : grant reference, ro/rw, mapped for host or device access
+  map->ref           : grant reference
+  map->flags         : ro/rw, mapped for host or device access
+
+********************************************************************************
+ Locking
+ ~~~~~~~
+ Xen uses several locks to serialize access to the internal grant table state.
+
+  grant_table->lock          : rwlock used to prevent readers from accessing
+                               inconsistent grant table state such as current
+                               version, partially initialized active table pages,
+                               etc.
+  grant_table->maptrack_lock : spinlock used to protect the maptrack free list
+  active_grant_entry->lock   : spinlock used to serialize modifications to
+                               active entries
+
+ The primary lock for the grant table is a read/write spinlock. All
+ functions that access members of struct grant_table must acquire a
+ read lock around critical sections. Any modification to the members
+ of struct grant_table (e.g., nr_status_frames, nr_grant_frames,
+ active frames, etc.) must only be made if the write lock is
+ held. These elements are read-mostly, and read critical sections can
+ be large, which makes a rwlock a good choice.
+
+ The maptrack free list is protected by its own spinlock. The maptrack
+ lock may be locked while holding the grant table lock.
+
+ Active entries are obtained by calling active_entry_acquire(gt, ref).
+ This function returns a pointer to the active entry after locking its
+ spinlock. The caller must hold the grant table read lock before
+ calling active_entry_acquire(). This is because the grant table can
+ be dynamically extended via gnttab_grow_table() while a domain is
+ running and must be fully initialized. Once all access to the active
+ entry is complete, release the lock by calling active_entry_release(act).
+
+ Summary of rules for locking:
+  active_entry_acquire() and active_entry_release() can only be
+  called when holding the relevant grant table's read lock. I.e.:
+    read_lock(&gt->lock);
+    act = active_entry_acquire(gt, ref);
+    ...
+    active_entry_release(act);
+    read_unlock(&gt->lock);
+
+ Active entries cannot be acquired while holding the maptrack lock.
+ Multiple active entries can be acquired while holding the grant table
+ _write_ lock.
+
+ Maptrack entries are protected by the corresponding active entry
+ lock.  As an exception, new maptrack entries may be populated without
+ holding the lock, provided the flags field is written last.  This
+ requires any maptrack entry user validates the flags field as
+ non-zero first.
 
 ********************************************************************************
 
diff --git a/docs/misc/qemu-upstream_howto_use_it.markdown b/docs/misc/qemu-upstream_howto_use_it.markdown
deleted file mode 100644
index 8370fac..0000000
--- a/docs/misc/qemu-upstream_howto_use_it.markdown
+++ /dev/null
@@ -1,12 +0,0 @@
-Using Upstream QEMU with Xen
-============================
-
-If you want to build with the QEMU unstable tree, follow the [QEMU
-Upstream](http://wiki.xen.org/wiki/QEMU_Upstream) wiki page.
-
-Otherwise, QEMU/SeaBIOS is now integrated into the build system, so you just
-have to specify the device model version in an `xl` config file:
-
-    device_model_version = 'qemu-xen'
-
-The version of QEMU used in the build system is the last release of QEMU.
diff --git a/docs/misc/sedf_scheduler_mini-HOWTO.txt b/docs/misc/sedf_scheduler_mini-HOWTO.txt
deleted file mode 100644
index 6742867..0000000
--- a/docs/misc/sedf_scheduler_mini-HOWTO.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-sEDF scheduler
---------------
-Author:
-   Stephan.Diestelhorst@{cl.cam.ac.uk, inf.tu-dresden.de}
-   
-Overview:
-  This scheduler provides weighted CPU sharing in an intuitive way and
-  uses realtime-algorithms to ensure time guarantees.
-
-Usage:
-   -add "sched=sedf" on Xen's boot command-line
-   -create domains as usual
-   -use "xm sched-sedf <dom-id> <period> <slice> <latency-hint> <extra> <weight>"
-    Where:
-      -period/slice are the normal EDF scheduling parameters in nanosecs
-      -latency-hint is the scaled period in case the domain is doing heavy I/O
-         (unused by the currently compiled version)
-      -extra is a flag (0/1), which controls whether the domain can run in
-       extra-time
-      -weight is mutually exclusive with period/slice and specifies another
-       way of setting a domains cpu slice
-
-Examples:
- normal EDF (20ms/5ms):
-  xm sched-sedf <dom-id> 20000000 5000000 0 0 0
-  
- best-effort domains (i.e. non-realtime):
-  xm sched-sedf <dom-id> 20000000 0 0 1 0
- 
- normal EDF (20ms/5ms) + share of extra-time:
-  xm sched-sedf <dom-id> 20000000 5000000 0 1 0
-  
- 4 domains with weights 2:3:4:2
-  xm sched-sedf <d1> 0 0 0 0 2
-  xm sched-sedf <d2> 0 0 0 0 3
-  xm sched-sedf <d3> 0 0 0 0 4
-  xm sched-sedf <d4> 0 0 0 0 2
-  
- 1 fully-specified (10ms/3ms) domain, 3 other domains share
- available rest in 2:7:3 ratio:
-  xm sched-sedf <d1> 10000000 3000000 0 0 0
-  xm sched-sedf <d2> 0 0 0 0 2
-  xm sched-sedf <d3> 0 0 0 0 7
-  xm sched-sedf <d4> 0 0 0 0 3
diff --git a/stubdom/README b/docs/misc/stubdom.txt
similarity index 100%
rename from stubdom/README
rename to docs/misc/stubdom.txt
diff --git a/docs/misc/vbd-interface.txt b/docs/misc/vbd-interface.txt
index f873db0..1c996bf 100644
--- a/docs/misc/vbd-interface.txt
+++ b/docs/misc/vbd-interface.txt
@@ -3,18 +3,20 @@ Xen guest interface
 
 A Xen guest can be provided with block devices.  These are always
 provided as Xen VBDs; for HVM guests they may also be provided as
-emulated IDE or SCSI disks.
+emulated IDE, AHCI or SCSI disks.
 
 The abstract interface involves specifying, for each block device:
 
  * Nominal disk type: Xen virtual disk (aka xvd*, the default); SCSI
-   (sd*); IDE (hd*).
+   (sd*); IDE or AHCI (hd*).
 
    For HVM guests, each whole-disk hd* and and sd* device is made
    available _both_ via emulated IDE resp. SCSI controller, _and_ as a
    Xen VBD.  The HVM guest is entitled to assume that the IDE or SCSI
    disks available via the emulated IDE controller target the same
    underlying devices as the corresponding Xen VBD (ie, multipath).
+   In hd* case with hdtype=ahci, disk will be AHCI via emulated
+   ich9 disk controller.
 
    For PV guests every device is made available to the guest only as a
    Xen VBD.  For these domains the type is advisory, for use by the
diff --git a/docs/misc/vtd.txt b/docs/misc/vtd.txt
index 9af0e99..88b2102 100644
--- a/docs/misc/vtd.txt
+++ b/docs/misc/vtd.txt
@@ -111,6 +111,30 @@ in the config file:
 To override for a specific device:
 	pci = [ '01:00.0,msitranslate=0', '03:00.0' ]
 
+RDM, 'reserved device memory', for PCI Device Passthrough
+---------------------------------------------------------
+
+There are some devices the BIOS controls, for e.g. USB devices to perform
+PS2 emulation. The regions of memory used for these devices are marked
+reserved in the e820 map. When we turn on DMA translation, DMA to those
+regions will fail. Hence BIOS uses RMRR to specify these regions along with
+devices that need to access these regions. OS is expected to setup
+identity mappings for these regions for these devices to access these regions.
+
+While creating a VM we should reserve them in advance, and avoid any conflicts.
+So we introduce user configurable parameters to specify RDM resource and
+according policies,
+
+To enable this globally, add "rdm" in the config file:
+
+    rdm = "strategy=host, policy=relaxed"   (default policy is "relaxed")
+
+Or just for a specific device:
+
+    pci = [ '01:00.0,rdm_policy=relaxed', '03:00.0,rdm_policy=strict' ]
+
+For all the options available to RDM, see xl.cfg(5).
+
 
 Caveat on Conventional PCI Device Passthrough
 ---------------------------------------------
diff --git a/docs/misc/vtpmmgr.txt b/docs/misc/vtpmmgr.txt
index 026c52b..d4f756c 100644
--- a/docs/misc/vtpmmgr.txt
+++ b/docs/misc/vtpmmgr.txt
@@ -1,4 +1,8 @@
-Author: Daniel De Graaf <dgdegra at tycho.nsa.gov>
+================================================================================
+Authors:
+    Daniel De Graaf <dgdegra at tycho.nsa.gov>
+    Quan Xu <quan.xu at intel.com>
+================================================================================
 
 This document describes the operation and command line interface of
 vtpmmgr-stubdom. See docs/misc/vtpm.txt for details on the vTPM subsystem as a
@@ -163,3 +167,152 @@ would look like the following:
 This requires the migration domain to be added to the list of valid vTPM kernel
 hashes. In the current version of the vtpmmgr domain, this is the hash of the
 XSM label, not the kernel.
+
+================================================================================
+Appendix B: vtpmmgr on TPM 2.0
+================================================================================
+
+Manager disk image setup:
+-------------------------
+
+The vTPM Manager requires a disk image to store its encrypted data. The image
+does not require a filesystem and can live anywhere on the host disk. The image
+is not large; the Xen 4.5 vtpmmgr is limited to using the first 2MB of the image
+but can support more than 20,000 vTPMs.
+
+ dd if=/dev/zero of=/home/vtpm2/vmgr bs=16M count=1
+
+Manager config file:
+--------------------
+
+The vTPM Manager domain (vtpmmgr-stubdom) must be started like any other Xen
+virtual machine and requires a config file.  The manager requires a disk image
+for storage and permission to access the hardware memory pages for the TPM. The
+disk must be presented as "hda", and the TPM memory pages are passed using the
+iomem configuration parameter. The TPM TIS uses 5 pages of IO memory (one per
+locality) that start at physical address 0xfed40000. By default, the TPM manager
+uses locality 0 (so only the page at 0xfed40 is needed).
+
+Add:
+..
+     extra="tpm2=1"
+..
+extra option to launch vtpmmgr-stubdom domain on TPM 2.0, and ignore it on TPM
+1.x. for example:
+
+    kernel="/usr/lib/xen/boot/vtpmmgr-stubdom.gz"
+    memory=128
+    disk=["file:/home/vtpm2/vmgr,hda,w"]
+    name="vtpmmgr"
+    iomem=["fed40,5"]
+    extra="tpm2=1"
+
+
+Key Hierarchy
+------------------------------
+
+    +------------------+
+    |  vTPM's secrets  | ...
+    +------------------+
+            |  ^
+            |  |(Bind / Unbind)
+- - - - -  -v  |- - - - - - - - TPM 2.0
+    +------------------+
+    |        SK        +
+    +------------------+
+            |  ^
+            v  |
+    +------------------+
+    |       SRK        |
+    +------------------+
+            |  ^
+            v  |
+    +------------------+
+    | TPM 2.0 Storage  |
+    |   Primary Seed   |
+    +------------------+
+
+Now the secrets for the vTPMs are only being bound to the presence of thephysical
+TPM 2.0. Since using PCRs to seal the data can be an important security feature
+that users of the vtpmmgr rely on. I will replace TPM2_Bind/TPM2_Unbind with
+TPM2_Seal/TPM2_Unseal to provide as much security as it did for TPM 1.2 in later
+series of patch.
+
+DESIGN OVERVIEW
+------------------------------
+
+The architecture of vTPM subsystem on TPM 2.0 is described below:
+
++------------------+
+|    Linux DomU    | ...
+|       |  ^       |
+|       v  |       |
+|   xen-tpmfront   |
++------------------+
+        |  ^
+        v  |
++------------------+
+| mini-os/tpmback  |
+|       |  ^       |
+|       v  |       |
+|  vtpm-stubdom    | ...
+|       |  ^       |
+|       v  |       |
+| mini-os/tpmfront |
++------------------+
+        |  ^
+        v  |
++------------------+
+| mini-os/tpmback  |
+|       |  ^       |
+|       v  |       |
+| vtpmmgr-stubdom  |
+|       |  ^       |
+|       v  |       |
+| mini-os/tpm2_tis |
++------------------+
+        |  ^
+        v  |
++------------------+
+| Hardware TPM 2.0 |
++------------------+
+
+ * Linux DomU: The Linux based guest that wants to use a vTPM. There many be
+               more than one of these.
+
+ * xen-tpmfront.ko: Linux kernel virtual TPM frontend driver. This driver
+                    provides vTPM access to a para-virtualized Linux based DomU.
+
+ * mini-os/tpmback: Mini-os TPM backend driver. The Linux frontend driver
+                    connects to this backend driver to facilitate
+                    communications between the Linux DomU and its vTPM. This
+                    driver is also used by vtpmmgr-stubdom to communicate with
+                    vtpm-stubdom.
+
+ * vtpm-stubdom: A mini-os stub domain that implements a vTPM. There is a
+                 one to one mapping between running vtpm-stubdom instances and
+                 logical vtpms on the system. The vTPM Platform Configuration
+                 Registers (PCRs) are all initialized to zero.
+
+ * mini-os/tpmfront: Mini-os TPM frontend driver. The vTPM mini-os domain
+                     vtpm-stubdom uses this driver to communicate with
+                     vtpmmgr-stubdom. This driver could also be used separately to
+                     implement a mini-os domain that wishes to use a vTPM of
+                     its own.
+
+ * vtpmmgr-stubdom: A mini-os domain that implements the vTPM manager.
+               There is only one vTPM manager and it should be running during
+               the entire lifetime of the machine.  This domain regulates
+               access to the physical TPM on the system and secures the
+               persistent state of each vTPM.
+
+ * mini-os/tpm2_tis: Mini-os TPM version 2.0 TPM Interface Specification (TIS)
+                    driver. This driver used by vtpmmgr-stubdom to talk directly
+                    to the hardware TPM 2.0. Communication is facilitated by mapping
+                    hardware memory pages into vtpmmgr-stubdom.
+
+ * Hardware TPM 2.0: The physical TPM 2.0 that is soldered onto the motherboard.
+
+---------------------
+Noted:
+    functionality for a virtual guest operating system (a DomU) is still TPM 1.2.
diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 1d877f9..a565c1b 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -140,6 +140,13 @@ mode during S3 resume.
 
 Permit Xen to use superpages when performing memory management.
 
+### altp2m (Intel)
+> `= <boolean>`
+
+> Default: `false`
+
+Permit multiple copies of host p2m.
+
 ### apic
 > `= bigsmp | default`
 
@@ -242,7 +249,7 @@ the NMI watchdog is also enabled.
 
 > Default: `0` (1/32 of RAM)
 
-Amount of RAM to set aside for the Xenheap.
+Amount of RAM to set aside for the Xenheap. Must be an integer multiple of 32.
 
 By default will use 1/32 of the RAM up to a maximum of 1GB and with a
 minimum of 32M, subject to a suitably aligned and sized contiguous
@@ -253,6 +260,14 @@ region of memory being available.
 
 If set, override Xen's default choice for the platform timer.
 
+### cmci-threshold
+> `= <integer>`
+
+> Default: `2`
+
+Specify the event count threshold for raising Corrected Machine Check
+Interrupts.  Specifying zero disables CMCI handling.
+
 ### cmos-rtc-probe
 > `= <boolean>`
 
@@ -543,6 +558,18 @@ any dom0 autoballooning feature present in your toolstack. See the
 _xl.conf(5)_ man page or [Xen Best
 Practices](http://wiki.xen.org/wiki/Xen_Best_Practices#Xen_dom0_dedicated_memory_and_preventing_dom0_memory_ballooning).
 
+### dom0\_nodes
+
+> `= List of [ <integer> | relaxed | strict ]`
+
+> Default: `strict`
+
+Specify the NUMA nodes to place Dom0 on. Defaults for vCPU-s created
+and memory assigned to Dom0 will be adjusted to match the node
+restrictions set up here. Note that the values to be specified here are
+ACPI PXM ones, not Xen internal node numbers. `relaxed` sets up vCPU
+affinities to prefer but be not limited to the specified node(s).
+
 ### dom0\_shadow
 > `= <boolean>`
 
@@ -597,12 +624,24 @@ Either force retrieval of monitor EDID information via VESA DDC, or
 disable it (edid=no). This option should not normally be required
 except for debugging purposes.
 
-### efi-rs
-> `= <boolean>`
+### efi
+> `= List of [ rs | attr ]`
+
+All options are of boolean kind and can be prefixed with `no-` to
+effect the inverse meaning.
+
+> `rs`
 
 > Default: `true`
 
-Force or disable use of EFI runtime services.
+>> Force or disable use of EFI runtime services.
+
+> `attr=uc`
+
+> Default: `off`
+
+>> Allows mapping of RuntimeServices which have no cachability attribute
+>> set as UC.
 
 ### extra\_guest\_irqs
 > `= [<domU number>][,<dom0 number>]`
@@ -618,11 +657,31 @@ hardware domain is architecture dependent.
 Note that specifying zero as domU value means zero, while for dom0 it means
 to use the default.
 
-### flask\_enabled
-> `= <integer>`
-
-### flask\_enforcing
-> `= <integer>`
+### flask
+> `= permissive | enforcing | late | disabled`
+
+> Default: `permissive`
+
+Specify how the FLASK security server should be configured.  This option is only
+available if the hypervisor was compiled with XSM support (which can be enabled
+by setting XSM\_ENABLE = y in .config).
+
+* `permissive`: This is intended for development and is not suitable for use
+  with untrusted guests.  If a policy is provided by the bootloader, it will be
+  loaded; errors will be reported to the ring buffer but will not prevent
+  booting.  The policy can be changed to enforcing mode using "xl setenforce".
+* `enforcing`: This requires a security policy to be provided by the bootloader
+  and will enter enforcing mode prior to the creation of domain 0.  If a valid
+  policy is not provided, the hypervisor will not continue booting.
+* `late`: This disables loading of the security policy from the bootloader.
+  FLASK will be enabled but will not enforce access controls until a policy is
+  loaded by a domain using "xl loadpolicy".  Once a policy is loaded, FLASK will
+  run in enforcing mode unless "xl setenforce" has changed that setting.
+* `disabled`: This causes the XSM framework to revert to the dummy module.  The
+  dummy module provides the same security policy as is used when compiling the
+  hypervisor without support for XSM.  The xsm\_op hypercall can also be used to
+  switch to this mode after boot, but there is no way to re-enable FLASK once
+  the dummy module is loaded.
 
 ### font
 > `= <height>` where height is `8x8 | 8x14 | 8x16`
@@ -645,6 +704,30 @@ requirement can be relaxed.  This option is particularly useful for nested
 virtualization, to allow the L1 hypervisor to use EPT even if the L0 hypervisor
 does not provide VM\_ENTRY\_LOAD\_GUEST\_PAT.
 
+### ept (Intel)
+> `= List of ( {no-}pml | {no-}ad )`
+
+Controls EPT related features.
+
+> Sub-options:
+
+> `pml`
+
+> Default: `false`
+
+>> PML is a new hardware feature in Intel's Broadwell Server and further
+>> platforms which reduces hypervisor overhead of log-dirty mechanism by
+>> automatically recording GPAs (guest physical addresses) when guest memory
+>> gets dirty, and therefore significantly reducing number of EPT violation
+>> caused by write protection of guest memory, which is a necessity to
+>> implement log-dirty mechanism before PML.
+
+> `ad`
+
+> Default: Hardware dependent
+
+>> Have hardware keep accessed/dirty (A/D) bits updated.
+
 ### gdb
 > `= <baud>[/<clock_hz>][,DPS[,<io-base>[,<irq>[,<port-bdf>[,<bridge-bdf>]]]] | pci | amt ] `
 
@@ -781,7 +864,7 @@ debug hypervisor only).
 > Default: `new` unless directed-EOI is supported
 
 ### iommu
-> `= List of [ <boolean> | force | required | intremap | qinval | snoop | sharept | dom0-passthrough | dom0-strict | amd-iommu-perdev-intremap | workaround_bios_bug | verbose | debug ]`
+> `= List of [ <boolean> | force | required | intremap | qinval | snoop | sharept | dom0-passthrough | dom0-strict | amd-iommu-perdev-intremap | workaround_bios_bug | igfx | verbose | debug ]`
 
 > Sub-options:
 
@@ -855,6 +938,15 @@ debug hypervisor only).
 >> ignored (normally IOMMU setup fails if any of the devices listed by a DRHD
 >> entry aren't PCI discoverable).
 
+> `igfx` (VT-d)
+
+> Default: `true`
+
+>> Enable IOMMU for Intel graphics devices. The intended usage of this option
+>> is `no-igfx`, which is similar to Linux `intel_iommu=igfx_off` option used
+>> to workaround graphics issues. If adding `no-igfx` fixes anything, you
+>> should file a bug reporting the problem.
+
 > `verbose`
 
 > Default: `false`
@@ -1082,9 +1174,9 @@ This option can be specified more than once (up to 8 times at present).
 > `= <integer>`
 
 ### psr (Intel)
-> `= List of ( cmt:<boolean> | rmid_max:<integer> )`
+> `= List of ( cmt:<boolean> | rmid_max:<integer> | cat:<boolean> | cos_max:<integer> )`
 
-> Default: `psr=cmt:0,rmid_max:255`
+> Default: `psr=cmt:0,rmid_max:255,cat:0,cos_max:255`
 
 Platform Shared Resource(PSR) Services.  Intel Haswell and later server
 platforms offer information about the sharing of resources.
@@ -1094,15 +1186,29 @@ Monitoring ID(RMID) is used to bind the domain to corresponding shared
 resource.  RMID is a hardware-provided layer of abstraction between software
 and logical processors.
 
+To use the PSR cache allocation service for a certain domain, a capacity
+bitmasks(CBM) is used to bind the domain to corresponding shared resource.
+CBM represents cache capacity and indicates the degree of overlap and isolation
+between domains. In hypervisor a Class of Service(COS) ID is allocated for each
+unique CBM.
+
 The following resources are available:
 
 * Cache Monitoring Technology (Haswell and later).  Information regarding the
   L3 cache occupancy.
   * `cmt` instructs Xen to enable/disable Cache Monitoring Technology.
   * `rmid_max` indicates the max value for rmid.
+* Memory Bandwidth Monitoring (Broadwell and later). Information regarding the
+  total/local memory bandwidth. Follow the same options with Cache Monitoring
+  Technology.
+
+* Cache Allocation Technology (Broadwell and later).  Information regarding
+  the cache allocation.
+  * `cat` instructs Xen to enable/disable Cache Allocation Technology.
+  * `cos_max` indicates the max value for COS ID.
 
 ### reboot
-> `= t[riple] | k[bd] | a[cpi] | p[ci] | e[fi] | n[o] [, [w]arm | [c]old]`
+> `= t[riple] | k[bd] | a[cpi] | p[ci] | P[ower] | e[fi] | n[o] [, [w]arm | [c]old]`
 
 > Default: `0`
 
@@ -1122,11 +1228,21 @@ Specify the host reboot method.
 
 `pci` instructs Xen to reboot the host using PCI reset register (port CF9).
 
+`Power` instructs Xen to power-cycle the host using PCI reset register (port CF9).
+
 'efi' instructs Xen to reboot using the EFI reboot call (in EFI mode by
  default it will use that method first).
 
+### ro-hpet
+> `= <boolean>`
+
+> Default: `true`
+
+Map the HPET page as read only in Dom0. If disabled the page will be mapped
+with read and write permissions.
+
 ### sched
-> `= credit | credit2 | sedf | arinc653`
+> `= credit | credit2 | arinc653`
 
 > Default: `sched=credit`
 
diff --git a/tools/xenmon/README b/docs/misc/xenmon.txt
similarity index 100%
rename from tools/xenmon/README
rename to docs/misc/xenmon.txt
diff --git a/docs/misc/xl-psr.markdown b/docs/misc/xl-psr.markdown
new file mode 100644
index 0000000..c32e25c
--- /dev/null
+++ b/docs/misc/xl-psr.markdown
@@ -0,0 +1,133 @@
+# Intel Platform Shared Resource Monitoring/Control in xl
+
+This document introduces Intel Platform Shared Resource Monitoring/Control
+technologies, their basic concepts and the xl interfaces.
+
+## Cache Monitoring Technology (CMT)
+
+Cache Monitoring Technology (CMT) is a new feature available on Intel Haswell
+and later server platforms that allows an OS or Hypervisor/VMM to determine
+the usage of cache (currently only L3 cache supported) by applications running
+on the platform. A Resource Monitoring ID (RMID) is the abstraction of the
+application(s) that will be monitored for its cache usage. The CMT hardware
+tracks cache utilization of memory accesses according to the RMID and reports
+monitored data via a counter register.
+
+For more detailed information please refer to Intel SDM chapter
+"Platform Shared Resource Monitoring: Cache Monitoring Technology".
+
+In Xen's implementation, each domain in the system can be assigned a RMID
+independently, while RMID=0 is reserved for monitoring domains that don't
+have CMT service attached. RMID is opaque for xl/libxl and is only used in
+hypervisor.
+
+### xl interfaces
+
+A domain is assigned a RMID implicitly by attaching it to CMT service:
+
+`xl psr-cmt-attach <domid>`
+
+After that, cache usage for the domain can be shown by:
+
+`xl psr-cmt-show cache-occupancy <domid>`
+
+Once monitoring is not needed any more, the domain can be detached from the
+CMT service by:
+
+`xl psr-cmt-detach <domid>`
+
+An attach may fail because of no free RMID available. In such case unused
+RMID(s) can be freed by detaching corresponding domains from CMT service.
+
+Maximum RMID and supported monitor types in the system can be obtained by:
+
+`xl psr-hwinfo --cmt`
+
+## Memory Bandwidth Monitoring (MBM)
+
+Memory Bandwidth Monitoring(MBM) is a new hardware feature available on Intel
+Broadwell and later server platforms which builds on the CMT infrastructure to
+allow monitoring of system memory bandwidth. It introduces two new monitoring
+event type to monitor system total/local memory bandwidth. The same RMID can
+be used to monitor both cache usage and memory bandwidth at the same time.
+
+For more detailed information please refer to Intel SDM chapter
+"Overview of Cache Monitoring Technology and Memory Bandwidth Monitoring".
+
+In Xen's implementation, MBM shares the same set of underlying monitoring
+service with CMT and can be used to monitor memory bandwidth on a per domain
+basis.
+
+The xl interfaces are the same with that of CMT. The difference is the
+monitor type is corresponding memory monitoring type (local-mem-bandwidth/
+total-mem-bandwidth instead of cache-occupancy). E.g. after a `xl psr-cmt-attach`:
+
+`xl psr-cmt-show local-mem-bandwidth <domid>`
+
+`xl psr-cmt-show total-mem-bandwidth <domid>`
+
+## Cache Allocation Technology (CAT)
+
+Cache Allocation Technology (CAT) is a new feature available on Intel
+Broadwell and later server platforms that allows an OS or Hypervisor/VMM to
+partition cache allocation (i.e. L3 cache) based on application priority or
+Class of Service (COS). Each COS is configured using capacity bitmasks (CBM)
+which represent cache capacity and indicate the degree of overlap and
+isolation between classes. System cache resource is divided into numbers of
+minimum portions which is then made up into subset for cache partition. Each
+portion corresponds to a bit in CBM and the set bit represents the
+corresponding cache portion is available.
+
+For example, assuming a system with 8 portions and 3 domains:
+
+ * A CBM of 0xff for every domain means each domain can access the whole cache.
+   This is the default.
+
+ * Giving one domain a CBM of 0x0f and the other two domain's 0xf0 means that
+   the first domain gets exclusive access to half of the cache (half of the
+   portions) and the other two will share the other half.
+
+ * Giving one domain a CBM of 0x0f, one 0x30 and the last 0xc0 would give the
+   first domain exclusive access to half the cache, and the other two exclusive
+   access to one quarter each.
+
+For more detailed information please refer to Intel SDM chapter
+"Platform Shared Resource Control: Cache Allocation Technology".
+
+In Xen's implementation, CBM can be configured with libxl/xl interfaces but
+COS is maintained in hypervisor only. The cache partition granularity is per
+domain, each domain has COS=0 assigned by default, the corresponding CBM is
+all-ones, which means all the cache resource can be used by default.
+
+### xl interfaces
+
+System CAT information such as maximum COS and CBM length can be obtained by:
+
+`xl psr-hwinfo --cat`
+
+The simplest way to change a domain's CBM from its default is running:
+
+`xl psr-cat-cbm-set  [OPTIONS] <domid> <cbm>`
+
+where cbm is a number to represent the corresponding cache subset can be used.
+A cbm is valid only when:
+
+ * Set bits only exist in the range of [0, cbm_len), where cbm_len can be
+   obtained with `xl psr-hwinfo --cat`.
+ * All the set bits are contiguous.
+
+In a multi-socket system, the same cbm will be set on each socket by default.
+Per socket cbm can be specified with the `--socket SOCKET` option.
+
+Setting the CBM may not be successful if insufficient COS is available. In
+such case unused COS(es) may be freed by setting CBM of all related domains to
+its default value(all-ones).
+
+Per domain CBM settings can be shown by:
+
+`xl psr-cat-show`
+
+## Reference
+
+[1] Intel SDM
+(http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html).
diff --git a/docs/misc/xsm-flask.txt b/docs/misc/xsm-flask.txt
index 9559028..7249f40 100644
--- a/docs/misc/xsm-flask.txt
+++ b/docs/misc/xsm-flask.txt
@@ -87,7 +87,7 @@ __HYPERVISOR_domctl (xen/include/public/domctl.h)
  * XEN_DOMCTL_set_machine_address_size
  * XEN_DOMCTL_debug_op
  * XEN_DOMCTL_gethvmcontext_partial
- * XEN_DOMCTL_mem_event_op
+ * XEN_DOMCTL_vm_event_op
  * XEN_DOMCTL_mem_sharing_op
  * XEN_DOMCTL_setvcpuextstate
  * XEN_DOMCTL_getvcpuextstate
@@ -116,7 +116,7 @@ __HYPERVISOR_sysctl (xen/include/public/sysctl.h)
  * XEN_SYSCTL_pm_op
  * XEN_SYSCTL_page_offline_op
  * XEN_SYSCTL_lockprof_op
- * XEN_SYSCTL_topologyinfo
+ * XEN_SYSCTL_cputopoinfo
  * XEN_SYSCTL_numainfo
  * XEN_SYSCTL_cpupool_op
  * XEN_SYSCTL_scheduler_op
@@ -213,9 +213,9 @@ that can be used without dom0 disaggregation. The main types for domUs are:
  - nomigrate_t is a domain that must be created via the nomigrate_t_building
    type, and whose memory cannot be read by dom0 once created
 
-HVM domains with stubdomain device models use two types (one per domain):
- - domHVM_t is an HVM domain that uses a stubdomain device model
- - dm_dom_t is the device model for a domain with type domHVM_t
+HVM domains with stubdomain device models also need a type for the stub domain.
+The example policy defines dm_dom_t for the device model of a domU_t domain;
+there are no device model types defined for the other domU types.
 
 One disadvantage of using type enforcement to enforce isolation is that a new
 type is needed for each group of domains. The user field can be used to address
@@ -335,33 +335,8 @@ memory, or even changing certain BIOS settings). Dynamic labeling requires that
 the domain performing the labeling be trusted to label all the devices in the
 system properly.
 
-To enable static device labeling, a checkpolicy >= 2.0.20 and libsepol >=2.0.39
-are required. The policy Makefile (tools/flask/policy/Makefile) must also be
-changed as follows:
-
-########################################
-#
-# Build a binary policy locally
-#
-$(POLVER): policy.conf
-        @echo "Compiling $(NAME) $(POLVER)"
-       $(QUIET) $(CHECKPOLICY) $^ -o $@            (Comment out this line)
-# Uncomment line below to enable policies for devices
-#        $(QUIET) $(CHECKPOLICY) -t Xen $^ -o $@   (Uncomment this line)
-
-########################################
-#
-# Install a binary policy
-#
-$(LOADPATH): policy.conf
-        @echo "Compiling and installing $(NAME) $(LOADPATH)"
-       $(QUIET) $(CHECKPOLICY) $^ -o $@            (Comment out this line)
-# Uncomment line below to enable policies for devices
-#        $(QUIET) $(CHECKPOLICY) -t Xen $^ -o $@   (Uncomment this line)
-
-
-IRQs, PCI devices, I/O memory and ports can all be labeled.  There are
-commented out lines in xen.te policy for examples on how to label devices.
+IRQs, PCI devices, I/O memory and x86 IO ports can all have labels defined.
+There are examples commented out in tools/flask/policy/policy/device_contexts.
 
 Device Labeling
 ---------------
@@ -378,7 +353,7 @@ lspci output is..
         Region 2: I/O ports at ecc0 [size=32]
         Kernel modules: e1000e
 
-The labeling can be done with these commands
+The labeling can be done with these lines in device_contexts:
 
 pirqcon 33 system_u:object_r:nicP_t
 iomemcon 0xfebe0-0xfebff system_u:object_r:nicP_t
@@ -396,32 +371,3 @@ the ranges being denied to more easily determine what resources are required.
 When running in permissive mode, only the first denial of a given
 source/destination is printed to the log, so labeling devices using this method
 may require multiple passes to find all required ranges.
-
-Additional notes on XSM:FLASK
------------------------------
-
-1) xen command line parameters
-
-	a) flask_enforcing
-	
-	The default value for flask_enforcing is '0'.  This parameter causes the 
-	platform to boot in permissive mode which means that the policy is loaded 
-	but not enforced.  This mode is often helpful for developing new systems 
-	and policies as the policy violations are reported on the xen console and 
-	may be viewed in dom0 through 'xl dmesg'.
-	
-	To boot the platform into enforcing mode, which means that the policy is
-	loaded and enforced, append 'flask_enforcing=1' on the grub line.
-	
-	This parameter may also be changed through the flask hypercall.
-	
-	b) flask_enabled
-	
-	The default value for flask_enabled is '1'.  This parameter causes the
-	platform to enable the FLASK security module under the XSM framework.
-	The parameter may be enabled/disabled only once per boot.  If the parameter
-	is set to '0', only a reboot can re-enable flask.  When flask_enabled is '0'
-	the DUMMY module is enforced.
-
-	This parameter may also be changed through the flask hypercall.  But may
-	only be performed once per boot.
diff --git a/docs/specs/libxc-migration-stream.pandoc b/docs/specs/libxc-migration-stream.pandoc
new file mode 100644
index 0000000..8cd678f
--- /dev/null
+++ b/docs/specs/libxc-migration-stream.pandoc
@@ -0,0 +1,696 @@
+% LibXenCtrl Domain Image Format
+% David Vrabel <<david.vrabel at citrix.com>>
+  Andrew Cooper <<andrew.cooper3 at citrix.com>>
+% Revision 1
+
+Introduction
+============
+
+Purpose
+-------
+
+The _domain save image_ is the context of a running domain used for
+snapshots of a domain or for transferring domains between hosts during
+migration.
+
+There are a number of problems with the format of the domain save
+image used in Xen 4.4 and earlier (the _legacy format_).
+
+* Dependant on toolstack word size.  A number of fields within the
+  image are native types such as `unsigned long` which have different
+  sizes between 32-bit and 64-bit toolstacks.  This prevents domains
+  from being migrated between hosts running 32-bit and 64-bit
+  toolstacks.
+
+* There is no header identifying the image.
+
+* The image has no version information.
+
+A new format that addresses the above is required.
+
+ARM does not yet have have a domain save image format specified and
+the format described in this specification should be suitable.
+
+Not Yet Included
+----------------
+
+The following features are not yet fully specified and will be
+included in a future draft.
+
+* Page data compression.
+
+* ARM
+
+
+Overview
+========
+
+The image format consists of two main sections:
+
+* _Headers_
+* _Records_
+
+Headers
+-------
+
+There are two headers: the _image header_, and the _domain header_.
+The image header describes the format of the image (version etc.).
+The _domain header_ contains general information about the domain
+(architecture, type etc.).
+
+Records
+-------
+
+The main part of the format is a sequence of different _records_.
+Each record type contains information about the domain context.  At a
+minimum there is a END record marking the end of the records section.
+
+
+Fields
+------
+
+All the fields within the headers and records have a fixed width.
+
+Fields are always aligned to their size.
+
+Padding and reserved fields are set to zero on save and must be
+ignored during restore.
+
+Integer (numeric) fields in the image header are always in big-endian
+byte order.
+
+Integer fields in the domain header and in the records are in the
+endianness described in the image header (which will typically be the
+native ordering).
+
+\clearpage
+
+Headers
+=======
+
+Image Header
+------------
+
+The image header identifies an image as a Xen domain save image.  It
+includes the version of this specification that the image complies
+with.
+
+Tools supporting version _V_ of the specification shall always save
+images using version _V_.  Tools shall support restoring from version
+_V_.  If the previous Xen release produced version _V_ - 1 images,
+tools shall supported restoring from these.  Tools may additionally
+support restoring from earlier versions.
+
+The marker field can be used to distinguish between legacy images and
+those corresponding to this specification.  Legacy images will have at
+one or more zero bits within the first 8 octets of the image.
+
+Fields within the image header are always in _big-endian_ byte order,
+regardless of the setting of the endianness bit.
+
+     0     1     2     3     4     5     6     7 octet
+    +-------------------------------------------------+
+    | marker                                          |
+    +-----------------------+-------------------------+
+    | id                    | version                 |
+    +-----------+-----------+-------------------------+
+    | options   | (reserved)                          |
+    +-----------+-------------------------------------+
+
+
+--------------------------------------------------------------------
+Field       Description
+----------- --------------------------------------------------------
+marker      0xFFFFFFFFFFFFFFFF.
+
+id          0x58454E46 ("XENF" in ASCII).
+
+version     0x00000002.  The version of this specification.
+
+options     bit 0: Endianness.  0 = little-endian, 1 = big-endian.
+
+            bit 1-15: Reserved.
+--------------------------------------------------------------------
+
+The endianness shall be 0 (little-endian) for images generated on an
+i386, x86_64, or arm host.
+
+\clearpage
+
+Domain Header
+-------------
+
+The domain header includes general properties of the domain.
+
+     0      1     2     3     4     5     6     7 octet
+    +-----------------------+-----------+-------------+
+    | type                  | page_shift| (reserved)  |
+    +-----------------------+-----------+-------------+
+    | xen_major             | xen_minor               |
+    +-----------------------+-------------------------+
+
+--------------------------------------------------------------------
+Field       Description
+----------- --------------------------------------------------------
+type        0x0000: Reserved.
+
+            0x0001: x86 PV.
+
+            0x0002: x86 HVM.
+
+            0x0003: x86 PVH.
+
+            0x0004: ARM.
+
+            0x0005 - 0xFFFFFFFF: Reserved.
+
+page_shift  Size of a guest page as a power of two.
+
+            i.e., page size = 2 ^page_shift^.
+
+xen_major   The Xen major version when this image was saved.
+
+xen_minor   The Xen minor version when this image was saved.
+--------------------------------------------------------------------
+
+The legacy stream conversion tool writes a `xen_major` version of 0, and sets
+`xen_minor` to the version of itself.
+
+\clearpage
+
+Records
+=======
+
+A record has a record header, type specific data and a trailing
+footer.  If `body_length` is not a multiple of 8, the body is padded
+with zeroes to align the end of the record on an 8 octet boundary.
+
+     0     1     2     3     4     5     6     7 octet
+    +-----------------------+-------------------------+
+    | type                  | body_length             |
+    +-----------+-----------+-------------------------+
+    | body...                                         |
+    ...
+    |           | padding (0 to 7 octets)             |
+    +-----------+-------------------------------------+
+
+--------------------------------------------------------------------
+Field        Description
+-----------  -------------------------------------------------------
+type         0x00000000: END
+
+             0x00000001: PAGE_DATA
+
+             0x00000002: X86_PV_INFO
+
+             0x00000003: X86_PV_P2M_FRAMES
+
+             0x00000004: X86_PV_VCPU_BASIC
+
+             0x00000005: X86_PV_VCPU_EXTENDED
+
+             0x00000006: X86_PV_VCPU_XSAVE
+
+             0x00000007: SHARED_INFO
+
+             0x00000008: TSC_INFO
+
+             0x00000009: HVM_CONTEXT
+
+             0x0000000A: HVM_PARAMS
+
+             0x0000000B: TOOLSTACK (deprecated)
+
+             0x0000000C: X86_PV_VCPU_MSRS
+
+             0x0000000D: VERIFY
+
+             0x0000000E: CHECKPOINT
+
+             0x0000000F - 0x7FFFFFFF: Reserved for future _mandatory_
+             records.
+
+             0x80000000 - 0xFFFFFFFF: Reserved for future _optional_
+             records.
+
+body_length  Length in octets of the record body.
+
+body         Content of the record.
+
+padding      0 to 7 octets of zeros to pad the whole record to a multiple
+             of 8 octets.
+--------------------------------------------------------------------
+
+Records may be _mandatory_ or _optional_.  Optional records have bit
+31 set in their type.  Restoring an image that has unrecognised or
+unsupported mandatory record must fail.  The contents of optional
+records may be ignored during a restore.
+
+The following sub-sections specify the record body format for each of
+the record types.
+
+\clearpage
+
+END
+----
+
+An end record marks the end of the image, and shall be the final record
+in the stream.
+
+     0     1     2     3     4     5     6     7 octet
+    +-------------------------------------------------+
+
+The end record contains no fields; its body_length is 0.
+
+\clearpage
+
+PAGE_DATA
+---------
+
+The bulk of an image consists of many PAGE_DATA records containing the
+memory contents.
+
+     0     1     2     3     4     5     6     7 octet
+    +-----------------------+-------------------------+
+    | count (C)             | (reserved)              |
+    +-----------------------+-------------------------+
+    | pfn[0]                                          |
+    +-------------------------------------------------+
+    ...
+    +-------------------------------------------------+
+    | pfn[C-1]                                        |
+    +-------------------------------------------------+
+    | page_data[0]...                                 |
+    ...
+    +-------------------------------------------------+
+    | page_data[N-1]...                               |
+    ...
+    +-------------------------------------------------+
+
+--------------------------------------------------------------------
+Field       Description
+----------- --------------------------------------------------------
+count       Number of pages described in this record.
+
+pfn         An array of count PFNs and their types.
+
+            Bit 63-60: XEN\_DOMCTL\_PFINFO\_* type (from
+            `public/domctl.h` but shifted by 32 bits)
+
+            Bit 59-52: Reserved.
+
+            Bit 51-0: PFN.
+
+page\_data  page\_size octets of uncompressed page contents for each
+            page set as present in the pfn array.
+--------------------------------------------------------------------
+
+Note: Count is strictly > 0.  N is strictly <= C and it is possible for there
+to be no page_data in the record if all pfns are of invalid types.
+
+--------------------------------------------------------------------
+PFINFO type    Value      Description
+-------------  ---------  ------------------------------------------
+NOTAB          0x0        Normal page.
+
+L1TAB          0x1        L1 page table page.
+
+L2TAB          0x2        L2 page table page.
+
+L3TAB          0x3        L3 page table page.
+
+L4TAB          0x4        L4 page table page.
+
+               0x5-0x8    Reserved.
+
+L1TAB_PIN      0x9        L1 page table page (pinned).
+
+L2TAB_PIN      0xA        L2 page table page (pinned).
+
+L3TAB_PIN      0xB        L3 page table page (pinned).
+
+L4TAB_PIN      0xC        L4 page table page (pinned).
+
+BROKEN         0xD        Broken page.
+
+XALLOC         0xE        Allocate only.
+
+XTAB           0xF        Invalid page.
+--------------------------------------------------------------------
+
+Table: XEN\_DOMCTL\_PFINFO\_* Page Types.
+
+PFNs with type `BROKEN`, `XALLOC`, or `XTAB` do not have any
+corresponding `page_data`.
+
+The saver uses the `XTAB` type for PFNs that become invalid in the
+guest's P2M table during a live migration[^2].
+
+Restoring an image with unrecognised page types shall fail.
+
+[^2]: In the legacy format, this is the list of unmapped PFNs in the
+tail.
+
+\clearpage
+
+X86_PV_INFO
+-----------
+
+     0     1     2     3     4     5     6     7 octet
+    +-----+-----+-----------+-------------------------+
+    | w   | ptl | (reserved)                          |
+    +-----+-----+-----------+-------------------------+
+
+--------------------------------------------------------------------
+Field            Description
+-----------      ---------------------------------------------------
+guest_width (w)  Guest width in octets (either 4 or 8).
+
+pt_levels (ptl)  Number of page table levels (either 3 or 4).
+--------------------------------------------------------------------
+
+\clearpage
+
+X86_PV_P2M_FRAMES
+-----------------
+
+     0     1     2     3     4     5     6     7 octet
+    +-----+-----+-----+-----+-------------------------+
+    | p2m_start_pfn (S)     | p2m_end_pfn (E)         |
+    +-----+-----+-----+-----+-------------------------+
+    | p2m_pfn[p2m frame containing pfn S]             |
+    +-------------------------------------------------+
+    ...
+    +-------------------------------------------------+
+    | p2m_pfn[p2m frame containing pfn E]             |
+    +-------------------------------------------------+
+
+--------------------------------------------------------------------
+Field            Description
+-------------    ---------------------------------------------------
+p2m_start_pfn    First pfn index in the p2m_pfn array.
+
+p2m_end_pfn      Last pfn index in the p2m_pfn array.
+
+p2m_pfn          Array of PFNs containing the guest's P2M table, for
+                 the PFN frames containing the PFN range S to E
+                 (inclusive).
+
+--------------------------------------------------------------------
+
+\clearpage
+
+X86_PV_VCPU_BASIC, EXTENDED, XSAVE, MSRS
+----------------------------------------
+
+The format of these records are identical.  They are all binary blobs
+of data which are accessed using specific pairs of domctl hypercalls.
+
+     0     1     2     3     4     5     6     7 octet
+    +-----------------------+-------------------------+
+    | vcpu_id               | (reserved)              |
+    +-----------------------+-------------------------+
+    | context...                                      |
+    ...
+    +-------------------------------------------------+
+
+---------------------------------------------------------------------
+Field            Description
+-----------      ----------------------------------------------------
+vcpu_id          The VCPU ID.
+
+context          Binary data for this VCPU.
+---------------------------------------------------------------------
+
+---------------------------------------------------------------------
+Record type                  Accessor hypercalls
+-----------------------      ----------------------------------------
+X86\_PV\_VCPU\_BASIC         XEN\_DOMCTL\_{get,set}vcpucontext
+
+X86\_PV\_VCPU\_EXTENDED      XEN\_DOMCTL\_{get,set}\_ext\_vcpucontext
+
+X86\_PV\_VCPU\_XSAVE         XEN\_DOMCTL\_{get,set}vcpuextstate
+
+X86\_PV\_VCPU\_MSRS          XEN\_DOMCTL\_{get,set}\_vcpu\_msrs
+---------------------------------------------------------------------
+
+\clearpage
+
+SHARED_INFO
+-----------
+
+The content of the Shared Info page.
+
+     0     1     2     3     4     5     6     7 octet
+    +-------------------------------------------------+
+    | shared_info                                     |
+    ...
+    +-------------------------------------------------+
+
+--------------------------------------------------------------------
+Field            Description
+-----------      ---------------------------------------------------
+shared_info      Contents of the shared info page.  This record
+                 should be exactly 1 page long.
+--------------------------------------------------------------------
+
+\clearpage
+
+TSC_INFO
+--------
+
+Domain TSC information, as accessed by the
+XEN\_DOMCTL\_{get,set}tscinfo hypercall sub-ops.
+
+     0     1     2     3     4     5     6     7 octet
+    +------------------------+------------------------+
+    | mode                   | khz                    |
+    +------------------------+------------------------+
+    | nsec                                            |
+    +------------------------+------------------------+
+    | incarnation            | (reserved)             |
+    +------------------------+------------------------+
+
+--------------------------------------------------------------------
+Field            Description
+-----------      ---------------------------------------------------
+mode             TSC mode, TSC\_MODE\_* constant.
+
+khz              TSC frequency, in kHz.
+
+nsec             Elapsed time, in nanoseconds.
+
+incarnation      Incarnation.
+--------------------------------------------------------------------
+
+\clearpage
+
+HVM_CONTEXT
+-----------
+
+HVM Domain context, as accessed by the
+XEN\_DOMCTL\_{get,set}hvmcontext hypercall sub-ops.
+
+     0     1     2     3     4     5     6     7 octet
+    +-------------------------------------------------+
+    | hvm_ctx                                         |
+    ...
+    +-------------------------------------------------+
+
+--------------------------------------------------------------------
+Field            Description
+-----------      ---------------------------------------------------
+hvm_ctx          The HVM Context blob from Xen.
+--------------------------------------------------------------------
+
+\clearpage
+
+HVM_PARAMS
+----------
+
+HVM Domain parameters, as accessed by the
+HVMOP\_{get,set}\_param hypercall sub-ops.
+
+     0     1     2     3     4     5     6     7 octet
+    +------------------------+------------------------+
+    | count (C)              | (reserved)             |
+    +------------------------+------------------------+
+    | param[0].index                                  |
+    +-------------------------------------------------+
+    | param[0].value                                  |
+    +-------------------------------------------------+
+    ...
+    +-------------------------------------------------+
+    | param[C-1].index                                |
+    +-------------------------------------------------+
+    | param[C-1].value                                |
+    +-------------------------------------------------+
+
+--------------------------------------------------------------------
+Field            Description
+-----------      ---------------------------------------------------
+count            The number of parameters contained in this record.
+                 Each parameter in the record contains an index and
+                 value.
+
+param index      Parameter index.
+
+param value      Parameter value.
+--------------------------------------------------------------------
+
+\clearpage
+
+TOOLSTACK (deprecated)
+----------------------
+
+> *This record was only present for transitionary purposes during
+>  development.  It is should not be used.*
+
+An opaque blob provided by and supplied to the higher layers of the
+toolstack (e.g., libxl) during save and restore.
+
+     0     1     2     3     4     5     6     7 octet
+    +------------------------+------------------------+
+    | data                                            |
+    ...
+    +-------------------------------------------------+
+
+--------------------------------------------------------------------
+Field            Description
+-----------      ---------------------------------------------------
+data             Blob of toolstack-specific data.
+--------------------------------------------------------------------
+
+\clearpage
+
+VERIFY
+------
+
+A verify record indicates that, while all memory has now been sent, the sender
+shall send further memory records for debugging purposes.
+
+     0     1     2     3     4     5     6     7 octet
+    +-------------------------------------------------+
+
+The verify record contains no fields; its body_length is 0.
+
+\clearpage
+
+CHECKPOINT
+----------
+
+A checkpoint record indicates that all the preceding records in the stream
+represent a consistent view of VM state.
+
+     0     1     2     3     4     5     6     7 octet
+    +-------------------------------------------------+
+
+The checkpoint record contains no fields; its body_length is 0
+
+If the stream is embedded in a higher level toolstack stream, the
+CHECKPOINT record marks the end of the libxc portion of the stream
+and the stream is handed back to the higher level for further
+processing.
+
+The higher level stream may then hand the stream back to libxc to
+process another set of records for the next consistent VM state
+snapshot.  This next set of records may be terminated by another
+CHECKPOINT record or an END record.
+
+\clearpage
+
+Layout
+======
+
+The set of valid records depends on the guest architecture and type.  No
+assumptions should be made about the ordering or interleaving of
+independent records.  Record dependencies are noted below.
+
+x86 PV Guest
+------------
+
+A typical save record for an x86 PV guest image would look like:
+
+1. Image header
+2. Domain header
+3. X86\_PV\_INFO record
+4. X86\_PV\_P2M\_FRAMES record
+5. Many PAGE\_DATA records
+6. TSC\_INFO
+7. SHARED\_INFO record
+8. VCPU context records for each online VCPU
+    a. X86\_PV\_VCPU\_BASIC record
+    b. X86\_PV\_VCPU\_EXTENDED record
+    c. X86\_PV\_VCPU\_XSAVE record
+    d. X86\_PV\_VCPU\_MSRS record
+9. END record
+
+There are some strict ordering requirements.  The following records must
+be present in the following order as each of them depends on information
+present in the preceding ones.
+
+1. X86\_PV\_INFO record
+2. X86\_PV\_P2M\_FRAMES record
+3. PAGE\_DATA records
+4. VCPU records
+
+x86 HVM Guest
+-------------
+
+A typical save record for an x86 HVM guest image would look like:
+
+1. Image header
+2. Domain header
+3. Many PAGE\_DATA records
+4. TSC\_INFO
+5. HVM\_PARAMS
+6. HVM\_CONTEXT
+
+HVM\_PARAMS must precede HVM\_CONTEXT, as certain parameters can affect
+the validity of architectural state in the context.
+
+
+Legacy Images (x86 only)
+========================
+
+Restoring legacy images from older tools shall be handled by
+translating the legacy format image into this new format.
+
+It shall not be possible to save in the legacy format.
+
+There are two different legacy images depending on whether they were
+generated by a 32-bit or a 64-bit toolstack. These shall be
+distinguished by inspecting octets 4-7 in the image.  If these are
+zero then it is a 64-bit image.
+
+Toolstack  Field                            Value
+---------  -----                            -----
+64-bit     Bit 31-63 of the p2m_size field  0 (since p2m_size < 2^32^)
+32-bit     extended-info chunk ID (PV)      0xFFFFFFFF
+32-bit     Chunk type (HVM)                 < 0
+32-bit     Page count (HVM)                 > 0
+
+Table: Possible values for octet 4-7 in legacy images
+
+This assumes the presence of the extended-info chunk which was
+introduced in Xen 3.0.
+
+
+Future Extensions
+=================
+
+All changes to this specification should bump the revision number in
+the title block.
+
+All changes to the image or domain headers require the image version
+to be increased.
+
+The format may be extended by adding additional record types.
+
+Extending an existing record type must be done by adding a new record
+type.  This allows old images with the old record to still be
+restored.
+
+The image header may only be extended by _appending_ additional
+fields.  In particular, the `marker`, `id` and `version` fields must
+never change size or location.
diff --git a/docs/specs/libxl-migration-stream.pandoc b/docs/specs/libxl-migration-stream.pandoc
new file mode 100644
index 0000000..2c97d86
--- /dev/null
+++ b/docs/specs/libxl-migration-stream.pandoc
@@ -0,0 +1,264 @@
+% LibXenLight Domain Image Format
+% Andrew Cooper <<andrew.cooper3 at citrix.com>>
+% Revision 1
+
+Introduction
+============
+
+For the purposes of this document, `xl` is used as a representation of any
+implementer of the `libxl` API.  `xl` should be considered completely
+interchangeable with alternates, such as `libvirt` or `xenopsd-xl`.
+
+Purpose
+-------
+
+The _domain image format_ is the context of a running domain used for
+snapshots of a domain or for transferring domains between hosts during
+migration.
+
+There are a number of problems with the domain image format used in Xen 4.5
+and earlier (the _legacy format_)
+
+* There is no `libxl` context information.  `xl` is required to send certain
+  pieces of `libxl` context itself.
+
+* The contents of the stream is passed directly through `libxl` to `libxc`.
+  The legacy `libxc` format contained some information which belonged at the
+  `libxl` level, resulting in awkward layer violation to return the
+  information back to `libxl`.
+
+* The legacy `libxc` format was inextensible, causing inextensibility in the
+  legacy `libxl` handling.
+
+This design addresses the above points, allowing for a completely
+self-contained, extensible stream with each layer responsible for its own
+appropriate information.
+
+
+Not Yet Included
+----------------
+
+The following features are not yet fully specified and will be
+included in a future draft.
+
+* ARM
+
+
+Overview
+========
+
+The image format consists of a _Header_, followed by 1 or more _Records_.
+Each record consists of a type and length field, followed by any type-specific
+data.
+
+\clearpage
+
+Header
+======
+
+The header identifies the stream as a `libxl` stream, including the version of
+this specification that it complies with.
+
+All fields in this header shall be in _big-endian_ byte order, regardless of
+the setting of the endianness bit.
+
+     0     1     2     3     4     5     6     7 octet
+    +-------------------------------------------------+
+    | ident                                           |
+    +-----------------------+-------------------------+
+    | version               | options                 |
+    +-----------------------+-------------------------+
+
+--------------------------------------------------------------------
+Field       Description
+----------- --------------------------------------------------------
+ident       0x4c6962786c466d74 ("LibxlFmt" in ASCII).
+
+version     0x00000002.  The version of this specification.
+
+options     bit 0: Endianness.    0 = little-endian, 1 = big-endian.
+
+            bit 1: Legacy Format. If set, this stream was created by
+                                  the legacy conversion tool.
+
+            bits 2-31: Reserved.
+--------------------------------------------------------------------
+
+The endianness shall be 0 (little-endian) for images generated on an
+i386, x86_64, or arm host.
+
+\clearpage
+
+
+Record Overview
+===============
+
+A record has a record header, type specific data and a trailing footer.  If
+`length` is not a multiple of 8, the body is padded with zeroes to align the
+end of the record on an 8 octet boundary.
+
+     0     1     2     3     4     5     6     7 octet
+    +-----------------------+-------------------------+
+    | type                  | body_length             |
+    +-----------+-----------+-------------------------+
+    | body...                                         |
+    ...
+    |           | padding (0 to 7 octets)             |
+    +-----------+-------------------------------------+
+
+--------------------------------------------------------------------
+Field        Description
+-----------  -------------------------------------------------------
+type         0x00000000: END
+
+             0x00000001: LIBXC_CONTEXT
+
+             0x00000002: EMULATOR_XENSTORE_DATA
+
+             0x00000003: EMULATOR_CONTEXT
+
+             0x00000004: CHECKPOINT_END
+
+             0x00000005 - 0x7FFFFFFF: Reserved for future _mandatory_
+             records.
+
+             0x80000000 - 0xFFFFFFFF: Reserved for future _optional_
+             records.
+
+body_length  Length in octets of the record body.
+
+body         Content of the record.
+
+padding      0 to 7 octets of zeros to pad the whole record to a multiple
+             of 8 octets.
+--------------------------------------------------------------------
+
+\clearpage
+
+Emulator Records
+----------------
+
+Several records are specifically for emulators, and have a common sub header.
+
+     0     1     2     3     4     5     6     7 octet
+    +------------------------+------------------------+
+    | emulator_id            | index                  |
+    +------------------------+------------------------+
+    | record specific data                            |
+    ...
+    +-------------------------------------------------+
+
+--------------------------------------------------------------------
+Field            Description
+------------     ---------------------------------------------------
+emulator_id      0x00000000: Unknown (In the case of a legacy stream)
+
+                 0x00000001: Qemu Traditional
+
+                 0x00000002: Qemu Upstream
+
+                 0x00000003 - 0xFFFFFFFF: Reserved for future emulators.
+
+index            Index of this emulator for the domain.
+--------------------------------------------------------------------
+
+\clearpage
+
+Records
+=======
+
+END
+----
+
+A end record marks the end of the image, and shall be the final record
+in the stream.
+
+     0     1     2     3     4     5     6     7 octet
+    +-------------------------------------------------+
+
+The end record contains no fields; its body_length is 0.
+
+LIBXC\_CONTEXT
+--------------
+
+A libxc context record is a marker, indicating that the stream should be
+handed to `xc_domain_restore()`.  `libxc` shall be responsible for reading its
+own image format from the stream.
+
+     0     1     2     3     4     5     6     7 octet
+    +-------------------------------------------------+
+
+The libxc context record contains no fields; its body_length is 0[^1].
+
+
+[^1]: The sending side cannot calculate ahead of time how much data `libxc`
+might write into the stream, especially for live migration where the quantity
+of data is partially proportional to the elapsed time.
+
+EMULATOR\_XENSTORE\_DATA
+------------------------
+
+A set of xenstore key/value pairs for a specific emulator associated with the
+domain.
+
+     0     1     2     3     4     5     6     7 octet
+    +------------------------+------------------------+
+    | emulator_id            | index                  |
+    +------------------------+------------------------+
+    | xenstore key/value data                         |
+    ...
+    +-------------------------------------------------+
+
+Xenstore key/value data are encoded as a packed sequence of (key, value)
+tuples.  Each (key, value) tuple is a packed pair of NUL terminated octets,
+conforming to xenstore protocol character encoding (keys strictly as
+alphanumeric ASCII and `-/_@`, values expected to be human-readable ASCII).
+
+Keys shall be relative to to the device models xenstore tree for the new
+domain.  At the time of writing, keys are relative to the path
+
+> `/local/domain/$dm_domid/device-model/$domid/`
+
+although this path is free to change moving forward, thus should not be
+assumed.
+
+EMULATOR\_CONTEXT
+----------------
+
+A context blob for a specific emulator associated with the domain.
+
+     0     1     2     3     4     5     6     7 octet
+    +------------------------+------------------------+
+    | emulator_id            | index                  |
+    +------------------------+------------------------+
+    | emulator_ctx                                    |
+    ...
+    +-------------------------------------------------+
+
+The *emulator_ctx* is a binary blob interpreted by the emulator identified by
+*emulator_id*.  Its format is unspecified.
+
+CHECKPOINT\_END
+---------------
+
+A checkpoint end record marks the end of a checkpoint in the image.
+
+     0     1     2     3     4     5     6     7 octet
+    +-------------------------------------------------+
+
+The end record contains no fields; its body_length is 0.
+
+
+Future Extensions
+=================
+
+All changes to this specification should bump the revision number in
+the title block.
+
+All changes to the header require the header version to be increased.
+
+The format may be extended by adding additional record types.
+
+Extending an existing record type must be done by adding a new record
+type.  This allows old images with the old record to still be
+restored.
diff --git a/extras/mini-os/COPYING b/extras/mini-os/COPYING
deleted file mode 100644
index 1d9df6c..0000000
--- a/extras/mini-os/COPYING
+++ /dev/null
@@ -1,36 +0,0 @@
-Certain files in this directory are licensed by the GNU
-General Public License version 2 (GPLv2). By default these
-files are not built and linked into MiniOs. Enabling them
-will cause the whole work to become covered by the GPLv2.
-
-The current set of GPLv2 features are:
-CONFIG_TPMFRONT
-CONFIG_TPMBACK
-CONFIG_TPM_TIS
-
-Do not use these if you do not want your MiniOS build to become
-GPL licensed!
-
-Copyright (c) 2009 Citrix Systems, Inc. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGE.
-
diff --git a/extras/mini-os/Config.mk b/extras/mini-os/Config.mk
deleted file mode 100644
index 4852443..0000000
--- a/extras/mini-os/Config.mk
+++ /dev/null
@@ -1,57 +0,0 @@
-# Set mini-os root path, used in mini-os.mk.
-MINI-OS_ROOT=$(XEN_ROOT)/extras/mini-os
-export MINI-OS_ROOT
-
-libc = $(stubdom)
-
-XEN_INTERFACE_VERSION := 0x00030205
-export XEN_INTERFACE_VERSION
-
-# Try to find out the architecture family TARGET_ARCH_FAM.
-# First check whether x86_... is contained (for x86_32, x86_32y, x86_64).
-# If not x86 then use $(XEN_TARGET_ARCH)
-ifeq ($(findstring x86_,$(XEN_TARGET_ARCH)),x86_)
-TARGET_ARCH_FAM = x86
-else
-TARGET_ARCH_FAM = $(XEN_TARGET_ARCH)
-endif
-
-# The architecture family directory below mini-os.
-TARGET_ARCH_DIR := arch/$(TARGET_ARCH_FAM)
-
-# Export these variables for possible use in architecture dependent makefiles.
-export TARGET_ARCH_DIR
-export TARGET_ARCH_FAM
-
-# This is used for architecture specific links.
-# This can be overwritten from arch specific rules.
-ARCH_LINKS =
-
-# The path pointing to the architecture specific header files.
-ARCH_INC := $(TARGET_ARCH_FAM)
-
-# For possible special header directories.
-# This can be overwritten from arch specific rules.
-EXTRA_INC = $(ARCH_INC)	
-
-# Include the architecture family's special makerules.
-# This must be before include minios.mk!
-include $(MINI-OS_ROOT)/$(TARGET_ARCH_DIR)/arch.mk
-
-extra_incl := $(foreach dir,$(EXTRA_INC),-isystem $(MINI-OS_ROOT)/include/$(dir))
-
-DEF_CPPFLAGS += -isystem $(MINI-OS_ROOT)/include
-DEF_CPPFLAGS += -D__MINIOS__
-
-ifeq ($(libc),y)
-DEF_CPPFLAGS += -DHAVE_LIBC
-DEF_CPPFLAGS += -isystem $(MINI-OS_ROOT)/include/posix
-DEF_CPPFLAGS += -isystem $(XEN_ROOT)/tools/xenstore/include
-endif
-
-ifneq ($(LWIPDIR),)
-lwip=y
-DEF_CPPFLAGS += -DHAVE_LWIP
-DEF_CPPFLAGS += -isystem $(LWIPDIR)/src/include
-DEF_CPPFLAGS += -isystem $(LWIPDIR)/src/include/ipv4
-endif
diff --git a/extras/mini-os/Makefile b/extras/mini-os/Makefile
deleted file mode 100644
index 6d6537e..0000000
--- a/extras/mini-os/Makefile
+++ /dev/null
@@ -1,226 +0,0 @@
-# Common Makefile for mini-os.
-#
-# Every architecture directory below mini-os/arch has to have a
-# Makefile and a arch.mk.
-#
-
-export XEN_ROOT = $(CURDIR)/../..
-include $(XEN_ROOT)/Config.mk
-OBJ_DIR ?= $(CURDIR)
-
-ifeq ($(MINIOS_CONFIG),)
-include Config.mk
-else
-EXTRA_DEPS += $(MINIOS_CONFIG)
-include $(MINIOS_CONFIG)
-endif
-
-# Configuration defaults
-CONFIG_START_NETWORK ?= y
-CONFIG_SPARSE_BSS ?= y
-CONFIG_QEMU_XS_ARGS ?= n
-CONFIG_TEST ?= n
-CONFIG_PCIFRONT ?= n
-CONFIG_BLKFRONT ?= y
-CONFIG_TPMFRONT ?= n
-CONFIG_TPM_TIS ?= n
-CONFIG_TPMBACK ?= n
-CONFIG_NETFRONT ?= y
-CONFIG_FBFRONT ?= y
-CONFIG_KBDFRONT ?= y
-CONFIG_CONSFRONT ?= y
-CONFIG_XENBUS ?= y
-CONFIG_XC ?=y
-CONFIG_LWIP ?= $(lwip)
-
-# Export config items as compiler directives
-flags-$(CONFIG_START_NETWORK) += -DCONFIG_START_NETWORK
-flags-$(CONFIG_SPARSE_BSS) += -DCONFIG_SPARSE_BSS
-flags-$(CONFIG_QEMU_XS_ARGS) += -DCONFIG_QEMU_XS_ARGS
-flags-$(CONFIG_PCIFRONT) += -DCONFIG_PCIFRONT
-flags-$(CONFIG_BLKFRONT) += -DCONFIG_BLKFRONT
-flags-$(CONFIG_TPMFRONT) += -DCONFIG_TPMFRONT
-flags-$(CONFIG_TPM_TIS) += -DCONFIG_TPM_TIS
-flags-$(CONFIG_TPMBACK) += -DCONFIG_TPMBACK
-flags-$(CONFIG_NETFRONT) += -DCONFIG_NETFRONT
-flags-$(CONFIG_KBDFRONT) += -DCONFIG_KBDFRONT
-flags-$(CONFIG_FBFRONT) += -DCONFIG_FBFRONT
-flags-$(CONFIG_CONSFRONT) += -DCONFIG_CONSFRONT
-flags-$(CONFIG_XENBUS) += -DCONFIG_XENBUS
-
-DEF_CFLAGS += $(flags-y)
-
-# Symlinks and headers that must be created before building the C files
-GENERATED_HEADERS := include/list.h $(ARCH_LINKS) include/mini-os include/xen include/$(TARGET_ARCH_FAM)/mini-os
-
-EXTRA_DEPS += $(GENERATED_HEADERS)
-
-# Include common mini-os makerules.
-include minios.mk
-
-# Set tester flags
-# CFLAGS += -DBLKTEST_WRITE
-
-# Define some default flags for linking.
-LDLIBS := 
-APP_LDLIBS := 
-LDARCHLIB := -L$(OBJ_DIR)/$(TARGET_ARCH_DIR) -l$(ARCH_LIB_NAME)
-LDFLAGS_FINAL := -T $(TARGET_ARCH_DIR)/minios-$(XEN_TARGET_ARCH).lds
-
-# Prefix for global API names. All other symbols are localised before
-# linking with EXTRA_OBJS.
-GLOBAL_PREFIX := xenos_
-EXTRA_OBJS =
-
-TARGET := mini-os
-
-# Subdirectories common to mini-os
-SUBDIRS := lib xenbus console
-
-src-$(CONFIG_BLKFRONT) += blkfront.c
-src-$(CONFIG_TPMFRONT) += tpmfront.c
-src-$(CONFIG_TPM_TIS) += tpm_tis.c
-src-$(CONFIG_TPMBACK) += tpmback.c
-src-y += daytime.c
-src-y += events.c
-src-$(CONFIG_FBFRONT) += fbfront.c
-src-y += gntmap.c
-src-y += gnttab.c
-src-y += hypervisor.c
-src-y += kernel.c
-src-y += lock.c
-src-y += main.c
-src-y += mm.c
-src-$(CONFIG_NETFRONT) += netfront.c
-src-$(CONFIG_PCIFRONT) += pcifront.c
-src-y += sched.c
-src-$(CONFIG_TEST) += test.c
-
-src-y += lib/ctype.c
-src-y += lib/math.c
-src-y += lib/printf.c
-src-y += lib/stack_chk_fail.c
-src-y += lib/string.c
-src-y += lib/sys.c
-src-y += lib/xmalloc.c
-src-$(CONFIG_XENBUS) += lib/xs.c
-
-src-$(CONFIG_XENBUS) += xenbus/xenbus.c
-
-src-y += console/console.c
-src-y += console/xencons_ring.c
-src-$(CONFIG_CONSFRONT) += console/xenbus.c
-
-# The common mini-os objects to build.
-APP_OBJS :=
-OBJS := $(patsubst %.c,$(OBJ_DIR)/%.o,$(src-y))
-
-.PHONY: default
-default: $(OBJ_DIR)/$(TARGET)
-
-# Create special architecture specific links. The function arch_links
-# has to be defined in arch.mk (see include above).
-ifneq ($(ARCH_LINKS),)
-$(ARCH_LINKS):
-	$(arch_links)
-endif
-
-include/list.h: $(XEN_ROOT)/tools/include/xen-external/bsd-sys-queue-h-seddery $(XEN_ROOT)/tools/include/xen-external/bsd-sys-queue.h
-	perl $^ --prefix=minios  >$@.new
-	$(call move-if-changed,$@.new,$@)
-
-# Used by stubdom's Makefile
-.PHONY: links
-links: $(GENERATED_HEADERS)
-
-include/xen:
-	ln -sf ../../../xen/include/public $@
-
-include/mini-os:
-	ln -sf . $@
-
-include/$(TARGET_ARCH_FAM)/mini-os:
-	ln -sf . $@
-
-.PHONY: arch_lib
-arch_lib:
-	$(MAKE) --directory=$(TARGET_ARCH_DIR) OBJ_DIR=$(OBJ_DIR)/$(TARGET_ARCH_DIR) || exit 1;
-
-ifeq ($(CONFIG_LWIP),y)
-# lwIP library
-LWC	:= $(shell find $(LWIPDIR)/src -type f -name '*.c')
-LWC	:= $(filter-out %6.c %ip6_addr.c %ethernetif.c, $(LWC))
-LWO	:= $(patsubst %.c,%.o,$(LWC))
-LWO	+= $(OBJ_DIR)/lwip-arch.o
-ifeq ($(CONFIG_NETFRONT),y)
-LWO += $(OBJ_DIR)/lwip-net.o
-endif
-
-$(OBJ_DIR)/lwip.a: $(LWO)
-	$(RM) $@
-	$(AR) cqs $@ $^
-
-OBJS += $(OBJ_DIR)/lwip.a
-endif
-
-OBJS := $(filter-out $(OBJ_DIR)/lwip%.o $(LWO), $(OBJS))
-
-ifeq ($(libc),y)
-ifeq ($(CONFIG_XC),y)
-APP_LDLIBS += -L$(XEN_ROOT)/stubdom/libxc-$(XEN_TARGET_ARCH) -whole-archive -lxenguest -lxenctrl -no-whole-archive
-endif
-APP_LDLIBS += -lpci
-APP_LDLIBS += -lz
-APP_LDLIBS += -lm
-LDLIBS += -lc
-endif
-
-ifneq ($(APP_OBJS)-$(lwip),-y)
-OBJS := $(filter-out $(OBJ_DIR)/daytime.o, $(OBJS))
-endif
-
-$(OBJ_DIR)/$(TARGET)_app.o: $(APP_OBJS) app.lds
-	$(LD) -r -d $(LDFLAGS) -\( $^ -\) $(APP_LDLIBS) --undefined main -o $@
-
-ifneq ($(APP_OBJS),)
-APP_O=$(OBJ_DIR)/$(TARGET)_app.o 
-endif
-
-$(OBJ_DIR)/$(TARGET): $(OBJS) $(APP_O) arch_lib
-	$(LD) -r $(LDFLAGS) $(HEAD_OBJ) $(APP_O) $(OBJS) $(LDARCHLIB) $(LDLIBS) -o $@.o
-	$(OBJCOPY) -w -G $(GLOBAL_PREFIX)* -G _start $@.o $@.o
-	$(LD) $(LDFLAGS) $(LDFLAGS_FINAL) $@.o $(EXTRA_OBJS) -o $@
-	gzip -f -9 -c $@ >$@.gz
-
-.PHONY: clean arch_clean
-
-arch_clean:
-	$(MAKE) --directory=$(TARGET_ARCH_DIR) OBJ_DIR=$(OBJ_DIR)/$(TARGET_ARCH_DIR) clean || exit 1;
-
-clean:	arch_clean
-	for dir in $(addprefix $(OBJ_DIR)/,$(SUBDIRS)); do \
-		rm -f $$dir/*.o; \
-	done
-	rm -f include/list.h
-	rm -f $(OBJ_DIR)/*.o *~ $(OBJ_DIR)/core $(OBJ_DIR)/$(TARGET).elf $(OBJ_DIR)/$(TARGET).raw $(OBJ_DIR)/$(TARGET) $(OBJ_DIR)/$(TARGET).gz
-	find . $(OBJ_DIR) -type l | xargs rm -f
-	$(RM) $(OBJ_DIR)/lwip.a $(LWO)
-	rm -f tags TAGS
-
-
-define all_sources
-     ( find . -follow -name SCCS -prune -o -name '*.[chS]' -print )
-endef
-
-.PHONY: cscope
-cscope:
-	$(all_sources) > cscope.files
-	cscope -k -b -q
-    
-.PHONY: tags
-tags:
-	$(all_sources) | xargs ctags
-
-.PHONY: TAGS
-TAGS:
-	$(all_sources) | xargs etags
diff --git a/extras/mini-os/README b/extras/mini-os/README
deleted file mode 100644
index 7960314..0000000
--- a/extras/mini-os/README
+++ /dev/null
@@ -1,46 +0,0 @@
- Minimal OS
- ----------
-
-This shows some of the stuff that any guest OS will have to set up.
-
-This includes:
-
- * installing a virtual exception table
- * handling virtual exceptions
- * handling asynchronous events
- * enabling/disabling async events
- * parsing start_info struct at start-of-day
- * registering virtual interrupt handlers (for timer interrupts)
- * a simple page and memory allocator
- * minimal libc support
- * minimal Copy-on-Write support
- * network, block, framebuffer support
- * transparent access to FileSystem exports (see tools/fs-back)
-
-- to build it just type make.
-
-- to build it with TCP/IP support, download LWIP 1.3.2 source code and type
-
-  make LWIPDIR=/path/to/lwip/source
-
-- to build it with much better libc support, see the stubdom/ directory
-
-- to start it do the following in domain0
-  # xl create -c domain_config
-
-This starts the kernel and prints out a bunch of stuff and then once every
-second the system time.
-
-If you have setup a disk in the config file (e.g.
-disk = [ 'file:/tmp/foo,hda,r' ] ), it will loop reading it.  If that disk is
-writable (e.g. disk = [ 'file:/tmp/foo,hda,w' ] ), it will write data patterns
-and re-read them.
-
-If you have setup a network in the config file (e.g. vif = [''] ), it will
-print incoming packets.
-
-If you have setup a VFB in the config file (e.g. vfb = ['type=sdl'] ), it will
-show a mouse with which you can draw color squares.
-
-If you have compiled it with TCP/IP support, it will run a daytime server on
-TCP port 13.
diff --git a/extras/mini-os/app.lds b/extras/mini-os/app.lds
deleted file mode 100644
index 4a48cc8..0000000
--- a/extras/mini-os/app.lds
+++ /dev/null
@@ -1,11 +0,0 @@
-SECTIONS
-{
-        .app.bss : {
-                __app_bss_start = . ;
-                *(.bss .bss.*)
-                *(COMMON)
-                *(.lbss .lbss.*)
-                *(LARGE_COMMON)
-                __app_bss_end = . ;
-        }
-}
diff --git a/extras/mini-os/arch/arm/arm32.S b/extras/mini-os/arch/arm/arm32.S
deleted file mode 100644
index 73223c8..0000000
--- a/extras/mini-os/arch/arm/arm32.S
+++ /dev/null
@@ -1,233 +0,0 @@
-@ Offset of the kernel within the RAM. This is a Linux/zImage convention which we
-@ rely on for now.
-#define ZIMAGE_KERNEL_OFFSET 0x8000
-
-.section .text
-
-.globl _start
-_start:
-	@ zImage header
-.rept   8
-        mov     r0, r0
-.endr
-        b       reset
-        .word   0x016f2818      @ Magic numbers to help the loader
-        .word   0		@ zImage start address (0 = relocatable)
-        .word   _edata - _start @ zImage end address (excludes bss section)
-	@ end of zImage header
-
-@ Called at boot time. Sets up MMU, exception vectors and stack, and then calls C arch_init() function.
-@ => r2 -> DTB
-@ <= never returns
-@ Note: this boot code needs to be within the first (1MB - ZIMAGE_KERNEL_OFFSET) of _start.
-reset:
-	@ Problem: the C code wants to be at a known address (_start), but Xen might
-	@ load us anywhere. We initialise the MMU (mapping virtual to physical @ addresses)
-	@ so everything ends up where the code expects it to be.
-	@
-	@ We calculate the offet between where the linker thought _start would be and where
-	@ it actually is and initialise the page tables to have that offset for every page.
-	@
-	@ When we turn on the MMU, we're still executing at the old address. We don't want
-	@ the code to disappear from under us. So we have to do the mapping in stages:
-	@
-	@ 1. set up a mapping to our current page from both its current and desired addresses
-	@ 2. enable the MMU
-	@ 3. jump to the new address
-	@ 4. remap all the other pages with the calculated offset
-
-	adr	r1, _start		@ r1 = physical address of _start
-	ldr	r3, =_start		@ r3 = (desired) virtual address of _start
-	sub 	r9, r1, r3		@ r9 = (physical - virtual) offset
-
-	ldr	r7, =_page_dir		@ r7 = (desired) virtual addr of translation table
-	add	r1, r7, r9		@ r1 = physical addr of translation table
-
-	@ Tell the system where our page table is located.
-	@ This is the 16 KB top-level translation table, in which
-	@ each word maps one 1MB virtual section to a physical section.
-	@ Note: We leave TTBCR as 0, meaning that only TTBR0 is used and
-	@ we use the short-descriptor format (32-bit physical addresses).
-	orr	r0, r1, #0b0001011	@ Sharable, Inner/Outer Write-Back Write-Allocate Cacheable
-	mcr	p15, 0, r0, c2, c0, 0	@ set TTBR0
-
-	@ Set access permission for domains.
-	@ Domains are deprecated, but we have to configure them anyway.
-	@ We mark every page as being domain 0 and set domain 0 to "client mode"
-	@ (client mode = use access flags in page table).
-	mov	r0, #1			@ 1 = client
-	mcr	p15, 0, r0, c3, c0, 0	@ DACR
-
-	@ Template (flags) for a 1 MB page-table entry.
-	@ TEX[2:0] C B = 001 1 1 (outer and inner write-back, write-allocate)
-	ldr	r8, =(0x2 +  		/* Section entry */ \
-		      0xc +  		/* C B */ \
-		      (3 << 10) + 	/* Read/write */ \
-		      (1 << 12) +	/* TEX */ \
-		      (1 << 16) +	/* Sharable */ \
-		      (1<<19))		/* Non-secure */
-	@ r8 = template page table entry
-
-	@ Add an entry for the current physical section, at the old and new
-	@ addresses. It's OK if they're the same.
-	mov	r0, pc, lsr#20
-	mov	r0, r0, lsl#20		@ r0 = physical address of this code's section start
-	orr	r3, r0, r8		@ r3 = table entry for this section
-	ldr	r4, =_start		@ r4 = desired virtual address of this section
-	str	r3, [r1, r4, lsr#18] 	@ map desired virtual section to this code
-	str	r3, [r1, r0, lsr#18]	@ map current section to this code too
-
-	@ Invalidate TLB
-	dsb				@ Caching is off, but must still prevent reordering
-	mcr	p15, 0, r1, c8, c7, 0	@ TLBIALL
-
-	@ Enable MMU / SCTLR
-	mrc	p15, 0, r1, c1, c0, 0	@ SCTLR
-	orr	r1, r1, #3 << 11	@ enable icache, branch prediction
-	orr	r1, r1, #4 + 1		@ enable dcache, MMU
-	mcr	p15, 0, r1, c1, c0, 0	@ SCTLR
-	isb
-
-	ldr	r1, =stage2		@ Virtual address of stage2
-	bx	r1
-
-@ Called once the MMU is enabled. The boot code and the page table are mapped,
-@ but nothing else is yet.
-@
-@ => r2 -> dtb (physical)
-@    r7 = virtual address of page table
-@    r8 = section entry template (flags)
-@    r9 = desired physical - virtual offset
-@    pc -> somewhere in newly-mapped virtual code section
-stage2:
-	@ Invalidate TLB
-	mcr	p15, 0, r1, c8, c7, 0	@ TLBIALL
-	isb
-
-	@ The new mapping has now taken effect:
-	@ r7 -> page_dir
-
-	@ Fill in the whole top-level translation table (at page_dir).
-	@ Populate the whole pagedir with 1MB section descriptors.
-
-	mov	r1, r7			@ r1 -> first section entry
-	add	r3, r1, #4*4*1024	@ limit (4 GB address space, 4 byte entries)
-	orr	r0, r8, r9		@ r0 = entry mapping section zero to start of physical RAM
-1:
-	str	r0, [r1],#4		@ write the section entry
-	add	r0, r0, #1 << 20 	@ next physical page (wraps)
-	cmp	r1, r3
-	bne	1b
-
-	@ Invalidate TLB
-	dsb
-	mcr	p15, 0, r1, c8, c7, 0	@ TLBIALL
-	isb
-
-	@ Set VBAR -> exception_vector_table
-	@ SCTLR.V = 0
-	adr	r0, exception_vector_table
-	mcr	p15, 0, r0, c12, c0, 0
-
-	@ Enable hardware floating point:
-	@ 1. Access to CP10 and CP11 must be enabled in the Coprocessor Access
-	@    Control Register (CP15.CACR):
-	mrc	p15, 0, r1, c1, c0, 2		@ CACR
-	orr	r1, r1, #(3 << 20) + (3 << 22)	@ full access for CP10 & CP11
-	mcr	p15, 0, r1, c1, c0, 2
-	@ 2. The EN bit in the FPEXC register must be set:
-	vmrs	r0, FPEXC
-	orr	r0, r0, #1<<30		@ EN (enable)
-	vmsr	FPEXC, r0
-
-	@ Initialise 16 KB stack
-	ldr	sp, =_boot_stack_end
-
-	sub	r0, r2, r9		@ r0 -> device tree (virtual address)
-	mov	r1, r9			@ r1 = physical_address_offset
-
-	b	arch_init
-
-.pushsection .bss
-@ Note: calling arch_init zeroes out this region.
-.align 12
-.globl shared_info_page
-shared_info_page:
-	.fill (1024), 4, 0x0
-
-.align 3
-.globl irqstack
-.globl irqstack_end
-irqstack:
-	.fill (1024), 4, 0x0
-irqstack_end:
-
-.popsection
-
-@ exception base address
-.align 5
-.globl exception_vector_table
-@ Note: remember to call CLREX if returning from an exception:
-@ "The architecture enables the local monitor to treat any exclusive store as
-@  matching a previous LDREX address. For this reason, use of the CLREX
-@  instruction to clear an existing tag is required on context switches."
-@ -- ARM Cortex-A Series Programmer’s Guide (Version: 4.0)
-exception_vector_table:
-	b	. @ reset
-	b	. @ undefined instruction
-	b	. @ supervisor call
-	b	. @ prefetch call
-	b	. @ prefetch abort
-	b	. @ data abort
-	b	irq_handler @ irq
-	.word 0xe7f000f0    @ abort on FIQ
-
-@ Call fault_undefined_instruction in "Undefined mode"
-bug:
-	.word	0xe7f000f0    	@ und/udf - a "Permanently Undefined" instruction
-
-irq_handler:
-	ldr	sp, =irqstack_end
-	push	{r0 - r12, r14}
-
-	ldr	r0, IRQ_handler
-	cmp	r0, #0
-	beq	bug
-	blx	r0		@ call handler
-
-	@ Return from IRQ
-	pop	{r0 - r12, r14}
-	clrex
-	subs	pc, lr, #4
-
-.globl IRQ_handler
-IRQ_handler:
-	.long	0x0
-
-
-.globl __arch_switch_threads
-@ => r0 = &prev->sp
-@    r1 = &next->sp
-@ <= returns to next thread's saved return address
-__arch_switch_threads:
-	push	{r4-r11}	@ Store callee-saved registers to old thread's stack
-	stmia	r0, {sp, lr}	@ Store current sp and ip to prev's struct thread
-
-	ldmia	r1, {sp, lr}	@ Load new sp, ip from next's struct thread
-	pop	{r4-r11}	@ Load callee-saved registers from new thread's stack
-
-	bx	lr
-
-@ This is called if you try to divide by zero. For now, we make a supervisor call,
-@ which will make us halt.
-.globl raise
-raise:
-	svc	0
-
-.globl arm_start_thread
-arm_start_thread:
-	pop	{r0, r1}
-	@ r0 = user data
-	@ r1 -> thread's main function
-	ldr	lr, =exit_thread
-	bx	r1
diff --git a/extras/mini-os/arch/arm/events.c b/extras/mini-os/arch/arm/events.c
deleted file mode 100644
index 441010d..0000000
--- a/extras/mini-os/arch/arm/events.c
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <mini-os/os.h>
-#include <mini-os/events.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/console.h>
-
-static void virq_debug(evtchn_port_t port, struct pt_regs *regs, void *params)
-{
-    printk("Received a virq_debug event\n");
-}
-
-evtchn_port_t debug_port = -1;
-void arch_init_events(void)
-{
-    debug_port = bind_virq(VIRQ_DEBUG, (evtchn_handler_t)virq_debug, 0);
-    if(debug_port == -1)
-        BUG();
-    unmask_evtchn(debug_port);
-}
-
-void arch_unbind_ports(void)
-{
-    if(debug_port != -1)
-    {
-        mask_evtchn(debug_port);
-        unbind_evtchn(debug_port);
-    }
-}
-
-void arch_fini_events(void)
-{
-}
diff --git a/extras/mini-os/arch/arm/hypercalls32.S b/extras/mini-os/arch/arm/hypercalls32.S
deleted file mode 100644
index af8e175..0000000
--- a/extras/mini-os/arch/arm/hypercalls32.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/******************************************************************************
- * hypercall.S
- *
- * Xen hypercall wrappers
- *
- * Stefano Stabellini <stefano.stabellini at eu.citrix.com>, Citrix, 2012
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <xen/xen.h>
-
-#define __HVC(imm16) .long ((0xE1400070 | (((imm16) & 0xFFF0) << 4) | ((imm16) & 0x000F)) & 0xFFFFFFFF)
-
-#define XEN_IMM 0xEA1
-
-#define HYPERCALL_SIMPLE(hypercall)		\
-.globl HYPERVISOR_##hypercall;			\
-.align 4,0x90;					\
-HYPERVISOR_##hypercall:				\
-        mov r12, #__HYPERVISOR_##hypercall;	\
-        __HVC(XEN_IMM);				\
-        mov pc, lr;
-
-#define _hypercall0 HYPERCALL_SIMPLE
-#define _hypercall1 HYPERCALL_SIMPLE
-#define _hypercall2 HYPERCALL_SIMPLE
-#define _hypercall3 HYPERCALL_SIMPLE
-#define _hypercall4 HYPERCALL_SIMPLE
-
-_hypercall2(sched_op);
-_hypercall2(memory_op);
-_hypercall2(event_channel_op);
-_hypercall2(xen_version);
-_hypercall3(console_io);
-_hypercall1(physdev_op);
-_hypercall3(grant_table_op);
-_hypercall3(vcpu_op);
-_hypercall1(sysctl);
-_hypercall1(domctl);
-_hypercall2(hvm_op);
-_hypercall1(xsm_op);
diff --git a/extras/mini-os/arch/arm/minios-arm32.lds b/extras/mini-os/arch/arm/minios-arm32.lds
deleted file mode 100755
index 9627162..0000000
--- a/extras/mini-os/arch/arm/minios-arm32.lds
+++ /dev/null
@@ -1,83 +0,0 @@
-OUTPUT_ARCH(arm)
-ENTRY(_start)
-SECTIONS
-{
-  /* Note: we currently assume that Xen will load the kernel image
-   * at start-of-RAM + 0x8000. We use this initial 32 KB for the stack
-   * and translation tables.
-   */
-  _boot_stack 	 = 0x400000;	/* 16 KB boot stack */
-  _boot_stack_end = 0x404000;
-  _page_dir      = 0x404000;	/* 16 KB translation table */
-  .		 = 0x408000;
-  _text = .;			/* Text and read-only data */
-  .text : {
-	*(.text)
-	*(.gnu.warning)
-	} = 0x9090
-
-  _etext = .;			/* End of text section */
-
-  .rodata : { *(.rodata) *(.rodata.*) }
-  . = ALIGN(4096);
-  _erodata = .;
-
-  /* newlib initialization functions */
-  . = ALIGN(32 / 8);
-  PROVIDE (__preinit_array_start = .);
-  .preinit_array     : { *(.preinit_array) }
-  PROVIDE (__preinit_array_end = .);
-  PROVIDE (__init_array_start = .);
-  .init_array     : { *(.init_array) }
-  PROVIDE (__init_array_end = .);
-  PROVIDE (__fini_array_start = .);
-  .fini_array     : { *(.fini_array) }
-  PROVIDE (__fini_array_end = .);
-
-  .ctors : {
-        __CTOR_LIST__ = .;
-        *(.ctors)
-	CONSTRUCTORS
-        LONG(0)
-        __CTOR_END__ = .;
-        }
-
-  .dtors : {
-        __DTOR_LIST__ = .;
-        *(.dtors)
-        LONG(0)
-        __DTOR_END__ = .;
-        }
-
-  .data : {			/* Data */
-	*(.data)
-	}
-
-  /* Note: linker will insert any extra sections here, just before .bss */
-
-  .bss : {
-	_edata = .;			/* End of data included in image */
-	/* Nothing after here is included in the zImage's size */
-
-	__bss_start = .;
-	*(.bss)
-        *(.app.bss)
-	}
-  _end = . ;
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	*(.text.exit)
-	*(.data.exit)
-	*(.exitcall.exit)
-	}
-
-  /* Stabs debugging sections.  */
-  .stab 0 : { *(.stab) }
-  .stabstr 0 : { *(.stabstr) }
-  .stab.excl 0 : { *(.stab.excl) }
-  .stab.exclstr 0 : { *(.stab.exclstr) }
-  .stab.index 0 : { *(.stab.index) }
-  .stab.indexstr 0 : { *(.stab.indexstr) }
-  .comment 0 : { *(.comment) }
-}
diff --git a/extras/mini-os/arch/arm/mm.c b/extras/mini-os/arch/arm/mm.c
deleted file mode 100644
index efecc51..0000000
--- a/extras/mini-os/arch/arm/mm.c
+++ /dev/null
@@ -1,139 +0,0 @@
-#include <mini-os/console.h>
-#include <xen/memory.h>
-#include <arch_mm.h>
-#include <mini-os/hypervisor.h>
-#include <libfdt.h>
-#include <lib.h>
-
-uint32_t physical_address_offset;
-
-unsigned long allocate_ondemand(unsigned long n, unsigned long alignment)
-{
-    // FIXME
-    BUG();
-}
-
-void arch_init_mm(unsigned long *start_pfn_p, unsigned long *max_pfn_p)
-{
-    int memory;
-    int prop_len = 0;
-    const uint64_t *regs;
-
-    printk("    _text: %p(VA)\n", &_text);
-    printk("    _etext: %p(VA)\n", &_etext);
-    printk("    _erodata: %p(VA)\n", &_erodata);
-    printk("    _edata: %p(VA)\n", &_edata);
-    printk("    stack start: %p(VA)\n", _boot_stack);
-    printk("    _end: %p(VA)\n", &_end);
-
-    if (fdt_num_mem_rsv(device_tree) != 0)
-        printk("WARNING: reserved memory not supported!\n");
-
-    memory = fdt_node_offset_by_prop_value(device_tree, -1, "device_type", "memory", sizeof("memory"));
-    if (memory < 0) {
-        printk("No memory found in FDT!\n");
-        BUG();
-    }
-
-    /* Xen will always provide us at least one bank of memory.
-     * Mini-OS will use the first bank for the time-being. */
-    regs = fdt_getprop(device_tree, memory, "reg", &prop_len);
-
-    /* The property must contain at least the start address
-     * and size, each of which is 8-bytes. */
-    if (regs == NULL || prop_len < 16) {
-        printk("Bad 'reg' property: %p %d\n", regs, prop_len);
-        BUG();
-    }
-
-    unsigned int end = (unsigned int) &_end;
-    paddr_t mem_base = fdt64_to_cpu(regs[0]);
-    uint64_t mem_size = fdt64_to_cpu(regs[1]);
-    printk("Found memory at 0x%llx (len 0x%llx)\n",
-            (unsigned long long) mem_base, (unsigned long long) mem_size);
-
-    BUG_ON(to_virt(mem_base) > (void *) &_text);          /* Our image isn't in our RAM! */
-    *start_pfn_p = PFN_UP(to_phys(end));
-    uint64_t heap_len = mem_size - (PFN_PHYS(*start_pfn_p) - mem_base);
-    *max_pfn_p = *start_pfn_p + PFN_DOWN(heap_len);
-
-    printk("Using pages %lu to %lu as free space for heap.\n", *start_pfn_p, *max_pfn_p);
-
-    /* The device tree is probably in memory that we're about to hand over to the page
-     * allocator, so move it to the end and reserve that space.
-     */
-    uint32_t fdt_size = fdt_totalsize(device_tree);
-    void *new_device_tree = to_virt(((*max_pfn_p << PAGE_SHIFT) - fdt_size) & PAGE_MASK);
-    if (new_device_tree != device_tree) {
-        memmove(new_device_tree, device_tree, fdt_size);
-    }
-    device_tree = new_device_tree;
-    *max_pfn_p = to_phys(new_device_tree) >> PAGE_SHIFT;
-}
-
-void arch_init_p2m(unsigned long max_pfn)
-{
-}
-
-void arch_init_demand_mapping_area(unsigned long cur_pfn)
-{
-}
-
-/* Get Xen's suggested physical page assignments for the grant table. */
-static paddr_t get_gnttab_base(void)
-{
-    int hypervisor;
-    int len = 0;
-    const uint64_t *regs;
-    paddr_t gnttab_base;
-
-    hypervisor = fdt_node_offset_by_compatible(device_tree, -1, "xen,xen");
-    BUG_ON(hypervisor < 0);
-
-    regs = fdt_getprop(device_tree, hypervisor, "reg", &len);
-    /* The property contains the address and size, 8-bytes each. */
-    if (regs == NULL || len < 16) {
-        printk("Bad 'reg' property: %p %d\n", regs, len);
-        BUG();
-    }
-
-    gnttab_base = fdt64_to_cpu(regs[0]);
-
-    printk("FDT suggests grant table base %llx\n", (unsigned long long) gnttab_base);
-
-    return gnttab_base;
-}
-
-grant_entry_t *arch_init_gnttab(int nr_grant_frames)
-{
-    struct xen_add_to_physmap xatp;
-    struct gnttab_setup_table setup;
-    xen_pfn_t frames[nr_grant_frames];
-    paddr_t gnttab_table;
-    int i, rc;
-
-    gnttab_table = get_gnttab_base();
-
-    for (i = 0; i < nr_grant_frames; i++)
-    {
-        xatp.domid = DOMID_SELF;
-        xatp.size = 0;      /* Seems to be unused */
-        xatp.space = XENMAPSPACE_grant_table;
-        xatp.idx = i;
-        xatp.gpfn = (gnttab_table >> PAGE_SHIFT) + i;
-        rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
-        BUG_ON(rc != 0);
-    }
-
-    setup.dom = DOMID_SELF;
-    setup.nr_frames = nr_grant_frames;
-    set_xen_guest_handle(setup.frame_list, frames);
-    HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
-    if (setup.status != 0)
-    {
-        printk("GNTTABOP_setup_table failed; status = %d\n", setup.status);
-        BUG();
-    }
-
-    return to_virt(gnttab_table);
-}
diff --git a/extras/mini-os/arch/arm/sched.c b/extras/mini-os/arch/arm/sched.c
deleted file mode 100644
index 8091566..0000000
--- a/extras/mini-os/arch/arm/sched.c
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <mini-os/sched.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/console.h>
-
-void arm_start_thread(void);
-
-/* The AAPCS requires the callee (e.g. __arch_switch_threads) to preserve r4-r11. */
-#define CALLEE_SAVED_REGISTERS 8
-
-/* Architecture specific setup of thread creation */
-struct thread* arch_create_thread(char *name, void (*function)(void *),
-                                  void *data)
-{
-    struct thread *thread;
-
-    thread = xmalloc(struct thread);
-    /* We can't use lazy allocation here since the trap handler runs on the stack */
-    thread->stack = (char *)alloc_pages(STACK_SIZE_PAGE_ORDER);
-    thread->name = name;
-    printk("Thread \"%s\": pointer: 0x%p, stack: 0x%p\n", name, thread,
-            thread->stack);
-
-    /* Save pointer to the thread on the stack, used by current macro */
-    *((unsigned long *)thread->stack) = (unsigned long)thread;
-
-    /* Push the details to pass to arm_start_thread onto the stack. */
-    int *sp = (int *) (thread->stack + STACK_SIZE);
-    *(--sp) = (int) function;
-    *(--sp) = (int) data;
-
-    /* We leave room for the 8 callee-saved registers which we will
-     * try to restore on thread switch, even though they're not needed
-     * for the initial switch. */
-    thread->sp = (unsigned long) sp - 4 * CALLEE_SAVED_REGISTERS;
-
-    thread->ip = (unsigned long) arm_start_thread;
-
-    return thread;
-}
-
-void run_idle_thread(void)
-{
-    __asm__ __volatile__ ("mov sp, %0; bx %1"::
-            "r"(idle_thread->sp + 4 * CALLEE_SAVED_REGISTERS),
-            "r"(idle_thread->ip));
-    /* Never arrive here! */
-}
diff --git a/extras/mini-os/arch/arm/setup.c b/extras/mini-os/arch/arm/setup.c
deleted file mode 100644
index 06afe46..0000000
--- a/extras/mini-os/arch/arm/setup.c
+++ /dev/null
@@ -1,119 +0,0 @@
-#include <mini-os/os.h>
-#include <mini-os/kernel.h>
-#include <mini-os/gic.h>
-#include <mini-os/console.h>
-#include <xen/xen.h>
-#include <xen/memory.h>
-#include <xen/hvm/params.h>
-#include <arch_mm.h>
-#include <libfdt.h>
-
-/*
- * This structure contains start-of-day info, such as pagetable base pointer,
- * address of the shared_info structure, and things like that.
- * On x86, the hypervisor passes it to us. On ARM, we fill it in ourselves.
- */
-union start_info_union start_info_union;
-
-/*
- * Shared page for communicating with the hypervisor.
- * Events flags go here, for example.
- */
-shared_info_t *HYPERVISOR_shared_info;
-
-extern char shared_info_page[PAGE_SIZE];
-
-void *device_tree;
-
-static int hvm_get_parameter(int idx, uint64_t *value)
-{
-    struct xen_hvm_param xhv;
-    int ret;
-
-    xhv.domid = DOMID_SELF;
-    xhv.index = idx;
-    ret = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
-    if (ret < 0) {
-        BUG();
-    }
-    *value = xhv.value;
-    return ret;
-}
-
-static void get_console(void)
-{
-    uint64_t v = -1;
-
-    hvm_get_parameter(HVM_PARAM_CONSOLE_EVTCHN, &v);
-    start_info.console.domU.evtchn = v;
-
-    hvm_get_parameter(HVM_PARAM_CONSOLE_PFN, &v);
-    start_info.console.domU.mfn = v;
-
-    printk("Console is on port %d\n", start_info.console.domU.evtchn);
-    printk("Console ring is at mfn %lx\n", (unsigned long) start_info.console.domU.mfn);
-}
-
-void get_xenbus(void)
-{
-    uint64_t value;
-
-    if (hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &value))
-        BUG();
-
-    start_info.store_evtchn = (int)value;
-
-    if(hvm_get_parameter(HVM_PARAM_STORE_PFN, &value))
-        BUG();
-    start_info.store_mfn = (unsigned long)value;
-}
-
-/*
- * INITIAL C ENTRY POINT.
- */
-void arch_init(void *dtb_pointer, uint32_t physical_offset)
-{
-    struct xen_add_to_physmap xatp;
-    int r;
-
-    memset(&__bss_start, 0, &_end - &__bss_start);
-
-    physical_address_offset = physical_offset;
-
-    xprintk("Virtual -> physical offset = %x\n", physical_address_offset);
-
-    xprintk("Checking DTB at %p...\n", dtb_pointer);
-
-    if ((r = fdt_check_header(dtb_pointer))) {
-        xprintk("Invalid DTB from Xen: %s\n", fdt_strerror(r));
-        BUG();
-    }
-    device_tree = dtb_pointer;
-
-    /* Map shared_info page */
-    xatp.domid = DOMID_SELF;
-    xatp.idx = 0;
-    xatp.space = XENMAPSPACE_shared_info;
-    xatp.gpfn = virt_to_pfn(shared_info_page);
-    if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp) != 0)
-        BUG();
-    HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
-
-    /* Fill in start_info */
-    get_console();
-    get_xenbus();
-
-    gic_init();
-
-    start_kernel();
-}
-
-void
-arch_fini(void)
-{
-}
-
-void
-arch_do_exit(void)
-{
-}
diff --git a/extras/mini-os/arch/x86/Makefile b/extras/mini-os/arch/x86/Makefile
deleted file mode 100644
index 1073e36..0000000
--- a/extras/mini-os/arch/x86/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-#
-# x86 architecture specific makefiles.
-# It's is used for x86_32, x86_32y and x86_64
-#
-
-XEN_ROOT = $(CURDIR)/../../../..
-include $(XEN_ROOT)/Config.mk
-include ../../Config.mk
-
-# include arch.mk has to be before mini-os.mk!
-
-include arch.mk
-include ../../minios.mk
-
-# Sources here are all *.c *.S without $(XEN_TARGET_ARCH).S
-# This is handled in $(HEAD_ARCH_OBJ)
-ARCH_SRCS := $(wildcard *.c)
-
-# The objects built from the sources.
-ARCH_OBJS := $(patsubst %.c,$(OBJ_DIR)/%.o,$(ARCH_SRCS))
-
-all: $(OBJ_DIR)/$(ARCH_LIB)
-
-# $(HEAD_ARCH_OBJ) is only build here, needed on linking
-# in ../../Makefile.
-$(OBJ_DIR)/$(ARCH_LIB): $(ARCH_OBJS) $(OBJ_DIR)/$(HEAD_ARCH_OBJ)
-	$(AR) rv $(OBJ_DIR)/$(ARCH_LIB) $(ARCH_OBJS)
-
-clean:
-	rm -f $(OBJ_DIR)/$(ARCH_LIB) $(ARCH_OBJS) $(OBJ_DIR)/$(HEAD_ARCH_OBJ)
-
diff --git a/extras/mini-os/arch/x86/arch.mk b/extras/mini-os/arch/x86/arch.mk
deleted file mode 100644
index b27f322..0000000
--- a/extras/mini-os/arch/x86/arch.mk
+++ /dev/null
@@ -1,22 +0,0 @@
-#
-# Architecture special makerules for x86 family
-# (including x86_32, x86_32y and x86_64).
-#
-
-ifeq ($(XEN_TARGET_ARCH),x86_32)
-ARCH_CFLAGS  := -m32 -march=i686
-ARCH_LDFLAGS := -m elf_i386
-ARCH_ASFLAGS := -m32
-EXTRA_INC += $(TARGET_ARCH_FAM)/$(XEN_TARGET_ARCH)
-EXTRA_SRC += arch/$(EXTRA_INC)
-endif
-
-ifeq ($(XEN_TARGET_ARCH),x86_64)
-ARCH_CFLAGS := -m64 -mno-red-zone -fno-reorder-blocks
-ARCH_CFLAGS += -fno-asynchronous-unwind-tables
-ARCH_ASFLAGS := -m64
-ARCH_LDFLAGS := -m elf_x86_64
-EXTRA_INC += $(TARGET_ARCH_FAM)/$(XEN_TARGET_ARCH)
-EXTRA_SRC += arch/$(EXTRA_INC)
-endif
-
diff --git a/extras/mini-os/arch/x86/events.c b/extras/mini-os/arch/x86/events.c
deleted file mode 100644
index 5198cf3..0000000
--- a/extras/mini-os/arch/x86/events.c
+++ /dev/null
@@ -1,35 +0,0 @@
-#include <mini-os/os.h>
-#include <mini-os/mm.h>
-#include <mini-os/events.h>
-
-#if defined(__x86_64__)
-char irqstack[2 * STACK_SIZE];
-
-static struct pda
-{
-    int irqcount;       /* offset 0 (used in x86_64.S) */
-    char *irqstackptr;  /*        8 */
-} cpu0_pda;
-#endif
-
-void arch_init_events(void)
-{
-#if defined(__x86_64__)
-    asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
-    wrmsrl(0xc0000101, &cpu0_pda); /* 0xc0000101 is MSR_GS_BASE */
-    cpu0_pda.irqcount = -1;
-    cpu0_pda.irqstackptr = (void*) (((unsigned long)irqstack + 2 * STACK_SIZE)
-                                    & ~(STACK_SIZE - 1));
-#endif
-}
-
-void arch_unbind_ports(void)
-{
-}
-
-void arch_fini_events(void)
-{
-#if defined(__x86_64__)
-    wrmsrl(0xc0000101, NULL); /* 0xc0000101 is MSR_GS_BASE */
-#endif
-}
diff --git a/extras/mini-os/arch/x86/ioremap.c b/extras/mini-os/arch/x86/ioremap.c
deleted file mode 100644
index 4384b1c..0000000
--- a/extras/mini-os/arch/x86/ioremap.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (C) 2009,  Netronome Systems, Inc.
- *                
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-
-#include <mini-os/types.h>
-#include <mini-os/lib.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/mm.h>
-#include <mini-os/ioremap.h>
-
-/* Map a physical address range into virtual address space with provided
- * flags. Return a virtual address range it is mapped to. */
-static void *__do_ioremap(unsigned long phys_addr, unsigned long size, 
-                          unsigned long prot)
-{
-    unsigned long va;
-    unsigned long mfns, mfn;
-    unsigned long num_pages, offset;
-
-    /* allow non page aligned addresses but for mapping we need to align them */
-    offset = (phys_addr & ~PAGE_MASK);
-    num_pages = (offset + size + PAGE_SIZE - 1) / PAGE_SIZE;
-    phys_addr &= PAGE_MASK;
-    mfns = mfn = phys_addr >> PAGE_SHIFT;
-    
-    va = (unsigned long)map_frames_ex(&mfns, num_pages, 0, 1, 1,
-                                      DOMID_IO, NULL, prot);
-    return (void *)(va + offset);
-}
-
-void *ioremap(unsigned long phys_addr, unsigned long size)
-{
-    return __do_ioremap(phys_addr, size, IO_PROT);
-}
-
-void *ioremap_nocache(unsigned long phys_addr, unsigned long size)
-{
-    return __do_ioremap(phys_addr, size, IO_PROT_NOCACHE);
-}
-
-/* Un-map the io-remapped region. Currently no list of existing mappings is
- * maintained, so the caller has to supply the size */
-void iounmap(void *virt_addr, unsigned long size)
-{   
-    unsigned long num_pages;
-    unsigned long va = (unsigned long)virt_addr;
-
-    /* work out number of frames to unmap */
-    num_pages = ((va & ~PAGE_MASK) + size + PAGE_SIZE - 1) / PAGE_SIZE;
-
-    unmap_frames(va & PAGE_MASK, num_pages);
-}
-
-
-
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 indent-tabs-mode:nil -*- */
diff --git a/extras/mini-os/arch/x86/iorw.c b/extras/mini-os/arch/x86/iorw.c
deleted file mode 100644
index 3080769..0000000
--- a/extras/mini-os/arch/x86/iorw.c
+++ /dev/null
@@ -1,35 +0,0 @@
-#include <mini-os/iorw.h>
-
-void iowrite8(volatile void* addr, uint8_t val)
-{
-   *((volatile uint8_t*)addr) = val;
-}
-void iowrite16(volatile void* addr, uint16_t val)
-{
-   *((volatile uint16_t*)addr) = val;
-}
-void iowrite32(volatile void* addr, uint32_t val)
-{
-   *((volatile uint32_t*)addr) = val;
-}
-void iowrite64(volatile void* addr, uint64_t val)
-{
-   *((volatile uint64_t*)addr) = val;
-}
-
-uint8_t ioread8(volatile void* addr)
-{
-   return *((volatile uint8_t*) addr);
-}
-uint16_t ioread16(volatile void* addr)
-{
-   return *((volatile uint16_t*) addr);
-}
-uint32_t ioread32(volatile void* addr)
-{
-   return *((volatile uint32_t*) addr);
-}
-uint64_t ioread64(volatile void* addr)
-{
-   return *((volatile uint64_t*) addr);
-}
diff --git a/extras/mini-os/arch/x86/minios-x86_32.lds b/extras/mini-os/arch/x86/minios-x86_32.lds
deleted file mode 100644
index f5cabb6..0000000
--- a/extras/mini-os/arch/x86/minios-x86_32.lds
+++ /dev/null
@@ -1,74 +0,0 @@
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(_start)
-SECTIONS
-{
-  . = 0x0;
-  _text = .;			/* Text and read-only data */
-  .text : {
-	*(.text)
-	*(.gnu.warning)
-	} = 0x9090
-
-  _etext = .;			/* End of text section */
-
-  .rodata : { *(.rodata) *(.rodata.*) }
-  . = ALIGN(4096);
-  _erodata = .;
-
-  /* newlib initialization functions */
-  . = ALIGN(32 / 8);
-  PROVIDE (__preinit_array_start = .);
-  .preinit_array     : { *(.preinit_array) }
-  PROVIDE (__preinit_array_end = .);
-  PROVIDE (__init_array_start = .);
-  .init_array     : { *(.init_array) }
-  PROVIDE (__init_array_end = .);
-  PROVIDE (__fini_array_start = .);
-  .fini_array     : { *(.fini_array) }
-  PROVIDE (__fini_array_end = .);
-
-  .ctors : {
-        __CTOR_LIST__ = .;
-        *(.ctors)
-	CONSTRUCTORS
-        LONG(0)
-        __CTOR_END__ = .;
-        }
-
-  .dtors : {
-        __DTOR_LIST__ = .;
-        *(.dtors)
-        LONG(0)
-        __DTOR_END__ = .;
-        }
-
-  .data : {			/* Data */
-	*(.data)
-	}
-
-  _edata = .;			/* End of data section */
-
-  __bss_start = .;		/* BSS */
-  .bss : {
-	*(.bss)
-        *(.app.bss)
-	}
-  _end = . ;
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	*(.text.exit)
-	*(.data.exit)
-	*(.exitcall.exit)
-	}
-
-  /* Stabs debugging sections.  */
-  .stab 0 : { *(.stab) }
-  .stabstr 0 : { *(.stabstr) }
-  .stab.excl 0 : { *(.stab.excl) }
-  .stab.exclstr 0 : { *(.stab.exclstr) }
-  .stab.index 0 : { *(.stab.index) }
-  .stab.indexstr 0 : { *(.stab.indexstr) }
-  .comment 0 : { *(.comment) }
-}
diff --git a/extras/mini-os/arch/x86/minios-x86_64.lds b/extras/mini-os/arch/x86/minios-x86_64.lds
deleted file mode 100644
index 3da0a9f..0000000
--- a/extras/mini-os/arch/x86/minios-x86_64.lds
+++ /dev/null
@@ -1,74 +0,0 @@
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(_start)
-SECTIONS
-{
-  . = 0x0;
-  _text = .;			/* Text and read-only data */
-  .text : {
-	*(.text)
-	*(.gnu.warning)
-	} = 0x9090
-
-  _etext = .;			/* End of text section */
-
-  .rodata : { *(.rodata) *(.rodata.*) }
-  . = ALIGN(4096);
-  _erodata = .;
-
-  /* newlib initialization functions */
-  . = ALIGN(64 / 8);
-  PROVIDE (__preinit_array_start = .);
-  .preinit_array     : { *(.preinit_array) }
-  PROVIDE (__preinit_array_end = .);
-  PROVIDE (__init_array_start = .);
-  .init_array     : { *(.init_array) }
-  PROVIDE (__init_array_end = .);
-  PROVIDE (__fini_array_start = .);
-  .fini_array     : { *(.fini_array) }
-  PROVIDE (__fini_array_end = .);
-
-  .ctors : {
-        __CTOR_LIST__ = .;
-        *(.ctors)
-	CONSTRUCTORS
-        QUAD(0)
-        __CTOR_END__ = .;
-        }
-
-  .dtors : {
-        __DTOR_LIST__ = .;
-        *(.dtors)
-        QUAD(0)
-        __DTOR_END__ = .;
-        }
-
-  .data : {			/* Data */
-	*(.data)
-	}
-
-  _edata = .;			/* End of data section */
-
-  __bss_start = .;		/* BSS */
-  .bss : {
-	*(.bss)
-        *(.app.bss)
-	}
-  _end = . ;
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	*(.text.exit)
-	*(.data.exit)
-	*(.exitcall.exit)
-	}
-
-  /* Stabs debugging sections.  */
-  .stab 0 : { *(.stab) }
-  .stabstr 0 : { *(.stabstr) }
-  .stab.excl 0 : { *(.stab.excl) }
-  .stab.exclstr 0 : { *(.stab.exclstr) }
-  .stab.index 0 : { *(.stab.index) }
-  .stab.indexstr 0 : { *(.stab.indexstr) }
-  .comment 0 : { *(.comment) }
-}
diff --git a/extras/mini-os/arch/x86/mm.c b/extras/mini-os/arch/x86/mm.c
deleted file mode 100644
index 9c6d1b8..0000000
--- a/extras/mini-os/arch/x86/mm.c
+++ /dev/null
@@ -1,957 +0,0 @@
-/* 
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2005 - Grzegorz Milos - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: mm.c
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: Grzegorz Milos
- *              
- *        Date: Aug 2003, chages Aug 2005
- * 
- * Environment: Xen Minimal OS
- * Description: memory management related functions
- *              contains buddy page allocator from Xen.
- *
- ****************************************************************************
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <mini-os/os.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/mm.h>
-#include <mini-os/types.h>
-#include <mini-os/lib.h>
-#include <mini-os/xmalloc.h>
-#include <xen/memory.h>
-
-#ifdef MM_DEBUG
-#define DEBUG(_f, _a...) \
-    printk("MINI_OS(file=mm.c, line=%d) " _f "\n", __LINE__, ## _a)
-#else
-#define DEBUG(_f, _a...)    ((void)0)
-#endif
-
-unsigned long *phys_to_machine_mapping;
-unsigned long mfn_zero;
-extern char stack[];
-extern void page_walk(unsigned long va);
-
-/*
- * Make pt_pfn a new 'level' page table frame and hook it into the page
- * table at offset in previous level MFN (pref_l_mfn). pt_pfn is a guest
- * PFN.
- */
-static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, 
-                         unsigned long offset, unsigned long level)
-{   
-    pgentry_t *tab = (pgentry_t *)start_info.pt_base;
-    unsigned long pt_page = (unsigned long)pfn_to_virt(*pt_pfn); 
-    pgentry_t prot_e, prot_t;
-    mmu_update_t mmu_updates[1];
-    int rc;
-    
-    prot_e = prot_t = 0;
-    DEBUG("Allocating new L%d pt frame for pfn=%lx, "
-          "prev_l_mfn=%lx, offset=%lx", 
-          level, *pt_pfn, prev_l_mfn, offset);
-
-    /* We need to clear the page, otherwise we might fail to map it
-       as a page table page */
-    memset((void*) pt_page, 0, PAGE_SIZE);  
- 
-    switch ( level )
-    {
-    case L1_FRAME:
-        prot_e = L1_PROT;
-        prot_t = L2_PROT;
-        break;
-    case L2_FRAME:
-        prot_e = L2_PROT;
-        prot_t = L3_PROT;
-        break;
-#if defined(__x86_64__)
-    case L3_FRAME:
-        prot_e = L3_PROT;
-        prot_t = L4_PROT;
-        break;
-#endif
-    default:
-        printk("new_pt_frame() called with invalid level number %d\n", level);
-        do_exit();
-        break;
-    }
-
-    /* Make PFN a page table page */
-#if defined(__x86_64__)
-    tab = pte_to_virt(tab[l4_table_offset(pt_page)]);
-#endif
-    tab = pte_to_virt(tab[l3_table_offset(pt_page)]);
-
-    mmu_updates[0].ptr = (tab[l2_table_offset(pt_page)] & PAGE_MASK) + 
-        sizeof(pgentry_t) * l1_table_offset(pt_page);
-    mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | 
-        (prot_e & ~_PAGE_RW);
-    
-    if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 )
-    {
-        printk("ERROR: PTE for new page table page could not be updated\n");
-        printk("       mmu_update failed with rc=%d\n", rc);
-        do_exit();
-    }
-
-    /* Hook the new page table page into the hierarchy */
-    mmu_updates[0].ptr =
-        ((pgentry_t)prev_l_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
-    mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | prot_t;
-
-    if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 ) 
-    {
-        printk("ERROR: mmu_update failed with rc=%d\n", rc);
-        do_exit();
-    }
-
-    *pt_pfn += 1;
-}
-
-/*
- * Checks if a pagetable frame is needed at 'level' to map a given
- * address. Note, this function is specific to the initial page table
- * building.
- */
-static int need_pt_frame(unsigned long va, int level)
-{
-    unsigned long hyp_virt_start = HYPERVISOR_VIRT_START;
-#if defined(__x86_64__)
-    unsigned long hyp_virt_end = HYPERVISOR_VIRT_END;
-#else
-    unsigned long hyp_virt_end = 0xffffffff;
-#endif
-
-    /* In general frames will _not_ be needed if they were already
-       allocated to map the hypervisor into our VA space */
-#if defined(__x86_64__)
-    if ( level == L3_FRAME )
-    {
-        if ( l4_table_offset(va) >= 
-             l4_table_offset(hyp_virt_start) &&
-             l4_table_offset(va) <= 
-             l4_table_offset(hyp_virt_end))
-            return 0;
-        return 1;
-    } 
-    else
-#endif
-
-    if ( level == L2_FRAME )
-    {
-#if defined(__x86_64__)
-        if ( l4_table_offset(va) >= 
-             l4_table_offset(hyp_virt_start) &&
-             l4_table_offset(va) <= 
-             l4_table_offset(hyp_virt_end))
-#endif
-            if ( l3_table_offset(va) >= 
-                 l3_table_offset(hyp_virt_start) &&
-                 l3_table_offset(va) <= 
-                 l3_table_offset(hyp_virt_end))
-                return 0;
-
-        return 1;
-    } 
-    else 
-        /* Always need l1 frames */
-        if ( level == L1_FRAME )
-            return 1;
-
-    printk("ERROR: Unknown frame level %d, hypervisor %llx,%llx\n", 
-           level, hyp_virt_start, hyp_virt_end);
-    return -1;
-}
-
-/*
- * Build the initial pagetable.
- */
-static void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn)
-{
-    unsigned long start_address, end_address;
-    unsigned long pfn_to_map, pt_pfn = *start_pfn;
-    static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
-    pgentry_t *tab = (pgentry_t *)start_info.pt_base, page;
-    unsigned long pt_mfn = pfn_to_mfn(virt_to_pfn(start_info.pt_base));
-    unsigned long offset;
-    int count = 0;
-    int rc;
-
-    pfn_to_map = 
-        (start_info.nr_pt_frames - NOT_L1_FRAMES) * L1_PAGETABLE_ENTRIES;
-
-    if ( *max_pfn >= virt_to_pfn(HYPERVISOR_VIRT_START) )
-    {
-        printk("WARNING: Mini-OS trying to use Xen virtual space. "
-               "Truncating memory from %dMB to ",
-               ((unsigned long)pfn_to_virt(*max_pfn) -
-                (unsigned long)&_text)>>20);
-        *max_pfn = virt_to_pfn(HYPERVISOR_VIRT_START - PAGE_SIZE);
-        printk("%dMB\n",
-               ((unsigned long)pfn_to_virt(*max_pfn) - 
-                (unsigned long)&_text)>>20);
-    }
-
-    start_address = (unsigned long)pfn_to_virt(pfn_to_map);
-    end_address = (unsigned long)pfn_to_virt(*max_pfn);
-
-    /* We worked out the virtual memory range to map, now mapping loop */
-    printk("Mapping memory range 0x%lx - 0x%lx\n", start_address, end_address);
-
-    while ( start_address < end_address )
-    {
-        tab = (pgentry_t *)start_info.pt_base;
-        pt_mfn = pfn_to_mfn(virt_to_pfn(start_info.pt_base));
-
-#if defined(__x86_64__)
-        offset = l4_table_offset(start_address);
-        /* Need new L3 pt frame */
-        if ( !(start_address & L3_MASK) )
-            if ( need_pt_frame(start_address, L3_FRAME) ) 
-                new_pt_frame(&pt_pfn, pt_mfn, offset, L3_FRAME);
-
-        page = tab[offset];
-        pt_mfn = pte_to_mfn(page);
-        tab = to_virt(mfn_to_pfn(pt_mfn) << PAGE_SHIFT);
-#endif
-        offset = l3_table_offset(start_address);
-        /* Need new L2 pt frame */
-        if ( !(start_address & L2_MASK) )
-            if ( need_pt_frame(start_address, L2_FRAME) )
-                new_pt_frame(&pt_pfn, pt_mfn, offset, L2_FRAME);
-
-        page = tab[offset];
-        pt_mfn = pte_to_mfn(page);
-        tab = to_virt(mfn_to_pfn(pt_mfn) << PAGE_SHIFT);
-        offset = l2_table_offset(start_address);        
-        /* Need new L1 pt frame */
-        if ( !(start_address & L1_MASK) )
-            if ( need_pt_frame(start_address, L1_FRAME) )
-                new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME);
-
-        page = tab[offset];
-        pt_mfn = pte_to_mfn(page);
-        offset = l1_table_offset(start_address);
-
-        mmu_updates[count].ptr =
-            ((pgentry_t)pt_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
-        mmu_updates[count].val = 
-            (pgentry_t)pfn_to_mfn(pfn_to_map++) << PAGE_SHIFT | L1_PROT;
-        count++;
-        if ( count == L1_PAGETABLE_ENTRIES || pfn_to_map == *max_pfn )
-        {
-            rc = HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF);
-            if ( rc < 0 )
-            {
-                printk("ERROR: build_pagetable(): PTE could not be updated\n");
-                printk("       mmu_update failed with rc=%d\n", rc);
-                do_exit();
-            }
-            count = 0;
-        }
-        start_address += PAGE_SIZE;
-    }
-
-    *start_pfn = pt_pfn;
-}
-
-/*
- * Mark portion of the address space read only.
- */
-extern struct shared_info shared_info;
-static void set_readonly(void *text, void *etext)
-{
-    unsigned long start_address =
-        ((unsigned long) text + PAGE_SIZE - 1) & PAGE_MASK;
-    unsigned long end_address = (unsigned long) etext;
-    static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
-    pgentry_t *tab = (pgentry_t *)start_info.pt_base, page;
-    unsigned long mfn = pfn_to_mfn(virt_to_pfn(start_info.pt_base));
-    unsigned long offset;
-    int count = 0;
-    int rc;
-
-    printk("setting %p-%p readonly\n", text, etext);
-
-    while ( start_address + PAGE_SIZE <= end_address )
-    {
-        tab = (pgentry_t *)start_info.pt_base;
-        mfn = pfn_to_mfn(virt_to_pfn(start_info.pt_base));
-
-#if defined(__x86_64__)
-        offset = l4_table_offset(start_address);
-        page = tab[offset];
-        mfn = pte_to_mfn(page);
-        tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
-#endif
-        offset = l3_table_offset(start_address);
-        page = tab[offset];
-        mfn = pte_to_mfn(page);
-        tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
-        offset = l2_table_offset(start_address);        
-        page = tab[offset];
-        mfn = pte_to_mfn(page);
-        tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
-
-        offset = l1_table_offset(start_address);
-
-        if ( start_address != (unsigned long)&shared_info )
-        {
-            mmu_updates[count].ptr = 
-                ((pgentry_t)mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
-            mmu_updates[count].val = tab[offset] & ~_PAGE_RW;
-            count++;
-        }
-        else
-            printk("skipped %p\n", start_address);
-
-        start_address += PAGE_SIZE;
-
-        if ( count == L1_PAGETABLE_ENTRIES || 
-             start_address + PAGE_SIZE > end_address )
-        {
-            rc = HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF);
-            if ( rc < 0 )
-            {
-                printk("ERROR: set_readonly(): PTE could not be updated\n");
-                do_exit();
-            }
-            count = 0;
-        }
-    }
-
-    {
-        mmuext_op_t op = {
-            .cmd = MMUEXT_TLB_FLUSH_ALL,
-        };
-        int count;
-        HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF);
-    }
-}
-
-/*
- * A useful mem testing function. Write the address to every address in the
- * range provided and read back the value. If verbose, print page walk to
- * some VA
- * 
- * If we get MEM_TEST_MAX_ERRORS we might as well stop
- */
-#define MEM_TEST_MAX_ERRORS 10 
-int mem_test(unsigned long *start_va, unsigned long *end_va, int verbose)
-{
-    unsigned long mask = 0x10000;
-    unsigned long *pointer;
-    int error_count = 0;
- 
-    /* write values and print page walks */
-    if ( verbose && (((unsigned long)start_va) & 0xfffff) )
-    {
-        printk("MemTest Start: 0x%lx\n", start_va);
-        page_walk((unsigned long)start_va);
-    }
-    for ( pointer = start_va; pointer < end_va; pointer++ )
-    {
-        if ( verbose && !(((unsigned long)pointer) & 0xfffff) )
-        {
-            printk("Writing to %lx\n", pointer);
-            page_walk((unsigned long)pointer);
-        }
-        *pointer = (unsigned long)pointer & ~mask;
-    }
-    if ( verbose && (((unsigned long)end_va) & 0xfffff) )
-    {
-        printk("MemTest End: %lx\n", end_va-1);
-        page_walk((unsigned long)end_va-1);
-    }
- 
-    /* verify values */
-    for ( pointer = start_va; pointer < end_va; pointer++ )
-    {
-        if ( ((unsigned long)pointer & ~mask) != *pointer )
-        {
-            printk("Read error at 0x%lx. Read: 0x%lx, should read 0x%lx\n",
-                   (unsigned long)pointer, *pointer, 
-                   ((unsigned long)pointer & ~mask));
-            error_count++;
-            if ( error_count >= MEM_TEST_MAX_ERRORS )
-            {
-                printk("mem_test: too many errors\n");
-                return -1;
-            }
-        }
-    }
-    return 0;
-}
-
-
-/*
- * get the PTE for virtual address va if it exists. Otherwise NULL.
- */
-static pgentry_t *get_pgt(unsigned long va)
-{
-    unsigned long mfn;
-    pgentry_t *tab;
-    unsigned offset;
-
-    tab = (pgentry_t *)start_info.pt_base;
-    mfn = virt_to_mfn(start_info.pt_base);
-
-#if defined(__x86_64__)
-    offset = l4_table_offset(va);
-    if ( !(tab[offset] & _PAGE_PRESENT) )
-        return NULL;
-    mfn = pte_to_mfn(tab[offset]);
-    tab = mfn_to_virt(mfn);
-#endif
-    offset = l3_table_offset(va);
-    if ( !(tab[offset] & _PAGE_PRESENT) )
-        return NULL;
-    mfn = pte_to_mfn(tab[offset]);
-    tab = mfn_to_virt(mfn);
-    offset = l2_table_offset(va);
-    if ( !(tab[offset] & _PAGE_PRESENT) )
-        return NULL;
-    mfn = pte_to_mfn(tab[offset]);
-    tab = mfn_to_virt(mfn);
-    offset = l1_table_offset(va);
-    return &tab[offset];
-}
-
-
-/*
- * return a valid PTE for a given virtual address. If PTE does not exist,
- * allocate page-table pages.
- */
-pgentry_t *need_pgt(unsigned long va)
-{
-    unsigned long pt_mfn;
-    pgentry_t *tab;
-    unsigned long pt_pfn;
-    unsigned offset;
-
-    tab = (pgentry_t *)start_info.pt_base;
-    pt_mfn = virt_to_mfn(start_info.pt_base);
-
-#if defined(__x86_64__)
-    offset = l4_table_offset(va);
-    if ( !(tab[offset] & _PAGE_PRESENT) )
-    {
-        pt_pfn = virt_to_pfn(alloc_page());
-        new_pt_frame(&pt_pfn, pt_mfn, offset, L3_FRAME);
-    }
-    ASSERT(tab[offset] & _PAGE_PRESENT);
-    pt_mfn = pte_to_mfn(tab[offset]);
-    tab = mfn_to_virt(pt_mfn);
-#endif
-    offset = l3_table_offset(va);
-    if ( !(tab[offset] & _PAGE_PRESENT) ) 
-    {
-        pt_pfn = virt_to_pfn(alloc_page());
-        new_pt_frame(&pt_pfn, pt_mfn, offset, L2_FRAME);
-    }
-    ASSERT(tab[offset] & _PAGE_PRESENT);
-    pt_mfn = pte_to_mfn(tab[offset]);
-    tab = mfn_to_virt(pt_mfn);
-    offset = l2_table_offset(va);
-    if ( !(tab[offset] & _PAGE_PRESENT) )
-    {
-        pt_pfn = virt_to_pfn(alloc_page());
-        new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME);
-    }
-    ASSERT(tab[offset] & _PAGE_PRESENT);
-    pt_mfn = pte_to_mfn(tab[offset]);
-    tab = mfn_to_virt(pt_mfn);
-
-    offset = l1_table_offset(va);
-    return &tab[offset];
-}
-
-/*
- * Reserve an area of virtual address space for mappings and Heap
- */
-static unsigned long demand_map_area_start;
-#ifdef __x86_64__
-#define DEMAND_MAP_PAGES ((128ULL << 30) / PAGE_SIZE)
-#else
-#define DEMAND_MAP_PAGES ((2ULL << 30) / PAGE_SIZE)
-#endif
-
-#ifndef HAVE_LIBC
-#define HEAP_PAGES 0
-#else
-unsigned long heap, brk, heap_mapped, heap_end;
-#ifdef __x86_64__
-#define HEAP_PAGES ((128ULL << 30) / PAGE_SIZE)
-#else
-#define HEAP_PAGES ((1ULL << 30) / PAGE_SIZE)
-#endif
-#endif
-
-void arch_init_demand_mapping_area(unsigned long cur_pfn)
-{
-    cur_pfn++;
-
-    demand_map_area_start = (unsigned long) pfn_to_virt(cur_pfn);
-    cur_pfn += DEMAND_MAP_PAGES;
-    printk("Demand map pfns at %lx-%lx.\n", 
-           demand_map_area_start, pfn_to_virt(cur_pfn));
-
-#ifdef HAVE_LIBC
-    cur_pfn++;
-    heap_mapped = brk = heap = (unsigned long) pfn_to_virt(cur_pfn);
-    cur_pfn += HEAP_PAGES;
-    heap_end = (unsigned long) pfn_to_virt(cur_pfn);
-    printk("Heap resides at %lx-%lx.\n", brk, heap_end);
-#endif
-}
-
-unsigned long allocate_ondemand(unsigned long n, unsigned long alignment)
-{
-    unsigned long x;
-    unsigned long y = 0;
-
-    /* Find a properly aligned run of n contiguous frames */
-    for ( x = 0;
-          x <= DEMAND_MAP_PAGES - n; 
-          x = (x + y + 1 + alignment - 1) & ~(alignment - 1) )
-    {
-        unsigned long addr = demand_map_area_start + x * PAGE_SIZE;
-        pgentry_t *pgt = get_pgt(addr);
-        for ( y = 0; y < n; y++, addr += PAGE_SIZE ) 
-        {
-            if ( !(addr & L1_MASK) )
-                pgt = get_pgt(addr);
-            if ( pgt )
-            {
-                if ( *pgt & _PAGE_PRESENT )
-                    break;
-                pgt++;
-            }
-        }
-        if ( y == n )
-            break;
-    }
-    if ( y != n )
-    {
-        printk("Failed to find %ld frames!\n", n);
-        return 0;
-    }
-    return demand_map_area_start + x * PAGE_SIZE;
-}
-
-/*
- * Map an array of MFNs contiguously into virtual address space starting at
- * va. map f[i*stride]+i*increment for i in 0..n-1.
- */
-#define MAP_BATCH ((STACK_SIZE / 2) / sizeof(mmu_update_t))
-void do_map_frames(unsigned long va,
-                   const unsigned long *mfns, unsigned long n, 
-                   unsigned long stride, unsigned long incr, 
-                   domid_t id, int *err, unsigned long prot)
-{
-    pgentry_t *pgt = NULL;
-    unsigned long done = 0;
-    unsigned long i;
-    int rc;
-
-    if ( !mfns ) 
-    {
-        printk("do_map_frames: no mfns supplied\n");
-        return;
-    }
-    DEBUG("va=%p n=0x%lx, mfns[0]=0x%lx stride=0x%lx incr=0x%lx prot=0x%lx\n",
-          va, n, mfns[0], stride, incr, prot);
-
-    if ( err )
-        memset(err, 0x00, n * sizeof(int));
-    while ( done < n )
-    {
-        unsigned long todo;
-
-        if ( err )
-            todo = 1;
-        else
-            todo = n - done;
-
-        if ( todo > MAP_BATCH )
-            todo = MAP_BATCH;
-
-        {
-            mmu_update_t mmu_updates[todo];
-
-            for ( i = 0; i < todo; i++, va += PAGE_SIZE, pgt++) 
-            {
-                if ( !pgt || !(va & L1_MASK) )
-                    pgt = need_pgt(va);
-                
-                mmu_updates[i].ptr = virt_to_mach(pgt) | MMU_NORMAL_PT_UPDATE;
-                mmu_updates[i].val = ((pgentry_t)(mfns[(done + i) * stride] +
-                                                  (done + i) * incr)
-                                      << PAGE_SHIFT) | prot;
-            }
-
-            rc = HYPERVISOR_mmu_update(mmu_updates, todo, NULL, id);
-            if ( rc < 0 )
-            {
-                if (err)
-                    err[done * stride] = rc;
-                else {
-                    printk("Map %ld (%lx, ...) at %p failed: %d.\n",
-                           todo, mfns[done * stride] + done * incr, va, rc);
-                    do_exit();
-                }
-            }
-        }
-        done += todo;
-    }
-}
-
-/*
- * Map an array of MFNs contiguous into virtual address space. Virtual
- * addresses are allocated from the on demand area.
- */
-void *map_frames_ex(const unsigned long *mfns, unsigned long n, 
-                    unsigned long stride, unsigned long incr,
-                    unsigned long alignment,
-                    domid_t id, int *err, unsigned long prot)
-{
-    unsigned long va = allocate_ondemand(n, alignment);
-
-    if ( !va )
-        return NULL;
-
-    do_map_frames(va, mfns, n, stride, incr, id, err, prot);
-
-    return (void *)va;
-}
-
-/*
- * Unmap nun_frames frames mapped at virtual address va.
- */
-#define UNMAP_BATCH ((STACK_SIZE / 2) / sizeof(multicall_entry_t))
-int unmap_frames(unsigned long va, unsigned long num_frames)
-{
-    int n = UNMAP_BATCH;
-    multicall_entry_t call[n];
-    int ret;
-    int i;
-
-    ASSERT(!((unsigned long)va & ~PAGE_MASK));
-
-    DEBUG("va=%p, num=0x%lx\n", va, num_frames);
-
-    while ( num_frames ) {
-        if ( n > num_frames )
-            n = num_frames;
-
-        for ( i = 0; i < n; i++ )
-        {
-            int arg = 0;
-            /* simply update the PTE for the VA and invalidate TLB */
-            call[i].op = __HYPERVISOR_update_va_mapping;
-            call[i].args[arg++] = va;
-            call[i].args[arg++] = 0;
-#ifdef __i386__
-            call[i].args[arg++] = 0;
-#endif  
-            call[i].args[arg++] = UVMF_INVLPG;
-
-            va += PAGE_SIZE;
-        }
-
-        ret = HYPERVISOR_multicall(call, n);
-        if ( ret )
-        {
-            printk("update_va_mapping hypercall failed with rc=%d.\n", ret);
-            return -ret;
-        }
-
-        for ( i = 0; i < n; i++ )
-        {
-            if ( call[i].result ) 
-            {
-                printk("update_va_mapping failed for with rc=%d.\n", ret);
-                return -(call[i].result);
-            }
-        }
-        num_frames -= n;
-    }
-    return 0;
-}
-
-/*
- * Allocate pages which are contiguous in machine memory.
- * Returns a VA to where they are mapped or 0 on failure.
- * 
- * addr_bits indicates if the region has restrictions on where it is
- * located. Typical values are 32 (if for example PCI devices can't access
- * 64bit memory) or 0 for no restrictions.
- *
- * Allocated pages can be freed using the page allocators free_pages() 
- * function.
- *
- * based on Linux function xen_create_contiguous_region()
- */
-#define MAX_CONTIG_ORDER 9 /* 2MB */
-unsigned long alloc_contig_pages(int order, unsigned int addr_bits)
-{
-    unsigned long in_va, va;
-    unsigned long in_frames[1UL << order], out_frames, mfn;
-    multicall_entry_t call[1UL << order];
-    unsigned int i, num_pages = 1UL << order;
-    int ret, exch_success;
-
-    /* pass in num_pages 'extends' of size 1 and
-     * request 1 extend of size 'order */
-    struct xen_memory_exchange exchange = {
-        .in = {
-            .nr_extents   = num_pages,
-            .extent_order = 0,
-            .domid        = DOMID_SELF
-        },
-        .out = {
-            .nr_extents   = 1,
-            .extent_order = order,
-            .address_bits = addr_bits,
-            .domid        = DOMID_SELF
-        },
-        .nr_exchanged = 0
-    };
-
-    if ( order > MAX_CONTIG_ORDER )
-    {
-        printk("alloc_contig_pages: order too large 0x%x > 0x%x\n",
-               order, MAX_CONTIG_ORDER);
-        return 0;
-    }
-
-    /* Allocate some potentially discontiguous pages */
-    in_va = alloc_pages(order);
-    if ( !in_va )
-    {
-        printk("alloc_contig_pages: could not get enough pages (order=0x%x\n",
-               order);
-        return 0;
-    }
-
-    /* set up arguments for exchange hyper call */
-    set_xen_guest_handle(exchange.in.extent_start, in_frames);
-    set_xen_guest_handle(exchange.out.extent_start, &out_frames);
-
-    /* unmap current frames, keep a list of MFNs */
-    for ( i = 0; i < num_pages; i++ )
-    {
-        int arg = 0;
-
-        va = in_va + (PAGE_SIZE * i);
-        in_frames[i] = virt_to_mfn(va);
-
-        /* update P2M mapping */
-        phys_to_machine_mapping[virt_to_pfn(va)] = INVALID_P2M_ENTRY;
-
-        /* build multi call */
-        call[i].op = __HYPERVISOR_update_va_mapping;
-        call[i].args[arg++] = va;
-        call[i].args[arg++] = 0;
-#ifdef __i386__
-        call[i].args[arg++] = 0;
-#endif  
-        call[i].args[arg++] = UVMF_INVLPG;
-    }
-
-    ret = HYPERVISOR_multicall(call, i);
-    if ( ret )
-    {
-        printk("Odd, update_va_mapping hypercall failed with rc=%d.\n", ret);
-        return 0;
-    }
-
-    /* try getting a contig range of MFNs */
-    out_frames = virt_to_pfn(in_va); /* PFNs to populate */
-    ret = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
-    if ( ret ) {
-        printk("mem exchanged order=0x%x failed with rc=%d, nr_exchanged=%d\n", 
-               order, ret, exchange.nr_exchanged);
-        /* we still need to return the allocated pages above to the pool
-         * ie. map them back into the 1:1 mapping etc. so we continue but 
-         * in the end return the pages to the page allocator and return 0. */
-        exch_success = 0;
-    }
-    else
-        exch_success = 1;
-
-    /* map frames into 1:1 and update p2m */
-    for ( i = 0; i < num_pages; i++ )
-    {
-        int arg = 0;
-        pte_t pte;
-
-        va = in_va + (PAGE_SIZE * i);
-        mfn = i < exchange.nr_exchanged ? (out_frames + i) : in_frames[i];
-        pte = __pte(mfn << PAGE_SHIFT | L1_PROT);
-
-        /* update P2M mapping */
-        phys_to_machine_mapping[virt_to_pfn(va)] = mfn;
-
-        /* build multi call */
-        call[i].op = __HYPERVISOR_update_va_mapping;
-        call[i].args[arg++] = va;
-#ifdef __x86_64__
-        call[i].args[arg++] = (pgentry_t)pte.pte;
-#else
-        call[i].args[arg++] = pte.pte_low;
-        call[i].args[arg++] = pte.pte_high;
-#endif  
-        call[i].args[arg++] = UVMF_INVLPG;
-    }
-    ret = HYPERVISOR_multicall(call, i);
-    if ( ret )
-    {
-        printk("update_va_mapping hypercall no. 2 failed with rc=%d.\n", ret);
-        return 0;
-    }
-
-    if ( !exch_success )
-    {
-        /* since the exchanged failed we just free the pages as well */
-        free_pages((void *) in_va, order);
-        return 0;
-    }
-    
-    return in_va;
-}
-
-/*
- * Clear some of the bootstrap memory
- */
-static void clear_bootstrap(void)
-{
-    pte_t nullpte = { };
-    int rc;
-
-    /* Use first page as the CoW zero page */
-    memset(&_text, 0, PAGE_SIZE);
-    mfn_zero = virt_to_mfn((unsigned long) &_text);
-    if ( (rc = HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG)) )
-        printk("Unable to unmap NULL page. rc=%d\n", rc);
-}
-
-void arch_init_p2m(unsigned long max_pfn)
-{
-#ifdef __x86_64__
-#define L1_P2M_SHIFT    9
-#define L2_P2M_SHIFT    18    
-#define L3_P2M_SHIFT    27    
-#else
-#define L1_P2M_SHIFT    10
-#define L2_P2M_SHIFT    20    
-#define L3_P2M_SHIFT    30    
-#endif
-#define L1_P2M_ENTRIES  (1 << L1_P2M_SHIFT)    
-#define L2_P2M_ENTRIES  (1 << (L2_P2M_SHIFT - L1_P2M_SHIFT))    
-#define L3_P2M_ENTRIES  (1 << (L3_P2M_SHIFT - L2_P2M_SHIFT))    
-#define L1_P2M_MASK     (L1_P2M_ENTRIES - 1)    
-#define L2_P2M_MASK     (L2_P2M_ENTRIES - 1)    
-#define L3_P2M_MASK     (L3_P2M_ENTRIES - 1)    
-    
-    unsigned long *l1_list = NULL, *l2_list = NULL, *l3_list;
-    unsigned long pfn;
-    
-    l3_list = (unsigned long *)alloc_page(); 
-    for ( pfn=0; pfn<max_pfn; pfn++ )
-    {
-        if ( !(pfn % (L1_P2M_ENTRIES * L2_P2M_ENTRIES)) )
-        {
-            l2_list = (unsigned long*)alloc_page();
-            if ( (pfn >> L3_P2M_SHIFT) > 0 )
-            {
-                printk("Error: Too many pfns.\n");
-                do_exit();
-            }
-            l3_list[(pfn >> L2_P2M_SHIFT)] = virt_to_mfn(l2_list);  
-        }
-        if ( !(pfn % (L1_P2M_ENTRIES)) )
-        {
-            l1_list = (unsigned long*)alloc_page();
-            l2_list[(pfn >> L1_P2M_SHIFT) & L2_P2M_MASK] = 
-                virt_to_mfn(l1_list); 
-        }
-
-        l1_list[pfn & L1_P2M_MASK] = pfn_to_mfn(pfn); 
-    }
-    HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 
-        virt_to_mfn(l3_list);
-    HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
-}
-
-void arch_init_mm(unsigned long* start_pfn_p, unsigned long* max_pfn_p)
-{
-    unsigned long start_pfn, max_pfn;
-
-    printk("      _text: %p(VA)\n", &_text);
-    printk("     _etext: %p(VA)\n", &_etext);
-    printk("   _erodata: %p(VA)\n", &_erodata);
-    printk("     _edata: %p(VA)\n", &_edata);
-    printk("stack start: %p(VA)\n", stack);
-    printk("       _end: %p(VA)\n", &_end);
-
-    /* First page follows page table pages and 3 more pages (store page etc) */
-    start_pfn = PFN_UP(to_phys(start_info.pt_base)) + 
-        start_info.nr_pt_frames + 3;
-    max_pfn = start_info.nr_pages;
-
-    /* We need room for demand mapping and heap, clip available memory */
-#if defined(__i386__)
-    {
-        unsigned long virt_pfns = 1 + DEMAND_MAP_PAGES + 1 + HEAP_PAGES;
-        if (max_pfn + virt_pfns >= 0x100000)
-            max_pfn = 0x100000 - virt_pfns - 1;
-    }
-#endif
-
-    printk("  start_pfn: %lx\n", start_pfn);
-    printk("    max_pfn: %lx\n", max_pfn);
-
-    build_pagetable(&start_pfn, &max_pfn);
-    clear_bootstrap();
-    set_readonly(&_text, &_erodata);
-
-    *start_pfn_p = start_pfn;
-    *max_pfn_p = max_pfn;
-}
-
-grant_entry_t *arch_init_gnttab(int nr_grant_frames)
-{
-    struct gnttab_setup_table setup;
-    unsigned long frames[nr_grant_frames];
-
-    setup.dom = DOMID_SELF;
-    setup.nr_frames = nr_grant_frames;
-    set_xen_guest_handle(setup.frame_list, frames);
-
-    HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
-    return map_frames(frames, nr_grant_frames);
-}
diff --git a/extras/mini-os/arch/x86/sched.c b/extras/mini-os/arch/x86/sched.c
deleted file mode 100644
index e4a3dc2..0000000
--- a/extras/mini-os/arch/x86/sched.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/* 
- ****************************************************************************
- * (C) 2005 - Grzegorz Milos - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: sched.c
- *      Author: Grzegorz Milos
- *     Changes: Robert Kaiser
- *              
- *        Date: Aug 2005
- * 
- * Environment: Xen Minimal OS
- * Description: simple scheduler for Mini-Os
- *
- * The scheduler is non-preemptive (cooperative), and schedules according 
- * to Round Robin algorithm.
- *
- ****************************************************************************
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <mini-os/os.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/time.h>
-#include <mini-os/mm.h>
-#include <mini-os/types.h>
-#include <mini-os/lib.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/list.h>
-#include <mini-os/sched.h>
-#include <mini-os/semaphore.h>
-
-
-#ifdef SCHED_DEBUG
-#define DEBUG(_f, _a...) \
-    printk("MINI_OS(file=sched.c, line=%d) " _f "\n", __LINE__, ## _a)
-#else
-#define DEBUG(_f, _a...)    ((void)0)
-#endif
-
-
-void dump_stack(struct thread *thread)
-{
-    unsigned long *bottom = (unsigned long *)(thread->stack + STACK_SIZE); 
-    unsigned long *pointer = (unsigned long *)thread->sp;
-    int count;
-    if(thread == current)
-    {
-#ifdef __i386__    
-        asm("movl %%esp,%0"
-            : "=r"(pointer));
-#else
-        asm("movq %%rsp,%0"
-            : "=r"(pointer));
-#endif
-    }
-    printk("The stack for \"%s\"\n", thread->name);
-    for(count = 0; count < 25 && pointer < bottom; count ++)
-    {
-        printk("[0x%lx] 0x%lx\n", pointer, *pointer);
-        pointer++;
-    }
-    
-    if(pointer < bottom) printk(" ... continues.\n");
-}
-
-/* Gets run when a new thread is scheduled the first time ever, 
-   defined in x86_[32/64].S */
-extern void thread_starter(void);
-
-/* Pushes the specified value onto the stack of the specified thread */
-static void stack_push(struct thread *thread, unsigned long value)
-{
-    thread->sp -= sizeof(unsigned long);
-    *((unsigned long *)thread->sp) = value;
-}
-
-/* Architecture specific setup of thread creation */
-struct thread* arch_create_thread(char *name, void (*function)(void *),
-                                  void *data)
-{
-    struct thread *thread;
-    
-    thread = xmalloc(struct thread);
-    /* We can't use lazy allocation here since the trap handler runs on the stack */
-    thread->stack = (char *)alloc_pages(STACK_SIZE_PAGE_ORDER);
-    thread->name = name;
-    printk("Thread \"%s\": pointer: 0x%lx, stack: 0x%lx\n", name, thread, 
-            thread->stack);
-    
-    thread->sp = (unsigned long)thread->stack + STACK_SIZE;
-    /* Save pointer to the thread on the stack, used by current macro */
-    *((unsigned long *)thread->stack) = (unsigned long)thread;
-
-    /* Must ensure that (%rsp + 8) is 16-byte aligned at the start of thread_starter. */
-    thread->sp -= sizeof(unsigned long);
-    
-    stack_push(thread, (unsigned long) function);
-    stack_push(thread, (unsigned long) data);
-    thread->ip = (unsigned long) thread_starter;
-    return thread;
-}
-
-void run_idle_thread(void)
-{
-    /* Switch stacks and run the thread */ 
-#if defined(__i386__)
-    __asm__ __volatile__("mov %0,%%esp\n\t"
-                         "push %1\n\t" 
-                         "ret"                                            
-                         :"=m" (idle_thread->sp)
-                         :"m" (idle_thread->ip));                          
-#elif defined(__x86_64__)
-    __asm__ __volatile__("mov %0,%%rsp\n\t"
-                         "push %1\n\t" 
-                         "ret"                                            
-                         :"=m" (idle_thread->sp)
-                         :"m" (idle_thread->ip));                                                    
-#endif
-}
-
-
-
diff --git a/extras/mini-os/arch/x86/setup.c b/extras/mini-os/arch/x86/setup.c
deleted file mode 100644
index 5e87dd1..0000000
--- a/extras/mini-os/arch/x86/setup.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/******************************************************************************
- * common.c
- * 
- * Common stuff special to x86 goes here.
- * 
- * Copyright (c) 2002-2003, K A Fraser & R Neugebauer
- * Copyright (c) 2005, Grzegorz Milos, Intel Research Cambridge
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include <mini-os/os.h>
-#include <mini-os/lib.h> /* for printk, memcpy */
-#include <mini-os/kernel.h>
-#include <xen/xen.h>
-
-/*
- * Shared page for communicating with the hypervisor.
- * Events flags go here, for example.
- */
-shared_info_t *HYPERVISOR_shared_info;
-
-/*
- * This structure contains start-of-day info, such as pagetable base pointer,
- * address of the shared_info structure, and things like that.
- */
-union start_info_union start_info_union;
-
-/*
- * Just allocate the kernel stack here. SS:ESP is set up to point here
- * in head.S.
- */
-char stack[2*STACK_SIZE];
-
-extern char shared_info[PAGE_SIZE];
-
-/* Assembler interface fns in entry.S. */
-void hypervisor_callback(void);
-void failsafe_callback(void);
-
-#if defined(__x86_64__)
-#define __pte(x) ((pte_t) { (x) } )
-#else
-#define __pte(x) ({ unsigned long long _x = (x);        \
-    ((pte_t) {(unsigned long)(_x), (unsigned long)(_x>>32)}); })
-#endif
-
-static
-shared_info_t *map_shared_info(unsigned long pa)
-{
-    int rc;
-
-	if ( (rc = HYPERVISOR_update_va_mapping(
-              (unsigned long)shared_info, __pte(pa | 7), UVMF_INVLPG)) )
-	{
-		printk("Failed to map shared_info!! rc=%d\n", rc);
-		do_exit();
-	}
-	return (shared_info_t *)shared_info;
-}
-
-static inline void fpu_init(void) {
-	asm volatile("fninit");
-}
-
-#ifdef __SSE__
-static inline void sse_init(void) {
-	unsigned long status = 0x1f80;
-	asm volatile("ldmxcsr %0" : : "m" (status));
-}
-#else
-#define sse_init()
-#endif
-
-
-/*
- * INITIAL C ENTRY POINT.
- */
-void
-arch_init(start_info_t *si)
-{
-	static char hello[] = "Bootstrapping...\n";
-
-	(void)HYPERVISOR_console_io(CONSOLEIO_write, strlen(hello), hello);
-
-	trap_init();
-
-	/*Initialize floating point unit */
-	fpu_init();
-
-	/* Initialize SSE */
-	sse_init();
-
-	/* Copy the start_info struct to a globally-accessible area. */
-	/* WARN: don't do printk before here, it uses information from
-	   shared_info. Use xprintk instead. */
-	memcpy(&start_info, si, sizeof(*si));
-
-	/* print out some useful information  */
-	printk("Xen Minimal OS!\n");
-	printk("  start_info: %p(VA)\n", si);
-	printk("    nr_pages: 0x%lx\n", si->nr_pages);
-	printk("  shared_inf: 0x%08lx(MA)\n", si->shared_info);
-	printk("     pt_base: %p(VA)\n", (void *)si->pt_base);
-	printk("nr_pt_frames: 0x%lx\n", si->nr_pt_frames);
-	printk("    mfn_list: %p(VA)\n", (void *)si->mfn_list);
-	printk("   mod_start: 0x%lx(VA)\n", si->mod_start);
-	printk("     mod_len: %lu\n", si->mod_len);
-	printk("       flags: 0x%x\n", (unsigned int)si->flags);
-	printk("    cmd_line: %s\n",
-			si->cmd_line ? (const char *)si->cmd_line : "NULL");
-	printk("       stack: %p-%p\n", stack, stack + sizeof(stack));
-
-	/* set up minimal memory infos */
-	phys_to_machine_mapping = (unsigned long *)start_info.mfn_list;
-
-	/* Grab the shared_info pointer and put it in a safe place. */
-	HYPERVISOR_shared_info = map_shared_info(start_info.shared_info);
-
-	    /* Set up event and failsafe callback addresses. */
-#ifdef __i386__
-	HYPERVISOR_set_callbacks(
-		__KERNEL_CS, (unsigned long)hypervisor_callback,
-		__KERNEL_CS, (unsigned long)failsafe_callback);
-#else
-	HYPERVISOR_set_callbacks(
-		(unsigned long)hypervisor_callback,
-		(unsigned long)failsafe_callback, 0);
-#endif
-
-	start_kernel();
-}
-
-void
-arch_fini(void)
-{
-	/* Reset traps */
-	trap_fini();
-
-#ifdef __i386__
-	HYPERVISOR_set_callbacks(0, 0, 0, 0);
-#else
-	HYPERVISOR_set_callbacks(0, 0, 0);
-#endif
-}
-
-void
-arch_do_exit(void)
-{
-	stack_walk();
-}
diff --git a/extras/mini-os/arch/x86/time.c b/extras/mini-os/arch/x86/time.c
deleted file mode 100644
index 2c8d033..0000000
--- a/extras/mini-os/arch/x86/time.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2002-2003 - Keir Fraser - University of Cambridge 
- * (C) 2005 - Grzegorz Milos - Intel Research Cambridge
- * (C) 2006 - Robert Kaiser - FH Wiesbaden
- ****************************************************************************
- *
- *        File: time.c
- *      Author: Rolf Neugebauer and Keir Fraser
- *     Changes: Grzegorz Milos
- *
- * Description: Simple time and timer functions
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-
-#include <mini-os/os.h>
-#include <mini-os/traps.h>
-#include <mini-os/types.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/events.h>
-#include <mini-os/time.h>
-#include <mini-os/lib.h>
-
-/************************************************************************
- * Time functions
- *************************************************************************/
-
-/* These are peridically updated in shared_info, and then copied here. */
-struct shadow_time_info {
-	uint64_t tsc_timestamp;     /* TSC at last update of time vals.  */
-	uint64_t system_timestamp;  /* Time, in nanosecs, since boot.    */
-	uint32_t tsc_to_nsec_mul;
-	uint32_t tsc_to_usec_mul;
-	int tsc_shift;
-	uint32_t version;
-};
-static struct timespec shadow_ts;
-static uint32_t shadow_ts_version;
-
-static struct shadow_time_info shadow;
-
-
-#ifndef rmb
-#define rmb()  __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory")
-#endif
-
-#define HANDLE_USEC_OVERFLOW(_tv)          \
-    do {                                   \
-        while ( (_tv)->tv_usec >= 1000000 ) \
-        {                                  \
-            (_tv)->tv_usec -= 1000000;      \
-            (_tv)->tv_sec++;                \
-        }                                  \
-    } while ( 0 )
-
-static inline int time_values_up_to_date(void)
-{
-	struct vcpu_time_info *src = &HYPERVISOR_shared_info->vcpu_info[0].time; 
-
-	return (shadow.version == src->version);
-}
-
-
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline uint64_t scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
-{
-	uint64_t product;
-#ifdef __i386__
-	uint32_t tmp1, tmp2;
-#endif
-
-	if ( shift < 0 )
-		delta >>= -shift;
-	else
-		delta <<= shift;
-
-#ifdef __i386__
-	__asm__ (
-		"mul  %5       ; "
-		"mov  %4,%%eax ; "
-		"mov  %%edx,%4 ; "
-		"mul  %5       ; "
-		"add  %4,%%eax ; "
-		"xor  %5,%5    ; "
-		"adc  %5,%%edx ; "
-		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
-		: "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), "2" (mul_frac) );
-#else
-	__asm__ (
-		"mul %%rdx ; shrd $32,%%rdx,%%rax"
-		: "=a" (product) : "0" (delta), "d" ((uint64_t)mul_frac) );
-#endif
-
-	return product;
-}
-
-
-static unsigned long get_nsec_offset(void)
-{
-	uint64_t now, delta;
-	rdtscll(now);
-	delta = now - shadow.tsc_timestamp;
-	return scale_delta(delta, shadow.tsc_to_nsec_mul, shadow.tsc_shift);
-}
-
-
-static void get_time_values_from_xen(void)
-{
-	struct vcpu_time_info    *src = &HYPERVISOR_shared_info->vcpu_info[0].time;
-
- 	do {
-		shadow.version = src->version;
-		rmb();
-		shadow.tsc_timestamp     = src->tsc_timestamp;
-		shadow.system_timestamp  = src->system_time;
-		shadow.tsc_to_nsec_mul   = src->tsc_to_system_mul;
-		shadow.tsc_shift         = src->tsc_shift;
-		rmb();
-	}
-	while ((src->version & 1) | (shadow.version ^ src->version));
-
-	shadow.tsc_to_usec_mul = shadow.tsc_to_nsec_mul / 1000;
-}
-
-
-
-
-/* monotonic_clock(): returns # of nanoseconds passed since time_init()
- *		Note: This function is required to return accurate
- *		time even in the absence of multiple timer ticks.
- */
-uint64_t monotonic_clock(void)
-{
-	uint64_t time;
-	uint32_t local_time_version;
-
-	do {
-		local_time_version = shadow.version;
-		rmb();
-		time = shadow.system_timestamp + get_nsec_offset();
-        if (!time_values_up_to_date())
-			get_time_values_from_xen();
-		rmb();
-	} while (local_time_version != shadow.version);
-
-	return time;
-}
-
-static void update_wallclock(void)
-{
-	shared_info_t *s = HYPERVISOR_shared_info;
-
-	do {
-		shadow_ts_version = s->wc_version;
-		rmb();
-		shadow_ts.tv_sec  = s->wc_sec;
-		shadow_ts.tv_nsec = s->wc_nsec;
-		rmb();
-	}
-	while ((s->wc_version & 1) | (shadow_ts_version ^ s->wc_version));
-}
-
-
-int gettimeofday(struct timeval *tv, void *tz)
-{
-    uint64_t nsec = monotonic_clock();
-    nsec += shadow_ts.tv_nsec;
-    
-    
-    tv->tv_sec = shadow_ts.tv_sec;
-    tv->tv_sec += NSEC_TO_SEC(nsec);
-    tv->tv_usec = NSEC_TO_USEC(nsec % 1000000000UL);
-
-    return 0;
-}
-
-
-void block_domain(s_time_t until)
-{
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    ASSERT(irqs_disabled());
-    if(monotonic_clock() < until)
-    {
-        HYPERVISOR_set_timer_op(until);
-        HYPERVISOR_sched_op(SCHEDOP_block, 0);
-        local_irq_disable();
-    }
-}
-
-
-/*
- * Just a dummy
- */
-static void timer_handler(evtchn_port_t ev, struct pt_regs *regs, void *ign)
-{
-    get_time_values_from_xen();
-    update_wallclock();
-}
-
-
-
-static evtchn_port_t port;
-void init_time(void)
-{
-    printk("Initialising timer interface\n");
-    port = bind_virq(VIRQ_TIMER, &timer_handler, NULL);
-    unmask_evtchn(port);
-}
-
-void fini_time(void)
-{
-    /* Clear any pending timer */
-    HYPERVISOR_set_timer_op(0);
-    unbind_evtchn(port);
-}
diff --git a/extras/mini-os/arch/x86/traps.c b/extras/mini-os/arch/x86/traps.c
deleted file mode 100644
index 516d133..0000000
--- a/extras/mini-os/arch/x86/traps.c
+++ /dev/null
@@ -1,333 +0,0 @@
-
-#include <mini-os/os.h>
-#include <mini-os/traps.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/mm.h>
-#include <mini-os/lib.h>
-#include <mini-os/sched.h>
-
-/*
- * These are assembler stubs in entry.S.
- * They are the actual entry points for virtual exceptions.
- */
-void divide_error(void);
-void debug(void);
-void int3(void);
-void overflow(void);
-void bounds(void);
-void invalid_op(void);
-void device_not_available(void);
-void coprocessor_segment_overrun(void);
-void invalid_TSS(void);
-void segment_not_present(void);
-void stack_segment(void);
-void general_protection(void);
-void page_fault(void);
-void coprocessor_error(void);
-void simd_coprocessor_error(void);
-void alignment_check(void);
-void spurious_interrupt_bug(void);
-void machine_check(void);
-
-
-void dump_regs(struct pt_regs *regs)
-{
-    printk("Thread: %s\n", current->name);
-#ifdef __i386__    
-    printk("EIP: %x, EFLAGS %x.\n", regs->eip, regs->eflags);
-    printk("EBX: %08x ECX: %08x EDX: %08x\n",
-	   regs->ebx, regs->ecx, regs->edx);
-    printk("ESI: %08x EDI: %08x EBP: %08x EAX: %08x\n",
-	   regs->esi, regs->edi, regs->ebp, regs->eax);
-    printk("DS: %04x ES: %04x orig_eax: %08x, eip: %08x\n",
-	   regs->xds, regs->xes, regs->orig_eax, regs->eip);
-    printk("CS: %04x EFLAGS: %08x esp: %08x ss: %04x\n",
-	   regs->xcs, regs->eflags, regs->esp, regs->xss);
-#else
-    printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
-    printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", 
-           regs->ss, regs->rsp, regs->eflags);
-    printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
-           regs->rax, regs->rbx, regs->rcx);
-    printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
-           regs->rdx, regs->rsi, regs->rdi); 
-    printk("RBP: %016lx R08: %016lx R09: %016lx\n",
-           regs->rbp, regs->r8, regs->r9); 
-    printk("R10: %016lx R11: %016lx R12: %016lx\n",
-           regs->r10, regs->r11, regs->r12); 
-    printk("R13: %016lx R14: %016lx R15: %016lx\n",
-           regs->r13, regs->r14, regs->r15); 
-#endif
-}
-
-static void do_trap(int trapnr, char *str, struct pt_regs * regs, unsigned long error_code)
-{
-    printk("FATAL:  Unhandled Trap %d (%s), error code=0x%lx\n", trapnr, str, error_code);
-    printk("Regs address %p\n", regs);
-    dump_regs(regs);
-    do_exit();
-}
-
-#define DO_ERROR(trapnr, str, name) \
-void do_##name(struct pt_regs * regs, unsigned long error_code) \
-{ \
-	do_trap(trapnr, str, regs, error_code); \
-}
-
-#define DO_ERROR_INFO(trapnr, str, name, sicode, siaddr) \
-void do_##name(struct pt_regs * regs, unsigned long error_code) \
-{ \
-	do_trap(trapnr, str, regs, error_code); \
-}
-
-DO_ERROR_INFO( 0, "divide error", divide_error, FPE_INTDIV, regs->eip)
-DO_ERROR( 3, "int3", int3)
-DO_ERROR( 4, "overflow", overflow)
-DO_ERROR( 5, "bounds", bounds)
-DO_ERROR_INFO( 6, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
-DO_ERROR( 7, "device not available", device_not_available)
-DO_ERROR( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, "invalid TSS", invalid_TSS)
-DO_ERROR(11, "segment not present", segment_not_present)
-DO_ERROR(12, "stack segment", stack_segment)
-DO_ERROR_INFO(17, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR(18, "machine check", machine_check)
-
-void page_walk(unsigned long virt_address)
-{
-        pgentry_t *tab = (pgentry_t *)start_info.pt_base, page;
-        unsigned long addr = virt_address;
-        printk("Pagetable walk from virt %lx, base %lx:\n", virt_address, start_info.pt_base);
-    
-#if defined(__x86_64__)
-        page = tab[l4_table_offset(addr)];
-        tab = pte_to_virt(page);
-        printk(" L4 = %"PRIpte" (%p)  [offset = %lx]\n", page, tab, l4_table_offset(addr));
-#endif
-        page = tab[l3_table_offset(addr)];
-        tab = pte_to_virt(page);
-        printk("  L3 = %"PRIpte" (%p)  [offset = %lx]\n", page, tab, l3_table_offset(addr));
-        page = tab[l2_table_offset(addr)];
-        tab = pte_to_virt(page);
-        printk("   L2 = %"PRIpte" (%p)  [offset = %lx]\n", page, tab, l2_table_offset(addr));
-        
-        page = tab[l1_table_offset(addr)];
-        printk("    L1 = %"PRIpte" [offset = %lx]\n", page, l1_table_offset(addr));
-
-}
-
-static int handle_cow(unsigned long addr) {
-        pgentry_t *tab = (pgentry_t *)start_info.pt_base, page;
-	unsigned long new_page;
-	int rc;
-
-#if defined(__x86_64__)
-        page = tab[l4_table_offset(addr)];
-	if (!(page & _PAGE_PRESENT))
-	    return 0;
-        tab = pte_to_virt(page);
-#endif
-        page = tab[l3_table_offset(addr)];
-	if (!(page & _PAGE_PRESENT))
-	    return 0;
-        tab = pte_to_virt(page);
-
-        page = tab[l2_table_offset(addr)];
-	if (!(page & _PAGE_PRESENT))
-	    return 0;
-        tab = pte_to_virt(page);
-        
-        page = tab[l1_table_offset(addr)];
-	if (!(page & _PAGE_PRESENT))
-	    return 0;
-	/* Only support CoW for the zero page.  */
-	if (PHYS_PFN(page) != mfn_zero)
-	    return 0;
-
-	new_page = alloc_pages(0);
-	memset((void*) new_page, 0, PAGE_SIZE);
-
-	rc = HYPERVISOR_update_va_mapping(addr & PAGE_MASK, __pte(virt_to_mach(new_page) | L1_PROT), UVMF_INVLPG);
-	if (!rc)
-		return 1;
-
-	printk("Map zero page to %lx failed: %d.\n", addr, rc);
-	return 0;
-}
-
-static void do_stack_walk(unsigned long frame_base)
-{
-    unsigned long *frame = (void*) frame_base;
-    printk("base is %#lx ", frame_base);
-    printk("caller is %#lx\n", frame[1]);
-    if (frame[0])
-	do_stack_walk(frame[0]);
-}
-
-void stack_walk(void)
-{
-    unsigned long bp;
-#ifdef __x86_64__
-    asm("movq %%rbp, %0":"=r"(bp));
-#else
-    asm("movl %%ebp, %0":"=r"(bp));
-#endif
-    do_stack_walk(bp);
-}
-
-static void dump_mem(unsigned long addr)
-{
-    unsigned long i;
-    if (addr < PAGE_SIZE)
-	return;
-
-    for (i = ((addr)-16 ) & ~15; i < (((addr)+48 ) & ~15); i++)
-    {
-	if (!(i%16))
-	    printk("\n%lx:", i);
-	printk(" %02x", *(unsigned char *)i);
-    }
-    printk("\n");
-}
-#define read_cr2() \
-        (HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].arch.cr2)
-
-static int handling_pg_fault = 0;
-
-void do_page_fault(struct pt_regs *regs, unsigned long error_code)
-{
-    unsigned long addr = read_cr2();
-    struct sched_shutdown sched_shutdown = { .reason = SHUTDOWN_crash };
-
-    if ((error_code & TRAP_PF_WRITE) && handle_cow(addr))
-	return;
-
-    /* If we are already handling a page fault, and got another one
-       that means we faulted in pagetable walk. Continuing here would cause
-       a recursive fault */       
-    if(handling_pg_fault == 1) 
-    {
-        printk("Page fault in pagetable walk (access to invalid memory?).\n"); 
-        HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
-    }
-    handling_pg_fault++;
-    barrier();
-
-#if defined(__x86_64__)
-    printk("Page fault at linear address %p, rip %p, regs %p, sp %p, our_sp %p, code %lx\n",
-           addr, regs->rip, regs, regs->rsp, &addr, error_code);
-#else
-    printk("Page fault at linear address %p, eip %p, regs %p, sp %p, our_sp %p, code %lx\n",
-           addr, regs->eip, regs, regs->esp, &addr, error_code);
-#endif
-
-    dump_regs(regs);
-#if defined(__x86_64__)
-    do_stack_walk(regs->rbp);
-    dump_mem(regs->rsp);
-    dump_mem(regs->rbp);
-    dump_mem(regs->rip);
-#else
-    do_stack_walk(regs->ebp);
-    dump_mem(regs->esp);
-    dump_mem(regs->ebp);
-    dump_mem(regs->eip);
-#endif
-    page_walk(addr);
-    HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
-    /* We should never get here ... but still */
-    handling_pg_fault--;
-}
-
-void do_general_protection(struct pt_regs *regs, long error_code)
-{
-    struct sched_shutdown sched_shutdown = { .reason = SHUTDOWN_crash };
-#ifdef __i386__
-    printk("GPF eip: %p, error_code=%lx\n", regs->eip, error_code);
-#else    
-    printk("GPF rip: %p, error_code=%lx\n", regs->rip, error_code);
-#endif
-    dump_regs(regs);
-#if defined(__x86_64__)
-    do_stack_walk(regs->rbp);
-    dump_mem(regs->rsp);
-    dump_mem(regs->rbp);
-    dump_mem(regs->rip);
-#else
-    do_stack_walk(regs->ebp);
-    dump_mem(regs->esp);
-    dump_mem(regs->ebp);
-    dump_mem(regs->eip);
-#endif
-    HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
-}
-
-
-void do_debug(struct pt_regs * regs)
-{
-    printk("Debug exception\n");
-#define TF_MASK 0x100
-    regs->eflags &= ~TF_MASK;
-    dump_regs(regs);
-    do_exit();
-}
-
-void do_coprocessor_error(struct pt_regs * regs)
-{
-    printk("Copro error\n");
-    dump_regs(regs);
-    do_exit();
-}
-
-void simd_math_error(void *eip)
-{
-    printk("SIMD error\n");
-}
-
-void do_simd_coprocessor_error(struct pt_regs * regs)
-{
-    printk("SIMD copro error\n");
-}
-
-void do_spurious_interrupt_bug(struct pt_regs * regs)
-{
-}
-
-/*
- * Submit a virtual IDT to teh hypervisor. This consists of tuples
- * (interrupt vector, privilege ring, CS:EIP of handler).
- * The 'privilege ring' field specifies the least-privileged ring that
- * can trap to that vector using a software-interrupt instruction (INT).
- */
-static trap_info_t trap_table[] = {
-    {  0, 0, __KERNEL_CS, (unsigned long)divide_error                },
-    {  1, 0, __KERNEL_CS, (unsigned long)debug                       },
-    {  3, 3, __KERNEL_CS, (unsigned long)int3                        },
-    {  4, 3, __KERNEL_CS, (unsigned long)overflow                    },
-    {  5, 3, __KERNEL_CS, (unsigned long)bounds                      },
-    {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                  },
-    {  7, 0, __KERNEL_CS, (unsigned long)device_not_available        },
-    {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
-    { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS                 },
-    { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present         },
-    { 12, 0, __KERNEL_CS, (unsigned long)stack_segment               },
-    { 13, 0, __KERNEL_CS, (unsigned long)general_protection          },
-    { 14, 0, __KERNEL_CS, (unsigned long)page_fault                  },
-    { 15, 0, __KERNEL_CS, (unsigned long)spurious_interrupt_bug      },
-    { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error           },
-    { 17, 0, __KERNEL_CS, (unsigned long)alignment_check             },
-    { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error      },
-    {  0, 0,           0, 0                           }
-};
-    
-
-
-void trap_init(void)
-{
-    HYPERVISOR_set_trap_table(trap_table);    
-}
-
-void trap_fini(void)
-{
-    HYPERVISOR_set_trap_table(NULL);
-}
diff --git a/extras/mini-os/arch/x86/x86_32.S b/extras/mini-os/arch/x86/x86_32.S
deleted file mode 100644
index b9aa392..0000000
--- a/extras/mini-os/arch/x86/x86_32.S
+++ /dev/null
@@ -1,305 +0,0 @@
-#include <mini-os/os.h>
-#include <mini-os/x86/arch_limits.h>
-#include <xen/arch-x86_32.h>
-
-.section __xen_guest
-	.ascii	"GUEST_OS=Mini-OS"
-	.ascii	",XEN_VER=xen-3.0"
-	.ascii	",VIRT_BASE=0x0" /* &_text from minios_x86_32.lds */
-	.ascii	",ELF_PADDR_OFFSET=0x0"
-	.ascii	",HYPERCALL_PAGE=0x2"
-	.ascii	",PAE=yes[extended-cr3]"
-	.ascii	",LOADER=generic"
-	.byte	0
-.text
-
-.globl _start, shared_info, hypercall_page
-                        
-_start:
-        cld
-        lss stack_start,%esp
-        andl $(~(__STACK_SIZE-1)), %esp
-        push %esi 
-        call arch_init
-
-stack_start:
-	.long stack+(2*__STACK_SIZE), __KERNEL_SS
-
-        /* Unpleasant -- the PTE that maps this page is actually overwritten */
-        /* to map the real shared-info page! :-)                             */
-        .org 0x1000
-shared_info:
-        .org 0x2000
-
-hypercall_page:
-        .org 0x3000
-
-ES		= 0x20
-ORIG_EAX	= 0x24
-EIP		= 0x28
-CS		= 0x2C
-
-#define ENTRY(X) .globl X ; X :
-
-#define SAVE_ALL \
-	cld; \
-	pushl %es; \
-	pushl %ds; \
-	pushl %eax; \
-	pushl %ebp; \
-	pushl %edi; \
-	pushl %esi; \
-	pushl %edx; \
-	pushl %ecx; \
-	pushl %ebx; \
-	movl $(__KERNEL_DS),%edx; \
-	movl %edx,%ds; \
-	movl %edx,%es;
-
-#define RESTORE_ALL	\
-	popl %ebx;	\
-	popl %ecx;	\
-	popl %edx;	\
-	popl %esi;	\
-	popl %edi;	\
-	popl %ebp;	\
-	popl %eax;	\
-	popl %ds;	\
-	popl %es;	\
-	addl $4,%esp;	\
-	iret;		
-
-ENTRY(divide_error)
-	pushl $0		# no error code
-	pushl $do_divide_error
-do_exception:
-    pushl %ds
-	pushl %eax
-	xorl %eax, %eax
-	pushl %ebp
-	pushl %edi
-	pushl %esi
-	pushl %edx
-	decl %eax			# eax = -1
-	pushl %ecx
-	pushl %ebx
-	cld
-	movl %es, %ecx
-	movl ES(%esp), %edi		# get the function address
-	movl ORIG_EAX(%esp), %edx	# get the error code
-	movl %eax, ORIG_EAX(%esp)
-	movl %ecx, ES(%esp)
-	movl $(__KERNEL_DS), %ecx
-	movl %ecx, %ds
-	movl %ecx, %es
-	movl %esp,%eax			# pt_regs pointer
-    pushl %edx
-    pushl %eax
-	call *%edi
-    jmp ret_from_exception
-    
-ret_from_exception:
-    movb CS(%esp),%cl
-    addl $8,%esp
-    RESTORE_ALL
-
-# A note on the "critical region" in our callback handler.
-# We want to avoid stacking callback handlers due to events occurring
-# during handling of the last event. To do this, we keep events disabled
-# until weve done all processing. HOWEVER, we must enable events before
-# popping the stack frame (cant be done atomically) and so it would still
-# be possible to get enough handler activations to overflow the stack.
-# Although unlikely, bugs of that kind are hard to track down, so wed
-# like to avoid the possibility.
-# So, on entry to the handler we detect whether we interrupted an
-# existing activation in its critical region -- if so, we pop the current
-# activation and restart the handler using the previous one.
-ENTRY(hypervisor_callback)
-        pushl %eax
-        SAVE_ALL
-        movl EIP(%esp),%eax
-        cmpl $scrit,%eax
-        jb   11f
-        cmpl $ecrit,%eax
-        jb   critical_region_fixup
-11:     push %esp
-        xorl %ebp,%ebp
-        call do_hypervisor_callback
-        add  $4,%esp
-        movl HYPERVISOR_shared_info,%esi
-        xorl %eax,%eax
-        movb CS(%esp),%cl
-    	test $2,%cl          # slow return to ring 2 or 3
-        jne  safesti
-safesti:movb $0,1(%esi)     # reenable event callbacks
-scrit:  /**** START OF CRITICAL REGION ****/
-        testb $0xFF,(%esi)
-        jnz  14f              # process more events if necessary...
-        RESTORE_ALL
-14:     movb $1,1(%esi)
-        jmp  11b
-ecrit:  /**** END OF CRITICAL REGION ****/
-# [How we do the fixup]. We want to merge the current stack frame with the
-# just-interrupted frame. How we do this depends on where in the critical
-# region the interrupted handler was executing, and so how many saved
-# registers are in each frame. We do this quickly using the lookup table
-# 'critical_fixup_table'. For each byte offset in the critical region, it
-# provides the number of bytes which have already been popped from the
-# interrupted stack frame. 
-critical_region_fixup:
-        addl $critical_fixup_table-scrit,%eax
-        movzbl (%eax),%eax    # %eax contains num bytes popped
-        mov  %esp,%esi
-        add  %eax,%esi        # %esi points at end of src region
-        mov  %esp,%edi
-        add  $0x34,%edi       # %edi points at end of dst region
-        mov  %eax,%ecx
-        shr  $2,%ecx          # convert words to bytes
-        je   16f              # skip loop if nothing to copy
-15:     subl $4,%esi          # pre-decrementing copy loop
-        subl $4,%edi
-        movl (%esi),%eax
-        movl %eax,(%edi)
-        loop 15b
-16:     movl %edi,%esp        # final %edi is top of merged stack
-        jmp  11b
-         
-critical_fixup_table:        
-        .byte 0x00,0x00,0x00                  # testb $0xff,(%esi)
-        .byte 0x00,0x00                       # jne  14f
-        .byte 0x00                            # pop  %ebx
-        .byte 0x04                            # pop  %ecx
-        .byte 0x08                            # pop  %edx
-        .byte 0x0c                            # pop  %esi
-        .byte 0x10                            # pop  %edi
-        .byte 0x14                            # pop  %ebp
-        .byte 0x18                            # pop  %eax
-        .byte 0x1c                            # pop  %ds
-        .byte 0x20                            # pop  %es
-        .byte 0x24,0x24,0x24                  # add  $4,%esp
-        .byte 0x28                            # iret
-        .byte 0x00,0x00,0x00,0x00             # movb $1,1(%esi)
-        .byte 0x00,0x00                       # jmp  11b
-       
-# Hypervisor uses this for application faults while it executes.
-ENTRY(failsafe_callback)
-      pop  %ds
-      pop  %es
-      pop  %fs
-      pop  %gs
-      iret
-                
-ENTRY(coprocessor_error)
-	pushl $0
-	pushl $do_coprocessor_error
-	jmp do_exception
-
-ENTRY(simd_coprocessor_error)
-	pushl $0
-	pushl $do_simd_coprocessor_error
-	jmp do_exception
-
-ENTRY(device_not_available)
-        iret
-
-ENTRY(debug)
-	pushl $0
-	pushl $do_debug
-	jmp do_exception
-
-ENTRY(int3)
-	pushl $0
-	pushl $do_int3
-	jmp do_exception
-
-ENTRY(overflow)
-	pushl $0
-	pushl $do_overflow
-	jmp do_exception
-
-ENTRY(bounds)
-	pushl $0
-	pushl $do_bounds
-	jmp do_exception
-
-ENTRY(invalid_op)
-	pushl $0
-	pushl $do_invalid_op
-	jmp do_exception
-
-
-ENTRY(coprocessor_segment_overrun)
-	pushl $0
-	pushl $do_coprocessor_segment_overrun
-	jmp do_exception
-
-
-ENTRY(invalid_TSS)
-	pushl $do_invalid_TSS
-	jmp do_exception
-
-
-ENTRY(segment_not_present)
-	pushl $do_segment_not_present
-	jmp do_exception
-
-
-ENTRY(stack_segment)
-	pushl $do_stack_segment
-	jmp do_exception
-
-
-ENTRY(general_protection)
-	pushl $do_general_protection
-	jmp do_exception
-
-
-ENTRY(alignment_check)
-	pushl $do_alignment_check
-	jmp do_exception
-
-
-ENTRY(page_fault)
-    pushl $do_page_fault
-    jmp do_exception
-    
-ENTRY(machine_check)
-	pushl $0
-	pushl $do_machine_check
-	jmp do_exception
-
-
-ENTRY(spurious_interrupt_bug)
-	pushl $0
-	pushl $do_spurious_interrupt_bug
-	jmp do_exception
-
-
-
-ENTRY(thread_starter)
-    popl %eax
-    popl %ebx
-    pushl $0
-    xorl %ebp,%ebp
-    pushl %eax
-    call *%ebx
-    call exit_thread 
-    
-ENTRY(__arch_switch_threads)
-    movl 4(%esp), %ecx		/* prev */
-    movl 8(%esp), %edx		/* next */
-    pushl %ebp
-    pushl %ebx
-    pushl %esi
-    pushl %edi
-    movl %esp, (%ecx)		/* save ESP */
-    movl (%edx), %esp		/* restore ESP */
-    movl $1f, 4(%ecx)		/* save EIP */
-    pushl 4(%edx)		/* restore EIP */
-    ret
-1:
-    popl %edi
-    popl %esi
-    popl %ebx
-    popl %ebp
-    ret
diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S
deleted file mode 100644
index df3469e..0000000
--- a/extras/mini-os/arch/x86/x86_64.S
+++ /dev/null
@@ -1,386 +0,0 @@
-#include <mini-os/os.h>
-#include <mini-os/x86/arch_limits.h>
-#include <xen/features.h>
-
-.section __xen_guest
-	.ascii	"GUEST_OS=Mini-OS"
-	.ascii	",XEN_VER=xen-3.0"
-	.ascii	",VIRT_BASE=0x0" /* &_text from minios_x86_64.lds */
-	.ascii	",ELF_PADDR_OFFSET=0x0"
-	.ascii	",HYPERCALL_PAGE=0x2"
-	.ascii	",LOADER=generic"
-	.byte	0
-.text
-
-#define ENTRY(X) .globl X ; X :
-.globl _start, shared_info, hypercall_page
-
-
-_start:
-        cld
-        movq stack_start(%rip),%rsp
-        andq $(~(__STACK_SIZE-1)), %rsp
-        movq %rsi,%rdi
-        call arch_init
-
-stack_start:
-        .quad stack+(2*__STACK_SIZE)
-
-        /* Unpleasant -- the PTE that maps this page is actually overwritten */
-        /* to map the real shared-info page! :-)                             */
-        .org 0x1000
-shared_info:
-        .org 0x2000
-
-hypercall_page:
-        .org 0x3000
-
-
-#define XEN_GET_VCPU_INFO(reg)	movq HYPERVISOR_shared_info,reg
-#define XEN_PUT_VCPU_INFO(reg)
-#define XEN_PUT_VCPU_INFO_fixup
-#define XEN_LOCKED_BLOCK_EVENTS(reg)	movb $1,evtchn_upcall_mask(reg)
-#define XEN_LOCKED_UNBLOCK_EVENTS(reg)	movb $0,evtchn_upcall_mask(reg)
-#define XEN_TEST_PENDING(reg)	testb $0xFF,evtchn_upcall_pending(reg)
-
-#define XEN_BLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
-                    			XEN_LOCKED_BLOCK_EVENTS(reg)	; \
-    				            XEN_PUT_VCPU_INFO(reg)
-
-#define XEN_UNBLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
-                				XEN_LOCKED_UNBLOCK_EVENTS(reg)	; \
-    			            	XEN_PUT_VCPU_INFO(reg)
-
-
-/* Offsets into shared_info_t. */                
-#define evtchn_upcall_pending		/* 0 */
-#define evtchn_upcall_mask		1
-
-NMI_MASK = 0x80000000
-KERNEL_CS_MASK = 0xfc
-
-#define RAX       80
-#define RDI      112
-#define ORIG_RAX 120       /* + error_code */
-#define RIP      128
-#define CS       136
-#define RFLAGS   144
-#define RSP      152
-
-
-/* Macros */
-.macro zeroentry sym
-	movq (%rsp),%rcx
-	movq 8(%rsp),%r11
-	addq $0x10,%rsp /* skip rcx and r11 */
-	pushq $0	/* push error code/oldrax */
-	pushq %rax	/* push real oldrax to the rdi slot */
-	leaq  \sym(%rip),%rax
-	jmp error_entry
-.endm
-
-.macro errorentry sym
-	movq (%rsp),%rcx
-	movq 8(%rsp),%r11
-	addq $0x10,%rsp /* rsp points to the error code */
-	pushq %rax
-	leaq  \sym(%rip),%rax
-	jmp error_entry
-.endm
-
-.macro RESTORE_ALL
-	movq (%rsp),%r11
-	movq 1*8(%rsp),%r10
-	movq 2*8(%rsp),%r9
-	movq 3*8(%rsp),%r8
-	movq 4*8(%rsp),%rax
-	movq 5*8(%rsp),%rcx
-	movq 6*8(%rsp),%rdx
-	movq 7*8(%rsp),%rsi
-	movq 8*8(%rsp),%rdi
-	addq $9*8+8,%rsp
-.endm	
-
-.macro RESTORE_REST
-	movq (%rsp),%r15
-	movq 1*8(%rsp),%r14
-	movq 2*8(%rsp),%r13
-	movq 3*8(%rsp),%r12
-	movq 4*8(%rsp),%rbp
-	movq 5*8(%rsp),%rbx
-	addq $6*8,%rsp
-.endm
-
-.macro SAVE_REST
-	subq $6*8,%rsp
-	movq %rbx,5*8(%rsp)
-	movq %rbp,4*8(%rsp)
-	movq %r12,3*8(%rsp)
-	movq %r13,2*8(%rsp)
-	movq %r14,1*8(%rsp)
-	movq %r15,(%rsp)
-.endm
-
-.macro HYPERVISOR_IRET flag
-	testl $NMI_MASK,2*8(%rsp)
-	jnz   2f
-
-	testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
-	jnz   1f
-
-	/* Direct iret to kernel space. Correct CS and SS. */
-	orb   $3,1*8(%rsp)
-	orb   $3,4*8(%rsp)
-1:	iretq
-
-2:	/* Slow iret via hypervisor. */
-	andl  $~NMI_MASK, 16(%rsp)
-	pushq $\flag
-	jmp  hypercall_page + (__HYPERVISOR_iret * 32)
-.endm
-
-
-/*
- * Exception entry point. This expects an error code/orig_rax on the stack
- * and the exception handler in %rax.	
- */ 		  				
-ENTRY(error_entry)
-	/* rdi slot contains rax, oldrax contains error code */
-	cld	
-	subq  $14*8,%rsp
-	movq %rsi,13*8(%rsp)
-	movq 14*8(%rsp),%rsi	/* load rax from rdi slot */
-	movq %rdx,12*8(%rsp)
-	movq %rcx,11*8(%rsp)
-	movq %rsi,10*8(%rsp)	/* store rax */ 
-	movq %r8, 9*8(%rsp)
-	movq %r9, 8*8(%rsp)
-	movq %r10,7*8(%rsp)
-	movq %r11,6*8(%rsp)
-	movq %rbx,5*8(%rsp) 
-	movq %rbp,4*8(%rsp) 
-	movq %r12,3*8(%rsp) 
-	movq %r13,2*8(%rsp) 
-	movq %r14,1*8(%rsp) 
-	movq %r15,(%rsp) 
-
-error_call_handler:
-	movq %rdi, RDI(%rsp)
-	movq %rsp,%rdi
-	movq ORIG_RAX(%rsp),%rsi	# get error code 
-	movq $-1,ORIG_RAX(%rsp)
-	call *%rax
-	jmp error_exit
-
-
-/*
- * Xen event (virtual interrupt) entry point.
- */
-ENTRY(hypervisor_callback)
-	zeroentry hypervisor_callback2
-
-ENTRY(hypervisor_callback2)
-	movq %rdi, %rsp
-
-	/* check against event re-entrant */
-	movq RIP(%rsp),%rax
-	cmpq $scrit,%rax
-	jb 11f
-	cmpq $ecrit,%rax
-	jb  critical_region_fixup
-
-11:	movq %gs:8,%rax
-	incl %gs:0
-	cmovzq %rax,%rsp
-	pushq %rdi
-	call do_hypervisor_callback
-	popq %rsp
-	decl %gs:0
-
-error_exit:
-retint_kernel:
-	movl RFLAGS(%rsp), %eax
-	shr $9, %eax			# EAX[0] == IRET_RFLAGS.IF
-	XEN_GET_VCPU_INFO(%rsi)
-	andb evtchn_upcall_mask(%rsi),%al
-	andb $1,%al			# EAX[0] == IRET_RFLAGS.IF & event_mask
-	jnz restore_all_enable_events	#        != 0 => enable event delivery
-	XEN_PUT_VCPU_INFO(%rsi)
-
-retint_restore_args:
-	RESTORE_REST
-	RESTORE_ALL
-	HYPERVISOR_IRET 0
-
-restore_all_enable_events:
-	RESTORE_REST
-	RESTORE_ALL
-	pushq %rax                      # save rax for it will be clobbered later
-	RSP_OFFSET=8                    # record the stack frame layout changes
-	XEN_GET_VCPU_INFO(%rax)         # safe to use rax since it is saved
-	XEN_UNBLOCK_EVENTS(%rax)
-
-scrit:	/**** START OF CRITICAL REGION ****/
-	XEN_TEST_PENDING(%rax)
-	jz 12f
-	XEN_LOCKED_BLOCK_EVENTS(%rax)   # if pending, mask events and handle
-	                                # by jumping to hypervisor_prologue
-12:	popq %rax                       # all registers restored from this point
-
-restore_end:
-	jnz hypervisor_prologue         # safe to jump out of critical region
-	                                # because events are masked if ZF = 0
-	HYPERVISOR_IRET 0
-ecrit:  /**** END OF CRITICAL REGION ****/
-
-# Set up the stack as Xen does before calling event callback
-hypervisor_prologue:
-	pushq %r11
-	pushq %rcx
-	jmp hypervisor_callback
-
-# [How we do the fixup]. We want to merge the current stack frame with the
-# just-interrupted frame. How we do this depends on where in the critical
-# region the interrupted handler was executing, and so if rax has been
-# restored. We determine by comparing interrupted rip with "restore_end".
-# We always copy all registers below RIP from the current stack frame
-# to the end of the previous activation frame so that we can continue
-# as if we've never even reached 11 running in the old activation frame.
-
-critical_region_fixup:
-	# Set up source and destination region pointers
-	leaq RIP(%rsp),%rsi   # esi points at end of src region
-	# Acquire interrupted rsp which was saved-on-stack. This points to
-	# the end of dst region. Note that it is not necessarily current rsp
-	# plus 0xb0, because the second interrupt might align the stack frame.
-	movq RSP(%rsp),%rdi   # edi points at end of dst region
-
-	cmpq $restore_end,%rax
-	jae  13f
-
-	# If interrupted rip is before restore_end
-	# then rax hasn't been restored yet
-	movq (%rdi),%rax
-	movq %rax, RAX(%rsp)  # save rax
-	addq $RSP_OFFSET,%rdi
-
-	# Set up the copy
-13:	movq $RIP,%rcx
-	shr  $3,%rcx          # convert bytes into count of 64-bit entities
-15:	subq $8,%rsi          # pre-decrementing copy loop
-	subq $8,%rdi
-	movq (%rsi),%rax
-	movq %rax,(%rdi)
-	loop 15b
-16:	movq %rdi,%rsp        # final rdi is top of merged stack
-	andb $KERNEL_CS_MASK,CS(%rsp)      # CS might have changed
-	jmp  11b
-
-
-
-ENTRY(failsafe_callback)
-        popq  %rcx
-        popq  %r11
-        iretq
-
-
-ENTRY(coprocessor_error)
-        zeroentry do_coprocessor_error
-
-
-ENTRY(simd_coprocessor_error)
-        zeroentry do_simd_coprocessor_error
-
-
-ENTRY(device_not_available)
-        zeroentry do_device_not_available
-
-
-ENTRY(debug)
-        zeroentry do_debug
-
-
-ENTRY(int3)
-        zeroentry do_int3
-
-ENTRY(overflow)
-        zeroentry do_overflow
-
-
-ENTRY(bounds)
-        zeroentry do_bounds
-    
-    
-ENTRY(invalid_op)
-        zeroentry do_invalid_op
-
-
-ENTRY(coprocessor_segment_overrun)
-        zeroentry do_coprocessor_segment_overrun
-
-
-ENTRY(invalid_TSS)
-        errorentry do_invalid_TSS
-
-
-ENTRY(segment_not_present)
-        errorentry do_segment_not_present
-
-
-/* runs on exception stack */
-ENTRY(stack_segment)
-        errorentry do_stack_segment
-                    
-
-ENTRY(general_protection)
-        errorentry do_general_protection
-
-
-ENTRY(alignment_check)
-        errorentry do_alignment_check
-
-
-ENTRY(divide_error)
-        zeroentry do_divide_error
-
-
-ENTRY(spurious_interrupt_bug)
-        zeroentry do_spurious_interrupt_bug
-            
-
-ENTRY(page_fault)
-        errorentry do_page_fault
-
-
-
-
-
-ENTRY(thread_starter)
-        popq %rdi
-        popq %rbx
-        pushq $0
-        xorq %rbp,%rbp
-        call *%rbx
-        call exit_thread 
-        
-
-ENTRY(__arch_switch_threads)
-	pushq %rbp
-	pushq %rbx
-	pushq %r12
-	pushq %r13
-	pushq %r14
-	pushq %r15
-	movq %rsp, (%rdi)		/* save ESP */
-	movq (%rsi), %rsp		/* restore ESP */
-	movq $1f, 8(%rdi)		/* save EIP */
-	pushq 8(%rsi)			/* restore EIP */
-	ret
-1:
-	popq %r15
-	popq %r14
-	popq %r13
-	popq %r12
-	popq %rbx
-	popq %rbp
-	ret
diff --git a/extras/mini-os/blkfront.c b/extras/mini-os/blkfront.c
deleted file mode 100644
index 59e576f..0000000
--- a/extras/mini-os/blkfront.c
+++ /dev/null
@@ -1,736 +0,0 @@
-/* Minimal block driver for Mini-OS. 
- * Copyright (c) 2007-2008 Samuel Thibault.
- * Based on netfront.c.
- */
-
-#include <stdint.h>
-#include <mini-os/os.h>
-#include <mini-os/xenbus.h>
-#include <mini-os/events.h>
-#include <errno.h>
-#include <xen/io/blkif.h>
-#include <xen/io/protocols.h>
-#include <mini-os/gnttab.h>
-#include <mini-os/xmalloc.h>
-#include <time.h>
-#include <mini-os/blkfront.h>
-#include <mini-os/lib.h>
-#include <fcntl.h>
-
-#ifndef HAVE_LIBC
-#define strtoul simple_strtoul
-#endif
-
-/* Note: we generally don't need to disable IRQs since we hardly do anything in
- * the interrupt handler.  */
-
-/* Note: we really suppose non-preemptive threads.  */
-
-DECLARE_WAIT_QUEUE_HEAD(blkfront_queue);
-
-
-
-
-#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
-#define GRANT_INVALID_REF 0
-
-
-struct blk_buffer {
-    void* page;
-    grant_ref_t gref;
-};
-
-struct blkfront_dev {
-    domid_t dom;
-
-    struct blkif_front_ring ring;
-    grant_ref_t ring_ref;
-    evtchn_port_t evtchn;
-    blkif_vdev_t handle;
-
-    char *nodename;
-    char *backend;
-    struct blkfront_info info;
-
-    xenbus_event_queue events;
-
-#ifdef HAVE_LIBC
-    int fd;
-#endif
-};
-
-void blkfront_handler(evtchn_port_t port, struct pt_regs *regs, void *data)
-{
-#ifdef HAVE_LIBC
-    struct blkfront_dev *dev = data;
-    int fd = dev->fd;
-
-    if (fd != -1)
-        files[fd].read = 1;
-#endif
-    wake_up(&blkfront_queue);
-}
-
-static void free_blkfront(struct blkfront_dev *dev)
-{
-    mask_evtchn(dev->evtchn);
-
-    free(dev->backend);
-
-    gnttab_end_access(dev->ring_ref);
-    free_page(dev->ring.sring);
-
-    unbind_evtchn(dev->evtchn);
-
-    free(dev->nodename);
-    free(dev);
-}
-
-struct blkfront_dev *init_blkfront(char *_nodename, struct blkfront_info *info)
-{
-    xenbus_transaction_t xbt;
-    char* err;
-    char* message=NULL;
-    struct blkif_sring *s;
-    int retry=0;
-    char* msg = NULL;
-    char* c;
-    char* nodename = _nodename ? _nodename : "device/vbd/768";
-
-    struct blkfront_dev *dev;
-
-    char path[strlen(nodename) + strlen("/backend-id") + 1];
-
-    printk("******************* BLKFRONT for %s **********\n\n\n", nodename);
-
-    dev = malloc(sizeof(*dev));
-    memset(dev, 0, sizeof(*dev));
-    dev->nodename = strdup(nodename);
-#ifdef HAVE_LIBC
-    dev->fd = -1;
-#endif
-
-    snprintf(path, sizeof(path), "%s/backend-id", nodename);
-    dev->dom = xenbus_read_integer(path); 
-    evtchn_alloc_unbound(dev->dom, blkfront_handler, dev, &dev->evtchn);
-
-    s = (struct blkif_sring*) alloc_page();
-    memset(s,0,PAGE_SIZE);
-
-
-    SHARED_RING_INIT(s);
-    FRONT_RING_INIT(&dev->ring, s, PAGE_SIZE);
-
-    dev->ring_ref = gnttab_grant_access(dev->dom,virt_to_mfn(s),0);
-
-    dev->events = NULL;
-
-again:
-    err = xenbus_transaction_start(&xbt);
-    if (err) {
-        printk("starting transaction\n");
-        free(err);
-    }
-
-    err = xenbus_printf(xbt, nodename, "ring-ref","%u",
-                dev->ring_ref);
-    if (err) {
-        message = "writing ring-ref";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename,
-                "event-channel", "%u", dev->evtchn);
-    if (err) {
-        message = "writing event-channel";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename,
-                "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
-    if (err) {
-        message = "writing protocol";
-        goto abort_transaction;
-    }
-
-    snprintf(path, sizeof(path), "%s/state", nodename);
-    err = xenbus_switch_state(xbt, path, XenbusStateConnected);
-    if (err) {
-        message = "switching state";
-        goto abort_transaction;
-    }
-
-
-    err = xenbus_transaction_end(xbt, 0, &retry);
-    free(err);
-    if (retry) {
-            goto again;
-        printk("completing transaction\n");
-    }
-
-    goto done;
-
-abort_transaction:
-    free(err);
-    err = xenbus_transaction_end(xbt, 1, &retry);
-    printk("Abort transaction %s\n", message);
-    goto error;
-
-done:
-
-    snprintf(path, sizeof(path), "%s/backend", nodename);
-    msg = xenbus_read(XBT_NIL, path, &dev->backend);
-    if (msg) {
-        printk("Error %s when reading the backend path %s\n", msg, path);
-        goto error;
-    }
-
-    printk("backend at %s\n", dev->backend);
-
-    dev->handle = strtoul(strrchr(nodename, '/')+1, NULL, 0);
-
-    {
-        XenbusState state;
-        char path[strlen(dev->backend) + strlen("/feature-flush-cache") + 1];
-        snprintf(path, sizeof(path), "%s/mode", dev->backend);
-        msg = xenbus_read(XBT_NIL, path, &c);
-        if (msg) {
-            printk("Error %s when reading the mode\n", msg);
-            goto error;
-        }
-        if (*c == 'w')
-            dev->info.mode = O_RDWR;
-        else
-            dev->info.mode = O_RDONLY;
-        free(c);
-
-        snprintf(path, sizeof(path), "%s/state", dev->backend);
-
-        xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
-
-        msg = NULL;
-        state = xenbus_read_integer(path);
-        while (msg == NULL && state < XenbusStateConnected)
-            msg = xenbus_wait_for_state_change(path, &state, &dev->events);
-        if (msg != NULL || state != XenbusStateConnected) {
-            printk("backend not available, state=%d\n", state);
-            xenbus_unwatch_path_token(XBT_NIL, path, path);
-            goto error;
-        }
-
-        snprintf(path, sizeof(path), "%s/info", dev->backend);
-        dev->info.info = xenbus_read_integer(path);
-
-        snprintf(path, sizeof(path), "%s/sectors", dev->backend);
-        // FIXME: read_integer returns an int, so disk size limited to 1TB for now
-        dev->info.sectors = xenbus_read_integer(path);
-
-        snprintf(path, sizeof(path), "%s/sector-size", dev->backend);
-        dev->info.sector_size = xenbus_read_integer(path);
-
-        snprintf(path, sizeof(path), "%s/feature-barrier", dev->backend);
-        dev->info.barrier = xenbus_read_integer(path);
-
-        snprintf(path, sizeof(path), "%s/feature-flush-cache", dev->backend);
-        dev->info.flush = xenbus_read_integer(path);
-
-        *info = dev->info;
-    }
-    unmask_evtchn(dev->evtchn);
-
-    printk("%u sectors of %u bytes\n", dev->info.sectors, dev->info.sector_size);
-    printk("**************************\n");
-
-    return dev;
-
-error:
-    free(msg);
-    free(err);
-    free_blkfront(dev);
-    return NULL;
-}
-
-void shutdown_blkfront(struct blkfront_dev *dev)
-{
-    char* err = NULL, *err2;
-    XenbusState state;
-
-    char path[strlen(dev->backend) + strlen("/state") + 1];
-    char nodename[strlen(dev->nodename) + strlen("/event-channel") + 1];
-
-    blkfront_sync(dev);
-
-    printk("close blk: backend=%s node=%s\n", dev->backend, dev->nodename);
-
-    snprintf(path, sizeof(path), "%s/state", dev->backend);
-    snprintf(nodename, sizeof(nodename), "%s/state", dev->nodename);
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosing)) != NULL) {
-        printk("shutdown_blkfront: error changing state to %d: %s\n",
-                XenbusStateClosing, err);
-        goto close;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && state < XenbusStateClosing)
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-    free(err);
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosed)) != NULL) {
-        printk("shutdown_blkfront: error changing state to %d: %s\n",
-                XenbusStateClosed, err);
-        goto close;
-    }
-    state = xenbus_read_integer(path);
-    while (state < XenbusStateClosed) {
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-        free(err);
-    }
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateInitialising)) != NULL) {
-        printk("shutdown_blkfront: error changing state to %d: %s\n",
-                XenbusStateInitialising, err);
-        goto close;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && (state < XenbusStateInitWait || state >= XenbusStateClosed))
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-
-close:
-    free(err);
-    err2 = xenbus_unwatch_path_token(XBT_NIL, path, path);
-    free(err2);
-
-    snprintf(nodename, sizeof(nodename), "%s/ring-ref", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-    snprintf(nodename, sizeof(nodename), "%s/event-channel", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-
-    if (!err)
-        free_blkfront(dev);
-}
-
-static void blkfront_wait_slot(struct blkfront_dev *dev)
-{
-    /* Wait for a slot */
-    if (RING_FULL(&dev->ring)) {
-	unsigned long flags;
-	DEFINE_WAIT(w);
-	local_irq_save(flags);
-	while (1) {
-	    blkfront_aio_poll(dev);
-	    if (!RING_FULL(&dev->ring))
-		break;
-	    /* Really no slot, go to sleep. */
-	    add_waiter(w, blkfront_queue);
-	    local_irq_restore(flags);
-	    schedule();
-	    local_irq_save(flags);
-	}
-	remove_waiter(w, blkfront_queue);
-	local_irq_restore(flags);
-    }
-}
-
-/* Issue an aio */
-void blkfront_aio(struct blkfront_aiocb *aiocbp, int write)
-{
-    struct blkfront_dev *dev = aiocbp->aio_dev;
-    struct blkif_request *req;
-    RING_IDX i;
-    int notify;
-    int n, j;
-    uintptr_t start, end;
-
-    // Can't io at non-sector-aligned location
-    ASSERT(!(aiocbp->aio_offset & (dev->info.sector_size-1)));
-    // Can't io non-sector-sized amounts
-    ASSERT(!(aiocbp->aio_nbytes & (dev->info.sector_size-1)));
-    // Can't io non-sector-aligned buffer
-    ASSERT(!((uintptr_t) aiocbp->aio_buf & (dev->info.sector_size-1)));
-
-    start = (uintptr_t)aiocbp->aio_buf & PAGE_MASK;
-    end = ((uintptr_t)aiocbp->aio_buf + aiocbp->aio_nbytes + PAGE_SIZE - 1) & PAGE_MASK;
-    aiocbp->n = n = (end - start) / PAGE_SIZE;
-
-    /* qemu's IDE max multsect is 16 (8KB) and SCSI max DMA was set to 32KB,
-     * so max 44KB can't happen */
-    ASSERT(n <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
-    blkfront_wait_slot(dev);
-    i = dev->ring.req_prod_pvt;
-    req = RING_GET_REQUEST(&dev->ring, i);
-
-    req->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ;
-    req->nr_segments = n;
-    req->handle = dev->handle;
-    req->id = (uintptr_t) aiocbp;
-    req->sector_number = aiocbp->aio_offset / 512;
-
-    for (j = 0; j < n; j++) {
-        req->seg[j].first_sect = 0;
-        req->seg[j].last_sect = PAGE_SIZE / 512 - 1;
-    }
-    req->seg[0].first_sect = ((uintptr_t)aiocbp->aio_buf & ~PAGE_MASK) / 512;
-    req->seg[n-1].last_sect = (((uintptr_t)aiocbp->aio_buf + aiocbp->aio_nbytes - 1) & ~PAGE_MASK) / 512;
-    for (j = 0; j < n; j++) {
-	uintptr_t data = start + j * PAGE_SIZE;
-        if (!write) {
-            /* Trigger CoW if needed */
-            *(char*)(data + (req->seg[j].first_sect << 9)) = 0;
-            barrier();
-        }
-	aiocbp->gref[j] = req->seg[j].gref =
-            gnttab_grant_access(dev->dom, virtual_to_mfn(data), write);
-    }
-
-    dev->ring.req_prod_pvt = i + 1;
-
-    wmb();
-    RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->ring, notify);
-
-    if(notify) notify_remote_via_evtchn(dev->evtchn);
-}
-
-static void blkfront_aio_cb(struct blkfront_aiocb *aiocbp, int ret)
-{
-    aiocbp->data = (void*) 1;
-    aiocbp->aio_cb = NULL;
-}
-
-void blkfront_io(struct blkfront_aiocb *aiocbp, int write)
-{
-    unsigned long flags;
-    DEFINE_WAIT(w);
-
-    ASSERT(!aiocbp->aio_cb);
-    aiocbp->aio_cb = blkfront_aio_cb;
-    blkfront_aio(aiocbp, write);
-    aiocbp->data = NULL;
-
-    local_irq_save(flags);
-    while (1) {
-	blkfront_aio_poll(aiocbp->aio_dev);
-	if (aiocbp->data)
-	    break;
-
-	add_waiter(w, blkfront_queue);
-	local_irq_restore(flags);
-	schedule();
-	local_irq_save(flags);
-    }
-    remove_waiter(w, blkfront_queue);
-    local_irq_restore(flags);
-}
-
-static void blkfront_push_operation(struct blkfront_dev *dev, uint8_t op, uint64_t id)
-{
-    int i;
-    struct blkif_request *req;
-    int notify;
-
-    blkfront_wait_slot(dev);
-    i = dev->ring.req_prod_pvt;
-    req = RING_GET_REQUEST(&dev->ring, i);
-    req->operation = op;
-    req->nr_segments = 0;
-    req->handle = dev->handle;
-    req->id = id;
-    /* Not needed anyway, but the backend will check it */
-    req->sector_number = 0;
-    dev->ring.req_prod_pvt = i + 1;
-    wmb();
-    RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->ring, notify);
-    if (notify) notify_remote_via_evtchn(dev->evtchn);
-}
-
-void blkfront_aio_push_operation(struct blkfront_aiocb *aiocbp, uint8_t op)
-{
-    struct blkfront_dev *dev = aiocbp->aio_dev;
-    blkfront_push_operation(dev, op, (uintptr_t) aiocbp);
-}
-
-void blkfront_sync(struct blkfront_dev *dev)
-{
-    unsigned long flags;
-    DEFINE_WAIT(w);
-
-    if (dev->info.mode == O_RDWR) {
-        if (dev->info.barrier == 1)
-            blkfront_push_operation(dev, BLKIF_OP_WRITE_BARRIER, 0);
-
-        if (dev->info.flush == 1)
-            blkfront_push_operation(dev, BLKIF_OP_FLUSH_DISKCACHE, 0);
-    }
-
-    /* Note: This won't finish if another thread enqueues requests.  */
-    local_irq_save(flags);
-    while (1) {
-	blkfront_aio_poll(dev);
-	if (RING_FREE_REQUESTS(&dev->ring) == RING_SIZE(&dev->ring))
-	    break;
-
-	add_waiter(w, blkfront_queue);
-	local_irq_restore(flags);
-	schedule();
-	local_irq_save(flags);
-    }
-    remove_waiter(w, blkfront_queue);
-    local_irq_restore(flags);
-}
-
-int blkfront_aio_poll(struct blkfront_dev *dev)
-{
-    RING_IDX rp, cons;
-    struct blkif_response *rsp;
-    int more;
-    int nr_consumed;
-
-moretodo:
-#ifdef HAVE_LIBC
-    if (dev->fd != -1) {
-        files[dev->fd].read = 0;
-        mb(); /* Make sure to let the handler set read to 1 before we start looking at the ring */
-    }
-#endif
-
-    rp = dev->ring.sring->rsp_prod;
-    rmb(); /* Ensure we see queued responses up to 'rp'. */
-    cons = dev->ring.rsp_cons;
-
-    nr_consumed = 0;
-    while ((cons != rp))
-    {
-        struct blkfront_aiocb *aiocbp;
-        int status;
-
-	rsp = RING_GET_RESPONSE(&dev->ring, cons);
-	nr_consumed++;
-
-        aiocbp = (void*) (uintptr_t) rsp->id;
-        status = rsp->status;
-
-        switch (rsp->operation) {
-        case BLKIF_OP_READ:
-        case BLKIF_OP_WRITE:
-        {
-            int j;
-
-            if (status != BLKIF_RSP_OKAY)
-                printk("%s error %d on %s at offset %llu, num bytes %llu\n",
-                        rsp->operation == BLKIF_OP_READ?"read":"write",
-                        status, aiocbp->aio_dev->nodename,
-                        (unsigned long long) aiocbp->aio_offset,
-                        (unsigned long long) aiocbp->aio_nbytes);
-
-            for (j = 0; j < aiocbp->n; j++)
-                gnttab_end_access(aiocbp->gref[j]);
-
-            break;
-        }
-
-        case BLKIF_OP_WRITE_BARRIER:
-            if (status != BLKIF_RSP_OKAY)
-                printk("write barrier error %d\n", status);
-            break;
-        case BLKIF_OP_FLUSH_DISKCACHE:
-            if (status != BLKIF_RSP_OKAY)
-                printk("flush error %d\n", status);
-            break;
-
-        default:
-            printk("unrecognized block operation %d response (status %d)\n", rsp->operation, status);
-            break;
-        }
-
-        dev->ring.rsp_cons = ++cons;
-        /* Nota: callback frees aiocbp itself */
-        if (aiocbp && aiocbp->aio_cb)
-            aiocbp->aio_cb(aiocbp, status ? -EIO : 0);
-        if (dev->ring.rsp_cons != cons)
-            /* We reentered, we must not continue here */
-            break;
-    }
-
-    RING_FINAL_CHECK_FOR_RESPONSES(&dev->ring, more);
-    if (more) goto moretodo;
-
-    return nr_consumed;
-}
-
-#ifdef HAVE_LIBC
-int blkfront_open(struct blkfront_dev *dev)
-{
-    /* Silently prevent multiple opens */
-    if(dev->fd != -1) {
-       return dev->fd;
-    }
-    dev->fd = alloc_fd(FTYPE_BLK);
-    printk("blk_open(%s) -> %d\n", dev->nodename, dev->fd);
-    files[dev->fd].blk.dev = dev;
-    files[dev->fd].blk.offset = 0;
-    return dev->fd;
-}
-
-int blkfront_posix_rwop(int fd, uint8_t* buf, size_t count, int write)
-{
-   struct blkfront_dev* dev = files[fd].blk.dev;
-   off_t offset = files[fd].blk.offset;
-   struct blkfront_aiocb aiocb;
-   unsigned long long disksize = dev->info.sectors * dev->info.sector_size;
-   unsigned int blocksize = dev->info.sector_size;
-
-   int blknum;
-   int blkoff;
-   size_t bytes;
-   int rc = 0;
-   int alignedbuf = 0;
-   uint8_t* copybuf = NULL;
-
-   /* RW 0 bytes is just a NOP */
-   if(count == 0) {
-      return 0;
-   }
-   /* Check for NULL buffer */
-   if( buf == NULL ) {
-      errno = EFAULT;
-      return -1;
-   }
-
-   /* Write mode checks */
-   if(write) {
-      /*Make sure we have write permission */
-      if(dev->info.info & VDISK_READONLY 
-            || (dev->info.mode != O_RDWR  && dev->info.mode !=  O_WRONLY)) {
-         errno = EACCES;
-         return -1;
-      }
-      /*Make sure disk is big enough for this write */
-      if(offset + count > disksize) {
-         errno = ENOSPC;
-         return -1;
-      }
-   }
-   /* Read mode checks */
-   else
-   {
-      /* Reading past the disk? Just return 0 */
-      if(offset >= disksize) {
-         return 0;
-      }
-
-      /*If the requested read is bigger than the disk, just
-       * read as much as we can until the end */
-      if(offset + count > disksize) {
-         count = disksize - offset;
-      }
-   }
-   /* Determine which block to start at and at which offset inside of it */
-   blknum = offset / blocksize;
-   blkoff = offset % blocksize;
-
-   /* Optimization: We need to check if buf is aligned to the sector size.
-    * This is somewhat tricky code. We have to add the blocksize - block offset
-    * because the first block may be a partial block and then for every subsequent
-    * block rw the buffer will be offset.*/
-   if(!((uintptr_t) (buf +(blocksize -  blkoff)) & (dev->info.sector_size-1))) {
-      alignedbuf = 1;
-   }
-
-   /* Setup aiocb block object */
-   aiocb.aio_dev = dev;
-   aiocb.aio_offset = blknum * blocksize;
-   aiocb.aio_cb = NULL;
-   aiocb.data = NULL;
-
-   /* If our buffer is unaligned or its aligned but we will need to rw a partial block
-    * then a copy will have to be done */
-   if(!alignedbuf || blkoff != 0 || count % blocksize != 0) {
-      copybuf = _xmalloc(blocksize, dev->info.sector_size);
-   }
-
-   rc = count;
-   while(count > 0) {
-      /* determine how many bytes to read/write from/to the current block buffer */
-      if(!alignedbuf || blkoff != 0 || count < blocksize) {
-         /* This is the case for unaligned R/W or partial block */
-         bytes = count < blocksize - blkoff ? count : blocksize - blkoff;
-         aiocb.aio_nbytes = blocksize;
-      } else {
-         /* We can optimize further if buffer is page aligned */
-         int not_page_aligned = 0;
-         if(((uintptr_t)buf) & (PAGE_SIZE -1)) {
-            not_page_aligned = 1;
-         }
-
-         /* For an aligned R/W we can read up to the maximum transfer size */
-         bytes = count > (BLKIF_MAX_SEGMENTS_PER_REQUEST-not_page_aligned)*PAGE_SIZE 
-            ? (BLKIF_MAX_SEGMENTS_PER_REQUEST-not_page_aligned)*PAGE_SIZE
-            : count & ~(blocksize -1);
-         aiocb.aio_nbytes = bytes;
-      }
-
-      /* read operation */
-      if(!write) {
-         if (alignedbuf && bytes >= blocksize) {
-            /* If aligned and were reading a whole block, just read right into buf */
-            aiocb.aio_buf = buf;
-            blkfront_read(&aiocb);
-         } else {
-            /* If not then we have to do a copy */
-            aiocb.aio_buf = copybuf;
-            blkfront_read(&aiocb);
-            memcpy(buf, &copybuf[blkoff], bytes);
-         }
-      }
-      /* Write operation */
-      else {
-         if(alignedbuf && bytes >= blocksize) {
-            /* If aligned and were writing a whole block, just write directly from buf */
-            aiocb.aio_buf = buf;
-            blkfront_write(&aiocb);
-         } else {
-            /* If not then we have to do a copy. */
-            aiocb.aio_buf = copybuf;
-            /* If we're writing a partial block, we need to read the current contents first
-             * so we don't overwrite the extra bits with garbage */
-            if(blkoff != 0 || bytes < blocksize) {
-               blkfront_read(&aiocb);
-            }
-            memcpy(&copybuf[blkoff], buf, bytes);
-            blkfront_write(&aiocb);
-         }
-      }
-      /* Will start at beginning of all remaining blocks */
-      blkoff = 0;
-
-      /* Increment counters and continue */
-      count -= bytes;
-      buf += bytes;
-      if(bytes < blocksize) {
-         //At minimum we read one block
-         aiocb.aio_offset += blocksize;
-      } else {
-         //If we read more than a block, was a multiple of blocksize
-         aiocb.aio_offset += bytes;
-      }
-   }
-
-   free(copybuf);
-   files[fd].blk.offset += rc;
-   return rc;
-
-}
-
-int blkfront_posix_fstat(int fd, struct stat* buf)
-{
-   struct blkfront_dev* dev = files[fd].blk.dev;
-
-   buf->st_mode = dev->info.mode;
-   buf->st_uid = 0;
-   buf->st_gid = 0;
-   buf->st_size = dev->info.sectors * dev->info.sector_size;
-   buf->st_atime = buf->st_mtime = buf->st_ctime = time(NULL);
-
-   return 0;
-}
-#endif
diff --git a/extras/mini-os/console/console.c b/extras/mini-os/console/console.c
deleted file mode 100644
index 5538bd4..0000000
--- a/extras/mini-os/console/console.c
+++ /dev/null
@@ -1,164 +0,0 @@
-/* 
- ****************************************************************************
- * (C) 2006 - Grzegorz Milos - Cambridge University
- ****************************************************************************
- *
- *        File: console.h
- *      Author: Grzegorz Milos
- *     Changes: 
- *              
- *        Date: Mar 2006
- * 
- * Environment: Xen Minimal OS
- * Description: Console interface.
- *
- * Handles console I/O. Defines printk.
- *
- ****************************************************************************
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
- 
-#include <mini-os/types.h>
-#include <mini-os/wait.h>
-#include <mini-os/mm.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/events.h>
-#include <mini-os/os.h>
-#include <mini-os/lib.h>
-#include <mini-os/xenbus.h>
-#include <xen/io/console.h>
-
-
-/* Copies all print output to the Xen emergency console apart
-   of standard dom0 handled console */
-#define USE_XEN_CONSOLE
-
-
-/* If console not initialised the printk will be sent to xen serial line 
-   NOTE: you need to enable verbose in xen/Rules.mk for it to work. */
-static int console_initialised = 0;
-
-__attribute__((weak)) void console_input(char * buf, unsigned len)
-{
-    if(len > 0)
-    {
-        /* Just repeat what's written */
-        buf[len] = '\0';
-        printk("%s", buf);
-        
-        if(buf[len-1] == '\r')
-            printk("\nNo console input handler.\n");
-    }
-}
-
-#ifndef HAVE_LIBC
-void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
-{
-    console_input(buf, len);
-}
-
-void xencons_tx(void)
-{
-    /* Do nothing, handled by _rx */
-}
-#endif
-
-
-void console_print(struct consfront_dev *dev, char *data, int length)
-{
-    char *curr_char, saved_char;
-    char copied_str[length+1];
-    char *copied_ptr;
-    int part_len;
-    int (*ring_send_fn)(struct consfront_dev *dev, const char *data, unsigned length);
-
-    if(!console_initialised)
-        ring_send_fn = xencons_ring_send_no_notify;
-    else
-        ring_send_fn = xencons_ring_send;
-
-    copied_ptr = copied_str;
-    memcpy(copied_ptr, data, length);
-    for(curr_char = copied_ptr; curr_char < copied_ptr+length-1; curr_char++)
-    {
-        if(*curr_char == '\n')
-        {
-            *curr_char = '\r';
-            saved_char = *(curr_char+1);
-            *(curr_char+1) = '\n';
-            part_len = curr_char - copied_ptr + 2;
-            ring_send_fn(dev, copied_ptr, part_len);
-            *(curr_char+1) = saved_char;
-            copied_ptr = curr_char+1;
-            length -= part_len - 1;
-        }
-    }
-
-    if (copied_ptr[length-1] == '\n') {
-        copied_ptr[length-1] = '\r';
-        copied_ptr[length] = '\n';
-        length++;
-    }
-    
-    ring_send_fn(dev, copied_ptr, length);
-}
-
-void print(int direct, const char *fmt, va_list args)
-{
-    static char   buf[1024];
-    
-    (void)vsnprintf(buf, sizeof(buf), fmt, args);
-
-    if(direct)
-    {
-        (void)HYPERVISOR_console_io(CONSOLEIO_write, strlen(buf), buf);
-        return;
-    } else {
-#ifndef USE_XEN_CONSOLE
-    if(!console_initialised)
-#endif    
-            (void)HYPERVISOR_console_io(CONSOLEIO_write, strlen(buf), buf);
-        
-        console_print(NULL, buf, strlen(buf));
-    }
-}
-
-void printk(const char *fmt, ...)
-{
-    va_list       args;
-    va_start(args, fmt);
-    print(0, fmt, args);
-    va_end(args);        
-}
-
-void xprintk(const char *fmt, ...)
-{
-    va_list       args;
-    va_start(args, fmt);
-    print(1, fmt, args);
-    va_end(args);        
-}
-void init_console(void)
-{   
-    printk("Initialising console ... ");
-    xencons_ring_init();    
-    console_initialised = 1;
-    /* This is also required to notify the daemon */
-    printk("done.\n");
-}
diff --git a/extras/mini-os/console/console.h b/extras/mini-os/console/console.h
deleted file mode 100644
index e85147a..0000000
--- a/extras/mini-os/console/console.h
+++ /dev/null
@@ -1,2 +0,0 @@
-
-void console_handle_input(evtchn_port_t port, struct pt_regs *regs, void *data);
diff --git a/extras/mini-os/console/xenbus.c b/extras/mini-os/console/xenbus.c
deleted file mode 100644
index 1c9a590..0000000
--- a/extras/mini-os/console/xenbus.c
+++ /dev/null
@@ -1,195 +0,0 @@
-#include <mini-os/types.h>
-#include <mini-os/wait.h>
-#include <mini-os/mm.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/events.h>
-#include <mini-os/os.h>
-#include <mini-os/lib.h>
-#include <mini-os/xenbus.h>
-#include <xen/io/console.h>
-#include <xen/io/protocols.h>
-#include <xen/io/ring.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/gnttab.h>
-#include "console.h"
-
-void free_consfront(struct consfront_dev *dev)
-{
-    char* err = NULL;
-    XenbusState state;
-
-    char path[strlen(dev->backend) + strlen("/state") + 1];
-    char nodename[strlen(dev->nodename) + strlen("/state") + 1];
-
-    snprintf(path, sizeof(path), "%s/state", dev->backend);
-    snprintf(nodename, sizeof(nodename), "%s/state", dev->nodename);
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosing)) != NULL) {
-        printk("free_consfront: error changing state to %d: %s\n",
-                XenbusStateClosing, err);
-        goto close;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && state < XenbusStateClosing)
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-    free(err);
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosed)) != NULL) {
-        printk("free_consfront: error changing state to %d: %s\n",
-                XenbusStateClosed, err);
-        goto close;
-    }
-
-close:
-    free(err);
-    err = xenbus_unwatch_path_token(XBT_NIL, path, path);
-    free(err);
-
-    mask_evtchn(dev->evtchn);
-    unbind_evtchn(dev->evtchn);
-    free(dev->backend);
-    free(dev->nodename);
-
-    gnttab_end_access(dev->ring_ref);
-
-    free_page(dev->ring);
-    free(dev);
-}
-
-struct consfront_dev *init_consfront(char *_nodename)
-{
-    xenbus_transaction_t xbt;
-    char* err = NULL;
-    char* message=NULL;
-    int retry=0;
-    char* msg = NULL;
-    char nodename[256];
-    char path[256];
-    static int consfrontends = 3;
-    struct consfront_dev *dev;
-    int res;
-
-    if (!_nodename)
-        snprintf(nodename, sizeof(nodename), "device/console/%d", consfrontends);
-    else {
-        strncpy(nodename, _nodename, sizeof(nodename) - 1);
-        nodename[sizeof(nodename) - 1] = 0;
-    }
-
-    printk("******************* CONSFRONT for %s **********\n\n\n", nodename);
-
-    consfrontends++;
-    dev = malloc(sizeof(*dev));
-    memset(dev, 0, sizeof(*dev));
-    dev->nodename = strdup(nodename);
-#ifdef HAVE_LIBC
-    dev->fd = -1;
-#endif
-
-    snprintf(path, sizeof(path), "%s/backend-id", nodename);
-    if ((res = xenbus_read_integer(path)) < 0) 
-        goto error;
-    else
-        dev->dom = res;
-    evtchn_alloc_unbound(dev->dom, console_handle_input, dev, &dev->evtchn);
-
-    dev->ring = (struct xencons_interface *) alloc_page();
-    memset(dev->ring, 0, PAGE_SIZE);
-    dev->ring_ref = gnttab_grant_access(dev->dom, virt_to_mfn(dev->ring), 0);
-
-    dev->events = NULL;
-
-again:
-    err = xenbus_transaction_start(&xbt);
-    if (err) {
-        printk("starting transaction\n");
-        free(err);
-    }
-
-    err = xenbus_printf(xbt, nodename, "ring-ref","%u",
-                dev->ring_ref);
-    if (err) {
-        message = "writing ring-ref";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename,
-                "port", "%u", dev->evtchn);
-    if (err) {
-        message = "writing event-channel";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename,
-                "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
-    if (err) {
-        message = "writing protocol";
-        goto abort_transaction;
-    }
-
-    snprintf(path, sizeof(path), "%s/state", nodename);
-    err = xenbus_switch_state(xbt, path, XenbusStateConnected);
-    if (err) {
-        message = "switching state";
-        goto abort_transaction;
-    }
-
-
-    err = xenbus_transaction_end(xbt, 0, &retry);
-    free(err);
-    if (retry) {
-            goto again;
-        printk("completing transaction\n");
-    }
-
-    goto done;
-
-abort_transaction:
-    free(err);
-    err = xenbus_transaction_end(xbt, 1, &retry);
-    printk("Abort transaction %s\n", message);
-    goto error;
-
-done:
-
-    snprintf(path, sizeof(path), "%s/backend", nodename);
-    msg = xenbus_read(XBT_NIL, path, &dev->backend);
-    if (msg) {
-        printk("Error %s when reading the backend path %s\n", msg, path);
-        goto error;
-    }
-
-    printk("backend at %s\n", dev->backend);
-
-    {
-        XenbusState state;
-        char path[strlen(dev->backend) + strlen("/state") + 1];
-        snprintf(path, sizeof(path), "%s/state", dev->backend);
-        
-	xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
-        msg = NULL;
-        state = xenbus_read_integer(path);
-        while (msg == NULL && state < XenbusStateConnected)
-            msg = xenbus_wait_for_state_change(path, &state, &dev->events);
-        if (msg != NULL || state != XenbusStateConnected) {
-            printk("backend not available, state=%d\n", state);
-            err = xenbus_unwatch_path_token(XBT_NIL, path, path);
-            goto error;
-        }
-    }
-    unmask_evtchn(dev->evtchn);
-
-    printk("**************************\n");
-
-    return dev;
-
-error:
-    free(msg);
-    free(err);
-    free_consfront(dev);
-    return NULL;
-}
-
-void fini_console(struct consfront_dev *dev)
-{
-    if (dev) free_consfront(dev);
-}
-
diff --git a/extras/mini-os/console/xencons_ring.c b/extras/mini-os/console/xencons_ring.c
deleted file mode 100644
index 81c8e99..0000000
--- a/extras/mini-os/console/xencons_ring.c
+++ /dev/null
@@ -1,195 +0,0 @@
-#include <mini-os/types.h>
-#include <mini-os/wait.h>
-#include <mini-os/mm.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/events.h>
-#include <mini-os/os.h>
-#include <mini-os/lib.h>
-#include <mini-os/xenbus.h>
-#include <xen/io/console.h>
-#include <xen/io/protocols.h>
-#include <xen/io/ring.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/gnttab.h>
-#include "console.h"
-
-DECLARE_WAIT_QUEUE_HEAD(console_queue);
-
-static inline void notify_daemon(struct consfront_dev *dev)
-{
-    /* Use evtchn: this is called early, before irq is set up. */
-    if (!dev)
-        notify_remote_via_evtchn(start_info.console.domU.evtchn);
-    else
-        notify_remote_via_evtchn(dev->evtchn);
-}
-
-static inline struct xencons_interface *xencons_interface(void)
-{
-    if (start_info.console.domU.evtchn)
-        return mfn_to_virt(start_info.console.domU.mfn);
-    else
-        return NULL;
-} 
- 
-int xencons_ring_send_no_notify(struct consfront_dev *dev, const char *data, unsigned len)
-{	
-    int sent = 0;
-	struct xencons_interface *intf;
-	XENCONS_RING_IDX cons, prod;
-
-	if (!dev)
-            intf = xencons_interface();
-        else
-            intf = dev->ring;
-        if (!intf)
-            return sent;
-
-	cons = intf->out_cons;
-	prod = intf->out_prod;
-	mb();
-	BUG_ON((prod - cons) > sizeof(intf->out));
-
-	while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
-		intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
-
-	wmb();
-	intf->out_prod = prod;
-    
-    return sent;
-}
-
-int xencons_ring_send(struct consfront_dev *dev, const char *data, unsigned len)
-{
-    int sent;
-
-    sent = xencons_ring_send_no_notify(dev, data, len);
-    notify_daemon(dev);
-
-    return sent;
-}	
-
-
-
-void console_handle_input(evtchn_port_t port, struct pt_regs *regs, void *data)
-{
-	struct consfront_dev *dev = (struct consfront_dev *) data;
-#ifdef HAVE_LIBC
-        int fd = dev ? dev->fd : -1;
-
-        if (fd != -1)
-            files[fd].read = 1;
-
-        wake_up(&console_queue);
-#else
-	struct xencons_interface *intf = xencons_interface();
-	XENCONS_RING_IDX cons, prod;
-
-	cons = intf->in_cons;
-	prod = intf->in_prod;
-	mb();
-	BUG_ON((prod - cons) > sizeof(intf->in));
-
-	while (cons != prod) {
-		xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
-		cons++;
-	}
-
-	mb();
-	intf->in_cons = cons;
-
-	notify_daemon(dev);
-
-	xencons_tx();
-#endif
-}
-
-#ifdef HAVE_LIBC
-int xencons_ring_avail(struct consfront_dev *dev)
-{
-	struct xencons_interface *intf;
-	XENCONS_RING_IDX cons, prod;
-
-        if (!dev)
-            intf = xencons_interface();
-        else
-            intf = dev->ring;
-
-	cons = intf->in_cons;
-	prod = intf->in_prod;
-	mb();
-	BUG_ON((prod - cons) > sizeof(intf->in));
-
-        return prod - cons;
-}
-
-int xencons_ring_recv(struct consfront_dev *dev, char *data, unsigned len)
-{
-	struct xencons_interface *intf;
-	XENCONS_RING_IDX cons, prod;
-        unsigned filled = 0;
-
-        if (!dev)
-            intf = xencons_interface();
-        else
-            intf = dev->ring;
-
-	cons = intf->in_cons;
-	prod = intf->in_prod;
-	mb();
-	BUG_ON((prod - cons) > sizeof(intf->in));
-
-        while (filled < len && cons + filled != prod) {
-                data[filled] = *(intf->in + MASK_XENCONS_IDX(cons + filled, intf->in));
-                filled++;
-	}
-
-	mb();
-        intf->in_cons = cons + filled;
-
-	notify_daemon(dev);
-
-        return filled;
-}
-#endif
-
-struct consfront_dev *xencons_ring_init(void)
-{
-	int err;
-	struct consfront_dev *dev;
-
-	if (!start_info.console.domU.evtchn)
-		return 0;
-
-	dev = malloc(sizeof(struct consfront_dev));
-	memset(dev, 0, sizeof(struct consfront_dev));
-	dev->nodename = "device/console";
-	dev->dom = 0;
-	dev->backend = 0;
-	dev->ring_ref = 0;
-
-#ifdef HAVE_LIBC
-	dev->fd = -1;
-#endif
-	dev->evtchn = start_info.console.domU.evtchn;
-	dev->ring = (struct xencons_interface *) mfn_to_virt(start_info.console.domU.mfn);
-
-	err = bind_evtchn(dev->evtchn, console_handle_input, dev);
-	if (err <= 0) {
-		printk("XEN console request chn bind failed %i\n", err);
-                free(dev);
-		return NULL;
-	}
-        unmask_evtchn(dev->evtchn);
-
-	/* In case we have in-flight data after save/restore... */
-	notify_daemon(dev);
-
-	return dev;
-}
-
-void xencons_resume(void)
-{
-	(void)xencons_ring_init();
-}
-
diff --git a/extras/mini-os/daytime.c b/extras/mini-os/daytime.c
deleted file mode 100644
index 7dc0de0..0000000
--- a/extras/mini-os/daytime.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/* 
- * daytime.c: a simple network service based on lwIP and mini-os
- * 
- * Tim Deegan <Tim.Deegan at eu.citrix.net>, July 2007
- */
-
-#include <os.h>
-#include <xmalloc.h>
-#include <console.h>
-#include <netfront.h>
-#include <lwip/api.h>
-
-static char message[29];
-
-void run_server(void *p)
-{
-    struct ip_addr listenaddr = { 0 };
-    struct netconn *listener;
-    struct netconn *session;
-    struct timeval tv;
-    err_t rc;
-
-    start_networking();
-
-    if (0) {
-        struct ip_addr ipaddr = { htonl(0x0a000001) };
-        struct ip_addr netmask = { htonl(0xff000000) };
-        struct ip_addr gw = { 0 };
-        networking_set_addr(&ipaddr, &netmask, &gw);
-    }
-
-    tprintk("Opening connection\n");
-
-    listener = netconn_new(NETCONN_TCP);
-    tprintk("Connection at %p\n", listener);
-
-    rc = netconn_bind(listener, &listenaddr, 13);
-    if (rc != ERR_OK) {
-        tprintk("Failed to bind connection: %i\n", rc);
-        return;
-    }
-
-    rc = netconn_listen(listener);
-    if (rc != ERR_OK) {
-        tprintk("Failed to listen on connection: %i\n", rc);
-        return;
-    }
-
-    while (1) {
-        session = netconn_accept(listener);
-        if (session == NULL) 
-            continue;
-
-        gettimeofday(&tv, NULL);
-        sprintf(message, "%20lu.%6.6lu\n", tv.tv_sec, tv.tv_usec);
-        (void) netconn_write(session, message, strlen(message), NETCONN_COPY);
-        (void) netconn_disconnect(session);
-        (void) netconn_delete(session);
-    }
-}
-
-
-int app_main(start_info_t *si)
-{
-    create_thread("server", run_server, NULL);
-    return 0;
-}
diff --git a/extras/mini-os/domain_config b/extras/mini-os/domain_config
deleted file mode 100644
index f3ec1d1..0000000
--- a/extras/mini-os/domain_config
+++ /dev/null
@@ -1,19 +0,0 @@
-#  -*- mode: python; -*-
-#============================================================================
-# Python configuration setup for 'xm create'.
-# This script sets the parameters used when a domain is created using 'xm create'.
-# You use a separate script for each domain you want to create, or 
-# you can set the parameters for the domain on the xm command line.
-#============================================================================
-
-#----------------------------------------------------------------------------
-# Kernel image file.
-kernel = "mini-os.gz"
-
-# Initial memory allocation (in megabytes) for the new domain.
-memory = 32
-
-# A name for your domain. All domains must have different names.
-name = "Mini-OS"
-
-on_crash = 'destroy'
diff --git a/extras/mini-os/events.c b/extras/mini-os/events.c
deleted file mode 100644
index 2a23042..0000000
--- a/extras/mini-os/events.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2005 - Grzegorz Milos - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: events.c
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: Grzegorz Milos (gm281 at cam.ac.uk)
- *              
- *        Date: Jul 2003, changes Jun 2005
- * 
- * Environment: Xen Minimal OS
- * Description: Deals with events recieved on event channels
- *
- ****************************************************************************
- */
-
-#include <mini-os/os.h>
-#include <mini-os/mm.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/events.h>
-#include <mini-os/lib.h>
-#include <xen/xsm/flask_op.h>
-
-#define NR_EVS 1024
-
-/* this represents a event handler. Chaining or sharing is not allowed */
-typedef struct _ev_action_t {
-	evtchn_handler_t handler;
-	void *data;
-    uint32_t count;
-} ev_action_t;
-
-static ev_action_t ev_actions[NR_EVS];
-void default_handler(evtchn_port_t port, struct pt_regs *regs, void *data);
-
-static unsigned long bound_ports[NR_EVS/(8*sizeof(unsigned long))];
-
-void unbind_all_ports(void)
-{
-    int i;
-    int cpu = 0;
-    shared_info_t *s = HYPERVISOR_shared_info;
-    vcpu_info_t   *vcpu_info = &s->vcpu_info[cpu];
-
-    for ( i = 0; i < NR_EVS; i++ )
-    {
-        if ( i == start_info.console.domU.evtchn ||
-             i == start_info.store_evtchn)
-            continue;
-
-        if ( test_and_clear_bit(i, bound_ports) )
-        {
-            printk("port %d still bound!\n", i);
-	    unbind_evtchn(i);
-        }
-    }
-    vcpu_info->evtchn_upcall_pending = 0;
-    vcpu_info->evtchn_pending_sel = 0;
-}
-
-/*
- * Demux events to different handlers.
- */
-int do_event(evtchn_port_t port, struct pt_regs *regs)
-{
-    ev_action_t  *action;
-
-    clear_evtchn(port);
-
-    if ( port >= NR_EVS )
-    {
-        printk("WARN: do_event(): Port number too large: %d\n", port);
-        return 1;
-    }
-
-    action = &ev_actions[port];
-    action->count++;
-
-    /* call the handler */
-	action->handler(port, regs, action->data);
-
-    return 1;
-
-}
-
-evtchn_port_t bind_evtchn(evtchn_port_t port, evtchn_handler_t handler,
-						  void *data)
-{
- 	if ( ev_actions[port].handler != default_handler )
-        printk("WARN: Handler for port %d already registered, replacing\n",
-               port);
-
-	ev_actions[port].data = data;
-	wmb();
-	ev_actions[port].handler = handler;
-	set_bit(port, bound_ports);
-
-	return port;
-}
-
-void unbind_evtchn(evtchn_port_t port )
-{
-    struct evtchn_close close;
-    int rc;
-
-    if ( ev_actions[port].handler == default_handler )
-        printk("WARN: No handler for port %d when unbinding\n", port);
-    mask_evtchn(port);
-    clear_evtchn(port);
-
-    ev_actions[port].handler = default_handler;
-    wmb();
-    ev_actions[port].data = NULL;
-    clear_bit(port, bound_ports);
-
-    close.port = port;
-    rc = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
-    if ( rc )
-        printk("WARN: close_port %d failed rc=%d. ignored\n", port, rc);
-}
-
-evtchn_port_t bind_virq(uint32_t virq, evtchn_handler_t handler, void *data)
-{
-	evtchn_bind_virq_t op;
-    int rc;
-
-	/* Try to bind the virq to a port */
-	op.virq = virq;
-	op.vcpu = smp_processor_id();
-
-	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &op);
-	if (rc != 0)
-	{
-		printk("Failed to bind virtual IRQ %d with rc=%d\n", virq, rc);
-		return -1;
-    }
-    bind_evtchn(op.port, handler, data);
-	return op.port;
-}
-
-evtchn_port_t bind_pirq(uint32_t pirq, int will_share,
-                        evtchn_handler_t handler, void *data)
-{
-	evtchn_bind_pirq_t op;
-    int rc;
-
-	/* Try to bind the pirq to a port */
-	op.pirq = pirq;
-	op.flags = will_share ? BIND_PIRQ__WILL_SHARE : 0;
-
-	if ( (rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &op)) != 0 )
-	{
-		printk("Failed to bind physical IRQ %d with rc=%d\n", pirq, rc);
-		return -1;
-	}
-	bind_evtchn(op.port, handler, data);
-	return op.port;
-}
-
-/*
- * Initially all events are without a handler and disabled
- */
-void init_events(void)
-{
-    int i;
-
-    /* initialize event handler */
-    for ( i = 0; i < NR_EVS; i++ )
-	{
-        ev_actions[i].handler = default_handler;
-        mask_evtchn(i);
-    }
-
-    arch_init_events();
-}
-
-void fini_events(void)
-{
-    /* Dealloc all events */
-    arch_unbind_ports();
-    unbind_all_ports();
-    arch_fini_events();
-}
-
-void default_handler(evtchn_port_t port, struct pt_regs *regs, void *ignore)
-{
-    printk("[Port %d] - event received\n", port);
-}
-
-/* Create a port available to the pal for exchanging notifications.
-   Returns the result of the hypervisor call. */
-
-/* Unfortunate confusion of terminology: the port is unbound as far
-   as Xen is concerned, but we automatically bind a handler to it
-   from inside mini-os. */
-
-int evtchn_alloc_unbound(domid_t pal, evtchn_handler_t handler,
-						 void *data, evtchn_port_t *port)
-{
-    int rc;
-
-    evtchn_alloc_unbound_t op;
-    op.dom = DOMID_SELF;
-    op.remote_dom = pal;
-    rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &op);
-    if ( rc )
-    {
-        printk("ERROR: alloc_unbound failed with rc=%d", rc);
-		return rc;
-    }
-    *port = bind_evtchn(op.port, handler, data);
-    return rc;
-}
-
-/* Connect to a port so as to allow the exchange of notifications with
-   the pal. Returns the result of the hypervisor call. */
-
-int evtchn_bind_interdomain(domid_t pal, evtchn_port_t remote_port,
-			    evtchn_handler_t handler, void *data,
-			    evtchn_port_t *local_port)
-{
-    int rc;
-    evtchn_port_t port;
-    evtchn_bind_interdomain_t op;
-    op.remote_dom = pal;
-    op.remote_port = remote_port;
-    rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, &op);
-    if ( rc )
-    {
-        printk("ERROR: bind_interdomain failed with rc=%d", rc);
-		return rc;
-    }
-    port = op.local_port;
-    *local_port = bind_evtchn(port, handler, data);
-    return rc;
-}
-
-int evtchn_get_peercontext(evtchn_port_t local_port, char *ctx, int size)
-{
-    int rc;
-    uint32_t sid;
-    struct xen_flask_op op;
-    op.cmd = FLASK_GET_PEER_SID;
-    op.interface_version = XEN_FLASK_INTERFACE_VERSION;
-    op.u.peersid.evtchn = local_port;
-    rc = HYPERVISOR_xsm_op(&op);
-    if (rc)
-        return rc;
-    sid = op.u.peersid.sid;
-    op.cmd = FLASK_SID_TO_CONTEXT;
-    op.u.sid_context.sid = sid;
-    op.u.sid_context.size = size;
-    set_xen_guest_handle(op.u.sid_context.context, ctx);
-    rc = HYPERVISOR_xsm_op(&op);
-    return rc;
-}
-
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/extras/mini-os/fbfront.c b/extras/mini-os/fbfront.c
deleted file mode 100644
index 9cc07b4..0000000
--- a/extras/mini-os/fbfront.c
+++ /dev/null
@@ -1,710 +0,0 @@
-/*
- * Frame Buffer + Keyboard driver for Mini-OS. 
- * Samuel Thibault <samuel.thibault at eu.citrix.com>, 2008
- * Based on blkfront.c.
- */
-
-#include <mini-os/os.h>
-#include <mini-os/xenbus.h>
-#include <mini-os/events.h>
-#include <xen/io/kbdif.h>
-#include <xen/io/fbif.h>
-#include <xen/io/protocols.h>
-#include <mini-os/gnttab.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/fbfront.h>
-#include <mini-os/lib.h>
-
-DECLARE_WAIT_QUEUE_HEAD(kbdfront_queue);
-
-
-
-
-
-
-struct kbdfront_dev {
-    domid_t dom;
-
-    struct xenkbd_page *page;
-    evtchn_port_t evtchn;
-
-    char *nodename;
-    char *backend;
-
-    xenbus_event_queue events;
-
-#ifdef HAVE_LIBC
-    int fd;
-#endif
-};
-
-void kbdfront_handler(evtchn_port_t port, struct pt_regs *regs, void *data)
-{
-#ifdef HAVE_LIBC
-    struct kbdfront_dev *dev = data;
-    int fd = dev->fd;
-
-    if (fd != -1)
-        files[fd].read = 1;
-#endif
-    wake_up(&kbdfront_queue);
-}
-
-static void free_kbdfront(struct kbdfront_dev *dev)
-{
-    mask_evtchn(dev->evtchn);
-
-    free(dev->backend);
-
-    free_page(dev->page);
-
-    unbind_evtchn(dev->evtchn);
-
-    free(dev->nodename);
-    free(dev);
-}
-
-struct kbdfront_dev *init_kbdfront(char *_nodename, int abs_pointer)
-{
-    xenbus_transaction_t xbt;
-    char* err;
-    char* message=NULL;
-    struct xenkbd_page *s;
-    int retry=0;
-    char* msg = NULL;
-    char* nodename = _nodename ? _nodename : "device/vkbd/0";
-    struct kbdfront_dev *dev;
-
-    char path[strlen(nodename) + strlen("/backend-id") + 1];
-
-    printk("******************* KBDFRONT for %s **********\n\n\n", nodename);
-
-    dev = malloc(sizeof(*dev));
-    memset(dev, 0, sizeof(*dev));
-    dev->nodename = strdup(nodename);
-#ifdef HAVE_LIBC
-    dev->fd = -1;
-#endif
-
-    snprintf(path, sizeof(path), "%s/backend-id", nodename);
-    dev->dom = xenbus_read_integer(path); 
-    evtchn_alloc_unbound(dev->dom, kbdfront_handler, dev, &dev->evtchn);
-
-    dev->page = s = (struct xenkbd_page*) alloc_page();
-    memset(s,0,PAGE_SIZE);
-
-    dev->events = NULL;
-
-    s->in_cons = s->in_prod = 0;
-    s->out_cons = s->out_prod = 0;
-
-again:
-    err = xenbus_transaction_start(&xbt);
-    if (err) {
-        printk("starting transaction\n");
-        free(err);
-    }
-
-    err = xenbus_printf(xbt, nodename, "page-ref","%lu", virt_to_mfn(s));
-    if (err) {
-        message = "writing page-ref";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename, "event-channel", "%u", dev->evtchn);
-    if (err) {
-        message = "writing event-channel";
-        goto abort_transaction;
-    }
-    if (abs_pointer) {
-        err = xenbus_printf(xbt, nodename, "request-abs-pointer", "1");
-        if (err) {
-            message = "writing event-channel";
-            goto abort_transaction;
-        }
-    }
-
-    snprintf(path, sizeof(path), "%s/state", nodename);
-    err = xenbus_switch_state(xbt, path, XenbusStateInitialised);
-    if (err) {
-        printk("error writing initialized: %s\n", err);
-        free(err);
-    }
-
-    err = xenbus_transaction_end(xbt, 0, &retry);
-    free(err);
-    if (retry) {
-            goto again;
-        printk("completing transaction\n");
-    }
-
-    goto done;
-
-abort_transaction:
-    free(err);
-    err = xenbus_transaction_end(xbt, 1, &retry);
-    printk("Abort transaction %s\n", message);
-    goto error;
-
-done:
-
-    snprintf(path, sizeof(path), "%s/backend", nodename);
-    msg = xenbus_read(XBT_NIL, path, &dev->backend);
-    if (msg) {
-        printk("Error %s when reading the backend path %s\n", msg, path);
-        goto error;
-    }
-
-    printk("backend at %s\n", dev->backend);
-
-    {
-        XenbusState state;
-        char path[strlen(dev->backend) + strlen("/state") + 1];
-        char frontpath[strlen(nodename) + strlen("/state") + 1];
-
-        snprintf(path, sizeof(path), "%s/state", dev->backend);
-
-        xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
-
-        err = NULL;
-        state = xenbus_read_integer(path);
-        while (err == NULL && state < XenbusStateConnected)
-            err = xenbus_wait_for_state_change(path, &state, &dev->events);
-        if (state != XenbusStateConnected) {
-            printk("backend not available, state=%d\n", state);
-            free(err);
-            err = xenbus_unwatch_path_token(XBT_NIL, path, path);
-            goto error;
-        }
-
-        printk("%s connected\n", dev->backend);
-
-        snprintf(frontpath, sizeof(frontpath), "%s/state", nodename);
-        if((err = xenbus_switch_state(XBT_NIL, frontpath, XenbusStateConnected))
-            != NULL) {
-            printk("error switching state: %s\n", err);
-            free(err);
-            err = xenbus_unwatch_path_token(XBT_NIL, path, path);
-            goto error;
-        }
-    }
-    unmask_evtchn(dev->evtchn);
-
-    printk("************************** KBDFRONT\n");
-
-    return dev;
-error:
-    free(msg);
-    free(err);
-    free_kbdfront(dev);
-    return NULL;
-}
-
-int kbdfront_receive(struct kbdfront_dev *dev, union xenkbd_in_event *buf, int n)
-{
-    struct xenkbd_page *page = dev->page;
-    uint32_t prod, cons;
-    int i;
-
-#ifdef HAVE_LIBC
-    if (dev->fd != -1) {
-        files[dev->fd].read = 0;
-        mb(); /* Make sure to let the handler set read to 1 before we start looking at the ring */
-    }
-#endif
-
-    prod = page->in_prod;
-
-    if (prod == page->in_cons)
-        return 0;
-
-    rmb();      /* ensure we see ring contents up to prod */
-
-    for (i = 0, cons = page->in_cons; i < n && cons != prod; i++, cons++)
-        memcpy(buf + i, &XENKBD_IN_RING_REF(page, cons), sizeof(*buf));
-
-    mb();       /* ensure we got ring contents */
-    page->in_cons = cons;
-    notify_remote_via_evtchn(dev->evtchn);
-
-#ifdef HAVE_LIBC
-    if (cons != prod && dev->fd != -1)
-        /* still some events to read */
-        files[dev->fd].read = 1;
-#endif
-
-    return i;
-}
-
-
-void shutdown_kbdfront(struct kbdfront_dev *dev)
-{
-    char* err = NULL, *err2;
-    XenbusState state;
-
-    char path[strlen(dev->backend) + strlen("/state") + 1];
-    char nodename[strlen(dev->nodename) + strlen("/request-abs-pointer") + 1];
-
-    printk("close kbd: backend at %s\n",dev->backend);
-
-    snprintf(path, sizeof(path), "%s/state", dev->backend);
-    snprintf(nodename, sizeof(nodename), "%s/state", dev->nodename);
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosing)) != NULL) {
-        printk("shutdown_kbdfront: error changing state to %d: %s\n",
-                XenbusStateClosing, err);
-        goto close_kbdfront;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && state < XenbusStateClosing)
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-    free(err);
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosed)) != NULL) {
-        printk("shutdown_kbdfront: error changing state to %d: %s\n",
-                XenbusStateClosed, err);
-        goto close_kbdfront;
-    }
-    state = xenbus_read_integer(path);
-    while (state < XenbusStateClosed) {
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-        free(err);
-    }
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateInitialising)) != NULL) {
-        printk("shutdown_kbdfront: error changing state to %d: %s\n",
-                XenbusStateInitialising, err);
-        goto close_kbdfront;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && (state < XenbusStateInitWait || state >= XenbusStateClosed))
-    err = xenbus_wait_for_state_change(path, &state, &dev->events);
-
-close_kbdfront:
-    free(err);
-    err2 = xenbus_unwatch_path_token(XBT_NIL, path, path);
-    free(err2);
-
-    snprintf(nodename, sizeof(nodename), "%s/page-ref", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-    snprintf(nodename, sizeof(nodename), "%s/event-channel", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-    snprintf(nodename, sizeof(nodename), "%s/request-abs-pointer", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-
-    if (!err)
-        free_kbdfront(dev);
-}
-
-#ifdef HAVE_LIBC
-int kbdfront_open(struct kbdfront_dev *dev)
-{
-    dev->fd = alloc_fd(FTYPE_KBD);
-    printk("kbd_open(%s) -> %d\n", dev->nodename, dev->fd);
-    files[dev->fd].kbd.dev = dev;
-    return dev->fd;
-}
-#endif
-
-
-
-
-
-DECLARE_WAIT_QUEUE_HEAD(fbfront_queue);
-
-
-
-
-
-
-struct fbfront_dev {
-    domid_t dom;
-
-    struct xenfb_page *page;
-    evtchn_port_t evtchn;
-
-    char *nodename;
-    char *backend;
-    int request_update;
-
-    int width;
-    int height;
-    int depth;
-    int stride;
-    int mem_length;
-    int offset;
-
-    xenbus_event_queue events;
-
-#ifdef HAVE_LIBC
-    int fd;
-#endif
-};
-
-void fbfront_handler(evtchn_port_t port, struct pt_regs *regs, void *data)
-{
-#ifdef HAVE_LIBC
-    struct fbfront_dev *dev = data;
-    int fd = dev->fd;
-
-    if (fd != -1)
-        files[fd].read = 1;
-#endif
-    wake_up(&fbfront_queue);
-}
-
-static void free_fbfront(struct fbfront_dev *dev)
-{
-    mask_evtchn(dev->evtchn);
-
-    free(dev->backend);
-
-    free_page(dev->page);
-
-    unbind_evtchn(dev->evtchn);
-
-    free(dev->nodename);
-    free(dev);
-}
-
-int fbfront_receive(struct fbfront_dev *dev, union xenfb_in_event *buf, int n)
-{
-    struct xenfb_page *page = dev->page;
-    uint32_t prod, cons;
-    int i;
-
-#ifdef HAVE_LIBC
-    if (dev->fd != -1) {
-        files[dev->fd].read = 0;
-        mb(); /* Make sure to let the handler set read to 1 before we start looking at the ring */
-    }
-#endif
-
-    prod = page->in_prod;
-
-    if (prod == page->in_cons)
-        return 0;
-
-    rmb();      /* ensure we see ring contents up to prod */
-
-    for (i = 0, cons = page->in_cons; i < n && cons != prod; i++, cons++)
-        memcpy(buf + i, &XENFB_IN_RING_REF(page, cons), sizeof(*buf));
-
-    mb();       /* ensure we got ring contents */
-    page->in_cons = cons;
-    notify_remote_via_evtchn(dev->evtchn);
-
-#ifdef HAVE_LIBC
-    if (cons != prod && dev->fd != -1)
-        /* still some events to read */
-        files[dev->fd].read = 1;
-#endif
-
-    return i;
-}
-
-struct fbfront_dev *init_fbfront(char *_nodename, unsigned long *mfns, int width, int height, int depth, int stride, int n)
-{
-    xenbus_transaction_t xbt;
-    char* err;
-    char* message=NULL;
-    struct xenfb_page *s;
-    int retry=0;
-    char* msg=NULL;
-    int i, j;
-    struct fbfront_dev *dev;
-    int max_pd;
-    unsigned long mapped;
-    char* nodename = _nodename ? _nodename : "device/vfb/0";
-
-    char path[strlen(nodename) + strlen("/backend-id") + 1];
-
-    printk("******************* FBFRONT for %s **********\n\n\n", nodename);
-
-    dev = malloc(sizeof(*dev));
-    memset(dev, 0, sizeof(*dev));
-    dev->nodename = strdup(nodename);
-#ifdef HAVE_LIBC
-    dev->fd = -1;
-#endif
-
-    snprintf(path, sizeof(path), "%s/backend-id", nodename);
-    dev->dom = xenbus_read_integer(path); 
-    evtchn_alloc_unbound(dev->dom, fbfront_handler, dev, &dev->evtchn);
-
-    dev->page = s = (struct xenfb_page*) alloc_page();
-    memset(s,0,PAGE_SIZE);
-
-    s->in_cons = s->in_prod = 0;
-    s->out_cons = s->out_prod = 0;
-    dev->width = s->width = width;
-    dev->height = s->height = height;
-    dev->depth = s->depth = depth;
-    dev->stride = s->line_length = stride;
-    dev->mem_length = s->mem_length = n * PAGE_SIZE;
-    dev->offset = 0;
-    dev->events = NULL;
-
-    max_pd = sizeof(s->pd) / sizeof(s->pd[0]);
-    mapped = 0;
-
-    for (i = 0; mapped < n && i < max_pd; i++) {
-        unsigned long *pd = (unsigned long *) alloc_page();
-        for (j = 0; mapped < n && j < PAGE_SIZE / sizeof(unsigned long); j++)
-            pd[j] = mfns[mapped++];
-        for ( ; j < PAGE_SIZE / sizeof(unsigned long); j++)
-            pd[j] = 0;
-        s->pd[i] = virt_to_mfn(pd);
-    }
-    for ( ; i < max_pd; i++)
-        s->pd[i] = 0;
-
-
-again:
-    err = xenbus_transaction_start(&xbt);
-    if (err) {
-        printk("starting transaction\n");
-        free(err);
-    }
-
-    err = xenbus_printf(xbt, nodename, "page-ref","%lu", virt_to_mfn(s));
-    if (err) {
-        message = "writing page-ref";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename, "event-channel", "%u", dev->evtchn);
-    if (err) {
-        message = "writing event-channel";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename, "protocol", "%s",
-                        XEN_IO_PROTO_ABI_NATIVE);
-    if (err) {
-        message = "writing event-channel";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename, "feature-update", "1");
-    if (err) {
-        message = "writing event-channel";
-        goto abort_transaction;
-    }
-
-    snprintf(path, sizeof(path), "%s/state", nodename);
-    err = xenbus_switch_state(xbt, path, XenbusStateInitialised);
-    if (err) {
-        message = "switching state";
-        goto abort_transaction;
-    }
-
-    err = xenbus_transaction_end(xbt, 0, &retry);
-    free(err);
-    if (retry) {
-            goto again;
-        printk("completing transaction\n");
-    }
-
-    goto done;
-
-abort_transaction:
-    free(err);
-    err = xenbus_transaction_end(xbt, 1, &retry);
-    printk("Abort transaction %s\n", message);
-    goto error;
-
-done:
-
-    snprintf(path, sizeof(path), "%s/backend", nodename);
-    msg = xenbus_read(XBT_NIL, path, &dev->backend);
-    if (msg) {
-        printk("Error %s when reading the backend path %s\n", msg, path);
-        goto error;
-    }
-
-    printk("backend at %s\n", dev->backend);
-
-    {
-        XenbusState state;
-        char path[strlen(dev->backend) + strlen("/request-update") + 1];
-        char frontpath[strlen(nodename) + strlen("/state") + 1];
-
-        snprintf(path, sizeof(path), "%s/state", dev->backend);
-
-        xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
-
-        err = NULL;
-        state = xenbus_read_integer(path);
-        while (err == NULL && state < XenbusStateConnected)
-            err = xenbus_wait_for_state_change(path, &state, &dev->events);
-        if (state != XenbusStateConnected) {
-            printk("backend not available, state=%d\n", state);
-            free(err);
-            err = xenbus_unwatch_path_token(XBT_NIL, path, path);
-            goto error;
-        }
-
-        printk("%s connected\n", dev->backend);
-
-        snprintf(path, sizeof(path), "%s/request-update", dev->backend);
-        dev->request_update = xenbus_read_integer(path);
-
-        snprintf(frontpath, sizeof(frontpath), "%s/state", nodename);
-        if ((err = xenbus_switch_state(XBT_NIL, frontpath, XenbusStateConnected))
-            != NULL) {
-            printk("error switching state: %s\n", err);
-            free(err);
-            err = xenbus_unwatch_path_token(XBT_NIL, path, path);
-            goto error;
-        }
-    }
-    unmask_evtchn(dev->evtchn);
-
-    printk("************************** FBFRONT\n");
-
-    return dev;
-
-error:
-    free(msg);
-    free(err);
-    free_fbfront(dev);
-    return NULL;
-}
-
-static void fbfront_out_event(struct fbfront_dev *dev, union xenfb_out_event *event)
-{
-    struct xenfb_page *page = dev->page;
-    uint32_t prod;
-    DEFINE_WAIT(w);
-
-    add_waiter(w, fbfront_queue);
-    while (page->out_prod - page->out_cons == XENFB_OUT_RING_LEN)
-        schedule();
-    remove_waiter(w, fbfront_queue);
-
-    prod = page->out_prod;
-    mb(); /* ensure ring space available */
-    XENFB_OUT_RING_REF(page, prod) = *event;
-    wmb(); /* ensure ring contents visible */
-    page->out_prod = prod + 1;
-    notify_remote_via_evtchn(dev->evtchn);
-}
-
-void fbfront_update(struct fbfront_dev *dev, int x, int y, int width, int height)
-{
-    struct xenfb_update update;
-
-    if (dev->request_update <= 0)
-        return;
-
-    if (x < 0) {
-        width += x;
-        x = 0;
-    }
-    if (x + width > dev->width)
-        width = dev->width - x;
-
-    if (y < 0) {
-        height += y;
-        y = 0;
-    }
-    if (y + height > dev->height)
-        height = dev->height - y;
-
-    if (width <= 0 || height <= 0)
-        return;
-
-    update.type = XENFB_TYPE_UPDATE;
-    update.x = x;
-    update.y = y;
-    update.width = width;
-    update.height = height;
-    fbfront_out_event(dev, (union xenfb_out_event *) &update);
-}
-
-void fbfront_resize(struct fbfront_dev *dev, int width, int height, int stride, int depth, int offset)
-{
-    struct xenfb_resize resize;
-
-    resize.type = XENFB_TYPE_RESIZE;
-    dev->width  = resize.width = width;
-    dev->height = resize.height = height;
-    dev->stride = resize.stride = stride;
-    dev->depth  = resize.depth = depth;
-    dev->offset = resize.offset = offset;
-    fbfront_out_event(dev, (union xenfb_out_event *) &resize);
-}
-
-void shutdown_fbfront(struct fbfront_dev *dev)
-{
-    char* err = NULL, *err2;
-    XenbusState state;
-
-    char path[strlen(dev->backend) + strlen("/state") + 1];
-    char nodename[strlen(dev->nodename) + strlen("/feature-update") + 1];
-
-    printk("close fb: backend at %s\n",dev->backend);
-
-    snprintf(path, sizeof(path), "%s/state", dev->backend);
-    snprintf(nodename, sizeof(nodename), "%s/state", dev->nodename);
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosing)) != NULL) {
-        printk("shutdown_fbfront: error changing state to %d: %s\n",
-                XenbusStateClosing, err);
-        goto close_fbfront;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && state < XenbusStateClosing)
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-    free(err);
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosed)) != NULL) {
-        printk("shutdown_fbfront: error changing state to %d: %s\n",
-                XenbusStateClosed, err);
-        goto close_fbfront;
-    }
-    state = xenbus_read_integer(path);
-    if (state < XenbusStateClosed) {
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-        free(err);
-    }
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateInitialising)) != NULL) {
-        printk("shutdown_fbfront: error changing state to %d: %s\n",
-                XenbusStateInitialising, err);
-        goto close_fbfront;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && (state < XenbusStateInitWait || state >= XenbusStateClosed))
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-
-close_fbfront:
-    free(err);
-    err2 = xenbus_unwatch_path_token(XBT_NIL, path, path);
-    free(err2);
-
-    snprintf(nodename, sizeof(nodename), "%s/page-ref", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-    snprintf(nodename, sizeof(nodename), "%s/event-channel", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-    snprintf(nodename, sizeof(nodename), "%s/protocol", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-    snprintf(nodename, sizeof(nodename), "%s/feature-update", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-
-    if (!err)
-        free_fbfront(dev);
-}
-
-#ifdef HAVE_LIBC
-int fbfront_open(struct fbfront_dev *dev)
-{
-    dev->fd = alloc_fd(FTYPE_FB);
-    printk("fb_open(%s) -> %d\n", dev->nodename, dev->fd);
-    files[dev->fd].fb.dev = dev;
-    return dev->fd;
-}
-#endif
-
diff --git a/extras/mini-os/gntmap.c b/extras/mini-os/gntmap.c
deleted file mode 100644
index f6ab3ad..0000000
--- a/extras/mini-os/gntmap.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Manages grant mappings from other domains.
- *
- * Diego Ongaro <diego.ongaro at citrix.com>, July 2008
- *
- * Files of type FTYPE_GNTMAP contain a gntmap, which is an array of
- * (host address, grant handle) pairs. Grant handles come from a hypervisor map
- * operation and are needed for the corresponding unmap.
- *
- * This is a rather naive implementation in terms of performance. If we start
- * using it frequently, there's definitely some low-hanging fruit here.
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <mini-os/os.h>
-#include <mini-os/lib.h>
-#include <mini-os/xmalloc.h>
-#include <errno.h>
-#include <xen/grant_table.h>
-#include <inttypes.h>
-#include <mini-os/gntmap.h>
-
-//#define GNTMAP_DEBUG
-#ifdef GNTMAP_DEBUG
-#define DEBUG(_f, _a...) \
-    printk("MINI_OS(gntmap.c:%d): %s" _f "\n", __LINE__, __func__, ## _a)
-#else
-#define DEBUG(_f, _a...)    ((void)0)
-#endif
-
-
-#define DEFAULT_MAX_GRANTS 128
-
-struct gntmap_entry {
-    unsigned long host_addr;
-    grant_handle_t handle;
-};
-
-static inline int
-gntmap_entry_used(struct gntmap_entry *entry)
-{
-    return entry->host_addr != 0;
-}
-
-static struct gntmap_entry*
-gntmap_find_free_entry(struct gntmap *map)
-{
-    int i;
-
-    for (i = 0; i < map->nentries; i++) {
-        if (!gntmap_entry_used(&map->entries[i]))
-            return &map->entries[i];
-    }
-
-    DEBUG("(map=%p): all %d entries full",
-           map, map->nentries);
-    return NULL;
-}
-
-static struct gntmap_entry*
-gntmap_find_entry(struct gntmap *map, unsigned long addr)
-{
-    int i;
-
-    for (i = 0; i < map->nentries; i++) {
-        if (map->entries[i].host_addr == addr)
-            return &map->entries[i];
-    }
-    return NULL;
-}
-
-int
-gntmap_set_max_grants(struct gntmap *map, int count)
-{
-    DEBUG("(map=%p, count=%d)", map, count);
-
-    if (map->nentries != 0)
-        return -EBUSY;
-
-    map->entries = xmalloc_array(struct gntmap_entry, count);
-    if (map->entries == NULL)
-        return -ENOMEM;
-
-    memset(map->entries, 0, sizeof(struct gntmap_entry) * count);
-    map->nentries = count;
-    return 0;
-}
-
-static int
-_gntmap_map_grant_ref(struct gntmap_entry *entry, 
-                      unsigned long host_addr,
-                      uint32_t domid,
-                      uint32_t ref,
-                      int writable)
-{
-    struct gnttab_map_grant_ref op;
-    int rc;
-
-    op.ref = (grant_ref_t) ref;
-    op.dom = (domid_t) domid;
-    op.host_addr = (uint64_t) host_addr;
-    op.flags = GNTMAP_host_map;
-    if (!writable)
-        op.flags |= GNTMAP_readonly;
-
-    rc = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
-    if (rc != 0 || op.status != GNTST_okay) {
-        printk("GNTTABOP_map_grant_ref failed: "
-               "returned %d, status %" PRId16 "\n",
-               rc, op.status);
-        return rc != 0 ? rc : op.status;
-    }
-
-    entry->host_addr = host_addr;
-    entry->handle = op.handle;
-    return 0;
-}
-
-static int
-_gntmap_unmap_grant_ref(struct gntmap_entry *entry)
-{
-    struct gnttab_unmap_grant_ref op;
-    int rc;
-
-    op.host_addr    = (uint64_t) entry->host_addr;
-    op.dev_bus_addr = 0;
-    op.handle       = entry->handle;
-
-    rc = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
-    if (rc != 0 || op.status != GNTST_okay) {
-        printk("GNTTABOP_unmap_grant_ref failed: "
-               "returned %d, status %" PRId16 "\n",
-               rc, op.status);
-        return rc != 0 ? rc : op.status;
-    }
-
-    entry->host_addr = 0;
-    return 0;
-}
-
-int
-gntmap_munmap(struct gntmap *map, unsigned long start_address, int count)
-{
-    int i, rc;
-    struct gntmap_entry *ent;
-
-    DEBUG("(map=%p, start_address=%lx, count=%d)",
-           map, start_address, count);
-
-    for (i = 0; i < count; i++) {
-        ent = gntmap_find_entry(map, start_address + PAGE_SIZE * i);
-        if (ent == NULL) {
-            printk("gntmap: tried to munmap unknown page\n");
-            return -EINVAL;
-        }
-
-        rc = _gntmap_unmap_grant_ref(ent);
-        if (rc != 0)
-            return rc;
-    }
-
-    return 0;
-}
-
-void*
-gntmap_map_grant_refs(struct gntmap *map, 
-                      uint32_t count,
-                      uint32_t *domids,
-                      int domids_stride,
-                      uint32_t *refs,
-                      int writable)
-{
-    unsigned long addr;
-    struct gntmap_entry *ent;
-    int i;
-
-    DEBUG("(map=%p, count=%" PRIu32 ", "
-           "domids=%p [%" PRIu32 "...], domids_stride=%d, "
-           "refs=%p [%" PRIu32 "...], writable=%d)",
-           map, count,
-           domids, domids == NULL ? 0 : domids[0], domids_stride,
-           refs, refs == NULL ? 0 : refs[0], writable);
-
-    (void) gntmap_set_max_grants(map, DEFAULT_MAX_GRANTS);
-
-    addr = allocate_ondemand((unsigned long) count, 1);
-    if (addr == 0)
-        return NULL;
-
-    for (i = 0; i < count; i++) {
-        ent = gntmap_find_free_entry(map);
-        if (ent == NULL ||
-            _gntmap_map_grant_ref(ent,
-                                  addr + PAGE_SIZE * i,
-                                  domids[i * domids_stride],
-                                  refs[i],
-                                  writable) != 0) {
-
-            (void) gntmap_munmap(map, addr, i);
-            return NULL;
-        }
-    }
-
-    return (void*) addr;
-}
-
-void
-gntmap_init(struct gntmap *map)
-{
-    DEBUG("(map=%p)", map);
-    map->nentries = 0;
-    map->entries = NULL;
-}
-
-void
-gntmap_fini(struct gntmap *map)
-{
-    struct gntmap_entry *ent;
-    int i;
-
-    DEBUG("(map=%p)", map);
-
-    for (i = 0; i < map->nentries; i++) {
-        ent = &map->entries[i];
-        if (gntmap_entry_used(ent))
-            (void) _gntmap_unmap_grant_ref(ent);
-    }
-
-    xfree(map->entries);
-    map->entries = NULL;
-    map->nentries = 0;
-}
diff --git a/extras/mini-os/gnttab.c b/extras/mini-os/gnttab.c
deleted file mode 100644
index f395d12..0000000
--- a/extras/mini-os/gnttab.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/* 
- ****************************************************************************
- * (C) 2006 - Cambridge University
- ****************************************************************************
- *
- *        File: gnttab.c
- *      Author: Steven Smith (sos22 at cam.ac.uk) 
- *     Changes: Grzegorz Milos (gm281 at cam.ac.uk)
- *              
- *        Date: July 2006
- * 
- * Environment: Xen Minimal OS
- * Description: Simple grant tables implementation. About as stupid as it's
- *  possible to be and still work.
- *
- ****************************************************************************
- */
-#include <mini-os/os.h>
-#include <mini-os/mm.h>
-#include <mini-os/gnttab.h>
-#include <mini-os/semaphore.h>
-
-#define NR_RESERVED_ENTRIES 8
-
-/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
-#define NR_GRANT_FRAMES 4
-#define NR_GRANT_ENTRIES (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(grant_entry_t))
-
-static grant_entry_t *gnttab_table;
-static grant_ref_t gnttab_list[NR_GRANT_ENTRIES];
-#ifdef GNT_DEBUG
-static char inuse[NR_GRANT_ENTRIES];
-#endif
-static __DECLARE_SEMAPHORE_GENERIC(gnttab_sem, 0);
-
-static void
-put_free_entry(grant_ref_t ref)
-{
-    unsigned long flags;
-    local_irq_save(flags);
-#ifdef GNT_DEBUG
-    BUG_ON(!inuse[ref]);
-    inuse[ref] = 0;
-#endif
-    gnttab_list[ref] = gnttab_list[0];
-    gnttab_list[0]  = ref;
-    local_irq_restore(flags);
-    up(&gnttab_sem);
-}
-
-static grant_ref_t
-get_free_entry(void)
-{
-    unsigned int ref;
-    unsigned long flags;
-    down(&gnttab_sem);
-    local_irq_save(flags);
-    ref = gnttab_list[0];
-    BUG_ON(ref < NR_RESERVED_ENTRIES || ref >= NR_GRANT_ENTRIES);
-    gnttab_list[0] = gnttab_list[ref];
-#ifdef GNT_DEBUG
-    BUG_ON(inuse[ref]);
-    inuse[ref] = 1;
-#endif
-    local_irq_restore(flags);
-    return ref;
-}
-
-grant_ref_t
-gnttab_grant_access(domid_t domid, unsigned long frame, int readonly)
-{
-    grant_ref_t ref;
-
-    ref = get_free_entry();
-    gnttab_table[ref].frame = frame;
-    gnttab_table[ref].domid = domid;
-    wmb();
-    readonly *= GTF_readonly;
-    gnttab_table[ref].flags = GTF_permit_access | readonly;
-
-    return ref;
-}
-
-grant_ref_t
-gnttab_grant_transfer(domid_t domid, unsigned long pfn)
-{
-    grant_ref_t ref;
-
-    ref = get_free_entry();
-    gnttab_table[ref].frame = pfn;
-    gnttab_table[ref].domid = domid;
-    wmb();
-    gnttab_table[ref].flags = GTF_accept_transfer;
-
-    return ref;
-}
-
-int
-gnttab_end_access(grant_ref_t ref)
-{
-    uint16_t flags, nflags;
-
-    BUG_ON(ref >= NR_GRANT_ENTRIES || ref < NR_RESERVED_ENTRIES);
-
-    nflags = gnttab_table[ref].flags;
-    do {
-        if ((flags = nflags) & (GTF_reading|GTF_writing)) {
-            printk("WARNING: g.e. still in use! (%x)\n", flags);
-            return 0;
-        }
-    } while ((nflags = synch_cmpxchg(&gnttab_table[ref].flags, flags, 0)) !=
-            flags);
-
-    put_free_entry(ref);
-    return 1;
-}
-
-unsigned long
-gnttab_end_transfer(grant_ref_t ref)
-{
-    unsigned long frame;
-    uint16_t flags;
-
-    BUG_ON(ref >= NR_GRANT_ENTRIES || ref < NR_RESERVED_ENTRIES);
-
-    while (!((flags = gnttab_table[ref].flags) & GTF_transfer_committed)) {
-        if (synch_cmpxchg(&gnttab_table[ref].flags, flags, 0) == flags) {
-            printk("Release unused transfer grant.\n");
-            put_free_entry(ref);
-            return 0;
-        }
-    }
-
-    /* If a transfer is in progress then wait until it is completed. */
-    while (!(flags & GTF_transfer_completed)) {
-        flags = gnttab_table[ref].flags;
-    }
-
-    /* Read the frame number /after/ reading completion status. */
-    rmb();
-    frame = gnttab_table[ref].frame;
-
-    put_free_entry(ref);
-
-    return frame;
-}
-
-grant_ref_t
-gnttab_alloc_and_grant(void **map)
-{
-    unsigned long mfn;
-    grant_ref_t gref;
-
-    *map = (void *)alloc_page();
-    mfn = virt_to_mfn(*map);
-    gref = gnttab_grant_access(0, mfn, 0);
-    return gref;
-}
-
-static const char * const gnttabop_error_msgs[] = GNTTABOP_error_msgs;
-
-const char *
-gnttabop_error(int16_t status)
-{
-    status = -status;
-    if (status < 0 || status >= ARRAY_SIZE(gnttabop_error_msgs))
-        return "bad status";
-    else
-        return gnttabop_error_msgs[status];
-}
-
-void
-init_gnttab(void)
-{
-    int i;
-
-#ifdef GNT_DEBUG
-    memset(inuse, 1, sizeof(inuse));
-#endif
-    for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++)
-        put_free_entry(i);
-
-    gnttab_table = arch_init_gnttab(NR_GRANT_FRAMES);
-    printk("gnttab_table mapped at %p.\n", gnttab_table);
-}
-
-void
-fini_gnttab(void)
-{
-    struct gnttab_setup_table setup;
-
-    setup.dom = DOMID_SELF;
-    setup.nr_frames = 0;
-
-    HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
-}
diff --git a/extras/mini-os/hypervisor.c b/extras/mini-os/hypervisor.c
deleted file mode 100644
index 1b61d9b..0000000
--- a/extras/mini-os/hypervisor.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/******************************************************************************
- * hypervisor.c
- * 
- * Communication to/from hypervisor.
- * 
- * Copyright (c) 2002-2003, K A Fraser
- * Copyright (c) 2005, Grzegorz Milos, gm281 at cam.ac.uk,Intel Research Cambridge
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <mini-os/os.h>
-#include <mini-os/lib.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/events.h>
-
-#define active_evtchns(cpu,sh,idx)              \
-    ((sh)->evtchn_pending[idx] &                \
-     ~(sh)->evtchn_mask[idx])
-
-int in_callback;
-
-void do_hypervisor_callback(struct pt_regs *regs)
-{
-    unsigned long  l1, l2, l1i, l2i;
-    unsigned int   port;
-    int            cpu = 0;
-    shared_info_t *s = HYPERVISOR_shared_info;
-    vcpu_info_t   *vcpu_info = &s->vcpu_info[cpu];
-
-    in_callback = 1;
-   
-    vcpu_info->evtchn_upcall_pending = 0;
-    /* NB x86. No need for a barrier here -- XCHG is a barrier on x86. */
-#if !defined(__i386__) && !defined(__x86_64__)
-    /* Clear master flag /before/ clearing selector flag. */
-    wmb();
-#endif
-    l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
-    while ( l1 != 0 )
-    {
-        l1i = __ffs(l1);
-        l1 &= ~(1UL << l1i);
-        
-        while ( (l2 = active_evtchns(cpu, s, l1i)) != 0 )
-        {
-            l2i = __ffs(l2);
-            l2 &= ~(1UL << l2i);
-
-            port = (l1i * (sizeof(unsigned long) * 8)) + l2i;
-            do_event(port, regs);
-        }
-    }
-
-    in_callback = 0;
-}
-
-void force_evtchn_callback(void)
-{
-#ifdef XEN_HAVE_PV_UPCALL_MASK
-    int save;
-#endif
-    vcpu_info_t *vcpu;
-    vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()];
-#ifdef XEN_HAVE_PV_UPCALL_MASK
-    save = vcpu->evtchn_upcall_mask;
-#endif
-
-    while (vcpu->evtchn_upcall_pending) {
-#ifdef XEN_HAVE_PV_UPCALL_MASK
-        vcpu->evtchn_upcall_mask = 1;
-#endif
-        barrier();
-        do_hypervisor_callback(NULL);
-        barrier();
-#ifdef XEN_HAVE_PV_UPCALL_MASK
-        vcpu->evtchn_upcall_mask = save;
-        barrier();
-#endif
-    };
-}
-
-inline void mask_evtchn(uint32_t port)
-{
-    shared_info_t *s = HYPERVISOR_shared_info;
-    synch_set_bit(port, &s->evtchn_mask[0]);
-}
-
-inline void unmask_evtchn(uint32_t port)
-{
-    shared_info_t *s = HYPERVISOR_shared_info;
-    vcpu_info_t *vcpu_info = &s->vcpu_info[smp_processor_id()];
-
-    synch_clear_bit(port, &s->evtchn_mask[0]);
-
-    /*
-     * The following is basically the equivalent of 'hw_resend_irq'. Just like
-     * a real IO-APIC we 'lose the interrupt edge' if the channel is masked.
-     */
-    if (  synch_test_bit        (port,    &s->evtchn_pending[0]) && 
-         !synch_test_and_set_bit(port / (sizeof(unsigned long) * 8),
-              &vcpu_info->evtchn_pending_sel) )
-    {
-        vcpu_info->evtchn_upcall_pending = 1;
-#ifdef XEN_HAVE_PV_UPCALL_MASK
-        if ( !vcpu_info->evtchn_upcall_mask )
-#endif
-            force_evtchn_callback();
-    }
-}
-
-inline void clear_evtchn(uint32_t port)
-{
-    shared_info_t *s = HYPERVISOR_shared_info;
-    synch_clear_bit(port, &s->evtchn_pending[0]);
-}
diff --git a/extras/mini-os/include/arch/cc.h b/extras/mini-os/include/arch/cc.h
deleted file mode 100644
index 85cfbdb..0000000
--- a/extras/mini-os/include/arch/cc.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* 
- * lwip/arch/cc.h
- *
- * Compiler-specific types and macros for lwIP running on mini-os 
- *
- * Tim Deegan <Tim.Deegan at eu.citrix.net>, July 2007
- */
-
-#ifndef __LWIP_ARCH_CC_H__
-#define __LWIP_ARCH_CC_H__
-
-/*   Typedefs for the types used by lwip - */
-#include <mini-os/os.h>
-#include <mini-os/types.h>
-#include <time.h>
-typedef uint8_t  u8_t;
-typedef int8_t   s8_t;
-typedef uint16_t u16_t;
-typedef int16_t  s16_t;
-typedef uint32_t u32_t;
-typedef int32_t  s32_t;
-typedef uint64_t u64_t;
-typedef int64_t  s64_t;
-typedef uintptr_t mem_ptr_t;
-
-typedef uint16_t u_short;
-
-/*   Compiler hints for packing lwip's structures - */
-#define PACK_STRUCT_FIELD(_x)  _x
-#define PACK_STRUCT_STRUCT     __attribute__ ((packed))
-#define PACK_STRUCT_BEGIN 
-#define PACK_STRUCT_END
-
-/*   Platform specific diagnostic output - */
-
-extern void lwip_printk(char *fmt, ...);
-#define LWIP_PLATFORM_DIAG(_x) do { lwip_printk _x ; } while (0)
-
-extern void lwip_die(char *fmt, ...);
-#define LWIP_PLATFORM_ASSERT(_x) do { lwip_die(_x); } while(0)
-
-/*   "lightweight" synchronization mechanisms - */
-/*     SYS_ARCH_DECL_PROTECT(x) - declare a protection state variable. */
-/*     SYS_ARCH_PROTECT(x)      - enter protection mode. */
-/*     SYS_ARCH_UNPROTECT(x)    - leave protection mode. */
-
-/*   If the compiler does not provide memset() this file must include a */
-/*   definition of it, or include a file which defines it. */
-#include <mini-os/lib.h>
-
-/*   This file must either include a system-local <errno.h> which defines */
-/*   the standard *nix error codes, or it should #define LWIP_PROVIDE_ERRNO */
-/*   to make lwip/arch.h define the codes which are used throughout. */
-#include <errno.h>
-
-/*   Not required by the docs, but needed for network-order calculations */
-#ifdef HAVE_LIBC
-#include <machine/endian.h>
-#ifndef BIG_ENDIAN
-#error endian.h does not define byte order
-#endif
-#else
-#include <endian.h>
-#endif
-
-#include <inttypes.h>
-#define S16_F PRIi16
-#define U16_F PRIu16
-#define X16_F PRIx16
-#define S32_F PRIi32
-#define U32_F PRIu32
-#define X32_F PRIx32
-
-#if 0
-#ifndef DBG_ON
-#define DBG_ON	LWIP_DBG_ON
-#endif
-#define LWIP_DEBUG	DBG_ON
-//#define IP_DEBUG	DBG_ON
-#define TCP_DEBUG	DBG_ON
-#define TCP_INPUT_DEBUG	DBG_ON
-#define TCP_QLEN_DEBUG	DBG_ON
-#define TCPIP_DEBUG	DBG_ON
-#define DBG_TYPES_ON	DBG_ON
-#endif
-
-#endif /* __LWIP_ARCH_CC_H__ */
diff --git a/extras/mini-os/include/arch/perf.h b/extras/mini-os/include/arch/perf.h
deleted file mode 100644
index dda87f2..0000000
--- a/extras/mini-os/include/arch/perf.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* 
- * lwip/arch/perf.h
- *
- * Arch-specific performance measurement for lwIP running on mini-os 
- *
- * Tim Deegan <Tim.Deegan at eu.citrix.net>, July 2007
- */
-
-#ifndef __LWIP_ARCH_PERF_H__
-#define __LWIP_ARCH_PERF_H__
-
-#define PERF_START    do { } while(0)
-#define PERF_STOP(_x) do { (void)(_x); } while (0)
-
-#endif /* __LWIP_ARCH_PERF_H__ */
diff --git a/extras/mini-os/include/arch/sys_arch.h b/extras/mini-os/include/arch/sys_arch.h
deleted file mode 100644
index 11d5328..0000000
--- a/extras/mini-os/include/arch/sys_arch.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* 
- * lwip/arch/sys_arch.h
- *
- * Arch-specific semaphores and mailboxes for lwIP running on mini-os 
- *
- * Tim Deegan <Tim.Deegan at eu.citrix.net>, July 2007
- */
-
-#ifndef __LWIP_ARCH_SYS_ARCH_H__
-#define __LWIP_ARCH_SYS_ARCH_H__
-
-#include <mini-os/os.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/semaphore.h>
-
-typedef struct semaphore *sys_sem_t;
-#define SYS_SEM_NULL ((sys_sem_t) NULL)
-
-struct mbox {
-    int count;
-    void **messages;
-    struct semaphore read_sem;
-    struct semaphore write_sem;
-    int writer;
-    int reader;
-};
-
-typedef struct mbox *sys_mbox_t;
-#define SYS_MBOX_NULL ((sys_mbox_t) 0)
-
-typedef struct thread *sys_thread_t;
-
-typedef unsigned long sys_prot_t;
-
-#endif /*__LWIP_ARCH_SYS_ARCH_H__ */
diff --git a/extras/mini-os/include/arm/arch_endian.h b/extras/mini-os/include/arm/arch_endian.h
deleted file mode 100644
index 0771683..0000000
--- a/extras/mini-os/include/arm/arch_endian.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef	ARCH_ENDIAN_H
-#error "Do not include arch_endian by itself, include endian.h"
-#else
-
-#define __BYTE_ORDER __LITTLE_ENDIAN
-
-#endif
diff --git a/extras/mini-os/include/arm/arch_limits.h b/extras/mini-os/include/arm/arch_limits.h
deleted file mode 100644
index bae99e1..0000000
--- a/extras/mini-os/include/arm/arch_limits.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ARCH_LIMITS_H__
-#define __ARCH_LIMITS_H__
-
-#include <mm.h>
-
-#define __STACK_SIZE_PAGE_ORDER  2
-#define __STACK_SIZE (4 * PAGE_SIZE)
-
-#endif
diff --git a/extras/mini-os/include/arm/arch_mm.h b/extras/mini-os/include/arm/arch_mm.h
deleted file mode 100644
index 085d4e5..0000000
--- a/extras/mini-os/include/arm/arch_mm.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef _ARCH_MM_H_
-#define _ARCH_MM_H_
-
-typedef uint64_t paddr_t;
-
-extern char _text, _etext, _erodata, _edata, _end, __bss_start;
-extern int _boot_stack[];
-extern int _boot_stack_end[];
-extern uint32_t physical_address_offset;	/* Add this to a virtual address to get the physical address (wraps at 4GB) */
-
-#define PAGE_SHIFT        12
-#define PAGE_SIZE        (1 << PAGE_SHIFT)
-#define PAGE_MASK       (~(PAGE_SIZE-1))
-
-#define L1_PAGETABLE_SHIFT      12
-
-#define to_phys(x)                 (((paddr_t)(x)+physical_address_offset) & 0xffffffff)
-#define to_virt(x)                 ((void *)(((x)-physical_address_offset) & 0xffffffff))
-
-#define PFN_UP(x)    (unsigned long)(((x) + PAGE_SIZE-1) >> L1_PAGETABLE_SHIFT)
-#define PFN_DOWN(x)    (unsigned long)((x) >> L1_PAGETABLE_SHIFT)
-#define PFN_PHYS(x)    ((uint64_t)(x) << L1_PAGETABLE_SHIFT)
-#define PHYS_PFN(x)    (unsigned long)((x) >> L1_PAGETABLE_SHIFT)
-
-#define virt_to_pfn(_virt)         (PFN_DOWN(to_phys(_virt)))
-#define virt_to_mfn(_virt)         (PFN_DOWN(to_phys(_virt)))
-#define mfn_to_virt(_mfn)          (to_virt(PFN_PHYS(_mfn)))
-#define pfn_to_virt(_pfn)          (to_virt(PFN_PHYS(_pfn)))
-
-#define mfn_to_pfn(x) (x)
-#define pfn_to_mfn(x) (x)
-
-#define virtual_to_mfn(_virt)	   virt_to_mfn(_virt)
-
-// FIXME
-#define map_frames(f, n) (NULL)
-
-#endif
diff --git a/extras/mini-os/include/arm/arch_sched.h b/extras/mini-os/include/arm/arch_sched.h
deleted file mode 100644
index de3ac02..0000000
--- a/extras/mini-os/include/arm/arch_sched.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef __ARCH_SCHED_H__
-#define __ARCH_SCHED_H__
-
-#include "arch_limits.h"
-
-static inline struct thread* get_current(void)
-{
-    struct thread **current;
-    unsigned long sp;
-    __asm__ __volatile__ ("mov %0, sp":"=r"(sp));
-    current = (void *)(unsigned long)(sp & ~(__STACK_SIZE-1));
-    return *current;
-}
-
-void __arch_switch_threads(unsigned long *prevctx, unsigned long *nextctx);
-
-#define arch_switch_threads(prev,next) __arch_switch_threads(&(prev)->sp, &(next)->sp)
-
-#endif /* __ARCH_SCHED_H__ */
diff --git a/extras/mini-os/include/arm/arch_spinlock.h b/extras/mini-os/include/arm/arch_spinlock.h
deleted file mode 100755
index dccb9fc..0000000
--- a/extras/mini-os/include/arm/arch_spinlock.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef __ARCH_ASM_SPINLOCK_H
-#define __ARCH_ASM_SPINLOCK_H
-
-#include "os.h"
-
-#define ARCH_SPIN_LOCK_UNLOCKED { 1 }
-
-/*
- * Simple spin lock operations.  There are two variants, one clears IRQ's
- * on the local processor, one does not.
- *
- * We make no fairness assumptions. They have a cost.
- */
-
-#define arch_spin_is_locked(x)    (*(volatile signed char *)(&(x)->slock) <= 0)
-#define arch_spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x))
-
-static inline void _raw_spin_unlock(spinlock_t *lock)
-{
-    xchg(&lock->slock, 1);
-}
-
-static inline int _raw_spin_trylock(spinlock_t *lock)
-{
-    return xchg(&lock->slock, 0) != 0 ? 1 : 0;
-}
-
-static inline void _raw_spin_lock(spinlock_t *lock)
-{
-    volatile int was_locked;
-    do {
-        was_locked = xchg(&lock->slock, 0) == 0 ? 1 : 0;
-    } while(was_locked);
-}
-
-#endif
diff --git a/extras/mini-os/include/arm/arm32/arch_wordsize.h b/extras/mini-os/include/arm/arm32/arch_wordsize.h
deleted file mode 100644
index b47eee9..0000000
--- a/extras/mini-os/include/arm/arm32/arch_wordsize.h
+++ /dev/null
@@ -1 +0,0 @@
-#define __WORDSIZE 32
diff --git a/extras/mini-os/include/arm/gic.h b/extras/mini-os/include/arm/gic.h
deleted file mode 100644
index cead2e5..0000000
--- a/extras/mini-os/include/arm/gic.h
+++ /dev/null
@@ -1 +0,0 @@
-void gic_init(void);
diff --git a/extras/mini-os/include/arm/hypercall-arm.h b/extras/mini-os/include/arm/hypercall-arm.h
deleted file mode 100644
index 26ac9f8..0000000
--- a/extras/mini-os/include/arm/hypercall-arm.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/******************************************************************************
- * hypercall-arm.h
- *
- * Copied from XenLinux.
- *
- * Copyright (c) 2002-2004, K A Fraser
- *
- * 64-bit updates:
- *   Benjamin Liu <benjamin.liu at intel.com>
- *   Jun Nakajima <jun.nakajima at intel.com>
- *
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef __HYPERCALL_ARM_H__
-#define __HYPERCALL_ARM_H__
-
-#include <xen/xen.h>
-#include <xen/sched.h>
-#include <xen/xsm/flask_op.h>
-#include <mini-os/mm.h>
-
-int
-HYPERVISOR_sched_op(
-    int cmd, void *arg);
-
-static inline int
-HYPERVISOR_shutdown(
-    unsigned int reason)
-{
-    struct sched_shutdown shutdown = { .reason = reason };
-    HYPERVISOR_sched_op(SCHEDOP_shutdown, &shutdown);
-}
-
-int
-HYPERVISOR_memory_op(
-    unsigned int cmd, void *arg);
-
-int
-HYPERVISOR_event_channel_op(
-    int cmd, void *op);
-
-int
-HYPERVISOR_xen_version(
-    int cmd, void *arg);
-
-int
-HYPERVISOR_console_io(
-    int cmd, int count, char *str);
-
-int
-HYPERVISOR_physdev_op(
-    void *physdev_op);
-
-int
-HYPERVISOR_grant_table_op(
-    unsigned int cmd, void *uop, unsigned int count);
-
-int
-HYPERVISOR_vcpu_op(
-    int cmd, int vcpuid, void *extra_args);
-
-int
-HYPERVISOR_sysctl(
-    unsigned long op);
-
-int
-HYPERVISOR_domctl(
-    unsigned long op);
-
-int
-HYPERVISOR_hvm_op(
-    unsigned long op, void *arg);
-
-int
-HYPERVISOR_xsm_op(
-    struct xen_flask_op *);
-
-#endif  /* __HYPERCALL_ARM_H__ */
diff --git a/extras/mini-os/include/arm/os.h b/extras/mini-os/include/arm/os.h
deleted file mode 100644
index 6a1cc37..0000000
--- a/extras/mini-os/include/arm/os.h
+++ /dev/null
@@ -1,216 +0,0 @@
-#ifndef _OS_H_
-#define _OS_H_
-
-#ifndef __ASSEMBLY__
-
-#include <mini-os/hypervisor.h>
-#include <mini-os/types.h>
-#include <mini-os/compiler.h>
-#include <mini-os/kernel.h>
-#include <xen/xen.h>
-
-void arch_fini(void);
-void timer_handler(evtchn_port_t port, struct pt_regs *regs, void *ign);
-
-extern void *device_tree;
-
-#define BUG() while(1){asm volatile (".word 0xe7f000f0\n");} /* Undefined instruction; will call our fault handler. */
-
-#define smp_processor_id() 0
-
-#define barrier() __asm__ __volatile__("": : :"memory")
-
-extern shared_info_t *HYPERVISOR_shared_info;
-
-// disable interrupts
-static inline void local_irq_disable(void) {
-    __asm__ __volatile__("cpsid i":::"memory");
-}
-
-// enable interrupts
-static inline void local_irq_enable(void) {
-    __asm__ __volatile__("cpsie i":::"memory");
-}
-
-#define local_irq_save(x) { \
-    __asm__ __volatile__("mrs %0, cpsr;cpsid i":"=r"(x)::"memory");    \
-}
-
-#define local_irq_restore(x) {    \
-    __asm__ __volatile__("msr cpsr_c, %0"::"r"(x):"memory");    \
-}
-
-#define local_save_flags(x)    { \
-    __asm__ __volatile__("mrs %0, cpsr":"=r"(x)::"memory");    \
-}
-
-static inline int irqs_disabled(void) {
-    int x;
-    local_save_flags(x);
-    return x & 0x80;
-}
-
-/* We probably only need "dmb" here, but we'll start by being paranoid. */
-#define mb() __asm__("dsb":::"memory");
-#define rmb() __asm__("dsb":::"memory");
-#define wmb() __asm__("dsb":::"memory");
-
-/************************** arm *******************************/
-#ifdef __INSIDE_MINIOS__
-#if defined (__arm__)
-#define xchg(ptr,v) __atomic_exchange_n(ptr, v, __ATOMIC_SEQ_CST)
-
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- *
- * This operation is atomic.
- * If you need a memory barrier, use synch_test_and_clear_bit instead.
- */
-static __inline__ int test_and_clear_bit(int nr, volatile void * addr)
-{
-    uint8_t *byte = ((uint8_t *)addr) + (nr >> 3);
-    uint8_t bit = 1 << (nr & 7);
-    uint8_t orig;
-
-    orig = __atomic_fetch_and(byte, ~bit, __ATOMIC_RELAXED);
-
-    return (orig & bit) != 0;
-}
-
-/**
- * Atomically set a bit and return the old value.
- * Similar to test_and_clear_bit.
- */
-static __inline__ int test_and_set_bit(int nr, volatile void *base)
-{
-    uint8_t *byte = ((uint8_t *)base) + (nr >> 3);
-    uint8_t bit = 1 << (nr & 7);
-    uint8_t orig;
-
-    orig = __atomic_fetch_or(byte, bit, __ATOMIC_RELAXED);
-
-    return (orig & bit) != 0;
-}
-
-/**
- * Test whether a bit is set. */
-static __inline__ int test_bit(int nr, const volatile unsigned long *addr)
-{
-    const uint8_t *ptr = (const uint8_t *) addr;
-    return ((1 << (nr & 7)) & (ptr[nr >> 3])) != 0;
-}
-
-/**
- * Atomically set a bit in memory (like test_and_set_bit but discards result).
- */
-static __inline__ void set_bit(int nr, volatile unsigned long *addr)
-{
-    test_and_set_bit(nr, addr);
-}
-
-/**
- * Atomically clear a bit in memory (like test_and_clear_bit but discards result).
- */
-static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
-{
-    test_and_clear_bit(nr, addr);
-}
-
-/**
- * __ffs - find first (lowest) set bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static __inline__ unsigned long __ffs(unsigned long word)
-{
-    int clz;
-
-    /* xxxxx10000 = word
-     * xxxxx01111 = word - 1
-     * 0000011111 = word ^ (word - 1)
-     *      4     = 31 - clz(word ^ (word - 1))
-     */
-
-    __asm__ (
-        "sub r0, %[word], #1\n"
-        "eor r0, r0, %[word]\n"
-        "clz %[clz], r0\n":
-        /* Outputs: */
-        [clz] "=r"(clz):
-        /* Inputs: */
-        [word] "r"(word):
-        /* Clobbers: */
-        "r0");
-
-    return 31 - clz;
-}
-
-#else /* ifdef __arm__ */
-#error "Unsupported architecture"
-#endif
-#endif /* ifdef __INSIDE_MINIOS */
-
-/********************* common arm32 and arm64  ****************************/
-
-/* If *ptr == old, then store new there (and return new).
- * Otherwise, return the old value.
- * Atomic. */
-#define synch_cmpxchg(ptr, old, new) \
-({ __typeof__(*ptr) stored = old; \
-   __atomic_compare_exchange_n(ptr, &stored, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ? new : old; \
-})
-
-/* As test_and_clear_bit, but using __ATOMIC_SEQ_CST */
-static __inline__ int synch_test_and_clear_bit(int nr, volatile void *addr)
-{
-    uint8_t *byte = ((uint8_t *)addr) + (nr >> 3);
-    uint8_t bit = 1 << (nr & 7);
-    uint8_t orig;
-
-    orig = __atomic_fetch_and(byte, ~bit, __ATOMIC_SEQ_CST);
-
-    return (orig & bit) != 0;
-}
-
-/* As test_and_set_bit, but using __ATOMIC_SEQ_CST */
-static __inline__ int synch_test_and_set_bit(int nr, volatile void *base)
-{
-    uint8_t *byte = ((uint8_t *)base) + (nr >> 3);
-    uint8_t bit = 1 << (nr & 7);
-    uint8_t orig;
-
-    orig = __atomic_fetch_or(byte, bit, __ATOMIC_SEQ_CST);
-
-    return (orig & bit) != 0;
-}
-
-/* As set_bit, but using __ATOMIC_SEQ_CST */
-static __inline__ void synch_set_bit(int nr, volatile void *addr)
-{
-    synch_test_and_set_bit(nr, addr);
-}
-
-/* As clear_bit, but using __ATOMIC_SEQ_CST */
-static __inline__ void synch_clear_bit(int nr, volatile void *addr)
-{
-    synch_test_and_clear_bit(nr, addr);
-}
-
-/* As test_bit, but with a following memory barrier. */
-static __inline__ int synch_test_bit(int nr, volatile void *addr)
-{
-    int result;
-    result = test_bit(nr, addr);
-    barrier();
-    return result;
-}
-
-#endif /* not assembly */
-
-#endif
diff --git a/extras/mini-os/include/arm/traps.h b/extras/mini-os/include/arm/traps.h
deleted file mode 100644
index 704df22..0000000
--- a/extras/mini-os/include/arm/traps.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _TRAPS_H_
-#define _TRAPS_H_
-
-struct pt_regs {
-    unsigned long r0;
-    unsigned long r1;
-    unsigned long r2;
-    unsigned long r3;
-    unsigned long r4;
-    unsigned long r5;
-    unsigned long r6;
-    unsigned long r7;
-    unsigned long r8;
-    unsigned long r9;
-    unsigned long r10;
-    unsigned long r11;
-    unsigned long r12;
-};
-
-#endif
diff --git a/extras/mini-os/include/blkfront.h b/extras/mini-os/include/blkfront.h
deleted file mode 100644
index 3528af9..0000000
--- a/extras/mini-os/include/blkfront.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#include <mini-os/wait.h>
-#include <xen/io/blkif.h>
-#include <mini-os/types.h>
-struct blkfront_dev;
-struct blkfront_aiocb
-{
-    struct blkfront_dev *aio_dev;
-    uint8_t *aio_buf;
-    size_t aio_nbytes;
-    off_t aio_offset;
-    size_t total_bytes;
-    uint8_t is_write;
-    void *data;
-
-    grant_ref_t gref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-    int n;
-
-    void (*aio_cb)(struct blkfront_aiocb *aiocb, int ret);
-};
-struct blkfront_info
-{
-    uint64_t sectors;
-    unsigned sector_size;
-    int mode;
-    int info;
-    int barrier;
-    int flush;
-};
-struct blkfront_dev *init_blkfront(char *nodename, struct blkfront_info *info);
-#ifdef HAVE_LIBC
-#include <sys/stat.h>
-/* POSIX IO functions:
- * use blkfront_open() to get a file descriptor to the block device
- * Don't use the other blkfront posix functions here directly, instead use
- * read(), write(), lseek() and fstat() on the file descriptor
- */
-int blkfront_open(struct blkfront_dev *dev);
-int blkfront_posix_rwop(int fd, uint8_t* buf, size_t count, int write);
-#define blkfront_posix_write(fd, buf, count) blkfront_posix_rwop(fd, (uint8_t*)buf, count, 1)
-#define blkfront_posix_read(fd, buf, count) blkfront_posix_rwop(fd, (uint8_t*)buf, count, 0)
-int blkfront_posix_fstat(int fd, struct stat* buf);
-#endif
-void blkfront_aio(struct blkfront_aiocb *aiocbp, int write);
-#define blkfront_aio_read(aiocbp) blkfront_aio(aiocbp, 0)
-#define blkfront_aio_write(aiocbp) blkfront_aio(aiocbp, 1)
-void blkfront_io(struct blkfront_aiocb *aiocbp, int write);
-#define blkfront_read(aiocbp) blkfront_io(aiocbp, 0)
-#define blkfront_write(aiocbp) blkfront_io(aiocbp, 1)
-void blkfront_aio_push_operation(struct blkfront_aiocb *aiocbp, uint8_t op);
-int blkfront_aio_poll(struct blkfront_dev *dev);
-void blkfront_sync(struct blkfront_dev *dev);
-void shutdown_blkfront(struct blkfront_dev *dev);
-
-extern struct wait_queue_head blkfront_queue;
diff --git a/extras/mini-os/include/byteorder.h b/extras/mini-os/include/byteorder.h
deleted file mode 100644
index c0e29df..0000000
--- a/extras/mini-os/include/byteorder.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef MINIOS_BYTEORDER_H
-#define MINIOS_BYTEORDER_H
-
-#include <mini-os/byteswap.h>
-#include <mini-os/endian.h>
-
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define be16_to_cpu(v) bswap_16(v)
-#define be32_to_cpu(v) bswap_32(v)
-#define be64_to_cpu(v) bswap_64(v)
-
-#define le16_to_cpu(v) (v)
-#define le32_to_cpu(v) (v)
-#define le64_to_cpu(v) (v)
-
-#else /*__BIG_ENDIAN*/
-#define be16_to_cpu(v) (v)
-#define be32_to_cpu(v) (v)
-#define be64_to_cpu(v) (v)
-
-#define le16_to_cpu(v) bswap_16(v)
-#define le32_to_cpu(v) bswap_32(v)
-#define le64_to_cpu(v) bswap_64(v)
-
-#endif
-
-#define cpu_to_be16(v) be16_to_cpu(v)
-#define cpu_to_be32(v) be32_to_cpu(v)
-#define cpu_to_be64(v) be64_to_cpu(v)
-
-#define cpu_to_le16(v) le16_to_cpu(v)
-#define cpu_to_le32(v) le32_to_cpu(v)
-#define cpu_to_le64(v) le64_to_cpu(v)
-
-
-#endif
diff --git a/extras/mini-os/include/byteswap.h b/extras/mini-os/include/byteswap.h
deleted file mode 100644
index 992c8bd..0000000
--- a/extras/mini-os/include/byteswap.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _BYTESWAP_H_
-#define _BYTESWAP_H_
-
-/* Unfortunately not provided by newlib.  */
-
-#include <mini-os/types.h>
-
-#define bswap_16(x) ((uint16_t)(                         \
-	         (((uint16_t)(x) & (uint16_t)0x00ffU) << 8) |                  \
-	         (((uint16_t)(x) & (uint16_t)0xff00U) >> 8)))
-
-/* Use gcc optimized versions if they exist */
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
-#define bswap_32(v) __builtin_bswap32(v)
-#define bswap_64(v) __builtin_bswap64(v)
-#else
-
-#define bswap_32(x) ((uint32_t)(                         \
-	         (((uint32_t)(x) & (uint32_t)0x000000ffUL) << 24) |            \
-	         (((uint32_t)(x) & (uint32_t)0x0000ff00UL) <<  8) |            \
-	         (((uint32_t)(x) & (uint32_t)0x00ff0000UL) >>  8) |            \
-	         (((uint32_t)(x) & (uint32_t)0xff000000UL) >> 24)))
-
-#define bswap_64(x) ((uint64_t)(                         \
-	         (((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 56) |   \
-	         (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 40) |   \
-	         (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 24) |   \
-	         (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) <<  8) |   \
-	         (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >>  8) |   \
-	         (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 24) |   \
-	         (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 40) |   \
-	         (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 56)))
-
-#endif
-
-
-
-
-#endif /* _BYTESWAP_H */
diff --git a/extras/mini-os/include/compiler.h b/extras/mini-os/include/compiler.h
deleted file mode 100644
index 4188277..0000000
--- a/extras/mini-os/include/compiler.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __MINIOS_COMPILER_H_
-#define __MINIOS_COMPILER_H_
-
-#if __GNUC__ == 2 && __GNUC_MINOR__ < 96
-#define __builtin_expect(x, expected_value) (x)
-#endif
-#define unlikely(x)  __builtin_expect(!!(x),0)
-#define likely(x)    __builtin_expect(!!(x),1)
-
-#endif /* __MINIOS_COMPILER_H_ */
diff --git a/extras/mini-os/include/console.h b/extras/mini-os/include/console.h
deleted file mode 100644
index 3755b66..0000000
--- a/extras/mini-os/include/console.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* 
- ****************************************************************************
- * (C) 2006 - Grzegorz Milos - Cambridge University
- ****************************************************************************
- *
- *        File: console.h
- *      Author: Grzegorz Milos
- *     Changes: 
- *              
- *        Date: Mar 2006
- * 
- * Environment: Xen Minimal OS
- * Description: Console interface.
- *
- * Handles console I/O. Defines printk.
- *
- ****************************************************************************
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-#ifndef _LIB_CONSOLE_H_
-#define _LIB_CONSOLE_H_
-
-#include <mini-os/os.h>
-#include <mini-os/traps.h>
-#include <mini-os/types.h>
-#include <xen/grant_table.h>
-#include <xenbus.h>
-#include <xen/io/console.h>
-#include <stdarg.h>
-
-struct consfront_dev {
-    domid_t dom;
-
-    struct xencons_interface *ring;
-    grant_ref_t ring_ref;
-    evtchn_port_t evtchn;
-
-    char *nodename;
-    char *backend;
-
-    xenbus_event_queue events;
-
-#ifdef HAVE_LIBC
-    int fd;
-#endif
-};
-
-
-
-void print(int direct, const char *fmt, va_list args);
-void printk(const char *fmt, ...);
-void xprintk(const char *fmt, ...);
-
-#define tprintk(_fmt, _args...) printk("[%s] " _fmt, current->name, ##_args) 
-
-void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
-void xencons_tx(void);
-
-void init_console(void);
-void console_print(struct consfront_dev *dev, char *data, int length);
-void fini_console(struct consfront_dev *dev);
-
-/* Low level functions defined in xencons_ring.c */
-extern struct wait_queue_head console_queue;
-struct consfront_dev *xencons_ring_init(void);
-struct consfront_dev *init_consfront(char *_nodename);
-int xencons_ring_send(struct consfront_dev *dev, const char *data, unsigned len);
-int xencons_ring_send_no_notify(struct consfront_dev *dev, const char *data, unsigned len);
-int xencons_ring_avail(struct consfront_dev *dev);
-int xencons_ring_recv(struct consfront_dev *dev, char *data, unsigned len);
-void free_consfront(struct consfront_dev *dev);
-
-#endif /* _LIB_CONSOLE_H_ */
diff --git a/extras/mini-os/include/ctype.h b/extras/mini-os/include/ctype.h
deleted file mode 100644
index ac0dd67..0000000
--- a/extras/mini-os/include/ctype.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef _CTYPE_H
-#define _CTYPE_H
-
-#ifdef HAVE_LIBC
-#include_next <ctype.h>
-#else
-/*
- * NOTE! This ctype does not handle EOF like the standard C
- * library is required to.
- */
-
-#define _U	0x01	/* upper */
-#define _L	0x02	/* lower */
-#define _D	0x04	/* digit */
-#define _C	0x08	/* cntrl */
-#define _P	0x10	/* punct */
-#define _S	0x20	/* white space (space/lf/tab) */
-#define _X	0x40	/* hex digit */
-#define _SP	0x80	/* hard space (0x20) */
-
-
-extern unsigned char _ctype[];
-
-#define __ismask(x) (_ctype[(int)(unsigned char)(x)])
-
-#define isalnum(c)	((__ismask(c)&(_U|_L|_D)) != 0)
-#define isalpha(c)	((__ismask(c)&(_U|_L)) != 0)
-#define iscntrl(c)	((__ismask(c)&(_C)) != 0)
-#define isdigit(c)	((__ismask(c)&(_D)) != 0)
-#define isgraph(c)	((__ismask(c)&(_P|_U|_L|_D)) != 0)
-#define islower(c)	((__ismask(c)&(_L)) != 0)
-#define isprint(c)	((__ismask(c)&(_P|_U|_L|_D|_SP)) != 0)
-#define ispunct(c)	((__ismask(c)&(_P)) != 0)
-#define isspace(c)	((__ismask(c)&(_S)) != 0)
-#define isupper(c)	((__ismask(c)&(_U)) != 0)
-#define isxdigit(c)	((__ismask(c)&(_D|_X)) != 0)
-
-#define isascii(c) (((unsigned char)(c))<=0x7f)
-#define toascii(c) (((unsigned char)(c))&0x7f)
-
-static inline unsigned char __tolower(unsigned char c)
-{
-	if (isupper(c))
-		c -= 'A'-'a';
-	return c;
-}
-
-static inline unsigned char __toupper(unsigned char c)
-{
-	if (islower(c))
-		c -= 'a'-'A';
-	return c;
-}
-
-#define tolower(c) __tolower(c)
-#define toupper(c) __toupper(c)
-
-#endif
-
-#endif
diff --git a/extras/mini-os/include/endian.h b/extras/mini-os/include/endian.h
deleted file mode 100644
index 5345517..0000000
--- a/extras/mini-os/include/endian.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef	_ENDIAN_H_
-#define	_ENDIAN_H_
-
-#define	__LITTLE_ENDIAN	1234
-#define	__BIG_ENDIAN	4321
-#define	__PDP_ENDIAN	3412
-
-#define ARCH_ENDIAN_H
-/* This will define __BYTE_ORDER for the current arch */
-#include <arch_endian.h>
-#undef ARCH_ENDIAN_H
-
-#include <arch_wordsize.h>
-
-#define BYTE_ORDER __BYTE_ORDER
-#define BIG_ENDIAN __BIG_ENDIAN
-#define LITTLE_ENDIAN __LITTLE_ENDIAN
-
-#endif	/* endian.h */
diff --git a/extras/mini-os/include/err.h b/extras/mini-os/include/err.h
deleted file mode 100644
index 4e19619..0000000
--- a/extras/mini-os/include/err.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _ERR_H
-#define _ERR_H
-
-#include <mini-os/errno.h>
-
-/*
- * Kernel pointers have redundant information, so we can use a
- * scheme where we can return either an error code or a dentry
- * pointer with the same return value.
- *
- * This should be a per-architecture thing, to allow different
- * error and pointer decisions.
- */
-#define IS_ERR_VALUE(x) ((x) > (unsigned long)-1000L)
-
-static inline void *ERR_PTR(long error)
-{
-	return (void *) error;
-}
-
-static inline long PTR_ERR(const void *ptr)
-{
-	return (long) ptr;
-}
-
-static inline long IS_ERR(const void *ptr)
-{
-	return IS_ERR_VALUE((unsigned long)ptr);
-}
-
-#endif /* _LINUX_ERR_H */
diff --git a/extras/mini-os/include/errno-base.h b/extras/mini-os/include/errno-base.h
deleted file mode 100644
index 036a080..0000000
--- a/extras/mini-os/include/errno-base.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _ERRNO_BASE_H
-#define _ERRNO_BASE_H
-
-#define	EPERM		 1	/* Operation not permitted */
-#define	ENOENT		 2	/* No such file or directory */
-#define	ESRCH		 3	/* No such process */
-#define	EINTR		 4	/* Interrupted system call */
-#define	EIO		 5	/* I/O error */
-#define	ENXIO		 6	/* No such device or address */
-#define	E2BIG		 7	/* Argument list too long */
-#define	ENOEXEC		 8	/* Exec format error */
-#define	EBADF		 9	/* Bad file number */
-#define	ECHILD		10	/* No child processes */
-#define	EAGAIN		11	/* Try again */
-#define	ENOMEM		12	/* Out of memory */
-#define	EACCES		13	/* Permission denied */
-#define	EFAULT		14	/* Bad address */
-#define	ENOTBLK		15	/* Block device required */
-#define	EBUSY		16	/* Device or resource busy */
-#define	EEXIST		17	/* File exists */
-#define	EXDEV		18	/* Cross-device link */
-#define	ENODEV		19	/* No such device */
-#define	ENOTDIR		20	/* Not a directory */
-#define	EISDIR		21	/* Is a directory */
-#define	EINVAL		22	/* Invalid argument */
-#define	ENFILE		23	/* File table overflow */
-#define	EMFILE		24	/* Too many open files */
-#define	ENOTTY		25	/* Not a typewriter */
-#define	ETXTBSY		26	/* Text file busy */
-#define	EFBIG		27	/* File too large */
-#define	ENOSPC		28	/* No space left on device */
-#define	ESPIPE		29	/* Illegal seek */
-#define	EROFS		30	/* Read-only file system */
-#define	EMLINK		31	/* Too many links */
-#define	EPIPE		32	/* Broken pipe */
-#define	EDOM		33	/* Math argument out of domain of func */
-#define	ERANGE		34	/* Math result not representable */
-
-#endif
diff --git a/extras/mini-os/include/errno.h b/extras/mini-os/include/errno.h
deleted file mode 100644
index f71b131..0000000
--- a/extras/mini-os/include/errno.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef _ERRNO_H
-#define _ERRNO_H
-
-#include <mini-os/errno-base.h>
-
-typedef int error_t;
-
-#define	EDEADLK		35	/* Resource deadlock would occur */
-#define	ENAMETOOLONG	36	/* File name too long */
-#define	ENOLCK		37	/* No record locks available */
-#define	ENOSYS		38	/* Function not implemented */
-#define	ENOTEMPTY	39	/* Directory not empty */
-#define	ELOOP		40	/* Too many symbolic links encountered */
-#define	EWOULDBLOCK	EAGAIN	/* Operation would block */
-#define	ENOMSG		42	/* No message of desired type */
-#define	EIDRM		43	/* Identifier removed */
-#define	ECHRNG		44	/* Channel number out of range */
-#define	EL2NSYNC	45	/* Level 2 not synchronized */
-#define	EL3HLT		46	/* Level 3 halted */
-#define	EL3RST		47	/* Level 3 reset */
-#define	ELNRNG		48	/* Link number out of range */
-#define	EUNATCH		49	/* Protocol driver not attached */
-#define	ENOCSI		50	/* No CSI structure available */
-#define	EL2HLT		51	/* Level 2 halted */
-#define	EBADE		52	/* Invalid exchange */
-#define	EBADR		53	/* Invalid request descriptor */
-#define	EXFULL		54	/* Exchange full */
-#define	ENOANO		55	/* No anode */
-#define	EBADRQC		56	/* Invalid request code */
-#define	EBADSLT		57	/* Invalid slot */
-
-#define	EDEADLOCK	EDEADLK
-
-#define	EBFONT		59	/* Bad font file format */
-#define	ENOSTR		60	/* Device not a stream */
-#define	ENODATA		61	/* No data available */
-#define	ETIME		62	/* Timer expired */
-#define	ENOSR		63	/* Out of streams resources */
-#define	ENONET		64	/* Machine is not on the network */
-#define	ENOPKG		65	/* Package not installed */
-#define	EREMOTE		66	/* Object is remote */
-#define	ENOLINK		67	/* Link has been severed */
-#define	EADV		68	/* Advertise error */
-#define	ESRMNT		69	/* Srmount error */
-#define	ECOMM		70	/* Communication error on send */
-#define	EPROTO		71	/* Protocol error */
-#define	EMULTIHOP	72	/* Multihop attempted */
-#define	EDOTDOT		73	/* RFS specific error */
-#define	EBADMSG		74	/* Not a data message */
-#define	EOVERFLOW	75	/* Value too large for defined data type */
-#define	ENOTUNIQ	76	/* Name not unique on network */
-#define	EBADFD		77	/* File descriptor in bad state */
-#define	EREMCHG		78	/* Remote address changed */
-#define	ELIBACC		79	/* Can not access a needed shared library */
-#define	ELIBBAD		80	/* Accessing a corrupted shared library */
-#define	ELIBSCN		81	/* .lib section in a.out corrupted */
-#define	ELIBMAX		82	/* Attempting to link in too many shared libraries */
-#define	ELIBEXEC	83	/* Cannot exec a shared library directly */
-#define	EILSEQ		84	/* Illegal byte sequence */
-#define	ERESTART	85	/* Interrupted system call should be restarted */
-#define	ESTRPIPE	86	/* Streams pipe error */
-#define	EUSERS		87	/* Too many users */
-#define	ENOTSOCK	88	/* Socket operation on non-socket */
-#define	EDESTADDRREQ	89	/* Destination address required */
-#define	EMSGSIZE	90	/* Message too long */
-#define	EPROTOTYPE	91	/* Protocol wrong type for socket */
-#define	ENOPROTOOPT	92	/* Protocol not available */
-#define	EPROTONOSUPPORT	93	/* Protocol not supported */
-#define	ESOCKTNOSUPPORT	94	/* Socket type not supported */
-#define	EOPNOTSUPP	95	/* Operation not supported on transport endpoint */
-#define	ENOTSUP EOPNOTSUPP
-#define	EPFNOSUPPORT	96	/* Protocol family not supported */
-#define	EAFNOSUPPORT	97	/* Address family not supported by protocol */
-#define	EADDRINUSE	98	/* Address already in use */
-#define	EADDRNOTAVAIL	99	/* Cannot assign requested address */
-#define	ENETDOWN	100	/* Network is down */
-#define	ENETUNREACH	101	/* Network is unreachable */
-#define	ENETRESET	102	/* Network dropped connection because of reset */
-#define	ECONNABORTED	103	/* Software caused connection abort */
-#define	ECONNRESET	104	/* Connection reset by peer */
-#define	ENOBUFS		105	/* No buffer space available */
-#define	EISCONN		106	/* Transport endpoint is already connected */
-#define	ENOTCONN	107	/* Transport endpoint is not connected */
-#define	ESHUTDOWN	108	/* Cannot send after transport endpoint shutdown */
-#define	ETOOMANYREFS	109	/* Too many references: cannot splice */
-#define	ETIMEDOUT	110	/* Connection timed out */
-#define	ECONNREFUSED	111	/* Connection refused */
-#define	EHOSTDOWN	112	/* Host is down */
-#define	EHOSTUNREACH	113	/* No route to host */
-#define	EALREADY	114	/* Operation already in progress */
-#define	EINPROGRESS	115	/* Operation now in progress */
-#define	ESTALE		116	/* Stale NFS file handle */
-#define	EUCLEAN		117	/* Structure needs cleaning */
-#define	ENOTNAM		118	/* Not a XENIX named type file */
-#define	ENAVAIL		119	/* No XENIX semaphores available */
-#define	EISNAM		120	/* Is a named type file */
-#define	EREMOTEIO	121	/* Remote I/O error */
-#define	EDQUOT		122	/* Quota exceeded */
-
-#define	ENOMEDIUM	123	/* No medium found */
-#define	EMEDIUMTYPE	124	/* Wrong medium type */
-#define	ECANCELED	125	/* Operation Canceled */
-#define	ENOKEY		126	/* Required key not available */
-#define	EKEYEXPIRED	127	/* Key has expired */
-#define	EKEYREVOKED	128	/* Key has been revoked */
-#define	EKEYREJECTED	129	/* Key was rejected by service */
-
-/* for robust mutexes */
-#define	EOWNERDEAD	130	/* Owner died */
-#define	ENOTRECOVERABLE	131	/* State not recoverable */
-
-
-#define EFTYPE          132     /* Inappropriate file type or format */
-
-#ifdef HAVE_LIBC
-#include <mini-os/sched.h>
-extern int errno;
-#define ERRNO
-#define errno (get_current()->reent._errno)
-#endif
-
-#endif
diff --git a/extras/mini-os/include/events.h b/extras/mini-os/include/events.h
deleted file mode 100644
index 89b5997..0000000
--- a/extras/mini-os/include/events.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2005 - Grzegorz Milos - Intel Reseach Cambridge
- ****************************************************************************
- *
- *        File: events.h
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: Grzegorz Milos (gm281 at cam.ac.uk)
- *              
- *        Date: Jul 2003, changes Jun 2005
- * 
- * Environment: Xen Minimal OS
- * Description: Deals with events on the event channels
- *
- ****************************************************************************
- */
-
-#ifndef _EVENTS_H_
-#define _EVENTS_H_
-
-#include<mini-os/traps.h>
-#include<xen/event_channel.h>
-
-typedef void (*evtchn_handler_t)(evtchn_port_t, struct pt_regs *, void *);
-
-/* prototypes */
-void arch_init_events(void);
-
-/* Called by fini_events to close any ports opened by arch-specific code. */
-void arch_unbind_ports(void);
-
-void arch_fini_events(void);
-
-int do_event(evtchn_port_t port, struct pt_regs *regs);
-evtchn_port_t bind_virq(uint32_t virq, evtchn_handler_t handler, void *data);
-evtchn_port_t bind_pirq(uint32_t pirq, int will_share, evtchn_handler_t handler, void *data);
-evtchn_port_t bind_evtchn(evtchn_port_t port, evtchn_handler_t handler,
-						  void *data);
-void unbind_evtchn(evtchn_port_t port);
-void init_events(void);
-int evtchn_alloc_unbound(domid_t pal, evtchn_handler_t handler,
-						 void *data, evtchn_port_t *port);
-int evtchn_bind_interdomain(domid_t pal, evtchn_port_t remote_port,
-							evtchn_handler_t handler, void *data,
-							evtchn_port_t *local_port);
-int evtchn_get_peercontext(evtchn_port_t local_port, char *ctx, int size);
-void unbind_all_ports(void);
-
-static inline int notify_remote_via_evtchn(evtchn_port_t port)
-{
-    evtchn_send_t op;
-    op.port = port;
-    return HYPERVISOR_event_channel_op(EVTCHNOP_send, &op);
-}
-
-void fini_events(void);
-
-#endif /* _EVENTS_H_ */
diff --git a/extras/mini-os/include/fbfront.h b/extras/mini-os/include/fbfront.h
deleted file mode 100644
index d4851a4..0000000
--- a/extras/mini-os/include/fbfront.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <xen/io/kbdif.h>
-#include <xen/io/fbif.h>
-#include <mini-os/semaphore.h>
-#include <mini-os/wait.h>
-
-/* from <linux/input.h> */
-#ifndef BTN_LEFT
-#define BTN_LEFT 0x110
-#endif
-#ifndef BTN_RIGHT
-#define BTN_RIGHT 0x111
-#endif
-#ifndef BTN_MIDDLE
-#define BTN_MIDDLE 0x112
-#endif
-#ifndef KEY_Q
-#define KEY_Q 16
-#endif
-#ifndef KEY_MAX
-#define KEY_MAX 0x1ff
-#endif
-
-
-struct kbdfront_dev;
-struct kbdfront_dev *init_kbdfront(char *nodename, int abs_pointer);
-#ifdef HAVE_LIBC
-int kbdfront_open(struct kbdfront_dev *dev);
-#endif
-
-int kbdfront_receive(struct kbdfront_dev *dev, union xenkbd_in_event *buf, int n);
-extern struct wait_queue_head kbdfront_queue;
-
-void shutdown_kbdfront(struct kbdfront_dev *dev);
-
-
-struct fbfront_dev *init_fbfront(char *nodename, unsigned long *mfns, int width, int height, int depth, int stride, int n);
-#ifdef HAVE_LIBC
-int fbfront_open(struct fbfront_dev *dev);
-#endif
-
-int fbfront_receive(struct fbfront_dev *dev, union xenfb_in_event *buf, int n);
-extern struct wait_queue_head fbfront_queue;
-void fbfront_update(struct fbfront_dev *dev, int x, int y, int width, int height);
-void fbfront_resize(struct fbfront_dev *dev, int width, int height, int stride, int depth, int offset);
-
-void shutdown_fbfront(struct fbfront_dev *dev);
diff --git a/extras/mini-os/include/fcntl.h b/extras/mini-os/include/fcntl.h
deleted file mode 100644
index cc59b3c..0000000
--- a/extras/mini-os/include/fcntl.h
+++ /dev/null
@@ -1,99 +0,0 @@
-#ifndef _I386_FCNTL_H
-#define _I386_FCNTL_H
-
-#ifdef HAVE_LIBC
-#include_next <fcntl.h>
-#else
-
-/* open/fcntl - O_SYNC is only implemented on blocks devices and on files
-   located on an ext2 file system */
-#define O_ACCMODE	   0003
-#define O_RDONLY	     00
-#define O_WRONLY	     01
-#define O_RDWR		     02
-#define O_CREAT		   0100	/* not fcntl */
-#define O_EXCL		   0200	/* not fcntl */
-#define O_NOCTTY	   0400	/* not fcntl */
-#define O_TRUNC		  01000	/* not fcntl */
-#define O_APPEND	  02000
-#define O_NONBLOCK	  04000
-#define O_NDELAY	O_NONBLOCK
-#define O_SYNC		 010000
-#define FASYNC		 020000	/* fcntl, for BSD compatibility */
-#define O_DIRECT	 040000	/* direct disk access hint */
-#define O_LARGEFILE	0100000
-#define O_DIRECTORY	0200000	/* must be a directory */
-#define O_NOFOLLOW	0400000 /* don't follow links */
-#define O_NOATIME	01000000
-
-#define F_DUPFD		0	/* dup */
-#define F_GETFD		1	/* get close_on_exec */
-#define F_SETFD		2	/* set/clear close_on_exec */
-#define F_GETFL		3	/* get file->f_flags */
-#define F_SETFL		4	/* set file->f_flags */
-#define F_GETLK		5
-#define F_SETLK		6
-#define F_SETLKW	7
-
-#define F_SETOWN	8	/*  for sockets. */
-#define F_GETOWN	9	/*  for sockets. */
-#define F_SETSIG	10	/*  for sockets. */
-#define F_GETSIG	11	/*  for sockets. */
-
-#define F_GETLK64	12	/*  using 'struct flock64' */
-#define F_SETLK64	13
-#define F_SETLKW64	14
-
-/* for F_[GET|SET]FL */
-#define FD_CLOEXEC	1	/* actually anything with low bit set goes */
-
-/* for posix fcntl() and lockf() */
-#define F_RDLCK		0
-#define F_WRLCK		1
-#define F_UNLCK		2
-
-/* for old implementation of bsd flock () */
-#define F_EXLCK		4	/* or 3 */
-#define F_SHLCK		8	/* or 4 */
-
-/* for leases */
-#define F_INPROGRESS	16
-
-/* operations for bsd flock(), also used by the kernel implementation */
-#define LOCK_SH		1	/* shared lock */
-#define LOCK_EX		2	/* exclusive lock */
-#define LOCK_NB		4	/* or'd with one of the above to prevent
-				   blocking */
-#define LOCK_UN		8	/* remove lock */
-
-#define LOCK_MAND	32	/* This is a mandatory flock */
-#define LOCK_READ	64	/* ... Which allows concurrent read operations */
-#define LOCK_WRITE	128	/* ... Which allows concurrent write operations */
-#define LOCK_RW		192	/* ... Which allows concurrent read & write ops */
-
-/*
-struct flock {
-	short l_type;
-	short l_whence;
-	off_t l_start;
-	off_t l_len;
-	pid_t l_pid;
-};
-
-struct flock64 {
-	short  l_type;
-	short  l_whence;
-	loff_t l_start;
-	loff_t l_len;
-	pid_t  l_pid;
-};
-
-#define F_LINUX_SPECIFIC_BASE	1024
-*/
-
-#endif
-
-int open(const char *path, int flags, ...) asm("open64");
-int fcntl(int fd, int cmd, ...);
-
-#endif
diff --git a/extras/mini-os/include/gntmap.h b/extras/mini-os/include/gntmap.h
deleted file mode 100644
index fde53f3..0000000
--- a/extras/mini-os/include/gntmap.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef __GNTMAP_H__
-#define __GNTMAP_H__
-
-#include <os.h>
-
-/*
- * Please consider struct gntmap opaque. If instead you choose to disregard
- * this message, I insist that you keep an eye out for raptors.
- */
-struct gntmap {
-    int nentries;
-    struct gntmap_entry *entries;
-};
-
-int
-gntmap_set_max_grants(struct gntmap *map, int count);
-
-int
-gntmap_munmap(struct gntmap *map, unsigned long start_address, int count);
-
-void*
-gntmap_map_grant_refs(struct gntmap *map, 
-                      uint32_t count,
-                      uint32_t *domids,
-                      int domids_stride,
-                      uint32_t *refs,
-                      int writable);
-
-void
-gntmap_init(struct gntmap *map);
-
-void
-gntmap_fini(struct gntmap *map);
-
-#endif /* !__GNTMAP_H__ */
diff --git a/extras/mini-os/include/gnttab.h b/extras/mini-os/include/gnttab.h
deleted file mode 100644
index c43ad42..0000000
--- a/extras/mini-os/include/gnttab.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef __GNTTAB_H__
-#define __GNTTAB_H__
-
-#include <xen/grant_table.h>
-
-void init_gnttab(void);
-grant_ref_t gnttab_alloc_and_grant(void **map);
-grant_ref_t gnttab_grant_access(domid_t domid, unsigned long frame,
-				int readonly);
-grant_ref_t gnttab_grant_transfer(domid_t domid, unsigned long pfn);
-unsigned long gnttab_end_transfer(grant_ref_t gref);
-int gnttab_end_access(grant_ref_t ref);
-const char *gnttabop_error(int16_t status);
-void fini_gnttab(void);
-grant_entry_t *arch_init_gnttab(int nr_grant_frames);
-
-#endif /* !__GNTTAB_H__ */
diff --git a/extras/mini-os/include/hypervisor.h b/extras/mini-os/include/hypervisor.h
deleted file mode 100644
index 21b3566..0000000
--- a/extras/mini-os/include/hypervisor.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/******************************************************************************
- * hypervisor.h
- * 
- * Hypervisor handling.
- * 
- *
- * Copyright (c) 2002, K A Fraser
- * Copyright (c) 2005, Grzegorz Milos
- * Updates: Aravindh Puthiyaparambil <aravindh.puthiyaparambil at unisys.com>
- */
-
-#ifndef _HYPERVISOR_H_
-#define _HYPERVISOR_H_
-
-#include <mini-os/types.h>
-#include <xen/xen.h>
-#if defined(__i386__)
-#include <hypercall-x86_32.h>
-#elif defined(__x86_64__)
-#include <hypercall-x86_64.h>
-#elif defined(__arm__) || defined(__aarch64__)
-#include <hypercall-arm.h>
-#else
-#error "Unsupported architecture"
-#endif
-#include <mini-os/traps.h>
-
-/*
- * a placeholder for the start of day information passed up from the hypervisor
- */
-union start_info_union
-{
-    start_info_t start_info;
-    char padding[512];
-};
-extern union start_info_union start_info_union;
-#define start_info (start_info_union.start_info)
-
-/* hypervisor.c */
-void force_evtchn_callback(void);
-void do_hypervisor_callback(struct pt_regs *regs);
-void mask_evtchn(uint32_t port);
-void unmask_evtchn(uint32_t port);
-void clear_evtchn(uint32_t port);
-
-extern int in_callback;
-
-#endif /* __HYPERVISOR_H__ */
diff --git a/extras/mini-os/include/ioremap.h b/extras/mini-os/include/ioremap.h
deleted file mode 100644
index 7f246e3..0000000
--- a/extras/mini-os/include/ioremap.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * Copyright (C) 2009 Netronome Systems, Inc.  All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-
-#ifndef _IOREMAP_H_
-#define _IOREMAP_H_
-
-void *ioremap(unsigned long phys_addr, unsigned long size);
-void *ioremap_nocache(unsigned long phys_addr, unsigned long size);
-void iounmap(void *virt_addr, unsigned long size);
-
-#endif /* _IOREMAP_H_ */
-
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 indent-tabs-mode:nil -*- */
diff --git a/extras/mini-os/include/iorw.h b/extras/mini-os/include/iorw.h
deleted file mode 100644
index d5ec065..0000000
--- a/extras/mini-os/include/iorw.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef MINIOS_IORW_H
-#define MINIOS_IORW_H
-
-#include <mini-os/types.h>
-
-void iowrite8(volatile void* addr, uint8_t val);
-void iowrite16(volatile void* addr, uint16_t val);
-void iowrite32(volatile void* addr, uint32_t val);
-void iowrite64(volatile void* addr, uint64_t val);
-
-uint8_t ioread8(volatile void* addr);
-uint16_t ioread16(volatile void* addr);
-uint32_t ioread32(volatile void* addr);
-uint64_t ioread64(volatile void* addr);
-
-#endif
diff --git a/extras/mini-os/include/kernel.h b/extras/mini-os/include/kernel.h
deleted file mode 100644
index 13e3274..0000000
--- a/extras/mini-os/include/kernel.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _KERNEL_H_
-#define _KERNEL_H_
-
-void start_kernel(void);
-void do_exit(void) __attribute__((noreturn));
-void arch_do_exit(void);
-void stop_kernel(void);
-
-#endif /* _KERNEL_H_ */
diff --git a/extras/mini-os/include/lib-gpl.h b/extras/mini-os/include/lib-gpl.h
deleted file mode 100644
index d5602b2..0000000
--- a/extras/mini-os/include/lib-gpl.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: lib.h
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: 
- *              
- *        Date: Aug 2003
- * 
- * Environment: Xen Minimal OS
- * Description: Random useful library functions, from Linux'
- * include/linux/kernel.h
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#ifndef _LIB_GPL_H_
-#define _LIB_GPL_H_
-
-#ifndef HAVE_LIBC
-/* printing */
-extern unsigned long simple_strtoul(const char *,char **,unsigned int);
-extern long simple_strtol(const char *,char **,unsigned int);
-extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
-extern long long simple_strtoll(const char *,char **,unsigned int);
-
-extern int sprintf(char * buf, const char * fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int vsprintf(char *buf, const char *, va_list)
-	__attribute__ ((format (printf, 2, 0)));
-extern int snprintf(char * buf, size_t size, const char * fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
-	__attribute__ ((format (printf, 3, 0)));
-extern int scnprintf(char * buf, size_t size, const char * fmt, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
-	__attribute__ ((format (printf, 3, 0)));
-extern int sscanf(const char *, const char *, ...)
-	__attribute__ ((format (scanf, 2, 3)));
-extern int vsscanf(const char *, const char *, va_list)
-	__attribute__ ((format (scanf, 2, 0)));
-#endif
-
-#endif /* _LIB_GPL_H_ */
diff --git a/extras/mini-os/include/lib.h b/extras/mini-os/include/lib.h
deleted file mode 100644
index 62836c7..0000000
--- a/extras/mini-os/include/lib.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: lib.h
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: 
- *              
- *        Date: Aug 2003
- * 
- * Environment: Xen Minimal OS
- * Description: Random useful library functions, contains some freebsd stuff
- *
- ****************************************************************************
- * $Id: h-insert.h,v 1.4 2002/11/08 16:03:55 rn Exp $
- ****************************************************************************
- *
- * Copyright (c) 1992, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- */
-
-#ifndef _LIB_H_
-#define _LIB_H_
-
-#include <stdarg.h>
-#include <stddef.h>
-#include <xen/xen.h>
-#include <xen/event_channel.h>
-#include "gntmap.h"
-
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
-#define BUILD_BUG_ON(cond) ({ _Static_assert(!(cond), "!(" #cond ")"); })
-#define BUILD_BUG_ON_ZERO(cond) \
-    sizeof(struct { _Static_assert(!(cond), "!(" #cond ")"); })
-#else
-#define BUILD_BUG_ON_ZERO(cond) sizeof(struct { int:-!!(cond); })
-#define BUILD_BUG_ON(cond) ((void)BUILD_BUG_ON_ZERO(cond))
-#endif
-
-#ifdef HAVE_LIBC
-#include <sys/queue.h>
-#include <stdio.h>
-#else
-#include <lib-gpl.h>
-#endif
-
-#ifdef HAVE_LIBC
-#include <string.h>
-#else
-/* string and memory manipulation */
-
-/*
- * From:
- *	@(#)libkern.h	8.1 (Berkeley) 6/10/93
- * $FreeBSD$
- */
-int	 memcmp(const void *b1, const void *b2, size_t len);
-
-char	*strcat(char * __restrict, const char * __restrict);
-int	 strcmp(const char *, const char *);
-char	*strcpy(char * __restrict, const char * __restrict);
-
-char	*strdup(const char *__restrict);
-
-size_t	 strlen(const char *);
-
-int	 strncmp(const char *, const char *, size_t);
-char	*strncpy(char * __restrict, const char * __restrict, size_t);
-
-char	*strstr(const char *, const char *);
-
-void *memset(void *, int, size_t);
-
-char *strchr(const char *p, int ch);
-char *strrchr(const char *p, int ch);
-
-/* From:
- *	@(#)systm.h	8.7 (Berkeley) 3/29/95
- * $FreeBSD$
- */
-void	*memcpy(void *to, const void *from, size_t len);
-
-size_t strnlen(const char *, size_t);
-#endif
-
-#include <mini-os/console.h>
-
-#define RAND_MIX 2654435769U
-
-int rand(void);
-
-#include <mini-os/xenbus.h>
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define ASSERT(x)                                              \
-do {                                                           \
-	if (!(x)) {                                                \
-		printk("ASSERTION FAILED: %s at %s:%d.\n",             \
-			   # x ,                                           \
-			   __FILE__,                                       \
-			   __LINE__);                                      \
-        BUG();                                                 \
-	}                                                          \
-} while(0)
-
-#define BUG_ON(x) ASSERT(!(x))
-
-/* Consistency check as much as possible. */
-void sanity_check(void);
-
-#ifdef HAVE_LIBC
-enum fd_type {
-    FTYPE_NONE = 0,
-    FTYPE_CONSOLE,
-    FTYPE_FILE,
-    FTYPE_XENBUS,
-    FTYPE_XC,
-    FTYPE_EVTCHN,
-    FTYPE_GNTMAP,
-    FTYPE_SOCKET,
-    FTYPE_TAP,
-    FTYPE_BLK,
-    FTYPE_KBD,
-    FTYPE_FB,
-    FTYPE_MEM,
-    FTYPE_SAVEFILE,
-    FTYPE_TPMFRONT,
-    FTYPE_TPM_TIS,
-};
-
-LIST_HEAD(evtchn_port_list, evtchn_port_info);
-
-struct evtchn_port_info {
-        LIST_ENTRY(evtchn_port_info) list;
-        evtchn_port_t port;
-        unsigned long pending;
-        int bound;
-};
-
-extern struct file {
-    enum fd_type type;
-    union {
-	struct {
-            /* lwIP fd */
-	    int fd;
-	} socket;
-	struct {
-            /* FS import fd */
-	    int fd;
-	    off_t offset;
-	} file;
-	struct {
-	    struct evtchn_port_list ports;
-	} evtchn;
-	struct gntmap gntmap;
-	struct {
-	    struct netfront_dev *dev;
-	} tap;
-	struct {
-	    struct blkfront_dev *dev;
-            off_t offset;
-	} blk;
-	struct {
-	    struct kbdfront_dev *dev;
-	} kbd;
-	struct {
-	    struct fbfront_dev *dev;
-	} fb;
-	struct {
-	    struct consfront_dev *dev;
-	} cons;
-#ifdef CONFIG_TPMFRONT
-	struct {
-	   struct tpmfront_dev *dev;
-	   int respgot;
-	   off_t offset;
-	} tpmfront;
-#endif
-#ifdef CONFIG_TPM_TIS
-	struct {
-	   struct tpm_chip *dev;
-	   int respgot;
-	   off_t offset;
-	} tpm_tis;
-#endif
-#ifdef CONFIG_XENBUS
-        struct {
-            /* To each xenbus FD is associated a queue of watch events for this
-             * FD.  */
-            xenbus_event_queue events;
-        } xenbus;
-#endif
-    };
-    int read;	/* maybe available for read */
-} files[];
-
-int alloc_fd(enum fd_type type);
-void close_all_files(void);
-extern struct thread *main_thread;
-void sparse(unsigned long data, size_t size);
-#endif
-
-#endif /* _LIB_H_ */
diff --git a/extras/mini-os/include/linux/types.h b/extras/mini-os/include/linux/types.h
deleted file mode 100644
index ac596a7..0000000
--- a/extras/mini-os/include/linux/types.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifndef _LINUX_TYPES_H_
-#define _LINUX_TYPES_H_
-#include <mini-os/types.h>
-typedef uint64_t __u64;
-#endif /* _LINUX_TYPES_H_ */
diff --git a/extras/mini-os/include/lwipopts.h b/extras/mini-os/include/lwipopts.h
deleted file mode 100644
index bc5555e..0000000
--- a/extras/mini-os/include/lwipopts.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * lwipopts.h
- *
- * Configuration for lwIP running on mini-os 
- *
- * Tim Deegan <Tim.Deegan at eu.citrix.net>, July 2007
- */
-
-#ifndef __LWIP_LWIPOPTS_H__
-#define __LWIP_LWIPOPTS_H__
-
-#define SYS_LIGHTWEIGHT_PROT 1
-#define MEM_LIBC_MALLOC 1
-#define LWIP_TIMEVAL_PRIVATE 0
-#define LWIP_DHCP 1
-#define LWIP_COMPAT_SOCKETS 0
-#define LWIP_IGMP 1
-#define LWIP_USE_HEAP_FROM_INTERRUPT 1
-#define MEMP_NUM_SYS_TIMEOUT 10
-#define TCP_SND_BUF 3000
-#define TCP_MSS 1500
-
-#endif /* __LWIP_LWIPOPTS_H__ */
diff --git a/extras/mini-os/include/mm.h b/extras/mini-os/include/mm.h
deleted file mode 100644
index f57d8ab..0000000
--- a/extras/mini-os/include/mm.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- *
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- * Copyright (c) 2005, Keir A Fraser
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef _MM_H_
-#define _MM_H_
-
-#if defined(__i386__)
-#include <xen/arch-x86_32.h>
-#elif defined(__x86_64__)
-#include <xen/arch-x86_64.h>
-#elif defined(__arm__) || defined(__aarch64__)
-#include <xen/arch-arm.h>
-#else
-#error "Unsupported architecture"
-#endif
-#include <xen/xen.h>
-
-#include <mini-os/arch_limits.h>
-#include <mini-os/arch_mm.h>
-
-#define STACK_SIZE_PAGE_ORDER __STACK_SIZE_PAGE_ORDER
-#define STACK_SIZE __STACK_SIZE
-
-
-void init_mm(void);
-unsigned long alloc_pages(int order);
-#define alloc_page()    alloc_pages(0)
-void free_pages(void *pointer, int order);
-#define free_page(p)    free_pages(p, 0)
-
-static __inline__ int get_order(unsigned long size)
-{
-    int order;
-    size = (size-1) >> PAGE_SHIFT;
-    for ( order = 0; size; order++ )
-        size >>= 1;
-    return order;
-}
-
-void arch_init_demand_mapping_area(unsigned long max_pfn);
-void arch_init_mm(unsigned long* start_pfn_p, unsigned long* max_pfn_p);
-void arch_init_p2m(unsigned long max_pfn_p);
-
-unsigned long allocate_ondemand(unsigned long n, unsigned long alignment);
-/* map f[i*stride]+i*increment for i in 0..n-1, aligned on alignment pages */
-void *map_frames_ex(const unsigned long *f, unsigned long n, unsigned long stride,
-	unsigned long increment, unsigned long alignment, domid_t id,
-	int *err, unsigned long prot);
-void do_map_frames(unsigned long addr,
-        const unsigned long *f, unsigned long n, unsigned long stride,
-	unsigned long increment, domid_t id, int *err, unsigned long prot);
-int unmap_frames(unsigned long va, unsigned long num_frames);
-unsigned long alloc_contig_pages(int order, unsigned int addr_bits);
-#ifdef HAVE_LIBC
-extern unsigned long heap, brk, heap_mapped, heap_end;
-#endif
-
-int free_physical_pages(xen_pfn_t *mfns, int n);
-void fini_mm(void);
-
-#endif /* _MM_H_ */
diff --git a/extras/mini-os/include/netfront.h b/extras/mini-os/include/netfront.h
deleted file mode 100644
index 2b95da9..0000000
--- a/extras/mini-os/include/netfront.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#include <mini-os/wait.h>
-#ifdef HAVE_LWIP
-#include <lwip/netif.h>
-#endif
-struct netfront_dev;
-struct netfront_dev *init_netfront(char *nodename, void (*netif_rx)(unsigned char *data, int len), unsigned char rawmac[6], char **ip);
-void netfront_xmit(struct netfront_dev *dev, unsigned char* data,int len);
-void shutdown_netfront(struct netfront_dev *dev);
-#ifdef HAVE_LIBC
-int netfront_tap_open(char *nodename);
-ssize_t netfront_receive(struct netfront_dev *dev, unsigned char *data, size_t len);
-#endif
-
-extern struct wait_queue_head netfront_queue;
-
-#ifdef HAVE_LWIP
-/* Call this to bring up the netfront interface and the lwIP stack.
- * N.B. _must_ be called from a thread; it's not safe to call this from 
- * app_main(). */
-void start_networking(void);
-void stop_networking(void);
-
-void networking_set_addr(struct ip_addr *ipaddr, struct ip_addr *netmask, struct ip_addr *gw);
-#endif
diff --git a/extras/mini-os/include/pcifront.h b/extras/mini-os/include/pcifront.h
deleted file mode 100644
index 1b05963..0000000
--- a/extras/mini-os/include/pcifront.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <mini-os/types.h>
-#include <xen/io/pciif.h>
-struct pcifront_dev;
-void pcifront_watches(void *opaque);
-struct pcifront_dev *init_pcifront(char *nodename);
-void pcifront_op(struct pcifront_dev *dev, struct xen_pci_op *op);
-void pcifront_scan(struct pcifront_dev *dev, void (*fun)(unsigned int domain, unsigned int bus, unsigned slot, unsigned int fun));
-int pcifront_conf_read(struct pcifront_dev *dev,
-                       unsigned int dom,
-                       unsigned int bus, unsigned int slot, unsigned int fun,
-                       unsigned int off, unsigned int size, unsigned int *val);
-int pcifront_conf_write(struct pcifront_dev *dev,
-                        unsigned int dom,
-                        unsigned int bus, unsigned int slot, unsigned int fun,
-                        unsigned int off, unsigned int size, unsigned int val);
-int pcifront_enable_msi(struct pcifront_dev *dev,
-                        unsigned int dom,
-                        unsigned int bus, unsigned int slot, unsigned int fun);
-int pcifront_disable_msi(struct pcifront_dev *dev,
-                         unsigned int dom,
-                         unsigned int bus, unsigned int slot, unsigned int fun);
-int pcifront_enable_msix(struct pcifront_dev *dev,
-                         unsigned int dom,
-                         unsigned int bus, unsigned int slot, unsigned int fun,
-                         struct xen_msix_entry *entries, int n);
-int pcifront_disable_msix(struct pcifront_dev *dev,
-                          unsigned int dom,
-                          unsigned int bus, unsigned int slot, unsigned int fun);
-void shutdown_pcifront(struct pcifront_dev *dev);
diff --git a/extras/mini-os/include/posix/arpa/inet.h b/extras/mini-os/include/posix/arpa/inet.h
deleted file mode 100644
index 012f3a4..0000000
--- a/extras/mini-os/include/posix/arpa/inet.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _POSIX_ARPA_INET_H_
-#define	_POSIX_ARPA_INET_H_
-
-#include <lwip/inet.h>
-
-#endif /* _POSIX_ARPA_INET_H_ */
-
diff --git a/extras/mini-os/include/posix/dirent.h b/extras/mini-os/include/posix/dirent.h
deleted file mode 100644
index 884b69e..0000000
--- a/extras/mini-os/include/posix/dirent.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _POSIX_DIRENT_H
-#define _POSIX_DIRENT_H
-
-#include <stdint.h>
-
-struct dirent {
-        char *d_name;
-};
-
-typedef struct {
-        struct dirent dirent;
-        char *name;
-        int32_t offset;
-        char **entries;
-        int32_t curentry;
-        int32_t nbentries;
-        int has_more;
-} DIR;
-
-DIR *opendir(const char *name);
-struct dirent *readdir(DIR *dir);
-int closedir(DIR *dir);
-
-#endif /* _POSIX_DIRENT_H */
diff --git a/extras/mini-os/include/posix/err.h b/extras/mini-os/include/posix/err.h
deleted file mode 100644
index 1079f58..0000000
--- a/extras/mini-os/include/posix/err.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _POSIX_ERR_H
-#define _POSIX_ERR_H
-
-#include <stdarg.h>
-
-void err(int eval, const char *fmt, ...);
-void errx(int eval, const char *fmt, ...);
-void warn(const char *fmt, ...);
-void warnx(const char *fmt, ...);
-void verr(int eval, const char *fmt, va_list args);
-void verrx(int eval, const char *fmt, va_list args);
-void vwarn(const char *fmt, va_list args);
-void vwarnx(const char *fmt, va_list args);
-
-#endif /* _POSIX_ERR_H */
diff --git a/extras/mini-os/include/posix/fcntl.h b/extras/mini-os/include/posix/fcntl.h
deleted file mode 100644
index ecfd8c8..0000000
--- a/extras/mini-os/include/posix/fcntl.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _POSIX_FCNTL_H
-#define _POSIX_FCNTL_H
-
-#include_next <fcntl.h>
-
-#define F_ULOCK 0
-#define F_LOCK  1
-#define F_TLOCK 2
-#define F_TEST  3
-
-#endif /* _POSIX_FCNTL_H */
diff --git a/extras/mini-os/include/posix/limits.h b/extras/mini-os/include/posix/limits.h
deleted file mode 100644
index 5d2b864..0000000
--- a/extras/mini-os/include/posix/limits.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _POSIX_LIMITS_H
-#define _POSIX_LIMITS_H
-
-#include <mini-os/arch_limits.h>
-
-#define CHAR_BIT        8
-
-#define SCHAR_MAX       0x7f
-#define SCHAR_MIN       (-SCHAR_MAX-1)
-#define UCHAR_MAX       0xff
-
-#ifdef __CHAR_UNSIGNED__
-# define CHAR_MIN       0
-# define CHAR_MAX       UCHAR_MAX
-#else
-# define CHAR_MIN       SCHAR_MIN
-# define CHAR_MAX       SCHAR_MAX
-#endif
-
-#define INT_MAX         0x7fffffff
-#define INT_MIN         (-INT_MAX-1)
-#define UINT_MAX        0xffffffff
-
-#define SHRT_MIN	(-0x8000)
-#define SHRT_MAX        0x7fff
-#define USHRT_MAX       0xffff
-
-#if defined(__x86_64__)
-# define LONG_MAX       0x7fffffffffffffffL
-# define ULONG_MAX      0xffffffffffffffffUL
-#else
-# define LONG_MAX       0x7fffffffL
-# define ULONG_MAX      0xffffffffUL
-#endif
-#define LONG_MIN        (-LONG_MAX-1L)
-
-#define LLONG_MAX       0x7fffffffffffffffLL
-#define LLONG_MIN       (-LLONG_MAX-1LL)
-#define ULLONG_MAX      0xffffffffffffffffULL
-
-#define LONG_LONG_MIN   LLONG_MIN
-#define LONG_LONG_MAX   LLONG_MAX
-#define ULONG_LONG_MAX  ULLONG_MAX
-
-#define PATH_MAX __PAGE_SIZE
-#define PAGE_SIZE __PAGE_SIZE
-
-#endif /* _POSIX_LIMITS_H */
diff --git a/extras/mini-os/include/posix/net/if.h b/extras/mini-os/include/posix/net/if.h
deleted file mode 100644
index 5be77d4..0000000
--- a/extras/mini-os/include/posix/net/if.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * This code is mostly taken from NetBSD net/if.h 
- * Changes: Stefano Stabellini <stefano.stabellini at eu.citrix.com>
- *
- ******************************************************************************
- *
- * Copyright (c) 1999, 2000, 2001 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by William Studenmund and Jason R. Thorpe.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Copyright (c) 1982, 1986, 1989, 1993
- *      The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- */
-
-#ifndef _NET_IF_H_
-#define _NET_IF_H_
-
-/*
- * Length of interface external name, including terminating '\0'.
- * Note: this is the same size as a generic device's external name.
- */
-#define IF_NAMESIZE 16
-
-struct if_nameindex {
-        unsigned int    if_index;       /* 1, 2, ... */
-        char            *if_name;       /* null terminated name: "le0", ... */
-};
-
-unsigned int if_nametoindex(const char *);
-char *  if_indextoname(unsigned int, char *);
-struct  if_nameindex * if_nameindex(void);
-void    if_freenameindex(struct if_nameindex *);
-
-#endif /* !_NET_IF_H_ */
-
diff --git a/extras/mini-os/include/posix/netdb.h b/extras/mini-os/include/posix/netdb.h
deleted file mode 100644
index 8f76a95..0000000
--- a/extras/mini-os/include/posix/netdb.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _POSIX_NETDB_H_
-#define _POSIX_NETDB_H_
-
-struct hostent {
-    char *h_addr;
-};
-#define gethostbyname(buf) NULL
-
-#endif /* _POSIX_NETDB_H_ */
diff --git a/extras/mini-os/include/posix/netinet/in.h b/extras/mini-os/include/posix/netinet/in.h
deleted file mode 100644
index cc1a910..0000000
--- a/extras/mini-os/include/posix/netinet/in.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _POSIX_SYS_IN_H_
-#define _POSIX_SYS_IN_H_
-
-#include <fcntl.h>
-#include <lwip/sockets.h>
-
-#endif /* _POSIX_SYS_IN_H_ */
diff --git a/extras/mini-os/include/posix/netinet/tcp.h b/extras/mini-os/include/posix/netinet/tcp.h
deleted file mode 100644
index 3e3b060..0000000
--- a/extras/mini-os/include/posix/netinet/tcp.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _POSIX_SYS_TCP_H_
-#define _POSIX_SYS_TCP_H_
-
-#include <lwip/tcp.h>
-
-#endif /* _POSIX_SYS_TCP_H_ */
diff --git a/extras/mini-os/include/posix/poll.h b/extras/mini-os/include/posix/poll.h
deleted file mode 100644
index 06fb41a..0000000
--- a/extras/mini-os/include/posix/poll.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <sys/poll.h>
diff --git a/extras/mini-os/include/posix/pthread.h b/extras/mini-os/include/posix/pthread.h
deleted file mode 100644
index f74d924..0000000
--- a/extras/mini-os/include/posix/pthread.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef _POSIX_PTHREAD_H
-#define _POSIX_PTHREAD_H
-
-#include <stdlib.h>
-
-/* Let's be single-threaded for now.  */
-
-typedef struct {
-    void *ptr;
-} *pthread_key_t;
-static inline int pthread_key_create(pthread_key_t *key, void (*destr_function)(void*))
-{
-    *key = malloc(sizeof(**key));
-    (*key)->ptr = NULL;
-    return 0;
-}
-static inline int pthread_setspecific(pthread_key_t key, const void *pointer)
-{
-    key->ptr = (void*) pointer;
-    return 0;
-}
-static inline void *pthread_getspecific(pthread_key_t key)
-{
-    return key->ptr;
-}
-static inline int pthread_key_delete(pthread_key_t key)
-{
-    free(key);
-    return 0;
-}
-
-
-
-typedef struct {} pthread_mutexattr_t;
-static inline int pthread_mutexattr_init(pthread_mutexattr_t *mattr) { return 0; }
-#define PTHREAD_MUTEX_NORMAL 0
-#define PTHREAD_MUTEX_RECURSIVE 1
-static inline int pthread_mutexattr_settype(pthread_mutexattr_t *mattr, int kind) { return 0; }
-static inline int pthread_mutexattr_destroy(pthread_mutexattr_t *mattr) { return 0; }
-typedef struct {} pthread_mutex_t;
-#define PTHREAD_MUTEX_INITIALIZER {}
-static inline int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *mattr) { return 0; }
-static inline int pthread_mutex_lock(pthread_mutex_t *mutex) { return 0; }
-static inline int pthread_mutex_unlock(pthread_mutex_t *mutex) { return 0; }
-
-
-
-typedef struct {
-    int done;
-} pthread_once_t;
-#define PTHREAD_ONCE_INIT { 0 }
-
-static inline int pthread_once(pthread_once_t *once_control, void (*init_routine)(void))
-{
-    if (!once_control->done) {
-        once_control->done = 1;
-        init_routine();
-    }
-    return 0;
-}
-
-#define __thread
-
-#endif /* _POSIX_PTHREAD_H */
diff --git a/extras/mini-os/include/posix/signal.h b/extras/mini-os/include/posix/signal.h
deleted file mode 100644
index be9e9f3..0000000
--- a/extras/mini-os/include/posix/signal.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _POSIX_SIGNAL_H
-#define _POSIX_SIGNAL_H
-
-#include_next <signal.h>
-
-int sigaction(int signum, const struct sigaction * __restrict,
-              struct sigaction * __restrict);
-
-#endif
-
diff --git a/extras/mini-os/include/posix/stdlib.h b/extras/mini-os/include/posix/stdlib.h
deleted file mode 100644
index 53e6289..0000000
--- a/extras/mini-os/include/posix/stdlib.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _POSIX_STDLIB_H
-#define _POSIX_STDLIB_H
-
-#include_next <stdlib.h>
-
-#define realpath(p,r) strcpy(r,p)
-
-#endif /* _POSIX_STDLIB_H */
diff --git a/extras/mini-os/include/posix/strings.h b/extras/mini-os/include/posix/strings.h
deleted file mode 100644
index 4957c41..0000000
--- a/extras/mini-os/include/posix/strings.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _POSIX_STRINGS_H
-#define _POSIX_STRINGS_H
-
-#include <string.h>
-
-#define bzero(ptr, size) (memset((ptr), '\0', (size)), (void) 0)
-
-int ffs (int i);
-int ffsl (long int li);
-int ffsll (long long int lli);
-
-#endif /* _POSIX_STRINGS_H */
diff --git a/extras/mini-os/include/posix/sys/ioctl.h b/extras/mini-os/include/posix/sys/ioctl.h
deleted file mode 100644
index ecf3080..0000000
--- a/extras/mini-os/include/posix/sys/ioctl.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _POSIX_SYS_IOCTL_H
-#define _POSIX_SYS_IOCTL_H
-
-int ioctl(int fd, int request, ...);
-
-#define _IOC_NONE 0
-#define _IOC_WRITE 1
-#define _IOC_READ 2
-
-#define _IOC(rw, class, n, size) \
-    	(((rw   ) << 30) | \
-	 ((class) << 22) | \
-	 ((n    ) << 14) | \
-	 ((size ) << 0))
-
-#endif /* _POSIX_SYS_IOCTL_H */
diff --git a/extras/mini-os/include/posix/sys/mman.h b/extras/mini-os/include/posix/sys/mman.h
deleted file mode 100644
index 4d34979..0000000
--- a/extras/mini-os/include/posix/sys/mman.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _POSIX_SYS_MMAN_H
-#define _POSIX_SYS_MMAN_H
-
-#define PROT_READ	0x1
-#define PROT_WRITE	0x2
-#define PROT_EXEC	0x4
-
-#define MAP_SHARED	0x01
-#define MAP_PRIVATE	0x02
-#define MAP_ANON	0x20
-
-/* Pages are always resident anyway */
-#define MAP_LOCKED	0x0
-
-#define MAP_FAILED	((void*)0)
-
-void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset) asm("mmap64");
-int munmap(void *start, size_t length);
-static inline mlock(const void *addr, size_t len) { return 0; }
-static inline munlock(const void *addr, size_t len) { return 0; }
-
-#endif /* _POSIX_SYS_MMAN_H */
diff --git a/extras/mini-os/include/posix/sys/poll.h b/extras/mini-os/include/posix/sys/poll.h
deleted file mode 100644
index f9d7f5c..0000000
--- a/extras/mini-os/include/posix/sys/poll.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * This code is mostly taken from FreeBSD sys/sys/poll.h 
- * Changes: Stefano Stabellini <stefano.stabellini at eu.citrix.com>
- *
- ****************************************************************************
- * Copyright (c) 1997 Peter Wemm <peter at freebsd.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote products
- *    derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _POSIX_SYS_POLL_H_
-#define	_POSIX_SYS_POLL_H_
-
-/*
- * This file is intended to be compatible with the traditional poll.h.
- */
-
-typedef	unsigned int	nfds_t;
-
-/*
- * This structure is passed as an array to poll(2).
- */
-struct pollfd {
-	int	fd;		/* which file descriptor to poll */
-	short	events;		/* events we are interested in */
-	short	revents;	/* events found on return */
-};
-
-/*
- * Requestable events.  If poll(2) finds any of these set, they are
- * copied to revents on return.
- * XXX Note that FreeBSD doesn't make much distinction between POLLPRI
- * and POLLRDBAND since none of the file types have distinct priority
- * bands - and only some have an urgent "mode".
- * XXX Note POLLIN isn't really supported in true SVSV terms.  Under SYSV
- * POLLIN includes all of normal, band and urgent data.  Most poll handlers
- * on FreeBSD only treat it as "normal" data.
- */
-#define	POLLIN		0x0001		/* any readable data available */
-#define	POLLPRI		0x0002		/* OOB/Urgent readable data */
-#define	POLLOUT		0x0004		/* file descriptor is writeable */
-#define	POLLRDNORM	0x0040		/* non-OOB/URG data available */
-#define	POLLWRNORM	POLLOUT		/* no write type differentiation */
-#define	POLLRDBAND	0x0080		/* OOB/Urgent readable data */
-#define	POLLWRBAND	0x0100		/* OOB/Urgent data can be written */
-
-/*
- * These events are set if they occur regardless of whether they were
- * requested.
- */
-#define	POLLERR		0x0008		/* some poll error occurred */
-#define	POLLHUP		0x0010		/* file descriptor was "hung up" */
-#define	POLLNVAL	0x0020		/* requested events "invalid" */
-
-int	poll(struct pollfd _pfd[], nfds_t _nfds, int _timeout);
-
-#endif /* _POSIX_SYS_POLL_H_ */
diff --git a/extras/mini-os/include/posix/sys/select.h b/extras/mini-os/include/posix/sys/select.h
deleted file mode 100644
index 5132c51..0000000
--- a/extras/mini-os/include/posix/sys/select.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _POSIX_SELECT_H
-#define _POSIX_SELECT_H
-
-#include <sys/time.h>
-int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
-
-#endif /* _POSIX_SELECT_H */
diff --git a/extras/mini-os/include/posix/sys/socket.h b/extras/mini-os/include/posix/sys/socket.h
deleted file mode 100644
index 7c039a2..0000000
--- a/extras/mini-os/include/posix/sys/socket.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _POSIX_SYS_SOCKET_H_
-#define _POSIX_SYS_SOCKET_H_
-
-#include <fcntl.h>
-#include <lwip/sockets.h>
-
-int accept(int s, struct sockaddr *addr, socklen_t *addrlen);
-int bind(int s, struct sockaddr *name, socklen_t namelen);
-int shutdown(int s, int how);
-int getpeername (int s, struct sockaddr *name, socklen_t *namelen);
-int getsockname (int s, struct sockaddr *name, socklen_t *namelen);
-int getsockopt (int s, int level, int optname, void *optval, socklen_t *optlen);
-int setsockopt (int s, int level, int optname, const void *optval, socklen_t optlen);
-int close(int s);
-int connect(int s, struct sockaddr *name, socklen_t namelen);
-int listen(int s, int backlog);
-int recv(int s, void *mem, int len, unsigned int flags);
-//int read(int s, void *mem, int len);
-int recvfrom(int s, void *mem, int len, unsigned int flags,
-      struct sockaddr *from, socklen_t *fromlen);
-int send(int s, void *dataptr, int size, unsigned int flags);
-int sendto(int s, void *dataptr, int size, unsigned int flags,
-    struct sockaddr *to, socklen_t tolen);
-int socket(int domain, int type, int protocol);
-//int write(int s, void *dataptr, int size);
-int select(int maxfdp1, fd_set *readset, fd_set *writeset, fd_set *exceptset,
-                struct timeval *timeout);
-//int ioctl(int s, long cmd, void *argp);
-int getsockname(int s, struct sockaddr *name, socklen_t *namelen);
-
-#endif /* _POSIX_SYS_SOCKET_H_ */
diff --git a/extras/mini-os/include/posix/sys/stat.h b/extras/mini-os/include/posix/sys/stat.h
deleted file mode 100644
index 0c13bea..0000000
--- a/extras/mini-os/include/posix/sys/stat.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _POSIX_SYS_STAT_H
-#define _POSIX_SYS_STAT_H
-
-#include_next <sys/stat.h>
-int fstat(int fd, struct stat *buf) asm("fstat64");
-
-#endif /* _POSIX_SYS_STAT_H */
diff --git a/extras/mini-os/include/posix/syslog.h b/extras/mini-os/include/posix/syslog.h
deleted file mode 100644
index aabd0e4..0000000
--- a/extras/mini-os/include/posix/syslog.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _POSIX_SYSLOG_H
-#define _POSIX_SYSLOG_H
-
-#include <stdarg.h>
-
-#define LOG_PID 0
-#define LOG_CONS 0
-#define LOG_NDELAY 0
-#define LOG_ODELAY 0
-#define LOG_NOWAIT 0
-
-#define LOG_KERN 0
-#define LOG_USER 0
-#define LOG_MAIL 0
-#define LOG_NEWS 0
-#define LOG_UUCP 0
-#define LOG_DAEMON 0
-#define LOG_AUTH 0
-#define LOG_CRON 0
-#define LOG_LPR 0
-
-/* TODO: support */
-#define LOG_EMERG 0
-#define LOG_ALERT 1
-#define LOG_CRIT 2
-#define LOG_ERR 3
-#define LOG_WARNING 4
-#define LOG_NOTICE 5
-#define LOG_INFO 6
-#define LOG_DEBUG 7
-
-void openlog(const char *ident, int option, int facility);
-void syslog(int priority, const char *format, ...);
-void closelog(void);
-void vsyslog(int priority, const char *format, va_list ap);
-
-#endif /* _POSIX_SYSLOG_H */
diff --git a/extras/mini-os/include/posix/termios.h b/extras/mini-os/include/posix/termios.h
deleted file mode 100644
index a57aee4..0000000
--- a/extras/mini-os/include/posix/termios.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef _POSIX_TERMIOS_H
-#define _POSIX_TERMIOS_H
-
-#define NCC 32
-
-struct termios {
-    unsigned long c_iflag;
-    unsigned long c_oflag;
-    unsigned long c_lflag;
-    unsigned long c_cflag;
-    unsigned char c_cc[NCC];
-};
-
-/* modem lines */
-#define TIOCM_DTR	0x002
-#define TIOCM_RTS	0x004
-#define TIOCM_CTS	0x020
-#define TIOCM_CAR	0x040
-#define TIOCM_RI	0x080
-#define TIOCM_DSR	0x100
-
-/* c_iflag */
-#define IGNBRK	0x00000001
-#define BRKINT	0x00000002
-#define IGNPAR	0x00000004
-#define PARMRK	0x00000008
-#define INPCK	0x00000010
-#define ISTRIP	0x00000020
-#define INLCR	0x00000040
-#define IGNCR	0x00000080
-#define ICRNL	0x00000100
-#define IUCLC	0x00000200
-#define IXON	0x00000400
-#define IXANY	0x00000800
-#define IXOFF	0x00001000
-#define IMAXBEL	0x00002000
-#define IUTF8	0x00004000
-
-/* c_oflag */
-#define OPOST	0x00000001
-#define OLCUC	0x00000002
-#define ONLCR	0x00000004
-#define OCRNL	0x00000008
-#define ONOCR	0x00000010
-#define ONLRET	0x00000020
-#define OFILL	0x00000040
-#define OFDEL	0x00000080
-
-/* c_lflag */
-#define ISIG	0x00000001
-#define ICANON	0x00000002
-#define XCASE	0x00000004
-#define ECHO	0x00000008
-#define ECHOE	0x00000010
-#define ECHOK	0x00000020
-#define ECHONL	0x00000040
-#define NOFLSH	0x00000080
-#define TOSTOP	0x00000100
-#define ECHOCTL	0x00000200
-#define ECHOPRT	0x00000400
-#define ECHOKE	0x00000800
-#define FLUSHO	0x00002000
-#define PENDIN	0x00004000
-#define IEXTEN	0x00008000
-
-/* c_cflag */
-#define CSIZE	0x00000030
-#define CS8	0x00000030
-#define CSTOPB	0x00000040
-#define CREAD	0x00000080
-#define PARENB	0x00000100
-#define PARODD	0x00000200
-#define HUPCL	0x00000400
-#define CLOCAL	0x00000800
-
-/* c_cc */
-#define VTIME	5
-#define VMIN	6
-
-#define TCSANOW		0
-#define TCSADRAIN	1
-#define TCSAFLUSH	2
-
-int tcsetattr(int fildes, int action, const struct termios *tios);
-int tcgetattr(int fildes, struct termios *tios);
-
-#endif /* _POSIX_TERMIOS_H */
diff --git a/extras/mini-os/include/posix/time.h b/extras/mini-os/include/posix/time.h
deleted file mode 100644
index ce75f32..0000000
--- a/extras/mini-os/include/posix/time.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _POSIX_TIME_H
-#define _POSIX_TIME_H
-
-#include <sys/time.h>
-#define CLOCK_MONOTONIC	2
-#include_next <time.h>
-
-int nanosleep(const struct timespec *req, struct timespec *rem);
-int clock_gettime(clockid_t clock_id, struct timespec *tp);
-
-#endif /* _POSIX_TIME_H */
diff --git a/extras/mini-os/include/posix/unistd.h b/extras/mini-os/include/posix/unistd.h
deleted file mode 100644
index e85592f..0000000
--- a/extras/mini-os/include/posix/unistd.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _POSIX_UNISTD_H
-#define _POSIX_UNISTD_H
-
-#include_next <unistd.h>
-
-uid_t getuid(void);
-uid_t geteuid(void);
-gid_t getgid(void);
-gid_t getegid(void);
-int gethostname(char *name, size_t namelen);
-size_t getpagesize(void);
-int ftruncate(int fd, off_t length);
-int lockf(int fd, int cmd, off_t len);
-int nice(int inc);
-
-#endif /* _POSIX_UNISTD_H */
diff --git a/extras/mini-os/include/sched.h b/extras/mini-os/include/sched.h
deleted file mode 100644
index 3d99d7d..0000000
--- a/extras/mini-os/include/sched.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef __SCHED_H__
-#define __SCHED_H__
-
-#include <mini-os/list.h>
-#include <mini-os/time.h>
-#include <mini-os/arch_sched.h>
-#ifdef HAVE_LIBC
-#include <sys/reent.h>
-#endif
-
-struct thread
-{
-    char *name;
-    char *stack;
-    /* keep in that order */
-    unsigned long sp;  /* Stack pointer */
-    unsigned long ip;  /* Instruction pointer */
-    MINIOS_TAILQ_ENTRY(struct thread) thread_list;
-    uint32_t flags;
-    s_time_t wakeup_time;
-#ifdef HAVE_LIBC
-    struct _reent reent;
-#endif
-};
-
-extern struct thread *idle_thread;
-void idle_thread_fn(void *unused);
-
-#define RUNNABLE_FLAG   0x00000001
-
-#define is_runnable(_thread)    (_thread->flags & RUNNABLE_FLAG)
-#define set_runnable(_thread)   (_thread->flags |= RUNNABLE_FLAG)
-#define clear_runnable(_thread) (_thread->flags &= ~RUNNABLE_FLAG)
-
-#define switch_threads(prev, next) arch_switch_threads(prev, next)
- 
-    /* Architecture specific setup of thread creation. */
-struct thread* arch_create_thread(char *name, void (*function)(void *),
-                                  void *data);
-
-void init_sched(void);
-void run_idle_thread(void);
-struct thread* create_thread(char *name, void (*function)(void *), void *data);
-void exit_thread(void) __attribute__((noreturn));
-void schedule(void);
-
-#ifdef __INSIDE_MINIOS__
-#define current get_current()
-#endif
-
-void wake(struct thread *thread);
-void block(struct thread *thread);
-void msleep(uint32_t millisecs);
-
-#endif /* __SCHED_H__ */
diff --git a/extras/mini-os/include/semaphore.h b/extras/mini-os/include/semaphore.h
deleted file mode 100644
index 47470c5..0000000
--- a/extras/mini-os/include/semaphore.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef _SEMAPHORE_H_
-#define _SEMAPHORE_H_
-
-#include <mini-os/wait.h>
-#include <mini-os/spinlock.h>
-
-/*
- * Implementation of semaphore in Mini-os is simple, because 
- * there are no preemptive threads, the atomicity is guaranteed.
- */
-
-struct semaphore
-{
-	int count;
-	struct wait_queue_head wait;
-};
-
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	signed long		count;
-	spinlock_t		wait_lock;
-	int			debug;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)                            \
-{                                                                   \
-    .count    = n,                                                  \
-    .wait           = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)    \
-}
-
-#define __MUTEX_INITIALIZER(name) \
-    __SEMAPHORE_INITIALIZER(name,1)
-                           
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-    struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-    
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
-
-static inline void init_SEMAPHORE(struct semaphore *sem, int count)
-{
-  sem->count = count;
-  init_waitqueue_head(&sem->wait);
-}
-
-#define init_MUTEX(sem) init_SEMAPHORE(sem, 1)
-
-static inline int trydown(struct semaphore *sem)
-{
-    unsigned long flags;
-    int ret = 0;
-    local_irq_save(flags);
-    if (sem->count > 0) {
-        ret = 1;
-        sem->count--;
-    }
-    local_irq_restore(flags);
-    return ret;
-}
-
-static void inline down(struct semaphore *sem)
-{
-    unsigned long flags;
-    while (1) {
-        wait_event(sem->wait, sem->count > 0);
-        local_irq_save(flags);
-        if (sem->count > 0)
-            break;
-        local_irq_restore(flags);
-    }
-    sem->count--;
-    local_irq_restore(flags);
-}
-
-static void inline up(struct semaphore *sem)
-{
-    unsigned long flags;
-    local_irq_save(flags);
-    sem->count++;
-    wake_up(&sem->wait);
-    local_irq_restore(flags);
-}
-
-/* FIXME! Thre read/write semaphores are unimplemented! */
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-  sem->count = 1;
-}
-
-static inline void down_read(struct rw_semaphore *sem)
-{
-}
-
-
-static inline void up_read(struct rw_semaphore *sem)
-{
-}
-
-static inline void up_write(struct rw_semaphore *sem)
-{
-}
-
-static inline void down_write(struct rw_semaphore *sem)
-{
-}
-
-#endif /* _SEMAPHORE_H */
diff --git a/extras/mini-os/include/spinlock.h b/extras/mini-os/include/spinlock.h
deleted file mode 100644
index 6604e3c..0000000
--- a/extras/mini-os/include/spinlock.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef __ASM_SPINLOCK_H
-#define __ASM_SPINLOCK_H
-
-#include <mini-os/lib.h>
-
-/*
- * Your basic SMP spinlocks, allowing only a single CPU anywhere
- */
-
-typedef struct {
-	volatile unsigned int slock;
-} spinlock_t;
-
-
-#include <mini-os/arch_spinlock.h>
-
-
-#define SPINLOCK_MAGIC	0xdead4ead
-
-#define SPIN_LOCK_UNLOCKED ARCH_SPIN_LOCK_UNLOCKED
-
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-
-/*
- * Simple spin lock operations.  There are two variants, one clears IRQ's
- * on the local processor, one does not.
- *
- * We make no fairness assumptions. They have a cost.
- */
-
-#define spin_is_locked(x)	arch_spin_is_locked(x)
-
-#define spin_unlock_wait(x)	arch_spin_unlock_wait(x)
-
-
-#define _spin_trylock(lock)     ({_raw_spin_trylock(lock) ? \
-                                1 : ({ 0;});})
-
-#define _spin_lock(lock)        \
-do {                            \
-        _raw_spin_lock(lock);   \
-} while(0)
-
-#define _spin_unlock(lock)      \
-do {                            \
-        _raw_spin_unlock(lock); \
-} while (0)
-
-
-#define spin_lock(lock)       _spin_lock(lock)
-#define spin_unlock(lock)       _spin_unlock(lock)
-
-#define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED
-
-#endif
diff --git a/extras/mini-os/include/sys/lock.h b/extras/mini-os/include/sys/lock.h
deleted file mode 100644
index 8004536..0000000
--- a/extras/mini-os/include/sys/lock.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef _MINIOS_SYS_LOCK_H_
-#define _MINIOS_SYS_LOCK_H_
-
-#ifdef HAVE_LIBC
-
-/* Due to inclusion loop, we can not include sched.h, so have to hide things */
-
-#include <mini-os/waittypes.h>
-
-
-typedef struct {
-        int busy;
-        struct wait_queue_head wait;
-} _LOCK_T;
-
-#define __LOCK_INIT(class,lock) \
-    class _LOCK_T lock = { .wait = __WAIT_QUEUE_HEAD_INITIALIZER(lock.wait) }
-int ___lock_init(_LOCK_T *lock);
-int ___lock_acquire(_LOCK_T *lock);
-int ___lock_try_acquire(_LOCK_T *lock);
-int ___lock_release(_LOCK_T *lock);
-int ___lock_close(_LOCK_T *lock);
-#define __lock_init(__lock) ___lock_init(&__lock)
-#define __lock_acquire(__lock) ___lock_acquire(&__lock)
-#define __lock_release(__lock) ___lock_release(&__lock)
-#define __lock_try_acquire(__lock) ___lock_try_acquire(&__lock)
-#define __lock_close(__lock) 0
-
-
-typedef struct {
-    struct thread *owner;
-    int count;
-    struct wait_queue_head wait;
-} _LOCK_RECURSIVE_T;
-
-#define __LOCK_INIT_RECURSIVE(class, lock) \
-    class _LOCK_RECURSIVE_T lock = { .wait = __WAIT_QUEUE_HEAD_INITIALIZER((lock).wait) }
-
-int ___lock_init_recursive(_LOCK_RECURSIVE_T *lock);
-int ___lock_acquire_recursive(_LOCK_RECURSIVE_T *lock);
-int ___lock_try_acquire_recursive(_LOCK_RECURSIVE_T *lock);
-int ___lock_release_recursive(_LOCK_RECURSIVE_T *lock);
-int ___lock_close_recursive(_LOCK_RECURSIVE_T *lock);
-#define __lock_init_recursive(__lock) ___lock_init_recursive(&__lock)
-#define __lock_acquire_recursive(__lock) ___lock_acquire_recursive(&__lock)
-#define __lock_release_recursive(__lock) ___lock_release_recursive(&__lock)
-#define __lock_try_acquire_recursive(__lock) ___lock_try_acquire_recursive(&__lock)
-#define __lock_close_recursive(__lock) 0
-
-#endif
-
-#endif /* _MINIOS_SYS_LOCK_H_ */
diff --git a/extras/mini-os/include/sys/time.h b/extras/mini-os/include/sys/time.h
deleted file mode 100644
index 3be3653..0000000
--- a/extras/mini-os/include/sys/time.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2005 - Grzegorz Milos - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: time.h
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: Grzegorz Milos (gm281 at cam.ac.uk)
- *              Robert Kaiser (kaiser at informatik.fh-wiesbaden.de)
- *              
- *        Date: Jul 2003, changes: Jun 2005, Sep 2006
- * 
- * Environment: Xen Minimal OS
- * Description: Time and timer functions
- *
- ****************************************************************************
- */
-
-#ifndef _MINIOS_SYS_TIME_H_
-#define _MINIOS_SYS_TIME_H_
-
-#ifdef HAVE_LIBC
-#include_next <sys/time.h>
-
-#else
-struct timespec {
-    time_t      tv_sec;
-    long        tv_nsec;
-};
-
-struct timezone {
-};
-
-struct timeval {
-	time_t		tv_sec;		/* seconds */
-	suseconds_t	tv_usec;	/* microseconds */
-};
-
-int      gettimeofday(struct timeval *tv, void *tz);
-
-#endif
-#ifdef HAVE_LIBC
-#include <sys/select.h>
-#endif
-
-#endif /* _MINIOS_SYS_TIME_H_ */
diff --git a/extras/mini-os/include/time.h b/extras/mini-os/include/time.h
deleted file mode 100644
index 5d6ed67..0000000
--- a/extras/mini-os/include/time.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2005 - Grzegorz Milos - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: time.h
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: Grzegorz Milos (gm281 at cam.ac.uk)
- *              Robert Kaiser (kaiser at informatik.fh-wiesbaden.de)
- *              
- *        Date: Jul 2003, changes: Jun 2005, Sep 2006
- * 
- * Environment: Xen Minimal OS
- * Description: Time and timer functions
- *
- ****************************************************************************
- */
-
-#ifndef _MINIOS_TIME_H_
-#define _MINIOS_TIME_H_
-#include <mini-os/types.h>
-
-/*
- * System Time
- * 64 bit value containing the nanoseconds elapsed since boot time.
- * This value is adjusted by frequency drift.
- * NOW() returns the current time.
- * The other macros are for convenience to approximate short intervals
- * of real time into system time 
- */
-typedef int64_t s_time_t;
-#define NOW()                   ((s_time_t)monotonic_clock())
-#define SECONDS(_s)             (((s_time_t)(_s))  * 1000000000UL )
-#define TENTHS(_ts)             (((s_time_t)(_ts)) * 100000000UL )
-#define HUNDREDTHS(_hs)         (((s_time_t)(_hs)) * 10000000UL )
-#define MILLISECS(_ms)          (((s_time_t)(_ms)) * 1000000UL )
-#define MICROSECS(_us)          (((s_time_t)(_us)) * 1000UL )
-#define Time_Max                ((s_time_t) 0x7fffffffffffffffLL)
-#define FOREVER                 Time_Max
-#define NSEC_TO_USEC(_nsec)     ((_nsec) / 1000UL)
-#define NSEC_TO_MSEC(_nsec)     ((_nsec) / 1000000ULL)
-#define NSEC_TO_SEC(_nsec)      ((_nsec) / 1000000000ULL)
-
-/* wall clock time  */
-typedef long time_t;
-typedef long suseconds_t;
-
-#include <sys/time.h>
-
-#ifdef HAVE_LIBC
-#include_next <time.h>
-#endif
-
-/* prototypes */
-void     init_time(void);
-void     fini_time(void);
-s_time_t get_s_time(void);
-s_time_t get_v_time(void);
-uint64_t monotonic_clock(void);
-void     block_domain(s_time_t until);
-
-#endif /* _MINIOS_TIME_H_ */
diff --git a/extras/mini-os/include/tpm_tis.h b/extras/mini-os/include/tpm_tis.h
deleted file mode 100644
index 1faca0d..0000000
--- a/extras/mini-os/include/tpm_tis.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2010-2012 United States Government, as represented by
- * the Secretary of Defense.  All rights reserved.
- *
- * This code has been derived from drivers/char/tpm.c
- * from the linux kernel
- *
- * Copyright (C) 2004 IBM Corporation
- *
- * This code has also been derived from drivers/char/tpm/tpm_tis.c
- * from the linux kernel
- *
- * Copyright (C) 2005, 2006 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation, version 2
- * of the License
- */
-#ifndef TPM_TIS_H
-#define TPM_TIS_H
-
-#include <mini-os/types.h>
-#include <mini-os/byteorder.h>
-
-#define TPM_TIS_EN_LOCL0 1
-#define TPM_TIS_EN_LOCL1 (1 << 1)
-#define TPM_TIS_EN_LOCL2 (1 << 2)
-#define TPM_TIS_EN_LOCL3 (1 << 3)
-#define TPM_TIS_EN_LOCL4 (1 << 4)
-#define TPM_TIS_EN_LOCLALL (TPM_TIS_EN_LOCL0 | TPM_TIS_EN_LOCL1  | TPM_TIS_EN_LOCL2 | TPM_TIS_EN_LOCL3 | TPM_TIS_EN_LOCL4)
-#define TPM_TIS_LOCL_INT_TO_FLAG(x) (1 << x)
-#define TPM_BASEADDR 0xFED40000
-#define TPM_PROBE_IRQ 0xFFFF
-
-struct tpm_chip;
-
-struct tpm_chip* init_tpm_tis(unsigned long baseaddr, int localities, unsigned int irq);
-void shutdown_tpm_tis(struct tpm_chip* tpm);
-
-int tpm_tis_request_locality(struct tpm_chip* tpm, int locality);
-int tpm_tis_cmd(struct tpm_chip* tpm, uint8_t* req, size_t reqlen, uint8_t** resp, size_t* resplen);
-
-#ifdef HAVE_LIBC
-#include <sys/stat.h>
-#include <fcntl.h>
-/* POSIX IO functions:
- * use tpm_tis_open() to get a file descriptor to the tpm device
- * use write() on the fd to send a command to the backend. You must
- * include the entire command in a single call to write().
- * use read() on the fd to read the response. You can use
- * fstat() to get the size of the response and lseek() to seek on it.
- */
-int tpm_tis_open(struct tpm_chip* tpm);
-int tpm_tis_posix_read(int fd, uint8_t* buf, size_t count);
-int tpm_tis_posix_write(int fd, const uint8_t* buf, size_t count);
-int tpm_tis_posix_fstat(int fd, struct stat* buf);
-#endif
-
-#endif
diff --git a/extras/mini-os/include/tpmback.h b/extras/mini-os/include/tpmback.h
deleted file mode 100644
index 4408986..0000000
--- a/extras/mini-os/include/tpmback.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2010-2012 United States Government, as represented by
- * the Secretary of Defense.  All rights reserved.
- *
- * This code has been derived from drivers/xen/tpmback/tpmback.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (c) 2005, IBM Corporation
- *
- * which was itself derived from drivers/xen/netback/netback.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (c) 2002-2004, K A Fraser
- *
- * This code has also been derived from drivers/xen/tpmback/xenbus.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (C) 2005 IBM Corporation
- * Copyright (C) 2005 Rusty Russell <rusty at rustcorp.com.au>
- *
- * This code has also been derived from drivers/xen/tpmback/interface.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (c) 2005, IBM Corporation
- *
- * which was itself also derived from drvivers/xen/netback/interface.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (c) 2004, Keir Fraser
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation, version 2
- * of the License
- */
-
-#include <xen/io/tpmif.h>
-#include <xen/io/xenbus.h>
-#include <mini-os/types.h>
-#include <xen/xen.h>
-#ifndef TPMBACK_H
-#define TPMBACK_H
-
-struct tpmcmd {
-   domid_t domid;		/* Domid of the frontend */
-   uint8_t locality;    /* Locality requested by the frontend */
-   unsigned int handle;	/* Handle of the frontend */
-   void *opaque;        /* Opaque pointer taken from the tpmback instance */
-
-   uint8_t* req;			/* tpm command bits, allocated by driver, DON'T FREE IT */
-   unsigned int req_len;		/* Size of the command in buf - set by tpmback driver */
-   unsigned int resp_len;	/* Size of the outgoing command,
-				   you set this before passing the cmd object to tpmback_resp */
-   uint8_t* resp;		/* Buffer for response - YOU MUST ALLOCATE IT, YOU MUST ALSO FREE IT */
-};
-typedef struct tpmcmd tpmcmd_t;
-
-/* Initialize the tpm backend driver */
-void init_tpmback(void (*open_cb)(domid_t, unsigned int), void (*close_cb)(domid_t, unsigned int));
-
-/* Shutdown tpm backend driver */
-void shutdown_tpmback(void);
-
-/* Blocks until a tpm command is sent from any front end.
- * Returns a pointer to the tpm command to handle.
- * Do not try to free this pointer or the req buffer
- * This function will return NULL if the tpm backend driver
- * is shutdown or any other error occurs */
-tpmcmd_t* tpmback_req_any(void);
-
-/* Blocks until a tpm command from the frontend at domid/handle
- * is sent.
- * Returns NULL if domid/handle is not connected, tpmback is
- * shutdown or shutting down, or if there is an error
- */
-tpmcmd_t* tpmback_req(domid_t domid, unsigned int handle);
-
-/* Send the response to the tpm command back to the frontend
- * This function will free the tpmcmd object, but you must free the resp
- * buffer yourself */
-void tpmback_resp(tpmcmd_t* tpmcmd);
-
-/* Waits for the first frontend to connect and then sets domid and handle appropriately.
- * If one or more frontends are already connected, this will set domid and handle to one
- * of them arbitrarily. The main use for this function is to wait until a single
- * frontend connection has occured.
- * returns 0 on success, non-zero on failure */
-int tpmback_wait_for_frontend_connect(domid_t *domid, unsigned int *handle);
-
-/* returns the number of frontends connected */
-int tpmback_num_frontends(void);
-
-/* Returns the uuid of the specified frontend, NULL on error.
- * The return value is internally allocated, so don't free it */
-unsigned char* tpmback_get_uuid(domid_t domid, unsigned int handle);
-
-/* Get and set the opaque pointer for a tpmback instance */
-void* tpmback_get_opaque(domid_t domid, unsigned int handle);
-/* Returns zero if successful, nonzero on failure (no such frontend) */
-int tpmback_set_opaque(domid_t domid, unsigned int handle, void* opaque);
-
-/* Get the XSM context of the given domain (using the tpmback event channel) */
-int tpmback_get_peercontext(domid_t domid, unsigned int handle, void* buffer, int buflen);
-#endif
diff --git a/extras/mini-os/include/tpmfront.h b/extras/mini-os/include/tpmfront.h
deleted file mode 100644
index c489fae..0000000
--- a/extras/mini-os/include/tpmfront.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2010-2012 United States Government, as represented by
- * the Secretary of Defense.  All rights reserved.
- *
- * This code has been derived from drivers/char/tpm_vtpm.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (C) 2006 IBM Corporation
- *
- * This code has also been derived from drivers/char/tpm_xen.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (c) 2005, IBM Corporation
- *
- * which was itself derived from drivers/xen/netfront/netfront.c
- * from the linux kernel
- *
- * Copyright (c) 2002-2004, K A Fraser
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
- */
-#ifndef TPMFRONT_H
-#define TPMFRONT_H
-
-#include <mini-os/types.h>
-#include <mini-os/os.h>
-#include <mini-os/events.h>
-#include <mini-os/wait.h>
-#include <xen/xen.h>
-#include <xen/io/xenbus.h>
-#include <xen/io/tpmif.h>
-
-struct tpmfront_dev {
-   grant_ref_t ring_ref;
-   evtchn_port_t evtchn;
-
-   tpmif_shared_page_t *page;
-
-   domid_t bedomid;
-   char* nodename;
-   char* bepath;
-
-   XenbusState state;
-
-   uint8_t waiting;
-   struct wait_queue_head waitq;
-
-   uint8_t* respbuf;
-   size_t resplen;
-
-#ifdef HAVE_LIBC
-   int fd;
-#endif
-
-};
-
-
-/*Initialize frontend */
-struct tpmfront_dev* init_tpmfront(const char* nodename);
-/*Shutdown frontend */
-void shutdown_tpmfront(struct tpmfront_dev* dev);
-
-/* Send a tpm command to the backend and wait for the response
- *
- * @dev - frontend device
- * @req - request buffer
- * @reqlen - length of request buffer
- * @resp - *resp will be set to internal response buffer, don't free it! Value is undefined on error
- * @resplen - *resplen will be set to the length of the response. Value is undefined on error
- *
- * returns 0 on success, non zero on failure.
- * */
-int tpmfront_cmd(struct tpmfront_dev* dev, uint8_t* req, size_t reqlen, uint8_t** resp, size_t* resplen);
-
-/* Set the locality used for communicating with a vTPM */
-int tpmfront_set_locality(struct tpmfront_dev* dev, int locality);
-
-#ifdef HAVE_LIBC
-#include <sys/stat.h>
-/* POSIX IO functions:
- * use tpmfront_open() to get a file descriptor to the tpm device
- * use write() on the fd to send a command to the backend. You must
- * include the entire command in a single call to write().
- * use read() on the fd to read the response. You can use
- * fstat() to get the size of the response and lseek() to seek on it.
- */
-int tpmfront_open(struct tpmfront_dev* dev);
-int tpmfront_posix_read(int fd, uint8_t* buf, size_t count);
-int tpmfront_posix_write(int fd, const uint8_t* buf, size_t count);
-int tpmfront_posix_fstat(int fd, struct stat* buf);
-#endif
-
-
-#endif
diff --git a/extras/mini-os/include/types.h b/extras/mini-os/include/types.h
deleted file mode 100644
index be9f1d3..0000000
--- a/extras/mini-os/include/types.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: types.h
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: 
- *              
- *        Date: May 2003
- * 
- * Environment: Xen Minimal OS
- * Description: a random collection of type definitions
- *
- ****************************************************************************
- * $Id: h-insert.h,v 1.4 2002/11/08 16:03:55 rn Exp $
- ****************************************************************************
- */
-
-#ifndef _TYPES_H_
-#define _TYPES_H_
-#include <stddef.h>
-
-/* FreeBSD compat types */
-#ifndef HAVE_LIBC
-typedef unsigned char       u_char;
-typedef unsigned int        u_int;
-typedef unsigned long       u_long;
-#endif
-#if defined(__i386__) || defined(__arm__)
-typedef long long           quad_t;
-typedef unsigned long long  u_quad_t;
-#elif defined(__x86_64__)
-typedef long                quad_t;
-typedef unsigned long       u_quad_t;
-#endif /* __i386__ || __x86_64__ */
-
-#ifdef HAVE_LIBC
-#include <limits.h>
-#include <stdint.h>
-#else
-#if defined(__i386__) || defined(__arm__)
-typedef unsigned int        uintptr_t;
-typedef int                 intptr_t;
-#elif defined(__x86_64__) || defined(__aarch64__)
-typedef unsigned long       uintptr_t;
-typedef long                intptr_t;
-#endif /* __i386__ || __x86_64__ */
-typedef unsigned char uint8_t;
-typedef   signed char int8_t;
-typedef unsigned short uint16_t;
-typedef   signed short int16_t;
-typedef unsigned int uint32_t;
-typedef   signed int int32_t;
-#if defined(__i386__) || defined(__arm__)
-typedef   signed long long int64_t;
-typedef unsigned long long uint64_t;
-#elif defined(__x86_64__) || defined(__aarch64__)
-typedef   signed long int64_t;
-typedef unsigned long uint64_t;
-#endif
-typedef uint64_t uintmax_t;
-typedef  int64_t intmax_t;
-typedef  int64_t off_t;
-#endif
-
-typedef intptr_t            ptrdiff_t;
-
-
-#ifndef HAVE_LIBC
-typedef long ssize_t;
-#endif
-
-#endif /* _TYPES_H_ */
diff --git a/extras/mini-os/include/wait.h b/extras/mini-os/include/wait.h
deleted file mode 100644
index ecbe396..0000000
--- a/extras/mini-os/include/wait.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef __WAIT_H__
-#define __WAIT_H__
-
-#include <mini-os/sched.h>
-#include <mini-os/os.h>
-#include <mini-os/waittypes.h>
-
-#define DEFINE_WAIT(name)                          \
-struct wait_queue name = {                         \
-    .thread       = get_current(),                 \
-    .waiting      = 0,                             \
-}
-
-
-static inline void init_waitqueue_head(struct wait_queue_head *h)
-{
-    MINIOS_STAILQ_INIT(h);
-}
-
-static inline void init_waitqueue_entry(struct wait_queue *q, struct thread *thread)
-{
-    q->thread = thread;
-    q->waiting = 0;
-}
-
-static inline void add_wait_queue(struct wait_queue_head *h, struct wait_queue *q)
-{
-    if (!q->waiting) {
-        MINIOS_STAILQ_INSERT_HEAD(h, q, thread_list);
-        q->waiting = 1;
-    }
-}
-
-static inline void remove_wait_queue(struct wait_queue_head *h, struct wait_queue *q)
-{
-    if (q->waiting) {
-        MINIOS_STAILQ_REMOVE(h, q, struct wait_queue, thread_list);
-        q->waiting = 0;
-    }
-}
-
-static inline void wake_up(struct wait_queue_head *head)
-{
-    unsigned long flags;
-    struct wait_queue *curr, *tmp;
-    local_irq_save(flags);
-    MINIOS_STAILQ_FOREACH_SAFE(curr, head, thread_list, tmp)
-         wake(curr->thread);
-    local_irq_restore(flags);
-}
-
-#define add_waiter(w, wq) do {  \
-    unsigned long flags;        \
-    local_irq_save(flags);      \
-    add_wait_queue(&wq, &w);    \
-    block(get_current());       \
-    local_irq_restore(flags);   \
-} while (0)
-
-#define remove_waiter(w, wq) do {  \
-    unsigned long flags;           \
-    local_irq_save(flags);         \
-    remove_wait_queue(&wq, &w);    \
-    local_irq_restore(flags);      \
-} while (0)
-
-#define wait_event_deadline(wq, condition, deadline) do {       \
-    unsigned long flags;                                        \
-    DEFINE_WAIT(__wait);                                        \
-    if(condition)                                               \
-        break;                                                  \
-    for(;;)                                                     \
-    {                                                           \
-        /* protect the list */                                  \
-        local_irq_save(flags);                                  \
-        add_wait_queue(&wq, &__wait);                           \
-        get_current()->wakeup_time = deadline;                  \
-        clear_runnable(get_current());                          \
-        local_irq_restore(flags);                               \
-        if((condition) || (deadline && NOW() >= deadline))      \
-            break;                                              \
-        schedule();                                             \
-    }                                                           \
-    local_irq_save(flags);                                      \
-    /* need to wake up */                                       \
-    wake(get_current());                                        \
-    remove_wait_queue(&wq, &__wait);                            \
-    local_irq_restore(flags);                                   \
-} while(0) 
-
-#define wait_event(wq, condition) wait_event_deadline(wq, condition, 0) 
-
-
-
-#endif /* __WAIT_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/extras/mini-os/include/waittypes.h b/extras/mini-os/include/waittypes.h
deleted file mode 100644
index 42a4786..0000000
--- a/extras/mini-os/include/waittypes.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef __WAITTYPE_H__
-#define __WAITTYPE_H__
-
-#include <mini-os/list.h>
-
-struct thread;
-struct wait_queue
-{
-    int waiting;
-    struct thread *thread;
-    MINIOS_STAILQ_ENTRY(struct wait_queue) thread_list;
-};
-
-/* TODO - lock required? */
-MINIOS_STAILQ_HEAD(wait_queue_head, struct wait_queue);
-
-#define DECLARE_WAIT_QUEUE_HEAD(name) \
-    struct wait_queue_head name = MINIOS_STAILQ_HEAD_INITIALIZER(name)
-
-#define __WAIT_QUEUE_HEAD_INITIALIZER(name) MINIOS_STAILQ_HEAD_INITIALIZER(name)
-
-#endif
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/extras/mini-os/include/x86/arch_endian.h b/extras/mini-os/include/x86/arch_endian.h
deleted file mode 100644
index 0771683..0000000
--- a/extras/mini-os/include/x86/arch_endian.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef	ARCH_ENDIAN_H
-#error "Do not include arch_endian by itself, include endian.h"
-#else
-
-#define __BYTE_ORDER __LITTLE_ENDIAN
-
-#endif
diff --git a/extras/mini-os/include/x86/arch_limits.h b/extras/mini-os/include/x86/arch_limits.h
deleted file mode 100644
index 41f8620..0000000
--- a/extras/mini-os/include/x86/arch_limits.h
+++ /dev/null
@@ -1,20 +0,0 @@
-
-#ifndef __ARCH_LIMITS_H__
-#define __ARCH_LIMITS_H__
-
-#define __PAGE_SHIFT      12
-
-#ifdef __ASSEMBLY__
-#define __PAGE_SIZE       (1 << __PAGE_SHIFT)
-#else
-#ifdef __x86_64__
-#define __PAGE_SIZE       (1UL << __PAGE_SHIFT)
-#else
-#define __PAGE_SIZE       (1ULL << __PAGE_SHIFT)
-#endif
-#endif
-
-#define __STACK_SIZE_PAGE_ORDER  4
-#define __STACK_SIZE             (__PAGE_SIZE * (1 << __STACK_SIZE_PAGE_ORDER))
-          
-#endif /* __ARCH_LIMITS_H__ */
diff --git a/extras/mini-os/include/x86/arch_mm.h b/extras/mini-os/include/x86/arch_mm.h
deleted file mode 100644
index 23cfca7..0000000
--- a/extras/mini-os/include/x86/arch_mm.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- *
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- * Copyright (c) 2005, Keir A Fraser
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef _ARCH_MM_H_
-#define _ARCH_MM_H_
-
-#ifndef __ASSEMBLY__
-#include <xen/xen.h>
-#if defined(__i386__)
-#include <xen/arch-x86_32.h>
-#elif defined(__x86_64__)
-#include <xen/arch-x86_64.h>
-#else
-#error "Unsupported architecture"
-#endif
-#endif
-
-#define L1_FRAME                1
-#define L2_FRAME                2
-#define L3_FRAME                3
-
-#define L1_PAGETABLE_SHIFT      12
-
-#if defined(__i386__)
-
-#define L2_PAGETABLE_SHIFT      21
-#define L3_PAGETABLE_SHIFT      30
-
-#define L1_PAGETABLE_ENTRIES    512
-#define L2_PAGETABLE_ENTRIES    512
-#define L3_PAGETABLE_ENTRIES    4
-
-#define PADDR_BITS              44
-#define PADDR_MASK              ((1ULL << PADDR_BITS)-1)
-
-#define L2_MASK  ((1UL << L3_PAGETABLE_SHIFT) - 1)
-
-/*
- * If starting from virtual address greater than 0xc0000000,
- * this value will be 2 to account for final mid-level page
- * directory which is always mapped in at this location.
- */
-#define NOT_L1_FRAMES           3
-#define PRIpte "016llx"
-#ifndef __ASSEMBLY__
-typedef uint64_t pgentry_t;
-#endif
-
-#elif defined(__x86_64__)
-
-#define L2_PAGETABLE_SHIFT      21
-#define L3_PAGETABLE_SHIFT      30
-#define L4_PAGETABLE_SHIFT      39
-
-#define L1_PAGETABLE_ENTRIES    512
-#define L2_PAGETABLE_ENTRIES    512
-#define L3_PAGETABLE_ENTRIES    512
-#define L4_PAGETABLE_ENTRIES    512
-
-/* These are page-table limitations. Current CPUs support only 40-bit phys. */
-#define PADDR_BITS              52
-#define VADDR_BITS              48
-#define PADDR_MASK              ((1UL << PADDR_BITS)-1)
-#define VADDR_MASK              ((1UL << VADDR_BITS)-1)
-
-#define L2_MASK  ((1UL << L3_PAGETABLE_SHIFT) - 1)
-#define L3_MASK  ((1UL << L4_PAGETABLE_SHIFT) - 1)
-
-#define NOT_L1_FRAMES           3
-#define PRIpte "016lx"
-#ifndef __ASSEMBLY__
-typedef unsigned long pgentry_t;
-#endif
-
-#endif
-
-#define L1_MASK  ((1UL << L2_PAGETABLE_SHIFT) - 1)
-
-/* Given a virtual address, get an entry offset into a page table. */
-#define l1_table_offset(_a) \
-  (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1))
-#define l2_table_offset(_a) \
-  (((_a) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1))
-#define l3_table_offset(_a) \
-  (((_a) >> L3_PAGETABLE_SHIFT) & (L3_PAGETABLE_ENTRIES - 1))
-#if defined(__x86_64__)
-#define l4_table_offset(_a) \
-  (((_a) >> L4_PAGETABLE_SHIFT) & (L4_PAGETABLE_ENTRIES - 1))
-#endif
-
-#define _PAGE_PRESENT  0x001ULL
-#define _PAGE_RW       0x002ULL
-#define _PAGE_USER     0x004ULL
-#define _PAGE_PWT      0x008ULL
-#define _PAGE_PCD      0x010ULL
-#define _PAGE_ACCESSED 0x020ULL
-#define _PAGE_DIRTY    0x040ULL
-#define _PAGE_PAT      0x080ULL
-#define _PAGE_PSE      0x080ULL
-#define _PAGE_GLOBAL   0x100ULL
-
-#if defined(__i386__)
-#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
-#define L1_PROT_RO (_PAGE_PRESENT|_PAGE_ACCESSED)
-#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY |_PAGE_USER)
-#define L3_PROT (_PAGE_PRESENT)
-#elif defined(__x86_64__)
-#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
-#define L1_PROT_RO (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_USER)
-#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
-#define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
-#define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
-#endif /* __i386__ || __x86_64__ */
-
-/* flags for ioremap */
-#define IO_PROT (L1_PROT)
-#define IO_PROT_NOCACHE (L1_PROT | _PAGE_PCD)
-
-/* for P2M */
-#define INVALID_P2M_ENTRY (~0UL)
-
-#include "arch_limits.h"
-#define PAGE_SIZE       __PAGE_SIZE
-#define PAGE_SHIFT      __PAGE_SHIFT
-#define PAGE_MASK       (~(PAGE_SIZE-1))
-
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> L1_PAGETABLE_SHIFT)
-#define PFN_DOWN(x)	((x) >> L1_PAGETABLE_SHIFT)
-#define PFN_PHYS(x)	((uint64_t)(x) << L1_PAGETABLE_SHIFT)
-#define PHYS_PFN(x)	((x) >> L1_PAGETABLE_SHIFT)
-
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)        (((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
-#ifndef __ASSEMBLY__
-/* Definitions for machine and pseudophysical addresses. */
-#ifdef __i386__
-typedef unsigned long long paddr_t;
-typedef unsigned long long maddr_t;
-#else
-typedef unsigned long paddr_t;
-typedef unsigned long maddr_t;
-#endif
-
-extern unsigned long *phys_to_machine_mapping;
-extern char _text, _etext, _erodata, _edata, _end;
-extern unsigned long mfn_zero;
-#define pfn_to_mfn(_pfn) (phys_to_machine_mapping[(_pfn)])
-static __inline__ maddr_t phys_to_machine(paddr_t phys)
-{
-	maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
-	machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
-	return machine;
-}
-
-#define mfn_to_pfn(_mfn) (machine_to_phys_mapping[(_mfn)])
-static __inline__ paddr_t machine_to_phys(maddr_t machine)
-{
-	paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
-	phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
-	return phys;
-}
-#endif
-
-#define VIRT_START                 ((unsigned long)&_text)
-
-#define to_phys(x)                 ((unsigned long)(x)-VIRT_START)
-#define to_virt(x)                 ((void *)((unsigned long)(x)+VIRT_START))
-
-#define virt_to_pfn(_virt)         (PFN_DOWN(to_phys(_virt)))
-#define virt_to_mfn(_virt)         (pfn_to_mfn(virt_to_pfn(_virt)))
-#define mach_to_virt(_mach)        (to_virt(machine_to_phys(_mach)))
-#define virt_to_mach(_virt)        (phys_to_machine(to_phys(_virt)))
-#define mfn_to_virt(_mfn)          (to_virt(mfn_to_pfn(_mfn) << PAGE_SHIFT))
-#define pfn_to_virt(_pfn)          (to_virt((_pfn) << PAGE_SHIFT))
-
-/* Pagetable walking. */
-#define pte_to_mfn(_pte)           (((_pte) & (PADDR_MASK&PAGE_MASK)) >> L1_PAGETABLE_SHIFT)
-#define pte_to_virt(_pte)          to_virt(mfn_to_pfn(pte_to_mfn(_pte)) << PAGE_SHIFT)
-
-
-#define PT_BASE			   ((pgentry_t *)start_info.pt_base)
-
-#ifdef __x86_64__
-#define virtual_to_l3(_virt)	   ((pgentry_t *)pte_to_virt(PT_BASE[l4_table_offset(_virt)]))
-#else
-#define virtual_to_l3(_virt)	   PT_BASE
-#endif
-
-#define virtual_to_l2(_virt)	   ({ \
-	unsigned long __virt2 = (_virt); \
-	(pgentry_t *) pte_to_virt(virtual_to_l3(__virt2)[l3_table_offset(__virt2)]); \
-})
-
-#define virtual_to_l1(_virt)	   ({ \
-	unsigned long __virt1 = (_virt); \
-	(pgentry_t *) pte_to_virt(virtual_to_l2(__virt1)[l2_table_offset(__virt1)]); \
-})
-
-#define virtual_to_pte(_virt)	   ({ \
-	unsigned long __virt0 = (unsigned long) (_virt); \
-	virtual_to_l1(__virt0)[l1_table_offset(__virt0)]; \
-})
-#define virtual_to_mfn(_virt)	   pte_to_mfn(virtual_to_pte(_virt))
-
-#define map_frames(f, n) map_frames_ex(f, n, 1, 0, 1, DOMID_SELF, NULL, L1_PROT)
-#define map_zero(n, a) map_frames_ex(&mfn_zero, n, 0, 0, a, DOMID_SELF, NULL, L1_PROT_RO)
-#define do_map_zero(start, n) do_map_frames(start, &mfn_zero, n, 0, 0, DOMID_SELF, NULL, L1_PROT_RO)
-
-pgentry_t *need_pgt(unsigned long addr);
-
-#endif /* _ARCH_MM_H_ */
diff --git a/extras/mini-os/include/x86/arch_sched.h b/extras/mini-os/include/x86/arch_sched.h
deleted file mode 100644
index b494eca..0000000
--- a/extras/mini-os/include/x86/arch_sched.h
+++ /dev/null
@@ -1,25 +0,0 @@
-
-#ifndef __ARCH_SCHED_H__
-#define __ARCH_SCHED_H__
-
-#include "arch_limits.h"
-
-static inline struct thread* get_current(void)
-{
-    struct thread **current;
-#ifdef __i386__    
-    register unsigned long sp asm("esp");
-#else
-    register unsigned long sp asm("rsp");
-#endif 
-    current = (void *)(unsigned long)(sp & ~(__STACK_SIZE-1));
-    return *current;
-}
-
-extern void __arch_switch_threads(unsigned long *prevctx, unsigned long *nextctx);
-
-#define arch_switch_threads(prev,next) __arch_switch_threads(&(prev)->sp, &(next)->sp)
-
-
-          
-#endif /* __ARCH_SCHED_H__ */
diff --git a/extras/mini-os/include/x86/arch_spinlock.h b/extras/mini-os/include/x86/arch_spinlock.h
deleted file mode 100644
index 59f7b63..0000000
--- a/extras/mini-os/include/x86/arch_spinlock.h
+++ /dev/null
@@ -1,94 +0,0 @@
-
-
-#ifndef __ARCH_ASM_SPINLOCK_H
-#define __ARCH_ASM_SPINLOCK_H
-
-#include <mini-os/lib.h>
-#include "os.h"
-
-
-#define ARCH_SPIN_LOCK_UNLOCKED { 1 }
-
-/*
- * Simple spin lock operations.  There are two variants, one clears IRQ's
- * on the local processor, one does not.
- *
- * We make no fairness assumptions. They have a cost.
- */
-
-#define arch_spin_is_locked(x)	(*(volatile signed char *)(&(x)->slock) <= 0)
-#define arch_spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x))
-
-#define spin_lock_string \
-        "1:\n" \
-	LOCK \
-	"decb %0\n\t" \
-	"jns 3f\n" \
-	"2:\t" \
-	"rep;nop\n\t" \
-	"cmpb $0,%0\n\t" \
-	"jle 2b\n\t" \
-	"jmp 1b\n" \
-	"3:\n\t"
-
-#define spin_lock_string_flags \
-        "1:\n" \
-	LOCK \
-	"decb %0\n\t" \
-	"jns 4f\n\t" \
-	"2:\t" \
-	"testl $0x200, %1\n\t" \
-	"jz 3f\n\t" \
-	"#sti\n\t" \
-	"3:\t" \
-	"rep;nop\n\t" \
-	"cmpb $0, %0\n\t" \
-	"jle 3b\n\t" \
-	"#cli\n\t" \
-	"jmp 1b\n" \
-	"4:\n\t"
-
-/*
- * This works. Despite all the confusion.
- * (except on PPro SMP or if we are using OOSTORE)
- * (PPro errata 66, 92)
- */
-
-#define spin_unlock_string \
-	"xchgb %b0, %1" \
-		:"=q" (oldval), "=m" (lock->slock) \
-		:"0" (oldval) : "memory"
-
-static inline void _raw_spin_unlock(spinlock_t *lock)
-{
-	char oldval = ARCH_SPIN_LOCK_UNLOCKED;
-	__asm__ __volatile__(
-		spin_unlock_string
-	);
-}
-
-static inline int _raw_spin_trylock(spinlock_t *lock)
-{
-	char oldval;
-	__asm__ __volatile__(
-		"xchgb %b0,%1\n"
-		:"=q" (oldval), "=m" (lock->slock)
-		:"0" (0) : "memory");
-	return oldval > 0;
-}
-
-static inline void _raw_spin_lock(spinlock_t *lock)
-{
-	__asm__ __volatile__(
-		spin_lock_string
-		:"=m" (lock->slock) : : "memory");
-}
-
-static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
-{
-	__asm__ __volatile__(
-		spin_lock_string_flags
-		:"=m" (lock->slock) : "r" (flags) : "memory");
-}
-
-#endif
diff --git a/extras/mini-os/include/x86/os.h b/extras/mini-os/include/x86/os.h
deleted file mode 100644
index ee9050b..0000000
--- a/extras/mini-os/include/x86/os.h
+++ /dev/null
@@ -1,572 +0,0 @@
-/******************************************************************************
- * os.h
- * 
- * random collection of macros and definition
- */
-
-#ifndef _OS_H_
-#define _OS_H_
-
-#define smp_processor_id() 0
-
-
-#ifndef __ASSEMBLY__
-#include <mini-os/compiler.h>
-#include <mini-os/types.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/kernel.h>
-#include <xen/xsm/flask_op.h>
-
-#define USED    __attribute__ ((used))
-
-#define BUG do_exit
-
-#endif
-#include <xen/xen.h>
-
-
-
-#define __KERNEL_CS  FLAT_KERNEL_CS
-#define __KERNEL_DS  FLAT_KERNEL_DS
-#define __KERNEL_SS  FLAT_KERNEL_SS
-
-#define TRAP_divide_error      0
-#define TRAP_debug             1
-#define TRAP_nmi               2
-#define TRAP_int3              3
-#define TRAP_overflow          4
-#define TRAP_bounds            5
-#define TRAP_invalid_op        6
-#define TRAP_no_device         7
-#define TRAP_double_fault      8
-#define TRAP_copro_seg         9
-#define TRAP_invalid_tss      10
-#define TRAP_no_segment       11
-#define TRAP_stack_error      12
-#define TRAP_gp_fault         13
-#define TRAP_page_fault       14
-#define TRAP_spurious_int     15
-#define TRAP_copro_error      16
-#define TRAP_alignment_check  17
-#define TRAP_machine_check    18
-#define TRAP_simd_error       19
-#define TRAP_deferred_nmi     31
-
-/* Everything below this point is not included by assembler (.S) files. */
-#ifndef __ASSEMBLY__
-
-extern shared_info_t *HYPERVISOR_shared_info;
-
-void trap_init(void);
-void trap_fini(void);
-
-void arch_fini(void);
-
-
-
-
-
-/* 
- * The use of 'barrier' in the following reflects their use as local-lock
- * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
- * critical operations are executed. All critical operations must complete
- * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
- * includes these barriers, for example.
- */
-
-#define __cli()								\
-do {									\
-	vcpu_info_t *_vcpu;						\
-	_vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()];	\
-	_vcpu->evtchn_upcall_mask = 1;					\
-	barrier();							\
-} while (0)
-
-#define __sti()								\
-do {									\
-	vcpu_info_t *_vcpu;						\
-	barrier();							\
-	_vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()];	\
-	_vcpu->evtchn_upcall_mask = 0;					\
-	barrier(); /* unmask then check (avoid races) */		\
-	if ( unlikely(_vcpu->evtchn_upcall_pending) )			\
-		force_evtchn_callback();				\
-} while (0)
-
-#define __save_flags(x)							\
-do {									\
-	vcpu_info_t *_vcpu;						\
-	_vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()];	\
-	(x) = _vcpu->evtchn_upcall_mask;				\
-} while (0)
-
-#define __restore_flags(x)						\
-do {									\
-	vcpu_info_t *_vcpu;						\
-	barrier();							\
-	_vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()];	\
-	if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {			\
-		barrier(); /* unmask then check (avoid races) */	\
-		if ( unlikely(_vcpu->evtchn_upcall_pending) )		\
-			force_evtchn_callback();			\
-	}\
-} while (0)
-
-#define safe_halt()		((void)0)
-
-#define __save_and_cli(x)						\
-do {									\
-	vcpu_info_t *_vcpu;						\
-	_vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()];	\
-	(x) = _vcpu->evtchn_upcall_mask;				\
-	_vcpu->evtchn_upcall_mask = 1;					\
-	barrier();							\
-} while (0)
-
-#define local_irq_save(x)	__save_and_cli(x)
-#define local_irq_restore(x)	__restore_flags(x)
-#define local_save_flags(x)	__save_flags(x)
-#define local_irq_disable()	__cli()
-#define local_irq_enable()	__sti()
-
-#define irqs_disabled()			\
-    HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].evtchn_upcall_mask
-
-/* This is a barrier for the compiler only, NOT the processor! */
-#define barrier() __asm__ __volatile__("": : :"memory")
-
-#if defined(__i386__)
-#define mb()    __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory")
-#define rmb()   __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory")
-#define wmb()	__asm__ __volatile__ ("": : :"memory")
-#elif defined(__x86_64__)
-#define mb()    __asm__ __volatile__ ("mfence":::"memory")
-#define rmb()   __asm__ __volatile__ ("lfence":::"memory")
-#define wmb()	__asm__ __volatile__ ("sfence" ::: "memory") /* From CONFIG_UNORDERED_IO (linux) */
-#endif
-
-
-#define LOCK_PREFIX ""
-#define LOCK ""
-#define ADDR (*(volatile long *) addr)
-/*
- * Make sure gcc doesn't try to be clever and move things around
- * on us. We need to use _exactly_ the address the user gave us,
- * not some alias that contains the same information.
- */
-typedef struct { volatile int counter; } atomic_t;
-
-
-/************************** i386 *******************************/
-#ifdef __INSIDE_MINIOS__
-#if defined (__i386__)
-
-#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
-struct __xchg_dummy { unsigned long a[100]; };
-#define __xg(x) ((struct __xchg_dummy *)(x))
-static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
-{
-	switch (size) {
-		case 1:
-			__asm__ __volatile__("xchgb %b0,%1"
-				:"=q" (x)
-				:"m" (*__xg(ptr)), "0" (x)
-				:"memory");
-			break;
-		case 2:
-			__asm__ __volatile__("xchgw %w0,%1"
-				:"=r" (x)
-				:"m" (*__xg(ptr)), "0" (x)
-				:"memory");
-			break;
-		case 4:
-			__asm__ __volatile__("xchgl %0,%1"
-				:"=r" (x)
-				:"m" (*__xg(ptr)), "0" (x)
-				:"memory");
-			break;
-	}
-	return x;
-}
-
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It can be reorderdered on other architectures other than x86.
- * It also implies a memory barrier.
- */
-static inline int test_and_clear_bit(int nr, volatile unsigned long * addr)
-{
-	int oldbit;
-
-	__asm__ __volatile__( LOCK
-		"btrl %2,%1\n\tsbbl %0,%0"
-		:"=r" (oldbit),"=m" (ADDR)
-		:"Ir" (nr) : "memory");
-	return oldbit;
-}
-
-static inline int constant_test_bit(int nr, const volatile unsigned long *addr)
-{
-	return ((1UL << (nr & 31)) & (addr[nr >> 5])) != 0;
-}
-
-static inline int variable_test_bit(int nr, const volatile unsigned long * addr)
-{
-	int oldbit;
-
-	__asm__ __volatile__(
-		"btl %2,%1\n\tsbbl %0,%0"
-		:"=r" (oldbit)
-		:"m" (ADDR),"Ir" (nr));
-	return oldbit;
-}
-
-#define test_bit(nr,addr) \
-(__builtin_constant_p(nr) ? \
- constant_test_bit((nr),(addr)) : \
- variable_test_bit((nr),(addr)))
-
-/**
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered.  See __set_bit()
- * if you do not require the atomic guarantees.
- *
- * Note: there are no guarantees that this function will not be reordered
- * on non x86 architectures, so if you are writting portable code,
- * make sure not to rely on its reordering guarantees.
- *
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void set_bit(int nr, volatile unsigned long * addr)
-{
-	__asm__ __volatile__( LOCK
-		"btsl %1,%0"
-		:"=m" (ADDR)
-		:"Ir" (nr));
-}
-
-/**
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered.  However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
- * in order to ensure changes are visible on other processors.
- */
-static inline void clear_bit(int nr, volatile unsigned long * addr)
-{
-	__asm__ __volatile__( LOCK
-		"btrl %1,%0"
-		:"=m" (ADDR)
-		:"Ir" (nr));
-}
-
-/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
-	__asm__("bsfl %1,%0"
-		:"=r" (word)
-		:"rm" (word));
-	return word;
-}
-
-
-/*
- * These have to be done with inline assembly: that way the bit-setting
- * is guaranteed to be atomic. All bit operations return 0 if the bit
- * was cleared before the operation and != 0 if it was not.
- *
- * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
- */
-#define ADDR (*(volatile long *) addr)
-
-#define rdtscll(val) \
-     __asm__ __volatile__("rdtsc" : "=A" (val))
-
-
-
-#elif defined(__x86_64__)/* ifdef __i386__ */
-/************************** x86_84 *******************************/
-
-#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
-#define __xg(x) ((volatile long *)(x))
-static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
-{
-	switch (size) {
-		case 1:
-			__asm__ __volatile__("xchgb %b0,%1"
-				:"=q" (x)
-				:"m" (*__xg(ptr)), "0" (x)
-				:"memory");
-			break;
-		case 2:
-			__asm__ __volatile__("xchgw %w0,%1"
-				:"=r" (x)
-				:"m" (*__xg(ptr)), "0" (x)
-				:"memory");
-			break;
-		case 4:
-			__asm__ __volatile__("xchgl %k0,%1"
-				:"=r" (x)
-				:"m" (*__xg(ptr)), "0" (x)
-				:"memory");
-			break;
-		case 8:
-			__asm__ __volatile__("xchgq %0,%1"
-				:"=r" (x)
-				:"m" (*__xg(ptr)), "0" (x)
-				:"memory");
-			break;
-	}
-	return x;
-}
-
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies a memory barrier.
- */
-static __inline__ int test_and_clear_bit(int nr, volatile void * addr)
-{
-	int oldbit;
-
-	__asm__ __volatile__( LOCK_PREFIX
-		"btrl %2,%1\n\tsbbl %0,%0"
-		:"=r" (oldbit),"=m" (ADDR)
-		:"dIr" (nr) : "memory");
-	return oldbit;
-}
-
-static __inline__ int constant_test_bit(int nr, const volatile void * addr)
-{
-	return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
-}
-
-static __inline__ int variable_test_bit(int nr, volatile const void * addr)
-{
-	int oldbit;
-
-	__asm__ __volatile__(
-		"btl %2,%1\n\tsbbl %0,%0"
-		:"=r" (oldbit)
-		:"m" (ADDR),"dIr" (nr));
-	return oldbit;
-}
-
-#define test_bit(nr,addr) \
-(__builtin_constant_p(nr) ? \
- constant_test_bit((nr),(addr)) : \
- variable_test_bit((nr),(addr)))
-
-
-/**
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered.  See __set_bit()
- * if you do not require the atomic guarantees.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static __inline__ void set_bit(int nr, volatile void * addr)
-{
-	__asm__ __volatile__( LOCK_PREFIX
-		"btsl %1,%0"
-		:"=m" (ADDR)
-		:"dIr" (nr) : "memory");
-}
-
-/**
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered.  However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
- * in order to ensure changes are visible on other processors.
- */
-static __inline__ void clear_bit(int nr, volatile void * addr)
-{
-	__asm__ __volatile__( LOCK_PREFIX
-		"btrl %1,%0"
-		:"=m" (ADDR)
-		:"dIr" (nr));
-}
-
-/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static __inline__ unsigned long __ffs(unsigned long word)
-{
-	__asm__("bsfq %1,%0"
-		:"=r" (word)
-		:"rm" (word));
-	return word;
-}
-
-#define ADDR (*(volatile long *) addr)
-
-#define rdtscll(val) do { \
-     unsigned int __a,__d; \
-     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
-     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
-} while(0)
-
-#define wrmsr(msr,val1,val2) \
-      __asm__ __volatile__("wrmsr" \
-                           : /* no outputs */ \
-                           : "c" (msr), "a" (val1), "d" (val2))
-
-#define wrmsrl(msr,val) wrmsr(msr,(uint32_t)((uint64_t)(val)),((uint64_t)(val))>>32)
-
-
-#else /* ifdef __x86_64__ */
-#error "Unsupported architecture"
-#endif
-#endif /* ifdef __INSIDE_MINIOS */
-
-/********************* common i386 and x86_64  ****************************/
-struct __synch_xchg_dummy { unsigned long a[100]; };
-#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
-
-#define synch_cmpxchg(ptr, old, new) \
-((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
-                                     (unsigned long)(old), \
-                                     (unsigned long)(new), \
-                                     sizeof(*(ptr))))
-
-static inline unsigned long __synch_cmpxchg(volatile void *ptr,
-        unsigned long old,
-        unsigned long new, int size)
-{
-    unsigned long prev;
-    switch (size) {
-        case 1:
-            __asm__ __volatile__("lock; cmpxchgb %b1,%2"
-                    : "=a"(prev)
-                    : "q"(new), "m"(*__synch_xg(ptr)),
-                    "0"(old)
-                    : "memory");
-            return prev;
-        case 2:
-            __asm__ __volatile__("lock; cmpxchgw %w1,%2"
-                    : "=a"(prev)
-                    : "r"(new), "m"(*__synch_xg(ptr)),
-                    "0"(old)
-                    : "memory");
-            return prev;
-#ifdef __x86_64__
-        case 4:
-            __asm__ __volatile__("lock; cmpxchgl %k1,%2"
-                    : "=a"(prev)
-                    : "r"(new), "m"(*__synch_xg(ptr)),
-                    "0"(old)
-                    : "memory");
-            return prev;
-        case 8:
-            __asm__ __volatile__("lock; cmpxchgq %1,%2"
-                    : "=a"(prev)
-                    : "r"(new), "m"(*__synch_xg(ptr)),
-                    "0"(old)
-                    : "memory");
-            return prev;
-#else
-        case 4:
-            __asm__ __volatile__("lock; cmpxchgl %1,%2"
-                    : "=a"(prev)
-                    : "r"(new), "m"(*__synch_xg(ptr)),
-                    "0"(old)
-                    : "memory");
-            return prev;
-#endif
-    }
-    return old;
-}
-
-
-static __inline__ void synch_set_bit(int nr, volatile void * addr)
-{
-    __asm__ __volatile__ ( 
-        "lock btsl %1,%0"
-        : "=m" (ADDR) : "Ir" (nr) : "memory" );
-}
-
-static __inline__ void synch_clear_bit(int nr, volatile void * addr)
-{
-    __asm__ __volatile__ (
-        "lock btrl %1,%0"
-        : "=m" (ADDR) : "Ir" (nr) : "memory" );
-}
-
-static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
-{
-    int oldbit;
-    __asm__ __volatile__ (
-        "lock btsl %2,%1\n\tsbbl %0,%0"
-        : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
-    return oldbit;
-}
-
-static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
-{
-    int oldbit;
-    __asm__ __volatile__ (
-        "lock btrl %2,%1\n\tsbbl %0,%0"
-        : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
-    return oldbit;
-}
-
-static __inline__ int synch_const_test_bit(int nr, const volatile void * addr)
-{
-    return ((1UL << (nr & 31)) & 
-            (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
-}
-
-static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
-{
-    int oldbit;
-    __asm__ __volatile__ (
-        "btl %2,%1\n\tsbbl %0,%0"
-        : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) );
-    return oldbit;
-}
-
-#define synch_test_bit(nr,addr) \
-(__builtin_constant_p(nr) ? \
- synch_const_test_bit((nr),(addr)) : \
- synch_var_test_bit((nr),(addr)))
-
-static inline int
-HYPERVISOR_xsm_op(
-        struct xen_flask_op *op)
-{
-    return _hypercall1(int, xsm_op, op);
-}
-
-#undef ADDR
-
-#endif /* not assembly */
-#endif /* _OS_H_ */
diff --git a/extras/mini-os/include/x86/traps.h b/extras/mini-os/include/x86/traps.h
deleted file mode 100644
index bfb6781..0000000
--- a/extras/mini-os/include/x86/traps.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* 
- ****************************************************************************
- * (C) 2005 - Grzegorz Milos - Intel Reseach Cambridge
- ****************************************************************************
- *
- *        File: traps.h
- *      Author: Grzegorz Milos (gm281 at cam.ac.uk)
- *              
- *        Date: Jun 2005
- * 
- * Environment: Xen Minimal OS
- * Description: Deals with traps
- *
- ****************************************************************************
- */
-
-#ifndef _TRAPS_H_
-#define _TRAPS_H_
-
-#ifdef __i386__
-struct pt_regs {
-	long ebx;
-	long ecx;
-	long edx;
-	long esi;
-	long edi;
-	long ebp;
-	long eax;
-	int  xds;
-	int  xes;
-	long orig_eax;
-	long eip;
-	int  xcs;
-	long eflags;
-	long esp;
-	int  xss;
-};
-#elif __x86_64__
-
-struct pt_regs {
-	unsigned long r15;
-	unsigned long r14;
-	unsigned long r13;
-	unsigned long r12;
-	unsigned long rbp;
-	unsigned long rbx;
-/* arguments: non interrupts/non tracing syscalls only save upto here*/
- 	unsigned long r11;
-	unsigned long r10;	
-	unsigned long r9;
-	unsigned long r8;
-	unsigned long rax;
-	unsigned long rcx;
-	unsigned long rdx;
-	unsigned long rsi;
-	unsigned long rdi;
-	unsigned long orig_rax;
-/* end of arguments */ 	
-/* cpu exception frame or undefined */
-	unsigned long rip;
-	unsigned long cs;
-	unsigned long eflags; 
-	unsigned long rsp; 
-	unsigned long ss;
-/* top of stack page */ 
-};
-
-
-#endif
-
-void dump_regs(struct pt_regs *regs);
-void stack_walk(void);
-
-#define TRAP_PF_PROT   0x1
-#define TRAP_PF_WRITE  0x2
-#define TRAP_PF_USER   0x4
-
-#endif /* _TRAPS_H_ */
diff --git a/extras/mini-os/include/x86/x86_32/arch_wordsize.h b/extras/mini-os/include/x86/x86_32/arch_wordsize.h
deleted file mode 100644
index b47eee9..0000000
--- a/extras/mini-os/include/x86/x86_32/arch_wordsize.h
+++ /dev/null
@@ -1 +0,0 @@
-#define __WORDSIZE 32
diff --git a/extras/mini-os/include/x86/x86_32/hypercall-x86_32.h b/extras/mini-os/include/x86/x86_32/hypercall-x86_32.h
deleted file mode 100644
index 99a4ee3..0000000
--- a/extras/mini-os/include/x86/x86_32/hypercall-x86_32.h
+++ /dev/null
@@ -1,337 +0,0 @@
-/******************************************************************************
- * hypercall-x86_32.h
- * 
- * Copied from XenLinux.
- * 
- * Copyright (c) 2002-2004, K A Fraser
- * 
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef __HYPERCALL_X86_32_H__
-#define __HYPERCALL_X86_32_H__
-
-#include <xen/xen.h>
-#include <xen/sched.h>
-#include <xen/nmi.h>
-#include <mini-os/mm.h>
-
-typedef struct { unsigned long pte_low, pte_high; } pte_t;
-
-#define __pte(x) ({ unsigned long long _x = (x);        \
-    ((pte_t) {(unsigned long)(_x), (unsigned long)(_x>>32)}); })
-
-#define __STR(x) #x
-#define STR(x) __STR(x)
-
-extern char hypercall_page[PAGE_SIZE];
-
-#define _hypercall0(type, name)			\
-({						\
-	long __res;				\
-	asm volatile (				\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res)			\
-		:				\
-		: "memory" );			\
-	(type)__res;				\
-})
-
-#define _hypercall1(type, name, a1)				\
-({								\
-	long __res, __ign1;					\
-	asm volatile (						\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res), "=b" (__ign1)			\
-		: "1" ((long)(a1))				\
-		: "memory" );					\
-	(type)__res;						\
-})
-
-#define _hypercall2(type, name, a1, a2)				\
-({								\
-	long __res, __ign1, __ign2;				\
-	asm volatile (						\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res), "=b" (__ign1), "=c" (__ign2)	\
-		: "1" ((long)(a1)), "2" ((long)(a2))		\
-		: "memory" );					\
-	(type)__res;						\
-})
-
-#define _hypercall3(type, name, a1, a2, a3)			\
-({								\
-	long __res, __ign1, __ign2, __ign3;			\
-	asm volatile (						\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res), "=b" (__ign1), "=c" (__ign2), 	\
-		"=d" (__ign3)					\
-		: "1" ((long)(a1)), "2" ((long)(a2)),		\
-		"3" ((long)(a3))				\
-		: "memory" );					\
-	(type)__res;						\
-})
-
-#define _hypercall4(type, name, a1, a2, a3, a4)			\
-({								\
-	long __res, __ign1, __ign2, __ign3, __ign4;		\
-	asm volatile (						\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res), "=b" (__ign1), "=c" (__ign2),	\
-		"=d" (__ign3), "=S" (__ign4)			\
-		: "1" ((long)(a1)), "2" ((long)(a2)),		\
-		"3" ((long)(a3)), "4" ((long)(a4))		\
-		: "memory" );					\
-	(type)__res;						\
-})
-
-#define _hypercall5(type, name, a1, a2, a3, a4, a5)		\
-({								\
-	long __res, __ign1, __ign2, __ign3, __ign4, __ign5;	\
-	asm volatile (						\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res), "=b" (__ign1), "=c" (__ign2),	\
-		"=d" (__ign3), "=S" (__ign4), "=D" (__ign5)	\
-		: "1" ((long)(a1)), "2" ((long)(a2)),		\
-		"3" ((long)(a3)), "4" ((long)(a4)),		\
-		"5" ((long)(a5))				\
-		: "memory" );					\
-	(type)__res;						\
-})
-
-static inline int
-HYPERVISOR_set_trap_table(
-	trap_info_t *table)
-{
-	return _hypercall1(int, set_trap_table, table);
-}
-
-static inline int
-HYPERVISOR_mmu_update(
-	mmu_update_t *req, int count, int *success_count, domid_t domid)
-{
-	return _hypercall4(int, mmu_update, req, count, success_count, domid);
-}
-
-static inline int
-HYPERVISOR_mmuext_op(
-	struct mmuext_op *op, int count, int *success_count, domid_t domid)
-{
-	return _hypercall4(int, mmuext_op, op, count, success_count, domid);
-}
-
-static inline int
-HYPERVISOR_set_gdt(
-	unsigned long *frame_list, int entries)
-{
-	return _hypercall2(int, set_gdt, frame_list, entries);
-}
-
-static inline int
-HYPERVISOR_stack_switch(
-	unsigned long ss, unsigned long esp)
-{
-	return _hypercall2(int, stack_switch, ss, esp);
-}
-
-static inline int
-HYPERVISOR_set_callbacks(
-	unsigned long event_selector, unsigned long event_address,
-	unsigned long failsafe_selector, unsigned long failsafe_address)
-{
-	return _hypercall4(int, set_callbacks,
-			   event_selector, event_address,
-			   failsafe_selector, failsafe_address);
-}
-
-static inline int
-HYPERVISOR_fpu_taskswitch(
-	int set)
-{
-	return _hypercall1(int, fpu_taskswitch, set);
-}
-
-static inline int
-HYPERVISOR_sched_op(
-	int cmd, void *arg)
-{
-	return _hypercall2(int, sched_op, cmd, arg);
-}
-
-static inline int
-HYPERVISOR_shutdown(
-	unsigned int reason)
-{
-	struct sched_shutdown shutdown = { .reason = reason };
-	return _hypercall2(int, sched_op, SCHEDOP_shutdown, &shutdown);
-}
-
-static inline long
-HYPERVISOR_set_timer_op(
-	uint64_t timeout)
-{
-	unsigned long timeout_hi = (unsigned long)(timeout>>32);
-	unsigned long timeout_lo = (unsigned long)timeout;
-	return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
-}
-
-static inline int
-HYPERVISOR_set_debugreg(
-	int reg, unsigned long value)
-{
-	return _hypercall2(int, set_debugreg, reg, value);
-}
-
-static inline unsigned long
-HYPERVISOR_get_debugreg(
-	int reg)
-{
-	return _hypercall1(unsigned long, get_debugreg, reg);
-}
-
-static inline int
-HYPERVISOR_update_descriptor(
-	uint64_t ma, uint64_t desc)
-{
-	return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
-}
-
-static inline int
-HYPERVISOR_memory_op(
-	unsigned int cmd, void *arg)
-{
-	return _hypercall2(int, memory_op, cmd, arg);
-}
-
-static inline int
-HYPERVISOR_multicall(
-	void *call_list, int nr_calls)
-{
-	return _hypercall2(int, multicall, call_list, nr_calls);
-}
-
-static inline int
-HYPERVISOR_update_va_mapping(
-	unsigned long va, pte_t new_val, unsigned long flags)
-{
-	return _hypercall4(int, update_va_mapping, va,
-			   new_val.pte_low, new_val.pte_high, flags);
-}
-
-static inline int
-HYPERVISOR_event_channel_op(
-	int cmd, void *op)
-{
-	return _hypercall2(int, event_channel_op, cmd, op);
-}
-
-static inline int
-HYPERVISOR_xen_version(
-	int cmd, void *arg)
-{
-	return _hypercall2(int, xen_version, cmd, arg);
-}
-
-static inline int
-HYPERVISOR_console_io(
-	int cmd, int count, char *str)
-{
-	return _hypercall3(int, console_io, cmd, count, str);
-}
-
-static inline int
-HYPERVISOR_physdev_op(
-	int cmd, void *physdev_op)
-{
-	return _hypercall2(int, physdev_op, cmd, physdev_op);
-}
-
-static inline int
-HYPERVISOR_grant_table_op(
-	unsigned int cmd, void *uop, unsigned int count)
-{
-	return _hypercall3(int, grant_table_op, cmd, uop, count);
-}
-
-static inline int
-HYPERVISOR_update_va_mapping_otherdomain(
-	unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
-{
-	return _hypercall5(int, update_va_mapping_otherdomain, va,
-			   new_val.pte_low, new_val.pte_high, flags, domid);
-}
-
-static inline int
-HYPERVISOR_vm_assist(
-	unsigned int cmd, unsigned int type)
-{
-	return _hypercall2(int, vm_assist, cmd, type);
-}
-
-static inline int
-HYPERVISOR_vcpu_op(
-	int cmd, int vcpuid, void *extra_args)
-{
-	return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
-}
-
-static inline int
-HYPERVISOR_suspend(
-	unsigned long srec)
-{
-	return _hypercall3(int, sched_op, SCHEDOP_shutdown,
-			   SHUTDOWN_suspend, srec);
-}
-
-static inline int
-HYPERVISOR_nmi_op(
-	unsigned long op,
-	unsigned long arg)
-{
-	return _hypercall2(int, nmi_op, op, arg);
-}
-
-static inline int
-HYPERVISOR_sysctl(
-	unsigned long op)
-{
-	return _hypercall1(int, sysctl, op);
-}
-
-static inline int
-HYPERVISOR_domctl(
-	unsigned long op)
-{
-	return _hypercall1(int, domctl, op);
-}
-
-#endif /* __HYPERCALL_X86_32_H__ */
-
-/*
- * Local variables:
- *  c-file-style: "linux"
- *  indent-tabs-mode: t
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/extras/mini-os/include/x86/x86_64/arch_wordsize.h b/extras/mini-os/include/x86/x86_64/arch_wordsize.h
deleted file mode 100644
index 3048136..0000000
--- a/extras/mini-os/include/x86/x86_64/arch_wordsize.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define __WORDSIZE 64
-#define __WORDSIZE_COMPAT32 1
diff --git a/extras/mini-os/include/x86/x86_64/hypercall-x86_64.h b/extras/mini-os/include/x86/x86_64/hypercall-x86_64.h
deleted file mode 100644
index e00b3bd..0000000
--- a/extras/mini-os/include/x86/x86_64/hypercall-x86_64.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/******************************************************************************
- * hypercall-x86_64.h
- * 
- * Copied from XenLinux.
- * 
- * Copyright (c) 2002-2004, K A Fraser
- * 
- * 64-bit updates:
- *   Benjamin Liu <benjamin.liu at intel.com>
- *   Jun Nakajima <jun.nakajima at intel.com>
- * 
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef __HYPERCALL_X86_64_H__
-#define __HYPERCALL_X86_64_H__
-
-#include <xen/xen.h>
-#include <xen/sched.h>
-#include <mini-os/mm.h>
-
-typedef struct { unsigned long pte; } pte_t;
-
-#define __pte(x) ((pte_t) { (x) } )
-
-#define __STR(x) #x
-#define STR(x) __STR(x)
-
-extern char hypercall_page[PAGE_SIZE];
-
-#define _hypercall0(type, name)			\
-({						\
-	long __res;				\
-	asm volatile (				\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res)			\
-		:				\
-		: "memory" );			\
-	(type)__res;				\
-})
-
-#define _hypercall1(type, name, a1)				\
-({								\
-	long __res, __ign1;					\
-	asm volatile (						\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res), "=D" (__ign1)			\
-		: "1" ((long)(a1))				\
-		: "memory" );					\
-	(type)__res;						\
-})
-
-#define _hypercall2(type, name, a1, a2)				\
-({								\
-	long __res, __ign1, __ign2;				\
-	asm volatile (						\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res), "=D" (__ign1), "=S" (__ign2)	\
-		: "1" ((long)(a1)), "2" ((long)(a2))		\
-		: "memory" );					\
-	(type)__res;						\
-})
-
-#define _hypercall3(type, name, a1, a2, a3)			\
-({								\
-	long __res, __ign1, __ign2, __ign3;			\
-	asm volatile (						\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res), "=D" (__ign1), "=S" (__ign2), 	\
-		"=d" (__ign3)					\
-		: "1" ((long)(a1)), "2" ((long)(a2)),		\
-		"3" ((long)(a3))				\
-		: "memory" );					\
-	(type)__res;						\
-})
-
-#define _hypercall4(type, name, a1, a2, a3, a4)			\
-({								\
-	long __res, __ign1, __ign2, __ign3;			\
-	asm volatile (						\
-		"movq %7,%%r10; "				\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res), "=D" (__ign1), "=S" (__ign2),	\
-		"=d" (__ign3)					\
-		: "1" ((long)(a1)), "2" ((long)(a2)),		\
-		"3" ((long)(a3)), "g" ((long)(a4))		\
-		: "memory", "r10" );				\
-	(type)__res;						\
-})
-
-#define _hypercall5(type, name, a1, a2, a3, a4, a5)		\
-({								\
-	long __res, __ign1, __ign2, __ign3;			\
-	asm volatile (						\
-		"movq %7,%%r10; movq %8,%%r8; "			\
-		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
-		: "=a" (__res), "=D" (__ign1), "=S" (__ign2),	\
-		"=d" (__ign3)					\
-		: "1" ((long)(a1)), "2" ((long)(a2)),		\
-		"3" ((long)(a3)), "g" ((long)(a4)),		\
-		"g" ((long)(a5))				\
-		: "memory", "r10", "r8" );			\
-	(type)__res;						\
-})
-
-static inline int
-HYPERVISOR_set_trap_table(
-	trap_info_t *table)
-{
-	return _hypercall1(int, set_trap_table, table);
-}
-
-static inline int
-HYPERVISOR_mmu_update(
-	mmu_update_t *req, int count, int *success_count, domid_t domid)
-{
-	return _hypercall4(int, mmu_update, req, count, success_count, domid);
-}
-
-static inline int
-HYPERVISOR_mmuext_op(
-	struct mmuext_op *op, int count, int *success_count, domid_t domid)
-{
-	return _hypercall4(int, mmuext_op, op, count, success_count, domid);
-}
-
-static inline int
-HYPERVISOR_set_gdt(
-	unsigned long *frame_list, int entries)
-{
-	return _hypercall2(int, set_gdt, frame_list, entries);
-}
-
-static inline int
-HYPERVISOR_stack_switch(
-	unsigned long ss, unsigned long esp)
-{
-	return _hypercall2(int, stack_switch, ss, esp);
-}
-
-static inline int
-HYPERVISOR_set_callbacks(
-	unsigned long event_address, unsigned long failsafe_address, 
-	unsigned long syscall_address)
-{
-	return _hypercall3(int, set_callbacks,
-			   event_address, failsafe_address, syscall_address);
-}
-
-static inline int
-HYPERVISOR_fpu_taskswitch(
-	int set)
-{
-	return _hypercall1(int, fpu_taskswitch, set);
-}
-
-static inline int
-HYPERVISOR_sched_op(
-	int cmd, void *arg)
-{
-	return _hypercall2(int, sched_op, cmd, arg);
-}
-
-static inline int
-HYPERVISOR_shutdown(
-	unsigned int reason)
-{
-	struct sched_shutdown shutdown = { .reason = reason };
-	return _hypercall2(int, sched_op, SCHEDOP_shutdown, &shutdown);
-}
-
-static inline long
-HYPERVISOR_set_timer_op(
-	uint64_t timeout)
-{
-	return _hypercall1(long, set_timer_op, timeout);
-}
-
-static inline int
-HYPERVISOR_set_debugreg(
-	int reg, unsigned long value)
-{
-	return _hypercall2(int, set_debugreg, reg, value);
-}
-
-static inline unsigned long
-HYPERVISOR_get_debugreg(
-	int reg)
-{
-	return _hypercall1(unsigned long, get_debugreg, reg);
-}
-
-static inline int
-HYPERVISOR_update_descriptor(
-	unsigned long ma, unsigned long word)
-{
-	return _hypercall2(int, update_descriptor, ma, word);
-}
-
-static inline int
-HYPERVISOR_memory_op(
-	unsigned int cmd, void *arg)
-{
-	return _hypercall2(int, memory_op, cmd, arg);
-}
-
-static inline int
-HYPERVISOR_multicall(
-	void *call_list, int nr_calls)
-{
-	return _hypercall2(int, multicall, call_list, nr_calls);
-}
-
-static inline int
-HYPERVISOR_update_va_mapping(
-	unsigned long va, pte_t new_val, unsigned long flags)
-{
-	return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
-}
-
-static inline int
-HYPERVISOR_event_channel_op(
-       int cmd, void *op)
-{
-    return _hypercall2(int, event_channel_op, cmd, op);
-}
-
-static inline int
-HYPERVISOR_xen_version(
-	int cmd, void *arg)
-{
-	return _hypercall2(int, xen_version, cmd, arg);
-}
-
-static inline int
-HYPERVISOR_console_io(
-	int cmd, int count, char *str)
-{
-	return _hypercall3(int, console_io, cmd, count, str);
-}
-
-static inline int
-HYPERVISOR_physdev_op(
-	int cmd, void *physdev_op)
-{
-	return _hypercall2(int, physdev_op, cmd, physdev_op);
-}
-
-static inline int
-HYPERVISOR_grant_table_op(
-	unsigned int cmd, void *uop, unsigned int count)
-{
-	return _hypercall3(int, grant_table_op, cmd, uop, count);
-}
-
-static inline int
-HYPERVISOR_update_va_mapping_otherdomain(
-	unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
-{
-	return _hypercall4(int, update_va_mapping_otherdomain, va,
-			   new_val.pte, flags, domid);
-}
-
-static inline int
-HYPERVISOR_vm_assist(
-	unsigned int cmd, unsigned int type)
-{
-	return _hypercall2(int, vm_assist, cmd, type);
-}
-
-static inline int
-HYPERVISOR_vcpu_op(
-	int cmd, int vcpuid, void *extra_args)
-{
-	return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
-}
-
-static inline int
-HYPERVISOR_set_segment_base(
-	int reg, unsigned long value)
-{
-	return _hypercall2(int, set_segment_base, reg, value);
-}
-
-static inline int
-HYPERVISOR_suspend(
-	unsigned long srec)
-{
-	return _hypercall3(int, sched_op, SCHEDOP_shutdown,
-			   SHUTDOWN_suspend, srec);
-}
-
-static inline int
-HYPERVISOR_nmi_op(
-	unsigned long op,
-	unsigned long arg)
-{
-	return _hypercall2(int, nmi_op, op, arg);
-}
-
-static inline int
-HYPERVISOR_sysctl(
-	unsigned long op)
-{
-	return _hypercall1(int, sysctl, op);
-}
-
-static inline int
-HYPERVISOR_domctl(
-	unsigned long op)
-{
-	return _hypercall1(int, domctl, op);
-}
-
-#endif /* __HYPERCALL_X86_64_H__ */
-
-/*
- * Local variables:
- *  c-file-style: "linux"
- *  indent-tabs-mode: t
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/extras/mini-os/include/xenbus.h b/extras/mini-os/include/xenbus.h
deleted file mode 100644
index d3bb7af..0000000
--- a/extras/mini-os/include/xenbus.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef XENBUS_H__
-#define XENBUS_H__
-
-#include <xen/io/xenbus.h>
-
-typedef unsigned long xenbus_transaction_t;
-#define XBT_NIL ((xenbus_transaction_t)0)
-
-#ifdef CONFIG_XENBUS
-/* Initialize the XenBus system. */
-void init_xenbus(void);
-#else
-static inline void init_xenbus(void)
-{
-}
-#endif
-
-/* Read the value associated with a path.  Returns a malloc'd error
-   string on failure and sets *value to NULL.  On success, *value is
-   set to a malloc'd copy of the value. */
-char *xenbus_read(xenbus_transaction_t xbt, const char *path, char **value);
-
-/* Watch event queue */
-struct xenbus_event {
-    /* Keep these two as this for xs.c */
-    char *path;
-    char *token;
-    struct xenbus_event *next;
-};
-typedef struct xenbus_event *xenbus_event_queue;
-
-char *xenbus_watch_path_token(xenbus_transaction_t xbt, const char *path, const char *token, xenbus_event_queue *events);
-char *xenbus_unwatch_path_token(xenbus_transaction_t xbt, const char *path, const char *token);
-extern struct wait_queue_head xenbus_watch_queue;
-void xenbus_wait_for_watch(xenbus_event_queue *queue);
-char **xenbus_wait_for_watch_return(xenbus_event_queue *queue);
-char* xenbus_wait_for_value(const char *path, const char *value, xenbus_event_queue *queue);
-char *xenbus_wait_for_state_change(const char* path, XenbusState *state, xenbus_event_queue *queue);
-char *xenbus_switch_state(xenbus_transaction_t xbt, const char* path, XenbusState state);
-
-/* When no token is provided, use a global queue. */
-#define XENBUS_WATCH_PATH_TOKEN "xenbus_watch_path"
-extern xenbus_event_queue xenbus_events;
-#define xenbus_watch_path(xbt, path) xenbus_watch_path_token(xbt, path, XENBUS_WATCH_PATH_TOKEN, NULL)
-#define xenbus_unwatch_path(xbt, path) xenbus_unwatch_path_token(xbt, path, XENBUS_WATCH_PATH_TOKEN)
-
-
-/* Associates a value with a path.  Returns a malloc'd error string on
-   failure. */
-char *xenbus_write(xenbus_transaction_t xbt, const char *path, const char *value);
-
-struct write_req {
-    const void *data;
-    unsigned len;
-};
-
-/* Send a message to xenbus, in the same fashion as xb_write, and
-   block waiting for a reply.  The reply is malloced and should be
-   freed by the caller. */
-struct xsd_sockmsg *
-xenbus_msg_reply(int type,
-                 xenbus_transaction_t trans,
-                 struct write_req *io,
-                 int nr_reqs);
-
-/* Removes the value associated with a path.  Returns a malloc'd error
-   string on failure. */
-char *xenbus_rm(xenbus_transaction_t xbt, const char *path);
-
-/* List the contents of a directory.  Returns a malloc'd error string
-   on failure and sets *contents to NULL.  On success, *contents is
-   set to a malloc'd array of pointers to malloc'd strings.  The array
-   is NULL terminated.  May block. */
-char *xenbus_ls(xenbus_transaction_t xbt, const char *prefix, char ***contents);
-
-/* Reads permissions associated with a path.  Returns a malloc'd error
-   string on failure and sets *value to NULL.  On success, *value is
-   set to a malloc'd copy of the value. */
-char *xenbus_get_perms(xenbus_transaction_t xbt, const char *path, char **value);
-
-/* Sets the permissions associated with a path.  Returns a malloc'd
-   error string on failure. */
-char *xenbus_set_perms(xenbus_transaction_t xbt, const char *path, domid_t dom, char perm);
-
-/* Start a xenbus transaction.  Returns the transaction in xbt on
-   success or a malloc'd error string otherwise. */
-char *xenbus_transaction_start(xenbus_transaction_t *xbt);
-
-/* End a xenbus transaction.  Returns a malloc'd error string if it
-   fails.  abort says whether the transaction should be aborted.
-   Returns 1 in *retry iff the transaction should be retried. */
-char *xenbus_transaction_end(xenbus_transaction_t, int abort,
-			     int *retry);
-
-/* Read path and parse it as an integer.  Returns -1 on error. */
-int xenbus_read_integer(const char *path);
-
-/* Read path and parse it as 16 byte uuid. Returns 1 if
- * read and parsing were successful, 0 if not */
-int xenbus_read_uuid(const char* path, unsigned char uuid[16]);
-
-/* Contraction of snprintf and xenbus_write(path/node). */
-char* xenbus_printf(xenbus_transaction_t xbt,
-                                  const char* node, const char* path,
-                                  const char* fmt, ...)
-                   __attribute__((__format__(printf, 4, 5)));
-
-/* Utility function to figure out our domain id */
-domid_t xenbus_get_self_id(void);
-
-#ifdef CONFIG_XENBUS
-/* Reset the XenBus system. */
-void fini_xenbus(void);
-#else
-static inline void fini_xenbus(void)
-{
-}
-#endif
-
-#endif /* XENBUS_H__ */
diff --git a/extras/mini-os/include/xmalloc.h b/extras/mini-os/include/xmalloc.h
deleted file mode 100644
index 11fb027..0000000
--- a/extras/mini-os/include/xmalloc.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef __XMALLOC_H__
-#define __XMALLOC_H__
-
-#ifdef HAVE_LIBC
-
-#include <stdlib.h>
-#include <malloc.h>
-/* Allocate space for typed object. */
-#define _xmalloc(size, align) memalign(align, size)
-#define xfree(ptr) free(ptr)
-
-#else
-
-#include <limits.h>
-
-#define DEFAULT_ALIGN (sizeof(unsigned long))
-
-extern void *malloc(size_t size);
-extern void *realloc(void *ptr, size_t size);
-extern void free(void *ptr);
-
-/* Free memory from any xmalloc*() call. */
-extern void xfree(const void *);
-
-/* Underlying functions */
-extern void *_xmalloc(size_t size, size_t align);
-
-#endif
-
-static inline void *_xmalloc_array(size_t size, size_t align, size_t num)
-{
-	/* Check for overflow. */
-	if (size && num > UINT_MAX / size)
-		return NULL;
- 	return _xmalloc(size * num, align);
-}
-
-/* Allocate space for typed object. */
-#define xmalloc(_type) ((_type *)_xmalloc(sizeof(_type), __alignof__(_type)))
-
-/* Allocate space for array of typed objects. */
-#define xmalloc_array(_type, _num) ((_type *)_xmalloc_array(sizeof(_type), __alignof__(_type), _num))
-
-#endif /* __XMALLOC_H__ */
diff --git a/extras/mini-os/kernel.c b/extras/mini-os/kernel.c
deleted file mode 100644
index 437e5b4..0000000
--- a/extras/mini-os/kernel.c
+++ /dev/null
@@ -1,198 +0,0 @@
-/******************************************************************************
- * kernel.c
- * 
- * Assorted crap goes here, including the initial C entry point, jumped at
- * from head.S.
- * 
- * Copyright (c) 2002-2003, K A Fraser & R Neugebauer
- * Copyright (c) 2005, Grzegorz Milos, Intel Research Cambridge
- * Copyright (c) 2006, Robert Kaiser, FH Wiesbaden
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <mini-os/os.h>
-#include <mini-os/kernel.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/mm.h>
-#include <mini-os/events.h>
-#include <mini-os/time.h>
-#include <mini-os/types.h>
-#include <mini-os/lib.h>
-#include <mini-os/sched.h>
-#include <mini-os/xenbus.h>
-#include <mini-os/gnttab.h>
-#include <mini-os/netfront.h>
-#include <mini-os/blkfront.h>
-#include <mini-os/fbfront.h>
-#include <mini-os/pcifront.h>
-#include <mini-os/xmalloc.h>
-#include <fcntl.h>
-#include <xen/features.h>
-#include <xen/version.h>
-
-uint8_t xen_features[XENFEAT_NR_SUBMAPS * 32];
-
-void setup_xen_features(void)
-{
-    xen_feature_info_t fi;
-    int i, j;
-
-    for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) 
-    {
-        fi.submap_idx = i;
-        if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
-            break;
-        
-        for (j=0; j<32; j++)
-            xen_features[i*32+j] = !!(fi.submap & 1<<j);
-    }
-}
-
-#ifdef CONFIG_XENBUS
-/* This should be overridden by the application we are linked against. */
-__attribute__((weak)) void app_shutdown(unsigned reason)
-{
-    struct sched_shutdown sched_shutdown = { .reason = reason };
-    printk("Shutdown requested: %d\n", reason);
-    HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
-}
-
-static void shutdown_thread(void *p)
-{
-    const char *path = "control/shutdown";
-    const char *token = path;
-    xenbus_event_queue events = NULL;
-    char *shutdown = NULL, *err;
-    unsigned int shutdown_reason;
-    xenbus_watch_path_token(XBT_NIL, path, token, &events);
-    while ((err = xenbus_read(XBT_NIL, path, &shutdown)) != NULL || !strcmp(shutdown, ""))
-    {
-        free(err);
-        free(shutdown);
-        shutdown = NULL;
-        xenbus_wait_for_watch(&events);
-    }
-    err = xenbus_unwatch_path_token(XBT_NIL, path, token);
-    free(err);
-    err = xenbus_write(XBT_NIL, path, "");
-    free(err);
-    printk("Shutting down (%s)\n", shutdown);
-
-    if (!strcmp(shutdown, "poweroff"))
-        shutdown_reason = SHUTDOWN_poweroff;
-    else if (!strcmp(shutdown, "reboot"))
-        shutdown_reason = SHUTDOWN_reboot;
-    else
-        /* Unknown */
-        shutdown_reason = SHUTDOWN_crash;
-    app_shutdown(shutdown_reason);
-    free(shutdown);
-}
-#endif
-
-
-/* This should be overridden by the application we are linked against. */
-__attribute__((weak)) int app_main(start_info_t *si)
-{
-    printk("kernel.c: dummy main: start_info=%p\n", si);
-    return 0;
-}
-
-void start_kernel(void)
-{
-    /* Set up events. */
-    init_events();
-
-    /* ENABLE EVENT DELIVERY. This is disabled at start of day. */
-    local_irq_enable();
-
-    setup_xen_features();
-
-    /* Init memory management. */
-    init_mm();
-
-    /* Init time and timers. */
-    init_time();
-
-    /* Init the console driver. */
-    init_console();
-
-    /* Init grant tables */
-    init_gnttab();
-    
-    /* Init scheduler. */
-    init_sched();
- 
-    /* Init XenBus */
-    init_xenbus();
-
-#ifdef CONFIG_XENBUS
-    create_thread("shutdown", shutdown_thread, NULL);
-#endif
-
-    /* Call (possibly overridden) app_main() */
-    app_main(&start_info);
-
-    /* Everything initialised, start idle thread */
-    run_idle_thread();
-}
-
-void stop_kernel(void)
-{
-    /* TODO: fs import */
-
-    local_irq_disable();
-
-    /* Reset grant tables */
-    fini_gnttab();
-
-    /* Reset XenBus */
-    fini_xenbus();
-
-    /* Reset timers */
-    fini_time();
-
-    /* Reset memory management. */
-    fini_mm();
-
-    /* Reset events. */
-    fini_events();
-
-    /* Reset arch details */
-    arch_fini();
-}
-
-/*
- * do_exit: This is called whenever an IRET fails in entry.S.
- * This will generally be because an application has got itself into
- * a really bad state (probably a bad CS or SS). It must be killed.
- * Of course, minimal OS doesn't have applications :-)
- */
-
-void do_exit(void)
-{
-    printk("Do_exit called!\n");
-    arch_do_exit();
-    for( ;; )
-    {
-        struct sched_shutdown sched_shutdown = { .reason = SHUTDOWN_crash };
-        HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
-    }
-}
diff --git a/extras/mini-os/lib/ctype.c b/extras/mini-os/lib/ctype.c
deleted file mode 100644
index 3f3bdb0..0000000
--- a/extras/mini-os/lib/ctype.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef HAVE_LIBC
-#include <ctype.h>
-
-unsigned char _ctype[] = {
-_C,_C,_C,_C,_C,_C,_C,_C,                        /* 0-7 */
-_C,_C|_S,_C|_S,_C|_S,_C|_S,_C|_S,_C,_C,         /* 8-15 */
-_C,_C,_C,_C,_C,_C,_C,_C,                        /* 16-23 */
-_C,_C,_C,_C,_C,_C,_C,_C,                        /* 24-31 */
-_S|_SP,_P,_P,_P,_P,_P,_P,_P,                    /* 32-39 */
-_P,_P,_P,_P,_P,_P,_P,_P,                        /* 40-47 */
-_D,_D,_D,_D,_D,_D,_D,_D,                        /* 48-55 */
-_D,_D,_P,_P,_P,_P,_P,_P,                        /* 56-63 */
-_P,_U|_X,_U|_X,_U|_X,_U|_X,_U|_X,_U|_X,_U,      /* 64-71 */
-_U,_U,_U,_U,_U,_U,_U,_U,                        /* 72-79 */
-_U,_U,_U,_U,_U,_U,_U,_U,                        /* 80-87 */
-_U,_U,_U,_P,_P,_P,_P,_P,                        /* 88-95 */
-_P,_L|_X,_L|_X,_L|_X,_L|_X,_L|_X,_L|_X,_L,      /* 96-103 */
-_L,_L,_L,_L,_L,_L,_L,_L,                        /* 104-111 */
-_L,_L,_L,_L,_L,_L,_L,_L,                        /* 112-119 */
-_L,_L,_L,_P,_P,_P,_P,_C,                        /* 120-127 */
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,                /* 128-143 */
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,                /* 144-159 */
-_S|_SP,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,   /* 160-175 */
-_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,       /* 176-191 */
-_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,       /* 192-207 */
-_U,_U,_U,_U,_U,_U,_U,_P,_U,_U,_U,_U,_U,_U,_U,_L,       /* 208-223 */
-_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,       /* 224-239 */
-_L,_L,_L,_L,_L,_L,_L,_P,_L,_L,_L,_L,_L,_L,_L,_L};      /* 240-255 */
-#endif
diff --git a/extras/mini-os/lib/math.c b/extras/mini-os/lib/math.c
deleted file mode 100644
index 561393e..0000000
--- a/extras/mini-os/lib/math.c
+++ /dev/null
@@ -1,426 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: math.c
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: 
- *              
- *        Date: Aug 2003
- * 
- * Environment: Xen Minimal OS
- * Description:  Library functions for 64bit arith and other
- *               from freebsd, files in sys/libkern/ (qdivrem.c, etc)
- *
- * Copyright (c) 1992, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * This software was developed by the Computer Systems Engineering group
- * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
- * contributed to Berkeley.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
-*/
-
-#include <mini-os/types.h>
-#include <mini-os/lib.h>
-#include <mini-os/time.h>
-
-/* XXX RN: Yuck hardcoded endianess :) */
-#define _QUAD_HIGHWORD 1
-#define _QUAD_LOWWORD 0
-
-/*
- * From
- *	@(#)quad.h	8.1 (Berkeley) 6/4/93
- */
-
-/*
- * Depending on the desired operation, we view a `long long' (aka quad_t) in
- * one or more of the following formats.
- */
-union uu {
-	quad_t	q;		/* as a (signed) quad */
-	quad_t	uq;		/* as an unsigned quad */
-	int32_t	sl[2];		/* as two signed longs */
-	uint32_t	ul[2];		/* as two unsigned longs */
-};
-
-/*
- * Define high and low longwords.
- */
-#define	H		_QUAD_HIGHWORD
-#define	L		_QUAD_LOWWORD
-
-/*
- * Total number of bits in an quad_t and in the pieces that make it up.
- * These are used for shifting, and also below for halfword extraction
- * and assembly.
- */
-#ifndef HAVE_LIBC
-#define CHAR_BIT        8               /* number of bits in a char */
-#endif
-#define	QUAD_BITS	(sizeof(quad_t) * CHAR_BIT)
-#define	LONG_BITS	(sizeof(int32_t) * CHAR_BIT)
-#define	HALF_BITS	(sizeof(int32_t) * CHAR_BIT / 2)
-
-/*
- * Extract high and low shortwords from longword, and move low shortword of
- * longword to upper half of int32_t, i.e., produce the upper longword of
- * ((quad_t)(x) << (number_of_bits_in_long/2)).  (`x' must actually be uint32_t.)
- *
- * These are used in the multiply code, to split a longword into upper
- * and lower halves, and to reassemble a product as a quad_t, shifted left
- * (sizeof(int32_t)*CHAR_BIT/2).
- */
-#define	HHALF(x)	((x) >> HALF_BITS)
-#define	LHALF(x)	((x) & ((1UL << HALF_BITS) - 1))
-#define	LHUP(x)		((x) << HALF_BITS)
-
-
-/*
- * From
- * qdivrem.c
- */
-
-/*
- * Multiprecision divide.  This algorithm is from Knuth vol. 2 (2nd ed),
- * section 4.3.1, pp. 257--259.
- */
-#define	B	(1UL << HALF_BITS)	/* digit base */
-
-/* Combine two `digits' to make a single two-digit number. */
-#define	COMBINE(a, b) (((uint32_t)(a) << HALF_BITS) | (b))
-
-/* select a type for digits in base B: */
-typedef uint16_t digit;
-
-/*
- * Shift p[0]..p[len] left `sh' bits, ignoring any bits that
- * `fall out' the left (there never will be any such anyway).
- * We may assume len >= 0.  NOTE THAT THIS WRITES len+1 DIGITS.
- */
-static void
-shl(register digit *p, register int len, register int sh)
-{
-	register int i;
-
-	for (i = 0; i < len; i++)
-		p[i] = LHALF(p[i] << sh) | (p[i + 1] >> (HALF_BITS - sh));
-	p[i] = LHALF(p[i] << sh);
-}
-
-/*
- * __qdivrem(u, v, rem) returns u/v and, optionally, sets *rem to u%v.
- *
- * We do this in base 2-sup-HALF_BITS, so that all intermediate products
- * fit within uint32_t.  As a consequence, the maximum length dividend and
- * divisor are 4 `digits' in this base (they are shorter if they have
- * leading zeros).
- */
-u_quad_t
-__qdivrem(u_quad_t uq, u_quad_t vq, u_quad_t *arq)
-{
-	union uu tmp;
-	digit *u, *v, *q;
-	register digit v1, v2;
-	uint32_t qhat, rhat, t;
-	int m, n, d, j, i;
-	digit uspace[5], vspace[5], qspace[5];
-
-	/*
-	 * Take care of special cases: divide by zero, and u < v.
-	 */
-	if (vq == 0) {
-		/* divide by zero. */
-		static volatile const unsigned int zero = 0;
-
-		tmp.ul[H] = tmp.ul[L] = 1 / zero;
-		if (arq)
-			*arq = uq;
-		return (tmp.q);
-	}
-	if (uq < vq) {
-		if (arq)
-			*arq = uq;
-		return (0);
-	}
-	u = &uspace[0];
-	v = &vspace[0];
-	q = &qspace[0];
-
-	/*
-	 * Break dividend and divisor into digits in base B, then
-	 * count leading zeros to determine m and n.  When done, we
-	 * will have:
-	 *	u = (u[1]u[2]...u[m+n]) sub B
-	 *	v = (v[1]v[2]...v[n]) sub B
-	 *	v[1] != 0
-	 *	1 < n <= 4 (if n = 1, we use a different division algorithm)
-	 *	m >= 0 (otherwise u < v, which we already checked)
-	 *	m + n = 4
-	 * and thus
-	 *	m = 4 - n <= 2
-	 */
-	tmp.uq = uq;
-	u[0] = 0;
-	u[1] = HHALF(tmp.ul[H]);
-	u[2] = LHALF(tmp.ul[H]);
-	u[3] = HHALF(tmp.ul[L]);
-	u[4] = LHALF(tmp.ul[L]);
-	tmp.uq = vq;
-	v[1] = HHALF(tmp.ul[H]);
-	v[2] = LHALF(tmp.ul[H]);
-	v[3] = HHALF(tmp.ul[L]);
-	v[4] = LHALF(tmp.ul[L]);
-	for (n = 4; v[1] == 0; v++) {
-		if (--n == 1) {
-			uint32_t rbj;	/* r*B+u[j] (not root boy jim) */
-			digit q1, q2, q3, q4;
-
-			/*
-			 * Change of plan, per exercise 16.
-			 *	r = 0;
-			 *	for j = 1..4:
-			 *		q[j] = floor((r*B + u[j]) / v),
-			 *		r = (r*B + u[j]) % v;
-			 * We unroll this completely here.
-			 */
-			t = v[2];	/* nonzero, by definition */
-			q1 = u[1] / t;
-			rbj = COMBINE(u[1] % t, u[2]);
-			q2 = rbj / t;
-			rbj = COMBINE(rbj % t, u[3]);
-			q3 = rbj / t;
-			rbj = COMBINE(rbj % t, u[4]);
-			q4 = rbj / t;
-			if (arq)
-				*arq = rbj % t;
-			tmp.ul[H] = COMBINE(q1, q2);
-			tmp.ul[L] = COMBINE(q3, q4);
-			return (tmp.q);
-		}
-	}
-
-	/*
-	 * By adjusting q once we determine m, we can guarantee that
-	 * there is a complete four-digit quotient at &qspace[1] when
-	 * we finally stop.
-	 */
-	for (m = 4 - n; u[1] == 0; u++)
-		m--;
-	for (i = 4 - m; --i >= 0;)
-		q[i] = 0;
-	q += 4 - m;
-
-	/*
-	 * Here we run Program D, translated from MIX to C and acquiring
-	 * a few minor changes.
-	 *
-	 * D1: choose multiplier 1 << d to ensure v[1] >= B/2.
-	 */
-	d = 0;
-	for (t = v[1]; t < B / 2; t <<= 1)
-		d++;
-	if (d > 0) {
-		shl(&u[0], m + n, d);		/* u <<= d */
-		shl(&v[1], n - 1, d);		/* v <<= d */
-	}
-	/*
-	 * D2: j = 0.
-	 */
-	j = 0;
-	v1 = v[1];	/* for D3 -- note that v[1..n] are constant */
-	v2 = v[2];	/* for D3 */
-	do {
-		register digit uj0, uj1, uj2;
-
-		/*
-		 * D3: Calculate qhat (\^q, in TeX notation).
-		 * Let qhat = min((u[j]*B + u[j+1])/v[1], B-1), and
-		 * let rhat = (u[j]*B + u[j+1]) mod v[1].
-		 * While rhat < B and v[2]*qhat > rhat*B+u[j+2],
-		 * decrement qhat and increase rhat correspondingly.
-		 * Note that if rhat >= B, v[2]*qhat < rhat*B.
-		 */
-		uj0 = u[j + 0];	/* for D3 only -- note that u[j+...] change */
-		uj1 = u[j + 1];	/* for D3 only */
-		uj2 = u[j + 2];	/* for D3 only */
-		if (uj0 == v1) {
-			qhat = B;
-			rhat = uj1;
-			goto qhat_too_big;
-		} else {
-			uint32_t nn = COMBINE(uj0, uj1);
-			qhat = nn / v1;
-			rhat = nn % v1;
-		}
-		while (v2 * qhat > COMBINE(rhat, uj2)) {
-	qhat_too_big:
-			qhat--;
-			if ((rhat += v1) >= B)
-				break;
-		}
-		/*
-		 * D4: Multiply and subtract.
-		 * The variable `t' holds any borrows across the loop.
-		 * We split this up so that we do not require v[0] = 0,
-		 * and to eliminate a final special case.
-		 */
-		for (t = 0, i = n; i > 0; i--) {
-			t = u[i + j] - v[i] * qhat - t;
-			u[i + j] = LHALF(t);
-			t = (B - HHALF(t)) & (B - 1);
-		}
-		t = u[j] - t;
-		u[j] = LHALF(t);
-		/*
-		 * D5: test remainder.
-		 * There is a borrow if and only if HHALF(t) is nonzero;
-		 * in that (rare) case, qhat was too large (by exactly 1).
-		 * Fix it by adding v[1..n] to u[j..j+n].
-		 */
-		if (HHALF(t)) {
-			qhat--;
-			for (t = 0, i = n; i > 0; i--) { /* D6: add back. */
-				t += u[i + j] + v[i];
-				u[i + j] = LHALF(t);
-				t = HHALF(t);
-			}
-			u[j] = LHALF(u[j] + t);
-		}
-		q[j] = qhat;
-	} while (++j <= m);		/* D7: loop on j. */
-
-	/*
-	 * If caller wants the remainder, we have to calculate it as
-	 * u[m..m+n] >> d (this is at most n digits and thus fits in
-	 * u[m+1..m+n], but we may need more source digits).
-	 */
-	if (arq) {
-		if (d) {
-			for (i = m + n; i > m; --i)
-				u[i] = (u[i] >> d) |
-				    LHALF(u[i - 1] << (HALF_BITS - d));
-			u[i] = 0;
-		}
-		tmp.ul[H] = COMBINE(uspace[1], uspace[2]);
-		tmp.ul[L] = COMBINE(uspace[3], uspace[4]);
-		*arq = tmp.q;
-	}
-
-	tmp.ul[H] = COMBINE(qspace[1], qspace[2]);
-	tmp.ul[L] = COMBINE(qspace[3], qspace[4]);
-	return (tmp.q);
-}
-
-/*
- * From
- * divdi3.c
- */
-
-/*
- * Divide two signed quads.
- * ??? if -1/2 should produce -1 on this machine, this code is wrong
- */
-quad_t
-__divdi3(quad_t a, quad_t b)
-{
-	u_quad_t ua, ub, uq;
-	int neg;
-
-	if (a < 0)
-		ua = -(u_quad_t)a, neg = 1;
-	else
-		ua = a, neg = 0;
-	if (b < 0)
-		ub = -(u_quad_t)b, neg ^= 1;
-	else
-		ub = b;
-	uq = __qdivrem(ua, ub, (u_quad_t *)0);
-	return (neg ? -uq : uq);
-}
-
-/*
- * From
- * udivdi3.c
- */
-
-/*
- * Divide two unsigned quads.
- */
-u_quad_t
-__udivdi3(u_quad_t a, u_quad_t b)
-{
-	return (__qdivrem(a, b, (u_quad_t *)0));
-}
-
-/*
- * From
- * umoddi3.c
- */
-
-/*
- * Return remainder after dividing two unsigned quads.
- */
-u_quad_t
-__umoddi3(u_quad_t a, u_quad_t b)
-{
-	u_quad_t r;
-
-	(void)__qdivrem(a, b, &r);
-	return (r);
-}
-
-/*
- * From
- * moddi3.c
- */
-
-/*
- * Return remainder after dividing two signed quads.
- *
- * XXX
- * If -1/2 should produce -1 on this machine, this code is wrong.
- */
-quad_t
-__moddi3(quad_t a, quad_t b)
-{
-	u_quad_t ua, ub, ur;
-	int neg;
-
-	if (a < 0)
-		ua = -(u_quad_t)a, neg = 1;
-	else
-		ua = a, neg = 0;
-	if (b < 0)
-		ub = -(u_quad_t)b;
-	else
-		ub = b;
-	(void)__qdivrem(ua, ub, &ur);
-	return (neg ? -ur : ur);
-}
diff --git a/extras/mini-os/lib/printf.c b/extras/mini-os/lib/printf.c
deleted file mode 100644
index 3d02e95..0000000
--- a/extras/mini-os/lib/printf.c
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: printf.c
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: Grzegorz Milos (gm281 at cam.ac.uk) 
- *
- *        Date: Aug 2003, Aug 2005
- *
- * Environment: Xen Minimal OS
- * Description: Library functions for printing
- *              (Linux port, mainly lib/vsprintf.c)
- *
- ****************************************************************************
- */
-
-/*
- * Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-/* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */
-/*
- * Wirzenius wrote this portably, Torvalds fucked it up :-)
- */
-
-/*
- * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel at datastacks.com>
- * - changed to provide snprintf and vsnprintf functions
- * So Feb  1 16:51:32 CET 2004 Juergen Quade <quade at hsnr.de>
- * - scnprintf and vscnprintf
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#if !defined HAVE_LIBC
-
-#include <mini-os/os.h>
-#include <mini-os/types.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/lib.h>
-#include <mini-os/mm.h>
-#include <mini-os/ctype.h>
-#include <mini-os/posix/limits.h>
-
-/**
- * simple_strtoul - convert a string to an unsigned long
- * @cp: The start of the string
- * @endp: A pointer to the end of the parsed string will be placed here
- * @base: The number base to use
- */
-unsigned long simple_strtoul(const char *cp,char **endp,unsigned int base)
-{
-    unsigned long result = 0,value;
-
-    if (!base) {
-        base = 10;
-        if (*cp == '0') {
-            base = 8;
-            cp++;
-            if ((*cp == 'x') && isxdigit(cp[1])) {
-                cp++;
-                base = 16;
-            }
-        }
-    }
-    while (isxdigit(*cp) &&
-           (value = isdigit(*cp) ? *cp-'0' : toupper(*cp)-'A'+10) < base) {
-        result = result*base + value;
-        cp++;
-    }
-    if (endp)
-        *endp = (char *)cp;
-    return result;
-}
-
-/**
- * simple_strtol - convert a string to a signed long
- * @cp: The start of the string
- * @endp: A pointer to the end of the parsed string will be placed here
- * @base: The number base to use
- */
-long simple_strtol(const char *cp,char **endp,unsigned int base)
-{
-    if(*cp=='-')
-        return -simple_strtoul(cp+1,endp,base);
-    return simple_strtoul(cp,endp,base);
-}
-
-/**
- * simple_strtoull - convert a string to an unsigned long long
- * @cp: The start of the string
- * @endp: A pointer to the end of the parsed string will be placed here
- * @base: The number base to use
- */
-unsigned long long simple_strtoull(const char *cp,char **endp,unsigned int base)
-{
-    unsigned long long result = 0,value;
-
-    if (!base) {
-        base = 10;
-        if (*cp == '0') {
-            base = 8;
-            cp++;
-            if ((*cp == 'x') && isxdigit(cp[1])) {
-                cp++;
-                base = 16;
-            }
-        }
-    }
-    while (isxdigit(*cp) && (value = isdigit(*cp) ? *cp-'0' : (islower(*cp)
-                                                               ? toupper(*cp) : *cp)-'A'+10) < base) {
-        result = result*base + value;
-        cp++;
-    }
-    if (endp)
-        *endp = (char *)cp;
-    return result;
-}
-
-/**
- * simple_strtoll - convert a string to a signed long long
- * @cp: The start of the string
- * @endp: A pointer to the end of the parsed string will be placed here
- * @base: The number base to use
- */
-long long simple_strtoll(const char *cp,char **endp,unsigned int base)
-{
-    if(*cp=='-')
-        return -simple_strtoull(cp+1,endp,base);
-    return simple_strtoull(cp,endp,base);
-}
-
-static int skip_atoi(const char **s)
-{
-    int i=0;
-
-    while (isdigit(**s))
-        i = i*10 + *((*s)++) - '0';
-    return i;
-}
-
-#define ZEROPAD 1               /* pad with zero */
-#define SIGN    2               /* unsigned/signed long */
-#define PLUS    4               /* show plus */
-#define SPACE   8               /* space if plus */
-#define LEFT    16              /* left justified */
-#define SPECIAL 32              /* 0x */
-#define LARGE   64              /* use 'ABCDEF' instead of 'abcdef' */
-
-static char * number(char * buf, char * end, long long num, int base, int size, int precision, int type)
-{
-    char c,sign,tmp[66];
-    const char *digits;
-    const char small_digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
-    const char large_digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-    int i;
-
-    digits = (type & LARGE) ? large_digits : small_digits;
-    if (type & LEFT)
-        type &= ~ZEROPAD;
-    if (base < 2 || base > 36)
-        return buf;
-    c = (type & ZEROPAD) ? '0' : ' ';
-    sign = 0;
-    if (type & SIGN) {
-        if (num < 0) {
-            sign = '-';
-            num = -num;
-            size--;
-        } else if (type & PLUS) {
-            sign = '+';
-            size--;
-        } else if (type & SPACE) {
-            sign = ' ';
-            size--;
-        }
-    }
-    if (type & SPECIAL) {
-        if (base == 16)
-            size -= 2;
-        else if (base == 8)
-            size--;
-    }
-    i = 0;
-    if (num == 0)
-        tmp[i++]='0';
-    else 
-    {
-        /* XXX KAF: force unsigned mod and div. */
-        unsigned long long num2=(unsigned long long)num;
-        unsigned int base2=(unsigned int)base;
-        while (num2 != 0) { tmp[i++] = digits[num2%base2]; num2 /= base2; }
-    }
-    if (i > precision)
-        precision = i;
-    size -= precision;
-    if (!(type&(ZEROPAD+LEFT))) {
-        while(size-->0) {
-            if (buf <= end)
-                *buf = ' ';
-            ++buf;
-        }
-    }
-    if (sign) {
-        if (buf <= end)
-            *buf = sign;
-        ++buf;
-    }
-    if (type & SPECIAL) {
-        if (base==8) {
-            if (buf <= end)
-                *buf = '0';
-            ++buf;
-        } else if (base==16) {
-            if (buf <= end)
-                *buf = '0';
-            ++buf;
-            if (buf <= end)
-                *buf = digits[33];
-            ++buf;
-        }
-    }
-    if (!(type & LEFT)) {
-        while (size-- > 0) {
-            if (buf <= end)
-                *buf = c;
-            ++buf;
-        }
-    }
-    while (i < precision--) {
-        if (buf <= end)
-            *buf = '0';
-        ++buf;
-    }
-    while (i-- > 0) {
-        if (buf <= end)
-            *buf = tmp[i];
-        ++buf;
-    }
-    while (size-- > 0) {
-        if (buf <= end)
-            *buf = ' ';
-        ++buf;
-    }
-    return buf;
-}
-
-/**
-* vsnprintf - Format a string and place it in a buffer
-* @buf: The buffer to place the result into
-* @size: The size of the buffer, including the trailing null space
-* @fmt: The format string to use
-* @args: Arguments for the format string
-*
-* Call this function if you are already dealing with a va_list.
-* You probably want snprintf instead.
- */
-int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
-{
-    int len;
-    unsigned long long num;
-    int i, base;
-    char *str, *end, c;
-    const char *s;
-
-    int flags;          /* flags to number() */
-
-    int field_width;    /* width of output field */
-    int precision;              /* min. # of digits for integers; max
-                                   number of chars for from string */
-    int qualifier;              /* 'h', 'l', or 'L' for integer fields */
-                                /* 'z' support added 23/7/1999 S.H.    */
-                                /* 'z' changed to 'Z' --davidm 1/25/99 */
-
-    str = buf;
-    end = buf + size - 1;
-
-    if (end < buf - 1) {
-        end = ((void *) -1);
-        size = end - buf + 1;
-    }
-
-    for (; *fmt ; ++fmt) {
-        if (*fmt != '%') {
-            if (str <= end)
-                *str = *fmt;
-            ++str;
-            continue;
-        }
-
-        /* process flags */
-        flags = 0;
-    repeat:
-        ++fmt;          /* this also skips first '%' */
-        switch (*fmt) {
-        case '-': flags |= LEFT; goto repeat;
-        case '+': flags |= PLUS; goto repeat;
-        case ' ': flags |= SPACE; goto repeat;
-        case '#': flags |= SPECIAL; goto repeat;
-        case '0': flags |= ZEROPAD; goto repeat;
-        }
-
-        /* get field width */
-        field_width = -1;
-        if (isdigit(*fmt))
-            field_width = skip_atoi(&fmt);
-        else if (*fmt == '*') {
-            ++fmt;
-            /* it's the next argument */
-            field_width = va_arg(args, int);
-            if (field_width < 0) {
-                field_width = -field_width;
-                flags |= LEFT;
-            }
-        }
-
-        /* get the precision */
-        precision = -1;
-        if (*fmt == '.') {
-            ++fmt;
-            if (isdigit(*fmt))
-                precision = skip_atoi(&fmt);
-            else if (*fmt == '*') {
-                ++fmt;
-                          /* it's the next argument */
-                precision = va_arg(args, int);
-            }
-            if (precision < 0)
-                precision = 0;
-        }
-
-        /* get the conversion qualifier */
-        qualifier = -1;
-        if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || *fmt =='Z') {
-            qualifier = *fmt;
-            ++fmt;
-            if (qualifier == 'l' && *fmt == 'l') {
-                qualifier = 'L';
-                ++fmt;
-            }
-        }
-        if (*fmt == 'q') {
-            qualifier = 'L';
-            ++fmt;
-        }
-
-        /* default base */
-        base = 10;
-
-        switch (*fmt) {
-        case 'c':
-            if (!(flags & LEFT)) {
-                while (--field_width > 0) {
-                    if (str <= end)
-                        *str = ' ';
-                    ++str;
-                }
-            }
-            c = (unsigned char) va_arg(args, int);
-            if (str <= end)
-                *str = c;
-            ++str;
-            while (--field_width > 0) {
-                if (str <= end)
-                    *str = ' ';
-                ++str;
-            }
-            continue;
-
-        case 's':
-            s = va_arg(args, char *);
-            if (!s)
-                s = "<NULL>";
-
-            len = strnlen(s, precision);
-
-            if (!(flags & LEFT)) {
-                while (len < field_width--) {
-                    if (str <= end)
-                        *str = ' ';
-                    ++str;
-                }
-            }
-            for (i = 0; i < len; ++i) {
-                if (str <= end)
-                    *str = *s;
-                ++str; ++s;
-            }
-            while (len < field_width--) {
-                if (str <= end)
-                    *str = ' ';
-                ++str;
-            }
-            continue;
-
-        case 'p':
-            if (field_width == -1) {
-                field_width = 2*sizeof(void *);
-                flags |= ZEROPAD;
-            }
-            str = number(str, end,
-                         (unsigned long) va_arg(args, void *),
-                         16, field_width, precision, flags);
-            continue;
-
-
-        case 'n':
-            if (qualifier == 'l') {
-                long * ip = va_arg(args, long *);
-                *ip = (str - buf);
-            } else if (qualifier == 'Z') {
-                size_t * ip = va_arg(args, size_t *);
-                *ip = (str - buf);
-            } else {
-                int * ip = va_arg(args, int *);
-                *ip = (str - buf);
-            }
-            continue;
-
-        case '%':
-            if (str <= end)
-                *str = '%';
-            ++str;
-            continue;
-
-            /* integer number formats - set up the flags and "break" */
-        case 'o':
-            base = 8;
-            break;
-
-        case 'X':
-            flags |= LARGE;
-        case 'x':
-            base = 16;
-            break;
-
-        case 'd':
-        case 'i':
-            flags |= SIGN;
-        case 'u':
-            break;
-
-        default:
-            if (str <= end)
-                *str = '%';
-            ++str;
-            if (*fmt) {
-                if (str <= end)
-                    *str = *fmt;
-                ++str;
-            } else {
-                --fmt;
-            }
-            continue;
-        }
-        if (qualifier == 'L')
-            num = va_arg(args, long long);
-        else if (qualifier == 'l') {
-            num = va_arg(args, unsigned long);
-            if (flags & SIGN)
-                num = (signed long) num;
-        } else if (qualifier == 'Z') {
-            num = va_arg(args, size_t);
-        } else if (qualifier == 'h') {
-            num = (unsigned short) va_arg(args, int);
-            if (flags & SIGN)
-                num = (signed short) num;
-        } else {
-            num = va_arg(args, unsigned int);
-            if (flags & SIGN)
-                num = (signed int) num;
-        }
-
-        str = number(str, end, num, base,
-                     field_width, precision, flags);
-    }
-    if (str <= end)
-        *str = '\0';
-    else if (size > 0)
-        /* don't write out a null byte if the buf size is zero */
-        *end = '\0';
-    /* the trailing null byte doesn't count towards the total
-     * ++str;
-     */
-    return str-buf;
-}
-
-/**
- * snprintf - Format a string and place it in a buffer
- * @buf: The buffer to place the result into
- * @size: The size of the buffer, including the trailing null space
- * @fmt: The format string to use
- * @...: Arguments for the format string
- */
-int snprintf(char * buf, size_t size, const char *fmt, ...)
-{
-    va_list args;
-    int i;
-
-    va_start(args, fmt);
-    i=vsnprintf(buf,size,fmt,args);
-    va_end(args);
-    return i;
-}
-
-/**
- * vsprintf - Format a string and place it in a buffer
- * @buf: The buffer to place the result into
- * @fmt: The format string to use
- * @args: Arguments for the format string
- *
- * Call this function if you are already dealing with a va_list.
- * You probably want sprintf instead.
- */
-int vsprintf(char *buf, const char *fmt, va_list args)
-{
-    return vsnprintf(buf, 0xFFFFFFFFUL, fmt, args);
-}
-
-
-/**
- * sprintf - Format a string and place it in a buffer
- * @buf: The buffer to place the result into
- * @fmt: The format string to use
- * @...: Arguments for the format string
- */
-int sprintf(char * buf, const char *fmt, ...)
-{
-    va_list args;
-    int i;
-
-    va_start(args, fmt);
-    i=vsprintf(buf,fmt,args);
-    va_end(args);
-    return i;
-}
-
-/**
- * vsscanf - Unformat a buffer into a list of arguments
- * @buf:	input buffer
- * @fmt:	format of buffer
- * @args:	arguments
- */
-int vsscanf(const char * buf, const char * fmt, va_list args)
-{
-	const char *str = buf;
-	char *next;
-	char digit;
-	int num = 0;
-	int qualifier;
-	int base;
-	int field_width;
-	int is_sign = 0;
-
-	while(*fmt && *str) {
-		/* skip any white space in format */
-		/* white space in format matchs any amount of
-		 * white space, including none, in the input.
-		 */
-		if (isspace(*fmt)) {
-			while (isspace(*fmt))
-				++fmt;
-			while (isspace(*str))
-				++str;
-		}
-
-		/* anything that is not a conversion must match exactly */
-		if (*fmt != '%' && *fmt) {
-			if (*fmt++ != *str++)
-				break;
-			continue;
-		}
-
-		if (!*fmt)
-			break;
-		++fmt;
-		
-		/* skip this conversion.
-		 * advance both strings to next white space
-		 */
-		if (*fmt == '*') {
-			while (!isspace(*fmt) && *fmt)
-				fmt++;
-			while (!isspace(*str) && *str)
-				str++;
-			continue;
-		}
-
-		/* get field width */
-		field_width = -1;
-		if (isdigit(*fmt))
-			field_width = skip_atoi(&fmt);
-
-		/* get conversion qualifier */
-		qualifier = -1;
-		if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
-		    *fmt == 'Z' || *fmt == 'z') {
-			qualifier = *fmt++;
-			if (unlikely(qualifier == *fmt)) {
-				if (qualifier == 'h') {
-					qualifier = 'H';
-					fmt++;
-				} else if (qualifier == 'l') {
-					qualifier = 'L';
-					fmt++;
-				}
-			}
-		}
-		base = 10;
-		is_sign = 0;
-
-		if (!*fmt || !*str)
-			break;
-
-		switch(*fmt++) {
-		case 'c':
-		{
-			char *s = (char *) va_arg(args,char*);
-			if (field_width == -1)
-				field_width = 1;
-			do {
-				*s++ = *str++;
-			} while (--field_width > 0 && *str);
-			num++;
-		}
-		continue;
-		case 's':
-		{
-			char *s = (char *) va_arg(args, char *);
-			if(field_width == -1)
-				field_width = INT_MAX;
-			/* first, skip leading white space in buffer */
-			while (isspace(*str))
-				str++;
-
-			/* now copy until next white space */
-			while (*str && !isspace(*str) && field_width--) {
-				*s++ = *str++;
-			}
-			*s = '\0';
-			num++;
-		}
-		continue;
-		case 'n':
-			/* return number of characters read so far */
-		{
-			int *i = (int *)va_arg(args,int*);
-			*i = str - buf;
-		}
-		continue;
-		case 'o':
-			base = 8;
-			break;
-		case 'x':
-		case 'X':
-			base = 16;
-			break;
-		case 'i':
-                        base = 0;
-		case 'd':
-			is_sign = 1;
-		case 'u':
-			break;
-		case '%':
-			/* looking for '%' in str */
-			if (*str++ != '%') 
-				return num;
-			continue;
-		default:
-			/* invalid format; stop here */
-			return num;
-		}
-
-		/* have some sort of integer conversion.
-		 * first, skip white space in buffer.
-		 */
-		while (isspace(*str))
-			str++;
-
-		digit = *str;
-		if (is_sign && digit == '-')
-			digit = *(str + 1);
-
-		if (!digit
-                    || (base == 16 && !isxdigit(digit))
-                    || (base == 10 && !isdigit(digit))
-                    || (base == 8 && (!isdigit(digit) || digit > '7'))
-                    || (base == 0 && !isdigit(digit)))
-				break;
-
-		switch(qualifier) {
-		case 'H':	/* that's 'hh' in format */
-			if (is_sign) {
-				signed char *s = (signed char *) va_arg(args,signed char *);
-				*s = (signed char) simple_strtol(str,&next,base);
-			} else {
-				unsigned char *s = (unsigned char *) va_arg(args, unsigned char *);
-				*s = (unsigned char) simple_strtoul(str, &next, base);
-			}
-			break;
-		case 'h':
-			if (is_sign) {
-				short *s = (short *) va_arg(args,short *);
-				*s = (short) simple_strtol(str,&next,base);
-			} else {
-				unsigned short *s = (unsigned short *) va_arg(args, unsigned short *);
-				*s = (unsigned short) simple_strtoul(str, &next, base);
-			}
-			break;
-		case 'l':
-			if (is_sign) {
-				long *l = (long *) va_arg(args,long *);
-				*l = simple_strtol(str,&next,base);
-			} else {
-				unsigned long *l = (unsigned long*) va_arg(args,unsigned long*);
-				*l = simple_strtoul(str,&next,base);
-			}
-			break;
-		case 'L':
-			if (is_sign) {
-				long long *l = (long long*) va_arg(args,long long *);
-				*l = simple_strtoll(str,&next,base);
-			} else {
-				unsigned long long *l = (unsigned long long*) va_arg(args,unsigned long long*);
-				*l = simple_strtoull(str,&next,base);
-			}
-			break;
-		case 'Z':
-		case 'z':
-		{
-			size_t *s = (size_t*) va_arg(args,size_t*);
-			*s = (size_t) simple_strtoul(str,&next,base);
-		}
-		break;
-		default:
-			if (is_sign) {
-				int *i = (int *) va_arg(args, int*);
-				*i = (int) simple_strtol(str,&next,base);
-			} else {
-				unsigned int *i = (unsigned int*) va_arg(args, unsigned int*);
-				*i = (unsigned int) simple_strtoul(str,&next,base);
-			}
-			break;
-		}
-		num++;
-
-		if (!next)
-			break;
-		str = next;
-	}
-	return num;
-}
-
-/**
- * sscanf - Unformat a buffer into a list of arguments
- * @buf:	input buffer
- * @fmt:	formatting of buffer
- * @...:	resulting arguments
- */
-int sscanf(const char * buf, const char * fmt, ...)
-{
-	va_list args;
-	int i;
-
-	va_start(args,fmt);
-	i = vsscanf(buf,fmt,args);
-	va_end(args);
-	return i;
-}
-
-#endif
diff --git a/extras/mini-os/lib/stack_chk_fail.c b/extras/mini-os/lib/stack_chk_fail.c
deleted file mode 100644
index 05aea20..0000000
--- a/extras/mini-os/lib/stack_chk_fail.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <mini-os/kernel.h>
-#include <mini-os/console.h>
-
-void __stack_chk_fail(void)
-{
-    printk("stack smashing detected\n");
-    do_exit();
-}
diff --git a/extras/mini-os/lib/string.c b/extras/mini-os/lib/string.c
deleted file mode 100644
index 8b24146..0000000
--- a/extras/mini-os/lib/string.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: string.c
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: 
- *              
- *        Date: Aug 2003
- * 
- * Environment: Xen Minimal OS
- * Description: Library function for string and memory manipulation
- *              Origin unknown
- *
- ****************************************************************************
- * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $
- ****************************************************************************
- */
-
-#include <strings.h>
-
-/* newlib defines ffs but not ffsll or ffsl */
-int __ffsti2 (long long int lli)
-{
-    int i, num, t, tmpint, len;
-
-    num = sizeof(long long int) / sizeof(int);
-    if (num == 1) return (ffs((int) lli));
-    len = sizeof(int) * 8;
-
-    for (i = 0; i < num; i++) {
-        tmpint = (int) (((lli >> len) << len) ^ lli);
-
-        t = ffs(tmpint);
-        if (t)
-            return (t + i * len);
-        lli = lli >> len;
-    }
-    return 0;
-}
-
-int __ffsdi2 (long int li)
-{
-    return __ffsti2 ((long long int) li);
-}
-
-int ffsl (long int li)
-{
-    return __ffsti2 ((long long int) li);
-}
-
-int ffsll (long long int lli)
-{
-    return __ffsti2 (lli);
-}
-
-#if !defined HAVE_LIBC
-
-#include <mini-os/os.h>
-#include <mini-os/types.h>
-#include <mini-os/lib.h>
-#include <mini-os/xmalloc.h>
-
-int memcmp(const void * cs,const void * ct,size_t count)
-{
-	const unsigned char *su1, *su2;
-	signed char res = 0;
-
-	for( su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
-		if ((res = *su1 - *su2) != 0)
-			break;
-	return res;
-}
-
-void * memcpy(void * dest,const void *src,size_t count)
-{
-	char *tmp = (char *) dest;
-    const char *s = src;
-
-	while (count--)
-		*tmp++ = *s++;
-
-	return dest;
-}
-
-int strncmp(const char * cs,const char * ct,size_t count)
-{
-	register signed char __res = 0;
-
-	while (count) {
-		if ((__res = *cs - *ct++) != 0 || !*cs++)
-			break;
-		count--;
-	}
-
-	return __res;
-}
-
-int strcmp(const char * cs,const char * ct)
-{
-        register signed char __res;
-
-        while (1) {
-                if ((__res = *cs - *ct++) != 0 || !*cs++)
-                        break;
-        }
-
-        return __res;
-}
-
-char * strcpy(char * dest,const char *src)
-{
-        char *tmp = dest;
-
-        while ((*dest++ = *src++) != '\0')
-                /* nothing */;
-        return tmp;
-}
-
-char * strncpy(char * dest,const char *src,size_t count)
-{
-        char *tmp = dest;
-
-        while (count-- && (*dest++ = *src++) != '\0')
-                /* nothing */;
-
-        return tmp;
-}
-
-void * memset(void * s,int c,size_t count)
-{
-        char *xs = (char *) s;
-
-        while (count--)
-                *xs++ = c;
-
-        return s;
-}
-
-size_t strnlen(const char * s, size_t count)
-{
-        const char *sc;
-
-        for (sc = s; count-- && *sc != '\0'; ++sc)
-                /* nothing */;
-        return sc - s;
-}
-
-
-char * strcat(char * dest, const char * src)
-{
-    char *tmp = dest;
-    
-    while (*dest)
-        dest++;
-    
-    while ((*dest++ = *src++) != '\0');
-    
-    return tmp;
-}
-
-size_t strlen(const char * s)
-{
-	const char *sc;
-
-	for (sc = s; *sc != '\0'; ++sc)
-		/* nothing */;
-	return sc - s;
-}
-
-char * strchr(const char * s, int c)
-{
-        for(; *s != (char) c; ++s)
-                if (*s == '\0')
-                        return NULL;
-        return (char *)s;
-}
-
-char * strrchr(const char * s, int c)
-{
-        const char *res = NULL;
-        for(; *s != '\0'; ++s)
-                if (*s == (char) c)
-                        res = s;
-        return (char *)res;
-}
-
-char * strstr(const char * s1,const char * s2)
-{
-        int l1, l2;
-
-        l2 = strlen(s2);
-        if (!l2)
-                return (char *) s1;
-        l1 = strlen(s1);
-        while (l1 >= l2) {
-                l1--;
-                if (!memcmp(s1,s2,l2))
-                        return (char *) s1;
-                s1++;
-        }
-        return NULL;
-}
-
-char *strdup(const char *x)
-{
-    int l = strlen(x);
-    char *res = malloc(l + 1);
-	if (!res) return NULL;
-    memcpy(res, x, l + 1);
-    return res;
-}
-
-int ffs(int i)
-{
-   int c = 1;
-
-   do {
-      if (i & 1)
-         return (c);
-      i = i >> 1;
-      c++;
-   } while (i);
-   return 0;
-}
-
-#endif
diff --git a/extras/mini-os/lib/sys.c b/extras/mini-os/lib/sys.c
deleted file mode 100644
index 13e7e59..0000000
--- a/extras/mini-os/lib/sys.c
+++ /dev/null
@@ -1,1550 +0,0 @@
-/*
- * POSIX-compatible libc layer
- *
- * Samuel Thibault <Samuel.Thibault at eu.citrix.net>, October 2007
- *
- * Provides the UNIXish part of the standard libc function.
- *
- * Relatively straight-forward: just multiplex the file descriptor operations
- * among the various file types (console, FS, network, ...)
- */
-
-//#define LIBC_VERBOSE
-//#define LIBC_DEBUG
-
-#ifdef LIBC_DEBUG
-#define DEBUG(fmt,...) printk(fmt, ##__VA_ARGS__)
-#else
-#define DEBUG(fmt,...)
-#endif
-
-#ifdef HAVE_LIBC
-#include <os.h>
-#include <console.h>
-#include <sched.h>
-#include <events.h>
-#include <wait.h>
-#include <netfront.h>
-#include <blkfront.h>
-#include <fbfront.h>
-#include <tpmfront.h>
-#include <tpm_tis.h>
-#include <xenbus.h>
-#include <xenstore.h>
-#include <poll.h>
-
-#include <sys/types.h>
-#include <sys/unistd.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <net/if.h>
-#include <time.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <assert.h>
-#include <dirent.h>
-#include <stdlib.h>
-#include <math.h>
-
-#ifdef HAVE_LWIP
-#include <lwip/sockets.h>
-#endif
-
-#define debug(fmt, ...) \
-
-#define print_unsupported(fmt, ...) \
-    printk("Unsupported function "fmt" called in Mini-OS kernel\n", ## __VA_ARGS__);
-
-/* Crash on function call */
-#define unsupported_function_crash(function) \
-    int __unsup_##function(void) asm(#function); \
-    int __unsup_##function(void) \
-    { \
-	print_unsupported(#function); \
-	do_exit(); \
-    }
-
-/* Log and err out on function call */
-#define unsupported_function_log(type, function, ret) \
-    type __unsup_##function(void) asm(#function); \
-    type __unsup_##function(void) \
-    { \
-	print_unsupported(#function); \
-	errno = ENOSYS; \
-	return ret; \
-    }
-
-/* Err out on function call */
-#define unsupported_function(type, function, ret) \
-    type __unsup_##function(void) asm(#function); \
-    type __unsup_##function(void) \
-    { \
-	errno = ENOSYS; \
-	return ret; \
-    }
-
-#define NOFILE 32
-extern void minios_interface_close_fd(int fd);
-extern void minios_evtchn_close_fd(int fd);
-extern void minios_gnttab_close_fd(int fd);
-
-pthread_mutex_t fd_lock = PTHREAD_MUTEX_INITIALIZER;
-struct file files[NOFILE] = {
-    { .type = FTYPE_CONSOLE }, /* stdin */
-    { .type = FTYPE_CONSOLE }, /* stdout */
-    { .type = FTYPE_CONSOLE }, /* stderr */
-};
-
-DECLARE_WAIT_QUEUE_HEAD(event_queue);
-
-int alloc_fd(enum fd_type type)
-{
-    int i;
-    pthread_mutex_lock(&fd_lock);
-    for (i=0; i<NOFILE; i++) {
-	if (files[i].type == FTYPE_NONE) {
-	    files[i].type = type;
-	    pthread_mutex_unlock(&fd_lock);
-	    return i;
-	}
-    }
-    pthread_mutex_unlock(&fd_lock);
-    printk("Too many opened files\n");
-    do_exit();
-}
-
-void close_all_files(void)
-{
-    int i;
-    pthread_mutex_lock(&fd_lock);
-    for (i=NOFILE - 1; i > 0; i--)
-	if (files[i].type != FTYPE_NONE)
-            close(i);
-    pthread_mutex_unlock(&fd_lock);
-}
-
-int dup2(int oldfd, int newfd)
-{
-    pthread_mutex_lock(&fd_lock);
-    if (files[newfd].type != FTYPE_NONE)
-	close(newfd);
-    // XXX: this is a bit bogus, as we are supposed to share the offset etc
-    files[newfd] = files[oldfd];
-    pthread_mutex_unlock(&fd_lock);
-    return 0;
-}
-
-pid_t getpid(void)
-{
-    return 1;
-}
-
-pid_t getppid(void)
-{
-    return 1;
-}
-
-pid_t setsid(void)
-{
-    return 1;
-}
-
-char *getcwd(char *buf, size_t size)
-{
-    snprintf(buf, size, "/");
-    return buf;
-}
-
-#define LOG_PATH "/var/log/"
-#define SAVE_PATH "/var/lib/xen"
-#define SAVE_CONSOLE 1
-#define RESTORE_CONSOLE 2
-
-int mkdir(const char *pathname, mode_t mode)
-{
-    errno = EIO;
-    return -1;
-}
-
-#ifdef CONFIG_CONSFRONT
-int posix_openpt(int flags)
-{
-    struct consfront_dev *dev;
-
-    /* Ignore flags */
-
-    dev = init_consfront(NULL);
-    dev->fd = alloc_fd(FTYPE_CONSOLE);
-    files[dev->fd].cons.dev = dev;
-
-    printk("fd(%d) = posix_openpt\n", dev->fd);
-    return(dev->fd);
-}
-
-int open_savefile(const char *path, int save)
-{
-    struct consfront_dev *dev;
-    char nodename[64];
-
-    snprintf(nodename, sizeof(nodename), "device/console/%d", save ? SAVE_CONSOLE : RESTORE_CONSOLE);
-
-    dev = init_consfront(nodename);
-    dev->fd = alloc_fd(FTYPE_SAVEFILE);
-    files[dev->fd].cons.dev = dev;
-
-    printk("fd(%d) = open_savefile\n", dev->fd);
-    return(dev->fd);
-}
-#else
-int posix_openpt(int flags)
-{
-	errno = EIO;
-	return -1;
-}
-int open_savefile(const char *path, int save)
-{
-	errno = EIO;
-	return -1;
-}
-#endif
-
-int open(const char *pathname, int flags, ...)
-{
-    int fd;
-    /* Ugly, but fine.  */
-    if (!strncmp(pathname,LOG_PATH,strlen(LOG_PATH))) {
-	fd = alloc_fd(FTYPE_CONSOLE);
-        printk("open(%s) -> %d\n", pathname, fd);
-        return fd;
-    }
-    if (!strncmp(pathname, "/dev/mem", strlen("/dev/mem"))) {
-        fd = alloc_fd(FTYPE_MEM);
-        printk("open(/dev/mem) -> %d\n", fd);
-        return fd;
-    }
-    if (!strncmp(pathname, "/dev/ptmx", strlen("/dev/ptmx")))
-        return posix_openpt(flags);
-    if (!strncmp(pathname,SAVE_PATH,strlen(SAVE_PATH)))
-        return open_savefile(pathname, flags & O_WRONLY);
-    errno = EIO;
-    return -1;
-}
-
-int isatty(int fd)
-{
-    return files[fd].type == FTYPE_CONSOLE;
-}
-
-int read(int fd, void *buf, size_t nbytes)
-{
-    switch (files[fd].type) {
-        case FTYPE_SAVEFILE:
-	case FTYPE_CONSOLE: {
-	    int ret;
-            DEFINE_WAIT(w);
-            while(1) {
-                add_waiter(w, console_queue);
-                ret = xencons_ring_recv(files[fd].cons.dev, buf, nbytes);
-                if (ret)
-                    break;
-                schedule();
-            }
-            remove_waiter(w, console_queue);
-            return ret;
-        }
-#ifdef HAVE_LWIP
-	case FTYPE_SOCKET:
-	    return lwip_read(files[fd].socket.fd, buf, nbytes);
-#endif
-#ifdef CONFIG_NETFRONT
-	case FTYPE_TAP: {
-	    ssize_t ret;
-	    ret = netfront_receive(files[fd].tap.dev, buf, nbytes);
-	    if (ret <= 0) {
-		errno = EAGAIN;
-		return -1;
-	    }
-	    return ret;
-	}
-#endif
-#ifdef CONFIG_KBDFRONT
-        case FTYPE_KBD: {
-            int ret, n;
-            n = nbytes / sizeof(union xenkbd_in_event);
-            ret = kbdfront_receive(files[fd].kbd.dev, buf, n);
-	    if (ret <= 0) {
-		errno = EAGAIN;
-		return -1;
-	    }
-	    return ret * sizeof(union xenkbd_in_event);
-        }
-#endif
-#ifdef CONFIG_FBFRONT
-        case FTYPE_FB: {
-            int ret, n;
-            n = nbytes / sizeof(union xenfb_in_event);
-            ret = fbfront_receive(files[fd].fb.dev, buf, n);
-	    if (ret <= 0) {
-		errno = EAGAIN;
-		return -1;
-	    }
-	    return ret * sizeof(union xenfb_in_event);
-        }
-#endif
-#ifdef CONFIG_BLKFRONT
-        case FTYPE_BLK: {
-	    return blkfront_posix_read(fd, buf, nbytes);
-        }
-#endif
-#ifdef CONFIG_TPMFRONT
-        case FTYPE_TPMFRONT: {
-	    return tpmfront_posix_read(fd, buf, nbytes);
-        }
-#endif
-#ifdef CONFIG_TPM_TIS
-        case FTYPE_TPM_TIS: {
-	    return tpm_tis_posix_read(fd, buf, nbytes);
-        }
-#endif
-	default:
-	    break;
-    }
-    printk("read(%d): Bad descriptor\n", fd);
-    errno = EBADF;
-    return -1;
-}
-
-int write(int fd, const void *buf, size_t nbytes)
-{
-    switch (files[fd].type) {
-        case FTYPE_SAVEFILE: {
-                int ret = 0, tot = nbytes;
-                while (nbytes > 0) {
-                    ret = xencons_ring_send(files[fd].cons.dev, (char *)buf, nbytes);
-                    nbytes -= ret;
-                    buf = (char *)buf + ret;
-                }
-                return tot - nbytes;
-            }
-	case FTYPE_CONSOLE:
-	    console_print(files[fd].cons.dev, (char *)buf, nbytes);
-	    return nbytes;
-#ifdef HAVE_LWIP
-	case FTYPE_SOCKET:
-	    return lwip_write(files[fd].socket.fd, (void*) buf, nbytes);
-#endif
-#ifdef CONFIG_NETFRONT
-	case FTYPE_TAP:
-	    netfront_xmit(files[fd].tap.dev, (void*) buf, nbytes);
-	    return nbytes;
-#endif
-#ifdef CONFIG_BLKFRONT
-	case FTYPE_BLK:
-	    return blkfront_posix_write(fd, buf, nbytes);
-#endif
-#ifdef CONFIG_TPMFRONT
-	case FTYPE_TPMFRONT:
-	    return tpmfront_posix_write(fd, buf, nbytes);
-#endif
-#ifdef CONFIG_TPM_TIS
-	case FTYPE_TPM_TIS:
-	    return tpm_tis_posix_write(fd, buf, nbytes);
-#endif
-	default:
-	    break;
-    }
-    printk("write(%d): Bad descriptor\n", fd);
-    errno = EBADF;
-    return -1;
-}
-
-off_t lseek(int fd, off_t offset, int whence)
-{
-    off_t* target = NULL;
-    switch(files[fd].type) {
-#ifdef CONFIG_BLKFRONT
-       case FTYPE_BLK:
-          target = &files[fd].blk.offset;
-          break;
-#endif
-#ifdef CONFIG_TPMFRONT
-       case FTYPE_TPMFRONT:
-          target = &files[fd].tpmfront.offset;
-          break;
-#endif
-#ifdef CONFIG_TPM_TIS
-       case FTYPE_TPM_TIS:
-          target = &files[fd].tpm_tis.offset;
-          break;
-#endif
-       case FTYPE_FILE:
-          target = &files[fd].file.offset;
-          break;
-       default:
-          /* Not implemented for this filetype */
-          errno = ESPIPE;
-          return (off_t) -1;
-    }
-
-    switch (whence) {
-       case SEEK_SET:
-          *target = offset;
-          break;
-       case SEEK_CUR:
-          *target += offset;
-          break;
-       case SEEK_END:
-          {
-             struct stat st;
-             int ret;
-             ret = fstat(fd, &st);
-             if (ret)
-                return -1;
-             *target = st.st_size + offset;
-             break;
-          }
-       default:
-          errno = EINVAL;
-          return -1;
-    }
-    return *target;
-}
-
-int fsync(int fd) {
-    errno = EBADF;
-    return -1;
-}
-
-int close(int fd)
-{
-    printk("close(%d)\n", fd);
-    switch (files[fd].type) {
-        default:
-	    files[fd].type = FTYPE_NONE;
-	    return 0;
-#ifdef CONFIG_XENBUS
-	case FTYPE_XENBUS:
-            xs_daemon_close((void*)(intptr_t) fd);
-            return 0;
-#endif
-#ifdef HAVE_LWIP
-	case FTYPE_SOCKET: {
-	    int res = lwip_close(files[fd].socket.fd);
-	    files[fd].type = FTYPE_NONE;
-	    return res;
-	}
-#endif
-#ifdef CONFIG_XC
-	case FTYPE_XC:
-	    minios_interface_close_fd(fd);
-	    return 0;
-	case FTYPE_EVTCHN:
-	    minios_evtchn_close_fd(fd);
-            return 0;
-	case FTYPE_GNTMAP:
-	    minios_gnttab_close_fd(fd);
-	    return 0;
-#endif
-#ifdef CONFIG_NETFRONT
-	case FTYPE_TAP:
-	    shutdown_netfront(files[fd].tap.dev);
-	    files[fd].type = FTYPE_NONE;
-	    return 0;
-#endif
-#ifdef CONFIG_BLKFRONT
-	case FTYPE_BLK:
-            shutdown_blkfront(files[fd].blk.dev);
-	    files[fd].type = FTYPE_NONE;
-	    return 0;
-#endif
-#ifdef CONFIG_TPMFRONT
-	case FTYPE_TPMFRONT:
-            shutdown_tpmfront(files[fd].tpmfront.dev);
-	    files[fd].type = FTYPE_NONE;
-	    return 0;
-#endif
-#ifdef CONFIG_TPM_TIS
-	case FTYPE_TPM_TIS:
-            shutdown_tpm_tis(files[fd].tpm_tis.dev);
-	    files[fd].type = FTYPE_NONE;
-	    return 0;
-#endif
-#ifdef CONFIG_KBDFRONT
-	case FTYPE_KBD:
-            shutdown_kbdfront(files[fd].kbd.dev);
-            files[fd].type = FTYPE_NONE;
-            return 0;
-#endif
-#ifdef CONFIG_FBFRONT
-	case FTYPE_FB:
-            shutdown_fbfront(files[fd].fb.dev);
-            files[fd].type = FTYPE_NONE;
-            return 0;
-#endif
-#ifdef CONFIG_CONSFRONT
-        case FTYPE_SAVEFILE:
-        case FTYPE_CONSOLE:
-            fini_console(files[fd].cons.dev);
-            files[fd].type = FTYPE_NONE;
-            return 0;
-#endif
-	case FTYPE_NONE:
-	    break;
-    }
-    printk("close(%d): Bad descriptor\n", fd);
-    errno = EBADF;
-    return -1;
-}
-
-static void init_stat(struct stat *buf)
-{
-    memset(buf, 0, sizeof(*buf));
-    buf->st_dev = 0;
-    buf->st_ino = 0;
-    buf->st_nlink = 1;
-    buf->st_rdev = 0;
-    buf->st_blksize = 4096;
-    buf->st_blocks = 0;
-}
-
-int stat(const char *path, struct stat *buf)
-{
-    errno = EIO;
-    return -1;
-}
-
-int fstat(int fd, struct stat *buf)
-{
-    init_stat(buf);
-    switch (files[fd].type) {
-	case FTYPE_SAVEFILE:
-	case FTYPE_CONSOLE:
-	case FTYPE_SOCKET: {
-            if (files[fd].type == FTYPE_CONSOLE)
-                buf->st_mode = S_IFCHR|S_IRUSR|S_IWUSR;
-            else if (files[fd].type == FTYPE_SOCKET)
-                buf->st_mode = S_IFSOCK|S_IRUSR|S_IWUSR;
-            else if (files[fd].type == FTYPE_SAVEFILE)
-                buf->st_mode = S_IFREG|S_IRUSR|S_IWUSR;
-	    buf->st_uid = 0;
-	    buf->st_gid = 0;
-	    buf->st_size = 0;
-	    buf->st_atime = 
-	    buf->st_mtime = 
-	    buf->st_ctime = time(NULL);
-	    return 0;
-	}
-#ifdef CONFIG_BLKFRONT
-	case FTYPE_BLK:
-	   return blkfront_posix_fstat(fd, buf);
-#endif
-#ifdef CONFIG_TPMFRONT
-	case FTYPE_TPMFRONT:
-	   return tpmfront_posix_fstat(fd, buf);
-#endif
-#ifdef CONFIG_TPM_TIS
-	case FTYPE_TPM_TIS:
-	   return tpm_tis_posix_fstat(fd, buf);
-#endif
-	default:
-	    break;
-    }
-
-    printk("statf(%d): Bad descriptor\n", fd);
-    errno = EBADF;
-    return -1;
-}
-
-int ftruncate(int fd, off_t length)
-{
-    errno = EBADF;
-    return -1;
-}
-
-int remove(const char *pathname)
-{
-    errno = EIO;
-    return -1;
-}
-
-int unlink(const char *pathname)
-{
-    return remove(pathname);
-}
-
-int rmdir(const char *pathname)
-{
-    return remove(pathname);
-}
-
-int fcntl(int fd, int cmd, ...)
-{
-    long arg;
-    va_list ap;
-    va_start(ap, cmd);
-    arg = va_arg(ap, long);
-    va_end(ap);
-
-    switch (cmd) {
-#ifdef HAVE_LWIP
-	case F_SETFL:
-	    if (files[fd].type == FTYPE_SOCKET && !(arg & ~O_NONBLOCK)) {
-		/* Only flag supported: non-blocking mode */
-		uint32_t nblock = !!(arg & O_NONBLOCK);
-		return lwip_ioctl(files[fd].socket.fd, FIONBIO, &nblock);
-	    }
-	    /* Fallthrough */
-#endif
-	default:
-	    printk("fcntl(%d, %d, %lx/%lo)\n", fd, cmd, arg, arg);
-	    errno = ENOSYS;
-	    return -1;
-    }
-}
-
-DIR *opendir(const char *name)
-{
-    DIR *ret;
-    ret = malloc(sizeof(*ret));
-    ret->name = strdup(name);
-    ret->offset = 0;
-    ret->entries = NULL;
-    ret->curentry = -1;
-    ret->nbentries = 0;
-    ret->has_more = 1;
-    return ret;
-}
-
-struct dirent *readdir(DIR *dir)
-{
-    return NULL;
-} 
-
-int closedir(DIR *dir)
-{
-    int i;
-    for (i=0; i<dir->nbentries; i++)
-        free(dir->entries[i]);
-    free(dir->entries);
-    free(dir->name);
-    free(dir);
-    return 0;
-}
-
-/* We assume that only the main thread calls select(). */
-
-static const char file_types[] = {
-    [FTYPE_NONE]	= 'N',
-    [FTYPE_CONSOLE]	= 'C',
-    [FTYPE_XENBUS]	= 'S',
-    [FTYPE_XC]		= 'X',
-    [FTYPE_EVTCHN]	= 'E',
-    [FTYPE_SOCKET]	= 's',
-    [FTYPE_TAP]		= 'T',
-    [FTYPE_BLK]		= 'B',
-    [FTYPE_KBD]		= 'K',
-    [FTYPE_FB]		= 'G',
-};
-#ifdef LIBC_DEBUG
-static void dump_set(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout)
-{
-    int i, comma;
-#define printfds(set) do {\
-    comma = 0; \
-    for (i = 0; i < nfds; i++) { \
-	if (FD_ISSET(i, set)) { \
-	    if (comma) \
-		printk(", "); \
-	    printk("%d(%c)", i, file_types[files[i].type]); \
-	    comma = 1; \
-	} \
-    } \
-} while (0)
-
-    printk("[");
-    if (readfds)
-	printfds(readfds);
-    printk("], [");
-    if (writefds)
-	printfds(writefds);
-    printk("], [");
-    if (exceptfds)
-	printfds(exceptfds);
-    printk("], ");
-    if (timeout)
-	printk("{ %ld, %ld }", timeout->tv_sec, timeout->tv_usec);
-}
-#else
-#define dump_set(nfds, readfds, writefds, exceptfds, timeout)
-#endif
-
-#ifdef LIBC_DEBUG
-static void dump_pollfds(struct pollfd *pfd, int nfds, int timeout)
-{
-    int i, comma, fd;
-
-    printk("[");
-    comma = 0;
-    for (i = 0; i < nfds; i++) {
-        fd = pfd[i].fd;
-        if (comma)
-            printk(", ");
-        printk("%d(%c)/%02x", fd, file_types[files[fd].type],
-            pfd[i].events);
-            comma = 1;
-    }
-    printk("]");
-
-    printk(", %d, %d", nfds, timeout);
-}
-#else
-#define dump_pollfds(pfds, nfds, timeout)
-#endif
-
-/* Just poll without blocking */
-static int select_poll(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds)
-{
-    int i, n = 0;
-#ifdef HAVE_LWIP
-    int sock_n = 0, sock_nfds = 0;
-    fd_set sock_readfds, sock_writefds, sock_exceptfds;
-    struct timeval timeout = { .tv_sec = 0, .tv_usec = 0};
-#endif
-
-#ifdef LIBC_VERBOSE
-    static int nb;
-    static int nbread[NOFILE], nbwrite[NOFILE], nbexcept[NOFILE];
-    static s_time_t lastshown;
-
-    nb++;
-#endif
-
-#ifdef HAVE_LWIP
-    /* first poll network */
-    FD_ZERO(&sock_readfds);
-    FD_ZERO(&sock_writefds);
-    FD_ZERO(&sock_exceptfds);
-    for (i = 0; i < nfds; i++) {
-	if (files[i].type == FTYPE_SOCKET) {
-	    if (FD_ISSET(i, readfds)) {
-		FD_SET(files[i].socket.fd, &sock_readfds);
-		sock_nfds = i+1;
-	    }
-	    if (FD_ISSET(i, writefds)) {
-		FD_SET(files[i].socket.fd, &sock_writefds);
-		sock_nfds = i+1;
-	    }
-	    if (FD_ISSET(i, exceptfds)) {
-		FD_SET(files[i].socket.fd, &sock_exceptfds);
-		sock_nfds = i+1;
-	    }
-	}
-    }
-    if (sock_nfds > 0) {
-        DEBUG("lwip_select(");
-        dump_set(nfds, &sock_readfds, &sock_writefds, &sock_exceptfds, &timeout);
-        DEBUG("); -> ");
-        sock_n = lwip_select(sock_nfds, &sock_readfds, &sock_writefds, &sock_exceptfds, &timeout);
-        dump_set(nfds, &sock_readfds, &sock_writefds, &sock_exceptfds, &timeout);
-        DEBUG("\n");
-    }
-#endif
-
-    /* Then see others as well. */
-    for (i = 0; i < nfds; i++) {
-	switch(files[i].type) {
-	default:
-	    if (FD_ISSET(i, readfds) || FD_ISSET(i, writefds) || FD_ISSET(i, exceptfds))
-		printk("bogus fd %d in select\n", i);
-	    /* Fallthrough.  */
-	case FTYPE_CONSOLE:
-	    if (FD_ISSET(i, readfds)) {
-                if (xencons_ring_avail(files[i].cons.dev))
-		    n++;
-		else
-		    FD_CLR(i, readfds);
-            }
-	    if (FD_ISSET(i, writefds))
-                n++;
-	    FD_CLR(i, exceptfds);
-	    break;
-#ifdef CONFIG_XENBUS
-	case FTYPE_XENBUS:
-	    if (FD_ISSET(i, readfds)) {
-                if (files[i].xenbus.events)
-		    n++;
-		else
-		    FD_CLR(i, readfds);
-	    }
-	    FD_CLR(i, writefds);
-	    FD_CLR(i, exceptfds);
-	    break;
-#endif
-	case FTYPE_EVTCHN:
-	case FTYPE_TAP:
-	case FTYPE_BLK:
-	case FTYPE_KBD:
-	case FTYPE_FB:
-	    if (FD_ISSET(i, readfds)) {
-		if (files[i].read)
-		    n++;
-		else
-		    FD_CLR(i, readfds);
-	    }
-	    FD_CLR(i, writefds);
-	    FD_CLR(i, exceptfds);
-	    break;
-#ifdef HAVE_LWIP
-	case FTYPE_SOCKET:
-	    if (FD_ISSET(i, readfds)) {
-	        /* Optimize no-network-packet case.  */
-		if (sock_n && FD_ISSET(files[i].socket.fd, &sock_readfds))
-		    n++;
-		else
-		    FD_CLR(i, readfds);
-	    }
-            if (FD_ISSET(i, writefds)) {
-		if (sock_n && FD_ISSET(files[i].socket.fd, &sock_writefds))
-		    n++;
-		else
-		    FD_CLR(i, writefds);
-            }
-            if (FD_ISSET(i, exceptfds)) {
-		if (sock_n && FD_ISSET(files[i].socket.fd, &sock_exceptfds))
-		    n++;
-		else
-		    FD_CLR(i, exceptfds);
-            }
-	    break;
-#endif
-	}
-#ifdef LIBC_VERBOSE
-	if (FD_ISSET(i, readfds))
-	    nbread[i]++;
-	if (FD_ISSET(i, writefds))
-	    nbwrite[i]++;
-	if (FD_ISSET(i, exceptfds))
-	    nbexcept[i]++;
-#endif
-    }
-#ifdef LIBC_VERBOSE
-    if (NOW() > lastshown + 1000000000ull) {
-	lastshown = NOW();
-	printk("%lu MB free, ", num_free_pages() / ((1 << 20) / PAGE_SIZE));
-	printk("%d(%d): ", nb, sock_n);
-	for (i = 0; i < nfds; i++) {
-	    if (nbread[i] || nbwrite[i] || nbexcept[i])
-		printk(" %d(%c):", i, file_types[files[i].type]);
-	    if (nbread[i])
-	    	printk(" %dR", nbread[i]);
-	    if (nbwrite[i])
-		printk(" %dW", nbwrite[i]);
-	    if (nbexcept[i])
-		printk(" %dE", nbexcept[i]);
-	}
-	printk("\n");
-	memset(nbread, 0, sizeof(nbread));
-	memset(nbwrite, 0, sizeof(nbwrite));
-	memset(nbexcept, 0, sizeof(nbexcept));
-	nb = 0;
-    }
-#endif
-    return n;
-}
-
-/* The strategy is to
- * - announce that we will maybe sleep
- * - poll a bit ; if successful, return
- * - if timeout, return
- * - really sleep (except if somebody woke us in the meanwhile) */
-int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
-	struct timeval *timeout)
-{
-    int n, ret;
-    fd_set myread, mywrite, myexcept;
-    struct thread *thread = get_current();
-    s_time_t start = NOW(), stop;
-#ifdef CONFIG_NETFRONT
-    DEFINE_WAIT(netfront_w);
-#endif
-    DEFINE_WAIT(event_w);
-#ifdef CONFIG_BLKFRONT
-    DEFINE_WAIT(blkfront_w);
-#endif
-#ifdef CONFIG_XENBUS
-    DEFINE_WAIT(xenbus_watch_w);
-#endif
-#ifdef CONFIG_KBDFRONT
-    DEFINE_WAIT(kbdfront_w);
-#endif
-    DEFINE_WAIT(console_w);
-
-    assert(thread == main_thread);
-
-    DEBUG("select(%d, ", nfds);
-    dump_set(nfds, readfds, writefds, exceptfds, timeout);
-    DEBUG(");\n");
-
-    if (timeout)
-	stop = start + SECONDS(timeout->tv_sec) + timeout->tv_usec * 1000;
-    else
-	/* just make gcc happy */
-	stop = start;
-
-    /* Tell people we're going to sleep before looking at what they are
-     * saying, hence letting them wake us if events happen between here and
-     * schedule() */
-#ifdef CONFIG_NETFRONT
-    add_waiter(netfront_w, netfront_queue);
-#endif
-    add_waiter(event_w, event_queue);
-#ifdef CONFIG_BLKFRONT
-    add_waiter(blkfront_w, blkfront_queue);
-#endif
-#ifdef CONFIG_XENBUS
-    add_waiter(xenbus_watch_w, xenbus_watch_queue);
-#endif
-#ifdef CONFIG_KBDFRONT
-    add_waiter(kbdfront_w, kbdfront_queue);
-#endif
-    add_waiter(console_w, console_queue);
-
-    if (readfds)
-        myread = *readfds;
-    else
-        FD_ZERO(&myread);
-    if (writefds)
-        mywrite = *writefds;
-    else
-        FD_ZERO(&mywrite);
-    if (exceptfds)
-        myexcept = *exceptfds;
-    else
-        FD_ZERO(&myexcept);
-
-    DEBUG("polling ");
-    dump_set(nfds, &myread, &mywrite, &myexcept, timeout);
-    DEBUG("\n");
-    n = select_poll(nfds, &myread, &mywrite, &myexcept);
-
-    if (n) {
-	dump_set(nfds, readfds, writefds, exceptfds, timeout);
-	if (readfds)
-	    *readfds = myread;
-	if (writefds)
-	    *writefds = mywrite;
-	if (exceptfds)
-	    *exceptfds = myexcept;
-	DEBUG(" -> ");
-	dump_set(nfds, readfds, writefds, exceptfds, timeout);
-	DEBUG("\n");
-	wake(thread);
-	ret = n;
-	goto out;
-    }
-    if (timeout && NOW() >= stop) {
-	if (readfds)
-	    FD_ZERO(readfds);
-	if (writefds)
-	    FD_ZERO(writefds);
-	if (exceptfds)
-	    FD_ZERO(exceptfds);
-	timeout->tv_sec = 0;
-	timeout->tv_usec = 0;
-	wake(thread);
-	ret = 0;
-	goto out;
-    }
-
-    if (timeout)
-	thread->wakeup_time = stop;
-    schedule();
-
-    if (readfds)
-        myread = *readfds;
-    else
-        FD_ZERO(&myread);
-    if (writefds)
-        mywrite = *writefds;
-    else
-        FD_ZERO(&mywrite);
-    if (exceptfds)
-        myexcept = *exceptfds;
-    else
-        FD_ZERO(&myexcept);
-
-    n = select_poll(nfds, &myread, &mywrite, &myexcept);
-
-    if (n) {
-	if (readfds)
-	    *readfds = myread;
-	if (writefds)
-	    *writefds = mywrite;
-	if (exceptfds)
-	    *exceptfds = myexcept;
-	ret = n;
-	goto out;
-    }
-    errno = EINTR;
-    ret = -1;
-
-out:
-#ifdef CONFIG_NETFRONT
-    remove_waiter(netfront_w, netfront_queue);
-#endif
-    remove_waiter(event_w, event_queue);
-#ifdef CONFIG_BLKFRONT
-    remove_waiter(blkfront_w, blkfront_queue);
-#endif
-#ifdef CONFIG_XENBUS
-    remove_waiter(xenbus_watch_w, xenbus_watch_queue);
-#endif
-#ifdef CONFIG_KBDFRONT
-    remove_waiter(kbdfront_w, kbdfront_queue);
-#endif
-    remove_waiter(console_w, console_queue);
-    return ret;
-}
-
-/* Wrap around select */
-int poll(struct pollfd _pfd[], nfds_t _nfds, int _timeout)
-{
-    int n, ret;
-    int i, fd;
-    struct timeval _timeo, *timeo = NULL;
-    fd_set rfds, wfds, efds;
-    int max_fd = -1;
-
-    DEBUG("poll(");
-    dump_pollfds(_pfd, _nfds, _timeout);
-    DEBUG(")\n");
-
-    FD_ZERO(&rfds);
-    FD_ZERO(&wfds);
-    FD_ZERO(&efds);
-
-    n = 0;
-
-    for (i = 0; i < _nfds; i++) {
-        fd = _pfd[i].fd;
-        _pfd[i].revents = 0;
-
-        /* fd < 0, revents = 0, which is already set */
-        if (fd < 0) continue;
-
-        /* fd is invalid, revents = POLLNVAL, increment counter */
-        if (fd >= NOFILE || files[fd].type == FTYPE_NONE) {
-            n++;
-            _pfd[i].revents |= POLLNVAL;
-            continue;
-        }
-
-        /* normal case, map POLL* into readfds and writefds:
-         * POLLIN  -> readfds
-         * POLLOUT -> writefds
-         * POLL*   -> none
-         */
-        if (_pfd[i].events & POLLIN)
-            FD_SET(fd, &rfds);
-        if (_pfd[i].events & POLLOUT)
-            FD_SET(fd, &wfds);
-        /* always set exceptfds */
-        FD_SET(fd, &efds);
-        if (fd > max_fd)
-            max_fd = fd;
-    }
-
-    /* should never sleep when we already have events */
-    if (n) {
-        _timeo.tv_sec  = 0;
-        _timeo.tv_usec = 0;
-        timeo = &_timeo;
-    } else if (_timeout >= 0) {
-        /* normal case, construct _timeout, might sleep */
-        _timeo.tv_sec  = _timeout / 1000;
-        _timeo.tv_usec = (_timeout % 1000) * 1000;
-        timeo = &_timeo;
-    } else {
-        /* _timeout < 0, block forever */
-        timeo = NULL;
-    }
-
-
-    ret = select(max_fd+1, &rfds, &wfds, &efds, timeo);
-    /* error in select, just return, errno is set by select() */
-    if (ret < 0)
-        return ret;
-
-    for (i = 0; i < _nfds; i++) {
-        fd = _pfd[i].fd;
-
-        /* the revents has already been set for all error case */
-        if (fd < 0 || fd >= NOFILE || files[fd].type == FTYPE_NONE)
-            continue;
-
-        if (FD_ISSET(fd, &rfds) || FD_ISSET(fd, &wfds) || FD_ISSET(fd, &efds))
-            n++;
-        if (FD_ISSET(fd, &efds)) {
-            /* anything bad happens we set POLLERR */
-            _pfd[i].revents |= POLLERR;
-            continue;
-        }
-        if (FD_ISSET(fd, &rfds))
-            _pfd[i].revents |= POLLIN;
-        if (FD_ISSET(fd, &wfds))
-            _pfd[i].revents |= POLLOUT;
-    }
-
-    return n;
-}
-
-#ifdef HAVE_LWIP
-int socket(int domain, int type, int protocol)
-{
-    int fd, res;
-    fd = lwip_socket(domain, type, protocol);
-    if (fd < 0)
-	return -1;
-    res = alloc_fd(FTYPE_SOCKET);
-    printk("socket -> %d\n", res);
-    files[res].socket.fd = fd;
-    return res;
-}
-
-int accept(int s, struct sockaddr *addr, socklen_t *addrlen)
-{
-    int fd, res;
-    if (files[s].type != FTYPE_SOCKET) {
-	printk("accept(%d): Bad descriptor\n", s);
-	errno = EBADF;
-	return -1;
-    }
-    fd = lwip_accept(files[s].socket.fd, addr, addrlen);
-    if (fd < 0)
-	return -1;
-    res = alloc_fd(FTYPE_SOCKET);
-    files[res].socket.fd = fd;
-    printk("accepted on %d -> %d\n", s, res);
-    return res;
-}
-
-#define LWIP_STUB(ret, name, proto, args) \
-ret name proto \
-{ \
-    if (files[s].type != FTYPE_SOCKET) { \
-	printk(#name "(%d): Bad descriptor\n", s); \
-	errno = EBADF; \
-	return -1; \
-    } \
-    s = files[s].socket.fd; \
-    return lwip_##name args; \
-}
-
-LWIP_STUB(int, bind, (int s, struct sockaddr *my_addr, socklen_t addrlen), (s, my_addr, addrlen))
-LWIP_STUB(int, getsockopt, (int s, int level, int optname, void *optval, socklen_t *optlen), (s, level, optname, optval, optlen))
-LWIP_STUB(int, setsockopt, (int s, int level, int optname, void *optval, socklen_t optlen), (s, level, optname, optval, optlen))
-LWIP_STUB(int, connect, (int s, struct sockaddr *serv_addr, socklen_t addrlen), (s, serv_addr, addrlen))
-LWIP_STUB(int, listen, (int s, int backlog), (s, backlog));
-LWIP_STUB(ssize_t, recv, (int s, void *buf, size_t len, int flags), (s, buf, len, flags))
-LWIP_STUB(ssize_t, recvfrom, (int s, void *buf, size_t len, int flags, struct sockaddr *from, socklen_t *fromlen), (s, buf, len, flags, from, fromlen))
-LWIP_STUB(ssize_t, send, (int s, void *buf, size_t len, int flags), (s, buf, len, flags))
-LWIP_STUB(ssize_t, sendto, (int s, void *buf, size_t len, int flags, struct sockaddr *to, socklen_t tolen), (s, buf, len, flags, to, tolen))
-LWIP_STUB(int, getsockname, (int s, struct sockaddr *name, socklen_t *namelen), (s, name, namelen))
-#endif
-
-static char *syslog_ident;
-void openlog(const char *ident, int option, int facility)
-{
-    free(syslog_ident);
-    syslog_ident = strdup(ident);
-}
-
-void vsyslog(int priority, const char *format, va_list ap)
-{
-    printk("%s: ", syslog_ident);
-    print(0, format, ap);
-}
-
-void syslog(int priority, const char *format, ...)
-{
-    va_list ap;
-    va_start(ap, format);
-    vsyslog(priority, format, ap);
-    va_end(ap);
-}
-
-void closelog(void)
-{
-    free(syslog_ident);
-    syslog_ident = NULL;
-}
-
-void vwarn(const char *format, va_list ap)
-{
-    int the_errno = errno;
-    printk("stubdom: ");
-    if (format) {
-        print(0, format, ap);
-        printk(", ");
-    }
-    printk("%s", strerror(the_errno));
-}
-
-void warn(const char *format, ...)
-{
-    va_list ap;
-    va_start(ap, format);
-    vwarn(format, ap);
-    va_end(ap);
-}
-
-void verr(int eval, const char *format, va_list ap)
-{
-    vwarn(format, ap);
-    exit(eval);
-}
-
-void err(int eval, const char *format, ...)
-{
-    va_list ap;
-    va_start(ap, format);
-    verr(eval, format, ap);
-    va_end(ap);
-}
-
-void vwarnx(const char *format, va_list ap)
-{
-    printk("stubdom: ");
-    if (format)
-        print(0, format, ap);
-}
-
-void warnx(const char *format, ...)
-{
-    va_list ap;
-    va_start(ap, format);
-    vwarnx(format, ap);
-    va_end(ap);
-}
-
-void verrx(int eval, const char *format, va_list ap)
-{
-    vwarnx(format, ap);
-    exit(eval);
-}
-
-void errx(int eval, const char *format, ...)
-{
-    va_list ap;
-    va_start(ap, format);
-    verrx(eval, format, ap);
-    va_end(ap);
-}
-
-int nanosleep(const struct timespec *req, struct timespec *rem)
-{
-    s_time_t start = NOW();
-    s_time_t stop = start + SECONDS(req->tv_sec) + req->tv_nsec;
-    s_time_t stopped;
-    struct thread *thread = get_current();
-
-    thread->wakeup_time = stop;
-    clear_runnable(thread);
-    schedule();
-    stopped = NOW();
-
-    if (rem)
-    {
-	s_time_t remaining = stop - stopped;
-	if (remaining > 0)
-	{
-	    rem->tv_nsec = remaining % 1000000000ULL;
-	    rem->tv_sec  = remaining / 1000000000ULL;
-	} else memset(rem, 0, sizeof(*rem));
-    }
-
-    return 0;
-}
-
-int usleep(useconds_t usec)
-{
-    /* "usec shall be less than one million."  */
-    struct timespec req;
-    req.tv_nsec = usec * 1000;
-    req.tv_sec = 0;
-
-    if (nanosleep(&req, NULL))
-	return -1;
-
-    return 0;
-}
-
-unsigned int sleep(unsigned int seconds)
-{
-    struct timespec req, rem;
-    req.tv_sec = seconds;
-    req.tv_nsec = 0;
-
-    if (nanosleep(&req, &rem))
-	return -1;
-
-    if (rem.tv_nsec > 0)
-	rem.tv_sec++;
-
-    return rem.tv_sec;
-}
-
-int clock_gettime(clockid_t clk_id, struct timespec *tp)
-{
-    switch (clk_id) {
-	case CLOCK_MONOTONIC:
-	{
-	    struct timeval tv;
-
-	    gettimeofday(&tv, NULL);
-
-	    tp->tv_sec = tv.tv_sec;
-	    tp->tv_nsec = tv.tv_usec * 1000;
-
-	    break;
-	}
-	case CLOCK_REALTIME:
-	{
-	    uint64_t nsec = monotonic_clock();
-
-	    tp->tv_sec = nsec / 1000000000ULL;
-	    tp->tv_nsec = nsec % 1000000000ULL;
-
-	    break;
-	}
-	default:
-	    print_unsupported("clock_gettime(%d)", clk_id);
-	    errno = EINVAL;
-	    return -1;
-    }
-
-    return 0;
-}
-
-uid_t getuid(void)
-{
-	return 0;
-}
-
-uid_t geteuid(void)
-{
-	return 0;
-}
-
-gid_t getgid(void)
-{
-	return 0;
-}
-
-gid_t getegid(void)
-{
-	return 0;
-}
-
-int gethostname(char *name, size_t namelen)
-{
-	strncpy(name, "mini-os", namelen);
-	return 0;
-}
-
-size_t getpagesize(void)
-{
-    return PAGE_SIZE;
-}
-
-void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset)
-{
-    unsigned long n = (length + PAGE_SIZE - 1) / PAGE_SIZE;
-
-    ASSERT(!start);
-    ASSERT(prot == (PROT_READ|PROT_WRITE));
-    ASSERT((fd == -1 && (flags == (MAP_SHARED|MAP_ANON) || flags == (MAP_PRIVATE|MAP_ANON)))
-        || (fd != -1 && flags == MAP_SHARED));
-
-    if (fd == -1)
-        return map_zero(n, 1);
-#ifdef CONFIG_XC
-    else if (files[fd].type == FTYPE_XC) {
-        unsigned long zero = 0;
-        return map_frames_ex(&zero, n, 0, 0, 1, DOMID_SELF, NULL, 0);
-    }
-#endif
-    else if (files[fd].type == FTYPE_MEM) {
-        unsigned long first_mfn = offset >> PAGE_SHIFT;
-        return map_frames_ex(&first_mfn, n, 0, 1, 1, DOMID_IO, NULL, _PAGE_PRESENT|_PAGE_RW);
-    } else ASSERT(0);
-}
-
-int munmap(void *start, size_t length)
-{
-    int total = length / PAGE_SIZE;
-    int ret;
-
-    ret = unmap_frames((unsigned long)start, (unsigned long)total);
-    if (ret) {
-        errno = ret;
-        return -1;
-    }
-    return 0;
-}
-
-void sparse(unsigned long data, size_t size)
-{
-    unsigned long newdata;
-    xen_pfn_t *mfns;
-    int i, n;
-
-    newdata = (data + PAGE_SIZE - 1) & PAGE_MASK;
-    if (newdata - data > size)
-        return;
-    size -= newdata - data;
-    data = newdata;
-    n = size / PAGE_SIZE;
-    size = n * PAGE_SIZE;
-
-    mfns = malloc(n * sizeof(*mfns));
-    for (i = 0; i < n; i++) {
-#ifdef LIBC_DEBUG
-        int j;
-        for (j=0; j<PAGE_SIZE; j++)
-            if (((char*)data + i * PAGE_SIZE)[j]) {
-                printk("%lx is not zero!\n", data + i * PAGE_SIZE + j);
-                exit(1);
-            }
-#endif
-        mfns[i] = virtual_to_mfn(data + i * PAGE_SIZE);
-    }
-
-    printk("sparsing %ldMB at %lx\n", size >> 20, data);
-
-    munmap((void *) data, size);
-    free_physical_pages(mfns, n);
-    do_map_zero(data, n);
-}
-
-int nice(int inc)
-{
-    printk("nice() stub called with inc=%d\n", inc);
-    return 0;
-}
-
-
-/* Not supported by FS yet.  */
-unsupported_function_crash(link);
-unsupported_function(int, readlink, -1);
-unsupported_function_crash(umask);
-
-/* We could support that.  */
-unsupported_function_log(int, chdir, -1);
-
-/* No dynamic library support.  */ 
-unsupported_function_log(void *, dlopen, NULL);
-unsupported_function_log(void *, dlsym, NULL);
-unsupported_function_log(char *, dlerror, NULL);
-unsupported_function_log(int, dlclose, -1);
-
-/* We don't raise signals anyway.  */
-unsupported_function(int, sigemptyset, -1);
-unsupported_function(int, sigfillset, -1);
-unsupported_function(int, sigaddset, -1);
-unsupported_function(int, sigdelset, -1);
-unsupported_function(int, sigismember, -1);
-unsupported_function(int, sigprocmask, -1);
-unsupported_function(int, sigaction, -1);
-unsupported_function(int, __sigsetjmp, 0);
-unsupported_function(int, sigaltstack, -1);
-unsupported_function_crash(kill);
-
-/* Unsupported */
-unsupported_function_crash(pipe);
-unsupported_function_crash(fork);
-unsupported_function_crash(execv);
-unsupported_function_crash(execve);
-unsupported_function_crash(waitpid);
-unsupported_function_crash(wait);
-unsupported_function_crash(lockf);
-unsupported_function_crash(sysconf);
-unsupported_function(int, tcsetattr, -1);
-unsupported_function(int, tcgetattr, 0);
-unsupported_function(int, grantpt, -1);
-unsupported_function(int, unlockpt, -1);
-unsupported_function(char *, ptsname, NULL);
-
-/* net/if.h */
-unsupported_function_log(unsigned int, if_nametoindex, -1);
-unsupported_function_log(char *, if_indextoname, (char *) NULL);
-unsupported_function_log(struct  if_nameindex *, if_nameindex, (struct  if_nameindex *) NULL);
-unsupported_function_crash(if_freenameindex);
-
-/* Linuxish abi for the Caml runtime, don't support 
-   Log, and return an error code if possible.  If it is not possible
-   to inform the application of an error, then crash instead!
-*/
-unsupported_function_log(struct dirent *, readdir64, NULL);
-unsupported_function_log(int, getrusage, -1);
-unsupported_function_log(int, getrlimit, -1);
-unsupported_function_log(int, getrlimit64, -1);
-unsupported_function_log(int, __xstat64, -1);
-unsupported_function_log(long, __strtol_internal, LONG_MIN);
-unsupported_function_log(double, __strtod_internal, HUGE_VAL);
-unsupported_function_log(int, utime, -1);
-unsupported_function_log(int, truncate64, -1);
-unsupported_function_log(int, tcflow, -1);
-unsupported_function_log(int, tcflush, -1);
-unsupported_function_log(int, tcdrain, -1);
-unsupported_function_log(int, tcsendbreak, -1);
-unsupported_function_log(int, cfsetospeed, -1);
-unsupported_function_log(int, cfsetispeed, -1);
-unsupported_function_crash(cfgetospeed);
-unsupported_function_crash(cfgetispeed);
-unsupported_function_log(int, symlink, -1);
-unsupported_function_log(const char*, inet_ntop, NULL);
-unsupported_function_crash(__fxstat64);
-unsupported_function_crash(__lxstat64);
-unsupported_function_log(int, socketpair, -1);
-unsupported_function_crash(sigsuspend);
-unsupported_function_log(int, sigpending, -1);
-unsupported_function_log(int, shutdown, -1);
-unsupported_function_log(int, setuid, -1);
-unsupported_function_log(int, setgid, -1);
-unsupported_function_crash(rewinddir);
-unsupported_function_log(int, getpriority, -1);
-unsupported_function_log(int, setpriority, -1);
-unsupported_function_log(int, mkfifo, -1);
-unsupported_function_log(int, getitimer, -1);
-unsupported_function_log(int, setitimer, -1);
-unsupported_function_log(void *, getservbyport, NULL);
-unsupported_function_log(void *, getservbyname, NULL);
-unsupported_function_log(void *, getpwuid, NULL);
-unsupported_function_log(void *, getpwnam, NULL);
-unsupported_function_log(void *, getprotobynumber, NULL);
-unsupported_function_log(void *, getprotobyname, NULL);
-unsupported_function_log(int, getpeername, -1);
-unsupported_function_log(int, getnameinfo, -1);
-unsupported_function_log(char *, getlogin, NULL);
-unsupported_function_crash(__h_errno_location);
-unsupported_function_log(int, gethostbyname_r, -1);
-unsupported_function_log(int, gethostbyaddr_r, -1);
-unsupported_function_log(int, getgroups, -1);
-unsupported_function_log(void *, getgrgid, NULL);
-unsupported_function_log(void *, getgrnam, NULL);
-unsupported_function_log(int, getaddrinfo, -1);
-unsupported_function_log(int, freeaddrinfo, -1);
-unsupported_function_log(int, ftruncate64, -1);
-unsupported_function_log(int, fchown, -1);
-unsupported_function_log(int, fchmod, -1);
-unsupported_function_crash(execvp);
-unsupported_function_log(int, dup, -1)
-unsupported_function_log(int, chroot, -1)
-unsupported_function_log(int, chown, -1);
-unsupported_function_log(int, chmod, -1);
-unsupported_function_crash(alarm);
-unsupported_function_log(int, inet_pton, -1);
-unsupported_function_log(int, access, -1);
-#endif
diff --git a/extras/mini-os/lib/xmalloc.c b/extras/mini-os/lib/xmalloc.c
deleted file mode 100644
index e16f161..0000000
--- a/extras/mini-os/lib/xmalloc.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/* 
- ****************************************************************************
- * (C) 2005 - Grzegorz Milos - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: xmaloc.c
- *      Author: Grzegorz Milos (gm281 at cam.ac.uk)
- *              Samuel Thibault (samuel.thibault at eu.citrix.com)
- *     Changes: 
- *              
- *        Date: Aug 2005
- *              Jan 2008
- * 
- * Environment: Xen Minimal OS
- * Description: simple memory allocator
- *
- ****************************************************************************
- * Simple allocator for Mini-os.  If larger than a page, simply use the
- * page-order allocator.
- *
- * Copy of the allocator for Xen by Rusty Russell:
- * Copyright (C) 2005 Rusty Russell IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#include <mini-os/os.h>
-#include <mini-os/mm.h>
-#include <mini-os/types.h>
-#include <mini-os/lib.h>
-#include <mini-os/list.h>
-#include <mini-os/xmalloc.h>
-
-#ifndef HAVE_LIBC
-/* static spinlock_t freelist_lock = SPIN_LOCK_UNLOCKED; */
-
-struct xmalloc_hdr
-{
-    /* Total including this hdr, unused padding and second hdr. */
-    size_t size;
-    MINIOS_TAILQ_ENTRY(struct xmalloc_hdr) freelist;
-} __cacheline_aligned;
-
-static MINIOS_TAILQ_HEAD(,struct xmalloc_hdr) freelist =
-	MINIOS_TAILQ_HEAD_INITIALIZER(freelist);
-
-/* Unused padding data between the two hdrs. */
-
-struct xmalloc_pad
-{
-    /* Size including both hdrs. */
-    size_t hdr_size;
-};
-
-/* Return size, increased to alignment with align. */
-static inline size_t align_up(size_t size, size_t align)
-{
-    return (size + align - 1) & ~(align - 1);
-}
-
-static void maybe_split(struct xmalloc_hdr *hdr, size_t size, size_t block)
-{
-    struct xmalloc_hdr *extra;
-    size_t leftover;
-    size = align_up(size, __alignof__(struct xmalloc_hdr));
-    size = align_up(size, __alignof__(struct xmalloc_pad));
-    leftover = block - size;
-
-    /* If enough is left to make a block, put it on free list. */
-    if ( leftover >= (2 * (sizeof(struct xmalloc_hdr) + sizeof(struct xmalloc_pad))) )
-    {
-        extra = (struct xmalloc_hdr *)((unsigned long)hdr + size);
-        extra->size = leftover;
-        /* spin_lock_irqsave(&freelist_lock, flags); */
-        MINIOS_TAILQ_INSERT_HEAD(&freelist, extra, freelist);
-        /* spin_unlock_irqrestore(&freelist_lock, flags); */
-    }
-    else
-    {
-        size = block;
-    }
-
-    hdr->size = size;
-}
-
-static struct xmalloc_hdr *xmalloc_new_page(size_t size)
-{
-    struct xmalloc_hdr *hdr;
-    /* unsigned long flags; */
-
-    hdr = (struct xmalloc_hdr *)alloc_page();
-    if ( hdr == NULL )
-        return NULL;
-
-    maybe_split(hdr, size, PAGE_SIZE);
-
-    return hdr;
-}
-
-/* Big object?  Just use the page allocator. */
-static void *xmalloc_whole_pages(size_t size, size_t align)
-{
-    struct xmalloc_hdr *hdr;
-    struct xmalloc_pad *pad;
-    unsigned int pageorder;
-    void *ret;
-    /* Room for headers */
-    size_t hdr_size = sizeof(struct xmalloc_hdr) + sizeof(struct xmalloc_pad);
-    /* Align for actual beginning of data */
-    hdr_size = align_up(hdr_size, align);
-
-    pageorder = get_order(hdr_size + size);
-
-    hdr = (struct xmalloc_hdr *)alloc_pages(pageorder);
-    if ( hdr == NULL )
-        return NULL;
-
-    hdr->size = (1UL << (pageorder + PAGE_SHIFT));
-
-    ret = (char*)hdr + hdr_size;
-    pad = (struct xmalloc_pad *) ret - 1;
-    pad->hdr_size = hdr_size;
-    return ret;
-}
-
-void *_xmalloc(size_t size, size_t align)
-{
-    struct xmalloc_hdr *i, *tmp, *hdr = NULL;
-    uintptr_t data_begin;
-    size_t hdr_size;
-    /* unsigned long flags; */
-
-    hdr_size = sizeof(struct xmalloc_hdr) + sizeof(struct xmalloc_pad);
-    /* Align on headers requirements. */
-    align = align_up(align, __alignof__(struct xmalloc_hdr));
-    align = align_up(align, __alignof__(struct xmalloc_pad));
-
-    /* For big allocs, give them whole pages. */
-    if ( size + align_up(hdr_size, align) >= PAGE_SIZE )
-        return xmalloc_whole_pages(size, align);
-
-    /* Search free list. */
-    /* spin_lock_irqsave(&freelist_lock, flags); */
-    MINIOS_TAILQ_FOREACH_SAFE(i, &freelist, freelist, tmp)
-    {
-        data_begin = align_up((uintptr_t)i + hdr_size, align);
-
-        if ( data_begin + size > (uintptr_t)i + i->size )
-            continue;
-
-        MINIOS_TAILQ_REMOVE(&freelist, i, freelist);
-        /* spin_unlock_irqrestore(&freelist_lock, flags); */
-
-        uintptr_t size_before = (data_begin - hdr_size) - (uintptr_t)i;
-
-        if (size_before >= 2 * hdr_size) {
-            /* Worth splitting the beginning */
-            struct xmalloc_hdr *new_i = (void*)(data_begin - hdr_size);
-            new_i->size = i->size - size_before;
-            i->size = size_before;
-            /* spin_lock_irqsave(&freelist_lock, flags); */
-            MINIOS_TAILQ_INSERT_HEAD(&freelist, i, freelist);
-            /* spin_unlock_irqrestore(&freelist_lock, flags); */
-            i = new_i;
-        }
-        maybe_split(i, (data_begin + size) - (uintptr_t)i, i->size);
-        hdr = i;
-        break;
-    }
-
-    if (!hdr) {
-        /* spin_unlock_irqrestore(&freelist_lock, flags); */
-
-        /* Alloc a new page and return from that. */
-        hdr = xmalloc_new_page(align_up(hdr_size, align) + size);
-        if ( hdr == NULL )
-            return NULL;
-        data_begin = (uintptr_t)hdr + align_up(hdr_size, align);
-    }
-
-    struct xmalloc_pad *pad = (struct xmalloc_pad *) data_begin - 1;
-    pad->hdr_size = data_begin - (uintptr_t)hdr;
-    BUG_ON(data_begin % align);
-    return (void*)data_begin;
-}
-
-void xfree(const void *p)
-{
-    /* unsigned long flags; */
-    struct xmalloc_hdr *i, *tmp, *hdr;
-    struct xmalloc_pad *pad;
-
-    if ( p == NULL )
-        return;
-
-    pad = (struct xmalloc_pad *)p - 1;
-    hdr = (struct xmalloc_hdr *)((char *)p - pad->hdr_size);
-
-    /* Big allocs free directly. */
-    if ( hdr->size >= PAGE_SIZE )
-    {
-        free_pages(hdr, get_order(hdr->size));
-        return;
-    }
-
-    /* We know hdr will be on same page. */
-    if(((long)p & PAGE_MASK) != ((long)hdr & PAGE_MASK))
-    {
-        printk("Header should be on the same page\n");
-        *(int*)0=0;
-    }
-
-    /* Merge with other free block, or put in list. */
-    /* spin_lock_irqsave(&freelist_lock, flags); */
-    MINIOS_TAILQ_FOREACH_SAFE(i, &freelist, freelist, tmp)
-    {
-        unsigned long _i   = (unsigned long)i;
-        unsigned long _hdr = (unsigned long)hdr;
-
-        /* Do not merge across page boundaries. */
-        if ( ((_i ^ _hdr) & PAGE_MASK) != 0 )
-            continue;
-
-        /* We follow this block?  Swallow it. */
-        if ( (_i + i->size) == _hdr )
-        {
-            MINIOS_TAILQ_REMOVE(&freelist, i, freelist);
-            i->size += hdr->size;
-            hdr = i;
-        }
-
-        /* We precede this block? Swallow it. */
-        if ( (_hdr + hdr->size) == _i )
-        {
-            MINIOS_TAILQ_REMOVE(&freelist, i, freelist);
-            hdr->size += i->size;
-        }
-    }
-
-    /* Did we merge an entire page? */
-    if ( hdr->size == PAGE_SIZE )
-    {
-        if((((unsigned long)hdr) & (PAGE_SIZE-1)) != 0)
-        {
-            printk("Bug\n");
-            *(int*)0=0;
-        }
-        free_page(hdr);
-    }
-    else
-    {
-        MINIOS_TAILQ_INSERT_HEAD(&freelist, hdr, freelist);
-    }
-
-    /* spin_unlock_irqrestore(&freelist_lock, flags); */
-}
-
-void *malloc(size_t size)
-{
-    return _xmalloc(size, DEFAULT_ALIGN);
-}
-
-void *realloc(void *ptr, size_t size)
-{
-    void *new;
-    struct xmalloc_hdr *hdr;
-    struct xmalloc_pad *pad;
-    size_t old_data_size;
-
-    if (ptr == NULL)
-        return _xmalloc(size, DEFAULT_ALIGN);
-
-    pad = (struct xmalloc_pad *)ptr - 1;
-    hdr = (struct xmalloc_hdr *)((char*)ptr - pad->hdr_size);
-
-    old_data_size = hdr->size - pad->hdr_size;
-    if ( old_data_size >= size )
-    {
-	maybe_split(hdr, pad->hdr_size + size, hdr->size);
-        return ptr;
-    }
-    
-    new = _xmalloc(size, DEFAULT_ALIGN);
-    if (new == NULL) 
-        return NULL;
-
-    memcpy(new, ptr, old_data_size);
-    xfree(ptr);
-
-    return new;
-}
-
-void free(void *ptr)
-{
-    xfree(ptr);
-}
-#endif
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/extras/mini-os/lib/xs.c b/extras/mini-os/lib/xs.c
deleted file mode 100644
index 324bd05..0000000
--- a/extras/mini-os/lib/xs.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * libxs-compatible layer
- *
- * Samuel Thibault <Samuel.Thibault at eu.citrix.net>, 2007-2008
- *
- * Mere wrapper around xenbus_*
- */
-
-#ifdef HAVE_LIBC
-#include <os.h>
-#include <lib.h>
-#include <xenstore.h>
-#include <xenbus.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-static inline int _xs_fileno(struct xs_handle *h) {
-    return (intptr_t) h;
-}
-
-struct xs_handle *xs_daemon_open()
-{
-    int fd = alloc_fd(FTYPE_XENBUS);
-    files[fd].xenbus.events = NULL;
-    printk("xs_daemon_open -> %d, %p\n", fd, &files[fd].xenbus.events);
-    return (void*)(intptr_t) fd;
-}
-
-void xs_daemon_close(struct xs_handle *h)
-{
-    int fd = _xs_fileno(h);
-    struct xenbus_event *event, *next;
-    for (event = files[fd].xenbus.events; event; event = next)
-    {
-        next = event->next;
-        free(event);
-    }
-    files[fd].type = FTYPE_NONE;
-}
-
-int xs_fileno(struct xs_handle *h)
-{
-    return _xs_fileno(h);
-}
-
-void *xs_read(struct xs_handle *h, xs_transaction_t t,
-	     const char *path, unsigned int *len)
-{
-    char *value;
-    char *msg;
-
-    msg = xenbus_read(t, path, &value);
-    if (msg) {
-	printk("xs_read(%s): %s\n", path, msg);
-	free(msg);
-	return NULL;
-    }
-
-    if (len)
-	*len = strlen(value);
-    return value;
-}
-
-bool xs_write(struct xs_handle *h, xs_transaction_t t,
-	      const char *path, const void *data, unsigned int len)
-{
-    char value[len + 1];
-    char *msg;
-
-    memcpy(value, data, len);
-    value[len] = 0;
-
-    msg = xenbus_write(t, path, value);
-    if (msg) {
-	printk("xs_write(%s): %s\n", path, msg);
-	free(msg);
-	return false;
-    }
-    return true;
-}
-
-static bool xs_bool(char *reply)
-{
-    if (!reply)
-	return true;
-    free(reply);
-    return false;
-}
-
-bool xs_rm(struct xs_handle *h, xs_transaction_t t, const char *path)
-{
-    return xs_bool(xenbus_rm(t, path));
-}
-
-static void *xs_talkv(struct xs_handle *h, xs_transaction_t t,
-		enum xsd_sockmsg_type type,
-		struct write_req *iovec,
-		unsigned int num_vecs,
-		unsigned int *len)
-{
-    struct xsd_sockmsg *msg;
-    void *ret;
-
-    msg = xenbus_msg_reply(type, t, iovec, num_vecs);
-    ret = malloc(msg->len);
-    memcpy(ret, (char*) msg + sizeof(*msg), msg->len);
-    if (len)
-	*len = msg->len - 1;
-    free(msg);
-    return ret;
-}
-
-static void *xs_single(struct xs_handle *h, xs_transaction_t t,
-		enum xsd_sockmsg_type type,
-		const char *string,
-		unsigned int *len)
-{
-    struct write_req iovec;
-
-    iovec.data = (void *)string;
-    iovec.len = strlen(string) + 1;
-
-    return xs_talkv(h, t, type, &iovec, 1, len);
-}
-
-char *xs_get_domain_path(struct xs_handle *h, unsigned int domid)
-{
-    char domid_str[MAX_STRLEN(domid)];
-
-    sprintf(domid_str, "%u", domid);
-
-    return xs_single(h, XBT_NULL, XS_GET_DOMAIN_PATH, domid_str, NULL);
-}
-
-char **xs_directory(struct xs_handle *h, xs_transaction_t t,
-		    const char *path, unsigned int *num)
-{
-    char *msg;
-    char **entries, **res;
-    char *entry;
-    int i, n;
-    int size;
-
-    msg = xenbus_ls(t, path, &res);
-    if (msg) {
-	printk("xs_directory(%s): %s\n", path, msg);
-	free(msg);
-	return NULL;
-    }
-
-    size = 0;
-    for (n = 0; res[n]; n++)
-	size += strlen(res[n]) + 1;
-
-    entries = malloc(n * sizeof(char *) + size);
-    entry = (char *) (&entries[n]);
-
-    for (i = 0; i < n; i++) {
-	int l = strlen(res[i]) + 1;
-	memcpy(entry, res[i], l);
-	free(res[i]);
-	entries[i] = entry;
-	entry += l;
-    }
-
-    *num = n;
-    free(res);
-    return entries;
-}
-
-bool xs_watch(struct xs_handle *h, const char *path, const char *token)
-{
-    int fd = _xs_fileno(h);
-    printk("xs_watch(%s, %s)\n", path, token);
-    return xs_bool(xenbus_watch_path_token(XBT_NULL, path, token, &files[fd].xenbus.events));
-}
-
-char **xs_read_watch(struct xs_handle *h, unsigned int *num)
-{
-    int fd = _xs_fileno(h);
-    struct xenbus_event *event;
-    event = files[fd].xenbus.events;
-    files[fd].xenbus.events = event->next;
-    printk("xs_read_watch() -> %s %s\n", event->path, event->token);
-    *num = 2;
-    return (char **) &event->path;
-}
-
-bool xs_unwatch(struct xs_handle *h, const char *path, const char *token)
-{
-    printk("xs_unwatch(%s, %s)\n", path, token);
-    return xs_bool(xenbus_unwatch_path_token(XBT_NULL, path, token));
-}
-#endif
diff --git a/extras/mini-os/lock.c b/extras/mini-os/lock.c
deleted file mode 100644
index 61194e5..0000000
--- a/extras/mini-os/lock.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * locks for newlib
- *
- * Samuel Thibault <Samuel.Thibault at eu.citrix.net>, July 20008
- */
-
-#ifdef HAVE_LIBC
-
-#include <sys/lock.h>
-#include <sched.h>
-#include <wait.h>
-#include <mini-os/lib.h>
-
-int ___lock_init(_LOCK_T *lock)
-{
-    lock->busy = 0;
-    init_waitqueue_head(&lock->wait);
-    return 0;
-}
-
-int ___lock_acquire(_LOCK_T *lock)
-{
-    unsigned long flags;
-    while(1) {
-        wait_event(lock->wait, !lock->busy);
-        local_irq_save(flags);
-        if (!lock->busy)
-            break;
-        local_irq_restore(flags);
-    }
-    lock->busy = 1;
-    local_irq_restore(flags);
-    return 0;
-}
-
-int ___lock_try_acquire(_LOCK_T *lock)
-{
-    unsigned long flags;
-    int ret = -1;
-    local_irq_save(flags);
-    if (!lock->busy) {
-        lock->busy = 1;
-        ret = 0;
-    }
-    local_irq_restore(flags);
-    return ret;
-}
-
-int ___lock_release(_LOCK_T *lock)
-{
-    unsigned long flags;
-    local_irq_save(flags);
-    lock->busy = 0;
-    wake_up(&lock->wait);
-    local_irq_restore(flags);
-    return 0;
-}
-
-
-int ___lock_init_recursive(_LOCK_RECURSIVE_T *lock)
-{
-    lock->owner = NULL;
-    init_waitqueue_head(&lock->wait);
-    return 0;
-}
-
-int ___lock_acquire_recursive(_LOCK_RECURSIVE_T *lock)
-{
-    unsigned long flags;
-    if (lock->owner != get_current()) {
-        while (1) {
-            wait_event(lock->wait, lock->owner == NULL);
-            local_irq_save(flags);
-            if (lock->owner == NULL)
-                break;
-            local_irq_restore(flags);
-        }
-        lock->owner = get_current();
-        local_irq_restore(flags);
-    }
-    lock->count++;
-    return 0;
-}
-
-int ___lock_try_acquire_recursive(_LOCK_RECURSIVE_T *lock)
-{
-    unsigned long flags;
-    int ret = -1;
-    local_irq_save(flags);
-    if (!lock->owner) {
-        ret = 0;
-        lock->owner = get_current();
-        lock->count++;
-    }
-    local_irq_restore(flags);
-    return ret;
-}
-
-int ___lock_release_recursive(_LOCK_RECURSIVE_T *lock)
-{
-    unsigned long flags;
-    BUG_ON(lock->owner != get_current());
-    if (--lock->count)
-        return 0;
-    local_irq_save(flags);
-    lock->owner = NULL;
-    wake_up(&lock->wait);
-    local_irq_restore(flags);
-    return 0;
-}
-
-#endif
diff --git a/extras/mini-os/lwip-arch.c b/extras/mini-os/lwip-arch.c
deleted file mode 100644
index e634ef4..0000000
--- a/extras/mini-os/lwip-arch.c
+++ /dev/null
@@ -1,293 +0,0 @@
-/* 
- * lwip-arch.c
- *
- * Arch-specific semaphores and mailboxes for lwIP running on mini-os 
- *
- * Tim Deegan <Tim.Deegan at eu.citrix.net>, July 2007
- */
-
-#include <os.h>
-#include <time.h>
-#include <console.h>
-#include <xmalloc.h>
-#include <lwip/sys.h>
-#include <stdarg.h>
-
-/* Is called to initialize the sys_arch layer */
-void sys_init(void)
-{
-}
-
-/* Creates and returns a new semaphore. The "count" argument specifies
- * the initial state of the semaphore. */
-sys_sem_t sys_sem_new(uint8_t count)
-{
-    struct semaphore *sem = xmalloc(struct semaphore);
-    sem->count = count;
-    init_waitqueue_head(&sem->wait);
-    return sem;
-}
-
-/* Deallocates a semaphore. */
-void sys_sem_free(sys_sem_t sem)
-{
-    xfree(sem);
-}
-
-/* Signals a semaphore. */
-void sys_sem_signal(sys_sem_t sem)
-{
-    up(sem);
-}
-
-/* Blocks the thread while waiting for the semaphore to be
- * signaled. If the "timeout" argument is non-zero, the thread should
- * only be blocked for the specified time (measured in
- * milliseconds).
- * 
- * If the timeout argument is non-zero, the return value is the number of
- * milliseconds spent waiting for the semaphore to be signaled. If the
- * semaphore wasn't signaled within the specified time, the return value is
- * SYS_ARCH_TIMEOUT. If the thread didn't have to wait for the semaphore
- * (i.e., it was already signaled), the function may return zero. */
-uint32_t sys_arch_sem_wait(sys_sem_t sem, uint32_t timeout)
-{
-    /* Slightly more complicated than the normal minios semaphore:
-     * need to wake on timeout *or* signal */
-    sys_prot_t prot;
-    int64_t then = NOW();
-    int64_t deadline;
-
-    if (timeout == 0)
-	deadline = 0;
-    else
-	deadline = then + MILLISECS(timeout);
-
-    while(1) {
-        wait_event_deadline(sem->wait, (sem->count > 0), deadline);
-
-        prot = sys_arch_protect();
-	/* Atomically check that we can proceed */
-	if (sem->count > 0 || (deadline && NOW() >= deadline))
-	    break;
-        sys_arch_unprotect(prot);
-    }
-
-    if (sem->count > 0) {
-        sem->count--;
-        sys_arch_unprotect(prot);
-        return NSEC_TO_MSEC(NOW() - then); 
-    }
-    
-    sys_arch_unprotect(prot);
-    return SYS_ARCH_TIMEOUT;
-}
-
-/* Creates an empty mailbox. */
-sys_mbox_t sys_mbox_new(int size)
-{
-    struct mbox *mbox = xmalloc(struct mbox);
-    if (!size)
-        size = 32;
-    else if (size == 1)
-        size = 2;
-    mbox->count = size;
-    mbox->messages = xmalloc_array(void*, size);
-    init_SEMAPHORE(&mbox->read_sem, 0);
-    mbox->reader = 0;
-    init_SEMAPHORE(&mbox->write_sem, size);
-    mbox->writer = 0;
-    return mbox;
-}
-
-/* Deallocates a mailbox. If there are messages still present in the
- * mailbox when the mailbox is deallocated, it is an indication of a
- * programming error in lwIP and the developer should be notified. */
-void sys_mbox_free(sys_mbox_t mbox)
-{
-    ASSERT(mbox->reader == mbox->writer);
-    xfree(mbox->messages);
-    xfree(mbox);
-}
-
-/* Posts the "msg" to the mailbox, internal version that actually does the
- * post. */
-static void do_mbox_post(sys_mbox_t mbox, void *msg)
-{
-    /* The caller got a semaphore token, so we are now allowed to increment
-     * writer, but we still need to prevent concurrency between writers
-     * (interrupt handler vs main) */
-    sys_prot_t prot = sys_arch_protect();
-    mbox->messages[mbox->writer] = msg;
-    mbox->writer = (mbox->writer + 1) % mbox->count;
-    ASSERT(mbox->reader != mbox->writer);
-    sys_arch_unprotect(prot);
-    up(&mbox->read_sem);
-}
-
-/* Posts the "msg" to the mailbox. */
-void sys_mbox_post(sys_mbox_t mbox, void *msg)
-{
-    if (mbox == SYS_MBOX_NULL)
-        return;
-    down(&mbox->write_sem);
-    do_mbox_post(mbox, msg);
-}
-
-/* Try to post the "msg" to the mailbox. */
-err_t sys_mbox_trypost(sys_mbox_t mbox, void *msg)
-{
-    if (mbox == SYS_MBOX_NULL)
-        return ERR_BUF;
-    if (!trydown(&mbox->write_sem))
-        return ERR_MEM;
-    do_mbox_post(mbox, msg);
-    return ERR_OK;
-}
-
-/*
- * Fetch a message from a mailbox. Internal version that actually does the
- * fetch.
- */
-static void do_mbox_fetch(sys_mbox_t mbox, void **msg)
-{
-    sys_prot_t prot;
-    /* The caller got a semaphore token, so we are now allowed to increment
-     * reader, but we may still need to prevent concurrency between readers.
-     * FIXME: can there be concurrent readers? */
-    prot = sys_arch_protect();
-    ASSERT(mbox->reader != mbox->writer);
-    if (msg != NULL)
-        *msg = mbox->messages[mbox->reader];
-    mbox->reader = (mbox->reader + 1) % mbox->count;
-    sys_arch_unprotect(prot);
-    up(&mbox->write_sem);
-}
-
-/* Blocks the thread until a message arrives in the mailbox, but does
- * not block the thread longer than "timeout" milliseconds (similar to
- * the sys_arch_sem_wait() function). The "msg" argument is a result
- * parameter that is set by the function (i.e., by doing "*msg =
- * ptr"). The "msg" parameter maybe NULL to indicate that the message
- * should be dropped.
- *
- * The return values are the same as for the sys_arch_sem_wait() function:
- * Number of milliseconds spent waiting or SYS_ARCH_TIMEOUT if there was a
- * timeout. */
-uint32_t sys_arch_mbox_fetch(sys_mbox_t mbox, void **msg, uint32_t timeout)
-{
-    uint32_t rv;
-    if (mbox == SYS_MBOX_NULL)
-        return SYS_ARCH_TIMEOUT;
-
-    rv = sys_arch_sem_wait(&mbox->read_sem, timeout);
-    if ( rv == SYS_ARCH_TIMEOUT )
-        return rv;
-
-    do_mbox_fetch(mbox, msg);
-    return 0;
-}
-
-/* This is similar to sys_arch_mbox_fetch, however if a message is not
- * present in the mailbox, it immediately returns with the code
- * SYS_MBOX_EMPTY. On success 0 is returned.
- *
- * To allow for efficient implementations, this can be defined as a
- * function-like macro in sys_arch.h instead of a normal function. For
- * example, a naive implementation could be:
- *   #define sys_arch_mbox_tryfetch(mbox,msg) \
- *     sys_arch_mbox_fetch(mbox,msg,1)
- * although this would introduce unnecessary delays. */
-
-uint32_t sys_arch_mbox_tryfetch(sys_mbox_t mbox, void **msg) {
-    if (mbox == SYS_MBOX_NULL)
-        return SYS_ARCH_TIMEOUT;
-
-    if (!trydown(&mbox->read_sem))
-	return SYS_MBOX_EMPTY;
-
-    do_mbox_fetch(mbox, msg);
-    return 0;
-}
-
-
-/* Returns a pointer to the per-thread sys_timeouts structure. In lwIP,
- * each thread has a list of timeouts which is repressented as a linked
- * list of sys_timeout structures. The sys_timeouts structure holds a
- * pointer to a linked list of timeouts. This function is called by
- * the lwIP timeout scheduler and must not return a NULL value. 
- *
- * In a single threadd sys_arch implementation, this function will
- * simply return a pointer to a global sys_timeouts variable stored in
- * the sys_arch module. */
-struct sys_timeouts *sys_arch_timeouts(void) 
-{
-    static struct sys_timeouts timeout;
-    return &timeout;
-}
-
-
-/* Starts a new thread with priority "prio" that will begin its execution in the
- * function "thread()". The "arg" argument will be passed as an argument to the
- * thread() function. The id of the new thread is returned. Both the id and
- * the priority are system dependent. */
-static struct thread *lwip_thread;
-sys_thread_t sys_thread_new(char *name, void (* thread)(void *arg), void *arg, int stacksize, int prio)
-{
-    struct thread *t;
-    if (stacksize > STACK_SIZE) {
-	printk("Can't start lwIP thread: stack size %d is too large for our %d\n", stacksize, STACK_SIZE);
-	do_exit();
-    }
-    lwip_thread = t = create_thread(name, thread, arg);
-    return t;
-}
-
-/* This optional function does a "fast" critical region protection and returns
- * the previous protection level. This function is only called during very short
- * critical regions. An embedded system which supports ISR-based drivers might
- * want to implement this function by disabling interrupts. Task-based systems
- * might want to implement this by using a mutex or disabling tasking. This
- * function should support recursive calls from the same task or interrupt. In
- * other words, sys_arch_protect() could be called while already protected. In
- * that case the return value indicates that it is already protected.
- *
- * sys_arch_protect() is only required if your port is supporting an operating
- * system. */
-sys_prot_t sys_arch_protect(void)
-{
-    unsigned long flags;
-    local_irq_save(flags);
-    return flags;
-}
-
-/* This optional function does a "fast" set of critical region protection to the
- * value specified by pval. See the documentation for sys_arch_protect() for
- * more information. This function is only required if your port is supporting
- * an operating system. */
-void sys_arch_unprotect(sys_prot_t pval)
-{
-    local_irq_restore(pval);
-}
-
-/* non-fatal, print a message. */
-void lwip_printk(char *fmt, ...)
-{
-    va_list args;
-    va_start(args, fmt);
-    printk("lwIP: ");
-    print(0, fmt, args);
-    va_end(args);
-}
-
-/* fatal, print message and abandon execution. */
-void lwip_die(char *fmt, ...)
-{
-    va_list args;
-    va_start(args, fmt);
-    printk("lwIP assertion failed: ");
-    print(0, fmt, args);
-    va_end(args);
-    printk("\n");
-    BUG();
-}
diff --git a/extras/mini-os/lwip-net.c b/extras/mini-os/lwip-net.c
deleted file mode 100644
index 449b70f..0000000
--- a/extras/mini-os/lwip-net.c
+++ /dev/null
@@ -1,386 +0,0 @@
-/* 
- * lwip-net.c
- *
- * interface between lwIP's ethernet and Mini-os's netfront.
- * For now, support only one network interface, as mini-os does.
- *
- * Tim Deegan <Tim.Deegan at eu.citrix.net>, July 2007
- * based on lwIP's ethernetif.c skeleton file, copyrights as below.
- */
-
-
-/*
- * Copyright (c) 2001-2004 Swedish Institute of Computer Science.
- * All rights reserved. 
- * 
- * Redistribution and use in source and binary forms, with or without modification, 
- * are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote products
- *    derived from this software without specific prior written permission. 
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED 
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 
- * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 
- * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 
- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 
- * OF SUCH DAMAGE.
- *
- * This file is part of the lwIP TCP/IP stack.
- * 
- * Author: Adam Dunkels <adam at sics.se>
- *
- */
-
-#include <os.h>
-
-#include "lwip/opt.h"
-#include "lwip/def.h"
-#include "lwip/mem.h"
-#include "lwip/pbuf.h"
-#include "lwip/sys.h"
-
-#include <lwip/stats.h>
-#include <lwip/sys.h>
-#include <lwip/mem.h>
-#include <lwip/memp.h>
-#include <lwip/pbuf.h>
-#include <netif/etharp.h>
-#include <lwip/tcpip.h>
-#include <lwip/tcp.h>
-#include <lwip/netif.h>
-#include <lwip/dhcp.h>
-
-#include "netif/etharp.h"
-
-#include <netfront.h>
-
-/* Define those to better describe your network interface. */
-#define IFNAME0 'e'
-#define IFNAME1 'n'
-
-#define IF_IPADDR	0x00000000
-#define IF_NETMASK	0x00000000
-
-/* Only have one network interface at a time. */
-static struct netif *the_interface = NULL;
-
-static unsigned char rawmac[6];
-static struct netfront_dev *dev;
-
-/* Forward declarations. */
-static err_t netfront_output(struct netif *netif, struct pbuf *p,
-             struct ip_addr *ipaddr);
-
-/*
- * low_level_output():
- *
- * Should do the actual transmission of the packet. The packet is
- * contained in the pbuf that is passed to the function. This pbuf
- * might be chained.
- *
- */
-
-static err_t
-low_level_output(struct netif *netif, struct pbuf *p)
-{
-  if (!dev)
-    return ERR_OK;
-
-#ifdef ETH_PAD_SIZE
-  pbuf_header(p, -ETH_PAD_SIZE); /* drop the padding word */
-#endif
-
-  /* Send the data from the pbuf to the interface, one pbuf at a
-     time. The size of the data in each pbuf is kept in the ->len
-     variable. */
-  if (!p->next) {
-    /* Only one fragment, can send it directly */
-      netfront_xmit(dev, p->payload, p->len);
-  } else {
-    unsigned char data[p->tot_len], *cur;
-    struct pbuf *q;
-
-    for(q = p, cur = data; q != NULL; cur += q->len, q = q->next)
-      memcpy(cur, q->payload, q->len);
-    netfront_xmit(dev, data, p->tot_len);
-  }
-
-#if ETH_PAD_SIZE
-  pbuf_header(p, ETH_PAD_SIZE);			/* reclaim the padding word */
-#endif
-  
-  LINK_STATS_INC(link.xmit);
-
-  return ERR_OK;
-}
-
-
-
-/*
- * netfront_output():
- *
- * This function is called by the TCP/IP stack when an IP packet
- * should be sent. It calls the function called low_level_output() to
- * do the actual transmission of the packet.
- *
- */
-
-static err_t
-netfront_output(struct netif *netif, struct pbuf *p,
-      struct ip_addr *ipaddr)
-{
-  
- /* resolve hardware address, then send (or queue) packet */
-  return etharp_output(netif, p, ipaddr);
- 
-}
-
-/*
- * netfront_input():
- *
- * This function should be called when a packet is ready to be read
- * from the interface. 
- *
- */
-
-static void
-netfront_input(struct netif *netif, unsigned char* data, int len)
-{
-  struct eth_hdr *ethhdr;
-  struct pbuf *p, *q;
-
-#if ETH_PAD_SIZE
-  len += ETH_PAD_SIZE; /* allow room for Ethernet padding */
-#endif
-  
-  /* move received packet into a new pbuf */
-  p = pbuf_alloc(PBUF_RAW, len, PBUF_POOL);
-  if (p == NULL) {
-    LINK_STATS_INC(link.memerr);
-    LINK_STATS_INC(link.drop);
-    return;
-  }
-
-#if ETH_PAD_SIZE
-  pbuf_header(p, -ETH_PAD_SIZE); /* drop the padding word */
-#endif
-  
-  /* We iterate over the pbuf chain until we have read the entire
-   * packet into the pbuf. */
-  for(q = p; q != NULL && len > 0; q = q->next) {
-    /* Read enough bytes to fill this pbuf in the chain. The
-     * available data in the pbuf is given by the q->len
-     * variable. */
-    memcpy(q->payload, data, len < q->len ? len : q->len);
-    data += q->len;
-    len -= q->len;
-  }
-
-#if ETH_PAD_SIZE
-  pbuf_header(p, ETH_PAD_SIZE); /* reclaim the padding word */
-#endif
-
-  LINK_STATS_INC(link.recv);
-
-  /* points to packet payload, which starts with an Ethernet header */
-  ethhdr = p->payload;
-    
-  switch (htons(ethhdr->type)) {
-  /* IP packet? */
-  case ETHTYPE_IP:
-#if 0
-/* CSi disabled ARP table update on ingress IP packets.
-   This seems to work but needs thorough testing. */
-    /* update ARP table */
-    etharp_ip_input(netif, p);
-#endif
-    /* skip Ethernet header */
-    pbuf_header(p, -(int16_t)sizeof(struct eth_hdr));
-    /* pass to network layer */
-    if (tcpip_input(p, netif) == ERR_MEM)
-      /* Could not store it, drop */
-      pbuf_free(p);
-    break;
-      
-  case ETHTYPE_ARP:
-    /* pass p to ARP module  */
-    etharp_arp_input(netif, (struct eth_addr *) netif->hwaddr, p);
-    break;
-
-  default:
-    pbuf_free(p);
-    p = NULL;
-    break;
-  }
-}
-
-
-/* 
- * netif_rx(): overrides the default netif_rx behaviour in the netfront driver.
- * 
- * Pull received packets into a pbuf queue for the low_level_input() 
- * function to pass up to lwIP.
- */
-
-void netif_rx(unsigned char* data, int len)
-{
-  if (the_interface != NULL) {
-    netfront_input(the_interface, data, len);
-    wake_up(&netfront_queue);
-  }
-  /* By returning, we ack the packet and relinquish the RX ring slot */
-}
-
-/*
- * Set the IP, mask and gateway of the IF
- */
-void networking_set_addr(struct ip_addr *ipaddr, struct ip_addr *netmask, struct ip_addr *gw)
-{
-  netif_set_ipaddr(the_interface, ipaddr);
-  netif_set_netmask(the_interface, netmask);
-  netif_set_gw(the_interface, gw);
-}
-
-
-static void
-arp_timer(void *arg)
-{
-  etharp_tmr();
-  sys_timeout(ARP_TMR_INTERVAL, arp_timer, NULL);
-}
-
-/*
- * netif_netfront_init():
- *
- * Should be called at the beginning of the program to set up the
- * network interface. It calls the function low_level_init() to do the
- * actual setup of the hardware.
- *
- */
-
-err_t
-netif_netfront_init(struct netif *netif)
-{
-  unsigned char *mac = netif->state;
-
-#if LWIP_SNMP
-  /* ifType ethernetCsmacd(6) @see RFC1213 */
-  netif->link_type = 6;
-  /* your link speed here */
-  netif->link_speed = ;
-  netif->ts = 0;
-  netif->ifinoctets = 0;
-  netif->ifinucastpkts = 0;
-  netif->ifinnucastpkts = 0;
-  netif->ifindiscards = 0;
-  netif->ifoutoctets = 0;
-  netif->ifoutucastpkts = 0;
-  netif->ifoutnucastpkts = 0;
-  netif->ifoutdiscards = 0;
-#endif
-  
-  netif->name[0] = IFNAME0;
-  netif->name[1] = IFNAME1;
-  netif->output = netfront_output;
-  netif->linkoutput = low_level_output;
-  
-  the_interface = netif;
-  
-  /* set MAC hardware address */
-  netif->hwaddr_len = 6;
-  netif->hwaddr[0] = mac[0];
-  netif->hwaddr[1] = mac[1];
-  netif->hwaddr[2] = mac[2];
-  netif->hwaddr[3] = mac[3];
-  netif->hwaddr[4] = mac[4];
-  netif->hwaddr[5] = mac[5];
-
-  /* No interesting per-interface state */
-  netif->state = NULL;
-
-  /* maximum transfer unit */
-  netif->mtu = 1500;
-  
-  /* broadcast capability */
-  netif->flags = NETIF_FLAG_BROADCAST;
-
-  etharp_init();
-
-  sys_timeout(ARP_TMR_INTERVAL, arp_timer, NULL);
-
-  return ERR_OK;
-}
-
-/*
- * Thread run by netfront: bring up the IP address and fire lwIP timers.
- */
-static __DECLARE_SEMAPHORE_GENERIC(tcpip_is_up, 0);
-static void tcpip_bringup_finished(void *p)
-{
-  tprintk("TCP/IP bringup ends.\n");
-  up(&tcpip_is_up);
-}
-
-/* 
- * Utility function to bring the whole lot up.  Call this from app_main() 
- * or similar -- it starts netfront and have lwIP start its thread,
- * which calls back to tcpip_bringup_finished(), which 
- * lets us know it's OK to continue.
- */
-void start_networking(void)
-{
-  struct netif *netif;
-  struct ip_addr ipaddr = { htonl(IF_IPADDR) };
-  struct ip_addr netmask = { htonl(IF_NETMASK) };
-  struct ip_addr gw = { 0 };
-  char *ip = NULL;
-
-  tprintk("Waiting for network.\n");
-
-  dev = init_netfront(NULL, NULL, rawmac, &ip);
-  
-  if (ip) {
-    ipaddr.addr = inet_addr(ip);
-    if (IN_CLASSA(ntohl(ipaddr.addr)))
-      netmask.addr = htonl(IN_CLASSA_NET);
-    else if (IN_CLASSB(ntohl(ipaddr.addr)))
-      netmask.addr = htonl(IN_CLASSB_NET);
-    else if (IN_CLASSC(ntohl(ipaddr.addr)))
-      netmask.addr = htonl(IN_CLASSC_NET);
-    else
-      tprintk("Strange IP %s, leaving netmask to 0.\n", ip);
-  }
-  tprintk("IP %x netmask %x gateway %x.\n",
-          ntohl(ipaddr.addr), ntohl(netmask.addr), ntohl(gw.addr));
-  
-  tprintk("TCP/IP bringup begins.\n");
-  
-  netif = xmalloc(struct netif);
-  tcpip_init(tcpip_bringup_finished, netif);
-    
-  netif_add(netif, &ipaddr, &netmask, &gw, rawmac, 
-            netif_netfront_init, ip_input);
-  netif_set_default(netif);
-  netif_set_up(netif);
-
-  down(&tcpip_is_up);
-
-  tprintk("Network is ready.\n");
-}
-
-/* Shut down the network */
-void stop_networking(void)
-{
-  if (dev)
-    shutdown_netfront(dev);
-}
diff --git a/extras/mini-os/main.c b/extras/mini-os/main.c
deleted file mode 100644
index 4ec40b5..0000000
--- a/extras/mini-os/main.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * POSIX-compatible main layer
- *
- * Samuel Thibault <Samuel.Thibault at eu.citrix.net>, October 2007
- */
-
-#ifdef HAVE_LIBC
-#include <os.h>
-#include <sched.h>
-#include <console.h>
-#include <netfront.h>
-#include <pcifront.h>
-#include <time.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <xenbus.h>
-#include <events.h>
-#include <mini-os/lib.h>
-
-extern int main(int argc, char *argv[], char *envp[]);
-extern void __libc_init_array(void);
-extern void __libc_fini_array(void);
-extern unsigned long __CTOR_LIST__[];
-extern unsigned long __DTOR_LIST__[];
-
-#if 0
-#include <stdio.h>
-int main(int argc, char *argv[], char *envp[])
-{
-    printf("Hello, World!\n");
-    return 1;
-}
-#endif
-
-void _init(void)
-{
-}
-
-void _fini(void)
-{
-}
-
-extern char __app_bss_start, __app_bss_end;
-static void call_main(void *p)
-{
-    char *c, quote;
-#ifdef CONFIG_QEMU_XS_ARGS
-    char *domargs, *msg;
-#endif
-    int argc;
-    char **argv;
-    char *envp[] = { NULL };
-#ifdef CONFIG_QEMU_XS_ARGS
-    char *vm;
-    char path[128];
-    int domid;
-#endif
-    int i;
-
-    /* Let other parts initialize (including console output) before maybe
-     * crashing. */
-    //sleep(1);
-
-#ifdef CONFIG_SPARSE_BSS
-    sparse((unsigned long) &__app_bss_start, &__app_bss_end - &__app_bss_start);
-#endif
-#if defined(HAVE_LWIP) && defined(CONFIG_START_NETWORK) && defined(CONFIG_NETFRONT)
-    start_networking();
-#endif
-#ifdef CONFIG_PCIFRONT
-    create_thread("pcifront", pcifront_watches, NULL);
-#endif
-
-#ifdef CONFIG_QEMU_XS_ARGS
-    /* Fetch argc, argv from XenStore */
-    domid = xenbus_read_integer("target");
-    if (domid == -1) {
-        printk("Couldn't read target\n");
-        do_exit();
-    }
-
-    snprintf(path, sizeof(path), "/local/domain/%d/vm", domid);
-    msg = xenbus_read(XBT_NIL, path, &vm);
-    if (msg) {
-        printk("Couldn't read vm path\n");
-        do_exit();
-    }
-    printk("dom vm is at %s\n", vm);
-
-    snprintf(path, sizeof(path), "%s/image/dmargs", vm);
-    free(vm);
-    msg = xenbus_read(XBT_NIL, path, &domargs);
-
-    if (msg) {
-        printk("Couldn't get stubdom args: %s\n", msg);
-        domargs = strdup("");
-    }
-#endif
-
-    argc = 1;
-
-#define PARSE_ARGS(ARGS,START,QUOTE,END) \
-    c = ARGS; \
-    quote = 0; \
-    while (*c) { \
-	if (*c != ' ') { \
-	    START; \
-	    while (*c) { \
-		if (quote) { \
-		    if (*c == quote) { \
-			quote = 0; \
-			QUOTE; \
-			continue; \
-		    } \
-		} else if (*c == ' ') \
-		    break; \
-		if (*c == '"' || *c == '\'') { \
-		    quote = *c; \
-		    QUOTE; \
-		    continue; \
-		} \
-		c++; \
-	    } \
-	} else { \
-            END; \
-	    while (*c == ' ') \
-		c++; \
-	} \
-    } \
-    if (quote) {\
-	printk("Warning: unterminated quotation %c\n", quote); \
-	quote = 0; \
-    }
-#define PARSE_ARGS_COUNT(ARGS) PARSE_ARGS(ARGS, argc++, c++, )
-#define PARSE_ARGS_STORE(ARGS) PARSE_ARGS(ARGS, argv[argc++] = c, memmove(c, c + 1, strlen(c + 1) + 1), *c++ = 0)
-
-    PARSE_ARGS_COUNT((char*)start_info.cmd_line);
-#ifdef CONFIG_QEMU_XS_ARGS
-    PARSE_ARGS_COUNT(domargs);
-#endif
-
-    argv = alloca((argc + 1) * sizeof(char *));
-    argv[0] = "main";
-    argc = 1;
-
-    PARSE_ARGS_STORE((char*)start_info.cmd_line)
-#ifdef CONFIG_QEMU_XS_ARGS
-    PARSE_ARGS_STORE(domargs)
-#endif
-
-    argv[argc] = NULL;
-
-    for (i = 0; i < argc; i++)
-	printf("\"%s\" ", argv[i]);
-    printf("\n");
-
-    __libc_init_array();
-    environ = envp;
-    for (i = 0; __CTOR_LIST__[i] != 0; i++)
-        ((void((*)(void)))__CTOR_LIST__[i]) ();
-    tzset();
-
-    exit(main(argc, argv, envp));
-}
-
-void _exit(int ret)
-{
-    int i;
-
-    for (i = 0; __DTOR_LIST__[i] != 0; i++)
-        ((void((*)(void)))__DTOR_LIST__[i]) ();
-    close_all_files();
-    __libc_fini_array();
-    printk("main returned %d\n", ret);
-#if defined(HAVE_LWIP) && defined(CONFIG_NETFRONT)
-    stop_networking();
-#endif
-    stop_kernel();
-    if (!ret) {
-	/* No problem, just shutdown.  */
-        struct sched_shutdown sched_shutdown = { .reason = SHUTDOWN_poweroff };
-        HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
-    }
-    do_exit();
-}
-
-int app_main(start_info_t *si)
-{
-    printk("main.c: dummy main: start_info=%p\n", si);
-    main_thread = create_thread("main", call_main, si);
-    return 0;
-}
-#endif
diff --git a/extras/mini-os/minios.mk b/extras/mini-os/minios.mk
deleted file mode 100644
index f42f48b..0000000
--- a/extras/mini-os/minios.mk
+++ /dev/null
@@ -1,76 +0,0 @@
-#
-# The file contains the common make rules for building mini-os.
-#
-
-debug = y
-
-# Define some default flags.
-# NB. '-Wcast-qual' is nasty, so I omitted it.
-DEF_CFLAGS += -fno-builtin -Wall -Werror -Wredundant-decls -Wno-format -Wno-redundant-decls
-DEF_CFLAGS += $(call cc-option,$(CC),-fno-stack-protector,)
-DEF_CFLAGS += $(call cc-option,$(CC),-fgnu89-inline)
-DEF_CFLAGS += -Wstrict-prototypes -Wnested-externs -Wpointer-arith -Winline
-DEF_CPPFLAGS += -D__XEN_INTERFACE_VERSION__=$(XEN_INTERFACE_VERSION)
-
-DEF_ASFLAGS += -D__ASSEMBLY__
-DEF_LDFLAGS +=
-
-ifeq ($(debug),y)
-DEF_CFLAGS += -g
-#DEF_CFLAGS += -DMM_DEBUG
-#DEF_CFLAGS += -DFS_DEBUG
-#DEF_CFLAGS += -DLIBC_DEBUG
-#DEF_CFLAGS += -DGNT_DEBUG
-#DEF_CFLAGS += -DGNTMAP_DEBUG
-else
-DEF_CFLAGS += -O3
-endif
-
-# Make the headers define our internal stuff
-DEF_CFLAGS += -D__INSIDE_MINIOS__
-
-# Build the CFLAGS and ASFLAGS for compiling and assembling.
-# DEF_... flags are the common mini-os flags,
-# ARCH_... flags may be defined in arch/$(TARGET_ARCH_FAM/rules.mk
-CFLAGS := $(DEF_CFLAGS) $(ARCH_CFLAGS)
-CPPFLAGS := $(DEF_CPPFLAGS) $(ARCH_CPPFLAGS)
-ASFLAGS := $(DEF_ASFLAGS) $(ARCH_ASFLAGS)
-LDFLAGS := $(DEF_LDFLAGS) $(ARCH_LDFLAGS)
-
-# Special build dependencies.
-# Rebuild all after touching this/these file(s)
-EXTRA_DEPS += $(MINI-OS_ROOT)/minios.mk
-EXTRA_DEPS += $(MINI-OS_ROOT)/$(TARGET_ARCH_DIR)/arch.mk
-
-# Find all header files for checking dependencies.
-HDRS := $(wildcard $(MINI-OS_ROOT)/include/*.h)
-HDRS += $(wildcard $(MINI-OS_ROOT)/include/xen/*.h)
-HDRS += $(wildcard $(ARCH_INC)/*.h)
-# For special wanted header directories.
-extra_heads := $(foreach dir,$(EXTRA_INC),$(wildcard $(dir)/*.h))
-HDRS += $(extra_heads)
-
-# Add the special header directories to the include paths.
-override CPPFLAGS := $(CPPFLAGS) $(extra_incl)
-
-# The name of the architecture specific library.
-# This is on x86_32: libx86_32.a
-# $(ARCH_LIB) has to built in the architecture specific directory.
-ARCH_LIB_NAME = $(XEN_TARGET_ARCH)
-ARCH_LIB := lib$(ARCH_LIB_NAME).a
-
-# This object contains the entrypoint for startup from Xen.
-# $(HEAD_ARCH_OBJ) has to be built in the architecture specific directory.
-HEAD_ARCH_OBJ := $(XEN_TARGET_ARCH).o
-HEAD_OBJ := $(OBJ_DIR)/$(TARGET_ARCH_DIR)/$(HEAD_ARCH_OBJ)
-
-
-$(OBJ_DIR)/%.o: %.c $(HDRS) Makefile $(EXTRA_DEPS)
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@
-
-$(OBJ_DIR)/%.o: %.S $(HDRS) Makefile $(EXTRA_DEPS)
-	$(CC) $(ASFLAGS) $(CPPFLAGS) -c $< -o $@
-
-
-
-
diff --git a/extras/mini-os/mm.c b/extras/mini-os/mm.c
deleted file mode 100644
index 64b3292..0000000
--- a/extras/mini-os/mm.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/* 
- ****************************************************************************
- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2005 - Grzegorz Milos - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: mm.c
- *      Author: Rolf Neugebauer (neugebar at dcs.gla.ac.uk)
- *     Changes: Grzegorz Milos
- *              
- *        Date: Aug 2003, chages Aug 2005
- * 
- * Environment: Xen Minimal OS
- * Description: memory management related functions
- *              contains buddy page allocator from Xen.
- *
- ****************************************************************************
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <mini-os/os.h>
-#include <mini-os/hypervisor.h>
-#include <xen/memory.h>
-#include <mini-os/mm.h>
-#include <mini-os/types.h>
-#include <mini-os/lib.h>
-#include <mini-os/xmalloc.h>
-
-#ifdef MM_DEBUG
-#define DEBUG(_f, _a...) \
-    printk("MINI_OS(file=mm.c, line=%d) " _f "\n", __LINE__, ## _a)
-#else
-#define DEBUG(_f, _a...)    ((void)0)
-#endif
-
-/*********************
- * ALLOCATION BITMAP
- *  One bit per page of memory. Bit set => page is allocated.
- */
-
-static unsigned long *alloc_bitmap;
-#define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8)
-
-#define allocated_in_map(_pn) \
-(alloc_bitmap[(_pn)/PAGES_PER_MAPWORD] & (1UL<<((_pn)&(PAGES_PER_MAPWORD-1))))
-
-/*
- * Hint regarding bitwise arithmetic in map_{alloc,free}:
- *  -(1<<n)  sets all bits >= n. 
- *  (1<<n)-1 sets all bits <  n.
- * Variable names in map_{alloc,free}:
- *  *_idx == Index into `alloc_bitmap' array.
- *  *_off == Bit offset within an element of the `alloc_bitmap' array.
- */
-
-static void map_alloc(unsigned long first_page, unsigned long nr_pages)
-{
-    unsigned long start_off, end_off, curr_idx, end_idx;
-
-    curr_idx  = first_page / PAGES_PER_MAPWORD;
-    start_off = first_page & (PAGES_PER_MAPWORD-1);
-    end_idx   = (first_page + nr_pages) / PAGES_PER_MAPWORD;
-    end_off   = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
-
-    if ( curr_idx == end_idx )
-    {
-        alloc_bitmap[curr_idx] |= ((1UL<<end_off)-1) & -(1UL<<start_off);
-    }
-    else 
-    {
-        alloc_bitmap[curr_idx] |= -(1UL<<start_off);
-        while ( ++curr_idx < end_idx ) alloc_bitmap[curr_idx] = ~0UL;
-        alloc_bitmap[curr_idx] |= (1UL<<end_off)-1;
-    }
-}
-
-
-static void map_free(unsigned long first_page, unsigned long nr_pages)
-{
-    unsigned long start_off, end_off, curr_idx, end_idx;
-
-    curr_idx = first_page / PAGES_PER_MAPWORD;
-    start_off = first_page & (PAGES_PER_MAPWORD-1);
-    end_idx   = (first_page + nr_pages) / PAGES_PER_MAPWORD;
-    end_off   = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
-
-    if ( curr_idx == end_idx )
-    {
-        alloc_bitmap[curr_idx] &= -(1UL<<end_off) | ((1UL<<start_off)-1);
-    }
-    else 
-    {
-        alloc_bitmap[curr_idx] &= (1UL<<start_off)-1;
-        while ( ++curr_idx != end_idx ) alloc_bitmap[curr_idx] = 0;
-        alloc_bitmap[curr_idx] &= -(1UL<<end_off);
-    }
-}
-
-
-
-/*************************
- * BINARY BUDDY ALLOCATOR
- */
-
-typedef struct chunk_head_st chunk_head_t;
-typedef struct chunk_tail_st chunk_tail_t;
-
-struct chunk_head_st {
-    chunk_head_t  *next;
-    chunk_head_t **pprev;
-    int            level;
-};
-
-struct chunk_tail_st {
-    int level;
-};
-
-/* Linked lists of free chunks of different powers-of-two in size. */
-#define FREELIST_SIZE ((sizeof(void*)<<3)-PAGE_SHIFT)
-static chunk_head_t *free_head[FREELIST_SIZE];
-static chunk_head_t  free_tail[FREELIST_SIZE];
-#define FREELIST_EMPTY(_l) ((_l)->next == NULL)
-
-#define round_pgdown(_p)  ((_p)&PAGE_MASK)
-#define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
-
-#ifdef MM_DEBUG
-/*
- * Prints allocation[0/1] for @nr_pages, starting at @start
- * address (virtual).
- */
-USED static void print_allocation(void *start, int nr_pages)
-{
-    unsigned long pfn_start = virt_to_pfn(start);
-    int count;
-    for(count = 0; count < nr_pages; count++)
-        if(allocated_in_map(pfn_start + count)) printk("1");
-        else printk("0");
-        
-    printk("\n");        
-}
-
-/*
- * Prints chunks (making them with letters) for @nr_pages starting
- * at @start (virtual).
- */
-USED static void print_chunks(void *start, int nr_pages)
-{
-    char chunks[1001], current='A';
-    int order, count;
-    chunk_head_t *head;
-    unsigned long pfn_start = virt_to_pfn(start);
-   
-    memset(chunks, (int)'_', 1000);
-    if(nr_pages > 1000) 
-    {
-        DEBUG("Can only pring 1000 pages. Increase buffer size.");
-    }
-    
-    for(order=0; order < FREELIST_SIZE; order++)
-    {
-        head = free_head[order];
-        while(!FREELIST_EMPTY(head))
-        {
-            for(count = 0; count < 1UL<< head->level; count++)
-            {
-                if(count + virt_to_pfn(head) - pfn_start < 1000)
-                    chunks[count + virt_to_pfn(head) - pfn_start] = current;
-            }
-            head = head->next;
-            current++;
-        }
-    }
-    chunks[nr_pages] = '\0';
-    printk("%s\n", chunks);
-}
-#endif
-
-
-/*
- * Initialise allocator, placing addresses [@min, at max] in free pool.
- * @min and @max are PHYSICAL addresses.
- */
-static void init_page_allocator(unsigned long min, unsigned long max)
-{
-    int i;
-    unsigned long range, bitmap_size;
-    chunk_head_t *ch;
-    chunk_tail_t *ct;
-    for ( i = 0; i < FREELIST_SIZE; i++ )
-    {
-        free_head[i]       = &free_tail[i];
-        free_tail[i].pprev = &free_head[i];
-        free_tail[i].next  = NULL;
-    }
-
-    min = round_pgup  (min);
-    max = round_pgdown(max);
-
-    /* Allocate space for the allocation bitmap. */
-    bitmap_size  = (max+1) >> (PAGE_SHIFT+3);
-    bitmap_size  = round_pgup(bitmap_size);
-    alloc_bitmap = (unsigned long *)to_virt(min);
-    min         += bitmap_size;
-    range        = max - min;
-
-    /* All allocated by default. */
-    memset(alloc_bitmap, ~0, bitmap_size);
-    /* Free up the memory we've been given to play with. */
-    map_free(PHYS_PFN(min), range>>PAGE_SHIFT);
-
-    /* The buddy lists are addressed in high memory. */
-    min = (unsigned long) to_virt(min);
-    max = (unsigned long) to_virt(max);
-
-    while ( range != 0 )
-    {
-        /*
-         * Next chunk is limited by alignment of min, but also
-         * must not be bigger than remaining range.
-         */
-        for ( i = PAGE_SHIFT; (1UL<<(i+1)) <= range; i++ )
-            if ( min & (1UL<<i) ) break;
-
-
-        ch = (chunk_head_t *)min;
-        min   += (1UL<<i);
-        range -= (1UL<<i);
-        ct = (chunk_tail_t *)min-1;
-        i -= PAGE_SHIFT;
-        ch->level       = i;
-        ch->next        = free_head[i];
-        ch->pprev       = &free_head[i];
-        ch->next->pprev = &ch->next;
-        free_head[i]    = ch;
-        ct->level       = i;
-    }
-}
-
-
-/* Allocate 2^@order contiguous pages. Returns a VIRTUAL address. */
-unsigned long alloc_pages(int order)
-{
-    int i;
-    chunk_head_t *alloc_ch, *spare_ch;
-    chunk_tail_t            *spare_ct;
-
-
-    /* Find smallest order which can satisfy the request. */
-    for ( i = order; i < FREELIST_SIZE; i++ ) {
-	if ( !FREELIST_EMPTY(free_head[i]) ) 
-	    break;
-    }
-
-    if ( i == FREELIST_SIZE ) goto no_memory;
- 
-    /* Unlink a chunk. */
-    alloc_ch = free_head[i];
-    free_head[i] = alloc_ch->next;
-    alloc_ch->next->pprev = alloc_ch->pprev;
-
-    /* We may have to break the chunk a number of times. */
-    while ( i != order )
-    {
-        /* Split into two equal parts. */
-        i--;
-        spare_ch = (chunk_head_t *)((char *)alloc_ch + (1UL<<(i+PAGE_SHIFT)));
-        spare_ct = (chunk_tail_t *)((char *)spare_ch + (1UL<<(i+PAGE_SHIFT)))-1;
-
-        /* Create new header for spare chunk. */
-        spare_ch->level = i;
-        spare_ch->next  = free_head[i];
-        spare_ch->pprev = &free_head[i];
-        spare_ct->level = i;
-
-        /* Link in the spare chunk. */
-        spare_ch->next->pprev = &spare_ch->next;
-        free_head[i] = spare_ch;
-    }
-    
-    map_alloc(PHYS_PFN(to_phys(alloc_ch)), 1UL<<order);
-
-    return((unsigned long)alloc_ch);
-
- no_memory:
-
-    printk("Cannot handle page request order %d!\n", order);
-
-    return 0;
-}
-
-void free_pages(void *pointer, int order)
-{
-    chunk_head_t *freed_ch, *to_merge_ch;
-    chunk_tail_t *freed_ct;
-    unsigned long mask;
-    
-    /* First free the chunk */
-    map_free(virt_to_pfn(pointer), 1UL << order);
-    
-    /* Create free chunk */
-    freed_ch = (chunk_head_t *)pointer;
-    freed_ct = (chunk_tail_t *)((char *)pointer + (1UL<<(order + PAGE_SHIFT)))-1;
-    
-    /* Now, possibly we can conseal chunks together */
-    while(order < FREELIST_SIZE)
-    {
-        mask = 1UL << (order + PAGE_SHIFT);
-        if((unsigned long)freed_ch & mask) 
-        {
-            to_merge_ch = (chunk_head_t *)((char *)freed_ch - mask);
-            if(allocated_in_map(virt_to_pfn(to_merge_ch)) ||
-                    to_merge_ch->level != order)
-                break;
-            
-            /* Merge with predecessor */
-            freed_ch = to_merge_ch;   
-        }
-        else 
-        {
-            to_merge_ch = (chunk_head_t *)((char *)freed_ch + mask);
-            if(allocated_in_map(virt_to_pfn(to_merge_ch)) ||
-                    to_merge_ch->level != order)
-                break;
-            
-            /* Merge with successor */
-            freed_ct = (chunk_tail_t *)((char *)to_merge_ch + mask) - 1;
-        }
-        
-        /* We are commited to merging, unlink the chunk */
-        *(to_merge_ch->pprev) = to_merge_ch->next;
-        to_merge_ch->next->pprev = to_merge_ch->pprev;
-        
-        order++;
-    }
-
-    /* Link the new chunk */
-    freed_ch->level = order;
-    freed_ch->next  = free_head[order];
-    freed_ch->pprev = &free_head[order];
-    freed_ct->level = order;
-    
-    freed_ch->next->pprev = &freed_ch->next;
-    free_head[order] = freed_ch;   
-   
-}
-
-int free_physical_pages(xen_pfn_t *mfns, int n)
-{
-    struct xen_memory_reservation reservation;
-
-    set_xen_guest_handle(reservation.extent_start, mfns);
-    reservation.nr_extents = n;
-    reservation.extent_order = 0;
-    reservation.domid = DOMID_SELF;
-    return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
-}
-
-#ifdef HAVE_LIBC
-void *sbrk(ptrdiff_t increment)
-{
-    unsigned long old_brk = brk;
-    unsigned long new_brk = old_brk + increment;
-
-    if (new_brk > heap_end) {
-	printk("Heap exhausted: %p + %lx = %p > %p\n", old_brk, increment, new_brk, heap_end);
-	return NULL;
-    }
-    
-    if (new_brk > heap_mapped) {
-        unsigned long n = (new_brk - heap_mapped + PAGE_SIZE - 1) / PAGE_SIZE;
-        do_map_zero(heap_mapped, n);
-        heap_mapped += n * PAGE_SIZE;
-    }
-
-    brk = new_brk;
-
-    return (void *) old_brk;
-}
-#endif
-
-
-
-void init_mm(void)
-{
-
-    unsigned long start_pfn, max_pfn;
-
-    printk("MM: Init\n");
-
-    arch_init_mm(&start_pfn, &max_pfn);
-    /*
-     * now we can initialise the page allocator
-     */
-    printk("MM: Initialise page allocator for %lx(%lx)-%lx(%lx)\n",
-           (u_long)to_virt(PFN_PHYS(start_pfn)), (u_long)PFN_PHYS(start_pfn),
-           (u_long)to_virt(PFN_PHYS(max_pfn)), (u_long)PFN_PHYS(max_pfn));
-    init_page_allocator(PFN_PHYS(start_pfn), PFN_PHYS(max_pfn));
-    printk("MM: done\n");
-
-    arch_init_p2m(max_pfn);
-    
-    arch_init_demand_mapping_area(max_pfn);
-}
-
-void fini_mm(void)
-{
-}
-
-void sanity_check(void)
-{
-    int x;
-    chunk_head_t *head;
-
-    for (x = 0; x < FREELIST_SIZE; x++) {
-        for (head = free_head[x]; !FREELIST_EMPTY(head); head = head->next) {
-            ASSERT(!allocated_in_map(virt_to_pfn(head)));
-            if (head->next)
-                ASSERT(head->next->pprev == &head->next);
-        }
-        if (free_head[x]) {
-            ASSERT(free_head[x]->pprev == &free_head[x]);
-        }
-    }
-}
diff --git a/extras/mini-os/netfront.c b/extras/mini-os/netfront.c
deleted file mode 100644
index 44c3995..0000000
--- a/extras/mini-os/netfront.c
+++ /dev/null
@@ -1,677 +0,0 @@
-/* Minimal network driver for Mini-OS. 
- * Copyright (c) 2006-2007 Jacob Gorm Hansen, University of Copenhagen.
- * Based on netfront.c from Xen Linux.
- *
- * Does not handle fragments or extras.
- */
-
-#include <mini-os/os.h>
-#include <mini-os/xenbus.h>
-#include <mini-os/events.h>
-#include <errno.h>
-#include <xen/io/netif.h>
-#include <mini-os/gnttab.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/time.h>
-#include <mini-os/netfront.h>
-#include <mini-os/lib.h>
-#include <mini-os/semaphore.h>
-
-DECLARE_WAIT_QUEUE_HEAD(netfront_queue);
-
-#ifdef HAVE_LIBC
-#define NETIF_SELECT_RX ((void*)-1)
-#endif
-
-
-
-#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE)
-#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE)
-#define GRANT_INVALID_REF 0
-
-
-struct net_buffer {
-    void* page;
-    grant_ref_t gref;
-};
-
-struct netfront_dev {
-    domid_t dom;
-
-    unsigned short tx_freelist[NET_TX_RING_SIZE + 1];
-    struct semaphore tx_sem;
-
-    struct net_buffer rx_buffers[NET_RX_RING_SIZE];
-    struct net_buffer tx_buffers[NET_TX_RING_SIZE];
-
-    struct netif_tx_front_ring tx;
-    struct netif_rx_front_ring rx;
-    grant_ref_t tx_ring_ref;
-    grant_ref_t rx_ring_ref;
-    evtchn_port_t evtchn;
-
-    char *nodename;
-    char *backend;
-    char *mac;
-
-    xenbus_event_queue events;
-
-#ifdef HAVE_LIBC
-    int fd;
-    unsigned char *data;
-    size_t len;
-    size_t rlen;
-#endif
-
-    void (*netif_rx)(unsigned char* data, int len);
-};
-
-void init_rx_buffers(struct netfront_dev *dev);
-
-static inline void add_id_to_freelist(unsigned int id,unsigned short* freelist)
-{
-    freelist[id + 1] = freelist[0];
-    freelist[0]  = id;
-}
-
-static inline unsigned short get_id_from_freelist(unsigned short* freelist)
-{
-    unsigned int id = freelist[0];
-    freelist[0] = freelist[id + 1];
-    return id;
-}
-
-__attribute__((weak)) void netif_rx(unsigned char* data,int len)
-{
-    printk("%d bytes incoming at %p\n",len,data);
-}
-
-__attribute__((weak)) void net_app_main(void*si,unsigned char*mac) {}
-
-static inline int xennet_rxidx(RING_IDX idx)
-{
-    return idx & (NET_RX_RING_SIZE - 1);
-}
-
-void network_rx(struct netfront_dev *dev)
-{
-    RING_IDX rp,cons,req_prod;
-    struct netif_rx_response *rx;
-    int nr_consumed, some, more, i, notify;
-
-
-moretodo:
-    rp = dev->rx.sring->rsp_prod;
-    rmb(); /* Ensure we see queued responses up to 'rp'. */
-    cons = dev->rx.rsp_cons;
-
-    for (nr_consumed = 0, some = 0;
-         (cons != rp) && !some;
-         nr_consumed++, cons++)
-    {
-        struct net_buffer* buf;
-        unsigned char* page;
-        int id;
-
-        rx = RING_GET_RESPONSE(&dev->rx, cons);
-
-        if (rx->flags & NETRXF_extra_info)
-        {
-            printk("+++++++++++++++++++++ we have extras!\n");
-            continue;
-        }
-
-
-        if (rx->status == NETIF_RSP_NULL) continue;
-
-        id = rx->id;
-        BUG_ON(id >= NET_TX_RING_SIZE);
-
-        buf = &dev->rx_buffers[id];
-        page = (unsigned char*)buf->page;
-        gnttab_end_access(buf->gref);
-
-        if(rx->status>0)
-        {
-#ifdef HAVE_LIBC
-	    if (dev->netif_rx == NETIF_SELECT_RX) {
-		int len = rx->status;
-		ASSERT(current == main_thread);
-		if (len > dev->len)
-		    len = dev->len;
-		memcpy(dev->data, page+rx->offset, len);
-		dev->rlen = len;
-		some = 1;
-	    } else
-#endif
-		dev->netif_rx(page+rx->offset,rx->status);
-        }
-    }
-    dev->rx.rsp_cons=cons;
-
-    RING_FINAL_CHECK_FOR_RESPONSES(&dev->rx,more);
-    if(more && !some) goto moretodo;
-
-    req_prod = dev->rx.req_prod_pvt;
-
-    for(i=0; i<nr_consumed; i++)
-    {
-        int id = xennet_rxidx(req_prod + i);
-        netif_rx_request_t *req = RING_GET_REQUEST(&dev->rx, req_prod + i);
-        struct net_buffer* buf = &dev->rx_buffers[id];
-        void* page = buf->page;
-
-        /* We are sure to have free gnttab entries since they got released above */
-        buf->gref = req->gref = 
-            gnttab_grant_access(dev->dom,virt_to_mfn(page),0);
-
-        req->id = id;
-    }
-
-    wmb();
-
-    dev->rx.req_prod_pvt = req_prod + i;
-    
-    RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->rx, notify);
-    if (notify)
-        notify_remote_via_evtchn(dev->evtchn);
-
-}
-
-void network_tx_buf_gc(struct netfront_dev *dev)
-{
-
-
-    RING_IDX cons, prod;
-    unsigned short id;
-
-    do {
-        prod = dev->tx.sring->rsp_prod;
-        rmb(); /* Ensure we see responses up to 'rp'. */
-
-        for (cons = dev->tx.rsp_cons; cons != prod; cons++) 
-        {
-            struct netif_tx_response *txrsp;
-            struct net_buffer *buf;
-
-            txrsp = RING_GET_RESPONSE(&dev->tx, cons);
-            if (txrsp->status == NETIF_RSP_NULL)
-                continue;
-
-            if (txrsp->status == NETIF_RSP_ERROR)
-                printk("packet error\n");
-
-            id  = txrsp->id;
-            BUG_ON(id >= NET_TX_RING_SIZE);
-            buf = &dev->tx_buffers[id];
-            gnttab_end_access(buf->gref);
-            buf->gref=GRANT_INVALID_REF;
-
-	    add_id_to_freelist(id,dev->tx_freelist);
-	    up(&dev->tx_sem);
-        }
-
-        dev->tx.rsp_cons = prod;
-
-        /*
-         * Set a new event, then check for race with update of tx_cons.
-         * Note that it is essential to schedule a callback, no matter
-         * how few tx_buffers are pending. Even if there is space in the
-         * transmit ring, higher layers may be blocked because too much
-         * data is outstanding: in such cases notification from Xen is
-         * likely to be the only kick that we'll get.
-         */
-        dev->tx.sring->rsp_event =
-            prod + ((dev->tx.sring->req_prod - prod) >> 1) + 1;
-        mb();
-    } while ((cons == prod) && (prod != dev->tx.sring->rsp_prod));
-
-
-}
-
-void netfront_handler(evtchn_port_t port, struct pt_regs *regs, void *data)
-{
-    int flags;
-    struct netfront_dev *dev = data;
-
-    local_irq_save(flags);
-
-    network_tx_buf_gc(dev);
-    network_rx(dev);
-
-    local_irq_restore(flags);
-}
-
-#ifdef HAVE_LIBC
-void netfront_select_handler(evtchn_port_t port, struct pt_regs *regs, void *data)
-{
-    int flags;
-    struct netfront_dev *dev = data;
-    int fd = dev->fd;
-
-    local_irq_save(flags);
-    network_tx_buf_gc(dev);
-    local_irq_restore(flags);
-
-    if (fd != -1)
-        files[fd].read = 1;
-    wake_up(&netfront_queue);
-}
-#endif
-
-static void free_netfront(struct netfront_dev *dev)
-{
-    int i;
-
-    for(i=0;i<NET_TX_RING_SIZE;i++)
-	down(&dev->tx_sem);
-
-    mask_evtchn(dev->evtchn);
-
-    free(dev->mac);
-    free(dev->backend);
-
-    gnttab_end_access(dev->rx_ring_ref);
-    gnttab_end_access(dev->tx_ring_ref);
-
-    free_page(dev->rx.sring);
-    free_page(dev->tx.sring);
-
-    unbind_evtchn(dev->evtchn);
-
-    for(i=0;i<NET_RX_RING_SIZE;i++) {
-	gnttab_end_access(dev->rx_buffers[i].gref);
-	free_page(dev->rx_buffers[i].page);
-    }
-
-    for(i=0;i<NET_TX_RING_SIZE;i++)
-	if (dev->tx_buffers[i].page)
-	    free_page(dev->tx_buffers[i].page);
-
-    free(dev->nodename);
-    free(dev);
-}
-
-struct netfront_dev *init_netfront(char *_nodename, void (*thenetif_rx)(unsigned char* data, int len), unsigned char rawmac[6], char **ip)
-{
-    xenbus_transaction_t xbt;
-    char* err;
-    char* message=NULL;
-    struct netif_tx_sring *txs;
-    struct netif_rx_sring *rxs;
-    int retry=0;
-    int i;
-    char* msg = NULL;
-    char nodename[256];
-    char path[256];
-    struct netfront_dev *dev;
-    static int netfrontends = 0;
-
-    if (!_nodename)
-        snprintf(nodename, sizeof(nodename), "device/vif/%d", netfrontends);
-    else {
-        strncpy(nodename, _nodename, sizeof(nodename) - 1);
-        nodename[sizeof(nodename) - 1] = 0;
-    }
-    netfrontends++;
-
-    if (!thenetif_rx)
-	thenetif_rx = netif_rx;
-
-    printk("************************ NETFRONT for %s **********\n\n\n", nodename);
-
-    dev = malloc(sizeof(*dev));
-    memset(dev, 0, sizeof(*dev));
-    dev->nodename = strdup(nodename);
-#ifdef HAVE_LIBC
-    dev->fd = -1;
-#endif
-
-    printk("net TX ring size %d\n", NET_TX_RING_SIZE);
-    printk("net RX ring size %d\n", NET_RX_RING_SIZE);
-    init_SEMAPHORE(&dev->tx_sem, NET_TX_RING_SIZE);
-    for(i=0;i<NET_TX_RING_SIZE;i++)
-    {
-	add_id_to_freelist(i,dev->tx_freelist);
-        dev->tx_buffers[i].page = NULL;
-    }
-
-    for(i=0;i<NET_RX_RING_SIZE;i++)
-    {
-	/* TODO: that's a lot of memory */
-        dev->rx_buffers[i].page = (char*)alloc_page();
-    }
-
-    snprintf(path, sizeof(path), "%s/backend-id", nodename);
-    dev->dom = xenbus_read_integer(path);
-#ifdef HAVE_LIBC
-    if (thenetif_rx == NETIF_SELECT_RX)
-        evtchn_alloc_unbound(dev->dom, netfront_select_handler, dev, &dev->evtchn);
-    else
-#endif
-        evtchn_alloc_unbound(dev->dom, netfront_handler, dev, &dev->evtchn);
-
-    txs = (struct netif_tx_sring *) alloc_page();
-    rxs = (struct netif_rx_sring *) alloc_page();
-    memset(txs,0,PAGE_SIZE);
-    memset(rxs,0,PAGE_SIZE);
-
-
-    SHARED_RING_INIT(txs);
-    SHARED_RING_INIT(rxs);
-    FRONT_RING_INIT(&dev->tx, txs, PAGE_SIZE);
-    FRONT_RING_INIT(&dev->rx, rxs, PAGE_SIZE);
-
-    dev->tx_ring_ref = gnttab_grant_access(dev->dom,virt_to_mfn(txs),0);
-    dev->rx_ring_ref = gnttab_grant_access(dev->dom,virt_to_mfn(rxs),0);
-
-    init_rx_buffers(dev);
-
-    dev->netif_rx = thenetif_rx;
-
-    dev->events = NULL;
-
-again:
-    err = xenbus_transaction_start(&xbt);
-    if (err) {
-        printk("starting transaction\n");
-        free(err);
-    }
-
-    err = xenbus_printf(xbt, nodename, "tx-ring-ref","%u",
-                dev->tx_ring_ref);
-    if (err) {
-        message = "writing tx ring-ref";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename, "rx-ring-ref","%u",
-                dev->rx_ring_ref);
-    if (err) {
-        message = "writing rx ring-ref";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename,
-                "event-channel", "%u", dev->evtchn);
-    if (err) {
-        message = "writing event-channel";
-        goto abort_transaction;
-    }
-
-    err = xenbus_printf(xbt, nodename, "request-rx-copy", "%u", 1);
-
-    if (err) {
-        message = "writing request-rx-copy";
-        goto abort_transaction;
-    }
-
-    snprintf(path, sizeof(path), "%s/state", nodename);
-    err = xenbus_switch_state(xbt, path, XenbusStateConnected);
-    if (err) {
-        message = "switching state";
-        goto abort_transaction;
-    }
-
-    err = xenbus_transaction_end(xbt, 0, &retry);
-    free(err);
-    if (retry) {
-            goto again;
-        printk("completing transaction\n");
-    }
-
-    goto done;
-
-abort_transaction:
-    free(err);
-    err = xenbus_transaction_end(xbt, 1, &retry);
-    printk("Abort transaction %s\n", message);
-    goto error;
-
-done:
-
-    snprintf(path, sizeof(path), "%s/backend", nodename);
-    msg = xenbus_read(XBT_NIL, path, &dev->backend);
-    snprintf(path, sizeof(path), "%s/mac", nodename);
-    msg = xenbus_read(XBT_NIL, path, &dev->mac);
-
-    if ((dev->backend == NULL) || (dev->mac == NULL)) {
-        printk("%s: backend/mac failed\n", __func__);
-        goto error;
-    }
-
-    printk("backend at %s\n",dev->backend);
-    printk("mac is %s\n",dev->mac);
-
-    {
-        XenbusState state;
-        char path[strlen(dev->backend) + strlen("/state") + 1];
-        snprintf(path, sizeof(path), "%s/state", dev->backend);
-
-        xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
-
-        err = NULL;
-        state = xenbus_read_integer(path);
-        while (err == NULL && state < XenbusStateConnected)
-            err = xenbus_wait_for_state_change(path, &state, &dev->events);
-        if (state != XenbusStateConnected) {
-            printk("backend not avalable, state=%d\n", state);
-            xenbus_unwatch_path_token(XBT_NIL, path, path);
-            goto error;
-        }
-
-        if (ip) {
-            snprintf(path, sizeof(path), "%s/ip", dev->backend);
-            xenbus_read(XBT_NIL, path, ip);
-        }
-    }
-
-    printk("**************************\n");
-
-    unmask_evtchn(dev->evtchn);
-
-        /* Special conversion specifier 'hh' needed for __ia64__. Without
-           this mini-os panics with 'Unaligned reference'. */
-    if (rawmac)
-	sscanf(dev->mac,"%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
-            &rawmac[0],
-            &rawmac[1],
-            &rawmac[2],
-            &rawmac[3],
-            &rawmac[4],
-            &rawmac[5]);
-
-    return dev;
-error:
-    free(msg);
-    free(err);
-    free_netfront(dev);
-    return NULL;
-}
-
-#ifdef HAVE_LIBC
-int netfront_tap_open(char *nodename) {
-    struct netfront_dev *dev;
-
-    dev = init_netfront(nodename, NETIF_SELECT_RX, NULL, NULL);
-    if (!dev) {
-	printk("TAP open failed\n");
-	errno = EIO;
-	return -1;
-    }
-    dev->fd = alloc_fd(FTYPE_TAP);
-    printk("tap_open(%s) -> %d\n", nodename, dev->fd);
-    files[dev->fd].tap.dev = dev;
-    return dev->fd;
-}
-#endif
-
-void shutdown_netfront(struct netfront_dev *dev)
-{
-    char* err = NULL, *err2;
-    XenbusState state;
-
-    char path[strlen(dev->backend) + strlen("/state") + 1];
-    char nodename[strlen(dev->nodename) + strlen("/request-rx-copy") + 1];
-
-    printk("close network: backend at %s\n",dev->backend);
-
-    snprintf(path, sizeof(path), "%s/state", dev->backend);
-    snprintf(nodename, sizeof(nodename), "%s/state", dev->nodename);
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosing)) != NULL) {
-        printk("shutdown_netfront: error changing state to %d: %s\n",
-                XenbusStateClosing, err);
-        goto close;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && state < XenbusStateClosing)
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-    free(err);
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosed)) != NULL) {
-        printk("shutdown_netfront: error changing state to %d: %s\n",
-                XenbusStateClosed, err);
-        goto close;
-    }
-    state = xenbus_read_integer(path);
-    while (state < XenbusStateClosed) {
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-        free(err);
-    }
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateInitialising)) != NULL) {
-        printk("shutdown_netfront: error changing state to %d: %s\n",
-                XenbusStateInitialising, err);
-        goto close;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && (state < XenbusStateInitWait || state >= XenbusStateClosed))
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-
-close:
-    free(err);
-    err2 = xenbus_unwatch_path_token(XBT_NIL, path, path);
-    free(err2);
-
-    snprintf(nodename, sizeof(nodename), "%s/tx-ring-ref", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-    snprintf(nodename, sizeof(nodename), "%s/rx-ring-ref", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-    snprintf(nodename, sizeof(nodename), "%s/event-channel", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-    snprintf(nodename, sizeof(nodename), "%s/request-rx-copy", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-
-    if (!err)
-        free_netfront(dev);
-}
-
-
-void init_rx_buffers(struct netfront_dev *dev)
-{
-    int i, requeue_idx;
-    netif_rx_request_t *req;
-    int notify;
-
-    /* Rebuild the RX buffer freelist and the RX ring itself. */
-    for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) 
-    {
-        struct net_buffer* buf = &dev->rx_buffers[requeue_idx];
-        req = RING_GET_REQUEST(&dev->rx, requeue_idx);
-
-        buf->gref = req->gref = 
-            gnttab_grant_access(dev->dom,virt_to_mfn(buf->page),0);
-
-        req->id = requeue_idx;
-
-        requeue_idx++;
-    }
-
-    dev->rx.req_prod_pvt = requeue_idx;
-
-    RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->rx, notify);
-
-    if (notify) 
-        notify_remote_via_evtchn(dev->evtchn);
-
-    dev->rx.sring->rsp_event = dev->rx.rsp_cons + 1;
-}
-
-
-void netfront_xmit(struct netfront_dev *dev, unsigned char* data,int len)
-{
-    int flags;
-    struct netif_tx_request *tx;
-    RING_IDX i;
-    int notify;
-    unsigned short id;
-    struct net_buffer* buf;
-    void* page;
-
-    BUG_ON(len > PAGE_SIZE);
-
-    down(&dev->tx_sem);
-
-    local_irq_save(flags);
-    id = get_id_from_freelist(dev->tx_freelist);
-    local_irq_restore(flags);
-
-    buf = &dev->tx_buffers[id];
-    page = buf->page;
-    if (!page)
-	page = buf->page = (char*) alloc_page();
-
-    i = dev->tx.req_prod_pvt;
-    tx = RING_GET_REQUEST(&dev->tx, i);
-
-    memcpy(page,data,len);
-
-    buf->gref = 
-        tx->gref = gnttab_grant_access(dev->dom,virt_to_mfn(page),1);
-
-    tx->offset=0;
-    tx->size = len;
-    tx->flags=0;
-    tx->id = id;
-    dev->tx.req_prod_pvt = i + 1;
-
-    wmb();
-
-    RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->tx, notify);
-
-    if(notify) notify_remote_via_evtchn(dev->evtchn);
-
-    local_irq_save(flags);
-    network_tx_buf_gc(dev);
-    local_irq_restore(flags);
-}
-
-#ifdef HAVE_LIBC
-ssize_t netfront_receive(struct netfront_dev *dev, unsigned char *data, size_t len)
-{
-    unsigned long flags;
-    int fd = dev->fd;
-    ASSERT(current == main_thread);
-
-    dev->rlen = 0;
-    dev->data = data;
-    dev->len = len;
-
-    local_irq_save(flags);
-    network_rx(dev);
-    if (!dev->rlen && fd != -1)
-	/* No data for us, make select stop returning */
-	files[fd].read = 0;
-    /* Before re-enabling the interrupts, in case a packet just arrived in the
-     * meanwhile. */
-    local_irq_restore(flags);
-
-    dev->data = NULL;
-    dev->len = 0;
-
-    return dev->rlen;
-}
-#endif
diff --git a/extras/mini-os/pcifront.c b/extras/mini-os/pcifront.c
deleted file mode 100644
index 0fc5b30..0000000
--- a/extras/mini-os/pcifront.c
+++ /dev/null
@@ -1,616 +0,0 @@
-/* Minimal PCI driver for Mini-OS. 
- * Copyright (c) 2007-2008 Samuel Thibault.
- * Based on blkfront.c.
- */
-
-#include <string.h>
-#include <mini-os/os.h>
-#include <mini-os/lib.h>
-#include <mini-os/xenbus.h>
-#include <mini-os/events.h>
-#include <errno.h>
-#include <mini-os/gnttab.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/wait.h>
-#include <mini-os/pcifront.h>
-#include <mini-os/sched.h>
-
-#define PCI_DEVFN(slot, func) ((((slot) & 0x1f) << 3) | ((func) & 0x07))
-
-DECLARE_WAIT_QUEUE_HEAD(pcifront_queue);
-static struct pcifront_dev *pcidev;
-
-struct pcifront_dev {
-    domid_t dom;
-
-    struct xen_pci_sharedinfo *info;
-    grant_ref_t info_ref;
-    evtchn_port_t evtchn;
-
-    char *nodename;
-    char *backend;
-
-    xenbus_event_queue events;
-};
-
-void pcifront_handler(evtchn_port_t port, struct pt_regs *regs, void *data)
-{
-    wake_up(&pcifront_queue);
-}
-
-static void free_pcifront(struct pcifront_dev *dev)
-{
-    if (!dev)
-        dev = pcidev;
-
-    mask_evtchn(dev->evtchn);
-
-    gnttab_end_access(dev->info_ref);
-    free_page(dev->info);
-
-    unbind_evtchn(dev->evtchn);
-
-    free(dev->backend);
-    free(dev->nodename);
-    free(dev);
-}
-
-void pcifront_watches(void *opaque)
-{
-    XenbusState state;
-    char *err = NULL, *msg = NULL;
-    char *be_path, *be_state;
-    char* nodename = opaque ? opaque : "device/pci/0";
-    char path[strlen(nodename) + 9];
-    char fe_state[strlen(nodename) + 7];
-    xenbus_event_queue events = NULL;
-
-    snprintf(path, sizeof(path), "%s/backend", nodename);
-    snprintf(fe_state, sizeof(fe_state), "%s/state", nodename);
-
-    while (1) {
-        printk("pcifront_watches: waiting for backend path to appear %s\n", path);
-        xenbus_watch_path_token(XBT_NIL, path, path, &events);
-        while ((err = xenbus_read(XBT_NIL, path, &be_path)) != NULL) {
-            free(err);
-            xenbus_wait_for_watch(&events);
-        }
-        xenbus_unwatch_path_token(XBT_NIL, path, path);
-        printk("pcifront_watches: waiting for backend to get into the right state %s\n", be_path);
-        be_state = (char *) malloc(strlen(be_path) +  7);
-        snprintf(be_state, strlen(be_path) +  7, "%s/state", be_path);
-        xenbus_watch_path_token(XBT_NIL, be_state, be_state, &events);
-        while ((err = xenbus_read(XBT_NIL, be_state, &msg)) != NULL || msg[0] > '4') {
-            free(msg);
-            free(err);
-            xenbus_wait_for_watch(&events);
-        }
-        xenbus_unwatch_path_token(XBT_NIL, be_state, be_state);
-        if (init_pcifront(NULL) == NULL) {
-            free(be_state);
-            free(be_path);
-            continue;
-        }
-        xenbus_watch_path_token(XBT_NIL, be_state, be_state, &events);
-        state = XenbusStateConnected;
-        printk("pcifront_watches: waiting for backend events %s\n", be_state);
-        while ((err = xenbus_wait_for_state_change(be_state, &state, &events)) == NULL &&
-               (err = xenbus_read(XBT_NIL, pcidev->backend, &msg)) == NULL) {
-            free(msg);
-            printk("pcifront_watches: backend state changed: %s %d\n", be_state, state);
-            if (state == XenbusStateReconfiguring) {
-                printk("pcifront_watches: writing %s %d\n", fe_state, XenbusStateReconfiguring);
-                if ((err = xenbus_switch_state(XBT_NIL, fe_state, XenbusStateReconfiguring)) != NULL) {
-                    printk("pcifront_watches: error changing state to %d: %s\n",
-                            XenbusStateReconfiguring, err);
-                    if (!strcmp(err, "ENOENT")) {
-                        xenbus_write(XBT_NIL, fe_state, "7");
-                        free(err);
-                    }
-                }
-            } else if (state == XenbusStateReconfigured) {
-                printk("pcifront_watches: writing %s %d\n", fe_state, XenbusStateConnected);
-                printk("pcifront_watches: changing state to %d\n", XenbusStateConnected);
-                if ((err = xenbus_switch_state(XBT_NIL, fe_state, XenbusStateConnected)) != NULL) {
-                    printk("pcifront_watches: error changing state to %d: %s\n",
-                            XenbusStateConnected, err);
-                    if (!strcmp(err, "ENOENT")) {
-                        xenbus_write(XBT_NIL, fe_state, "4");
-                        free(err);
-                    }
-                }
-            } else if (state == XenbusStateClosing)
-                break;
-        }
-        if (err) {
-            printk("pcifront_watches: done waiting err=%s\n", err);
-            free(err);
-        } else
-            printk("pcifront_watches: done waiting\n");
-        err = xenbus_unwatch_path_token(XBT_NIL, be_state, be_state);
-        shutdown_pcifront(pcidev);
-        free(be_state);
-        free(be_path);
-        free(err);
-        pcidev = NULL;
-    }
-
-    xenbus_unwatch_path_token(XBT_NIL, path, path);
-}
-
-struct pcifront_dev *init_pcifront(char *_nodename)
-{
-    xenbus_transaction_t xbt;
-    char* err;
-    char* message=NULL;
-    int retry=0;
-    char* msg = NULL;
-    char* nodename = _nodename ? _nodename : "device/pci/0";
-    int dom;
-
-    struct pcifront_dev *dev;
-
-    char path[strlen(nodename) + strlen("/backend-id") + 1];
-
-    if (!_nodename && pcidev)
-        return pcidev;
-
-    printk("******************* PCIFRONT for %s **********\n\n\n", nodename);
-
-    snprintf(path, sizeof(path), "%s/backend-id", nodename);
-    dom = xenbus_read_integer(path); 
-    if (dom == -1) {
-        printk("no backend\n");
-        return NULL;
-    }
-
-    dev = malloc(sizeof(*dev));
-    memset(dev, 0, sizeof(*dev));
-    dev->nodename = strdup(nodename);
-    dev->dom = dom;
-
-    evtchn_alloc_unbound(dev->dom, pcifront_handler, dev, &dev->evtchn);
-
-    dev->info = (struct xen_pci_sharedinfo*) alloc_page();
-    memset(dev->info,0,PAGE_SIZE);
-
-    dev->info_ref = gnttab_grant_access(dev->dom,virt_to_mfn(dev->info),0);
-
-    dev->events = NULL;
-
-again:
-    err = xenbus_transaction_start(&xbt);
-    if (err) {
-        printk("starting transaction\n");
-        free(err);
-    }
-
-    err = xenbus_printf(xbt, nodename, "pci-op-ref","%u",
-                dev->info_ref);
-    if (err) {
-        message = "writing pci-op-ref";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename,
-                "event-channel", "%u", dev->evtchn);
-    if (err) {
-        message = "writing event-channel";
-        goto abort_transaction;
-    }
-    err = xenbus_printf(xbt, nodename,
-                "magic", XEN_PCI_MAGIC);
-    if (err) {
-        message = "writing magic";
-        goto abort_transaction;
-    }
-
-    snprintf(path, sizeof(path), "%s/state", nodename);
-    err = xenbus_switch_state(xbt, path, XenbusStateInitialised);
-    if (err) {
-        message = "switching state";
-        goto abort_transaction;
-    }
-
-    err = xenbus_transaction_end(xbt, 0, &retry);
-    free(err);
-    if (retry) {
-            goto again;
-        printk("completing transaction\n");
-    }
-
-    goto done;
-
-abort_transaction:
-    free(err);
-    err = xenbus_transaction_end(xbt, 1, &retry);
-    printk("Abort transaction %s\n", message);
-    goto error;
-
-done:
-
-    snprintf(path, sizeof(path), "%s/backend", nodename);
-    msg = xenbus_read(XBT_NIL, path, &dev->backend);
-    if (msg) {
-        printk("Error %s when reading the backend path %s\n", msg, path);
-        goto error;
-    }
-
-    printk("backend at %s\n", dev->backend);
-
-    {
-        char path[strlen(dev->backend) + strlen("/state") + 1];
-        char frontpath[strlen(nodename) + strlen("/state") + 1];
-        XenbusState state;
-        snprintf(path, sizeof(path), "%s/state", dev->backend);
-
-        xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
-
-        err = NULL;
-        state = xenbus_read_integer(path);
-        while (err == NULL && state < XenbusStateConnected)
-            err = xenbus_wait_for_state_change(path, &state, &dev->events);
-        if (state != XenbusStateConnected) {
-            printk("backend not avalable, state=%d\n", state);
-            free(err);
-            err = xenbus_unwatch_path_token(XBT_NIL, path, path);
-            goto error;
-        }
-
-        snprintf(frontpath, sizeof(frontpath), "%s/state", nodename);
-        if ((err = xenbus_switch_state(XBT_NIL, frontpath, XenbusStateConnected))
-            != NULL) {
-            printk("error switching state %s\n", err);
-            free(err);
-            err = xenbus_unwatch_path_token(XBT_NIL, path, path);
-            goto error;
-        }
-    }
-    unmask_evtchn(dev->evtchn);
-
-    printk("**************************\n");
-
-    if (!_nodename)
-        pcidev = dev;
-
-    return dev;
-
-error:
-    free(msg);
-    free(err);
-    free_pcifront(dev);
-    return NULL;
-}
-
-void pcifront_scan(struct pcifront_dev *dev, void (*func)(unsigned int domain, unsigned int bus, unsigned slot, unsigned int fun))
-{
-    char *path;
-    int i, n, len;
-    char *s, *msg = NULL;
-    unsigned int domain, bus, slot, fun;
-
-    if (!dev)
-        dev = pcidev;
-    if (!dev) {
-	printk("pcifront_scan: device or bus\n");
-	return;
-    }
-
-    len = strlen(dev->backend) + 1 + 5 + 10 + 1;
-    path = (char *) malloc(len);
-    snprintf(path, len, "%s/num_devs", dev->backend);
-    n = xenbus_read_integer(path);
-
-    for (i = 0; i < n; i++) {
-        snprintf(path, len, "%s/dev-%d", dev->backend, i);
-        msg = xenbus_read(XBT_NIL, path, &s);
-        if (msg) {
-            printk("Error %s when reading the PCI root name at %s\n", msg, path);
-            free(msg);
-            continue;
-        }
-
-        if (sscanf(s, "%x:%x:%x.%x", &domain, &bus, &slot, &fun) != 4) {
-            printk("\"%s\" does not look like a PCI device address\n", s);
-            free(s);
-            continue;
-        }
-        free(s);
-
-        if (func)
-            func(domain, bus, slot, fun);
-    }
-    free(path);
-}
-
-void shutdown_pcifront(struct pcifront_dev *dev)
-{
-    char* err = NULL, *err2;
-    XenbusState state;
-
-    char path[strlen(dev->backend) + strlen("/state") + 1];
-    char nodename[strlen(dev->nodename) + strlen("/event-channel") + 1];
-
-    printk("close pci: backend at %s\n",dev->backend);
-
-    snprintf(path, sizeof(path), "%s/state", dev->backend);
-    snprintf(nodename, sizeof(nodename), "%s/state", dev->nodename);
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosing)) != NULL) {
-        printk("shutdown_pcifront: error changing state to %d: %s\n",
-                XenbusStateClosing, err);
-        goto close_pcifront;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && state < XenbusStateClosing)
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-    free(err);
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosed)) != NULL) {
-        printk("shutdown_pcifront: error changing state to %d: %s\n",
-                XenbusStateClosed, err);
-        goto close_pcifront;
-    }
-    state = xenbus_read_integer(path);
-    while (state < XenbusStateClosed) {
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-        free(err);
-    }
-
-    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateInitialising)) != NULL) {
-        printk("shutdown_pcifront: error changing state to %d: %s\n",
-                XenbusStateInitialising, err);
-        goto close_pcifront;
-    }
-    state = xenbus_read_integer(path);
-    while (err == NULL && (state < XenbusStateInitWait || state >= XenbusStateClosed))
-        err = xenbus_wait_for_state_change(path, &state, &dev->events);
-
-close_pcifront:
-    free(err);
-    err2 = xenbus_unwatch_path_token(XBT_NIL, path, path);
-    free(err2);
-
-    snprintf(nodename, sizeof(nodename), "%s/info-ref", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-    snprintf(nodename, sizeof(nodename), "%s/event-channel", dev->nodename);
-    err2 = xenbus_rm(XBT_NIL, nodename);
-    free(err2);
-
-    if (!err)
-        free_pcifront(dev);
-}
-
-int pcifront_physical_to_virtual (struct pcifront_dev *dev,
-                                  unsigned int *dom,
-                                  unsigned int *bus,
-                                  unsigned int *slot,
-                                  unsigned int *fun)
-{
-    /* FIXME: the buffer sizing is a little lazy here. 10 extra bytes
-       should be enough to hold the paths we need to construct, even
-       if the number of devices is large */
-    char path[strlen(dev->backend) + strlen("/num_devs") + 10 + 1];
-    int i, n;
-    char *s, *msg = NULL;
-    unsigned int dom1, bus1, slot1, fun1;
-
-    if (!dev)
-        dev = pcidev;
-
-    snprintf(path, sizeof(path), "%s/num_devs", dev->backend);
-    n = xenbus_read_integer(path);
-
-    for (i = 0; i < n; i++) {
-        snprintf(path, sizeof(path), "%s/dev-%d", dev->backend, i);
-        msg = xenbus_read(XBT_NIL, path, &s);
-        if (msg) {
-            printk("Error %s when reading the PCI root name at %s\n", msg, path);
-            free(msg);
-            continue;
-        }
-
-        if (sscanf(s, "%x:%x:%x.%x", &dom1, &bus1, &slot1, &fun1) != 4) {
-            printk("\"%s\" does not look like a PCI device address\n", s);
-            free(s);
-            continue;
-        }
-        free(s);
-
-        if (dom1 == *dom && bus1 == *bus && slot1 == *slot && fun1 == *fun) {
-            snprintf(path, sizeof(path), "%s/vdev-%d", dev->backend, i);
-            msg = xenbus_read(XBT_NIL, path, &s);
-            if (msg) {
-                printk("Error %s when reading the PCI root name at %s\n", msg, path);
-                continue;
-            }
-
-            if (sscanf(s, "%x:%x:%x.%x", dom, bus, slot, fun) != 4) {
-                printk("\"%s\" does not look like a PCI device address\n", s);
-                free(s);
-                continue;
-            }
-            free(s);
-
-            return 0;
-        }
-    }
-    return -1;
-}
-
-void pcifront_op(struct pcifront_dev *dev, struct xen_pci_op *op)
-{
-    if (!dev)
-        dev = pcidev;
-    dev->info->op = *op;
-    /* Make sure info is written before the flag */
-    wmb();
-    set_bit(_XEN_PCIF_active, (void*) &dev->info->flags);
-    notify_remote_via_evtchn(dev->evtchn);
-
-    wait_event(pcifront_queue, !test_bit(_XEN_PCIF_active, (void*) &dev->info->flags));
-
-    /* Make sure flag is read before info */
-    rmb();
-    *op = dev->info->op;
-}
-
-int pcifront_conf_read(struct pcifront_dev *dev,
-                       unsigned int dom,
-                       unsigned int bus, unsigned int slot, unsigned int fun,
-                       unsigned int off, unsigned int size, unsigned int *val)
-{
-    struct xen_pci_op op;
-
-    if (!dev)
-        dev = pcidev;
-    if (pcifront_physical_to_virtual(dev, &dom, &bus, &slot, &fun) < 0)
-        return XEN_PCI_ERR_dev_not_found;
-    memset(&op, 0, sizeof(op));
-
-    op.cmd = XEN_PCI_OP_conf_read;
-    op.domain = dom;
-    op.bus = bus;
-    op.devfn = PCI_DEVFN(slot, fun);
-    op.offset = off;
-    op.size = size;
-
-    pcifront_op(dev, &op);
-
-    if (op.err)
-        return op.err;
-
-    *val = op.value;
-
-    return 0;
-}
-
-int pcifront_conf_write(struct pcifront_dev *dev,
-                        unsigned int dom,
-                        unsigned int bus, unsigned int slot, unsigned int fun,
-                        unsigned int off, unsigned int size, unsigned int val)
-{
-    struct xen_pci_op op;
-
-    if (!dev)
-        dev = pcidev;
-    if (pcifront_physical_to_virtual(dev, &dom, &bus, &slot, &fun) < 0)
-        return XEN_PCI_ERR_dev_not_found;
-    memset(&op, 0, sizeof(op));
-
-    op.cmd = XEN_PCI_OP_conf_write;
-    op.domain = dom;
-    op.bus = bus;
-    op.devfn = PCI_DEVFN(slot, fun);
-    op.offset = off;
-    op.size = size;
-
-    op.value = val;
-
-    pcifront_op(dev, &op);
-
-    return op.err;
-}
-
-int pcifront_enable_msi(struct pcifront_dev *dev,
-                        unsigned int dom,
-                        unsigned int bus, unsigned int slot, unsigned int fun)
-{
-    struct xen_pci_op op;
-
-    if (!dev)
-        dev = pcidev;
-    if (pcifront_physical_to_virtual(dev, &dom, &bus, &slot, &fun) < 0)
-        return XEN_PCI_ERR_dev_not_found;
-    memset(&op, 0, sizeof(op));
-
-    op.cmd = XEN_PCI_OP_enable_msi;
-    op.domain = dom;
-    op.bus = bus;
-    op.devfn = PCI_DEVFN(slot, fun);
-
-    pcifront_op(dev, &op);
-    
-    if (op.err)
-        return op.err;
-    else
-        return op.value;
-}
-
-int pcifront_disable_msi(struct pcifront_dev *dev,
-                         unsigned int dom,
-                         unsigned int bus, unsigned int slot, unsigned int fun)
-{
-    struct xen_pci_op op;
-
-    if (!dev)
-        dev = pcidev;
-    if (pcifront_physical_to_virtual(dev, &dom, &bus, &slot, &fun) < 0)
-        return XEN_PCI_ERR_dev_not_found;
-    memset(&op, 0, sizeof(op));
-
-    op.cmd = XEN_PCI_OP_disable_msi;
-    op.domain = dom;
-    op.bus = bus;
-    op.devfn = PCI_DEVFN(slot, fun);
-
-    pcifront_op(dev, &op);
-    
-    return op.err;
-}
-
-int pcifront_enable_msix(struct pcifront_dev *dev,
-                         unsigned int dom,
-                         unsigned int bus, unsigned int slot, unsigned int fun,
-                         struct xen_msix_entry *entries, int n)
-{
-    struct xen_pci_op op;
-
-    if (!dev)
-        dev = pcidev;
-    if (pcifront_physical_to_virtual(dev, &dom, &bus, &slot, &fun) < 0)
-        return XEN_PCI_ERR_dev_not_found;
-    if (n > SH_INFO_MAX_VEC)
-        return XEN_PCI_ERR_op_failed;
-
-    memset(&op, 0, sizeof(op));
-
-    op.cmd = XEN_PCI_OP_enable_msix;
-    op.domain = dom;
-    op.bus = bus;
-    op.devfn = PCI_DEVFN(slot, fun);
-    op.value = n;
-
-    memcpy(op.msix_entries, entries, n * sizeof(*entries));
-
-    pcifront_op(dev, &op);
-    
-    if (op.err)
-        return op.err;
-
-    memcpy(entries, op.msix_entries, n * sizeof(*entries));
-
-    return 0;
-}
-
-
-int pcifront_disable_msix(struct pcifront_dev *dev,
-                          unsigned int dom,
-                          unsigned int bus, unsigned int slot, unsigned int fun)
-{
-    struct xen_pci_op op;
-
-    if (!dev)
-        dev = pcidev;
-    if (pcifront_physical_to_virtual(dev, &dom, &bus, &slot, &fun) < 0)
-        return XEN_PCI_ERR_dev_not_found;
-    memset(&op, 0, sizeof(op));
-
-    op.cmd = XEN_PCI_OP_disable_msix;
-    op.domain = dom;
-    op.bus = bus;
-    op.devfn = PCI_DEVFN(slot, fun);
-
-    pcifront_op(dev, &op);
-    
-    return op.err;
-}
diff --git a/extras/mini-os/sched.c b/extras/mini-os/sched.c
deleted file mode 100644
index d0c607e..0000000
--- a/extras/mini-os/sched.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/* 
- ****************************************************************************
- * (C) 2005 - Grzegorz Milos - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: sched.c
- *      Author: Grzegorz Milos
- *     Changes: Robert Kaiser
- *              
- *        Date: Aug 2005
- * 
- * Environment: Xen Minimal OS
- * Description: simple scheduler for Mini-Os
- *
- * The scheduler is non-preemptive (cooperative), and schedules according 
- * to Round Robin algorithm.
- *
- ****************************************************************************
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <mini-os/os.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/time.h>
-#include <mini-os/mm.h>
-#include <mini-os/types.h>
-#include <mini-os/lib.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/list.h>
-#include <mini-os/sched.h>
-#include <mini-os/semaphore.h>
-
-
-#ifdef SCHED_DEBUG
-#define DEBUG(_f, _a...) \
-    printk("MINI_OS(file=sched.c, line=%d) " _f "\n", __LINE__, ## _a)
-#else
-#define DEBUG(_f, _a...)    ((void)0)
-#endif
-
-MINIOS_TAILQ_HEAD(thread_list, struct thread);
-
-struct thread *idle_thread = NULL;
-static struct thread_list exited_threads = MINIOS_TAILQ_HEAD_INITIALIZER(exited_threads);
-static struct thread_list thread_list = MINIOS_TAILQ_HEAD_INITIALIZER(thread_list);
-static int threads_started;
-
-struct thread *main_thread;
-
-void inline print_runqueue(void)
-{
-    struct thread *th;
-    MINIOS_TAILQ_FOREACH(th, &thread_list, thread_list)
-    {
-        printk("   Thread \"%s\", runnable=%d\n", th->name, is_runnable(th));
-    }
-    printk("\n");
-}
-
-void schedule(void)
-{
-    struct thread *prev, *next, *thread, *tmp;
-    unsigned long flags;
-
-    if (irqs_disabled()) {
-        printk("Must not call schedule() with IRQs disabled\n");
-        BUG();
-    }
-
-    prev = current;
-    local_irq_save(flags); 
-
-    if (in_callback) {
-        printk("Must not call schedule() from a callback\n");
-        BUG();
-    }
-
-    do {
-        /* Examine all threads.
-           Find a runnable thread, but also wake up expired ones and find the
-           time when the next timeout expires, else use 10 seconds. */
-        s_time_t now = NOW();
-        s_time_t min_wakeup_time = now + SECONDS(10);
-        next = NULL;
-        MINIOS_TAILQ_FOREACH_SAFE(thread, &thread_list, thread_list, tmp)
-        {
-            if (!is_runnable(thread) && thread->wakeup_time != 0LL)
-            {
-                if (thread->wakeup_time <= now)
-                    wake(thread);
-                else if (thread->wakeup_time < min_wakeup_time)
-                    min_wakeup_time = thread->wakeup_time;
-            }
-            if(is_runnable(thread)) 
-            {
-                next = thread;
-                /* Put this thread on the end of the list */
-                MINIOS_TAILQ_REMOVE(&thread_list, thread, thread_list);
-                MINIOS_TAILQ_INSERT_TAIL(&thread_list, thread, thread_list);
-                break;
-            }
-        }
-        if (next)
-            break;
-        /* block until the next timeout expires, or for 10 secs, whichever comes first */
-        block_domain(min_wakeup_time);
-        /* handle pending events if any */
-        force_evtchn_callback();
-    } while(1);
-    local_irq_restore(flags);
-    /* Interrupting the switch is equivalent to having the next thread
-       inturrupted at the return instruction. And therefore at safe point. */
-    if(prev != next) switch_threads(prev, next);
-
-    MINIOS_TAILQ_FOREACH_SAFE(thread, &exited_threads, thread_list, tmp)
-    {
-        if(thread != prev)
-        {
-            MINIOS_TAILQ_REMOVE(&exited_threads, thread, thread_list);
-            free_pages(thread->stack, STACK_SIZE_PAGE_ORDER);
-            xfree(thread);
-        }
-    }
-}
-
-struct thread* create_thread(char *name, void (*function)(void *), void *data)
-{
-    struct thread *thread;
-    unsigned long flags;
-    /* Call architecture specific setup. */
-    thread = arch_create_thread(name, function, data);
-    /* Not runable, not exited, not sleeping */
-    thread->flags = 0;
-    thread->wakeup_time = 0LL;
-#ifdef HAVE_LIBC
-    _REENT_INIT_PTR((&thread->reent))
-#endif
-    set_runnable(thread);
-    local_irq_save(flags);
-    MINIOS_TAILQ_INSERT_TAIL(&thread_list, thread, thread_list);
-    local_irq_restore(flags);
-    return thread;
-}
-
-#ifdef HAVE_LIBC
-static struct _reent callback_reent;
-struct _reent *__getreent(void)
-{
-    struct _reent *_reent;
-
-    if (!threads_started)
-        _reent = _impure_ptr;
-    else if (in_callback)
-        _reent = &callback_reent;
-    else
-        _reent = &get_current()->reent;
-
-#ifndef NDEBUG
-#if defined(__x86_64__) || defined(__x86__)
-    {
-#ifdef __x86_64__
-        register unsigned long sp asm ("rsp");
-#else
-        register unsigned long sp asm ("esp");
-#endif
-        if ((sp & (STACK_SIZE-1)) < STACK_SIZE / 16) {
-            static int overflowing;
-            if (!overflowing) {
-                overflowing = 1;
-                printk("stack overflow\n");
-                BUG();
-            }
-        }
-    }
-#endif
-#else
-#error Not implemented yet
-#endif
-    return _reent;
-}
-#endif
-
-void exit_thread(void)
-{
-    unsigned long flags;
-    struct thread *thread = current;
-    printk("Thread \"%s\" exited.\n", thread->name);
-    local_irq_save(flags);
-    /* Remove from the thread list */
-    MINIOS_TAILQ_REMOVE(&thread_list, thread, thread_list);
-    clear_runnable(thread);
-    /* Put onto exited list */
-    MINIOS_TAILQ_INSERT_HEAD(&exited_threads, thread, thread_list);
-    local_irq_restore(flags);
-    /* Schedule will free the resources */
-    while(1)
-    {
-        schedule();
-        printk("schedule() returned!  Trying again\n");
-    }
-}
-
-void block(struct thread *thread)
-{
-    thread->wakeup_time = 0LL;
-    clear_runnable(thread);
-}
-
-void msleep(uint32_t millisecs)
-{
-    struct thread *thread = get_current();
-    thread->wakeup_time = NOW()  + MILLISECS(millisecs);
-    clear_runnable(thread);
-    schedule();
-}
-
-void wake(struct thread *thread)
-{
-    thread->wakeup_time = 0LL;
-    set_runnable(thread);
-}
-
-void idle_thread_fn(void *unused)
-{
-    threads_started = 1;
-    while (1) {
-        block(current);
-        schedule();
-    }
-}
-
-DECLARE_MUTEX(mutex);
-
-void th_f1(void *data)
-{
-    struct timeval tv1, tv2;
-
-    for(;;)
-    {
-        down(&mutex);
-        printk("Thread \"%s\" got semaphore, runnable %d\n", current->name, is_runnable(current));
-        schedule();
-        printk("Thread \"%s\" releases the semaphore\n", current->name);
-        up(&mutex);
-        
-        
-        gettimeofday(&tv1, NULL);
-        for(;;)
-        {
-            gettimeofday(&tv2, NULL);
-            if(tv2.tv_sec - tv1.tv_sec > 2) break;
-        }
-                
-        
-        schedule(); 
-    }
-}
-
-void th_f2(void *data)
-{
-    for(;;)
-    {
-        printk("Thread OTHER executing, data 0x%lx\n", data);
-        schedule();
-    }
-}
-
-
-
-void init_sched(void)
-{
-    printk("Initialising scheduler\n");
-
-#ifdef HAVE_LIBC
-    _REENT_INIT_PTR((&callback_reent))
-#endif
-    idle_thread = create_thread("Idle", idle_thread_fn, NULL);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/extras/mini-os/test.c b/extras/mini-os/test.c
deleted file mode 100644
index 0d7aba4..0000000
--- a/extras/mini-os/test.c
+++ /dev/null
@@ -1,577 +0,0 @@
-/******************************************************************************
- * test.c
- * 
- * Test code for all the various frontends; split from kernel.c
- * 
- * Copyright (c) 2002-2003, K A Fraser & R Neugebauer
- * Copyright (c) 2005, Grzegorz Milos, Intel Research Cambridge
- * Copyright (c) 2006, Robert Kaiser, FH Wiesbaden
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <mini-os/os.h>
-#include <mini-os/hypervisor.h>
-#include <mini-os/mm.h>
-#include <mini-os/events.h>
-#include <mini-os/time.h>
-#include <mini-os/types.h>
-#include <mini-os/lib.h>
-#include <mini-os/sched.h>
-#include <mini-os/xenbus.h>
-#include <mini-os/gnttab.h>
-#include <mini-os/netfront.h>
-#include <mini-os/blkfront.h>
-#include <mini-os/fbfront.h>
-#include <mini-os/pcifront.h>
-#include <mini-os/xmalloc.h>
-#include <fcntl.h>
-#include <xen/features.h>
-#include <xen/version.h>
-
-#ifdef CONFIG_XENBUS
-static unsigned int do_shutdown = 0;
-static unsigned int shutdown_reason;
-static DECLARE_WAIT_QUEUE_HEAD(shutdown_queue);
-#endif
-
-#ifdef CONFIG_XENBUS
-void test_xenbus(void);
-
-static void xenbus_tester(void *p)
-{
-    test_xenbus();
-}
-#endif
-
-#ifndef HAVE_LIBC
-/* Should be random enough for our uses */
-int rand(void)
-{
-    static unsigned int previous;
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    previous += tv.tv_sec + tv.tv_usec;
-    previous *= RAND_MIX;
-    return previous;
-}
-#endif
-
-static void periodic_thread(void *p)
-{
-    struct timeval tv;
-    printk("Periodic thread started.\n");
-    for(;;)
-    {
-        gettimeofday(&tv, NULL);
-        printk("T(s=%ld us=%ld)\n", tv.tv_sec, tv.tv_usec);
-        msleep(1000);
-    }
-}
-
-#ifdef CONFIG_NETFRONT
-static struct netfront_dev *net_dev;
-static struct semaphore net_sem = __SEMAPHORE_INITIALIZER(net_sem, 0);
-
-static void netfront_thread(void *p)
-{
-    net_dev = init_netfront(NULL, NULL, NULL, NULL);
-    up(&net_sem);
-}
-#endif
-
-#ifdef CONFIG_BLKFRONT
-static struct blkfront_dev *blk_dev;
-static struct blkfront_info blk_info;
-static uint64_t blk_size_read;
-static uint64_t blk_size_write;
-static struct semaphore blk_sem = __SEMAPHORE_INITIALIZER(blk_sem, 0);;
-
-struct blk_req {
-    struct blkfront_aiocb aiocb;
-    int rand_value;
-    struct blk_req *next;
-};
-
-#ifdef BLKTEST_WRITE
-static struct blk_req *blk_to_read;
-#endif
-
-static struct blk_req *blk_alloc_req(uint64_t sector)
-{
-    struct blk_req *req = xmalloc(struct blk_req);
-    req->aiocb.aio_dev = blk_dev;
-    req->aiocb.aio_buf = _xmalloc(blk_info.sector_size, blk_info.sector_size);
-    req->aiocb.aio_nbytes = blk_info.sector_size;
-    req->aiocb.aio_offset = sector * blk_info.sector_size;
-    req->aiocb.data = req;
-    req->next = NULL;
-    return req;
-}
-
-static void blk_read_completed(struct blkfront_aiocb *aiocb, int ret)
-{
-    struct blk_req *req = aiocb->data;
-    if (ret)
-        printk("got error code %d when reading at offset %ld\n", ret, aiocb->aio_offset);
-    else
-        blk_size_read += blk_info.sector_size;
-    free(aiocb->aio_buf);
-    free(req);
-}
-
-static void blk_read_sector(uint64_t sector)
-{
-    struct blk_req *req;
-
-    req = blk_alloc_req(sector);
-    req->aiocb.aio_cb = blk_read_completed;
-
-    blkfront_aio_read(&req->aiocb);
-}
-
-#ifdef BLKTEST_WRITE
-static void blk_write_read_completed(struct blkfront_aiocb *aiocb, int ret)
-{
-    struct blk_req *req = aiocb->data;
-    int rand_value;
-    int i;
-    int *buf;
-
-    if (ret) {
-        printk("got error code %d when reading back at offset %ld\n", ret, aiocb->aio_offset);
-        free(aiocb->aio_buf);
-        free(req);
-        return;
-    }
-    blk_size_read += blk_info.sector_size;
-    buf = (int*) aiocb->aio_buf;
-    rand_value = req->rand_value;
-    for (i = 0; i < blk_info.sector_size / sizeof(int); i++) {
-        if (buf[i] != rand_value) {
-            printk("bogus data at offset %ld\n", aiocb->aio_offset + i);
-            break;
-        }
-        rand_value *= RAND_MIX;
-    }
-    free(aiocb->aio_buf);
-    free(req);
-}
-
-static void blk_write_completed(struct blkfront_aiocb *aiocb, int ret)
-{
-    struct blk_req *req = aiocb->data;
-    if (ret) {
-        printk("got error code %d when writing at offset %ld\n", ret, aiocb->aio_offset);
-        free(aiocb->aio_buf);
-        free(req);
-        return;
-    }
-    blk_size_write += blk_info.sector_size;
-    /* Push write check */
-    req->next = blk_to_read;
-    blk_to_read = req;
-}
-
-static void blk_write_sector(uint64_t sector)
-{
-    struct blk_req *req;
-    int rand_value;
-    int i;
-    int *buf;
-
-    req = blk_alloc_req(sector);
-    req->aiocb.aio_cb = blk_write_completed;
-    req->rand_value = rand_value = rand();
-
-    buf = (int*) req->aiocb.aio_buf;
-    for (i = 0; i < blk_info.sector_size / sizeof(int); i++) {
-        buf[i] = rand_value;
-        rand_value *= RAND_MIX;
-    }
-
-    blkfront_aio_write(&req->aiocb);
-}
-#endif
-
-static void blkfront_thread(void *p)
-{
-    time_t lasttime = 0;
-
-    blk_dev = init_blkfront(NULL, &blk_info);
-    if (!blk_dev) {
-        up(&blk_sem);
-        return;
-    }
-
-    if (blk_info.info & VDISK_CDROM)
-        printk("Block device is a CDROM\n");
-    if (blk_info.info & VDISK_REMOVABLE)
-        printk("Block device is removable\n");
-    if (blk_info.info & VDISK_READONLY)
-        printk("Block device is read-only\n");
-
-#ifdef BLKTEST_WRITE
-    if (blk_info.mode == O_RDWR) {
-        blk_write_sector(0);
-        blk_write_sector(blk_info.sectors-1);
-    } else
-#endif
-    {
-        blk_read_sector(0);
-        blk_read_sector(blk_info.sectors-1);
-    }
-
-    while (!do_shutdown) {
-        uint64_t sector = rand() % blk_info.sectors;
-        struct timeval tv;
-#ifdef BLKTEST_WRITE
-        if (blk_info.mode == O_RDWR)
-            blk_write_sector(sector);
-        else
-#endif
-            blk_read_sector(sector);
-        blkfront_aio_poll(blk_dev);
-        gettimeofday(&tv, NULL);
-        if (tv.tv_sec > lasttime + 10) {
-            printk("%llu read, %llu write\n", blk_size_read, blk_size_write);
-            lasttime = tv.tv_sec;
-        }
-
-#ifdef BLKTEST_WRITE
-        while (blk_to_read) {
-            struct blk_req *req = blk_to_read;
-            blk_to_read = blk_to_read->next;
-            req->aiocb.aio_cb = blk_write_read_completed;
-            blkfront_aio_read(&req->aiocb);
-        }
-#endif
-    }
-    up(&blk_sem);
-}
-#endif
-
-#if defined(CONFIG_FBFRONT) && defined(CONFIG_KBDFRONT)
-#define WIDTH 800
-#define HEIGHT 600
-#define DEPTH 32
-
-static uint32_t *fb;
-static int refresh_period = 50;
-static struct fbfront_dev *fb_dev;
-static struct semaphore fbfront_sem = __SEMAPHORE_INITIALIZER(fbfront_sem, 0);
-
-static void fbfront_drawvert(int x, int y1, int y2, uint32_t color)
-{
-    int y;
-    if (x < 0)
-        return;
-    if (x >= WIDTH)
-        return;
-    if (y1 < 0)
-        y1 = 0;
-    if (y2 >= HEIGHT)
-        y2 = HEIGHT-1;
-    for (y = y1; y <= y2; y++)
-        fb[x + y*WIDTH] ^= color;
-}
-
-static void fbfront_drawhoriz(int x1, int x2, int y, uint32_t color)
-{
-    int x;
-    if (y < 0)
-        return;
-    if (y >= HEIGHT)
-        return;
-    if (x1 < 0)
-        x1 = 0;
-    if (x2 >= WIDTH)
-        x2 = WIDTH-1;
-    for (x = x1; x <= x2; x++)
-        fb[x + y*WIDTH] ^= color;
-}
-
-static void fbfront_thread(void *p)
-{
-    size_t line_length = WIDTH * (DEPTH / 8);
-    size_t memsize = HEIGHT * line_length;
-    unsigned long *mfns;
-    int i, n = (memsize + PAGE_SIZE-1) / PAGE_SIZE;
-
-    memsize = n * PAGE_SIZE;
-    fb = _xmalloc(memsize, PAGE_SIZE);
-    memset(fb, 0, memsize);
-    mfns = xmalloc_array(unsigned long, n);
-    for (i = 0; i < n; i++)
-        mfns[i] = virtual_to_mfn((char *) fb + i * PAGE_SIZE);
-    fb_dev = init_fbfront(NULL, mfns, WIDTH, HEIGHT, DEPTH, line_length, n);
-    xfree(mfns);
-    if (!fb_dev) {
-        xfree(fb);
-    }
-    up(&fbfront_sem);
-}
-
-static void clip_cursor(int *x, int *y)
-{
-    if (*x < 0)
-        *x = 0;
-    if (*x >= WIDTH)
-        *x = WIDTH - 1;
-    if (*y < 0)
-        *y = 0;
-    if (*y >= HEIGHT)
-        *y = HEIGHT - 1;
-}
-
-static void refresh_cursor(int new_x, int new_y)
-{
-    static int old_x = -1, old_y = -1;
-
-    if (!refresh_period)
-        return;
-
-    if (old_x != -1 && old_y != -1) {
-        fbfront_drawvert(old_x, old_y + 1, old_y + 8, 0xffffffff);
-        fbfront_drawhoriz(old_x + 1, old_x + 8, old_y, 0xffffffff);
-        fbfront_update(fb_dev, old_x, old_y, 9, 9);
-    }
-    old_x = new_x;
-    old_y = new_y;
-    fbfront_drawvert(new_x, new_y + 1, new_y + 8, 0xffffffff);
-    fbfront_drawhoriz(new_x + 1, new_x + 8, new_y, 0xffffffff);
-    fbfront_update(fb_dev, new_x, new_y, 9, 9);
-}
-
-static struct kbdfront_dev *kbd_dev;
-static struct semaphore kbd_sem = __SEMAPHORE_INITIALIZER(kbd_sem, 0);
-static void kbdfront_thread(void *p)
-{
-    DEFINE_WAIT(w);
-    DEFINE_WAIT(w2);
-    DEFINE_WAIT(w3);
-    int x = WIDTH / 2, y = HEIGHT / 2, z = 0;
-
-    kbd_dev = init_kbdfront(NULL, 1);
-    down(&fbfront_sem);
-    if (!kbd_dev) {
-        up(&kbd_sem);
-        return;
-    }
-
-    refresh_cursor(x, y);
-    while (1) {
-        union xenkbd_in_event kbdevent;
-        union xenfb_in_event fbevent;
-        int sleep = 1;
-
-        add_waiter(w, kbdfront_queue);
-        add_waiter(w2, fbfront_queue);
-        add_waiter(w3, shutdown_queue);
-
-        rmb();
-        if (do_shutdown)
-            break;
-
-        while (kbdfront_receive(kbd_dev, &kbdevent, 1) != 0) {
-            sleep = 0;
-            switch(kbdevent.type) {
-            case XENKBD_TYPE_MOTION:
-                printk("motion x:%d y:%d z:%d\n",
-                        kbdevent.motion.rel_x,
-                        kbdevent.motion.rel_y,
-                        kbdevent.motion.rel_z);
-                x += kbdevent.motion.rel_x;
-                y += kbdevent.motion.rel_y;
-                z += kbdevent.motion.rel_z;
-                clip_cursor(&x, &y);
-                refresh_cursor(x, y);
-                break;
-            case XENKBD_TYPE_POS:
-                printk("pos x:%d y:%d dz:%d\n",
-                        kbdevent.pos.abs_x,
-                        kbdevent.pos.abs_y,
-                        kbdevent.pos.rel_z);
-                x = kbdevent.pos.abs_x;
-                y = kbdevent.pos.abs_y;
-                z = kbdevent.pos.rel_z;
-                clip_cursor(&x, &y);
-                refresh_cursor(x, y);
-                break;
-            case XENKBD_TYPE_KEY:
-                printk("key %d %s\n",
-                        kbdevent.key.keycode,
-                        kbdevent.key.pressed ? "pressed" : "released");
-                if (kbdevent.key.keycode == BTN_LEFT) {
-                    printk("mouse %s at (%d,%d,%d)\n",
-                            kbdevent.key.pressed ? "clic" : "release", x, y, z);
-                    if (kbdevent.key.pressed) {
-                        uint32_t color = rand();
-                        fbfront_drawvert(x - 16, y - 16, y + 15, color);
-                        fbfront_drawhoriz(x - 16, x + 15, y + 16, color);
-                        fbfront_drawvert(x + 16, y - 15, y + 16, color);
-                        fbfront_drawhoriz(x - 15, x + 16, y - 16, color);
-                        fbfront_update(fb_dev, x - 16, y - 16, 33, 33);
-                    }
-                } else if (kbdevent.key.keycode == KEY_Q) {
-                    shutdown_reason = SHUTDOWN_poweroff;
-                    wmb();
-                    do_shutdown = 1;
-                    wmb();
-                    wake_up(&shutdown_queue);
-                }
-                break;
-            }
-        }
-        while (fbfront_receive(fb_dev, &fbevent, 1) != 0) {
-            sleep = 0;
-            switch(fbevent.type) {
-            case XENFB_TYPE_REFRESH_PERIOD:
-                refresh_period = fbevent.refresh_period.period;
-                printk("refresh period %d\n", refresh_period);
-                refresh_cursor(x, y);
-                break;
-            }
-        }
-        if (sleep)
-            schedule();
-        remove_waiter(w3, shutdown_queue);
-        remove_waiter(w2, fbfront_queue);
-        remove_waiter(w, kbdfront_queue);
-    }
-    up(&kbd_sem);
-}
-#endif
-
-#ifdef CONFIG_PCIFRONT
-static struct pcifront_dev *pci_dev;
-static struct semaphore pci_sem = __SEMAPHORE_INITIALIZER(pci_sem, 0);
-
-static void print_pcidev(unsigned int domain, unsigned int bus, unsigned int slot, unsigned int fun)
-{
-    unsigned int vendor, device, rev, class;
-
-    pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x00, 2, &vendor);
-    pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x02, 2, &device);
-    pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x08, 1, &rev);
-    pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x0a, 2, &class);
-
-    printk("%04x:%02x:%02x.%02x %04x: %04x:%04x (rev %02x)\n", domain, bus, slot, fun, class, vendor, device, rev);
-}
-
-static void pcifront_thread(void *p)
-{
-    pcifront_watches(NULL);
-    pci_dev = init_pcifront(NULL);
-    if (!pci_dev) {
-        up(&pci_sem);
-        return;
-    }
-    printk("PCI devices:\n");
-    pcifront_scan(pci_dev, print_pcidev);
-    up(&pci_sem);
-}
-#endif
-
-void shutdown_frontends(void)
-{
-#ifdef CONFIG_NETFRONT
-    down(&net_sem);
-    if (net_dev)
-        shutdown_netfront(net_dev);
-#endif
-
-#ifdef CONFIG_BLKFRONT
-    down(&blk_sem);
-    if (blk_dev)
-        shutdown_blkfront(blk_dev);
-#endif
-
-#if defined(CONFIG_FBFRONT) && defined(CONFIG_KBDFRONT)
-    if (fb_dev)
-        shutdown_fbfront(fb_dev);
-
-    down(&kbd_sem);
-    if (kbd_dev)
-        shutdown_kbdfront(kbd_dev);
-#endif
-
-#ifdef CONFIG_PCIFRONT
-    down(&pci_sem);
-    if (pci_dev)
-        shutdown_pcifront(pci_dev);
-#endif
-}
-
-#ifdef CONFIG_XENBUS
-void app_shutdown(unsigned reason)
-{
-    shutdown_reason = reason;
-    wmb();
-    do_shutdown = 1;
-    wmb();
-    wake_up(&shutdown_queue);
-}
-
-static void shutdown_thread(void *p)
-{
-    DEFINE_WAIT(w);
-
-    while (1) {
-        add_waiter(w, shutdown_queue);
-        rmb();
-        if (do_shutdown) {
-            rmb();
-            break;
-        }
-        schedule();
-        remove_waiter(w, shutdown_queue);
-    }
-
-    shutdown_frontends();
-
-    HYPERVISOR_shutdown(shutdown_reason);
-}
-#endif
-
-int app_main(start_info_t *si)
-{
-    printk("Test main: start_info=%p\n", si);
-#ifdef CONFIG_XENBUS
-    create_thread("xenbus_tester", xenbus_tester, si);
-#endif
-    create_thread("periodic_thread", periodic_thread, si);
-#ifdef CONFIG_NETFRONT
-    create_thread("netfront", netfront_thread, si);
-#endif
-#ifdef CONFIG_BLKFRONT
-    create_thread("blkfront", blkfront_thread, si);
-#endif
-#if defined(CONFIG_FBFRONT) && defined(CONFIG_KBDFRONT)
-    create_thread("fbfront", fbfront_thread, si);
-    create_thread("kbdfront", kbdfront_thread, si);
-#endif
-#ifdef CONFIG_PCIFRONT
-    create_thread("pcifront", pcifront_thread, si);
-#endif
-#ifdef CONFIG_XENBUS
-    create_thread("shutdown", shutdown_thread, si);
-#endif
-    return 0;
-}
diff --git a/extras/mini-os/tpm_tis.c b/extras/mini-os/tpm_tis.c
deleted file mode 100644
index d78c465..0000000
--- a/extras/mini-os/tpm_tis.c
+++ /dev/null
@@ -1,1367 +0,0 @@
-/*
- * Copyright (c) 2010-2012 United States Government, as represented by
- * the Secretary of Defense.  All rights reserved.
- *
- * This code has been derived from drivers/char/tpm.c
- * from the linux kernel
- *
- * Copyright (C) 2004 IBM Corporation
- *
- * This code has also been derived from drivers/char/tpm/tpm_tis.c
- * from the linux kernel
- *
- * Copyright (C) 2005, 2006 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation, version 2
- * of the License
- */
-#include <mini-os/ioremap.h>
-#include <mini-os/iorw.h>
-#include <mini-os/tpm_tis.h>
-#include <mini-os/os.h>
-#include <mini-os/sched.h>
-#include <mini-os/byteorder.h>
-#include <mini-os/events.h>
-#include <mini-os/wait.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/lib.h>
-#include <errno.h>
-#include <stdbool.h>
-
-#ifndef min
-	#define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
-#endif
-#define ADJUST_TIMEOUTS_TO_STANDARD(initial,standard,timeout_no)			\
-	if((initial) < (standard)){							\
-		(initial) = (standard);							\
-		printk("Timeout %c was adjusted to standard value.\n",timeout_no);	\
-	}
-
-#define TPM_HEADER_SIZE 10
-
-#define TPM_BUFSIZE 2048
-
-struct tpm_input_header {
-        uint16_t  tag;
-        uint32_t  length;
-        uint32_t  ordinal;
-}__attribute__((packed));
-
-struct tpm_output_header {
-        uint16_t  tag;
-        uint32_t  length;
-        uint32_t  return_code;
-}__attribute__((packed));
-
-struct  stclear_flags_t {
-        uint16_t  tag;
-        uint8_t      deactivated;
-        uint8_t      disableForceClear;
-        uint8_t      physicalPresence;
-        uint8_t      physicalPresenceLock;
-        uint8_t      bGlobalLock;
-}__attribute__((packed));
-
-struct  tpm_version_t {
-        uint8_t      Major;
-        uint8_t      Minor;
-        uint8_t      revMajor;
-        uint8_t      revMinor;
-}__attribute__((packed));
-
-struct  tpm_version_1_2_t {
-        uint16_t  tag;
-        uint8_t      Major;
-        uint8_t      Minor;
-        uint8_t      revMajor;
-        uint8_t      revMinor;
-}__attribute__((packed));
-
-struct  timeout_t {
-        uint32_t  a;
-        uint32_t  b;
-        uint32_t  c;
-        uint32_t  d;
-}__attribute__((packed));
-
-struct duration_t {
-        uint32_t  tpm_short;
-        uint32_t  tpm_medium;
-        uint32_t  tpm_long;
-}__attribute__((packed));
-
-struct permanent_flags_t {
-        uint16_t  tag;
-        uint8_t      disable;
-        uint8_t      ownership;
-        uint8_t      deactivated;
-        uint8_t      readPubek;
-        uint8_t      disableOwnerClear;
-        uint8_t      allowMaintenance;
-        uint8_t      physicalPresenceLifetimeLock;
-        uint8_t      physicalPresenceHWEnable;
-        uint8_t      physicalPresenceCMDEnable;
-        uint8_t      CEKPUsed;
-        uint8_t      TPMpost;
-        uint8_t      TPMpostLock;
-        uint8_t      FIPS;
-        uint8_t      operator;
-        uint8_t      enableRevokeEK;
-        uint8_t      nvLocked;
-        uint8_t      readSRKPub;
-        uint8_t      tpmEstablished;
-        uint8_t      maintenanceDone;
-        uint8_t      disableFullDALogicInfo;
-}__attribute__((packed));
-
-typedef union {
-        struct  permanent_flags_t perm_flags;
-        struct  stclear_flags_t stclear_flags;
-        bool    owned;
-        uint32_t  num_pcrs;
-        struct  tpm_version_t   tpm_version;
-        struct  tpm_version_1_2_t tpm_version_1_2;
-        uint32_t  manufacturer_id;
-        struct timeout_t  timeout;
-        struct duration_t duration;
-} cap_t;
-
-struct  tpm_getcap_params_in {
-        uint32_t  cap;
-        uint32_t  subcap_size;
-        uint32_t  subcap;
-}__attribute__((packed));
-
-struct  tpm_getcap_params_out {
-        uint32_t  cap_size;
-        cap_t   cap;
-}__attribute__((packed));
-
-struct  tpm_readpubek_params_out {
-        uint8_t      algorithm[4];
-        uint8_t      encscheme[2];
-        uint8_t      sigscheme[2];
-        uint32_t  paramsize;
-        uint8_t      parameters[12]; /*assuming RSA*/
-        uint32_t  keysize;
-        uint8_t      modulus[256];
-        uint8_t      checksum[20];
-}__attribute__((packed));
-
-typedef union {
-        struct  tpm_input_header in;
-        struct  tpm_output_header out;
-} tpm_cmd_header;
-
-#define TPM_DIGEST_SIZE 20
-struct tpm_pcrread_out {
-        uint8_t      pcr_result[TPM_DIGEST_SIZE];
-}__attribute__((packed));
-
-struct tpm_pcrread_in {
-        uint32_t  pcr_idx;
-}__attribute__((packed));
-
-struct tpm_pcrextend_in {
-        uint32_t  pcr_idx;
-        uint8_t      hash[TPM_DIGEST_SIZE];
-}__attribute__((packed));
-
-typedef union {
-        struct  tpm_getcap_params_out getcap_out;
-        struct  tpm_readpubek_params_out readpubek_out;
-        uint8_t      readpubek_out_buffer[sizeof(struct tpm_readpubek_params_out)];
-        struct  tpm_getcap_params_in getcap_in;
-        struct  tpm_pcrread_in  pcrread_in;
-        struct  tpm_pcrread_out pcrread_out;
-        struct  tpm_pcrextend_in pcrextend_in;
-} tpm_cmd_params;
-
-struct tpm_cmd_t {
-        tpm_cmd_header  header;
-        tpm_cmd_params  params;
-}__attribute__((packed));
-
-
-enum tpm_duration {
-   TPM_SHORT = 0,
-   TPM_MEDIUM = 1,
-   TPM_LONG = 2,
-   TPM_UNDEFINED,
-};
-
-#define TPM_MAX_ORDINAL 243
-#define TPM_MAX_PROTECTED_ORDINAL 12
-#define TPM_PROTECTED_ORDINAL_MASK 0xFF
-
-extern const uint8_t tpm_protected_ordinal_duration[TPM_MAX_PROTECTED_ORDINAL];
-extern const uint8_t tpm_ordinal_duration[TPM_MAX_ORDINAL];
-
-#define TPM_DIGEST_SIZE 20
-#define TPM_ERROR_SIZE 10
-#define TPM_RET_CODE_IDX 6
-
-/* tpm_capabilities */
-#define TPM_CAP_FLAG cpu_to_be32(4)
-#define TPM_CAP_PROP cpu_to_be32(5)
-#define CAP_VERSION_1_1 cpu_to_be32(0x06)
-#define CAP_VERSION_1_2 cpu_to_be32(0x1A)
-
-/* tpm_sub_capabilities */
-#define TPM_CAP_PROP_PCR cpu_to_be32(0x101)
-#define TPM_CAP_PROP_MANUFACTURER cpu_to_be32(0x103)
-#define TPM_CAP_FLAG_PERM cpu_to_be32(0x108)
-#define TPM_CAP_FLAG_VOL cpu_to_be32(0x109)
-#define TPM_CAP_PROP_OWNER cpu_to_be32(0x111)
-#define TPM_CAP_PROP_TIS_TIMEOUT cpu_to_be32(0x115)
-#define TPM_CAP_PROP_TIS_DURATION cpu_to_be32(0x120)
-
-
-#define TPM_INTERNAL_RESULT_SIZE 200
-#define TPM_TAG_RQU_COMMAND cpu_to_be16(193)
-#define TPM_ORD_GET_CAP cpu_to_be32(101)
-
-extern const struct tpm_input_header tpm_getcap_header;
-
-
-
-const uint8_t tpm_protected_ordinal_duration[TPM_MAX_PROTECTED_ORDINAL] = {
-   TPM_UNDEFINED,          /* 0 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 5 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 10 */
-   TPM_SHORT,
-};
-
-const uint8_t tpm_ordinal_duration[TPM_MAX_ORDINAL] = {
-   TPM_UNDEFINED,          /* 0 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 5 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 10 */
-   TPM_SHORT,
-   TPM_MEDIUM,
-   TPM_LONG,
-   TPM_LONG,
-   TPM_MEDIUM,             /* 15 */
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_MEDIUM,
-   TPM_LONG,
-   TPM_SHORT,              /* 20 */
-   TPM_SHORT,
-   TPM_MEDIUM,
-   TPM_MEDIUM,
-   TPM_MEDIUM,
-   TPM_SHORT,              /* 25 */
-   TPM_SHORT,
-   TPM_MEDIUM,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_MEDIUM,             /* 30 */
-   TPM_LONG,
-   TPM_MEDIUM,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,              /* 35 */
-   TPM_MEDIUM,
-   TPM_MEDIUM,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_MEDIUM,             /* 40 */
-   TPM_LONG,
-   TPM_MEDIUM,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,              /* 45 */
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_LONG,
-   TPM_MEDIUM,             /* 50 */
-   TPM_MEDIUM,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 55 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_MEDIUM,             /* 60 */
-   TPM_MEDIUM,
-   TPM_MEDIUM,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_MEDIUM,             /* 65 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 70 */
-   TPM_SHORT,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 75 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_LONG,               /* 80 */
-   TPM_UNDEFINED,
-   TPM_MEDIUM,
-   TPM_LONG,
-   TPM_SHORT,
-   TPM_UNDEFINED,          /* 85 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 90 */
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_UNDEFINED,          /* 95 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_MEDIUM,             /* 100 */
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 105 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 110 */
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,              /* 115 */
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_LONG,               /* 120 */
-   TPM_LONG,
-   TPM_MEDIUM,
-   TPM_UNDEFINED,
-   TPM_SHORT,
-   TPM_SHORT,              /* 125 */
-   TPM_SHORT,
-   TPM_LONG,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,              /* 130 */
-   TPM_MEDIUM,
-   TPM_UNDEFINED,
-   TPM_SHORT,
-   TPM_MEDIUM,
-   TPM_UNDEFINED,          /* 135 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 140 */
-   TPM_SHORT,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 145 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 150 */
-   TPM_MEDIUM,
-   TPM_MEDIUM,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_UNDEFINED,          /* 155 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 160 */
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 165 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_LONG,               /* 170 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 175 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_MEDIUM,             /* 180 */
-   TPM_SHORT,
-   TPM_MEDIUM,
-   TPM_MEDIUM,
-   TPM_MEDIUM,
-   TPM_MEDIUM,             /* 185 */
-   TPM_SHORT,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 190 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 195 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 200 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,
-   TPM_SHORT,              /* 205 */
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_MEDIUM,             /* 210 */
-   TPM_UNDEFINED,
-   TPM_MEDIUM,
-   TPM_MEDIUM,
-   TPM_MEDIUM,
-   TPM_UNDEFINED,          /* 215 */
-   TPM_MEDIUM,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,
-   TPM_SHORT,              /* 220 */
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_SHORT,
-   TPM_UNDEFINED,          /* 225 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 230 */
-   TPM_LONG,
-   TPM_MEDIUM,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,          /* 235 */
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_UNDEFINED,
-   TPM_SHORT,              /* 240 */
-   TPM_UNDEFINED,
-   TPM_MEDIUM,
-};
-
-const struct tpm_input_header tpm_getcap_header = {
-        .tag = TPM_TAG_RQU_COMMAND,
-        .length = cpu_to_be32(22),
-        .ordinal = TPM_ORD_GET_CAP
-};
-
-
-enum tis_access {
-   TPM_ACCESS_VALID = 0x80,
-   TPM_ACCESS_ACTIVE_LOCALITY = 0x20,	/* (R) */
-   TPM_ACCESS_RELINQUISH_LOCALITY = 0x20,/* (W) */
-   TPM_ACCESS_REQUEST_PENDING = 0x04,	/* (W) */
-   TPM_ACCESS_REQUEST_USE = 0x02,	/* (W) */
-};
-
-enum tis_status {
-   TPM_STS_VALID = 0x80,		/* (R) */
-   TPM_STS_COMMAND_READY = 0x40,	/* (R) */
-   TPM_STS_DATA_AVAIL = 0x10,		/* (R) */
-   TPM_STS_DATA_EXPECT = 0x08,		/* (R) */
-   TPM_STS_GO = 0x20,			/* (W) */
-};
-
-enum tis_int_flags {
-   TPM_GLOBAL_INT_ENABLE = 0x80000000,
-   TPM_INTF_BURST_COUNT_STATIC = 0x100,
-   TPM_INTF_CMD_READY_INT = 0x080,
-   TPM_INTF_INT_EDGE_FALLING = 0x040,
-   TPM_INTF_INT_EDGE_RISING = 0x020,
-   TPM_INTF_INT_LEVEL_LOW = 0x010,
-   TPM_INTF_INT_LEVEL_HIGH = 0x008,
-   TPM_INTF_LOCALITY_CHANGE_INT = 0x004,
-   TPM_INTF_STS_VALID_INT = 0x002,
-   TPM_INTF_DATA_AVAIL_INT = 0x001,
-};
-
-enum tis_defaults {
-   TIS_MEM_BASE = 0xFED40000,
-   TIS_MEM_LEN  = 0x5000,
-   TIS_SHORT_TIMEOUT = 750, /*ms*/
-   TIS_LONG_TIMEOUT = 2000, /*2 sec */
-};
-
-#define TPM_TIMEOUT 5
-
-#define TPM_ACCESS(t, l)                   (((uint8_t*)t->pages[l]) + 0x0000)
-#define TPM_INT_ENABLE(t, l)               ((uint32_t*)(((uint8_t*)t->pages[l]) + 0x0008))
-#define TPM_INT_VECTOR(t, l)               (((uint8_t*)t->pages[l]) + 0x000C)
-#define TPM_INT_STATUS(t, l)               (((uint8_t*)t->pages[l]) + 0x0010)
-#define TPM_INTF_CAPS(t, l)                ((uint32_t*)(((uint8_t*)t->pages[l]) + 0x0014))
-#define TPM_STS(t, l)                      ((uint8_t*)(((uint8_t*)t->pages[l]) + 0x0018))
-#define TPM_DATA_FIFO(t, l)                (((uint8_t*)t->pages[l]) + 0x0024)
-
-#define TPM_DID_VID(t, l)                  ((uint32_t*)(((uint8_t*)t->pages[l]) + 0x0F00))
-#define TPM_RID(t, l)                      (((uint8_t*)t->pages[l]) + 0x0F04)
-
-struct tpm_chip {
-   int enabled_localities;
-   int locality;
-   unsigned long baseaddr;
-   uint8_t* pages[5];
-   int did, vid, rid;
-
-   uint8_t data_buffer[TPM_BUFSIZE];
-   int data_len;
-
-   s_time_t timeout_a, timeout_b, timeout_c, timeout_d;
-   s_time_t duration[3];
-
-#ifdef HAVE_LIBC
-   int fd;
-#endif
-
-   unsigned int irq;
-   struct wait_queue_head read_queue;
-   struct wait_queue_head int_queue;
-};
-
-
-static void __init_tpm_chip(struct tpm_chip* tpm) {
-   tpm->enabled_localities = TPM_TIS_EN_LOCLALL;
-   tpm->locality = -1;
-   tpm->baseaddr = 0;
-   tpm->pages[0] = tpm->pages[1] = tpm->pages[2] = tpm->pages[3] = tpm->pages[4] = NULL;
-   tpm->vid = 0;
-   tpm->did = 0;
-   tpm->irq = 0;
-   init_waitqueue_head(&tpm->read_queue);
-   init_waitqueue_head(&tpm->int_queue);
-
-   tpm->data_len = -1;
-
-#ifdef HAVE_LIBC
-   tpm->fd = -1;
-#endif
-}
-
-/*
- * Returns max number of nsecs to wait
- */
-s_time_t tpm_calc_ordinal_duration(struct tpm_chip *chip,
-      uint32_t ordinal)
-{
-   int duration_idx = TPM_UNDEFINED;
-   s_time_t duration = 0;
-
-   if (ordinal < TPM_MAX_ORDINAL)
-      duration_idx = tpm_ordinal_duration[ordinal];
-   else if ((ordinal & TPM_PROTECTED_ORDINAL_MASK) <
-	 TPM_MAX_PROTECTED_ORDINAL)
-      duration_idx =
-	 tpm_protected_ordinal_duration[ordinal &
-	 TPM_PROTECTED_ORDINAL_MASK];
-
-   if (duration_idx != TPM_UNDEFINED) {
-      duration = chip->duration[duration_idx];
-   }
-
-   if (duration <= 0) {
-      return SECONDS(120);
-   }
-   else
-   {
-      return duration;
-   }
-}
-
-
-static int locality_enabled(struct tpm_chip* tpm, int l) {
-   return l >= 0 && tpm->enabled_localities & (1 << l);
-}
-
-static int check_locality(struct tpm_chip* tpm, int l) {
-   if(locality_enabled(tpm, l) && (ioread8(TPM_ACCESS(tpm, l)) &
-	    (TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID)) ==
-	 (TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID)) {
-      return l;
-   }
-   return -1;
-}
-
-void release_locality(struct tpm_chip* tpm, int l, int force)
-{
-   if (locality_enabled(tpm, l) && (force || (ioread8(TPM_ACCESS(tpm, l)) &
-	       (TPM_ACCESS_REQUEST_PENDING | TPM_ACCESS_VALID)) ==
-	    (TPM_ACCESS_REQUEST_PENDING | TPM_ACCESS_VALID))) {
-      iowrite8(TPM_ACCESS(tpm, l), TPM_ACCESS_RELINQUISH_LOCALITY);
-   }
-}
-
-int tpm_tis_request_locality(struct tpm_chip* tpm, int l) {
-
-   s_time_t stop;
-   /*Make sure locality is valid */
-   if(!locality_enabled(tpm, l)) {
-      printk("tpm_tis_change_locality() Tried to change to locality %d, but it is disabled or invalid!\n", l);
-      return -1;
-   }
-   /* Check if we already have the current locality */
-   if(check_locality(tpm, l) >= 0) {
-      return tpm->locality = l;
-   }
-   /* Set the new locality*/
-   iowrite8(TPM_ACCESS(tpm, l), TPM_ACCESS_REQUEST_USE);
-
-   if(tpm->irq) {
-      /* Wait for interrupt */
-      wait_event_deadline(tpm->int_queue, (check_locality(tpm, l) >= 0), NOW() + tpm->timeout_a);
-
-      /* FIXME: Handle timeout event, should return error in that case */
-      return l;
-   } else {
-      /* Wait for burstcount */
-      stop = NOW() + tpm->timeout_a;
-      do {
-	 if(check_locality(tpm, l) >= 0) {
-	    return tpm->locality = l;
-	 }
-	 msleep(TPM_TIMEOUT);
-      } while(NOW() < stop);
-   }
-
-   printk("REQ LOCALITY FAILURE\n");
-   return -1;
-}
-
-static uint8_t tpm_tis_status(struct tpm_chip* tpm) {
-   return ioread8(TPM_STS(tpm, tpm->locality));
-}
-
-/* This causes the current command to be aborted */
-static void tpm_tis_ready(struct tpm_chip* tpm) {
-   iowrite8(TPM_STS(tpm, tpm->locality), TPM_STS_COMMAND_READY);
-}
-#define tpm_tis_cancel_cmd(v) tpm_tis_ready(v)
-
-static int get_burstcount(struct tpm_chip* tpm) {
-   s_time_t stop;
-   int burstcnt;
-
-   stop = NOW() + tpm->timeout_d;
-   do {
-      burstcnt = ioread8((TPM_STS(tpm, tpm->locality) + 1));
-      burstcnt += ioread8(TPM_STS(tpm, tpm->locality) + 2) << 8;
-
-      if (burstcnt) {
-	 return burstcnt;
-      }
-      msleep(TPM_TIMEOUT);
-   } while(NOW() < stop);
-   return -EBUSY;
-}
-
-static int wait_for_stat(struct tpm_chip* tpm, uint8_t mask,
-      unsigned long timeout, struct wait_queue_head* queue) {
-   s_time_t stop;
-   uint8_t status;
-
-   status = tpm_tis_status(tpm);
-   if((status & mask) == mask) {
-      return 0;
-   }
-
-   if(tpm->irq) {
-      wait_event_deadline(*queue, ((tpm_tis_status(tpm) & mask) == mask), timeout);
-      /* FIXME: Check for timeout and return -ETIME */
-      return 0;
-   } else {
-      stop = NOW() + timeout;
-      do {
-	 msleep(TPM_TIMEOUT);
-	 status = tpm_tis_status(tpm);
-	 if((status & mask) == mask)
-	    return 0;
-      } while( NOW() < stop);
-   }
-   return -ETIME;
-}
-
-static int recv_data(struct tpm_chip* tpm, uint8_t* buf, size_t count) {
-   int size = 0;
-   int burstcnt;
-   while( size < count &&
-	 wait_for_stat(tpm,
-	    TPM_STS_DATA_AVAIL | TPM_STS_VALID,
-	    tpm->timeout_c,
-	    &tpm->read_queue)
-	 == 0) {
-      burstcnt = get_burstcount(tpm);
-      for(; burstcnt > 0 && size < count; --burstcnt)
-      {
-	 buf[size++] = ioread8(TPM_DATA_FIFO(tpm, tpm->locality));
-      }
-   }
-   return size;
-}
-
-int tpm_tis_recv(struct tpm_chip* tpm, uint8_t* buf, size_t count) {
-   int size = 0;
-   int expected, status;
-
-   if (count < TPM_HEADER_SIZE) {
-      size = -EIO;
-      goto out;
-   }
-
-   /* read first 10 bytes, including tag, paramsize, and result */
-   if((size =
-	    recv_data(tpm, buf, TPM_HEADER_SIZE)) < TPM_HEADER_SIZE) {
-      printk("Error reading tpm cmd header\n");
-      goto out;
-   }
-
-   expected = be32_to_cpu(*((uint32_t*)(buf + 2)));
-   if(expected > count) {
-      size = -EIO;
-      goto out;
-   }
-
-   if((size += recv_data(tpm, & buf[TPM_HEADER_SIZE],
-	       expected - TPM_HEADER_SIZE)) < expected) {
-      printk("Unable to read rest of tpm command size=%d expected=%d\n", size, expected);
-      size = -ETIME;
-      goto out;
-   }
-
-   wait_for_stat(tpm, TPM_STS_VALID, tpm->timeout_c, &tpm->int_queue);
-   status = tpm_tis_status(tpm);
-   if(status & TPM_STS_DATA_AVAIL) {
-      printk("Error: left over data\n");
-      size = -EIO;
-      goto out;
-   }
-
-out:
-   tpm_tis_ready(tpm);
-   release_locality(tpm, tpm->locality, 0);
-   return size;
-}
-int tpm_tis_send(struct tpm_chip* tpm, uint8_t* buf, size_t len) {
-   int rc;
-   int status, burstcnt = 0;
-   int count = 0;
-   uint32_t ordinal;
-
-   if(tpm_tis_request_locality(tpm, tpm->locality) < 0) {
-      return -EBUSY;
-   }
-
-   status = tpm_tis_status(tpm);
-   if((status & TPM_STS_COMMAND_READY) == 0) {
-      tpm_tis_ready(tpm);
-      if(wait_for_stat(tpm, TPM_STS_COMMAND_READY, tpm->timeout_b, &tpm->int_queue) < 0) {
-	 rc = -ETIME;
-	 goto out_err;
-      }
-   }
-
-   while(count < len - 1) {
-      burstcnt = get_burstcount(tpm);
-      for(;burstcnt > 0 && count < len -1; --burstcnt) {
-	 iowrite8(TPM_DATA_FIFO(tpm, tpm->locality), buf[count++]);
-      }
-
-      wait_for_stat(tpm, TPM_STS_VALID, tpm->timeout_c, &tpm->int_queue);
-      status = tpm_tis_status(tpm);
-      if((status & TPM_STS_DATA_EXPECT) == 0) {
-	 rc = -EIO;
-	 goto out_err;
-      }
-   }
-
-   /*Write last byte*/
-   iowrite8(TPM_DATA_FIFO(tpm, tpm->locality), buf[count]);
-   wait_for_stat(tpm, TPM_STS_VALID, tpm->timeout_c, &tpm->read_queue);
-   status = tpm_tis_status(tpm);
-   if((status & TPM_STS_DATA_EXPECT) != 0) {
-      rc = -EIO;
-      goto out_err;
-   }
-
-   /*go and do it*/
-   iowrite8(TPM_STS(tpm, tpm->locality), TPM_STS_GO);
-
-   if(tpm->irq) {
-      /*Wait for interrupt */
-      ordinal = be32_to_cpu(*(buf + 6));
-      if(wait_for_stat(tpm,
-	       TPM_STS_DATA_AVAIL | TPM_STS_VALID,
-	       tpm_calc_ordinal_duration(tpm, ordinal),
-	       &tpm->read_queue) < 0) {
-	 rc = -ETIME;
-	 goto out_err;
-      }
-   }
-#ifdef HAVE_LIBC
-   if(tpm->fd >= 0) {
-      files[tpm->fd].read = 0;
-      files[tpm->fd].tpm_tis.respgot = 0;
-      files[tpm->fd].tpm_tis.offset = 0;
-   }
-#endif
-   return len;
-
-out_err:
-   tpm_tis_ready(tpm);
-   release_locality(tpm, tpm->locality, 0);
-   return rc;
-}
-
-static void tpm_tis_irq_handler(evtchn_port_t port, struct pt_regs *regs, void* data)
-{
-   struct tpm_chip* tpm = data;
-   uint32_t interrupt;
-   int i;
-
-   interrupt = ioread32(TPM_INT_STATUS(tpm, tpm->locality));
-   if(interrupt == 0) {
-      return;
-   }
-
-   if(interrupt & TPM_INTF_DATA_AVAIL_INT) {
-      wake_up(&tpm->read_queue);
-   }
-   if(interrupt & TPM_INTF_LOCALITY_CHANGE_INT) {
-      for(i = 0; i < 5; ++i) {
-	 if(check_locality(tpm, i) >= 0) {
-	    break;
-	 }
-      }
-   }
-   if(interrupt & (TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_STS_VALID_INT |
-	    TPM_INTF_CMD_READY_INT)) {
-      wake_up(&tpm->int_queue);
-   }
-
-   /* Clear interrupts handled with TPM_EOI */
-   iowrite32(TPM_INT_STATUS(tpm, tpm->locality), interrupt);
-   ioread32(TPM_INT_STATUS(tpm, tpm->locality));
-   return;
-}
-
-/*
- * Internal kernel interface to transmit TPM commands
- */
-static ssize_t tpm_transmit(struct tpm_chip *chip, const uint8_t *buf,
-      size_t bufsiz)
-{
-   ssize_t rc;
-   uint32_t count, ordinal;
-   s_time_t stop;
-
-   count = be32_to_cpu(*((uint32_t *) (buf + 2)));
-   ordinal = be32_to_cpu(*((uint32_t *) (buf + 6)));
-   if (count == 0)
-      return -ENODATA;
-   if (count > bufsiz) {
-      printk("Error: invalid count value %x %zx \n", count, bufsiz);
-      return -E2BIG;
-   }
-
-   //down(&chip->tpm_mutex);
-
-   if ((rc = tpm_tis_send(chip, (uint8_t *) buf, count)) < 0) {
-      printk("tpm_transmit: tpm_send: error %ld\n", rc);
-      goto out;
-   }
-
-   if (chip->irq)
-      goto out_recv;
-
-   stop = NOW() + tpm_calc_ordinal_duration(chip, ordinal);
-   do {
-      uint8_t status = tpm_tis_status(chip);
-      if ((status & (TPM_STS_DATA_AVAIL | TPM_STS_VALID)) ==
-	    (TPM_STS_DATA_AVAIL | TPM_STS_VALID))
-	 goto out_recv;
-
-      if ((status == TPM_STS_COMMAND_READY)) {
-	 printk("TPM Error: Operation Canceled\n");
-	 rc = -ECANCELED;
-	 goto out;
-      }
-
-      msleep(TPM_TIMEOUT);    /* CHECK */
-      rmb();
-   } while (NOW() < stop);
-
-   /* Cancel the command */
-   tpm_tis_cancel_cmd(chip);
-   printk("TPM Operation Timed out\n");
-   rc = -ETIME;
-   goto out;
-
-out_recv:
-   if((rc = tpm_tis_recv(chip, (uint8_t *) buf, bufsiz)) < 0) {
-      printk("tpm_transmit: tpm_recv: error %d\n", rc);
-   }
-out:
-   //up(&chip->tpm_mutex);
-   return rc;
-}
-
-static ssize_t transmit_cmd(struct tpm_chip *chip, struct tpm_cmd_t *cmd,
-                            int len, const char *desc)
-{
-        int err;
-
-        len = tpm_transmit(chip,(uint8_t *) cmd, len);
-        if (len <  0)
-                return len;
-        if (len == TPM_ERROR_SIZE) {
-                err = be32_to_cpu(cmd->header.out.return_code);
-                printk("A TPM error (%d) occurred %s\n", err, desc);
-                return err;
-        }
-        return 0;
-}
-
-int tpm_get_timeouts(struct tpm_chip *chip)
-{
-   struct tpm_cmd_t tpm_cmd;
-   struct timeout_t *timeout_cap;
-   struct duration_t *duration_cap;
-   ssize_t rc;
-   uint32_t timeout;
-   unsigned int scale = 1;
-
-   tpm_cmd.header.in = tpm_getcap_header;
-   tpm_cmd.params.getcap_in.cap = TPM_CAP_PROP;
-   tpm_cmd.params.getcap_in.subcap_size = cpu_to_be32(4);
-   tpm_cmd.params.getcap_in.subcap = TPM_CAP_PROP_TIS_TIMEOUT;
-
-   if((rc = transmit_cmd(chip, &tpm_cmd, TPM_INTERNAL_RESULT_SIZE,
-	 "attempting to determine the timeouts")) != 0) {
-      printk("transmit failed %d\n", rc);
-      goto duration;
-   }
-
-   if(be32_to_cpu(tpm_cmd.header.out.return_code) != 0 ||
-         be32_to_cpu(tpm_cmd.header.out.length) !=
-         sizeof(tpm_cmd.header.out) + sizeof(uint32_t) + 4 * sizeof(uint32_t)) {
-      return -EINVAL;
-   }
-
-   timeout_cap = &tpm_cmd.params.getcap_out.cap.timeout;
-   /* Don't overwrite default if value is 0 */
-   timeout = be32_to_cpu(timeout_cap->a);
-   if(timeout && timeout < 1000) {
-      /* timeouts in msc rather usec */
-      scale = 1000;
-   }
-   if (timeout)
-      chip->timeout_a = MICROSECS(timeout * scale); /*Convert to msec */
-   ADJUST_TIMEOUTS_TO_STANDARD(chip->timeout_a,MILLISECS(TIS_SHORT_TIMEOUT),'a');
-
-   timeout = be32_to_cpu(timeout_cap->b);
-   if (timeout)
-      chip->timeout_b = MICROSECS(timeout * scale); /*Convert to msec */
-   ADJUST_TIMEOUTS_TO_STANDARD(chip->timeout_b,MILLISECS(TIS_LONG_TIMEOUT),'b');
-
-   timeout = be32_to_cpu(timeout_cap->c);
-   if (timeout)
-      chip->timeout_c = MICROSECS(timeout * scale); /*Convert to msec */
-   ADJUST_TIMEOUTS_TO_STANDARD(chip->timeout_c,MILLISECS(TIS_SHORT_TIMEOUT),'c');
-
-   timeout = be32_to_cpu(timeout_cap->d);
-   if (timeout)
-      chip->timeout_d = MICROSECS(timeout * scale); /*Convert to msec */
-   ADJUST_TIMEOUTS_TO_STANDARD(chip->timeout_d,MILLISECS(TIS_SHORT_TIMEOUT),'d');
-
-duration:
-   tpm_cmd.header.in = tpm_getcap_header;
-   tpm_cmd.params.getcap_in.cap = TPM_CAP_PROP;
-   tpm_cmd.params.getcap_in.subcap_size = cpu_to_be32(4);
-   tpm_cmd.params.getcap_in.subcap = TPM_CAP_PROP_TIS_DURATION;
-
-   if((rc = transmit_cmd(chip, &tpm_cmd, TPM_INTERNAL_RESULT_SIZE,
-	 "attempting to determine the durations")) < 0) {
-      return rc;
-   }
-
-   if(be32_to_cpu(tpm_cmd.header.out.return_code) != 0 ||
-         be32_to_cpu(tpm_cmd.header.out.length) !=
-         sizeof(tpm_cmd.header.out) + sizeof(uint32_t) + 3 * sizeof(uint32_t)) {
-      return -EINVAL;
-   }
-
-   duration_cap = &tpm_cmd.params.getcap_out.cap.duration;
-   chip->duration[TPM_SHORT] = MICROSECS(be32_to_cpu(duration_cap->tpm_short));
-   chip->duration[TPM_MEDIUM] = MICROSECS(be32_to_cpu(duration_cap->tpm_medium));
-   chip->duration[TPM_LONG] = MICROSECS(be32_to_cpu(duration_cap->tpm_long));
-
-   /* The Broadcom BCM0102 chipset in a Dell Latitude D820 gets the above
-    * value wrong and apparently reports msecs rather than usecs. So we
-    * fix up the resulting too-small TPM_SHORT value to make things work.
-    */
-   if (chip->duration[TPM_SHORT] < MILLISECS(10)) {
-      chip->duration[TPM_SHORT] = SECONDS(1);
-      chip->duration[TPM_MEDIUM] *= 1000;
-      chip->duration[TPM_LONG] *= 1000;
-      printk("Adjusting TPM timeout parameters\n");
-   }
-
-   return 0;
-}
-
-
-
-void tpm_continue_selftest(struct tpm_chip* chip) {
-   uint8_t data[] = {
-      0, 193,                 /* TPM_TAG_RQU_COMMAND */
-      0, 0, 0, 10,            /* length */
-      0, 0, 0, 83,            /* TPM_ORD_GetCapability */
-   };
-
-   tpm_transmit(chip, data, sizeof(data));
-}
-
-ssize_t tpm_getcap(struct tpm_chip *chip, uint32_t subcap_id, cap_t *cap,
-                   const char *desc)
-{
-        struct tpm_cmd_t tpm_cmd;
-        int rc;
-
-        tpm_cmd.header.in = tpm_getcap_header;
-        if (subcap_id == CAP_VERSION_1_1 || subcap_id == CAP_VERSION_1_2) {
-                tpm_cmd.params.getcap_in.cap = subcap_id;
-                /*subcap field not necessary */
-                tpm_cmd.params.getcap_in.subcap_size = cpu_to_be32(0);
-                tpm_cmd.header.in.length -= cpu_to_be32(sizeof(uint32_t));
-        } else {
-                if (subcap_id == TPM_CAP_FLAG_PERM ||
-                    subcap_id == TPM_CAP_FLAG_VOL)
-                        tpm_cmd.params.getcap_in.cap = TPM_CAP_FLAG;
-                else
-                        tpm_cmd.params.getcap_in.cap = TPM_CAP_PROP;
-                tpm_cmd.params.getcap_in.subcap_size = cpu_to_be32(4);
-                tpm_cmd.params.getcap_in.subcap = subcap_id;
-        }
-        rc = transmit_cmd(chip, &tpm_cmd, TPM_INTERNAL_RESULT_SIZE, desc);
-        if (!rc)
-                *cap = tpm_cmd.params.getcap_out.cap;
-        return rc;
-}
-
-
-struct tpm_chip* init_tpm_tis(unsigned long baseaddr, int localities, unsigned int irq)
-{
-   int i;
-   unsigned long addr;
-   struct tpm_chip* tpm = NULL;
-   uint32_t didvid;
-   uint32_t intfcaps;
-   uint32_t intmask;
-
-   printk("============= Init TPM TIS Driver ==============\n");
-
-   /*Sanity check the localities input */
-   if(localities & ~TPM_TIS_EN_LOCLALL) {
-      printk("init_tpm_tis() Invalid locality specification! %X\n", localities);
-      goto abort_egress;
-   }
-
-   printk("IOMEM Machine Base Address: %lX\n", baseaddr);
-
-   /* Create the tpm data structure */
-   tpm = malloc(sizeof(struct tpm_chip));
-   __init_tpm_chip(tpm);
-
-   /* Set the enabled localities - if 0 we leave default as all enabled */
-   if(localities != 0) {
-      tpm->enabled_localities = localities;
-   }
-   printk("Enabled Localities: ");
-   for(i = 0; i < 5; ++i) {
-      if(locality_enabled(tpm, i)) {
-	 printk("%d ", i);
-      }
-   }
-   printk("\n");
-
-   /* Set the base machine address */
-   tpm->baseaddr = baseaddr;
-
-   /* Set default timeouts */
-   tpm->timeout_a = MILLISECS(TIS_SHORT_TIMEOUT);
-   tpm->timeout_b = MILLISECS(TIS_LONG_TIMEOUT);
-   tpm->timeout_c = MILLISECS(TIS_SHORT_TIMEOUT);
-   tpm->timeout_d = MILLISECS(TIS_SHORT_TIMEOUT);
-
-   /*Map the mmio pages */
-   addr = tpm->baseaddr;
-   for(i = 0; i < 5; ++i) {
-      if(locality_enabled(tpm, i)) {
-	 /* Map the page in now */
-	 if((tpm->pages[i] = ioremap_nocache(addr, PAGE_SIZE)) == NULL) {
-	    printk("Unable to map iomem page a address %p\n", addr);
-	    goto abort_egress;
-	 }
-
-	 /* Set default locality to the first enabled one */
-	 if (tpm->locality < 0) {
-	    if(tpm_tis_request_locality(tpm, i) < 0) {
-	       printk("Unable to request locality %d??\n", i);
-	       goto abort_egress;
-	    }
-	 }
-      }
-      addr += PAGE_SIZE;
-   }
-
-
-   /* Get the vendor and device ids */
-   didvid = ioread32(TPM_DID_VID(tpm, tpm->locality));
-   tpm->did = didvid >> 16;
-   tpm->vid = didvid & 0xFFFF;
-
-
-   /* Get the revision id */
-   tpm->rid = ioread8(TPM_RID(tpm, tpm->locality));
-
-   printk("1.2 TPM (device-id=0x%X vendor-id = %X rev-id = %X)\n", tpm->did, tpm->vid, tpm->rid);
-
-   intfcaps = ioread32(TPM_INTF_CAPS(tpm, tpm->locality));
-   printk("TPM interface capabilities (0x%x):\n", intfcaps);
-   if (intfcaps & TPM_INTF_BURST_COUNT_STATIC)
-      printk("\tBurst Count Static\n");
-   if (intfcaps & TPM_INTF_CMD_READY_INT)
-      printk("\tCommand Ready Int Support\n");
-   if (intfcaps & TPM_INTF_INT_EDGE_FALLING)
-      printk("\tInterrupt Edge Falling\n");
-   if (intfcaps & TPM_INTF_INT_EDGE_RISING)
-      printk("\tInterrupt Edge Rising\n");
-   if (intfcaps & TPM_INTF_INT_LEVEL_LOW)
-      printk("\tInterrupt Level Low\n");
-   if (intfcaps & TPM_INTF_INT_LEVEL_HIGH)
-      printk("\tInterrupt Level High\n");
-   if (intfcaps & TPM_INTF_LOCALITY_CHANGE_INT)
-      printk("\tLocality Change Int Support\n");
-   if (intfcaps & TPM_INTF_STS_VALID_INT)
-      printk("\tSts Valid Int Support\n");
-   if (intfcaps & TPM_INTF_DATA_AVAIL_INT)
-      printk("\tData Avail Int Support\n");
-
-   /*Interupt setup */
-   intmask = ioread32(TPM_INT_ENABLE(tpm, tpm->locality));
-
-   intmask |= TPM_INTF_CMD_READY_INT
-      | TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_DATA_AVAIL_INT
-      | TPM_INTF_STS_VALID_INT;
-
-   iowrite32(TPM_INT_ENABLE(tpm, tpm->locality), intmask);
-
-   /*If interupts are enabled, handle it */
-   if(irq) {
-      if(irq != TPM_PROBE_IRQ) {
-	 tpm->irq = irq;
-      } else {
-	 /*FIXME add irq probing feature later */
-	 printk("IRQ probing not implemented\n");
-      }
-   }
-
-   if(tpm->irq) {
-      iowrite8(TPM_INT_VECTOR(tpm, tpm->locality), tpm->irq);
-
-      if(bind_pirq(tpm->irq, 1, tpm_tis_irq_handler, tpm) != 0) {
-	 printk("Unabled to request irq: %u for use\n", tpm->irq);
-	 printk("Will use polling mode\n");
-	 tpm->irq = 0;
-      } else {
-	 /* Clear all existing */
-	 iowrite32(TPM_INT_STATUS(tpm, tpm->locality), ioread32(TPM_INT_STATUS(tpm, tpm->locality)));
-
-	 /* Turn on interrupts */
-	 iowrite32(TPM_INT_ENABLE(tpm, tpm->locality), intmask | TPM_GLOBAL_INT_ENABLE);
-      }
-   }
-
-   if(tpm_get_timeouts(tpm)) {
-      printk("Could not get TPM timeouts and durations\n");
-      goto abort_egress;
-   }
-   tpm_continue_selftest(tpm);
-
-
-   return tpm;
-abort_egress:
-   if(tpm != NULL) {
-      shutdown_tpm_tis(tpm);
-   }
-   return NULL;
-}
-
-void shutdown_tpm_tis(struct tpm_chip* tpm){
-   int i;
-
-   printk("Shutting down tpm_tis device\n");
-
-   iowrite32(TPM_INT_ENABLE(tpm, tpm->locality), ~TPM_GLOBAL_INT_ENABLE);
-
-   /*Unmap all of the mmio pages */
-   for(i = 0; i < 5; ++i) {
-      if(tpm->pages[i] != NULL) {
-	 iounmap(tpm->pages[i], PAGE_SIZE);
-	 tpm->pages[i] = NULL;
-      }
-   }
-   free(tpm);
-   return;
-}
-
-
-int tpm_tis_cmd(struct tpm_chip* tpm, uint8_t* req, size_t reqlen, uint8_t** resp, size_t* resplen)
-{
-   if(tpm->locality < 0) {
-      printk("tpm_tis_cmd() failed! locality not set!\n");
-      return -1;
-   }
-   if(reqlen > TPM_BUFSIZE) {
-      reqlen = TPM_BUFSIZE;
-   }
-   memcpy(tpm->data_buffer, req, reqlen);
-   *resplen = tpm_transmit(tpm, tpm->data_buffer, TPM_BUFSIZE);
-
-   *resp = malloc(*resplen);
-   memcpy(*resp, tpm->data_buffer, *resplen);
-   return 0;
-}
-
-#ifdef HAVE_LIBC
-int tpm_tis_open(struct tpm_chip* tpm)
-{
-   /* Silently prevent multiple opens */
-   if(tpm->fd != -1) {
-      return tpm->fd;
-   }
-
-   tpm->fd = alloc_fd(FTYPE_TPM_TIS);
-   printk("tpm_tis_open() -> %d\n", tpm->fd);
-   files[tpm->fd].tpm_tis.dev = tpm;
-   files[tpm->fd].tpm_tis.offset = 0;
-   files[tpm->fd].tpm_tis.respgot = 0;
-   return tpm->fd;
-}
-
-int tpm_tis_posix_write(int fd, const uint8_t* buf, size_t count)
-{
-   struct tpm_chip* tpm;
-   tpm = files[fd].tpm_tis.dev;
-
-   if(tpm->locality < 0) {
-      printk("tpm_tis_posix_write() failed! locality not set!\n");
-      errno = EINPROGRESS;
-      return -1;
-   }
-   if(count == 0) {
-      return 0;
-   }
-
-   /* Return an error if we are already processing a command */
-   if(count > TPM_BUFSIZE) {
-      count = TPM_BUFSIZE;
-   }
-   /* Send the command now */
-   memcpy(tpm->data_buffer, buf, count);
-   if((tpm->data_len = tpm_transmit(tpm, tpm->data_buffer, TPM_BUFSIZE)) < 0) {
-      errno = EIO;
-      return -1;
-   }
-   return count;
-}
-
-int tpm_tis_posix_read(int fd, uint8_t* buf, size_t count)
-{
-   int rc;
-   struct tpm_chip* tpm;
-   tpm = files[fd].tpm_tis.dev;
-
-   if(count == 0) {
-      return 0;
-   }
-
-   /* If there is no tpm resp to read, return EIO */
-   if(tpm->data_len < 0) {
-      errno = EIO;
-      return -1;
-   }
-
-
-   /* Handle EOF case */
-   if(files[fd].tpm_tis.offset >= tpm->data_len) {
-      rc = 0;
-   } else {
-      rc = min(tpm->data_len - files[fd].tpm_tis.offset, count);
-      memcpy(buf, tpm->data_buffer + files[fd].tpm_tis.offset, rc);
-   }
-   files[fd].tpm_tis.offset += rc;
-   /* Reset the data pending flag */
-   return rc;
-}
-int tpm_tis_posix_fstat(int fd, struct stat* buf)
-{
-   struct tpm_chip* tpm;
-   tpm = files[fd].tpm_tis.dev;
-
-   buf->st_mode = O_RDWR;
-   buf->st_uid = 0;
-   buf->st_gid = 0;
-   buf->st_size = be32_to_cpu(*((uint32_t*)(tpm->data_buffer + 2)));
-   buf->st_atime = buf->st_mtime = buf->st_ctime = time(NULL);
-   return 0;
-}
-
-
-#endif
diff --git a/extras/mini-os/tpmback.c b/extras/mini-os/tpmback.c
deleted file mode 100644
index 00b66e8..0000000
--- a/extras/mini-os/tpmback.c
+++ /dev/null
@@ -1,1136 +0,0 @@
-/*
- * Copyright (c) 2010-2012 United States Government, as represented by
- * the Secretary of Defense.  All rights reserved.
- *
- * This code has been derived from drivers/xen/tpmback/tpmback.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (c) 2005, IBM Corporation
- *
- * which was itself derived from drivers/xen/netback/netback.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (c) 2002-2004, K A Fraser
- *
- * This code has also been derived from drivers/xen/tpmback/xenbus.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (C) 2005 IBM Corporation
- * Copyright (C) 2005 Rusty Russell <rusty at rustcorp.com.au>
- *
- * This code has also been derived from drivers/xen/tpmback/interface.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (c) 2005, IBM Corporation
- *
- * which was itself also derived from drvivers/xen/netback/interface.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (c) 2004, Keir Fraser
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation, version 2
- * of the License
- */
-#include <mini-os/os.h>
-#include <mini-os/xenbus.h>
-#include <mini-os/events.h>
-#include <errno.h>
-#include <mini-os/gnttab.h>
-#include <xen/io/xenbus.h>
-#include <xen/io/tpmif.h>
-#include <xen/io/protocols.h>
-#include <mini-os/xmalloc.h>
-#include <time.h>
-#include <mini-os/tpmback.h>
-#include <mini-os/lib.h>
-#include <fcntl.h>
-#include <mini-os/mm.h>
-#include <mini-os/posix/sys/mman.h>
-#include <mini-os/semaphore.h>
-#include <mini-os/wait.h>
-
-
-#ifndef HAVE_LIBC
-#define strtoul simple_strtoul
-#endif
-
-//#define TPMBACK_PRINT_DEBUG
-#ifdef TPMBACK_PRINT_DEBUG
-#define TPMBACK_DEBUG(fmt,...) printk("Tpmback:Debug("__FILE__":%d) " fmt, __LINE__, ##__VA_ARGS__)
-#define TPMBACK_DEBUG_MORE(fmt,...) printk(fmt, ##__VA_ARGS__)
-#else
-#define TPMBACK_DEBUG(fmt,...)
-#endif
-#define TPMBACK_ERR(fmt,...) printk("Tpmback:Error " fmt, ##__VA_ARGS__)
-#define TPMBACK_LOG(fmt,...) printk("Tpmback:Info " fmt, ##__VA_ARGS__)
-
-#define min(a,b) (((a) < (b)) ? (a) : (b))
-
-/* Default size of the tpmif array at initialization */
-#define DEF_ARRAY_SIZE 1
-
-/* tpmif and tpmdev flags */
-#define TPMIF_CLOSED 1
-#define TPMIF_REQ_READY 2
-
-struct tpmif {
-   domid_t domid;
-   unsigned int handle;
-
-   char* fe_path;
-   char* fe_state_path;
-
-   /* Locally bound event channel*/
-   evtchn_port_t evtchn;
-
-   /* Shared page */
-   tpmif_shared_page_t *page;
-
-   enum xenbus_state state;
-   enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
-
-   unsigned char uuid[16];
-   void* opaque;
-
-   /* state flags */
-   int flags;
-};
-typedef struct tpmif tpmif_t;
-
-struct tpmback_dev {
-
-   tpmif_t** tpmlist;
-   unsigned long num_tpms;
-   unsigned long num_alloc;
-
-   struct gntmap map;
-
-   /* True if at least one tpmif has a request to be handled */
-   int flags;
-
-   xenbus_event_queue events;
-
-   /* Callbacks */
-   void (*open_callback)(domid_t, unsigned int);
-   void (*close_callback)(domid_t, unsigned int);
-};
-typedef struct tpmback_dev tpmback_dev_t;
-
-enum { EV_NONE, EV_NEWFE, EV_STCHNG } tpm_ev_enum;
-
-/* Global objects */
-static struct thread* eventthread = NULL;
-static tpmback_dev_t gtpmdev = {
-   .tpmlist = NULL,
-   .num_tpms = 0,
-   .num_alloc = 0,
-   .flags = TPMIF_CLOSED,
-   .events = NULL,
-   .open_callback = NULL,
-   .close_callback = NULL,
-};
-struct wait_queue_head waitq;
-int globalinit = 0;
-
-/************************************
- * TPMIF SORTED ARRAY FUNCTIONS
- * tpmback_dev_t.tpmlist is a sorted array, sorted by domid and then handle number
- * Duplicates are not allowed
- * **********************************/
-
-static void tpmif_req_ready(tpmif_t* tpmif) {
-   tpmif->flags |= TPMIF_REQ_READY;
-   gtpmdev.flags |= TPMIF_REQ_READY;
-}
-
-static void tpmdev_check_req(void) {
-   int i;
-   int flags;
-   local_irq_save(flags);
-   for(i = 0; i < gtpmdev.num_tpms; ++i) {
-      if(gtpmdev.tpmlist[i]->flags & TPMIF_REQ_READY) {
-	 gtpmdev.flags |= TPMIF_REQ_READY;
-	 local_irq_restore(flags);
-	 return;
-      }
-   }
-   gtpmdev.flags &= ~TPMIF_REQ_READY;
-   local_irq_restore(flags);
-}
-
-static void tpmif_req_finished(tpmif_t* tpmif) {
-   tpmif->flags &= ~TPMIF_REQ_READY;
-   tpmdev_check_req();
-}
-
-int __get_tpmif_index(int st, int n, domid_t domid, unsigned int handle)
-{
-   int i = st + n /2;
-   tpmif_t* tmp;
-
-   if( n <= 0 )
-      return -1;
-
-   tmp = gtpmdev.tpmlist[i];
-   if(domid == tmp->domid && tmp->handle == handle) {
-      return i;
-   } else if ( (domid < tmp->domid) ||
-	 (domid == tmp->domid && handle < tmp->handle)) {
-      return __get_tpmif_index(st, n/2, domid, handle);
-   } else {
-      return __get_tpmif_index(i + 1, n/2 - ((n +1) % 2), domid, handle);
-   }
-}
-
-/* Returns the array index of the tpmif domid/handle. Returns -1 if no such tpmif exists */
-int get_tpmif_index(domid_t domid, unsigned int handle)
-{
-   int flags;
-   int index;
-   local_irq_save(flags);
-   index = __get_tpmif_index(0, gtpmdev.num_tpms, domid, handle);
-   local_irq_restore(flags);
-   return index;
-}
-
-/* Returns the tpmif domid/handle or NULL if none exists */
-tpmif_t* get_tpmif(domid_t domid, unsigned int handle)
-{
-   int flags;
-   int i;
-   tpmif_t* ret;
-   local_irq_save(flags);
-   i = get_tpmif_index(domid, handle);
-   if (i < 0) {
-      ret = NULL;
-   } else {
-      ret = gtpmdev.tpmlist[i];
-   }
-   local_irq_restore(flags);
-   return ret;
-}
-
-/* Remove the given tpmif. Returns 0 if it was removed, -1 if it was not removed */
-int remove_tpmif(tpmif_t* tpmif)
-{
-   int i, j;
-   char* err;
-   int flags;
-   local_irq_save(flags);
-
-   /* Find the index in the array if it exists */
-   i = get_tpmif_index(tpmif->domid, tpmif->handle);
-   if (i < 0) {
-      goto error;
-   }
-
-   /* Remove the interface from the list */
-   for(j = i; j < gtpmdev.num_tpms - 1; ++j) {
-      gtpmdev.tpmlist[j] = gtpmdev.tpmlist[j+1];
-   }
-   gtpmdev.tpmlist[j] = NULL;
-   --gtpmdev.num_tpms;
-
-   /* If removed tpm was the only ready tpm, then we need to check and turn off the ready flag */
-   tpmdev_check_req();
-
-   local_irq_restore(flags);
-
-   /* Stop listening for events on this tpm interface */
-   if((err = xenbus_unwatch_path_token(XBT_NIL, tpmif->fe_state_path, tpmif->fe_state_path))) {
-      TPMBACK_ERR("Unable to unwatch path token `%s' Error was %s Ignoring..\n", tpmif->fe_state_path, err);
-      free(err);
-   }
-
-   return 0;
-error:
-   local_irq_restore(flags);
-   return -1;
-}
-
-/* Insert tpmif into dev->tpmlist. Returns 0 on success and non zero on error.
- * It is an error to insert a tpmif with the same domid and handle
- * number
- * as something already in the list */
-int insert_tpmif(tpmif_t* tpmif)
-{
-   int flags;
-   unsigned int i, j;
-   tpmif_t* tmp;
-   char* err;
-   char path[512];
-
-   local_irq_save(flags);
-
-   /*Check if we need to allocate more space */
-   if (gtpmdev.num_tpms == gtpmdev.num_alloc) {
-      gtpmdev.num_alloc *= 2;
-      gtpmdev.tpmlist = realloc(gtpmdev.tpmlist, gtpmdev.num_alloc);
-   }
-
-   /*Find where to put the new interface */
-   for(i = 0; i < gtpmdev.num_tpms; ++i)
-   {
-      tmp = gtpmdev.tpmlist[i];
-      if(tpmif->domid == tmp->domid && tpmif->handle == tmp->handle) {
-	 TPMBACK_ERR("Tried to insert duplicate tpm interface %u/%u\n", (unsigned int) tpmif->domid, tpmif->handle);
-	 goto error;
-      }
-      if((tpmif->domid < tmp->domid) ||
-	    (tpmif->domid == tmp->domid && tpmif->handle < tmp->handle)) {
-	 break;
-      }
-   }
-
-   /*Shift all the tpm pointers past i down one */
-   for(j = gtpmdev.num_tpms; j > i; --j) {
-      gtpmdev.tpmlist[j] = gtpmdev.tpmlist[j-1];
-   }
-
-   /*Add the new interface */
-   gtpmdev.tpmlist[i] = tpmif;
-   ++gtpmdev.num_tpms;
-
-   /*Should not be needed, anything inserted with ready flag is probably an error */
-   tpmdev_check_req();
-
-   local_irq_restore(flags);
-
-   snprintf(path, 512, "backend/vtpm/%u/%u/feature-protocol-v2", (unsigned int) tpmif->domid, tpmif->handle);
-   if ((err = xenbus_write(XBT_NIL, path, "1")))
-   {
-      /* if we got an error here we should carefully remove the interface and then return */
-      TPMBACK_ERR("Unable to write feature-protocol-v2 node: %s\n", err);
-      free(err);
-      remove_tpmif(tpmif);
-      goto error_post_irq;
-   }
-
-   /*Listen for state changes on the new interface */
-   if((err = xenbus_watch_path_token(XBT_NIL, tpmif->fe_state_path, tpmif->fe_state_path, &gtpmdev.events)))
-   {
-      /* if we got an error here we should carefully remove the interface and then return */
-      TPMBACK_ERR("Unable to watch path token `%s' Error was %s\n", tpmif->fe_state_path, err);
-      free(err);
-      remove_tpmif(tpmif);
-      goto error_post_irq;
-   }
-   return 0;
-error:
-   local_irq_restore(flags);
-error_post_irq:
-   return -1;
-}
-
-
-/*****************
- * CHANGE BACKEND STATE
- * *****************/
-/*Attempts to change the backend state in xenstore
- * returns 0 on success and non-zero on error */
-int tpmif_change_state(tpmif_t* tpmif, enum xenbus_state state)
-{
-   int tempst;
-   char path[512];
-   char *value;
-   char *err;
-   enum xenbus_state readst;
-   TPMBACK_DEBUG("Backend state change %u/%u from=%d to=%d\n", (unsigned int) tpmif->domid, tpmif->handle, tpmif->state, state);
-   if (tpmif->state == state)
-      return 0;
-
-   snprintf(path, 512, "backend/vtpm/%u/%u/state", (unsigned int) tpmif->domid, tpmif->handle);
-
-   if((err = xenbus_read(XBT_NIL, path, &value))) {
-      TPMBACK_ERR("Unable to read backend state %s, error was %s\n", path, err);
-      free(err);
-      return -1;
-   }
-   if(sscanf(value, "%d", &tempst) != 1) {
-      TPMBACK_ERR("Non integer value (%s) in %s ??\n", value, path);
-      free(value);
-      return -1;
-   }
-   readst = (enum xenbus_state) tempst;
-   free(value);
-
-   /* It's possible that the backend state got updated by hotplug or something else behind our back */
-   if(readst != tpmif->state) {
-      TPMBACK_DEBUG("tpm interface state was %d but xenstore state was %d!\n", tpmif->state, readst);
-      tpmif->state = readst;
-   }
-
-   /*If if the state isnt changing, then we dont update xenstore b/c we dont want to fire extraneous events */
-   if(tpmif->state == state) {
-      return 0;
-   }
-
-   /*update xenstore*/
-   snprintf(path, 512, "backend/vtpm/%u/%u", (unsigned int) tpmif->domid, tpmif->handle);
-   if((err = xenbus_printf(XBT_NIL, path, "state", "%u", state))) {
-      TPMBACK_ERR("Error writing to xenstore %s, error was %s new state=%d\n", path, err, state);
-      free(err);
-      return -1;
-   }
-
-   tpmif->state = state;
-
-   return 0;
-}
-/**********************************
- * TPMIF CREATION AND DELETION
- * *******************************/
-static tpmif_t* __init_tpmif(domid_t domid, unsigned int handle)
-{
-   tpmif_t* tpmif;
-   tpmif = malloc(sizeof(*tpmif));
-   tpmif->domid = domid;
-   tpmif->handle = handle;
-   tpmif->fe_path = NULL;
-   tpmif->fe_state_path = NULL;
-   tpmif->state = XenbusStateInitialising;
-   tpmif->status = DISCONNECTED;
-   tpmif->page = NULL;
-   tpmif->flags = 0;
-   tpmif->opaque = NULL;
-   memset(tpmif->uuid, 0, sizeof(tpmif->uuid));
-   return tpmif;
-}
-
-void __free_tpmif(tpmif_t* tpmif)
-{
-   if(tpmif->fe_path) {
-      free(tpmif->fe_path);
-   }
-   if(tpmif->fe_state_path) {
-      free(tpmif->fe_state_path);
-   }
-   free(tpmif);
-}
-/* Creates a new tpm interface, adds it to the sorted array and returns it.
- * returns NULL on error
- * If the tpm interface already exists, it is returned*/
-tpmif_t* new_tpmif(domid_t domid, unsigned int handle)
-{
-   tpmif_t* tpmif;
-   char* err;
-   char path[512];
-
-   /* Make sure we haven't already created this tpm
-    * Double events can occur */
-   if((tpmif = get_tpmif(domid, handle)) != NULL) {
-      return tpmif;
-   }
-
-   tpmif = __init_tpmif(domid, handle);
-
-   /* Get the uuid from xenstore */
-   snprintf(path, 512, "backend/vtpm/%u/%u/uuid", (unsigned int) domid, handle);
-   if((!xenbus_read_uuid(path, tpmif->uuid))) {
-      TPMBACK_ERR("Error reading %s\n", path);
-      goto error;
-   }
-
-   if(tpmif_change_state(tpmif, XenbusStateInitWait)) {
-      goto error;
-   }
-
-   snprintf(path, 512, "backend/vtpm/%u/%u/frontend", (unsigned int) domid, handle);
-   if((err = xenbus_read(XBT_NIL, path, &tpmif->fe_path))) {
-      TPMBACK_ERR("Error creating new tpm instance xenbus_read(%s), Error = %s", path, err);
-      free(err);
-      goto error;
-   }
-
-   /*Set the state path */
-   tpmif->fe_state_path = malloc(strlen(tpmif->fe_path) + 7);
-   strcpy(tpmif->fe_state_path, tpmif->fe_path);
-   strcat(tpmif->fe_state_path, "/state");
-
-   if(insert_tpmif(tpmif)) {
-      goto error;
-   }
-   TPMBACK_DEBUG("New tpmif %u/%u\n", (unsigned int) tpmif->domid, tpmif->handle);
-   /* Do the callback now */
-   if(gtpmdev.open_callback) {
-      gtpmdev.open_callback(tpmif->domid, tpmif->handle);
-   }
-   return tpmif;
-error:
-   __free_tpmif(tpmif);
-   return NULL;
-
-}
-
-/* Removes tpmif from dev->tpmlist and frees it's memory usage */
-void free_tpmif(tpmif_t* tpmif)
-{
-   char* err;
-   char path[512];
-   TPMBACK_DEBUG("Free tpmif %u/%u\n", (unsigned int) tpmif->domid, tpmif->handle);
-   if(tpmif->flags & TPMIF_CLOSED) {
-      TPMBACK_ERR("Tried to free an instance twice! Theres a bug somewhere!\n");
-      BUG();
-   }
-   tpmif->flags = TPMIF_CLOSED;
-
-   tpmif_change_state(tpmif, XenbusStateClosing);
-
-   /* Unmap share page and unbind event channel */
-   if(tpmif->status == CONNECTED) {
-      tpmif->status = DISCONNECTING;
-      mask_evtchn(tpmif->evtchn);
-
-      if(gntmap_munmap(&gtpmdev.map, (unsigned long)tpmif->page, 1)) {
-	 TPMBACK_ERR("%u/%u Error occured while trying to unmap shared page\n", (unsigned int) tpmif->domid, tpmif->handle);
-      }
-
-      unbind_evtchn(tpmif->evtchn);
-   }
-   tpmif->status = DISCONNECTED;
-   tpmif_change_state(tpmif, XenbusStateClosed);
-
-   /* Do the callback now */
-   if(gtpmdev.close_callback) {
-      gtpmdev.close_callback(tpmif->domid, tpmif->handle);
-   }
-
-   /* remove from array */
-   remove_tpmif(tpmif);
-
-   /* Wake up anyone possibly waiting on this interface and let them exit */
-   wake_up(&waitq);
-   schedule();
-
-   /* Remove the old xenbus entries */
-   snprintf(path, 512, "backend/vtpm/%u/%u", (unsigned int) tpmif->domid, tpmif->handle);
-   if((err = xenbus_rm(XBT_NIL, path))) {
-      TPMBACK_ERR("Error cleaning up xenbus entries path=%s error=%s\n", path, err);
-      free(err);
-   }
-
-   TPMBACK_LOG("Frontend %u/%u disconnected\n", (unsigned int) tpmif->domid, tpmif->handle);
-
-   /* free memory */
-   __free_tpmif(tpmif);
-
-}
-
-/**********************
- * REMAINING TPMBACK FUNCTIONS
- * ********************/
-
-/*Event channel handler */
-void tpmback_handler(evtchn_port_t port, struct pt_regs *regs, void *data)
-{
-   tpmif_t* tpmif = (tpmif_t*) data;
-   tpmif_shared_page_t *pg = tpmif->page;
-
-   switch (pg->state)
-   {
-   case TPMIF_STATE_SUBMIT:
-      TPMBACK_DEBUG("EVENT CHANNEL FIRE %u/%u\n", (unsigned int) tpmif->domid, tpmif->handle);
-      tpmif_req_ready(tpmif);
-      wake_up(&waitq);
-      break;
-   case TPMIF_STATE_CANCEL:
-      /* If we are busy with a request, do nothing */
-      if (tpmif->flags & TPMIF_REQ_READY)
-         return;
-      /* Acknowledge the cancellation if we are idle */
-      pg->state = TPMIF_STATE_IDLE;
-      wmb();
-      notify_remote_via_evtchn(tpmif->evtchn);
-      return;
-   default:
-      /* Spurious wakeup; do nothing */
-      return;
-   }
-}
-
-/* Connect to frontend */
-int connect_fe(tpmif_t* tpmif)
-{
-   char path[512];
-   char* err, *value;
-   uint32_t domid;
-   grant_ref_t ringref;
-   evtchn_port_t evtchn;
-
-   /* If already connected then quit */
-   if (tpmif->status == CONNECTED) {
-      TPMBACK_DEBUG("%u/%u tried to connect while it was already connected?\n", (unsigned int) tpmif->domid, tpmif->handle);
-      return 0;
-   }
-
-   /* Fetch the grant reference */
-   snprintf(path, 512, "%s/ring-ref", tpmif->fe_path);
-   if((err = xenbus_read(XBT_NIL, path, &value))) {
-      TPMBACK_ERR("Error creating new tpm instance xenbus_read(%s) Error = %s", path, err);
-      free(err);
-      return -1;
-   }
-   if(sscanf(value, "%d", &ringref) != 1) {
-      TPMBACK_ERR("Non integer value (%s) in %s ??\n", value, path);
-      free(value);
-      return -1;
-   }
-   free(value);
-
-
-   /* Fetch the event channel*/
-   snprintf(path, 512, "%s/event-channel", tpmif->fe_path);
-   if((err = xenbus_read(XBT_NIL, path, &value))) {
-      TPMBACK_ERR("Error creating new tpm instance xenbus_read(%s) Error = %s", path, err);
-      free(err);
-      return -1;
-   }
-   if(sscanf(value, "%d", &evtchn) != 1) {
-      TPMBACK_ERR("Non integer value (%s) in %s ??\n", value, path);
-      free(value);
-      return -1;
-   }
-   free(value);
-
-   /* Check that protocol v2 is being used */
-   snprintf(path, 512, "%s/feature-protocol-v2", tpmif->fe_path);
-   if((err = xenbus_read(XBT_NIL, path, &value))) {
-      TPMBACK_ERR("Unable to read %s during tpmback initialization! error = %s\n", path, err);
-      free(err);
-      return -1;
-   }
-   if(strcmp(value, "1")) {
-      TPMBACK_ERR("%s has an invalid value (%s)\n", path, value);
-      free(value);
-      return -1;
-   }
-   free(value);
-
-   domid = tpmif->domid;
-   if((tpmif->page = gntmap_map_grant_refs(&gtpmdev.map, 1, &domid, 0, &ringref, PROT_READ | PROT_WRITE)) == NULL) {
-      TPMBACK_ERR("Failed to map grant reference %u/%u\n", (unsigned int) tpmif->domid, tpmif->handle);
-      return -1;
-   }
-
-   /*Bind the event channel */
-   if((evtchn_bind_interdomain(tpmif->domid, evtchn, tpmback_handler, tpmif, &tpmif->evtchn)))
-   {
-      TPMBACK_ERR("%u/%u Unable to bind to interdomain event channel!\n", (unsigned int) tpmif->domid, tpmif->handle);
-      goto error_post_map;
-   }
-   unmask_evtchn(tpmif->evtchn);
-
-   /* Write the ready flag and change status to connected */
-   snprintf(path, 512, "backend/vtpm/%u/%u", (unsigned int) tpmif->domid, tpmif->handle);
-   if((err = xenbus_printf(XBT_NIL, path, "ready", "%u", 1))) {
-      TPMBACK_ERR("%u/%u Unable to write ready flag on connect_fe()\n", (unsigned int) tpmif->domid, tpmif->handle);
-      free(err);
-      goto error_post_evtchn;
-   }
-   tpmif->status = CONNECTED;
-   if((tpmif_change_state(tpmif, XenbusStateConnected))){
-      goto error_post_evtchn;
-   }
-
-   TPMBACK_LOG("Frontend %u/%u connected\n", (unsigned int) tpmif->domid, tpmif->handle);
-
-   return 0;
-error_post_evtchn:
-   mask_evtchn(tpmif->evtchn);
-   unbind_evtchn(tpmif->evtchn);
-error_post_map:
-   gntmap_munmap(&gtpmdev.map, (unsigned long)tpmif->page, 1);
-   return -1;
-}
-
-static void disconnect_fe(tpmif_t* tpmif)
-{
-   if (tpmif->status == CONNECTED) {
-      tpmif->status = DISCONNECTING;
-      mask_evtchn(tpmif->evtchn);
-
-      if(gntmap_munmap(&gtpmdev.map, (unsigned long)tpmif->page, 1)) {
-	 TPMBACK_ERR("%u/%u Error occured while trying to unmap shared page\n", (unsigned int) tpmif->domid, tpmif->handle);
-      }
-
-      unbind_evtchn(tpmif->evtchn);
-   }
-   tpmif->status = DISCONNECTED;
-   tpmif_change_state(tpmif, XenbusStateInitWait);
-
-   TPMBACK_LOG("Frontend %u/%u disconnected\n", (unsigned int) tpmif->domid, tpmif->handle);
-}
-
-static int frontend_changed(tpmif_t* tpmif)
-{
-   int state = xenbus_read_integer(tpmif->fe_state_path);
-   if(state < 0) {
-      state = XenbusStateUnknown;
-   }
-
-   TPMBACK_DEBUG("Frontend %u/%u state changed to %d\n", (unsigned int) tpmif->domid, tpmif->handle, state);
-
-   switch (state) {
-      case XenbusStateInitialising:
-	 break;
-
-      case XenbusStateInitialised:
-      case XenbusStateConnected:
-	 if(connect_fe(tpmif)) {
-	    TPMBACK_ERR("Failed to connect to front end %u/%u\n", (unsigned int) tpmif->domid, tpmif->handle);
-	    tpmif_change_state(tpmif, XenbusStateClosed);
-	    return -1;
-	 }
-	 break;
-
-      case XenbusStateClosing:
-	 tpmif_change_state(tpmif, XenbusStateClosing);
-	 break;
-
-      case XenbusStateClosed:
-         disconnect_fe(tpmif);
-	 break;
-
-      case XenbusStateUnknown: /* keep it here */
-	 free_tpmif(tpmif);
-	 break;
-
-      default:
-	 TPMBACK_DEBUG("BAD STATE CHANGE %u/%u state = %d for tpmif\n", (unsigned int) tpmif->domid, tpmif->handle, state);
-	 return -1;
-   }
-   return 0;
-}
-
-
-/* parses the string that comes out of xenbus_watch_wait_return. */
-static int parse_eventstr(const char* evstr, domid_t* domid, unsigned int* handle)
-{
-   int ret;
-   char cmd[40];
-   char* err;
-   char* value;
-   unsigned int udomid = 0;
-   tpmif_t* tpmif;
-   /* First check for new frontends, this occurs when /backend/vtpm/<domid>/<handle> gets created. Note we what the sscanf to fail on the last %s */
-   if (sscanf(evstr, "backend/vtpm/%u/%u/%40s", &udomid, handle, cmd) == 2) {
-      *domid = udomid;
-      /* Make sure the entry exists, if this event triggers because the entry dissapeared then ignore it */
-      if((err = xenbus_read(XBT_NIL, evstr, &value))) {
-	 free(err);
-	 return EV_NONE;
-      }
-      free(value);
-      /* Make sure the tpmif entry does not already exist, this should not happen */
-      if((tpmif = get_tpmif(*domid, *handle)) != NULL) {
-	 TPMBACK_DEBUG("Duplicate tpm entries! %u %u\n", tpmif->domid, tpmif->handle);
-	 return EV_NONE;
-      }
-      return EV_NEWFE;
-   } else if((ret = sscanf(evstr, "/local/domain/%u/device/vtpm/%u/%40s", &udomid, handle, cmd)) == 3) {
-      *domid = udomid;
-      if (!strcmp(cmd, "state"))
-	 return EV_STCHNG;
-   }
-   return EV_NONE;
-}
-
-void handle_backend_event(char* evstr) {
-   tpmif_t* tpmif;
-   domid_t domid;
-   unsigned int handle;
-   int event;
-
-   TPMBACK_DEBUG("Xenbus Event: %s\n", evstr);
-
-   event = parse_eventstr(evstr, &domid, &handle);
-
-   switch(event) {
-      case EV_NEWFE:
-	 if(new_tpmif(domid, handle) == NULL) {
-	    TPMBACK_ERR("Failed to create new tpm instance %u/%u\n", (unsigned int) domid, handle);
-	 }
-	 wake_up(&waitq);
-	 break;
-      case EV_STCHNG:
-	 if((tpmif = get_tpmif(domid, handle))) {
-	    frontend_changed(tpmif);
-	 } else {
-	    TPMBACK_DEBUG("Event Received for non-existant tpm! instance=%u/%u xenbus_event=%s\n", (unsigned int) domid, handle, evstr);
-	 }
-	 break;
-   }
-}
-
-/* Runs through the given path and creates events recursively
- * for all of its children.
- * @path - xenstore path to scan */
-static void generate_backend_events(const char* path)
-{
-   char* err;
-   int i, len;
-   char **dirs;
-   char *entry;
-
-   if((err = xenbus_ls(XBT_NIL, path, &dirs)) != NULL) {
-      free(err);
-      return;
-   }
-
-   for(i = 0; dirs[i] != NULL; ++i) {
-      len = strlen(path) + strlen(dirs[i]) + 2;
-      entry = malloc(len);
-      snprintf(entry, len, "%s/%s", path, dirs[i]);
-
-      /* Generate and handle event for the entry itself */
-      handle_backend_event(entry);
-
-      /* Do children */
-      generate_backend_events(entry);
-
-      /* Cleanup */
-      free(entry);
-      free(dirs[i]);
-   }
-   free(dirs);
-   return;
-}
-
-void* tpmback_get_opaque(domid_t domid, unsigned int handle)
-{
-   tpmif_t* tpmif;
-   if((tpmif = get_tpmif(domid, handle)) == NULL) {
-      TPMBACK_DEBUG("get_opaque() failed, %u/%u is an invalid frontend\n", (unsigned int) domid, handle);
-      return NULL;
-   }
-
-   return tpmif->opaque;
-}
-
-int tpmback_set_opaque(domid_t domid, unsigned int handle, void *opaque)
-{
-   tpmif_t* tpmif;
-   if((tpmif = get_tpmif(domid, handle)) == NULL) {
-      TPMBACK_DEBUG("set_opaque() failed, %u/%u is an invalid frontend\n", (unsigned int) domid, handle);
-      return -1;
-   }
-
-   tpmif->opaque = opaque;
-   return 0;
-}
-
-unsigned char* tpmback_get_uuid(domid_t domid, unsigned int handle)
-{
-   tpmif_t* tpmif;
-   if((tpmif = get_tpmif(domid, handle)) == NULL) {
-      TPMBACK_DEBUG("get_uuid() failed, %u/%u is an invalid frontend\n", (unsigned int) domid, handle);
-      return NULL;
-   }
-
-   return tpmif->uuid;
-}
-
-int tpmback_get_peercontext(domid_t domid, unsigned int handle, void* buffer, int buflen)
-{
-   tpmif_t* tpmif;
-   if((tpmif = get_tpmif(domid, handle)) == NULL) {
-      TPMBACK_DEBUG("get_uuid() failed, %u/%u is an invalid frontend\n", (unsigned int) domid, handle);
-      return -1;
-   }
-
-   return evtchn_get_peercontext(tpmif->evtchn, buffer, buflen);
-}
-
-static void event_listener(void)
-{
-   const char* bepath = "backend/vtpm";
-   char **path;
-   char* err;
-
-   /* Setup the backend device watch */
-   if((err = xenbus_watch_path_token(XBT_NIL, bepath, bepath, &gtpmdev.events)) != NULL) {
-      TPMBACK_ERR("xenbus_watch_path_token(%s) failed with error %s!\n", bepath, err);
-      free(err);
-      goto egress;
-   }
-
-   /* Check for any frontends that connected before we set the watch.
-    * This is almost guaranteed to happen if both domains are started
-    * immediatly one after the other.
-    * We do this by manually generating events on everything in the backend
-    * path */
-   generate_backend_events(bepath);
-
-   /* Wait and listen for changes in frontend connections */
-   while(1) {
-      path = xenbus_wait_for_watch_return(&gtpmdev.events);
-
-      /*If quit flag was set then exit */
-      if(gtpmdev.flags & TPMIF_CLOSED) {
-	 TPMBACK_DEBUG("listener thread got quit event. Exiting..\n");
-	 free(path);
-	 break;
-      }
-      handle_backend_event(*path);
-      free(path);
-
-   }
-
-   if((err = xenbus_unwatch_path_token(XBT_NIL, bepath, bepath)) != NULL) {
-      free(err);
-   }
-egress:
-   return;
-}
-
-void event_thread(void* p) {
-   event_listener();
-}
-
-void init_tpmback(void (*open_cb)(domid_t, unsigned int), void (*close_cb)(domid_t, unsigned int))
-{
-   if(!globalinit) {
-      init_waitqueue_head(&waitq);
-      globalinit = 1;
-   }
-   printk("============= Init TPM BACK ================\n");
-   gtpmdev.tpmlist = malloc(sizeof(tpmif_t*) * DEF_ARRAY_SIZE);
-   gtpmdev.num_alloc = DEF_ARRAY_SIZE;
-   gtpmdev.num_tpms = 0;
-   gtpmdev.flags = 0;
-
-   gtpmdev.open_callback = open_cb;
-   gtpmdev.close_callback = close_cb;
-
-   eventthread = create_thread("tpmback-listener", event_thread, NULL);
-
-}
-
-void shutdown_tpmback(void)
-{
-   TPMBACK_LOG("Shutting down tpm backend\n");
-   /* Set the quit flag */
-   gtpmdev.flags = TPMIF_CLOSED;
-
-   //printk("num tpms is %d\n", gtpmdev.num_tpms);
-   /*Free all backend instances */
-   while(gtpmdev.num_tpms) {
-      free_tpmif(gtpmdev.tpmlist[0]);
-   }
-   free(gtpmdev.tpmlist);
-   gtpmdev.tpmlist = NULL;
-   gtpmdev.num_alloc = 0;
-
-   /* Wake up anyone possibly waiting on the device and let them exit */
-   wake_up(&waitq);
-   schedule();
-}
-
-static void init_tpmcmd(tpmcmd_t* tpmcmd, domid_t domid, unsigned int handle, void *opaque)
-{
-   tpmcmd->domid = domid;
-   tpmcmd->locality = -1;
-   tpmcmd->handle = handle;
-   tpmcmd->opaque = opaque;
-   tpmcmd->req = NULL;
-   tpmcmd->req_len = 0;
-   tpmcmd->resp = NULL;
-   tpmcmd->resp_len = 0;
-}
-
-tpmcmd_t* get_request(tpmif_t* tpmif) {
-   tpmcmd_t* cmd;
-   tpmif_shared_page_t *shr;
-   unsigned int offset;
-   int flags;
-#ifdef TPMBACK_PRINT_DEBUG
-   int i;
-#endif
-
-   local_irq_save(flags);
-
-   /* Allocate the cmd object to hold the data */
-   if((cmd = malloc(sizeof(*cmd))) == NULL) {
-      goto error;
-   }
-   init_tpmcmd(cmd, tpmif->domid, tpmif->handle, tpmif->opaque);
-
-   shr = tpmif->page;
-   cmd->req_len = shr->length;
-   cmd->locality = shr->locality;
-   offset = sizeof(*shr) + 4*shr->nr_extra_pages;
-   if (offset > PAGE_SIZE || offset + cmd->req_len > PAGE_SIZE) {
-      TPMBACK_ERR("%u/%u Command size too long for shared page!\n", (unsigned int) tpmif->domid, tpmif->handle);
-      goto error;
-   }
-   /* Allocate the buffer */
-   if(cmd->req_len) {
-      if((cmd->req = malloc(cmd->req_len)) == NULL) {
-	 goto error;
-      }
-   }
-   /* Copy the bits from the shared page(s) */
-   memcpy(cmd->req, offset + (uint8_t*)shr, cmd->req_len);
-
-#ifdef TPMBACK_PRINT_DEBUG
-   TPMBACK_DEBUG("Received Tpm Command from %u/%u of size %u", (unsigned int) tpmif->domid, tpmif->handle, cmd->req_len);
-   for(i = 0; i < cmd->req_len; ++i) {
-      if (!(i % 30)) {
-	 TPMBACK_DEBUG_MORE("\n");
-      }
-      TPMBACK_DEBUG_MORE("%02hhX ", cmd->req[i]);
-   }
-   TPMBACK_DEBUG_MORE("\n\n");
-#endif
-
-   local_irq_restore(flags);
-   return cmd;
-error:
-   if(cmd != NULL) {
-      if (cmd->req != NULL) {
-	 free(cmd->req);
-	 cmd->req = NULL;
-      }
-      free(cmd);
-      cmd = NULL;
-   }
-   local_irq_restore(flags);
-   return NULL;
-
-}
-
-void send_response(tpmcmd_t* cmd, tpmif_t* tpmif)
-{
-   tpmif_shared_page_t *shr;
-   unsigned int offset;
-   int flags;
-#ifdef TPMBACK_PRINT_DEBUG
-int i;
-#endif
-
-   local_irq_save(flags);
-
-   shr = tpmif->page;
-   shr->length = cmd->resp_len;
-
-   offset = sizeof(*shr) + 4*shr->nr_extra_pages;
-   if (offset > PAGE_SIZE || offset + cmd->resp_len > PAGE_SIZE) {
-      TPMBACK_ERR("%u/%u Command size too long for shared page!\n", (unsigned int) tpmif->domid, tpmif->handle);
-      goto error;
-   }
-   memcpy(offset + (uint8_t*)shr, cmd->resp, cmd->resp_len);
-
-#ifdef TPMBACK_PRINT_DEBUG
-   TPMBACK_DEBUG("Sent response to %u/%u of size %u", (unsigned int) tpmif->domid, tpmif->handle, cmd->resp_len);
-   for(i = 0; i < cmd->resp_len; ++i) {
-      if (!(i % 30)) {
-	 TPMBACK_DEBUG_MORE("\n");
-      }
-      TPMBACK_DEBUG_MORE("%02hhX ", cmd->resp[i]);
-   }
-   TPMBACK_DEBUG_MORE("\n\n");
-#endif
-   /* clear the ready flag and send the event channel notice to the frontend */
-   tpmif_req_finished(tpmif);
-   barrier();
-   shr->state = TPMIF_STATE_FINISH;
-   wmb();
-   notify_remote_via_evtchn(tpmif->evtchn);
-error:
-   local_irq_restore(flags);
-   return;
-}
-
-tpmcmd_t* tpmback_req_any(void)
-{
-   int i;
-   /* Block until something has a request */
-   wait_event(waitq, (gtpmdev.flags & (TPMIF_REQ_READY | TPMIF_CLOSED)));
-
-   /* Check if were shutting down */
-   if(gtpmdev.flags & TPMIF_CLOSED) {
-      /* if something was waiting for us to give up the queue so it can shutdown, let it finish */
-      schedule();
-      return NULL;
-   }
-
-   for(i = 0; i < gtpmdev.num_tpms; ++i) {
-      if(gtpmdev.tpmlist[i]->flags & TPMIF_REQ_READY) {
-	 return get_request(gtpmdev.tpmlist[i]);
-      }
-   }
-
-   TPMBACK_ERR("backend request ready flag was set but no interfaces were actually ready\n");
-   return NULL;
-}
-
-tpmcmd_t* tpmback_req(domid_t domid, unsigned int handle)
-{
-   tpmif_t* tpmif;
-   tpmif = get_tpmif(domid, handle);
-   if(tpmif == NULL) {
-      return NULL;
-   }
-
-   wait_event(waitq, (tpmif->flags & (TPMIF_REQ_READY | TPMIF_CLOSED) || gtpmdev.flags & TPMIF_CLOSED));
-
-   /* Check if were shutting down */
-   if(tpmif->flags & TPMIF_CLOSED || gtpmdev.flags & TPMIF_CLOSED) {
-      /* if something was waiting for us to give up the queue so it can free this instance, let it finish */
-      schedule();
-      return NULL;
-   }
-
-   return get_request(tpmif);
-}
-
-void tpmback_resp(tpmcmd_t* tpmcmd)
-{
-   tpmif_t* tpmif;
-
-   /* Get the associated interface, if it doesnt exist then just quit */
-   tpmif = get_tpmif(tpmcmd->domid, tpmcmd->handle);
-   if(tpmif == NULL) {
-      TPMBACK_ERR("Tried to send a reponse to non existant frontend %u/%u\n", (unsigned int) tpmcmd->domid, tpmcmd->handle);
-      goto end;
-   }
-
-   if(!(tpmif->flags & TPMIF_REQ_READY)) {
-      TPMBACK_ERR("Tried to send response to a frontend that was not waiting for one %u/%u\n", (unsigned int) tpmcmd->domid, tpmcmd->handle);
-      goto end;
-   }
-
-   /* Send response to frontend */
-   send_response(tpmcmd, tpmif);
-
-end:
-   if(tpmcmd->req != NULL) {
-      free(tpmcmd->req);
-   }
-   free(tpmcmd);
-   return;
-}
-
-int tpmback_wait_for_frontend_connect(domid_t *domid, unsigned int *handle)
-{
-   tpmif_t* tpmif;
-   int flags;
-   wait_event(waitq, ((gtpmdev.num_tpms > 0) || gtpmdev.flags & TPMIF_CLOSED));
-   if(gtpmdev.flags & TPMIF_CLOSED) {
-      return -1;
-   }
-   local_irq_save(flags);
-   tpmif = gtpmdev.tpmlist[0];
-   *domid = tpmif->domid;
-   *handle = tpmif->handle;
-   local_irq_restore(flags);
-
-   return 0;
-}
-
-int tpmback_num_frontends(void)
-{
-   return gtpmdev.num_tpms;
-}
diff --git a/extras/mini-os/tpmfront.c b/extras/mini-os/tpmfront.c
deleted file mode 100644
index 6049244..0000000
--- a/extras/mini-os/tpmfront.c
+++ /dev/null
@@ -1,631 +0,0 @@
-/*
- * Copyright (c) 2010-2012 United States Government, as represented by
- * the Secretary of Defense.  All rights reserved.
- *
- * This code has been derived from drivers/char/tpm_vtpm.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (C) 2006 IBM Corporation
- *
- * This code has also been derived from drivers/char/tpm_xen.c
- * from the xen 2.6.18 linux kernel
- *
- * Copyright (c) 2005, IBM Corporation
- *
- * which was itself derived from drivers/xen/netfront/netfront.c
- * from the linux kernel
- *
- * Copyright (c) 2002-2004, K A Fraser
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
- */
-#include <mini-os/os.h>
-#include <mini-os/xenbus.h>
-#include <mini-os/xmalloc.h>
-#include <mini-os/events.h>
-#include <mini-os/wait.h>
-#include <mini-os/gnttab.h>
-#include <xen/io/xenbus.h>
-#include <xen/io/tpmif.h>
-#include <mini-os/tpmfront.h>
-#include <mini-os/lib.h>
-#include <fcntl.h>
-
-//#define TPMFRONT_PRINT_DEBUG
-#ifdef TPMFRONT_PRINT_DEBUG
-#define TPMFRONT_DEBUG(fmt,...) printk("Tpmfront:Debug("__FILE__":%d) " fmt, __LINE__, ##__VA_ARGS__)
-#define TPMFRONT_DEBUG_MORE(fmt,...) printk(fmt, ##__VA_ARGS__)
-#else
-#define TPMFRONT_DEBUG(fmt,...)
-#endif
-#define TPMFRONT_ERR(fmt,...) printk("Tpmfront:Error " fmt, ##__VA_ARGS__)
-#define TPMFRONT_LOG(fmt,...) printk("Tpmfront:Info " fmt, ##__VA_ARGS__)
-
-#define min(a,b) (((a) < (b)) ? (a) : (b))
-
-void tpmfront_handler(evtchn_port_t port, struct pt_regs *regs, void *data) {
-   struct tpmfront_dev* dev = (struct tpmfront_dev*) data;
-   tpmif_shared_page_t *shr = dev->page;
-   /*If we get a response when we didnt make a request, just ignore it */
-   if(!dev->waiting) {
-      return;
-   }
-
-   switch (shr->state) {
-   case TPMIF_STATE_FINISH: /* request was completed */
-   case TPMIF_STATE_IDLE:   /* request was cancelled */
-      break;
-   default:
-      /* Spurious wakeup; do nothing, request is still pending */
-      return;
-   }
-
-   dev->waiting = 0;
-#ifdef HAVE_LIBC
-   if(dev->fd >= 0) {
-      files[dev->fd].read = 1;
-   }
-#endif
-   wake_up(&dev->waitq);
-}
-
-static int publish_xenbus(struct tpmfront_dev* dev) {
-   xenbus_transaction_t xbt;
-   int retry;
-   char* err;
-   /* Write the grant reference and event channel to xenstore */
-again:
-   if((err = xenbus_transaction_start(&xbt))) {
-      TPMFRONT_ERR("Unable to start xenbus transaction, error was %s\n", err);
-      free(err);
-      return -1;
-   }
-
-   if((err = xenbus_printf(xbt, dev->nodename, "ring-ref", "%u", (unsigned int) dev->ring_ref))) {
-      TPMFRONT_ERR("Unable to write %s/ring-ref, error was %s\n", dev->nodename, err);
-      free(err);
-      goto abort_transaction;
-   }
-
-   if((err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", (unsigned int) dev->evtchn))) {
-      TPMFRONT_ERR("Unable to write %s/event-channel, error was %s\n", dev->nodename, err);
-      free(err);
-      goto abort_transaction;
-   }
-
-   if((err = xenbus_transaction_end(xbt, 0, &retry))) {
-      TPMFRONT_ERR("Unable to complete xenbus transaction, error was %s\n", err);
-      free(err);
-      return -1;
-   }
-   if(retry) {
-      goto again;
-   }
-
-   return 0;
-abort_transaction:
-   if((err = xenbus_transaction_end(xbt, 1, &retry))) {
-      free(err);
-   }
-   return -1;
-}
-
-static int wait_for_backend_connect(xenbus_event_queue* events, char* path)
-{
-   int state;
-
-   TPMFRONT_LOG("Waiting for backend connection..\n");
-   /* Wait for the backend to connect */
-   while(1) {
-      state = xenbus_read_integer(path);
-      if ( state < 0)
-	 state = XenbusStateUnknown;
-      switch(state) {
-	 /* Bad states, we quit with error */
-	 case XenbusStateUnknown:
-	 case XenbusStateClosing:
-	 case XenbusStateClosed:
-	    TPMFRONT_ERR("Unable to connect to backend\n");
-	    return -1;
-	 /* If backend is connected then break out of loop */
-	 case XenbusStateConnected:
-	    TPMFRONT_LOG("Backend Connected\n");
-	    return 0;
-	 default:
-	    xenbus_wait_for_watch(events);
-      }
-   }
-
-}
-
-static int wait_for_backend_closed(xenbus_event_queue* events, char* path)
-{
-   int state;
-
-   TPMFRONT_LOG("Waiting for backend to close..\n");
-   while(1) {
-      state = xenbus_read_integer(path);
-      if ( state < 0)
-	 state = XenbusStateUnknown;
-      switch(state) {
-	 case XenbusStateUnknown:
-	    TPMFRONT_ERR("Backend Unknown state, forcing shutdown\n");
-	    return -1;
-	 case XenbusStateClosed:
-	    TPMFRONT_LOG("Backend Closed\n");
-	    return 0;
-	 case XenbusStateInitWait:
-	    TPMFRONT_LOG("Backend Closed (waiting for reconnect)\n");
-	    return 0;
-	 default:
-	    xenbus_wait_for_watch(events);
-      }
-   }
-
-}
-
-static int wait_for_backend_state_changed(struct tpmfront_dev* dev, XenbusState state) {
-   char* err;
-   int ret = 0;
-   xenbus_event_queue events = NULL;
-   char path[512];
-
-   snprintf(path, 512, "%s/state", dev->bepath);
-   /*Setup the watch to wait for the backend */
-   if((err = xenbus_watch_path_token(XBT_NIL, path, path, &events))) {
-      TPMFRONT_ERR("Could not set a watch on %s, error was %s\n", path, err);
-      free(err);
-      return -1;
-   }
-
-   /* Do the actual wait loop now */
-   switch(state) {
-      case XenbusStateConnected:
-	 ret = wait_for_backend_connect(&events, path);
-	 break;
-      case XenbusStateClosed:
-	 ret = wait_for_backend_closed(&events, path);
-	 break;
-      default:
-         TPMFRONT_ERR("Bad wait state %d, ignoring\n", state);
-   }
-
-   if((err = xenbus_unwatch_path_token(XBT_NIL, path, path))) {
-      TPMFRONT_ERR("Unable to unwatch %s, error was %s, ignoring..\n", path, err);
-      free(err);
-   }
-   return ret;
-}
-
-static int tpmfront_connect(struct tpmfront_dev* dev)
-{
-   char* err;
-   /* Create shared page */
-   dev->page = (tpmif_shared_page_t *)alloc_page();
-   if(dev->page == NULL) {
-      TPMFRONT_ERR("Unable to allocate page for shared memory\n");
-      goto error;
-   }
-   memset(dev->page, 0, PAGE_SIZE);
-   dev->ring_ref = gnttab_grant_access(dev->bedomid, virt_to_mfn(dev->page), 0);
-   TPMFRONT_DEBUG("grant ref is %lu\n", (unsigned long) dev->ring_ref);
-
-   /*Create event channel */
-   if(evtchn_alloc_unbound(dev->bedomid, tpmfront_handler, dev, &dev->evtchn)) {
-      TPMFRONT_ERR("Unable to allocate event channel\n");
-      goto error_postmap;
-   }
-   unmask_evtchn(dev->evtchn);
-   TPMFRONT_DEBUG("event channel is %lu\n", (unsigned long) dev->evtchn);
-
-   /* Write the entries to xenstore */
-   if(publish_xenbus(dev)) {
-      goto error_postevtchn;
-   }
-
-   /* Change state to connected */
-   dev->state = XenbusStateConnected;
-
-   /* Tell the backend that we are ready */
-   if((err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%u", dev->state))) {
-      TPMFRONT_ERR("Unable to write to xenstore %s/state, value=%u", dev->nodename, XenbusStateConnected);
-      free(err);
-      goto error;
-   }
-
-   return 0;
-error_postevtchn:
-      mask_evtchn(dev->evtchn);
-      unbind_evtchn(dev->evtchn);
-error_postmap:
-      gnttab_end_access(dev->ring_ref);
-      free_page(dev->page);
-error:
-   return -1;
-}
-
-struct tpmfront_dev* init_tpmfront(const char* _nodename)
-{
-   struct tpmfront_dev* dev;
-   const char* nodename;
-   char path[512];
-   char* value, *err;
-   unsigned long long ival;
-
-   printk("============= Init TPM Front ================\n");
-
-   dev = malloc(sizeof(struct tpmfront_dev));
-   memset(dev, 0, sizeof(struct tpmfront_dev));
-
-#ifdef HAVE_LIBC
-   dev->fd = -1;
-#endif
-
-   nodename = _nodename ? _nodename : "device/vtpm/0";
-   dev->nodename = strdup(nodename);
-
-   init_waitqueue_head(&dev->waitq);
-
-   /* Get backend domid */
-   snprintf(path, 512, "%s/backend-id", dev->nodename);
-   if((err = xenbus_read(XBT_NIL, path, &value))) {
-      TPMFRONT_ERR("Unable to read %s during tpmfront initialization! error = %s\n", path, err);
-      free(err);
-      goto error;
-   }
-   if(sscanf(value, "%llu", &ival) != 1) {
-      TPMFRONT_ERR("%s has non-integer value (%s)\n", path, value);
-      free(value);
-      goto error;
-   }
-   free(value);
-   dev->bedomid = ival;
-
-   /* Get backend xenstore path */
-   snprintf(path, 512, "%s/backend", dev->nodename);
-   if((err = xenbus_read(XBT_NIL, path, &dev->bepath))) {
-      TPMFRONT_ERR("Unable to read %s during tpmfront initialization! error = %s\n", path, err);
-      free(err);
-      goto error;
-   }
-
-   /* Publish protocol v2 feature */
-   snprintf(path, 512, "%s/feature-protocol-v2", dev->nodename);
-   if ((err = xenbus_write(XBT_NIL, path, "1")))
-   {
-      TPMFRONT_ERR("Unable to write feature-protocol-v2 node: %s\n", err);
-      free(err);
-      goto error;
-   }
-
-   /* Create and publish grant reference and event channel */
-   if (tpmfront_connect(dev)) {
-      goto error;
-   }
-
-   /* Wait for backend to connect */
-   if( wait_for_backend_state_changed(dev, XenbusStateConnected)) {
-      goto error;
-   }
-
-   /* Ensure backend is also using protocol v2 */
-   snprintf(path, 512, "%s/feature-protocol-v2", dev->bepath);
-   if((err = xenbus_read(XBT_NIL, path, &value))) {
-      TPMFRONT_ERR("Unable to read %s during tpmfront initialization! error = %s\n", path, err);
-      free(err);
-      goto error;
-   }
-   if(strcmp(value, "1")) {
-      TPMFRONT_ERR("%s has an invalid value (%s)\n", path, value);
-      free(value);
-      goto error;
-   }
-   free(value);
-
-   TPMFRONT_LOG("Initialization Completed successfully\n");
-
-   return dev;
-
-error:
-   shutdown_tpmfront(dev);
-   return NULL;
-}
-void shutdown_tpmfront(struct tpmfront_dev* dev)
-{
-   char* err;
-   char path[512];
-   if(dev == NULL) {
-      return;
-   }
-   TPMFRONT_LOG("Shutting down tpmfront\n");
-   /* disconnect */
-   if(dev->state == XenbusStateConnected) {
-      /* Tell backend we are closing */
-      dev->state = XenbusStateClosing;
-      if((err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%u", (unsigned int) dev->state))) {
-	 TPMFRONT_ERR("Unable to write to %s, error was %s", dev->nodename, err);
-	 free(err);
-      }
-
-      /* Clean up xenstore entries */
-      snprintf(path, 512, "%s/event-channel", dev->nodename);
-      if((err = xenbus_rm(XBT_NIL, path))) {
-	 free(err);
-      }
-      snprintf(path, 512, "%s/ring-ref", dev->nodename);
-      if((err = xenbus_rm(XBT_NIL, path))) {
-	 free(err);
-      }
-
-      /* Tell backend we are closed */
-      dev->state = XenbusStateClosed;
-      if((err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%u", (unsigned int) dev->state))) {
-	 TPMFRONT_ERR("Unable to write to %s, error was %s", dev->nodename, err);
-	 free(err);
-      }
-
-      /* Wait for the backend to close and unmap shared pages, ignore any errors */
-      wait_for_backend_state_changed(dev, XenbusStateClosed);
-
-      /* Prepare for a later reopen (possibly by a kexec'd kernel) */
-      dev->state = XenbusStateInitialising;
-      if((err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%u", (unsigned int) dev->state))) {
-	 TPMFRONT_ERR("Unable to write to %s, error was %s", dev->nodename, err);
-	 free(err);
-      }
-
-      /* Close event channel and unmap shared page */
-      mask_evtchn(dev->evtchn);
-      unbind_evtchn(dev->evtchn);
-      gnttab_end_access(dev->ring_ref);
-
-      free_page(dev->page);
-   }
-
-   /* Cleanup memory usage */
-   if(dev->respbuf) {
-      free(dev->respbuf);
-   }
-   if(dev->bepath) {
-      free(dev->bepath);
-   }
-   if(dev->nodename) {
-      free(dev->nodename);
-   }
-   free(dev);
-}
-
-int tpmfront_send(struct tpmfront_dev* dev, const uint8_t* msg, size_t length)
-{
-   unsigned int offset;
-   tpmif_shared_page_t *shr = NULL;
-#ifdef TPMFRONT_PRINT_DEBUG
-   int i;
-#endif
-   /* Error Checking */
-   if(dev == NULL || dev->state != XenbusStateConnected) {
-      TPMFRONT_ERR("Tried to send message through disconnected frontend\n");
-      return -1;
-   }
-   shr = dev->page;
-
-#ifdef TPMFRONT_PRINT_DEBUG
-   TPMFRONT_DEBUG("Sending Msg to backend size=%u", (unsigned int) length);
-   for(i = 0; i < length; ++i) {
-      if(!(i % 30)) {
-	 TPMFRONT_DEBUG_MORE("\n");
-      }
-      TPMFRONT_DEBUG_MORE("%02X ", msg[i]);
-   }
-   TPMFRONT_DEBUG_MORE("\n");
-#endif
-
-   /* Copy to shared pages now */
-   offset = sizeof(*shr);
-   if (length + offset > PAGE_SIZE) {
-      TPMFRONT_ERR("Message too long for shared page\n");
-      return -1;
-   }
-   memcpy(offset + (uint8_t*)shr, msg, length);
-   shr->length = length;
-   barrier();
-   shr->state = TPMIF_STATE_SUBMIT;
-
-   dev->waiting = 1;
-   dev->resplen = 0;
-#ifdef HAVE_LIBC
-   if(dev->fd >= 0) {
-      files[dev->fd].read = 0;
-      files[dev->fd].tpmfront.respgot = 0;
-      files[dev->fd].tpmfront.offset = 0;
-   }
-#endif
-   wmb();
-   notify_remote_via_evtchn(dev->evtchn);
-   return 0;
-}
-int tpmfront_recv(struct tpmfront_dev* dev, uint8_t** msg, size_t *length)
-{
-   unsigned int offset;
-   tpmif_shared_page_t *shr = NULL;
-#ifdef TPMFRONT_PRINT_DEBUG
-int i;
-#endif
-   if(dev == NULL || dev->state != XenbusStateConnected) {
-      TPMFRONT_ERR("Tried to receive message from disconnected frontend\n");
-      return -1;
-   }
-   /*Wait for the response */
-   wait_event(dev->waitq, (!dev->waiting));
-   shr = dev->page;
-
-   /* Initialize */
-   *msg = NULL;
-   *length = 0;
-   offset = sizeof(*shr);
-
-   if (shr->state != TPMIF_STATE_FINISH)
-      goto quit;
-
-   *length = shr->length;
-
-   if (*length + offset > PAGE_SIZE) {
-      TPMFRONT_ERR("Reply too long for shared page\n");
-      return -1;
-   }
-
-   /* Alloc the buffer */
-   if(dev->respbuf) {
-      free(dev->respbuf);
-   }
-   *msg = dev->respbuf = malloc(*length);
-   dev->resplen = *length;
-
-   /* Copy the bits */
-   memcpy(*msg, offset + (uint8_t*)shr, *length);
-
-#ifdef TPMFRONT_PRINT_DEBUG
-   TPMFRONT_DEBUG("Received response from backend size=%u", (unsigned int) *length);
-   for(i = 0; i < *length; ++i) {
-      if(!(i % 30)) {
-	 TPMFRONT_DEBUG_MORE("\n");
-      }
-      TPMFRONT_DEBUG_MORE("%02X ", (*msg)[i]);
-   }
-   TPMFRONT_DEBUG_MORE("\n");
-#endif
-#ifdef HAVE_LIBC
-   if(dev->fd >= 0) {
-      files[dev->fd].tpmfront.respgot = 1;
-   }
-#endif
-quit:
-   return 0;
-}
-
-int tpmfront_cmd(struct tpmfront_dev* dev, uint8_t* req, size_t reqlen, uint8_t** resp, size_t* resplen)
-{
-   int rc;
-   if((rc = tpmfront_send(dev, req, reqlen))) {
-      return rc;
-   }
-   if((rc = tpmfront_recv(dev, resp, resplen))) {
-      return rc;
-   }
-
-   return 0;
-}
-
-int tpmfront_set_locality(struct tpmfront_dev* dev, int locality)
-{
-   if (!dev || !dev->page)
-      return -1;
-   dev->page->locality = locality;
-   return 0;
-}
-
-#ifdef HAVE_LIBC
-#include <errno.h>
-int tpmfront_open(struct tpmfront_dev* dev)
-{
-   /* Silently prevent multiple opens */
-   if(dev->fd != -1) {
-      return dev->fd;
-   }
-
-   dev->fd = alloc_fd(FTYPE_TPMFRONT);
-   printk("tpmfront_open(%s) -> %d\n", dev->nodename, dev->fd);
-   files[dev->fd].tpmfront.dev = dev;
-   files[dev->fd].tpmfront.offset = 0;
-   files[dev->fd].tpmfront.respgot = 0;
-   return dev->fd;
-}
-
-int tpmfront_posix_write(int fd, const uint8_t* buf, size_t count)
-{
-   int rc;
-   struct tpmfront_dev* dev;
-   dev = files[fd].tpmfront.dev;
-
-   if(count == 0) {
-      return 0;
-   }
-
-   /* Return an error if we are already processing a command */
-   if(dev->waiting) {
-      errno = EINPROGRESS;
-      return -1;
-   }
-   /* Send the command now */
-   if((rc = tpmfront_send(dev, buf, count)) != 0) {
-      errno = EIO;
-      return -1;
-   }
-   return count;
-}
-
-int tpmfront_posix_read(int fd, uint8_t* buf, size_t count)
-{
-   int rc;
-   uint8_t* dummybuf;
-   size_t dummysz;
-   struct tpmfront_dev* dev;
-
-   dev = files[fd].tpmfront.dev;
-
-   if(count == 0) {
-      return 0;
-   }
-
-   /* get the response if we haven't already */
-   if(files[dev->fd].tpmfront.respgot == 0) {
-      if ((rc = tpmfront_recv(dev, &dummybuf, &dummysz)) != 0) {
-	 errno = EIO;
-	 return -1;
-      }
-   }
-
-   /* handle EOF case */
-   if(files[dev->fd].tpmfront.offset >= dev->resplen) {
-      return 0;
-   }
-
-   /* Compute the number of bytes and do the copy operation */
-   if((rc = min(count, dev->resplen - files[dev->fd].tpmfront.offset)) != 0) {
-      memcpy(buf, dev->respbuf + files[dev->fd].tpmfront.offset, rc);
-      files[dev->fd].tpmfront.offset += rc;
-   }
-
-   return rc;
-}
-
-int tpmfront_posix_fstat(int fd, struct stat* buf)
-{
-   uint8_t* dummybuf;
-   size_t dummysz;
-   int rc;
-   struct tpmfront_dev* dev = files[fd].tpmfront.dev;
-
-   /* If we have a response waiting, then read it now from the backend
-    * so we can get its length*/
-   if(dev->waiting || (files[dev->fd].read == 1 && !files[dev->fd].tpmfront.respgot)) {
-      if ((rc = tpmfront_recv(dev, &dummybuf, &dummysz)) != 0) {
-	 errno = EIO;
-	 return -1;
-      }
-   }
-
-   buf->st_mode = O_RDWR;
-   buf->st_uid = 0;
-   buf->st_gid = 0;
-   buf->st_size = dev->resplen;
-   buf->st_atime = buf->st_mtime = buf->st_ctime = time(NULL);
-
-   return 0;
-}
-
-
-#endif
diff --git a/extras/mini-os/xenbus/xenbus.c b/extras/mini-os/xenbus/xenbus.c
deleted file mode 100644
index 934f23b..0000000
--- a/extras/mini-os/xenbus/xenbus.c
+++ /dev/null
@@ -1,870 +0,0 @@
-/* 
- ****************************************************************************
- * (C) 2006 - Cambridge University
- ****************************************************************************
- *
- *        File: xenbus.c
- *      Author: Steven Smith (sos22 at cam.ac.uk) 
- *     Changes: Grzegorz Milos (gm281 at cam.ac.uk)
- *     Changes: John D. Ramsdell
- *              
- *        Date: Jun 2006, chages Aug 2005
- * 
- * Environment: Xen Minimal OS
- * Description: Minimal implementation of xenbus
- *
- ****************************************************************************
- **/
-#include <inttypes.h>
-#include <mini-os/os.h>
-#include <mini-os/mm.h>
-#include <mini-os/traps.h>
-#include <mini-os/lib.h>
-#include <mini-os/xenbus.h>
-#include <mini-os/events.h>
-#include <mini-os/errno.h>
-#include <mini-os/sched.h>
-#include <mini-os/wait.h>
-#include <xen/io/xs_wire.h>
-#include <mini-os/spinlock.h>
-#include <mini-os/xmalloc.h>
-
-#define min(x,y) ({                       \
-        typeof(x) tmpx = (x);                 \
-        typeof(y) tmpy = (y);                 \
-        tmpx < tmpy ? tmpx : tmpy;            \
-        })
-
-#ifdef XENBUS_DEBUG
-#define DEBUG(_f, _a...) \
-    printk("MINI_OS(file=xenbus.c, line=%d) " _f , __LINE__, ## _a)
-#else
-#define DEBUG(_f, _a...)    ((void)0)
-#endif
-
-static struct xenstore_domain_interface *xenstore_buf;
-static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
-DECLARE_WAIT_QUEUE_HEAD(xenbus_watch_queue);
-
-xenbus_event_queue xenbus_events;
-static struct watch {
-    char *token;
-    xenbus_event_queue *events;
-    struct watch *next;
-} *watches;
-struct xenbus_req_info 
-{
-    int in_use:1;
-    struct wait_queue_head waitq;
-    void *reply;
-};
-
-#define NR_REQS 32
-static struct xenbus_req_info req_info[NR_REQS];
-
-static void memcpy_from_ring(const void *Ring,
-        void *Dest,
-        int off,
-        int len)
-{
-    int c1, c2;
-    const char *ring = Ring;
-    char *dest = Dest;
-    c1 = min(len, XENSTORE_RING_SIZE - off);
-    c2 = len - c1;
-    memcpy(dest, ring + off, c1);
-    memcpy(dest + c1, ring, c2);
-}
-
-char **xenbus_wait_for_watch_return(xenbus_event_queue *queue)
-{
-    struct xenbus_event *event;
-    DEFINE_WAIT(w);
-    if (!queue)
-        queue = &xenbus_events;
-    while (!(event = *queue)) {
-        add_waiter(w, xenbus_watch_queue);
-        schedule();
-    }
-    remove_waiter(w, xenbus_watch_queue);
-    *queue = event->next;
-    return &event->path;
-}
-
-void xenbus_wait_for_watch(xenbus_event_queue *queue)
-{
-    char **ret;
-    if (!queue)
-        queue = &xenbus_events;
-    ret = xenbus_wait_for_watch_return(queue);
-    if (ret)
-        free(ret);
-    else
-        printk("unexpected path returned by watch\n");
-}
-
-char* xenbus_wait_for_value(const char* path, const char* value, xenbus_event_queue *queue)
-{
-    if (!queue)
-        queue = &xenbus_events;
-    for(;;)
-    {
-        char *res, *msg;
-        int r;
-
-        msg = xenbus_read(XBT_NIL, path, &res);
-        if(msg) return msg;
-
-        r = strcmp(value,res);
-        free(res);
-
-        if(r==0) break;
-        else xenbus_wait_for_watch(queue);
-    }
-    return NULL;
-}
-
-char *xenbus_switch_state(xenbus_transaction_t xbt, const char* path, XenbusState state)
-{
-    char *current_state;
-    char *msg = NULL;
-    char *msg2 = NULL;
-    char value[2];
-    XenbusState rs;
-    int xbt_flag = 0;
-    int retry = 0;
-
-    do {
-        if (xbt == XBT_NIL) {
-            msg = xenbus_transaction_start(&xbt);
-            if (msg) goto exit;
-            xbt_flag = 1;
-        }
-
-        msg = xenbus_read(xbt, path, &current_state);
-        if (msg) goto exit;
-
-        rs = (XenbusState) (current_state[0] - '0');
-        free(current_state);
-        if (rs == state) {
-            msg = NULL;
-            goto exit;
-        }
-
-        snprintf(value, 2, "%d", state);
-        msg = xenbus_write(xbt, path, value);
-
-exit:
-        if (xbt_flag) {
-            msg2 = xenbus_transaction_end(xbt, 0, &retry);
-            xbt = XBT_NIL;
-        }
-        if (msg == NULL && msg2 != NULL)
-            msg = msg2;
-    } while (retry);
-
-    return msg;
-}
-
-char *xenbus_wait_for_state_change(const char* path, XenbusState *state, xenbus_event_queue *queue)
-{
-    if (!queue)
-        queue = &xenbus_events;
-    for(;;)
-    {
-        char *res, *msg;
-        XenbusState rs;
-
-        msg = xenbus_read(XBT_NIL, path, &res);
-        if(msg) return msg;
-
-        rs = (XenbusState) (res[0] - 48);
-        free(res);
-
-        if (rs == *state)
-            xenbus_wait_for_watch(queue);
-        else {
-            *state = rs;
-            break;
-        }
-    }
-    return NULL;
-}
-
-
-static void xenbus_thread_func(void *ign)
-{
-    struct xsd_sockmsg msg;
-    unsigned prod = xenstore_buf->rsp_prod;
-
-    for (;;) 
-    {
-        wait_event(xb_waitq, prod != xenstore_buf->rsp_prod);
-        while (1) 
-        {
-            prod = xenstore_buf->rsp_prod;
-            DEBUG("Rsp_cons %d, rsp_prod %d.\n", xenstore_buf->rsp_cons,
-                    xenstore_buf->rsp_prod);
-            if (xenstore_buf->rsp_prod - xenstore_buf->rsp_cons < sizeof(msg))
-                break;
-            rmb();
-            memcpy_from_ring(xenstore_buf->rsp,
-                    &msg,
-                    MASK_XENSTORE_IDX(xenstore_buf->rsp_cons),
-                    sizeof(msg));
-            DEBUG("Msg len %d, %d avail, id %d.\n",
-                    msg.len + sizeof(msg),
-                    xenstore_buf->rsp_prod - xenstore_buf->rsp_cons,
-                    msg.req_id);
-            if (xenstore_buf->rsp_prod - xenstore_buf->rsp_cons <
-                    sizeof(msg) + msg.len)
-                break;
-
-            DEBUG("Message is good.\n");
-
-            if(msg.type == XS_WATCH_EVENT)
-            {
-		struct xenbus_event *event = malloc(sizeof(*event) + msg.len);
-                xenbus_event_queue *events = NULL;
-		char *data = (char*)event + sizeof(*event);
-                struct watch *watch;
-
-                memcpy_from_ring(xenstore_buf->rsp,
-		    data,
-                    MASK_XENSTORE_IDX(xenstore_buf->rsp_cons + sizeof(msg)),
-                    msg.len);
-
-		event->path = data;
-		event->token = event->path + strlen(event->path) + 1;
-
-                xenstore_buf->rsp_cons += msg.len + sizeof(msg);
-
-                for (watch = watches; watch; watch = watch->next)
-                    if (!strcmp(watch->token, event->token)) {
-                        events = watch->events;
-                        break;
-                    }
-
-                if (events) {
-                    event->next = *events;
-                    *events = event;
-                    wake_up(&xenbus_watch_queue);
-                } else {
-                    printk("unexpected watch token %s\n", event->token);
-                    free(event);
-                }
-            }
-
-            else
-            {
-                req_info[msg.req_id].reply = malloc(sizeof(msg) + msg.len);
-                memcpy_from_ring(xenstore_buf->rsp,
-                    req_info[msg.req_id].reply,
-                    MASK_XENSTORE_IDX(xenstore_buf->rsp_cons),
-                    msg.len + sizeof(msg));
-                xenstore_buf->rsp_cons += msg.len + sizeof(msg);
-                wake_up(&req_info[msg.req_id].waitq);
-            }
-        }
-    }
-}
-
-static void xenbus_evtchn_handler(evtchn_port_t port, struct pt_regs *regs,
-				  void *ign)
-{
-    wake_up(&xb_waitq);
-}
-
-static int nr_live_reqs;
-static DEFINE_SPINLOCK(req_lock);
-static DECLARE_WAIT_QUEUE_HEAD(req_wq);
-
-/* Release a xenbus identifier */
-static void release_xenbus_id(int id)
-{
-    BUG_ON(!req_info[id].in_use);
-    spin_lock(&req_lock);
-    req_info[id].in_use = 0;
-    nr_live_reqs--;
-    req_info[id].in_use = 0;
-    if (nr_live_reqs == NR_REQS - 1)
-        wake_up(&req_wq);
-    spin_unlock(&req_lock);
-}
-
-/* Allocate an identifier for a xenbus request.  Blocks if none are
-   available. */
-static int allocate_xenbus_id(void)
-{
-    static int probe;
-    int o_probe;
-
-    while (1) 
-    {
-        spin_lock(&req_lock);
-        if (nr_live_reqs < NR_REQS)
-            break;
-        spin_unlock(&req_lock);
-        wait_event(req_wq, (nr_live_reqs < NR_REQS));
-    }
-
-    o_probe = probe;
-    for (;;) 
-    {
-        if (!req_info[o_probe].in_use)
-            break;
-        o_probe = (o_probe + 1) % NR_REQS;
-        BUG_ON(o_probe == probe);
-    }
-    nr_live_reqs++;
-    req_info[o_probe].in_use = 1;
-    probe = (o_probe + 1) % NR_REQS;
-    spin_unlock(&req_lock);
-    init_waitqueue_head(&req_info[o_probe].waitq);
-
-    return o_probe;
-}
-
-/* Initialise xenbus. */
-void init_xenbus(void)
-{
-    int err;
-    DEBUG("init_xenbus called.\n");
-    xenstore_buf = mfn_to_virt(start_info.store_mfn);
-    create_thread("xenstore", xenbus_thread_func, NULL);
-    DEBUG("buf at %p.\n", xenstore_buf);
-    err = bind_evtchn(start_info.store_evtchn,
-		      xenbus_evtchn_handler,
-              NULL);
-    unmask_evtchn(start_info.store_evtchn);
-    printk("xenbus initialised on irq %d mfn %#lx\n",
-	   err, start_info.store_mfn);
-}
-
-void fini_xenbus(void)
-{
-}
-
-/* Send data to xenbus.  This can block.  All of the requests are seen
-   by xenbus as if sent atomically.  The header is added
-   automatically, using type %type, req_id %req_id, and trans_id
-   %trans_id. */
-static void xb_write(int type, int req_id, xenbus_transaction_t trans_id,
-		     const struct write_req *req, int nr_reqs)
-{
-    XENSTORE_RING_IDX prod;
-    int r;
-    int len = 0;
-    const struct write_req *cur_req;
-    int req_off;
-    int total_off;
-    int this_chunk;
-    struct xsd_sockmsg m = {.type = type, .req_id = req_id,
-        .tx_id = trans_id };
-    struct write_req header_req = { &m, sizeof(m) };
-
-    for (r = 0; r < nr_reqs; r++)
-        len += req[r].len;
-    m.len = len;
-    len += sizeof(m);
-
-    cur_req = &header_req;
-
-    BUG_ON(len > XENSTORE_RING_SIZE);
-    /* Wait for the ring to drain to the point where we can send the
-       message. */
-    prod = xenstore_buf->req_prod;
-    if (prod + len - xenstore_buf->req_cons > XENSTORE_RING_SIZE) 
-    {
-        /* Wait for there to be space on the ring */
-        DEBUG("prod %d, len %d, cons %d, size %d; waiting.\n",
-                prod, len, xenstore_buf->req_cons, XENSTORE_RING_SIZE);
-        wait_event(xb_waitq,
-                xenstore_buf->req_prod + len - xenstore_buf->req_cons <=
-                XENSTORE_RING_SIZE);
-        DEBUG("Back from wait.\n");
-        prod = xenstore_buf->req_prod;
-    }
-
-    /* We're now guaranteed to be able to send the message without
-       overflowing the ring.  Do so. */
-    total_off = 0;
-    req_off = 0;
-    while (total_off < len) 
-    {
-        this_chunk = min(cur_req->len - req_off,
-                XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod));
-        memcpy((char *)xenstore_buf->req + MASK_XENSTORE_IDX(prod),
-                (char *)cur_req->data + req_off, this_chunk);
-        prod += this_chunk;
-        req_off += this_chunk;
-        total_off += this_chunk;
-        if (req_off == cur_req->len) 
-        {
-            req_off = 0;
-            if (cur_req == &header_req)
-                cur_req = req;
-            else
-                cur_req++;
-        }
-    }
-
-    DEBUG("Complete main loop of xb_write.\n");
-    BUG_ON(req_off != 0);
-    BUG_ON(total_off != len);
-    BUG_ON(prod > xenstore_buf->req_cons + XENSTORE_RING_SIZE);
-
-    /* Remote must see entire message before updating indexes */
-    wmb();
-
-    xenstore_buf->req_prod += len;
-
-    /* Send evtchn to notify remote */
-    notify_remote_via_evtchn(start_info.store_evtchn);
-}
-
-/* Send a mesasge to xenbus, in the same fashion as xb_write, and
-   block waiting for a reply.  The reply is malloced and should be
-   freed by the caller. */
-struct xsd_sockmsg *
-xenbus_msg_reply(int type,
-		 xenbus_transaction_t trans,
-		 struct write_req *io,
-		 int nr_reqs)
-{
-    int id;
-    DEFINE_WAIT(w);
-    struct xsd_sockmsg *rep;
-
-    id = allocate_xenbus_id();
-    add_waiter(w, req_info[id].waitq);
-
-    xb_write(type, id, trans, io, nr_reqs);
-
-    schedule();
-    remove_waiter(w, req_info[id].waitq);
-    wake(current);
-
-    rep = req_info[id].reply;
-    BUG_ON(rep->req_id != id);
-    release_xenbus_id(id);
-    return rep;
-}
-
-static char *errmsg(struct xsd_sockmsg *rep)
-{
-    char *res;
-    if (!rep) {
-	char msg[] = "No reply";
-	size_t len = strlen(msg) + 1;
-	return memcpy(malloc(len), msg, len);
-    }
-    if (rep->type != XS_ERROR)
-	return NULL;
-    res = malloc(rep->len + 1);
-    memcpy(res, rep + 1, rep->len);
-    res[rep->len] = 0;
-    free(rep);
-    return res;
-}	
-
-/* Send a debug message to xenbus.  Can block. */
-static void xenbus_debug_msg(const char *msg)
-{
-    int len = strlen(msg);
-    struct write_req req[] = {
-        { "print", sizeof("print") },
-        { msg, len },
-        { "", 1 }};
-    struct xsd_sockmsg *reply;
-
-    reply = xenbus_msg_reply(XS_DEBUG, 0, req, ARRAY_SIZE(req));
-    printk("Got a reply, type %d, id %d, len %d.\n",
-            reply->type, reply->req_id, reply->len);
-}
-
-/* List the contents of a directory.  Returns a malloc()ed array of
-   pointers to malloc()ed strings.  The array is NULL terminated.  May
-   block. */
-char *xenbus_ls(xenbus_transaction_t xbt, const char *pre, char ***contents)
-{
-    struct xsd_sockmsg *reply, *repmsg;
-    struct write_req req[] = { { pre, strlen(pre)+1 } };
-    int nr_elems, x, i;
-    char **res, *msg;
-
-    repmsg = xenbus_msg_reply(XS_DIRECTORY, xbt, req, ARRAY_SIZE(req));
-    msg = errmsg(repmsg);
-    if (msg) {
-	*contents = NULL;
-	return msg;
-    }
-    reply = repmsg + 1;
-    for (x = nr_elems = 0; x < repmsg->len; x++)
-        nr_elems += (((char *)reply)[x] == 0);
-    res = malloc(sizeof(res[0]) * (nr_elems + 1));
-    for (x = i = 0; i < nr_elems; i++) {
-        int l = strlen((char *)reply + x);
-        res[i] = malloc(l + 1);
-        memcpy(res[i], (char *)reply + x, l + 1);
-        x += l + 1;
-    }
-    res[i] = NULL;
-    free(repmsg);
-    *contents = res;
-    return NULL;
-}
-
-char *xenbus_read(xenbus_transaction_t xbt, const char *path, char **value)
-{
-    struct write_req req[] = { {path, strlen(path) + 1} };
-    struct xsd_sockmsg *rep;
-    char *res, *msg;
-    rep = xenbus_msg_reply(XS_READ, xbt, req, ARRAY_SIZE(req));
-    msg = errmsg(rep);
-    if (msg) {
-	*value = NULL;
-	return msg;
-    }
-    res = malloc(rep->len + 1);
-    memcpy(res, rep + 1, rep->len);
-    res[rep->len] = 0;
-    free(rep);
-    *value = res;
-    return NULL;
-}
-
-char *xenbus_write(xenbus_transaction_t xbt, const char *path, const char *value)
-{
-    struct write_req req[] = { 
-	{path, strlen(path) + 1},
-	{value, strlen(value)},
-    };
-    struct xsd_sockmsg *rep;
-    char *msg;
-    rep = xenbus_msg_reply(XS_WRITE, xbt, req, ARRAY_SIZE(req));
-    msg = errmsg(rep);
-    if (msg) return msg;
-    free(rep);
-    return NULL;
-}
-
-char* xenbus_watch_path_token( xenbus_transaction_t xbt, const char *path, const char *token, xenbus_event_queue *events)
-{
-    struct xsd_sockmsg *rep;
-
-    struct write_req req[] = { 
-        {path, strlen(path) + 1},
-	{token, strlen(token) + 1},
-    };
-
-    struct watch *watch = malloc(sizeof(*watch));
-
-    char *msg;
-
-    if (!events)
-        events = &xenbus_events;
-
-    watch->token = strdup(token);
-    watch->events = events;
-    watch->next = watches;
-    watches = watch;
-
-    rep = xenbus_msg_reply(XS_WATCH, xbt, req, ARRAY_SIZE(req));
-
-    msg = errmsg(rep);
-    if (msg) return msg;
-    free(rep);
-
-    return NULL;
-}
-
-char* xenbus_unwatch_path_token( xenbus_transaction_t xbt, const char *path, const char *token)
-{
-    struct xsd_sockmsg *rep;
-
-    struct write_req req[] = { 
-        {path, strlen(path) + 1},
-	{token, strlen(token) + 1},
-    };
-
-    struct watch *watch, **prev;
-
-    char *msg;
-
-    rep = xenbus_msg_reply(XS_UNWATCH, xbt, req, ARRAY_SIZE(req));
-
-    msg = errmsg(rep);
-    if (msg) return msg;
-    free(rep);
-
-    for (prev = &watches, watch = *prev; watch; prev = &watch->next, watch = *prev)
-        if (!strcmp(watch->token, token)) {
-            free(watch->token);
-            *prev = watch->next;
-            free(watch);
-            break;
-        }
-
-    return NULL;
-}
-
-char *xenbus_rm(xenbus_transaction_t xbt, const char *path)
-{
-    struct write_req req[] = { {path, strlen(path) + 1} };
-    struct xsd_sockmsg *rep;
-    char *msg;
-    rep = xenbus_msg_reply(XS_RM, xbt, req, ARRAY_SIZE(req));
-    msg = errmsg(rep);
-    if (msg)
-	return msg;
-    free(rep);
-    return NULL;
-}
-
-char *xenbus_get_perms(xenbus_transaction_t xbt, const char *path, char **value)
-{
-    struct write_req req[] = { {path, strlen(path) + 1} };
-    struct xsd_sockmsg *rep;
-    char *res, *msg;
-    rep = xenbus_msg_reply(XS_GET_PERMS, xbt, req, ARRAY_SIZE(req));
-    msg = errmsg(rep);
-    if (msg) {
-	*value = NULL;
-	return msg;
-    }
-    res = malloc(rep->len + 1);
-    memcpy(res, rep + 1, rep->len);
-    res[rep->len] = 0;
-    free(rep);
-    *value = res;
-    return NULL;
-}
-
-#define PERM_MAX_SIZE 32
-char *xenbus_set_perms(xenbus_transaction_t xbt, const char *path, domid_t dom, char perm)
-{
-    char value[PERM_MAX_SIZE];
-    struct write_req req[] = { 
-	{path, strlen(path) + 1},
-	{value, 0},
-    };
-    struct xsd_sockmsg *rep;
-    char *msg;
-    snprintf(value, PERM_MAX_SIZE, "%c%hu", perm, dom);
-    req[1].len = strlen(value) + 1;
-    rep = xenbus_msg_reply(XS_SET_PERMS, xbt, req, ARRAY_SIZE(req));
-    msg = errmsg(rep);
-    if (msg)
-	return msg;
-    free(rep);
-    return NULL;
-}
-
-char *xenbus_transaction_start(xenbus_transaction_t *xbt)
-{
-    /* xenstored becomes angry if you send a length 0 message, so just
-       shove a nul terminator on the end */
-    struct write_req req = { "", 1};
-    struct xsd_sockmsg *rep;
-    char *err;
-
-    rep = xenbus_msg_reply(XS_TRANSACTION_START, 0, &req, 1);
-    err = errmsg(rep);
-    if (err)
-	return err;
-    sscanf((char *)(rep + 1), "%lu", xbt);
-    free(rep);
-    return NULL;
-}
-
-char *
-xenbus_transaction_end(xenbus_transaction_t t, int abort, int *retry)
-{
-    struct xsd_sockmsg *rep;
-    struct write_req req;
-    char *err;
-
-    *retry = 0;
-
-    req.data = abort ? "F" : "T";
-    req.len = 2;
-    rep = xenbus_msg_reply(XS_TRANSACTION_END, t, &req, 1);
-    err = errmsg(rep);
-    if (err) {
-	if (!strcmp(err, "EAGAIN")) {
-	    *retry = 1;
-	    free(err);
-	    return NULL;
-	} else {
-	    return err;
-	}
-    }
-    free(rep);
-    return NULL;
-}
-
-int xenbus_read_integer(const char *path)
-{
-    char *res, *buf;
-    int t;
-
-    res = xenbus_read(XBT_NIL, path, &buf);
-    if (res) {
-	printk("Failed to read %s.\n", path);
-	free(res);
-	return -1;
-    }
-    sscanf(buf, "%d", &t);
-    free(buf);
-    return t;
-}
-
-int xenbus_read_uuid(const char* path, unsigned char uuid[16]) {
-   char * res, *buf;
-   res = xenbus_read(XBT_NIL, path, &buf);
-   if(res) {
-      printk("Failed to read %s.\n", path);
-      free(res);
-      return 0;
-   }
-   if(strlen(buf) != ((2*16)+4) /* 16 hex bytes and 4 hyphens */
-         || sscanf(buf,
-            "%2hhx%2hhx%2hhx%2hhx-"
-            "%2hhx%2hhx-"
-            "%2hhx%2hhx-"
-            "%2hhx%2hhx-"
-            "%2hhx%2hhx%2hhx%2hhx%2hhx%2hhx",
-            uuid, uuid + 1, uuid + 2, uuid + 3,
-            uuid + 4, uuid + 5, uuid + 6, uuid + 7,
-            uuid + 8, uuid + 9, uuid + 10, uuid + 11,
-            uuid + 12, uuid + 13, uuid + 14, uuid + 15) != 16) {
-      printk("Xenbus path %s value %s is not a uuid!\n", path, buf);
-      free(buf);
-      return 0;
-   }
-   free(buf);
-   return 1;
-}
-
-char* xenbus_printf(xenbus_transaction_t xbt,
-                                  const char* node, const char* path,
-                                  const char* fmt, ...)
-{
-#define BUFFER_SIZE 256
-    char fullpath[BUFFER_SIZE];
-    char val[BUFFER_SIZE];
-    va_list args;
-
-    BUG_ON(strlen(node) + strlen(path) + 1 >= BUFFER_SIZE);
-    sprintf(fullpath,"%s/%s", node, path);
-    va_start(args, fmt);
-    vsprintf(val, fmt, args);
-    va_end(args);
-    return xenbus_write(xbt,fullpath,val);
-}
-
-domid_t xenbus_get_self_id(void)
-{
-    char *dom_id;
-    domid_t ret;
-
-    BUG_ON(xenbus_read(XBT_NIL, "domid", &dom_id));
-    sscanf(dom_id, "%"SCNd16, &ret);
-
-    return ret;
-}
-
-static void do_ls_test(const char *pre)
-{
-    char **dirs, *msg;
-    int x;
-
-    printk("ls %s...\n", pre);
-    msg = xenbus_ls(XBT_NIL, pre, &dirs);
-    if (msg) {
-	printk("Error in xenbus ls: %s\n", msg);
-	free(msg);
-	return;
-    }
-    for (x = 0; dirs[x]; x++) 
-    {
-        printk("ls %s[%d] -> %s\n", pre, x, dirs[x]);
-        free(dirs[x]);
-    }
-    free(dirs);
-}
-
-static void do_read_test(const char *path)
-{
-    char *res, *msg;
-    printk("Read %s...\n", path);
-    msg = xenbus_read(XBT_NIL, path, &res);
-    if (msg) {
-	printk("Error in xenbus read: %s\n", msg);
-	free(msg);
-	return;
-    }
-    printk("Read %s -> %s.\n", path, res);
-    free(res);
-}
-
-static void do_write_test(const char *path, const char *val)
-{
-    char *msg;
-    printk("Write %s to %s...\n", val, path);
-    msg = xenbus_write(XBT_NIL, path, val);
-    if (msg) {
-	printk("Result %s\n", msg);
-	free(msg);
-    } else {
-	printk("Success.\n");
-    }
-}
-
-static void do_rm_test(const char *path)
-{
-    char *msg;
-    printk("rm %s...\n", path);
-    msg = xenbus_rm(XBT_NIL, path);
-    if (msg) {
-	printk("Result %s\n", msg);
-	free(msg);
-    } else {
-	printk("Success.\n");
-    }
-}
-
-/* Simple testing thing */
-void test_xenbus(void)
-{
-    printk("Doing xenbus test.\n");
-    xenbus_debug_msg("Testing xenbus...\n");
-
-    printk("Doing ls test.\n");
-    do_ls_test("device");
-    do_ls_test("device/vif");
-    do_ls_test("device/vif/0");
-
-    printk("Doing read test.\n");
-    do_read_test("device/vif/0/mac");
-    do_read_test("device/vif/0/backend");
-
-    printk("Doing write test.\n");
-    do_write_test("device/vif/0/flibble", "flobble");
-    do_read_test("device/vif/0/flibble");
-    do_write_test("device/vif/0/flibble", "widget");
-    do_read_test("device/vif/0/flibble");
-
-    printk("Doing rm test.\n");
-    do_rm_test("device/vif/0/flibble");
-    do_read_test("device/vif/0/flibble");
-    printk("(Should have said ENOENT)\n");
-}
-
-/*
- * Local variables:
- * mode: C
- * c-basic-offset: 4
- * End:
- */
diff --git a/m4/paths.m4 b/m4/paths.m4
index 7ede5bd..63e0f6b 100644
--- a/m4/paths.m4
+++ b/m4/paths.m4
@@ -62,6 +62,12 @@ AC_ARG_WITH([sysconfig-leaf-dir],
 CONFIG_LEAF_DIR=$config_leaf_dir
 AC_SUBST(CONFIG_LEAF_DIR)
 
+AC_ARG_WITH([xen-dumpdir],
+    AS_HELP_STRING([--with-xen-dumpdir=DIR],
+    [Path to directory for domU crash dumps. [LOCALSTATEDIR/lib/xen/dump]]),
+    [xen_dumpdir_path=$withval],
+    [xen_dumpdir_path=$localstatedir/lib/xen/dump])
+
 if test "$libexecdir" = '${exec_prefix}/libexec' ; then
     case "$host_os" in
          *netbsd*) ;;
@@ -77,6 +83,9 @@ dnl This variable will be substituted in various .in files
 LIBEXEC_BIN=`eval echo $libexecdir/$PACKAGE_TARNAME/bin`
 AC_SUBST(LIBEXEC_BIN)
 
+XENFIRMWAREDIR=`eval echo $libexecdir/$PACKAGE_TARNAME/boot`
+AC_SUBST(XENFIRMWAREDIR)
+
 XEN_RUN_DIR=$localstatedir/run/xen
 AC_SUBST(XEN_RUN_DIR)
 
@@ -110,4 +119,7 @@ AC_SUBST(XEN_LOCK_DIR)
 
 XEN_PAGING_DIR=$localstatedir/lib/xen/xenpaging
 AC_SUBST(XEN_PAGING_DIR)
+
+XEN_DUMP_DIR=$xen_dumpdir_path
+AC_SUBST(XEN_DUMP_DIR)
 ])
diff --git a/m4/pkg.m4 b/m4/pkg.m4
index 62995f0..ed7182d 100644
--- a/m4/pkg.m4
+++ b/m4/pkg.m4
@@ -14,8 +14,7 @@
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
diff --git a/m4/python_fortify_noopt.m4 b/m4/python_fortify_noopt.m4
new file mode 100644
index 0000000..f9cb52b
--- /dev/null
+++ b/m4/python_fortify_noopt.m4
@@ -0,0 +1,31 @@
+dnl Defines PY_NOOPT_CFLAGS to either '' or -O1
+dnl
+
+dnl This is necessary because on some systems setup.py includes
+dnl -D_FORTIFY_SOURCE but have a -D_FORTIFY_SOURCE which breaks
+dnl with -O0.  On those systems we arrange to use -O1 for debug
+dnl builds instead.
+
+AC_DEFUN([AX_CHECK_PYTHON_FORTIFY_NOOPT], [
+    AC_CACHE_CHECK([whether Python setup.py brokenly enables -D_FORTIFY_SOURCE],
+                   [ax_cv_python_fortify],[
+        ax_cv_python_fortify=no
+        for arg in $($PYTHON-config --cflags); do
+            case "$arg" in
+            -D_FORTIFY_SOURCE=0) ax_cv_python_fortify=no ;;
+            -D_FORTIFY_SOURCE=*) ax_cv_python_fortify=yes ;;
+            -Wp,-D_FORTIFY_SOURCE=0) ax_cv_python_fortify=no ;;
+            -Wp,-D_FORTIFY_SOURCE=*) ax_cv_python_fortify=yes ;;
+            *) ;;
+            esac
+        done
+    ])
+
+    AS_IF([test x$ax_cv_python_fortify = xyes],[
+        PY_NOOPT_CFLAGS=-O1
+    ], [
+        PY_NOOPT_CFLAGS=''
+    ])
+
+    AC_SUBST(PY_NOOPT_CFLAGS)
+])
diff --git a/m4/systemd.m4 b/m4/systemd.m4
index b04964b..e4b1aa5 100644
--- a/m4/systemd.m4
+++ b/m4/systemd.m4
@@ -13,8 +13,7 @@
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 
 dnl Some optional path options
 AC_DEFUN([AX_SYSTEMD_OPTIONS], [
@@ -86,7 +85,11 @@ AC_DEFUN([AX_CHECK_SYSTEMD], [
 		AC_DEFINE([HAVE_SYSTEMD], [1], [Systemd available and enabled])
 			systemd=y
 			AX_CHECK_SYSTEMD_LIBS()
-	    ],[systemd=n])
+	    ],[
+		AS_IF([test "x$enable_systemd" = "xyes"],
+			[AC_MSG_ERROR([Unable to find systemd development library])],
+			[systemd=n])
+	    ])
 	],[systemd=n])
 ])
 
diff --git a/scripts/git-checkout.sh b/scripts/git-checkout.sh
index 15b3ce9..20ae31f 100755
--- a/scripts/git-checkout.sh
+++ b/scripts/git-checkout.sh
@@ -13,7 +13,7 @@ set -e
 
 if test \! -d $DIR-remote; then
 	rm -rf $DIR-remote $DIR-remote.tmp
-	mkdir $DIR-remote.tmp; rmdir $DIR-remote.tmp
+	mkdir -p $DIR-remote.tmp; rmdir $DIR-remote.tmp
 	$GIT clone $TREE $DIR-remote.tmp
 	if test "$TAG" ; then
 		cd $DIR-remote.tmp
diff --git a/stubdom/Makefile b/stubdom/Makefile
index 8fb885a..e1359cf 100644
--- a/stubdom/Makefile
+++ b/stubdom/Makefile
@@ -1,11 +1,19 @@
 XEN_ROOT = $(CURDIR)/..
 MINI_OS = $(XEN_ROOT)/extras/mini-os
 
+export XEN_ROOT
 export XEN_OS=MiniOS
 
 export stubdom=y
 export debug=y
-include $(XEN_ROOT)/Config.mk
+
+ifeq (,$(findstring clean,$(MAKECMDGOALS)))
+  ifeq ($(wildcard $(MINI_OS)/Config.mk),)
+    $(error Please run `make mini-os-dir' in top-level directory)
+  endif
+  include $(XEN_ROOT)/Config.mk
+endif
+
 -include $(XEN_ROOT)/config/Stubdom.mk
 
 GNU_TARGET_ARCH:=$(XEN_TARGET_ARCH)
@@ -211,6 +219,7 @@ tpm_emulator-$(XEN_TARGET_ARCH): tpm_emulator-$(TPMEMU_VERSION).tar.gz
 	patch -d $@ -p1 < vtpm-locality.patch
 	patch -d $@ -p1 < vtpm-parent-sign-ek.patch
 	patch -d $@ -p1 < vtpm-deepquote.patch
+	patch -d $@ -p1 < vtpm-deepquote-anyloc.patch
 	patch -d $@ -p1 < vtpm-cmake-Wextra.patch
 	mkdir $@/build
 	cd $@/build; CC=${CC} $(CMAKE) .. -DCMAKE_C_FLAGS:STRING="-std=c99 -DTPM_NO_EXTERN $(TARGET_CPPFLAGS) $(TARGET_CFLAGS) -Wno-declaration-after-statement"
@@ -333,6 +342,8 @@ $(TARGETS_MINIOS): mini-os-%:
 .PHONY: libxc
 libxc: libxc-$(XEN_TARGET_ARCH)/libxenctrl.a libxc-$(XEN_TARGET_ARCH)/libxenguest.a
 libxc-$(XEN_TARGET_ARCH)/libxenctrl.a: cross-zlib
+	$(MAKE) -C $(XEN_ROOT)/tools/include
+	$(MAKE) DESTDIR= -C $(MINI_OS) links
 	CPPFLAGS="$(TARGET_CPPFLAGS)" CFLAGS="$(TARGET_CFLAGS)" $(MAKE) DESTDIR= CONFIG_LIBXC_MINIOS=y -C libxc-$(XEN_TARGET_ARCH)
 
  libxc-$(XEN_TARGET_ARCH)/libxenguest.a: libxc-$(XEN_TARGET_ARCH)/libxenctrl.a
@@ -452,15 +463,11 @@ xenstore-stubdom: mini-os-$(XEN_TARGET_ARCH)-xenstore libxc xenstore
 #########
 
 ifeq ($(STUBDOM_SUPPORTED),1)
-install: $(STUBDOMPATH) install-readme $(STUBDOM_INSTALL)
+install: $(STUBDOMPATH) $(STUBDOM_INSTALL)
 else
 install: $(STUBDOMPATH)
 endif
 
-install-readme:
-	$(INSTALL_DIR) $(DESTDIR)$(DOCDIR)
-	$(INSTALL_DATA) README $(DESTDIR)$(DOCDIR)/README.stubdom
-
 install-ioemu: ioemu-stubdom
 	$(INSTALL_DIR) "$(DESTDIR)$(LIBEXEC_BIN)"
 	$(INSTALL_PROG) stubdom-dm "$(DESTDIR)$(LIBEXEC_BIN)"
@@ -502,7 +509,6 @@ clean:
 	rm -fr mini-os-$(XEN_TARGET_ARCH)-xenstore
 	rm -fr mini-os-$(XEN_TARGET_ARCH)-vtpm
 	rm -fr mini-os-$(XEN_TARGET_ARCH)-vtpmmgr
-	$(MAKE) DESTDIR= -C $(MINI_OS) clean
 	$(MAKE) DESTDIR= -C caml clean
 	$(MAKE) DESTDIR= -C c clean
 	$(MAKE) -C vtpm clean
diff --git a/stubdom/c/Makefile b/stubdom/c/Makefile
index c646c26..b252dca 100644
--- a/stubdom/c/Makefile
+++ b/stubdom/c/Makefile
@@ -1,6 +1,8 @@
 XEN_ROOT = $(CURDIR)/../..
 
+ifeq (,$(findstring clean,$(MAKECMDGOALS)))
 include $(XEN_ROOT)/Config.mk
+endif
 
 all: main.a
 
diff --git a/stubdom/caml/Makefile b/stubdom/caml/Makefile
index e79c98d..f550de1 100644
--- a/stubdom/caml/Makefile
+++ b/stubdom/caml/Makefile
@@ -1,6 +1,8 @@
 XEN_ROOT = $(CURDIR)/../..
 
+ifeq (,$(findstring clean,$(MAKECMDGOALS)))
 include $(XEN_ROOT)/Config.mk
+endif
 
 CAMLLIB = $(shell $(OCAMLC_CROSS_PREFIX)ocamlc -where)
 DEF_CPPFLAGS += -I$(CAMLLIB)
diff --git a/stubdom/configure b/stubdom/configure
index 9981f5a..f4b1cd3 100755
--- a/stubdom/configure
+++ b/stubdom/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for Xen Hypervisor Stub Domains 4.5.
+# Generated by GNU Autoconf 2.69 for Xen Hypervisor Stub Domains 4.6.
 #
 # Report bugs to <xen-devel at lists.xen.org>.
 #
@@ -579,12 +579,12 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='Xen Hypervisor Stub Domains'
 PACKAGE_TARNAME='xen'
-PACKAGE_VERSION='4.5'
-PACKAGE_STRING='Xen Hypervisor Stub Domains 4.5'
+PACKAGE_VERSION='4.6'
+PACKAGE_STRING='Xen Hypervisor Stub Domains 4.6'
 PACKAGE_BUGREPORT='xen-devel at lists.xen.org'
 PACKAGE_URL='http://www.xen.org/'
 
-ac_unique_file="../extras/mini-os/kernel.c"
+ac_unique_file="xenstore-minios.cfg"
 ac_subst_vars='LTLIBOBJS
 LIBOBJS
 STUBDOM_INSTALL
@@ -1250,7 +1250,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures Xen Hypervisor Stub Domains 4.5 to adapt to many kinds of systems.
+\`configure' configures Xen Hypervisor Stub Domains 4.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1315,7 +1315,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of Xen Hypervisor Stub Domains 4.5:";;
+     short | recursive ) echo "Configuration of Xen Hypervisor Stub Domains 4.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1425,7 +1425,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-Xen Hypervisor Stub Domains configure 4.5
+Xen Hypervisor Stub Domains configure 4.6
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1480,7 +1480,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by Xen Hypervisor Stub Domains $as_me 4.5, which was
+It was created by Xen Hypervisor Stub Domains $as_me 4.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -4178,7 +4178,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by Xen Hypervisor Stub Domains $as_me 4.5, which was
+This file was extended by Xen Hypervisor Stub Domains $as_me 4.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -4232,7 +4232,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-Xen Hypervisor Stub Domains config.status 4.5
+Xen Hypervisor Stub Domains config.status 4.6
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/stubdom/configure.ac b/stubdom/configure.ac
index 6468203..9fec853 100644
--- a/stubdom/configure.ac
+++ b/stubdom/configure.ac
@@ -4,7 +4,7 @@
 AC_PREREQ([2.67])
 AC_INIT([Xen Hypervisor Stub Domains], m4_esyscmd([../version.sh ../xen/Makefile]),
     [xen-devel at lists.xen.org], [xen], [http://www.xen.org/])
-AC_CONFIG_SRCDIR([../extras/mini-os/kernel.c])
+AC_CONFIG_SRCDIR([xenstore-minios.cfg])
 AC_CONFIG_FILES([../config/Stubdom.mk])
 AC_CONFIG_AUX_DIR([../])
 
diff --git a/stubdom/grub.patches/10graphics.diff b/stubdom/grub.patches/10graphics.diff
index d891c51..5ee2852 100644
--- a/stubdom/grub.patches/10graphics.diff
+++ b/stubdom/grub.patches/10graphics.diff
@@ -1164,7 +1164,7 @@ diff -Naur grub-0.97.orig/stage2/cmdline.c grub-0.97/stage2/cmdline.c
 diff -Naur grub-0.97.orig/stage2/graphics.c grub-0.97/stage2/graphics.c
 --- grub-0.97.orig/stage2/graphics.c	1969-12-31 21:00:00.000000000 -0300
 +++ grub-0.97/stage2/graphics.c	2005-06-13 19:13:31.000000000 -0300
-@@ -0,0 +1,585 @@
+@@ -0,0 +1,584 @@
 +/*
 + * graphics.c - graphics mode support for GRUB
 + * Implemented as a terminal type by Jeremy Katz <katzj at redhat.com> based
@@ -1187,8 +1187,7 @@ diff -Naur grub-0.97.orig/stage2/graphics.c grub-0.97/stage2/graphics.c
 + *  GNU General Public License for more details.
 + *
 + *  You should have received a copy of the GNU General Public License
-+ *  along with this program; if not, write to the Free Software
-+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++ *  along with this program; If not, see <http://www.gnu.org/licenses/>
 + */
 +
 +#ifdef SUPPORT_GRAPHICS
@@ -1753,7 +1752,7 @@ diff -Naur grub-0.97.orig/stage2/graphics.c grub-0.97/stage2/graphics.c
 diff -Naur grub-0.97.orig/stage2/graphics.h grub-0.97/stage2/graphics.h
 --- grub-0.97.orig/stage2/graphics.h	1969-12-31 21:00:00.000000000 -0300
 +++ grub-0.97/stage2/graphics.h	2005-06-12 20:56:49.000000000 -0300
-@@ -0,0 +1,44 @@
+@@ -0,0 +1,43 @@
 +/* graphics.h - graphics console interface */
 +/*
 + *  GRUB  --  GRand Unified Bootloader
@@ -1770,8 +1769,7 @@ diff -Naur grub-0.97.orig/stage2/graphics.h grub-0.97/stage2/graphics.h
 + *  GNU General Public License for more details.
 + *
 + *  You should have received a copy of the GNU General Public License
-+ *  along with this program; if not, write to the Free Software
-+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++ *  along with this program; If not, see <http://www.gnu.org/licenses/>
 + */
 +
 +#ifndef GRAPHICS_H
diff --git a/stubdom/grub.patches/61btrfs.diff b/stubdom/grub.patches/61btrfs.diff
index fc72771..c831a10 100644
--- a/stubdom/grub.patches/61btrfs.diff
+++ b/stubdom/grub.patches/61btrfs.diff
@@ -66,7 +66,7 @@ diff -up grub-upstream.wip/INSTALL.btrfs grub-upstream.wip/INSTALL
 diff -up /dev/null grub-upstream.wip/stage2/btrfs.h
 --- /dev/null	2009-06-03 06:46:26.160951000 +0000
 +++ grub-upstream.wip/stage2/btrfs.h	2012-03-20 05:07:09.000000000 +0000
-@@ -0,0 +1,1415 @@
+@@ -0,0 +1,1413 @@
 +/* btrfs.h - an extraction from btrfs-progs-0.18/ctree.h into one file
 + *
 + * Copyright (C) 2007 Oracle.  All rights reserved.
@@ -81,9 +81,7 @@ diff -up /dev/null grub-upstream.wip/stage2/btrfs.h
 + * General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public
-+ * License along with this program; if not, write to the
-+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-+ * Boston, MA 021110-1307, USA.
++ * License along with this program; If not, see <http://www.gnu.org/licenses/>
 + */
 +
 +/* include/asm-i386/types.h */
diff --git a/stubdom/grub/kexec.c b/stubdom/grub/kexec.c
index dc8db81..4c33b25 100644
--- a/stubdom/grub/kexec.c
+++ b/stubdom/grub/kexec.c
@@ -276,12 +276,13 @@ void kexec(void *kernel, long kernel_size, void *module, long module_size, char
     dom->total_pages = start_info.nr_pages;
 
     /* equivalent of arch_setup_meminit */
+    dom->p2m_size = dom->total_pages;
 
     /* setup initial p2m */
-    dom->p2m_host = malloc(sizeof(*dom->p2m_host) * dom->total_pages);
+    dom->p2m_host = malloc(sizeof(*dom->p2m_host) * dom->p2m_size);
 
     /* Start with our current P2M */
-    for (i = 0; i < dom->total_pages; i++)
+    for (i = 0; i < dom->p2m_size; i++)
         dom->p2m_host[i] = pfn_to_mfn(i);
 
     if ( (rc = xc_dom_build_image(dom)) != 0 ) {
diff --git a/stubdom/vtpm-deepquote-anyloc.patch b/stubdom/vtpm-deepquote-anyloc.patch
new file mode 100644
index 0000000..7b37d51
--- /dev/null
+++ b/stubdom/vtpm-deepquote-anyloc.patch
@@ -0,0 +1,127 @@
+diff --git a/tpm/tpm_cmd_handler.c b/tpm/tpm_cmd_handler.c
+index 69511d1..7545d51 100644
+--- a/tpm/tpm_cmd_handler.c
++++ b/tpm/tpm_cmd_handler.c
+@@ -3347,12 +3347,13 @@ static TPM_RESULT execute_TPM_DeepQuote(TPM_REQUEST *req, TPM_RESPONSE *rsp)
+ {
+ 	TPM_NONCE nonce;
+ 	TPM_RESULT res;
+-	UINT32 sigSize;
+-	BYTE *sig;
++	UINT32 quote_blob_size;
++	BYTE *quote_blob;
+ 	BYTE *ptr;
+ 	UINT32 len;
+ 	TPM_PCR_SELECTION myPCR;
+ 	TPM_PCR_SELECTION ptPCR;
++	UINT32 extraInfoFlags = 0;
+ 
+ 	tpm_compute_in_param_digest(req);
+ 
+@@ -3361,17 +3362,19 @@ static TPM_RESULT execute_TPM_DeepQuote(TPM_REQUEST *req, TPM_RESPONSE *rsp)
+ 	if (tpm_unmarshal_TPM_NONCE(&ptr, &len, &nonce)
+ 		|| tpm_unmarshal_TPM_PCR_SELECTION(&ptr, &len, &myPCR)
+ 		|| tpm_unmarshal_TPM_PCR_SELECTION(&ptr, &len, &ptPCR)
++		|| tpm_unmarshal_TPM_DEEP_QUOTE_INFO(&ptr, &len, &extraInfoFlags)
+ 		|| len != 0) return TPM_BAD_PARAMETER;
+ 
+-	res = TPM_DeepQuote(&nonce, &myPCR, &ptPCR, &req->auth1, &sigSize, &sig);
++	res = TPM_DeepQuote(&nonce, &myPCR, &ptPCR, &req->auth1, extraInfoFlags,
++		&quote_blob_size, &quote_blob);
+ 	if (res != TPM_SUCCESS) return res;
+-	rsp->paramSize = len = sigSize;
++	rsp->paramSize = len = quote_blob_size;
+ 	rsp->param = ptr = tpm_malloc(len);
+-	if (ptr == NULL || tpm_marshal_BLOB(&ptr, &len, sig, sigSize)) {
++	if (ptr == NULL || tpm_marshal_BLOB(&ptr, &len, quote_blob, quote_blob_size)) {
+ 		tpm_free(rsp->param);
+ 		res = TPM_FAIL;
+ 	}
+-	tpm_free(sig);
++	tpm_free(quote_blob);
+ 
+ 	return res;
+ }
+diff --git a/tpm/tpm_commands.h b/tpm/tpm_commands.h
+index 328d1be..a56dd5f 100644
+--- a/tpm/tpm_commands.h
++++ b/tpm/tpm_commands.h
+@@ -3077,6 +3077,7 @@ TPM_RESULT TPM_ParentSignEK(
+  * @myPCR: [in] PCR selection for the virtual TPM
+  * @ptPCR: [in] PCR selection for the hardware TPM
+  * @auth1: [in, out] Authorization protocol parameters
++ * @extraInfoFlags [in] Flags for including, kernel hash, group info, etc
+  * @sigSize: [out] The length of the returned digital signature
+  * @sig: [out] The resulting digital signature and PCR values
+  * Returns: TPM_SUCCESS on success, a TPM error code otherwise.
+@@ -3086,6 +3087,7 @@ TPM_RESULT TPM_DeepQuote(
+   TPM_PCR_SELECTION *myPCR,
+   TPM_PCR_SELECTION *ptPCR,
+   TPM_AUTH *auth1,
++  UINT32 extraInfoFlags,
+   UINT32 *sigSize,
+   BYTE **sig
+ );
+diff --git a/tpm/tpm_credentials.c b/tpm/tpm_credentials.c
+index c0d62e7..6586c22 100644
+--- a/tpm/tpm_credentials.c
++++ b/tpm/tpm_credentials.c
+@@ -183,7 +183,8 @@ TPM_RESULT TPM_OwnerReadInternalPub(TPM_KEY_HANDLE keyHandle, TPM_AUTH *auth1,
+ 
+ int endorsementKeyFresh = 0;
+ 
+-TPM_RESULT VTPM_GetParentQuote(TPM_DIGEST* data, TPM_PCR_SELECTION *sel, UINT32 *sigSize, BYTE **sig);
++TPM_RESULT VTPM_GetParentQuote(TPM_NONCE *data, TPM_PCR_SELECTION *sel,
++                               UINT32 extraInfoFlags, UINT32 *sigSize, BYTE **sig);
+ 
+ TPM_RESULT TPM_ParentSignEK(TPM_NONCE *externalData, TPM_PCR_SELECTION *sel,
+                             TPM_AUTH *auth1, UINT32 *sigSize, BYTE **sig)
+@@ -191,7 +192,7 @@ TPM_RESULT TPM_ParentSignEK(TPM_NONCE *externalData, TPM_PCR_SELECTION *sel,
+ 	TPM_PUBKEY pubKey;
+ 	TPM_RESULT res;
+ 	TPM_DIGEST hres;
+-
++	UINT32 extraInfoFlags = 0;
+ 	info("TPM_ParentSignEK()");
+ 
+ 	res = tpm_verify_auth(auth1, tpmData.permanent.data.ownerAuth, TPM_KH_OWNER);
+@@ -206,7 +207,7 @@ TPM_RESULT TPM_ParentSignEK(TPM_NONCE *externalData, TPM_PCR_SELECTION *sel,
+ 		res = TPM_FAIL;
+ 
+ 	if (res == TPM_SUCCESS)
+-		res = VTPM_GetParentQuote(&hres, sel, sigSize, sig);
++		res = VTPM_GetParentQuote((TPM_NONCE*)&hres, sel, extraInfoFlags, sigSize, sig);
+ 
+ 	free_TPM_PUBKEY(pubKey);
+ 	return res;
+@@ -218,7 +219,7 @@ static const BYTE dquot_hdr[] = {
+ 
+ TPM_RESULT TPM_DeepQuote(TPM_NONCE *externalData, TPM_PCR_SELECTION *myPCR,
+                          TPM_PCR_SELECTION *ptPCR, TPM_AUTH *auth1,
+-                         UINT32 *sigSize, BYTE **sig)
++                         UINT32 extraInfoFlags, UINT32 *quote_blob_size, BYTE **quote_blob)
+ {
+   TPM_RESULT res;
+   TPM_DIGEST hres;
+@@ -253,7 +254,7 @@ TPM_RESULT TPM_DeepQuote(TPM_NONCE *externalData, TPM_PCR_SELECTION *myPCR,
+ 
+   tpm_free(buf);
+ 
+-	res = VTPM_GetParentQuote(&hres, ptPCR, sigSize, sig);
++  res = VTPM_GetParentQuote((TPM_NONCE*)&hres, ptPCR, extraInfoFlags, quote_blob_size, quote_blob);
+ 
+   return res;
+ }
+diff --git a/tpm/tpm_marshalling.h b/tpm/tpm_marshalling.h
+index d510ebe..2e0c008 100644
+--- a/tpm/tpm_marshalling.h
++++ b/tpm/tpm_marshalling.h
+@@ -268,6 +268,8 @@ static inline int tpm_unmarshal_BOOL(BYTE **ptr, UINT32 *length, BOOL *v)
+ #define tpm_unmarshal_TPM_REDIR_COMMAND        tpm_unmarshal_UINT32
+ #define tpm_marshal_DAAHANDLE                  tpm_marshal_UINT32
+ #define tpm_unmarshal_DAAHANDLE                tpm_unmarshal_UINT32
++#define tpm_marshal_TPM_DEEP_QUOTE_INFO        tpm_marshal_UINT32
++#define tpm_unmarshal_TPM_DEEP_QUOTE_INFO      tpm_unmarshal_UINT32
+ 
+ int tpm_marshal_UINT32_ARRAY(BYTE **ptr, UINT32 *length, UINT32 *v, UINT32 n);
+ int tpm_unmarshal_UINT32_ARRAY(BYTE **ptr, UINT32 *length, UINT32 *v, UINT32 n);
diff --git a/stubdom/vtpm/vtpm_cmd.c b/stubdom/vtpm/vtpm_cmd.c
index 6fda456..eec37df 100644
--- a/stubdom/vtpm/vtpm_cmd.c
+++ b/stubdom/vtpm/vtpm_cmd.c
@@ -218,7 +218,8 @@ egress:
 }
 
 extern struct tpmfront_dev* tpmfront_dev;
-TPM_RESULT VTPM_GetParentQuote(TPM_NONCE *data, TPM_PCR_SELECTION *sel, UINT32 *sigSize, BYTE **sig)
+TPM_RESULT VTPM_GetParentQuote(TPM_NONCE *data, TPM_PCR_SELECTION *sel,
+                               UINT32 extraInfoFlags, UINT32 *quote_blob_size, BYTE **quote_blob)
 {
    TPM_RESULT status = TPM_SUCCESS;
    uint8_t* bptr, *resp;
@@ -231,11 +232,12 @@ TPM_RESULT VTPM_GetParentQuote(TPM_NONCE *data, TPM_PCR_SELECTION *sel, UINT32 *
    TPM_COMMAND_CODE ord = VTPM_ORD_GET_QUOTE;
 
    /*Create the command*/
-   len = size = VTPM_COMMAND_HEADER_SIZE + 25;
+   len = size = VTPM_COMMAND_HEADER_SIZE + 20 + sizeof_TPM_PCR_SELECTION((*sel)) + 4;
    bptr = cmdbuf = malloc(size);
    TRYFAILGOTO(pack_header(&bptr, &len, tag, size, ord));
    TRYFAILGOTO(tpm_marshal_TPM_NONCE(&bptr, &len, data));
    TRYFAILGOTO(tpm_marshal_TPM_PCR_SELECTION(&bptr, &len, sel));
+   TRYFAILGOTO(tpm_marshal_TPM_DEEP_QUOTE_INFO(&bptr, &len, extraInfoFlags));
 
    /* Send the command to vtpm_manager */
    info("Requesting Quote from backend");
@@ -248,11 +250,10 @@ TPM_RESULT VTPM_GetParentQuote(TPM_NONCE *data, TPM_PCR_SELECTION *sel, UINT32 *
 
    /* Check return code */
    CHECKSTATUSGOTO(ord, "VTPM_GetParentQuote()");
-
    /* Copy out the value */
-   *sigSize = len;
-   *sig = tpm_malloc(*sigSize);
-   TRYFAILGOTOMSG(tpm_unmarshal_BYTE_ARRAY(&bptr, &len, *sig, *sigSize), ERR_MALFORMED);
+   *quote_blob_size = len;
+   *quote_blob = tpm_malloc(*quote_blob_size);
+   TRYFAILGOTOMSG(tpm_unmarshal_BYTE_ARRAY(&bptr, &len, *quote_blob, *quote_blob_size), ERR_MALFORMED);
 
    goto egress;
 abort_egress:
diff --git a/stubdom/vtpmmgr/Makefile b/stubdom/vtpmmgr/Makefile
index c5e17c5..6dae034 100644
--- a/stubdom/vtpmmgr/Makefile
+++ b/stubdom/vtpmmgr/Makefile
@@ -12,7 +12,7 @@
 XEN_ROOT=../..
 
 TARGET=vtpmmgr.a
-OBJS=vtpmmgr.o vtpm_cmd_handler.o init.o tpmrsa.o tpm.o log.o
+OBJS=vtpmmgr.o vtpm_cmd_handler.o init.o tpmrsa.o tpm.o tpm2.o log.o
 OBJS += vtpm_disk.o disk_tpm.o disk_io.o disk_crypto.o disk_read.o disk_write.o
 OBJS += mgmt_authority.o
 
diff --git a/stubdom/vtpmmgr/common_types.h b/stubdom/vtpmmgr/common_types.h
new file mode 100644
index 0000000..7321bb6
--- /dev/null
+++ b/stubdom/vtpmmgr/common_types.h
@@ -0,0 +1,9 @@
+#ifndef VTPM_COMMON_TYPES
+#define VTPM_COMMON_TYPES 1
+typedef unsigned char BYTE;
+typedef unsigned char BOOL;
+typedef unsigned char UINT8;
+typedef uint16_t UINT16;
+typedef uint32_t UINT32;
+typedef uint64_t UINT64;
+#endif
diff --git a/stubdom/vtpmmgr/disk_read.c b/stubdom/vtpmmgr/disk_read.c
index 33aacdd..944d3ff 100644
--- a/stubdom/vtpmmgr/disk_read.c
+++ b/stubdom/vtpmmgr/disk_read.c
@@ -67,6 +67,7 @@ static int find_group_key(struct mem_group *dst,
 		const struct mem_tpm_mgr *parent)
 {
 	int i, rc, rv = 1;
+    unsigned int olen;
 	struct hash160 buf;
 	struct disk_group_sealed_data sealed;
 
@@ -88,7 +89,13 @@ static int find_group_key(struct mem_group *dst,
 		TPM_pcr_digest(&buf, cfg->pcr_selection);
 		if (memcmp(&buf, &cfg->digest_release, 20))
 			continue;
-		rc = TPM_disk_unseal(&sealed, sizeof(sealed), cfg);
+
+        /*TPM 2.0 unbind | TPM 1.x unseal*/
+        if (hw_is_tpm2())
+            rc = TPM2_disk_unbind(&sealed, &olen, cfg);
+        else
+            rc = TPM_disk_unseal(&sealed, sizeof(sealed), cfg);
+
 		if (rc)
 			continue;
 		if (memcmp(&sealed.magic, DISK_GROUP_BOUND_MAGIC, 4))
@@ -112,9 +119,15 @@ static int find_group_key(struct mem_group *dst,
 static int parse_root_key(struct mem_tpm_mgr *dst, struct disk_seal_entry *src)
 {
 	int rc;
+    unsigned int olen;
 	struct disk_root_sealed_data sealed;
 
-	rc = TPM_disk_unseal(&sealed, sizeof(sealed), src);
+    /*TPM 2.0 unbind | TPM 1.x unseal*/
+    if (hw_is_tpm2())
+        rc = TPM2_disk_unbind(&sealed, &olen, src);
+    else
+        rc = TPM_disk_unseal(&sealed, sizeof(sealed), src);
+
 	if (rc)
 		return rc;
 
@@ -535,18 +548,18 @@ int vtpm_load_disk(void)
 	TPM_read_pcrs();
 
 	printk("TPM Manager - disk format %d\n", TPM_MGR_VERSION);
-	printk(" root seal: %lu; sector of %d: %lu\n",
+	printk(" root seal: %zu; sector of %d: %zu\n",
 		sizeof(struct disk_root_sealed_data), SEALS_PER_ROOT_SEAL_LIST, sizeof(struct disk_seal_list));
-	printk(" root: %lu v=%lu\n", sizeof(root1), sizeof(root1.v));
-	printk(" itree: %lu; sector of %d: %lu\n",
+	printk(" root: %zu v=%zu\n", sizeof(root1), sizeof(root1.v));
+	printk(" itree: %u; sector of %d: %zu\n",
 		4 + 32, NR_ENTRIES_PER_ITREE, sizeof(struct disk_itree_sector));
-	printk(" group: %lu v=%lu id=%lu md=%lu\n",
+	printk(" group: %zu v=%zu id=%zu md=%zu\n",
 		sizeof(struct disk_group_sector), sizeof(struct disk_group_sector_mac3_area),
 		sizeof(struct group_id_data), sizeof(struct group_details));
-	printk(" group seal: %lu; %d in parent: %lu; sector of %d: %lu\n",
+	printk(" group seal: %zu; %d in parent: %zu; sector of %d: %zu\n",
 		sizeof(struct disk_group_sealed_data), NR_SEALS_PER_GROUP, sizeof(struct disk_group_boot_config_list),
 		SEALS_PER_GROUP_SEAL_LIST, sizeof(struct disk_group_seal_list));
-	printk(" vtpm: %lu+%lu; sector of %d: %lu\n",
+	printk(" vtpm: %zu+%zu; sector of %d: %zu\n",
 		sizeof(struct disk_vtpm_plain), sizeof(struct disk_vtpm_secret),
 		VTPMS_PER_SECTOR, sizeof(struct disk_vtpm_sector));
 
diff --git a/stubdom/vtpmmgr/disk_tpm.c b/stubdom/vtpmmgr/disk_tpm.c
index d650fbc..45a326a 100644
--- a/stubdom/vtpmmgr/disk_tpm.c
+++ b/stubdom/vtpmmgr/disk_tpm.c
@@ -12,17 +12,20 @@
 #include <polarssl/sha1.h>
 
 #include "tpm.h"
+#include "tpm2.h"
 #include "tcg.h"
 
 #include "vtpmmgr.h"
 #include "vtpm_disk.h"
 #include "disk_tpm.h"
 
+#include "log.h"
 // Print out input/output of seal/unseal operations (includes keys)
 #undef DEBUG_SEAL_OPS
 
 #ifdef DEBUG_SEAL_OPS
 #include "marshal.h"
+#include "tpm2_marshal.h"
 #endif
 
 struct pcr_list {
@@ -31,11 +34,16 @@ struct pcr_list {
 
 static struct pcr_list hwtpm;
 
+/*Ignore PCR on TPM 2.0, read PCR values for TPM 1.x seal | unseal*/
 void TPM_read_pcrs(void)
 {
 	int i;
-	for(i=0; i < 24; i++)
-		TPM_PCR_Read(i, &hwtpm.pcrs[i]);
+	for (i=0; i < 24; i++) {
+        if (hw_is_tpm2())
+            tpm2_pcr_read(i, (uint8_t *)&hwtpm.pcrs[i]);
+        else
+		    TPM_PCR_Read(i, &hwtpm.pcrs[i]);
+    }
 }
 
 struct pcr_composite_3 {
@@ -138,6 +146,36 @@ int TPM_disk_seal(struct disk_seal_entry *dst, const void* src, size_t size)
 	return rc;
 }
 
+TPM_RC TPM2_disk_bind(struct disk_seal_entry *dst, void* src, unsigned int size)
+{
+    TPM_RESULT status = TPM_SUCCESS;
+
+    TPMTRYRETURN(TPM2_Bind(vtpm_globals.sk_handle,
+                           src,
+                           size,
+                           dst->sealed_data));
+
+abort_egress:
+egress:
+   return status;
+}
+
+TPM_RC TPM2_disk_unbind(void *dst, unsigned int *size, const struct disk_seal_entry *src)
+{
+    TPM_RESULT status = TPM_SUCCESS;
+    unsigned char buf[RSA_CIPHER_SIZE];
+
+    memcpy(buf, src->sealed_data, RSA_CIPHER_SIZE);
+    TPMTRYRETURN(TPM2_UnBind(vtpm_globals.sk_handle,
+                             RSA_CIPHER_SIZE,
+                             buf,
+                             size,
+                             dst));
+abort_egress:
+egress:
+   return status;
+}
+
 int TPM_disk_unseal(void *dst, size_t size, const struct disk_seal_entry *src)
 {
 	uint32_t rc;
diff --git a/stubdom/vtpmmgr/disk_tpm.h b/stubdom/vtpmmgr/disk_tpm.h
index b235895..57ae2a6 100644
--- a/stubdom/vtpmmgr/disk_tpm.h
+++ b/stubdom/vtpmmgr/disk_tpm.h
@@ -10,6 +10,10 @@ void TPM_pcr_digest(struct hash160 *buf, le32_t selection);
 int TPM_disk_seal(struct disk_seal_entry *dst, const void* src, size_t size);
 int TPM_disk_unseal(void *dst, size_t size, const struct disk_seal_entry *src);
 
+/*TPM 2.0 Bind and Unbind */
+TPM_RC TPM2_disk_bind(struct disk_seal_entry *dst, void* src, unsigned int size);
+TPM_RC TPM2_disk_unbind(void *dst, unsigned int *size, const struct disk_seal_entry *src);
+
 /* NVRAM to allow revocation of TM-KEY */
 int TPM_disk_nvalloc(be32_t *nvram_slot, struct tpm_authdata auth);
 int TPM_disk_nvread(void *buf, size_t bufsiz, be32_t nvram_slot, struct tpm_authdata auth);
diff --git a/stubdom/vtpmmgr/disk_write.c b/stubdom/vtpmmgr/disk_write.c
index 4c825c5..ab15a9a 100644
--- a/stubdom/vtpmmgr/disk_write.c
+++ b/stubdom/vtpmmgr/disk_write.c
@@ -88,7 +88,12 @@ static void generate_group_seals(struct mem_group *src, const struct mem_tpm_mgr
 		dst->pcr_selection = src->seals[i].pcr_selection;
 		memcpy(&dst->digest_release, &src->seals[i].digest_release, 20);
 		TPM_pcr_digest(&dst->digest_at_seal, dst->pcr_selection);
-		TPM_disk_seal(dst, &sblob, sizeof(sblob));
+
+        /*TPM 2.0 bind | TPM 1.x seal*/
+        if (hw_is_tpm2())
+            TPM2_disk_bind(dst, &sblob, sizeof(sblob));
+        else
+            TPM_disk_seal(dst, &sblob, sizeof(sblob));
 	}
 	src->seal_bits.nr_cfgs = native_be32(src->nr_seals);
 
@@ -250,7 +255,11 @@ static void disk_write_seal_list(struct mem_tpm_mgr *mgr, struct mem_group *grou
 		memcpy(&dst->digest_release, &src->digest_release, 20);
 		TPM_pcr_digest(&dst->digest_at_seal, dst->pcr_selection);
 
-		TPM_disk_seal(dst, &sblob, sizeof(sblob));
+        /*TPM 2.0 bind / TPM 1.x seal*/
+        if (hw_is_tpm2())
+            TPM2_disk_bind(dst, &sblob, sizeof(sblob));
+        else
+            TPM_disk_seal(dst, &sblob, sizeof(sblob));
 	}
 
 	memcpy(seal->hdr.magic, TPM_MGR_MAGIC, 12);
diff --git a/stubdom/vtpmmgr/init.c b/stubdom/vtpmmgr/init.c
index f3aa02f..1506735 100644
--- a/stubdom/vtpmmgr/init.c
+++ b/stubdom/vtpmmgr/init.c
@@ -51,6 +51,8 @@
 #include "vtpm_disk.h"
 #include "tpm.h"
 #include "marshal.h"
+#include "tpm2_marshal.h"
+#include "tpm2.h"
 
 struct Opts {
    enum {
@@ -509,3 +511,280 @@ void vtpmmgr_shutdown(void)
 
    vtpmloginfo(VTPM_LOG_VTPM, "VTPM Manager stopped.\n");
 }
+
+/* TPM 2.0 */
+
+static void tpm2_AuthArea_ctor(const char *authValue, UINT32 authLen,
+                               TPM_AuthArea *auth)
+{
+    auth->sessionHandle = TPM_RS_PW;
+    auth->nonce.size = 0;
+    auth->sessionAttributes = 1;
+    auth->auth.size = authLen;
+    memcpy(auth->auth.buffer, authValue, authLen);
+    auth->size = 9 + authLen;
+}
+
+TPM_RC tpm2_take_ownership(void)
+{
+    TPM_RC status = TPM_SUCCESS;
+
+    tpm2_AuthArea_ctor(NULL, 0, &vtpm_globals.pw_auth);
+
+    /* create SRK */
+    TPM2_CreatePrimary_Params_in in = {
+        .inSensitive = {
+            .size = 4,
+            .sensitive = {
+                .userAuth.size = 0,
+                .userAuth.buffer = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\
+                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+                .data.size = 0,
+            },
+        },
+        .inPublic = {
+            .size = 60,
+            .publicArea = {
+                .type = TPM2_ALG_RSA,
+                .nameAlg = TPM2_ALG_SHA256,
+#define SRK_OBJ_ATTR (fixedTPM | fixedParent  | userWithAuth | \
+                      sensitiveDataOrigin | restricted | decrypt)
+                .objectAttributes = SRK_OBJ_ATTR,
+                .authPolicy.size = 0,
+                .parameters.rsaDetail = {
+                    .symmetric = {
+                    .algorithm = TPM2_ALG_AES,
+                    .keyBits.aes = AES_KEY_SIZES_BITS,
+                    .mode.aes = TPM2_ALG_CFB,
+                    },
+                .scheme = { TPM2_ALG_NULL },
+                .keyBits = RSA_KEY_SIZES_BITS,
+                .exponent = 0,
+                },
+                .unique.rsa.size = 0,
+            },
+        },
+            .outsideInfo.size = 0,
+            .creationPCR.count = 0,
+    };
+
+    TPMTRYRETURN(TPM2_CreatePrimary(TPM_RH_OWNER,&in,
+                                    &vtpm_globals.srk_handle, NULL));
+    vtpmloginfo(VTPM_LOG_VTPM, "SRK handle: 0x%X\n", vtpm_globals.srk_handle);
+    {
+        const char data[20] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+        tpm2_AuthArea_ctor(data, 20, &vtpm_globals.srk_auth_area);
+    }
+    /*end create SRK*/
+
+abort_egress:
+    return status;
+}
+
+TPM_RESULT vtpmmgr2_create(void)
+{
+    TPM_RESULT status = TPM_SUCCESS;
+
+    TPMTRYRETURN(tpm2_take_ownership());
+
+   /* create SK */
+    TPM2_Create_Params_out out;
+    TPM2_Create_Params_in in = {
+        .inSensitive = {
+            .size = 4 + 20,
+            .sensitive = {
+                .userAuth.size = 20,
+                .userAuth.buffer = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\
+                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+                .data.size = 0,
+            },
+        },
+        .inPublic = {
+            .size = (60),
+            .publicArea = {
+                 .type = TPM2_ALG_RSA,
+                 .nameAlg = TPM2_ALG_SHA256,
+#define SK_OBJ_ATTR (fixedTPM | fixedParent | userWithAuth |\
+                     sensitiveDataOrigin |decrypt)
+                 .objectAttributes = SK_OBJ_ATTR,
+                 .authPolicy.size = 0,
+                 .parameters.rsaDetail = {
+                     .symmetric = {
+                         .algorithm = TPM2_ALG_NULL,
+                     },
+                     .scheme = {
+                         TPM2_ALG_OAEP,
+                         .details.oaep.hashAlg = TPM2_ALG_SHA256,
+                     },
+                     .keyBits = RSA_KEY_SIZES_BITS,
+                     .exponent = 0,
+                  },
+                  .unique.rsa.size = 0,
+            },
+        },
+        .outsideInfo.size = 0,
+        .creationPCR.count = 0,
+    };/*end in */
+
+    TPMTRYRETURN(TPM2_Create(vtpm_globals.srk_handle, &in, &out));
+    TPMTRYRETURN(TPM2_Load(vtpm_globals.srk_handle,
+                           &vtpm_globals.tpm2_storage_key.Private,
+                           &vtpm_globals.tpm2_storage_key.Public,
+                           &vtpm_globals.sk_handle,
+                           &vtpm_globals.sk_name));
+
+    vtpmloginfo(VTPM_LOG_VTPM, "SK HANDLE: 0x%X\n", vtpm_globals.sk_handle);
+
+    /*Create new disk image*/
+    TPMTRYRETURN(vtpm_new_disk());
+
+    goto egress;
+
+abort_egress:
+egress:
+    vtpmloginfo(VTPM_LOG_VTPM, "Finished initialized new VTPM manager\n");
+    return status;
+}
+
+static int tpm2_entropy_source(void* dummy, unsigned char* data,
+                               size_t len, size_t* olen)
+{
+    UINT32 sz = len;
+    TPM_RESULT rc = TPM2_GetRandom(&sz, data);
+    *olen = sz;
+    return rc == TPM_SUCCESS ? 0 : POLARSSL_ERR_ENTROPY_SOURCE_FAILED;
+}
+
+/*TPM 2.0 Objects flush */
+static TPM_RC flush_tpm2(void)
+{
+    int i;
+
+    for (i = TRANSIENT_FIRST; i < TRANSIENT_LAST; i++)
+         TPM2_FlushContext(i);
+
+    return TPM_SUCCESS;
+}
+
+TPM_RESULT vtpmmgr2_init(int argc, char** argv)
+{
+    TPM_RESULT status = TPM_SUCCESS;
+
+    /* Default commandline options */
+    struct Opts opts = {
+        .tpmdriver = TPMDRV_TPM_TIS,
+        .tpmiomem = TPM_BASEADDR,
+        .tpmirq = 0,
+        .tpmlocality = 0,
+        .gen_owner_auth = 0,
+    };
+
+    if (parse_cmdline_opts(argc, argv, &opts) != 0) {
+        vtpmlogerror(VTPM_LOG_VTPM, "Command line parsing failed! exiting..\n");
+        status = TPM_BAD_PARAMETER;
+        goto abort_egress;
+    }
+
+    /*Setup storage system*/
+    if (vtpm_storage_init() != 0) {
+        vtpmlogerror(VTPM_LOG_VTPM, "Unable to initialize storage subsystem!\n");
+        status = TPM_IOERROR;
+        goto abort_egress;
+    }
+
+    /*Setup tpmback device*/
+    init_tpmback(set_opaque, free_opaque);
+
+    /*Setup tpm access*/
+    switch(opts.tpmdriver) {
+        case TPMDRV_TPM_TIS:
+        {
+            struct tpm_chip* tpm;
+            if ((tpm = init_tpm2_tis(opts.tpmiomem, TPM_TIS_LOCL_INT_TO_FLAG(opts.tpmlocality),
+                                     opts.tpmirq)) == NULL) {
+                vtpmlogerror(VTPM_LOG_VTPM, "Unable to initialize tpmfront device\n");
+                status = TPM_IOERROR;
+                goto abort_egress;
+            }
+            printk("init_tpm2_tis()       ...ok\n");
+            vtpm_globals.tpm_fd = tpm_tis_open(tpm);
+            tpm_tis_request_locality(tpm, opts.tpmlocality);
+        }
+        break;
+        case TPMDRV_TPMFRONT:
+        {
+            struct tpmfront_dev* tpmfront_dev;
+            if ((tpmfront_dev = init_tpmfront(NULL)) == NULL) {
+                vtpmlogerror(VTPM_LOG_VTPM, "Unable to initialize tpmfront device\n");
+                status = TPM_IOERROR;
+                goto abort_egress;
+            }
+            vtpm_globals.tpm_fd = tpmfront_open(tpmfront_dev);
+        }
+        break;
+    }
+    printk("TPM 2.0 access ...ok\n");
+    /* Blow away all stale handles left in the tpm*/
+    if (flush_tpm2() != TPM_SUCCESS) {
+        vtpmlogerror(VTPM_LOG_VTPM, "VTPM_FlushResources failed, continuing anyway..\n");
+    }
+
+    /* Initialize the rng */
+    entropy_init(&vtpm_globals.entropy);
+    entropy_add_source(&vtpm_globals.entropy, tpm2_entropy_source, NULL, 0);
+    entropy_gather(&vtpm_globals.entropy);
+    ctr_drbg_init(&vtpm_globals.ctr_drbg, entropy_func, &vtpm_globals.entropy, NULL, 0);
+    ctr_drbg_set_prediction_resistance( &vtpm_globals.ctr_drbg, CTR_DRBG_PR_OFF );
+
+    /* Generate Auth for Owner*/
+    if (opts.gen_owner_auth) {
+        vtpmmgr_rand(vtpm_globals.owner_auth, sizeof(TPM_AUTHDATA));
+    }
+
+    /* Load the Manager data, if it fails create a new manager */
+    if (vtpm_load_disk()) {
+        vtpmloginfo(VTPM_LOG_VTPM, "Assuming first time initialization.\n");
+        TPMTRYRETURN(vtpmmgr2_create());
+    }
+
+    goto egress;
+
+abort_egress:
+    vtpmmgr_shutdown();
+egress:
+    return status;
+}
+
+TPM_RC tpm2_pcr_read(int index, uint8_t *buf)
+{
+    TPM_RESULT status = TPM_SUCCESS;
+    TPML_PCR_SELECTION pcrSelectionIn = {
+        .count = 1,};
+
+    TPMS_PCR_SELECTION tpms_pcr_selection = {
+        .hash = TPM2_ALG_SHA1,
+        .sizeofSelect = PCR_SELECT_MAX,};
+
+    UINT32 pcrUpdateCounter;
+    TPML_PCR_SELECTION pcrSelectionOut;
+    TPML_DIGEST pcrValues;
+    TPM2B_DIGEST tpm2b_digest;
+
+    tpms_pcr_selection.pcrSelect[PCR_SELECT_NUM(index)] = PCR_SELECT_VALUE(index);
+    memcpy(&pcrSelectionIn.pcrSelections[0], &tpms_pcr_selection,
+           sizeof(TPMS_PCR_SELECTION));
+
+    TPMTRYRETURN(TPM2_PCR_Read(pcrSelectionIn, &pcrUpdateCounter,
+                               &pcrSelectionOut, &pcrValues));
+
+    if (pcrValues.count < 1)
+        goto egress;
+
+    unpack_TPM2B_DIGEST((uint8_t *) &pcrValues, &tpm2b_digest);
+    memcpy(buf, tpm2b_digest.buffer, SHA1_DIGEST_SIZE);
+
+abort_egress:
+egress:
+    return status;
+}
diff --git a/stubdom/vtpmmgr/marshal.h b/stubdom/vtpmmgr/marshal.h
index bcc7c46..d826f19 100644
--- a/stubdom/vtpmmgr/marshal.h
+++ b/stubdom/vtpmmgr/marshal.h
@@ -195,6 +195,7 @@ inline int unpack3_UINT32(BYTE* ptr, UINT32* pos, UINT32 max, UINT32 *t)
 #define unpack3_TPM_PHYSICAL_PRESENCE(p, l, m, t) unpack3_UINT16(p, l, m, t)
 #define unpack3_TPM_KEY_FLAGS(p, l, m, t) unpack3_UINT32(p, l, m, t)
 #define unpack3_TPM_LOCALITY_SELECTION(p, l, m, t) unpack3_BYTE(p, l, m, t)
+#define unpack3_TPM_DEEP_QUOTE_INFO(p, l, m, t) unpack3_UINT32(p, l, m, t)
 
 #define sizeof_TPM_RESULT(t) sizeof_UINT32(t)
 #define sizeof_TPM_PCRINDEX(t) sizeof_UINT32(t)
diff --git a/stubdom/vtpmmgr/mgmt_authority.c b/stubdom/vtpmmgr/mgmt_authority.c
index 0526a12..b839a20 100644
--- a/stubdom/vtpmmgr/mgmt_authority.c
+++ b/stubdom/vtpmmgr/mgmt_authority.c
@@ -128,6 +128,55 @@ static int do_load_aik(struct mem_group *group, TPM_HANDLE *handle)
 	return TPM_LoadKey(TPM_SRK_KEYHANDLE, &key, handle, (void*)&vtpm_globals.srk_auth, &vtpm_globals.oiap);
 }
 
+static void do_vtpminfo_hash(uint32_t extra_info_flags,struct mem_group *group,
+	const void* uuid, const uint8_t* kern_hash,unsigned char** calc_hashes)
+{
+	int i;
+	sha1_context ctx;
+	if(extra_info_flags & VTPM_QUOTE_FLAGS_HASH_UUID){
+		printk("hashing for FLAGS_HASH_UUID: ");
+		sha1_starts(&ctx);
+		if(uuid){
+			printk("true");
+			sha1_update(&ctx, (void*)uuid, 16);
+		}
+		sha1_finish(&ctx, *calc_hashes);
+		*calc_hashes = *calc_hashes + 20;
+		printk("\n");
+	}
+	if(extra_info_flags & VTPM_QUOTE_FLAGS_VTPM_MEASUREMENTS){
+		printk("hashing for VTPM_QUOTE_FLAGS_VTPM_MEASUREMENTS: ");
+		sha1_starts(&ctx);
+		if(kern_hash){
+			printk("true");
+			sha1_update(&ctx, (void*)kern_hash, 20);
+		}
+		sha1_finish(&ctx, *calc_hashes);
+		*calc_hashes = *calc_hashes + 20;
+		printk("\n");
+	}
+	if(extra_info_flags & VTPM_QUOTE_FLAGS_GROUP_INFO){
+		printk("hashing for VTPM_QUOTE_FLAGS_GROUP_INFO: true\n");
+		sha1_starts(&ctx);
+		sha1_update(&ctx, (void*)&group->id_data.saa_pubkey, sizeof(group->id_data.saa_pubkey));
+		sha1_update(&ctx, (void*)&group->details.cfg_seq, 8);
+		sha1_update(&ctx, (void*)&group->seal_bits.nr_cfgs, 4);
+		for(i=0; i < group->nr_seals; i++)
+			sha1_update(&ctx, (void*)&group->seals[i].digest_release, 20);
+		sha1_update(&ctx, (void*)&group->seal_bits.nr_kerns, 4);
+		sha1_update(&ctx, (void*)&group->seal_bits.kernels, 20 * be32_native(group->seal_bits.nr_kerns));
+		sha1_finish(&ctx, *calc_hashes);
+		*calc_hashes = *calc_hashes + 20;
+	}
+	if(extra_info_flags & VTPM_QUOTE_FLAGS_GROUP_PUBKEY){
+		printk("hashing for VTPM_QUOTE_FLAGS_GROUP_PUBKEY: true\n");
+		sha1_starts(&ctx);
+		sha1_update(&ctx, (void*)&group->id_data.saa_pubkey, sizeof(group->id_data.saa_pubkey));
+		sha1_finish(&ctx, *calc_hashes);
+		*calc_hashes = *calc_hashes + 20;
+	}
+}
+
 /* 
  * Sets up resettable PCRs for a vTPM deep quote request
  */
@@ -273,18 +322,40 @@ int group_do_activate(struct mem_group *group, void* blob, int blobSize,
 
 int vtpm_do_quote(struct mem_group *group, const uuid_t uuid,
 	const uint8_t* kern_hash, const struct tpm_authdata *data, TPM_PCR_SELECTION *sel,
-	void* pcr_out, uint32_t *pcr_size, void* sig_out)
+	uint32_t extra_info_flags, void* pcr_out, uint32_t *pcr_size, void* sig_out)
 {
 	TPM_HANDLE handle;
 	TPM_AUTH_SESSION oiap = TPM_AUTH_SESSION_INIT;
 	TPM_PCR_COMPOSITE pcrs;
 	BYTE* sig;
 	UINT32 size;
+	sha1_context ctx;
+	TPM_DIGEST externData;
+	const void* data_to_quote = data;
+	unsigned char* ppcr_out = (unsigned char*)pcr_out;
+	unsigned char** pcr_outv = (unsigned char**)&ppcr_out;
+
 	int rc;
+	printk("Extra Info Flags =0x%x\n",extra_info_flags);
+	if((extra_info_flags & ~VTPM_QUOTE_FLAGS_HASH_UUID
+		& ~VTPM_QUOTE_FLAGS_VTPM_MEASUREMENTS
+		& ~VTPM_QUOTE_FLAGS_GROUP_INFO
+		& ~VTPM_QUOTE_FLAGS_GROUP_PUBKEY) != 0)
+		return VTPM_INVALID_REQUEST;
 
-	rc = do_pcr_setup(group, uuid, kern_hash);
-	if (rc)
-		return rc;
+	sha1_starts(&ctx);
+	sha1_update(&ctx, (void*)&extra_info_flags, 4);
+	sha1_update(&ctx, (void*)data, 20);
+	if(pcr_out!=NULL && extra_info_flags!=0)
+	{
+		/*creates hashes and sets them to pcr_out*/
+		do_vtpminfo_hash(extra_info_flags,group, uuid, kern_hash, pcr_outv);
+		*pcr_size = *pcr_outv - (unsigned char*)pcr_out;
+		if(*pcr_size > 0)
+			sha1_update(&ctx, pcr_out, *pcr_size);
+	}
+	sha1_finish(&ctx, externData.digest);
+	data_to_quote = (void*)externData.digest;
 
 	rc = do_load_aik(group, &handle);
 	if (rc)
@@ -296,8 +367,7 @@ int vtpm_do_quote(struct mem_group *group, const uuid_t uuid,
 		return rc;
 	}
 
-	rc = TPM_Quote(handle, (void*)data, sel, (void*)&group->aik_authdata, &oiap, &pcrs, &sig, &size);
-	printk("TPM_Quote: %d\n", rc);
+	rc = TPM_Quote(handle, data_to_quote, sel, (void*)&group->aik_authdata, &oiap, &pcrs, &sig, &size);
 
 	TPM_TerminateHandle(oiap.AuthHandle);
 	TPM_FlushSpecific(handle, TPM_RT_KEY);
@@ -306,16 +376,19 @@ int vtpm_do_quote(struct mem_group *group, const uuid_t uuid,
 		return rc;
 	if (size != 256) {
 		printk("Bad size\n");
-		return TPM_FAIL;
+		rc = TPM_FAIL;
+		goto end;
 	}
 
 	if (pcr_out) {
-		*pcr_size = pcrs.valueSize;
-		memcpy(pcr_out, pcrs.pcrValue, *pcr_size);
+		/*append TPM_PCRVALUEs after externData hashes*/
+		memcpy(pcr_out+*pcr_size, pcrs.pcrValue, pcrs.valueSize);
+		*pcr_size = *pcr_size + pcrs.valueSize;
 	}
 
 	memcpy(sig_out, sig, size);
 
+end:
 	free_TPM_PCR_COMPOSITE(&pcrs);
 	free(sig);
 
diff --git a/stubdom/vtpmmgr/mgmt_authority.h b/stubdom/vtpmmgr/mgmt_authority.h
index 1e96c8a..cdd06aa 100644
--- a/stubdom/vtpmmgr/mgmt_authority.h
+++ b/stubdom/vtpmmgr/mgmt_authority.h
@@ -5,7 +5,7 @@ struct mem_group *vtpm_new_group(const struct tpm_authdata *privCADigest);
 int group_do_activate(struct mem_group *group, void* blob, int blobSize,
 	void* resp, unsigned int *rlen);
 int vtpm_do_quote(struct mem_group *group, const uuid_t uuid,
-	const uint8_t* kern_hash, const struct tpm_authdata *data, TPM_PCR_SELECTION *sel,
+	const uint8_t* kern_hash, const struct tpm_authdata *data, TPM_PCR_SELECTION *sel, uint32_t extraInfoFlags,
 	void* pcr_out, uint32_t *pcr_size, void* sig_out);
 
 #endif
diff --git a/stubdom/vtpmmgr/tcg.h b/stubdom/vtpmmgr/tcg.h
index 7321ec6..813ce57 100644
--- a/stubdom/vtpmmgr/tcg.h
+++ b/stubdom/vtpmmgr/tcg.h
@@ -39,6 +39,7 @@
 
 #include <stdlib.h>
 #include <stdint.h>
+#include "common_types.h"
 
 // **************************** CONSTANTS *********************************
 
@@ -401,12 +402,6 @@
 
 
 // *************************** TYPEDEFS *********************************
-typedef unsigned char BYTE;
-typedef unsigned char BOOL;
-typedef uint16_t UINT16;
-typedef uint32_t UINT32;
-typedef uint64_t UINT64;
-
 typedef UINT32 TPM_RESULT;
 typedef UINT32 TPM_PCRINDEX;
 typedef UINT32 TPM_DIRINDEX;
diff --git a/stubdom/vtpmmgr/tpm2.c b/stubdom/vtpmmgr/tpm2.c
new file mode 100644
index 0000000..c9f1016
--- /dev/null
+++ b/stubdom/vtpmmgr/tpm2.c
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2014 Intel Corporation.
+ *
+ * Authors:
+ *   Quan Xu <quan.xu at intel.com>
+ *
+ * Copyright (c) 2010-2012 United States Government, as represented by
+ * the Secretary of Defense.  All rights reserved.
+ *
+ * based off of the original tools/vtpm_manager code base which is:
+ * Copyright (c) 2005/2006, Intel Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <malloc.h>
+#include <unistd.h>
+#include <errno.h>
+#include <polarssl/sha1.h>
+
+#include "tcg.h"
+#include "tpm.h"
+#include "tpm2.h"
+#include "log.h"
+#include "marshal.h"
+#include "tpm2_marshal.h"
+#include "tpmrsa.h"
+#include "vtpmmgr.h"
+
+#define TCPA_MAX_BUFFER_LENGTH 0x2000
+#define TPM_BEGIN(TAG, ORD) \
+    const TPM_TAG intag = TAG;\
+    TPM_TAG tag = intag;\
+    UINT32 paramSize;\
+    const TPM_COMMAND_CODE ordinal = ORD;\
+    TPM_RESULT status = TPM_SUCCESS;\
+    BYTE in_buf[TCPA_MAX_BUFFER_LENGTH];\
+    BYTE out_buf[TCPA_MAX_BUFFER_LENGTH];\
+    UINT32 out_len = sizeof(out_buf);\
+    BYTE* ptr = in_buf;\
+    /*Print a log message */\
+    vtpmloginfo(VTPM_LOG_TPM, "%s\n", __func__);\
+    /* Pack the header*/\
+    ptr = pack_TPM_TAG(ptr, tag);\
+    ptr += sizeof(UINT32);\
+    ptr = pack_TPM_COMMAND_CODE(ptr, ordinal)\
+
+#define TPM_AUTH_BEGIN() \
+    sha1_context sha1_ctx;\
+    BYTE* authbase = ptr - sizeof(TPM_COMMAND_CODE);\
+    TPM_DIGEST paramDigest;\
+    sha1_starts(&sha1_ctx)
+
+#define TPM_AUTH1_GEN(HMACkey, auth) do {\
+    sha1_finish(&sha1_ctx, paramDigest.digest);\
+    generateAuth(&paramDigest, HMACkey, auth);\
+    ptr = pack_TPM_AUTH_SESSION(ptr, auth);\
+} while(0)
+
+#define TPM_AUTH2_GEN(HMACkey, auth) do {\
+    generateAuth(&paramDigest, HMACkey, auth);\
+    ptr = pack_TPM_AUTH_SESSION(ptr, auth);\
+} while(0)
+
+#define TPM_TRANSMIT() do {\
+    /* Pack the command size */\
+    paramSize = ptr - in_buf;\
+    pack_UINT32(in_buf + sizeof(TPM_TAG), paramSize);\
+    if ((status = TPM_TransmitData(in_buf, paramSize, out_buf, &out_len)) != TPM_SUCCESS) {\
+        goto abort_egress;\
+    }\
+} while(0)
+
+#define TPM_AUTH_VERIFY_BEGIN() do {\
+    UINT32 buf[2] = { cpu_to_be32(status), cpu_to_be32(ordinal) };\
+    sha1_starts(&sha1_ctx);\
+    sha1_update(&sha1_ctx, (unsigned char*)buf, sizeof(buf));\
+    authbase = ptr;\
+} while(0)
+
+#define TPM_AUTH1_VERIFY(HMACkey, auth) do {\
+    sha1_finish(&sha1_ctx, paramDigest.digest);\
+    ptr = unpack_TPM_AUTH_SESSION(ptr, auth);\
+    if ((status = verifyAuth(&paramDigest, HMACkey, auth)) != TPM_SUCCESS) {\
+        goto abort_egress;\
+    }\
+} while(0)
+
+#define TPM_AUTH2_VERIFY(HMACkey, auth) do {\
+    ptr = unpack_TPM_AUTH_SESSION(ptr, auth);\
+    if ((status = verifyAuth(&paramDigest, HMACkey, auth)) != TPM_SUCCESS) {\
+        goto abort_egress;\
+    }\
+} while(0)
+
+#define TPM_UNPACK_VERIFY() do { \
+    ptr = out_buf;\
+    ptr = unpack_TPM_RSP_HEADER(ptr, \
+          &(tag), &(paramSize), &(status));\
+    if ((status) != TPM_SUCCESS){ \
+        vtpmlogerror(VTPM_LOG_TPM, "Failed with return code %s\n", tpm_get_error_name(status));\
+        goto abort_egress;\
+    }\
+} while(0)
+
+#define TPM_AUTH_HASH() do {\
+    sha1_update(&sha1_ctx, authbase, ptr - authbase);\
+    authbase = ptr;\
+} while(0)
+
+#define TPM_AUTH_SKIP() do {\
+    authbase = ptr;\
+} while(0)
+
+TPM_RC TPM2_PCR_Read(TPML_PCR_SELECTION pcrSelectionIn,
+                     UINT32 *pcrUpdateCounter,
+                     TPML_PCR_SELECTION *pcrSelectionOut,
+                     TPML_DIGEST *pcrValues)
+{
+    TPM_BEGIN(TPM_ST_NO_SESSIONS,TPM_CC_PCR_Read);
+
+    /*pack in*/
+    ptr =  pack_TPML_PCR_SELECTION(ptr, &pcrSelectionIn);
+
+    TPM_TRANSMIT();
+    TPM_UNPACK_VERIFY();
+
+    /*unpack out*/
+    ptr = unpack_UINT32(ptr, pcrUpdateCounter);
+    ptr = unpack_TPML_PCR_SELECTION(ptr, pcrSelectionOut);
+    ptr = unpack_TPML_DIGEST(ptr, pcrValues);
+
+    goto egress;
+abort_egress:
+egress:
+    return status;
+}
+
+TPM_RC TPM2_Load(TPMI_DH_OBJECT parentHandle,
+                 TPM2B_PRIVATE *inPrivate, /* in */
+                 TPM2B_PUBLIC *inPublic, /* in */
+                 TPM2_HANDLE *objectHandle, /* out */
+                 TPM2B_NAME *name /* out */)
+{
+    TPM_BEGIN(TPM_ST_SESSIONS, TPM_CC_Load);
+
+    /* pack handle of parent for new object */
+    ptr =  pack_UINT32(ptr, parentHandle);
+
+    ptr = pack_TPM_AuthArea(ptr, &vtpm_globals.srk_auth_area);
+    ptr = pack_TPM2B_PRIVATE(ptr, inPrivate);
+    ptr = pack_TPM2B_PUBLIC(ptr, inPublic);
+
+    TPM_TRANSMIT();
+    TPM_UNPACK_VERIFY();
+
+    if (objectHandle != NULL) {
+        ptr = unpack_TPM_HANDLE(ptr, objectHandle);
+    } else {
+        TPM2_HANDLE tmp;
+        ptr = unpack_TPM_HANDLE(ptr, &tmp);
+    }
+
+    if (name != NULL)
+        ptr = unpack_TPM2B_NAME(ptr, name);
+    goto egress;
+
+abort_egress:
+egress:
+    return status;
+}
+
+TPM_RC TPM2_Create(TPMI_DH_OBJECT parentHandle,
+                   TPM2_Create_Params_in *in,
+                   TPM2_Create_Params_out *out)
+{
+    UINT32 param_size;
+    TPM_BEGIN(TPM_ST_SESSIONS, TPM_CC_Create);
+
+    /* pack handle of parent for new object */
+    ptr =  pack_UINT32(ptr, parentHandle);
+
+    /* pack Auth Area */
+    ptr = pack_TPM_AuthArea(ptr, &vtpm_globals.srk_auth_area);
+
+    /* pack inSensitive */
+    ptr = pack_TPM2B_SENSITIVE_CREATE(ptr, &in->inSensitive);
+
+    /* pack inPublic */
+    ptr = pack_TPM2B_PUBLIC(ptr, &in->inPublic);
+
+    /* pack outside Info */
+    ptr = pack_TPM2B_DATA(ptr, &in->outsideInfo);
+
+    /* pack createPCR */
+    ptr = pack_TPML_PCR_SELECTION(ptr, &in->creationPCR);
+
+    /* Send the command to the tpm */
+    TPM_TRANSMIT();
+
+    /* Unpack and validate the header */
+    TPM_UNPACK_VERIFY();
+
+    ptr = unpack_UINT32(ptr, &param_size);
+    if (out != NULL) {
+        ptr = unpack_TPM2B_PRIVATE(ptr, &vtpm_globals.tpm2_storage_key.Private);
+        ptr = unpack_TPM2B_PUBLIC(ptr, &vtpm_globals.tpm2_storage_key.Public);
+        ptr = unpack_TPM2B_CREATION_DATA(ptr, &out->creationData);
+        ptr = unpack_TPM2B_DIGEST(ptr, &out->creationHash);
+        ptr = unpack_TPMT_TK_CREATION(ptr, &out->creationTicket);
+    } else {
+           ptr += param_size;
+    }
+    goto egress;
+
+abort_egress:
+egress:
+    return status;
+}
+
+TPM_RC TPM2_CreatePrimary(TPMI_RH_HIERARCHY primaryHandle,
+                          TPM2_Create_Params_in *in,
+                          TPM2_HANDLE *objHandle,
+                          TPM2_Create_Params_out *out)
+{
+    UINT32 param_size;
+    TPM_BEGIN(TPM_ST_SESSIONS, TPM_CC_CreatePrimary);
+
+    /* pack primary handle */
+    ptr = pack_UINT32(ptr, primaryHandle);
+
+    /* pack Auth Area */
+    ptr = pack_TPM_AuthArea(ptr, &vtpm_globals.pw_auth);
+
+    /* pack inSenstive */
+    ptr = pack_TPM2B_SENSITIVE_CREATE(ptr, &in->inSensitive);
+
+    /* pack inPublic */
+    ptr = pack_TPM2B_PUBLIC(ptr, &in->inPublic);
+
+    /* pack outsideInfo */
+    ptr = pack_TPM2B_DATA(ptr, &in->outsideInfo);
+
+    /* pack creationPCR */
+    ptr = pack_TPML_PCR_SELECTION(ptr, &in->creationPCR);
+
+    /* Send the command to the tpm */
+    TPM_TRANSMIT();
+
+    /* Unpack and validate the header */
+    TPM_UNPACK_VERIFY();
+
+    if (objHandle != NULL)
+        ptr = unpack_TPM_HANDLE(ptr, objHandle);
+    else {
+        TPM2_HANDLE handle;
+        ptr = unpack_TPM_HANDLE(ptr, &handle);
+    }
+    ptr = unpack_UINT32(ptr, &param_size);
+
+    if (out != NULL) {
+        ptr = unpack_TPM2B_PUBLIC(ptr, &out->outPublic);
+        ptr = unpack_TPM2B_CREATION_DATA(ptr, &out->creationData);
+        ptr = unpack_TPM2B_DIGEST(ptr, &out->creationHash);
+        ptr = unpack_TPMT_TK_CREATION(ptr, &out->creationTicket);
+    } else {
+        ptr += param_size;
+    }
+
+goto egress;
+
+abort_egress:
+egress:
+   return status;
+}
+
+TPM_RC TPM2_HierachyChangeAuth(TPM2I_RH_HIERARCHY_AUTH authHandle, TPM2B_AUTH *newAuth)
+{
+    TPM_BEGIN(TPM_ST_SESSIONS, TPM_CC_HierarchyChangeAuth);
+    ptr = pack_UINT32(ptr, authHandle);
+    ptr = pack_TPM_AuthArea(ptr, &vtpm_globals.pw_auth);
+    ptr = pack_TPM2B_AUTH(ptr, newAuth);
+    TPM_TRANSMIT();
+    TPM_UNPACK_VERIFY();
+
+abort_egress:
+    return status;
+}
+
+TPM_RC TPM2_RSA_ENCRYPT(TPMI_DH_OBJECT keyHandle,
+                        TPM2B_PUBLIC_KEY_RSA *message,
+                        TPMT_RSA_DECRYPT *inScheme,
+                        TPM2B_DATA *label,
+                        TPM2B_PUBLIC_KEY_RSA *outData)
+{
+    TPM_BEGIN(TPM_ST_NO_SESSIONS, TPM_CC_RSA_Encrypt);
+
+    ptr = pack_UINT32(ptr, keyHandle);
+    ptr = pack_TPM2B_PUBLIC_KEY_RSA(ptr, message);
+    ptr = pack_TPMT_RSA_DECRYPT(ptr, inScheme);
+    ptr = pack_TPM2B_DATA(ptr, label);
+
+    TPM_TRANSMIT();
+    TPM_UNPACK_VERIFY();
+
+    if (outData != NULL)
+        unpack_TPM2B_PUBLIC_KEY_RSA(ptr, outData);
+abort_egress:
+    return status;
+}
+
+TPM_RC TPM2_Bind(TPMI_DH_OBJECT keyHandle,
+                 void *buf,
+                 UINT32 len,
+                 void *out)
+{
+    TPM_RC status = TPM_SUCCESS;
+    TPM2B_PUBLIC_KEY_RSA message;
+    TPMT_RSA_DECRYPT inScheme;
+    TPM2B_DATA label;
+    TPM2B_PUBLIC_KEY_RSA outData;
+
+    message.size = len;
+    memcpy(message.buffer, buf, len);
+    inScheme.scheme = TPM2_ALG_NULL;
+    label.size = 0;
+    TPMTRYRETURN(TPM2_RSA_ENCRYPT(keyHandle, &message, &inScheme, &label, &outData));
+    memcpy(out, outData.buffer, outData.size);
+
+abort_egress:
+    return status;
+}
+
+TPM_RC TPM2_RSA_Decrypt(TPMI_DH_OBJECT keyHandle,
+                        TPM2B_PUBLIC_KEY_RSA *cipherText,
+                        TPMT_RSA_DECRYPT *inScheme,
+                        TPM2B_DATA *label,
+                        TPM2B_PUBLIC_KEY_RSA *message)
+{
+    UINT32 param_size;
+
+    TPM_BEGIN(TPM_ST_SESSIONS, TPM_CC_RSA_Decrypt);
+    ptr = pack_UINT32(ptr, keyHandle);
+    ptr = pack_TPM_AuthArea(ptr, &vtpm_globals.srk_auth_area);
+    ptr = pack_TPM2B_PUBLIC_KEY_RSA(ptr, cipherText);
+    ptr = pack_TPMT_RSA_DECRYPT(ptr, inScheme);
+    ptr = pack_TPM2B_DATA(ptr, label);
+
+    TPM_TRANSMIT();
+    TPM_UNPACK_VERIFY();
+
+    ptr = unpack_UINT32(ptr, &param_size);
+
+    if (message)
+        ptr = unpack_TPM2B_PUBLIC_KEY_RSA(ptr, message);
+
+abort_egress:
+    return status;
+}
+
+TPM_RC TPM2_UnBind(TPMI_DH_OBJECT keyHandle,
+                   UINT32 ilen,
+                   void *in,
+                   UINT32 *olen,
+                   void *out)
+{
+    UINT32 status;
+    TPM2B_PUBLIC_KEY_RSA cipher, message;
+    TPMT_RSA_DECRYPT inScheme;
+    TPM2B_DATA label;
+
+    cipher.size = ilen;
+    memcpy(cipher.buffer, in, ilen);
+    inScheme.scheme = TPM2_ALG_NULL;
+    label.size = 0;
+
+    TPMTRYRETURN(TPM2_RSA_Decrypt(keyHandle, &cipher, &inScheme, &label, &message));
+
+    *olen = message.size;
+    memcpy(out, message.buffer, *olen);
+
+abort_egress:
+    return status;
+}
+
+TPM_RC TPM2_CLEAR(void)
+{
+    TPM_BEGIN(TPM_ST_SESSIONS, TPM_CC_Clear);
+
+    ptr = pack_UINT32(ptr, TPM_RH_PLATFORM);
+    ptr = pack_TPM_AuthArea(ptr, &vtpm_globals.pw_auth);
+
+    TPM_TRANSMIT();
+    TPM_UNPACK_VERIFY();
+
+abort_egress:
+    return status;
+}
+
+TPM_RC TPM2_GetRandom(UINT32 * bytesRequested, BYTE * randomBytes)
+{
+    TPM_BEGIN(TPM_ST_NO_SESSIONS, TPM_CC_GetRandom);
+
+    ptr = pack_UINT16(ptr, (UINT16)*bytesRequested);
+
+    TPM_TRANSMIT();
+    TPM_UNPACK_VERIFY();
+
+    ptr = unpack_UINT16(ptr, (UINT16 *)bytesRequested);
+    ptr = unpack_TPM_BUFFER(ptr, randomBytes, *bytesRequested);
+
+abort_egress:
+    return status;
+}
+
+TPM_RC TPM2_FlushContext(TPMI_DH_CONTEXT flushHandle)
+{
+    TPM_BEGIN(TPM_ST_NO_SESSIONS, TPM_CC_FlushContext);
+
+    ptr = pack_UINT32(ptr, flushHandle);
+
+    TPM_TRANSMIT();
+    TPM_UNPACK_VERIFY();
+
+abort_egress:
+    return status;
+}
diff --git a/stubdom/vtpmmgr/tpm2.h b/stubdom/vtpmmgr/tpm2.h
new file mode 100644
index 0000000..9e01286
--- /dev/null
+++ b/stubdom/vtpmmgr/tpm2.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2014 Intel Corporation.
+ *
+ * Authors:
+ *   Quan Xu <quan.xu at intel.com>
+ *
+ * Copyright (c) 2010-2012 United States Government, as represented by
+ * the Secretary of Defense.  All rights reserved.
+ *
+ * based off of the original tools/vtpm_manager code base which is:
+ * Copyright (c) 2005/2006, Intel Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __TPM2_H__
+#define __TPM2_H__
+
+#include "tcg.h"
+#include "tpm2_types.h"
+
+// ------------------------------------------------------------------
+// TPM 2.0 Exposed API
+// ------------------------------------------------------------------
+
+TPM_RC TPM2_PCR_Read(TPML_PCR_SELECTION pcrSelectionIn,
+                     UINT32 *pcrUpdateCounter,
+                     TPML_PCR_SELECTION *pcrSelectionOut,
+                     TPML_DIGEST *pcrValues);
+
+TPM_RC TPM2_Load(TPMI_DH_OBJECT parentHandle,
+                 TPM2B_PRIVATE *inPrivate,
+                 TPM2B_PUBLIC *inPublic,
+                 TPM2_HANDLE *objectHandle,
+                 TPM2B_NAME *name);
+
+TPM_RC TPM2_Create(TPMI_DH_OBJECT parentHandle,
+                   TPM2_Create_Params_in *in,
+                   TPM2_Create_Params_out *out);
+
+TPM_RC TPM2_CreatePrimary(TPMI_RH_HIERARCHY primaryHandle,
+                          TPM2_Create_Params_in *objHandle,
+                          TPM2_HANDLE *in,
+                          TPM2_Create_Params_out *out);
+
+TPM_RC TPM2_HierachyChangeAuth(TPM2I_RH_HIERARCHY_AUTH authHandle,
+                               TPM2B_AUTH *newAuth);
+
+TPM_RC TPM2_RSA_ENCRYPT(TPMI_DH_OBJECT keyHandle,
+                        TPM2B_PUBLIC_KEY_RSA *message,
+                        TPMT_RSA_DECRYPT *inScheme,
+                        TPM2B_DATA *label,
+                        TPM2B_PUBLIC_KEY_RSA *outData);
+
+TPM_RC TPM2_Bind(TPMI_DH_OBJECT keyHandle,
+                 void *buf,
+                 UINT32 len,
+                 void *out);
+
+TPM_RC TPM2_RSA_Decrypt(TPMI_DH_OBJECT keyHandle,
+                        TPM2B_PUBLIC_KEY_RSA *cipherText,
+                        TPMT_RSA_DECRYPT *inScheme,
+                        TPM2B_DATA *label,
+                        TPM2B_PUBLIC_KEY_RSA *message);
+
+TPM_RC TPM2_UnBind(TPMI_DH_OBJECT keyHandle,
+                   UINT32 ilen,
+                   void *in,
+                   UINT32 *olen,
+                   void *out);
+
+TPM_RESULT TPM2_GetRandom(UINT32* bytesRequested,
+                          BYTE* randomBytes);
+
+TPM_RC TPM2_CLEAR(void);
+
+TPM_RC TPM2_FlushContext(TPMI_DH_CONTEXT);
+#endif //TPM2_H
diff --git a/stubdom/vtpmmgr/tpm2_marshal.h b/stubdom/vtpmmgr/tpm2_marshal.h
new file mode 100644
index 0000000..aaa4464
--- /dev/null
+++ b/stubdom/vtpmmgr/tpm2_marshal.h
@@ -0,0 +1,673 @@
+/*
+ * Copyright (c) 2014 Intel Corporation.
+ *
+ * Authors:
+ *   Quan Xu <quan.xu at intel.com>
+ *
+ * Copyright (c) 2010-2012 United States Government, as represented by
+ * the Secretary of Defense.  All rights reserved.
+ *
+ * based off of the original tools/vtpm_manager code base which is:
+ * Copyright (c) 2005, Intel Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef TPM2_MARSHAL_H
+#define TPM2_MARSHAL_H
+
+#include <stdlib.h>
+#include <mini-os/byteorder.h>
+#include <mini-os/endian.h>
+#include "tcg.h"
+#include "tpm2_types.h"
+#include <assert.h>
+
+#define pack_TPM_BUFFER(ptr, buf, size) pack_BUFFER(ptr, buf, size)
+#define unpack_TPM_BUFFER(ptr, buf, size) unpack_BUFFER(ptr, buf, size)
+
+inline BYTE* pack_BYTE_ARRAY(BYTE* ptr, const BYTE* array, UINT32 size)
+{
+    int i;
+    for (i = 0; i < size; i++)
+         ptr = pack_BYTE(ptr, array[i]);
+    return ptr;
+}
+
+inline BYTE* pack_TPMA_SESSION(BYTE* ptr, const TPMA_SESSION *attr)
+{
+    return pack_BYTE(ptr, (BYTE)(*attr));
+}
+
+inline BYTE* unpack_TPMA_SESSION(BYTE* ptr, TPMA_SESSION *attr)
+{
+    return unpack_BYTE(ptr, (BYTE *)attr);
+}
+
+inline BYTE* pack_TPMI_ALG_HASH(BYTE* ptr, const TPMI_ALG_HASH *hash)
+{
+    return pack_UINT16(ptr, *hash);
+}
+
+inline BYTE* unpack_TPMI_ALG_HASH(BYTE *ptr, TPMI_ALG_HASH *hash)
+{
+    return unpack_UINT16(ptr, hash);
+}
+
+#define pack_TPMA_OBJECT(ptr, t)                pack_UINT32(ptr, (UINT32)(*t))
+#define unpack_TPMA_OBJECT(ptr, t)              unpack_UINT32(ptr, (UINT32 *)(t))
+#define pack_TPM_RH(ptr, t)                     pack_UINT32(ptr, (UINT32)(*t))
+#define unpack_TPM_RH(ptr, t)                   unpack_UINT32(ptr, (UINT32 *)(t))
+#define pack_TPMA_LOCALITY(ptr, locality)       pack_BYTE(ptr, (BYTE)*locality)
+#define unpack_TPMA_LOCALITY(ptr, locality)     unpack_BYTE(ptr, (BYTE *)locality)
+#define pack_TPM_ST(ptr, tag)                   pack_UINT16(ptr, *tag)
+#define unpack_TPM_ST(ptr, tag)                 unpack_UINT16(ptr, tag)
+#define pack_TPM_KEY_BITS(ptr, t)               pack_UINT16(ptr, *t)
+#define unpack_TPM_KEY_BITS(ptr, t)             unpack_UINT16(ptr, t)
+#define pack_TPMI_AES_KEY_BITS(ptr, t)          pack_TPM_KEY_BITS(ptr, t)
+#define unpack_TPMI_AES_KEY_BITS(ptr, t)        unpack_TPM_KEY_BITS(ptr, t)
+#define pack_TPMI_RSA_KEY_BITS(ptr, t)          pack_TPM_KEY_BITS(ptr, t)
+#define unpack_TPMI_RSA_KEY_BITS(ptr, t)        unpack_TPM_KEY_BITS(ptr, t)
+#define pack_TPM_ALG_ID(ptr, id)                pack_UINT16(ptr, *id)
+#define unpack_TPM_ALG_ID(ptr, id)              unpack_UINT16(ptr, id)
+#define pack_TPM_ALG_SYM(ptr, t)                pack_TPM_ALG_ID(ptr, t)
+#define unpack_TPM_ALG_SYM(ptr, t)              unpack_TPM_ALG_ID(ptr, t)
+#define pack_TPMI_ALG_ASYM(ptr, asym)           pack_TPM_ALG_ID(ptr, asym)
+#define unpack_TPMI_ALG_ASYM(ptr, asym)         unpack_TPM_ALG_ID(ptr, asym)
+#define pack_TPMI_ALG_SYM_OBJECT(ptr, t)        pack_TPM_ALG_ID(ptr, t)
+#define unpack_TPMI_ALG_SYM_OBJECT(ptr, t)      unpack_TPM_ALG_ID(ptr, t)
+#define pack_TPMI_ALG_SYM_MODE(ptr, t)          pack_TPM_ALG_ID(ptr, t)
+#define unpack_TPMI_ALG_SYM_MODE(ptr, t)        unpack_TPM_ALG_ID(ptr, t)
+#define pack_TPMI_ALG_KDF(ptr, t)               pack_TPM_ALG_ID(ptr, t)
+#define unpack_TPMI_ALG_KDF(ptr, t)             unpack_TPM_ALG_ID(ptr, t)
+#define pack_TPMI_ALG_PUBLIC(ptr, t)            pack_TPM_ALG_ID(ptr, t)
+#define unpack_TPMI_ALG_PUBLIC(ptr, t)          unpack_TPM_ALG_ID(ptr, t)
+#define pack_TPM2_HANDLE(ptr, h)                pack_UINT32(ptr, *h)
+#define unpack_TPM2_HANDLE(ptr, h)              unpack_UINT32(ptr, h)
+#define pack_TPMI_ALG_RSA_SCHEME(ptr, t)        pack_TPM_ALG_ID(ptr, t)
+#define unpack_TPMI_ALG_RSA_SCHEME(ptr, t)      unpack_TPM_ALG_ID(ptr, t)
+#define pack_TPMI_DH_OBJECT(ptr, o)             pack_TPM2_HANDLE(ptr, o)
+#define unpack_TPMI_DH_OBJECT(PTR, O)           unpack_TPM2_HANDLE(ptr, o)
+#define pack_TPMI_RH_HIERACHY(ptr, h)           pack_TPM2_HANDLE(ptr, h)
+#define unpack_TPMI_RH_HIERACHY(ptr, h)         unpack_TPM2_HANDLE(ptr, h)
+#define pack_TPMI_RH_PLATFORM(ptr, p)           pack_TPM2_HANDLE(ptr, p)
+#define unpack_TPMI_RH_PLATFORM(ptr, p)         unpack_TPM2_HANDLE(ptr, p)
+#define pack_TPMI_RH_OWNER(ptr, o)              pack_TPM2_HANDLE(ptr, o)
+#define unpack_TPMI_RH_OWNER(ptr, o)            unpack_TPM2_HANDLE(ptr, o)
+#define pack_TPMI_RH_ENDORSEMENT(ptr, e)        pack_TPM2_HANDLE(ptr, e)
+#define unpack_TPMI_RH_ENDORSEMENT(ptr, e)      unpack_TPM2_HANDLE(ptr, e)
+#define pack_TPMI_RH_LOCKOUT(ptr, l)            pack_TPM2_HANDLE(ptr, l)
+#define unpack_TPMI_RH_LOCKOUT(ptr, l)          unpack_TPM2_HANDLE(ptr, l)
+
+inline BYTE* pack_TPM2B_DIGEST(BYTE* ptr, const TPM2B_DIGEST *digest)
+{
+    ptr = pack_UINT16(ptr, digest->size);
+    ptr = pack_BUFFER(ptr, digest->buffer, digest->size);
+    return ptr;
+}
+
+inline BYTE* unpack_TPM2B_DIGEST(BYTE* ptr, TPM2B_DIGEST *digest)
+{
+    ptr = unpack_UINT16(ptr, &digest->size);
+    ptr = unpack_BUFFER(ptr, digest->buffer, digest->size);
+    return ptr;
+}
+
+inline BYTE* pack_TPMT_TK_CREATION(BYTE* ptr,const TPMT_TK_CREATION *ticket )
+{
+    ptr = pack_TPM_ST(ptr , &ticket->tag);
+    ptr = pack_TPMI_RH_HIERACHY(ptr , &ticket->hierarchy);
+    ptr = pack_TPM2B_DIGEST(ptr, &ticket->digest);
+    return ptr;
+}
+
+inline BYTE* unpack_TPMT_TK_CREATION(BYTE* ptr, TPMT_TK_CREATION *ticket )
+{
+    ptr = unpack_TPM_ST(ptr, &ticket->tag);
+    ptr = unpack_TPMI_RH_HIERACHY(ptr, &ticket->hierarchy);
+    ptr = unpack_TPM2B_DIGEST(ptr, &ticket->digest);
+    return ptr;
+}
+
+inline BYTE* pack_TPM2B_NAME(BYTE* ptr,const TPM2B_NAME *name )
+{
+    ptr = pack_UINT16(ptr, name->size);
+    ptr = pack_TPM_BUFFER(ptr, name->name, name->size);
+    return ptr;
+}
+
+inline BYTE* unpack_TPM2B_NAME(BYTE* ptr, TPM2B_NAME *name)
+{
+    ptr = unpack_UINT16(ptr, &name->size);
+    ptr = unpack_TPM_BUFFER(ptr, name->name, name->size);
+    return ptr;
+}
+
+inline BYTE* pack_TPM2B_NONCE(BYTE* ptr, const TPM2B_NONCE *nonce)
+{
+    return pack_TPM2B_DIGEST(ptr, (const TPM2B_DIGEST*)nonce);
+}
+
+#define unpack_TPM2B_NONCE(ptr, nonce)  unpack_TPM2B_DIGEST(ptr, (TPM2B_DIGEST*)nonce)
+
+inline BYTE* pack_TPM2B_AUTH(BYTE* ptr, const TPM2B_AUTH *auth)
+{
+    return pack_TPM2B_DIGEST(ptr, (const TPM2B_DIGEST*)auth);
+}
+
+#define unpack_TPM2B_AUTH(ptr, auth)    unpack_TPM2B_DIGEST(ptr, (TPM2B_DIGEST*)auth)
+
+inline BYTE* pack_TPM2B_DATA(BYTE* ptr, const TPM2B_DATA *data)
+{
+    return pack_TPM2B_DIGEST(ptr, (const TPM2B_DIGEST*)data);
+}
+
+#define unpack_TPM2B_DATA(ptr, data)    unpack_TPM2B_DIGEST(ptr, (TPM2B_DIGEST*)data)
+
+inline BYTE* pack_TPM2B_SENSITIVE_DATA(BYTE* ptr, const TPM2B_SENSITIVE_DATA *data)
+{
+    return pack_TPM2B_DIGEST(ptr, (const TPM2B_DIGEST*)data);
+}
+
+#define unpack_TPM2B_SENSITIVE_DATA(ptr, data)  unpack_TPM2B_DIGEST(ptr, (TPM2B_DIGEST*)data)
+
+inline BYTE* pack_TPM2B_PUBLIC_KEY_RSA(BYTE* ptr, const TPM2B_PUBLIC_KEY_RSA *rsa)
+{
+    return pack_TPM2B_DIGEST(ptr, (const TPM2B_DIGEST*)rsa);
+}
+
+#define unpack_TPM2B_PUBLIC_KEY_RSA(ptr, rsa)   unpack_TPM2B_DIGEST(ptr, (TPM2B_DIGEST*)rsa)
+
+inline BYTE* pack_TPM2B_PRIVATE(BYTE* ptr, const TPM2B_PRIVATE *Private)
+{
+    ptr = pack_UINT16(ptr, Private->size);
+    ptr = pack_TPM_BUFFER(ptr, Private->buffer, Private->size);
+    return ptr;
+}
+
+inline BYTE* unpack_TPM2B_PRIVATE(BYTE* ptr, TPM2B_PRIVATE *Private)
+{
+    ptr = unpack_UINT16(ptr, &Private->size);
+    ptr = unpack_BUFFER(ptr, Private->buffer, Private->size);
+    return ptr;
+}
+
+inline BYTE* pack_TPMS_PCR_SELECTION_ARRAY(BYTE* ptr, const TPMS_PCR_SELECTION *sel, UINT32 count)
+{
+    int i;
+    for (i = 0; i < count; i++) {
+        ptr = pack_TPMI_ALG_HASH(ptr, &sel[i].hash);
+        ptr = pack_BYTE(ptr, sel[i].sizeofSelect);
+        ptr = pack_BUFFER(ptr, sel[i].pcrSelect, sel[i].sizeofSelect);
+    }
+    return ptr;
+}
+
+inline BYTE* unpack_TPMS_PCR_SELECTION_ARRAY(BYTE* ptr, TPMS_PCR_SELECTION *sel, UINT32 count)
+{
+    int i;
+    for (i = 0; i < count; i++) {
+        ptr = unpack_TPMI_ALG_HASH(ptr, &sel[i].hash);
+        ptr = unpack_BYTE(ptr, &sel[i].sizeofSelect);
+        ptr = unpack_BUFFER(ptr, sel[i].pcrSelect, sel[i].sizeofSelect);
+    }
+    return ptr;
+}
+
+inline BYTE* pack_TPML_PCR_SELECTION(BYTE* ptr, const TPML_PCR_SELECTION *sel)
+{
+    ptr = pack_UINT32(ptr, sel->count);
+    ptr = pack_TPMS_PCR_SELECTION_ARRAY(ptr, sel->pcrSelections, sel->count);
+    return ptr;
+}
+
+inline BYTE* unpack_TPML_PCR_SELECTION(BYTE* ptr, TPML_PCR_SELECTION *sel)
+{
+    ptr = unpack_UINT32(ptr, &sel->count);
+    ptr = unpack_TPMS_PCR_SELECTION_ARRAY(ptr, sel->pcrSelections, sel->count);
+    return ptr;
+}
+
+inline BYTE* unpack_TPML_DIGEST(BYTE* ptr,TPML_DIGEST *digest)
+{
+    int i;
+    ptr = unpack_UINT32(ptr, &digest->count);
+    for (i=0;i<digest->count;i++)
+    {
+        ptr = unpack_TPM2B_DIGEST(ptr, &digest->digests[i]);
+    }
+    return ptr;
+}
+
+inline BYTE* pack_TPMS_CREATION_DATA(BYTE* ptr,const TPMS_CREATION_DATA *data)
+{
+    ptr = pack_TPML_PCR_SELECTION(ptr, &data->pcrSelect);
+    ptr = pack_TPM2B_DIGEST(ptr, &data->pcrDigest);
+    ptr = pack_TPMA_LOCALITY(ptr, &data->locality);
+    ptr = pack_TPM_ALG_ID(ptr, &data->parentNameAlg);
+    ptr = pack_TPM2B_NAME(ptr, &data->parentQualifiedName);
+    ptr = pack_TPM2B_DATA(ptr, &data->outsideInfo);
+    return ptr;
+}
+
+inline BYTE* unpack_TPMS_CREATION_DATA(BYTE* ptr, TPMS_CREATION_DATA *data)
+{
+    ptr = unpack_TPML_PCR_SELECTION(ptr, &data->pcrSelect);
+    ptr = unpack_TPM2B_DIGEST(ptr, &data->pcrDigest);
+    ptr = unpack_TPMA_LOCALITY(ptr, &data->locality);
+    ptr = unpack_TPM_ALG_ID(ptr, &data->parentNameAlg);
+    ptr = unpack_TPM2B_NAME(ptr, &data->parentName);
+    ptr = unpack_TPM2B_NAME(ptr, &data->parentQualifiedName);
+    ptr = unpack_TPM2B_DATA(ptr, &data->outsideInfo);
+    return ptr;
+}
+
+inline BYTE* pack_TPM2B_CREATION_DATA(BYTE* ptr, const TPM2B_CREATION_DATA *data )
+{
+    ptr = pack_UINT16(ptr, data->size);
+    ptr = pack_TPMS_CREATION_DATA(ptr, &data->creationData);
+    return ptr;
+}
+
+inline BYTE* unpack_TPM2B_CREATION_DATA(BYTE* ptr, TPM2B_CREATION_DATA * data)
+{
+    ptr = unpack_UINT16(ptr, &data->size);
+    ptr = unpack_TPMS_CREATION_DATA(ptr, &data->creationData);
+    return ptr;
+}
+
+inline BYTE* pack_TPMS_SENSITIVE_CREATE(BYTE* ptr, const TPMS_SENSITIVE_CREATE *create)
+{
+    ptr = pack_TPM2B_AUTH(ptr, &create->userAuth);
+    ptr = pack_TPM2B_SENSITIVE_DATA(ptr, &create->data);
+    return ptr;
+}
+
+inline BYTE* pack_TPM2B_SENSITIVE_CREATE(BYTE* ptr, const TPM2B_SENSITIVE_CREATE *create)
+{
+    BYTE* sizePtr = ptr;
+    ptr += 2;
+    ptr = pack_TPMS_SENSITIVE_CREATE(ptr, &create->sensitive);
+    pack_UINT16(sizePtr, (UINT16)(ptr - sizePtr - 2));
+    return ptr;
+}
+
+inline BYTE* pack_TPMU_SYM_MODE(BYTE* ptr, const TPMU_SYM_MODE *p,
+                                const TPMI_ALG_SYM_OBJECT *sel)
+{
+    switch(*sel) {
+    case TPM2_ALG_AES:
+        ptr = pack_TPMI_ALG_SYM_MODE(ptr, &p->aes);
+        break;
+    case TPM2_ALG_SM4:
+        assert(false);
+        break;
+    case TPM2_ALG_NULL:
+        case TPM2_ALG_XOR:
+        break;
+    default:
+        ptr = pack_TPMI_ALG_SYM_MODE(ptr, &p->sym);
+    }
+    return ptr;
+}
+inline BYTE* unpack_TPMU_SYM_MODE(BYTE* ptr, TPMU_SYM_MODE *p,
+                                  const TPMI_ALG_SYM_OBJECT *sel)
+{
+    switch(*sel) {
+    case TPM2_ALG_AES:
+        ptr = unpack_TPMI_ALG_SYM_MODE(ptr, &p->aes);
+        break;
+    case TPM2_ALG_SM4:
+        assert(false);
+        break;
+    case TPM2_ALG_NULL:
+    case TPM2_ALG_XOR:
+        break;
+    default:
+        ptr = unpack_TPMI_ALG_SYM_MODE(ptr, &p->sym);
+    }
+    return ptr;
+}
+
+inline BYTE* pack_TPMU_SYM_KEY_BITS(BYTE* ptr, const TPMU_SYM_KEY_BITS *p,
+                                    const TPMI_ALG_SYM_OBJECT *sel)
+{
+    switch(*sel) {
+    case TPM2_ALG_AES:
+        ptr = pack_TPMI_AES_KEY_BITS(ptr, &p->aes);
+        break;
+    case TPM2_ALG_SM4:
+        assert(false);
+        break;
+    case TPM2_ALG_XOR:
+        assert(false);
+        break;
+    case TPM2_ALG_NULL:
+        break;
+    default:
+        ptr = pack_TPM_KEY_BITS(ptr, &p->sym);
+    }
+    return ptr;
+}
+
+inline BYTE* unpack_TPMU_SYM_KEY_BITS(BYTE* ptr, TPMU_SYM_KEY_BITS *p,
+                                      const TPMI_ALG_SYM_OBJECT *sel)
+{
+    switch(*sel) {
+    case TPM2_ALG_AES:
+        ptr = unpack_TPMI_AES_KEY_BITS(ptr, &p->aes);
+        break;
+    case TPM2_ALG_SM4:
+        assert(false);
+        break;
+    case TPM2_ALG_XOR:
+        assert(false);
+        break;
+    case TPM2_ALG_NULL:
+        break;
+    default:
+        ptr = unpack_TPM_KEY_BITS(ptr, &p->sym);
+    }
+    return ptr;
+}
+
+inline BYTE* pack_TPMT_SYM_DEF_OBJECT(BYTE* ptr, const TPMT_SYM_DEF_OBJECT *p)
+{
+    ptr = pack_TPMI_ALG_SYM_OBJECT(ptr, &p->algorithm);
+    ptr = pack_TPMU_SYM_KEY_BITS(ptr, &p->keyBits, &p->algorithm);
+    ptr = pack_TPMU_SYM_MODE(ptr, &p->mode, &p->algorithm);
+    return ptr;
+}
+
+inline BYTE* unpack_TPMT_SYM_DEF_OBJECT(BYTE *ptr, TPMT_SYM_DEF_OBJECT *p)
+{
+    ptr = unpack_TPMI_ALG_SYM_OBJECT(ptr, &p->algorithm);
+    ptr = unpack_TPMU_SYM_KEY_BITS(ptr, &p->keyBits, &p->algorithm);
+    ptr = unpack_TPMU_SYM_MODE(ptr, &p->mode, &p->algorithm);
+    return ptr;
+}
+
+#define pack_TPMS_SCHEME_OAEP(p, t)     pack_TPMI_ALG_HASH(p, &((t)->hashAlg))
+#define unpack_TPMS_SCHEME_OAEP(p, t)   unpack_TPMI_ALG_HASH(p, &((t)->hashAlg))
+
+inline BYTE* pack_TPMU_ASYM_SCHEME(BYTE *ptr, const TPMU_ASYM_SCHEME *p,
+                                   const TPMI_ALG_RSA_SCHEME *s)
+{
+    switch(*s) {
+#ifdef TPM2_ALG_RSASSA
+    case TPM2_ALG_RSASSA:
+        assert(false || "TPM2_ALG_RSASSA");
+        break;
+#endif
+#ifdef TPM2_ALG_OAEP
+    case TPM2_ALG_OAEP:
+        ptr = pack_TPMS_SCHEME_OAEP(ptr, &p->oaep);
+        break;
+#endif
+    case TPM2_ALG_NULL:
+        break;
+    default:
+        assert(false || "DEFAULT");
+    }
+    return ptr;
+}
+
+inline BYTE* unpack_TPMU_ASYM_SCHEME(BYTE *ptr, TPMU_ASYM_SCHEME *p,
+                                     const TPMI_ALG_RSA_SCHEME *s)
+{
+    switch(*s) {
+    #ifdef TPM2_ALG_RSASSA
+    case TPM2_ALG_RSASSA:
+        printf("not support TPM_ALG_RSASSA\n");
+        assert(false);
+        break;
+    #endif
+    #ifdef TPM2_ALG_OAEP
+    case TPM2_ALG_OAEP:
+        ptr = unpack_TPMS_SCHEME_OAEP(ptr, &p->oaep);
+        break;
+    #endif
+    case TPM2_ALG_NULL:
+        break;
+    default:
+        printf("default TPMI_ALG_RSA_SCHEME 0x%X\n", (UINT32)*s);
+        ptr = unpack_TPMI_ALG_HASH(ptr, &p->anySig.hashAlg);
+    }
+    return ptr;
+}
+
+inline BYTE* pack_TPMT_RSA_SCHEME(BYTE* ptr, const TPMT_RSA_SCHEME *p)
+{
+    ptr = pack_TPMI_ALG_RSA_SCHEME(ptr, &p->scheme);
+    ptr = pack_TPMU_ASYM_SCHEME(ptr, &p->details, &p->scheme);
+    return ptr;
+}
+
+inline BYTE* unpack_TPMT_RSA_SCHEME(BYTE* ptr, TPMT_RSA_SCHEME *p)
+{
+    ptr = unpack_TPMI_ALG_RSA_SCHEME(ptr, &p->scheme);
+    ptr = unpack_TPMU_ASYM_SCHEME(ptr, &p->details, &p->scheme);
+    return ptr;
+}
+
+inline BYTE* pack_TPMT_RSA_DECRYPT(BYTE* ptr, const TPMT_RSA_DECRYPT *p)
+{
+    ptr = pack_TPMI_ALG_RSA_SCHEME(ptr, &p->scheme);
+    ptr = pack_TPMU_ASYM_SCHEME(ptr, &p->details, &p->scheme);
+    return ptr;
+}
+
+inline BYTE* pack_TPMS_RSA_PARMS(BYTE* ptr, const TPMS_RSA_PARMS *p)
+{
+    ptr = pack_TPMT_SYM_DEF_OBJECT(ptr, &p->symmetric);
+    ptr = pack_TPMT_RSA_SCHEME(ptr, &p->scheme);
+    ptr = pack_TPMI_RSA_KEY_BITS(ptr, &p->keyBits);
+    ptr = pack_UINT32(ptr, p->exponent);
+    return ptr;
+}
+
+inline BYTE* unpack_TPMS_RSA_PARMS(BYTE *ptr, TPMS_RSA_PARMS *p)
+{
+    ptr = unpack_TPMT_SYM_DEF_OBJECT(ptr, &p->symmetric);
+    ptr = unpack_TPMT_RSA_SCHEME(ptr, &p->scheme);
+    ptr = unpack_TPMI_RSA_KEY_BITS(ptr, &p->keyBits);
+    ptr = unpack_UINT32(ptr, &p->exponent);
+    return ptr;
+}
+
+inline BYTE* pack_TPMU_PUBLIC_PARMS(BYTE* ptr, const TPMU_PUBLIC_PARMS *param,
+                                    const TPMI_ALG_PUBLIC *selector)
+{
+    switch(*selector) {
+    case TPM2_ALG_KEYEDHASH:
+        assert(false);
+    case TPM2_ALG_SYMCIPHER:
+        assert(false);
+    case TPM2_ALG_RSA:
+        return pack_TPMS_RSA_PARMS(ptr, &param->rsaDetail);
+    case TPM2_ALG_ECC:
+        assert(false);
+    }
+    assert(false);
+    return NULL;
+}
+
+inline BYTE* unpack_TPMU_PUBLIC_PARMS(BYTE* ptr, TPMU_PUBLIC_PARMS *param,
+                                      const TPMI_ALG_PUBLIC *selector)
+{
+    switch(*selector) {
+    case TPM2_ALG_KEYEDHASH:
+        assert(false);
+    case TPM2_ALG_SYMCIPHER:
+        assert(false);
+    case TPM2_ALG_RSA:
+        return unpack_TPMS_RSA_PARMS(ptr, &param->rsaDetail);
+    case TPM2_ALG_ECC:
+        assert(false);
+    }
+    assert(false);
+    return NULL;
+}
+
+inline BYTE* pack_TPMS_ECC_POINT(BYTE* ptr, const TPMS_ECC_POINT *point)
+{
+    assert(false);
+    return ptr;
+}
+
+inline BYTE* unpack_TPMS_ECC_POINT(BYTE* ptr, TPMS_ECC_POINT *point)
+{
+    assert(false);
+    return ptr;
+}
+
+inline BYTE* pack_TPMU_PUBLIC_ID(BYTE* ptr, const TPMU_PUBLIC_ID *id,
+                                 const TPMI_ALG_PUBLIC *selector)
+{
+    switch (*selector) {
+    case TPM2_ALG_KEYEDHASH:
+        return pack_TPM2B_DIGEST(ptr, &id->keyedHash);
+    case TPM2_ALG_SYMCIPHER:
+        return pack_TPM2B_DIGEST(ptr, &id->sym);
+    case TPM2_ALG_RSA:
+        return pack_TPM2B_PUBLIC_KEY_RSA(ptr, &id->rsa);
+    case TPM2_ALG_ECC:
+        return pack_TPMS_ECC_POINT(ptr, &id->ecc);
+    }
+    assert(false);
+    return NULL;
+}
+
+inline BYTE* unpack_TPMU_PUBLIC_ID(BYTE* ptr, TPMU_PUBLIC_ID *id, TPMI_ALG_PUBLIC *selector)
+{
+    switch (*selector) {
+    case TPM2_ALG_KEYEDHASH:
+        return unpack_TPM2B_DIGEST(ptr, &id->keyedHash);
+    case TPM2_ALG_SYMCIPHER:
+        return unpack_TPM2B_DIGEST(ptr, &id->sym);
+    case TPM2_ALG_RSA:
+        return unpack_TPM2B_PUBLIC_KEY_RSA(ptr, &id->rsa);
+    case TPM2_ALG_ECC:
+        return unpack_TPMS_ECC_POINT(ptr, &id->ecc);
+    }
+    assert(false);
+    return NULL;
+}
+
+inline BYTE* pack_TPMT_PUBLIC(BYTE* ptr, const TPMT_PUBLIC *public)
+{
+    ptr = pack_TPMI_ALG_PUBLIC(ptr, &public->type);
+    ptr = pack_TPMI_ALG_HASH(ptr, &public->nameAlg);
+    ptr = pack_TPMA_OBJECT(ptr, &public->objectAttributes);
+    ptr = pack_TPM2B_DIGEST(ptr, &public->authPolicy);
+    ptr = pack_TPMU_PUBLIC_PARMS(ptr, &public->parameters, &public->type);
+    ptr = pack_TPMU_PUBLIC_ID(ptr, &public->unique, &public->type);
+    return ptr;
+}
+
+inline BYTE* unpack_TPMT_PUBLIC(BYTE* ptr, TPMT_PUBLIC *public)
+{
+    ptr = unpack_TPMI_ALG_PUBLIC(ptr, &public->type);
+    ptr = unpack_TPMI_ALG_HASH(ptr, &public->nameAlg);
+    ptr = unpack_TPMA_OBJECT(ptr, &public->objectAttributes);
+    ptr = unpack_TPM2B_DIGEST(ptr, &public->authPolicy);
+    ptr = unpack_TPMU_PUBLIC_PARMS(ptr, &public->parameters, &public->type);
+    ptr = unpack_TPMU_PUBLIC_ID(ptr, &public->unique, &public->type);
+    return ptr;
+}
+
+inline BYTE* pack_TPM2B_PUBLIC(BYTE* ptr, const TPM2B_PUBLIC *public)
+{
+    BYTE *sizePtr = ptr;
+    ptr += 2;
+    ptr = pack_TPMT_PUBLIC(ptr, &public->publicArea);
+    pack_UINT16(sizePtr, (UINT16)(ptr - sizePtr - 2));
+    return ptr;
+}
+
+inline BYTE* unpack_TPM2B_PUBLIC(BYTE* ptr, TPM2B_PUBLIC *public)
+{
+    ptr = unpack_UINT16(ptr, &public->size);
+    ptr = unpack_TPMT_PUBLIC(ptr, &public->publicArea);
+    return ptr;
+}
+
+inline BYTE* pack_TPMS_PCR_SELECTION(BYTE* ptr, const TPMS_PCR_SELECTION *selection)
+{
+    ptr = pack_TPMI_ALG_HASH(ptr, &selection->hash);
+    ptr = pack_BYTE(ptr, selection->sizeofSelect);
+    ptr = pack_BYTE_ARRAY(ptr, selection->pcrSelect, selection->sizeofSelect);
+    return ptr;
+}
+
+inline BYTE* pack_TPMS_PCR_SELECTION_Array(BYTE* ptr, const TPMS_PCR_SELECTION *selections,
+                                           const UINT32 cnt)
+{
+    int i;
+    for (i = 0; i < cnt; i++)
+        ptr = pack_TPMS_PCR_SELECTION(ptr, selections + i);
+    return ptr;
+}
+
+inline BYTE* pack_TPM_AuthArea(BYTE* ptr, const TPM_AuthArea *auth)
+{
+    BYTE* sizePtr = ptr;
+    ptr += sizeof(UINT32);
+    ptr = pack_TPM_RH(ptr, &auth->sessionHandle);
+    ptr = pack_TPM2B_NONCE(ptr, &auth->nonce);
+    ptr = pack_TPMA_SESSION(ptr, &auth->sessionAttributes);
+    ptr = pack_TPM2B_AUTH(ptr, &auth->auth);
+    pack_UINT32(sizePtr, ptr - sizePtr - sizeof(UINT32));
+    return ptr;
+}
+
+inline BYTE* unpack_TPM_AuthArea(BYTE* ptr, TPM_AuthArea *auth)
+{
+    ptr = unpack_UINT32(ptr, &auth->size);
+    ptr = unpack_TPM_RH(ptr, &auth->sessionHandle);
+    ptr = unpack_TPM2B_NONCE(ptr, &auth->nonce);
+    ptr = unpack_TPMA_SESSION(ptr, &auth->sessionAttributes);
+    ptr = unpack_TPM2B_AUTH(ptr, &auth->auth);
+    return ptr;
+}
+
+inline BYTE* pack_TPM2_RSA_KEY(BYTE* ptr, const TPM2_RSA_KEY *key)
+{
+    ptr = pack_TPM2B_PRIVATE(ptr, &key->Private);
+    ptr = pack_TPM2B_PUBLIC(ptr, &key->Public);
+    return ptr;
+}
+
+inline BYTE* unpack_TPM2_RSA_KEY(BYTE* ptr, TPM2_RSA_KEY *key)
+{
+    ptr = unpack_TPM2B_PRIVATE(ptr, &key->Private);
+    ptr = unpack_TPM2B_PUBLIC(ptr, &key->Public);
+    return ptr;
+}
+#endif
diff --git a/stubdom/vtpmmgr/tpm2_types.h b/stubdom/vtpmmgr/tpm2_types.h
new file mode 100644
index 0000000..a07d8f3
--- /dev/null
+++ b/stubdom/vtpmmgr/tpm2_types.h
@@ -0,0 +1,969 @@
+#ifndef __TPM2_TYPES_H__
+#define __TPM2_TYPES_H__
+
+#include <stdlib.h>
+#include <stdint.h>
+#include "common_types.h"
+
+// "implementation.h"
+// Table 212 -- Logic Values
+#define    YES      1
+#define    NO       0
+#ifndef    TRUE
+#define    TRUE     1
+#endif
+#ifndef    FALSE
+#define    FALSE    0
+#endif
+#ifndef    true
+#define    true     1
+#endif
+#ifndef    false
+#define    false    0
+#endif
+#define    SET      1
+#define    CLEAR    0
+
+
+// Table 214 -- Implemented Algorithms
+#define    ALG_RSA               YES    // 1
+#define    ALG_DES               NO     // 0
+#define    ALG__3DES             NO     // 0
+#define    ALG_SHA1              YES    // 1
+#define    ALG_HMAC              YES    // 1
+#define    ALG_AES               YES    // 1
+#define    ALG_MGF1              YES    // 1
+#define    ALG_XOR               YES    // 1
+#define    ALG_KEYEDHASH         YES    // 1
+#define    ALG_SHA256            YES    // 1
+#define    ALG_SHA384            YES    // 0
+#define    ALG_SHA512            YES    // 0
+#define    ALG_WHIRLPOOL512      YES    // 0
+#define    ALG_SM3_256           YES    // 1
+#define    ALG_SM4               YES    // 1
+#define    ALG_RSASSA            YES    // 1
+#define    ALG_RSAES             YES    // 1
+#define    ALG_RSAPSS            YES    // 1
+#define    ALG_OAEP              YES    // 1
+#define    ALG_ECC               YES    // 1
+#define    ALG_CFB               YES    // 1
+#define    ALG_ECDH              YES    // 1
+#define    ALG_ECDSA             YES    // 1
+#define    ALG_ECDAA             YES    // 1
+#define    ALG_SM2               YES    // 1
+#define    ALG_ECSCHNORR         YES    // 1
+#define    ALG_SYMCIPHER         YES    // 1
+#define    ALG_KDF1_SP800_56a    YES    // 1
+#define    ALG_KDF2              NO     // 0
+#define    ALG_KDF1_SP800_108    YES    // 1
+#define    ALG_CTR               YES    // 1
+#define    ALG_OFB               YES    // 1
+#define    ALG_CBC               YES    // 1
+
+#define HASH_COUNT (ALG_SHA1+ALG_SHA256+ALG_SHA384+ALG_SHA512+ALG_WHIRLPOOL512+ALG_SM3_256)
+
+// Table 216 -- RSA Algorithm Constants
+#define    RSA_KEY_SIZES_BITS    2048    // {1024,2048}
+#define    MAX_RSA_KEY_BITS      2048
+#define    MAX_RSA_KEY_BYTES     ((MAX_RSA_KEY_BITS + 7) / 8)    // 256
+
+// Table 218 -- AES Algorithm Constants
+#define    AES_KEY_SIZES_BITS          128
+#define    MAX_AES_KEY_BITS            128
+#define    MAX_AES_BLOCK_SIZE_BYTES    16
+#define    MAX_AES_KEY_BYTES           ((MAX_AES_KEY_BITS + 7) / 8)    // 16
+
+
+// Table 220 -- Symmetric Algorithm Constants
+#define    MAX_SYM_KEY_BITS      MAX_AES_KEY_BITS    // 128
+#define    MAX_SYM_KEY_BYTES     MAX_AES_KEY_BYTES    // 16
+#define    MAX_SYM_BLOCK_SIZE    MAX_AES_BLOCK_SIZE_BYTES    // 16
+
+#define    MAX_SYM_DATA         128
+#define    MAX_ECC_KEY_BITS     256
+#define    MAX_ECC_KEY_BYTES    ((MAX_ECC_KEY_BITS + 7) / 8)
+
+// TPM2 command code
+
+typedef UINT32 TPM_CC;
+#define    TPM_CC_FIRST                         (TPM_CC)(0x0000011F)
+#define    TPM_CC_PP_FIRST                      (TPM_CC)(0x0000011F)
+#define    TPM_CC_NV_UndefineSpaceSpecial       (TPM_CC)(0x0000011F)
+#define    TPM_CC_EvictControl                  (TPM_CC)(0x00000120)
+#define    TPM_CC_HierarchyControl              (TPM_CC)(0x00000121)
+#define    TPM_CC_NV_UndefineSpace              (TPM_CC)(0x00000122)
+#define    TPM_CC_ChangeEPS                     (TPM_CC)(0x00000124)
+#define    TPM_CC_ChangePPS                     (TPM_CC)(0x00000125)
+#define    TPM_CC_Clear                         (TPM_CC)(0x00000126)
+#define    TPM_CC_ClearControl                  (TPM_CC)(0x00000127)
+#define    TPM_CC_ClockSet                      (TPM_CC)(0x00000128)
+#define    TPM_CC_HierarchyChangeAuth           (TPM_CC)(0x00000129)
+#define    TPM_CC_NV_DefineSpace                (TPM_CC)(0x0000012A)
+#define    TPM_CC_PCR_Allocate                  (TPM_CC)(0x0000012B)
+#define    TPM_CC_PCR_SetAuthPolicy             (TPM_CC)(0x0000012C)
+#define    TPM_CC_PP_Commands                   (TPM_CC)(0x0000012D)
+#define    TPM_CC_SetPrimaryPolicy              (TPM_CC)(0x0000012E)
+#define    TPM_CC_FieldUpgradeStart             (TPM_CC)(0x0000012F)
+#define    TPM_CC_ClockRateAdjust               (TPM_CC)(0x00000130)
+#define    TPM_CC_CreatePrimary                 (TPM_CC)(0x00000131)
+#define    TPM_CC_NV_GlobalWriteLock            (TPM_CC)(0x00000132)
+#define    TPM_CC_PP_LAST                       (TPM_CC)(0x00000132)
+#define    TPM_CC_GetCommandAuditDigest         (TPM_CC)(0x00000133)
+#define    TPM_CC_NV_Increment                  (TPM_CC)(0x00000134)
+#define    TPM_CC_NV_SetBits                    (TPM_CC)(0x00000135)
+#define    TPM_CC_NV_Extend                     (TPM_CC)(0x00000136)
+#define    TPM_CC_NV_Write                      (TPM_CC)(0x00000137)
+#define    TPM_CC_NV_WriteLock                  (TPM_CC)(0x00000138)
+#define    TPM_CC_DictionaryAttackLockReset     (TPM_CC)(0x00000139)
+#define    TPM_CC_DictionaryAttackParameters    (TPM_CC)(0x0000013A)
+#define    TPM_CC_NV_ChangeAuth                 (TPM_CC)(0x0000013B)
+#define    TPM_CC_PCR_Event                     (TPM_CC)(0x0000013C)
+#define    TPM_CC_PCR_Reset                     (TPM_CC)(0x0000013D)
+#define    TPM_CC_SequenceComplete              (TPM_CC)(0x0000013E)
+#define    TPM_CC_SetAlgorithmSet               (TPM_CC)(0x0000013F)
+#define    TPM_CC_SetCommandCodeAuditStatus     (TPM_CC)(0x00000140)
+#define    TPM_CC_FieldUpgradeData              (TPM_CC)(0x00000141)
+#define    TPM_CC_IncrementalSelfTest           (TPM_CC)(0x00000142)
+#define    TPM_CC_SelfTest                      (TPM_CC)(0x00000143)
+#define    TPM_CC_Startup                       (TPM_CC)(0x00000144)
+#define    TPM_CC_Shutdown                      (TPM_CC)(0x00000145)
+#define    TPM_CC_StirRandom                    (TPM_CC)(0x00000146)
+#define    TPM_CC_ActivateCredential            (TPM_CC)(0x00000147)
+#define    TPM_CC_Certify                       (TPM_CC)(0x00000148)
+#define    TPM_CC_PolicyNV                      (TPM_CC)(0x00000149)
+#define    TPM_CC_CertifyCreation               (TPM_CC)(0x0000014A)
+#define    TPM_CC_Duplicate                     (TPM_CC)(0x0000014B)
+#define    TPM_CC_GetTime                       (TPM_CC)(0x0000014C)
+#define    TPM_CC_GetSessionAuditDigest         (TPM_CC)(0x0000014D)
+#define    TPM_CC_NV_Read                       (TPM_CC)(0x0000014E)
+#define    TPM_CC_NV_ReadLock                   (TPM_CC)(0x0000014F)
+#define    TPM_CC_ObjectChangeAuth              (TPM_CC)(0x00000150)
+#define    TPM_CC_PolicySecret                  (TPM_CC)(0x00000151)
+#define    TPM_CC_Rewrap                        (TPM_CC)(0x00000152)
+#define    TPM_CC_Create                        (TPM_CC)(0x00000153)
+#define    TPM_CC_ECDH_ZGen                     (TPM_CC)(0x00000154)
+#define    TPM_CC_HMAC                          (TPM_CC)(0x00000155)
+#define    TPM_CC_Import                        (TPM_CC)(0x00000156)
+#define    TPM_CC_Load                          (TPM_CC)(0x00000157)
+#define    TPM_CC_Quote                         (TPM_CC)(0x00000158)
+#define    TPM_CC_RSA_Decrypt                   (TPM_CC)(0x00000159)
+#define    TPM_CC_HMAC_Start                    (TPM_CC)(0x0000015B)
+#define    TPM_CC_SequenceUpdate                (TPM_CC)(0x0000015C)
+#define    TPM_CC_Sign                          (TPM_CC)(0x0000015D)
+#define    TPM_CC_Unseal                        (TPM_CC)(0x0000015E)
+#define    TPM_CC_PolicySigned                  (TPM_CC)(0x00000160)
+#define    TPM_CC_ContextLoad                   (TPM_CC)(0x00000161)
+#define    TPM_CC_ContextSave                   (TPM_CC)(0x00000162)
+#define    TPM_CC_ECDH_KeyGen                   (TPM_CC)(0x00000163)
+#define    TPM_CC_EncryptDecrypt                (TPM_CC)(0x00000164)
+#define    TPM_CC_FlushContext                  (TPM_CC)(0x00000165)
+#define    TPM_CC_LoadExternal                  (TPM_CC)(0x00000167)
+#define    TPM_CC_MakeCredential                (TPM_CC)(0x00000168)
+#define    TPM_CC_NV_ReadPublic                 (TPM_CC)(0x00000169)
+#define    TPM_CC_PolicyAuthorize               (TPM_CC)(0x0000016A)
+#define    TPM_CC_PolicyAuthValue               (TPM_CC)(0x0000016B)
+#define    TPM_CC_PolicyCommandCode             (TPM_CC)(0x0000016C)
+#define    TPM_CC_PolicyCounterTimer            (TPM_CC)(0x0000016D)
+#define    TPM_CC_PolicyCpHash                  (TPM_CC)(0x0000016E)
+#define    TPM_CC_PolicyLocality                (TPM_CC)(0x0000016F)
+#define    TPM_CC_PolicyNameHash                (TPM_CC)(0x00000170)
+#define    TPM_CC_PolicyOR                      (TPM_CC)(0x00000171)
+#define    TPM_CC_PolicyTicket                  (TPM_CC)(0x00000172)
+#define    TPM_CC_ReadPublic                    (TPM_CC)(0x00000173)
+#define    TPM_CC_RSA_Encrypt                   (TPM_CC)(0x00000174)
+#define    TPM_CC_StartAuthSession              (TPM_CC)(0x00000176)
+#define    TPM_CC_VerifySignature               (TPM_CC)(0x00000177)
+#define    TPM_CC_ECC_Parameters                (TPM_CC)(0x00000178)
+#define    TPM_CC_FirmwareRead                  (TPM_CC)(0x00000179)
+#define    TPM_CC_GetCapability                 (TPM_CC)(0x0000017A)
+#define    TPM_CC_GetRandom                     (TPM_CC)(0x0000017B)
+#define    TPM_CC_GetTestResult                 (TPM_CC)(0x0000017C)
+#define    TPM_CC_Hash                          (TPM_CC)(0x0000017D)
+#define    TPM_CC_PCR_Read                      (TPM_CC)(0x0000017E)
+#define    TPM_CC_PolicyPCR                     (TPM_CC)(0x0000017F)
+#define    TPM_CC_PolicyRestart                 (TPM_CC)(0x00000180)
+#define    TPM_CC_ReadClock                     (TPM_CC)(0x00000181)
+#define    TPM_CC_PCR_Extend                    (TPM_CC)(0x00000182)
+#define    TPM_CC_PCR_SetAuthValue              (TPM_CC)(0x00000183)
+#define    TPM_CC_NV_Certify                    (TPM_CC)(0x00000184)
+#define    TPM_CC_EventSequenceComplete         (TPM_CC)(0x00000185)
+#define    TPM_CC_HashSequenceStart             (TPM_CC)(0x00000186)
+#define    TPM_CC_PolicyPhysicalPresence        (TPM_CC)(0x00000187)
+#define    TPM_CC_PolicyDuplicationSelect       (TPM_CC)(0x00000188)
+#define    TPM_CC_PolicyGetDigest               (TPM_CC)(0x00000189)
+#define    TPM_CC_TestParms                     (TPM_CC)(0x0000018A)
+#define    TPM_CC_Commit                        (TPM_CC)(0x0000018B)
+#define    TPM_CC_PolicyPassword                (TPM_CC)(0x0000018C)
+#define    TPM_CC_SM2_ZGen                      (TPM_CC)(0x0000018D)
+#define    TPM_CC_LAST                          (TPM_CC)(0x0000018D)
+
+
+//TPM_RC
+typedef UINT32 TPM_RC;
+
+// TPM_ST Constants
+typedef UINT16 TPM_ST;
+#define    TPM_ST_NULL                    (TPM_ST)(0X8000)
+#define    TPM_ST_NO_SESSIONS             (TPM_ST)(0x8001)
+#define    TPM_ST_SESSIONS                (TPM_ST)(0x8002)
+
+
+// TPM Handle types
+typedef UINT32 TPM2_HANDLE;
+typedef UINT8 TPM_HT;
+
+
+// TPM_RH Constants
+typedef UINT32 TPM_RH;
+
+#define    TPM_RH_FIRST          (TPM_RH)(0x40000000)
+#define    TPM_RH_SRK            (TPM_RH)(0x40000000)
+#define    TPM_RH_OWNER          (TPM_RH)(0x40000001)
+#define    TPM_RS_PW             (TPM_RH)(0x40000009)
+#define    TPM_RH_LOCKOUT        (TPM_RH)(0x4000000A)
+#define    TPM_RH_ENDORSEMENT    (TPM_RH)(0x4000000B)
+#define    TPM_RH_PLATFORM       (TPM_RH)(0x4000000C)
+#define    TPM_RH_LAST           (TPM_RH)(0x4000000C)
+
+// Table 4 -- DocumentationClarity Types <I/O>
+typedef UINT32    TPM_MODIFIER_INDICATOR;
+typedef UINT32    TPM_SESSION_OFFSET;
+typedef UINT16    TPM_KEY_SIZE;
+typedef UINT16    TPM_KEY_BITS;
+typedef UINT64    TPM_SYSTEM_ADDRESS;
+typedef UINT32    TPM_SPEC;
+
+// Table 29 -- TPMA_ALGORITHM Bits <I/O>
+typedef struct {
+    unsigned int asymmetric:1;
+    unsigned int symmetric:1;
+    unsigned int hash:1;
+    unsigned int object:1;
+    unsigned int reserved5:4;
+    unsigned int signing:1;
+    unsigned int encrypting:1;
+    unsigned int method:1;
+    unsigned int reserved9:21;
+} TPMA_ALGORITHM;
+
+typedef UINT32 TPMA_OBJECT;
+typedef BYTE TPMA_SESSION;
+typedef BYTE TPMA_LOCALITY;
+
+// Table 37 -- TPMI_YES_NO Type <I/O>
+typedef BYTE TPMI_YES_NO;
+
+// Table 38 -- TPMI_DH_OBJECT Type <I/O>
+typedef TPM2_HANDLE TPMI_DH_OBJECT;
+
+// Table 39 -- TPMI_DH_PERSISTENT Type <I/O>
+typedef TPM2_HANDLE TPMI_DH_PERSISTENT;
+
+// Table 42 -- TPMI_SH_AUTH_SESSION Type <I/O>
+typedef TPM2_HANDLE TPMI_SH_AUTH_SESSION;
+
+// Table 40 -- TPMI_DH_ENTITY Type <I>
+typedef TPM2_HANDLE TPMI_DH_ENTITY;
+
+// Table 45 -- TPMI_DH_CONTEXT Type <I/O>
+typedef TPM2_HANDLE TPMI_DH_CONTEXT;
+
+// Table 46 -- TPMI_RH_HIERARCHY Type <I/O>
+typedef TPM2_HANDLE TPMI_RH_HIERARCHY;
+
+// Table 47 -- TPM2I_RH_HIERARCHY_AUTH Type <I>
+typedef TPM2_HANDLE TPM2I_RH_HIERARCHY_AUTH;
+
+// Table 48 -- TPMI_RH_PLATFORM Type <I>
+typedef TPM2_HANDLE TPMI_RH_PLATFORM;
+
+// Table 49 -- TPMI_RH_OWNER Type <I>
+typedef TPM2_HANDLE TPMI_RH_OWNER;
+
+// Table 50 -- TPMI_RH_ENDORSEMENT Type <I>
+typedef TPM2_HANDLE TPMI_RH_ENDORSEMENT;
+
+// Table 51 -- TPMI_RH_PROVISION Type <I>
+typedef TPM2_HANDLE TPMI_RH_PROVISION;
+
+// Table 52 -- TPMI_RH_CLEAR Type <I>
+typedef TPM2_HANDLE TPMI_RH_CLEAR;
+
+// Table 54 -- TPMI_RH_LOCKOUT Type <I>
+typedef TPM2_HANDLE TPMI_RH_LOCKOUT;
+
+// Table 7 -- TPM_ALG_ID
+typedef UINT16 TPM_ALG_ID;
+
+#define    TPM2_ALG_ERROR             (TPM_ALG_ID)(0x0000) // a: ; D:
+#define    TPM2_ALG_FIRST             (TPM_ALG_ID)(0x0001) // a: ; D:
+#if ALG_RSA == YES || ALG_ALL == YES
+#define    TPM2_ALG_RSA               (TPM_ALG_ID)(0x0001) // a: A O; D:
+#endif
+#if ALG_DES == YES || ALG_ALL == YES
+#define    TPM2_ALG_DES               (TPM_ALG_ID)(0x0002) // a: S; D:
+#endif
+#define    TPM2_ALG_SHA1              (TPM_ALG_ID)(0x0004) // a: H; D:
+#if ALG_HMAC == YES || ALG_ALL == YES
+#define    TPM2_ALG_HMAC              (TPM_ALG_ID)(0x0005) // a: H X; D:
+#endif
+#if ALG_AES == YES || ALG_ALL == YES
+#define    TPM2_ALG_AES               (TPM_ALG_ID)(0x0006) // a: S; D:
+#endif
+#if ALG_XOR == YES || ALG_ALL == YES
+#define    TPM2_ALG_XOR               (TPM_ALG_ID)(0x000A) // a: H S; D:
+#endif
+#if ALG_MGF1 == YES || ALG_ALL == YES
+#define    TPM2_ALG_MGF1              (TPM_ALG_ID)(0x0007) // a: H M; D:
+#endif
+#if ALG_KEYEDHASH == YES || ALG_ALL == YES
+#define    TPM2_ALG_KEYEDHASH         (TPM_ALG_ID)(0x0008) // a: H E X O; D:
+#endif
+#if ALG_SHA256 == YES || ALG_ALL == YES
+#define    TPM2_ALG_SHA256            (TPM_ALG_ID)(0x000B) // a: H; D:
+#endif
+#define    TPM2_ALG_NULL              (TPM_ALG_ID)(0x0010) // a: ; D:
+#if ALG_OAEP == YES || ALG_ALL == YES
+#define    TPM2_ALG_OAEP              (TPM_ALG_ID)(0x0017) // a: A E; D: RSA
+#endif
+#if ALG_ECC == YES || ALG_ALL == YES
+#define    TPM2_ALG_ECC               (TPM_ALG_ID)(0x0023) // a: A O; D:
+#endif
+#if ALG_SM4 == YES || ALG_ALL == YES
+#define    TPM2_ALG_SM4               (TPM_ALG_ID)(0x0013) // a: S; D:
+#endif
+#if ALG_SYMCIPHER == YES || ALG_ALL == YES
+#define    TPM2_ALG_SYMCIPHER         (TPM_ALG_ID)(0x0025) // a: O; D:
+#endif
+#if ALG_CFB == YES || ALG_ALL == YES
+#define    TPM2_ALG_CFB               (TPM_ALG_ID)(0x0043) // a: S E; D:
+#endif
+#define    TPM2_ALG_LAST              (TPM_ALG_ID)(0x0044)
+
+#define    SHA1_DIGEST_SIZE      20
+#define    SHA1_BLOCK_SIZE       64
+#define    SHA256_DIGEST_SIZE    32
+#define    SHA256_BLOCK_SIZE     64
+
+// Table 57 -- TPMI_ALG_ASYM Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_ASYM;
+
+// Table 56 -- TPMI_ALG_HASH Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_HASH;
+
+// Table 58 -- TPMI_ALG_SYM Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_SYM;
+
+// Table 59 -- TPMI_ALG_SYM_OBJECT Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_SYM_OBJECT;
+
+// Table 60 -- TPMI_ALG_SYM_MODE Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_SYM_MODE;
+
+// Table 61 -- TPMI_ALG_KDF Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_KDF;
+
+// Table 62 -- TPMI_ALG_SIG_SCHEME Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_SIG_SCHEME;
+
+// Table 65 -- TPMU_HA Union <I/O,S>
+typedef union {
+#ifdef TPM2_ALG_SHA1
+    BYTE  sha1[SHA1_DIGEST_SIZE];
+#endif
+#ifdef TPM2_ALG_SHA256
+    BYTE  sha256[SHA256_DIGEST_SIZE];
+#endif
+#ifdef TPM2_ALG_SM3_256
+    BYTE  sm3_256[SM3_256_DIGEST_SIZE];
+#endif
+#ifdef TPM2_ALG_SHA384
+    BYTE  sha384[SHA384_DIGEST_SIZE];
+#endif
+#ifdef TPM2_ALG_SHA512
+    BYTE  sha512[SHA512_DIGEST_SIZE];
+#endif
+#ifdef TPM2_ALG_WHIRLPOOL512
+    BYTE  whirlpool[WHIRLPOOL512_DIGEST_SIZE];
+#endif
+
+} TPMU_HA;
+
+// Table 67 -- TPM2B_DIGEST Structure <I/O>
+typedef struct {
+    UINT16    size;
+    BYTE      buffer[sizeof(TPMU_HA)];
+} TPM2B_DIGEST;
+
+// Table 69 -- TPM2B_NONCE Types <I/O>
+typedef TPM2B_DIGEST    TPM2B_NONCE;
+
+typedef TPM2B_DIGEST    TPM2B_DATA;
+
+// Table 70 -- TPM2B_AUTH Types <I/O>
+typedef TPM2B_DIGEST    TPM2B_AUTH;
+
+// Table 71 -- TPM2B_OPERAND Types <I/O>
+typedef TPM2B_DIGEST    TPM2B_OPERAND;
+
+// Table 66 -- TPMT_HA Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+    TPMU_HA          digest;
+} TPMT_HA;
+
+//Table 80 -- TPM2B_NAME Structure
+typedef struct {
+    UINT16 size;
+    BYTE name[sizeof(TPMT_HA)];
+} TPM2B_NAME;
+
+#define    IMPLEMENTATION_PCR   24
+#define    PLATFORM_PCR         24
+#define    PCR_SELECT_MAX       ((IMPLEMENTATION_PCR+7)/8)
+#define    PCR_SELECT_NUM(x)    (uint8_t)(x/8)
+#define    PCR_SELECT_VALUE(x)  (uint8_t)(0x1)<<(x%8)
+
+//Table 79 -- TPMS_PCR_SELECT Structure <I/O>
+typedef struct {
+    UINT8    sizeofSelect;
+    BYTE     pcrSelect[PCR_SELECT_MAX];
+} TPMS_PCR_SELECT;
+
+// Table 80 -- TPMS_PCR_SELECTION Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hash;
+    UINT8            sizeofSelect;
+    BYTE             pcrSelect[PCR_SELECT_MAX];
+} TPMS_PCR_SELECTION;
+
+// Table 83 -- TPMT_TK_CREATION Structure <I/O>
+typedef struct {
+    TPM_ST               tag;
+    TPMI_RH_HIERARCHY    hierarchy;
+    TPM2B_DIGEST         digest;
+} TPMT_TK_CREATION;
+
+// Table 96 -- Definition of TPML_DIGEST Structure <I/O>
+typedef struct {
+    UINT32               count;
+    TPM2B_DIGEST         digests[8];
+}TPML_DIGEST;
+
+// Table 97 -- TPML_PCR_SELECTION Structure <I/O>
+typedef struct {
+    UINT32                count;
+    TPMS_PCR_SELECTION    pcrSelections[HASH_COUNT];
+} TPML_PCR_SELECTION;
+
+// Table 119 -- TPMI_AES_KEY_BITS Type <I/O>
+typedef TPM_KEY_BITS TPMI_AES_KEY_BITS;
+
+// Table 120 -- TPMI_SM4_KEY_BITS Type <I/O>
+typedef TPM_KEY_BITS TPMI_SM4_KEY_BITS;
+
+// Table 121 -- TPMU_SYM_KEY_BITS Union <I/O>
+typedef union {
+#ifdef TPM2_ALG_AES
+    TPMI_AES_KEY_BITS  aes;
+#endif
+#ifdef TPM2_ALG_SM4
+    TPMI_SM4_KEY_BITS  SM4;
+#endif
+    TPM_KEY_BITS  sym;
+#ifdef TPM2_ALG_XOR
+    TPMI_ALG_HASH  xor;
+#endif
+
+} TPMU_SYM_KEY_BITS;
+
+// Table 122 -- TPMU_SYM_MODE Union <I/O>
+typedef union {
+#ifdef TPM2_ALG_AES
+    TPMI_ALG_SYM_MODE  aes;
+#endif
+#ifdef TPM2_ALG_SM4
+    TPMI_ALG_SYM_MODE  SM4;
+#endif
+    TPMI_ALG_SYM_MODE  sym;
+} TPMU_SYM_MODE ;
+
+// Table 124 -- TPMT_SYM_DEF Structure <I/O>
+typedef struct {
+    TPMI_ALG_SYM         algorithm;
+    TPMU_SYM_KEY_BITS    keyBits;
+    TPMU_SYM_MODE        mode;
+} TPMT_SYM_DEF;
+
+// Table 125 -- TPMT_SYM_DEF_OBJECT Structure <I/O>
+typedef struct {
+    TPMI_ALG_SYM_OBJECT    algorithm;
+    TPMU_SYM_KEY_BITS      keyBits;
+    TPMU_SYM_MODE          mode;
+} TPMT_SYM_DEF_OBJECT;
+
+// Table 126 -- TPM2B_SYM_KEY Structure <I/O>
+typedef struct {
+    UINT16    size;
+    BYTE      buffer[MAX_SYM_KEY_BYTES];
+} TPM2B_SYM_KEY;
+
+// Table 127 -- TPMS_SYMCIPHER_PARMS Structure <I/O>
+typedef struct {
+    TPMT_SYM_DEF_OBJECT    sym;
+} TPMS_SYMCIPHER_PARMS;
+
+// Table 128 -- TPM2B_SENSITIVE_DATA Structure <I/O>
+typedef struct {
+    UINT16    size;
+    BYTE      buffer[MAX_SYM_DATA];
+} TPM2B_SENSITIVE_DATA;
+
+// Table 129 -- TPMS_SENSITIVE_CREATE Structure <I>
+typedef struct {
+    TPM2B_AUTH              userAuth;
+    TPM2B_SENSITIVE_DATA    data;
+} TPMS_SENSITIVE_CREATE;
+
+// Table 130 -- TPM2B_SENSITIVE_CREATE Structure <I,S>
+typedef struct {
+    UINT16                   size;
+    TPMS_SENSITIVE_CREATE    sensitive;
+} TPM2B_SENSITIVE_CREATE;
+
+// Table 131 -- TPMS_SCHEME_SIGHASH Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+} TPMS_SCHEME_SIGHASH;
+
+// Table 132 -- TPMI_ALG_KEYEDHASH_SCHEME Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_KEYEDHASH_SCHEME;
+
+// Table 133 -- HMAC_SIG_SCHEME Types <I/O>
+typedef TPMS_SCHEME_SIGHASH    TPMS_SCHEME_HMAC;
+
+// Table 134 -- TPMS_SCHEME_XOR Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+    TPMI_ALG_KDF     kdf;
+} TPMS_SCHEME_XOR;
+
+// Table 135 -- TPMU_SCHEME_KEYEDHASH Union <I/O,S>
+typedef union {
+#ifdef TPM2_ALG_HMAC
+    TPMS_SCHEME_HMAC  hmac;
+#endif
+#ifdef TPM2_ALG_XOR
+    TPMS_SCHEME_XOR  xor;
+#endif
+
+} TPMU_SCHEME_KEYEDHASH ;
+
+// Table 136 -- TPMT_KEYEDHASH_SCHEME Structure <I/O>
+typedef struct {
+    TPMI_ALG_KEYEDHASH_SCHEME    scheme;
+    TPMU_SCHEME_KEYEDHASH        details;
+} TPMT_KEYEDHASH_SCHEME;
+
+// Table 137 -- RSA_SIG_SCHEMES Types <I/O>
+typedef TPMS_SCHEME_SIGHASH    TPMS_SCHEME_RSASSA;
+typedef TPMS_SCHEME_SIGHASH    TPMS_SCHEME_RSAPSS;
+
+// Table 138 -- ECC_SIG_SCHEMES Types <I/O>
+typedef TPMS_SCHEME_SIGHASH    TPMS_SCHEME_ECDSA;
+typedef TPMS_SCHEME_SIGHASH    TPMS_SCHEME_SM2;
+
+// Table 139 -- TPMS_SCHEME_ECDAA Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+    UINT16           count;
+} TPMS_SCHEME_ECDAA;
+
+// Table 140 -- TPMS_SCHEME_ECSCHNORR Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+    UINT16           count;
+} TPMS_SCHEME_ECSCHNORR;
+
+// Table 141 -- TPMU_SIG_SCHEME Union <I/O,S>
+typedef union {
+#ifdef TPM2_ALG_RSASSA
+    TPMS_SCHEME_RSASSA  rsassa;
+#endif
+#ifdef TPM2_ALG_RSAPSS
+    TPMS_SCHEME_RSAPSS  rsapss;
+#endif
+#ifdef TPM2_ALG_ECDSA
+    TPMS_SCHEME_ECDSA  ecdsa;
+#endif
+#ifdef TPM2_ALG_SM2
+    TPMS_SCHEME_SM2  sm2;
+#endif
+#ifdef TPM2_ALG_ECDAA
+    TPMS_SCHEME_ECDAA  ecdaa;
+#endif
+#ifdef TPM2_ALG_ECSCHNORR
+    TPMS_SCHEME_ECSCHNORR  ecSchnorr;
+#endif
+#ifdef TPM2_ALG_HMAC
+    TPMS_SCHEME_HMAC  hmac;
+#endif
+    TPMS_SCHEME_SIGHASH  any;
+} TPMU_SIG_SCHEME;
+
+// Table 142 -- TPMT_SIG_SCHEME Structure <I/O>
+typedef struct {
+    TPMI_ALG_SIG_SCHEME    scheme;
+    TPMU_SIG_SCHEME        details;
+} TPMT_SIG_SCHEME;
+
+// Table 143 -- TPMS_SCHEME_OAEP Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+} TPMS_SCHEME_OAEP;
+
+// Table 144 -- TPMS_SCHEME_ECDH Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+} TPMS_SCHEME_ECDH;
+
+// Table 145 -- TPMS_SCHEME_MGF1 Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+} TPMS_SCHEME_MGF1;
+
+// Table 146 -- TPMS_SCHEME_KDF1_SP800_56a Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+} TPMS_SCHEME_KDF1_SP800_56a;
+
+// Table 147 -- TPMS_SCHEME_KDF2 Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+} TPMS_SCHEME_KDF2;
+
+// Table 148 -- TPMS_SCHEME_KDF1_SP800_108 Structure <I/O>
+typedef struct {
+    TPMI_ALG_HASH    hashAlg;
+} TPMS_SCHEME_KDF1_SP800_108;
+
+// Table 149 -- TPMU_KDF_SCHEME Union <I/O,S>
+typedef union {
+#ifdef TPM2_ALG_MGF1
+    TPMS_SCHEME_MGF1  mgf1;
+#endif
+#ifdef TPM2_ALG_KDF1_SP800_56a
+    TPMS_SCHEME_KDF1_SP800_56a  kdf1_SP800_56a;
+#endif
+#ifdef TPM2_ALG_KDF2
+    TPMS_SCHEME_KDF2  kdf2;
+#endif
+#ifdef TPM2_ALG_KDF1_SP800_108
+    TPMS_SCHEME_KDF1_SP800_108  kdf1_sp800_108;
+#endif
+
+} TPMU_KDF_SCHEME;
+
+// Table 150 -- TPMT_KDF_SCHEME Structure <I/O>
+typedef struct {
+    TPMI_ALG_KDF       scheme;
+    TPMU_KDF_SCHEME    details;
+} TPMT_KDF_SCHEME;
+typedef TPM_ALG_ID TPMI_ALG_ASYM_SCHEME;
+
+// Table 152 -- TPMU_ASYM_SCHEME Union <I/O>
+typedef union {
+#ifdef TPM2_ALG_RSASSA
+    TPMS_SCHEME_RSASSA  rsassa;
+#endif
+#ifdef TPM2_ALG_RSAPSS
+    TPMS_SCHEME_RSAPSS  rsapss;
+#endif
+#ifdef TPM2_ALG_OAEP
+    TPMS_SCHEME_OAEP  oaep;
+#endif
+#ifdef TPM2_ALG_ECDSA
+    TPMS_SCHEME_ECDSA  ecdsa;
+#endif
+#ifdef TPM2_ALG_SM2
+    TPMS_SCHEME_SM2  sm2;
+#endif
+#ifdef TPM2_ALG_ECDAA
+    TPMS_SCHEME_ECDAA  ecdaa;
+#endif
+#ifdef TPM2_ALG_ECSCHNORR
+    TPMS_SCHEME_ECSCHNORR  ecSchnorr;
+#endif
+    TPMS_SCHEME_SIGHASH  anySig;
+} TPMU_ASYM_SCHEME;
+
+typedef struct {
+    TPMI_ALG_ASYM_SCHEME    scheme;
+    TPMU_ASYM_SCHEME        details;
+} TPMT_ASYM_SCHEME;
+
+// Table 154 -- TPMI_ALG_RSA_SCHEME Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_RSA_SCHEME;
+
+// Table 155 -- TPMT_RSA_SCHEME Structure <I/O>
+typedef struct {
+    TPMI_ALG_RSA_SCHEME    scheme;
+    TPMU_ASYM_SCHEME       details;
+} TPMT_RSA_SCHEME;
+
+// Table 156 -- TPMI_ALG_RSA_DECRYPT Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_RSA_DECRYPT;
+
+// Table 157 -- TPMT_RSA_DECRYPT Structure <I/O>
+typedef struct {
+    TPMI_ALG_RSA_DECRYPT    scheme;
+    TPMU_ASYM_SCHEME        details;
+} TPMT_RSA_DECRYPT;
+
+// Table 158 -- TPM2B_PUBLIC_KEY_RSA Structure <I/O>
+typedef struct {
+    UINT16    size;
+    BYTE      buffer[MAX_RSA_KEY_BYTES];
+} TPM2B_PUBLIC_KEY_RSA;
+
+// Table 159 -- TPMI_RSA_KEY_BITS Type <I/O>
+typedef TPM_KEY_BITS TPMI_RSA_KEY_BITS;
+
+// Table 160 -- TPM2B_PRIVATE_KEY_RSA Structure <I/O>
+typedef struct {
+    UINT16    size;
+    BYTE      buffer[MAX_RSA_KEY_BYTES/2];
+} TPM2B_PRIVATE_KEY_RSA;
+
+// Table 162 -- TPM2B_ECC_PARAMETER
+typedef struct {
+    UINT16 size;
+    BYTE buffer[MAX_ECC_KEY_BYTES];
+} TPM2B_ECC_PARAMETER;
+
+// Table 163 -- TPMS_ECC_POINT Structure <I/O>
+typedef struct {
+    TPM2B_ECC_PARAMETER    x;
+    TPM2B_ECC_PARAMETER    y;
+} TPMS_ECC_POINT;
+
+// Table 164 -- TPMI_ALG_ECC_SCHEME Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_ECC_SCHEME;
+
+typedef UINT16 TPM_ECC_CURVE;
+
+// Table 165 -- TPMI_ECC_CURVE Type <I/O>
+typedef TPM_ECC_CURVE TPMI_ECC_CURVE;
+
+// Table 166 -- TPMT_ECC_SCHEME Structure <I/O>
+typedef struct {
+    TPMI_ALG_ECC_SCHEME    scheme;
+    TPMU_SIG_SCHEME        details;
+} TPMT_ECC_SCHEME;
+
+// Table 175 -- TPMI_ALG_PUBLIC Type <I/O>
+typedef TPM_ALG_ID TPMI_ALG_PUBLIC;
+
+// Table 176 -- TPMU_PUBLIC_ID Union <I/O,S>
+typedef union {
+#ifdef TPM2_ALG_KEYEDHASH
+    TPM2B_DIGEST  keyedHash;
+#endif
+#ifdef TPM2_ALG_SYMCIPHER
+    TPM2B_DIGEST  sym;
+#endif
+#ifdef TPM2_ALG_RSA
+    TPM2B_PUBLIC_KEY_RSA  rsa;
+#endif
+#ifdef TPM2_ALG_ECC
+    TPMS_ECC_POINT  ecc;
+#endif
+} TPMU_PUBLIC_ID;
+
+// Table 177 -- TPMS_KEYEDHASH_PARMS Structure <I/O>
+typedef struct {
+    TPMT_KEYEDHASH_SCHEME    scheme;
+} TPMS_KEYEDHASH_PARMS;
+typedef struct {
+    TPMT_SYM_DEF_OBJECT    symmetric;
+    TPMT_ASYM_SCHEME       scheme;
+} TPMS_ASYM_PARMS;
+
+// Table 179 -- TPMS_RSA_PARMS Structure <I/O>
+typedef struct {
+    TPMT_SYM_DEF_OBJECT    symmetric;
+    TPMT_RSA_SCHEME        scheme;
+    TPMI_RSA_KEY_BITS      keyBits;
+    UINT32                 exponent;
+} TPMS_RSA_PARMS;
+
+// Table 180 -- TPMS_ECC_PARMS Structure <I/O>
+typedef struct {
+    TPMT_SYM_DEF_OBJECT    symmetric;
+    TPMT_ECC_SCHEME        scheme;
+    TPMI_ECC_CURVE         curveID;
+    TPMT_KDF_SCHEME        kdf;
+} TPMS_ECC_PARMS;
+
+// Table 181 -- TPMU_PUBLIC_PARMS Union <I/O,S>
+typedef union {
+#ifdef TPM2_ALG_KEYEDHASH
+    TPMS_KEYEDHASH_PARMS  keyedHashDetail;
+#endif
+#ifdef TPM2_ALG_SYMCIPHER
+    TPMT_SYM_DEF_OBJECT  symDetail;
+#endif
+#ifdef TPM2_ALG_RSA
+    TPMS_RSA_PARMS  rsaDetail;
+#endif
+#ifdef TPM2_ALG_ECC
+    TPMS_ECC_PARMS  eccDetail;
+#endif
+    TPMS_ASYM_PARMS  asymDetail;
+} TPMU_PUBLIC_PARMS;
+
+// Table 182 -- TPMT_PUBLIC_PARMS Structure <I/O>
+typedef struct {
+    TPMI_ALG_PUBLIC      type;
+    TPMU_PUBLIC_PARMS    parameters;
+} TPMT_PUBLIC_PARMS;
+
+// Table 183 -- TPMT_PUBLIC Structure <I/O>
+typedef struct {
+    TPMI_ALG_PUBLIC      type;
+    TPMI_ALG_HASH        nameAlg;
+    TPMA_OBJECT          objectAttributes;
+    TPM2B_DIGEST         authPolicy;
+    TPMU_PUBLIC_PARMS    parameters;
+    TPMU_PUBLIC_ID       unique;
+} TPMT_PUBLIC;
+
+// Table 184 -- TPM2B_PUBLIC
+typedef struct {
+    UINT16         size;
+    TPMT_PUBLIC    publicArea;
+} TPM2B_PUBLIC;
+
+// Table 185 -- TPMU_SENSITIVE_COMPOSITE Union <I/O,S>
+typedef union {
+#ifdef TPM2_ALG_RSA
+    TPM2B_PRIVATE_KEY_RSA  rsa;
+#endif
+#ifdef TPM2_ALG_ECC
+    TPM2B_ECC_PARAMETER  ecc;
+#endif
+#ifdef TPM2_ALG_KEYEDHASH
+    TPM2B_SENSITIVE_DATA  bits;
+#endif
+#ifdef TPM2_ALG_SYMCIPHER
+    TPM2B_SYM_KEY  sym;
+#endif
+    TPM2B_SENSITIVE_DATA  any;
+} TPMU_SENSITIVE_COMPOSITE;
+
+// Table 186 -- TPMT_SENSITIVE Structure <I/O>
+typedef struct {
+    TPMI_ALG_PUBLIC             sensitiveType;
+    TPM2B_AUTH                  authValue;
+    TPM2B_DIGEST                seedValue;
+    TPMU_SENSITIVE_COMPOSITE    sensitive;
+} TPMT_SENSITIVE;
+
+// Table 187 -- TPM2B_SENSITIVE Structure <I/O>
+typedef struct {
+    UINT16            size;
+    TPMT_SENSITIVE    sensitiveArea;
+} TPM2B_SENSITIVE;
+
+typedef struct {
+    TPM2B_DIGEST      integrityOuter;
+    TPM2B_DIGEST      integrityInner;
+    TPMT_SENSITIVE    sensitive;
+} _PRIVATE;
+
+// Table 189 -- TPM2B_PRIVATE Structure <I/O,S>
+typedef struct {
+    UINT16    size;
+    BYTE      buffer[sizeof(_PRIVATE)];
+} TPM2B_PRIVATE;
+
+// Table 204 -- TPMS_CREATION_DATA <OUT>
+typedef struct {
+    TPML_PCR_SELECTION    pcrSelect;
+    TPM2B_DIGEST          pcrDigest;
+    TPMA_LOCALITY         locality;
+    TPM_ALG_ID            parentNameAlg;
+    TPM2B_NAME            parentName;
+    TPM2B_NAME            parentQualifiedName;
+    TPM2B_DATA            outsideInfo;
+} TPMS_CREATION_DATA;
+
+// Table 205 -- TPM2B_CREATION_DATA <OUT>
+typedef struct {
+    UINT16 size;
+    TPMS_CREATION_DATA creationData;
+} TPM2B_CREATION_DATA;
+
+/* the following structs is not part of standard struct defined in TPM2 spec */
+typedef struct {
+    UINT32            size;
+    TPM_RH            sessionHandle;
+    TPM2B_NONCE       nonce;
+    TPMA_SESSION      sessionAttributes;
+    TPM2B_AUTH        auth;
+} TPM_AuthArea;
+
+typedef struct {
+    TPM2B_SENSITIVE_CREATE  inSensitive;
+    TPM2B_PUBLIC            inPublic;
+    TPM2B_DATA              outsideInfo;
+    TPML_PCR_SELECTION      creationPCR;
+} TPM2_Create_Params_in;
+
+typedef TPM2_Create_Params_in    TPM2_CreatePrimary_Params_in;
+
+typedef struct {
+    TPM2B_PUBLIC        outPublic;
+    TPM2B_CREATION_DATA creationData;
+    TPM2B_DIGEST        creationHash;
+    TPMT_TK_CREATION    creationTicket;
+    TPM2B_NAME          name;
+} TPM2_CreatePrimary_Params_out;
+
+typedef struct {
+    TPM2B_PRIVATE       outPrivate;
+    TPM2B_PUBLIC        outPublic;
+    TPM2B_CREATION_DATA creationData;
+    TPM2B_DIGEST        creationHash;
+    TPMT_TK_CREATION    creationTicket;
+} TPM2_Create_Params_out;
+typedef struct {
+    TPM2B_PRIVATE    Private;
+    TPM2B_PUBLIC     Public;
+} TPM2_RSA_KEY;
+
+/*
+ * TPM 2.0 Objects
+ */
+
+#define TPM_HT_TRANSIENT        0x80
+#define HR_SHIFT                24
+#define HR_PERMANENT            (TPM_HT_TRANSIENT << HR_SHIFT)
+#define TRANSIENT_FIRST         (HR_PERMANENT)
+#define MAX_LOADED_OBJECTS      3
+#define TRANSIENT_LAST          (TRANSIENT_FIRST+MAX_LOADED_OBJECTS-1)
+/*
+ * TPMA_OBJECT Bits
+ */
+#define fixedTPM                ((1 << 1))
+#define stClear                 ((1 << 2))
+#define fixedParent             ((1 << 4))
+#define sensitiveDataOrigin     ((1 << 5))
+#define userWithAuth            ((1 << 6))
+#define adminWithPolicy         ((1 << 7))
+#define noDA                    ((1 << 10))
+#define encryptedDuplication    ((1 << 11))
+#define restricted              ((1 << 16))
+#define decrypt                 ((1 << 17))
+#define sign                    ((1 << 18))
+#endif
diff --git a/stubdom/vtpmmgr/tpmrsa.c b/stubdom/vtpmmgr/tpmrsa.c
index 2a2fa36..b18a5a3 100644
--- a/stubdom/vtpmmgr/tpmrsa.c
+++ b/stubdom/vtpmmgr/tpmrsa.c
@@ -19,8 +19,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  *  RSA was designed by Ron Rivest, Adi Shamir and Len Adleman.
diff --git a/stubdom/vtpmmgr/tpmrsa.h b/stubdom/vtpmmgr/tpmrsa.h
index 31e5a8b..08213bb 100644
--- a/stubdom/vtpmmgr/tpmrsa.h
+++ b/stubdom/vtpmmgr/tpmrsa.h
@@ -21,8 +21,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef TPMRSA_H
 #define TPMRSA_H
diff --git a/stubdom/vtpmmgr/vtpm_cmd_handler.c b/stubdom/vtpmmgr/vtpm_cmd_handler.c
index 13ead93..2ac14fa 100644
--- a/stubdom/vtpmmgr/vtpm_cmd_handler.c
+++ b/stubdom/vtpmmgr/vtpm_cmd_handler.c
@@ -282,9 +282,11 @@ static TPM_RESULT vtpmmgr_GetQuote(struct tpm_opaque *opq, tpmcmd_t* tpmcmd)
 	void *ibuf;
 	uint32_t pcr_size;
 	TPM_PCR_SELECTION sel;
+	uint32_t extra_info_flags;
 
 	UNPACK_IN(VPTR, &ibuf, 20, UNPACK_ALIAS);
 	UNPACK_IN(TPM_PCR_SELECTION, &sel, UNPACK_ALIAS);
+	UNPACK_IN(TPM_DEEP_QUOTE_INFO, &extra_info_flags);
 	UNPACK_DONE();
 
 	if (!opq->vtpm) {
@@ -297,7 +299,7 @@ static TPM_RESULT vtpmmgr_GetQuote(struct tpm_opaque *opq, tpmcmd_t* tpmcmd)
 		printk("%02x", ((uint8_t*)ibuf)[i]);
 	printk("\n");
 
-	status = vtpm_do_quote(opq->group, *opq->uuid, opq->kern_hash, ibuf, &sel, PACK_BUF + 256, &pcr_size, PACK_BUF);
+	status = vtpm_do_quote(opq->group, *opq->uuid, opq->kern_hash, ibuf, &sel, extra_info_flags, PACK_BUF + 256, &pcr_size, PACK_BUF);
 	if (status)
 		goto abort_egress;
 	tpmcmd->resp_len += 256 + pcr_size;
@@ -529,6 +531,7 @@ static TPM_RESULT vtpmmgr_GroupRegister(tpmcmd_t* tpmcmd)
 	sha1_context ctx;
 	TPM_PCR_SELECTION sel;
 	void *dhkx1, *dhkx2, *gk, *sig;
+	uint32_t extra_info_flags = 0;
 
 	UNPACK_GROUP(group);
 	UNPACK_IN(VPTR, &dhkx1, 256, UNPACK_ALIAS);
@@ -567,7 +570,7 @@ static TPM_RESULT vtpmmgr_GroupRegister(tpmcmd_t* tpmcmd)
 	sha1_update(&ctx, dhkx2, 256 + 32);
 	sha1_finish(&ctx, digest.bits);
 
-	status = vtpm_do_quote(group, NULL, NULL, &digest, &sel, NULL, NULL, PACK_BUF);
+	status = vtpm_do_quote(group, NULL, NULL, &digest, &sel, extra_info_flags,NULL, NULL, PACK_BUF);
 	tpmcmd->resp_len += 256;
 
 	CMD_END;
diff --git a/stubdom/vtpmmgr/vtpm_manager.h b/stubdom/vtpmmgr/vtpm_manager.h
index 156c2ce..2d2109d 100644
--- a/stubdom/vtpmmgr/vtpm_manager.h
+++ b/stubdom/vtpmmgr/vtpm_manager.h
@@ -46,6 +46,12 @@
 // Header size
 #define VTPM_COMMAND_HEADER_SIZE ( 2 + 4 + 4)
 
+//************************ Command Params ***************************
+#define VTPM_QUOTE_FLAGS_HASH_UUID                  0x00000001
+#define VTPM_QUOTE_FLAGS_VTPM_MEASUREMENTS          0x00000002
+#define VTPM_QUOTE_FLAGS_GROUP_INFO                 0x00000004
+#define VTPM_QUOTE_FLAGS_GROUP_PUBKEY               0x00000008
+
 //************************ Command Codes ****************************
 #define VTPM_ORD_BASE       0x0000
 #define TPM_VENDOR_COMMAND  0x02000000 // TPM Main, part 2, section 17.
@@ -110,6 +116,23 @@
  * Get a hardware TPM quote for this vTPM.  The quote will use the AIK
  * associated with the group this vTPM was created in. Values specific to the
  * vTPM will be extended to certain resettable PCRs.
+ * Additional info can be included when creating the signature by using
+ * quoteSelect as PCR selection and by setting flags param. The externData
+ * param for TPM_Quote is calculated as:
+ * externData = SHA1 (
+ *       extraInfoFlags
+ *       requestData
+ *       [SHA1 (
+ *          [SHA1 (UUIDs if requested)]
+ *          [SHA1 (vTPM measurements if requested)]
+ *          [SHA1 (vTPM group update policy if requested)]
+ *          [SHA1 (vTPM group public key if requested)]
+ *       ) if flags !=0 ]
+ * )
+ * The response param pcrValues is an array containing requested hashes used
+ * for externData calculation : UUIDs, vTPM measurements, vTPM group update
+ * policy, group public key. At the end of these hashes the PCR values are
+ * appended.
  *
  * Input:
  *  TPM_TAG         tag          VTPM_TAG_REQ
@@ -117,12 +140,14 @@
  *  UINT32          ordinal      VTPM_ORD_GET_QUOTE
  *  TPM_NONCE       externData   Data to be quoted
  *  PCR_SELECTION   quoteSelect  PCR selection for quote.
+ *  UINT32          flags        Bit mask of VTPM_QUOTE_FLAGS_*
  * Output:
  *  TPM_TAG         tag          VTPM_TAG_RSP
  *  UINT32          paramSize    total size
  *  UINT32          status       return code
  *  BYTE[]          signature    256 bytes of signature data
- *  TPM_PCRVALUE[]  pcrValues    Values of PCRs selected by the request
+ *  TPM_PCRVALUE[]  pcrValues    Values of additional SHA1 hashes requested,
+ *                               concatenated with PCRs selected by the request
  */
 #define VTPM_ORD_GET_QUOTE        (VTPM_ORD_BASE + 4)
 
diff --git a/stubdom/vtpmmgr/vtpmmgr.c b/stubdom/vtpmmgr/vtpmmgr.c
index 270ca8a..9fddaa2 100644
--- a/stubdom/vtpmmgr/vtpmmgr.c
+++ b/stubdom/vtpmmgr/vtpmmgr.c
@@ -45,6 +45,27 @@
 #include "vtpmmgr.h"
 #include "tcg.h"
 
+struct tpm_hardware_version hardware_version = {
+    .hw_version = TPM1_HARDWARE,
+};
+
+int parse_cmdline_hw(int argc, char** argv)
+{
+    int i;
+
+    for (i = 1; i < argc; ++i) {
+        if (!strcmp(argv[i], TPM2_EXTRA_OPT)) {
+            hardware_version.hw_version = TPM2_HARDWARE;
+            break;
+        }
+    }
+    return 0;
+}
+
+int hw_is_tpm2(void)
+{
+    return (hardware_version.hw_version == TPM2_HARDWARE) ? 1 : 0;
+}
 
 void main_loop(void) {
    tpmcmd_t* tpmcmd;
@@ -74,12 +95,25 @@ int main(int argc, char** argv)
    sleep(2);
    vtpmloginfo(VTPM_LOG_VTPM, "Starting vTPM manager domain\n");
 
-   /* Initialize the vtpm manager */
-   if(vtpmmgr_init(argc, argv) != TPM_SUCCESS) {
-      vtpmlogerror(VTPM_LOG_VTPM, "Unable to initialize vtpmmgr domain!\n");
-      rc = -1;
-      goto exit;
-   }
+    /*Parse TPM hardware in extra command line*/
+    parse_cmdline_hw(argc, argv);
+
+    /* Initialize the vtpm manager */
+    if (hw_is_tpm2()) {
+        vtpmloginfo(VTPM_LOG_VTPM, "Hardware : --- TPM 2.0 ---\n");
+        if (vtpmmgr2_init(argc, argv) != TPM_SUCCESS) {
+            vtpmlogerror(VTPM_LOG_VTPM, "Unable to initialize vtpmmgr domain!\n");
+            rc = -1;
+            goto exit;
+        }
+    }else{
+        vtpmloginfo(VTPM_LOG_VTPM, "Hardware : --- TPM 1.x ---\n");
+        if (vtpmmgr_init(argc, argv) != TPM_SUCCESS) {
+            vtpmlogerror(VTPM_LOG_VTPM, "Unable to initialize vtpmmgr domain!\n");
+            rc = -1;
+            goto exit;
+        }
+    }
 
    main_loop();
 
diff --git a/stubdom/vtpmmgr/vtpmmgr.h b/stubdom/vtpmmgr/vtpmmgr.h
index 2d9d153..2e6f8de 100644
--- a/stubdom/vtpmmgr/vtpmmgr.h
+++ b/stubdom/vtpmmgr/vtpmmgr.h
@@ -44,10 +44,23 @@
 #include "uuid.h"
 #include "tcg.h"
 #include "vtpm_manager.h"
+#include "tpm2_types.h"
 
+#define TPM2_EXTRA_OPT "tpm2=1"
 #define RSA_KEY_SIZE 0x0800
 #define RSA_CIPHER_SIZE (RSA_KEY_SIZE / 8)
 
+enum {
+    TPM1_HARDWARE = 1,
+    TPM2_HARDWARE,
+} tpm_version;
+
+struct tpm_hardware_version {
+    int hw_version;
+};
+
+extern struct tpm_hardware_version hardware_version;
+
 struct vtpm_globals {
    int tpm_fd;
    TPM_AUTH_SESSION    oiap;                // OIAP session for storageKey
@@ -59,6 +72,14 @@ struct vtpm_globals {
    ctr_drbg_context    ctr_drbg;
 
    int hw_locality;
+
+    /* TPM 2.0 */
+    TPM_AuthArea       pw_auth;
+    TPM_AuthArea       srk_auth_area;
+    TPM2_HANDLE        srk_handle;
+    TPM2_HANDLE        sk_handle;
+    TPM2B_NAME         sk_name;
+    TPM2_RSA_KEY       tpm2_storage_key;
 };
 
 struct tpm_opaque {
@@ -84,4 +105,12 @@ inline TPM_RESULT vtpmmgr_rand(unsigned char* bytes, size_t num_bytes) {
    return ctr_drbg_random(&vtpm_globals.ctr_drbg, bytes, num_bytes) == 0 ? 0 : TPM_FAIL;
 }
 
+/* TPM 2.0 */
+TPM_RC tpm2_take_ownership(void);
+TPM_RC tpm2_pcr_read(int index, uint8_t *buf);
+TPM_RESULT vtpmmgr2_create(void);
+TPM_RESULT vtpmmgr2_init(int argc, char** argv);
+int parse_cmdline_hw(int argc, char** argv);
+int hw_is_tpm2(void);
+
 #endif
diff --git a/tools/Makefile b/tools/Makefile
index af9798a..2618559 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -16,7 +16,6 @@ SUBDIRS-y += console
 SUBDIRS-y += xenmon
 SUBDIRS-y += xenstat
 SUBDIRS-$(CONFIG_Linux) += memshr 
-SUBDIRS-$(CONFIG_BLKTAP1) += blktap
 SUBDIRS-$(CONFIG_BLKTAP2) += blktap2
 SUBDIRS-$(CONFIG_NetBSD) += xenbackendd
 SUBDIRS-y += libfsimage
@@ -59,7 +58,7 @@ build all: subdirs-all
 
 .PHONY: install
 install: subdirs-install
-	$(INSTALL_DIR) $(DESTDIR)/var/xen/dump
+	$(INSTALL_DIR) $(DESTDIR)$(XEN_DUMP_DIR)
 	$(INSTALL_DIR) $(DESTDIR)/var/log/xen
 	$(INSTALL_DIR) $(DESTDIR)/var/lib/xen
 
@@ -72,30 +71,30 @@ uninstall:
 	rm -f  $(D)$(CONFIG_DIR)/udev/rules.d/xen-backend.rules
 	rm -f  $(D)$(CONFIG_DIR)/udev/rules.d/xend.rules
 	rm -f  $(D)$(SYSCONFIG_DIR)/xendomains
-	rm -f  $(D)$(SBINDIR)/xendomains
+	rm -f  $(D)$(sbindir)/xendomains
 	rm -f  $(D)$(SYSCONFIG_DIR)/xencommons
 	rm -rf $(D)/var/lib/xen*
-	rm -rf $(D)$(BINDIR)/cpuperf-perfcntr $(D)$(BINDIR)/cpuperf-xen
-	rm -rf $(D)$(BINDIR)/xc_shadow
-	rm -rf $(D)$(BINDIR)/pygrub
-	rm -rf $(D)$(BINDIR)/setsize $(D)$(BINDIR)/tbctl
-	rm -rf $(D)$(BINDIR)/xsls
-	rm -rf $(D)$(BINDIR)/xenstore* $(D)$(BINDIR)/xentrace*
-	rm -rf $(D)$(BINDIR)/xen-detect $(D)$(BINDIR)/xencons
-	rm -rf $(D)$(BINDIR)/xenpvnetboot $(D)$(BINDIR)/qemu-*-xen
-	rm -rf $(D)$(INCLUDEDIR)/xenctrl* $(D)$(INCLUDEDIR)/xenguest.h
-	rm -rf $(D)$(INCLUDEDIR)/xs_lib.h $(D)$(INCLUDEDIR)/xs.h
-	rm -rf $(D)$(INCLUDEDIR)/xenstore-compat/xs_lib.h $(D)$(INCLUDEDIR)/xenstore-compat/xs.h
-	rm -rf $(D)$(INCLUDEDIR)/xenstore_lib.h $(D)$(INCLUDEDIR)/xenstore.h
-	rm -rf $(D)$(INCLUDEDIR)/xen
-	rm -rf $(D)$(INCLUDEDIR)/_libxl* $(D)$(INCLUDEDIR)/libxl*
-	rm -rf $(D)$(INCLUDEDIR)/xenstat.h $(D)$(INCLUDEDIR)/xentoollog.h
-	rm -rf $(D)$(LIBDIR)/libxenctrl* $(D)$(LIBDIR)/libxenguest*
-	rm -rf $(D)$(LIBDIR)/libxenstore* $(D)$(LIBDIR)/libxlutil*
-	rm -rf $(D)$(LIBDIR)/python/xen $(D)$(LIBDIR)/python/grub
+	rm -rf $(D)$(bindir)/cpuperf-perfcntr $(D)$(bindir)/cpuperf-xen
+	rm -rf $(D)$(bindir)/xc_shadow
+	rm -rf $(D)$(bindir)/pygrub
+	rm -rf $(D)$(bindir)/setsize $(D)$(bindir)/tbctl
+	rm -rf $(D)$(bindir)/xsls
+	rm -rf $(D)$(bindir)/xenstore* $(D)$(bindir)/xentrace*
+	rm -rf $(D)$(bindir)/xen-detect $(D)$(bindir)/xencons
+	rm -rf $(D)$(bindir)/xenpvnetboot $(D)$(bindir)/qemu-*-xen
+	rm -rf $(D)$(includedir)/xenctrl* $(D)$(includedir)/xenguest.h
+	rm -rf $(D)$(includedir)/xs_lib.h $(D)$(includedir)/xs.h
+	rm -rf $(D)$(includedir)/xenstore-compat/xs_lib.h $(D)$(includedir)/xenstore-compat/xs.h
+	rm -rf $(D)$(includedir)/xenstore_lib.h $(D)$(includedir)/xenstore.h
+	rm -rf $(D)$(includedir)/xen
+	rm -rf $(D)$(includedir)/_libxl* $(D)$(includedir)/libxl*
+	rm -rf $(D)$(includedir)/xenstat.h $(D)$(includedir)/xentoollog.h
+	rm -rf $(D)$(libdir)/libxenctrl* $(D)$(libdir)/libxenguest*
+	rm -rf $(D)$(libdir)/libxenstore* $(D)$(libdir)/libxlutil*
+	rm -rf $(D)$(libdir)/python/xen $(D)$(libdir)/python/grub
 	rm -rf $(D)$(LIBEXEC)
-	rm -rf $(D)$(SBINDIR)/setmask
-	rm -rf $(D)$(SBINDIR)/xen* $(D)$(SBINDIR)/netfix $(D)$(SBINDIR)/xm
+	rm -rf $(D)$(sbindir)/setmask
+	rm -rf $(D)$(sbindir)/xen* $(D)$(sbindir)/netfix $(D)$(sbindir)/xm
 	rm -rf $(D)$(SHAREDIR)/doc/xen
 	rm -rf $(D)$(SHAREDIR)/xen
 	rm -rf $(D)$(SHAREDIR)/qemu-xen
@@ -118,8 +117,8 @@ IOEMU_CONFIGURE_CROSS ?= --cross-prefix=$(CROSS_COMPILE) \
 endif
 
 ifeq ($(XEN_TOOLS_RPATH),y)
-QEMU_UPSTREAM_RPATH := -Wl,-rpath,$(LIBEXEC_LIB):$(LIBDIR)
-IOEMU_EXTRA_LDFLAGS := --extra-ldflags="-Wl,-rpath,$(LIBDIR)"
+QEMU_UPSTREAM_RPATH := -Wl,-rpath,$(LIBEXEC_LIB):$(libdir)
+IOEMU_EXTRA_LDFLAGS := --extra-ldflags="-Wl,-rpath,$(libdir)"
 else
 QEMU_UPSTREAM_RPATH := -Wl,-rpath,$(LIBEXEC_LIB)
 IOEMU_EXTRA_LDFLAGS :=
@@ -130,6 +129,41 @@ ifneq ($(QEMU_ROOT),.)
 export QEMU_ROOT
 endif
 
+# Targets for external trees:
+#  ${target}-dir-find
+#    See if the directory exists and check it out if not.
+#  ${target}-dir-force-update
+#    Pull to the most recent update (as if you had checked it out for the
+#    first time)
+#  subdir-all-${target}-dir
+#    Do "make all" for ${target}, including all prerequisites (such as 
+#    configure)
+#  subdir-install-${target}-dir
+#    Do "make install" for $TARGET
+#  subdir-clean-${target}-dir
+#    Do "make clean" for $TARGET
+#
+# Directories for external trees:
+#  ${target}-dir
+#    Used for local builds.  Usually a link to ${target}-dir-remote
+#  ${target}-dir-remote
+#    Where remote repositories are cloned
+#  ${target}
+#    Where a copy of the source files are put when building a source 
+#    tarball for release
+#
+# Expected variables:
+#   ${TARGET}_URL
+#     A url from which to clone a git repo
+#   ${TARGET}_REVISION
+#     The target revision to check out when doing "find" or "force-update"
+#   ${TARGET}_INTREE
+#     The directory where the subtree can be found (usually used when building
+#     a source tarball)
+#   ${TARGET}_LOC
+#     The ultimate location of the source (either a local dir or remote URL)
+
+# External target: qemu-xen-traditional
 qemu-xen-traditional-dir-find:
 	set -ex; \
 	if test -d $(QEMU_TRADITIONAL_LOC); then \
@@ -139,14 +173,6 @@ qemu-xen-traditional-dir-find:
 		$(XEN_ROOT)/scripts/git-checkout.sh $(QEMU_TRADITIONAL_LOC) $(QEMU_TRADITIONAL_REVISION) qemu-xen-traditional-dir; \
 	fi
 
-qemu-xen-dir-find:
-	if test -d $(QEMU_UPSTREAM_LOC) ; then \
-		mkdir -p qemu-xen-dir; \
-	else \
-		export GIT=$(GIT); \
-		$(XEN_ROOT)/scripts/git-checkout.sh $(QEMU_UPSTREAM_LOC) $(QEMU_UPSTREAM_REVISION) qemu-xen-dir ; \
-	fi
-
 .PHONY: qemu-xen-traditional-dir-force-update
 qemu-xen-traditional-dir-force-update: qemu-xen-traditional-dir-find
 	set -ex; \
@@ -159,6 +185,7 @@ qemu-xen-traditional-dir-force-update: qemu-xen-traditional-dir-find
 subdir-all-qemu-xen-traditional-dir: qemu-xen-traditional-dir-find
 	set -e; \
 		$(buildmakevars2shellvars); \
+		export CONFIG_BLKTAP1=n; \
 		cd qemu-xen-traditional-dir; \
 		$(QEMU_ROOT)/xen-setup \
 		$(IOEMU_EXTRA_LDFLAGS) \
@@ -169,6 +196,7 @@ subdir-all-qemu-xen-traditional-dir: qemu-xen-traditional-dir-find
 subdir-install-qemu-xen-traditional-dir: qemu-xen-traditional-dir-find
 	set -e; \
 		$(buildmakevars2shellvars); \
+		export CONFIG_BLKTAP1=n; \
 		cd qemu-xen-traditional-dir; \
 		$(QEMU_ROOT)/xen-setup \
 		--extra-cflags="$(EXTRA_CFLAGS_QEMU_TRADITIONAL)" \
@@ -182,6 +210,15 @@ subdir-clean-qemu-xen-traditional-dir:
 		$(MAKE) -C qemu-xen-traditional-dir clean; \
 	fi
 
+# External target: qemu-xen
+qemu-xen-dir-find:
+	if test -d $(QEMU_UPSTREAM_LOC) ; then \
+		mkdir -p qemu-xen-dir; \
+	else \
+		export GIT=$(GIT); \
+		$(XEN_ROOT)/scripts/git-checkout.sh $(QEMU_UPSTREAM_LOC) $(QEMU_UPSTREAM_REVISION) qemu-xen-dir ; \
+	fi
+
 .PHONY: qemu-xen-dir-force-update
 qemu-xen-dir-force-update: qemu-xen-dir-find
 	set -ex; \
@@ -268,7 +305,9 @@ endif
 ifeq ($(CONFIG_QEMU_TRAD),y)
 	$(MAKE) qemu-xen-traditional-dir-force-update
 endif
+ifeq ($(CONFIG_X86),y)
 	$(MAKE) -C firmware subtree-force-update
+endif
 
 subtree-force-update-all:
 	$(MAKE) qemu-xen-dir-force-update
diff --git a/tools/Rules.mk b/tools/Rules.mk
index 87a56dc..2c422bd 100644
--- a/tools/Rules.mk
+++ b/tools/Rules.mk
@@ -51,14 +51,21 @@ LDLIBS_libxenstat  = $(SHLIB_libxenctrl) $(SHLIB_libxenstore) $(XEN_LIBXENSTAT)/
 SHLIB_libxenstat  = -Wl,-rpath-link=$(XEN_LIBXENSTAT)
 
 CFLAGS_libxenvchan = -I$(XEN_LIBVCHAN)
-LDLIBS_libxenvchan = $(SHLIB_libxenctrl) $(SHLIB_libxenstore) -L$(XEN_LIBVCHAN) -lxenvchan
+LDLIBS_libxenvchan = $(SHLIB_libxenctrl) $(SHLIB_libxenstore) $(XEN_LIBVCHAN)/libxenvchan$(libextension)
 SHLIB_libxenvchan  = -Wl,-rpath-link=$(XEN_LIBVCHAN)
 
+ifeq ($(debug),y)
+# Disable optimizations and enable debugging information for macros
+CFLAGS += -O0 -g3
+# But allow an override to -O0 in case Python enforces -D_FORTIFY_SOURCE=<n>.
+PY_CFLAGS += $(PY_NOOPT_CFLAGS)
+endif
+
 LIBXL_BLKTAP ?= $(CONFIG_BLKTAP2)
 
 ifeq ($(LIBXL_BLKTAP),y)
 CFLAGS_libblktapctl = -I$(XEN_BLKTAP2)/control -I$(XEN_BLKTAP2)/include $(CFLAGS_xeninclude)
-LDLIBS_libblktapctl = -L$(XEN_BLKTAP2)/control -lblktapctl
+LDLIBS_libblktapctl = $(XEN_BLKTAP2)/control/libblktapctl$(libextension)
 SHLIB_libblktapctl  = -Wl,-rpath-link=$(XEN_BLKTAP2)/control
 else
 CFLAGS_libblktapctl =
@@ -116,7 +123,7 @@ subdir-all-% subdir-clean-% subdir-install-%: .phony
 	$(MAKE) -C $* $(patsubst subdir-%-$*,%,$@)
 
 subdir-distclean-%: .phony
-	$(MAKE) -C $* clean
+	$(MAKE) -C $* distclean
 
 ifeq (,$(findstring clean,$(MAKECMDGOALS)))
 $(XEN_ROOT)/config/Tools.mk:
diff --git a/tools/blktap/Makefile b/tools/blktap/Makefile
deleted file mode 100644
index 4020566..0000000
--- a/tools/blktap/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-XEN_ROOT = $(CURDIR)/../..
-include $(XEN_ROOT)/tools/Rules.mk
-
-SUBDIRS-y :=
-SUBDIRS-y += lib
-SUBDIRS-y += drivers
-
-.PHONY: all clean install
-all clean install: %: subdirs-%
-
-install:
-	$(INSTALL_DIR) $(DESTDIR)$(DOCDIR)
-	$(INSTALL_DATA) README $(DESTDIR)$(DOCDIR)/README.blktap
diff --git a/tools/blktap/README b/tools/blktap/README
deleted file mode 100644
index 5e41080..0000000
--- a/tools/blktap/README
+++ /dev/null
@@ -1,122 +0,0 @@
-Blktap Userspace Tools + Library
-================================
-
-Andrew Warfield and Julian Chesterfield
-16th June 2006
-
-{firstname.lastname}@cl.cam.ac.uk
-
-The blktap userspace toolkit provides a user-level disk I/O
-interface. The blktap mechanism involves a kernel driver that acts
-similarly to the existing Xen/Linux blkback driver, and a set of
-associated user-level libraries.  Using these tools, blktap allows
-virtual block devices presented to VMs to be implemented in userspace
-and to be backed by raw partitions, files, network, etc.
-
-The key benefit of blktap is that it makes it easy and fast to write
-arbitrary block backends, and that these user-level backends actually
-perform very well.  Specifically:
-
-- Metadata disk formats such as Copy-on-Write, encrypted disks, sparse
-  formats and other compression features can be easily implemented.
-
-- Accessing file-based images from userspace avoids problems related
-  to flushing dirty pages which are present in the Linux loopback
-  driver.  (Specifically, doing a large number of writes to an
-  NFS-backed image don't result in the OOM killer going berserk.)
-
-- Per-disk handler processes enable easier userspace policing of block
-  resources, and process-granularity QoS techniques (disk scheduling
-  and related tools) may be trivially applied to block devices.
-
-- It's very easy to take advantage of userspace facilities such as
-  networking libraries, compression utilities, peer-to-peer
-  file-sharing systems and so on to build more complex block backends.
-
-- Crashes are contained -- incremental development/debugging is very
-  fast.
-
-How it works (in one paragraph):
-
-Working in conjunction with the kernel blktap driver, all disk I/O
-requests from VMs are passed to the userspace deamon (using a shared
-memory interface) through a character device. Each active disk is
-mapped to an individual device node, allowing per-disk processes to
-implement individual block devices where desired.  The userspace
-drivers are implemented using asynchronous (Linux libaio),
-O_DIRECT-based calls to preserve the unbuffered, batched and
-asynchronous request dispatch achieved with the existing blkback
-code.  We provide a simple, asynchronous virtual disk interface that
-makes it quite easy to add new disk implementations.
-
-As of June 2006 the current supported disk formats are:
-
- - Raw Images (both on partitions and in image files)
- - File-backed Qcow disks
- - Standalone sparse Qcow disks
- - Fast shareable RAM disk between VMs (requires some form of cluster-based 
-   filesystem support e.g. OCFS2 in the guest kernel)
- - Some VMDK images - your mileage may vary
-
-Raw and QCow images have asynchronous backends and so should perform
-fairly well.  VMDK is based directly on the qemu vmdk driver, which is
-synchronous (a.k.a. slow).
-
-Build and Installation Instructions
-===================================
-
-Make to configure the blktap backend driver in your dom0 kernel.  It
-will cooperate fine with the existing backend driver, so you can
-experiment with tap disks without breaking existing VM configs.
-
-To build the tools separately, "make && make install" in 
-tools/blktap.
-
-
-Using the Tools
-===============
-
-Prepare the image for booting. For qcow files use the qcow utilities
-installed earlier. e.g. qcow-create generates a blank standalone image
-or a file-backed CoW image. img2qcow takes an existing image or
-partition and creates a sparse, standalone qcow-based file.
-
-The userspace disk agent is configured to start automatically via xend
-(alternatively you can start it manually => 'blktapctrl')
-
-Customise the VM config file to use the 'tap' handler, followed by the
-driver type. e.g. for a raw image such as a file or partition:
-
-disk = ['tap:aio:<FILENAME>,sda1,w']
-
-e.g. for a qcow image:
-
-disk = ['tap:qcow:<FILENAME>,sda1,w']
-
-
-Mounting images in Dom0 using the blktap driver
-===============================================
-Tap (and blkback) disks are also mountable in Dom0 without requiring an
-active VM to attach. You will need to build a xenlinux Dom0 kernel that
-includes the blkfront driver (e.g. the default 'make world' or 
-'make kernels' build. Simply use the xm command-line tool to activate
-the backend disks, and blkfront will generate a virtual block device that
-can be accessed in the same way as a loop device or partition:
-
-e.g. for a raw image file <FILENAME> that would normally be mounted using
-the loopback driver (such as 'mount -o loop <FILENAME> /mnt/disk'), do the
-following:
-
-xm block-attach 0 tap:aio:<FILENAME> /dev/xvda1 w 0
-mount /dev/xvda1 /mnt/disk        <--- don't use loop driver
-
-In this way, you can use any of the userspace device-type drivers built
-with the blktap userspace toolkit to open and mount disks such as qcow
-or vmdk images:
-
-xm block-attach 0 tap:qcow:<FILENAME> /dev/xvda1 w 0
-mount /dev/xvda1 /mnt/disk
-
-
-
- 
diff --git a/tools/blktap/drivers/Makefile b/tools/blktap/drivers/Makefile
deleted file mode 100644
index cea8b3b..0000000
--- a/tools/blktap/drivers/Makefile
+++ /dev/null
@@ -1,73 +0,0 @@
-XEN_ROOT = $(CURDIR)/../../..
-include $(XEN_ROOT)/tools/Rules.mk
-
-IBIN         = blktapctrl tapdisk
-QCOW_UTIL    = img2qcow qcow2raw qcow-create
-
-CFLAGS   += -Werror
-CFLAGS   += -Wno-unused
-CFLAGS   += -I../lib
-CFLAGS   += $(CFLAGS_libxenctrl)
-CFLAGS   += $(CFLAGS_libxenstore)
-CFLAGS   += -D_GNU_SOURCE
-
-ifeq ($(CONFIG_GCRYPT),y)
-CFLAGS += -DUSE_GCRYPT
-CRYPT_LIB := -lgcrypt
-else
-CRYPT_LIB := -lcrypto
-$(warning === libgcrypt not installed: falling back to libcrypto ===)
-endif
-
-MEMSHRLIBS :=
-ifeq ($(CONFIG_Linux), y)
-MEMSHR_DIR   = ../../memshr
-CFLAGS += -DMEMSHR
-CFLAGS += -I $(MEMSHR_DIR)
-MEMSHRLIBS += $(MEMSHR_DIR)/libmemshr.a
-endif
-
-AIOLIBS     := -laio
-
-CFLAGS += $(PTHREAD_CFLAGS)
-LDFLAGS += $(PTHREAD_LDFLAGS)
-
-LDLIBS_blktapctrl := $(MEMSHRLIBS) $(LDLIBS_libxenctrl) $(LDLIBS_libxenstore) -L../lib -lblktap -lrt -lm $(PTHREAD_LIBS)
-LDLIBS_img := $(AIOLIBS) $(CRYPT_LIB) $(PTHREAD_LIBS) -lz
-
-BLK-OBJS-y  := block-aio.o
-BLK-OBJS-y  += block-sync.o
-BLK-OBJS-y  += block-vmdk.o
-BLK-OBJS-y  += block-ram.o
-BLK-OBJS-y  += block-qcow.o
-BLK-OBJS-y  += block-qcow2.o
-BLK-OBJS-y  += aes.o
-BLK-OBJS-y  += tapaio.o
-BLK-OBJS-$(CONFIG_Linux) += blk_linux.o
-
-BLKTAB-OBJS-y := blktapctrl.o
-BLKTAB-OBJS-$(CONFIG_Linux) += blktapctrl_linux.o
-
-all: $(IBIN) qcow-util
-
-blktapctrl: $(BLKTAB-OBJS-y)
-	$(CC) $(LDFLAGS) -o $@ $^ $(LDLIBS_blktapctrl)
-
-tapdisk: tapdisk.o $(BLK-OBJS-y)
-	$(CC) $(LDFLAGS) -o $@ $^ $(LDLIBS_img)
-
-.PHONY: qcow-util
-qcow-util: img2qcow qcow2raw qcow-create
-
-img2qcow qcow2raw qcow-create: %: %.o $(BLK-OBJS-y)
-	$(CC) $(LDFLAGS) -o $* $^ $(LDLIBS_img)
-
-install: all
-	$(INSTALL_PROG) $(IBIN) $(QCOW_UTIL) $(VHD_UTIL) $(DESTDIR)$(SBINDIR)
-
-clean:
-	rm -rf *.o *~ $(DEPS) xen TAGS $(IBIN) $(LIB) $(QCOW_UTIL) $(VHD_UTIL)
-
-.PHONY: clean install
-
--include $(DEPS)
diff --git a/tools/blktap/drivers/aes.c b/tools/blktap/drivers/aes.c
deleted file mode 100644
index 4d83fac..0000000
--- a/tools/blktap/drivers/aes.c
+++ /dev/null
@@ -1,1319 +0,0 @@
-/**
- * 
- * aes.c - integrated in QEMU by Fabrice Bellard from the OpenSSL project.
- */
-/*
- * rijndael-alg-fst.c
- *
- * @version 3.0 (December 2000)
- *
- * Optimised ANSI C code for the Rijndael cipher (now AES)
- *
- * @author Vincent Rijmen <vincent.rijmen at esat.kuleuven.ac.be>
- * @author Antoon Bosselaers <antoon.bosselaers at esat.kuleuven.ac.be>
- * @author Paulo Barreto <paulo.barreto at terra.com.br>
- *
- * This code is hereby placed in the public domain.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
- * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
- * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-//#include "vl.h"
-#include <inttypes.h>
-#include <string.h>
-#include "aes.h"
-
-//#define NDEBUG
-#include <assert.h>
-
-typedef uint32_t u32;
-typedef uint16_t u16;
-typedef uint8_t u8;
-
-#define MAXKC   (256/32)
-#define MAXKB   (256/8)
-#define MAXNR   14
-
-/* This controls loop-unrolling in aes_core.c */
-#undef FULL_UNROLL
-# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] <<  8) ^ ((u32)(pt)[3]))
-# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >>  8); (ct)[3] = (u8)(st); }
-
-/*
-Te0[x] = S [x].[02, 01, 01, 03];
-Te1[x] = S [x].[03, 02, 01, 01];
-Te2[x] = S [x].[01, 03, 02, 01];
-Te3[x] = S [x].[01, 01, 03, 02];
-Te4[x] = S [x].[01, 01, 01, 01];
-
-Td0[x] = Si[x].[0e, 09, 0d, 0b];
-Td1[x] = Si[x].[0b, 0e, 09, 0d];
-Td2[x] = Si[x].[0d, 0b, 0e, 09];
-Td3[x] = Si[x].[09, 0d, 0b, 0e];
-Td4[x] = Si[x].[01, 01, 01, 01];
-*/
-
-static const u32 Te0[256] = {
-    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
-    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
-    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
-    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
-    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
-    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
-    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
-    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
-    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
-    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
-    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
-    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
-    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
-    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
-    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
-    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
-    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
-    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
-    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
-    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
-    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
-    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
-    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
-    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
-    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
-    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
-    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
-    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
-    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
-    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
-    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
-    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
-    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
-    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
-    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
-    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
-    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
-    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
-    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
-    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
-    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
-    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
-    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
-    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
-    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
-    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
-    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
-    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
-    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
-    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
-    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
-    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
-    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
-    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
-    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
-    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
-    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
-    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
-    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
-    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
-    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
-    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
-    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
-    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
-};
-static const u32 Te1[256] = {
-    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
-    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
-    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
-    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
-    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
-    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
-    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
-    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
-    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
-    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
-    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
-    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
-    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
-    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
-    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
-    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
-    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
-    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
-    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
-    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
-    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
-    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
-    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
-    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
-    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
-    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
-    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
-    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
-    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
-    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
-    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
-    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
-    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
-    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
-    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
-    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
-    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
-    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
-    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
-    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
-    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
-    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
-    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
-    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
-    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
-    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
-    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
-    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
-    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
-    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
-    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
-    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
-    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
-    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
-    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
-    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
-    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
-    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
-    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
-    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
-    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
-    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
-    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
-    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
-};
-static const u32 Te2[256] = {
-    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
-    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
-    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
-    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
-    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
-    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
-    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
-    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
-    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
-    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
-    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
-    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
-    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
-    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
-    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
-    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
-    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
-    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
-    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
-    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
-    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
-    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
-    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
-    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
-    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
-    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
-    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
-    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
-    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
-    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
-    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
-    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
-    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
-    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
-    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
-    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
-    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
-    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
-    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
-    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
-    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
-    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
-    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
-    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
-    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
-    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
-    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
-    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
-    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
-    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
-    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
-    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
-    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
-    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
-    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
-    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
-    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
-    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
-    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
-    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
-    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
-    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
-    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
-    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
-};
-static const u32 Te3[256] = {
-
-    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
-    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
-    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
-    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
-    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
-    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
-    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
-    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
-    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
-    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
-    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
-    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
-    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
-    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
-    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
-    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
-    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
-    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
-    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
-    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
-    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
-    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
-    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
-    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
-    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
-    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
-    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
-    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
-    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
-    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
-    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
-    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
-    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
-    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
-    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
-    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
-    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
-    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
-    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
-    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
-    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
-    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
-    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
-    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
-    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
-    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
-    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
-    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
-    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
-    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
-    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
-    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
-    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
-    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
-    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
-    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
-    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
-    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
-    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
-    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
-    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
-    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
-    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
-    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
-};
-static const u32 Te4[256] = {
-    0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
-    0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
-    0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
-    0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
-    0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
-    0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
-    0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
-    0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
-    0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
-    0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
-    0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
-    0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
-    0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
-    0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
-    0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
-    0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
-    0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
-    0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
-    0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
-    0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
-    0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
-    0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
-    0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
-    0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
-    0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
-    0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
-    0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
-    0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
-    0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
-    0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
-    0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
-    0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
-    0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
-    0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
-    0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
-    0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
-    0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
-    0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
-    0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
-    0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
-    0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
-    0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
-    0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
-    0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
-    0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
-    0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
-    0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
-    0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
-    0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
-    0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
-    0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
-    0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
-    0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
-    0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
-    0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
-    0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
-    0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
-    0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
-    0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
-    0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
-    0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
-    0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
-    0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
-    0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
-};
-static const u32 Td0[256] = {
-    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
-    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
-    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
-    0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
-    0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
-    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
-    0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
-    0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
-    0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
-    0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
-    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
-    0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
-    0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
-    0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
-    0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
-    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
-    0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
-    0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
-    0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
-    0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
-    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
-    0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
-    0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
-    0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
-    0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
-    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
-    0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
-    0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
-    0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
-    0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
-    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
-    0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
-    0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
-    0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
-    0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
-    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
-    0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
-    0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
-    0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
-    0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
-    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
-    0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
-    0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
-    0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
-    0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
-    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
-    0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
-    0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
-    0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
-    0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
-    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
-    0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
-    0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
-    0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
-    0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
-    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
-    0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
-    0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
-    0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
-    0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
-    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
-    0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
-    0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
-    0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
-};
-static const u32 Td1[256] = {
-    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
-    0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
-    0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
-    0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
-    0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
-    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
-    0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
-    0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
-    0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
-    0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
-    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
-    0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
-    0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
-    0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
-    0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
-    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
-    0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
-    0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
-    0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
-    0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
-    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
-    0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
-    0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
-    0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
-    0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
-    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
-    0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
-    0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
-    0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
-    0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
-    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
-    0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
-    0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
-    0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
-    0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
-    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
-    0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
-    0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
-    0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
-    0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
-    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
-    0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
-    0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
-    0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
-    0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
-    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
-    0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
-    0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
-    0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
-    0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
-    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
-    0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
-    0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
-    0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
-    0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
-    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
-    0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
-    0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
-    0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
-    0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
-    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
-    0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
-    0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
-    0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
-};
-static const u32 Td2[256] = {
-    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
-    0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
-    0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
-    0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
-    0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
-    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
-    0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
-    0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
-    0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
-    0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
-    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
-    0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
-    0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
-    0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
-    0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
-    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
-    0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
-    0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
-    0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
-    0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
-
-    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
-    0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
-    0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
-    0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
-    0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
-    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
-    0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
-    0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
-    0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
-    0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
-    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
-    0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
-    0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
-    0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
-    0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
-    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
-    0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
-    0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
-    0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
-    0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
-    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
-    0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
-    0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
-    0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
-    0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
-    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
-    0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
-    0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
-    0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
-    0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
-    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
-    0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
-    0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
-    0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
-    0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
-    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
-    0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
-    0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
-    0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
-    0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
-    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
-    0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
-    0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
-    0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
-};
-static const u32 Td3[256] = {
-    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
-    0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
-    0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
-    0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
-    0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
-    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
-    0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
-    0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
-    0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
-    0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
-    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
-    0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
-    0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
-    0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
-    0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
-    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
-    0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
-    0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
-    0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
-    0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
-    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
-    0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
-    0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
-    0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
-    0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
-    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
-    0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
-    0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
-    0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
-    0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
-    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
-    0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
-    0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
-    0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
-    0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
-    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
-    0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
-    0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
-    0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
-    0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
-    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
-    0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
-    0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
-    0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
-    0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
-    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
-    0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
-    0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
-    0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
-    0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
-    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
-    0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
-    0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
-    0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
-    0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
-    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
-    0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
-    0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
-    0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
-    0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
-    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
-    0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
-    0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
-    0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
-};
-static const u32 Td4[256] = {
-    0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,
-    0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,
-    0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,
-    0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,
-    0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,
-    0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,
-    0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,
-    0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,
-    0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,
-    0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,
-    0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,
-    0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,
-    0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,
-    0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,
-    0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,
-    0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,
-    0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,
-    0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,
-    0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,
-    0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,
-    0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,
-    0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,
-    0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,
-    0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,
-    0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,
-    0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,
-    0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,
-    0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,
-    0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,
-    0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,
-    0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,
-    0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,
-    0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,
-    0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,
-    0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,
-    0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,
-    0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,
-    0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,
-    0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,
-    0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,
-    0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,
-    0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,
-    0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,
-    0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,
-    0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,
-    0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,
-    0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,
-    0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,
-    0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,
-    0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,
-    0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,
-    0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,
-    0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,
-    0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,
-    0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,
-    0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,
-    0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,
-    0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,
-    0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,
-    0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,
-    0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,
-    0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,
-    0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,
-    0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU,
-};
-static const u32 rcon[] = {
-	0x01000000, 0x02000000, 0x04000000, 0x08000000,
-	0x10000000, 0x20000000, 0x40000000, 0x80000000,
-	0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
-};
-
-/**
- * Expand the cipher key into the encryption key schedule.
- */
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-			AES_KEY *key) {
-
-	u32 *rk;
-   	int i = 0;
-	u32 temp;
-
-	if (!userKey || !key)
-		return -1;
-	if (bits != 128 && bits != 192 && bits != 256)
-		return -2;
-
-	rk = key->rd_key;
-
-	if (bits==128)
-		key->rounds = 10;
-	else if (bits==192)
-		key->rounds = 12;
-	else
-		key->rounds = 14;
-
-	rk[0] = GETU32(userKey     );
-	rk[1] = GETU32(userKey +  4);
-	rk[2] = GETU32(userKey +  8);
-	rk[3] = GETU32(userKey + 12);
-	if (bits == 128) {
-		while (1) {
-			temp  = rk[3];
-			rk[4] = rk[0] ^
-				(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
-				(Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
-				(Te4[(temp      ) & 0xff] & 0x0000ff00) ^
-				(Te4[(temp >> 24)       ] & 0x000000ff) ^
-				rcon[i];
-			rk[5] = rk[1] ^ rk[4];
-			rk[6] = rk[2] ^ rk[5];
-			rk[7] = rk[3] ^ rk[6];
-			if (++i == 10) {
-				return 0;
-			}
-			rk += 4;
-		}
-	}
-	rk[4] = GETU32(userKey + 16);
-	rk[5] = GETU32(userKey + 20);
-	if (bits == 192) {
-		while (1) {
-			temp = rk[ 5];
-			rk[ 6] = rk[ 0] ^
-				(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
-				(Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
-				(Te4[(temp      ) & 0xff] & 0x0000ff00) ^
-				(Te4[(temp >> 24)       ] & 0x000000ff) ^
-				rcon[i];
-			rk[ 7] = rk[ 1] ^ rk[ 6];
-			rk[ 8] = rk[ 2] ^ rk[ 7];
-			rk[ 9] = rk[ 3] ^ rk[ 8];
-			if (++i == 8) {
-				return 0;
-			}
-			rk[10] = rk[ 4] ^ rk[ 9];
-			rk[11] = rk[ 5] ^ rk[10];
-			rk += 6;
-		}
-	}
-	rk[6] = GETU32(userKey + 24);
-	rk[7] = GETU32(userKey + 28);
-	if (bits == 256) {
-		while (1) {
-			temp = rk[ 7];
-			rk[ 8] = rk[ 0] ^
-				(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
-				(Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
-				(Te4[(temp      ) & 0xff] & 0x0000ff00) ^
-				(Te4[(temp >> 24)       ] & 0x000000ff) ^
-				rcon[i];
-			rk[ 9] = rk[ 1] ^ rk[ 8];
-			rk[10] = rk[ 2] ^ rk[ 9];
-			rk[11] = rk[ 3] ^ rk[10];
-			if (++i == 7) {
-				return 0;
-			}
-			temp = rk[11];
-			rk[12] = rk[ 4] ^
-				(Te4[(temp >> 24)       ] & 0xff000000) ^
-				(Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
-				(Te4[(temp >>  8) & 0xff] & 0x0000ff00) ^
-				(Te4[(temp      ) & 0xff] & 0x000000ff);
-			rk[13] = rk[ 5] ^ rk[12];
-			rk[14] = rk[ 6] ^ rk[13];
-			rk[15] = rk[ 7] ^ rk[14];
-
-			rk += 8;
-        	}
-	}
-	return 0;
-}
-
-/**
- * Expand the cipher key into the decryption key schedule.
- */
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-			 AES_KEY *key) {
-
-        u32 *rk;
-	int i, j, status;
-	u32 temp;
-
-	/* first, start with an encryption schedule */
-	status = AES_set_encrypt_key(userKey, bits, key);
-	if (status < 0)
-		return status;
-
-	rk = key->rd_key;
-
-	/* invert the order of the round keys: */
-	for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
-		temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
-		temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
-		temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
-		temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
-	}
-	/* apply the inverse MixColumn transform to all round keys but the first and the last: */
-	for (i = 1; i < (key->rounds); i++) {
-		rk += 4;
-		rk[0] =
-			Td0[Te4[(rk[0] >> 24)       ] & 0xff] ^
-			Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
-			Td2[Te4[(rk[0] >>  8) & 0xff] & 0xff] ^
-			Td3[Te4[(rk[0]      ) & 0xff] & 0xff];
-		rk[1] =
-			Td0[Te4[(rk[1] >> 24)       ] & 0xff] ^
-			Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
-			Td2[Te4[(rk[1] >>  8) & 0xff] & 0xff] ^
-			Td3[Te4[(rk[1]      ) & 0xff] & 0xff];
-		rk[2] =
-			Td0[Te4[(rk[2] >> 24)       ] & 0xff] ^
-			Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
-			Td2[Te4[(rk[2] >>  8) & 0xff] & 0xff] ^
-			Td3[Te4[(rk[2]      ) & 0xff] & 0xff];
-		rk[3] =
-			Td0[Te4[(rk[3] >> 24)       ] & 0xff] ^
-			Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
-			Td2[Te4[(rk[3] >>  8) & 0xff] & 0xff] ^
-			Td3[Te4[(rk[3]      ) & 0xff] & 0xff];
-	}
-	return 0;
-}
-
-#ifndef AES_ASM
-/*
- * Encrypt a single block
- * in and out can overlap
- */
-void AES_encrypt(const unsigned char *in, unsigned char *out,
-		 const AES_KEY *key) {
-
-	const u32 *rk;
-	u32 s0, s1, s2, s3, t0, t1, t2, t3;
-#ifndef FULL_UNROLL
-	int r;
-#endif /* ?FULL_UNROLL */
-
-	assert(in && out && key);
-	rk = key->rd_key;
-
-	/*
-	 * map byte array block to cipher state
-	 * and add initial round key:
-	 */
-	s0 = GETU32(in     ) ^ rk[0];
-	s1 = GETU32(in +  4) ^ rk[1];
-	s2 = GETU32(in +  8) ^ rk[2];
-	s3 = GETU32(in + 12) ^ rk[3];
-#ifdef FULL_UNROLL
-	/* round 1: */
-   	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
-   	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
-   	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
-   	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
-   	/* round 2: */
-   	s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
-   	s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
-   	s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
-   	s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
-	/* round 3: */
-   	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
-   	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
-   	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
-   	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
-   	/* round 4: */
-   	s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
-   	s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
-   	s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
-   	s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
-	/* round 5: */
-   	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
-   	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
-   	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
-   	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
-   	/* round 6: */
-   	s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
-   	s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
-   	s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
-   	s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
-	/* round 7: */
-   	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
-   	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
-   	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
-   	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
-   	/* round 8: */
-   	s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
-   	s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
-   	s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
-   	s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
-	/* round 9: */
-   	t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
-   	t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
-   	t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
-   	t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
-    if (key->rounds > 10) {
-        /* round 10: */
-        s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
-        s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
-        s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
-        s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
-        /* round 11: */
-        t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
-        t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
-        t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
-        t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
-        if (key->rounds > 12) {
-            /* round 12: */
-            s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
-            s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
-            s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
-            s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
-            /* round 13: */
-            t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
-            t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
-            t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
-            t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
-        }
-    }
-    rk += key->rounds << 2;
-#else  /* !FULL_UNROLL */
-    /*
-     * Nr - 1 full rounds:
-     */
-    r = key->rounds >> 1;
-    for (;;) {
-        t0 =
-            Te0[(s0 >> 24)       ] ^
-            Te1[(s1 >> 16) & 0xff] ^
-            Te2[(s2 >>  8) & 0xff] ^
-            Te3[(s3      ) & 0xff] ^
-            rk[4];
-        t1 =
-            Te0[(s1 >> 24)       ] ^
-            Te1[(s2 >> 16) & 0xff] ^
-            Te2[(s3 >>  8) & 0xff] ^
-            Te3[(s0      ) & 0xff] ^
-            rk[5];
-        t2 =
-            Te0[(s2 >> 24)       ] ^
-            Te1[(s3 >> 16) & 0xff] ^
-            Te2[(s0 >>  8) & 0xff] ^
-            Te3[(s1      ) & 0xff] ^
-            rk[6];
-        t3 =
-            Te0[(s3 >> 24)       ] ^
-            Te1[(s0 >> 16) & 0xff] ^
-            Te2[(s1 >>  8) & 0xff] ^
-            Te3[(s2      ) & 0xff] ^
-            rk[7];
-
-        rk += 8;
-        if (--r == 0) {
-            break;
-        }
-
-        s0 =
-            Te0[(t0 >> 24)       ] ^
-            Te1[(t1 >> 16) & 0xff] ^
-            Te2[(t2 >>  8) & 0xff] ^
-            Te3[(t3      ) & 0xff] ^
-            rk[0];
-        s1 =
-            Te0[(t1 >> 24)       ] ^
-            Te1[(t2 >> 16) & 0xff] ^
-            Te2[(t3 >>  8) & 0xff] ^
-            Te3[(t0      ) & 0xff] ^
-            rk[1];
-        s2 =
-            Te0[(t2 >> 24)       ] ^
-            Te1[(t3 >> 16) & 0xff] ^
-            Te2[(t0 >>  8) & 0xff] ^
-            Te3[(t1      ) & 0xff] ^
-            rk[2];
-        s3 =
-            Te0[(t3 >> 24)       ] ^
-            Te1[(t0 >> 16) & 0xff] ^
-            Te2[(t1 >>  8) & 0xff] ^
-            Te3[(t2      ) & 0xff] ^
-            rk[3];
-    }
-#endif /* ?FULL_UNROLL */
-    /*
-	 * apply last round and
-	 * map cipher state to byte array block:
-	 */
-	s0 =
-		(Te4[(t0 >> 24)       ] & 0xff000000) ^
-		(Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
-		(Te4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
-		(Te4[(t3      ) & 0xff] & 0x000000ff) ^
-		rk[0];
-	PUTU32(out     , s0);
-	s1 =
-		(Te4[(t1 >> 24)       ] & 0xff000000) ^
-		(Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
-		(Te4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
-		(Te4[(t0      ) & 0xff] & 0x000000ff) ^
-		rk[1];
-	PUTU32(out +  4, s1);
-	s2 =
-		(Te4[(t2 >> 24)       ] & 0xff000000) ^
-		(Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
-		(Te4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
-		(Te4[(t1      ) & 0xff] & 0x000000ff) ^
-		rk[2];
-	PUTU32(out +  8, s2);
-	s3 =
-		(Te4[(t3 >> 24)       ] & 0xff000000) ^
-		(Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
-		(Te4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
-		(Te4[(t2      ) & 0xff] & 0x000000ff) ^
-		rk[3];
-	PUTU32(out + 12, s3);
-}
-
-/*
- * Decrypt a single block
- * in and out can overlap
- */
-void AES_decrypt(const unsigned char *in, unsigned char *out,
-		 const AES_KEY *key) {
-
-	const u32 *rk;
-	u32 s0, s1, s2, s3, t0, t1, t2, t3;
-#ifndef FULL_UNROLL
-	int r;
-#endif /* ?FULL_UNROLL */
-
-	assert(in && out && key);
-	rk = key->rd_key;
-
-	/*
-	 * map byte array block to cipher state
-	 * and add initial round key:
-	 */
-    s0 = GETU32(in     ) ^ rk[0];
-    s1 = GETU32(in +  4) ^ rk[1];
-    s2 = GETU32(in +  8) ^ rk[2];
-    s3 = GETU32(in + 12) ^ rk[3];
-#ifdef FULL_UNROLL
-    /* round 1: */
-    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
-    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
-    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
-    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
-    /* round 2: */
-    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
-    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
-    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
-    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
-    /* round 3: */
-    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
-    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
-    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
-    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
-    /* round 4: */
-    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
-    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
-    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
-    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
-    /* round 5: */
-    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
-    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
-    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
-    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
-    /* round 6: */
-    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
-    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
-    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
-    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
-    /* round 7: */
-    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
-    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
-    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
-    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
-    /* round 8: */
-    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
-    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
-    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
-    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
-    /* round 9: */
-    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
-    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
-    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
-    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
-    if (key->rounds > 10) {
-        /* round 10: */
-        s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
-        s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
-        s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
-        s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
-        /* round 11: */
-        t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
-        t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
-        t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
-        t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
-        if (key->rounds > 12) {
-            /* round 12: */
-            s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
-            s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
-            s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
-            s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
-            /* round 13: */
-            t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
-            t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
-            t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
-            t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
-        }
-    }
-	rk += key->rounds << 2;
-#else  /* !FULL_UNROLL */
-    /*
-     * Nr - 1 full rounds:
-     */
-    r = key->rounds >> 1;
-    for (;;) {
-        t0 =
-            Td0[(s0 >> 24)       ] ^
-            Td1[(s3 >> 16) & 0xff] ^
-            Td2[(s2 >>  8) & 0xff] ^
-            Td3[(s1      ) & 0xff] ^
-            rk[4];
-        t1 =
-            Td0[(s1 >> 24)       ] ^
-            Td1[(s0 >> 16) & 0xff] ^
-            Td2[(s3 >>  8) & 0xff] ^
-            Td3[(s2      ) & 0xff] ^
-            rk[5];
-        t2 =
-            Td0[(s2 >> 24)       ] ^
-            Td1[(s1 >> 16) & 0xff] ^
-            Td2[(s0 >>  8) & 0xff] ^
-            Td3[(s3      ) & 0xff] ^
-            rk[6];
-        t3 =
-            Td0[(s3 >> 24)       ] ^
-            Td1[(s2 >> 16) & 0xff] ^
-            Td2[(s1 >>  8) & 0xff] ^
-            Td3[(s0      ) & 0xff] ^
-            rk[7];
-
-        rk += 8;
-        if (--r == 0) {
-            break;
-        }
-
-        s0 =
-            Td0[(t0 >> 24)       ] ^
-            Td1[(t3 >> 16) & 0xff] ^
-            Td2[(t2 >>  8) & 0xff] ^
-            Td3[(t1      ) & 0xff] ^
-            rk[0];
-        s1 =
-            Td0[(t1 >> 24)       ] ^
-            Td1[(t0 >> 16) & 0xff] ^
-            Td2[(t3 >>  8) & 0xff] ^
-            Td3[(t2      ) & 0xff] ^
-            rk[1];
-        s2 =
-            Td0[(t2 >> 24)       ] ^
-            Td1[(t1 >> 16) & 0xff] ^
-            Td2[(t0 >>  8) & 0xff] ^
-            Td3[(t3      ) & 0xff] ^
-            rk[2];
-        s3 =
-            Td0[(t3 >> 24)       ] ^
-            Td1[(t2 >> 16) & 0xff] ^
-            Td2[(t1 >>  8) & 0xff] ^
-            Td3[(t0      ) & 0xff] ^
-            rk[3];
-    }
-#endif /* ?FULL_UNROLL */
-    /*
-	 * apply last round and
-	 * map cipher state to byte array block:
-	 */
-   	s0 =
-   		(Td4[(t0 >> 24)       ] & 0xff000000) ^
-   		(Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
-   		(Td4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
-   		(Td4[(t1      ) & 0xff] & 0x000000ff) ^
-   		rk[0];
-	PUTU32(out     , s0);
-   	s1 =
-   		(Td4[(t1 >> 24)       ] & 0xff000000) ^
-   		(Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
-   		(Td4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
-   		(Td4[(t2      ) & 0xff] & 0x000000ff) ^
-   		rk[1];
-	PUTU32(out +  4, s1);
-   	s2 =
-   		(Td4[(t2 >> 24)       ] & 0xff000000) ^
-   		(Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
-   		(Td4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
-   		(Td4[(t3      ) & 0xff] & 0x000000ff) ^
-   		rk[2];
-	PUTU32(out +  8, s2);
-   	s3 =
-   		(Td4[(t3 >> 24)       ] & 0xff000000) ^
-   		(Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
-   		(Td4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
-   		(Td4[(t0      ) & 0xff] & 0x000000ff) ^
-   		rk[3];
-	PUTU32(out + 12, s3);
-}
-
-#endif /* AES_ASM */
-
-void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
-		     const unsigned long length, const AES_KEY *key,
-		     unsigned char *ivec, const int enc) 
-{
-
-	unsigned long n;
-	unsigned long len = length;
-	unsigned char tmp[AES_BLOCK_SIZE];
-
-	assert(in && out && key && ivec);
-
-	if (enc) {
-		while (len >= AES_BLOCK_SIZE) {
-			for(n=0; n < AES_BLOCK_SIZE; ++n)
-				tmp[n] = in[n] ^ ivec[n];
-			AES_encrypt(tmp, out, key);
-			memcpy(ivec, out, AES_BLOCK_SIZE);
-			len -= AES_BLOCK_SIZE;
-			in += AES_BLOCK_SIZE;
-			out += AES_BLOCK_SIZE;
-		}
-		if (len) {
-			for(n=0; n < len; ++n)
-				tmp[n] = in[n] ^ ivec[n];
-			for(n=len; n < AES_BLOCK_SIZE; ++n)
-				tmp[n] = ivec[n];
-			AES_encrypt(tmp, tmp, key);
-			memcpy(out, tmp, AES_BLOCK_SIZE);
-			memcpy(ivec, tmp, AES_BLOCK_SIZE);
-		}			
-	} else {
-		while (len >= AES_BLOCK_SIZE) {
-			memcpy(tmp, in, AES_BLOCK_SIZE);
-			AES_decrypt(in, out, key);
-			for(n=0; n < AES_BLOCK_SIZE; ++n)
-				out[n] ^= ivec[n];
-			memcpy(ivec, tmp, AES_BLOCK_SIZE);
-			len -= AES_BLOCK_SIZE;
-			in += AES_BLOCK_SIZE;
-			out += AES_BLOCK_SIZE;
-		}
-		if (len) {
-			memcpy(tmp, in, AES_BLOCK_SIZE);
-			AES_decrypt(tmp, tmp, key);
-			for(n=0; n < len; ++n)
-				out[n] = tmp[n] ^ ivec[n];
-			memcpy(ivec, tmp, AES_BLOCK_SIZE);
-		}			
-	}
-}
diff --git a/tools/blktap/drivers/aes.h b/tools/blktap/drivers/aes.h
deleted file mode 100644
index 9fb54a9..0000000
--- a/tools/blktap/drivers/aes.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef QEMU_AES_H
-#define QEMU_AES_H
-
-#include <stdint.h>
-
-#define AES_MAXNR 14
-#define AES_BLOCK_SIZE 16
-
-struct aes_key_st {
-    uint32_t rd_key[4 *(AES_MAXNR + 1)];
-    int rounds;
-};
-typedef struct aes_key_st AES_KEY;
-
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-	AES_KEY *key);
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-	AES_KEY *key);
-
-void AES_encrypt(const unsigned char *in, unsigned char *out,
-	const AES_KEY *key);
-void AES_decrypt(const unsigned char *in, unsigned char *out,
-	const AES_KEY *key);
-void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
-		     const unsigned long length, const AES_KEY *key,
-		     unsigned char *ivec, const int enc);
-
-#endif
diff --git a/tools/blktap/drivers/blk.h b/tools/blktap/drivers/blk.h
deleted file mode 100644
index 1cdc980..0000000
--- a/tools/blktap/drivers/blk.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-int blk_getimagesize(int fd, uint64_t *size);
-int blk_getsectorsize(int fd, uint64_t *sector_size);
diff --git a/tools/blktap/drivers/blk_linux.c b/tools/blktap/drivers/blk_linux.c
deleted file mode 100644
index bb52717..0000000
--- a/tools/blktap/drivers/blk_linux.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <inttypes.h>
-#include <sys/ioctl.h>
-#include <sys/mount.h>
-#include "tapdisk.h"
-#include "blk.h"
-
-int blk_getimagesize(int fd, uint64_t *size)
-{
-	int rc;
-
-	*size = 0;
-	rc = ioctl(fd, BLKGETSIZE, size);
-	if (rc) {
-		DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-int blk_getsectorsize(int fd, uint64_t *sector_size)
-{
-#if defined(BLKSSZGET)
-	int rc;
-
-	*sector_size = DEFAULT_SECTOR_SIZE;
-	rc = ioctl(fd, BLKSSZGET, sector_size);
-	if (rc) {
-		DPRINTF("ERR: BLKSSZGET failed. Falling back to use default sector size");
-		*sector_size = DEFAULT_SECTOR_SIZE;
-	}
-
-	if (*sector_size != DEFAULT_SECTOR_SIZE)
-		DPRINTF("Note: sector size is %"PRIu64" (not %u)\n",
-			*sector_size, DEFAULT_SECTOR_SIZE);
-#else
-	*sector_size = DEFAULT_SECTOR_SIZE;
-#endif
-
-	return 0;
-}
-
diff --git a/tools/blktap/drivers/blktapctrl.c b/tools/blktap/drivers/blktapctrl.c
deleted file mode 100644
index 0a8b880..0000000
--- a/tools/blktap/drivers/blktapctrl.c
+++ /dev/null
@@ -1,937 +0,0 @@
-/*
- * blktapctrl.c
- * 
- * userspace controller for the blktap disks.
- * As requests for new block devices arrive,
- * the controller spawns off a separate process
- * per-disk.
- *
- *
- * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <err.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <signal.h>
-#include <fcntl.h>
-#include <sys/poll.h>
-#include <sys/ioctl.h>
-#include <string.h>
-#include <unistd.h>
-#include <xenstore.h>
-#include <sys/time.h>
-#include <syslog.h>
-#ifdef MEMSHR
-#include <memshr.h>
-#endif
-#include <sys/stat.h>
-                                                                     
-#include "blktaplib.h"
-#include "blktapctrl.h"
-#include "tapdisk.h"
-#include "list.h"
-#include "xs_api.h" /* for xs_fire_next_watch() */
-
-#define PIDFILE "/var/run/blktapctrl.pid"
-
-#define NUM_POLL_FDS 2
-#define MSG_SIZE 4096
-#define MAX_TIMEOUT 10
-#define MAX_RAND_VAL 0xFFFF
-#define MAX_ATTEMPTS 10
-
-int run = 1;
-int max_timeout = MAX_TIMEOUT;
-int ctlfd = 0;
-
-int blktap_major;
-
-static int open_ctrl_socket(char *devname);
-static int write_msg(int fd, int msgtype, void *ptr, void *ptr2);
-static int read_msg(int fd, int msgtype, void *ptr);
-static driver_list_entry_t *active_disks[MAX_DISK_TYPES];
-
-
-static unsigned long long tapdisk_get_size(blkif_t *blkif)
-{
-	image_t *img = (image_t *)blkif->prv;
-	return img->size;
-}
-
-static unsigned long tapdisk_get_secsize(blkif_t *blkif)
-{
-	image_t *img = (image_t *)blkif->prv;
-	return img->secsize;
-}
-
-static unsigned int tapdisk_get_info(blkif_t *blkif)
-{
-	image_t *img = (image_t *)blkif->prv;
-	return img->info;
-}
-
-struct blkif_ops tapdisk_ops = {
-	.get_size = tapdisk_get_size,
-	.get_secsize = tapdisk_get_secsize,
-	.get_info = tapdisk_get_info,
-};
-
-
-static void init_driver_list(void)
-{
-	int i;
-
-	for (i = 0; i < MAX_DISK_TYPES; i++)
-		active_disks[i] = NULL;
-	return;
-}
-
-static void init_rng(void)
-{
-	static uint32_t seed;
-	struct timeval tv;
-
-	gettimeofday(&tv, NULL);
-	seed = tv.tv_usec;
-	srand48(seed);
-	return;
-}
-
-static int get_tapdisk_pid(blkif_t *blkif)
-{
-	int ret;
-
-	if ((ret = write_msg(blkif->fds[WRITE], CTLMSG_PID, blkif, NULL)) 
-	    <= 0) {
-		DPRINTF("Write_msg failed - CTLMSG_PID(%d)\n", ret);
-		return -EINVAL;
-	}
-
-	if ((ret = read_msg(blkif->fds[READ], CTLMSG_PID_RSP, blkif))
-	     <= 0) {
-		DPRINTF("Read_msg failure - CTLMSG_PID(%d)\n", ret);
-		return -EINVAL;
-	}	
-	return 1;
-}
-
-/* Look up the disk specified by path: 
- *   if found, dev points to the device string in the path
- *             type is the tapdisk driver type id
- *             blkif is the existing interface if this is a shared driver
- *             and NULL otherwise.
- *   return 0 on success, -1 on error.
- */
-
-static int test_path(char *path, char **dev, int *type, blkif_t **blkif,
-	int* use_ioemu)
-{
-	char *ptr, handle[10];
-	int i, size, found = 0;
-	size_t handle_len;
-
-	size = sizeof(dtypes)/sizeof(disk_info_t *);
-	*type = MAX_DISK_TYPES + 1;
-        *blkif = NULL;
-
-	if (!strncmp(path, "tapdisk:", strlen("tapdisk:"))) {
-		*use_ioemu = 0;
-		path += strlen("tapdisk:");
-	} else if (!strncmp(path, "ioemu:", strlen("ioemu:"))) {
-		*use_ioemu = 1;
-		path += strlen("ioemu:");
-	} else {
-		// Use the default for the image type
-		*use_ioemu = -1;
-	}
-
-	if ( (ptr = strstr(path, ":"))!=NULL) {
-		handle_len = (ptr - path);
-		memcpy(handle, path, handle_len);
-		*dev = ptr + 1;
-		ptr = handle + handle_len;
-		*ptr = '\0';
-		DPRINTF("Detected handle: [%s]\n",handle);
-
-		for (i = 0; i < size; i++) {
-			if ((strlen(dtypes[i]->handle) == handle_len) &&
-					strncmp(handle, dtypes[i]->handle,
-					handle_len) == 0) {
-                                found = 1;
-                        }
-
-			if (found) {
-				if (*use_ioemu == -1)
-					*use_ioemu = dtypes[i]->use_ioemu;
-				*type = dtypes[i]->idnum;
-                        
-                        if (dtypes[i]->single_handler == 1) {
-                                /* Check whether tapdisk process 
-                                   already exists */
-                                if (active_disks[dtypes[i]->idnum] == NULL) 
-                                        *blkif = NULL;
-                                else 
-                                        *blkif = active_disks[dtypes[i]
-                                                             ->idnum]->blkif;
-                        }
-
-                        return 0;
-                }
-            }
-        }
-
-        /* Fall-through case, we didn't find a disk driver. */
-        DPRINTF("Unknown blktap disk type [%s]!\n",handle);
-        *dev = NULL;
-        return -1;
-}
-
-
-static void add_disktype(blkif_t *blkif, int type)
-{
-	driver_list_entry_t *entry, **pprev;
-
-	if (type > MAX_DISK_TYPES)
-		return;
-
-	entry = malloc(sizeof(driver_list_entry_t));
-	entry->blkif = blkif;
-	entry->next  = NULL;
-
-	pprev = &active_disks[type];
-	while (*pprev != NULL)
-		pprev = &(*pprev)->next;
-
-	*pprev = entry;
-	entry->pprev = pprev;
-}
-
-static int qemu_instance_has_disks(pid_t pid)
-{
-	int i;
-	int count = 0;
-	driver_list_entry_t *entry;
-
-	for (i = 0; i < MAX_DISK_TYPES; i++) {
-		entry = active_disks[i];
-		while (entry) {
-			if ((entry->blkif->tappid == pid) && dtypes[i]->use_ioemu)
-				count++;
-			entry = entry->next;
-		}
-	}
-
-	return (count != 0);
-}
-
-static int del_disktype(blkif_t *blkif)
-{
-	driver_list_entry_t *entry, **pprev;
-	int type = blkif->drivertype, count = 0, close = 0;
-
-	if (type > MAX_DISK_TYPES)
-		return 1;
-
-	pprev = &active_disks[type];
-	while ((*pprev != NULL) && ((*pprev)->blkif != blkif))
-		pprev = &(*pprev)->next;
-
-	if ((entry = *pprev) == NULL) {
-		DPRINTF("DEL_DISKTYPE: No match\n");
-		return 1;
-	}
-
-	*pprev = entry->next;
-	if (entry->next)
-		entry->next->pprev = pprev;
-
-	DPRINTF("DEL_DISKTYPE: Freeing entry\n");
-	free(entry);
-
-	/*
-	 * When using ioemu, all disks of one VM are connected to the same
-	 * qemu-dm instance. We may close the file handle only if there is
-	 * no other disk left for this domain.
-	 */
-	if (dtypes[type]->use_ioemu)
-		return !qemu_instance_has_disks(blkif->tappid);
-
-	/* Caller should close() if no single controller, or list is empty. */
-	return (!dtypes[type]->single_handler || (active_disks[type] == NULL));
-}
-
-static int write_msg(int fd, int msgtype, void *ptr, void *ptr2)
-{
-	blkif_t *blkif;
-	blkif_info_t *blk;
-	msg_hdr_t *msg;
-	msg_newdev_t *msg_dev;
-	char *p, *buf, *path;
-	int msglen, len, ret;
-	fd_set writefds;
-	struct timeval timeout;
-	image_t *image, *img;
-	uint32_t seed;
-
-	blkif = (blkif_t *)ptr;
-	blk = blkif->info;
-	image = blkif->prv;
-	len = 0;
-
-	switch (msgtype)
-	{
-	case CTLMSG_PARAMS:
-		path = (char *)ptr2;
-		DPRINTF("Write_msg called: CTLMSG_PARAMS, sending [%s, %s]\n",
-			blk->params, path);
-
-		msglen = sizeof(msg_hdr_t) + strlen(path) + 1;
-		buf = malloc(msglen);
-
-		/*Assign header fields*/
-		msg = (msg_hdr_t *)buf;
-		msg->type = CTLMSG_PARAMS;
-		msg->len = msglen;
-		msg->drivertype = blkif->drivertype;
-		msg->readonly = blkif->readonly;
-
-		gettimeofday(&timeout, NULL);
-		msg->cookie = blkif->cookie;
-		DPRINTF("Generated cookie, %d\n",blkif->cookie);
-
-		/*Copy blk->params to msg*/
-		p = buf + sizeof(msg_hdr_t);
-		memcpy(p, path, strlen(path) + 1);
-
-		break;
-
-	case CTLMSG_NEWDEV:
-		DPRINTF("Write_msg called: CTLMSG_NEWDEV\n");
-
-		msglen = sizeof(msg_hdr_t) + sizeof(msg_newdev_t);
-		buf = malloc(msglen);
-		
-		/*Assign header fields*/
-		msg = (msg_hdr_t *)buf;
-		msg->type = CTLMSG_NEWDEV;
-		msg->len = msglen;
-		msg->drivertype = blkif->drivertype;
-		msg->cookie = blkif->cookie;
-		
-		msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t));
-		msg_dev->devnum = blkif->minor;
-		msg_dev->domid = blkif->domid;
-
-		break;
-
-	case CTLMSG_CLOSE:
-		DPRINTF("Write_msg called: CTLMSG_CLOSE\n");
-
-		msglen = sizeof(msg_hdr_t);
-		buf = malloc(msglen);
-		
-		/*Assign header fields*/
-		msg = (msg_hdr_t *)buf;
-		msg->type = CTLMSG_CLOSE;
-		msg->len = msglen;
-		msg->drivertype = blkif->drivertype;
-		msg->cookie = blkif->cookie;
-		
-		break;
-
-	case CTLMSG_PID:
-		DPRINTF("Write_msg called: CTLMSG_PID\n");
-
-		msglen = sizeof(msg_hdr_t);
-		buf = malloc(msglen);
-		
-		/*Assign header fields*/
-		msg = (msg_hdr_t *)buf;
-		msg->type = CTLMSG_PID;
-		msg->len = msglen;
-		msg->drivertype = blkif->drivertype;
-		msg->cookie = blkif->cookie;
-		
-		break;
-		
-	default:
-		return -1;
-	}
-
-	/*Now send the message*/
-	ret = 0;
-	FD_ZERO(&writefds);
-	FD_SET(fd,&writefds);
-	timeout.tv_sec = max_timeout; /*Wait for up to max_timeout seconds*/
-	timeout.tv_usec = 0;
-	if (select(fd+1, (fd_set *) 0, &writefds, 
-		  (fd_set *) 0, &timeout) > 0) {
-		len = write(fd, buf, msglen);
-		if (len == -1) DPRINTF("Write failed: (%d)\n",errno);
-	}
-	free(buf);
-
-	return len;
-}
-
-static int read_msg(int fd, int msgtype, void *ptr)
-{
-	blkif_t *blkif;
-	blkif_info_t *blk;
-	msg_hdr_t *msg;
-	msg_pid_t *msg_pid;
-	char *p, *buf;
-	int msglen = MSG_SIZE, len, ret;
-	fd_set readfds;
-	struct timeval timeout;
-	image_t *image, *img;
-
-
-	blkif = (blkif_t *)ptr;
-	blk = blkif->info;
-	image = blkif->prv;
-
-	buf = malloc(MSG_SIZE);
-
-	ret = 0;
-	FD_ZERO(&readfds);
-	FD_SET(fd,&readfds);
-	timeout.tv_sec = max_timeout; /*Wait for up to max_timeout seconds*/ 
-	timeout.tv_usec = 0;
-	if (select(fd+1, &readfds,  (fd_set *) 0,
-		  (fd_set *) 0, &timeout) > 0) {
-		ret = read(fd, buf, msglen);
-	}			
-	if (ret > 0) {
-		msg = (msg_hdr_t *)buf;
-		switch (msg->type)
-		{
-		case CTLMSG_IMG:
-			img = (image_t *)(buf + sizeof(msg_hdr_t));
-			image->size = img->size;
-			image->secsize = img->secsize;
-			image->info = img->info;
-
-			DPRINTF("Received CTLMSG_IMG: %llu, %lu, %u\n",
-				image->size, image->secsize, image->info);
-			if(msgtype != CTLMSG_IMG) ret = 0;
-			break;
-			
-		case CTLMSG_IMG_FAIL:
-			DPRINTF("Received CTLMSG_IMG_FAIL, "
-				"unable to open image\n");
-			ret = 0;
-			break;
-				
-		case CTLMSG_NEWDEV_RSP:
-			DPRINTF("Received CTLMSG_NEWDEV_RSP\n");
-			if(msgtype != CTLMSG_NEWDEV_RSP) ret = 0;
-			break;
-			
-		case CTLMSG_NEWDEV_FAIL:
-			DPRINTF("Received CTLMSG_NEWDEV_FAIL\n");
-			ret = 0;
-			break;
-			
-		case CTLMSG_CLOSE_RSP:
-			DPRINTF("Received CTLMSG_CLOSE_RSP\n");
-			if (msgtype != CTLMSG_CLOSE_RSP) ret = 0;
-			break;
-
-		case CTLMSG_PID_RSP:
-			DPRINTF("Received CTLMSG_PID_RSP\n");
-			if (msgtype != CTLMSG_PID_RSP) ret = 0;
-			else {
-				msg_pid = (msg_pid_t *)
-					(buf + sizeof(msg_hdr_t));
-				blkif->tappid = msg_pid->pid;
-				DPRINTF("\tPID: [%d]\n",blkif->tappid);
-			}
-			break;
-		default:
-			DPRINTF("UNKNOWN MESSAGE TYPE RECEIVED\n");
-			ret = 0;
-			break;
-		}
-	} 
-	
-	free(buf);
-	
-	return ret;
-
-}
-
-static int launch_tapdisk_provider(char **argv)
-{
-	pid_t child;
-	
-	if ((child = fork()) < 0)
-		return -1;
-
-	if (!child) {
-		int i;
-		for (i = 0 ; i < sysconf(_SC_OPEN_MAX) ; i++)
-			if (i != STDIN_FILENO &&
-			    i != STDOUT_FILENO &&
-			    i != STDERR_FILENO)
-				close(i);
-
-		execvp(argv[0], argv);
-		DPRINTF("execvp failed: %d (%s)\n", errno, strerror(errno));
-		DPRINTF("PATH = %s\n", getenv("PATH"));
-		_exit(1);
-	} else {
-		pid_t got;
-		do {
-			got = waitpid(child, NULL, 0);
-		} while (got != child);
-	}
-	return child;
-}
-
-static int launch_tapdisk(char *wrctldev, char *rdctldev)
-{
-	char *argv[] = { "tapdisk", wrctldev, rdctldev, NULL };
-
-	if (launch_tapdisk_provider(argv) < 0)
-		return -1;
-
-	return 0;
-}
-
-static int launch_tapdisk_ioemu(void)
-{
-	char *argv[] = { "tapdisk-ioemu", NULL };
-	return launch_tapdisk_provider(argv);
-}
-
-/* 
- * Connect to an ioemu based disk provider (qemu-dm or tapdisk-ioemu)
- *
- * If the domain has a device model, connect to qemu-dm through the
- * domain specific pipe. Otherwise use a single tapdisk-ioemu instance
- * which is represented by domid 0 and provides access for Dom0 and
- * all DomUs without device model.
- */
-static int connect_qemu(blkif_t *blkif, int domid)
-{
-	char *rdctldev, *wrctldev;
-
-	static int tapdisk_ioemu_pid = 0;
-	static int dom0_readfd = 0;
-	static int dom0_writefd = 0;
-	int refresh_pid = 0;
-
-	if (asprintf(&rdctldev, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) < 0)
-		return -1;
-
-	if (asprintf(&wrctldev, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) < 0) {
-		free(rdctldev);
-		return -1;
-	}
-
-	DPRINTF("Using qemu blktap pipe: %s\n", rdctldev);
-	
-	if (domid == 0) {
-		/*
-		 * tapdisk-ioemu exits as soon as the last image is 
-		 * disconnected. Check if it is still running.
-		 */
-		if (tapdisk_ioemu_pid == 0 || kill(tapdisk_ioemu_pid, 0)) {
-			/* No device model and tapdisk-ioemu doesn't run yet */
-			DPRINTF("Launching tapdisk-ioemu\n");
-			launch_tapdisk_ioemu();
-			
-			dom0_readfd = open_ctrl_socket(wrctldev);
-			dom0_writefd = open_ctrl_socket(rdctldev);
-
-			refresh_pid = 1;
-		}
-
-		DPRINTF("Using tapdisk-ioemu connection\n");
-		blkif->fds[READ] = dom0_readfd;
-		blkif->fds[WRITE] = dom0_writefd;
-
-		if (refresh_pid) {
-			get_tapdisk_pid(blkif);
-			tapdisk_ioemu_pid = blkif->tappid;
-		}
-
-	} else if (access(rdctldev, R_OK | W_OK) == 0) {
-		/* Use existing pipe to the device model */
-		DPRINTF("Using qemu-dm connection\n");
-		blkif->fds[READ] = open_ctrl_socket(wrctldev);
-		blkif->fds[WRITE] = open_ctrl_socket(rdctldev);
-	} else {
-		/* No device model => try with tapdisk-ioemu */
-		DPRINTF("No device model\n");
-		connect_qemu(blkif, 0);
-	}
-	
-	free(rdctldev);
-	free(wrctldev);
-	
-	if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1)
-		return -1;
-
-	DPRINTF("Attached to qemu blktap pipes\n");
-	return 0;
-}
-
-/* Launch tapdisk instance */
-static int connect_tapdisk(blkif_t *blkif, int minor)
-{
-	char *rdctldev = NULL, *wrctldev = NULL;
-	int ret = -1;
-
-	DPRINTF("tapdisk process does not exist:\n");
-
-	if (asprintf(&rdctldev,
-		     "%s/tapctrlread%d", BLKTAP_CTRL_DIR, minor) == -1)
-		goto fail;
-
-	if (asprintf(&wrctldev,
-		     "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, minor) == -1)
-		goto fail;
-	
-	blkif->fds[READ] = open_ctrl_socket(rdctldev);
-	blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
-	
-	if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1)
-		goto fail;
-
-	/*launch the new process*/
-	DPRINTF("Launching process, CMDLINE [tapdisk %s %s]\n",
-			wrctldev, rdctldev);
-
-	if (launch_tapdisk(wrctldev, rdctldev) == -1) {
-		DPRINTF("Unable to fork, cmdline: [tapdisk %s %s]\n",
-				wrctldev, rdctldev);
-		goto fail;
-	}
-
-	ret = 0;
-	
-fail:
-	if (rdctldev)
-		free(rdctldev);
-
-	if (wrctldev)
-		free(wrctldev);
-
-	return ret;
-}
-
-static int blktapctrl_new_blkif(blkif_t *blkif)
-{
-	blkif_info_t *blk;
-	int major, minor, fd_read, fd_write, type, new;
-	char *rdctldev, *wrctldev, *ptr;
-	image_t *image;
-	blkif_t *exist = NULL;
-	static uint16_t next_cookie = 0;
-	int use_ioemu;
-
-	DPRINTF("Received a poll for a new vbd\n");
-	if ( ((blk=blkif->info) != NULL) && (blk->params != NULL) ) {
-		if (blktap_interface_create(ctlfd, &major, &minor, blkif) < 0)
-			return -1;
-
-		if (test_path(blk->params, &ptr, &type, &exist, &use_ioemu) != 0) {
-                        DPRINTF("Error in blktap device string(%s).\n",
-                                blk->params);
-                        goto fail;
-                }
-		blkif->drivertype = type;
-		blkif->cookie = next_cookie++;
-
-		if (!exist) {
-			if (use_ioemu) {
-				if (connect_qemu(blkif, blkif->domid))
-					goto fail;
-			} else {
-				if (connect_tapdisk(blkif, minor))
-					goto fail;
-			}
-
-		} else {
-			DPRINTF("Process exists!\n");
-			blkif->fds[READ] = exist->fds[READ];
-			blkif->fds[WRITE] = exist->fds[WRITE];
-		}
-
-		add_disktype(blkif, type);
-		blkif->major = major;
-		blkif->minor = minor;
-
-		image = (image_t *)malloc(sizeof(image_t));
-		blkif->prv = (void *)image;
-		blkif->ops = &tapdisk_ops;
-
-		/*Retrieve the PID of the new process*/
-		if (get_tapdisk_pid(blkif) <= 0) {
-			DPRINTF("Unable to contact disk process\n");
-			goto fail;
-		}
-
-		/* Both of the following read and write calls will block up to 
-		 * max_timeout val*/
-		if (write_msg(blkif->fds[WRITE], CTLMSG_PARAMS, blkif, ptr) 
-		    <= 0) {
-			DPRINTF("Write_msg failed - CTLMSG_PARAMS\n");
-			goto fail;
-		}
-
-		if (read_msg(blkif->fds[READ], CTLMSG_IMG, blkif) <= 0) {
-			DPRINTF("Read_msg failure - CTLMSG_IMG\n");
-			goto fail;
-		}
-
-	} else return -1;
-
-	return 0;
-fail:
-	ioctl(ctlfd, BLKTAP_IOCTL_FREEINTF, minor);
-	return -EINVAL;
-}
-
-static int map_new_blktapctrl(blkif_t *blkif)
-{
-	DPRINTF("Received a poll for a new devmap\n");
-	if (write_msg(blkif->fds[WRITE], CTLMSG_NEWDEV, blkif, NULL) <= 0) {
-		DPRINTF("Write_msg failed - CTLMSG_NEWDEV\n");
-		return -EINVAL;
-	}
-
-	if (read_msg(blkif->fds[READ], CTLMSG_NEWDEV_RSP, blkif) <= 0) {
-		DPRINTF("Read_msg failed - CTLMSG_NEWDEV_RSP\n");
-		return -EINVAL;
-	}
-	DPRINTF("Exiting map_new_blktapctrl\n");
-
-	return blkif->minor - 1;
-}
-
-static int unmap_blktapctrl(blkif_t *blkif)
-{
-	DPRINTF("Unmapping vbd\n");
-
-	if (write_msg(blkif->fds[WRITE], CTLMSG_CLOSE, blkif, NULL) <= 0) {
-		DPRINTF("Write_msg failed - CTLMSG_CLOSE\n");
-		return -EINVAL;
-	}
-
-	if (del_disktype(blkif)) {
-		DPRINTF("Closing communication pipe to pid %d\n", blkif->tappid);
-		close(blkif->fds[WRITE]);
-		close(blkif->fds[READ]);
-	}
-
-	return 0;
-}
-
-int open_ctrl_socket(char *devname)
-{
-	int ret;
-	int ipc_fd;
-	fd_set socks;
-	struct timeval timeout;
-
-	if (mkdir(BLKTAP_CTRL_DIR, 0755) == 0)
-		DPRINTF("Created %s directory\n", BLKTAP_CTRL_DIR);
-	ret = mkfifo(devname,S_IRWXU|S_IRWXG|S_IRWXO);
-	if ( (ret != 0) && (errno != EEXIST) ) {
-		DPRINTF("ERROR: pipe failed (%d)\n", errno);
-		exit(0);
-	}
-
-	ipc_fd = open(devname,O_RDWR|O_NONBLOCK);
-
-	if (ipc_fd < 0) {
-		DPRINTF("FD open failed\n");
-		return -1;
-	}
-
-	return ipc_fd;
-}
-
-static void print_drivers(void)
-{
-	int i, size;
-
-	size = sizeof(dtypes)/sizeof(disk_info_t *);
-	DPRINTF("blktapctrl: v1.0.0\n");
-	for (i = 0; i < size; i++)
-		DPRINTF("Found driver: [%s]\n",dtypes[i]->name);
-} 
-
-static void write_pidfile(long pid)
-{
-	char buf[100];
-	int len;
-	int fd;
-	int flags;
-
-	fd = open(PIDFILE, O_RDWR | O_CREAT, 0600);
-	if (fd == -1) {
-		DPRINTF("Opening pid file failed (%d)\n", errno);
-		exit(1);
-	}
-
-	/* We exit silently if daemon already running. */
-	if (lockf(fd, F_TLOCK, 0) == -1)
-		exit(0);
-
-	/* Set FD_CLOEXEC, so that tapdisk doesn't get this file
-	   descriptor. */
-	if ((flags = fcntl(fd, F_GETFD)) == -1) {
-		DPRINTF("F_GETFD failed (%d)\n", errno);
-		exit(1);
-	}
-	flags |= FD_CLOEXEC;
-	if (fcntl(fd, F_SETFD, flags) == -1) {
-		DPRINTF("F_SETFD failed (%d)\n", errno);
-		exit(1);
-	}
-
-	len = snprintf(buf, sizeof(buf), "%ld\n", pid);
-	if (write(fd, buf, len) != len) {
-		DPRINTF("Writing pid file failed (%d)\n", errno);
-		exit(1);
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	char *devname;
-	tapdev_info_t *ctlinfo;
-	int tap_pfd, store_pfd, xs_fd, ret, timeout, pfd_count, count=0;
-	struct xs_handle *h;
-	struct pollfd  pfd[NUM_POLL_FDS];
-	pid_t process;
-	char buf[128];
-
-	__init_blkif();
-	snprintf(buf, sizeof(buf), "BLKTAPCTRL[%d]", getpid());
-	openlog(buf, LOG_CONS|LOG_ODELAY, LOG_DAEMON);
-	if (daemon(0,0)) {
-		DPRINTF("daemon failed (%d)\n", errno);
-		goto open_failed;
-	}
-
-	print_drivers();
-	init_driver_list();
-	init_rng();
-
-	register_new_blkif_hook(blktapctrl_new_blkif);
-	register_new_devmap_hook(map_new_blktapctrl);
-	register_new_unmap_hook(unmap_blktapctrl);
-
-	ctlfd = blktap_interface_open();
-	if (ctlfd < 0) {
-		DPRINTF("couldn't open blktap interface\n");
-		goto open_failed;
-	}
-
-#ifdef MEMSHR
-	memshr_daemon_initialize();
-#endif
-
- retry:
-	/* Set up store connection and watch. */
-	h = xs_daemon_open();
-	if (h == NULL) {
-		DPRINTF("xs_daemon_open failed -- "
-			"is xenstore running?\n");
-                if (count < MAX_ATTEMPTS) {
-                        count++;
-                        sleep(2);
-                        goto retry;
-                } else goto open_failed;
-	}
-	
-	ret = setup_probe_watch(h);
-	if (ret != 0) {
-		DPRINTF("Failed adding device probewatch\n");
-		xs_daemon_close(h);
-		goto open_failed;
-	}
-
-	ioctl(ctlfd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
-
-	process = getpid();
-	write_pidfile(process);
-	ret = ioctl(ctlfd, BLKTAP_IOCTL_SENDPID, process );
-
-	/*Static pollhooks*/
-	pfd_count = 0;
-	tap_pfd = pfd_count++;
-	pfd[tap_pfd].fd = ctlfd;
-	pfd[tap_pfd].events = POLLIN;
-	
-	store_pfd = pfd_count++;
-	pfd[store_pfd].fd = xs_fileno(h);
-	pfd[store_pfd].events = POLLIN;
-
-	while (run) {
-		timeout = 1000; /*Milliseconds*/
-                ret = poll(pfd, pfd_count, timeout);
-
-		if (ret > 0) {
-			if (pfd[store_pfd].revents) {
-				ret = xs_fire_next_watch(h);
-			}
-		}
-	}
-
-	xs_daemon_close(h);
-	ioctl(ctlfd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_PASSTHROUGH );
-	close(ctlfd);
-	closelog();
-
-	return 0;
-	
- open_failed:
-	DPRINTF("Unable to start blktapctrl\n");
-	closelog();
-	return -1;
-}
-
-/*
- * Local variables:
- *  c-file-style: "linux"
- *  indent-tabs-mode: t
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/tools/blktap/drivers/blktapctrl_linux.c b/tools/blktap/drivers/blktapctrl_linux.c
deleted file mode 100644
index 6282fa6..0000000
--- a/tools/blktap/drivers/blktapctrl_linux.c
+++ /dev/null
@@ -1,89 +0,0 @@
-
-#include <stdio.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-
-#include "tapdisk.h"
-#include "blktaplib.h"
-#include "blktapctrl.h"
-
-static void make_blktap_dev(char *devname, int major, int minor)
-{
-	struct stat st;
- 
-	if (lstat(devname, &st) != 0) {
-		/*Need to create device*/
-		if (mkdir(BLKTAP_DEV_DIR, 0755) == 0)
-			DPRINTF("Created %s directory\n",BLKTAP_DEV_DIR);
-		if (mknod(devname, S_IFCHR|0600,
-			makedev(major, minor)) == 0)
-			DPRINTF("Created %s device\n",devname);
-	} else {
-		DPRINTF("%s device already exists\n",devname); 
-		/* it already exists, but is it the same major number */
-		if (((st.st_rdev>>8) & 0xff) != major) {
-			DPRINTF("%s has old major %d\n",
-				devname,
-				(unsigned int)((st.st_rdev >> 8) & 0xff));
-			/* only try again if we succed in deleting it */
-			if (!unlink(devname))
-				make_blktap_dev(devname, major, minor);
-		}
-	}
-}
-
-int blktap_interface_create(int ctlfd, int *major, int *minor, blkif_t *blkif)
-{       
-        domid_translate_t tr;
-        domid_translate_ext_t tr_ext;
-        int ret; 
-        char *devname;
-
-        if (blkif->be_id >= (1<<28)) {
-                /* new-style backend-id, so use the extended structure */
-                tr_ext.domid = blkif->domid;
-                tr_ext.busid = blkif->be_id;
-                ret = ioctl(ctlfd, BLKTAP_IOCTL_NEWINTF_EXT, &tr_ext);
-                DPRINTF("Sent domid %d and be_id %d\n", tr_ext.domid,
-                        tr_ext.busid);
-        }
-        else {
-                /* old-style backend-id; use the old structure */
-                tr.domid = blkif->domid;
-                tr.busid = (unsigned short)blkif->be_id;
-                ret = ioctl(ctlfd, BLKTAP_IOCTL_NEWINTF, tr);
-                DPRINTF("Sent domid %d and be_id %d\n", tr.domid, tr.busid);
-        }
-
-        if ( (ret <= 0)||(ret > MAX_TAP_DEV) ) {
-                DPRINTF("Incorrect Dev ID [%d]\n",ret);
-                return -1;
-        }
-
-        *minor = ret;
-        *major = ioctl(ctlfd, BLKTAP_IOCTL_MAJOR, ret );
-        if (*major < 0) {
-                DPRINTF("Incorrect Major ID [%d]\n",*major);
-                return -1;
-        }
-
-        if (asprintf(&devname,"%s/%s%d",BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, *minor) == -1)
-                return -1;
-        make_blktap_dev(devname,*major,*minor);
-        DPRINTF("Received device id %d and major %d\n",
-                *minor, *major);
-        return 0;
-}
-
-
-int blktap_interface_open(void)
-{
-	int ctlfd;
-
-	ctlfd = open(BLKTAP_DEV_DIR "/" BLKTAP_DEV_NAME "0", O_RDWR);
-	if (ctlfd == -1)
-		DPRINTF("blktap0 open failed\n");
-
-	return ctlfd;
-}
diff --git a/tools/blktap/drivers/block-aio.c b/tools/blktap/drivers/block-aio.c
deleted file mode 100644
index 98727f4..0000000
--- a/tools/blktap/drivers/block-aio.c
+++ /dev/null
@@ -1,259 +0,0 @@
-/* block-aio.c
- *
- * libaio-based raw disk implementation.
- *
- * (c) 2006 Andrew Warfield and Julian Chesterfield
- *
- * NB: This code is not thread-safe.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-
-#include <errno.h>
-#include <libaio.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/statvfs.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include "tapdisk.h"
-#include "tapaio.h"
-#include "blk.h"
-
-#define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
-
-/* *BSD has no O_LARGEFILE */
-#ifndef O_LARGEFILE
-#define O_LARGEFILE	0
-#endif
-
-struct tdaio_state {
-	int fd;
-	tap_aio_context_t aio;
-};
-
-
-/*Get Image size, secsize*/
-static int get_image_info(struct td_state *s, int fd)
-{
-	int ret;
-	long size;
-	unsigned long total_size;
-	struct statvfs statBuf;
-	struct stat stat;
-
-	ret = fstat(fd, &stat);
-	if (ret != 0) {
-		DPRINTF("ERROR: fstat failed, Couldn't stat image");
-		return -EINVAL;
-	}
-
-	if (S_ISBLK(stat.st_mode)) {
-		/*Accessing block device directly*/
-		if (blk_getimagesize(fd, &s->size) != 0)
-			return -EINVAL;
-
-		DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
-			"sector_shift [%llu]\n",
-			(long long unsigned)(s->size << SECTOR_SHIFT),
-			(long long unsigned)s->size);
-
-		/*Get the sector size*/
-		if (blk_getsectorsize(fd, &s->sector_size) != 0)
-			s->sector_size = DEFAULT_SECTOR_SIZE;
-
-	} else {
-		/*Local file? try fstat instead*/
-		s->size = (stat.st_size >> SECTOR_SHIFT);
-		s->sector_size = DEFAULT_SECTOR_SIZE;
-		DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
-			"sector_shift [%llu]\n",
-			(long long unsigned)(s->size << SECTOR_SHIFT),
-			(long long unsigned)s->size);
-	}
-
-	if (s->size == 0) {		
-		s->size =((uint64_t) 16836057);
-		s->sector_size = DEFAULT_SECTOR_SIZE;
-	}
-	s->info = 0;
-
-	return 0;
-}
-
-static inline void init_fds(struct disk_driver *dd)
-{
-	int i;
-	struct tdaio_state *prv = (struct tdaio_state *)dd->private;
-
-	for(i = 0; i < MAX_IOFD; i++) 
-		dd->io_fd[i] = 0;
-
-	dd->io_fd[0] = prv->aio.aio_ctx.pollfd;
-}
-
-/* Open the disk file and initialize aio state. */
-static int tdaio_open (struct disk_driver *dd, const char *name, td_flag_t flags)
-{
-	int i, fd, ret = 0, o_flags;
-	struct td_state    *s   = dd->td_state;
-	struct tdaio_state *prv = (struct tdaio_state *)dd->private;
-
-	DPRINTF("block-aio open('%s')", name);
-
-	/* Initialize AIO */
-	ret = tap_aio_init(&prv->aio, 0, MAX_AIO_REQS);
-	if (ret != 0)
-		return ret;
-
-	/* Open the file */
-	o_flags = O_DIRECT | O_LARGEFILE | 
-		((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
-        fd = open(name, o_flags);
-
-        if ( (fd == -1) && (errno == EINVAL) ) {
-
-                /* Maybe O_DIRECT isn't supported. */
-		o_flags &= ~O_DIRECT;
-                fd = open(name, o_flags);
-                if (fd != -1) DPRINTF("WARNING: Accessing image without"
-                                     "O_DIRECT! (%s)\n", name);
-
-        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
-	
-        if (fd == -1) {
-		DPRINTF("Unable to open [%s] (%d)!\n", name, 0 - errno);
-        	ret = 0 - errno;
-        	goto done;
-        }
-
-        prv->fd = fd;
-
-	init_fds(dd);
-	ret = get_image_info(s, fd);
-
-done:
-	return ret;	
-}
-
-static int tdaio_queue_read(struct disk_driver *dd, uint64_t sector,
-		     int nb_sectors, char *buf, td_callback_t cb,
-		     int id, void *private)
-{
-	struct   td_state    *s   = dd->td_state;
-	struct   tdaio_state *prv = (struct tdaio_state *)dd->private;
-	int      size    = nb_sectors * s->sector_size;
-	uint64_t offset  = sector * (uint64_t)s->sector_size;
-
-	return tap_aio_read(&prv->aio, prv->fd, size, offset, buf, 
-		cb, id, sector, private);
-}
-			
-static int tdaio_queue_write(struct disk_driver *dd, uint64_t sector,
-		      int nb_sectors, char *buf, td_callback_t cb,
-		      int id, void *private)
-{
-	struct   td_state    *s   = dd->td_state;
-	struct   tdaio_state *prv = (struct tdaio_state *)dd->private;
-	int      size    = nb_sectors * s->sector_size;
-	uint64_t offset  = sector * (uint64_t)s->sector_size;
-
-	return tap_aio_write(&prv->aio, prv->fd, size, offset, buf,
-		cb, id, sector, private);
-}
-
-static int tdaio_submit(struct disk_driver *dd)
-{
-	struct tdaio_state *prv = (struct tdaio_state *)dd->private;
-
-	return tap_aio_submit(&prv->aio);
-}
-			
-static int tdaio_close(struct disk_driver *dd)
-{
-	struct tdaio_state *prv = (struct tdaio_state *)dd->private;
-	
-	io_destroy(prv->aio.aio_ctx.aio_ctx);
-	close(prv->fd);
-
-	return 0;
-}
-
-static int tdaio_do_callbacks(struct disk_driver *dd, int sid)
-{
-	int i, nr_events, rsp = 0;
-	struct io_event *ep;
-	struct tdaio_state *prv = (struct tdaio_state *)dd->private;
-
-	nr_events = tap_aio_get_events(&prv->aio.aio_ctx);
-repeat:
-	for (ep = prv->aio.aio_events, i = nr_events; i-- > 0; ep++) {
-		struct iocb        *io  = ep->obj;
-		struct pending_aio *pio;
-		
-		pio = &prv->aio.pending_aio[(long)io->data];
-		rsp += pio->cb(dd, ep->res == io->u.c.nbytes ? 0 : 1,
-			       pio->sector, io->u.c.nbytes >> 9, 
-			       pio->id, pio->private);
-
-		prv->aio.iocb_free[prv->aio.iocb_free_count++] = io;
-	}
-
-	if (nr_events) {
-		nr_events = tap_aio_more_events(&prv->aio.aio_ctx);
-		goto repeat;
-	}
-
-	tap_aio_continue(&prv->aio.aio_ctx);
-
-	return rsp;
-}
-
-static int tdaio_get_parent_id(struct disk_driver *dd, struct disk_id *id)
-{
-	return TD_NO_PARENT;
-}
-
-static int tdaio_validate_parent(struct disk_driver *dd, 
-			  struct disk_driver *parent, td_flag_t flags)
-{
-	return -EINVAL;
-}
-
-struct tap_disk tapdisk_aio = {
-	.disk_type          = "tapdisk_aio",
-	.private_data_size  = sizeof(struct tdaio_state),
-	.td_open            = tdaio_open,
-	.td_queue_read      = tdaio_queue_read,
-	.td_queue_write     = tdaio_queue_write,
-	.td_submit          = tdaio_submit,
-	.td_close           = tdaio_close,
-	.td_do_callbacks    = tdaio_do_callbacks,
-	.td_get_parent_id   = tdaio_get_parent_id,
-	.td_validate_parent = tdaio_validate_parent
-};
diff --git a/tools/blktap/drivers/block-qcow.c b/tools/blktap/drivers/block-qcow.c
deleted file mode 100644
index 0e4e9cf..0000000
--- a/tools/blktap/drivers/block-qcow.c
+++ /dev/null
@@ -1,1434 +0,0 @@
-/* block-qcow.c
- *
- * Asynchronous Qemu copy-on-write disk implementation.
- * Code based on the Qemu implementation
- * (see copyright notice below)
- *
- * (c) 2006 Andrew Warfield and Julian Chesterfield
- *
- */
-
-/*
- * Block driver for the QCOW format
- * 
- * Copyright (c) 2004 Fabrice Bellard
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files(the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/statvfs.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <string.h>
-#include <zlib.h>
-#include <inttypes.h>
-#include <libaio.h>
-#include "bswap.h"
-#include "aes.h"
-#include "tapdisk.h"
-#include "tapaio.h"
-#include "blk.h"
-
-/* *BSD has no O_LARGEFILE */
-#ifndef O_LARGEFILE
-#define O_LARGEFILE	0
-#endif
-
-#if 1
-#define ASSERT(_p) \
-    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
-    __LINE__, __FILE__); *(int*)0=0; }
-#else
-#define ASSERT(_p) ((void)0)
-#endif
-
-#define ROUNDUP(l, s) \
-({ \
-    (uint64_t)( \
-        ((l) + ((s) - 1)) - (((l) + ((s) - 1)) % (s))); \
-})
-
-#undef IOCB_IDX
-#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
-
-#define ZERO_TEST(_b) (_b | 0x00)
-
-/**************************************************************/
-/* QEMU COW block driver with compression and encryption support */
-
-#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
-#define XEN_MAGIC  (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
-#define QCOW_VERSION 1
-
-#define QCOW_CRYPT_NONE 0x00
-#define QCOW_CRYPT_AES  0x01
-
-#define QCOW_OFLAG_COMPRESSED (1LL << 63)
-#define SPARSE_FILE 0x01
-#define EXTHDR_L1_BIG_ENDIAN 0x02
-
-#ifndef O_BINARY
-#define O_BINARY 0
-#endif
-
-typedef struct QCowHeader {
-	uint32_t magic;
-	uint32_t version;
-	uint64_t backing_file_offset;
-	uint32_t backing_file_size;
-	uint32_t mtime;
-	uint64_t size; /* in bytes */
-	uint8_t cluster_bits;
-	uint8_t l2_bits;
-	uint32_t crypt_method;
-	uint64_t l1_table_offset;
-} QCowHeader;
-
-/*Extended header for Xen enhancements*/
-typedef struct QCowHeader_ext {
-        uint32_t xmagic;
-        uint32_t cksum;
-        uint32_t min_cluster_alloc;
-        uint32_t flags;
-} QCowHeader_ext;
-
-#define L2_CACHE_SIZE 16  /*Fixed allocation in Qemu*/
-
-struct tdqcow_state {
-        int fd;                        /*Main Qcow file descriptor */
-	uint64_t fd_end;               /*Store a local record of file length */
-	char *name;                    /*Record of the filename*/
-	uint32_t backing_file_size;
-	uint64_t backing_file_offset;
-	int encrypted;                 /*File contents are encrypted or plain*/
-	int cluster_bits;              /*Determines length of cluster as 
-					*indicated by file hdr*/
-	int cluster_size;              /*Length of cluster*/
-	int cluster_sectors;           /*Number of sectors per cluster*/
-	int cluster_alloc;             /*Blktap fix for allocating full 
-					*extents*/
-	int min_cluster_alloc;         /*Blktap historical extent alloc*/
-	int sparse;                    /*Indicates whether to preserve sparseness*/
-	int l2_bits;                   /*Size of L2 table entry*/
-	int l2_size;                   /*Full table size*/
-	int l1_size;                   /*L1 table size*/
-	uint64_t cluster_offset_mask;    
-	uint64_t l1_table_offset;      /*L1 table offset from beginning of 
-					*file*/
-	uint64_t *l1_table;            /*L1 table entries*/
-	uint64_t *l2_cache;            /*We maintain a cache of size 
-					*L2_CACHE_SIZE of most read entries*/
-	uint64_t l2_cache_offsets[L2_CACHE_SIZE];     /*L2 cache entries*/
-	uint32_t l2_cache_counts[L2_CACHE_SIZE];      /*Cache access record*/
-	uint8_t *cluster_cache;          
-	uint8_t *cluster_data;
-	uint64_t cluster_cache_offset; /**/
-	uint32_t crypt_method;         /*current crypt method, 0 if no 
-					*key yet */
-	uint32_t crypt_method_header;  /**/
-	AES_KEY aes_encrypt_key;       /*AES key*/
-	AES_KEY aes_decrypt_key;       /*AES key*/
-        
-	/* libaio state */
-	tap_aio_context_t	aio;
-};
-
-static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
-
-#ifdef USE_GCRYPT
-
-#include <gcrypt.h>
-
-static uint32_t gen_cksum(char *ptr, int len)
-{
-	int i;
-	uint32_t md[4];
-
-	/* Convert L1 table to big endian */
-	for(i = 0; i < len / sizeof(uint64_t); i++) {
-		cpu_to_be64s(&((uint64_t*) ptr)[i]);
-	}
-
-	/* Generate checksum */
-	gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
-
-	/* Convert L1 table back to native endianess */
-	for(i = 0; i < len / sizeof(uint64_t); i++) {
-		be64_to_cpus(&((uint64_t*) ptr)[i]);
-	}
-
-	return md[0];
-}
-
-#else /* use libcrypto */
-
-#include <openssl/md5.h>
-
-static uint32_t gen_cksum(char *ptr, int len)
-{
-	int i;
-	unsigned char *md;
-	uint32_t ret;
-
-	md = malloc(MD5_DIGEST_LENGTH);
-	if(!md) return 0;
-
-	/* Convert L1 table to big endian */
-	for(i = 0; i < len / sizeof(uint64_t); i++) {
-		cpu_to_be64s(&((uint64_t*) ptr)[i]);
-	}
-
-	/* Generate checksum */
-	if (MD5((unsigned char *)ptr, len, md) != md)
-		ret = 0;
-	else
-		memcpy(&ret, md, sizeof(uint32_t));
-
-	/* Convert L1 table back to native endianess */
-	for(i = 0; i < len / sizeof(uint64_t); i++) {
-		be64_to_cpus(&((uint64_t*) ptr)[i]);
-	}
-
-	free(md);
-	return ret;
-}
-
-#endif
-
-static int get_filesize(char *filename, uint64_t *size, struct stat *st)
-{
-	int fd;
-	QCowHeader header;
-
-	/*Set to the backing file size*/
-	fd = open(filename, O_RDONLY);
-	if (fd < 0)
-		return -1;
-	if (read(fd, &header, sizeof(header)) < sizeof(header)) {
-		close(fd);
-		return -1;
-	}
-	close(fd);
-	
-	be32_to_cpus(&header.magic);
-	be64_to_cpus(&header.size);
-	if (header.magic == QCOW_MAGIC) {
-		*size = header.size >> SECTOR_SHIFT;
-		return 0;
-	}
-
-	if(S_ISBLK(st->st_mode)) {
-		fd = open(filename, O_RDONLY);
-		if (fd < 0)
-			return -1;
-		if (blk_getimagesize(fd, size) != 0) {
-			close(fd);
-			return -1;
-		}
-		close(fd);
-	} else *size = (st->st_size >> SECTOR_SHIFT);	
-	return 0;
-}
-
-static int qcow_set_key(struct tdqcow_state *s, const char *key)
-{
-	uint8_t keybuf[16];
-	int len, i;
-	
-	memset(keybuf, 0, 16);
-	len = strlen(key);
-	if (len > 16)
-		len = 16;
-	/* XXX: we could compress the chars to 7 bits to increase
-	   entropy */
-	for (i = 0; i < len; i++) {
-		keybuf[i] = key[i];
-	}
-	s->crypt_method = s->crypt_method_header;
-	
-	if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
-		return -1;
-	if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
-		return -1;
-#if 0
-	/* test */
-	{
-		uint8_t in[16];
-		uint8_t out[16];
-		uint8_t tmp[16];
-		for (i=0; i<16; i++)
-			in[i] = i;
-		AES_encrypt(in, tmp, &s->aes_encrypt_key);
-		AES_decrypt(tmp, out, &s->aes_decrypt_key);
-		for (i = 0; i < 16; i++)
-			DPRINTF(" %02x", tmp[i]);
-		DPRINTF("\n");
-		for (i = 0; i < 16; i++)
-			DPRINTF(" %02x", out[i]);
-		DPRINTF("\n");
-	}
-#endif
-	return 0;
-}
-
-/* 
- * The crypt function is compatible with the linux cryptoloop
- * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
- * supported .
- */
-static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
-                            uint8_t *out_buf, const uint8_t *in_buf,
-                            int nb_sectors, int enc,
-                            const AES_KEY *key)
-{
-	union {
-		uint64_t ll[2];
-		uint8_t b[16];
-	} ivec;
-	int i;
-	
-	for (i = 0; i < nb_sectors; i++) {
-		ivec.ll[0] = cpu_to_le64(sector_num);
-		ivec.ll[1] = 0;
-		AES_cbc_encrypt(in_buf, out_buf, 512, key, 
-				ivec.b, enc);
-		sector_num++;
-		in_buf += 512;
-		out_buf += 512;
-	}
-}
-
-static int qtruncate(int fd, off_t length, int sparse)
-{
-	int ret, i; 
-	int current = 0, rem = 0;
-	uint64_t sectors;
-	struct stat st;
-	char *buf;
-
-	/* If length is greater than the current file len
-	 * we synchronously write zeroes to the end of the 
-	 * file, otherwise we truncate the length down
-	 */
-	ret = fstat(fd, &st);
-	if (ret == -1) 
-		return -1;
-	if (S_ISBLK(st.st_mode))
-		return 0;
-
-	sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
-	current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
-	rem     = st.st_size % DEFAULT_SECTOR_SIZE;
-
-	/* If we are extending this file, we write zeros to the end --
-	 * this tries to ensure that the extents allocated wind up being
-	 * contiguous on disk.
-	 */
-	if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
-		/*We are extending the file*/
-		if ((ret = posix_memalign((void **)&buf, 
-					  512, DEFAULT_SECTOR_SIZE))) {
-			DPRINTF("posix_memalign failed: %d\n", ret);
-			return -1;
-		}
-		memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
-		if (lseek(fd, 0, SEEK_END)==-1) {
-			DPRINTF("Lseek EOF failed (%d), internal error\n",
-				errno);
-			free(buf);
-			return -1;
-		}
-		if (rem) {
-			ret = write(fd, buf, rem);
-			if (ret != rem) {
-				DPRINTF("write failed: ret = %d, err = %s\n",
-					ret, strerror(errno));
-				free(buf);
-				return -1;
-			}
-		}
-		for (i = current; i < sectors; i++ ) {
-			ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
-			if (ret != DEFAULT_SECTOR_SIZE) {
-				DPRINTF("write failed: ret = %d, err = %s\n",
-					ret, strerror(errno));
-				free(buf);
-				return -1;
-			}
-		}
-		free(buf);
-	} else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
-		if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
-			DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
-			return -1;
-		}
-	return 0;
-}
-
-
-/* 'allocate' is:
- *
- * 0 to not allocate.
- *
- * 1 to allocate a normal cluster (for sector indexes 'n_start' to
- * 'n_end')
- *
- * 2 to allocate a compressed cluster of size
- * 'compressed_size'. 'compressed_size' must be > 0 and <
- * cluster_size 
- *
- * return 0 if not allocated.
- */
-static uint64_t get_cluster_offset(struct tdqcow_state *s,
-                                   uint64_t offset, int allocate,
-                                   int compressed_size,
-                                   int n_start, int n_end)
-{
-	int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
-	char *tmp_ptr2, *l2_ptr, *l1_ptr;
-	uint64_t *tmp_ptr;
-	uint64_t l2_offset, *l2_table, cluster_offset, tmp;
-	uint32_t min_count;
-	int new_l2_table;
-
-	/*Check L1 table for the extent offset*/
-	l1_index = offset >> (s->l2_bits + s->cluster_bits);
-	l2_offset = s->l1_table[l1_index];
-	new_l2_table = 0;
-	if (!l2_offset) {
-		if (!allocate)
-			return 0;
-		/* 
-		 * allocating a new l2 entry + extent 
-		 * at the end of the file, we must also
-		 * update the L1 entry safely.
-		 */
-		l2_offset = s->fd_end;
-
-		/* round to cluster size */
-		l2_offset = (l2_offset + s->cluster_size - 1) 
-			& ~(s->cluster_size - 1);
-
-		/* update the L1 entry */
-		s->l1_table[l1_index] = l2_offset;
-		tmp = cpu_to_be64(l2_offset);
-		
-		/*Truncate file for L2 table 
-		 *(initialised to zero in case we crash)*/
-		if (qtruncate(s->fd, 
-			      l2_offset + (s->l2_size * sizeof(uint64_t)),
-			      s->sparse) != 0) {
-			DPRINTF("ERROR truncating file\n");
-			return 0;
-		}
-		s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
-
-		/*Update the L1 table entry on disk
-                 * (for O_DIRECT we write 4KByte blocks)*/
-		l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
-		l1_ptr = (char *)s->l1_table + (l1_sector << 12);
-
-		if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
-			DPRINTF("ERROR allocating memory for L1 table\n");
-		}
-		memcpy(tmp_ptr, l1_ptr, 4096);
-
-		/* Convert block to write to big endian */
-		for(i = 0; i < 4096 / sizeof(uint64_t); i++) {
-			cpu_to_be64s(&tmp_ptr[i]);
-		}
-
-		/*
-		 * Issue non-asynchronous L1 write.
-		 * For safety, we must ensure that
-		 * entry is written before blocks.
-		 */
-		lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
-		if (write(s->fd, tmp_ptr, 4096) != 4096) {
-			free(tmp_ptr);
-		 	return 0;
-		}
-		free(tmp_ptr);
-
-		new_l2_table = 1;
-		goto cache_miss;
-	} else if (s->min_cluster_alloc == s->l2_size) {
-		/*Fast-track the request*/
-		cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
-		l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
-		return cluster_offset + (l2_index * s->cluster_size);
-	}
-
-	/*Check to see if L2 entry is already cached*/
-	for (i = 0; i < L2_CACHE_SIZE; i++) {
-		if (l2_offset == s->l2_cache_offsets[i]) {
-			/* increment the hit count */
-			if (++s->l2_cache_counts[i] == 0xffffffff) {
-				for (j = 0; j < L2_CACHE_SIZE; j++) {
-					s->l2_cache_counts[j] >>= 1;
-				}
-			}
-			l2_table = s->l2_cache + (i << s->l2_bits);
-			goto found;
-		}
-	}
-
-cache_miss:
-	/* not found: load a new entry in the least used one */
-	min_index = 0;
-	min_count = 0xffffffff;
-	for (i = 0; i < L2_CACHE_SIZE; i++) {
-		if (s->l2_cache_counts[i] < min_count) {
-			min_count = s->l2_cache_counts[i];
-			min_index = i;
-		}
-	}
-	l2_table = s->l2_cache + (min_index << s->l2_bits);
-
-	/*If extent pre-allocated, read table from disk, 
-	 *otherwise write new table to disk*/
-	if (new_l2_table) {
-		/*Should we allocate the whole extent? Adjustable parameter.*/
-		if (s->cluster_alloc == s->l2_size) {
-			cluster_offset = l2_offset + 
-				(s->l2_size * sizeof(uint64_t));
-			cluster_offset = (cluster_offset + s->cluster_size - 1)
-				& ~(s->cluster_size - 1);
-			if (qtruncate(s->fd, cluster_offset + 
-				  (s->cluster_size * s->l2_size), 
-				      s->sparse) != 0) {
-				DPRINTF("ERROR truncating file\n");
-				return 0;
-			}
-			s->fd_end = cluster_offset + 
-				(s->cluster_size * s->l2_size);
-			for (i = 0; i < s->l2_size; i++) {
-				l2_table[i] = cpu_to_be64(cluster_offset + 
-							  (i*s->cluster_size));
-			}  
-		} else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
-
-		lseek(s->fd, l2_offset, SEEK_SET);
-		if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
-		   s->l2_size * sizeof(uint64_t))
-			return 0;
-	} else {
-		lseek(s->fd, l2_offset, SEEK_SET);
-		if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) != 
-		    s->l2_size * sizeof(uint64_t))
-			return 0;
-	}
-	
-	/*Update the cache entries*/ 
-	s->l2_cache_offsets[min_index] = l2_offset;
-	s->l2_cache_counts[min_index] = 1;
-
-found:
-	/*The extent is split into 's->l2_size' blocks of 
-	 *size 's->cluster_size'*/
-	l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
-	cluster_offset = be64_to_cpu(l2_table[l2_index]);
-
-	if (!cluster_offset || 
-	    ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
-		if (!allocate)
-			return 0;
-		
-		if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
-		    (n_end - n_start) < s->cluster_sectors) {
-			/* cluster is already allocated but compressed, we must
-			   decompress it in the case it is not completely
-			   overwritten */
-			if (decompress_cluster(s, cluster_offset) < 0)
-				return 0;
-			cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
-			cluster_offset = (cluster_offset + s->cluster_size - 1)
-				& ~(s->cluster_size - 1);
-			/* write the cluster content - not asynchronous */
-			lseek(s->fd, cluster_offset, SEEK_SET);
-			if (write(s->fd, s->cluster_cache, s->cluster_size) != 
-			    s->cluster_size)
-			    return -1;
-		} else {
-			/* allocate a new cluster */
-			cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
-			if (allocate == 1) {
-				/* round to cluster size */
-				cluster_offset = 
-					(cluster_offset + s->cluster_size - 1) 
-					& ~(s->cluster_size - 1);
-				if (qtruncate(s->fd, cluster_offset + 
-					      s->cluster_size, s->sparse)!=0) {
-					DPRINTF("ERROR truncating file\n");
-					return 0;
-				}
-				s->fd_end = (cluster_offset + s->cluster_size);
-				/* if encrypted, we must initialize the cluster
-				   content which won't be written */
-				if (s->crypt_method && 
-				    (n_end - n_start) < s->cluster_sectors) {
-					uint64_t start_sect;
-					start_sect = (offset & 
-						      ~(s->cluster_size - 1)) 
-							      >> 9;
-					memset(s->cluster_data + 512, 
-					       0xaa, 512);
-					for (i = 0; i < s->cluster_sectors;i++)
-					{
-						if (i < n_start || i >= n_end) 
-						{
-							encrypt_sectors(s, start_sect + i, 
-									s->cluster_data, 
-									s->cluster_data + 512, 1, 1,
-									&s->aes_encrypt_key);
-							lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
-							if (write(s->fd, s->cluster_data, 512) != 512)
-								return -1;
-						}
-					}
-				}
-			} else {
-				cluster_offset |= QCOW_OFLAG_COMPRESSED | 
-					(uint64_t)compressed_size 
-						<< (63 - s->cluster_bits);
-			}
-		}
-		/* update L2 table */
-		tmp = cpu_to_be64(cluster_offset);
-		l2_table[l2_index] = tmp;
-
-		/*For IO_DIRECT we write 4KByte blocks*/
-		l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
-		l2_ptr = (char *)l2_table + (l2_sector << 12);
-		
-		if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
-			DPRINTF("ERROR allocating memory for L1 table\n");
-		}
-		memcpy(tmp_ptr2, l2_ptr, 4096);
-		lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
-		if (write(s->fd, tmp_ptr2, 4096) != 4096) {
-			free(tmp_ptr2);
-			return -1;
-		}
-		free(tmp_ptr2);
-	}
-	return cluster_offset;
-}
-
-static void init_cluster_cache(struct disk_driver *dd)
-{
-	struct td_state     *bs = dd->td_state;
-	struct tdqcow_state *s  = (struct tdqcow_state *)dd->private;
-	uint32_t count = 0;
-	int i, cluster_entries;
-
-	cluster_entries = s->cluster_size / 512;
-	DPRINTF("Initialising Cluster cache, %d sectors per cluster (%d cluster size)\n",
-		cluster_entries, s->cluster_size);
-
-	for (i = 0; i < bs->size; i += cluster_entries) {
-		if (get_cluster_offset(s, i << 9, 0, 0, 0, 1)) count++;
-		if (count >= L2_CACHE_SIZE) return;
-	}
-	DPRINTF("Finished cluster initialisation, added %d entries\n", count);
-	return;
-}
-
-static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
-                             int nb_sectors, int *pnum)
-{
-	int index_in_cluster, n;
-	uint64_t cluster_offset;
-
-	cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
-	index_in_cluster = sector_num & (s->cluster_sectors - 1);
-	n = s->cluster_sectors - index_in_cluster;
-	if (n > nb_sectors)
-		n = nb_sectors;
-	*pnum = n;
-	return (cluster_offset != 0);
-}
-
-static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
-                             const uint8_t *buf, int buf_size)
-{
-	z_stream strm1, *strm = &strm1;
-	int ret, out_len;
-	
-	memset(strm, 0, sizeof(*strm));
-	
-	strm->next_in = (uint8_t *)buf;
-	strm->avail_in = buf_size;
-	strm->next_out = out_buf;
-	strm->avail_out = out_buf_size;
-	
-	ret = inflateInit2(strm, -12);
-	if (ret != Z_OK)
-		return -1;
-	ret = inflate(strm, Z_FINISH);
-	out_len = strm->next_out - out_buf;
-	if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
-	    (out_len != out_buf_size) ) {
-		inflateEnd(strm);
-		return -1;
-	}
-	inflateEnd(strm);
-	return 0;
-}
-                              
-static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
-{
-	int ret, csize;
-	uint64_t coffset;
-
-	coffset = cluster_offset & s->cluster_offset_mask;
-	if (s->cluster_cache_offset != coffset) {
-		csize = cluster_offset >> (63 - s->cluster_bits);
-		csize &= (s->cluster_size - 1);
-		lseek(s->fd, coffset, SEEK_SET);
-		ret = read(s->fd, s->cluster_data, csize);
-		if (ret != csize) 
-			return -1;
-		if (decompress_buffer(s->cluster_cache, s->cluster_size,
-				      s->cluster_data, csize) < 0) {
-			return -1;
-		}
-		s->cluster_cache_offset = coffset;
-	}
-	return 0;
-}
-
-static inline void init_fds(struct disk_driver *dd)
-{
-	int i;
-	struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
-
-	for(i = 0; i < MAX_IOFD; i++) 
-		dd->io_fd[i] = 0;
-
-	dd->io_fd[0] = s->aio.aio_ctx.pollfd;
-}
-
-/* Open the disk file and initialize qcow state. */
-static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flags)
-{
-	int fd, len, i, shift, ret, size, l1_table_size, o_flags, l1_table_block;
-	int max_aio_reqs;
-	struct td_state     *bs = dd->td_state;
-	struct tdqcow_state *s  = (struct tdqcow_state *)dd->private;
-	char *buf, *buf2;
-	QCowHeader *header;
-	QCowHeader_ext *exthdr;
-	uint32_t cksum;
-	uint64_t final_cluster = 0;
-
- 	DPRINTF("QCOW: Opening %s\n",name);
-
-	o_flags = O_DIRECT | O_LARGEFILE | 
-		((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
-	fd = open(name, o_flags);
-	if (fd < 0) {
-		DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
-		return -1;
-	}
-
-	s->fd = fd;
-	if (asprintf(&s->name,"%s", name) == -1) {
-		close(fd);
-		return -1;
-	}
-
-	ASSERT(sizeof(QCowHeader) + sizeof(QCowHeader_ext) < 512);
-
-	ret = posix_memalign((void **)&buf, 512, 512);
-	if (ret != 0) goto fail;
-
-	if (read(fd, buf, 512) != 512)
-		goto fail;
-
-	header = (QCowHeader *)buf;
-	be32_to_cpus(&header->magic);
-	be32_to_cpus(&header->version);
-	be64_to_cpus(&header->backing_file_offset);
-	be32_to_cpus(&header->backing_file_size);
-	be32_to_cpus(&header->mtime);
-	be64_to_cpus(&header->size);
-	be32_to_cpus(&header->crypt_method);
-	be64_to_cpus(&header->l1_table_offset);
-
-	if (header->magic != QCOW_MAGIC)
-		goto fail;
-
-	switch (header->version) {
-	case QCOW_VERSION:
-		break;
-	case 2:
-		close(fd);
-		dd->drv = &tapdisk_qcow2;
-		return dd->drv->td_open(dd, name, flags);
-	default:
-		goto fail;
-	}
-
-	if (header->size <= 1 || header->cluster_bits < 9)
-		goto fail;
-	if (header->crypt_method > QCOW_CRYPT_AES)
-		goto fail;
-	s->crypt_method_header = header->crypt_method;
-	if (s->crypt_method_header)
-		s->encrypted = 1;
-	s->cluster_bits = header->cluster_bits;
-	s->cluster_size = 1 << s->cluster_bits;
-	s->cluster_sectors = 1 << (s->cluster_bits - 9);
-	s->l2_bits = header->l2_bits;
-	s->l2_size = 1 << s->l2_bits;
-	s->cluster_alloc = s->l2_size;
-	bs->size = header->size / 512;
-	s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
-	s->backing_file_offset = header->backing_file_offset;
-	s->backing_file_size   = header->backing_file_size;
-
-	/* read the level 1 table */
-	shift = s->cluster_bits + s->l2_bits;
-	s->l1_size = ROUNDUP(header->size, 1LL << shift);
-	
-	s->l1_table_offset = header->l1_table_offset;
-
-	/*allocate a 4Kbyte multiple of memory*/
-	l1_table_size = s->l1_size * sizeof(uint64_t);
-	if (l1_table_size % 4096 > 0) {
-		l1_table_size = ROUNDUP(l1_table_size, 4096);
-	}
-	ret = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
-	if (ret != 0) goto fail;
-
-	memset(s->l1_table, 0x00, l1_table_size);
-
-	DPRINTF("L1 Table offset detected: %llu, size %d (%d)\n",
-		(long long)s->l1_table_offset,
-		(int) (s->l1_size * sizeof(uint64_t)), 
-		l1_table_size);
-
-	lseek(fd, 0, SEEK_SET);
-	l1_table_block = l1_table_size + s->l1_table_offset;
-	l1_table_block = ROUNDUP(l1_table_block, 512);
-	ret = posix_memalign((void **)&buf2, 4096, l1_table_block);
-	if (ret != 0) goto fail;
-	if (read(fd, buf2, l1_table_block) < l1_table_size + s->l1_table_offset)
-		goto fail;
-	memcpy(s->l1_table, buf2 + s->l1_table_offset, l1_table_size);
-
-	for(i = 0; i < s->l1_size; i++) {
-		be64_to_cpus(&s->l1_table[i]);
-		//DPRINTF("L1[%d] => %llu\n", i, s->l1_table[i]);
-		if (s->l1_table[i] > final_cluster)
-			final_cluster = s->l1_table[i];
-	}
-
-	/* alloc L2 cache */
-	size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
-	ret = posix_memalign((void **)&s->l2_cache, 4096, size);
-	if(ret != 0) goto fail;
-
-	size = s->cluster_size;
-	ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
-	if(ret != 0) goto fail;
-
-	ret = posix_memalign((void **)&s->cluster_data, 4096, size);
-	if(ret != 0) goto fail;
-	s->cluster_cache_offset = -1;
-
-	if (s->backing_file_offset != 0)
-		s->cluster_alloc = 1; /*Cannot use pre-alloc*/
-
-        bs->sector_size = 512;
-        bs->info = 0;
-	
-	/*Detect min_cluster_alloc*/
-	s->min_cluster_alloc = 1; /*Default*/
-	if (s->backing_file_offset == 0 && s->l1_table_offset % 4096 == 0) {
-		/*We test to see if the xen magic # exists*/
-		exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
-		be32_to_cpus(&exthdr->xmagic);
-		if(exthdr->xmagic != XEN_MAGIC) 
-			goto end_xenhdr;
-	
-		be32_to_cpus(&exthdr->flags);
-		/* Try to detect old tapdisk images. They have to be fixed because 
-		 * they don't use big endian but native endianess for the L1 table */
-		if ((exthdr->flags & EXTHDR_L1_BIG_ENDIAN) == 0) {
-			QCowHeader_ext *tmphdr = (QCowHeader_ext *)(buf2 + sizeof(QCowHeader));
-			/* 
-			   The image is broken. Fix it. The L1 table has already been 
-			   byte-swapped, so we can write it to the image file as it is
-			   currently in memory. Then swap it back to native endianess
-			   for operation.
-			 */
-
-			/* Change ENDIAN flag and copy it to store buffer */
-			exthdr->flags |= EXTHDR_L1_BIG_ENDIAN;
-			tmphdr->flags = cpu_to_be32(exthdr->flags);
-
-
-			DPRINTF("qcow: Converting image to big endian L1 table\n");
-
-			memcpy(buf2 + s->l1_table_offset, s->l1_table, l1_table_size);
-			lseek(fd, 0, SEEK_SET);
-			if (write(fd, buf2, l1_table_block) < 
-				l1_table_size + s->l1_table_offset) {
-				DPRINTF("qcow: Failed to write new L1 table\n");
-				goto fail;
-			}
-
-			for(i = 0;i < s->l1_size; i++) {
-				cpu_to_be64s(&s->l1_table[i]);
-			}
-
-		}
-
-		/*Finally check the L1 table cksum*/
-		be32_to_cpus(&exthdr->cksum);
-		cksum = gen_cksum((char *)s->l1_table, 
-				  s->l1_size * sizeof(uint64_t));
-		if(exthdr->cksum != cksum)
-			goto end_xenhdr;
-			
-		be32_to_cpus(&exthdr->min_cluster_alloc);
-		s->sparse = (exthdr->flags & SPARSE_FILE);
-		s->min_cluster_alloc = exthdr->min_cluster_alloc; 
-	}
-
- end_xenhdr:
- 	
-	/* A segment (i.e. a page) can span multiple clusters */
-	max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
-		MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
-
-	if (tap_aio_init(&s->aio, bs->size, max_aio_reqs)!=0) {
-		DPRINTF("Unable to initialise AIO state\n");
-                tap_aio_free(&s->aio);
-		goto fail;
-	}
-	init_fds(dd);
-
-	if (!final_cluster)
-		s->fd_end = l1_table_block;
-	else {
-		s->fd_end = lseek(fd, 0, SEEK_END);
-		if (s->fd_end == (off_t)-1)
-			goto fail;
-	}
-
-	return 0;
-	
-fail:
-	DPRINTF("QCOW Open failed\n");
-	tap_aio_free(&s->aio);
-	free(s->l1_table);
-	free(s->l2_cache);
-	free(s->cluster_cache);
-	free(s->cluster_data);
-	close(fd);
-	return -1;
-}
-
-static int tdqcow_queue_read(struct disk_driver *dd, uint64_t sector,
-		      int nb_sectors, char *buf, td_callback_t cb,
-		      int id, void *private)
-{
-	struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
-	int ret = 0, index_in_cluster, n, i, rsp = 0;
-	uint64_t cluster_offset, sec, nr_secs;
-
-	sec     = sector;
-	nr_secs = nb_sectors;
-
-	/*Check we can get a lock*/
-	for (i = 0; i < nb_sectors; i++) 
-		if (!tap_aio_can_lock(&s->aio, sector + i)) 
-			return cb(dd, -EBUSY, sector, nb_sectors, id, private);
-
-	/*We store a local record of the request*/
-	while (nb_sectors > 0) {
-		cluster_offset = 
-			get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
-		index_in_cluster = sector & (s->cluster_sectors - 1);
-		n = s->cluster_sectors - index_in_cluster;
-		if (n > nb_sectors)
-			n = nb_sectors;
-
-		if (s->aio.iocb_free_count == 0 || !tap_aio_lock(&s->aio, sector)) 
-			return cb(dd, -EBUSY, sector, nb_sectors, id, private);
-		
-		if(!cluster_offset) {
-			tap_aio_unlock(&s->aio, sector);
-			ret = cb(dd, BLK_NOT_ALLOCATED, 
-				 sector, n, id, private);
-			if (ret == -EBUSY) {
-				/* mark remainder of request
-				 * as busy and try again later */
-				return cb(dd, -EBUSY, sector + n,
-					  nb_sectors - n, id, private);
-			} else
-				rsp += ret;
-		} else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
-			tap_aio_unlock(&s->aio, sector);
-			if (decompress_cluster(s, cluster_offset) < 0) {
-				rsp += cb(dd, -EIO, sector, 
-					  nb_sectors, id, private);
-				goto done;
-			}
-			memcpy(buf, s->cluster_cache + index_in_cluster * 512, 
-			       512 * n);
-			rsp += cb(dd, 0, sector, n, id, private);
-		} else {
-			tap_aio_read(&s->aio, s->fd, n * 512, 
-				   (cluster_offset + index_in_cluster * 512),
-				   buf, cb, id, sector, private);
-		}
-		nb_sectors -= n;
-		sector += n;
-		buf += n * 512;
-	}
-done:
-	return rsp;
-}
-
-static int tdqcow_queue_write(struct disk_driver *dd, uint64_t sector,
-		       int nb_sectors, char *buf, td_callback_t cb,
-		       int id, void *private)
-{
-	struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
-	int ret = 0, index_in_cluster, n, i;
-	uint64_t cluster_offset, sec, nr_secs;
-
-	sec     = sector;
-	nr_secs = nb_sectors;
-
-	/*Check we can get a lock*/
-	for (i = 0; i < nb_sectors; i++)
-		if (!tap_aio_can_lock(&s->aio, sector + i))  
-			return cb(dd, -EBUSY, sector, nb_sectors, id, private);
-		   
-	/*We store a local record of the request*/
-	while (nb_sectors > 0) {
-		index_in_cluster = sector & (s->cluster_sectors - 1);
-		n = s->cluster_sectors - index_in_cluster;
-		if (n > nb_sectors)
-			n = nb_sectors;
-
-		if (s->aio.iocb_free_count == 0 || !tap_aio_lock(&s->aio, sector))
-			return cb(dd, -EBUSY, sector, nb_sectors, id, private);
-
-		cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
-						    index_in_cluster, 
-						    index_in_cluster+n);
-		if (!cluster_offset) {
-			DPRINTF("Ooops, no write cluster offset!\n");
-			tap_aio_unlock(&s->aio, sector);
-			return cb(dd, -EIO, sector, nb_sectors, id, private);
-		}
-
-		if (s->crypt_method) {
-			encrypt_sectors(s, sector, s->cluster_data, 
-					(unsigned char *)buf, n, 1,
-					&s->aes_encrypt_key);
-			tap_aio_write(&s->aio, s->fd, n * 512, 
-				    (cluster_offset + index_in_cluster*512),
-				    (char *)s->cluster_data, cb, id, sector, 
-				    private);
-		} else {
-			tap_aio_write(&s->aio, s->fd, n * 512, 
-				    (cluster_offset + index_in_cluster*512),
-				    buf, cb, id, sector, private);
-		}
-		
-		nb_sectors -= n;
-		sector += n;
-		buf += n * 512;
-	}
-	s->cluster_cache_offset = -1; /* disable compressed cache */
-
-	return 0;
-}
- 		
-static int tdqcow_submit(struct disk_driver *dd)
-{
-        struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
-
-	return tap_aio_submit(&prv->aio);
-}
-
-static int tdqcow_close(struct disk_driver *dd)
-{
-	struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
-	uint32_t cksum, out;
-	int fd, offset;
-
-	/*Update the hdr cksum*/
-	if(s->min_cluster_alloc == s->l2_size) {
-		cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
-		printf("Writing cksum: %d",cksum);
-		fd = open(s->name, O_WRONLY | O_LARGEFILE); /*Open without O_DIRECT*/
-		offset = sizeof(QCowHeader) + sizeof(uint32_t);
-		lseek(fd, offset, SEEK_SET);
-		out = cpu_to_be32(cksum);
-		if (write(fd, &out, sizeof(uint32_t))) ;
-		close(fd);
-	}
-
-	io_destroy(s->aio.aio_ctx.aio_ctx);
-	free(s->name);
-	free(s->l1_table);
-	free(s->l2_cache);
-	free(s->cluster_cache);
-	free(s->cluster_data);
-	close(s->fd);	
-	return 0;
-}
-
-static int tdqcow_do_callbacks(struct disk_driver *dd, int sid)
-{
-        int ret, i, nr_events, rsp = 0,*ptr;
-        struct io_event *ep;
-        struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
-
-        if (sid > MAX_IOFD) return 1;
-
-        nr_events = tap_aio_get_events(&prv->aio.aio_ctx);
-repeat:
-        for (ep = prv->aio.aio_events, i = nr_events; i-- > 0; ep++) {
-                struct iocb        *io  = ep->obj;
-                struct pending_aio *pio;
-
-                pio = &prv->aio.pending_aio[(long)io->data];
-
-		tap_aio_unlock(&prv->aio, pio->sector);
-
-		if (prv->crypt_method)
-			encrypt_sectors(prv, pio->sector, 
-					(unsigned char *)pio->buf, 
-					(unsigned char *)pio->buf, 
-					pio->nb_sectors, 0, 
-					&prv->aes_decrypt_key);
-
-		rsp += pio->cb(dd, ep->res == io->u.c.nbytes ? 0 : 1, 
-			       pio->sector, pio->nb_sectors,
-			       pio->id, pio->private);
-
-                prv->aio.iocb_free[prv->aio.iocb_free_count++] = io;
-        }
-
-        if (nr_events) {
-                nr_events = tap_aio_more_events(&prv->aio.aio_ctx);
-                goto repeat;
-        }
-
-        tap_aio_continue(&prv->aio.aio_ctx);
-
-        return rsp;
-}
-
-int qcow_create(const char *filename, uint64_t total_size,
-		const char *backing_file, int sparse)
-{
-	int fd, header_size, backing_filename_len, l1_size, i;
-	int shift, length, adjust, flags = 0, ret = 0;
-	QCowHeader header;
-	QCowHeader_ext exthdr;
-	char backing_filename[PATH_MAX], *ptr;
-	uint64_t tmp, size, total_length;
-	struct stat st;
-
-	DPRINTF("Qcow_create: size %llu\n",(long long unsigned)total_size);
-
-	fd = open(filename, 
-		  O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
-		  0644);
-	if (fd < 0)
-		return -1;
-
-	memset(&header, 0, sizeof(header));
-	header.magic = cpu_to_be32(QCOW_MAGIC);
-	header.version = cpu_to_be32(QCOW_VERSION);
-
-	/*Create extended header fields*/
-	exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
-
-	header_size = sizeof(header) + sizeof(QCowHeader_ext);
-	backing_filename_len = 0;
-	size = (total_size >> SECTOR_SHIFT);
-	if (backing_file) {
-		if (strcmp(backing_file, "fat:")) {
-			const char *p;
-			/* XXX: this is a hack: we do not attempt to 
-			 *check for URL like syntax */
-			p = strchr(backing_file, ':');
-			if (p && (p - backing_file) >= 2) {
-				/* URL like but exclude "c:" like filenames */
-				strncpy(backing_filename, backing_file,
-					sizeof(backing_filename));
-			} else {
-				if (realpath(backing_file, backing_filename) == NULL ||
-				    stat(backing_filename, &st) != 0) {
-					return -1;
-				}
-			}
-			header.backing_file_offset = cpu_to_be64(header_size);
-			backing_filename_len = strlen(backing_filename);
-			header.backing_file_size = cpu_to_be32(
-				backing_filename_len);
-			header_size += backing_filename_len;
-			
-			/*Set to the backing file size*/
-			if(get_filesize(backing_filename, &size, &st)) {
-				return -1;
-			}
-			DPRINTF("Backing file size detected: %lld sectors" 
-				"(total %lld [%lld MB])\n", 
-				(long long)size, 
-				(long long)(size << SECTOR_SHIFT), 
-				(long long)(size >> 11));
-		} else {
-			backing_file = NULL;
-			DPRINTF("Setting file size: %lld (total %lld)\n", 
-				(long long) total_size, 
-				(long long) (total_size << SECTOR_SHIFT));
-		}
-		header.mtime = cpu_to_be32(st.st_mtime);
-		header.cluster_bits = 9; /* 512 byte cluster to avoid copying
-					    unmodifyed sectors */
-		header.l2_bits = 12; /* 32 KB L2 tables */
-		exthdr.min_cluster_alloc = cpu_to_be32(1);
-	} else {
-		DPRINTF("Setting file size: %lld sectors" 
-			"(total %lld [%lld MB])\n", 
-			(long long) size, 
-			(long long) (size << SECTOR_SHIFT), 
-			(long long) (size >> 11));
-		header.cluster_bits = 12; /* 4 KB clusters */
-		header.l2_bits = 9; /* 4 KB L2 tables */
-		exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
-	}
-	/*Set the header size value*/
-	header.size = cpu_to_be64(size * 512);
-	
-	header_size = (header_size + 7) & ~7;
-	if (header_size % 4096 > 0) {
-		header_size = ROUNDUP(header_size, 4096);
-	}
-
-	shift = header.cluster_bits + header.l2_bits;
-	l1_size = ROUNDUP(size * 512, 1LL << shift);
-
-	header.l1_table_offset = cpu_to_be64(header_size);
-	DPRINTF("L1 Table offset: %d, size %d\n",
-		header_size,
-		(int)(l1_size * sizeof(uint64_t)));
-	header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
-
-	ptr = calloc(1, l1_size * sizeof(uint64_t));
-	exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
-	printf("Created cksum: %d\n",exthdr.cksum);
-	free(ptr);
-
-	/*adjust file length to system page size boundary*/
-	length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
-		getpagesize());
-	if (qtruncate(fd, length, 0)!=0) {
-		DPRINTF("ERROR truncating file\n");
-		return -1;
-	}
-
-	if (sparse == 0) {
-		/*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
-		total_length = length + (l1_size * (1 << 9)) + (size * 512);
-		if (qtruncate(fd, total_length, 0)!=0) {
-                        DPRINTF("ERROR truncating file\n");
-                        return -1;
-		}
-		printf("File truncated to length %"PRIu64"\n",total_length);
-	} else
-		flags = SPARSE_FILE;
-
-	flags |= EXTHDR_L1_BIG_ENDIAN;
-	exthdr.flags = cpu_to_be32(flags);
-	
-	/* write all the data */
-	lseek(fd, 0, SEEK_SET);
-	ret += write(fd, &header, sizeof(header));
-	ret += write(fd, &exthdr, sizeof(exthdr));
-	if (backing_file)
-		ret += write(fd, backing_filename, backing_filename_len);
-
-	lseek(fd, header_size, SEEK_SET);
-	tmp = 0;
-	for (i = 0;i < l1_size; i++) {
-		ret += write(fd, &tmp, sizeof(tmp));
-	}
-
-	close(fd);
-
-	return 0;
-}
-
-static int qcow_make_empty(struct tdqcow_state *s)
-{
-	uint32_t l1_length = s->l1_size * sizeof(uint64_t);
-
-	memset(s->l1_table, 0, l1_length);
-	lseek(s->fd, s->l1_table_offset, SEEK_SET);
-	if (write(s->fd, s->l1_table, l1_length) < 0)
-		return -1;
-	if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
-		DPRINTF("ERROR truncating file\n");
-		return -1;
-	}
-
-	memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
-	memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
-	memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
-
-	return 0;
-}
-
-static int qcow_get_cluster_size(struct tdqcow_state *s)
-{
-	return s->cluster_size;
-}
-
-/* XXX: put compressed sectors first, then all the cluster aligned
-   tables to avoid losing bytes in alignment */
-static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num, 
-                          const uint8_t *buf)
-{
-	z_stream strm;
-	int ret, out_len;
-	uint8_t *out_buf;
-	uint64_t cluster_offset;
-
-	out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
-	if (!out_buf)
-		return -1;
-
-	/* best compression, small window, no zlib header */
-	memset(&strm, 0, sizeof(strm));
-	ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
-			   Z_DEFLATED, -12, 
-			   9, Z_DEFAULT_STRATEGY);
-	if (ret != 0) {
-		free(out_buf);
-		return -1;
-	}
-
-	strm.avail_in = s->cluster_size;
-	strm.next_in = (uint8_t *)buf;
-	strm.avail_out = s->cluster_size;
-	strm.next_out = out_buf;
-
-	ret = deflate(&strm, Z_FINISH);
-	if (ret != Z_STREAM_END && ret != Z_OK) {
-		free(out_buf);
-		deflateEnd(&strm);
-		return -1;
-	}
-	out_len = strm.next_out - out_buf;
-
-	deflateEnd(&strm);
-
-	if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
-		/* could not compress: write normal cluster */
-		//tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
-	} else {
-		cluster_offset = get_cluster_offset(s, sector_num << 9, 2, 
-                                            out_len, 0, 0);
-		cluster_offset &= s->cluster_offset_mask;
-		lseek(s->fd, cluster_offset, SEEK_SET);
-		if (write(s->fd, out_buf, out_len) != out_len) {
-			free(out_buf);
-			return -1;
-		}
-	}
-	
-	free(out_buf);
-	return 0;
-}
-
-static int tdqcow_get_parent_id(struct disk_driver *dd, struct disk_id *id)
-{
-	off_t off;
-	char *buf, *filename;
-	int len, secs, err = -EINVAL;
-	struct tdqcow_state *child  = (struct tdqcow_state *)dd->private;
-
-	if (!child->backing_file_offset)
-		return TD_NO_PARENT;
-
-	/* read the backing file name */
-	len  = child->backing_file_size;
-	off  = child->backing_file_offset - (child->backing_file_offset % 512);
-	secs = (len + (child->backing_file_offset - off) + 511) >> 9;
-
-	if (posix_memalign((void **)&buf, 512, secs << 9)) 
-		return -1;
-
-	if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
-		goto out;
-
-	if (read(child->fd, buf, secs << 9) != secs << 9)
-		goto out;
-	filename       = buf + (child->backing_file_offset - off);
-	filename[len]  = '\0';
-
-	id->name       = strdup(filename);
-	id->drivertype = DISK_TYPE_AIO;
-	err            = 0;
- out:
-	free(buf);
-	return err;
-}
-
-static int tdqcow_validate_parent(struct disk_driver *child,
-			   struct disk_driver *parent, td_flag_t flags)
-{
-	struct stat stats;
-	uint64_t psize, csize;
-	
-	if (stat(parent->name, &stats))
-		return -EINVAL;
-	if (get_filesize(parent->name, &psize, &stats))
-		return -EINVAL;
-
-	if (stat(child->name, &stats))
-		return -EINVAL;
-	if (get_filesize(child->name, &csize, &stats))
-		return -EINVAL;
-
-	if (csize != psize)
-		return -EINVAL;
-
-	return 0;
-}
-
-struct tap_disk tapdisk_qcow = {
-	.disk_type           = "tapdisk_qcow",
-	.private_data_size   = sizeof(struct tdqcow_state),
-	.td_open             = tdqcow_open,
-	.td_queue_read       = tdqcow_queue_read,
-	.td_queue_write      = tdqcow_queue_write,
-	.td_submit           = tdqcow_submit,
-	.td_close            = tdqcow_close,
-	.td_do_callbacks     = tdqcow_do_callbacks,
-	.td_get_parent_id    = tdqcow_get_parent_id,
-	.td_validate_parent  = tdqcow_validate_parent
-};
diff --git a/tools/blktap/drivers/block-qcow2.c b/tools/blktap/drivers/block-qcow2.c
deleted file mode 100644
index ceda4f0..0000000
--- a/tools/blktap/drivers/block-qcow2.c
+++ /dev/null
@@ -1,2098 +0,0 @@
-/*
- * Block driver for the QCOW version 2 format
- *
- * Copyright (c) 2004-2006 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <zlib.h>
-#include "aes.h"
-#include <assert.h>
-#include <stdint.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/stat.h>
-
-#include "tapdisk.h"
-#include "tapaio.h"
-#include "bswap.h"
-#include "blk.h"
-
-#define USE_AIO
-
-#define qemu_malloc malloc
-#define qemu_mallocz(size) calloc(1, size)
-#define qemu_free free
-
-#ifndef O_BINARY
-#define O_BINARY 0
-#endif
-
-/* *BSD has no O_LARGEFILE */
-#ifndef O_LARGEFILE
-#define O_LARGEFILE     0 
-#endif
-
-#define BLOCK_FLAG_ENCRYPT 1
-
-/*
-  Differences with QCOW:
-
-  - Support for multiple incremental snapshots.
-  - Memory management by reference counts.
-  - Clusters which have a reference count of one have the bit
-	QCOW_OFLAG_COPIED to optimize write performance.
-  - Size of compressed clusters is stored in sectors to reduce bit usage
-	in the cluster offsets.
-  - Support for storing additional data (such as the VM state) in the
-	snapshots.
-  - If a backing store is used, the cluster size is not constrained
-	(could be backported to QCOW).
-  - L2 tables have always a size of one cluster.
-*/
-
-//#define DEBUG_ALLOC
-//#define DEBUG_ALLOC2
-
-#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
-#define QCOW_VERSION 2
-
-#define QCOW_CRYPT_NONE 0
-#define QCOW_CRYPT_AES	1
-
-/* indicate that the refcount of the referenced cluster is exactly one. */
-#define QCOW_OFLAG_COPIED	  (1LL << 63)
-/* indicate that the cluster is compressed (they never have the copied flag) */
-#define QCOW_OFLAG_COMPRESSED (1LL << 62)
-
-#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
-
-#ifndef offsetof
-#define offsetof(type, field) ((size_t) &((type *)0)->field)
-#endif
-
-typedef struct QCowHeader {
-	uint32_t magic;
-	uint32_t version;
-	uint64_t backing_file_offset;
-	uint32_t backing_file_size;
-	uint32_t cluster_bits;
-	uint64_t size; /* in bytes */
-
-	uint32_t crypt_method;
-	uint32_t l1_size; /* XXX: save number of clusters instead ? */
-	uint64_t l1_table_offset;
-	uint64_t refcount_table_offset;
-	uint32_t refcount_table_clusters;
-	uint32_t nb_snapshots;
-	uint64_t snapshots_offset;
-} QCowHeader;
-
-typedef struct __attribute__((packed)) QCowSnapshotHeader {
-	/* header is 8 byte aligned */
-	uint64_t l1_table_offset;
-
-	uint32_t l1_size;
-	uint16_t id_str_size;
-	uint16_t name_size;
-
-	uint32_t date_sec;
-	uint32_t date_nsec;
-
-	uint64_t vm_clock_nsec;
-
-	uint32_t vm_state_size;
-	uint32_t extra_data_size; /* for extension */
-	/* extra data follows */
-	/* id_str follows */
-	/* name follows  */
-} QCowSnapshotHeader;
-
-#define L2_CACHE_SIZE 16
-
-typedef struct QCowSnapshot {
-	uint64_t l1_table_offset;
-	uint32_t l1_size;
-	char *id_str;
-	char *name;
-	uint32_t vm_state_size;
-	uint32_t date_sec;
-	uint32_t date_nsec;
-	uint64_t vm_clock_nsec;
-} QCowSnapshot;
-
-typedef struct BDRVQcowState {
-
-	/* blktap additions */
-	int fd;
-	int poll_pipe[2]; /* dummy fd for polling on */
-	char* name;
-	int encrypted;
-	char backing_file[1024];
-	struct disk_driver* backing_hd;
-
-	int64_t total_sectors;
-
-	tap_aio_context_t async;
-
-	/* Original qemu variables */
-	int cluster_bits;
-	int cluster_size;
-	int cluster_sectors;
-	int l2_bits;
-	int l2_size;
-	int l1_size;
-	int l1_vm_state_index;
-	int csize_shift;
-	int csize_mask;
-	uint64_t cluster_offset_mask;
-	uint64_t l1_table_offset;
-	uint64_t *l1_table;
-	uint64_t *l2_cache;
-	uint64_t l2_cache_offsets[L2_CACHE_SIZE];
-	uint32_t l2_cache_counts[L2_CACHE_SIZE];
-	uint8_t *cluster_cache;
-	uint8_t *cluster_data;
-	uint64_t cluster_cache_offset;
-
-	uint64_t *refcount_table;
-	uint64_t refcount_table_offset;
-	uint32_t refcount_table_size;
-	uint64_t refcount_block_cache_offset;
-	uint16_t *refcount_block_cache;
-	int64_t free_cluster_index;
-	int64_t free_byte_offset;
-
-	uint32_t crypt_method; /* current crypt method, 0 if no key yet */
-	uint32_t crypt_method_header;
-	AES_KEY aes_encrypt_key;
-	AES_KEY aes_decrypt_key;
-	uint64_t snapshots_offset;
-	int snapshots_size;
-	int nb_snapshots;
-	QCowSnapshot *snapshots;
-} BDRVQcowState;
-
-static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
-static int qcow_read(struct disk_driver *bs, uint64_t sector_num,
-		uint8_t *buf, int nb_sectors);
-
-static int qcow_read_snapshots(struct disk_driver *bs);
-static void qcow_free_snapshots(struct disk_driver *bs);
-
-static int refcount_init(struct disk_driver *bs);
-static void refcount_close(struct disk_driver *bs);
-static int get_refcount(struct disk_driver *bs, int64_t cluster_index);
-static int update_cluster_refcount(struct disk_driver *bs,
-		int64_t cluster_index,
-		int addend);
-static void update_refcount(struct disk_driver *bs,
-		int64_t offset, int64_t length,
-		int addend);
-static int64_t alloc_clusters(struct disk_driver *bs, int64_t size);
-static int64_t alloc_bytes(struct disk_driver *bs, int size);
-static void free_clusters(struct disk_driver *bs,
-		int64_t offset, int64_t size);
-#ifdef DEBUG_ALLOC
-static void check_refcounts(struct disk_driver *bs);
-#endif
-
-static int qcow_sync_read(struct disk_driver *dd, uint64_t sector,
-		int nb_sectors, char *buf, td_callback_t cb,
-		int id, void *prv);
-
-/**
- * Read with byte offsets
- */
-static int bdrv_pread(int fd, int64_t offset, void *buf, int count)
-{
-	int ret;
-
-	if (lseek(fd, offset, SEEK_SET) == -1) {
-		DPRINTF("bdrv_pread failed seek (%#"PRIx64").\n", offset);
-		return -1;
-	}
-
-	ret =  read(fd, buf, count);
-	if (ret < 0) {
-		if (lseek(fd, 0, SEEK_END) >= offset) {
-			DPRINTF("bdrv_pread read failed (%#"PRIx64", END = %#"PRIx64").\n", 
-					offset, lseek(fd, 0, SEEK_END));
-			return -1;
-		}
-
-		/* Read beyond end of file. Reading zeros. */
-		memset(buf, 0, count);
-		ret = count;
-	} else if (ret < count) {
-		/* Read beyond end of file. Filling up with zeros. */
-		memset(buf + ret, 0, count - ret);
-		ret = count;
-	}
-	return ret;
-}
-
-/**
- * Write with byte offsets
- */
-static int bdrv_pwrite(int fd, int64_t offset, const void *buf, int count)
-{
-	if (lseek(fd, offset, SEEK_SET) == -1) {
-		DPRINTF("bdrv_pwrite failed seek (%#"PRIx64").\n", offset);
-		return -1;
-	}
-
-	return write(fd, buf, count);
-}
-
-
-/**
- * Read with sector offsets
- */
-static int bdrv_read(int fd, int64_t offset, void *buf, int count)
-{
-	return bdrv_pread(fd, 512 * offset, buf, 512 * count);
-}
-
-/**
- * Write with sector offsets
- */
-static int bdrv_write(int fd, int64_t offset, const void *buf, int count)
-{
-	return bdrv_pwrite(fd, 512 * offset, buf, count);
-}
-
-
-static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
-{
-	const QCowHeader *cow_header = (const void *)buf;
-
-	if (buf_size >= sizeof(QCowHeader) &&
-		be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
-		be32_to_cpu(cow_header->version) == QCOW_VERSION)
-		return 100;
-	else
-		return 0;
-}
-
-static int qcow_open(struct disk_driver *bs, const char *filename, td_flag_t flags)
-{
-	BDRVQcowState *s = bs->private;
-	int len, i, shift, ret, max_aio_reqs;
-	QCowHeader header;
-
-	int fd, o_flags;
-	
-	o_flags = O_LARGEFILE | ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
-
-	DPRINTF("Opening %s\n", filename);
-	fd = open(filename, o_flags);
-	if (fd < 0) {
-		DPRINTF("Unable to open %s (%d)\n", filename, 0 - errno);
-		return -1;
-	}
-
-	s->fd = fd;
-	if (asprintf(&s->name,"%s", filename) == -1) {
-		close(fd);
-		return -1;
-	}
-
-	ret = read(fd, &header, sizeof(header));
-	if (ret != sizeof(header)) {
-		DPRINTF("  ret = %d, errno = %d\n", ret, errno);
-		goto fail;
-	}
-
-	be32_to_cpus(&header.magic);
-	be32_to_cpus(&header.version);
-	be64_to_cpus(&header.backing_file_offset);
-	be32_to_cpus(&header.backing_file_size);
-	be64_to_cpus(&header.size);
-	be32_to_cpus(&header.cluster_bits);
-	be32_to_cpus(&header.crypt_method);
-	be64_to_cpus(&header.l1_table_offset);
-	be32_to_cpus(&header.l1_size);
-	be64_to_cpus(&header.refcount_table_offset);
-	be32_to_cpus(&header.refcount_table_clusters);
-	be64_to_cpus(&header.snapshots_offset);
-	be32_to_cpus(&header.nb_snapshots);
-
-	if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
-		goto fail;
-
-	if (header.size <= 1 ||
-		header.cluster_bits < 9 ||
-		header.cluster_bits > 16)
-		goto fail;
-	
-	s->crypt_method = 0;
-	if (header.crypt_method > QCOW_CRYPT_AES)
-		goto fail;
-	s->crypt_method_header = header.crypt_method;
-	if (s->crypt_method_header)
-		s->encrypted = 1;
-	s->cluster_bits = header.cluster_bits;
-	s->cluster_size = 1 << s->cluster_bits;
-	s->cluster_sectors = 1 << (s->cluster_bits - 9);
-	s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
-	s->l2_size = 1 << s->l2_bits;
-	s->total_sectors = header.size / 512;
-	s->csize_shift = (62 - (s->cluster_bits - 8));
-	s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
-	s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
-	s->refcount_table_offset = header.refcount_table_offset;
-	s->refcount_table_size =
-		header.refcount_table_clusters << (s->cluster_bits - 3);
-
-	s->snapshots_offset = header.snapshots_offset;
-	s->nb_snapshots = header.nb_snapshots;
-
-//	  DPRINTF("-- cluster_bits/size/sectors = %d/%d/%d\n",
-//		  s->cluster_bits, s->cluster_size, s->cluster_sectors);
-//	  DPRINTF("-- l2_bits/sizes = %d/%d\n",
-//		  s->l2_bits, s->l2_size);
-
-	/* Set sector size and number */
-	bs->td_state->sector_size = 512;
-	bs->td_state->size = header.size / 512;
-	bs->td_state->info = 0;
-
-	/* read the level 1 table */
-	s->l1_size = header.l1_size;
-	shift = s->cluster_bits + s->l2_bits;
-	s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
-	/* the L1 table must contain at least enough entries to put
-	   header.size bytes */
-	if (s->l1_size < s->l1_vm_state_index) {
-		DPRINTF("L1 table tooo small\n");
-		goto fail;
-	}
-	s->l1_table_offset = header.l1_table_offset;
-
-	s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
-	if (!s->l1_table)
-		goto fail;
-
-
-	if (lseek(fd, s->l1_table_offset, SEEK_SET) == -1)
-		goto fail;
-
-	if (read(fd, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
-			s->l1_size * sizeof(uint64_t)) {
-
-		DPRINTF("Could not read L1 table\n");
-		goto fail;
-	}
-
-	for(i = 0;i < s->l1_size; i++) {
-		be64_to_cpus(&s->l1_table[i]);
-	}
-	/* alloc L2 cache */
-	s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
-	if (!s->l2_cache)
-		goto fail;
-	s->cluster_cache = qemu_malloc(s->cluster_size);
-	if (!s->cluster_cache)
-		goto fail;
-	/* one more sector for decompressed data alignment */
-	s->cluster_data = qemu_malloc(s->cluster_size + 512);
-	if (!s->cluster_data)
-		goto fail;
-	s->cluster_cache_offset = -1;
-
-	if (refcount_init(bs) < 0)
-		goto fail;
-		
-	/* read the backing file name */
-	s->backing_file[0] = '\0';
-	if (header.backing_file_offset != 0) {
-		len = header.backing_file_size;
-		if (len > 1023)
-			len = 1023;
-
-		if (lseek(fd, header.backing_file_offset, SEEK_SET) == -1) {
-			DPRINTF("Could not lseek to %#"PRIx64"\n", header.backing_file_offset);
-			goto fail;
-		}
-
-		if (read(fd, s->backing_file, len) != len) {
-			DPRINTF("Could not read %#x bytes from %#"PRIx64": %s\n",
-				len, header.backing_file_offset,
-				strerror(errno));
-			goto fail;
-		}
-
-		s->backing_file[len] = '\0';
-	}
-
-#if 0
-	s->backing_hd = NULL;
-	if (qcow_read_snapshots(bs) < 0) {
-		DPRINTF("Could not read backing files\n");
-		goto fail;
-	}
-#endif
-
-#ifdef DEBUG_ALLOC
-	check_refcounts(bs);
-#endif
-	
-	/* Initialize fds */
-	for(i = 0; i < MAX_IOFD; i++)
-		bs->io_fd[i] = 0;
-
-#ifdef USE_AIO
-	/* Initialize AIO */
-
-	/* A segment (i.e. a page) can span multiple clusters */
-	max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
-		MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
-
-	if (tap_aio_init(&s->async, bs->td_state->size, max_aio_reqs)) {
-		DPRINTF("Unable to initialise AIO state\n");
-		tap_aio_free(&s->async);
-		goto fail;
-	}
-
-	bs->io_fd[0] = s->async.aio_ctx.pollfd; 
-#else	
-	/* Synchronous IO */
-	if (pipe(s->poll_pipe)) 
-		goto fail;
-
-	bs->io_fd[0] = s->poll_pipe[0];
-#endif
-
-	return 0;
-
- fail:
-	DPRINTF("qcow_open failed\n");
-
-#ifdef USE_AIO	
-	tap_aio_free(&s->async);
-#endif
-
-	qcow_free_snapshots(bs);
-	refcount_close(bs);
-	qemu_free(s->l1_table);
-	qemu_free(s->l2_cache);
-	qemu_free(s->cluster_cache);
-	qemu_free(s->cluster_data);
-	close(fd);
-	return -1;
-}
-
-static int qcow_set_key(struct disk_driver *bs, const char *key)
-{
-	BDRVQcowState *s = bs->private;
-	uint8_t keybuf[16];
-	int len, i;
-
-	memset(keybuf, 0, 16);
-	len = strlen(key);
-	if (len > 16)
-		len = 16;
-	/* XXX: we could compress the chars to 7 bits to increase
-	   entropy */
-	for(i = 0;i < len;i++) {
-		keybuf[i] = key[i];
-	}
-	s->crypt_method = s->crypt_method_header;
-
-	if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
-		return -1;
-	if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
-		return -1;
-#if 0
-	/* test */
-	{
-		uint8_t in[16];
-		uint8_t out[16];
-		uint8_t tmp[16];
-		for(i=0;i<16;i++)
-			in[i] = i;
-		AES_encrypt(in, tmp, &s->aes_encrypt_key);
-		AES_decrypt(tmp, out, &s->aes_decrypt_key);
-		for(i = 0; i < 16; i++)
-			printf(" %02x", tmp[i]);
-		printf("\n");
-		for(i = 0; i < 16; i++)
-			printf(" %02x", out[i]);
-		printf("\n");
-	}
-#endif
-	return 0;
-}
-
-/* The crypt function is compatible with the linux cryptoloop
-   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
-   supported */
-static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
-		uint8_t *out_buf, const uint8_t *in_buf,
-		int nb_sectors, int enc,
-		const AES_KEY *key)
-{
-	union {
-		uint64_t ll[2];
-		uint8_t b[16];
-	} ivec;
-	int i;
-
-	for(i = 0; i < nb_sectors; i++) {
-		ivec.ll[0] = cpu_to_le64(sector_num);
-		ivec.ll[1] = 0;
-		AES_cbc_encrypt(in_buf, out_buf, 512, key,
-						ivec.b, enc);
-		sector_num++;
-		in_buf += 512;
-		out_buf += 512;
-	}
-}
-
-static int copy_sectors(struct disk_driver *bs, uint64_t start_sect,
-		uint64_t cluster_offset, int n_start, int n_end)
-{
-	BDRVQcowState *s = bs->private;
-	int n, ret;
-	
-	n = n_end - n_start;
-	if (n <= 0)
-		return 0;
-
-	ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
-
-	if (ret < 0)
-		return ret;
-	if (s->crypt_method) {
-		encrypt_sectors(s, start_sect + n_start,
-				s->cluster_data,
-				s->cluster_data, n, 1,
-				&s->aes_encrypt_key);
-	}
-
-
-	ret = bdrv_pwrite(s->fd, cluster_offset + 512*n_start, s->cluster_data, n*512);
-
-	if (ret < 0)
-		return ret;
-	return 0;
-}
-
-static void l2_cache_reset(struct disk_driver *bs)
-{
-	BDRVQcowState *s = bs->private;
-
-	memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
-	memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
-	memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
-}
-
-static inline int l2_cache_new_entry(struct disk_driver *bs)
-{
-	BDRVQcowState *s = bs->private;
-	uint32_t min_count;
-	int min_index, i;
-
-	/* find a new entry in the least used one */
-	min_index = 0;
-	min_count = 0xffffffff;
-	for(i = 0; i < L2_CACHE_SIZE; i++) {
-		if (s->l2_cache_counts[i] < min_count) {
-			min_count = s->l2_cache_counts[i];
-			min_index = i;
-		}
-	}
-	return min_index;
-}
-
-static int64_t align_offset(int64_t offset, int n)
-{
-	offset = (offset + n - 1) & ~(n - 1);
-	return offset;
-}
-
-static int grow_l1_table(struct disk_driver *bs, int min_size)
-{
-	BDRVQcowState *s = bs->private;
-	int new_l1_size, new_l1_size2, ret, i;
-	uint64_t *new_l1_table;
-	uint64_t new_l1_table_offset;
-	uint64_t data64;
-	uint32_t data32;
-
-	new_l1_size = s->l1_size;
-	if (min_size <= new_l1_size)
-		return 0;
-	while (min_size > new_l1_size) {
-		new_l1_size = (new_l1_size * 3 + 1) / 2;
-	}
-
-#ifdef DEBUG_ALLOC2
-	DPRINTF("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
-#endif
-
-	new_l1_size2 = sizeof(uint64_t) * new_l1_size;
-	new_l1_table = qemu_mallocz(new_l1_size2);
-	if (!new_l1_table)
-		return -ENOMEM;
-	memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
-
-	/* write new table (align to cluster) */
-	new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
-
-	for(i = 0; i < s->l1_size; i++)
-		new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
-
-
-	if (lseek(s->fd, new_l1_table_offset, SEEK_SET) == -1)
-		goto fail;
-
-	ret = write(s->fd, new_l1_table, new_l1_size2);
-	if (ret != new_l1_size2)
-		goto fail;
-
-
-	for(i = 0; i < s->l1_size; i++)
-		new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
-
-	/* set new table */
-	data64 = cpu_to_be64(new_l1_table_offset);
-
-	if (lseek(s->fd, offsetof(QCowHeader, l1_table_offset), SEEK_SET) == -1)
-		goto fail;
-
-	if (write(s->fd, &data64, sizeof(data64)) != sizeof(data64))
-		goto fail;
-
-	data32 = cpu_to_be32(new_l1_size);
-
-	if (bdrv_pwrite(s->fd, offsetof(QCowHeader, l1_size),
-					&data32, sizeof(data32)) != sizeof(data32))
-		goto fail;
-	qemu_free(s->l1_table);
-	free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
-	s->l1_table_offset = new_l1_table_offset;
-	s->l1_table = new_l1_table;
-	s->l1_size = new_l1_size;
-	return 0;
- fail:
-	qemu_free(s->l1_table);
-	return -EIO;
-}
-
-/* 'allocate' is:
- *
- * 0 not to allocate.
- *
- * 1 to allocate a normal cluster (for sector indexes 'n_start' to
- * 'n_end')
- *
- * 2 to allocate a compressed cluster of size
- * 'compressed_size'. 'compressed_size' must be > 0 and <
- * cluster_size
- *
- * return 0 if not allocated.
- */
-static uint64_t get_cluster_offset(struct disk_driver *bs,
-		uint64_t offset, int allocate,
-		int compressed_size,
-		int n_start, int n_end)
-{
-	BDRVQcowState *s = bs->private;
-	int min_index, i, j, l1_index, l2_index, ret;
-	uint64_t l2_offset, *l2_table, cluster_offset, tmp, old_l2_offset;
-
-	l1_index = offset >> (s->l2_bits + s->cluster_bits);
-	if (l1_index >= s->l1_size) {
-		/* outside l1 table is allowed: we grow the table if needed */
-		if (!allocate)
-			return 0;
-
-		if (grow_l1_table(bs, l1_index + 1) < 0) {
-			DPRINTF("Could not grow L1 table");
-			return 0;
-		}
-	}
-
-	l2_offset = s->l1_table[l1_index];
-	if (!l2_offset) {
-		if (!allocate)
-			return 0;
-
-	l2_allocate:
-		old_l2_offset = l2_offset;
-		/* allocate a new l2 entry */
-		l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
-		
-		/* update the L1 entry */
-		s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
-		tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
-		if (bdrv_pwrite(s->fd, s->l1_table_offset + l1_index * sizeof(tmp),
-						&tmp, sizeof(tmp)) != sizeof(tmp))
-			return 0;
-		min_index = l2_cache_new_entry(bs);
-		l2_table = s->l2_cache + (min_index << s->l2_bits);
-
-		if (old_l2_offset == 0) {
-			memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
-		} else {
-			if (bdrv_pread(s->fd, old_l2_offset,
-						   l2_table, s->l2_size * sizeof(uint64_t)) !=
-				s->l2_size * sizeof(uint64_t))
-				return 0;
-		}
-		if (bdrv_pwrite(s->fd, l2_offset,
-						l2_table, s->l2_size * sizeof(uint64_t)) !=
-			s->l2_size * sizeof(uint64_t))
-			return 0;
-	} else {
-		if (!(l2_offset & QCOW_OFLAG_COPIED)) {
-			if (allocate) {
-				free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
-				goto l2_allocate;
-			}
-		} else {
-			l2_offset &= ~QCOW_OFLAG_COPIED;
-		}
-		for(i = 0; i < L2_CACHE_SIZE; i++) {
-			if (l2_offset == s->l2_cache_offsets[i]) {
-				/* increment the hit count */
-				if (++s->l2_cache_counts[i] == 0xffffffff) {
-					for(j = 0; j < L2_CACHE_SIZE; j++) {
-						s->l2_cache_counts[j] >>= 1;
-					}
-				}
-				l2_table = s->l2_cache + (i << s->l2_bits);
-				goto found;
-			}
-		}
-		/* not found: load a new entry in the least used one */
-		min_index = l2_cache_new_entry(bs);
-		l2_table = s->l2_cache + (min_index << s->l2_bits);
-
-		if (bdrv_pread(s->fd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
-			s->l2_size * sizeof(uint64_t))
-		{
-			DPRINTF("Could not read L2 table");
-			return 0;
-		}
-	}
-	s->l2_cache_offsets[min_index] = l2_offset;
-	s->l2_cache_counts[min_index] = 1;
-found:
-	l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
-
-	cluster_offset = be64_to_cpu(l2_table[l2_index]);
-	if (!cluster_offset) {
-		if (!allocate) {
-			return cluster_offset;
-		}
-	} else if (!(cluster_offset & QCOW_OFLAG_COPIED)) {
-		if (!allocate)
-			return cluster_offset;
-		/* free the cluster */
-		if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
-			int nb_csectors;
-			nb_csectors = ((cluster_offset >> s->csize_shift) &
-					s->csize_mask) + 1;
-			free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
-					nb_csectors * 512);
-		} else {
-			free_clusters(bs, cluster_offset, s->cluster_size);
-		}
-	} else {
-		cluster_offset &= ~QCOW_OFLAG_COPIED;
-		return cluster_offset;
-	}
-	if (allocate == 1) {
-		/* allocate a new cluster */
-		cluster_offset = alloc_clusters(bs, s->cluster_size);
-
-		/* we must initialize the cluster content which won't be
-		   written */
-		if ((n_end - n_start) < s->cluster_sectors) {
-			uint64_t start_sect;
-
-			start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
-			ret = copy_sectors(bs, start_sect,
-					cluster_offset, 0, n_start);
-			if (ret < 0)
-				return 0;
-			ret = copy_sectors(bs, start_sect,
-					cluster_offset, n_end, s->cluster_sectors);
-			if (ret < 0)
-				return 0;
-		}
-		tmp = cpu_to_be64(cluster_offset | QCOW_OFLAG_COPIED);
-	} else {
-		int nb_csectors;
-		cluster_offset = alloc_bytes(bs, compressed_size);
-		nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
-			(cluster_offset >> 9);
-		cluster_offset |= QCOW_OFLAG_COMPRESSED |
-			((uint64_t)nb_csectors << s->csize_shift);
-		/* compressed clusters never have the copied flag */
-		tmp = cpu_to_be64(cluster_offset);
-	}
-	/* update L2 table */
-	l2_table[l2_index] = tmp;
-
-	if (bdrv_pwrite(s->fd, l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) != sizeof(tmp))
-		return 0;
-	return cluster_offset;
-}
-
-static int qcow_is_allocated(struct disk_driver *bs, int64_t sector_num,
-		int nb_sectors, int *pnum)
-{
-	BDRVQcowState *s = bs->private;
-	int index_in_cluster, n;
-	uint64_t cluster_offset;
-
-	cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
-	index_in_cluster = sector_num & (s->cluster_sectors - 1);
-	n = s->cluster_sectors - index_in_cluster;
-	if (n > nb_sectors)
-		n = nb_sectors;
-	*pnum = n;
-	return (cluster_offset != 0);
-}
-
-static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
-		const uint8_t *buf, int buf_size)
-{
-	z_stream strm1, *strm = &strm1;
-	int ret, out_len;
-
-	memset(strm, 0, sizeof(*strm));
-
-	strm->next_in = (uint8_t *)buf;
-	strm->avail_in = buf_size;
-	strm->next_out = out_buf;
-	strm->avail_out = out_buf_size;
-
-	ret = inflateInit2(strm, -12);
-	if (ret != Z_OK)
-		return -1;
-	ret = inflate(strm, Z_FINISH);
-	out_len = strm->next_out - out_buf;
-	if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
-		out_len != out_buf_size) {
-		inflateEnd(strm);
-		return -1;
-	}
-	inflateEnd(strm);
-	return 0;
-}
-
-static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
-{
-	int ret, csize, nb_csectors, sector_offset;
-	uint64_t coffset;
-
-	coffset = cluster_offset & s->cluster_offset_mask;
-	if (s->cluster_cache_offset != coffset) {
-		nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
-		sector_offset = coffset & 511;
-		csize = nb_csectors * 512 - sector_offset;
-		ret = bdrv_read(s->fd, coffset >> 9, s->cluster_data, nb_csectors);
-		if (ret < 0) {
-			return -1;
-		}
-		if (decompress_buffer(s->cluster_cache, s->cluster_size,
-							  s->cluster_data + sector_offset, csize) < 0) {
-			return -1;
-		}
-		s->cluster_cache_offset = coffset;
-	}
-	return 0;
-}
-
-/* handle reading after the end of the backing file */
-static int backing_read1(struct disk_driver *bs,
-		int64_t sector_num, uint8_t *buf, int nb_sectors)
-{
-	int n1;
-	BDRVQcowState* s = bs->private;
-
-	if ((sector_num + nb_sectors) <= s->total_sectors)
-		return nb_sectors;
-	if (sector_num >= s->total_sectors)
-		n1 = 0;
-	else
-		n1 = s->total_sectors - sector_num;
-	memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
-	return n1;
-}
-
-/**
- * Reads a number of sectors from the image (synchronous)
- */
-static int qcow_read(struct disk_driver *bs, uint64_t sector_num,
-		uint8_t *buf, int nb_sectors)
-{
-	BDRVQcowState *s = bs->private;
-	int ret, index_in_cluster, n, n1;
-	uint64_t cluster_offset;
-
-	while (nb_sectors > 0) {
-		cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
-		index_in_cluster = sector_num & (s->cluster_sectors - 1);
-		n = s->cluster_sectors - index_in_cluster;
-		if (n > nb_sectors)
-			n = nb_sectors;
-		if (!cluster_offset) {
-
-			if (bs->next) {
-
-				/* Read from backing file */
-				struct disk_driver *parent = bs->next;
-
-				ret = qcow_sync_read(parent, sector_num, 
-						nb_sectors, (char*) buf, NULL, 0, NULL);
-
-#if 0		
-				/* read from the base image */
-				n1 = backing_read1(s->backing_hd, sector_num, buf, n);
-				if (n1 > 0) {
-					ret = bdrv_read(((BDRVQcowState*) s->backing_hd)->fd, sector_num, buf, n1);
-					if (ret < 0) {
-						DPRINTF("read from backing file failed: ret = %d; errno = %d\n", ret, errno);
-						return -1;
-					}
-				}
-#endif
-			} else {
-				memset(buf, 0, 512 * n);
-			}
-		} else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
-			if (decompress_cluster(s, cluster_offset) < 0) {
-				DPRINTF("read/decompression failed: errno = %d\n", errno);
-				return -1;
-			}
-			memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
-		} else {
-			ret = bdrv_pread(s->fd, cluster_offset + index_in_cluster * 512, buf, n * 512);
-			if (ret != n * 512) {
-				DPRINTF("read failed: ret = %d != n * 512 = %d; errno = %d\n", ret, n * 512, errno);
-				DPRINTF("  cluster_offset = %"PRIx64", index = %d; sector_num = %"PRId64"", cluster_offset, index_in_cluster, sector_num);
-				return -1;
-			}
-
-			if (s->crypt_method) {
-				encrypt_sectors(s, sector_num, buf, buf, n, 0,
-						&s->aes_decrypt_key);
-			}
-		}
-		nb_sectors -= n;
-		sector_num += n;
-		buf += n * 512;
-	}
-	return 0;
-}
-
-/**
- * Writes a number of sectors to the image (synchronous)
- */
-static int qcow_write(struct disk_driver *bs, uint64_t sector_num,
-		const uint8_t *buf, int nb_sectors)
-{
-	BDRVQcowState *s = bs->private;
-	int ret, index_in_cluster, n;
-	uint64_t cluster_offset;
-
-	while (nb_sectors > 0) {
-		index_in_cluster = sector_num & (s->cluster_sectors - 1);
-		n = s->cluster_sectors - index_in_cluster;
-		if (n > nb_sectors)
-			n = nb_sectors;
-		cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
-											index_in_cluster,
-											index_in_cluster + n);
-		if (!cluster_offset) {
-			DPRINTF("qcow_write: cluster_offset == 0\n");
-			DPRINTF("  index = %d; sector_num = %"PRId64"\n", 
-				index_in_cluster, sector_num);
-			return -1;
-		}
-
-		if (s->crypt_method) {
-			encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
-					&s->aes_encrypt_key);
-			ret = bdrv_pwrite(s->fd, cluster_offset + index_in_cluster * 512,
-					s->cluster_data, n * 512);
-		} else {
-			ret = bdrv_pwrite(s->fd, cluster_offset + index_in_cluster * 512, buf, n * 512);
-		}
-		if (ret != n * 512) {
-			DPRINTF("write failed: ret = %d != n * 512 = %d; errno = %d\n", ret, n * 512, errno);
-			DPRINTF("  cluster_offset = %"PRIx64", index = %d; sector_num = %"PRId64"\n", cluster_offset, index_in_cluster, sector_num);
-			return -1;
-		}
-
-		nb_sectors -= n;
-		sector_num += n;
-		buf += n * 512;
-	}
-	s->cluster_cache_offset = -1; /* disable compressed cache */
-	return 0;
-}
-
-
-
-#ifdef USE_AIO
-
-/*
- * QCOW2 specific AIO functions
- */
-
-static int qcow_queue_read(struct disk_driver *bs, uint64_t sector,
-		int nb_sectors, char *buf, td_callback_t cb,
-		int id, void *private)
-{
-	BDRVQcowState *s = bs->private;
-	int i, index_in_cluster, n, ret;
-	int rsp = 0;
-	uint64_t cluster_offset;
-
-	/*Check we can get a lock*/
-	for (i = 0; i < nb_sectors; i++) 
-		if (!tap_aio_can_lock(&s->async, sector + i)) 
-			return cb(bs, -EBUSY, sector, nb_sectors, id, private);
-
-	while (nb_sectors > 0) {
-		
-		cluster_offset = get_cluster_offset(bs, sector << 9, 0, 0, 0, 0);
-				
-		index_in_cluster = sector & (s->cluster_sectors - 1);
-		n = s->cluster_sectors - index_in_cluster;
-		if (n > nb_sectors)
-			n = nb_sectors;
-
-		if (s->async.iocb_free_count == 0 || !tap_aio_lock(&s->async, sector)) 
-			return cb(bs, -EBUSY, sector, nb_sectors, id, private);
-
-		if (!cluster_offset) {
-
-			/* The requested sector is not allocated */
-			tap_aio_unlock(&s->async, sector);
-			ret = cb(bs, BLK_NOT_ALLOCATED, 
-					sector, n, id, private);
-			if (ret == -EBUSY) {
-				/* mark remainder of request
-				 * as busy and try again later */
-				return cb(bs, -EBUSY, sector + n,
-						nb_sectors - n, id, private);
-			} else {
-				rsp += ret;
-			}
-
-		} else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
-
-			/* sync read for compressed clusters */
-			tap_aio_unlock(&s->async, sector);
-			if (decompress_cluster(s, cluster_offset) < 0) {
-				rsp += cb(bs, -EIO, sector, nb_sectors, id, private);
-				goto done;
-			}
-			memcpy(buf, s->cluster_cache + index_in_cluster * 512, 
-					512 * n);
-			rsp += cb(bs, 0, sector, n, id, private);
-
-		} else {
-
-			/* async read */
-			tap_aio_read(&s->async, s->fd, n * 512, 
-					(cluster_offset + index_in_cluster * 512),
-					buf, cb, id, sector, private);
-		}
-
-		/* Prepare for next sector to read */
-		nb_sectors -= n;
-		sector += n;
-		buf += n * 512;
-	}
-
-done:
-	return rsp;
-
-}
-
-static int qcow_queue_write(struct disk_driver *bs, uint64_t sector,
-		int nb_sectors, char *buf, td_callback_t cb,
-		int id, void *private)
-{
-	BDRVQcowState *s = bs->private;
-	int i, n, index_in_cluster;
-	uint64_t cluster_offset;
-	const uint8_t *src_buf;
-		
-	
-	/*Check we can get a lock*/
-	for (i = 0; i < nb_sectors; i++) 
-		if (!tap_aio_can_lock(&s->async, sector + i)) 
-			return cb(bs, -EBUSY, sector, nb_sectors, id, private);
-
-
-	while (nb_sectors > 0) {
-				
-		index_in_cluster = sector & (s->cluster_sectors - 1);
-		n = s->cluster_sectors - index_in_cluster;
-		if (n > nb_sectors)
-			n = nb_sectors;
-
-		if (s->async.iocb_free_count == 0 || !tap_aio_lock(&s->async, sector))
-			return cb(bs, -EBUSY, sector, nb_sectors, id, private);
-
-
-		cluster_offset = get_cluster_offset(bs, sector << 9, 1, 0,
-				index_in_cluster, 
-				index_in_cluster+n);
-
-		if (!cluster_offset) {
-			DPRINTF("Ooops, no write cluster offset!\n");
-			tap_aio_unlock(&s->async, sector);
-			return cb(bs, -EIO, sector, nb_sectors, id, private);
-		}
-
-
-		// TODO Encryption
-
-		tap_aio_write(&s->async, s->fd, n * 512, 
-				(cluster_offset + index_in_cluster*512),
-				buf, cb, id, sector, private);
-
-		/* Prepare for next sector to write */
-		nb_sectors -= n;
-		sector += n;
-		buf += n * 512;
-	}
-
-		
-	s->cluster_cache_offset = -1; /* disable compressed cache */
-
-	return 0;
-}
-
-
-#endif /* USE_AIO */
-
-
-static int qcow_close(struct disk_driver *bs)
-{
-	BDRVQcowState *s = bs->private;
-	
-#ifdef USE_AIO	
-	io_destroy(s->async.aio_ctx.aio_ctx);
-	tap_aio_free(&s->async);
-#else		
-	close(s->poll_pipe[0]);
-	close(s->poll_pipe[1]);
-#endif		
-
-	qemu_free(s->l1_table);
-	qemu_free(s->l2_cache);
-	qemu_free(s->cluster_cache);
-	qemu_free(s->cluster_data);
-	refcount_close(bs);
-	return close(s->fd);
-}
-
-/* XXX: use std qcow open function ? */
-typedef struct QCowCreateState {
-	int cluster_size;
-	int cluster_bits;
-	uint16_t *refcount_block;
-	uint64_t *refcount_table;
-	int64_t l1_table_offset;
-	int64_t refcount_table_offset;
-	int64_t refcount_block_offset;
-} QCowCreateState;
-
-static void create_refcount_update(QCowCreateState *s,
-		int64_t offset, int64_t size)
-{
-	int refcount;
-	int64_t start, last, cluster_offset;
-	uint16_t *p;
-
-	start = offset & ~(s->cluster_size - 1);
-	last = (offset + size - 1)	& ~(s->cluster_size - 1);
-	for(cluster_offset = start; cluster_offset <= last;
-		cluster_offset += s->cluster_size) {
-		p = &s->refcount_block[cluster_offset >> s->cluster_bits];
-		refcount = be16_to_cpu(*p);
-		refcount++;
-		*p = cpu_to_be16(refcount);
-	}
-}
-
-static int qcow_submit(struct disk_driver *bs)
-{
-	struct BDRVQcowState *s = (struct BDRVQcowState*) bs->private;
-
-	fsync(s->fd);
-	return tap_aio_submit(&s->async);
-}
-
-
-/*********************************************************/
-/* snapshot support */
-
-
-static void qcow_free_snapshots(struct disk_driver *bs)
-{
-	BDRVQcowState *s = bs->private;
-	int i;
-
-	for(i = 0; i < s->nb_snapshots; i++) {
-		qemu_free(s->snapshots[i].name);
-		qemu_free(s->snapshots[i].id_str);
-	}
-	qemu_free(s->snapshots);
-	s->snapshots = NULL;
-	s->nb_snapshots = 0;
-}
-
-static int qcow_read_snapshots(struct disk_driver *bs)
-{
-	BDRVQcowState *s = bs->private;
-	QCowSnapshotHeader h;
-	QCowSnapshot *sn;
-	int i, id_str_size, name_size;
-	int64_t offset;
-	uint32_t extra_data_size;
-
-	offset = s->snapshots_offset;
-	s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
-	if (!s->snapshots)
-		goto fail;
-	for(i = 0; i < s->nb_snapshots; i++) {
-		offset = align_offset(offset, 8);
-		if (bdrv_pread(s->fd, offset, &h, sizeof(h)) != sizeof(h))
-			goto fail;
-		offset += sizeof(h);
-		sn = s->snapshots + i;
-		sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
-		sn->l1_size = be32_to_cpu(h.l1_size);
-		sn->vm_state_size = be32_to_cpu(h.vm_state_size);
-		sn->date_sec = be32_to_cpu(h.date_sec);
-		sn->date_nsec = be32_to_cpu(h.date_nsec);
-		sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
-		extra_data_size = be32_to_cpu(h.extra_data_size);
-
-		id_str_size = be16_to_cpu(h.id_str_size);
-		name_size = be16_to_cpu(h.name_size);
-
-		offset += extra_data_size;
-
-		sn->id_str = qemu_malloc(id_str_size + 1);
-		if (!sn->id_str)
-			goto fail;
-		if (bdrv_pread(s->fd, offset, sn->id_str, id_str_size) != id_str_size)
-			goto fail;
-		offset += id_str_size;
-		sn->id_str[id_str_size] = '\0';
-
-		sn->name = qemu_malloc(name_size + 1);
-		if (!sn->name)
-			goto fail;
-		if (bdrv_pread(s->fd, offset, sn->name, name_size) != name_size)
-			goto fail;
-		offset += name_size;
-		sn->name[name_size] = '\0';
-	}
-	s->snapshots_size = offset - s->snapshots_offset;
-	return 0;
-fail:
-	qcow_free_snapshots(bs);
-	return -1;
-}
-
-
-/*********************************************************/
-/* refcount handling */
-
-static int refcount_init(struct disk_driver *bs)
-{
-	BDRVQcowState *s = bs->private;
-	int ret, refcount_table_size2, i;
-
-	s->refcount_block_cache = qemu_malloc(s->cluster_size);
-	if (!s->refcount_block_cache)
-		goto fail;
-	refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
-	s->refcount_table = qemu_malloc(refcount_table_size2);
-	if (!s->refcount_table)
-		goto fail;
-	if (s->refcount_table_size > 0) {
-		ret = bdrv_pread(s->fd, s->refcount_table_offset,
-				s->refcount_table, refcount_table_size2);
-		if (ret != refcount_table_size2)
-			goto fail;
-		for(i = 0; i < s->refcount_table_size; i++)
-			be64_to_cpus(&s->refcount_table[i]);
-	}
-	return 0;
- fail:
-	return -ENOMEM;
-}
-
-static void refcount_close(struct disk_driver *bs)
-{
-	BDRVQcowState *s = bs->private;
-	qemu_free(s->refcount_block_cache);
-	qemu_free(s->refcount_table);
-}
-
-
-static int load_refcount_block(struct disk_driver *bs,
-		int64_t refcount_block_offset)
-{
-	BDRVQcowState *s = bs->private;
-	int ret;
-	ret = bdrv_pread(s->fd, refcount_block_offset, s->refcount_block_cache,
-			s->cluster_size);
-	if (ret != s->cluster_size)
-		return -EIO;
-	s->refcount_block_cache_offset = refcount_block_offset;
-	return 0;
-}
-
-static int get_refcount(struct disk_driver *bs, int64_t cluster_index)
-{
-	BDRVQcowState *s = bs->private;
-	int refcount_table_index, block_index;
-	int64_t refcount_block_offset;
-
-	refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
-	if (refcount_table_index >= s->refcount_table_size)
-		return 0;
-	refcount_block_offset = s->refcount_table[refcount_table_index];
-	if (!refcount_block_offset)
-		return 0;
-	if (refcount_block_offset != s->refcount_block_cache_offset) {
-		/* better than nothing: return allocated if read error */
-		if (load_refcount_block(bs, refcount_block_offset) < 0)
-			return 1;
-	}
-	block_index = cluster_index &
-		((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-	return be16_to_cpu(s->refcount_block_cache[block_index]);
-}
-
-/* return < 0 if error */
-static int64_t alloc_clusters_noref(struct disk_driver *bs, int64_t size)
-{
-	BDRVQcowState *s = bs->private;
-	int i, nb_clusters;
-
-	nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
-	for(;;) {
-		if (get_refcount(bs, s->free_cluster_index) == 0) {
-			s->free_cluster_index++;
-			for(i = 1; i < nb_clusters; i++) {
-				if (get_refcount(bs, s->free_cluster_index) != 0)
-					goto not_found;
-				s->free_cluster_index++;
-			}
-
-#ifdef DEBUG_ALLOC2
-			DPRINTF("alloc_clusters: size=%ld -> %ld\n",
-				   size,
-				   (s->free_cluster_index - nb_clusters) << s->cluster_bits);
-#endif
-
-			return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
-		} else {
-		not_found:
-			s->free_cluster_index++;
-		}
-	}
-}
-
-static int64_t alloc_clusters(struct disk_driver *bs, int64_t size)
-{
-	int64_t offset;
-
-	offset = alloc_clusters_noref(bs, size);
-	update_refcount(bs, offset, size, 1);
-	return offset;
-}
-
-/* only used to allocate compressed sectors. We try to allocate
-   contiguous sectors. size must be <= cluster_size */
-static int64_t alloc_bytes(struct disk_driver *bs, int size)
-{
-	BDRVQcowState *s = bs->private;
-	int64_t offset, cluster_offset;
-	int free_in_cluster;
-
-	assert(size > 0 && size <= s->cluster_size);
-	if (s->free_byte_offset == 0) {
-		s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
-	}
-redo:
-	free_in_cluster = s->cluster_size -
-		(s->free_byte_offset & (s->cluster_size - 1));
-	if (size <= free_in_cluster) {
-		/* enough space in current cluster */
-		offset = s->free_byte_offset;
-		s->free_byte_offset += size;
-		free_in_cluster -= size;
-		if (free_in_cluster == 0)
-			s->free_byte_offset = 0;
-		if ((offset & (s->cluster_size - 1)) != 0)
-			update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
-	} else {
-		offset = alloc_clusters(bs, s->cluster_size);
-		cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
-		if ((cluster_offset + s->cluster_size) == offset) {
-			/* we are lucky: contiguous data */
-			offset = s->free_byte_offset;
-			update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
-			s->free_byte_offset += size;
-		} else {
-			s->free_byte_offset = offset;
-			goto redo;
-		}
-	}
-	return offset;
-}
-
-static void free_clusters(struct disk_driver *bs,
-		int64_t offset, int64_t size)
-{
-	update_refcount(bs, offset, size, -1);
-}
-
-static int grow_refcount_table(struct disk_driver *bs, int min_size)
-{
-	BDRVQcowState *s = bs->private;
-	int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
-	uint64_t *new_table;
-	int64_t table_offset;
-	uint64_t data64;
-	uint32_t data32;
-	int old_table_size;
-	int64_t old_table_offset;
-
-	if (min_size <= s->refcount_table_size)
-		return 0;
-	
-	/* compute new table size */
-	refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
-	for(;;) {
-		if (refcount_table_clusters == 0) {
-			refcount_table_clusters = 1;
-		} else {
-			refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
-		}
-		new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
-		if (min_size <= new_table_size)
-			break;
-	}
-
-#ifdef DEBUG_ALLOC2
-	printf("grow_refcount_table from %d to %d\n",
-		   s->refcount_table_size,
-		   new_table_size);
-#endif
-	new_table_size2 = new_table_size * sizeof(uint64_t);
-	new_table = qemu_mallocz(new_table_size2);
-	if (!new_table)
-		return -ENOMEM;
-	memcpy(new_table, s->refcount_table,
-		   s->refcount_table_size * sizeof(uint64_t));
-	for(i = 0; i < s->refcount_table_size; i++)
-		cpu_to_be64s(&new_table[i]);
-	/* Note: we cannot update the refcount now to avoid recursion */
-	table_offset = alloc_clusters_noref(bs, new_table_size2);
-	ret = bdrv_pwrite(s->fd, table_offset, new_table, new_table_size2);
-	if (ret != new_table_size2)
-		goto fail;
-	for(i = 0; i < s->refcount_table_size; i++)
-		be64_to_cpus(&new_table[i]);
-
-	data64 = cpu_to_be64(table_offset);
-	if (bdrv_pwrite(s->fd, offsetof(QCowHeader, refcount_table_offset),
-					&data64, sizeof(data64)) != sizeof(data64))
-		goto fail;
-	data32 = cpu_to_be32(refcount_table_clusters);
-	if (bdrv_pwrite(s->fd, offsetof(QCowHeader, refcount_table_clusters),
-					&data32, sizeof(data32)) != sizeof(data32))
-		goto fail;
-	qemu_free(s->refcount_table);
-	old_table_offset = s->refcount_table_offset;
-	old_table_size = s->refcount_table_size;
-	s->refcount_table = new_table;
-	s->refcount_table_size = new_table_size;
-	s->refcount_table_offset = table_offset;
-
-	update_refcount(bs, table_offset, new_table_size2, 1);
-	free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
-	return 0;
- fail:
-	free_clusters(bs, table_offset, new_table_size2);
-	qemu_free(new_table);
-	return -EIO;
-}
-
-/* addend must be 1 or -1 */
-/* XXX: cache several refcount block clusters ? */
-static int update_cluster_refcount(struct disk_driver *bs,
-		int64_t cluster_index,
-		int addend)
-{
-	BDRVQcowState *s = bs->private;
-	int64_t offset, refcount_block_offset;
-	int ret, refcount_table_index, block_index, refcount;
-	uint64_t data64;
-
-	refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
-	if (refcount_table_index >= s->refcount_table_size) {
-		if (addend < 0)
-			return -EINVAL;
-		ret = grow_refcount_table(bs, refcount_table_index + 1);
-		if (ret < 0)
-			return ret;
-	}
-	refcount_block_offset = s->refcount_table[refcount_table_index];
-	if (!refcount_block_offset) {
-		if (addend < 0)
-			return -EINVAL;
-		/* create a new refcount block */
-		/* Note: we cannot update the refcount now to avoid recursion */
-		offset = alloc_clusters_noref(bs, s->cluster_size);
-		memset(s->refcount_block_cache, 0, s->cluster_size);
-		ret = bdrv_pwrite(s->fd, offset, s->refcount_block_cache, s->cluster_size);
-		if (ret != s->cluster_size)
-			return -EINVAL;
-		s->refcount_table[refcount_table_index] = offset;
-		data64 = cpu_to_be64(offset);
-		ret = bdrv_pwrite(s->fd, s->refcount_table_offset +
-						  refcount_table_index * sizeof(uint64_t),
-						  &data64, sizeof(data64));
-		if (ret != sizeof(data64))
-			return -EINVAL;
-
-		refcount_block_offset = offset;
-		s->refcount_block_cache_offset = offset;
-		update_refcount(bs, offset, s->cluster_size, 1);
-	} else {
-		if (refcount_block_offset != s->refcount_block_cache_offset) {
-			if (load_refcount_block(bs, refcount_block_offset) < 0)
-				return -EIO;
-		}
-	}
-	/* we can update the count and save it */
-	block_index = cluster_index &
-		((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-	refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
-	refcount += addend;
-	if (refcount < 0 || refcount > 0xffff)
-		return -EINVAL;
-	if (refcount == 0 && cluster_index < s->free_cluster_index) {
-		s->free_cluster_index = cluster_index;
-	}
-	s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
-	if (bdrv_pwrite(s->fd,
-					refcount_block_offset + (block_index << REFCOUNT_SHIFT),
-					&s->refcount_block_cache[block_index], 2) != 2)
-		return -EIO;
-	return refcount;
-}
-
-static void update_refcount(struct disk_driver *bs,
-		int64_t offset, int64_t length,
-		int addend)
-{
-	BDRVQcowState *s = bs->private;
-	int64_t start, last, cluster_offset;
-
-#ifdef DEBUG_ALLOC2
-	printf("update_refcount: offset=%lld size=%lld addend=%d\n",
-		   offset, length, addend);
-#endif
-	if (length <= 0)
-		return;
-	start = offset & ~(s->cluster_size - 1);
-	last = (offset + length - 1) & ~(s->cluster_size - 1);
-	for(cluster_offset = start; cluster_offset <= last;
-		cluster_offset += s->cluster_size) {
-		update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
-	}
-}
-
-#ifdef DEBUG_ALLOC
-static void inc_refcounts(struct disk_driver *bs,
-		uint16_t *refcount_table,
-		int refcount_table_size,
-		int64_t offset, int64_t size)
-{
-	BDRVQcowState *s = bs->private;
-	int64_t start, last, cluster_offset;
-	int k;
-
-	if (size <= 0)
-		return;
-
-	start = offset & ~(s->cluster_size - 1);
-	last = (offset + size - 1) & ~(s->cluster_size - 1);
-	for(cluster_offset = start; cluster_offset <= last;
-		cluster_offset += s->cluster_size) {
-		k = cluster_offset >> s->cluster_bits;
-		if (k < 0 || k >= refcount_table_size) {
-			printf("ERROR: invalid cluster offset=0x%llx\n", cluster_offset);
-		} else {
-			if (++refcount_table[k] == 0) {
-				printf("ERROR: overflow cluster offset=0x%llx\n", cluster_offset);
-			}
-		}
-	}
-}
-
-static int check_refcounts_l1(struct disk_driver *bs,
-		uint16_t *refcount_table,
-		int refcount_table_size,
-		int64_t l1_table_offset, int l1_size,
-		int check_copied)
-{
-	BDRVQcowState *s = bs->private;
-	uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
-	int l2_size, i, j, nb_csectors, refcount;
-
-	l2_table = NULL;
-	l1_size2 = l1_size * sizeof(uint64_t);
-
-	inc_refcounts(bs, refcount_table, refcount_table_size,
-				  l1_table_offset, l1_size2);
-
-	l1_table = qemu_malloc(l1_size2);
-	if (!l1_table)
-		goto fail;
-	if (bdrv_pread(s->fd, l1_table_offset,
-				   l1_table, l1_size2) != l1_size2)
-		goto fail;
-	for(i = 0;i < l1_size; i++)
-		be64_to_cpus(&l1_table[i]);
-
-	l2_size = s->l2_size * sizeof(uint64_t);
-	l2_table = qemu_malloc(l2_size);
-	if (!l2_table)
-		goto fail;
-	for(i = 0; i < l1_size; i++) {
-		l2_offset = l1_table[i];
-		if (l2_offset) {
-			if (check_copied) {
-				refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
-				if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
-					printf("ERROR OFLAG_COPIED: l2_offset=%llx refcount=%d\n",
-						   l2_offset, refcount);
-				}
-			}
-			l2_offset &= ~QCOW_OFLAG_COPIED;
-			if (bdrv_pread(s->fd, l2_offset, l2_table, l2_size) != l2_size)
-				goto fail;
-			for(j = 0; j < s->l2_size; j++) {
-				offset = be64_to_cpu(l2_table[j]);
-				if (offset != 0) {
-					if (offset & QCOW_OFLAG_COMPRESSED) {
-						if (offset & QCOW_OFLAG_COPIED) {
-							printf("ERROR: cluster %lld: copied flag must never be set for compressed clusters\n",
-								   offset >> s->cluster_bits);
-							offset &= ~QCOW_OFLAG_COPIED;
-						}
-						nb_csectors = ((offset >> s->csize_shift) &
-									   s->csize_mask) + 1;
-						offset &= s->cluster_offset_mask;
-						inc_refcounts(bs, refcount_table,
-								refcount_table_size,
-								offset & ~511, nb_csectors * 512);
-					} else {
-						if (check_copied) {
-							refcount = get_refcount(bs, (offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits);
-							if ((refcount == 1) != ((offset & QCOW_OFLAG_COPIED) != 0)) {
-								printf("ERROR OFLAG_COPIED: offset=%llx refcount=%d\n",
-									   offset, refcount);
-							}
-						}
-						offset &= ~QCOW_OFLAG_COPIED;
-						inc_refcounts(bs, refcount_table,
-								refcount_table_size,
-								offset, s->cluster_size);
-					}
-				}
-			}
-			inc_refcounts(bs, refcount_table,
-					refcount_table_size,
-					l2_offset,
-					s->cluster_size);
-		}
-	}
-	qemu_free(l1_table);
-	qemu_free(l2_table);
-	return 0;
- fail:
-	printf("ERROR: I/O error in check_refcounts_l1\n");
-	qemu_free(l1_table);
-	qemu_free(l2_table);
-	return -EIO;
-}
-
-static void check_refcounts(struct disk_driver *bs)
-{
-	BDRVQcowState *s = bs->private;
-	int64_t size;
-	int nb_clusters, refcount1, refcount2, i;
-	QCowSnapshot *sn;
-	uint16_t *refcount_table;
-
-	size = bdrv_getlength(s->fd);
-	nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits;
-	refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
-
-	/* header */
-	inc_refcounts(bs, refcount_table, nb_clusters,
-			0, s->cluster_size);
-
-	check_refcounts_l1(bs, refcount_table, nb_clusters,
-			s->l1_table_offset, s->l1_size, 1);
-
-	/* snapshots */
-	for(i = 0; i < s->nb_snapshots; i++) {
-		sn = s->snapshots + i;
-		check_refcounts_l1(bs, refcount_table, nb_clusters,
-						   sn->l1_table_offset, sn->l1_size, 0);
-	}
-	inc_refcounts(bs, refcount_table, nb_clusters,
-				  s->snapshots_offset, s->snapshots_size);
-
-	/* refcount data */
-	inc_refcounts(bs, refcount_table, nb_clusters,
-			s->refcount_table_offset,
-			s->refcount_table_size * sizeof(uint64_t));
-
-	for(i = 0; i < s->refcount_table_size; i++) {
-		int64_t offset;
-		offset = s->refcount_table[i];
-		if (offset != 0) {
-			inc_refcounts(bs, refcount_table, nb_clusters,
-					offset, s->cluster_size);
-		}
-	}
-
-	/* compare ref counts */
-	for(i = 0; i < nb_clusters; i++) {
-		refcount1 = get_refcount(bs, i);
-		refcount2 = refcount_table[i];
-		if (refcount1 != refcount2)
-			printf("ERROR cluster %d refcount=%d reference=%d\n",
-				   i, refcount1, refcount2);
-	}
-
-	qemu_free(refcount_table);
-}
-#endif
-
-
-/**
- * Wrapper for synchronous read.
- * This function is called when not using AIO at all (#undef USE_AIO) or
- * for accessing the backing file.
- */
-static int qcow_sync_read(struct disk_driver *dd, uint64_t sector,
-		int nb_sectors, char *buf, td_callback_t cb,
-		int id, void *prv)
-{
-	int ret = qcow_read(dd, sector, (uint8_t*) buf, nb_sectors);
-
-	if (cb != NULL) {
-		return cb(dd, (ret < 0) ? ret : 0, sector, nb_sectors, id, prv);
-	} else {
-		return ret;
-	}
-}
-
-#ifndef USE_AIO
-/**
- * Wrapper for synchronous write
- */
-static int qcow_sync_write(struct disk_driver *dd, uint64_t sector,
-		int nb_sectors, char *buf, td_callback_t cb,
-		int id, void *prv)
-{
-	int ret = qcow_write(dd, sector, (uint8_t*) buf, nb_sectors);
-	
-	return cb(dd, (ret < 0) ? ret : 0, sector, nb_sectors, id, prv);
-}
-#endif
-
-
-
-#ifndef USE_AIO
-
-static int qcow_do_callbacks(struct disk_driver *dd, int sid)
-{
-	return 1;
-}
-
-#else
-
-static int qcow_do_callbacks(struct disk_driver *dd, int sid)
-{
-	int ret, i, nr_events, rsp = 0,*ptr;
-	struct io_event *ep;
-	struct BDRVQcowState *prv = (struct BDRVQcowState*)dd->private;
-
-	if (sid > MAX_IOFD) return 1;
-
-	nr_events = tap_aio_get_events(&prv->async.aio_ctx);
-
-repeat:
-	for (ep = prv->async.aio_events, i = nr_events; i-- > 0; ep++) {
-		struct iocb		   *io	= ep->obj;
-		struct pending_aio *pio;
-
-		pio = &prv->async.pending_aio[(long)io->data];
-
-		tap_aio_unlock(&prv->async, pio->sector);
-
-		if (prv->crypt_method)
-			encrypt_sectors(prv, pio->sector, 
-					(unsigned char *)pio->buf, 
-					(unsigned char *)pio->buf, 
-					pio->nb_sectors, 0, 
-					&prv->aes_decrypt_key);
-
-		rsp += pio->cb(dd, ep->res == io->u.c.nbytes ? 0 : 1, 
-			pio->sector, pio->nb_sectors,
-			pio->id, pio->private);
-
-		prv->async.iocb_free[prv->async.iocb_free_count++] = io;
-	}
-
-	if (nr_events) {
-		nr_events = tap_aio_more_events(&prv->async.aio_ctx);
-		goto repeat;
-	}
-
-	tap_aio_continue(&prv->async.aio_ctx);
-
-	return rsp;
-}
-
-#endif	
-
-static int get_filesize(char *filename, uint64_t *size, struct stat *st)
-{
-	int fd;
-	QCowHeader header;
-
-	/*Set to the backing file size*/
-	fd = open(filename, O_RDONLY);
-	if (fd < 0)
-		return -1;
-	if (read(fd, &header, sizeof(header)) < sizeof(header)) {
-		close(fd);
-		return -1;
-	}
-	close(fd);
-	
-	be32_to_cpus(&header.magic);
-	be32_to_cpus(&header.version);
-	be64_to_cpus(&header.size);
-	if (header.magic == QCOW_MAGIC && header.version == QCOW_VERSION) {
-		*size = header.size >> SECTOR_SHIFT;
-		return 0;
-	}
-
-	if(S_ISBLK(st->st_mode)) {
-		fd = open(filename, O_RDONLY);
-		if (fd < 0)
-			return -1;
-		if (blk_getimagesize(fd, size) != 0) {
-			close(fd);
-			return -1;
-		}
-		close(fd);
-	} else *size = (st->st_size >> SECTOR_SHIFT);	
-	return 0;
-}
-
-/**
- * @return 
- *	   0 if parent id successfully retrieved;
- *	   TD_NO_PARENT if no parent exists;
- *	   -errno on error
- */
-static int qcow_get_parent_id(struct disk_driver *dd, struct disk_id *id)
-{
-	struct BDRVQcowState* s = (struct BDRVQcowState*) dd->private;
-
-	if (s->backing_file[0] == '\0')
-		return TD_NO_PARENT;
-
-	id->name = strdup(s->backing_file);
-	id->drivertype = DISK_TYPE_AIO;
-
-	return 0;
-}
-
-static int qcow_validate_parent(struct disk_driver *child, 
-		struct disk_driver *parent, td_flag_t flags)
-{
-	struct stat stats;
-	uint64_t psize, csize;
-	
-	if (stat(parent->name, &stats))
-		return -EINVAL;
-	if (get_filesize(parent->name, &psize, &stats))
-		return -EINVAL;
-
-	if (stat(child->name, &stats))
-		return -EINVAL;
-	if (get_filesize(child->name, &csize, &stats))
-		return -EINVAL;
-
-	if (csize != psize)
-		return -EINVAL;
-
-	return 0;
-}
-
-int qcow2_create(const char *filename, uint64_t total_size,
-                      const char *backing_file, int flags)
-{
-    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
-    int ret = 0;
-    QCowHeader header;
-    uint64_t tmp, offset;
-    QCowCreateState s1, *s = &s1;
-
-    memset(s, 0, sizeof(*s));
-
-    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
-    if (fd < 0)
-        return -1;
-    memset(&header, 0, sizeof(header));
-    header.magic = cpu_to_be32(QCOW_MAGIC);
-    header.version = cpu_to_be32(QCOW_VERSION);
-    header.size = cpu_to_be64(total_size * 512);
-    header_size = sizeof(header);
-    backing_filename_len = 0;
-    if (backing_file) {
-        header.backing_file_offset = cpu_to_be64(header_size);
-        backing_filename_len = strlen(backing_file);
-        header.backing_file_size = cpu_to_be32(backing_filename_len);
-        header_size += backing_filename_len;
-    }
-    s->cluster_bits = 12;  /* 4 KB clusters */
-    s->cluster_size = 1 << s->cluster_bits;
-    header.cluster_bits = cpu_to_be32(s->cluster_bits);
-    header_size = (header_size + 7) & ~7;
-    if (flags & BLOCK_FLAG_ENCRYPT) {
-        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
-    } else {
-        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
-    }
-    l2_bits = s->cluster_bits - 3;
-    shift = s->cluster_bits + l2_bits;
-    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
-    offset = align_offset(header_size, s->cluster_size);
-    s->l1_table_offset = offset;
-    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
-    header.l1_size = cpu_to_be32(l1_size);
-    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
-
-    s->refcount_table = qemu_mallocz(s->cluster_size);
-    s->refcount_block = qemu_mallocz(s->cluster_size);
-
-    s->refcount_table_offset = offset;
-    header.refcount_table_offset = cpu_to_be64(offset);
-    header.refcount_table_clusters = cpu_to_be32(1);
-    offset += s->cluster_size;
-
-    s->refcount_table[0] = cpu_to_be64(offset);
-    s->refcount_block_offset = offset;
-    offset += s->cluster_size;
-
-    /* update refcounts */
-    create_refcount_update(s, 0, header_size);
-    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
-    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
-    create_refcount_update(s, s->refcount_block_offset, s->cluster_size);
-
-    /* write all the data */
-    ret = write(fd, &header, sizeof(header));
-    if (ret < 0)
-        goto out;
-    if (backing_file) {
-        ret = write(fd, backing_file, backing_filename_len);
-        if (ret < 0)
-            goto out;
-    }
-    lseek(fd, s->l1_table_offset, SEEK_SET);
-    tmp = 0;
-    for(i = 0;i < l1_size; i++) {
-        ret = write(fd, &tmp, sizeof(tmp));
-        if (ret < 0)
-            goto out;
-    }
-    lseek(fd, s->refcount_table_offset, SEEK_SET);
-    ret = write(fd, s->refcount_table, s->cluster_size);
-    if (ret < 0)
-        goto out;
-
-    lseek(fd, s->refcount_block_offset, SEEK_SET);
-    ret = write(fd, s->refcount_block, s->cluster_size);
-    if (ret < 0)
-        goto out;
-    ret = 0;
-
-  out:
-    qemu_free(s->refcount_table);
-    qemu_free(s->refcount_block);
-    close(fd);
-    return ret;
-}
-
-
-
-struct tap_disk tapdisk_qcow2 = {
-	"qcow2",
-	sizeof(BDRVQcowState),
-	qcow_open,
-#ifdef USE_AIO
-	qcow_queue_read,
-	qcow_queue_write,
-#else
-	qcow_sync_read,
-	qcow_sync_write,
-#endif
-	qcow_submit,
-	qcow_close,
-	qcow_do_callbacks,
-	qcow_get_parent_id,
-	qcow_validate_parent
-};
diff --git a/tools/blktap/drivers/block-ram.c b/tools/blktap/drivers/block-ram.c
deleted file mode 100644
index 836a68e..0000000
--- a/tools/blktap/drivers/block-ram.c
+++ /dev/null
@@ -1,295 +0,0 @@
-/* block-ram.c
- *
- * Fast Ramdisk implementation.
- *
- * (c) 2006 Andrew Warfield and Julian Chesterfield
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <unistd.h>
-#include <sys/statvfs.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <string.h>
-#include "tapdisk.h"
-#include "blk.h"
-
-#define MAX_DISK_SIZE 1024000 /*500MB disk limit*/
-
-/* *BSD has no O_LARGEFILE */
-#ifndef O_LARGEFILE
-#define O_LARGEFILE	0
-#endif
-
-char *img;
-long int   disksector_size;
-long int   disksize;
-long int   diskinfo;
-static int connections = 0;
-
-struct tdram_state {
-        int fd;
-	int poll_pipe[2]; /* dummy fd for polling on */
-};
-
-/*Get Image size, secsize*/
-static int get_image_info(struct td_state *s, int fd)
-{
-	int ret;
-	long size;
-	unsigned long total_size;
-	struct statvfs statBuf;
-	struct stat stat;
-
-	ret = fstat(fd, &stat);
-	if (ret != 0) {
-		DPRINTF("ERROR: fstat failed, Couldn't stat image");
-		return -EINVAL;
-	}
-
-	if (S_ISBLK(stat.st_mode)) {
-		/*Accessing block device directly*/
-		if (blk_getimagesize(fd, &s->size) != 0)
-			return -EINVAL;
-
-		DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
-			"sector_shift [%llu]\n",
-			(long long unsigned)(s->size << SECTOR_SHIFT),
-			(long long unsigned)s->size);
-
-		/*Get the sector size*/
-		if (blk_getsectorsize(fd, &s->sector_size) != 0)
-			s->sector_size = DEFAULT_SECTOR_SIZE;
-
-	} else {
-		/*Local file? try fstat instead*/
-		s->size = (stat.st_size >> SECTOR_SHIFT);
-		s->sector_size = DEFAULT_SECTOR_SIZE;
-		DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
-			"sector_shift [%llu]\n",
-			(long long unsigned)(s->size << SECTOR_SHIFT),
-			(long long unsigned)s->size);
-	}
-
-	if (s->size == 0) {		
-		s->size =((uint64_t) MAX_DISK_SIZE);
-		s->sector_size = DEFAULT_SECTOR_SIZE;
-	}
-	s->info = 0;
-
-        /*Store variables locally*/
-	disksector_size = s->sector_size;
-	disksize        = s->size;
-	diskinfo        = s->info;
-	DPRINTF("Image sector_size: \n\t[%"PRIu64"]\n",
-		s->sector_size);
-
-	return 0;
-}
-
-static inline void init_fds(struct disk_driver *dd)
-{
-        int i;
-	struct tdram_state *prv = (struct tdram_state *)dd->private;
-
-        for(i =0 ; i < MAX_IOFD; i++)
-		dd->io_fd[i] = 0;
-
-        dd->io_fd[0] = prv->poll_pipe[0];
-}
-
-/* Open the disk file and initialize ram state. */
-static int tdram_open (struct disk_driver *dd, const char *name, td_flag_t flags)
-{
-	char *p;
-	uint64_t size;
-	int i, fd, ret = 0, count = 0, o_flags;
-	struct td_state    *s     = dd->td_state;
-	struct tdram_state *prv   = (struct tdram_state *)dd->private;
-
-	connections++;
-	
-	/* set up a pipe so that we can hand back a poll fd that won't fire.*/
-	ret = pipe(prv->poll_pipe);
-	if (ret != 0)
-		return (0 - errno);
-
-	if (connections > 1) {
-		s->sector_size = disksector_size;
-		s->size        = disksize;
-		s->info        = diskinfo; 
-		DPRINTF("Image already open, returning parameters:\n");
-		DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
-			"sector_shift [%llu]\n",
-			(long long unsigned)(s->size << SECTOR_SHIFT),
-			(long long unsigned)s->size);
-		DPRINTF("Image sector_size: \n\t[%"PRIu64"]\n",
-			s->sector_size);
-
-		prv->fd = -1;
-		goto done;
-	}
-
-	/* Open the file */
-	o_flags = O_DIRECT | O_LARGEFILE | 
-		((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
-        fd = open(name, o_flags);
-
-        if ((fd == -1) && (errno == EINVAL)) {
-
-                /* Maybe O_DIRECT isn't supported. */
-		o_flags &= ~O_DIRECT;
-                fd = open(name, o_flags);
-                if (fd != -1) DPRINTF("WARNING: Accessing image without"
-                                     "O_DIRECT! (%s)\n", name);
-
-        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
-	
-        if (fd == -1) {
-		DPRINTF("Unable to open [%s]!\n",name);
-        	ret = 0 - errno;
-        	goto done;
-        }
-
-        prv->fd = fd;
-
-	ret = get_image_info(s, fd);
-	size = MAX_DISK_SIZE;
-
-	if (s->size > size) {
-		DPRINTF("Disk exceeds limit, must be less than [%d]MB",
-			(MAX_DISK_SIZE<<SECTOR_SHIFT)>>20);
-		return -ENOMEM;
-	}
-
-	/*Read the image into memory*/
-	p = img = malloc(s->size << SECTOR_SHIFT);
-	if (img == NULL) {
-		DPRINTF("Mem malloc failed\n");
-		return -1;
-	}
-	DPRINTF("Reading %llu bytes.......",(long long unsigned)s->size << SECTOR_SHIFT);
-
-	for (i = 0; i < s->size; i++) {
-		ret = read(prv->fd, p, s->sector_size);
-		if (ret != s->sector_size) {
-			ret = 0 - errno;
-			break;
-		} else {
-			count += ret;
-			p = img + count;
-		}
-	}
-	DPRINTF("[%d]\n",count);
-	if (count != s->size << SECTOR_SHIFT) {
-		ret = -1;
-	} else {
-		ret = 0;
-	} 
-
-	init_fds(dd);
-done:
-	return ret;
-}
-
-static int tdram_queue_read(struct disk_driver *dd, uint64_t sector,
-		      int nb_sectors, char *buf, td_callback_t cb,
-		      int id, void *private)
-{
-	struct td_state    *s   = dd->td_state;
-	struct tdram_state *prv = (struct tdram_state *)dd->private;
-	int      size    = nb_sectors * s->sector_size;
-	uint64_t offset  = sector * (uint64_t)s->sector_size;
-
-	memcpy(buf, img + offset, size);
-
-	return cb(dd, 0, sector, nb_sectors, id, private);
-}
-
-static int tdram_queue_write(struct disk_driver *dd, uint64_t sector,
-		      int nb_sectors, char *buf, td_callback_t cb,
-		      int id, void *private)
-{
-	struct td_state    *s   = dd->td_state;
-	struct tdram_state *prv = (struct tdram_state *)dd->private;
-	int      size    = nb_sectors * s->sector_size;
-	uint64_t offset  = sector * (uint64_t)s->sector_size;
-	
-	/* We assume that write access is controlled
-	 * at a higher level for multiple disks */
-	memcpy(img + offset, buf, size);
-
-	return cb(dd, 0, sector, nb_sectors, id, private);
-}
- 		
-static int tdram_submit(struct disk_driver *dd)
-{
-	return 0;	
-}
-
-static int tdram_close(struct disk_driver *dd)
-{
-	struct tdram_state *prv = (struct tdram_state *)dd->private;
-	
-	connections--;
-	
-	return 0;
-}
-
-static int tdram_do_callbacks(struct disk_driver *dd, int sid)
-{
-	/* always ask for a kick */
-	return 1;
-}
-
-static int tdram_get_parent_id(struct disk_driver *dd, struct disk_id *id)
-{
-	return TD_NO_PARENT;
-}
-
-static int tdram_validate_parent(struct disk_driver *dd, 
-			  struct disk_driver *parent, td_flag_t flags)
-{
-	return -EINVAL;
-}
-
-struct tap_disk tapdisk_ram = {
-	.disk_type          = "tapdisk_ram",
-	.private_data_size  = sizeof(struct tdram_state),
-	.td_open            = tdram_open,
-	.td_queue_read      = tdram_queue_read,
-	.td_queue_write     = tdram_queue_write,
-	.td_submit          = tdram_submit,
-	.td_close           = tdram_close,
-	.td_do_callbacks    = tdram_do_callbacks,
-	.td_get_parent_id   = tdram_get_parent_id,
-	.td_validate_parent = tdram_validate_parent
-};
diff --git a/tools/blktap/drivers/block-sync.c b/tools/blktap/drivers/block-sync.c
deleted file mode 100644
index dde4538..0000000
--- a/tools/blktap/drivers/block-sync.c
+++ /dev/null
@@ -1,242 +0,0 @@
-/* block-sync.c
- *
- * simple slow synchronous raw disk implementation.
- *
- * (c) 2006 Andrew Warfield and Julian Chesterfield
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/statvfs.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include "tapdisk.h"
-#include "blk.h"
-
-/* *BSD has no O_LARGEFILE */
-#ifndef O_LARGEFILE
-#define O_LARGEFILE	0
-#endif
-
-struct tdsync_state {
-	int fd;
-	int poll_pipe[2]; /* dummy fd for polling on */
-};
-	
-/*Get Image size, secsize*/
-static int get_image_info(struct td_state *s, int fd)
-{
-	int ret;
-	long size;
-	unsigned long total_size;
-	struct statvfs statBuf;
-	struct stat stat;
-
-	ret = fstat(fd, &stat);
-	if (ret != 0) {
-		DPRINTF("ERROR: fstat failed, Couldn't stat image");
-		return -EINVAL;
-	}
-
-	if (S_ISBLK(stat.st_mode)) {
-		/*Accessing block device directly*/
-		if (blk_getimagesize(fd, &s->size) != 0)
-			return -EINVAL;
-
-		DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
-			"sector_shift [%llu]\n",
-			(long long unsigned)(s->size << SECTOR_SHIFT),
-			(long long unsigned)s->size);
-
-		/*Get the sector size*/
-		if (blk_getsectorsize(fd, &s->sector_size) != 0)
-			s->sector_size = DEFAULT_SECTOR_SIZE;
-
-	} else {
-		/*Local file? try fstat instead*/
-		s->size = (stat.st_size >> SECTOR_SHIFT);
-		s->sector_size = DEFAULT_SECTOR_SIZE;
-		DPRINTF("Image size: \n\tpre sector_shift  [%lluu]\n\tpost "
-			"sector_shift [%lluu]\n",
-			(long long unsigned)(s->size << SECTOR_SHIFT),
-			(long long unsigned)s->size);
-	}
-
-	if (s->size == 0)
-		return -EINVAL;
-
-	s->info = 0;
-
-	return 0;
-}
-
-static inline void init_fds(struct disk_driver *dd)
-{
-	int i;
-	struct tdsync_state *prv = (struct tdsync_state *)dd->private;
-	
-	for(i = 0; i < MAX_IOFD; i++)
-		dd->io_fd[i] = 0;
-
-	dd->io_fd[0] = prv->poll_pipe[0];
-}
-
-/* Open the disk file and initialize aio state. */
-static int tdsync_open (struct disk_driver *dd, const char *name, td_flag_t flags)
-{
-	int i, fd, ret = 0, o_flags;
-	struct td_state     *s   = dd->td_state;
-	struct tdsync_state *prv = (struct tdsync_state *)dd->private;
-	
-	/* set up a pipe so that we can hand back a poll fd that won't fire.*/
-	ret = pipe(prv->poll_pipe);
-	if (ret != 0)
-		return (0 - errno);
-	
-	/* Open the file */
-	o_flags = O_DIRECT | O_LARGEFILE | 
-		((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
-        fd = open(name, o_flags);
-
-        if ( (fd == -1) && (errno == EINVAL) ) {
-
-                /* Maybe O_DIRECT isn't supported. */
-		o_flags &= ~O_DIRECT;
-                fd = open(name, o_flags);
-                if (fd != -1) DPRINTF("WARNING: Accessing image without"
-                                     "O_DIRECT! (%s)\n", name);
-
-        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
-	
-        if (fd == -1) {
-		DPRINTF("Unable to open [%s]!\n",name);
-        	ret = 0 - errno;
-        	goto done;
-        }
-
-        prv->fd = fd;
-
-	init_fds(dd);
-	ret = get_image_info(s, fd);
-done:
-	return ret;	
-}
-
-static int tdsync_queue_read(struct disk_driver *dd, uint64_t sector,
-			       int nb_sectors, char *buf, td_callback_t cb,
-			       int id, void *private)
-{
-	struct td_state     *s   = dd->td_state;
-	struct tdsync_state *prv = (struct tdsync_state *)dd->private;
-	int      size    = nb_sectors * s->sector_size;
-	uint64_t offset  = sector * (uint64_t)s->sector_size;
-	int ret;
-	
-	ret = lseek(prv->fd, offset, SEEK_SET);
-	if (ret != (off_t)-1) {
-		ret = read(prv->fd, buf, size);
-		if (ret != size) {
-			ret = 0 - errno;
-		} else {
-			ret = 1;
-		} 
-	} else ret = 0 - errno;
-		
-	return cb(dd, (ret < 0) ? ret: 0, sector, nb_sectors, id, private);
-}
-
-static int tdsync_queue_write(struct disk_driver *dd, uint64_t sector,
-			       int nb_sectors, char *buf, td_callback_t cb,
-			       int id, void *private)
-{
-	struct td_state     *s   = dd->td_state;
-	struct tdsync_state *prv = (struct tdsync_state *)dd->private;
-	int      size    = nb_sectors * s->sector_size;
-	uint64_t offset  = sector * (uint64_t)s->sector_size;
-	int ret = 0;
-	
-	ret = lseek(prv->fd, offset, SEEK_SET);
-	if (ret != (off_t)-1) {
-		ret = write(prv->fd, buf, size);
-		if (ret != size) {
-			ret = 0 - errno;
-		} else {
-			ret = 1;
-		}
-	} else ret = 0 - errno;
-		
-	return cb(dd, (ret < 0) ? ret : 0, sector, nb_sectors, id, private);
-}
- 		
-static int tdsync_submit(struct disk_driver *dd)
-{
-	return 0;	
-}
-
-static int tdsync_close(struct disk_driver *dd)
-{
-	struct tdsync_state *prv = (struct tdsync_state *)dd->private;
-	
-	close(prv->fd);
-	close(prv->poll_pipe[0]);
-	close(prv->poll_pipe[1]);
-	
-	return 0;
-}
-
-static int tdsync_do_callbacks(struct disk_driver *dd, int sid)
-{
-	/* always ask for a kick */
-	return 1;
-}
-
-static int tdsync_get_parent_id(struct disk_driver *dd, struct disk_id *id)
-{
-	return TD_NO_PARENT;
-}
-
-static int tdsync_validate_parent(struct disk_driver *dd, 
-			   struct disk_driver *parent, td_flag_t flags)
-{
-	return -EINVAL;
-}
-
-struct tap_disk tapdisk_sync = {
-	.disk_type           = "tapdisk_sync",
-	.private_data_size   = sizeof(struct tdsync_state),
-	.td_open             = tdsync_open,
-	.td_queue_read       = tdsync_queue_read,
-	.td_queue_write      = tdsync_queue_write,
-	.td_submit           = tdsync_submit,
-	.td_close            = tdsync_close,
-	.td_do_callbacks     = tdsync_do_callbacks,
-	.td_get_parent_id    = tdsync_get_parent_id,
-	.td_validate_parent  = tdsync_validate_parent
-};
diff --git a/tools/blktap/drivers/block-vmdk.c b/tools/blktap/drivers/block-vmdk.c
deleted file mode 100644
index 4d16965..0000000
--- a/tools/blktap/drivers/block-vmdk.c
+++ /dev/null
@@ -1,428 +0,0 @@
-/* block-vmdk.c
- *
- * VMware Disk format implementation.
- *
- * (c) 2006 Andrew Warfield and Julian Chesterfield
- *
- * This is largely the same as the vmdk driver in Qemu, I've just twisted it
- * to match our interfaces.  The original (BSDish) Copyright message appears 
- * below:
- */
- 
-/*
- * Block driver for the VMDK format
- * 
- * Copyright (c) 2004 Fabrice Bellard
- * Copyright (c) 2005 Filip Navara
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/statvfs.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <string.h>
-#include "tapdisk.h"
-#include "bswap.h"
-
-/* *BSD has no O_LARGEFILE */
-#ifndef O_LARGEFILE
-#define O_LARGEFILE	0
-#endif
-
-#define safer_free(_x)       \
-  do {                       \
-  	if (NULL != _x) {    \
-  		free(_x);    \
-  		(_x) = NULL; \
-  	}                    \
-  } while (0) ;
-
-#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
-#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
-
-typedef struct {
-    uint32_t version;
-    uint32_t flags;
-    uint32_t disk_sectors;
-    uint32_t granularity;
-    uint32_t l1dir_offset;
-    uint32_t l1dir_size;
-    uint32_t file_sectors;
-    uint32_t cylinders;
-    uint32_t heads;
-    uint32_t sectors_per_track;
-} VMDK3Header;
-
-typedef struct {
-    uint32_t version;
-    uint32_t flags;
-    int64_t capacity;
-    int64_t granularity;
-    int64_t desc_offset;
-    int64_t desc_size;
-    int32_t num_gtes_per_gte;
-    int64_t rgd_offset;
-    int64_t gd_offset;
-    int64_t grain_offset;
-    char filler[1];
-    char check_bytes[4];
-} __attribute__((packed)) VMDK4Header;
-
-#define L2_CACHE_SIZE 16
-
-struct tdvmdk_state {
-        int fd;
-	int poll_pipe[2]; /* dummy fd for polling on */
-	
-    	unsigned int l1_size;
-    	int64_t l1_table_offset;
-    	int64_t l1_backup_table_offset;
-    	uint32_t l1_entry_sectors;
-    	unsigned int l2_size;
-	
-    	uint32_t *l1_table;
-    	uint32_t *l1_backup_table;
-    	uint32_t *l2_cache;
-    	uint32_t l2_cache_offsets[L2_CACHE_SIZE];
-    	uint32_t l2_cache_counts[L2_CACHE_SIZE];
-    	
-    	unsigned int cluster_sectors;
-};
-
-static inline void init_fds(struct disk_driver *dd)
-{
-        int i;
-	struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
-
-        for (i = 0; i < MAX_IOFD; i++)
-		dd->io_fd[i] = 0;
-
-        dd->io_fd[0] = prv->poll_pipe[0];
-}
-
-/* Open the disk file and initialize aio state. */
-static int tdvmdk_open (struct disk_driver *dd, 
-			const char *name, td_flag_t flags)
-{
-	int ret, fd;
-    	int l1_size, i, o_flags;
-    	uint32_t magic;
-	struct td_state     *s   = dd->td_state;
-	struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
-
-	/* set up a pipe so that we can hand back a poll fd that won't fire.*/
-	ret = pipe(prv->poll_pipe);
-	if (ret != 0)
-		return -1;
-	
-	/* Open the file */
-	o_flags = O_DIRECT | O_LARGEFILE | 
-		((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
-        fd = open(name, o_flags); 
-
-        if ( (fd == -1) && (errno == EINVAL) ) {
-
-                /* Maybe O_DIRECT isn't supported. */
-		o_flags &= ~O_DIRECT;
-                fd = open(name, o_flags);
-                if (fd != -1) DPRINTF("WARNING: Accessing image without"
-                                     "O_DIRECT! (%s)\n", name);
-
-        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
-	
-        if (fd == -1) {
-		DPRINTF("Unable to open [%s]!\n",name);
-        	ret = 0 - errno;
-        	return -1;
-        }
-        
-        prv->fd = fd;
-        
-        /* Grok the vmdk header. */
-    	if ((ret = read(fd, &magic, sizeof(magic))) != sizeof(magic))
-        	goto fail;
-    	magic = be32_to_cpu(magic);
-    	if (magic == VMDK3_MAGIC) {
-        	VMDK3Header header;
-        	if (read(fd, &header, sizeof(header)) != 
-            		sizeof(header)) 
-            		goto fail;
-        	prv->cluster_sectors = le32_to_cpu(header.granularity);
-        	prv->l2_size = 1 << 9;
-        	prv->l1_size = 1 << 6;
-        	s->size = le32_to_cpu(header.disk_sectors);
-        	prv->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
-        	prv->l1_backup_table_offset = 0;
-        	prv->l1_entry_sectors = prv->l2_size * prv->cluster_sectors;
-    	} else if (magic == VMDK4_MAGIC) {
-        	VMDK4Header header;
-        
-        	if (read(fd, &header, sizeof(header)) != sizeof(header))
-            		goto fail;
-        	s->size = le32_to_cpu(header.capacity);
-        	prv->cluster_sectors = le32_to_cpu(header.granularity);
-        	prv->l2_size = le32_to_cpu(header.num_gtes_per_gte);
-        	prv->l1_entry_sectors = prv->l2_size * prv->cluster_sectors;
-        	if (prv->l1_entry_sectors <= 0)
-            		goto fail;
-        	prv->l1_size = (s->size + prv->l1_entry_sectors - 1) 
-            		       / prv->l1_entry_sectors;
-        	prv->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
-        	prv->l1_backup_table_offset = 
-        		le64_to_cpu(header.gd_offset) << 9;
-    	} else {
-        	goto fail;
-    	}
-    	/* read the L1 table */
-    	l1_size = prv->l1_size * sizeof(uint32_t);
-    	prv->l1_table = malloc(l1_size);
-    	if (!prv->l1_table)
-        	goto fail;
-    	if (lseek(fd, prv->l1_table_offset, SEEK_SET) == -1)
-        	goto fail;
-    	if (read(fd, prv->l1_table, l1_size) != l1_size)
-        	goto fail;
-    	for (i = 0; i < prv->l1_size; i++) {
-        	le32_to_cpus(&prv->l1_table[i]);
-    	}
-
-    	if (prv->l1_backup_table_offset) {
-        	prv->l1_backup_table = malloc(l1_size);
-        	if (!prv->l1_backup_table)
-            		goto fail;
-        	if (lseek(fd, prv->l1_backup_table_offset, SEEK_SET) == -1)
-            		goto fail;
-        	if (read(fd, prv->l1_backup_table, l1_size) != l1_size)
-            		goto fail;
-        	for(i = 0; i < prv->l1_size; i++) {
-            		le32_to_cpus(&prv->l1_backup_table[i]);
-        	}
-    	}
-
-    	prv->l2_cache = malloc(prv->l2_size * L2_CACHE_SIZE *sizeof(uint32_t));
-    	if (!prv->l2_cache)
-        	goto fail;
-    	prv->fd = fd;
-	init_fds(dd);
-	DPRINTF("VMDK File opened successfully\n");
-    	return 0;
-	
-fail:
-	DPRINTF("VMDK File open failed.\n"); 
-   	safer_free(prv->l1_backup_table);
-    	free(prv->l1_table);
-    	free(prv->l2_cache);
-    	close(fd);
-	return -1;
-}
-
-static uint64_t get_cluster_offset(struct tdvmdk_state *prv, 
-                                   uint64_t offset, int allocate)
-{
-    	unsigned int l1_index, l2_offset, l2_index;
-    	int min_index, i, j;
-    	uint32_t min_count, *l2_table, tmp;
-    	uint64_t cluster_offset;
-    
-    	l1_index = (offset >> 9) / prv->l1_entry_sectors;
-    	if (l1_index >= prv->l1_size)
-        	return 0;
-    	l2_offset = prv->l1_table[l1_index];
-    	if (!l2_offset)
-        	return 0;
-    	for (i = 0; i < L2_CACHE_SIZE; i++) {
-        	if (l2_offset == prv->l2_cache_offsets[i]) {
-            		/* increment the hit count */
-            		if (++prv->l2_cache_counts[i] == 0xffffffff) {
-	                	for(j = 0; j < L2_CACHE_SIZE; j++) {
-	                    		prv->l2_cache_counts[j] >>= 1;
-	                	}
-            		}
-            		l2_table = prv->l2_cache + (i * prv->l2_size);
-            		goto found;
-        	}
-    	}
-    	/* not found: load a new entry in the least used one */
-    	min_index = 0;
-    	min_count = 0xffffffff;
-    	for (i = 0; i < L2_CACHE_SIZE; i++) {
-        	if (prv->l2_cache_counts[i] < min_count) {
-            		min_count = prv->l2_cache_counts[i];
-            		min_index = i;
-        	}
-    	}
-    	l2_table = prv->l2_cache + (min_index * prv->l2_size);
-    	lseek(prv->fd, (int64_t)l2_offset * 512, SEEK_SET);
-    	if (read(prv->fd, l2_table, prv->l2_size * sizeof(uint32_t)) != 
-        	 prv->l2_size * sizeof(uint32_t))
-        	return 0;
-    	prv->l2_cache_offsets[min_index] = l2_offset;
-    	prv->l2_cache_counts[min_index] = 1;
- found:
-    	l2_index = ((offset >> 9) / prv->cluster_sectors) % prv->l2_size;
-    	cluster_offset = le32_to_cpu(l2_table[l2_index]);
-    	if (!cluster_offset) {
-        	if (!allocate)
-            		return 0;
-        	cluster_offset = lseek(prv->fd, 0, SEEK_END);
-        	if (ftruncate(prv->fd, cluster_offset + 
-			      (prv->cluster_sectors << 9)))
-			return 0;
-        	cluster_offset >>= 9;
-        	/* update L2 table */
-        	tmp = cpu_to_le32(cluster_offset);
-        	l2_table[l2_index] = tmp;
-        	lseek(prv->fd, ((int64_t)l2_offset * 512) + 
-        	      (l2_index * sizeof(tmp)), SEEK_SET);
-        	if (write(prv->fd, &tmp, sizeof(tmp)) != sizeof(tmp))
-            		return 0;
-        	/* update backup L2 table */
-        	if (prv->l1_backup_table_offset != 0) {
-            		l2_offset = prv->l1_backup_table[l1_index];
-            	lseek(prv->fd, ((int64_t)l2_offset * 512) + 
-            		(l2_index * sizeof(tmp)), SEEK_SET);
-            	if (write(prv->fd, &tmp, sizeof(tmp)) != sizeof(tmp))
-                	return 0;
-        	}
-    	}
-    	cluster_offset <<= 9;
-    	return cluster_offset;
-}
-
-static int tdvmdk_queue_read(struct disk_driver *dd, uint64_t sector,
-			       int nb_sectors, char *buf, td_callback_t cb,
-			       int id, void *private)
-{
-	struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
-    	int index_in_cluster, n;
-    	uint64_t cluster_offset;
-    	int ret = 0;
-
-    	while (nb_sectors > 0) {
-        	cluster_offset = get_cluster_offset(prv, sector << 9, 0);
-        	index_in_cluster = sector % prv->cluster_sectors;
-        	n = prv->cluster_sectors - index_in_cluster;
-        	if (n > nb_sectors)
-            		n = nb_sectors;
-        	if (!cluster_offset) {
-            		memset(buf, 0, 512 * n);
-        	} else {
-            		lseek(prv->fd, cluster_offset + index_in_cluster * 512,
-            	      	      SEEK_SET);
-            		ret = read(prv->fd, buf, n * 512);
-            		if (ret != n * 512) {
-                		ret = -1;
-                		goto done;
-            		}
-        	}
-        	nb_sectors -= n;
-        	sector     += n;
-        	buf += n * 512;
-    	}
-done:
-	return cb(dd, ret == -1 ? -1 : 0, sector, nb_sectors, id, private);
-}
-
-static  int tdvmdk_queue_write(struct disk_driver *dd, uint64_t sector,
-			       int nb_sectors, char *buf, td_callback_t cb,
-			       int id, void *private)
-{
-	struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
-    	int index_in_cluster, n;
-    	uint64_t cluster_offset;
-    	int ret = 0;
-
-    	while (nb_sectors > 0) {
-        	index_in_cluster = sector & (prv->cluster_sectors - 1);
-        	n = prv->cluster_sectors - index_in_cluster;
-        	if (n > nb_sectors)
-            		n = nb_sectors;
-        	cluster_offset = get_cluster_offset(prv, sector << 9, 1);
-        	if (!cluster_offset) {
-            		ret = -1;
-            		goto done;
-        	}
-        	lseek(prv->fd, cluster_offset + index_in_cluster * 512, 
-        	      SEEK_SET);
-        	ret = write(prv->fd, buf, n * 512);
-        	if (ret != n * 512) {
-            		ret = -1;
-            		goto done;
-        	}
-        	nb_sectors -= n;
-        	sector     += n;
-        	buf += n * 512;
-    	}
-done:
-	return cb(dd, ret == -1 ? -1 : 0, sector, nb_sectors, id, private);
-}
- 		
-static int tdvmdk_submit(struct disk_driver *dd)
-{
-	return 0;	
-}
-
-static int tdvmdk_close(struct disk_driver *dd)
-{
-	struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
-	
-    	safer_free(prv->l1_table);
-    	safer_free(prv->l1_backup_table);
-    	safer_free(prv->l2_cache);
-    	close(prv->fd);
-	close(prv->poll_pipe[0]);
-	close(prv->poll_pipe[1]);
-	return 0;
-}
-
-static int tdvmdk_do_callbacks(struct disk_driver *dd, int sid)
-{
-	/* always ask for a kick */
-	return 1;
-}
-
-static int tdvmdk_get_parent_id(struct disk_driver *dd, struct disk_id *id)
-{
-	return TD_NO_PARENT;
-}
-
-static int tdvmdk_validate_parent(struct disk_driver *dd, 
-				  struct disk_driver *parent, td_flag_t flags)
-{
-	return -EINVAL;
-}
-
-struct tap_disk tapdisk_vmdk = {
-	.disk_type           = "tapdisk_vmdk",
-	.private_data_size   = sizeof(struct tdvmdk_state),
-	.td_open             = tdvmdk_open,
-	.td_queue_read       = tdvmdk_queue_read,
-	.td_queue_write      = tdvmdk_queue_write,
-	.td_submit           = tdvmdk_submit,
-	.td_close            = tdvmdk_close,
-	.td_do_callbacks     = tdvmdk_do_callbacks,
-	.td_get_parent_id    = tdvmdk_get_parent_id,
-	.td_validate_parent  = tdvmdk_validate_parent
-};
diff --git a/tools/blktap/drivers/bswap.h b/tools/blktap/drivers/bswap.h
deleted file mode 100644
index 5578913..0000000
--- a/tools/blktap/drivers/bswap.h
+++ /dev/null
@@ -1,178 +0,0 @@
-#ifndef BSWAP_H
-#define BSWAP_H
-
-//#include "config-host.h"
-
-#include <inttypes.h>
-
-#if defined(__NetBSD__)
-#include <sys/endian.h>
-#include <sys/types.h>
-#elif defined(__OpenBSD__)
-#include <machine/endian.h>
-#define bswap_16(x) swap16(x)
-#define bswap_32(x) swap32(x)
-#define bswap_64(x) swap64(x)
-#elif defined(__linux__)
-
-#include <byteswap.h>
-
-static inline uint16_t bswap16(uint16_t x)
-{
-    return bswap_16(x);
-}
-
-static inline uint32_t bswap32(uint32_t x) 
-{
-    return bswap_32(x);
-}
-
-static inline uint64_t bswap64(uint64_t x) 
-{
-    return bswap_64(x);
-}
-
-static inline void bswap16s(uint16_t *s)
-{
-    *s = bswap16(*s);
-}
-
-static inline void bswap32s(uint32_t *s)
-{
-    *s = bswap32(*s);
-}
-
-static inline void bswap64s(uint64_t *s)
-{
-    *s = bswap64(*s);
-}
-
-#endif
-
-#if defined(WORDS_BIGENDIAN)
-#define be_bswap(v, size) (v)
-#define le_bswap(v, size) bswap ## size(v)
-#define be_bswaps(v, size)
-#define le_bswaps(p, size) *p = bswap ## size(*p);
-#else
-#define le_bswap(v, size) (v)
-#define be_bswap(v, size) bswap ## size(v)
-#define le_bswaps(v, size)
-#define be_bswaps(p, size) *p = bswap ## size(*p);
-#endif
-
-#define CPU_CONVERT(endian, size, type)\
-static inline type endian ## size ## _to_cpu(type v)\
-{\
-    return endian ## _bswap(v, size);\
-}\
-\
-static inline type cpu_to_ ## endian ## size(type v)\
-{\
-    return endian ## _bswap(v, size);\
-}\
-\
-static inline void endian ## size ## _to_cpus(type *p)\
-{\
-    endian ## _bswaps(p, size)\
-}\
-\
-static inline void cpu_to_ ## endian ## size ## s(type *p)\
-{\
-    endian ## _bswaps(p, size)\
-}\
-\
-static inline type endian ## size ## _to_cpup(const type *p)\
-{\
-    return endian ## size ## _to_cpu(*p);\
-}\
-\
-static inline void cpu_to_ ## endian ## size ## w(type *p, type v)\
-{\
-     *p = cpu_to_ ## endian ## size(v);\
-}
-
-CPU_CONVERT(be, 16, uint16_t)
-CPU_CONVERT(be, 32, uint32_t)
-CPU_CONVERT(be, 64, uint64_t)
-
-CPU_CONVERT(le, 16, uint16_t)
-CPU_CONVERT(le, 32, uint32_t)
-CPU_CONVERT(le, 64, uint64_t)
-
-/* unaligned versions (optimized for frequent unaligned accesses)*/
-
-#if defined(__i386__) || defined(__powerpc__)
-
-#define cpu_to_le16wu(p, v) cpu_to_le16w(p, v)
-#define cpu_to_le32wu(p, v) cpu_to_le32w(p, v)
-#define le16_to_cpupu(p) le16_to_cpup(p)
-#define le32_to_cpupu(p) le32_to_cpup(p)
-
-#define cpu_to_be16wu(p, v) cpu_to_be16w(p, v)
-#define cpu_to_be32wu(p, v) cpu_to_be32w(p, v)
-
-#else
-
-static inline void cpu_to_le16wu(uint16_t *p, uint16_t v)
-{
-    uint8_t *p1 = (uint8_t *)p;
-
-    p1[0] = v;
-    p1[1] = v >> 8;
-}
-
-static inline void cpu_to_le32wu(uint32_t *p, uint32_t v)
-{
-    uint8_t *p1 = (uint8_t *)p;
-
-    p1[0] = v;
-    p1[1] = v >> 8;
-    p1[2] = v >> 16;
-    p1[3] = v >> 24;
-}
-
-static inline uint16_t le16_to_cpupu(const uint16_t *p)
-{
-    const uint8_t *p1 = (const uint8_t *)p;
-    return p1[0] | (p1[1] << 8);
-}
-
-static inline uint32_t le32_to_cpupu(const uint32_t *p)
-{
-    const uint8_t *p1 = (const uint8_t *)p;
-    return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24);
-}
-
-static inline void cpu_to_be16wu(uint16_t *p, uint16_t v)
-{
-    uint8_t *p1 = (uint8_t *)p;
-
-    p1[0] = v >> 8;
-    p1[1] = v;
-}
-
-static inline void cpu_to_be32wu(uint32_t *p, uint32_t v)
-{
-    uint8_t *p1 = (uint8_t *)p;
-
-    p1[0] = v >> 24;
-    p1[1] = v >> 16;
-    p1[2] = v >> 8;
-    p1[3] = v;
-}
-
-#endif
-
-#ifdef WORDS_BIGENDIAN
-#define cpu_to_32wu cpu_to_be32wu
-#else
-#define cpu_to_32wu cpu_to_le32wu
-#endif
-
-#undef le_bswap
-#undef be_bswap
-#undef le_bswaps
-#undef be_bswaps
-
-#endif /* BSWAP_H */
diff --git a/tools/blktap/drivers/img2qcow.c b/tools/blktap/drivers/img2qcow.c
deleted file mode 100644
index 6b4fa70..0000000
--- a/tools/blktap/drivers/img2qcow.c
+++ /dev/null
@@ -1,282 +0,0 @@
-/* img2qcow.c
- *
- * Generates a qcow format disk and fills it from an existing image.
- *
- * (c) 2006 Julian Chesterfield and Andrew Warfield
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/statvfs.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <string.h>
-#include "tapdisk.h"
-#include "blk.h"
-
-#if 1
-#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
-#else
-#define DFPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* *BSD has no O_LARGEFILE */
-#ifndef O_LARGEFILE
-#define O_LARGEFILE	0
-#endif
-
-
-#define TAPDISK 1
-#define BLOCK_PROCESSSZ 4096
-
-static int maxfds, *io_fd, running = 1, complete = 0;
-static int returned_events = 0, submit_events = 0;
-static uint64_t prev = 0;
-static char output[25];
-
-static void print_bytes(void *ptr, int length)
-{
-  int i,k;
-  unsigned char *p = ptr;
-
-    DFPRINTF("Buf dump, length %d:\n",length);
-    for (k = 0; k < length; k++) {
-        DFPRINTF("%x",*p);
-        *p++;
-	if(k % 16 == 0) DFPRINTF("\n");
-        else if(k % 2 == 0) DFPRINTF(" ");	
-    }
-    DFPRINTF("\n");
-    return;
-}
-
-static void debug_output(uint64_t progress, uint64_t size)
-{
-	uint64_t blocks = size/20;
-
-	/*Output progress every 5% */	
-	if (progress/blocks > prev) {
-		memcpy(output+prev+1,"=>",2);
-		prev++;
-		DFPRINTF("\r%s     %llu%%", output, 
-			(long long)(prev-1)*5);
-	}
-	return;
-}
-
-static inline void LOCAL_FD_SET(fd_set *readfds) 
-{
-	FD_SET(io_fd[0], readfds);
-	maxfds = io_fd[0] + 1;
-	
-	return;
-}
-
-static int get_image_info(struct td_state *s, int fd)
-{
-	int ret;
-	long size;
-	unsigned long total_size;
-	struct statvfs statBuf;
-	struct stat stat;
-
-	ret = fstat(fd, &stat);
-	if (ret != 0) {
-		DFPRINTF("ERROR: fstat failed, Couldn't stat image");
-		return -EINVAL;
-	}
-
-	if (S_ISBLK(stat.st_mode)) {
-		/*Accessing block device directly*/
-		if (blk_getimagesize(fd, &s->size) != 0)
-			return -EINVAL;
-
-		DFPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
-			"sector_shift [%llu]\n",
-			(long long unsigned)(s->size << SECTOR_SHIFT),
-			(long long unsigned)s->size);
-
-		/*Get the sector size*/
-		if (blk_getsectorsize(fd, &s->sector_size) != 0)
-			s->sector_size = DEFAULT_SECTOR_SIZE;
-
-	} else {
-		/*Local file? try fstat instead*/
-		s->size = (stat.st_size >> SECTOR_SHIFT);
-		s->sector_size = DEFAULT_SECTOR_SIZE;
-		DFPRINTF("Image size: [%llu]\n",
-			(long long unsigned)s->size);
-	}
-
-	return 0;
-}
-
-static int send_responses(struct disk_driver *dd, int res, uint64_t sec, 
-			  int nr_secs, int idx, void *private)
-{
-	if (res < 0) DFPRINTF("AIO FAILURE: res [%d]!\n",res);
-	
-	returned_events++;
-	
-	free(private);
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	struct disk_driver dd;
-	struct td_state *s;
-	int ret = -1, fd, len;
-	fd_set readfds;
-	struct timeval timeout;
-	uint64_t i;
-	char *buf;
-
-	if (argc != 3) {
-		fprintf(stderr, "Qcow-utils: v1.0.0\n");
-		fprintf(stderr, "usage: %s <QCOW FILENAME> <SRC IMAGE>\n", 
-			argv[0]);
-		exit(-1);
-	}
-
-	s = malloc(sizeof(struct td_state));
-	
-	/*Open image*/
-	fd = open(argv[2], O_RDONLY | O_LARGEFILE);
-	
-        if (fd == -1) {
-                DFPRINTF("Unable to open [%s], (err %d)!\n",argv[2],0 - errno);
-                exit(-1);
-        }
-	
-	get_image_info(s, fd);
-	
-	/*Create qcow file*/
-	ret = qcow_create(argv[1],s->size<<SECTOR_SHIFT,NULL,0);
-	
-	if (ret < 0) {
-		DFPRINTF("Unable to create QCOW file\n");
-		exit(-1);
-	} else DFPRINTF("Qcow file created: size %llu sectors\n",
-			(long long unsigned)s->size);
-	
-	dd.td_state = s;
-	dd.drv      = &tapdisk_qcow;
-	dd.private  = malloc(dd.drv->private_data_size);
-
-        /*Open qcow file*/
-        if (dd.drv->td_open(&dd, argv[1], 0)!=0) {
-		DFPRINTF("Unable to open Qcow file [%s]\n",argv[1]);
-		exit(-1);
-	}
-
-	io_fd = dd.io_fd;
-
-	/*Initialise the output string*/
-	memset(output,0x20,25);
-	output[0] = '[';
-	output[22] = ']';
-	output[23] = '\0';
-	DFPRINTF("%s",output);
-
-	i = 0;
-	while (running) {
-		timeout.tv_sec = 0;
-		
-		if (!complete) {
-			/*Read sector from image*/
-			if (lseek(fd, i, SEEK_SET) == (off_t)-1) {
-				DFPRINTF("Unable to access file offset %llu\n",
-				       (long long)i);
-				exit(-1);
-			}
-			
-			if( (ret = posix_memalign((void **)&buf, 
-						  BLOCK_PROCESSSZ, 
-						  BLOCK_PROCESSSZ)) != 0) {
-				DFPRINTF("Unable to read memalign buf (%d)\n",ret);
-				exit(-1);				
-			}
-		
-			/*We attempt to read 4k sized blocks*/
-			len = read(fd, buf, BLOCK_PROCESSSZ);
-			if (len < 512) {
-				DFPRINTF("Unable to read sector %llu\n",
-				       (long long unsigned) (i >> 9));
-				complete = 1;
-				continue;
-			}
-			
-			if (len % 512) {
-				len = (len >> 9) << 9;
-			}
-
-			ret = dd.drv->td_queue_write(&dd, i >> 9,
-						     len >> 9, buf, 
-						     send_responses, 0, buf);
-				
-			if (!ret) submit_events++;
-				
-			if (ret < 0) {
-				DFPRINTF("UNABLE TO WRITE block [%llu]\n",
-				       (long long unsigned) (i >> 9));
-			} else i += len;
-			
-			if (i >> 9 == s->size) complete = 1;
-
-			debug_output(i,s->size << 9);
-			
-			if ((submit_events % 10 == 0) || complete) 
-				dd.drv->td_submit(&dd);
-			timeout.tv_usec = 0;
-			
-		} else {
-			timeout.tv_usec = 1000;
-			if (!submit_events) running = 0;
-		}
-		
-
-		/*Check AIO FD*/
-		LOCAL_FD_SET(&readfds);
-                ret = select(maxfds + 1, &readfds, (fd_set *) 0,
-                             (fd_set *) 0, &timeout);
-			     
-		if (ret > 0) dd.drv->td_do_callbacks(&dd, 0);
-		if (complete && (returned_events == submit_events)) 
-			running = 0;
-	}
-	memcpy(output+prev+1,"=",1);
-	DFPRINTF("\r%s     100%%\nTRANSFER COMPLETE\n\n", output);
-        dd.drv->td_close(&dd);
-        free(dd.private);
-        free(s);
-		
-	return 0;
-}
diff --git a/tools/blktap/drivers/qcow-create.c b/tools/blktap/drivers/qcow-create.c
deleted file mode 100644
index 25abfcd..0000000
--- a/tools/blktap/drivers/qcow-create.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/* qcow-create.c
- *
- * Generates a qcow format disk.
- *
- * (c) 2006 Andrew Warfield and Julian Chesterfield
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/statvfs.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <string.h>
-#include "tapdisk.h"
-
-#if 1
-#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
-#else
-#define DFPRINTF(_f, _a...) ((void)0)
-#endif
-
-#define MAX_NAME_LEN 1000
-
-static void help(void)
-{
-	fprintf(stderr, "Qcow-utils: v1.0.0\n");
-	fprintf(stderr, 
-		"usage: qcow-create [-h help] [-r reserve] [-f format] <SIZE(MB)> <FILENAME> "
-		"[<BACKING_FILENAME>]\n"); 
-	exit(-1);
-}
-
-int main(int argc, char *argv[])
-{
-	int ret = -1, c, backed = 0;
-	int sparse =  1;
-	char *fmt = "qcow";
-	uint64_t size;
-	char filename[MAX_NAME_LEN], bfilename[MAX_NAME_LEN];
-	char *tmpfile;
-
-        for(;;) {
-                c = getopt(argc, argv, "hrf");
-                if (c == -1)
-                        break;
-                switch(c) {
-                case 'h':
-                        help();
-                        exit(0);
-                        break;
-                case 'f':
-                        fmt = argv[optind++];
-                        break;
-                case 'r':
-			sparse = 0;
-			break;
-		default:
-			fprintf(stderr, "Unknown option\n");
-			help();
-		}
-	}
-
-	printf("Optind %d, argc %d\n", optind, argc);
-	if ( !(optind == (argc - 2) || optind == (argc - 3)) )
-		help();
-
-	size = atoi(argv[optind++]);
-	size = size << 20;
-
-	if (snprintf(filename, MAX_NAME_LEN, "%s",argv[optind++]) >=
-		MAX_NAME_LEN) {
-		fprintf(stderr,"Device name too long\n");
-		exit(-1);
-	}
-
-	if (optind != argc) {
-		/*Backing file argument*/
-		backed = 1;
-		if (snprintf(bfilename, MAX_NAME_LEN, "%s",argv[optind++]) >=
-			MAX_NAME_LEN) {
-			fprintf(stderr,"Device name too long\n");
-			exit(-1);
-		}
-	}
-
-    tmpfile = backed ? bfilename: NULL; 
-    if (!strcmp(fmt, "qcow")) {
-        ret = qcow_create(filename, size, tmpfile, sparse);
-    } else if(!strcmp(fmt, "qcow2")) {
-        ret = qcow2_create(filename, size, tmpfile, sparse);
-    } else {
-        fprintf(stderr,"Unsupport format:%s\n", fmt);
-        exit(-1);
-    } 
-    DFPRINTF("Creating file size %llu, name %s\n",(long long unsigned)size, filename);
-
-	if (ret < 0)
-		DPRINTF("Unable to create QCOW file\n");
-	else
-		DPRINTF("QCOW file successfully created\n");
-
-	return 0;
-}
diff --git a/tools/blktap/drivers/qcow2raw.c b/tools/blktap/drivers/qcow2raw.c
deleted file mode 100644
index 0fa88c1..0000000
--- a/tools/blktap/drivers/qcow2raw.c
+++ /dev/null
@@ -1,348 +0,0 @@
-/* qcow2raw.c
- *
- * Generates raw image data from an existing qcow image
- *
- * (c) 2006 Julian Chesterfield and Andrew Warfield
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <unistd.h>
-#include <sys/statvfs.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <string.h>
-#include "tapdisk.h"
-#include "blk.h"
-
-#if 1
-#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
-#else
-#define DFPRINTF(_f, _a...) ((void)0)
-#endif
-
-
-/* *BSD has no O_LARGEFILE */
-#ifndef O_LARGEFILE
-#define O_LARGEFILE 0
-#endif
-
-#define TAPDISK 1
-#define BLOCK_PROCESSSZ 4096
-
-static int maxfds, *qcowio_fd, *aio_fd, running = 1, complete = 0; 
-static int returned_read_events = 0, returned_write_events = 0;
-static int submit_events = 0;
-static uint32_t read_idx = 0, write_idx = 0;
-struct disk_driver ddqcow, ddaio;
-static uint64_t prev = 0, written = 0;
-static char output[25];
-
-static void print_bytes(void *ptr, int length)
-{
-  int i,k;
-  unsigned char *p = ptr;
-
-    DFPRINTF("Buf dump, length %d:\n",length);
-    for (k = 0; k < length; k++) {
-        DFPRINTF("%x",*p);
-        *p++;
-	if (k % 16 == 0) DFPRINTF("\n");
-        else if (k % 2 == 0) DFPRINTF(" ");	
-    }
-    DFPRINTF("\n");
-    return;
-}
-
-static void debug_output(uint64_t progress, uint64_t size)
-{
-	/*Output progress every 5% */	
-	uint64_t blocks = size/20;
-
-	if (progress/blocks > prev) {
-		memcpy(output+prev+1,"=>",2);
-		prev++;
-		DFPRINTF("\r%s     %llu%%", 
-			output, (long long)((prev-1)*5));
-	}
-	return;
-}
-
-static inline void LOCAL_FD_SET(fd_set *readfds) 
-{
-	FD_SET(qcowio_fd[0], readfds);
-	FD_SET(aio_fd[0], readfds);
-	
-	maxfds = (qcowio_fd[0] > aio_fd[0] ? qcowio_fd[0] : aio_fd[0]) + 1;
-	
-	return;
-}
-
-static int send_write_responses(struct disk_driver *dd, int res, uint64_t sec,
-				int nr_secs, int idx, void *private)
-{
-	if (res < 0) {
-		DFPRINTF("AIO FAILURE: res [%d]!\n",res);
-		return 0;
-	}
-	written += BLOCK_PROCESSSZ;
-	returned_write_events++;
-	write_idx = idx;
-
-	debug_output(written, dd->td_state->size << 9);
-	free(private);
-	return 0;
-}
-
-static int send_read_responses(struct disk_driver *dd, int res, uint64_t sec,
-			       int nr_secs, int idx, void *private)
-{
-	int ret;
-
-	if (res < 0) DFPRINTF("AIO FAILURE: res [%d]!\n",res);
-	
-	returned_read_events++;
-	read_idx = idx;
-	
-	ret = ddaio.drv->td_queue_write(&ddaio, idx, BLOCK_PROCESSSZ>>9, private, 
-					send_write_responses, idx, private);
-	if (ret != 0) {
-		DFPRINTF("ERROR in submitting queue write!\n");
-		return 0;
-	}
-
-	if ( (returned_read_events == submit_events) || 
-	     (returned_read_events % 10 == 0) ) {
-		ddaio.drv->td_submit(&ddaio);
-	}
-
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	int ret = -1, fd, len,input;
-	uint64_t size;
-	fd_set readfds;
-	struct timeval timeout;
-	uint64_t i;
-	char *buf;
-	struct stat finfo;
-
-	if (argc != 3) {
-		fprintf(stderr, "Qcow-utils: v1.0.0\n");
-		fprintf(stderr, "usage: %s <Dest File descriptor> "
-			"<Qcow SRC IMAGE>\n", 
-		       argv[0]);
-		exit(-1);
-	}
-
-	ddqcow.td_state = malloc(sizeof(struct td_state));
-	ddaio.td_state  = malloc(sizeof(struct td_state));
-	
-	/*Open qcow source file*/	
-	ddqcow.drv = &tapdisk_qcow;
-	ddqcow.private = malloc(ddqcow.drv->private_data_size);
-
-        if (ddqcow.drv->td_open(&ddqcow, argv[2], TD_RDONLY)!=0) {
-		DFPRINTF("Unable to open Qcow file [%s]\n",argv[2]);
-		exit(-1);
-	} else DFPRINTF("QCOW file opened, size %llu\n",
-		      (long long unsigned)ddqcow.td_state->size);
-
-	qcowio_fd = ddqcow.io_fd;
-
-        /*Setup aio destination file*/
-	ret = stat(argv[1],&finfo);
-	if (ret == -1) {
-		/*Check errno*/
-		switch(errno) {
-		case ENOENT:
-			/*File doesn't exist, create*/
-			fd = open(argv[1], 
-				  O_RDWR | O_LARGEFILE | O_CREAT, 0644);
-			if (fd < 0) {
-				DFPRINTF("ERROR creating file [%s] "
-					 "(errno %d)\n",
-				       argv[1], 0 - errno);
-				exit(-1);
-			}
-			if (ftruncate(fd, (off_t)ddqcow.td_state->size<<9) < 0) {
-				DFPRINTF("Unable to create file "
-					"[%s] of size %llu (errno %d). "
-					 "Exiting...\n",
-					argv[1], 
-					(long long unsigned)ddqcow.td_state->size<<9, 
-					0 - errno);
-				close(fd);
-				exit(-1);
-			}
-			close(fd);
-			break;
-		case  ENXIO:
-			DFPRINTF("ERROR Device [%s] does not exist\n",argv[1]);
-			exit(-1);
-		default: 
-			DFPRINTF("An error occurred opening Device [%s] "
-				 "(errno %d)\n",
-			       argv[1], 0 - errno);
-			exit(-1);
-		}
-	} else {		
-		fprintf(stderr, "WARNING: All existing data in "
-			"%s will be overwritten.\nDo you wish to continue? "
-			"(y or n)  ",
-			argv[1]);
-		if (getchar() != 'y') {
-			DFPRINTF("Exiting...\n");
-			exit(-1);
-		}
-		
-		/*TODO - Test the existing file or device for adequate space*/
-		fd = open(argv[1], O_RDWR | O_LARGEFILE);
-		if (fd < 0) {
-			DFPRINTF("ERROR: opening file [%s] (errno %d)\n",
-			       argv[1], 0 - errno);
-			exit(-1);
-		}
-
-		if (S_ISBLK(finfo.st_mode)) {
-			if (blk_getimagesize(fd, &size) != 0) {
-				close(fd);
-				return -1;
-			}
-
-			if (size < ddqcow.td_state->size<<9) {
-				DFPRINTF("ERROR: Not enough space on device "
-					"%s (%"PRIu64" bytes available, "
-					"%llu bytes required\n",
-					argv[1], size, 
-					(long long unsigned)ddqcow.td_state->size<<9);
-				close(fd);
-				exit(-1);				
-			}
-		} else {
-			if (ftruncate(fd, (off_t)ddqcow.td_state->size<<9) < 0) {
-				DFPRINTF("Unable to create file "
-					"[%s] of size %llu (errno %d). "
-					 "Exiting...\n",
-					argv[1], 
-					(long long unsigned)ddqcow.td_state->size<<9, 
-					 0 - errno);
-				close(fd);
-				exit(-1);
-			} else DFPRINTF("File [%s] truncated to length %llu "
-					"(%llu)\n", 
-				       argv[1], 
-				       (long long unsigned)ddqcow.td_state->size<<9, 
-				       (long long unsigned)ddqcow.td_state->size);
-		}
-		close(fd);
-	}
-
-	/*Open aio destination file*/	
-	ddaio.drv = &tapdisk_aio;
-	ddaio.private = malloc(ddaio.drv->private_data_size);
-
-        if (ddaio.drv->td_open(&ddaio, argv[1], 0)!=0) {
-		DFPRINTF("Unable to open Qcow file [%s]\n", argv[1]);
-		exit(-1);
-	}
-
-	aio_fd = ddaio.io_fd;
-
-	/*Initialise the output string*/
-	memset(output,0x20,25);
-	output[0] = '[';
-	output[22] = ']';
-	output[23] = '\0';
-	DFPRINTF("%s",output);
-
-	i = 0;
-	while (running) {
-		timeout.tv_sec = 0;
-		
-		if (!complete) {
-			/*Read Pages from qcow image*/
-			if ( (ret = posix_memalign((void **)&buf, 
-						   BLOCK_PROCESSSZ, 
-						   BLOCK_PROCESSSZ))
-			     != 0) {
-				DFPRINTF("Unable to alloc memory (%d)\n",ret);
-				exit(-1);				
-			}
-		
-			/*Attempt to read 4k sized blocks*/
-			submit_events++;
-			ret = ddqcow.drv->td_queue_read(&ddqcow, i>>9,
-							BLOCK_PROCESSSZ>>9, buf, 
-							send_read_responses, i>>9, buf);
-
-			if (ret < 0) {
-				DFPRINTF("UNABLE TO READ block [%llu]\n",
-				       (long long unsigned)i);
-				exit(-1);
-			} else {
-				i += BLOCK_PROCESSSZ;
-			}
-
-			if (i >= ddqcow.td_state->size<<9) {
-				complete = 1;
-			}
-			
-			if ((submit_events % 10 == 0) || complete) 
-				ddqcow.drv->td_submit(&ddqcow);
-			timeout.tv_usec = 0;
-			
-		} else {
-			timeout.tv_usec = 1000;
-			if (!submit_events) running = 0;
-		}
-		
-
-		/*Check AIO FD*/
-		LOCAL_FD_SET(&readfds);
-                ret = select(maxfds + 1, &readfds, (fd_set *) 0,
-                             (fd_set *) 0, &timeout);
-			     
-		if (ret > 0) {
-			if (FD_ISSET(qcowio_fd[0], &readfds)) 
-				ddqcow.drv->td_do_callbacks(&ddqcow, 0);
-			if (FD_ISSET(aio_fd[0], &readfds)) 
-				ddaio.drv->td_do_callbacks(&ddaio, 0);
-		}
-		if (complete && (returned_write_events == submit_events)) 
-			running = 0;
-	}
-	memcpy(output+prev+1,"=",1);
-	DFPRINTF("\r%s     100%%\nTRANSFER COMPLETE\n\n", output);
-		
-	return 0;
-}
diff --git a/tools/blktap/drivers/tapaio.c b/tools/blktap/drivers/tapaio.c
deleted file mode 100644
index 140c44a..0000000
--- a/tools/blktap/drivers/tapaio.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * Copyright (c) 2006 Andrew Warfield and Julian Chesterfield
- * Copyright (c) 2007 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "tapaio.h"
-#include "tapdisk.h"
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <stdlib.h>
-
-/**
- * We used a kernel patch to return an fd associated with the AIO context
- * so that we can concurrently poll on synchronous and async descriptors.
- * This is signalled by passing 1 as the io context to io_setup.
- */
-#define REQUEST_ASYNC_FD 1
-
-/*
- * If we don't have any way to do epoll on aio events in a normal kernel,
- * wait for aio events in a separate thread and return completion status
- * that via a pipe that can be waited on normally.
- *
- * To keep locking problems between the completion thread and the submit
- * thread to a minimum, there's a handshake which allows only one thread
- * to be doing work on the completion queue at a time:
- *
- * 1) main thread sends completion thread a command via the command pipe;
- * 2) completion thread waits for aio events and returns the number
- *    received on the completion pipe
- * 3) main thread processes the received ctx->aio_events events
- * 4) loop back to 1) to let the completion thread refill the aio_events
- *    buffer.
- *
- * This workaround needs to disappear once the kernel provides a single
- * mechanism for waiting on both aio and normal fd wakeups.
- */
-static void *
-tap_aio_completion_thread(void *arg)
-{
-	tap_aio_internal_context_t *ctx = (tap_aio_internal_context_t *) arg;
-	int command;
-	int nr_events;
-	int rc;
-
-	while (1) {
-		rc = read(ctx->command_fd[0], &command, sizeof(command));
-
-		do {
-			rc = io_getevents(ctx->aio_ctx, 1,
-					  ctx->max_aio_events, ctx->aio_events,
-					  NULL);
-			if (rc) {
-				nr_events = rc;
-				rc = write(ctx->completion_fd[1], &nr_events,
-					   sizeof(nr_events));
-			}
-		} while (!rc);
-	}
-	return NULL;
-}
-
-void
-tap_aio_continue(tap_aio_internal_context_t *ctx)
-{
-        int cmd = 0;
-
-        if (!ctx->poll_in_thread)
-                return;
-
-        if (write(ctx->command_fd[1], &cmd, sizeof(cmd)) < 0)
-                DPRINTF("Cannot write to command pipe\n");
-}
-
-static int
-tap_aio_setup(tap_aio_internal_context_t *ctx,
-              struct io_event *aio_events,
-              int max_aio_events)
-{
-        int ret;
-
-        ctx->aio_events = aio_events;
-        ctx->max_aio_events = max_aio_events;
-        ctx->poll_in_thread = 0;
-
-        ctx->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
-        ret = io_setup(ctx->max_aio_events, &ctx->aio_ctx);
-        if (ret < 0 && ret != -EINVAL)
-                return ret;
-        else if (ret > 0) {
-                ctx->pollfd = ret;
-                return ctx->pollfd;
-        }
-
-        ctx->aio_ctx = (io_context_t) 0;
-        ret = io_setup(ctx->max_aio_events, &ctx->aio_ctx);
-        if (ret < 0)
-                return ret;
-
-        if ((ret = pipe(ctx->command_fd)) < 0) {
-                DPRINTF("Unable to create command pipe\n");
-                return -1;
-        }
-        if ((ret = pipe(ctx->completion_fd)) < 0) {
-                DPRINTF("Unable to create completion pipe\n");
-                return -1;
-        }
-
-        if ((ret = pthread_create(&ctx->aio_thread, NULL,
-                                  tap_aio_completion_thread, ctx)) != 0) {
-                DPRINTF("Unable to create completion thread\n");
-                return -1;
-        }
-
-        ctx->pollfd = ctx->completion_fd[0];
-        ctx->poll_in_thread = 1;
-
-        tap_aio_continue(ctx);
-
-        return 0;
-}
-
-int
-tap_aio_get_events(tap_aio_internal_context_t *ctx)
-{
-        int nr_events = 0;
-
-        if (!ctx->poll_in_thread)
-                nr_events = io_getevents(ctx->aio_ctx, 1,
-                                         ctx->max_aio_events, ctx->aio_events, NULL);
-        else {
-		int r;
-		r = read(ctx->completion_fd[0], &nr_events, sizeof(nr_events));
-		if (r < 0) {
-			if (errno == EAGAIN || errno == EINTR)
-				return 0;
-			/* This is pretty bad, we'll probably spin */
-			DPRINTF("Aargh, read completion_fd failed: %s",
-				strerror(errno));
-		} else if (r != sizeof(nr_events)) {
-			/* Should never happen because sizeof(nr_events)
-			 * fits in the guaranteed atomic pipe write size.
-			 * Blundering on is slightly nicer than asserting */
-			DPRINTF("Aargh, read completion_fd short read %d", r);
-		}
-	}
-
-        return nr_events;
-}
-
-int tap_aio_more_events(tap_aio_internal_context_t *ctx)
-{
-        return io_getevents(ctx->aio_ctx, 0,
-                            ctx->max_aio_events, ctx->aio_events, NULL);
-}
-
-int tap_aio_init(tap_aio_context_t *ctx, uint64_t sectors,
-		int max_aio_reqs)
-{
-	int i, ret;
-	long ioidx;
-
-	ctx->iocb_list = NULL;
-	ctx->pending_aio = NULL;
-	ctx->aio_events = NULL;
-	ctx->iocb_free = NULL;
-	ctx->iocb_queue = NULL;
-
-	/*Initialize Locking bitmap*/
-	ctx->sector_lock = calloc(1, sectors);
-		
-	if (!ctx->sector_lock) {
-		DPRINTF("Failed to allocate sector lock\n");
-		goto fail;
-	}
-
-
-	/* Initialize AIO */
-	ctx->max_aio_reqs = max_aio_reqs;
-	ctx->iocb_free_count = ctx->max_aio_reqs;
-	ctx->iocb_queued	 = 0;
-
-	if (!(ctx->iocb_list = malloc(sizeof(struct iocb) * ctx->max_aio_reqs)) ||
-		!(ctx->pending_aio = malloc(sizeof(struct pending_aio) * ctx->max_aio_reqs)) ||
-		!(ctx->aio_events = malloc(sizeof(struct io_event) * ctx->max_aio_reqs)) ||
-		!(ctx->iocb_free = malloc(sizeof(struct iocb *) * ctx->max_aio_reqs)) ||
-		!(ctx->iocb_queue = malloc(sizeof(struct iocb *) * ctx->max_aio_reqs))) 
-	{
-		DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n",
-				ctx->max_aio_reqs);
-		goto fail;
-	}
-
-	ret = tap_aio_setup(&ctx->aio_ctx, ctx->aio_events, ctx->max_aio_reqs);
-	if (ret < 0) {
-		if (ret == -EAGAIN) {
-			DPRINTF("Couldn't setup AIO context.  If you are "
-				"trying to concurrently use a large number "
-				"of blktap-based disks, you may need to "
-				"increase the system-wide aio request limit. "
-				"(e.g. 'echo echo 1048576 > /proc/sys/fs/"
-				"aio-max-nr')\n");
-		} else {
-			DPRINTF("Couldn't setup AIO context.\n");
-		}
-		goto fail;
-	}
-
-	for (i=0;i<ctx->max_aio_reqs;i++)
-		ctx->iocb_free[i] = &ctx->iocb_list[i];
-
-	DPRINTF("AIO state initialised\n");
-
-	return 0;
-
-fail:
-	return -1;
-}
-
-void tap_aio_free(tap_aio_context_t *ctx)
-{
-	if (ctx->sector_lock)
-		free(ctx->sector_lock);
-	if (ctx->iocb_list)
-		free(ctx->iocb_list);
-	if (ctx->pending_aio)
-		free(ctx->pending_aio);
-	if (ctx->aio_events)
-		free(ctx->aio_events);
-	if (ctx->iocb_free)
-		free(ctx->iocb_free);
-	if (ctx->iocb_queue)
-		free(ctx->iocb_queue);
-}
-
-/*TODO: Fix sector span!*/
-int tap_aio_can_lock(tap_aio_context_t *ctx, uint64_t sector)
-{
-	return (ctx->sector_lock[sector] ? 0 : 1);
-}
-
-int tap_aio_lock(tap_aio_context_t *ctx, uint64_t sector)
-{
-	return ++ctx->sector_lock[sector];
-}
-
-void tap_aio_unlock(tap_aio_context_t *ctx, uint64_t sector)
-{
-	if (!ctx->sector_lock[sector]) return;
-
-	--ctx->sector_lock[sector];
-	return;
-}
-
-
-int tap_aio_read(tap_aio_context_t *ctx, int fd, int size, 
-		uint64_t offset, char *buf, td_callback_t cb,
-		int id, uint64_t sector, void *private)
-{
-	struct	 iocb *io;
-	struct	 pending_aio *pio;
-	long	 ioidx;
-
-	if (ctx->iocb_free_count == 0)
-		return -ENOMEM;
-
-	io = ctx->iocb_free[--ctx->iocb_free_count];
-
-	ioidx = IOCB_IDX(ctx, io);
-	pio = &ctx->pending_aio[ioidx];
-	pio->cb = cb;
-	pio->id = id;
-	pio->private = private;
-	pio->nb_sectors = size/512;
-	pio->buf = buf;
-	pio->sector = sector;
-
-	io_prep_pread(io, fd, buf, size, offset);
-	io->data = (void *)ioidx;
-
-	ctx->iocb_queue[ctx->iocb_queued++] = io;
-
-	return 0;
-}
-
-int tap_aio_write(tap_aio_context_t *ctx, int fd, int size,
-		uint64_t offset, char *buf, td_callback_t cb,
-		int id, uint64_t sector, void *private)
-{
-	struct	 iocb *io;
-	struct	 pending_aio *pio;
-	long	 ioidx;
-
-	if (ctx->iocb_free_count == 0)
-		return -ENOMEM;
-
-	io = ctx->iocb_free[--ctx->iocb_free_count];
-
-	ioidx = IOCB_IDX(ctx, io);
-	pio = &ctx->pending_aio[ioidx];
-	pio->cb = cb;
-	pio->id = id;
-	pio->private = private;
-	pio->nb_sectors = size/512;
-	pio->buf = buf;
-	pio->sector = sector;
-
-	io_prep_pwrite(io, fd, buf, size, offset);
-	io->data = (void *)ioidx;
-
-	ctx->iocb_queue[ctx->iocb_queued++] = io;
-
-	return 0;
-}
-
-int tap_aio_submit(tap_aio_context_t *ctx)
-{
-	int ret;
-
-	if (!ctx->iocb_queued)
-		return 0;
-
-	ret = io_submit(ctx->aio_ctx.aio_ctx, ctx->iocb_queued, ctx->iocb_queue);
-
-	/* XXX: TODO: Handle error conditions here. */
-
-	/* Success case: */
-	ctx->iocb_queued = 0;
-
-	return 0;
-}
-
diff --git a/tools/blktap/drivers/tapaio.h b/tools/blktap/drivers/tapaio.h
deleted file mode 100644
index 27d3881..0000000
--- a/tools/blktap/drivers/tapaio.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2006 Andrew Warfield and Julian Chesterfield
- * Copyright (c) 2007 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef __TAPAIO_H__
-#define __TAPAIO_H__
-
-#include <pthread.h>
-#include <libaio.h>
-#include <stdint.h>
-
-#include "tapdisk.h"
-
-#define IOCB_IDX(_ctx, _io) ((_io) - (_ctx)->iocb_list)
-
-struct tap_aio_internal_context {
-        io_context_t     aio_ctx;
-
-        struct io_event *aio_events;
-        int              max_aio_events;
-
-        pthread_t        aio_thread;
-        int              command_fd[2];
-        int              completion_fd[2];
-        int              pollfd;
-        unsigned int     poll_in_thread : 1;
-};
-	
-
-typedef struct tap_aio_internal_context tap_aio_internal_context_t;
-
-
-struct pending_aio {
-	td_callback_t cb;
-	int id;
-	void *private;
-	int nb_sectors;
-	char *buf;
-	uint64_t sector;
-};
-
-	
-struct tap_aio_context {
-	tap_aio_internal_context_t    aio_ctx;
-
-	int                  max_aio_reqs;
-	struct iocb         *iocb_list;
-	struct iocb        **iocb_free;
-	struct pending_aio  *pending_aio;
-	int                  iocb_free_count;
-	struct iocb        **iocb_queue;
-	int	             iocb_queued;
-	struct io_event     *aio_events;
-
-	/* Locking bitmap for AIO reads/writes */
-	uint8_t *sector_lock;		   
-};
-
-typedef struct tap_aio_context tap_aio_context_t;
-
-void tap_aio_continue   (tap_aio_internal_context_t *ctx);
-int  tap_aio_get_events (tap_aio_internal_context_t *ctx);
-int  tap_aio_more_events(tap_aio_internal_context_t *ctx);
-
-
-int tap_aio_init(tap_aio_context_t *ctx, uint64_t sectors,
-		int max_aio_reqs);
-void tap_aio_free(tap_aio_context_t *ctx);
-
-int tap_aio_can_lock(tap_aio_context_t *ctx, uint64_t sector);
-int tap_aio_lock(tap_aio_context_t *ctx, uint64_t sector);
-void tap_aio_unlock(tap_aio_context_t *ctx, uint64_t sector);
-
-
-int tap_aio_read(tap_aio_context_t *ctx, int fd, int size, 
-		uint64_t offset, char *buf, td_callback_t cb,
-		int id, uint64_t sector, void *private);
-int tap_aio_write(tap_aio_context_t *ctx, int fd, int size,
-		uint64_t offset, char *buf, td_callback_t cb,
-		int id, uint64_t sector, void *private);
-int tap_aio_submit(tap_aio_context_t *ctx);
-
-#endif /* __TAPAIO_H__ */
diff --git a/tools/blktap/drivers/tapdisk.c b/tools/blktap/drivers/tapdisk.c
deleted file mode 100644
index 19cd777..0000000
--- a/tools/blktap/drivers/tapdisk.c
+++ /dev/null
@@ -1,872 +0,0 @@
-/* tapdisk.c
- *
- * separate disk process, spawned by blktapctrl. Inherits code from driver 
- * plugins
- * 
- * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield.
- *
- */
-
-#define MSG_SIZE 4096
-#define TAPDISK
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <string.h>
-#include <signal.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/poll.h>
-#include <unistd.h>
-#include <errno.h>
-#include <pthread.h>
-#include <time.h>
-#include <err.h>
-#include <poll.h>
-#include <sys/statvfs.h>
-#include <sys/ioctl.h>
-#include "blktaplib.h"
-#include "tapdisk.h"
-
-#if 1                                                                        
-#define ASSERT(_p) \
-    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
-    __LINE__, __FILE__); *(int*)0=0; }
-#else
-#define ASSERT(_p) ((void)0)
-#endif 
-
-#define INPUT 0
-#define OUTPUT 1
-
-static int maxfds, fds[2], run = 1;
-
-static pid_t process;
-int connected_disks = 0;
-fd_list_entry_t *fd_start = NULL;
-
-int do_cow_read(struct disk_driver *dd, blkif_request_t *req, 
-		int sidx, uint64_t sector, int nr_secs);
-
-#define td_for_each_disk(tds, drv) \
-        for (drv = tds->disks; drv != NULL; drv = drv->next)
-
-static void usage(void) 
-{
-	fprintf(stderr, "blktap-utils: v1.0.0\n");
-	fprintf(stderr, "usage: tapdisk <READ fifo> <WRITE fifo>\n");
-        exit(-1);
-}
-
-static void daemonize(void)
-{
-	int i;
-
-	if (getppid()==1) return; /* already a daemon */
-	if (fork() != 0) exit(0);
-
-#if 0
-	/*Set new program session ID and close all descriptors*/
-	setsid();
-	for (i = getdtablesize(); i >= 0; --i) close(i);
-
-	/*Send all I/O to /dev/null */
-	i = open("/dev/null",O_RDWR);
-	dup(i); 
-	dup(i);
-#endif
-	return;
-}
-
-static void free_driver(struct disk_driver *d)
-{
-	if (d->name)
-		free(d->name);
-	if (d->private)
-		free(d->private);
-	free(d);
-}
-
-static void unmap_disk(struct td_state *s)
-{
-	tapdev_info_t *info = s->ring_info;
-	struct disk_driver *dd, *tmp;
-	fd_list_entry_t *entry;
-
-	dd = s->disks;
-	while (dd) {
-		tmp = dd->next;
-		dd->drv->td_close(dd);
-		free_driver(dd);
-		dd = tmp;
-	}
-
-	if (info != NULL && info->mem > 0)
-	        munmap(info->mem, getpagesize() * BLKTAP_MMAP_REGION_SIZE);
-
-	entry = s->fd_entry;
-	*entry->pprev = entry->next;
-	if (entry->next)
-		entry->next->pprev = entry->pprev;
-
-	close(info->fd);
-
-	free(s->fd_entry);
-	free(s->blkif);
-	free(s->ring_info);
-	free(s);
-
-	return;
-}
-
-static void sig_handler(int sig)
-{
-	/*Received signal to close. If no disks are active, we close app.*/
-
-	if (connected_disks < 1) run = 0;	
-}
-
-static inline int LOCAL_FD_SET(fd_set *readfds)
-{
-	fd_list_entry_t *ptr;
-	struct disk_driver *dd;
-
-	ptr = fd_start;
-	while (ptr != NULL) {
-		if (ptr->tap_fd) {
-			FD_SET(ptr->tap_fd, readfds);
-			td_for_each_disk(ptr->s, dd) {
-				if (dd->io_fd[READ]) 
-					FD_SET(dd->io_fd[READ], readfds);
-				maxfds = (dd->io_fd[READ] > maxfds ? 
-					  dd->io_fd[READ] : maxfds);
-			}
-			maxfds = (ptr->tap_fd > maxfds ? ptr->tap_fd : maxfds);
-		}
-		ptr = ptr->next;
-	}
-
-	return 0;
-}
-
-static inline fd_list_entry_t *add_fd_entry(int tap_fd, struct td_state *s)
-{
-	fd_list_entry_t **pprev, *entry;
-	int i;
-
-	DPRINTF("Adding fd_list_entry\n");
-
-	/*Add to linked list*/
-	s->fd_entry   = entry = malloc(sizeof(fd_list_entry_t));
-	entry->tap_fd = tap_fd;
-	entry->s      = s;
-	entry->next   = NULL;
-
-	pprev = &fd_start;
-	while (*pprev != NULL)
-		pprev = &(*pprev)->next;
-
-	*pprev = entry;
-	entry->pprev = pprev;
-
-	return entry;
-}
-
-static inline struct td_state *get_state(int cookie)
-{
-	fd_list_entry_t *ptr;
-
-	ptr = fd_start;
-	while (ptr != NULL) {
-		if (ptr->cookie == cookie) return ptr->s;
-		ptr = ptr->next;
-	}
-	return NULL;
-}
-
-static struct tap_disk *get_driver(int drivertype)
-{
-	/* blktapctrl has passed us the driver type */
-
-	return dtypes[drivertype]->drv;
-}
-
-static struct td_state *state_init(void)
-{
-	int i;
-	struct td_state *s;
-	blkif_t *blkif;
-
-	s = malloc(sizeof(struct td_state));
-	blkif = s->blkif = malloc(sizeof(blkif_t));
-	s->ring_info = calloc(1, sizeof(tapdev_info_t));
-
-	for (i = 0; i < MAX_REQUESTS; i++) {
-		blkif->pending_list[i].secs_pending = 0;
-		blkif->pending_list[i].submitting = 0;
-	}
-
-	return s;
-}
-
-static int map_new_dev(struct td_state *s, int minor)
-{
-	int tap_fd;
-	tapdev_info_t *info = s->ring_info;
-	char *devname;
-	fd_list_entry_t *ptr;
-	int page_size;
-
-	if (asprintf(&devname,"%s/%s%d", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor) == -1)
-		return -1;
-	tap_fd = open(devname, O_RDWR);
-	if (tap_fd == -1) 
-	{
-		DPRINTF("open failed on dev %s!",devname);
-		goto fail;
-	} 
-	info->fd = tap_fd;
-
-	/*Map the shared memory*/
-	page_size = getpagesize();
-	info->mem = mmap(0, page_size * BLKTAP_MMAP_REGION_SIZE, 
-			  PROT_READ | PROT_WRITE, MAP_SHARED, info->fd, 0);
-	if ((long int)info->mem == -1) 
-	{
-		DPRINTF("mmap failed on dev %s!\n",devname);
-		goto fail;
-	}
-
-	/* assign the rings to the mapped memory */ 
-	info->sring = (blkif_sring_t *)((unsigned long)info->mem);
-	BACK_RING_INIT(&info->fe_ring, info->sring, page_size);
-	
-	info->vstart = 
-	        (unsigned long)info->mem + (BLKTAP_RING_PAGES * page_size);
-
-	ioctl(info->fd, BLKTAP_IOCTL_SENDPID, process );
-	ioctl(info->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
-	free(devname);
-
-	/*Update the fd entry*/
-	ptr = fd_start;
-	while (ptr != NULL) {
-		if (s == ptr->s) {
-			ptr->tap_fd = tap_fd;
-			break;
-		}
-		ptr = ptr->next;
-	}	
-
-	return minor;
-
- fail:
-	free(devname);
-	return -1;
-}
-
-static struct disk_driver *disk_init(struct td_state *s, 
-				     struct tap_disk *drv, 
-				     char *name, td_flag_t flags)
-{
-	struct disk_driver *dd;
-
-	dd = calloc(1, sizeof(struct disk_driver));
-	if (!dd)
-		return NULL;
-	
-	dd->private = malloc(drv->private_data_size);
-	if (!dd->private) {
-		free(dd);
-		return NULL;
-	}
-
-	dd->drv      = drv;
-	dd->td_state = s;
-	dd->name     = name;
-	dd->flags    = flags;
-
-	return dd;
-}
-
-static int open_disk(struct td_state *s, 
-		     struct tap_disk *drv, char *path, td_flag_t flags)
-{
-	int err;
-	char *dup;
-	td_flag_t pflags;
-	struct disk_id id;
-	struct disk_driver *d;
-
-	dup = strdup(path);
-	if (!dup)
-		return -ENOMEM;
-
-	memset(&id, 0, sizeof(struct disk_id));
-	s->disks = d = disk_init(s, drv, dup, flags);
-	if (!d)
-		return -ENOMEM;
-
-	err = drv->td_open(d, path, flags);
-	if (err) {
-		free_driver(d);
-		s->disks = NULL;
-		return -ENOMEM;
-	}
-	pflags = flags | TD_RDONLY;
-
-	/* load backing files as necessary */
-	while ((err = d->drv->td_get_parent_id(d, &id)) == 0) {
-		struct disk_driver *new;
-		
-		if (id.drivertype > MAX_DISK_TYPES || 
-		    !get_driver(id.drivertype) || !id.name)
-			goto fail;
-
-		dup = strdup(id.name);
-		if (!dup)
-			goto fail;
-
-		new = disk_init(s, get_driver(id.drivertype), dup, pflags);
-		if (!new)
-			goto fail;
-
-		err = new->drv->td_open(new, new->name, pflags);
-		if (err)
-			goto fail;
-
-		err = d->drv->td_validate_parent(d, new, 0);
-		if (err) {
-			d->next = new;
-			goto fail;
-		}
-
-		d = d->next = new;
-		free(id.name);
-	}
-
-	s->info |= ((flags & TD_RDONLY) ? VDISK_READONLY : 0);
-
-	if (err >= 0)
-		return 0;
-
- fail:
-	DPRINTF("failed opening disk\n");
-	if (id.name)
-		free(id.name);
-	d = s->disks;
-	while (d) {
-		struct disk_driver *tmp = d->next;
-		d->drv->td_close(d);
-		free_driver(d);
-		d = tmp;
-	}
-	s->disks = NULL;
-	return -1;
-}
-
-static int read_msg(char *buf)
-{
-	int length, len, msglen, tap_fd, *io_fd;
-	char *ptr, *path;
-	image_t *img;
-	msg_hdr_t *msg;
-	msg_newdev_t *msg_dev;
-	msg_pid_t *msg_pid;
-	struct tap_disk *drv;
-	int ret = -1;
-	struct td_state *s = NULL;
-	fd_list_entry_t *entry;
-
-	length = read(fds[READ], buf, MSG_SIZE);
-
-	if (length > 0 && length >= sizeof(msg_hdr_t)) 
-	{
-		msg = (msg_hdr_t *)buf;
-		DPRINTF("Tapdisk: Received msg, len %d, type %d, UID %d\n",
-			length,msg->type,msg->cookie);
-
-		switch (msg->type) {
-		case CTLMSG_PARAMS: 			
-			ptr = buf + sizeof(msg_hdr_t);
-			len = (length - sizeof(msg_hdr_t));
-			path = calloc(1, len);
-			
-			memcpy(path, ptr, len); 
-			DPRINTF("Received CTLMSG_PARAMS: [%s]\n", path);
-
-			/*Assign driver*/
-			drv = get_driver(msg->drivertype);
-			if (drv == NULL)
-				goto params_done;
-				
-			DPRINTF("Loaded driver: name [%s], type [%d]\n",
-				drv->disk_type, msg->drivertype);
-
-			/* Allocate the disk structs */
-			s = state_init();
-			if (s == NULL)
-				goto params_done;
-
-			/*Open file*/
-			ret = open_disk(s, drv, path, 
-					((msg->readonly) ? TD_RDONLY : 0));
-			if (ret)
-				goto params_done;
-
-			entry = add_fd_entry(0, s);
-			entry->cookie = msg->cookie;
-			DPRINTF("Entered cookie %d\n", entry->cookie);
-			
-			memset(buf, 0x00, MSG_SIZE); 
-			
-		params_done:
-			if (ret == 0) {
-				msglen = sizeof(msg_hdr_t) + sizeof(image_t);
-				msg->type = CTLMSG_IMG;
-				img = (image_t *)(buf + sizeof(msg_hdr_t));
-				img->size = s->size;
-				img->secsize = s->sector_size;
-				img->info = s->info;
-			} else {
-				msglen = sizeof(msg_hdr_t);
-				msg->type = CTLMSG_IMG_FAIL;
-				msg->len = msglen;
-			}
-			len = write(fds[WRITE], buf, msglen);
-			free(path);
-			return 1;
-			
-		case CTLMSG_NEWDEV:
-			msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t));
-
-			s = get_state(msg->cookie);
-			DPRINTF("Retrieving state, cookie %d.....[%s]\n",
-				msg->cookie, (s == NULL ? "FAIL":"OK"));
-			if (s != NULL) {
-				ret = ((map_new_dev(s, msg_dev->devnum) 
-					== msg_dev->devnum ? 0: -1));
-				connected_disks++;
-			}	
-
-			memset(buf, 0x00, MSG_SIZE); 
-			msglen = sizeof(msg_hdr_t);
-			msg->type = (ret == 0 ? CTLMSG_NEWDEV_RSP 
-				              : CTLMSG_NEWDEV_FAIL);
-			msg->len = msglen;
-
-			len = write(fds[WRITE], buf, msglen);
-			return 1;
-
-		case CTLMSG_CLOSE:
-			s = get_state(msg->cookie);
-			if (s) unmap_disk(s);
-			
-			connected_disks--;
-			sig_handler(SIGINT);
-
-			return 1;			
-
-		case CTLMSG_PID:
-			memset(buf, 0x00, MSG_SIZE);
-			msglen = sizeof(msg_hdr_t) + sizeof(msg_pid_t);
-			msg->type = CTLMSG_PID_RSP;
-			msg->len = msglen;
-
-			msg_pid = (msg_pid_t *)(buf + sizeof(msg_hdr_t));
-			process = getpid();
-			msg_pid->pid = process;
-
-			len = write(fds[WRITE], buf, msglen);
-			return 1;
-
-		default:
-			return 0;
-		}
-	}
-	return 0;
-}
-
-static inline int write_rsp_to_ring(struct td_state *s, blkif_response_t *rsp)
-{
-	tapdev_info_t *info = s->ring_info;
-	blkif_response_t *rsp_d;
-	
-	rsp_d = RING_GET_RESPONSE(&info->fe_ring, info->fe_ring.rsp_prod_pvt);
-	memcpy(rsp_d, rsp, sizeof(blkif_response_t));
-	info->fe_ring.rsp_prod_pvt++;
-	
-	return 0;
-}
-
-static inline void kick_responses(struct td_state *s)
-{
-	tapdev_info_t *info = s->ring_info;
-
-	if (info->fe_ring.rsp_prod_pvt != info->fe_ring.sring->rsp_prod) 
-	{
-		RING_PUSH_RESPONSES(&info->fe_ring);
-		ioctl(info->fd, BLKTAP_IOCTL_KICK_FE);
-	}
-}
-
-static void io_done(struct disk_driver *dd, int sid)
-{
-	struct tap_disk *drv = dd->drv;
-
-	if (!run) return; /*We have received signal to close*/
-
-	if (sid > MAX_IOFD || drv->td_do_callbacks(dd, sid) > 0)
-		kick_responses(dd->td_state);
-
-	return;
-}
-
-static inline uint64_t
-segment_start(blkif_request_t *req, int sidx)
-{
-	int i;
-	uint64_t start = req->sector_number;
-
-	for (i = 0; i < sidx; i++) 
-		start += (req->seg[i].last_sect - req->seg[i].first_sect + 1);
-
-	return start;
-}
-
-uint64_t sends, responds;
-static int send_responses(struct disk_driver *dd, int res, 
-		   uint64_t sector, int nr_secs, int idx, void *private)
-{
-	pending_req_t   *preq;
-	blkif_request_t *req;
-	int responses_queued = 0;
-	struct td_state *s = dd->td_state;
-	blkif_t *blkif = s->blkif;
-	int sidx = (int)(long)private, secs_done = nr_secs;
-
-	if ( (idx > MAX_REQUESTS-1) )
-	{
-		DPRINTF("invalid index returned(%u)!\n", idx);
-		return 0;
-	}
-	preq = &blkif->pending_list[idx];
-	req  = &preq->req;
-
-	if (res == BLK_NOT_ALLOCATED) {
-		res = do_cow_read(dd, req, sidx, sector, nr_secs);
-		if (res >= 0) {
-			secs_done = res;
-			res = 0;
-		} else
-			secs_done = 0;
-	}
-
-	preq->secs_pending -= secs_done;
-
-	if (res == -EBUSY && preq->submitting) 
-		return -EBUSY;  /* propagate -EBUSY back to higher layers */
-	if (res) 
-		preq->status = BLKIF_RSP_ERROR;
-	
-	if (!preq->submitting && preq->secs_pending == 0) 
-	{
-		blkif_request_t tmp;
-		blkif_response_t *rsp;
-
-		tmp = preq->req;
-		rsp = (blkif_response_t *)req;
-		
-		rsp->id = tmp.id;
-		rsp->operation = tmp.operation;
-		rsp->status = preq->status;
-		
-		write_rsp_to_ring(s, rsp);
-		responses_queued++;
-	}
-	return responses_queued;
-}
-
-int do_cow_read(struct disk_driver *dd, blkif_request_t *req, 
-		int sidx, uint64_t sector, int nr_secs)
-{
-	char *page;
-	int ret, early;
-	uint64_t seg_start, seg_end;
-	struct td_state  *s = dd->td_state;
-	tapdev_info_t *info = s->ring_info;
-	struct disk_driver *parent = dd->next;
-	
-	seg_start = segment_start(req, sidx);
-	seg_end   = seg_start + req->seg[sidx].last_sect + 1;
-	
-	ASSERT(sector >= seg_start && sector + nr_secs <= seg_end);
-
-	page  = (char *)MMAP_VADDR(info->vstart, 
-				   (unsigned long)req->id, sidx);
-	page += (req->seg[sidx].first_sect << SECTOR_SHIFT);
-	page += ((sector - seg_start) << SECTOR_SHIFT);
-
-	if (!parent) {
-		memset(page, 0, nr_secs << SECTOR_SHIFT);
-		return nr_secs;
-	}
-
-	/* reissue request to backing file */
-	ret = parent->drv->td_queue_read(parent, sector, nr_secs,
-					 page, send_responses, 
-					 req->id, (void *)(long)sidx);
-	if (ret > 0)
-		parent->early += ret;
-
-	return ((ret >= 0) ? 0 : ret);
-}
-
-static void get_io_request(struct td_state *s)
-{
-	RING_IDX          rp, rc, j, i;
-	blkif_request_t  *req;
-	int idx, nsects, ret;
-	uint64_t sector_nr;
-	char *page;
-	int early = 0; /* count early completions */
-	struct disk_driver *dd = s->disks;
-	struct tap_disk *drv   = dd->drv;
-	blkif_t *blkif = s->blkif;
-	tapdev_info_t *info = s->ring_info;
-	int page_size = getpagesize();
-
-	if (!run) return; /*We have received signal to close*/
-
-	rp = info->fe_ring.sring->req_prod; 
-	xen_rmb();
-	for (j = info->fe_ring.req_cons; j != rp; j++)
-	{
-		int done = 0, start_seg = 0; 
-
-		req = NULL;
-		req = RING_GET_REQUEST(&info->fe_ring, j);
-		++info->fe_ring.req_cons;
-		
-		if (req == NULL) continue;
-
-		idx = req->id;
-
-		if (info->busy.req) {
-			/* continue where we left off last time */
-			ASSERT(info->busy.req == req);
-			start_seg = info->busy.seg_idx;
-			sector_nr = segment_start(req, start_seg);
-			info->busy.seg_idx = 0;
-			info->busy.req     = NULL;
-		} else {
-			ASSERT(blkif->pending_list[idx].secs_pending == 0);
-			memcpy(&blkif->pending_list[idx].req, 
-			       req, sizeof(*req));
-			blkif->pending_list[idx].status = BLKIF_RSP_OKAY;
-			blkif->pending_list[idx].submitting = 1;
-			sector_nr = req->sector_number;
-		}
-
-		if ((dd->flags & TD_RDONLY) && 
-		    (req->operation == BLKIF_OP_WRITE)) {
-			blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
-			goto send_response;
-		}
-
-		for (i = start_seg; i < req->nr_segments; i++) {
-			nsects = req->seg[i].last_sect - 
-				 req->seg[i].first_sect + 1;
-	
-			if ((req->seg[i].last_sect >= page_size >> 9) ||
-			    (nsects <= 0))
-				continue;
-
-			page  = (char *)MMAP_VADDR(info->vstart, 
-						   (unsigned long)req->id, i);
-			page += (req->seg[i].first_sect << SECTOR_SHIFT);
-
-			if (sector_nr >= s->size) {
-				DPRINTF("Sector request failed:\n");
-				DPRINTF("%s request, idx [%d,%d] size [%llu], "
-					"sector [%llu,%llu]\n",
-					(req->operation == BLKIF_OP_WRITE ? 
-					 "WRITE" : "READ"),
-					idx,i,
-					(long long unsigned) 
-						nsects<<SECTOR_SHIFT,
-					(long long unsigned) 
-						sector_nr<<SECTOR_SHIFT,
-					(long long unsigned) sector_nr);
-				continue;
-			}
-
-			blkif->pending_list[idx].secs_pending += nsects;
-
-			switch (req->operation) 
-			{
-			case BLKIF_OP_WRITE:
-				ret = drv->td_queue_write(dd, sector_nr,
-							  nsects, page, 
-							  send_responses,
-							  idx, (void *)(long)i);
-				if (ret > 0) dd->early += ret;
-				else if (ret == -EBUSY) {
-					/* put req back on queue */
-					--info->fe_ring.req_cons;
-					info->busy.req     = req;
-					info->busy.seg_idx = i;
-					goto out;
-				}
-				break;
-			case BLKIF_OP_READ:
-				ret = drv->td_queue_read(dd, sector_nr,
-							 nsects, page, 
-							 send_responses,
-							 idx, (void *)(long)i);
-				if (ret > 0) dd->early += ret;
-				else if (ret == -EBUSY) {
-					/* put req back on queue */
-					--info->fe_ring.req_cons;
-					info->busy.req     = req;
-					info->busy.seg_idx = i;
-					goto out;
-				}
-				break;
-			default:
-				DPRINTF("Unknown block operation\n");
-				break;
-			}
-			sector_nr += nsects;
-		}
-	send_response:
-		blkif->pending_list[idx].submitting = 0;
-		/* force write_rsp_to_ring for synchronous case */
-		if (blkif->pending_list[idx].secs_pending == 0)
-			dd->early += send_responses(dd, 0, 0, 0, idx, 
-						    (void *)(long)0);
-	}
-
- out:
-	/*Batch done*/
-	td_for_each_disk(s, dd) {
-		dd->early += dd->drv->td_submit(dd);
-		if (dd->early > 0) {
-			io_done(dd, MAX_IOFD + 1);
-			dd->early = 0;
-		}
-	}
-
-	return;
-}
-
-int main(int argc, char *argv[])
-{
-	int len, msglen, ret;
-	char *p, *buf;
-	fd_set readfds, writefds;	
-	fd_list_entry_t *ptr;
-	struct td_state *s;
-	char openlogbuf[128];
-
-	if (argc != 3) usage();
-
-	daemonize();
-
-	snprintf(openlogbuf, sizeof(openlogbuf), "TAPDISK[%d]", getpid());
-	openlog(openlogbuf, LOG_CONS|LOG_ODELAY, LOG_DAEMON);
-	/*Setup signal handlers*/
-	signal (SIGBUS, sig_handler);
-	signal (SIGINT, sig_handler);
-
-	/*Open the control channel*/
-	fds[READ]  = open(argv[1],O_RDWR|O_NONBLOCK);
-	fds[WRITE] = open(argv[2],O_RDWR|O_NONBLOCK);
-
-	if ( (fds[READ] < 0) || (fds[WRITE] < 0) ) 
-	{
-		DPRINTF("FD open failed [%d,%d]\n", fds[READ], fds[WRITE]);
-		exit(-1);
-	}
-
-	buf = calloc(MSG_SIZE, 1);
-
-	if (buf == NULL) 
-        {
-		DPRINTF("ERROR: allocating memory.\n");
-		exit(-1);
-	}
-
-	while (run) 
-        {
-		ret = 0;
-		FD_ZERO(&readfds);
-		FD_SET(fds[READ], &readfds);
-		maxfds = fds[READ];
-
-		/*Set all tap fds*/
-		LOCAL_FD_SET(&readfds);
-
-		/*Wait for incoming messages*/
-		ret = select(maxfds + 1, &readfds, (fd_set *) 0, 
-			     (fd_set *) 0, NULL);
-
-		if (ret > 0) 
-		{
-			ptr = fd_start;
-			while (ptr != NULL) {
-				int progress_made = 0;
-				struct disk_driver *dd;
-				tapdev_info_t *info = ptr->s->ring_info;
-
-				td_for_each_disk(ptr->s, dd) {
-					if (dd->io_fd[READ] &&
-					    FD_ISSET(dd->io_fd[READ], 
-						     &readfds)) {
-						io_done(dd, READ);
-						progress_made = 1;
-					}
-				}
-
-				/* completed io from above may have 
-				 * queued new requests on chained disks */
-				if (progress_made) {
-					td_for_each_disk(ptr->s, dd) {
-						dd->early += 
-							dd->drv->td_submit(dd);
-						if (dd->early > 0) {
-							io_done(dd, 
-								MAX_IOFD + 1);
-							dd->early = 0;
-						}
-					}
-				}
-
-				if (FD_ISSET(ptr->tap_fd, &readfds) ||
-				    (info->busy.req && progress_made))
-					get_io_request(ptr->s);
-
-				ptr = ptr->next;
-			}
-
-			if (FD_ISSET(fds[READ], &readfds))
-				read_msg(buf);
-		}
-	}
-	free(buf);
-	close(fds[READ]);
-	close(fds[WRITE]);
-
-	ptr = fd_start;
-	while (ptr != NULL) {
-		s = ptr->s;
-		unmap_disk(s);
-		close(ptr->tap_fd);
-		ptr = ptr->next;
-	}
-	closelog();
-
-	return 0;
-}
diff --git a/tools/blktap/drivers/tapdisk.h b/tools/blktap/drivers/tapdisk.h
deleted file mode 100644
index f3e165a..0000000
--- a/tools/blktap/drivers/tapdisk.h
+++ /dev/null
@@ -1,259 +0,0 @@
-/* tapdisk.h
- *
- * Generic disk interface for blktap-based image adapters.
- *
- * (c) 2006 Andrew Warfield and Julian Chesterfield
- * 
- * Some notes on the tap_disk interface:
- * 
- * tap_disk aims to provide a generic interface to easily implement new 
- * types of image accessors.  The structure-of-function-calls is similar
- * to disk interfaces used in qemu/denali/etc, with the significant 
- * difference being the expectation of asynchronous rather than synchronous 
- * I/O.  The asynchronous interface is intended to allow lots of requests to
- * be pipelined through a disk, without the disk requiring any of its own
- * threads of control.  As such, a batch of requests is delivered to the disk
- * using:
- * 
- *    td_queue_[read,write]()
- * 
- * and passing in a completion callback, which the disk is responsible for 
- * tracking.  The end of a back is marked with a call to:
- * 
- *    td_submit()
- * 
- * The disk implementation must provide a file handle, which is used to 
- * indicate that it needs to do work.  tapdisk will add this file handle 
- * (returned from td_get_fd()) to it's poll set, and will call into the disk
- * using td_do_callbacks() whenever there is data pending.
- * 
- * Two disk implementations demonstrate how this interface may be used to 
- * implement disks with both asynchronous and synchronous calls.  block-aio.c
- * maps this interface down onto the linux libaio calls, while block-sync uses 
- * normal posix read/write.
- * 
- * A few things to realize about the sync case, which doesn't need to defer 
- * io completions:
- * 
- *   - td_queue_[read,write]() call read/write directly, and then call the 
- *     callback immediately.  The MUST then return a value greater than 0
- *     in order to tell tapdisk that requests have finished early, and to 
- *     force responses to be kicked to the clents.
- * 
- *   - The fd used for poll is an otherwise unused pipe, which allows poll to 
- *     be safely called without ever returning anything.
- *
- * NOTE: tapdisk uses the number of sectors submitted per request as a 
- * ref count.  Plugins must use the callback function to communicate the
- * completion--or error--of every sector submitted to them.
- *
- * td_get_parent_id returns:
- *     0 if parent id successfully retrieved
- *     TD_NO_PARENT if no parent exists
- *     -errno on error
- */
-
-#ifndef TAPDISK_H_
-#define TAPDISK_H_
-
-#include <stdint.h>
-#include <syslog.h>
-#include <stdio.h>
-#include "blktaplib.h"
-
-/*If enabled, log all debug messages to syslog*/
-#if 1
-#define DPRINTF(_f, _a...) syslog( LOG_DEBUG, __FILE__ ":%d: " _f , __LINE__, ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* Things disks need to know about, these should probably be in a higher-level
- * header. */
-#define MAX_SEGMENTS_PER_REQ    11
-#define SECTOR_SHIFT             9
-#define DEFAULT_SECTOR_SIZE    512
-
-#define MAX_IOFD                 2
-
-#define BLK_NOT_ALLOCATED       99
-#define TD_NO_PARENT             1
-
-typedef uint32_t td_flag_t;
-
-#define TD_RDONLY                1
-
-struct td_state;
-struct tap_disk;
-
-struct disk_id {
-	char *name;
-	int drivertype;
-};
-
-struct disk_driver {
-	int early;
-	char *name;
-	void *private;
-	td_flag_t flags;
-	int io_fd[MAX_IOFD];
-	struct tap_disk *drv;
-	struct td_state *td_state;
-	struct disk_driver *next;
-};
-
-/* This structure represents the state of an active virtual disk.           */
-struct td_state {
-	struct disk_driver *disks;
-	void *blkif;
-	void *image;
-	void *ring_info;
-	void *fd_entry;
-	uint64_t sector_size;
-	uint64_t size;
-	unsigned int       info;
-};
-
-/* Prototype of the callback to activate as requests complete.              */
-typedef int (*td_callback_t)(struct disk_driver *dd, int res, uint64_t sector,
-			     int nb_sectors, int id, void *private);
-
-/* Structure describing the interface to a virtual disk implementation.     */
-/* See note at the top of this file describing this interface.              */
-struct tap_disk {
-	const char *disk_type;
-	int private_data_size;
-	int (*td_open)           (struct disk_driver *dd, 
-				  const char *name, td_flag_t flags);
-	int (*td_queue_read)     (struct disk_driver *dd, uint64_t sector,
-				  int nb_sectors, char *buf, td_callback_t cb,
-				  int id, void *prv);
-	int (*td_queue_write)    (struct disk_driver *dd, uint64_t sector,
-				  int nb_sectors, char *buf, td_callback_t cb, 
-				  int id, void *prv);
-	int (*td_submit)         (struct disk_driver *dd);
-	int (*td_close)          (struct disk_driver *dd);
-	int (*td_do_callbacks)   (struct disk_driver *dd, int sid);
-	int (*td_get_parent_id)  (struct disk_driver *dd, struct disk_id *id);
-	int (*td_validate_parent)(struct disk_driver *dd, 
-				  struct disk_driver *p, td_flag_t flags);
-};
-
-typedef struct disk_info {
-	int  idnum;
-	char name[50];       /* e.g. "RAMDISK" */
-	char handle[10];     /* xend handle, e.g. 'ram' */
-	int  single_handler; /* is there a single controller for all */
-	                     /* instances of disk type? */
-	int  use_ioemu;      /* backend provider: 0 = tapdisk; 1 = ioemu */
-
-#ifdef TAPDISK
-	struct tap_disk *drv;	
-#endif
-} disk_info_t;
-
-void debug_fe_ring(struct td_state *s);
-
-extern struct tap_disk tapdisk_aio;
-extern struct tap_disk tapdisk_sync;
-extern struct tap_disk tapdisk_vmdk;
-extern struct tap_disk tapdisk_ram;
-extern struct tap_disk tapdisk_qcow;
-extern struct tap_disk tapdisk_qcow2;
-
-
-/*Define Individual Disk Parameters here */
-static disk_info_t aio_disk = {
-	DISK_TYPE_AIO,
-	"raw image (aio)",
-	"aio",
-	0,
-	0,
-#ifdef TAPDISK
-	&tapdisk_aio,
-#endif
-};
-
-static disk_info_t sync_disk = {
-	DISK_TYPE_SYNC,
-	"raw image (sync)",
-	"sync",
-	0,
-	0,
-#ifdef TAPDISK
-	&tapdisk_sync,
-#endif
-};
-
-static disk_info_t vmdk_disk = {
-	DISK_TYPE_VMDK,
-	"vmware image (vmdk)",
-	"vmdk",
-	1,
-	0,
-#ifdef TAPDISK
-	&tapdisk_vmdk,
-#endif
-};
-
-static disk_info_t ram_disk = {
-	DISK_TYPE_RAM,
-	"ramdisk image (ram)",
-	"ram",
-	1,
-	0,
-#ifdef TAPDISK
-	&tapdisk_ram,
-#endif
-};
-
-static disk_info_t qcow_disk = {
-	DISK_TYPE_QCOW,
-	"qcow disk (qcow)",
-	"qcow",
-	0,
-	0,
-#ifdef TAPDISK
-	&tapdisk_qcow,
-#endif
-};
-
-static disk_info_t qcow2_disk = {
-	DISK_TYPE_QCOW2,
-	"qcow2 disk (qcow2)",
-	"qcow2",
-	0,
-	0,
-#ifdef TAPDISK
-	&tapdisk_qcow2,
-#endif
-};
-
-/*Main disk info array */
-static disk_info_t *dtypes[] = {
-	&aio_disk,
-	&sync_disk,
-	&vmdk_disk,
-	&ram_disk,
-	&qcow_disk,
-	&qcow2_disk,
-};
-
-typedef struct driver_list_entry {
-	struct blkif *blkif;
-	struct driver_list_entry **pprev, *next;
-} driver_list_entry_t;
-
-typedef struct fd_list_entry {
-	int cookie;
-	int  tap_fd;
-	struct td_state *s;
-	struct fd_list_entry **pprev, *next;
-} fd_list_entry_t;
-
-int qcow_create(const char *filename, uint64_t total_size,
-		const char *backing_file, int flags);
-
-int qcow2_create(const char *filename, uint64_t total_size,
-		const char *backing_file, int flags);
-#endif /*TAPDISK_H_*/
diff --git a/tools/blktap/lib/Makefile b/tools/blktap/lib/Makefile
deleted file mode 100644
index 8852c46..0000000
--- a/tools/blktap/lib/Makefile
+++ /dev/null
@@ -1,60 +0,0 @@
-XEN_ROOT = $(CURDIR)/../../..
-include $(XEN_ROOT)/tools/Rules.mk
-
-MAJOR    = 3.0
-MINOR    = 0
-SONAME   = libblktap.so.$(MAJOR)
-
-CFLAGS   += -I.
-CFLAGS   += $(CFLAGS_libxenctrl)
-CFLAGS   += $(CFLAGS_libxenstore)
-LDLIBS   += $(LDLIBS_libxenstore)
-
-SRCS     :=
-SRCS     += xenbus.c blkif.c xs_api.c
-
-CFLAGS   += -Werror
-CFLAGS   += -Wno-unused
-CFLAGS   += -fPIC
-# get asprintf():
-CFLAGS   += -D _GNU_SOURCE
-
-OBJS     = $(SRCS:.c=.o)
-OBJS_PIC = $(SRCS:.c=.opic)
-IBINS   :=
-
-LIB      = libblktap.a
-LIB_SO   = libblktap.so.$(MAJOR).$(MINOR)
-
-.PHONY: all
-all: $(LIB) $(LIB_SO)
-
-.PHONY: install
-install: all
-	$(INSTALL_DIR) $(DESTDIR)$(LIBDIR)
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_PROG) $(LIB_SO) $(DESTDIR)$(LIBDIR)
-	$(INSTALL_DATA) $(LIB) $(DESTDIR)$(LIBDIR)
-	ln -sf libblktap.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libblktap.so.$(MAJOR)
-	ln -sf libblktap.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libblktap.so
-	$(INSTALL_DATA) blktaplib.h $(DESTDIR)$(INCLUDEDIR)
-
-.PHONY: clean
-clean:
-	rm -rf *.a *.so* *.o *.opic *.rpm $(LIB) $(LIB_SO) *~ $(DEPS) xen TAGS
-
-libblktap.so.$(MAJOR).$(MINOR): $(OBJS_PIC) 
-	$(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,$(SONAME) $(SHLIB_LDFLAGS) \
-	      -o $@ $^ $(LDLIBS)
-	ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
-	ln -sf libblktap.so.$(MAJOR) libblktap.so
-
-libblktap.a: $(OBJS) 
-	$(AR) rc $@ $^
-
-.PHONY: TAGS
-TAGS:
-	etags -t $(SRCS) *.h
-
--include $(DEPS)
-
diff --git a/tools/blktap/lib/blkif.c b/tools/blktap/lib/blkif.c
deleted file mode 100644
index 9a19596..0000000
--- a/tools/blktap/lib/blkif.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * tools/blktap_user/blkif.c
- * 
- * The blkif interface for blktap.  A blkif describes an in-use virtual disk.
- * (c) 2005 Andrew Warfield and Julian Chesterfield
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <string.h>
-#include <err.h>
-#include <unistd.h>
-
-#include "blktaplib.h"
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-#define BLKIF_HASHSZ 1024
-#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
-
-static blkif_t      *blkif_hash[BLKIF_HASHSZ];
-
-blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
-{
-	blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
-	while ( (blkif != NULL) && 
-		((blkif->domid != domid) || (blkif->handle != handle)) )
-		blkif = blkif->hash_next;
-	return blkif;
-}
-
-blkif_t *alloc_blkif(domid_t domid)
-{
-	blkif_t *blkif;
-	DPRINTF("Alloc_blkif called [%d]\n",domid);
-	blkif = (blkif_t *)malloc(sizeof(blkif_t));
-	if (!blkif)
-		return NULL;
-	memset(blkif, 0, sizeof(*blkif));
-	blkif->domid = domid;
-	blkif->devnum = -1;
-	return blkif;
-}
-
-/*Controller callbacks*/
-static int (*new_devmap_hook)(blkif_t *blkif) = NULL;
-void register_new_devmap_hook(int (*fn)(blkif_t *blkif))
-{
-	new_devmap_hook = fn;
-}
-
-static int (*new_unmap_hook)(blkif_t *blkif) = NULL;
-void register_new_unmap_hook(int (*fn)(blkif_t *blkif))
-{
-	new_unmap_hook = fn;
-}
-
-static int (*new_blkif_hook)(blkif_t *blkif) = NULL;
-void register_new_blkif_hook(int (*fn)(blkif_t *blkif))
-{
-	new_blkif_hook = fn;
-}
-
-int blkif_init(blkif_t *blkif, long int handle, long int pdev, 
-               long int readonly)
-{
-	domid_t domid;
-	blkif_t **pblkif;
-	int devnum;
-	
-	if (blkif == NULL)
-		return -EINVAL;
-	
-	domid = blkif->domid;
-	blkif->handle   = handle;
-	blkif->pdev     = pdev;
-	blkif->readonly = readonly;
-	
-	/*
-	 * Call out to the new_blkif_hook. 
-	 * The tap application should define this,
-	 * and it should return having set blkif->ops
-	 * 
-	 */
-	if (new_blkif_hook == NULL)
-	{
-		DPRINTF("Probe detected a new blkif, but no new_blkif_hook!");
-		return -1;
-	}
-	if (new_blkif_hook(blkif)!=0) {
-		DPRINTF("BLKIF: Image open failed\n");
-		return -1;
-	}
-	
-	/* Now wire it in. */
-	pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
-	DPRINTF("Created hash entry: %d [%d,%ld]\n", 
-		BLKIF_HASH(domid, handle), domid, handle);
-	
-	while ( *pblkif != NULL )
-	{
-		if ( ((*pblkif)->domid == domid) && 
-		     ((*pblkif)->handle == handle) )
-		{
-			DPRINTF("Could not create blkif: already exists\n");
-			return -1;
-		}
-		pblkif = &(*pblkif)->hash_next;
-	}
-	blkif->hash_next = NULL;
-	*pblkif = blkif;
-	
-	if (new_devmap_hook == NULL)
-	{
-		DPRINTF("Probe setting up new blkif but no devmap hook!");
-		return -1;
-	}
-	
-	devnum = new_devmap_hook(blkif);
-	if (devnum == -1)
-		return -1;
-	blkif->devnum = devnum;
-	
-	return 0;
-}
-
-void free_blkif(blkif_t *blkif)
-{
-	blkif_t **pblkif, *curs;
-	image_t *image;
-	
-	pblkif = &blkif_hash[BLKIF_HASH(blkif->domid, blkif->handle)];
-	while ( (curs = *pblkif) != NULL )
-	{
-		if ( blkif == curs )
-		{
-			*pblkif = curs->hash_next;
-		}
-		pblkif = &curs->hash_next;
-	}
-	if (blkif != NULL) {
-		if ((image=(image_t *)blkif->prv)!=NULL) {
-			free(blkif->prv);
-		}
-		if (blkif->info!=NULL) {
-			free(blkif->info);
-		}
-		if (new_unmap_hook != NULL) new_unmap_hook(blkif);
-		free(blkif);
-	}
-}
-
-void __init_blkif(void)
-{    
-	memset(blkif_hash, 0, sizeof(blkif_hash));
-}
diff --git a/tools/blktap/lib/blktaplib.h b/tools/blktap/lib/blktaplib.h
deleted file mode 100644
index a80e518..0000000
--- a/tools/blktap/lib/blktaplib.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/* blktaplib.h
- *
- * Blktap library userspace code.
- *
- * (c) 2005 Andrew Warfield and Julian Chesterfield
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef __BLKTAPLIB_H__
-#define __BLKTAPLIB_H__
-
-#include <xenctrl.h>
-#include <sys/param.h>
-#include <sys/user.h>
-#include <xen/xen.h>
-#include <xen/io/blkif.h>
-#include <xen/io/ring.h>
-#include <xenstore.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, XC_PAGE_SIZE)
-
-/* size of the extra VMA area to map in attached pages. */
-#define BLKTAP_VMA_PAGES BLK_RING_SIZE
-
-/* blktap IOCTLs: These must correspond with the blktap driver ioctls*/
-#define BLKTAP_IOCTL_KICK_FE         1
-#define BLKTAP_IOCTL_KICK_BE         2
-#define BLKTAP_IOCTL_SETMODE         3
-#define BLKTAP_IOCTL_SENDPID	     4
-#define BLKTAP_IOCTL_NEWINTF	     5
-#define BLKTAP_IOCTL_MINOR	     6
-#define BLKTAP_IOCTL_MAJOR	     7
-#define BLKTAP_QUERY_ALLOC_REQS      8
-#define BLKTAP_IOCTL_FREEINTF	     9
-#define BLKTAP_IOCTL_NEWINTF_EXT     50
-#define BLKTAP_IOCTL_PRINT_IDXS      100   
-
-/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
-#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
-#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
-#define BLKTAP_MODE_INTERCEPT_BE     0x00000002
-
-#define BLKTAP_MODE_INTERPOSE \
-           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
-
-static inline int BLKTAP_MODE_VALID(unsigned long arg)
-{
-	return (
-		( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
-		( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
-		( arg == BLKTAP_MODE_INTERPOSE    ) );
-}
-
-#define MAX_REQUESTS            BLK_RING_SIZE
-
-#define BLKTAP_IOCTL_KICK 1
-#define MAX_PENDING_REQS	BLK_RING_SIZE
-#define BLKTAP_DEV_DIR   "/dev/xen"
-#define BLKTAP_DEV_NAME  "blktap"
-#define BLKTAP_DEV_MINOR 0
-#define BLKTAP_CTRL_DIR   "/var/run/tap"
-
-extern int blktap_major;
-
-#define BLKTAP_RING_PAGES       1 /* Front */
-#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES)
-
-struct blkif;
-
-typedef struct {
-	blkif_request_t  req;
-	struct blkif    *blkif;
-	int              submitting;
-	int              secs_pending;
-        int16_t          status;
-} pending_req_t;
-
-struct blkif_ops {
-	unsigned long long (*get_size)(struct blkif *blkif);
-	unsigned long (*get_secsize)(struct blkif *blkif);
-	unsigned int (*get_info)(struct blkif *blkif);
-};
-
-typedef struct blkif {
-	domid_t domid;
-	long int handle;
-	
-	long int pdev;
-	long int readonly;
-	
-	enum { DISCONNECTED, DISCONNECTING, CONNECTED } state;
-	
-	struct blkif_ops *ops;
-	struct blkif *hash_next;
-	
-	void *prv;  /* device-specific data */
-	void *info; /*Image parameter passing */
-	pending_req_t pending_list[MAX_REQUESTS];
-	int devnum;
-	int fds[2];
-	int be_id;
-	int major;
-	int minor;
-	pid_t tappid;
-	int drivertype;
-	uint16_t cookie;
-} blkif_t;
-
-typedef struct blkif_info {
-	char *params;
-} blkif_info_t;
-
-void register_new_devmap_hook(int (*fn)(blkif_t *blkif));
-void register_new_unmap_hook(int (*fn)(blkif_t *blkif));
-void register_new_blkif_hook(int (*fn)(blkif_t *blkif));
-blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
-blkif_t *alloc_blkif(domid_t domid);
-int blkif_init(blkif_t *blkif, long int handle, long int pdev, 
-               long int readonly);
-void free_blkif(blkif_t *blkif);
-void __init_blkif(void);
-
-typedef struct busy_state {
-	int seg_idx;
-	blkif_request_t *req;
-} busy_state_t;
-
-typedef struct tapdev_info {
-	int fd;
-	char *mem;
-	blkif_sring_t *sring;
-	blkif_back_ring_t  fe_ring;
-	unsigned long vstart;
-	blkif_t *blkif;
-	busy_state_t busy;
-} tapdev_info_t;
-
-typedef struct domid_translate {
-	unsigned short domid;
-	unsigned short busid;
-} domid_translate_t ;
-
-typedef struct domid_translate_ext {
-	unsigned short domid;
-	uint32_t busid;
-} domid_translate_ext_t ;
-
-typedef struct image {
-	unsigned long long size;
-	unsigned long secsize;
-	unsigned int info;
-} image_t;
-
-/* 16-byte message header, immediately followed by message payload. */
-typedef struct msg_hdr {
-	uint16_t   type;
-	uint16_t   len;
-	uint16_t   drivertype;
-	uint16_t   cookie;
-	uint8_t    readonly;
-	uint8_t    pad[7];
-} msg_hdr_t;
-
-typedef struct msg_newdev {
-	uint8_t     devnum;
-	uint16_t    domid;
-} msg_newdev_t;
-
-typedef struct msg_pid {
-	pid_t     pid;
-} msg_pid_t;
-
-#define READ 0
-#define WRITE 1
-
-/*Control Messages between manager and tapdev*/
-#define CTLMSG_PARAMS      1
-#define CTLMSG_IMG         2
-#define CTLMSG_IMG_FAIL    3
-#define CTLMSG_NEWDEV      4
-#define CTLMSG_NEWDEV_RSP  5
-#define CTLMSG_NEWDEV_FAIL 6
-#define CTLMSG_CLOSE       7
-#define CTLMSG_CLOSE_RSP   8
-#define CTLMSG_PID         9
-#define CTLMSG_PID_RSP     10
-
-/* disk driver types */
-#define MAX_DISK_TYPES     20
-
-#define DISK_TYPE_AIO      0
-#define DISK_TYPE_SYNC     1
-#define DISK_TYPE_VMDK     2
-#define DISK_TYPE_RAM      3
-#define DISK_TYPE_QCOW     4
-#define DISK_TYPE_QCOW2    5
-
-/* xenstore/xenbus: */
-#define DOMNAME "Domain-0"
-int setup_probe_watch(struct xs_handle *h);
-
-
-/* Abitrary values, must match the underlying driver... */
-#define MAX_TAP_DEV 100
-
-/* Accessing attached data page mappings */
-#define MMAP_PAGES                                              \
-    (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-#define MMAP_VADDR(_vstart,_req,_seg)                                   \
-    ((_vstart) +                                              \
-     ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) +    \
-     ((_seg) * getpagesize()))
-
-
-#endif /* __BLKTAPLIB_H__ */
diff --git a/tools/blktap/lib/list.h b/tools/blktap/lib/list.h
deleted file mode 100644
index c82242f..0000000
--- a/tools/blktap/lib/list.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * list.h
- * 
- * This is a subset of linux's list.h intended to be used in user-space.
- * 
- */
-
-#ifndef __LIST_H__
-#define __LIST_H__
-
-#ifdef LIST_HEAD
-#undef LIST_HEAD
-#endif
-
-#define LIST_POISON1  ((void *) 0x00100100)
-#define LIST_POISON2  ((void *) 0x00200200)
-
-struct list_head {
-        struct list_head *next, *prev;
-};
- 
-#define LIST_HEAD_INIT(name) { &(name), &(name) }
- 
-#define LIST_HEAD(name) \
-        struct list_head name = LIST_HEAD_INIT(name)
-
-static inline void __list_add(struct list_head *new,
-                              struct list_head *prev,
-                              struct list_head *next)
-{
-        next->prev = new;
-        new->next = next;
-        new->prev = prev;
-        prev->next = new;
-}
-
-static inline void list_add(struct list_head *new, struct list_head *head)
-{
-        __list_add(new, head, head->next);
-}
-static inline void __list_del(struct list_head * prev, struct list_head * next)
-{
-        next->prev = prev;
-        prev->next = next;
-}
-static inline void list_del(struct list_head *entry)
-{
-        __list_del(entry->prev, entry->next);
-        entry->next = LIST_POISON1;
-        entry->prev = LIST_POISON2;
-}
-#define list_entry(ptr, type, member)                                   \
-        ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
-#define list_for_each_entry(pos, head, member)                          \
-        for (pos = list_entry((head)->next, typeof(*pos), member);      \
-             &pos->member != (head);                                    \
-             pos = list_entry(pos->member.next, typeof(*pos), member))
-
-#endif /* __LIST_H__ */
diff --git a/tools/blktap/lib/xenbus.c b/tools/blktap/lib/xenbus.c
deleted file mode 100644
index 948eb02..0000000
--- a/tools/blktap/lib/xenbus.c
+++ /dev/null
@@ -1,617 +0,0 @@
-/*
- * xenbus.c
- * 
- * xenbus interface to the blocktap.
- * 
- * this handles the top-half of integration with block devices through the
- * store -- the tap driver negotiates the device channel etc, while the
- * userland tap client needs to sort out the disk parameters etc.
- * 
- * (c) 2005 Andrew Warfield and Julian Chesterfield
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <err.h>
-#include <stdarg.h>
-#include <errno.h>
-#include <xenstore.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <poll.h>
-#include <time.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include "blktaplib.h"
-#include "list.h"
-#include "xs_api.h"
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-struct backend_info
-{
-	/* our communications channel */
-	blkif_t *blkif;
-	
-	long int frontend_id;
-	long int pdev;
-	long int readonly;
-	
-	char *backpath;
-	char *frontpath;
-	
-	struct list_head list;
-};
-
-static LIST_HEAD(belist);
-
-static int strsep_len(const char *str, char c, unsigned int len)
-{
-	unsigned int i;
-	
-	for (i = 0; str[i]; i++)
-		if (str[i] == c) {
-			if (len == 0)
-				return i;
-			len--;
-		}
-	return (len == 0) ? i : -ERANGE;
-}
-
-static int get_be_id(const char *str)
-{
-	int len,end;
-	const char *ptr;
-	char *tptr, num[10];
-	
-	len = strsep_len(str, '/', 6);
-	end = strlen(str);
-	if( (len < 0) || (end < 0) ) return -1;
-	
-	ptr = str + len + 1;
-	strncpy(num, ptr, end - len);
-	tptr = num + (end - (len + 1));
-	*tptr = '\0';
-
-	return atoi(num);
-}
-
-static int get_be_domid(const char *str)
-{
-	int len1, len2;
-	const char *ptr;
-	char *tptr, num[10];
-
-	len2 = strsep_len(str, '/', 3);
-	if ( len2 < 0 ) return -1;
-	len1 = strsep_len(str, '/', 2);
-
-	ptr = str + len1 + 1;
-	strncpy(num, ptr, len2 - len1 - 1);
-	tptr = num + (len2 - len1 - 1);
-	*tptr = '\0';
-
-	return atoi(num);
-}
-
-static struct backend_info *be_lookup_be(const char *bepath)
-{
-	struct backend_info *be;
-	
-	list_for_each_entry(be, &belist, list)
-		if (strcmp(bepath, be->backpath) == 0)
-			return be;
-	return (struct backend_info *)NULL;
-}
-
-static int be_exists_be(const char *bepath)
-{
-	return (be_lookup_be(bepath) != NULL);
-}
-
-static struct backend_info *be_lookup_fe(const char *fepath)
-{
-	struct backend_info *be;
-	
-	list_for_each_entry(be, &belist, list)
-		if (strcmp(fepath, be->frontpath) == 0)
-			return be;
-	return (struct backend_info *)NULL;
-}
-
-static int backend_remove(struct xs_handle *h, struct backend_info *be)
-{
-	/* Unhook from be list. */
-	list_del(&be->list);
-
-	/* Free everything else. */
-	if (be->blkif) {
-		DPRINTF("Freeing blkif dev [%d]\n",be->blkif->devnum);
-		free_blkif(be->blkif);
-	}
-	if (be->frontpath)
-		free(be->frontpath);
-	if (be->backpath)
-		free(be->backpath);
-	free(be);
-	return 0;
-}
-
-static const char *get_image_path(const char *path)
-{
-	const char *tmp;
-
-	/* Strip off the image type */
-	if (!strncmp(path, "tapdisk:", strlen("tapdisk:"))) {
-		path += strlen("tapdisk:");
-	} else if (!strncmp(path, "ioemu:", strlen("ioemu:"))) {
-		path += strlen("ioemu:");
-	}
-
-	tmp = strchr(path, ':');
-	if (tmp != NULL)
-		path = tmp + 1;
-
-	return path;
-}
-
-static int check_sharing(struct xs_handle *h, struct backend_info *be)
-{
-	char *dom_uuid;
-	char *cur_dom_uuid;
-	char *path;
-	char *mode;
-	char *params;
-	char **domains;
-	char **devices;
-	int i, j;
-	unsigned int num_dom, num_dev;
-	blkif_info_t *info = be->blkif->info;
-	int ret = 0;
-	const char *image_path[2];
-	int be_domid = get_be_domid(be->backpath);
-
-	image_path[0] = get_image_path(info->params);
-
-	/* If the mode contains '!' or doesn't contain 'w' don't check anything */
-	xs_gather(h, be->backpath, "mode", NULL, &mode, NULL);
-	if (strchr(mode, '!'))
-		goto out;
-	if (strchr(mode, 'w') == NULL)
-		goto out;
-
-	/* Get the UUID of the domain we want to attach to */
-	if (asprintf(&path, "/local/domain/%ld", be->frontend_id) == -1)
-		goto fail;
-	xs_gather(h, path, "vm", NULL, &dom_uuid, NULL);
-	free(path);
-
-	/* Iterate through the devices of all VMs */
-	if (asprintf(&path, "/local/domain/%d/backend/tap", be_domid) == -1)
-		goto fail;
-	domains = xs_directory(h, XBT_NULL, path, &num_dom);
-	free(path);
-	if (domains == NULL)
-		num_dom = 0;
-
-	for (i = 0; !ret && (i < num_dom); i++) {
-
-		/* If it's the same VM, no action needed */
-		if (asprintf(&path, "/local/domain/%s", domains[i]) == -1) {
-			ret = -1;
-			break;
-		}
-		cur_dom_uuid = NULL;
-		xs_gather(h, path, "vm", NULL, &cur_dom_uuid, NULL);
-		free(path);
-		if (!cur_dom_uuid)
-			continue;
-
-		if (!strcmp(cur_dom_uuid, dom_uuid)) {
-			free(cur_dom_uuid);
-			continue;
-		}
-
-		/* Check the devices */
-		if (asprintf(&path, "/local/domain/%d/backend/tap/%s", be_domid, domains[i]) == -1) {
-			ret = -1;
-			free(cur_dom_uuid);
-			break;
-		}
-		devices = xs_directory(h, XBT_NULL, path, &num_dev);
-		if (devices == NULL)
-			num_dev = 0;
-		free(path);
-
-		for (j = 0; !ret && (j < num_dev); j++) {
-			if (asprintf(&path, "/local/domain/%d/backend/tap/%s/%s", be_domid, domains[i], devices[j]) == -1) {
-				ret = -1;
-				break;
-			}
-			params = NULL;
-			xs_gather(h, path, "params", NULL, &params, NULL);
-			free(path);
-			if (!params)
-				continue;
-
-			image_path[1] = get_image_path(params);
-			if (!strcmp(image_path[0], image_path[1])) {
-				ret = -1;
-			}
-
-			free(params);
-		}
-
-		free(cur_dom_uuid);
-		free(devices);
-	}
-	free(domains);
-	free(dom_uuid);
-	goto out;
-
-fail:
-	ret = -1;
-out:
-	free(mode);
-	return ret;
-}
-
-static int check_image(struct xs_handle *h, struct backend_info *be,
-	const char** errmsg)
-{
-	const char *path;
-	int mode;
-	blkif_t *blkif = be->blkif;
-	blkif_info_t *info = blkif->info;
-
-	path = get_image_path(info->params);
-
-	/* Check if the image exists and access is permitted */
-	mode = R_OK;
-	if (!be->readonly)
-		mode |= W_OK;
-	if (access(path, mode)) {
-		if (errno == ENOENT)
-			*errmsg = "File not found.";
-		else
-			*errmsg = "Insufficient file permissions.";
-		return -1;
-	}
-
-	/* Check that the image is not attached to a different VM */
-	if (check_sharing(h, be)) {
-		*errmsg = "File already in use by other domain";
-		return -1;
-	}
-
-	return 0;
-}
-
-static void ueblktap_setup(struct xs_handle *h, char *bepath)
-{
-	struct backend_info *be;
-	char *path = NULL, *p,*dev;
-	int len, er, deverr;
-	long int pdev = 0, handle;
-	blkif_info_t *blk;
-	const char* errmsg = NULL;
-	
-	be = be_lookup_be(bepath);
-	if (be == NULL)
-	{
-		DPRINTF("ERROR: backend changed called for nonexistent "
-			"backend! (%s)\n", bepath);
-		goto fail;
-	}
-
-	deverr = xs_gather(h, bepath, "physical-device", "%li", &pdev, NULL);
-	if (!deverr) {
-		DPRINTF("pdev set to %ld\n",pdev);
-		if (be->pdev && be->pdev != pdev) {
-			DPRINTF("changing physical-device not supported");
-			goto fail;
-		}
-		be->pdev = pdev;
-	}
-
-	/* Check to see if device is to be opened read-only. */
-	deverr = xs_gather(h, bepath, "mode", NULL, &path, NULL);
-	if (deverr) {
-		DPRINTF("ERROR: could not find read/write mode\n");
-		goto fail;
-	} else if (path[0] == 'r')
-		be->readonly = 1;
-
-	if (be->blkif == NULL) {
-		/* Front end dir is a number, which is used as the handle. */
-		p = strrchr(be->frontpath, '/') + 1;
-		handle = strtoul(p, NULL, 0);
-
-		be->blkif = alloc_blkif(be->frontend_id);
-		if (be->blkif == NULL)
-			goto fail;
-
-		be->blkif->be_id = get_be_id(bepath);
-		
-		/* Insert device specific info, */
-		blk = malloc(sizeof(blkif_info_t));
-		if (!blk) {
-			DPRINTF("Out of memory - blkif_info_t\n");
-			goto fail;
-		}
-		er = xs_gather(h, bepath, "params", NULL, &blk->params, NULL);
-		if (er)
-			goto fail;
-		be->blkif->info = blk;
-		
-		if (deverr) {
-			/*Dev number was not available, try to set manually*/
-			pdev = convert_dev_name_to_num(blk->params);
-			be->pdev = pdev;
-		}
-
-		if (check_image(h, be, &errmsg))
-			goto fail;
-
-		er = blkif_init(be->blkif, handle, be->pdev, be->readonly);
-		if (er != 0) {
-			DPRINTF("Unable to open device %s\n",blk->params);
-			goto fail;
-		}
-
-		DPRINTF("[BECHG]: ADDED A NEW BLKIF (%s)\n", bepath);
-	}
-
-	/* Supply the information about the device to xenstore */
-	er = xs_printf(h, be->backpath, "sectors", "%llu",
-			be->blkif->ops->get_size(be->blkif));
-
-	if (er == 0) {
-		DPRINTF("ERROR: Failed writing sectors");
-		goto fail;
-	}
-
-	er = xs_printf(h, be->backpath, "sector-size", "%lu",
-			be->blkif->ops->get_secsize(be->blkif));
-
-	if (er == 0) {
-		DPRINTF("ERROR: Failed writing sector-size");
-		goto fail;
-	}
-
-	er = xs_printf(h, be->backpath, "info", "%u",
-			be->blkif->ops->get_info(be->blkif));
-
-	if (er == 0) {
-		DPRINTF("ERROR: Failed writing info");
-		goto fail;
-	}
-
-	be->blkif->state = CONNECTED;
-	xs_printf(h, be->backpath, "hotplug-status", "connected");
-
-	DPRINTF("[SETUP] Complete\n\n");
-	goto close;
-	
-fail:
-	if (be) {
-		if (errmsg == NULL)
-			errmsg = "Setting up the backend failed. See the log "
-				"files in /var/log/xen/ for details.";
-		xs_printf(h, be->backpath, "hotplug-error", errmsg);
-		xs_printf(h, be->backpath, "hotplug-status", "error");
-
-		backend_remove(h, be);
-	}
-close:
-	if (path)
-		free(path);
-	return;
-}
-
-/**
- * Xenstore watch callback entry point. This code replaces the hotplug scripts,
- * and as soon as the xenstore backend driver entries are created, this script
- * gets called.
- */
-static void ueblktap_probe(struct xs_handle *h, struct xenbus_watch *w, 
-			   const char *bepath_im)
-{
-	struct backend_info *be = NULL;
-	char *frontend = NULL, *bepath = NULL, *p;
-	int er, len;
-	blkif_t *blkif;
-	
-	
-	bepath = strdup(bepath_im);
-	
-	if (!bepath) {
-		DPRINTF("No path\n");
-		return;
-	}
-	
-	/*
-	 *asserts that xenstore structure is always 7 levels deep
-	 *e.g. /local/domain/0/backend/vbd/1/2049
-	 */
-	len = strsep_len(bepath, '/', 7);
-	if (len < 0) 
-		goto free_be;
-	if (bepath[len] != '\0')
-		goto free_be;
-	
-	be = malloc(sizeof(*be));
-	if (!be) {
-		DPRINTF("ERROR: allocating backend structure\n");
-		goto free_be;
-	}
-	memset(be, 0, sizeof(*be));
-	frontend = NULL;
-
-	er = xs_gather(h, bepath,
-		       "frontend-id", "%li", &be->frontend_id,
-		       "frontend", NULL, &frontend,
-		       NULL);
-
-	if (er) {
-		/*
-		 *Unable to find frontend entries, 
-		 *bus-id is no longer valid
-		 */
-		DPRINTF("ERROR: Frontend-id check failed, removing backend: "
-			"[%s]\n",bepath);
-
-		/**
-		 * BE info should already exist, 
-		 * free new mem and find old entry
-		 */
-		free(be);
-		be = be_lookup_be(bepath);
-		if ( (be != NULL) && (be->blkif != NULL) ) 
-			backend_remove(h, be);
-		else goto free_be;
-		if (bepath)
-			free(bepath);
-		return;
-	}
-	
-	/* Are we already tracking this device? */
-	if (be_exists_be(bepath))
-		goto free_be;
-	
-	be->backpath = bepath;
-	be->frontpath = frontend;
-	
-	list_add(&be->list, &belist);
-	
-	DPRINTF("[PROBE]\tADDED NEW DEVICE (%s)\n", bepath);
-	DPRINTF("\tFRONTEND (%s),(%ld)\n", frontend,be->frontend_id);
-	
-	ueblktap_setup(h, bepath);	
-	return;
-	
- free_be:
-	if (frontend)
-		free(frontend);
-	if (bepath)
-		free(bepath);
-	if (be) 
-		free(be);
-}
-
-/**
- *We set a general watch on the backend vbd directory
- *ueblktap_probe is called for every update
- *Our job is to monitor for new entries. As they 
- *are created, we initalise the state and attach a disk.
- */
-
-static int add_blockdevice_probe_watch(struct xs_handle *h, const char *domid)
-{
-	char *path;
-	struct xenbus_watch *vbd_watch;
-	
-	if (asprintf(&path, "/local/domain/%s/backend/tap", domid) == -1)
-		return -ENOMEM;
-	
-	vbd_watch = (struct xenbus_watch *)malloc(sizeof(struct xenbus_watch));
-	if (!vbd_watch) {
-		DPRINTF("ERROR: unable to malloc vbd_watch [%s]\n", path);
-		return -EINVAL;
-	}	
-	vbd_watch->node     = path;
-	vbd_watch->callback = ueblktap_probe;
-	if (register_xenbus_watch(h, vbd_watch) != 0) {
-		DPRINTF("ERROR: adding vbd probe watch %s\n", path);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-/* Asynch callback to check for /local/domain/<DOMID>/name */
-static void check_dom(struct xs_handle *h, struct xenbus_watch *w, 
-	       const char *bepath_im)
-{
-	char *domid;
-
-	domid = get_dom_domid(h);
-	if (domid == NULL)
-		return;
-
-	add_blockdevice_probe_watch(h, domid);
-	free(domid);
-	unregister_xenbus_watch(h, w);
-}
-
-/* We must wait for xend to register /local/domain/<DOMID> */
-static int watch_for_domid(struct xs_handle *h)
-{
-	struct xenbus_watch *domid_watch;
-	char *path = NULL;
-
-	if (asprintf(&path, "/local/domain") == -1)
-		return -ENOMEM;
-
-	domid_watch = malloc(sizeof(struct xenbus_watch));
-	if (domid_watch == NULL) {
-		DPRINTF("ERROR: unable to malloc domid_watch [%s]\n", path);
-		return -EINVAL;
-	}	
-
-	domid_watch->node     = path;
-	domid_watch->callback = check_dom;
-
-	if (register_xenbus_watch(h, domid_watch) != 0) {
-		DPRINTF("ERROR: adding vbd probe watch %s\n", path);
-		return -EINVAL;
-	}
-
-	DPRINTF("Set async watch for /local/domain\n");
-
-	return 0;
-}
-
-int setup_probe_watch(struct xs_handle *h)
-{
-	char *domid;
-	int ret;
-	
-	domid = get_dom_domid(h);
-	if (domid == NULL)
-		return watch_for_domid(h);
-
-	ret = add_blockdevice_probe_watch(h, domid);
-	free(domid);
-	return ret;
-}
diff --git a/tools/blktap/lib/xs_api.c b/tools/blktap/lib/xs_api.c
deleted file mode 100644
index 4648432..0000000
--- a/tools/blktap/lib/xs_api.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * xs_api.c
- * 
- * blocktap interface functions to xenstore
- *
- * (c) 2005 Andrew Warfield and Julian Chesterfield
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <err.h>
-#include <stdarg.h>
-#include <errno.h>
-#include <xenstore.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <poll.h>
-#include "blktaplib.h"
-#include "list.h"
-#include "xs_api.h"
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-static LIST_HEAD(watches);
-#define BASE_DEV_VAL 2048
-
-int xs_gather(struct xs_handle *xs, const char *dir, ...)
-{
-	va_list ap;
-	const char *name;
-	char *path, **e;
-	int ret = 0, num,i;
-	unsigned int len;
-	xs_transaction_t xth;
-
-again:
-	if ( (xth = xs_transaction_start(xs)) == XBT_NULL) {
-		DPRINTF("unable to start xs trasanction\n");
-		ret = ENOMEM;
-		return ret;
-	}
-	
-	va_start(ap, dir);
-	while ( (ret == 0) && (name = va_arg(ap, char *)) != NULL) {
-		const char *fmt = va_arg(ap, char *);
-		void *result = va_arg(ap, void *);
-		char *p;
-		
-		if (asprintf(&path, "%s/%s", dir, name) == -1)
-		{
-			printf("allocation error in xs_gather!\n");
-			ret = ENOMEM;
-			break;
-		}
-		
-		p = xs_read(xs, xth, path, &len);
-		
-		
-		free(path);
-		if (p == NULL) {
-			ret = ENOENT;
-			break;
-		}
-		if (fmt) {
-			if (sscanf(p, fmt, result) == 0)
-				ret = EINVAL;
-			free(p);
-		} else
-			*(char **)result = p;
-	}
-	va_end(ap);
-
-	if (!xs_transaction_end(xs, xth, ret)) {
-		if (ret == 0 && errno == EAGAIN)
-			goto again;
-		else
-			ret = errno;
-	}
-
-	return ret;
-}
-
-
-/* Single printf and write: returns -errno or 0. */
-int xs_printf(struct xs_handle *h, const char *dir, const char *node, 
-	      const char *fmt, ...)
-{
-	char *buf, *path;
-	va_list ap;
-	int ret;
-	
-	va_start(ap, fmt);
-	ret = vasprintf(&buf, fmt, ap);
-	va_end(ap);
-	
-	if (ret == -1)
-		return ENOMEM;
-	if (asprintf(&path, "%s/%s", dir, node) == -1) {
-		free(buf);
-		return ENOMEM;
-	}
-
-	ret = xs_write(h, XBT_NULL, path, buf, strlen(buf));
-	
-	free(buf);
-	free(path);
-	
-	return ret;
-}
-
-
-int xs_exists(struct xs_handle *h, const char *path)
-{
-	char **d;
-	unsigned int num;
-	xs_transaction_t xth;
-	
-	if ( (xth = xs_transaction_start(h)) == XBT_NULL) {
-		printf("unable to start xs trasanction\n");
-		return 0;
-	}	
-	
-	d = xs_directory(h, xth, path, &num);
-	xs_transaction_end(h, xth, 0);
-	if (d == NULL)
-		return 0;
-	free(d);
-	return 1;
-}
-
-
-
-/**
- * This assumes that the domain name we are looking for is unique. 
- * Name parameter Domain-0 
- */
-char *get_dom_domid(struct xs_handle *h)
-{
-	char **e, *val, *domid = NULL;
-	unsigned int num, len;
-	int i;
-	char *path;
-	xs_transaction_t xth;
-	
-	if ( (xth = xs_transaction_start(h)) == XBT_NULL) {
-		warn("unable to start xs trasanction\n");
-		return NULL;
-	}
-	
-	e = xs_directory(h, xth, "/local/domain", &num);
-	if (e == NULL)
-		goto done;
-
-	for (i = 0; (i < num) && (domid == NULL); i++) {
-		if (asprintf(&path, "/local/domain/%s/name", e[i]) == -1)
-			break;
-		val = xs_read(h, xth, path, &len);
-		free(path);
-		if (val == NULL)
-			continue;
-		
-		if (strcmp(val, DOMNAME) == 0) {
-			/* match! */
-			if (asprintf(&path, "/local/domain/%s/domid", e[i]) == -1) {
-				free(val);
-				break;
-			}
-			domid = xs_read(h, xth, path, &len);
-			free(path);
-		}
-		free(val);
-	}
-done:
-	xs_transaction_end(h, xth, 0);
-	if (e)
-		free(e);
-	return domid;
-}
-
-int convert_dev_name_to_num(char *name) {
-	char *p, *ptr;
-	int majors[10] = {3,22,33,34,56,57,88,89,90,91};
-	int maj,i,ret = 0;
-	char *p_sd = "/dev/sd";
-	char *p_hd = "/dev/hd";
-	char *p_xvd = "/dev/xvd";
-	char *p_plx = "plx";
-	char *alpha = "abcdefghijklmnop";
-
-	if (strstr(name, p_sd) != NULL) {
-		p = name + strlen(p_sd);
-		for(i = 0, ptr = alpha; i < strlen(alpha); i++) {
-			if(*ptr == *p)
-				break;
-			*ptr++;
-		}
-		*p++;
-		ret = BASE_DEV_VAL + (16*i) + atoi(p);
-	} else if (strstr(name, p_hd) != NULL) {
-		p = name + strlen(p_hd);
-		for (i = 0, ptr = alpha; i < strlen(alpha); i++) {
-			if(*ptr == *p) break;
-			*ptr++;
-		}
-		*p++;
-		ret = (majors[i/2]*256) + atoi(p);
-
-	} else if (strstr(name, p_xvd) != NULL) {
-		p = name + strlen(p_xvd);
-		for(i = 0, ptr = alpha; i < strlen(alpha); i++) {
-			if(*ptr == *p) break;
-			*ptr++;
-		}
-		*p++;
-		ret = (202*256) + (16*i) + atoi(p);
-
-	} else if (strstr(name, p_plx) != NULL) {
-		p = name + strlen(p_plx);
-		ret = atoi(p);
-
-	} else {
-		DPRINTF("Unknown device type, setting to default.\n");
-		ret = BASE_DEV_VAL;
-	}
-
-	return ret;
-}
-
-/**
- * A little paranoia: we don't just trust token. 
- */
-static struct xenbus_watch *find_watch(const char *token)
-{
-	struct xenbus_watch *i, *cmp;
-	
-	cmp = (void *)strtoul(token, NULL, 16);
-	
-	list_for_each_entry(i, &watches, list)
-		if (i == cmp)
-			return i;
-	return NULL;
-}
-
-/**
- * Register callback to watch this node. 
- * like xs_watch, return 0 on failure 
- */
-int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
-{
-	/* Pointer in ascii is the token. */
-	char token[sizeof(watch) * 2 + 1];
-
-	snprintf(token, sizeof(token), "%lX", (long)watch);
-	if (find_watch(token)) {
-		DPRINTF("watch collision!\n");
-		return -EINVAL;
-	}
-	
-	if (!xs_watch(h, watch->node, token)) {
-		DPRINTF("unable to set watch!\n");
-		return -EINVAL;
-	}
-
-	list_add(&watch->list, &watches);
-
-	return 0;
-}
-
-int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
-{
-	char token[sizeof(watch) * 2 + 1];
-	
-	snprintf(token, sizeof(token), "%lX", (long)watch);
-	if (!find_watch(token)) {
-		DPRINTF("no such watch!\n");
-		return -EINVAL;
-	}
-
-	if (!xs_unwatch(h, watch->node, token))
-		DPRINTF("XENBUS Failed to release watch %s\n",
-			watch->node);
-
-	list_del(&watch->list);
-	
-	return 0;
-}
-
-/**
- * Re-register callbacks to all watches. 
- */
-void reregister_xenbus_watches(struct xs_handle *h)
-{
-	struct xenbus_watch *watch;
-	char token[sizeof(watch) * 2 + 1];
-	
-	list_for_each_entry(watch, &watches, list) {
-		snprintf(token, sizeof(token), "%lX", (long)watch);
-		xs_watch(h, watch->node, token);
-	}
-}
-
-/**
- * based on watch_thread() 
- */
-int xs_fire_next_watch(struct xs_handle *h)
-{
-	char **res;
-	char *token;
-	char *node = NULL;
-	struct xenbus_watch *w;
-	int er;
-	unsigned int num;
-	
-	res = xs_read_watch(h, &num);
-	if (res == NULL) 
-		return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */
-	
-	node  = res[XS_WATCH_PATH];
-	token = res[XS_WATCH_TOKEN];
-
-	w = find_watch(token);
-	if (w) 
-		w->callback(h, w, node);
-
-	free(res);
-
-	return 1;
-}
diff --git a/tools/blktap/lib/xs_api.h b/tools/blktap/lib/xs_api.h
deleted file mode 100644
index 34430dc..0000000
--- a/tools/blktap/lib/xs_api.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * xs_api.h
- *
- * (c) 2005 Andrew Warfield and Julian Chesterfield
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-struct xenbus_watch
-{
-        struct list_head list;
-        char *node;
-        void (*callback)(struct xs_handle *h, 
-                         struct xenbus_watch *, 
-                         const  char *node);
-};
-
-int xs_gather(struct xs_handle *xs, const char *dir, ...);
-int xs_printf(struct xs_handle *h, const char *dir, const char *node, 
-	      const char *fmt, ...);
-int xs_exists(struct xs_handle *h, const char *path);
-char *get_dom_domid(struct xs_handle *h);
-int convert_dev_name_to_num(char *name);
-int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch);
-int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch);
-void reregister_xenbus_watches(struct xs_handle *h);
-int xs_fire_next_watch(struct xs_handle *h);
diff --git a/tools/blktap2/Makefile b/tools/blktap2/Makefile
index d41758f..94200dc 100644
--- a/tools/blktap2/Makefile
+++ b/tools/blktap2/Makefile
@@ -14,5 +14,7 @@ SUBDIRS-$(CONFIG_Linux) += control
 clean:
 	rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) TAGS
 
-.PHONY: all clean install
-all clean install: %: subdirs-%
+distclean: clean
+
+.PHONY: all clean install distclean
+all clean install distclean: %: subdirs-%
diff --git a/tools/blktap2/control/Makefile b/tools/blktap2/control/Makefile
index a88ff4c..767f52a 100644
--- a/tools/blktap2/control/Makefile
+++ b/tools/blktap2/control/Makefile
@@ -61,18 +61,20 @@ $(LIB_SHARED): $(CTL_PICS)
 	$(CC) $(LDFLAGS) -fPIC  -Wl,$(SONAME_LDFLAG) -Wl,$(LIBSONAME) $(SHLIB_LDFLAGS) -rdynamic $^ -o $@  $(APPEND_LDFLAGS)
 
 install: $(IBIN) $(LIB_STATIC) $(LIB_SHARED)
-	$(INSTALL_DIR) -p $(DESTDIR)$(SBINDIR)
-	$(INSTALL_PROG) $(IBIN) $(DESTDIR)$(SBINDIR)
-	$(INSTALL_DATA) $(LIB_STATIC) $(DESTDIR)$(LIBDIR)
-	$(INSTALL_PROG) $(LIB_SHARED) $(DESTDIR)$(LIBDIR)
-	ln -sf $(LIBSONAME) $(DESTDIR)$(LIBDIR)/$(LIBNAME).so
-	ln -sf $(LIB_SHARED) $(DESTDIR)$(LIBDIR)/$(LIBSONAME)
+	$(INSTALL_DIR) -p $(DESTDIR)$(sbindir)
+	$(INSTALL_PROG) $(IBIN) $(DESTDIR)$(sbindir)
+	$(INSTALL_DATA) $(LIB_STATIC) $(DESTDIR)$(libdir)
+	$(INSTALL_PROG) $(LIB_SHARED) $(DESTDIR)$(libdir)
+	ln -sf $(LIBSONAME) $(DESTDIR)$(libdir)/$(LIBNAME).so
+	ln -sf $(LIB_SHARED) $(DESTDIR)$(libdir)/$(LIBSONAME)
 
 clean:
 	rm -f $(OBJS) $(PICS) $(DEPS) $(IBIN) $(LIB_STATIC) $(LIB_SHARED)
 	rm -f $(LIBNAME).so $(LIBSONAME)
 	rm -f *~
 
-.PHONY: all build clean install
+distclean: clean
+
+.PHONY: all build clean distclean install
 
 -include $(DEPS)
diff --git a/tools/blktap2/drivers/Makefile b/tools/blktap2/drivers/Makefile
index 3476fc1..5328c40 100644
--- a/tools/blktap2/drivers/Makefile
+++ b/tools/blktap2/drivers/Makefile
@@ -7,7 +7,7 @@ LIBVHDDIR  = $(BLKTAP_ROOT)/vhd/lib
 IBIN       = tapdisk2 td-util tapdisk-client tapdisk-stream tapdisk-diff
 QCOW_UTIL  = img2qcow qcow-create qcow2raw
 LOCK_UTIL  = lock-util
-INST_DIR   = $(SBINDIR)
+INST_DIR   = $(sbindir)
 
 CFLAGS    += -Werror
 CFLAGS    += -Wno-unused
@@ -108,4 +108,6 @@ install: all
 clean:
 	rm -rf .*.d *.o *~ xen TAGS $(IBIN) $(LIB) $(LOCK_UTIL) $(QCOW_UTIL)
 
-.PHONY: clean install
+distclean: clean
+
+.PHONY: clean install distclean
diff --git a/tools/blktap2/drivers/libaio-compat.h b/tools/blktap2/drivers/libaio-compat.h
index 47cd96d..ca9ff45 100644
--- a/tools/blktap2/drivers/libaio-compat.h
+++ b/tools/blktap2/drivers/libaio-compat.h
@@ -13,9 +13,7 @@
  * Lesser General Public License for more details.
  *
  * You should  have received a copy  of the GNU  Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
diff --git a/tools/blktap2/drivers/tapdisk-vbd.c b/tools/blktap2/drivers/tapdisk-vbd.c
index c665f27..6d1d94a 100644
--- a/tools/blktap2/drivers/tapdisk-vbd.c
+++ b/tools/blktap2/drivers/tapdisk-vbd.c
@@ -1684,7 +1684,7 @@ tapdisk_vbd_check_ring_message(td_vbd_t *vbd)
 	if (!vbd->ring.sring)
 		return -EINVAL;
 
-	switch (vbd->ring.sring->private.tapif_user.msg) {
+	switch (vbd->ring.sring->pvt.tapif_user.msg) {
 	case 0:
 		return 0;
 
diff --git a/tools/blktap2/include/Makefile b/tools/blktap2/include/Makefile
index f85351e..66e8a1e 100644
--- a/tools/blktap2/include/Makefile
+++ b/tools/blktap2/include/Makefile
@@ -6,9 +6,12 @@ all:
 
 .PHONY: install
 install:
-	$(INSTALL_DIR) -p $(DESTDIR)$(INCLUDEDIR)
+	$(INSTALL_DIR) -p $(DESTDIR)$(includedir)
 
 
 .PHONY: clean
 clean:
 	@:
+
+.PHONY: distclean
+distclean: clean
diff --git a/tools/blktap2/lvm/Makefile b/tools/blktap2/lvm/Makefile
index cf8a53b..7d5f8ea 100644
--- a/tools/blktap2/lvm/Makefile
+++ b/tools/blktap2/lvm/Makefile
@@ -29,6 +29,8 @@ lvm-util: lvm-util.o
 clean:
 	rm -rf *.o *.opic *~ $(DEPS) $(IBIN)
 
-.PHONY: all build clean install lvm-util
+distclean: clean
+
+.PHONY: all build clean distclean install lvm-util
 
 -include $(DEPS)
diff --git a/tools/blktap2/vhd/Makefile b/tools/blktap2/vhd/Makefile
index fef0d36..fabd665 100644
--- a/tools/blktap2/vhd/Makefile
+++ b/tools/blktap2/vhd/Makefile
@@ -6,7 +6,7 @@ SUBDIRS-y         :=
 SUBDIRS-y         += lib
 
 IBIN               = vhd-util vhd-update
-INST_DIR           = $(SBINDIR)
+INST_DIR           = $(sbindir)
 
 CFLAGS            += -Werror
 CFLAGS            += -Wno-unused
@@ -44,6 +44,8 @@ install: all
 clean: subdirs-clean
 	rm -rf *.o *~ $(DEPS) $(IBIN)
 
-.PHONY: all build clean install vhd-util vhd-update
+distclean: clean
+
+.PHONY: all build clean distclean install vhd-util vhd-update
 
 -include $(DEPS)
diff --git a/tools/blktap2/vhd/lib/Makefile b/tools/blktap2/vhd/lib/Makefile
index 99e3cdb..ab2d648 100644
--- a/tools/blktap2/vhd/lib/Makefile
+++ b/tools/blktap2/vhd/lib/Makefile
@@ -10,7 +10,7 @@ LVM-UTIL-OBJ    := $(BLKTAP_ROOT)/lvm/lvm-util.o
 
 LIBVHD-BUILD    := libvhd.a
 
-INST-DIR         = $(LIBDIR)
+INST-DIR         = $(libdir)
 
 CFLAGS          += -Werror
 CFLAGS          += -Wno-unused
@@ -75,6 +75,8 @@ install: all
 clean:
 	rm -rf *.a *.so* *.o *.opic *~ $(DEPS) $(LIBVHD)
 
-.PHONY: all build clean install libvhd
+distclean: clean
+
+.PHONY: all build clean distclean install libvhd
 
 -include $(DEPS)
diff --git a/tools/blktap2/vhd/lib/libvhd.c b/tools/blktap2/vhd/lib/libvhd.c
index 95eb5d6..1fd5b4e 100644
--- a/tools/blktap2/vhd/lib/libvhd.c
+++ b/tools/blktap2/vhd/lib/libvhd.c
@@ -37,6 +37,7 @@
 #include <iconv.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
+#include <langinfo.h>
 
 #include "libvhd.h"
 #include "relative-path.h"
@@ -1296,6 +1297,7 @@ vhd_macx_encode_location(char *name, char **out, int *outlen)
 	size_t ibl, obl;
 	char *uri, *uri_utf8, *uri_utf8p, *ret;
 	const char *urip;
+	char *codeset;
 
 	err     = 0;
 	ret     = NULL;
@@ -1304,7 +1306,7 @@ vhd_macx_encode_location(char *name, char **out, int *outlen)
 	len     = strlen(name) + strlen("file://");
 
 	ibl     = len;
-	obl     = len;
+	obl     = len * 2;
 
 	urip = uri = malloc(ibl + 1);
 	uri_utf8 = uri_utf8p = malloc(obl);
@@ -1312,7 +1314,8 @@ vhd_macx_encode_location(char *name, char **out, int *outlen)
 	if (!uri || !uri_utf8)
 		return -ENOMEM;
 
-	cd = iconv_open("UTF-8", "ASCII");
+	codeset = nl_langinfo(CODESET);
+	cd = iconv_open("UTF-8", codeset);
 	if (cd == (iconv_t)-1) {
 		err = -errno;
 		goto out;
@@ -1325,7 +1328,7 @@ vhd_macx_encode_location(char *name, char **out, int *outlen)
 	    (char **)
 #endif
 	    &urip, &ibl, &uri_utf8p, &obl) == (size_t)-1 ||
-	    ibl || obl) {
+	    ibl) {
 		err = (errno ? -errno : -EIO);
 		goto out;
 	}
@@ -1357,6 +1360,7 @@ vhd_w2u_encode_location(char *name, char **out, int *outlen)
 	size_t ibl, obl;
 	char *uri, *uri_utf16, *uri_utf16p, *tmp, *ret;
 	const char *urip;
+	char *codeset;
 
 	err     = 0;
 	ret     = NULL;
@@ -1404,7 +1408,8 @@ vhd_w2u_encode_location(char *name, char **out, int *outlen)
 	 * MICROSOFT_COMPAT
 	 * little endian unicode here 
 	 */
-	cd = iconv_open("UTF-16LE", "ASCII");
+	codeset = nl_langinfo(CODESET);
+	cd = iconv_open("UTF-16LE", codeset);
 	if (cd == (iconv_t)-1) {
 		err = -errno;
 		goto out;
@@ -1415,7 +1420,7 @@ vhd_w2u_encode_location(char *name, char **out, int *outlen)
 	    (char **)
 #endif
 	    &urip, &ibl, &uri_utf16p, &obl) == (size_t)-1 ||
-	    ibl || obl) {
+	    ibl) {
 		err = (errno ? -errno : -EIO);
 		goto out;
 	}
@@ -1447,11 +1452,13 @@ vhd_macx_decode_location(const char *in, char *out, int len)
 	iconv_t cd;
 	char *name;
 	size_t ibl, obl;
+	char *codeset;
 
 	name = out;
 	ibl  = obl = len;
 
-	cd = iconv_open("ASCII", "UTF-8");
+	codeset = nl_langinfo(CODESET);
+	cd = iconv_open(codeset, "UTF-8");
 	if (cd == (iconv_t)-1) 
 		return NULL;
 
@@ -1479,11 +1486,13 @@ vhd_w2u_decode_location(const char *in, char *out, int len, char *utf_type)
 	iconv_t cd;
 	char *name, *tmp;
 	size_t ibl, obl;
+	char *codeset;
 
 	tmp = name = out;
 	ibl = obl  = len;
 
-	cd = iconv_open("ASCII", utf_type);
+	codeset = nl_langinfo(CODESET);
+	cd = iconv_open(codeset, utf_type);
 	if (cd == (iconv_t)-1) 
 		return NULL;
 
@@ -2450,6 +2459,7 @@ vhd_initialize_header_parent_name(vhd_context_t *ctx, const char *parent_path)
 	size_t ibl, obl;
 	char *ppath, *dst;
 	const char *pname;
+	char *codeset;
 
 	err   = 0;
 	pname = NULL;
@@ -2459,7 +2469,8 @@ vhd_initialize_header_parent_name(vhd_context_t *ctx, const char *parent_path)
 	 * MICROSOFT_COMPAT
 	 * big endian unicode here 
 	 */
-	cd = iconv_open(UTF_16BE, "ASCII");
+	codeset = nl_langinfo(CODESET);
+	cd = iconv_open(UTF_16BE, codeset);
 	if (cd == (iconv_t)-1) {
 		err = -errno;
 		goto out;
diff --git a/tools/blktap2/vhd/vhd-util.c b/tools/blktap2/vhd/vhd-util.c
index 944a59e..13f1835 100644
--- a/tools/blktap2/vhd/vhd-util.c
+++ b/tools/blktap2/vhd/vhd-util.c
@@ -28,6 +28,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <langinfo.h>
+#include <locale.h>
 
 #include "libvhd.h"
 #include "vhd-util.h"
@@ -114,6 +116,7 @@ main(int argc, char *argv[])
 	if (setrlimit(RLIMIT_CORE, &rlim) < 0)
 		fprintf(stderr, "setrlimit failed: %d\n", errno);
 #endif
+	setlocale(LC_CTYPE, "");
 
 	ret = 0;
 
diff --git a/tools/config.h.in b/tools/config.h.in
index 2a0ae48..478a2cc 100644
--- a/tools/config.h.in
+++ b/tools/config.h.in
@@ -1,8 +1,29 @@
 /* config.h.in.  Generated from configure.ac by autoheader.  */
 
+/* Enabling support partial device tree in libxl */
+#undef ENABLE_PARTIAL_DEVICE_TREE
+
 /* Blktap2 enabled */
 #undef HAVE_BLKTAP2
 
+/* Define to 1 if you have the declaration of `fdt_first_subnode', and to 0 if
+   you don't. */
+#undef HAVE_DECL_FDT_FIRST_SUBNODE
+
+/* Define to 1 if you have the declaration of `fdt_next_subnode', and to 0 if
+   you don't. */
+#undef HAVE_DECL_FDT_NEXT_SUBNODE
+
+/* Define to 1 if you have the declaration of `fdt_property_u32', and to 0 if
+   you don't. */
+#undef HAVE_DECL_FDT_PROPERTY_U32
+
+/* Define to 1 if you have the `fdt_first_subnode' function. */
+#undef HAVE_FDT_FIRST_SUBNODE
+
+/* Define to 1 if you have the `fdt_next_subnode' function. */
+#undef HAVE_FDT_NEXT_SUBNODE
+
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
diff --git a/tools/configure b/tools/configure
index 2fa7426..aa66876 100755
--- a/tools/configure
+++ b/tools/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for Xen Hypervisor Tools 4.5.
+# Generated by GNU Autoconf 2.69 for Xen Hypervisor Tools 4.6.
 #
 # Report bugs to <xen-devel at lists.xen.org>.
 #
@@ -580,8 +580,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='Xen Hypervisor Tools'
 PACKAGE_TARNAME='xen'
-PACKAGE_VERSION='4.5'
-PACKAGE_STRING='Xen Hypervisor Tools 4.5'
+PACKAGE_VERSION='4.6'
+PACKAGE_STRING='Xen Hypervisor Tools 4.6'
 PACKAGE_BUGREPORT='xen-devel at lists.xen.org'
 PACKAGE_URL='http://www.xen.org/'
 
@@ -632,6 +632,7 @@ systemd
 remus_netbuf
 LIBNL3_LIBS
 LIBNL3_CFLAGS
+argp_ldflags
 libiconv
 PTYFUNCS_LIBS
 PTHREAD_LIBS
@@ -644,12 +645,16 @@ zlib
 FETCHER
 FTP
 WGET
+pixman_LIBS
+pixman_CFLAGS
 glib_LIBS
 glib_CFLAGS
 PKG_CONFIG_LIBDIR
 PKG_CONFIG_PATH
 PKG_CONFIG
+TINFO_LIBS
 CURSES_LIBS
+PY_NOOPT_CFLAGS
 EGREP
 GREP
 CPP
@@ -657,7 +662,6 @@ pyconfig
 PYTHONPATH
 CHECKPOLICY
 XENSTORED
-AWK
 OCAMLFIND
 OCAMLBUILD
 OCAMLDOC
@@ -676,6 +680,7 @@ INSTALL_DATA
 INSTALL_SCRIPT
 INSTALL_PROGRAM
 SET_MAKE
+AWK
 IASL
 BCC
 LD86
@@ -695,13 +700,13 @@ PREPEND_INCLUDES
 EXTRA_QEMUU_CONFIGURE_ARGS
 ovmf_path
 seabios_path
+qemu_xen_systemd
+qemu_xen_path
 qemu_xen
 rombios
 qemu_traditional
 blktap2
 LINUX_BACKEND_MODULES
-blktap1
-debug
 seabios
 ovmf
 xsmpolicy
@@ -709,6 +714,7 @@ ocamltools
 monitors
 githttp
 rpath
+XEN_DUMP_DIR
 XEN_PAGING_DIR
 XEN_LOCK_DIR
 XEN_SCRIPT_DIR
@@ -719,6 +725,7 @@ SHAREDIR
 XEN_LIB_STORED
 XEN_LOG_DIR
 XEN_RUN_DIR
+XENFIRMWAREDIR
 LIBEXEC_BIN
 CONFIG_LEAF_DIR
 FILE_OFFSET_BITS
@@ -782,6 +789,7 @@ enable_option_checking
 enable_largefile
 with_initddir
 with_sysconfig_leaf_dir
+with_xen_dumpdir
 enable_rpath
 enable_githttp
 enable_monitors
@@ -789,8 +797,6 @@ enable_ocamltools
 enable_xsmpolicy
 enable_ovmf
 enable_seabios
-enable_debug
-enable_blktap1
 with_linux_backend_modules
 enable_blktap2
 enable_qemu_traditional
@@ -828,12 +834,15 @@ AS86
 LD86
 BCC
 IASL
+AWK
 CPP
 PKG_CONFIG
 PKG_CONFIG_PATH
 PKG_CONFIG_LIBDIR
 glib_CFLAGS
 glib_LIBS
+pixman_CFLAGS
+pixman_LIBS
 LIBNL3_CFLAGS
 LIBNL3_LIBS
 SYSTEMD_CFLAGS
@@ -1378,7 +1387,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures Xen Hypervisor Tools 4.5 to adapt to many kinds of systems.
+\`configure' configures Xen Hypervisor Tools 4.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1443,7 +1452,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of Xen Hypervisor Tools 4.5:";;
+     short | recursive ) echo "Configuration of Xen Hypervisor Tools 4.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1462,8 +1471,6 @@ Optional Features:
   --disable-xsmpolicy     Disable XSM policy compilation (default is ENABLED)
   --enable-ovmf           Enable OVMF (default is DISABLED)
   --disable-seabios       Disable SeaBIOS (default is ENABLED)
-  --disable-debug         Disable debug build of tools (default is ENABLED)
-  --enable-blktap1        Enable blktap1 tools (default is DISABLED)
   --enable-blktap2        Enable blktap2, (DEFAULT is on for Linux, otherwise
                           off)
   --enable-qemu-traditional
@@ -1483,6 +1490,8 @@ Optional Packages:
                           options for runlevel scripts and daemons such as
                           xenstored. This should be either "sysconfig" or
                           "default". [sysconfig]
+  --with-xen-dumpdir=DIR  Path to directory for domU crash dumps.
+                          [LOCALSTATEDIR/lib/xen/dump]
   --with-linux-backend-modules="mod1 mod2"
                           List of Linux backend module or modalias names to be
                           autoloaded on startup.
@@ -1542,6 +1551,7 @@ Some influential environment variables:
   LD86        Path to ld86 tool
   BCC         Path to bcc tool
   IASL        Path to iasl tool
+  AWK         Path to awk tool
   CPP         C preprocessor
   PKG_CONFIG  path to pkg-config utility
   PKG_CONFIG_PATH
@@ -1550,6 +1560,9 @@ Some influential environment variables:
               path overriding pkg-config's built-in search path
   glib_CFLAGS C compiler flags for glib, overriding pkg-config
   glib_LIBS   linker flags for glib, overriding pkg-config
+  pixman_CFLAGS
+              C compiler flags for pixman, overriding pkg-config
+  pixman_LIBS linker flags for pixman, overriding pkg-config
   LIBNL3_CFLAGS
               C compiler flags for LIBNL3, overriding pkg-config
   LIBNL3_LIBS linker flags for LIBNL3, overriding pkg-config
@@ -1625,7 +1638,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-Xen Hypervisor Tools configure 4.5
+Xen Hypervisor Tools configure 4.6
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1923,11 +1936,124 @@ fi
   as_fn_set_status $ac_retval
 
 } # ac_fn_c_try_link
+
+# ac_fn_c_check_func LINENO FUNC VAR
+# ----------------------------------
+# Tests whether FUNC exists, setting the cache variable VAR accordingly
+ac_fn_c_check_func ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+/* Define $2 to an innocuous variant, in case <limits.h> declares $2.
+   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
+#define $2 innocuous_$2
+
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char $2 (); below.
+    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+    <limits.h> exists even on freestanding compilers.  */
+
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+
+#undef $2
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $2 ();
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined __stub_$2 || defined __stub___$2
+choke me
+#endif
+
+int
+main ()
+{
+return $2 ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_func
+
+# ac_fn_c_check_decl LINENO SYMBOL VAR INCLUDES
+# ---------------------------------------------
+# Tests whether SYMBOL is declared in INCLUDES, setting cache variable VAR
+# accordingly.
+ac_fn_c_check_decl ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  as_decl_name=`echo $2|sed 's/ *(.*//'`
+  as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'`
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5
+$as_echo_n "checking whether $as_decl_name is declared... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+#ifndef $as_decl_name
+#ifdef __cplusplus
+  (void) $as_decl_use;
+#else
+  (void) $as_decl_name;
+#endif
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_decl
 cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by Xen Hypervisor Tools $as_me 4.5, which was
+It was created by Xen Hypervisor Tools $as_me 4.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2276,7 +2402,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 
-ac_config_files="$ac_config_files ../config/Tools.mk hotplug/FreeBSD/rc.d/xencommons hotplug/Linux/init.d/sysconfig.xencommons hotplug/Linux/init.d/xen-watchdog hotplug/Linux/init.d/xencommons hotplug/Linux/init.d/xendomains hotplug/Linux/systemd/proc-xen.mount hotplug/Linux/systemd/var-lib-xenstored.mount hotplug/Linux/systemd/xen-init-dom0.service hotplug/Linux/systemd/xen-qemu-dom0-disk-backend.service hotplug/Linux/systemd/xen-watchdog.service hotplug/Linux/systemd/xenconsoled.servic [...]
+ac_config_files="$ac_config_files ../config/Tools.mk hotplug/FreeBSD/rc.d/xencommons hotplug/FreeBSD/rc.d/xendriverdomain hotplug/Linux/init.d/sysconfig.xencommons hotplug/Linux/init.d/xen-watchdog hotplug/Linux/init.d/xencommons hotplug/Linux/init.d/xendomains hotplug/Linux/init.d/xendriverdomain hotplug/Linux/vif-setup hotplug/Linux/xen-hotplug-common.sh hotplug/Linux/xendomains hotplug/NetBSD/rc.d/xencommons hotplug/NetBSD/rc.d/xendriverdomain libxl/xenlight.pc.in libxl/xlutil.pc.in"
 
 ac_config_headers="$ac_config_headers config.h"
 
@@ -3216,7 +3342,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -3262,7 +3388,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -3286,7 +3412,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -3331,7 +3457,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -3355,7 +3481,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -3448,6 +3574,10 @@ esac
 
 
 
+
+
+
+
 # pkg.m4 - Macros to locate and utilise pkg-config.            -*- Autoconf -*-
 # serial 1 (pkg-config-0.24)
 #
@@ -3464,8 +3594,7 @@ esac
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -3674,8 +3803,7 @@ esac
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 
 
 
@@ -3758,6 +3886,15 @@ fi
 CONFIG_LEAF_DIR=$config_leaf_dir
 
 
+
+# Check whether --with-xen-dumpdir was given.
+if test "${with_xen_dumpdir+set}" = set; then :
+  withval=$with_xen_dumpdir; xen_dumpdir_path=$withval
+else
+  xen_dumpdir_path=$localstatedir/lib/xen/dump
+fi
+
+
 if test "$libexecdir" = '${exec_prefix}/libexec' ; then
     case "$host_os" in
          *netbsd*) ;;
@@ -3770,6 +3907,9 @@ libexecdir=`eval echo $libexecdir`
 LIBEXEC_BIN=`eval echo $libexecdir/$PACKAGE_TARNAME/bin`
 
 
+XENFIRMWAREDIR=`eval echo $libexecdir/$PACKAGE_TARNAME/boot`
+
+
 XEN_RUN_DIR=$localstatedir/run/xen
 
 
@@ -3804,6 +3944,9 @@ esac
 XEN_PAGING_DIR=$localstatedir/lib/xen/xenpaging
 
 
+XEN_DUMP_DIR=$xen_dumpdir_path
+
+
 
 # Enable/disable options
 
@@ -3968,52 +4111,6 @@ seabios=$ax_cv_seabios
 
 
 
-# Check whether --enable-debug was given.
-if test "${enable_debug+set}" = set; then :
-  enableval=$enable_debug;
-fi
-
-
-if test "x$enable_debug" = "xno"; then :
-
-    ax_cv_debug="n"
-
-elif test "x$enable_debug" = "xyes"; then :
-
-    ax_cv_debug="y"
-
-elif test -z $ax_cv_debug; then :
-
-    ax_cv_debug="y"
-
-fi
-debug=$ax_cv_debug
-
-
-
-# Check whether --enable-blktap1 was given.
-if test "${enable_blktap1+set}" = set; then :
-  enableval=$enable_blktap1;
-fi
-
-
-if test "x$enable_blktap1" = "xno"; then :
-
-    ax_cv_blktap1="n"
-
-elif test "x$enable_blktap1" = "xyes"; then :
-
-    ax_cv_blktap1="y"
-
-elif test -z $ax_cv_blktap1; then :
-
-    ax_cv_blktap1="n"
-
-fi
-blktap1=$ax_cv_blktap1
-
-
-
 
 # Check whether --with-linux-backend-modules was given.
 if test "${with_linux_backend_modules+set}" = set; then :
@@ -4037,7 +4134,6 @@ usbbk
 pciback
 xen-acpi-processor
 blktap2
-blktap
 "
 ;;
 *)
@@ -4142,9 +4238,14 @@ fi
 if test "${with_system_qemu+set}" = set; then :
   withval=$with_system_qemu;
     case $withval in
-    yes) qemu_xen=n ; qemu_xen_path=qemu ;;
-    no)  qemu_xen=y ; qemu_xen_path= ;;
-    *)   qemu_xen=n ; qemu_xen_path=$withval ;;
+        yes)
+            qemu_xen=n ; qemu_xen_path="qemu-system-i386"
+            qemu_xen_systemd="/usr/bin/env $qemu_xen_path" ;;
+        no)
+            qemu_xen=y ;;
+        *)
+            qemu_xen=n ; qemu_xen_path="$withval" ;
+            qemu_xen_systemd="$qemu_xen_path" ;;
     esac
 
 else
@@ -4159,15 +4260,19 @@ else
 
 fi
 
-if test "x$qemu_xen" = "xn"; then :
+if test "x$qemu_xen" = "xy"; then :
 
+    qemu_xen_path="$LIBEXEC_BIN/qemu-system-i386"
+    qemu_xen_systemd="$qemu_xen_path"
+
+fi
 
 cat >>confdefs.h <<_ACEOF
 #define QEMU_XEN_PATH "$qemu_xen_path"
 _ACEOF
 
 
-fi
+
 
 
 
@@ -4246,6 +4351,7 @@ LDFLAGS="$PREPEND_LDFLAGS $LDFLAGS $APPEND_LDFLAGS"
 
 
 
+
 # Checks for programs.
 ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
@@ -5035,6 +5141,51 @@ if test x"${PERL}" = x"no"
 then
     as_fn_error $? "Unable to find perl, please install perl" "$LINENO" 5
 fi
+# Extract the first word of "awk", so it can be a program name with args.
+set dummy awk; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_AWK+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $AWK in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_AWK="$AWK" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_path_AWK="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_path_AWK" && ac_cv_path_AWK="no"
+  ;;
+esac
+fi
+AWK=$ac_cv_path_AWK
+if test -n "$AWK"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AWK" >&5
+$as_echo "$AWK" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+if test x"${AWK}" = x"no"
+then
+    as_fn_error $? "Unable to find awk, please install awk" "$LINENO" 5
+fi
 
   # checking for ocamlc
   if test -n "$ac_tool_prefix"; then
@@ -7059,6 +7210,40 @@ CPPFLAGS=$ac_previous_cppflags
 LDLFAGS=$ac_previous_ldflags
 
 
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether Python setup.py brokenly enables -D_FORTIFY_SOURCE" >&5
+$as_echo_n "checking whether Python setup.py brokenly enables -D_FORTIFY_SOURCE... " >&6; }
+if ${ax_cv_python_fortify+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+        ax_cv_python_fortify=no
+        for arg in $($PYTHON-config --cflags); do
+            case "$arg" in
+            -D_FORTIFY_SOURCE=0) ax_cv_python_fortify=no ;;
+            -D_FORTIFY_SOURCE=*) ax_cv_python_fortify=yes ;;
+            -Wp,-D_FORTIFY_SOURCE=0) ax_cv_python_fortify=no ;;
+            -Wp,-D_FORTIFY_SOURCE=*) ax_cv_python_fortify=yes ;;
+            *) ;;
+            esac
+        done
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_python_fortify" >&5
+$as_echo "$ax_cv_python_fortify" >&6; }
+
+    if test x$ax_cv_python_fortify = xyes; then :
+
+        PY_NOOPT_CFLAGS=-O1
+
+else
+
+        PY_NOOPT_CFLAGS=''
+
+fi
+
+
+
+
 fi
 
 if ! $rump; then
@@ -7478,6 +7663,52 @@ $as_echo "#define INCLUDE_CURSES_H <curses.h>" >>confdefs.h
 fi
 
 
+if test "$ncurses" = "y"; then :
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for define_key in -ltinfo" >&5
+$as_echo_n "checking for define_key in -ltinfo... " >&6; }
+if ${ac_cv_lib_tinfo_define_key+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ltinfo  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char define_key ();
+int
+main ()
+{
+return define_key ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_tinfo_define_key=yes
+else
+  ac_cv_lib_tinfo_define_key=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_tinfo_define_key" >&5
+$as_echo "$ac_cv_lib_tinfo_define_key" >&6; }
+if test "x$ac_cv_lib_tinfo_define_key" = xyes; then :
+  TINFO_LIBS=-ltinfo
+fi
+
+
+fi
+
+
 
 
 
@@ -7598,6 +7829,8 @@ $as_echo "no" >&6; }
 		PKG_CONFIG=""
 	fi
 fi
+if test "x$qemu_xen" = "xy"; then :
+
 
 pkg_failed=no
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for glib" >&5
@@ -7690,6 +7923,99 @@ $as_echo "yes" >&6; }
 
 fi
 
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for pixman" >&5
+$as_echo_n "checking for pixman... " >&6; }
+
+if test -n "$pixman_CFLAGS"; then
+    pkg_cv_pixman_CFLAGS="$pixman_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"pixman-1 >= 0.21.8\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "pixman-1 >= 0.21.8") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_pixman_CFLAGS=`$PKG_CONFIG --cflags "pixman-1 >= 0.21.8" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+if test -n "$pixman_LIBS"; then
+    pkg_cv_pixman_LIBS="$pixman_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"pixman-1 >= 0.21.8\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "pixman-1 >= 0.21.8") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_pixman_LIBS=`$PKG_CONFIG --libs "pixman-1 >= 0.21.8" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+   	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi
+        if test $_pkg_short_errors_supported = yes; then
+	        pixman_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "pixman-1 >= 0.21.8" 2>&1`
+        else
+	        pixman_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "pixman-1 >= 0.21.8" 2>&1`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$pixman_PKG_ERRORS" >&5
+
+	as_fn_error $? "Package requirements (pixman-1 >= 0.21.8) were not met:
+
+$pixman_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+Alternatively, you may set the environment variables pixman_CFLAGS
+and pixman_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details." "$LINENO" 5
+elif test $pkg_failed = untried; then
+     	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+Alternatively, you may set the environment variables pixman_CFLAGS
+and pixman_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.
+See \`config.log' for more details" "$LINENO" 5; }
+else
+	pixman_CFLAGS=$pkg_cv_pixman_CFLAGS
+	pixman_LIBS=$pkg_cv_pixman_LIBS
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+fi
+
+fi
+
 # Extract the first word of "wget", so it can be a program name with args.
 set dummy wget; ac_word=$2
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
@@ -7935,7 +8261,7 @@ fi
 
 
 
-if test "x$enable_blktap1" = "xyes" || test "x$enable_blktap2" = "xyes"; then :
+if test "x$enable_blktap2" = "xyes"; then :
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for io_setup in -laio" >&5
 $as_echo_n "checking for io_setup in -laio... " >&6; }
@@ -8451,6 +8777,56 @@ else
 fi
 
 
+ac_fn_c_check_header_mongrel "$LINENO" "argp.h" "ac_cv_header_argp_h" "$ac_includes_default"
+if test "x$ac_cv_header_argp_h" = xyes; then :
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for argp_usage in -largp" >&5
+$as_echo_n "checking for argp_usage in -largp... " >&6; }
+if ${ac_cv_lib_argp_argp_usage+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-largp  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char argp_usage ();
+int
+main ()
+{
+return argp_usage ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_argp_argp_usage=yes
+else
+  ac_cv_lib_argp_argp_usage=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_argp_argp_usage" >&5
+$as_echo "$ac_cv_lib_argp_argp_usage" >&6; }
+if test "x$ac_cv_lib_argp_argp_usage" = xyes; then :
+  argp_ldflags="-largp"
+fi
+
+
+else
+  as_fn_error $? "Could not find argp" "$LINENO" 5
+fi
+
+
+
 
 # FDT is needed only on ARM
 case "$host_cpu" in
@@ -8502,6 +8878,83 @@ else
   as_fn_error $? "Could not find libfdt" "$LINENO" 5
 fi
 
+
+# Check for libfdt >= 1.4.0. If present enable passthrough
+# Note that libfdt doesn't provide versionning. So we need to rely on
+# function present in new version.
+# Use fdt_first_property_offset which has been correctly exported since v1.4.0
+ac_fn_c_check_func "$LINENO" "fdt_first_property_offset" "ac_cv_func_fdt_first_property_offset"
+if test "x$ac_cv_func_fdt_first_property_offset" = xyes; then :
+  partial_dt="y"
+else
+  partial_dt="n"
+fi
+
+
+if test "x$partial_dt" = "xy" ; then :
+
+$as_echo "#define ENABLE_PARTIAL_DEVICE_TREE 1" >>confdefs.h
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Disabling support for partial device tree in libxl.
+       Please install libfdt library - version 1.4.0 or higher" >&5
+$as_echo "$as_me: WARNING: Disabling support for partial device tree in libxl.
+       Please install libfdt library - version 1.4.0 or higher" >&2;}
+fi
+
+# The functions fdt_{first,next}_subnode may not be available because:
+#   * It has been introduced in 2013 => Doesn't work on Wheezy
+#   * The prototype exists but the functions are not exposed. Don't ask why...
+for ac_func in fdt_first_subnode fdt_next_subnode
+do :
+  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+done
+
+ac_fn_c_check_decl "$LINENO" "fdt_first_subnode" "ac_cv_have_decl_fdt_first_subnode" "#include <libfdt.h>
+"
+if test "x$ac_cv_have_decl_fdt_first_subnode" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_FDT_FIRST_SUBNODE $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "fdt_next_subnode" "ac_cv_have_decl_fdt_next_subnode" "#include <libfdt.h>
+"
+if test "x$ac_cv_have_decl_fdt_next_subnode" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_FDT_NEXT_SUBNODE $ac_have_decl
+_ACEOF
+
+
+# The helper fdt_property_u32 is only present in libfdt >= 1.4.0
+# It's an inline function, so only check if the declaration is present
+ac_fn_c_check_decl "$LINENO" "fdt_property_u32" "ac_cv_have_decl_fdt_property_u32" "#include <libfdt.h>
+"
+if test "x$ac_cv_have_decl_fdt_property_u32" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_FDT_PROPERTY_U32 $ac_have_decl
+_ACEOF
+
 esac
 
 # Checks for header files.
@@ -8865,14 +9318,28 @@ fi
 
 
 else
+
+		if test "x$enable_systemd" = "xyes"; then :
+  as_fn_error $? "Unable to find systemd development library" "$LINENO" 5
+else
   systemd=n
 fi
 
+fi
+
 else
   systemd=n
 fi
 
 
+
+if test "x$systemd" = "xy"; then :
+
+    ac_config_files="$ac_config_files hotplug/Linux/systemd/proc-xen.mount hotplug/Linux/systemd/var-lib-xenstored.mount hotplug/Linux/systemd/xen-init-dom0.service hotplug/Linux/systemd/xen-qemu-dom0-disk-backend.service hotplug/Linux/systemd/xen-watchdog.service hotplug/Linux/systemd/xenconsoled.service hotplug/Linux/systemd/xendomains.service hotplug/Linux/systemd/xenstored.service hotplug/Linux/systemd/xenstored.socket hotplug/Linux/systemd/xenstored_ro.socket"
+
+
+fi
+
 cat >confcache <<\_ACEOF
 # This file is a shell script that caches the results of configure
 # tests run on this system so they can be shared between configure
@@ -9379,7 +9846,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by Xen Hypervisor Tools $as_me 4.5, which was
+This file was extended by Xen Hypervisor Tools $as_me 4.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -9442,7 +9909,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-Xen Hypervisor Tools config.status 4.5
+Xen Hypervisor Tools config.status 4.6
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
@@ -9567,10 +10034,20 @@ do
   case $ac_config_target in
     "../config/Tools.mk") CONFIG_FILES="$CONFIG_FILES ../config/Tools.mk" ;;
     "hotplug/FreeBSD/rc.d/xencommons") CONFIG_FILES="$CONFIG_FILES hotplug/FreeBSD/rc.d/xencommons" ;;
+    "hotplug/FreeBSD/rc.d/xendriverdomain") CONFIG_FILES="$CONFIG_FILES hotplug/FreeBSD/rc.d/xendriverdomain" ;;
     "hotplug/Linux/init.d/sysconfig.xencommons") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/init.d/sysconfig.xencommons" ;;
     "hotplug/Linux/init.d/xen-watchdog") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/init.d/xen-watchdog" ;;
     "hotplug/Linux/init.d/xencommons") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/init.d/xencommons" ;;
     "hotplug/Linux/init.d/xendomains") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/init.d/xendomains" ;;
+    "hotplug/Linux/init.d/xendriverdomain") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/init.d/xendriverdomain" ;;
+    "hotplug/Linux/vif-setup") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/vif-setup" ;;
+    "hotplug/Linux/xen-hotplug-common.sh") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/xen-hotplug-common.sh" ;;
+    "hotplug/Linux/xendomains") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/xendomains" ;;
+    "hotplug/NetBSD/rc.d/xencommons") CONFIG_FILES="$CONFIG_FILES hotplug/NetBSD/rc.d/xencommons" ;;
+    "hotplug/NetBSD/rc.d/xendriverdomain") CONFIG_FILES="$CONFIG_FILES hotplug/NetBSD/rc.d/xendriverdomain" ;;
+    "libxl/xenlight.pc.in") CONFIG_FILES="$CONFIG_FILES libxl/xenlight.pc.in" ;;
+    "libxl/xlutil.pc.in") CONFIG_FILES="$CONFIG_FILES libxl/xlutil.pc.in" ;;
+    "config.h") CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;;
     "hotplug/Linux/systemd/proc-xen.mount") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/systemd/proc-xen.mount" ;;
     "hotplug/Linux/systemd/var-lib-xenstored.mount") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/systemd/var-lib-xenstored.mount" ;;
     "hotplug/Linux/systemd/xen-init-dom0.service") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/systemd/xen-init-dom0.service" ;;
@@ -9581,12 +10058,6 @@ do
     "hotplug/Linux/systemd/xenstored.service") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/systemd/xenstored.service" ;;
     "hotplug/Linux/systemd/xenstored.socket") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/systemd/xenstored.socket" ;;
     "hotplug/Linux/systemd/xenstored_ro.socket") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/systemd/xenstored_ro.socket" ;;
-    "hotplug/Linux/vif-setup") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/vif-setup" ;;
-    "hotplug/Linux/xen-backend.rules") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/xen-backend.rules" ;;
-    "hotplug/Linux/xen-hotplug-common.sh") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/xen-hotplug-common.sh" ;;
-    "hotplug/Linux/xendomains") CONFIG_FILES="$CONFIG_FILES hotplug/Linux/xendomains" ;;
-    "hotplug/NetBSD/rc.d/xencommons") CONFIG_FILES="$CONFIG_FILES hotplug/NetBSD/rc.d/xencommons" ;;
-    "config.h") CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;;
 
   *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
   esac
diff --git a/tools/configure.ac b/tools/configure.ac
index b7f1513..6c70040 100644
--- a/tools/configure.ac
+++ b/tools/configure.ac
@@ -8,25 +8,19 @@ AC_CONFIG_SRCDIR([libxl/libxl.c])
 AC_CONFIG_FILES([
 ../config/Tools.mk
 hotplug/FreeBSD/rc.d/xencommons
+hotplug/FreeBSD/rc.d/xendriverdomain
 hotplug/Linux/init.d/sysconfig.xencommons
 hotplug/Linux/init.d/xen-watchdog
 hotplug/Linux/init.d/xencommons
 hotplug/Linux/init.d/xendomains
-hotplug/Linux/systemd/proc-xen.mount
-hotplug/Linux/systemd/var-lib-xenstored.mount
-hotplug/Linux/systemd/xen-init-dom0.service
-hotplug/Linux/systemd/xen-qemu-dom0-disk-backend.service
-hotplug/Linux/systemd/xen-watchdog.service
-hotplug/Linux/systemd/xenconsoled.service
-hotplug/Linux/systemd/xendomains.service
-hotplug/Linux/systemd/xenstored.service
-hotplug/Linux/systemd/xenstored.socket
-hotplug/Linux/systemd/xenstored_ro.socket
+hotplug/Linux/init.d/xendriverdomain
 hotplug/Linux/vif-setup
-hotplug/Linux/xen-backend.rules
 hotplug/Linux/xen-hotplug-common.sh
 hotplug/Linux/xendomains
 hotplug/NetBSD/rc.d/xencommons
+hotplug/NetBSD/rc.d/xendriverdomain
+libxl/xenlight.pc.in
+libxl/xlutil.pc.in
 ])
 AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_AUX_DIR([../])
@@ -66,6 +60,7 @@ m4_include([../m4/checkpolicy.m4])
 m4_include([../m4/set_cflags_ldflags.m4])
 m4_include([../m4/python_version.m4])
 m4_include([../m4/python_devel.m4])
+m4_include([../m4/python_fortify_noopt.m4])
 m4_include([../m4/ocaml.m4])
 m4_include([../m4/uuid.m4])
 m4_include([../m4/pkg.m4])
@@ -88,8 +83,6 @@ AX_ARG_DEFAULT_ENABLE([ocamltools], [Disable Ocaml tools])
 AX_ARG_DEFAULT_ENABLE([xsmpolicy], [Disable XSM policy compilation])
 AX_ARG_DEFAULT_DISABLE([ovmf], [Enable OVMF])
 AX_ARG_DEFAULT_ENABLE([seabios], [Disable SeaBIOS])
-AX_ARG_DEFAULT_ENABLE([debug], [Disable debug build of tools])
-AX_ARG_DEFAULT_DISABLE([blktap1], [Enable blktap1 tools])
 
 AC_ARG_WITH([linux-backend-modules],
     AS_HELP_STRING([--with-linux-backend-modules="mod1 mod2"],
@@ -113,7 +106,6 @@ usbbk
 pciback
 xen-acpi-processor
 blktap2
-blktap
 "
 ;;
 *)
@@ -183,9 +175,14 @@ AC_ARG_WITH([system-qemu],
        [Use system supplied qemu PATH or qemu (taken from $PATH) as qemu-xen
         device model instead of building and installing our own version]),[
     case $withval in
-    yes) qemu_xen=n ; qemu_xen_path=qemu ;;
-    no)  qemu_xen=y ; qemu_xen_path= ;;
-    *)   qemu_xen=n ; qemu_xen_path=$withval ;;
+        yes)
+            qemu_xen=n ; qemu_xen_path="qemu-system-i386"
+            qemu_xen_systemd="/usr/bin/env $qemu_xen_path" ;;
+        no)
+            qemu_xen=y ;;
+        *)
+            qemu_xen=n ; qemu_xen_path="$withval" ;
+            qemu_xen_systemd="$qemu_xen_path" ;;
     esac
 ],[
     case "$host_cpu" in
@@ -196,10 +193,14 @@ AC_ARG_WITH([system-qemu],
         *) qemu_xen=n;;
     esac
 ])
-AS_IF([test "x$qemu_xen" = "xn"], [
-    AC_DEFINE_UNQUOTED([QEMU_XEN_PATH], ["$qemu_xen_path"], [Qemu Xen path])
+AS_IF([test "x$qemu_xen" = "xy"], [
+    qemu_xen_path="$LIBEXEC_BIN/qemu-system-i386"
+    qemu_xen_systemd="$qemu_xen_path"
 ])
+AC_DEFINE_UNQUOTED([QEMU_XEN_PATH], ["$qemu_xen_path"], [Qemu Xen path])
 AC_SUBST(qemu_xen)
+AC_SUBST(qemu_xen_path)
+AC_SUBST(qemu_xen_systemd)
 
 AC_ARG_WITH([system-seabios],
     AS_HELP_STRING([--with-system-seabios@<:@=PATH@:>@],
@@ -256,6 +257,7 @@ AC_ARG_VAR([AS86], [Path to as86 tool])
 AC_ARG_VAR([LD86], [Path to ld86 tool])
 AC_ARG_VAR([BCC], [Path to bcc tool])
 AC_ARG_VAR([IASL], [Path to iasl tool])
+AC_ARG_VAR([AWK], [Path to awk tool])
 
 # Checks for programs.
 AC_PROG_CC
@@ -264,6 +266,7 @@ AC_PROG_INSTALL
 AC_PATH_PROG([BISON], [bison])
 AC_PATH_PROG([FLEX], [flex])
 AX_PATH_PROG_OR_FAIL([PERL], [perl])
+AX_PATH_PROG_OR_FAIL([AWK], [awk])
 
 AC_PROG_OCAML
 AC_PROG_FINDLIB
@@ -305,6 +308,7 @@ AX_CHECK_PYTHON_VERSION([2], [3])
 
 AS_IF([test "$cross_compiling" != yes], [
     AX_CHECK_PYTHON_DEVEL()
+    AX_CHECK_PYTHON_FORTIFY_NOOPT()
 ])
 
 if ! $rump; then
@@ -324,7 +328,16 @@ i[[3456]]86|x86_64)
 esac
  AX_CHECK_UUID
  AX_CHECK_CURSES
+AS_IF([test "$ncurses" = "y"], [
+AC_CHECK_LIB([tinfo], [define_key], [TINFO_LIBS=-ltinfo])
+])
+AC_SUBST(TINFO_LIBS)
+
+dnl The following are only required when upstream QEMU is built
+AS_IF([test "x$qemu_xen" = "xy"], [
 PKG_CHECK_MODULES(glib, [glib-2.0 >= 2.12])
+PKG_CHECK_MODULES(pixman, [pixman-1 >= 0.21.8])
+])
 AX_CHECK_FETCHER
 
 # Checks for libraries.
@@ -338,7 +351,7 @@ AC_CHECK_HEADER([lzo/lzo1x.h], [
 AC_CHECK_LIB([lzo2], [lzo1x_decompress], [zlib="$zlib -DHAVE_LZO1X -llzo2"])
 ])
 AC_SUBST(zlib)
-AS_IF([test "x$enable_blktap1" = "xyes" || test "x$enable_blktap2" = "xyes"], [
+AS_IF([test "x$enable_blktap2" = "xyes"], [
 AC_CHECK_LIB([aio], [io_setup], [], [AC_MSG_ERROR([Could not find libaio])])
 ])
 AC_SUBST(system_aio)
@@ -353,11 +366,37 @@ AC_CHECK_LIB([yajl], [yajl_alloc], [],
 AC_CHECK_LIB([z], [deflateCopy], [], [AC_MSG_ERROR([Could not find zlib])])
 AC_CHECK_LIB([iconv], [libiconv_open], [libiconv="y"], [libiconv="n"])
 AC_SUBST(libiconv)
+AC_CHECK_HEADER([argp.h], [
+AC_CHECK_LIB([argp], [argp_usage], [argp_ldflags="-largp"])
+], [AC_MSG_ERROR([Could not find argp])])
+AC_SUBST(argp_ldflags)
 
 # FDT is needed only on ARM
 case "$host_cpu" in
 arm*|aarch64)
 AC_CHECK_LIB([fdt], [fdt_create], [], [AC_MSG_ERROR([Could not find libfdt])])
+
+# Check for libfdt >= 1.4.0. If present enable passthrough
+# Note that libfdt doesn't provide versionning. So we need to rely on
+# function present in new version.
+# Use fdt_first_property_offset which has been correctly exported since v1.4.0
+AC_CHECK_FUNC(fdt_first_property_offset, [partial_dt="y"], [partial_dt="n"])
+
+AS_IF([test "x$partial_dt" = "xy" ],
+      [AC_DEFINE([ENABLE_PARTIAL_DEVICE_TREE], [1],
+                 [Enabling support partial device tree in libxl])],
+      [AC_MSG_WARN([Disabling support for partial device tree in libxl.
+       Please install libfdt library - version 1.4.0 or higher])])
+
+# The functions fdt_{first,next}_subnode may not be available because:
+#   * It has been introduced in 2013 => Doesn't work on Wheezy
+#   * The prototype exists but the functions are not exposed. Don't ask why...
+AC_CHECK_FUNCS([fdt_first_subnode fdt_next_subnode])
+AC_CHECK_DECLS([fdt_first_subnode, fdt_next_subnode],,,[#include <libfdt.h>])
+
+# The helper fdt_property_u32 is only present in libfdt >= 1.4.0
+# It's an inline function, so only check if the declaration is present
+AC_CHECK_DECLS([fdt_property_u32],,,[#include <libfdt.h>])
 esac
 
 # Checks for header files.
@@ -382,5 +421,21 @@ AC_SUBST(LIBNL3_CFLAGS)
 fi # ! $rump
 
 AX_AVAILABLE_SYSTEMD()
+
+AS_IF([test "x$systemd" = "xy"], [
+    AC_CONFIG_FILES([
+    hotplug/Linux/systemd/proc-xen.mount
+    hotplug/Linux/systemd/var-lib-xenstored.mount
+    hotplug/Linux/systemd/xen-init-dom0.service
+    hotplug/Linux/systemd/xen-qemu-dom0-disk-backend.service
+    hotplug/Linux/systemd/xen-watchdog.service
+    hotplug/Linux/systemd/xenconsoled.service
+    hotplug/Linux/systemd/xendomains.service
+    hotplug/Linux/systemd/xenstored.service
+    hotplug/Linux/systemd/xenstored.socket
+    hotplug/Linux/systemd/xenstored_ro.socket
+    ])
+])
+
 AC_OUTPUT()
 
diff --git a/tools/console/Makefile b/tools/console/Makefile
index 6e55618..77e8f29 100644
--- a/tools/console/Makefile
+++ b/tools/console/Makefile
@@ -21,17 +21,24 @@ all: $(BIN)
 clean:
 	$(RM) *.a *.so *.o *.rpm $(BIN) $(DEPS)
 	$(RM) client/*.o daemon/*.o
+	$(RM) client/_paths.h
+
+.PHONY: distclean
+distclean: clean
 
 xenconsoled: $(patsubst %.c,%.o,$(wildcard daemon/*.c))
 	$(CC) $(LDFLAGS) $^ -o $@ $(LDLIBS) $(LDLIBS_xenconsoled) $(APPEND_LDFLAGS)
 
-xenconsole: $(patsubst %.c,%.o,$(wildcard client/*.c))
+xenconsole: client/_paths.h $(patsubst %.c,%.o,$(wildcard client/*.c))
 	$(CC) $(LDFLAGS) $^ -o $@ $(LDLIBS) $(LDLIBS_xenconsole) $(APPEND_LDFLAGS)
 
+genpath-target = $(call buildmakevars2header,client/_paths.h)
+$(eval $(genpath-target))
+
 .PHONY: install
 install: $(BIN)
-	$(INSTALL_DIR) $(DESTDIR)/$(SBINDIR)
-	$(INSTALL_PROG) xenconsoled $(DESTDIR)/$(SBINDIR)
+	$(INSTALL_DIR) $(DESTDIR)/$(sbindir)
+	$(INSTALL_PROG) xenconsoled $(DESTDIR)/$(sbindir)
 	$(INSTALL_DIR) $(DESTDIR)$(LIBEXEC_BIN)
 	$(INSTALL_PROG) xenconsole $(DESTDIR)$(LIBEXEC_BIN)
 
diff --git a/tools/console/client/main.c b/tools/console/client/main.c
index f4c783b..f130a60 100644
--- a/tools/console/client/main.c
+++ b/tools/console/client/main.c
@@ -14,11 +14,12 @@
  *  GNU General Public License for more details.
  * 
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
 \*/
 
+#include <sys/file.h>
 #include <sys/types.h>
+#include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/un.h>
 #include <stdio.h>
@@ -40,10 +41,13 @@
 
 #include <xenstore.h>
 #include "xenctrl.h"
+#include "_paths.h"
 
 #define ESCAPE_CHARACTER 0x1d
 
 static volatile sig_atomic_t received_signal = 0;
+static char lockfile[sizeof (XEN_LOCK_DIR "/xenconsole.") + 8] = { 0 };
+static int lockfd = -1;
 
 static void sighandler(int signum)
 {
@@ -168,16 +172,19 @@ static void restore_term(int fd, struct termios *old)
 	tcsetattr(fd, TCSANOW, old);
 }
 
-static int console_loop(int fd, struct xs_handle *xs, char *pty_path)
+static int console_loop(int fd, struct xs_handle *xs, char *pty_path,
+		        bool interactive)
 {
-	int ret, xs_fd = xs_fileno(xs), max_fd;
+	int ret, xs_fd = xs_fileno(xs), max_fd = -1;
 
 	do {
 		fd_set fds;
 
 		FD_ZERO(&fds);
-		FD_SET(STDIN_FILENO, &fds);
-		max_fd = STDIN_FILENO;
+		if (interactive) {
+			FD_SET(STDIN_FILENO, &fds);
+			max_fd = STDIN_FILENO;
+		}
 		FD_SET(xs_fd, &fds);
 		if (xs_fd > max_fd) max_fd = xs_fd;
 		if (fd != -1) FD_SET(fd, &fds);
@@ -264,6 +271,53 @@ static void restore_term_stdin(void)
 	restore_term(STDIN_FILENO, &stdin_old_attr);
 }
 
+/* The following locking strategy is based on that from
+ * libxl__domain_userdata_lock(), with the difference that we want to fail if we
+ * cannot acquire the lock rather than wait indefinitely.
+ */
+static void console_lock(int domid)
+{
+	struct stat stab, fstab;
+	int fd;
+
+	snprintf(lockfile, sizeof lockfile, "%s%d", XEN_LOCK_DIR "/xenconsole.", domid);
+
+	while (true) {
+		fd = open(lockfile, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+		if (fd < 0)
+			err(errno, "Could not open %s", lockfile);
+
+		while (flock(fd, LOCK_EX | LOCK_NB)) {
+			if (errno == EINTR)
+				continue;
+			else
+				err(errno, "Could not lock %s", lockfile);
+		}
+		if (fstat(fd, &fstab))
+			err(errno, "Could not fstat %s", lockfile);
+		if (stat(lockfile, &stab)) {
+			if (errno != ENOENT)
+				err(errno, "Could not stat %s", lockfile);
+		} else {
+			if (stab.st_dev == fstab.st_dev && stab.st_ino == fstab.st_ino)
+				break;
+		}
+
+		close(fd);
+	}
+
+	lockfd = fd;
+	return;
+}
+
+static void console_unlock(void)
+{
+	if (lockfile[0] && lockfd != -1) {
+		unlink(lockfile);
+		close(lockfd);
+	}
+}
+
 int main(int argc, char **argv)
 {
 	struct termios attr;
@@ -284,6 +338,10 @@ int main(int argc, char **argv)
 	struct xs_handle *xs;
 	char *end;
 	console_type type = CONSOLE_INVAL;
+	bool interactive = 0;
+
+	if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO))
+		interactive = 1;
 
 	while((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
 		switch(ch) {
@@ -375,6 +433,9 @@ int main(int argc, char **argv)
 		exit(EINVAL);
 	}
 
+	console_lock(domid);
+	atexit(console_unlock);
+
 	/* Set a watch on this domain's console pty */
 	if (!xs_watch(xs, path, ""))
 		err(errno, "Can't set watch for console pty");
@@ -390,9 +451,11 @@ int main(int argc, char **argv)
 	}
 
 	init_term(spty, &attr);
-	init_term(STDIN_FILENO, &stdin_old_attr);
-	atexit(restore_term_stdin); /* if this fails, oh dear */
-	console_loop(spty, xs, path);
+	if (interactive) {
+		init_term(STDIN_FILENO, &stdin_old_attr);
+		atexit(restore_term_stdin); /* if this fails, oh dear */
+	}
+	console_loop(spty, xs, path, interactive);
 
 	free(path);
 	free(dom_path);
diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
index ac08b5b..cafc7b7 100644
--- a/tools/console/daemon/io.c
+++ b/tools/console/daemon/io.c
@@ -14,8 +14,7 @@
  *  GNU General Public License for more details.
  * 
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #define _GNU_SOURCE
@@ -523,7 +522,7 @@ static void domain_unmap_interface(struct domain *dom)
 	if (xcg_handle && dom->ring_ref == -1)
 		xc_gnttab_munmap(xcg_handle, dom->interface, 1);
 	else
-		munmap(dom->interface, getpagesize());
+		munmap(dom->interface, XC_PAGE_SIZE);
 	dom->interface = NULL;
 	dom->ring_ref = -1;
 }
@@ -562,7 +561,7 @@ static int domain_create_ring(struct domain *dom)
 	if (!dom->interface) {
 		/* Fall back to xc_map_foreign_range */
 		dom->interface = xc_map_foreign_range(
-			xc, dom->domid, getpagesize(),
+			xc, dom->domid, XC_PAGE_SIZE,
 			PROT_READ|PROT_WRITE,
 			(unsigned long)ring_ref);
 		if (dom->interface == NULL) {
diff --git a/tools/console/daemon/io.h b/tools/console/daemon/io.h
index f658bfc..d016add 100644
--- a/tools/console/daemon/io.h
+++ b/tools/console/daemon/io.h
@@ -14,8 +14,7 @@
  *  GNU General Public License for more details.
  * 
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
 \*/
 
 #ifndef CONSOLED_IO_H
diff --git a/tools/console/daemon/main.c b/tools/console/daemon/main.c
index 92d2fc4..23860d3 100644
--- a/tools/console/daemon/main.c
+++ b/tools/console/daemon/main.c
@@ -14,8 +14,7 @@
  *  GNU General Public License for more details.
  * 
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
 \*/
 
 #include <getopt.h>
@@ -26,6 +25,7 @@
 #include <string.h>
 #include <signal.h>
 #include <sys/types.h>
+#include <sys/resource.h>
 
 #include "xenctrl.h"
 
@@ -55,6 +55,41 @@ static void version(char *name)
 	printf("Xen Console Daemon 3.0\n");
 }
 
+static void increase_fd_limit(void)
+{
+	/*
+	 * We require many file descriptors:
+	 * - per domain: pty master, pty slave, logfile and evtchn
+	 * - misc extra: hypervisor log, privcmd, gntdev, std...
+	 *
+	 * Allow a generous 1000 for misc, and calculate the maximum possible
+	 * number of fds which could be used.
+	 */
+	unsigned min_fds = (DOMID_FIRST_RESERVED * 4) + 1000;
+	struct rlimit lim, new = { min_fds, min_fds };
+
+	if (getrlimit(RLIMIT_NOFILE, &lim) < 0) {
+		fprintf(stderr, "Failed to obtain fd limit: %s\n",
+			strerror(errno));
+		exit(1);
+	}
+
+	/* Do we already have sufficient? Great! */
+	if (lim.rlim_cur >= min_fds)
+		return;
+
+	/* Try to increase our limit. */
+	if (setrlimit(RLIMIT_NOFILE, &new) < 0)
+		syslog(LOG_WARNING,
+		       "Unable to increase fd limit from {%llu, %llu} to "
+		       "{%llu, %llu}: (%s) - May run out with lots of domains",
+		       (unsigned long long)lim.rlim_cur,
+		       (unsigned long long)lim.rlim_max,
+		       (unsigned long long)new.rlim_cur,
+		       (unsigned long long)new.rlim_max,
+		       strerror(errno));
+}
+
 int main(int argc, char **argv)
 {
 	const char *sopts = "hVvit:o:";
@@ -154,6 +189,8 @@ int main(int argc, char **argv)
 	openlog("xenconsoled", syslog_option, LOG_DAEMON);
 	setlogmask(syslog_mask);
 
+	increase_fd_limit();
+
 	if (!is_interactive) {
 		daemonize(pidfile ? pidfile : "/var/run/xenconsoled.pid");
 	}
diff --git a/tools/console/daemon/utils.c b/tools/console/daemon/utils.c
index 71dd185..dbb3b12 100644
--- a/tools/console/daemon/utils.c
+++ b/tools/console/daemon/utils.c
@@ -14,8 +14,7 @@
  *  GNU General Public License for more details.
  * 
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
 \*/
 
 #include <sys/types.h>
diff --git a/tools/console/daemon/utils.h b/tools/console/daemon/utils.h
index 8725dcd..1295822 100644
--- a/tools/console/daemon/utils.h
+++ b/tools/console/daemon/utils.h
@@ -14,8 +14,7 @@
  *  GNU General Public License for more details.
  * 
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
 \*/
 
 #ifndef CONSOLED_UTILS_H
diff --git a/tools/console/testsuite/Makefile b/tools/console/testsuite/Makefile
index d9e4380..85501fd 100644
--- a/tools/console/testsuite/Makefile
+++ b/tools/console/testsuite/Makefile
@@ -11,4 +11,7 @@ console-domU: console-domU.o
 procpipe: procpipe.o
 
 .PHONY: clean
-clean:; $(RM) *.o console-domU console-dom0 procpipe
+clean: $(RM) *.o console-domU console-dom0 procpipe
+
+.PHONY: distclean
+distclean: clean
diff --git a/tools/debugger/gdbsx/Makefile b/tools/debugger/gdbsx/Makefile
index 7938dd8..4ed6d76 100644
--- a/tools/debugger/gdbsx/Makefile
+++ b/tools/debugger/gdbsx/Makefile
@@ -12,10 +12,14 @@ clean:
 	rm -f xg_all.a gx_all.a gdbsx
 	set -e; for d in xg gx; do $(MAKE) -C $$d clean; done
 
+.PHONY: distclean
+distclean: clean
+	set -e; for d in xg gx; do $(MAKE) -c $$d distclean; done
+
 .PHONY: install
 install: all
-	[ -d $(DESTDIR)$(SBINDIR) ] || $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
-	$(INSTALL_PROG) gdbsx $(DESTDIR)$(SBINDIR)/gdbsx
+	[ -d $(DESTDIR)$(sbindir) ] || $(INSTALL_DIR) $(DESTDIR)$(sbindir)
+	$(INSTALL_PROG) gdbsx $(DESTDIR)$(sbindir)/gdbsx
 
 gdbsx: gx/gx_all.a xg/xg_all.a 
 	$(CC) -o $@ $^
diff --git a/tools/debugger/gdbsx/gx/Makefile b/tools/debugger/gdbsx/gx/Makefile
index 9a0ff07..3b8467f 100644
--- a/tools/debugger/gdbsx/gx/Makefile
+++ b/tools/debugger/gdbsx/gx/Makefile
@@ -11,6 +11,8 @@ all: gx_all.a
 clean:
 	rm -rf gx_all.a *.o .*.d
 
+.PHONY: distclean
+distclean: clean
 
 #%.o: %.c $(GX_HDRS) Makefile
 #	$(CC) -c $(CFLAGS) -o $@ $<
diff --git a/tools/debugger/gdbsx/gx/gx.h b/tools/debugger/gdbsx/gx/gx.h
index 47594c3..af39575 100644
--- a/tools/debugger/gdbsx/gx/gx.h
+++ b/tools/debugger/gdbsx/gx/gx.h
@@ -11,9 +11,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 typedef uint16_t domid_t;
diff --git a/tools/debugger/gdbsx/gx/gx_comm.c b/tools/debugger/gdbsx/gx/gx_comm.c
index 7680dbd..5a0c61d 100644
--- a/tools/debugger/gdbsx/gx/gx_comm.c
+++ b/tools/debugger/gdbsx/gx/gx_comm.c
@@ -15,9 +15,7 @@
    GNU General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor,
-   Boston, MA 02110-1301, USA.  */
+   along with this program; If not, see <http://www.gnu.org/licenses/>.  */
 /*
  * Copyright (C) 2009, Mukesh Rathor, Oracle Corp.  All rights reserved.
  *
@@ -31,9 +29,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /* This module handles communication with remote gdb.  courtesy 
diff --git a/tools/debugger/gdbsx/gx/gx_local.c b/tools/debugger/gdbsx/gx/gx_local.c
index c8f0e72..1bec03d 100644
--- a/tools/debugger/gdbsx/gx/gx_local.c
+++ b/tools/debugger/gdbsx/gx/gx_local.c
@@ -11,9 +11,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
diff --git a/tools/debugger/gdbsx/gx/gx_main.c b/tools/debugger/gdbsx/gx/gx_main.c
index e3feee1..a908c45 100644
--- a/tools/debugger/gdbsx/gx/gx_main.c
+++ b/tools/debugger/gdbsx/gx/gx_main.c
@@ -11,9 +11,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /* This module is the main module for gdbsx implementation. gdbsx is a remote
diff --git a/tools/debugger/gdbsx/gx/gx_utils.c b/tools/debugger/gdbsx/gx/gx_utils.c
index e87ffcb..f3c0039 100644
--- a/tools/debugger/gdbsx/gx/gx_utils.c
+++ b/tools/debugger/gdbsx/gx/gx_utils.c
@@ -11,9 +11,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <stdio.h>
diff --git a/tools/debugger/gdbsx/gx/xg_dummy.c b/tools/debugger/gdbsx/gx/xg_dummy.c
index b82899f..e995fad 100644
--- a/tools/debugger/gdbsx/gx/xg_dummy.c
+++ b/tools/debugger/gdbsx/gx/xg_dummy.c
@@ -11,9 +11,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #include <stdio.h>
 #include <stddef.h>
diff --git a/tools/debugger/gdbsx/xg/Makefile b/tools/debugger/gdbsx/xg/Makefile
index 7638633..78b4021 100644
--- a/tools/debugger/gdbsx/xg/Makefile
+++ b/tools/debugger/gdbsx/xg/Makefile
@@ -33,3 +33,5 @@ xen-headers:
 clean:
 	rm -rf xen xg_all.a $(XG_OBJS)  .*.d
 
+.PHONY: distclean
+distclean: clean
diff --git a/tools/debugger/gdbsx/xg/xg_main.c b/tools/debugger/gdbsx/xg/xg_main.c
index c95e4ed..8c8a402 100644
--- a/tools/debugger/gdbsx/xg/xg_main.c
+++ b/tools/debugger/gdbsx/xg/xg_main.c
@@ -11,9 +11,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /* This is the main module to interface with xen. This module exports APIs that
diff --git a/tools/debugger/gdbsx/xg/xg_public.h b/tools/debugger/gdbsx/xg/xg_public.h
index 6236d08..3f905a2 100644
--- a/tools/debugger/gdbsx/xg/xg_public.h
+++ b/tools/debugger/gdbsx/xg/xg_public.h
@@ -11,9 +11,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #define XGERR(...)   \
diff --git a/tools/debugger/kdd/Makefile b/tools/debugger/kdd/Makefile
index 34b781f..a79d7cf 100644
--- a/tools/debugger/kdd/Makefile
+++ b/tools/debugger/kdd/Makefile
@@ -16,7 +16,10 @@ kdd: $(OBJS)
 clean:
 	rm -f $(OBJS) $(DEPS) kdd
 
+.PHONY: distclean
+distclean: clean
+
 .PHONY: install
 install: all
-	[ -d $(DESTDIR)$(SBINDIR) ] || $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
-	$(INSTALL_PROG) kdd $(DESTDIR)$(SBINDIR)/kdd
+	[ -d $(DESTDIR)$(sbindir) ] || $(INSTALL_DIR) $(DESTDIR)$(sbindir)
+	$(INSTALL_PROG) kdd $(DESTDIR)$(sbindir)/kdd
diff --git a/tools/examples/Makefile b/tools/examples/Makefile
index 473580e..87dd760 100644
--- a/tools/examples/Makefile
+++ b/tools/examples/Makefile
@@ -43,3 +43,6 @@ install-configs: $(XEN_CONFIGS)
 
 .PHONY: clean
 clean:
+
+.PHONY: distclean
+distclean: clean
diff --git a/tools/examples/README b/tools/examples/README
index 115ca02..13380a4 100644
--- a/tools/examples/README
+++ b/tools/examples/README
@@ -24,7 +24,6 @@ vif-nat             - xen virtual network start/stop script in NAT mode
 vif-route           - xen virtual network start/stop script in routed mode
 xen-backend.agent   - calls block, vif-* scripts to add, remove, hotplug
                       devices  
-xen-backend.rules   - hotplug script rules
 xen-hotplug-common.sh - sourced by vif-common.sh
 xen-network-common.sh - sourced by vif-common.sh
 xen-script-common.sh  - sourced by xen-hotplug-common.sh
diff --git a/tools/examples/cpupool b/tools/examples/cpupool
index 01e62c8..35e229e 100644
--- a/tools/examples/cpupool
+++ b/tools/examples/cpupool
@@ -9,7 +9,7 @@
 # the name of the new cpupool
 name = "Example-Cpupool"
 
-# the scheduler to use: valid are e.g. credit, sedf, credit2
+# the scheduler to use: valid are e.g. credit, credit2 and rtds
 sched = "credit"
 
 # list of cpus to use
diff --git a/tools/firmware/Makefile b/tools/firmware/Makefile
index ca5df42..6cc86ce 100644
--- a/tools/firmware/Makefile
+++ b/tools/firmware/Makefile
@@ -55,7 +55,7 @@ distclean: subdirs-distclean
 subdir-distclean-etherboot: .phony
 	$(MAKE) -C etherboot distclean
 
-subdir-distclean-ovmf: .phony
+subdir-distclean-ovmf-dir: .phony
 	rm -rf ovmf-dir ovmf-dir-remote
 
 subdir-distclean-seabios-dir: .phony
@@ -70,7 +70,7 @@ ovmf-dir-force-update: ovmf-dir
 		$(GIT) reset --hard $(OVMF_UPSTREAM_REVISION); \
 	fi
 
-subdir-clean-ovmf:
+subdir-clean-ovmf-dir:
 	set -e; if test -d ovmf-dir/.; then \
 		$(MAKE) -C ovmf-dir clean; \
 	fi
diff --git a/tools/firmware/etherboot/patches/build-compare.patch b/tools/firmware/etherboot/patches/build-compare.patch
new file mode 100644
index 0000000..d41f68b
--- /dev/null
+++ b/tools/firmware/etherboot/patches/build-compare.patch
@@ -0,0 +1,19 @@
+The result of $(wildcard *) is random.
+Sort input files to reduce build-compare noise.
+---
+ ipxe/src/Makefile.housekeeping |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+Index: ipxe/src/Makefile.housekeeping
+===================================================================
+--- ipxe/src/Makefile.housekeeping
++++ ipxe/src/Makefile.housekeeping
+@@ -773,7 +773,7 @@ BLIB		= $(BIN)/blib.a
+ $(BLIB) : $(BLIB_OBJS) $(BLIB_LIST) $(MAKEDEPS)
+ 	$(Q)$(RM) $(BLIB)
+ 	$(QM)$(ECHO) "  [AR] $@"
+-	$(Q)$(AR) r $@ $(BLIB_OBJS)
++	$(Q)$(AR) r $@ $(sort $(BLIB_OBJS))
+ 	$(Q)$(RANLIB) $@
+ blib : $(BLIB)
+ 
diff --git a/tools/firmware/etherboot/patches/build_fix_4.patch b/tools/firmware/etherboot/patches/build_fix_4.patch
new file mode 100644
index 0000000..9271c8c
--- /dev/null
+++ b/tools/firmware/etherboot/patches/build_fix_4.patch
@@ -0,0 +1,225 @@
+From 1b56452121672e6408c38ac8926bdd6998a39004 Mon Sep 17 00:00:00 2001
+From: Christian Hesse <mail at eworm.de>
+Date: Thu, 23 Apr 2015 13:33:26 +0200
+Subject: [PATCH] [ath9k] Remove confusing logic inversion in an ANI variable
+
+This changed in Linux kernel the same way in commit 7067e701
+("ath9k_hw: remove confusing logic inversion in an ANI variable") by
+Felix Fietkau.
+
+Additionally this fixes "error: logical not is only applied to the
+left hand side of comparison" with GCC 5.1.0.
+
+Signed-off-by: Christian Hesse <mail at eworm.de>
+Signed-off-by: Michael Brown <mcb30 at ipxe.org>
+---
+ src/drivers/net/ath/ath9k/ani.h              |  2 +-
+ src/drivers/net/ath/ath9k/ath9k_ani.c        | 16 ++++++++--------
+ src/drivers/net/ath/ath9k/ath9k_ar5008_phy.c | 18 +++++++++---------
+ src/drivers/net/ath/ath9k/ath9k_ar9003_phy.c | 12 ++++++------
+ 4 files changed, 24 insertions(+), 24 deletions(-)
+
+diff --git a/src/drivers/net/ath/ath9k/ani.h b/src/drivers/net/ath/ath9k/ani.h
+index dbd4d4d..ba87ba0 100644
+--- a/src/drivers/net/ath/ath9k/ani.h
++++ b/src/drivers/net/ath/ath9k/ani.h
+@@ -125,7 +125,7 @@ struct ar5416AniState {
+ 	u8 mrcCCKOff;
+ 	u8 spurImmunityLevel;
+ 	u8 firstepLevel;
+-	u8 ofdmWeakSigDetectOff;
++	u8 ofdmWeakSigDetect;
+ 	u8 cckWeakSigThreshold;
+ 	u32 listenTime;
+ 	int32_t rssiThrLow;
+diff --git a/src/drivers/net/ath/ath9k/ath9k_ani.c b/src/drivers/net/ath/ath9k/ath9k_ani.c
+index ff7df49..76ca79c 100644
+--- a/src/drivers/net/ath/ath9k/ath9k_ani.c
++++ b/src/drivers/net/ath/ath9k/ath9k_ani.c
+@@ -177,7 +177,7 @@ static void ath9k_hw_ani_ofdm_err_trigger_old(struct ath_hw *ah)
+ 
+ 	rssi = BEACON_RSSI(ah);
+ 	if (rssi > aniState->rssiThrHigh) {
+-		if (!aniState->ofdmWeakSigDetectOff) {
++		if (aniState->ofdmWeakSigDetect) {
+ 			if (ath9k_hw_ani_control(ah,
+ 					 ATH9K_ANI_OFDM_WEAK_SIGNAL_DETECTION,
+ 					 0)) {
+@@ -192,7 +192,7 @@ static void ath9k_hw_ani_ofdm_err_trigger_old(struct ath_hw *ah)
+ 			return;
+ 		}
+ 	} else if (rssi > aniState->rssiThrLow) {
+-		if (aniState->ofdmWeakSigDetectOff)
++		if (!aniState->ofdmWeakSigDetect)
+ 			ath9k_hw_ani_control(ah,
+ 				     ATH9K_ANI_OFDM_WEAK_SIGNAL_DETECTION,
+ 				     1);
+@@ -202,7 +202,7 @@ static void ath9k_hw_ani_ofdm_err_trigger_old(struct ath_hw *ah)
+ 		return;
+ 	} else {
+ 		if ((ah->dev->channels + ah->dev->channel)->band == NET80211_BAND_2GHZ) {
+-			if (!aniState->ofdmWeakSigDetectOff)
++			if (aniState->ofdmWeakSigDetect)
+ 				ath9k_hw_ani_control(ah,
+ 				     ATH9K_ANI_OFDM_WEAK_SIGNAL_DETECTION,
+ 				     0);
+@@ -360,7 +360,7 @@ static void ath9k_hw_ani_lower_immunity_old(struct ath_hw *ah)
+ 	if (rssi > aniState->rssiThrHigh) {
+ 		/* XXX: Handle me */
+ 	} else if (rssi > aniState->rssiThrLow) {
+-		if (aniState->ofdmWeakSigDetectOff) {
++		if (!aniState->ofdmWeakSigDetect) {
+ 			if (ath9k_hw_ani_control(ah,
+ 				 ATH9K_ANI_OFDM_WEAK_SIGNAL_DETECTION,
+ 				 1) == 1)
+@@ -436,9 +436,9 @@ static void ath9k_ani_reset_old(struct ath_hw *ah)
+ 	if (aniState->spurImmunityLevel != 0)
+ 		ath9k_hw_ani_control(ah, ATH9K_ANI_SPUR_IMMUNITY_LEVEL,
+ 				     aniState->spurImmunityLevel);
+-	if (aniState->ofdmWeakSigDetectOff)
++	if (!aniState->ofdmWeakSigDetect)
+ 		ath9k_hw_ani_control(ah, ATH9K_ANI_OFDM_WEAK_SIGNAL_DETECTION,
+-				     !aniState->ofdmWeakSigDetectOff);
++				     aniState->ofdmWeakSigDetect);
+ 	if (aniState->cckWeakSigThreshold)
+ 		ath9k_hw_ani_control(ah, ATH9K_ANI_CCK_WEAK_SIGNAL_THR,
+ 				     aniState->cckWeakSigThreshold);
+@@ -709,8 +709,8 @@ void ath9k_hw_ani_init(struct ath_hw *ah)
+ 
+ 		ani->rssiThrHigh = ATH9K_ANI_RSSI_THR_HIGH;
+ 		ani->rssiThrLow = ATH9K_ANI_RSSI_THR_LOW;
+-		ani->ofdmWeakSigDetectOff =
+-			!ATH9K_ANI_USE_OFDM_WEAK_SIG;
++		ani->ofdmWeakSigDetect =
++			ATH9K_ANI_USE_OFDM_WEAK_SIG;
+ 		ani->cckNoiseImmunityLevel = ATH9K_ANI_CCK_DEF_LEVEL;
+ 	}
+ 
+diff --git a/src/drivers/net/ath/ath9k/ath9k_ar5008_phy.c b/src/drivers/net/ath/ath9k/ath9k_ar5008_phy.c
+index 60e87e9..2b6c133 100644
+--- a/src/drivers/net/ath/ath9k/ath9k_ar5008_phy.c
++++ b/src/drivers/net/ath/ath9k/ath9k_ar5008_phy.c
+@@ -1141,12 +1141,12 @@ static int ar5008_hw_ani_control_old(struct ath_hw *ah,
+ 			REG_CLR_BIT(ah, AR_PHY_SFCORR_LOW,
+ 				    AR_PHY_SFCORR_LOW_USE_SELF_CORR_LOW);
+ 
+-		if (!on != aniState->ofdmWeakSigDetectOff) {
++		if (on != aniState->ofdmWeakSigDetect) {
+ 			if (on)
+ 				ah->stats.ast_ani_ofdmon++;
+ 			else
+ 				ah->stats.ast_ani_ofdmoff++;
+-			aniState->ofdmWeakSigDetectOff = !on;
++			aniState->ofdmWeakSigDetect = on;
+ 		}
+ 		break;
+ 	}
+@@ -1215,10 +1215,10 @@ static int ar5008_hw_ani_control_old(struct ath_hw *ah,
+ 
+ 	DBG2("ath9k: ANI parameters:\n");
+ 	DBG2(
+-		"noiseImmunityLevel=%d, spurImmunityLevel=%d, ofdmWeakSigDetectOff=%d\n",
++		"noiseImmunityLevel=%d, spurImmunityLevel=%d, ofdmWeakSigDetect=%d\n",
+ 		aniState->noiseImmunityLevel,
+ 		aniState->spurImmunityLevel,
+-		!aniState->ofdmWeakSigDetectOff);
++		aniState->ofdmWeakSigDetect);
+ 	DBG2(
+ 		"cckWeakSigThreshold=%d, firstepLevel=%d, listenTime=%d\n",
+ 		aniState->cckWeakSigThreshold,
+@@ -1307,18 +1307,18 @@ static int ar5008_hw_ani_control_new(struct ath_hw *ah,
+ 			REG_CLR_BIT(ah, AR_PHY_SFCORR_LOW,
+ 				    AR_PHY_SFCORR_LOW_USE_SELF_CORR_LOW);
+ 
+-		if (!on != aniState->ofdmWeakSigDetectOff) {
++		if (on != aniState->ofdmWeakSigDetect) {
+ 			DBG2("ath9k: "
+ 				"** ch %d: ofdm weak signal: %s=>%s\n",
+ 				chan->channel,
+-				!aniState->ofdmWeakSigDetectOff ?
++				aniState->ofdmWeakSigDetect ?
+ 				"on" : "off",
+ 				on ? "on" : "off");
+ 			if (on)
+ 				ah->stats.ast_ani_ofdmon++;
+ 			else
+ 				ah->stats.ast_ani_ofdmoff++;
+-			aniState->ofdmWeakSigDetectOff = !on;
++			aniState->ofdmWeakSigDetect = on;
+ 		}
+ 		break;
+ 	}
+@@ -1467,7 +1467,7 @@ static int ar5008_hw_ani_control_new(struct ath_hw *ah,
+ 	DBG2("ath9k: "
+ 		"ANI parameters: SI=%d, ofdmWS=%s FS=%d MRCcck=%s listenTime=%d ofdmErrs=%d cckErrs=%d\n",
+ 		aniState->spurImmunityLevel,
+-		!aniState->ofdmWeakSigDetectOff ? "on" : "off",
++		aniState->ofdmWeakSigDetect ? "on" : "off",
+ 		aniState->firstepLevel,
+ 		!aniState->mrcCCKOff ? "on" : "off",
+ 		aniState->listenTime,
+@@ -1554,7 +1554,7 @@ static void ar5008_hw_ani_cache_ini_regs(struct ath_hw *ah)
+ 	/* these levels just got reset to defaults by the INI */
+ 	aniState->spurImmunityLevel = ATH9K_ANI_SPUR_IMMUNE_LVL_NEW;
+ 	aniState->firstepLevel = ATH9K_ANI_FIRSTEP_LVL_NEW;
+-	aniState->ofdmWeakSigDetectOff = !ATH9K_ANI_USE_OFDM_WEAK_SIG;
++	aniState->ofdmWeakSigDetect = ATH9K_ANI_USE_OFDM_WEAK_SIG;
+ 	aniState->mrcCCKOff = 1; /* not available on pre AR9003 */
+ }
+ 
+diff --git a/src/drivers/net/ath/ath9k/ath9k_ar9003_phy.c b/src/drivers/net/ath/ath9k/ath9k_ar9003_phy.c
+index 6103040..2244b77 100644
+--- a/src/drivers/net/ath/ath9k/ath9k_ar9003_phy.c
++++ b/src/drivers/net/ath/ath9k/ath9k_ar9003_phy.c
+@@ -859,18 +859,18 @@ static int ar9003_hw_ani_control(struct ath_hw *ah,
+ 			REG_CLR_BIT(ah, AR_PHY_SFCORR_LOW,
+ 				    AR_PHY_SFCORR_LOW_USE_SELF_CORR_LOW);
+ 
+-		if (!on != aniState->ofdmWeakSigDetectOff) {
++		if (on != aniState->ofdmWeakSigDetect) {
+ 			DBG2("ath9k: "
+ 				"** ch %d: ofdm weak signal: %s=>%s\n",
+ 				chan->channel,
+-				!aniState->ofdmWeakSigDetectOff ?
++				aniState->ofdmWeakSigDetect ?
+ 				"on" : "off",
+ 				on ? "on" : "off");
+ 			if (on)
+ 				ah->stats.ast_ani_ofdmon++;
+ 			else
+ 				ah->stats.ast_ani_ofdmoff++;
+-			aniState->ofdmWeakSigDetectOff = !on;
++			aniState->ofdmWeakSigDetect = on;
+ 		}
+ 		break;
+ 	}
+@@ -1013,7 +1013,7 @@ static int ar9003_hw_ani_control(struct ath_hw *ah,
+ 			      AR_PHY_MRC_CCK_ENABLE, is_on);
+ 		REG_RMW_FIELD(ah, AR_PHY_MRC_CCK_CTRL,
+ 			      AR_PHY_MRC_CCK_MUX_REG, is_on);
+-		if (!is_on != aniState->mrcCCKOff) {
++		if (!(is_on != aniState->mrcCCKOff)) {
+ 			DBG2("ath9k: "
+ 				"** ch %d: MRC CCK: %s=>%s\n",
+ 				chan->channel,
+@@ -1037,7 +1037,7 @@ static int ar9003_hw_ani_control(struct ath_hw *ah,
+ 	DBG2("ath9k: "
+ 		"ANI parameters: SI=%d, ofdmWS=%s FS=%d MRCcck=%s listenTime=%d ofdmErrs=%d cckErrs=%d\n",
+ 		aniState->spurImmunityLevel,
+-		!aniState->ofdmWeakSigDetectOff ? "on" : "off",
++		aniState->ofdmWeakSigDetect ? "on" : "off",
+ 		aniState->firstepLevel,
+ 		!aniState->mrcCCKOff ? "on" : "off",
+ 		aniState->listenTime,
+@@ -1137,7 +1137,7 @@ static void ar9003_hw_ani_cache_ini_regs(struct ath_hw *ah)
+ 	/* these levels just got reset to defaults by the INI */
+ 	aniState->spurImmunityLevel = ATH9K_ANI_SPUR_IMMUNE_LVL_NEW;
+ 	aniState->firstepLevel = ATH9K_ANI_FIRSTEP_LVL_NEW;
+-	aniState->ofdmWeakSigDetectOff = !ATH9K_ANI_USE_OFDM_WEAK_SIG;
++	aniState->ofdmWeakSigDetect = ATH9K_ANI_USE_OFDM_WEAK_SIG;
+ 	aniState->mrcCCKOff = !ATH9K_ANI_ENABLE_MRC_CCK;
+ }
+ 
+-- 
+2.4.3
+
diff --git a/tools/firmware/etherboot/patches/series b/tools/firmware/etherboot/patches/series
index 5bd7df8..2c39853 100644
--- a/tools/firmware/etherboot/patches/series
+++ b/tools/firmware/etherboot/patches/series
@@ -2,3 +2,5 @@ boot_prompt_option.patch
 build_fix_1.patch
 build_fix_2.patch
 build_fix_3.patch
+build-compare.patch
+build_fix_4.patch
diff --git a/tools/firmware/hvmloader/32bitbios_support.c b/tools/firmware/hvmloader/32bitbios_support.c
index fe770a3..1141350 100644
--- a/tools/firmware/hvmloader/32bitbios_support.c
+++ b/tools/firmware/hvmloader/32bitbios_support.c
@@ -17,8 +17,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <inttypes.h>
diff --git a/tools/firmware/hvmloader/Makefile b/tools/firmware/hvmloader/Makefile
index 46a79c5..0560a7b 100644
--- a/tools/firmware/hvmloader/Makefile
+++ b/tools/firmware/hvmloader/Makefile
@@ -14,8 +14,7 @@
 # more details.
 #
 # You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-# Place - Suite 330, Boston, MA 02111-1307 USA.
+# this program; If not, see <http://www.gnu.org/licenses/>.
 #
 
 XEN_ROOT = $(CURDIR)/../../..
@@ -26,10 +25,16 @@ SUBDIRS := acpi
 # The HVM loader is started in 32-bit mode at the address below:
 LOADADDR = 0x100000
 
+# SMBIOS spec requires format mm/dd/yyyy
+SMBIOS_REL_DATE ?= $(shell date +%m/%d/%Y)
+
 CFLAGS += $(CFLAGS_xeninclude)
 
+# We mustn't use tools-only public interfaces.
+CFLAGS += -U__XEN_TOOLS__ -D__XEN_INTERFACE_VERSION__=__XEN_LATEST_INTERFACE_VERSION__
+
 OBJS  = hvmloader.o mp_tables.o util.o smbios.o 
-OBJS += smp.o cacheattr.o xenbus.o
+OBJS += smp.o cacheattr.o xenbus.o vnuma.o
 OBJS += e820.o pci.o pir.o ctype.o
 OBJS += hvm_param.o
 ifeq ($(debug),y)
@@ -88,7 +93,7 @@ all: subdirs-all
 	$(MAKE) hvmloader
 
 ovmf.o rombios.o seabios.o hvmloader.o: roms.inc
-smbios.o: CFLAGS += -D__SMBIOS_DATE__="\"$(shell date +%m/%d/%Y)\""
+smbios.o: CFLAGS += -D__SMBIOS_DATE__="\"$(SMBIOS_REL_DATE)\""
 
 hvmloader: $(OBJS) acpi/acpi.a
 	$(LD) $(LDFLAGS_DIRECT) -N -Ttext $(LOADADDR) -o hvmloader.tmp $^
@@ -139,4 +144,7 @@ clean: subdirs-clean
 	rm -f roms.inc roms.inc.new acpi.h
 	rm -f hvmloader hvmloader.tmp *.o $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 -include $(DEPS)
diff --git a/tools/firmware/hvmloader/acpi/Makefile b/tools/firmware/hvmloader/acpi/Makefile
index 2c50851..d3e882a 100644
--- a/tools/firmware/hvmloader/acpi/Makefile
+++ b/tools/firmware/hvmloader/acpi/Makefile
@@ -11,8 +11,7 @@
 # more details.
 #
 # You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-# Place - Suite 330, Boston, MA 02111-1307 USA.
+# this program; If not, see <http://www.gnu.org/licenses/>.
 #
 
 XEN_ROOT = $(CURDIR)/../../../..
@@ -36,12 +35,12 @@ mk_dsdt: mk_dsdt.c
 
 dsdt_anycpu_qemu_xen.asl: dsdt.asl mk_dsdt
 	awk 'NR > 1 {print s} {s=$$0}' $< > $@
-	./mk_dsdt --dm-version qemu-xen >> $@
+	./mk_dsdt --debug=$(debug) --dm-version qemu-xen >> $@
 
 # NB. awk invocation is a portable alternative to 'head -n -1'
 dsdt_%cpu.asl: dsdt.asl mk_dsdt
 	awk 'NR > 1 {print s} {s=$$0}' $< > $@
-	./mk_dsdt --maxcpu $*  >> $@
+	./mk_dsdt --debug=$(debug) --maxcpu $*  >> $@
 
 $(filter dsdt_%.c,$(C_SRC)): %.c: iasl %.asl
 	iasl -vs -p $* -tc $*.asl
@@ -66,6 +65,8 @@ clean:
 	rm -rf *.a *.o $(IASL_VER) $(IASL_VER).tar.gz $(DEPS)
 	rm -rf ssdt_*.h dsdt*.c *~ *.aml *.hex mk_dsdt dsdt_*.asl
 
+distclean: clean
+
 install: all
 
 -include $(DEPS)
diff --git a/tools/firmware/hvmloader/acpi/acpi2_0.h b/tools/firmware/hvmloader/acpi/acpi2_0.h
index 7b22d80..78eb43d 100644
--- a/tools/firmware/hvmloader/acpi/acpi2_0.h
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 #ifndef _ACPI_2_0_H_
@@ -364,6 +363,63 @@ struct acpi_20_madt_intsrcovr {
 };
 
 /*
+ * System Resource Affinity Table header definition (SRAT)
+ */
+struct acpi_20_srat {
+    struct acpi_header header;
+    uint32_t table_revision;
+    uint32_t reserved2[2];
+};
+
+#define ACPI_SRAT_TABLE_REVISION 1
+
+/*
+ * System Resource Affinity Table structure types.
+ */
+#define ACPI_PROCESSOR_AFFINITY 0x0
+#define ACPI_MEMORY_AFFINITY    0x1
+struct acpi_20_srat_processor {
+    uint8_t type;
+    uint8_t length;
+    uint8_t domain;
+    uint8_t apic_id;
+    uint32_t flags;
+    uint8_t sapic_id;
+    uint8_t domain_hi[3];
+    uint32_t reserved;
+};
+
+/*
+ * Local APIC Affinity Flags.  All other bits are reserved and must be 0.
+ */
+#define ACPI_LOCAL_APIC_AFFIN_ENABLED (1 << 0)
+
+struct acpi_20_srat_memory {
+    uint8_t type;
+    uint8_t length;
+    uint32_t domain;
+    uint16_t reserved;
+    uint64_t base_address;
+    uint64_t mem_length;
+    uint32_t reserved2;
+    uint32_t flags;
+    uint64_t reserved3;
+};
+
+/*
+ * Memory Affinity Flags.  All other bits are reserved and must be 0.
+ */
+#define ACPI_MEM_AFFIN_ENABLED (1 << 0)
+#define ACPI_MEM_AFFIN_HOTPLUGGABLE (1 << 1)
+#define ACPI_MEM_AFFIN_NONVOLATILE (1 << 2)
+
+struct acpi_20_slit {
+    struct acpi_header header;
+    uint64_t localities;
+    uint8_t entry[0];
+};
+
+/*
  * Table Signatures.
  */
 #define ACPI_2_0_RSDP_SIGNATURE ASCII64('R','S','D',' ','P','T','R',' ')
@@ -375,6 +431,8 @@ struct acpi_20_madt_intsrcovr {
 #define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A')
 #define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T')
 #define ACPI_2_0_WAET_SIGNATURE ASCII32('W','A','E','T')
+#define ACPI_2_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
+#define ACPI_2_0_SLIT_SIGNATURE ASCII32('S','L','I','T')
 
 /*
  * Table revision numbers.
@@ -388,6 +446,8 @@ struct acpi_20_madt_intsrcovr {
 #define ACPI_2_0_HPET_REVISION 0x01
 #define ACPI_2_0_WAET_REVISION 0x01
 #define ACPI_1_0_FADT_REVISION 0x01
+#define ACPI_2_0_SRAT_REVISION 0x01
+#define ACPI_2_0_SLIT_REVISION 0x01
 
 #pragma pack ()
 
diff --git a/tools/firmware/hvmloader/acpi/build.c b/tools/firmware/hvmloader/acpi/build.c
index 1431296..503648c 100644
--- a/tools/firmware/hvmloader/acpi/build.c
+++ b/tools/firmware/hvmloader/acpi/build.c
@@ -12,8 +12,7 @@
  * details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "acpi2_0.h"
@@ -23,6 +22,7 @@
 #include "ssdt_pm.h"
 #include "../config.h"
 #include "../util.h"
+#include "../vnuma.h"
 #include <xen/hvm/hvm_xs_strings.h>
 #include <xen/hvm/params.h>
 
@@ -203,6 +203,95 @@ static struct acpi_20_waet *construct_waet(void)
     return waet;
 }
 
+static struct acpi_20_srat *construct_srat(void)
+{
+    struct acpi_20_srat *srat;
+    struct acpi_20_srat_processor *processor;
+    struct acpi_20_srat_memory *memory;
+    unsigned int size;
+    void *p;
+    unsigned int i;
+
+    size = sizeof(*srat) + sizeof(*processor) * hvm_info->nr_vcpus +
+           sizeof(*memory) * nr_vmemranges;
+
+    p = mem_alloc(size, 16);
+    if ( !p )
+        return NULL;
+
+    srat = memset(p, 0, size);
+    srat->header.signature    = ACPI_2_0_SRAT_SIGNATURE;
+    srat->header.revision     = ACPI_2_0_SRAT_REVISION;
+    fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    srat->header.oem_revision = ACPI_OEM_REVISION;
+    srat->header.creator_id   = ACPI_CREATOR_ID;
+    srat->header.creator_revision = ACPI_CREATOR_REVISION;
+    srat->table_revision      = ACPI_SRAT_TABLE_REVISION;
+
+    processor = (struct acpi_20_srat_processor *)(srat + 1);
+    for ( i = 0; i < hvm_info->nr_vcpus; i++ )
+    {
+        processor->type     = ACPI_PROCESSOR_AFFINITY;
+        processor->length   = sizeof(*processor);
+        processor->domain   = vcpu_to_vnode[i];
+        processor->apic_id  = LAPIC_ID(i);
+        processor->flags    = ACPI_LOCAL_APIC_AFFIN_ENABLED;
+        processor++;
+    }
+
+    memory = (struct acpi_20_srat_memory *)processor;
+    for ( i = 0; i < nr_vmemranges; i++ )
+    {
+        memory->type          = ACPI_MEMORY_AFFINITY;
+        memory->length        = sizeof(*memory);
+        memory->domain        = vmemrange[i].nid;
+        memory->flags         = ACPI_MEM_AFFIN_ENABLED;
+        memory->base_address  = vmemrange[i].start;
+        memory->mem_length    = vmemrange[i].end - vmemrange[i].start;
+        memory++;
+    }
+
+    ASSERT(((unsigned long)memory) - ((unsigned long)p) == size);
+
+    srat->header.length = size;
+    set_checksum(srat, offsetof(struct acpi_header, checksum), size);
+
+    return srat;
+}
+
+static struct acpi_20_slit *construct_slit(void)
+{
+    struct acpi_20_slit *slit;
+    unsigned int i, num, size;
+
+    num = nr_vnodes * nr_vnodes;
+    size = sizeof(*slit) + num * sizeof(uint8_t);
+
+    slit = mem_alloc(size, 16);
+    if ( !slit )
+        return NULL;
+
+    memset(slit, 0, size);
+    slit->header.signature    = ACPI_2_0_SLIT_SIGNATURE;
+    slit->header.revision     = ACPI_2_0_SLIT_REVISION;
+    fixed_strcpy(slit->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(slit->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    slit->header.oem_revision = ACPI_OEM_REVISION;
+    slit->header.creator_id   = ACPI_CREATOR_ID;
+    slit->header.creator_revision = ACPI_CREATOR_REVISION;
+
+    for ( i = 0; i < num; i++ )
+        slit->entry[i] = vdistance[i];
+
+    slit->localities = nr_vnodes;
+
+    slit->header.length = size;
+    set_checksum(slit, offsetof(struct acpi_header, checksum), size);
+
+    return slit;
+}
+
 static int construct_passthrough_tables(unsigned long *table_ptrs,
                                         int nr_tables)
 {
@@ -346,6 +435,22 @@ static int construct_secondary_tables(unsigned long *table_ptrs,
         }
     }
 
+    /* SRAT and SLIT */
+    if ( nr_vnodes > 0 )
+    {
+        struct acpi_20_srat *srat = construct_srat();
+        struct acpi_20_slit *slit = construct_slit();
+
+        if ( srat )
+            table_ptrs[nr_tables++] = (unsigned long)srat;
+        else
+            printf("Failed to build SRAT, skipping...\n");
+        if ( slit )
+            table_ptrs[nr_tables++] = (unsigned long)slit;
+        else
+            printf("Failed to build SLIT, skipping...\n");
+    }
+
     /* Load any additional tables passed through. */
     nr_tables += construct_passthrough_tables(table_ptrs, nr_tables);
 
diff --git a/tools/firmware/hvmloader/acpi/dsdt.asl b/tools/firmware/hvmloader/acpi/dsdt.asl
index 5610385..e266dc2 100644
--- a/tools/firmware/hvmloader/acpi/dsdt.asl
+++ b/tools/firmware/hvmloader/acpi/dsdt.asl
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
@@ -109,7 +108,7 @@ DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
 
            Method (_CRS, 0, NotSerialized)
            {
-               Name (PRT0, ResourceTemplate ()
+               Store (ResourceTemplate ()
                {
                    /* bus number is from 0 - 255*/
                    WordBusNumber(
@@ -167,11 +166,11 @@ DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
                         0x0000000000000010,
                         ,, _Y02)
 
-                })
+                }, Local1)
 
-                CreateDWordField(PRT0, \_SB.PCI0._CRS._Y01._MIN, MMIN)
-                CreateDWordField(PRT0, \_SB.PCI0._CRS._Y01._MAX, MMAX)
-                CreateDWordField(PRT0, \_SB.PCI0._CRS._Y01._LEN, MLEN)
+                CreateDWordField(Local1, \_SB.PCI0._CRS._Y01._MIN, MMIN)
+                CreateDWordField(Local1, \_SB.PCI0._CRS._Y01._MAX, MMAX)
+                CreateDWordField(Local1, \_SB.PCI0._CRS._Y01._LEN, MLEN)
 
                 Store(\_SB.PMIN, MMIN)
                 Store(\_SB.PLEN, MLEN)
@@ -192,12 +191,12 @@ DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
                 } Else {
                     Store(\_SB.PCI0._CRS._Y02, Local0)
                 }
-                CreateDWordField(PRT0, Add(Local0, 14), MINL)
-                CreateDWordField(PRT0, Add(Local0, 18), MINH)
-                CreateDWordField(PRT0, Add(Local0, 22), MAXL)
-                CreateDWordField(PRT0, Add(Local0, 26), MAXH)
-                CreateDWordField(PRT0, Add(Local0, 38), LENL)
-                CreateDWordField(PRT0, Add(Local0, 42), LENH)
+                CreateDWordField(Local1, Add(Local0, 14), MINL)
+                CreateDWordField(Local1, Add(Local0, 18), MINH)
+                CreateDWordField(Local1, Add(Local0, 22), MAXL)
+                CreateDWordField(Local1, Add(Local0, 26), MAXH)
+                CreateDWordField(Local1, Add(Local0, 38), LENL)
+                CreateDWordField(Local1, Add(Local0, 42), LENH)
 
                 Store(\_SB.LMIN, MINL)
                 Store(\_SB.HMIN, MINH)
@@ -215,7 +214,7 @@ DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
                     Subtract(MAXL, One, MAXL)
                 }
 
-                Return (PRT0)
+                Return (Local1)
             }
 
             Device(HPET) {
diff --git a/tools/firmware/hvmloader/acpi/mk_dsdt.c b/tools/firmware/hvmloader/acpi/mk_dsdt.c
index a4b693b..b567b38 100644
--- a/tools/firmware/hvmloader/acpi/mk_dsdt.c
+++ b/tools/firmware/hvmloader/acpi/mk_dsdt.c
@@ -4,9 +4,11 @@
 #include <string.h>
 #include <getopt.h>
 #include <stdlib.h>
+#include <stdbool.h>
 #include <xen/hvm/hvm_info_table.h>
 
 static unsigned int indent_level;
+static bool debug = false;
 
 typedef enum dm_version {
     QEMU_XEN_TRADITIONAL,
@@ -83,6 +85,7 @@ static void decision_tree(
 static struct option options[] = {
     { "maxcpu", 1, 0, 'c' },
     { "dm-version", 1, 0, 'q' },
+    { "debug", 1, 0, 'd' },
     { 0, 0, 0, 0 }
 };
 
@@ -125,6 +128,10 @@ int main(int argc, char **argv)
                 return -1;
             }
             break;
+        case 'd':
+            if (*optarg == 'y')
+                debug = true;
+            break;
         default:
             return -1;
         }
@@ -222,12 +229,9 @@ int main(int argc, char **argv)
 
     /* Define GPE control method. */
     push_block("Scope", "\\_GPE");
-    if (dm_version == QEMU_XEN_TRADITIONAL) {
-        push_block("Method", "_L02");
-    } else {
-        push_block("Method", "_E02");
-    }
-    stmt("Return", "\\_SB.PRSC()");
+    push_block("Method",
+               dm_version == QEMU_XEN_TRADITIONAL ? "_L02" : "_E02");
+    stmt("\\_SB.PRSC ()", NULL);
     pop_block();
     pop_block();
     /**** Processor end ****/
@@ -347,14 +351,20 @@ int main(int argc, char **argv)
             /* _SUN == dev */
             stmt("Name", "_SUN, 0x%08x", slot >> 3);
             push_block("Method", "_EJ0, 1");
-            stmt("Store", "0x%02x, \\_GPE.DPT1", slot);
-            stmt("Store", "0x88, \\_GPE.DPT2");
+            if (debug)
+            {
+                stmt("Store", "0x%02x, \\_GPE.DPT1", slot);
+                stmt("Store", "0x88, \\_GPE.DPT2");
+            }
             stmt("Store", "0x%02x, \\_GPE.PH%02X", /* eject */
                  (slot & 1) ? 0x10 : 0x01, slot & ~1);
             pop_block();
             push_block("Method", "_STA, 0");
-            stmt("Store", "0x%02x, \\_GPE.DPT1", slot);
-            stmt("Store", "0x89, \\_GPE.DPT2");
+            if (debug)
+            {
+                stmt("Store", "0x%02x, \\_GPE.DPT1", slot);
+                stmt("Store", "0x89, \\_GPE.DPT2");
+            }
             if ( slot & 1 )
                 stmt("ShiftRight", "0x4, \\_GPE.PH%02X, Local1", slot & ~1);
             else
@@ -374,8 +384,7 @@ int main(int argc, char **argv)
             push_block("Device", "S%i", slot); {
                 stmt("Name", "_ADR, %#06x0000", slot);
                 push_block("Method", "_EJ0,1"); {
-                    stmt("Store", "ShiftLeft(1, %#06x), B0EJ", slot);
-                    stmt("Return", "0x0");
+                    stmt("Store", "%#010x, B0EJ", 1 << slot);
                 } pop_block();
                 stmt("Name", "_SUN, %i", slot);
             } pop_block();
@@ -425,9 +434,11 @@ int main(int argc, char **argv)
         stmt("And", "Local1, 0xf, EVT");
         stmt("Store", "PSTB, Local1"); /* XXX: Store (PSTB, SLT) ? */
         stmt("And", "Local1, 0xff, SLT");
-        /* Debug */
-        stmt("Store", "SLT, DPT1");
-        stmt("Store", "EVT, DPT2");
+        if (debug)
+        {
+            stmt("Store", "SLT, DPT1");
+            stmt("Store", "EVT, DPT2");
+        }
         /* Decision tree */
         decision_tree(0x00, 0x100, "SLT", pci_hotplug_notify);
         pop_block();
diff --git a/tools/firmware/hvmloader/acpi/ssdt_pm.asl b/tools/firmware/hvmloader/acpi/ssdt_pm.asl
index afb78b6..1a7d752 100644
--- a/tools/firmware/hvmloader/acpi/ssdt_pm.asl
+++ b/tools/firmware/hvmloader/acpi/ssdt_pm.asl
@@ -15,8 +15,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
@@ -276,14 +275,13 @@ DefinitionBlock ("SSDT_PM.aml", "SSDT", 2, "Xen", "HVM", 0)
                 HLP8 (Arg0, Local0)
                 Increment (Local0)
             }
+            Return (Arg0)
         }
 
         Method (HLPA, 0, NotSerialized)
         {
             Store (HLP6 (), Local0)
-            Name (TMP, Buffer (Local0) {})
-            HLP9 (TMP, Local0)
-            Return (TMP)
+            Return (HLP9 (Buffer (Local0) {}, Local0))
         }
 
         Method (REL, 0, NotSerialized)
@@ -372,14 +370,14 @@ DefinitionBlock ("SSDT_PM.aml", "SSDT", 2, "Xen", "HVM", 0)
                 INIT (0x02)
                 INIT (0x01)
                 HLP5 ()
-                Name (BST0, Package (0x04) {})
-                Store (HLP7 (), Index (BST0, 0x00))
-                Store (HLP7 (), Index (BST0, 0x01))
-                Store (HLP7 (), Index (BST0, 0x02))
-                Store (HLP7 (), Index (BST0, 0x03))
+                Store (Package (0x04) {}, Local0)
+                Store (HLP7 (), Index (Local0, 0x00))
+                Store (HLP7 (), Index (Local0, 0x01))
+                Store (HLP7 (), Index (Local0, 0x02))
+                Store (HLP7 (), Index (Local0, 0x03))
                 REL ()
                 Store (2, \_SB.DBG1)
-                Return (BST0)
+                Return (Local0)
             }
         }
 
@@ -409,13 +407,13 @@ DefinitionBlock ("SSDT_PM.aml", "SSDT", 2, "Xen", "HVM", 0)
                 INIT (0x02)
                 INIT (0x02)
                 HLP5 ()
-                Name (BST1, Package (0x04) {})
-                Store (HLP7 (), Index (BST1, 0x00))
-                Store (HLP7 (), Index (BST1, 0x01))
-                Store (HLP7 (), Index (BST1, 0x02))
-                Store (HLP7 (), Index (BST1, 0x03))
+                Store (Package (0x04) {}, Local0)
+                Store (HLP7 (), Index (Local0, 0x00))
+                Store (HLP7 (), Index (Local0, 0x01))
+                Store (HLP7 (), Index (Local0, 0x02))
+                Store (HLP7 (), Index (Local0, 0x03))
                 REL ()
-                Return (BST1)
+                Return (Local0)
             }
         }
     }
diff --git a/tools/firmware/hvmloader/acpi/ssdt_s3.asl b/tools/firmware/hvmloader/acpi/ssdt_s3.asl
index dad1db5..f89ac02 100644
--- a/tools/firmware/hvmloader/acpi/ssdt_s3.asl
+++ b/tools/firmware/hvmloader/acpi/ssdt_s3.asl
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 DefinitionBlock ("SSDT_S3.aml", "SSDT", 2, "Xen", "HVM", 0)
diff --git a/tools/firmware/hvmloader/acpi/ssdt_s4.asl b/tools/firmware/hvmloader/acpi/ssdt_s4.asl
index 0a84381..d589e4b 100644
--- a/tools/firmware/hvmloader/acpi/ssdt_s4.asl
+++ b/tools/firmware/hvmloader/acpi/ssdt_s4.asl
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 DefinitionBlock ("SSDT_S4.aml", "SSDT", 2, "Xen", "HVM", 0)
diff --git a/tools/firmware/hvmloader/acpi/ssdt_tpm.asl b/tools/firmware/hvmloader/acpi/ssdt_tpm.asl
index 1157eb4..2ae8ad4 100644
--- a/tools/firmware/hvmloader/acpi/ssdt_tpm.asl
+++ b/tools/firmware/hvmloader/acpi/ssdt_tpm.asl
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /* SSDT for TPM TIS Interface for Xen with Qemu device model. */
diff --git a/tools/firmware/hvmloader/acpi/static_tables.c b/tools/firmware/hvmloader/acpi/static_tables.c
index 323ae31..f4d627b 100644
--- a/tools/firmware/hvmloader/acpi/static_tables.c
+++ b/tools/firmware/hvmloader/acpi/static_tables.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "acpi2_0.h"
diff --git a/tools/firmware/hvmloader/cacheattr.c b/tools/firmware/hvmloader/cacheattr.c
index de8d39c..1ac6656 100644
--- a/tools/firmware/hvmloader/cacheattr.c
+++ b/tools/firmware/hvmloader/cacheattr.c
@@ -16,8 +16,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "util.h"
diff --git a/tools/firmware/hvmloader/e820.c b/tools/firmware/hvmloader/e820.c
index 2e05e93..bbde2be 100644
--- a/tools/firmware/hvmloader/e820.c
+++ b/tools/firmware/hvmloader/e820.c
@@ -16,13 +16,92 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "config.h"
 #include "util.h"
 
+struct e820map memory_map;
+
+void memory_map_setup(void)
+{
+    unsigned int nr_entries = E820MAX, i;
+    int rc;
+    uint64_t alloc_addr = RESERVED_MEMORY_DYNAMIC_START;
+    uint64_t alloc_size = RESERVED_MEMORY_DYNAMIC_END - alloc_addr;
+
+    rc = get_mem_mapping_layout(memory_map.map, &nr_entries);
+
+    if ( rc || !nr_entries )
+    {
+        printf("Get guest memory maps[%d] failed. (%d)\n", nr_entries, rc);
+        BUG();
+    }
+
+    memory_map.nr_map = nr_entries;
+
+    for ( i = 0; i < nr_entries; i++ )
+    {
+        if ( memory_map.map[i].type == E820_RESERVED &&
+             check_overlap(alloc_addr, alloc_size,
+                           memory_map.map[i].addr, memory_map.map[i].size) )
+        {
+            printf("Fail to setup memory map due to conflict");
+            printf(" on dynamic reserved memory range.\n");
+            BUG();
+        }
+    }
+}
+
+/*
+ * Sometimes hvmloader may have relocated RAM so low_mem_pgend/high_mem_end
+ * would be changed over there. But memory_map[] just records the
+ * original low/high memory, so we need to sync these entries once
+ * hvmloader modifies low/high memory.
+ */
+void adjust_memory_map(void)
+{
+    uint32_t low_mem_end = hvm_info->low_mem_pgend << PAGE_SHIFT;
+    uint64_t high_mem_end = (uint64_t)hvm_info->high_mem_pgend << PAGE_SHIFT;
+    unsigned int i;
+
+    for ( i = 0; i < memory_map.nr_map; i++ )
+    {
+        uint64_t map_start = memory_map.map[i].addr;
+        uint64_t map_size = memory_map.map[i].size;
+        uint64_t map_end = map_start + map_size;
+
+        /* If we need to adjust lowmem. */
+        if ( memory_map.map[i].type == E820_RAM &&
+             low_mem_end > map_start && low_mem_end < map_end )
+        {
+            memory_map.map[i].size = low_mem_end - map_start;
+            continue;
+        }
+
+        /* Modify the existing highmem region if it exists. */
+        if ( memory_map.map[i].type == E820_RAM &&
+             high_mem_end && map_start == ((uint64_t)1 << 32) )
+        {
+            if ( high_mem_end != map_end )
+                memory_map.map[i].size = high_mem_end - map_start;
+            high_mem_end = 0;
+            continue;
+        }
+    }
+
+    /* If there was no highmem region, just create one. */
+    if ( high_mem_end )
+    {
+        memory_map.map[i].addr = ((uint64_t)1 << 32);
+        memory_map.map[i].size =
+                ((uint64_t)hvm_info->high_mem_pgend << PAGE_SHIFT) -
+                    memory_map.map[i].addr;
+        memory_map.map[i].type = E820_RAM;
+    }
+}
+
 void dump_e820_table(struct e820entry *e820, unsigned int nr)
 {
     uint64_t last_end = 0, start, end;
@@ -73,7 +152,8 @@ int build_e820_table(struct e820entry *e820,
                      unsigned int lowmem_reserved_base,
                      unsigned int bios_image_base)
 {
-    unsigned int nr = 0;
+    unsigned int nr = 0, i, j;
+    uint32_t low_mem_end = hvm_info->low_mem_pgend << PAGE_SHIFT;
 
     if ( !lowmem_reserved_base )
             lowmem_reserved_base = 0xA0000;
@@ -117,13 +197,6 @@ int build_e820_table(struct e820entry *e820,
     e820[nr].type = E820_RESERVED;
     nr++;
 
-    /* Low RAM goes here. Reserve space for special pages. */
-    BUG_ON((hvm_info->low_mem_pgend << PAGE_SHIFT) < (2u << 20));
-    e820[nr].addr = 0x100000;
-    e820[nr].size = (hvm_info->low_mem_pgend << PAGE_SHIFT) - e820[nr].addr;
-    e820[nr].type = E820_RAM;
-    nr++;
-
     /*
      * Explicitly reserve space for special pages.
      * This space starts at RESERVED_MEMBASE an extends to cover various
@@ -159,16 +232,48 @@ int build_e820_table(struct e820entry *e820,
         nr++;
     }
 
+    /* Low RAM goes here. Reserve space for special pages. */
+    BUG_ON(low_mem_end < (2u << 20));
 
-    if ( hvm_info->high_mem_pgend )
+    /*
+     * Construct E820 table according to recorded memory map.
+     *
+     * The memory map created by toolstack may include,
+     *
+     * #1. Low memory region
+     *
+     * Low RAM starts at least from 1M to make sure all standard regions
+     * of the PC memory map, like BIOS, VGA memory-mapped I/O and vgabios,
+     * have enough space.
+     *
+     * #2. Reserved regions if they exist
+     *
+     * #3. High memory region if it exists
+     *
+     * Note we just have one low memory entry and one high mmeory entry if
+     * exists.
+     */
+    for ( i = 0; i < memory_map.nr_map; i++ )
     {
-        e820[nr].addr = ((uint64_t)1 << 32);
-        e820[nr].size =
-            ((uint64_t)hvm_info->high_mem_pgend << PAGE_SHIFT) - e820[nr].addr;
-        e820[nr].type = E820_RAM;
+        e820[nr] = memory_map.map[i];
         nr++;
     }
 
+    /* Finally we need to sort all e820 entries. */
+    for ( j = 0; j < nr - 1; j++ )
+    {
+        for ( i = j + 1; i < nr; i++ )
+        {
+            if ( e820[j].addr > e820[i].addr )
+            {
+                struct e820entry tmp = e820[j];
+
+                e820[j] = e820[i];
+                e820[i] = tmp;
+            }
+        }
+    }
+
     return nr;
 }
 
diff --git a/tools/firmware/hvmloader/e820.h b/tools/firmware/hvmloader/e820.h
index b2ead7f..8b5a9e0 100644
--- a/tools/firmware/hvmloader/e820.h
+++ b/tools/firmware/hvmloader/e820.h
@@ -15,6 +15,13 @@ struct e820entry {
     uint32_t type;
 } __attribute__((packed));
 
+#define E820MAX	128
+
+struct e820map {
+    unsigned int nr_map;
+    struct e820entry map[E820MAX];
+};
+
 #endif /* __HVMLOADER_E820_H__ */
 
 /*
diff --git a/tools/firmware/hvmloader/hvmloader.c b/tools/firmware/hvmloader/hvmloader.c
index 7b0da38..716d03c 100644
--- a/tools/firmware/hvmloader/hvmloader.c
+++ b/tools/firmware/hvmloader/hvmloader.c
@@ -16,8 +16,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "util.h"
@@ -26,6 +25,7 @@
 #include "pci_regs.h"
 #include "apic_regs.h"
 #include "acpi/acpi2_0.h"
+#include "vnuma.h"
 #include <xen/version.h>
 #include <xen/hvm/params.h>
 
@@ -261,6 +261,8 @@ int main(void)
 
     init_hypercalls();
 
+    memory_map_setup();
+
     xenbus_setup();
 
     bios = detect_bios();
@@ -310,6 +312,8 @@ int main(void)
 
     if ( acpi_enabled )
     {
+        init_vnuma_info();
+
         if ( bios->acpi_build_tables )
         {
             printf("Loading ACPI ...\n");
diff --git a/tools/firmware/hvmloader/mkhex b/tools/firmware/hvmloader/mkhex
index cb21257..d0982d5 100755
--- a/tools/firmware/hvmloader/mkhex
+++ b/tools/firmware/hvmloader/mkhex
@@ -16,8 +16,7 @@
 # more details.
 #
 # You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-# Place - Suite 330, Boston, MA 02111-1307 USA.
+# this program; If not, see <http://www.gnu.org/licenses/>.
 #
 
 echo "unsigned $1[] = {"
diff --git a/tools/firmware/hvmloader/mp_tables.c b/tools/firmware/hvmloader/mp_tables.c
index fd636a0..69c2885 100644
--- a/tools/firmware/hvmloader/mp_tables.c
+++ b/tools/firmware/hvmloader/mp_tables.c
@@ -24,8 +24,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <stdint.h>
diff --git a/tools/firmware/hvmloader/optionroms.c b/tools/firmware/hvmloader/optionroms.c
index e35aebc..9708058 100644
--- a/tools/firmware/hvmloader/optionroms.c
+++ b/tools/firmware/hvmloader/optionroms.c
@@ -16,8 +16,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "config.h"
diff --git a/tools/firmware/hvmloader/ovmf.c b/tools/firmware/hvmloader/ovmf.c
index 28dd7bc..bb3da93 100644
--- a/tools/firmware/hvmloader/ovmf.c
+++ b/tools/firmware/hvmloader/ovmf.c
@@ -18,8 +18,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "config.h"
diff --git a/tools/firmware/hvmloader/pci.c b/tools/firmware/hvmloader/pci.c
index 5ff87a7..4eb1a31 100644
--- a/tools/firmware/hvmloader/pci.c
+++ b/tools/firmware/hvmloader/pci.c
@@ -16,8 +16,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "util.h"
@@ -38,6 +37,45 @@ uint64_t pci_hi_mem_start = 0, pci_hi_mem_end = 0;
 enum virtual_vga virtual_vga = VGA_none;
 unsigned long igd_opregion_pgbase = 0;
 
+/* Check if the specified range conflicts with any reserved device memory. */
+static bool check_overlap_all(uint64_t start, uint64_t size)
+{
+    unsigned int i;
+
+    for ( i = 0; i < memory_map.nr_map; i++ )
+    {
+        if ( memory_map.map[i].type == E820_RESERVED &&
+             check_overlap(start, size,
+                           memory_map.map[i].addr,
+                           memory_map.map[i].size) )
+            return true;
+    }
+
+    return false;
+}
+
+/* Find the lowest RMRR ending above base but below 4G. */
+static int find_next_rmrr(uint32_t base)
+{
+    unsigned int i;
+    int next_rmrr = -1;
+    uint64_t end, min_end = 1ULL << 32;
+
+    for ( i = 0; i < memory_map.nr_map ; i++ )
+    {
+        end = memory_map.map[i].addr + memory_map.map[i].size;
+
+        if ( memory_map.map[i].type == E820_RESERVED &&
+             end > base && end <= min_end )
+        {
+            next_rmrr = i;
+            min_end = end;
+        }
+    }
+
+    return next_rmrr;
+}
+
 void pci_setup(void)
 {
     uint8_t is_64bar, using_64bar, bar64_relocate = 0;
@@ -46,6 +84,7 @@ void pci_setup(void)
     uint32_t vga_devfn = 256;
     uint16_t class, vendor_id, device_id;
     unsigned int bar, pin, link, isa_irq;
+    int next_rmrr;
 
     /* Resources assignable to PCI devices via BARs. */
     struct resource {
@@ -299,6 +338,15 @@ void pci_setup(void)
                     || (((pci_mem_start << 1) >> PAGE_SHIFT)
                         >= hvm_info->low_mem_pgend)) )
             pci_mem_start <<= 1;
+
+        /*
+         * Try to accommodate RMRRs in our MMIO region on a best-effort basis.
+         * If we have RMRRs in the range, then make pci_mem_start just after
+         * hvm_info->low_mem_pgend.
+         */
+        if ( pci_mem_start > (hvm_info->low_mem_pgend << PAGE_SHIFT) &&
+             check_overlap_all(pci_mem_start, pci_mem_end-pci_mem_start) )
+            pci_mem_start = hvm_info->low_mem_pgend << PAGE_SHIFT;
     }
 
     if ( mmio_total > (pci_mem_end - pci_mem_start) )
@@ -334,6 +382,9 @@ void pci_setup(void)
         hvm_info->high_mem_pgend += nr_pages;
     }
 
+    /* Sync memory map[] if necessary. */
+    adjust_memory_map();
+
     high_mem_resource.base = ((uint64_t)hvm_info->high_mem_pgend) << PAGE_SHIFT;
     if ( high_mem_resource.base < 1ull << 32 )
     {
@@ -352,6 +403,8 @@ void pci_setup(void)
     io_resource.base = 0xc000;
     io_resource.max = 0x10000;
 
+    next_rmrr = find_next_rmrr(pci_mem_start);
+
     /* Assign iomem and ioport resources in descending order of size. */
     for ( i = 0; i < nr_bars; i++ )
     {
@@ -407,6 +460,19 @@ void pci_setup(void)
         }
 
         base = (resource->base  + bar_sz - 1) & ~(uint64_t)(bar_sz - 1);
+
+        /* If we're using mem_resource, check for RMRR conflicts. */
+        while ( resource == &mem_resource &&
+                next_rmrr >= 0 &&
+                check_overlap(base, bar_sz,
+                              memory_map.map[next_rmrr].addr,
+                              memory_map.map[next_rmrr].size) )
+        {
+            base = memory_map.map[next_rmrr].addr + memory_map.map[next_rmrr].size;
+            base = (base + bar_sz - 1) & ~(bar_sz - 1);
+            next_rmrr = find_next_rmrr(base);
+        }
+
         bar_data |= (uint32_t)base;
         bar_data_upper = (uint32_t)(base >> 32);
         base += bar_sz;
diff --git a/tools/firmware/hvmloader/pir_types.h b/tools/firmware/hvmloader/pir_types.h
index 6e50822..9f9259c 100644
--- a/tools/firmware/hvmloader/pir_types.h
+++ b/tools/firmware/hvmloader/pir_types.h
@@ -12,8 +12,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Citrix Systems, 2011
  *
diff --git a/tools/firmware/hvmloader/rombios.c b/tools/firmware/hvmloader/rombios.c
index 810bd24..1f15b94 100644
--- a/tools/firmware/hvmloader/rombios.c
+++ b/tools/firmware/hvmloader/rombios.c
@@ -16,8 +16,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "config.h"
diff --git a/tools/firmware/hvmloader/seabios.c b/tools/firmware/hvmloader/seabios.c
index dd7dfbe..c6b3d9f 100644
--- a/tools/firmware/hvmloader/seabios.c
+++ b/tools/firmware/hvmloader/seabios.c
@@ -16,8 +16,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "config.h"
diff --git a/tools/firmware/hvmloader/smbios.c b/tools/firmware/hvmloader/smbios.c
index 4d3d692..210c7b0 100644
--- a/tools/firmware/hvmloader/smbios.c
+++ b/tools/firmware/hvmloader/smbios.c
@@ -12,8 +12,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) IBM Corporation, 2006
  *
diff --git a/tools/firmware/hvmloader/smbios_types.h b/tools/firmware/hvmloader/smbios_types.h
index ff36564..e924f81 100644
--- a/tools/firmware/hvmloader/smbios_types.h
+++ b/tools/firmware/hvmloader/smbios_types.h
@@ -12,8 +12,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) IBM Corporation, 2006
  *
diff --git a/tools/firmware/hvmloader/smp.c b/tools/firmware/hvmloader/smp.c
index fa96878..082b17f 100644
--- a/tools/firmware/hvmloader/smp.c
+++ b/tools/firmware/hvmloader/smp.c
@@ -16,8 +16,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "util.h"
diff --git a/tools/firmware/hvmloader/tests.c b/tools/firmware/hvmloader/tests.c
index 52772aa..fea3ad3 100644
--- a/tools/firmware/hvmloader/tests.c
+++ b/tools/firmware/hvmloader/tests.c
@@ -16,8 +16,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "util.h"
diff --git a/tools/firmware/hvmloader/util.c b/tools/firmware/hvmloader/util.c
index 80d822f..d779fd7 100644
--- a/tools/firmware/hvmloader/util.c
+++ b/tools/firmware/hvmloader/util.c
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "util.h"
@@ -27,6 +26,17 @@
 #include <xen/memory.h>
 #include <xen/sched.h>
 
+/*
+ * Check whether there exists overlap in the specified memory range.
+ * Returns true if exists, else returns false.
+ */
+bool check_overlap(uint64_t start, uint64_t size,
+                   uint64_t reserved_start, uint64_t reserved_size)
+{
+    return (start + size > reserved_start) &&
+            (start < reserved_start + reserved_size);
+}
+
 void wrmsr(uint32_t idx, uint64_t v)
 {
     asm volatile (
@@ -368,6 +378,21 @@ uuid_to_string(char *dest, uint8_t *uuid)
     *p = '\0';
 }
 
+int get_mem_mapping_layout(struct e820entry entries[], uint32_t *max_entries)
+{
+    int rc;
+    struct xen_memory_map memmap = {
+        .nr_entries = *max_entries
+    };
+
+    set_xen_guest_handle(memmap.buffer, entries);
+
+    rc = hypercall_memory_op(XENMEM_memory_map, &memmap);
+    *max_entries = memmap.nr_entries;
+
+    return rc;
+}
+
 void mem_hole_populate_ram(xen_pfn_t mfn, uint32_t nr_mfns)
 {
     static int over_allocated;
@@ -406,6 +431,9 @@ void mem_hole_populate_ram(xen_pfn_t mfn, uint32_t nr_mfns)
         if ( hypercall_memory_op(XENMEM_add_to_physmap, &xatp) != 0 )
             BUG();
     }
+
+    /* Sync memory map[]. */
+    adjust_memory_map();
 }
 
 static uint32_t alloc_up = RESERVED_MEMORY_DYNAMIC_START - 1;
diff --git a/tools/firmware/hvmloader/util.h b/tools/firmware/hvmloader/util.h
index a70e4aa..132d915 100644
--- a/tools/firmware/hvmloader/util.h
+++ b/tools/firmware/hvmloader/util.h
@@ -4,8 +4,10 @@
 #include <stdarg.h>
 #include <stdint.h>
 #include <stddef.h>
+#include <stdbool.h>
 #include <xen/xen.h>
 #include <xen/hvm/hvm_info_table.h>
+#include "e820.h"
 
 #define __STR(...) #__VA_ARGS__
 #define STR(...) __STR(__VA_ARGS__)
@@ -82,9 +84,9 @@ uint32_t pci_read(uint32_t devfn, uint32_t reg, uint32_t len);
 #define pci_readw(devfn, reg) ((uint16_t)pci_read(devfn, reg, 2))
 #define pci_readl(devfn, reg) ((uint32_t)pci_read(devfn, reg, 4))
 void pci_write(uint32_t devfn, uint32_t reg, uint32_t len, uint32_t val);
-#define pci_writeb(devfn, reg, val) (pci_write(devfn, reg, 1, (uint8_t) val))
-#define pci_writew(devfn, reg, val) (pci_write(devfn, reg, 2, (uint16_t)val))
-#define pci_writel(devfn, reg, val) (pci_write(devfn, reg, 4, (uint32_t)val))
+#define pci_writeb(devfn, reg, val) pci_write(devfn, reg, 1, (uint8_t) (val))
+#define pci_writew(devfn, reg, val) pci_write(devfn, reg, 2, (uint16_t)(val))
+#define pci_writel(devfn, reg, val) pci_write(devfn, reg, 4, (uint32_t)(val))
 
 /* Get a pointer to the shared-info page */
 struct shared_info *get_shared_info(void) __attribute__ ((const));
@@ -222,6 +224,12 @@ int hvm_param_set(uint32_t index, uint64_t value);
 /* Setup PCI bus */
 void pci_setup(void);
 
+/* Setup memory map  */
+void memory_map_setup(void);
+
+/* Sync memory map */
+void adjust_memory_map(void);
+
 /* Prepare the 32bit BIOS */
 uint32_t rombios_highbios_setup(void);
 
@@ -249,6 +257,13 @@ void perform_tests(void);
 
 extern char _start[], _end[];
 
+int get_mem_mapping_layout(struct e820entry entries[],
+                           unsigned int *max_entries);
+
+extern struct e820map memory_map;
+bool check_overlap(uint64_t start, uint64_t size,
+                   uint64_t reserved_start, uint64_t reserved_size);
+
 #endif /* __HVMLOADER_UTIL_H__ */
 
 /*
diff --git a/tools/firmware/hvmloader/vnuma.c b/tools/firmware/hvmloader/vnuma.c
new file mode 100644
index 0000000..4121cc6
--- /dev/null
+++ b/tools/firmware/hvmloader/vnuma.c
@@ -0,0 +1,80 @@
+/*
+ * vnuma.c: obtain vNUMA information from hypervisor
+ *
+ * Copyright (c) 2014 Wei Liu, Citrix Systems (R&D) Ltd.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "util.h"
+#include "hypercall.h"
+#include "vnuma.h"
+#include <xen/errno.h>
+
+unsigned int nr_vnodes, nr_vmemranges;
+unsigned int *vcpu_to_vnode, *vdistance;
+xen_vmemrange_t *vmemrange;
+
+void init_vnuma_info(void)
+{
+    int rc;
+    struct xen_vnuma_topology_info vnuma_topo = { .domid = DOMID_SELF };
+
+    rc = hypercall_memory_op(XENMEM_get_vnumainfo, &vnuma_topo);
+    if ( rc != -XEN_ENOBUFS )
+        return;
+
+    ASSERT(vnuma_topo.nr_vcpus == hvm_info->nr_vcpus);
+
+    vcpu_to_vnode =
+        scratch_alloc(sizeof(*vcpu_to_vnode) * hvm_info->nr_vcpus, 0);
+    vdistance = scratch_alloc(sizeof(uint32_t) * vnuma_topo.nr_vnodes *
+                              vnuma_topo.nr_vnodes, 0);
+    vmemrange = scratch_alloc(sizeof(xen_vmemrange_t) *
+                              vnuma_topo.nr_vmemranges, 0);
+
+    set_xen_guest_handle(vnuma_topo.vdistance.h, vdistance);
+    set_xen_guest_handle(vnuma_topo.vcpu_to_vnode.h, vcpu_to_vnode);
+    set_xen_guest_handle(vnuma_topo.vmemrange.h, vmemrange);
+
+    rc = hypercall_memory_op(XENMEM_get_vnumainfo, &vnuma_topo);
+
+    if ( rc < 0 )
+    {
+        printf("Failed to retrieve vNUMA information, rc = %d\n", rc);
+        return;
+    }
+
+    nr_vnodes = vnuma_topo.nr_vnodes;
+    nr_vmemranges = vnuma_topo.nr_vmemranges;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/blktap/drivers/blktapctrl.h b/tools/firmware/hvmloader/vnuma.h
similarity index 72%
rename from tools/blktap/drivers/blktapctrl.h
rename to tools/firmware/hvmloader/vnuma.h
index 4512807..63b648a 100644
--- a/tools/blktap/drivers/blktapctrl.h
+++ b/tools/firmware/hvmloader/vnuma.h
@@ -1,8 +1,7 @@
-/* blktapctrl.h
+/******************************************************************************
+ * vnuma.h
  *
- * controller image utils.
- * 
- * (c) 2004-6 Andrew Warfield and Julian Chesterfield
+ * Copyright (c) 2014, Wei Liu
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License version 2
@@ -29,8 +28,25 @@
  * IN THE SOFTWARE.
  */
 
+#ifndef __HVMLOADER_VNUMA_H__
+#define __HVMLOADER_VNUMA_H__
 
-int blktap_interface_open(void);
+#include <xen/memory.h>
 
-int blktap_interface_create(int ctlfd, int *major, int *minor, blkif_t *blkif);
+extern unsigned int nr_vnodes, nr_vmemranges;
+extern unsigned int *vcpu_to_vnode, *vdistance;
+extern xen_vmemrange_t *vmemrange;
 
+void init_vnuma_info(void);
+
+#endif /* __HVMLOADER_VNUMA_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/firmware/hvmloader/xenbus.c b/tools/firmware/hvmloader/xenbus.c
index f900a1e..d0ed993 100644
--- a/tools/firmware/hvmloader/xenbus.c
+++ b/tools/firmware/hvmloader/xenbus.c
@@ -105,7 +105,7 @@ void xenbus_shutdown(void)
 /* Helper functions: copy data in and out of the ring */
 static void ring_write(const char *data, uint32_t len)
 {
-    uint32_t part;
+    uint32_t part, done = 0;
 
     ASSERT(len <= XENSTORE_PAYLOAD_MAX);
 
@@ -122,16 +122,18 @@ static void ring_write(const char *data, uint32_t len)
         if ( part > len ) 
             part = len;
 
-        memcpy(rings->req + MASK_XENSTORE_IDX(rings->req_prod), data, part);
+        memcpy(rings->req + MASK_XENSTORE_IDX(rings->req_prod),
+               data + done, part);
         barrier(); /* = wmb before prod write, rmb before next cons read */
         rings->req_prod += part;
         len -= part;
+        done += part;
     }
 }
 
 static void ring_read(char *data, uint32_t len)
 {
-    uint32_t part;
+    uint32_t part, done = 0;
 
     ASSERT(len <= XENSTORE_PAYLOAD_MAX);
 
@@ -148,10 +150,12 @@ static void ring_read(char *data, uint32_t len)
         if ( part > len )
             part = len;
 
-        memcpy(data, rings->rsp + MASK_XENSTORE_IDX(rings->rsp_cons), part);
+        memcpy(data + done,
+               rings->rsp + MASK_XENSTORE_IDX(rings->rsp_cons), part);
         barrier(); /* = wmb before cons write, rmb before next prod read */
         rings->rsp_cons += part;
         len -= part;
+        done += part;
     }
 }
 
@@ -204,15 +208,23 @@ static void xenbus_send(uint32_t type, ...)
  * Returns 0 for success, or an errno for error.
  * The answer is returned in a static buffer which is only
  * valid until the next call of xenbus_send(). */
-static int xenbus_recv(uint32_t *reply_len, const char **reply_data)
+static int xenbus_recv(uint32_t *reply_len, const char **reply_data,
+                       uint32_t *reply_type)
 {
     struct xsd_sockmsg hdr;
 
-    /* Pull the reply off the ring */
-    ring_read((char *) &hdr, sizeof(hdr));
-    ring_read(payload, hdr.len);
-    /* For sanity's sake, nul-terminate the answer */
-    payload[hdr.len] = '\0';
+    do
+    {
+        /* Pull the reply off the ring */
+        ring_read((char *) &hdr, sizeof(hdr));
+        ring_read(payload, hdr.len);
+        /* For sanity's sake, nul-terminate the answer */
+        payload[hdr.len] = '\0';
+
+    } while ( hdr.type == XS_DEBUG );
+
+    if ( reply_type )
+        *reply_type = hdr.type;
 
     /* Handle errors */
     if ( hdr.type == XS_ERROR )
@@ -243,7 +255,7 @@ static int xenbus_recv(uint32_t *reply_len, const char **reply_data)
  */
 const char *xenstore_read(const char *path, const char *default_resp)
 {
-    uint32_t len = 0;
+    uint32_t len = 0, type = 0;
     const char *answer = NULL;
 
     xenbus_send(XS_READ,
@@ -251,7 +263,7 @@ const char *xenstore_read(const char *path, const char *default_resp)
                 "", 1, /* nul separator */
                 NULL, 0);
 
-    if ( xenbus_recv(&len, &answer) )
+    if ( xenbus_recv(&len, &answer, &type) || (type != XS_READ) )
         answer = NULL;
 
     if ( (default_resp != NULL) && ((answer == NULL) || (*answer == '\0')) )
@@ -266,13 +278,23 @@ const char *xenstore_read(const char *path, const char *default_resp)
  */
 int xenstore_write(const char *path, const char *value)
 {
+    uint32_t len = 0, type = 0;
+    const char *answer = NULL;
+    int ret;
+
     xenbus_send(XS_WRITE,
                 path, strlen(path),
                 "", 1, /* nul separator */
                 value, strlen(value),
                 NULL, 0);
 
-    return ( xenbus_recv(NULL, NULL) );
+    ret = xenbus_recv(&len, &answer, &type);
+
+    if ( ret == 0 && ((type != XS_WRITE) || (len != 3) ||
+                      !answer || strcmp(answer, "OK")) )
+        ret = EIO;
+
+    return ret;
 }
 
 /*
diff --git a/tools/firmware/ovmf-makefile b/tools/firmware/ovmf-makefile
index 1ad041f..2838744 100644
--- a/tools/firmware/ovmf-makefile
+++ b/tools/firmware/ovmf-makefile
@@ -1,6 +1,3 @@
-# OVMF building system is not ready yet to run in parallel.
-# Force it to be serial in order to exploit parallelism for neighbors.
-
 XEN_ROOT=$(CURDIR)/../../..
 include $(XEN_ROOT)/tools/Rules.mk
 
@@ -10,6 +7,7 @@ else
 TARGET=RELEASE
 endif
 
+# OVMF build system has its own parallel building support.
 .NOTPARALLEL:
 MAKEFLAGS  += -j1
 
@@ -18,7 +16,7 @@ all: build
 
 .PHONY: build
 build:
-	OvmfPkg/build.sh -a X64 -b $(TARGET)
+	OvmfPkg/build.sh -a X64 -b $(TARGET) -n 4
 	cp Build/OvmfX64/$(TARGET)_GCC*/FV/OVMF.fd ovmf.bin
 
 .PHONY: clean
diff --git a/tools/firmware/rombios/32bit/32bitbios.c b/tools/firmware/rombios/32bit/32bitbios.c
index 22f83f4..87acf20 100644
--- a/tools/firmware/rombios/32bit/32bitbios.c
+++ b/tools/firmware/rombios/32bit/32bitbios.c
@@ -12,8 +12,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) IBM Corporation, 2006
  *
diff --git a/tools/firmware/rombios/32bit/Makefile b/tools/firmware/rombios/32bit/Makefile
index e5d1cbc..396906c 100644
--- a/tools/firmware/rombios/32bit/Makefile
+++ b/tools/firmware/rombios/32bit/Makefile
@@ -15,6 +15,9 @@ all: subdirs-all
 clean: subdirs-clean
 	rm -rf *.o $(TARGET) $(DEPS)
 
+.PHONY: distclean
+distclean: subdirs-distclean
+
 $(TARGET): 32bitbios_all.o
 	sh mkhex highbios_array 32bitbios_all.o > $@
 
diff --git a/tools/firmware/rombios/32bit/mkhex b/tools/firmware/rombios/32bit/mkhex
index 4517e36..7200d00 100644
--- a/tools/firmware/rombios/32bit/mkhex
+++ b/tools/firmware/rombios/32bit/mkhex
@@ -16,8 +16,7 @@
 # more details.
 #
 # You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-# Place - Suite 330, Boston, MA 02111-1307 USA.
+# this program; If not, see <http://www.gnu.org/licenses/>.
 #
 
 echo "unsigned $1[] = {"
diff --git a/tools/firmware/rombios/32bit/pmm.c b/tools/firmware/rombios/32bit/pmm.c
index 4a279ca..09fec42 100644
--- a/tools/firmware/rombios/32bit/pmm.c
+++ b/tools/firmware/rombios/32bit/pmm.c
@@ -14,8 +14,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  *  Copyright (C) 2009 FUJITSU LIMITED
  *
diff --git a/tools/firmware/rombios/32bit/tcgbios/Makefile b/tools/firmware/rombios/32bit/tcgbios/Makefile
index ddb0471..f6f2649 100644
--- a/tools/firmware/rombios/32bit/tcgbios/Makefile
+++ b/tools/firmware/rombios/32bit/tcgbios/Makefile
@@ -12,6 +12,9 @@ all: $(TARGET)
 clean:
 	rm -rf *.o $(TARGET) $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 $(TARGET): tcgbios.o tpm_drivers.o
 	$(LD) $(LDFLAGS_DIRECT) -r $^ -o $@
 
diff --git a/tools/firmware/rombios/32bit/tcgbios/tcgbios.c b/tools/firmware/rombios/32bit/tcgbios/tcgbios.c
index 01d4f2f..beef5a4 100644
--- a/tools/firmware/rombios/32bit/tcgbios/tcgbios.c
+++ b/tools/firmware/rombios/32bit/tcgbios/tcgbios.c
@@ -14,8 +14,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) IBM Corporation, 2006
  *
diff --git a/tools/firmware/rombios/32bit/tcgbios/tpm_drivers.c b/tools/firmware/rombios/32bit/tcgbios/tpm_drivers.c
index d45f9b0..59d16bb 100644
--- a/tools/firmware/rombios/32bit/tcgbios/tpm_drivers.c
+++ b/tools/firmware/rombios/32bit/tcgbios/tpm_drivers.c
@@ -14,8 +14,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) IBM Corporation, 2006
  *
diff --git a/tools/firmware/rombios/32bit/util.c b/tools/firmware/rombios/32bit/util.c
index a47bb71..e473e89 100644
--- a/tools/firmware/rombios/32bit/util.c
+++ b/tools/firmware/rombios/32bit/util.c
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #include <stdarg.h>
 #include <stdint.h>
diff --git a/tools/firmware/rombios/32bitgateway.c b/tools/firmware/rombios/32bitgateway.c
index fc82e07..3d58a6b 100644
--- a/tools/firmware/rombios/32bitgateway.c
+++ b/tools/firmware/rombios/32bitgateway.c
@@ -15,8 +15,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) IBM Corporation, 2006
  * Copyright (c) 2008, Citrix Systems, Inc.
diff --git a/tools/firmware/rombios/Makefile b/tools/firmware/rombios/Makefile
index 2f2e409..94e65db 100644
--- a/tools/firmware/rombios/Makefile
+++ b/tools/firmware/rombios/Makefile
@@ -15,6 +15,9 @@ clean: subdirs-clean
 	rm -f  BIOS-bochs-*
 	rm -f  $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 BIOS-bochs-latest: rombios.c biossums 32bitgateway.c tcgbios.c
 	gcc -DBX_SMP_PROCESSORS=1 -E -P $< > _rombios_.c
 	bcc -o rombios.s -C-c -D__i86__ -0 -S _rombios_.c
diff --git a/tools/firmware/rombios/apmbios.S b/tools/firmware/rombios/apmbios.S
index a010949..814be8f 100644
--- a/tools/firmware/rombios/apmbios.S
+++ b/tools/firmware/rombios/apmbios.S
@@ -15,8 +15,7 @@
 //  Lesser General Public License for more details.
 //
 //  You should have received a copy of the GNU Lesser General Public
-//  License along with this library; if not, write to the Free Software
-//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+//  License along with this library; If not, see <http://www.gnu.org/licenses/>.
 
 #if defined(APM_REAL)
 #define APMSYM(s) apmreal_ ## s
diff --git a/tools/firmware/rombios/rombios.c b/tools/firmware/rombios/rombios.c
index 057aced..58ace9b 100644
--- a/tools/firmware/rombios/rombios.c
+++ b/tools/firmware/rombios/rombios.c
@@ -21,8 +21,7 @@
 //  Lesser General Public License for more details.
 //
 //  You should have received a copy of the GNU Lesser General Public
-//  License along with this library; if not, write to the Free Software
-//  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+//  License along with this library; If not, see <http://www.gnu.org/licenses/>.
 
 // ROM BIOS for use with Bochs/Plex86/QEMU emulation environment
 
diff --git a/tools/firmware/rombios/rombios.h b/tools/firmware/rombios/rombios.h
index 0308a18..8726d37 100644
--- a/tools/firmware/rombios/rombios.h
+++ b/tools/firmware/rombios/rombios.h
@@ -15,8 +15,7 @@
 //  Lesser General Public License for more details.
 //
 //  You should have received a copy of the GNU Lesser General Public
-//  License along with this library; if not, write to the Free Software
-//  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+//  License along with this library; If not, see <http://www.gnu.org/licenses/>.
 
 /* define it to include QEMU specific code */
 //#define BX_QEMU
diff --git a/tools/firmware/rombios/tcgbios.c b/tools/firmware/rombios/tcgbios.c
index c7ec261..e725ef9 100644
--- a/tools/firmware/rombios/tcgbios.c
+++ b/tools/firmware/rombios/tcgbios.c
@@ -13,8 +13,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) IBM Corporation, 2006
  *
diff --git a/tools/firmware/vgabios/COPYING b/tools/firmware/vgabios/COPYING
index 223ede7..0595626 100644
--- a/tools/firmware/vgabios/COPYING
+++ b/tools/firmware/vgabios/COPYING
@@ -484,8 +484,7 @@ convey the exclusion of warranty; and each file should have at least the
     Lesser General Public License for more details.
 
     You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    License along with this library; If not, see <http://www.gnu.org/licenses/>.
 
 Also add information on how to contact you by electronic and paper mail.
 
diff --git a/tools/firmware/vgabios/Makefile b/tools/firmware/vgabios/Makefile
index 26bb871..3284812 100644
--- a/tools/firmware/vgabios/Makefile
+++ b/tools/firmware/vgabios/Makefile
@@ -5,10 +5,10 @@ BCC = bcc
 AS86 = as86
 
 RELEASE = `pwd | sed "s-.*/--"`
-RELDATE = `date '+%d %b %Y'`
+VGABIOS_REL_DATE ?= `date '+%d %b %Y'`
 RELVERS = `pwd | sed "s-.*/--" | sed "s/vgabios//" | sed "s/-//"`
 
-VGABIOS_DATE = "-DVGABIOS_DATE=\"$(RELDATE)\""
+VGABIOS_DATE = "-DVGABIOS_DATE=\"$(VGABIOS_REL_DATE)\""
 
 .PHONY: all
 all: bios cirrus-bios
@@ -25,6 +25,9 @@ clean:
           temp.awk.* vgabios*.orig _vgabios_* _vgabios-debug_* core vgabios*.bin vgabios*.txt $(RELEASE).bin *.bak
 	rm -f VGABIOS-lgpl-latest*.bin
 
+.PHONY: distclean
+distclean: clean
+
 .PHONY: release
 release: 
 	VGABIOS_VERS=\"-DVGABIOS_VERS=\\\"$(RELVERS)\\\"\" make bios cirrus-bios
diff --git a/tools/firmware/vgabios/biossums.c b/tools/firmware/vgabios/biossums.c
index 6288498..85ecc04 100644
--- a/tools/firmware/vgabios/biossums.c
+++ b/tools/firmware/vgabios/biossums.c
@@ -12,8 +12,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/tools/firmware/vgabios/clext.c b/tools/firmware/vgabios/clext.c
index c501227..d727cd5 100644
--- a/tools/firmware/vgabios/clext.c
+++ b/tools/firmware/vgabios/clext.c
@@ -14,8 +14,7 @@
 //  Lesser General Public License for more details.
 //
 //  You should have received a copy of the GNU Lesser General Public
-//  License along with this library; if not, write to the Free Software
-//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+//  License along with this library; If not, see <http://www.gnu.org/licenses/>.
 // 
 
 //#define CIRRUS_VESA3_PMINFO
diff --git a/tools/firmware/vgabios/vbe.c b/tools/firmware/vgabios/vbe.c
index a13e223..c506690 100644
--- a/tools/firmware/vgabios/vbe.c
+++ b/tools/firmware/vgabios/vbe.c
@@ -13,8 +13,7 @@
 //  Lesser General Public License for more details.
 //
 //  You should have received a copy of the GNU Lesser General Public
-//  License along with this library; if not, write to the Free Software
-//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+//  License along with this library; If not, see <http://www.gnu.org/licenses/>.
 // 
 // ============================================================================================
 //  
diff --git a/tools/firmware/vgabios/vgabios.c b/tools/firmware/vgabios/vgabios.c
index a9dbe00..1c75b7d 100644
--- a/tools/firmware/vgabios/vgabios.c
+++ b/tools/firmware/vgabios/vgabios.c
@@ -17,8 +17,7 @@
 //  Lesser General Public License for more details.
 //
 //  You should have received a copy of the GNU Lesser General Public
-//  License along with this library; if not, write to the Free Software
-//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+//  License along with this library; If not, see <http://www.gnu.org/licenses/>.
 // 
 // ============================================================================================
 //  
diff --git a/tools/flask/Makefile b/tools/flask/Makefile
index bc77a06..a31cc16 100644
--- a/tools/flask/Makefile
+++ b/tools/flask/Makefile
@@ -4,6 +4,6 @@ include $(XEN_ROOT)/tools/Rules.mk
 SUBDIRS-y := utils
 SUBDIRS-$(FLASK_POLICY) += policy
 
-.PHONY: all clean install
-all clean install: %: subdirs-%
+.PHONY: all clean install distclean
+all clean install distclean: %: subdirs-%
 
diff --git a/tools/flask/policy/Makefile b/tools/flask/policy/Makefile
index 5d8cfbf..4be921c 100644
--- a/tools/flask/policy/Makefile
+++ b/tools/flask/policy/Makefile
@@ -20,21 +20,21 @@ MLS_CATS ?= 256
 CHECKPOLICY ?= checkpolicy
 M4 ?= m4
 
+# Output security policy version.  Leave unset to autodetect.
+OUTPUT_POLICY ?= $(BEST_POLICY_VER)
+
 ########################################
 #
 # End of configuration options
 #
 ########################################
 
-# Policy version
-# By default, checkpolicy creates the highest version policy it supports. Force
-# the use of version 24 which is the highest that Xen supports, and the first to
-# include the Xen policy type (needed for static device policy).
-OUTPUT_POLICY = 24
-
 POLICY_FILENAME = xenpolicy-$(shell $(MAKE) -C $(XEN_ROOT)/xen xenversion --no-print-directory)
 POLICY_LOADPATH = /boot
 
+# List of policy versions supported by the hypervisor
+POLICY_VER_LIST_HV = 24 30
+
 # policy source layout
 POLDIR := policy
 MODDIR := $(POLDIR)/modules
@@ -56,6 +56,7 @@ MLSSUPPORT := $(POLDIR)/mls
 USERS := $(POLDIR)/users
 CONSTRAINTS := $(POLDIR)/constraints
 ISID_DEFS := $(POLDIR)/initial_sids
+DEV_OCONS := $(POLDIR)/device_contexts
 
 # config file paths
 GLOBALTUN := $(POLDIR)/global_tunables
@@ -63,6 +64,14 @@ MOD_CONF := $(POLDIR)/modules.conf
 
 # checkpolicy can use the #line directives provided by -s for error reporting:
 M4PARAM := -D self_contained_policy -s
+
+# The output of checkpolicy -V is "30 (compatibility range 30-15)", and the
+# first word of the output is the maximum policy version supported.
+CHECKPOLICY_VER_MAX := $(firstword $(shell $(CHECKPOLICY) -V))
+
+# Find the highest version supported by both the hypervisor and checkpolicy
+BEST_POLICY_VER := $(shell best=24; for ver in $(POLICY_VER_LIST_HV); do if test $$ver -le $(CHECKPOLICY_VER_MAX); then best=$$ver; fi; done; echo $$best)
+
 CHECKPOLICY_PARAM := -t Xen -c $(OUTPUT_POLICY)
 
 # enable MLS if requested.
@@ -86,7 +95,7 @@ DETECTED_MODS := $(sort $(foreach dir,$(ALL_LAYERS),$(wildcard $(dir)/*.te)))
 MODENABLED := on
 
 # extract settings from modules.conf
-ENABLED_MODS := $(foreach mod,$(shell awk '/^[[:blank:]]*[[:alpha:]]/{ if ($$3 == "$(MODENABLED)") print $$1 }' $(MOD_CONF) 2> /dev/null),$(subst ./,,$(shell find -iname $(mod).te)))
+ENABLED_MODS := $(foreach mod,$(shell awk '/^[ \t]*[a-z]/{ if ($$3 == "$(MODENABLED)") print $$1 }' $(MOD_CONF) 2> /dev/null),$(subst ./,,$(shell find -iname $(mod).te)))
 
 ALL_MODULES := $(filter $(ENABLED_MODS),$(DETECTED_MODS))
 
@@ -98,7 +107,7 @@ POLICY_SECTIONS += $(M4SUPPORT) $(MLSSUPPORT)
 POLICY_SECTIONS += $(ALL_INTERFACES)
 POLICY_SECTIONS += $(GLOBALTUN)
 POLICY_SECTIONS += $(ALL_MODULES)
-POLICY_SECTIONS += $(USERS) $(CONSTRAINTS) $(ISID_DEFS)
+POLICY_SECTIONS += $(USERS) $(CONSTRAINTS) $(ISID_DEFS) $(DEV_OCONS)
 
 all: $(POLICY_FILENAME)
 
@@ -115,4 +124,6 @@ policy.conf: $(POLICY_SECTIONS)
 clean:
 	$(RM) tmp policy.conf $(POLICY_FILENAME)
 
-.PHONY: all install clean
+distclean: clean
+
+.PHONY: all install clean distclean
diff --git a/tools/flask/policy/policy/device_contexts b/tools/flask/policy/policy/device_contexts
new file mode 100644
index 0000000..c2de7e7
--- /dev/null
+++ b/tools/flask/policy/policy/device_contexts
@@ -0,0 +1,32 @@
+###############################################################################
+#
+# Label devices for delegation
+#
+# The PCI, IRQ, memory, and I/O port ranges are hardware-specific.
+#
+###############################################################################
+
+# label e1000e nic
+#pirqcon 33 system_u:object_r:nic_dev_t
+#pirqcon 55 system_u:object_r:nic_dev_t
+#iomemcon 0xfebe0-0xfebff system_u:object_r:nic_dev_t
+#iomemcon 0xfebd9 system_u:object_r:nic_dev_t
+#ioportcon 0xecc0-0xecdf system_u:object_r:nic_dev_t
+#pcidevicecon 0xc800 system_u:object_r:nic_dev_t
+
+# label e100 nic
+#pirqcon 16 system_u:object_r:nic_dev_t
+#iomemcon 0xfe5df system_u:object_r:nic_dev_t
+#iomemcon 0xfe5e0-0xfe5ff system_u:object_r:nic_dev_t
+#iomemcon 0xc2000-0xc200f system_u:object_r:nic_dev_t
+#ioportcon 0xccc0-0xcd00 system_u:object_r:nic_dev_t
+
+# label usb 1d.0-2 1d.7
+#pirqcon 23 system_u:object_r:nic_dev_t
+#pirqcon 17 system_u:object_r:nic_dev_t
+#pirqcon 18 system_u:object_r:nic_dev_t
+#ioportcon 0xff80-0xFF9F system_u:object_r:nic_dev_t
+#ioportcon 0xff60-0xff7f system_u:object_r:nic_dev_t
+#ioportcon 0xff40-0xff5f system_u:object_r:nic_dev_t
+#iomemcon 0xff980 system_u:object_r:nic_dev_t
+#ioportcon 0xff00-0xff1f system_u:object_r:nic_dev_t
diff --git a/tools/flask/policy/policy/initial_sids b/tools/flask/policy/policy/initial_sids
index 5de0bbf..6b7b7ef 100644
--- a/tools/flask/policy/policy/initial_sids
+++ b/tools/flask/policy/policy/initial_sids
@@ -12,3 +12,7 @@ sid irq gen_context(system_u:object_r:irq_t,s0)
 sid iomem gen_context(system_u:object_r:iomem_t,s0)
 sid ioport gen_context(system_u:object_r:ioport_t,s0)
 sid device gen_context(system_u:object_r:device_t,s0)
+
+# Initial SIDs used by the toolstack for domains without defined labels
+sid domU gen_context(system_u:system_r:domU_t,s0)
+sid domDM gen_context(system_u:system_r:dm_dom_t,s0)
diff --git a/tools/flask/policy/policy/modules/xen/xen.if b/tools/flask/policy/policy/modules/xen/xen.if
index 2d32e1c..a2f25e1 100644
--- a/tools/flask/policy/policy/modules/xen/xen.if
+++ b/tools/flask/policy/policy/modules/xen/xen.if
@@ -8,7 +8,8 @@
 define(`declare_domain_common', `
 	allow $1 $2:grant { query setup };
 	allow $1 $2:mmu { adjust physmap map_read map_write stat pinpage updatemp mmuext_op };
-	allow $1 $2:hvm { getparam setparam };
+	allow $1 $2:hvm { getparam setparam altp2mhvm_op };
+	allow $1 $2:domain2 get_vnumainfo;
 ')
 
 # declare_domain(type, attrs...)
@@ -51,13 +52,13 @@ define(`create_domain_common', `
 			getaffinity setaffinity setvcpuextstate };
 	allow $1 $2:domain2 { set_cpuid settsc setscheduler setclaim
 			set_max_evtchn set_vnumainfo get_vnumainfo cacheflush
-			psr_cmt_op configure_domain };
+			psr_cmt_op psr_cat_op };
 	allow $1 $2:security check_context;
 	allow $1 $2:shadow enable;
 	allow $1 $2:mmu { map_read map_write adjust memorymap physmap pinpage mmuext_op updatemp };
 	allow $1 $2:grant setup;
 	allow $1 $2:hvm { cacheattr getparam hvmctl irqlevel pciroute sethvmc
-			setparam pcilevel trackdirtyvram nested };
+			setparam pcilevel trackdirtyvram nested altp2mhvm altp2mhvm_op };
 ')
 
 # create_domain(priv, target)
@@ -95,6 +96,7 @@ define(`migrate_domain_out', `
 	allow $1 $2:mmu { stat pageinfo map_read };
 	allow $1 $2:domain { getaddrsize getvcpucontext getextvcpucontext getvcpuextstate pause destroy };
 	allow $1 $2:domain2 gettsc;
+	allow $1 $2:shadow { enable disable logdirty };
 ')
 
 ################################################################################
diff --git a/tools/flask/policy/policy/modules/xen/xen.te b/tools/flask/policy/policy/modules/xen/xen.te
index c0128aa..5e94ee3 100644
--- a/tools/flask/policy/policy/modules/xen/xen.te
+++ b/tools/flask/policy/policy/modules/xen/xen.te
@@ -67,6 +67,10 @@ allow dom0_t xen_t:xen {
 allow dom0_t xen_t:xen2 {
     resource_op
     psr_cmt_op
+    psr_cat_op
+};
+allow dom0_t xen_t:xen2 {
+    pmu_ctrl
 };
 allow dom0_t xen_t:mmu memorymap;
 
@@ -80,7 +84,8 @@ allow dom0_t dom0_t:domain {
 	getpodtarget setpodtarget set_misc_info set_virq_handler
 };
 allow dom0_t dom0_t:domain2 {
-	set_cpuid gettsc settsc setscheduler set_max_evtchn set_vnumainfo get_vnumainfo psr_cmt_op
+	set_cpuid gettsc settsc setscheduler set_max_evtchn set_vnumainfo
+	get_vnumainfo psr_cmt_op psr_cat_op
 };
 allow dom0_t dom0_t:resource { add remove };
 
@@ -117,6 +122,20 @@ domain_comms(dom0_t, dom0_t)
 # Allow all domains to use (unprivileged parts of) the tmem hypercall
 allow domain_type xen_t:xen tmem_op;
 
+# Allow guest console output to the serial console.  This is used by PV Linux
+# and stub domains for early boot output, so don't audit even when we deny it.
+# Without XSM, this is enabled only if the Xen was compiled in debug mode.
+gen_bool(guest_writeconsole, true)
+if (guest_writeconsole) {
+	allow domain_type xen_t : xen writeconsole;
+} else {
+	dontaudit domain_type xen_t : xen writeconsole;
+}
+
+# Allow all domains to use PMU (but not to change its settings --- that's what
+# pmu_ctrl is for)
+allow domain_type xen_t:xen2 pmu_use;
+
 ###############################################################################
 #
 # Domain creation
@@ -151,18 +170,13 @@ domain_comms(domU_t, prot_domU_t)
 domain_comms(prot_domU_t, prot_domU_t)
 domain_self_comms(prot_domU_t)
 
-# domHVM_t is meant to be paired with a qemu-dm stub domain of type dm_dom_t
-declare_domain(domHVM_t)
-create_domain(dom0_t, domHVM_t)
-manage_domain(dom0_t, domHVM_t)
-domain_comms(dom0_t, domHVM_t)
-domain_self_comms(domHVM_t)
-
+# Device model for domU_t.  You can define distinct types for device models for
+# domains of other types, or add more make_device_model lines for this type.
 declare_domain(dm_dom_t)
 create_domain(dom0_t, dm_dom_t)
 manage_domain(dom0_t, dm_dom_t)
 domain_comms(dom0_t, dm_dom_t)
-make_device_model(dom0_t, dm_dom_t, domHVM_t)
+make_device_model(dom0_t, dm_dom_t, domU_t)
 
 # nomigrate_t must be built via the nomigrate_t_building label; once built,
 # dom0 cannot read its memory.
@@ -177,6 +191,10 @@ domain_self_comms(nomigrate_t)
 #
 # Device delegation
 #
+# This requires that the device be labeled with a type defined here.  You can
+# use flask-label-pci to dynamically label devices on each boot or define the
+# labels statically in tools/flask/policy/policy/device_contexts
+#
 ###############################################################################
 
 type nic_dev_t, resource_type;
@@ -186,40 +204,6 @@ use_device(domU_t, nic_dev_t)
 
 delegate_devices(dom0_t, domU_t)
 
-###############################################################################
-#
-# Label devices for delegation
-#
-# The PCI, IRQ, memory, and I/O port ranges are hardware-specific.
-# You may also use flask-label-pci to dynamically label devices on each boot.
-#
-###############################################################################
-
-# label e1000e nic
-#pirqcon 33 system_u:object_r:nic_dev_t
-#pirqcon 55 system_u:object_r:nic_dev_t
-#iomemcon 0xfebe0-0xfebff system_u:object_r:nic_dev_t
-#iomemcon 0xfebd9 system_u:object_r:nic_dev_t
-#ioportcon 0xecc0-0xecdf system_u:object_r:nic_dev_t
-#pcidevicecon 0xc800 system_u:object_r:nic_dev_t
-
-# label e100 nic
-#pirqcon 16 system_u:object_r:nic_dev_t
-#iomemcon 0xfe5df system_u:object_r:nic_dev_t
-#iomemcon 0xfe5e0-0xfe5ff system_u:object_r:nic_dev_t
-#iomemcon 0xc2000-0xc200f system_u:object_r:nic_dev_t
-#ioportcon 0xccc0-0xcd00 system_u:object_r:nic_dev_t
-
-# label usb 1d.0-2 1d.7
-#pirqcon 23 system_u:object_r:nic_dev_t
-#pirqcon 17 system_u:object_r:nic_dev_t
-#pirqcon 18 system_u:object_r:nic_dev_t
-#ioportcon 0xff80-0xFF9F system_u:object_r:nic_dev_t
-#ioportcon 0xff60-0xff7f system_u:object_r:nic_dev_t
-#ioportcon 0xff40-0xff5f system_u:object_r:nic_dev_t
-#iomemcon 0xff980 system_u:object_r:nic_dev_t
-#ioportcon 0xff00-0xff1f system_u:object_r:nic_dev_t
-
 ################################################################################
 #
 # Policy constraints
diff --git a/tools/flask/utils/Makefile b/tools/flask/utils/Makefile
index c87f15c..91a53b4 100644
--- a/tools/flask/utils/Makefile
+++ b/tools/flask/utils/Makefile
@@ -39,6 +39,9 @@ clean:
 	rm -f $(CLIENTS)
 	$(RM) $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 .PHONY: print-dir
 print-dir:
 	@echo -n tools/flask/utils: 
@@ -49,7 +52,7 @@ print-end:
 
 .PHONY: install
 install: all
-	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
-	$(INSTALL_PROG) $(CLIENTS) $(DESTDIR)$(SBINDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(sbindir)
+	$(INSTALL_PROG) $(CLIENTS) $(DESTDIR)$(sbindir)
 
 -include $(DEPS)
diff --git a/tools/hotplug/FreeBSD/Makefile b/tools/hotplug/FreeBSD/Makefile
index 6aba64a..10fce4f 100644
--- a/tools/hotplug/FreeBSD/Makefile
+++ b/tools/hotplug/FreeBSD/Makefile
@@ -6,7 +6,7 @@ XEN_SCRIPTS = vif-bridge
 
 XEN_SCRIPT_DATA =
 
-XEN_RCD_PROG = rc.d/xencommons
+XEN_RCD_PROG = rc.d/xencommons rc.d/xendriverdomain
 
 .PHONY: all
 all:
@@ -40,3 +40,6 @@ install-rcd:
 
 .PHONY: clean
 clean:
+
+.PHONY: distclean
+distclean: clean
diff --git a/tools/hotplug/FreeBSD/rc.d/xencommons.in b/tools/hotplug/FreeBSD/rc.d/xencommons.in
index 900990b..d453c6b 100644
--- a/tools/hotplug/FreeBSD/rc.d/xencommons.in
+++ b/tools/hotplug/FreeBSD/rc.d/xencommons.in
@@ -7,7 +7,7 @@
 
 . @XEN_SCRIPT_DIR@/hotplugpath.sh
 
-LD_LIBRARY_PATH="${LIBDIR}"
+LD_LIBRARY_PATH="${libdir}"
 export LD_LIBRARY_PATH
 
 name="xencommons"
@@ -33,7 +33,7 @@ xen_startcmd()
 	local time=0
 	local timeout=30
 
-	xenstored_pid=$(check_pidfile ${XENSTORED_PIDFILE} ${SBINDIR}/xenstored)
+	xenstored_pid=$(check_pidfile ${XENSTORED_PIDFILE} ${sbindir}/xenstored)
 	if test -z "$xenstored_pid"; then
 		printf "Cleaning xenstore database.\n"
 		if [ -z "${XENSTORED_ROOTDIR}" ]; then
@@ -45,8 +45,8 @@ xen_startcmd()
 		if [ -n "${XENSTORED_TRACE}" ]; then
 			XENSTORED_ARGS="${XENSTORED_ARGS} -T /var/log/xen/xenstored-trace.log"
 		fi
-		${SBINDIR}/xenstored ${XENSTORED_ARGS}
-		while [ $time -lt $timeout ] && ! `${BINDIR}/xenstore-read -s / >/dev/null 2>&1` ; do
+		${sbindir}/xenstored ${XENSTORED_ARGS}
+		while [ $time -lt $timeout ] && ! `${bindir}/xenstore-read -s / >/dev/null 2>&1` ; do
 			printf "."
 			time=$(($time+1))
 			sleep 1
@@ -60,7 +60,7 @@ xen_startcmd()
 		XENCONSOLED_ARGS="${XENCONSOLED_ARGS} --log=${XENCONSOLED_TRACE}"
 	fi
 
-	${SBINDIR}/xenconsoled ${XENCONSOLED_ARGS}
+	${sbindir}/xenconsoled ${XENCONSOLED_ARGS}
 
 	printf "\n"
 
@@ -74,7 +74,7 @@ xen_stop()
 	printf "Stopping xencommons.\n"
 	printf "WARNING: Not stopping xenstored, as it cannot be restarted.\n"
 
-	rc_pid=$(check_pidfile ${XENCONSOLED_PIDFILE} ${SBINDIR}/xenconsoled)
+	rc_pid=$(check_pidfile ${XENCONSOLED_PIDFILE} ${sbindir}/xenconsoled)
 	pids="$pids $rc_pid"
 
 	kill -${sig_stop:-TERM} $pids
@@ -83,12 +83,12 @@ xen_stop()
 
 xen_status()
 {
-	xenstored_pid=$(check_pidfile ${XENSTORED_PIDFILE} ${SBINDIR}/xenstored)
+	xenstored_pid=$(check_pidfile ${XENSTORED_PIDFILE} ${sbindir}/xenstored)
 	if test -n ${xenstored_pid}; then
 		pids="$pids $xenstored_pid"
 	fi
 
-	xenconsoled_pid=$(check_pidfile ${XENCONSOLED_PIDFILE} ${SBINDIR}/xenconsoled)
+	xenconsoled_pid=$(check_pidfile ${XENCONSOLED_PIDFILE} ${sbindir}/xenconsoled)
 	if test -n ${xenconsoled_pid}; then
 		pids="$pids $xenconsoled_pid"
 	fi
diff --git a/tools/hotplug/FreeBSD/rc.d/xendriverdomain.in b/tools/hotplug/FreeBSD/rc.d/xendriverdomain.in
new file mode 100644
index 0000000..4063c06
--- /dev/null
+++ b/tools/hotplug/FreeBSD/rc.d/xendriverdomain.in
@@ -0,0 +1,48 @@
+#!/bin/sh
+#
+# PROVIDE: xendriverdomain
+# REQUIRE: DAEMON
+#
+# Should be run in a driver domain, but not in domain 0.
+
+. /etc/rc.subr
+
+. @XEN_SCRIPT_DIR@/hotplugpath.sh
+
+LD_LIBRARY_PATH="${libdir}"
+export LD_LIBRARY_PATH
+
+name="xendriverdomain"
+start_precmd="xendriverdomain_precmd"
+start_cmd="xendriverdomain_startcmd"
+stop_cmd="xendriverdomain_stop"
+extra_commands=""
+
+XLDEVD_PIDFILE="/var/run/xldevd.pid"
+
+xendriverdomain_precmd()
+{
+	:
+}
+
+xendriverdomain_startcmd()
+{
+	printf "Starting xenservices: xl devd."
+
+	${sbindir}/xl devd --pidfile=$XLDEVD_PIDFILE ${XLDEVD_ARGS}
+
+	printf "\n"
+}
+
+xendriverdomain_stop()
+{
+	printf "Stopping xl devd.\n"
+
+	rc_pid=$(check_pidfile ${XLDEVD_PIDFILE} ${sbindir}/xl)
+
+	kill -${sig_stop:-TERM} $rc_pids
+	wait_for_pids $rc_pids
+}
+
+load_rc_config $name
+run_rc_command "$1"
diff --git a/tools/hotplug/FreeBSD/vif-bridge b/tools/hotplug/FreeBSD/vif-bridge
index d350df4..428c653 100644
--- a/tools/hotplug/FreeBSD/vif-bridge
+++ b/tools/hotplug/FreeBSD/vif-bridge
@@ -13,7 +13,7 @@
 DIR=$(dirname "$0")
 . "${DIR}/hotplugpath.sh"
 
-PATH=${BINDIR}:${SBINDIR}:${LIBEXEC_BIN}:/bin:/usr/bin:/sbin:/usr/sbin
+PATH=${bindir}:${sbindir}:${LIBEXEC_BIN}:/bin:/usr/bin:/sbin:/usr/sbin
 export PATH
 
 path=$1
@@ -22,6 +22,8 @@ action=$2
 case $action in
 add)
 	bridge=$(xenstore-read "$path/bridge")
+	mtu=$(ifconfig $bridge | sed -n 's/.*mtu \([0-9]*\)$/\1/p')
+	ifconfig $iface_dev mtu $mtu
 	ifconfig $bridge addm $iface_dev
 	ifconfig $iface_dev up
 	exit 0
diff --git a/tools/hotplug/Linux/Makefile b/tools/hotplug/Linux/Makefile
index 1706c05..6e10118 100644
--- a/tools/hotplug/Linux/Makefile
+++ b/tools/hotplug/Linux/Makefile
@@ -9,6 +9,8 @@ XENDOMAINS_SYSCONFIG = init.d/sysconfig.xendomains
 XENCOMMONS_INITD = init.d/xencommons
 XENCOMMONS_SYSCONFIG = init.d/sysconfig.xencommons
 
+XENDRIVERDOMAIN_INITD = init.d/xendriverdomain
+
 # Xen script dir and scripts to go there.
 XEN_SCRIPTS = vif-bridge
 XEN_SCRIPTS += vif-route
@@ -19,11 +21,11 @@ XEN_SCRIPTS += vif-setup
 XEN_SCRIPTS-$(CONFIG_REMUS_NETBUF) += remus-netbuf-setup
 XEN_SCRIPTS += block
 XEN_SCRIPTS += block-enbd block-nbd
-XEN_SCRIPTS-$(CONFIG_BLKTAP1) += blktap
 XEN_SCRIPTS += xen-hotplug-cleanup
 XEN_SCRIPTS += external-device-migrate
 XEN_SCRIPTS += vscsi
 XEN_SCRIPTS += block-iscsi
+XEN_SCRIPTS += block-tap
 XEN_SCRIPTS += block-drbd-probe
 XEN_SCRIPTS += $(XEN_SCRIPTS-y)
 
@@ -33,9 +35,6 @@ XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
 XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
 XEN_SCRIPT_DATA += block-common.sh
 
-UDEV_RULES_DIR = $(CONFIG_DIR)/udev
-UDEV_RULES = xen-backend.rules $(UDEV_RULES-y)
-
 .PHONY: all
 all: subdirs-all
 
@@ -43,7 +42,7 @@ all: subdirs-all
 build:
 
 .PHONY: install
-install: install-initd install-scripts install-udev subdirs-install
+install: install-initd install-scripts subdirs-install
 
 # See docs/misc/distro_mapping.txt for INITD_DIR location
 .PHONY: install-initd
@@ -56,6 +55,7 @@ install-initd:
 	$(INSTALL_DATA) $(XENDOMAINS_SYSCONFIG) $(DESTDIR)$(SYSCONFIG_DIR)/xendomains
 	$(INSTALL_PROG) $(XENCOMMONS_INITD) $(DESTDIR)$(INITD_DIR)
 	$(INSTALL_DATA) $(XENCOMMONS_SYSCONFIG) $(DESTDIR)$(SYSCONFIG_DIR)/xencommons
+	$(INSTALL_PROG) $(XENDRIVERDOMAIN_INITD) $(DESTDIR)$(INITD_DIR)
 	$(INSTALL_PROG) init.d/xen-watchdog $(DESTDIR)$(INITD_DIR)
 
 .PHONY: install-scripts
@@ -71,14 +71,8 @@ install-scripts:
 	    $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
 	done
 
-.PHONY: install-udev
-install-udev:
-	[ -d $(DESTDIR)$(UDEV_RULES_DIR) ] || \
-		$(INSTALL_DIR) $(DESTDIR)$(UDEV_RULES_DIR)/rules.d
-	set -e; for i in $(UDEV_RULES); \
-	    do \
-	    $(INSTALL_DATA) $$i $(DESTDIR)$(UDEV_RULES_DIR)/rules.d; \
-	done
-
 .PHONY: clean
 clean: subdirs-clean
+
+.PHONY: distclean
+distclean: clean
diff --git a/tools/hotplug/Linux/blktap b/tools/hotplug/Linux/blktap
deleted file mode 100644
index cd30a38..0000000
--- a/tools/hotplug/Linux/blktap
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2005, XenSource Ltd.
-
-dir=$(dirname "$0")
-. "$dir/xen-hotplug-common.sh"
-. "$dir/block-common.sh"
-
-findCommand "$@"
-
-##
-# check_blktap_sharing file mode
-#
-# Perform the sharing check for the given blktap and mode.
-#
-check_blktap_sharing()
-{
-    local file="$1"
-    local mode="$2"
-
-    local base_path="$XENBUS_BASE_PATH/$XENBUS_TYPE"
-    for dom in $(xenstore-list "$base_path")
-    do
-        for dev in $(xenstore-list "$base_path/$dom")
-        do
-            params=$(xenstore_read_default "$base_path/$dom/$dev/params" "" | cut -d: -f2)
-            if [ "$file" = "$params" ]
-            then
-
-                if [ "$mode" = 'w' ]
-                then
-                    if ! same_vm "$dom" 
-                    then
-                        echo 'guest'
-                        return
-                    fi
-                else 
-                    local m=$(xenstore_read_default "$base_path/$dom/$dev/mode" "")
-                    m=$(canonicalise_mode "$m")
-
-                    if [ "$m" = 'w' ] 
-                    then
-                        if ! same_vm "$dom"
-                        then
-                            echo 'guest'
-                            return
-                        fi
-                    fi
-                fi
-            fi
-        done
-    done
-
-    echo 'ok'
-}
-
-
-t=$(xenstore_read_default "$XENBUS_PATH/type" 'MISSING')
-if [ -n "$t" ]
-then
-    p=$(xenstore_read "$XENBUS_PATH/params")
-    p=${p#tapdisk:}
-    # if we have a ':', chew from head including :
-    if echo $p | grep -q \:
-    then
-        p=${p#*:}
-    fi
-fi
-# some versions of readlink cannot be passed a regular file
-if [ -L "$p" ]; then
-    file=$(readlink -f "$p") || fatal "$p link does not exist."
-else
-    file="$p"
-fi
-
-if [ "$command" = 'add' ]
-then
-    [ -e "$file" ] || { fatal $file does not exist; }
-
-    FRONTEND_ID=$(xenstore_read "$XENBUS_PATH/frontend-id")
-    FRONTEND_UUID=$(xenstore_read "/local/domain/$FRONTEND_ID/vm")
-    mode=$(xenstore_read "$XENBUS_PATH/mode")
-    mode=$(canonicalise_mode "$mode")
-
-    if [ "$mode" != '!' ] 
-    then
-        result=$(check_blktap_sharing "$file" "$mode")
-        [ "$result" = 'ok' ] || ebusy "$file already in use by other domain"
-    fi
-
-    success
-fi
-
-exit 0
diff --git a/tools/hotplug/Linux/block b/tools/hotplug/Linux/block
index da26e22..8d2ee9d 100644
--- a/tools/hotplug/Linux/block
+++ b/tools/hotplug/Linux/block
@@ -206,6 +206,13 @@ and so cannot be mounted ${m2}${when}."
 
 
 t=$(xenstore_read_default "$XENBUS_PATH/type" 'MISSING')
+p=$(xenstore_read "$XENBUS_PATH/params")
+mode=$(xenstore_read "$XENBUS_PATH/mode")
+if [ -b "$p" ]; then
+    truetype="phy"
+elif [ -f "$p" ]; then
+    truetype="file"
+fi
 
 case "$command" in
   add)
@@ -217,16 +224,11 @@ case "$command" in
       exit 0
     fi
 
-    if [ -n "$t" ]
-    then
-      p=$(xenstore_read "$XENBUS_PATH/params")
-      mode=$(xenstore_read "$XENBUS_PATH/mode")
-    fi
     FRONTEND_ID=$(xenstore_read "$XENBUS_PATH/frontend-id")
     FRONTEND_UUID=$(xenstore_read_default \
             "/local/domain/$FRONTEND_ID/vm" 'unknown')
 
-    case $t in 
+    case $truetype in
       phy)
         dev=$(expand_dev $p)
 
@@ -319,7 +321,7 @@ mount it read-write in a guest domain."
     ;;
 
   remove)
-    case $t in 
+    case $truetype in
       phy)
 	exit 0
 	;;
diff --git a/tools/hotplug/Linux/block-common.sh b/tools/hotplug/Linux/block-common.sh
index cc374ef..ee95009 100644
--- a/tools/hotplug/Linux/block-common.sh
+++ b/tools/hotplug/Linux/block-common.sh
@@ -11,8 +11,7 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #
 
 
diff --git a/tools/hotplug/Linux/block-drbd-probe b/tools/hotplug/Linux/block-drbd-probe
index 247a9d0..635d9f9 100755
--- a/tools/hotplug/Linux/block-drbd-probe
+++ b/tools/hotplug/Linux/block-drbd-probe
@@ -12,8 +12,7 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #
 # Usage:
 #     block-drbd-probe devicename
diff --git a/tools/hotplug/Linux/block-tap b/tools/hotplug/Linux/block-tap
new file mode 100755
index 0000000..8924792
--- /dev/null
+++ b/tools/hotplug/Linux/block-tap
@@ -0,0 +1,123 @@
+#!/bin/bash -e
+#
+# tapdisk Xen block device hotplug script
+#
+# Author George Dunlap <george.dunlap at eu.citrix.com>
+#
+# Based on block-iscsi by Roger Pau Monné <roger.pau at citrix.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation; version 2.1 only. with the special
+# exception on linking described in file LICENSE.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# Usage:
+#
+# Target should be specified using the following syntax:
+#
+# script=block-tap,vdev=xvda,target=<type>:<file>
+#
+# Type is either "aio" (for raw files), or "vhd"
+
+dir=$(dirname "$0")
+. "$dir/block-common.sh"
+
+remove_label()
+{
+    echo $1 | sed "s/^\("$2"\)//"
+}
+
+check_tools()
+{
+    if ! command -v tap-ctl > /dev/null 2>&1; then
+        fatal "Unable to find tap-ctl tool"
+    fi
+    modprobe blktap
+    if ! tap-ctl check >& /dev/null ; then
+	fatal "Blocktap kernel module not available"
+    fi
+}
+
+# Sets the following global variables based on the params field passed in as
+# a parameter: type file
+parse_target()
+{
+    params=($(echo "$1" | tr ":" "\n"))
+
+    type=${params[0]}
+    file=${params[1]}
+    if [ -z "$type" ] || [ -z "$file" ]; then
+        fatal "Cannot parse required parameters"
+    fi
+}
+
+# Sets $pid and $minor to point to the device associated with the target
+find_device()
+{
+    local info
+    local param
+
+    if [ -z "$type" ] || [ -z "$file" ]; then
+        fatal "required parameters not set"
+    fi
+
+    info=$(tap-ctl list -t $type -f $file)
+
+    for param in $(echo "$info" | tr "," "\n")
+    do
+        case $param in
+        pid=*)
+            pid=$(remove_label $param "pid=")
+            ;;
+        minor=*)
+            minor=$(remove_label $param "minor=")
+            ;;
+        esac
+    done
+
+    if [ -z "$pid" ] || [ -z "$minor" ]; then
+        fatal "cannot find required parameters"
+    fi
+}
+
+# Attaches the device and writes xenstore backend entries to connect
+# the device
+add()
+{
+    dev=$(tap-ctl create -a $target)
+    write_dev $dev
+}
+
+# Disconnects the device
+remove()
+{
+    find_device
+    do_or_die tap-ctl destroy -p ${pid} -m ${minor} > /dev/null
+}
+
+command=$1
+target=$(xenstore-read $XENBUS_PATH/params || true)
+if [ -z "$target" ]; then
+    fatal "No information about the target"
+fi
+
+parse_target "$target"
+
+check_tools || exit 1
+
+case $command in
+add)
+    add
+    ;;
+remove)
+    remove
+    ;;
+*)
+    exit 1
+    ;;
+esac
diff --git a/tools/hotplug/Linux/external-device-migrate b/tools/hotplug/Linux/external-device-migrate
index a411348..f5942a6 100644
--- a/tools/hotplug/Linux/external-device-migrate
+++ b/tools/hotplug/Linux/external-device-migrate
@@ -12,8 +12,7 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #
 
 set -x
diff --git a/tools/hotplug/Linux/init.d/sysconfig.xencommons.in b/tools/hotplug/Linux/init.d/sysconfig.xencommons.in
index c12fc8a..acc7309 100644
--- a/tools/hotplug/Linux/init.d/sysconfig.xencommons.in
+++ b/tools/hotplug/Linux/init.d/sysconfig.xencommons.in
@@ -21,6 +21,14 @@
 #XENSTORED=@XENSTORED@
 
 ## Type: string
+## Default: ""
+#
+# Additional commandline arguments to start xenstored,
+# like "--trace-file /var/log/xen/xenstored-trace.log"
+# See "@sbindir@/xenstored --help" for possible options.
+XENSTORED_ARGS=
+
+## Type: string
 ## Default: Not defined, tracing off
 #
 # Log xenstored messages
@@ -39,4 +47,4 @@
 #XENBACKENDD_DEBUG=[yes|on|1]
 
 # qemu path
-#QEMU_XEN=@LIBEXEC_BIN@/qemu-system-i386
+#QEMU_XEN=@qemu_xen_path@
diff --git a/tools/hotplug/Linux/init.d/xen-watchdog.in b/tools/hotplug/Linux/init.d/xen-watchdog.in
index 32c7268..c05f1f6 100644
--- a/tools/hotplug/Linux/init.d/xen-watchdog.in
+++ b/tools/hotplug/Linux/init.d/xen-watchdog.in
@@ -19,7 +19,7 @@
 
 . @XEN_SCRIPT_DIR@/hotplugpath.sh
 
-DAEMON=${SBINDIR}/xenwatchdogd
+DAEMON=${sbindir}/xenwatchdogd
 base=$(basename $DAEMON)
 
 # Source function library.
diff --git a/tools/hotplug/Linux/init.d/xencommons.in b/tools/hotplug/Linux/init.d/xencommons.in
index a1095c2..21e9133 100644
--- a/tools/hotplug/Linux/init.d/xencommons.in
+++ b/tools/hotplug/Linux/init.d/xencommons.in
@@ -62,7 +62,7 @@ do_start () {
 	mkdir -p ${XEN_RUN_DIR}
 	mkdir -p ${XEN_LOCK_DIR}
 
-	if ! `${BINDIR}/xenstore-read -s / >/dev/null 2>&1`
+	if ! `${bindir}/xenstore-read -s / >/dev/null 2>&1`
 	then
 		test -z "$XENSTORED_ROOTDIR" && XENSTORED_ROOTDIR="@XEN_LIB_STORED@"
 		rm -f "$XENSTORED_ROOTDIR"/tdb* &>/dev/null
@@ -77,7 +77,7 @@ do_start () {
 		fi
 
 		# Wait for xenstored to actually come up, timing out after 30 seconds
-                while [ $time -lt $timeout ] && ! `${BINDIR}/xenstore-read -s / >/dev/null 2>&1` ; do
+                while [ $time -lt $timeout ] && ! `${bindir}/xenstore-read -s / >/dev/null 2>&1` ; do
                     echo -n .
 		    time=$(($time+1))
                     sleep 1
@@ -96,9 +96,9 @@ do_start () {
 
 	echo Starting xenconsoled...
 	test -z "$XENCONSOLED_TRACE" || XENCONSOLED_ARGS=" --log=$XENCONSOLED_TRACE"
-	${SBINDIR}/xenconsoled --pid-file=$XENCONSOLED_PIDFILE $XENCONSOLED_ARGS
+	${sbindir}/xenconsoled --pid-file=$XENCONSOLED_PIDFILE $XENCONSOLED_ARGS
 	echo Starting QEMU as disk backend for dom0
-	test -z "$QEMU_XEN" && QEMU_XEN="${LIBEXEC_BIN}/qemu-system-i386"
+	test -z "$QEMU_XEN" && QEMU_XEN="@qemu_xen_path@"
 	$QEMU_XEN -xen-domid 0 -xen-attach -name dom0 -nographic -M xenpv -daemonize \
 		-monitor /dev/null -serial /dev/null -parallel /dev/null \
 		-pidfile $QEMU_PIDFILE
@@ -126,7 +126,7 @@ case "$1" in
 	do_start
 	;;
   status)
-        ${BINDIR}/xenstore-read -s /
+        ${bindir}/xenstore-read -s /
 	;;
   stop)
 	do_stop
diff --git a/tools/hotplug/Linux/init.d/xendriverdomain.in b/tools/hotplug/Linux/init.d/xendriverdomain.in
new file mode 100644
index 0000000..dd5f3a3
--- /dev/null
+++ b/tools/hotplug/Linux/init.d/xendriverdomain.in
@@ -0,0 +1,85 @@
+
+#!/bin/bash
+#
+# xendriverdomain    Script to start services needed in a Xen driver domain
+#
+# NOTE: This initscript is not needed on dom0.
+
+# chkconfig: 2345 70 10
+# description: Starts and stops xen driver domain daemon
+### BEGIN INIT INFO
+# Provides:          xendevd
+# Required-Start:    $syslog $remote_fs
+# Should-Start:
+# Required-Stop:     $syslog $remote_fs
+# Should-Stop:
+# Default-Start:     2 3 5
+# Default-Stop:      0 1 6
+# Short-Description: Start/stop xen driver domain daemon
+# Description:       Starts and stops the daemons neeeded for a xen driver domain
+### END INIT INFO
+
+. @XEN_SCRIPT_DIR@/hotplugpath.sh
+
+xendriverdomain_config=@CONFIG_DIR@/@CONFIG_LEAF_DIR@
+
+test -f $xendriverdomain_config/xendriverdomain && . $xendriverdomain_config/xendriverdomain
+
+XLDEVD_PIDFILE=/var/run/xldevd.pid
+
+# not running in Xen dom0 or domU
+if ! test -d /proc/xen ; then
+	exit 0
+fi
+
+# mount xenfs in dom0 or domU with a pv_ops kernel
+if test "x$1" = xstart && \
+   ! test -f /proc/xen/capabilities && \
+   ! grep '^xenfs ' /proc/mounts >/dev/null;
+then
+	mount -t xenfs xenfs /proc/xen
+fi
+
+# run this script only in domU:
+# no capabilities file in xenlinux domU kernel
+# empty capabilities file in pv_ops domU kernel
+if ! test -f /proc/xen/capabilities || \
+   grep -q "control_d" /proc/xen/capabilities ; then
+	exit 0
+fi
+
+do_start () {
+	echo Starting xl devd...
+	${sbindir}/xl devd --pidfile=$XLDEVD_PIDFILE $XLDEVD_ARGS
+}
+do_stop () {
+        echo Stopping xl devd...
+	if read 2>/dev/null <$XLDEVD_PIDFILE pid; then
+		kill $pid
+		while kill -9 $pid >/dev/null 2>&1; do sleep 0.1; done
+		rm -f $XLDEVD_PIDFILE
+	fi
+}
+
+case "$1" in
+  start)
+	do_start
+	;;
+  stop)
+	do_stop
+	;;
+  reload)
+	echo >&2 'Reload not available; use force-reload'; exit 1
+	;;
+  force-reload|restart)
+        do_stop
+	do_start
+	;;
+  *)
+	# do not advertise unreasonable commands that there is no reason
+	# to use with this device
+	echo $"Usage: $0 {start|stop|restart|force-reload}"
+	exit 1
+esac
+
+exit $?
diff --git a/tools/hotplug/Linux/locking.sh b/tools/hotplug/Linux/locking.sh
index 122bcfb..c6a7e96 100644
--- a/tools/hotplug/Linux/locking.sh
+++ b/tools/hotplug/Linux/locking.sh
@@ -12,8 +12,7 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #
 
 #
diff --git a/tools/hotplug/Linux/logging.sh b/tools/hotplug/Linux/logging.sh
index c1bc699..3e94df1 100644
--- a/tools/hotplug/Linux/logging.sh
+++ b/tools/hotplug/Linux/logging.sh
@@ -11,8 +11,7 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #
 
 log() {
diff --git a/tools/hotplug/Linux/systemd/Makefile b/tools/hotplug/Linux/systemd/Makefile
index 51c10fe..83e3b32 100644
--- a/tools/hotplug/Linux/systemd/Makefile
+++ b/tools/hotplug/Linux/systemd/Makefile
@@ -28,6 +28,9 @@ all:	$(ALL_XEN_SYSTEMD)
 clean:
 	rm -f $(XEN_SYSTEMD_MODULES)
 
+.PHONY: distclean
+distclean: clean
+
 .PHONY: install
 install: $(ALL_XEN_SYSTEMD)
 	[ -d $(DESTDIR)$(XEN_SYSTEMD_DIR) ] || \
diff --git a/tools/hotplug/Linux/systemd/xen-qemu-dom0-disk-backend.service.in b/tools/hotplug/Linux/systemd/xen-qemu-dom0-disk-backend.service.in
index 274cec0..acf61a8 100644
--- a/tools/hotplug/Linux/systemd/xen-qemu-dom0-disk-backend.service.in
+++ b/tools/hotplug/Linux/systemd/xen-qemu-dom0-disk-backend.service.in
@@ -11,7 +11,7 @@ Type=simple
 PIDFile=@XEN_RUN_DIR@/qemu-dom0.pid
 ExecStartPre=/bin/grep -q control_d /proc/xen/capabilities
 ExecStartPre=/bin/mkdir -p @XEN_RUN_DIR@
-ExecStart=@LIBEXEC_BIN@/qemu-system-i386 -xen-domid 0 \
+ExecStart=@qemu_xen_systemd@ -xen-domid 0 \
 	-xen-attach -name dom0 -nographic -M xenpv -daemonize \
 	-monitor /dev/null -serial /dev/null -parallel /dev/null \
 	-pidfile @XEN_RUN_DIR@/qemu-dom0.pid
diff --git a/tools/hotplug/Linux/systemd/xenstored.service.in b/tools/hotplug/Linux/systemd/xenstored.service.in
index 0f0ac58..a5f836b 100644
--- a/tools/hotplug/Linux/systemd/xenstored.service.in
+++ b/tools/hotplug/Linux/systemd/xenstored.service.in
@@ -8,6 +8,7 @@ ConditionPathExists=/proc/xen/capabilities
 
 [Service]
 Type=notify
+KillMode=none
 Environment=XENSTORED_ARGS=
 Environment=XENSTORED=@XENSTORED@
 EnvironmentFile=- at CONFIG_DIR@/@CONFIG_LEAF_DIR@/xencommons
diff --git a/tools/hotplug/Linux/vif-common.sh b/tools/hotplug/Linux/vif-common.sh
index 28ddae5..6e8d584 100644
--- a/tools/hotplug/Linux/vif-common.sh
+++ b/tools/hotplug/Linux/vif-common.sh
@@ -11,8 +11,7 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #
 
 
@@ -207,7 +206,7 @@ dom0_ip()
   local result=$(ip_of "$nd")
   if [ -z "$result" ]
   then
-      fatal
+      fatal \
 "$netdev is not up.  Bring it up or specify another interface with " \
 "netdev=<if> as a parameter to $0."
   fi
diff --git a/tools/hotplug/Linux/xen-backend.rules.in b/tools/hotplug/Linux/xen-backend.rules.in
deleted file mode 100644
index 7d2f914..0000000
--- a/tools/hotplug/Linux/xen-backend.rules.in
+++ /dev/null
@@ -1,15 +0,0 @@
-SUBSYSTEM=="xen-backend", KERNEL=="tap*", ENV{UDEV_CALL}="1", RUN+="@XEN_SCRIPT_DIR@/blktap $env{ACTION}"
-SUBSYSTEM=="xen-backend", KERNEL=="vbd*", ENV{UDEV_CALL}="1", RUN+="@XEN_SCRIPT_DIR@/block $env{ACTION}"
-SUBSYSTEM=="xen-backend", KERNEL=="vif2-*", RUN+="@XEN_SCRIPT_DIR@/vif2 $env{ACTION}"
-SUBSYSTEM=="xen-backend", KERNEL=="vif-*", ENV{UDEV_CALL}="1", ACTION=="online", RUN+="@XEN_SCRIPT_DIR@/vif-setup online type_if=vif"
-SUBSYSTEM=="xen-backend", KERNEL=="vif-*", ENV{UDEV_CALL}="1", ACTION=="offline", RUN+="@XEN_SCRIPT_DIR@/vif-setup offline type_if=vif"
-SUBSYSTEM=="xen-backend", KERNEL=="vscsi*", RUN+="@XEN_SCRIPT_DIR@/vscsi $env{ACTION}"
-SUBSYSTEM=="xen-backend", ACTION=="remove", ENV{UDEV_CALL}="1", RUN+="@XEN_SCRIPT_DIR@/xen-hotplug-cleanup"
-KERNEL=="evtchn", NAME="xen/%k"
-SUBSYSTEM=="xen", KERNEL=="blktap[0-9]*", NAME="xen/%k", MODE="0600"
-SUBSYSTEM=="blktap2", KERNEL=="blktap[0-9]*", NAME="xen/blktap-2/%k", MODE="0600"
-KERNEL=="blktap-control", NAME="xen/blktap-2/control", MODE="0600"
-KERNEL=="gntdev", NAME="xen/%k", MODE="0600"
-KERNEL=="pci_iomul", NAME="xen/%k", MODE="0600"
-KERNEL=="tapdev[a-z]*", NAME="xen/blktap-2/tapdev%m", MODE="0600"
-SUBSYSTEM=="net", KERNEL=="vif*-emu", ACTION=="add", ENV{UDEV_CALL}="1", RUN+="@XEN_SCRIPT_DIR@/vif-setup $env{ACTION} type_if=tap"
diff --git a/tools/hotplug/Linux/xen-hotplug-common.sh.in b/tools/hotplug/Linux/xen-hotplug-common.sh.in
index 1deafe1..d5d0b69 100644
--- a/tools/hotplug/Linux/xen-hotplug-common.sh.in
+++ b/tools/hotplug/Linux/xen-hotplug-common.sh.in
@@ -11,17 +11,9 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #
 
-# Hack to prevent the execution of hotplug scripts from udev if the domain
-# has been launched from libxl
-if [ -n "${UDEV_CALL}" ] && \
-   xenstore-read "libxl/disable_udev" >/dev/null 2>&1; then
-    exit 0
-fi
-
 dir=$(dirname "$0")
 . "$dir/hotplugpath.sh"
 . "$dir/logging.sh"
@@ -30,8 +22,8 @@ dir=$(dirname "$0")
 
 exec 2>>/var/log/xen/xen-hotplug.log
 
-export PATH="${BINDIR}:${SBINDIR}:${LIBEXEC_BIN}:/sbin:/bin:/usr/bin:/usr/sbin:$PATH"
-export LD_LIBRARY_PATH="${LIBDIR}${LD_LIBRARY_PATH+:}$LD_LIBRARY_PATH"
+export PATH="${bindir}:${sbindir}:${LIBEXEC_BIN}:/sbin:/bin:/usr/bin:/usr/sbin:$PATH"
+export LD_LIBRARY_PATH="${libdir}${LD_LIBRARY_PATH+:}$LD_LIBRARY_PATH"
 export LANG="POSIX"
 unset $(set | grep ^LC_ | cut -d= -f1)
 
diff --git a/tools/hotplug/Linux/xen-network-common.sh b/tools/hotplug/Linux/xen-network-common.sh
index 3c63c55..92ffa60 100644
--- a/tools/hotplug/Linux/xen-network-common.sh
+++ b/tools/hotplug/Linux/xen-network-common.sh
@@ -11,8 +11,7 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #
 
 
diff --git a/tools/hotplug/Linux/xen-script-common.sh b/tools/hotplug/Linux/xen-script-common.sh
index f6841ac..370a50a 100644
--- a/tools/hotplug/Linux/xen-script-common.sh
+++ b/tools/hotplug/Linux/xen-script-common.sh
@@ -11,8 +11,7 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #
 
 
diff --git a/tools/hotplug/Linux/xendomains.in b/tools/hotplug/Linux/xendomains.in
index 2e65ac6..0603842 100644
--- a/tools/hotplug/Linux/xendomains.in
+++ b/tools/hotplug/Linux/xendomains.in
@@ -29,7 +29,7 @@
 
 . @XEN_SCRIPT_DIR@/hotplugpath.sh
 
-CMD=${SBINDIR}/xl
+CMD=${sbindir}/xl
 HEADCOMP="Xen saved domain"
 $CMD list &> /dev/null
 if test $? -ne 0
diff --git a/tools/hotplug/Makefile b/tools/hotplug/Makefile
index 14ae9a8..c1a82c1 100644
--- a/tools/hotplug/Makefile
+++ b/tools/hotplug/Makefile
@@ -6,5 +6,5 @@ SUBDIRS-$(CONFIG_NetBSD) += NetBSD
 SUBDIRS-$(CONFIG_Linux) += Linux
 SUBDIRS-$(CONFIG_FreeBSD) += FreeBSD
 
-.PHONY: all clean install
-all clean install: %: subdirs-%
+.PHONY: all clean install distclean
+all clean install distclean: %: subdirs-%
diff --git a/tools/hotplug/NetBSD/Makefile b/tools/hotplug/NetBSD/Makefile
index 0a370b8..d01aabf 100644
--- a/tools/hotplug/NetBSD/Makefile
+++ b/tools/hotplug/NetBSD/Makefile
@@ -8,7 +8,7 @@ XEN_SCRIPTS += vif-bridge
 XEN_SCRIPTS += vif-ip
 
 XEN_SCRIPT_DATA =
-XEN_RCD_PROG = rc.d/xencommons rc.d/xendomains rc.d/xen-watchdog
+XEN_RCD_PROG = rc.d/xencommons rc.d/xendomains rc.d/xen-watchdog rc.d/xendriverdomain
 
 .PHONY: all
 all:
@@ -42,3 +42,6 @@ install-rcd:
 
 .PHONY: clean
 clean:
+
+.PHONY: distclean
+distclean: clean
diff --git a/tools/hotplug/NetBSD/block b/tools/hotplug/NetBSD/block
index b2e9af5..32c20b6 100644
--- a/tools/hotplug/NetBSD/block
+++ b/tools/hotplug/NetBSD/block
@@ -7,7 +7,7 @@
 DIR=$(dirname "$0")
 . "${DIR}/hotplugpath.sh"
 
-PATH=${BINDIR}:${SBINDIR}:${LIBEXEC_BIN}:/bin:/usr/bin:/sbin:/usr/sbin
+PATH=${bindir}:${sbindir}:${LIBEXEC_BIN}:/bin:/usr/bin:/sbin:/usr/sbin
 export PATH
 
 error() {
diff --git a/tools/hotplug/NetBSD/rc.d/xen-watchdog b/tools/hotplug/NetBSD/rc.d/xen-watchdog
index ff4a52c..d2e2ab8 100644
--- a/tools/hotplug/NetBSD/rc.d/xen-watchdog
+++ b/tools/hotplug/NetBSD/rc.d/xen-watchdog
@@ -11,13 +11,13 @@
 DIR=$(dirname "$0")
 . "${DIR}/xen-hotplugpath.sh"
 
-LD_LIBRARY_PATH="${LIBDIR}"
+LD_LIBRARY_PATH="${libdir}"
 export LD_LIBRARY_PATH
 
 name="xenwatchdog"
 rcvar=$name
-command="${SBINDIR}/xenwatchdogd"
-start_cmd="echo Starting ${name}. && PATH=${PATH}:${SBINDIR} ${command} 30 15"
+command="${sbindir}/xenwatchdogd"
+start_cmd="echo Starting ${name}. && PATH=${PATH}:${sbindir} ${command} 30 15"
 
 load_rc_config $name
 run_rc_command "$1"
diff --git a/tools/hotplug/NetBSD/rc.d/xencommons.in b/tools/hotplug/NetBSD/rc.d/xencommons.in
index db03f22..d7552cd 100644
--- a/tools/hotplug/NetBSD/rc.d/xencommons.in
+++ b/tools/hotplug/NetBSD/rc.d/xencommons.in
@@ -8,7 +8,7 @@
 DIR=$(dirname "$0")
 . "${DIR}/xen-hotplugpath.sh"
 
-LD_LIBRARY_PATH="${LIBDIR}"
+LD_LIBRARY_PATH="${libdir}"
 export LD_LIBRARY_PATH
 
 name="xencommons"
@@ -37,7 +37,7 @@ xen_startcmd()
 	local time=0
 	local timeout=30
 
-	xenstored_pid=$(check_pidfile ${XENSTORED_PIDFILE} ${SBINDIR}/xenstored)
+	xenstored_pid=$(check_pidfile ${XENSTORED_PIDFILE} ${sbindir}/xenstored)
 	if test -z "$xenstored_pid"; then
 		printf "Cleaning xenstore database.\n"
 		if [ -z "${XENSTORED_ROOTDIR}" ]; then
@@ -49,8 +49,8 @@ xen_startcmd()
 		if [ -n "${XENSTORED_TRACE}" ]; then
 			XENSTORED_ARGS="${XENSTORED_ARGS} -T /var/log/xen/xenstored-trace.log"
 		fi
-		${SBINDIR}/xenstored ${XENSTORED_ARGS}
-		while [ $time -lt $timeout ] && ! `${BINDIR}/xenstore-read -s / >/dev/null 2>&1` ; do
+		${sbindir}/xenstored ${XENSTORED_ARGS}
+		while [ $time -lt $timeout ] && ! `${bindir}/xenstore-read -s / >/dev/null 2>&1` ; do
 			printf "."
 			time=$(($time+1))
 			sleep 1
@@ -64,7 +64,7 @@ xen_startcmd()
 		XENCONSOLED_ARGS="${XENCONSOLED_ARGS} --log=${XENCONSOLED_TRACE}"
 	fi
 
-	${SBINDIR}/xenconsoled ${XENCONSOLED_ARGS}
+	${sbindir}/xenconsoled ${XENCONSOLED_ARGS}
 
 	printf "\n"
 
@@ -78,7 +78,7 @@ xen_stop()
 	printf "Stopping xencommons.\n"
 	printf "WARNING: Not stopping xenstored, as it cannot be restarted.\n"
 
-	rc_pid=$(check_pidfile ${XENCONSOLED_PIDFILE} ${SBINDIR}/xenconsoled)
+	rc_pid=$(check_pidfile ${XENCONSOLED_PIDFILE} ${sbindir}/xenconsoled)
 	pids="$pids $rc_pid"
 
 	kill -${sig_stop:-TERM} $pids
@@ -87,12 +87,12 @@ xen_stop()
 
 xen_status()
 {
-	xenstored_pid=$(check_pidfile ${XENSTORED_PIDFILE} ${SBINDIR}/xenstored)
+	xenstored_pid=$(check_pidfile ${XENSTORED_PIDFILE} ${sbindir}/xenstored)
 	if test -n ${xenstored_pid}; then
 		pids="$pids $xenstored_pid"
 	fi
 
-	xenconsoled_pid=$(check_pidfile ${XENCONSOLED_PIDFILE} ${SBINDIR}/xenconsoled)
+	xenconsoled_pid=$(check_pidfile ${XENCONSOLED_PIDFILE} ${sbindir}/xenconsoled)
 	if test -n ${xenconsoled_pid}; then
 		pids="$pids $xenconsoled_pid"
 	fi
diff --git a/tools/hotplug/NetBSD/rc.d/xendomains b/tools/hotplug/NetBSD/rc.d/xendomains
index b0e7111..edc4495 100644
--- a/tools/hotplug/NetBSD/rc.d/xendomains
+++ b/tools/hotplug/NetBSD/rc.d/xendomains
@@ -30,11 +30,11 @@
 DIR=$(dirname "$0")
 . "${DIR}/xen-hotplugpath.sh"
 
-LD_LIBRARY_PATH="${LIBDIR}"
+LD_LIBRARY_PATH="${libdir}"
 export LD_LIBRARY_PATH
 
 name="xendomains"
-ctl_command="${SBINDIR}/xl"
+ctl_command="${sbindir}/xl"
 start_cmd="xendomains_start"
 stop_cmd="xendomains_stop"
 list_cmd="xendomains_list"
diff --git a/tools/hotplug/NetBSD/rc.d/xendriverdomain.in b/tools/hotplug/NetBSD/rc.d/xendriverdomain.in
new file mode 100644
index 0000000..5062a71
--- /dev/null
+++ b/tools/hotplug/NetBSD/rc.d/xendriverdomain.in
@@ -0,0 +1,49 @@
+#!/bin/sh
+#
+# PROVIDE: xendriverdomain
+# REQUIRE: DAEMON
+#
+# Should be run in a driver domain, but not in domain 0.
+
+. /etc/rc.subr
+
+DIR=$(dirname "$0")
+. "${DIR}/xen-hotplugpath.sh"
+
+LD_LIBRARY_PATH="${libdir}"
+export LD_LIBRARY_PATH
+
+name="xendriverdomain"
+start_precmd="xendriverdomain_precmd"
+start_cmd="xendriverdomain_startcmd"
+stop_cmd="xendriverdomain_stop"
+extra_commands=""
+
+XLDEVD_PIDFILE="/var/run/xldevd.pid"
+
+xendriverdomain_precmd()
+{
+	:
+}
+
+xendriverdomain_startcmd()
+{
+	printf "Starting xenservices: xl devd."
+
+	${sbindir}/xl devd --pidfile=$XLDEVD_PIDFILE ${XLDEVD_ARGS}
+
+	printf "\n"
+}
+
+xendriverdomain_stop()
+{
+	printf "Stopping xl devd.\n"
+
+	rc_pid=$(check_pidfile ${XLDEVD_PIDFILE} ${sbindir}/xl)
+
+	kill -${sig_stop:-TERM} $rc_pids
+	wait_for_pids $rc_pids
+}
+
+load_rc_config $name
+run_rc_command "$1"
diff --git a/tools/hotplug/NetBSD/vif-bridge b/tools/hotplug/NetBSD/vif-bridge
index 960303e..b58e922 100644
--- a/tools/hotplug/NetBSD/vif-bridge
+++ b/tools/hotplug/NetBSD/vif-bridge
@@ -7,7 +7,7 @@
 DIR=$(dirname "$0")
 . "${DIR}/hotplugpath.sh"
 
-PATH=${BINDIR}:${SBINDIR}:${LIBEXEC_BIN}:/bin:/usr/bin:/sbin:/usr/sbin
+PATH=${bindir}:${sbindir}:${LIBEXEC_BIN}:/bin:/usr/bin:/sbin:/usr/sbin
 export PATH
 
 xpath=$1
diff --git a/tools/hotplug/NetBSD/vif-ip b/tools/hotplug/NetBSD/vif-ip
index 28c43de..83cbfe2 100644
--- a/tools/hotplug/NetBSD/vif-ip
+++ b/tools/hotplug/NetBSD/vif-ip
@@ -7,7 +7,7 @@
 DIR=$(dirname "$0")
 . "${DIR}/hotplugpath.sh"
 
-PATH=${BINDIR}:${SBINDIR}:${LIBEXEC_BIN}:/bin:/usr/bin:/sbin:/usr/sbin
+PATH=${bindir}:${sbindir}:${LIBEXEC_BIN}:/bin:/usr/bin:/sbin:/usr/sbin
 export PATH
 
 xpath=$1
diff --git a/tools/hotplug/common/Makefile b/tools/hotplug/common/Makefile
index 5623abb..b53b55c 100644
--- a/tools/hotplug/common/Makefile
+++ b/tools/hotplug/common/Makefile
@@ -37,3 +37,6 @@ install-scripts: build
 .PHONY: clean
 clean:
 	rm -f $(HOTPLUGPATH)
+
+.PHONY: distclean
+distclean: clean
diff --git a/tools/include/Makefile b/tools/include/Makefile
index f7a6256..dec8b3d 100644
--- a/tools/include/Makefile
+++ b/tools/include/Makefile
@@ -1,8 +1,11 @@
 XEN_ROOT = $(CURDIR)/../..
 include $(XEN_ROOT)/tools/Rules.mk
 
+# Relative to $(XEN_ROOT)/xen/xsm/flask
+FLASK_H_DEPEND := policy/initial_sids
+
 .PHONY: all
-all: xen-foreign xen/.dir
+all: xen-foreign xen/.dir xen-xsm/.dir
 
 .PHONY: xen-foreign
 xen-foreign:
@@ -19,34 +22,46 @@ xen/.dir:
 	ln -s ../xen-foreign xen/foreign
 	touch $@
 
+# Not xen/xsm as that clashes with link to
+# $(XEN_ROOT)/xen/include/public/xsm above.
+xen-xsm/.dir: $(XEN_ROOT)/xen/xsm/flask/policy/mkflask.sh \
+	      $(patsubst %,$(XEN_ROOT)/xen/xsm/flask/%,$(FLASK_H_DEPEND))
+	mkdir -p xen-xsm/flask
+	cd $(XEN_ROOT)/xen/xsm/flask/ && \
+		$(SHELL) policy/mkflask.sh $(AWK) $(CURDIR)/xen-xsm/flask $(FLASK_H_DEPEND)
+	touch $@
+
 .PHONY: install
 install: all
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)/xen/arch-x86
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)/xen/arch-x86/hvm
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)/xen/arch-arm
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)/xen/arch-arm/hvm
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)/xen/foreign
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)/xen/hvm
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)/xen/io
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)/xen/sys
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)/xen/xsm
-	$(INSTALL_DATA) xen/COPYING $(DESTDIR)$(INCLUDEDIR)/xen
-	$(INSTALL_DATA) xen/*.h $(DESTDIR)$(INCLUDEDIR)/xen
-	$(INSTALL_DATA) xen/arch-x86/*.h $(DESTDIR)$(INCLUDEDIR)/xen/arch-x86
-	$(INSTALL_DATA) xen/arch-x86/hvm/*.h $(DESTDIR)$(INCLUDEDIR)/xen/arch-x86/hvm
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)/xen/arch-x86
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)/xen/arch-x86/hvm
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)/xen/arch-arm
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)/xen/arch-arm/hvm
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)/xen/foreign
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)/xen/hvm
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)/xen/io
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)/xen/sys
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)/xen/xsm
+	$(INSTALL_DATA) xen/COPYING $(DESTDIR)$(includedir)/xen
+	$(INSTALL_DATA) xen/*.h $(DESTDIR)$(includedir)/xen
+	$(INSTALL_DATA) xen/arch-x86/*.h $(DESTDIR)$(includedir)/xen/arch-x86
+	$(INSTALL_DATA) xen/arch-x86/hvm/*.h $(DESTDIR)$(includedir)/xen/arch-x86/hvm
 
 # 	xen/arch-arm doesn't contains headers for now. Uncommented the line
 # 	as soon as a new header is added
-#	$(INSTALL_DATA) xen/arch-arm/*.h $(DESTDIR)$(INCLUDEDIR)/xen/arch-arm
-	$(INSTALL_DATA) xen/arch-arm/hvm/*.h $(DESTDIR)$(INCLUDEDIR)/xen/arch-arm/hvm
-	$(INSTALL_DATA) xen/foreign/*.h $(DESTDIR)$(INCLUDEDIR)/xen/foreign
-	$(INSTALL_DATA) xen/hvm/*.h $(DESTDIR)$(INCLUDEDIR)/xen/hvm
-	$(INSTALL_DATA) xen/io/*.h $(DESTDIR)$(INCLUDEDIR)/xen/io
-	$(INSTALL_DATA) xen/sys/*.h $(DESTDIR)$(INCLUDEDIR)/xen/sys
-	$(INSTALL_DATA) xen/xsm/*.h $(DESTDIR)$(INCLUDEDIR)/xen/xsm
+#	$(INSTALL_DATA) xen/arch-arm/*.h $(DESTDIR)$(includedir)/xen/arch-arm
+	$(INSTALL_DATA) xen/arch-arm/hvm/*.h $(DESTDIR)$(includedir)/xen/arch-arm/hvm
+	$(INSTALL_DATA) xen/foreign/*.h $(DESTDIR)$(includedir)/xen/foreign
+	$(INSTALL_DATA) xen/hvm/*.h $(DESTDIR)$(includedir)/xen/hvm
+	$(INSTALL_DATA) xen/io/*.h $(DESTDIR)$(includedir)/xen/io
+	$(INSTALL_DATA) xen/sys/*.h $(DESTDIR)$(includedir)/xen/sys
+	$(INSTALL_DATA) xen/xsm/*.h $(DESTDIR)$(includedir)/xen/xsm
 
 .PHONY: clean
 clean:
-	rm -rf xen
+	rm -rf xen xen-xsm
 	$(MAKE) -C xen-foreign clean
 
+
+.PHONY: distclean
+distclean: clean
diff --git a/tools/include/xen-external/bsd-sys-queue-h-seddery b/tools/include/xen-external/bsd-sys-queue-h-seddery
index 7a957e3..3f8716d 100755
--- a/tools/include/xen-external/bsd-sys-queue-h-seddery
+++ b/tools/include/xen-external/bsd-sys-queue-h-seddery
@@ -69,4 +69,6 @@ s/\b struct \s+ type \b/type/xg;
 
 s,^\#include.*sys/cdefs.*,/* $& */,xg;
 
+s,\b __offsetof \b ,offsetof,xg;
+
 s/\b( NULL )/0/xg;
diff --git a/tools/include/xen-foreign/Makefile b/tools/include/xen-foreign/Makefile
index 06b844c..80a446a 100644
--- a/tools/include/xen-foreign/Makefile
+++ b/tools/include/xen-foreign/Makefile
@@ -6,7 +6,7 @@ ROOT = $(XEN_ROOT)/xen/include/public
 architectures := arm32 arm64 x86_32 x86_64
 headers := $(patsubst %, %.h, $(architectures))
 
-.PHONY: all clean check-headers
+.PHONY: all clean distclean check-headers
 all: $(headers) check-headers
 
 clean:
@@ -14,8 +14,10 @@ clean:
 	rm -f checker checker.c
 	rm -f *.pyc *.o *~
 
+distclean: clean
+
 checker: checker.c $(headers)
-	$(HOSTCC) $(HOSTCFLAGS) -o $@ $<
+	$(HOSTCC) $(HOSTCFLAGS) -D__XEN_TOOLS__ -o $@ $<
 
 check-headers: checker
 	./checker > tmp.size
diff --git a/tools/include/xen-foreign/reference.size b/tools/include/xen-foreign/reference.size
index 60ee262..a0409db 100644
--- a/tools/include/xen-foreign/reference.size
+++ b/tools/include/xen-foreign/reference.size
@@ -9,6 +9,6 @@ vcpu_guest_context        |     344     344    2800    5168
 arch_vcpu_info            |       0       0      24      16
 vcpu_time_info            |      32      32      32      32
 vcpu_info                 |      48      48      64      64
-arch_shared_info          |       0       0     268     280
-shared_info               |    1088    1088    2584    3368
+arch_shared_info          |       0       0      28      48
+shared_info               |    1088    1088    2344    3136
 
diff --git a/tools/include/xen-sys/NetBSDRump/evtchn.h b/tools/include/xen-sys/NetBSDRump/evtchn.h
new file mode 100644
index 0000000..2d8a1f9
--- /dev/null
+++ b/tools/include/xen-sys/NetBSDRump/evtchn.h
@@ -0,0 +1,86 @@
+/* $NetBSD: evtchn.h,v 1.1.1.1 2007/06/14 19:39:45 bouyer Exp $ */
+/******************************************************************************
+ * evtchn.h
+ * 
+ * Interface to /dev/xen/evtchn.
+ * 
+ * Copyright (c) 2003-2005, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __NetBSD_EVTCHN_H__
+#define __NetBSD_EVTCHN_H__
+
+/*
+ * Bind a fresh port to VIRQ @virq.
+ */
+#define IOCTL_EVTCHN_BIND_VIRQ				\
+	_IOWR('E', 4, struct ioctl_evtchn_bind_virq)
+struct ioctl_evtchn_bind_virq {
+	unsigned int virq;
+	unsigned int port;
+};
+
+/*
+ * Bind a fresh port to remote <@remote_domain, @remote_port>.
+ */
+#define IOCTL_EVTCHN_BIND_INTERDOMAIN			\
+	_IOWR('E', 5, struct ioctl_evtchn_bind_interdomain)
+struct ioctl_evtchn_bind_interdomain {
+	unsigned int remote_domain, remote_port;
+	unsigned int port;
+};
+
+/*
+ * Allocate a fresh port for binding to @remote_domain.
+ */
+#define IOCTL_EVTCHN_BIND_UNBOUND_PORT			\
+	_IOWR('E', 6, struct ioctl_evtchn_bind_unbound_port)
+struct ioctl_evtchn_bind_unbound_port {
+	unsigned int remote_domain;
+	unsigned int port;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_UNBIND				\
+	_IOW('E', 7, struct ioctl_evtchn_unbind)
+struct ioctl_evtchn_unbind {
+	unsigned int port;
+};
+
+/*
+ * Send event to previously allocated @port.
+ */
+#define IOCTL_EVTCHN_NOTIFY				\
+	_IOW('E', 8, struct ioctl_evtchn_notify)
+struct ioctl_evtchn_notify {
+	unsigned int port;
+};
+
+/* Clear and reinitialise the event buffer. Clear error condition. */
+#define IOCTL_EVTCHN_RESET				\
+	_IO('E', 9)
+
+#endif /* __NetBSD_EVTCHN_H__ */
diff --git a/tools/include/xen-sys/NetBSDRump/privcmd.h b/tools/include/xen-sys/NetBSDRump/privcmd.h
index efdcae9..1296b30 100644
--- a/tools/include/xen-sys/NetBSDRump/privcmd.h
+++ b/tools/include/xen-sys/NetBSDRump/privcmd.h
@@ -1,6 +1,36 @@
+/*	NetBSD: xenio.h,v 1.3 2005/05/24 12:07:12 yamt Exp $	*/
 
-#ifndef __NetBSDRump_PRIVCMD_H__
-#define __NetBSDRump_PRIVCMD_H__
+/******************************************************************************
+ * privcmd.h
+ * 
+ * Copyright (c) 2003-2004, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __NetBSD_PRIVCMD_H__
+#define __NetBSD_PRIVCMD_H__
+
+/* Interface to /proc/xen/privcmd */
 
 typedef struct privcmd_hypercall
 {
@@ -28,4 +58,49 @@ typedef struct privcmd_mmapbatch {
     unsigned long *arr; /* array of mfns - top nibble set on err */
 } privcmd_mmapbatch_t; 
 
-#endif
+typedef struct privcmd_blkmsg
+{
+    unsigned long op;
+    void         *buf;
+    int           buf_size;
+} privcmd_blkmsg_t;
+
+/*
+ * @cmd: IOCTL_PRIVCMD_HYPERCALL
+ * @arg: &privcmd_hypercall_t
+ * Return: Value returned from execution of the specified hypercall.
+ */
+#define IOCTL_PRIVCMD_HYPERCALL         \
+    _IOWR('P', 0, privcmd_hypercall_t)
+
+#if defined(_KERNEL)
+/* compat */
+#define IOCTL_PRIVCMD_INITDOMAIN_EVTCHN_OLD \
+    _IO('P', 1)
+#endif /* defined(_KERNEL) */
+    
+#define IOCTL_PRIVCMD_MMAP             \
+    _IOW('P', 2, privcmd_mmap_t)
+#define IOCTL_PRIVCMD_MMAPBATCH        \
+    _IOW('P', 3, privcmd_mmapbatch_t)
+#define IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN \
+    _IOR('P', 4, unsigned long)
+
+/*
+ * @cmd: IOCTL_PRIVCMD_INITDOMAIN_EVTCHN
+ * @arg: n/a
+ * Return: Port associated with domain-controller end of control event channel
+ *         for the initial domain.
+ */
+#define IOCTL_PRIVCMD_INITDOMAIN_EVTCHN \
+    _IOR('P', 5, int)
+
+/* Interface to /dev/xenevt */
+/* EVTCHN_RESET: Clear and reinit the event buffer. Clear error condition. */
+#define EVTCHN_RESET  _IO('E', 1)
+/* EVTCHN_BIND: Bind to the specified event-channel port. */
+#define EVTCHN_BIND   _IOW('E', 2, unsigned long)
+/* EVTCHN_UNBIND: Unbind from the specified event-channel port. */
+#define EVTCHN_UNBIND _IOW('E', 3, unsigned long)
+
+#endif /* __NetBSD_PRIVCMD_H__ */
diff --git a/tools/libfsimage/Rules.mk b/tools/libfsimage/Rules.mk
index 8a23655..a0c6504 100644
--- a/tools/libfsimage/Rules.mk
+++ b/tools/libfsimage/Rules.mk
@@ -6,7 +6,7 @@ LDFLAGS += -L../common/
 
 PIC_OBJS := $(patsubst %.c,%.opic,$(LIB_SRCS-y))
 
-FSDIR = $(LIBDIR)/fs
+FSDIR = $(libdir)/fs
 
 FSLIB = fsimage.so
 
diff --git a/tools/libfsimage/common/Makefile b/tools/libfsimage/common/Makefile
index fb306f4..4840bc2 100644
--- a/tools/libfsimage/common/Makefile
+++ b/tools/libfsimage/common/Makefile
@@ -22,14 +22,14 @@ all: $(LIB)
 
 .PHONY: install
 install: all
-	$(INSTALL_DIR) $(DESTDIR)$(LIBDIR)
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_PROG) libfsimage.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)
-	ln -sf libfsimage.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libfsimage.so.$(MAJOR)
-	ln -sf libfsimage.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libfsimage.so
-	$(INSTALL_DATA) fsimage.h $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_DATA) fsimage_plugin.h $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_DATA) fsimage_grub.h $(DESTDIR)$(INCLUDEDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(libdir)
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)
+	$(INSTALL_PROG) libfsimage.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)
+	ln -sf libfsimage.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)/libfsimage.so.$(MAJOR)
+	ln -sf libfsimage.so.$(MAJOR) $(DESTDIR)$(libdir)/libfsimage.so
+	$(INSTALL_DATA) fsimage.h $(DESTDIR)$(includedir)
+	$(INSTALL_DATA) fsimage_plugin.h $(DESTDIR)$(includedir)
+	$(INSTALL_DATA) fsimage_grub.h $(DESTDIR)$(includedir)
 
 clean distclean::
 	rm -f $(LIB)
diff --git a/tools/libfsimage/ext2fs-lib/Makefile b/tools/libfsimage/ext2fs-lib/Makefile
index 671fbff..0e00fde 100644
--- a/tools/libfsimage/ext2fs-lib/Makefile
+++ b/tools/libfsimage/ext2fs-lib/Makefile
@@ -6,8 +6,8 @@ FS = ext2fs-lib
 
 FS_LIBDEPS = $(EXTFS_LIBS)
 
-# Include configure output (config.h) to headers search path
-CFLAGS += -I$(XEN_ROOT)/tools
+# Include configure output (config.h)
+CFLAGS += -include $(XEN_ROOT)/tools/config.h
 
 .PHONY: all
 all: fs-all
diff --git a/tools/libfsimage/ext2fs-lib/ext2fs-lib.c b/tools/libfsimage/ext2fs-lib/ext2fs-lib.c
index ed47146..84b6d1e 100644
--- a/tools/libfsimage/ext2fs-lib/ext2fs-lib.c
+++ b/tools/libfsimage/ext2fs-lib/ext2fs-lib.c
@@ -21,9 +21,6 @@
  * Use is subject to license terms.
  */
 
-/* Include output from configure */
-#include <config.h>
-
 #include <fsimage_plugin.h>
 #include INCLUDE_EXTFS_H
 #include <errno.h>
diff --git a/tools/libfsimage/ext2fs/fsys_ext2fs.c b/tools/libfsimage/ext2fs/fsys_ext2fs.c
index 75c4cbe..ba53ff4 100644
--- a/tools/libfsimage/ext2fs/fsys_ext2fs.c
+++ b/tools/libfsimage/ext2fs/fsys_ext2fs.c
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <fsimage_grub.h>
diff --git a/tools/libfsimage/fat/fat.h b/tools/libfsimage/fat/fat.h
index f72b72a..2abb430 100644
--- a/tools/libfsimage/fat/fat.h
+++ b/tools/libfsimage/fat/fat.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
diff --git a/tools/libfsimage/fat/fsys_fat.c b/tools/libfsimage/fat/fsys_fat.c
index d22d243..b8129a8 100644
--- a/tools/libfsimage/fat/fsys_fat.c
+++ b/tools/libfsimage/fat/fsys_fat.c
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <limits.h>
diff --git a/tools/libfsimage/iso9660/fsys_iso9660.c b/tools/libfsimage/iso9660/fsys_iso9660.c
index b991cfc..5dbf100 100644
--- a/tools/libfsimage/iso9660/fsys_iso9660.c
+++ b/tools/libfsimage/iso9660/fsys_iso9660.c
@@ -15,8 +15,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  *  References:
diff --git a/tools/libfsimage/iso9660/iso9660.h b/tools/libfsimage/iso9660/iso9660.h
index 83d0019..4ea9b3e 100644
--- a/tools/libfsimage/iso9660/iso9660.h
+++ b/tools/libfsimage/iso9660/iso9660.h
@@ -15,8 +15,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  *  References:
diff --git a/tools/libfsimage/reiserfs/fsys_reiserfs.c b/tools/libfsimage/reiserfs/fsys_reiserfs.c
index 4aa9bfb..92c3b2b 100644
--- a/tools/libfsimage/reiserfs/fsys_reiserfs.c
+++ b/tools/libfsimage/reiserfs/fsys_reiserfs.c
@@ -14,8 +14,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <fsimage_grub.h>
diff --git a/tools/libfsimage/ufs/fsys_ufs.c b/tools/libfsimage/ufs/fsys_ufs.c
index be51411..cf7e22a 100644
--- a/tools/libfsimage/ufs/fsys_ufs.c
+++ b/tools/libfsimage/ufs/fsys_ufs.c
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c
index 0261a52..f39d096 100644
--- a/tools/libfsimage/xfs/fsys_xfs.c
+++ b/tools/libfsimage/xfs/fsys_xfs.c
@@ -14,8 +14,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <fsimage_grub.h>
diff --git a/tools/libfsimage/xfs/xfs.h b/tools/libfsimage/xfs/xfs.h
index 02f8dcd..4069928 100644
--- a/tools/libfsimage/xfs/xfs.h
+++ b/tools/libfsimage/xfs/xfs.h
@@ -20,8 +20,7 @@
  *  other software, or any other product whatsoever.
  * 
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write the Free Software Foundation, Inc., 59
- *  Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  * 
  *  Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
  *  Mountain View, CA  94043, or:
diff --git a/tools/libfsimage/zfs/Makefile b/tools/libfsimage/zfs/Makefile
index 9fb0aab..4b9f131 100644
--- a/tools/libfsimage/zfs/Makefile
+++ b/tools/libfsimage/zfs/Makefile
@@ -13,8 +13,7 @@
 #  GNU General Public License for more details.
 #
 #  You should have received a copy of the GNU General Public License
-#  along with this program; if not, write to the Free Software
-#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#  along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 
 # 
diff --git a/tools/libfsimage/zfs/filesys.h b/tools/libfsimage/zfs/filesys.h
index 1fae8e8..36cc720 100644
--- a/tools/libfsimage/zfs/filesys.h
+++ b/tools/libfsimage/zfs/filesys.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/fsi_zfs.c b/tools/libfsimage/zfs/fsi_zfs.c
index 05148bc..7514587 100644
--- a/tools/libfsimage/zfs/fsi_zfs.c
+++ b/tools/libfsimage/zfs/fsi_zfs.c
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/fsi_zfs.h b/tools/libfsimage/zfs/fsi_zfs.h
index c849147..fb4dc23 100644
--- a/tools/libfsimage/zfs/fsi_zfs.h
+++ b/tools/libfsimage/zfs/fsi_zfs.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/fsys_zfs.c b/tools/libfsimage/zfs/fsys_zfs.c
index 863232b..e881fbb 100644
--- a/tools/libfsimage/zfs/fsys_zfs.c
+++ b/tools/libfsimage/zfs/fsys_zfs.c
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/fsys_zfs.h b/tools/libfsimage/zfs/fsys_zfs.h
index 36aa1e7..5cd627d 100644
--- a/tools/libfsimage/zfs/fsys_zfs.h
+++ b/tools/libfsimage/zfs/fsys_zfs.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/mb_info.h b/tools/libfsimage/zfs/mb_info.h
index 1e1e63b..16e9013 100644
--- a/tools/libfsimage/zfs/mb_info.h
+++ b/tools/libfsimage/zfs/mb_info.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
diff --git a/tools/libfsimage/zfs/shared.h b/tools/libfsimage/zfs/shared.h
index 0bf6751..e4a239a 100644
--- a/tools/libfsimage/zfs/shared.h
+++ b/tools/libfsimage/zfs/shared.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/dmu.h b/tools/libfsimage/zfs/zfs-include/dmu.h
index 7faa708..b39a951 100644
--- a/tools/libfsimage/zfs/zfs-include/dmu.h
+++ b/tools/libfsimage/zfs/zfs-include/dmu.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/dmu_objset.h b/tools/libfsimage/zfs/zfs-include/dmu_objset.h
index 8d1cf8e..2680fcf 100644
--- a/tools/libfsimage/zfs/zfs-include/dmu_objset.h
+++ b/tools/libfsimage/zfs/zfs-include/dmu_objset.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/dnode.h b/tools/libfsimage/zfs/zfs-include/dnode.h
index 5f6e4bb..dd90a87 100644
--- a/tools/libfsimage/zfs/zfs-include/dnode.h
+++ b/tools/libfsimage/zfs/zfs-include/dnode.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/dsl_dataset.h b/tools/libfsimage/zfs/zfs-include/dsl_dataset.h
index a8c60c4..470b607 100644
--- a/tools/libfsimage/zfs/zfs-include/dsl_dataset.h
+++ b/tools/libfsimage/zfs/zfs-include/dsl_dataset.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/dsl_dir.h b/tools/libfsimage/zfs/zfs-include/dsl_dir.h
index 591e197..0e71b6b 100644
--- a/tools/libfsimage/zfs/zfs-include/dsl_dir.h
+++ b/tools/libfsimage/zfs/zfs-include/dsl_dir.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/sa_impl.h b/tools/libfsimage/zfs/zfs-include/sa_impl.h
index 4d93558..0679c93 100644
--- a/tools/libfsimage/zfs/zfs-include/sa_impl.h
+++ b/tools/libfsimage/zfs/zfs-include/sa_impl.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/spa.h b/tools/libfsimage/zfs/zfs-include/spa.h
index 4c94331..ee708ef 100644
--- a/tools/libfsimage/zfs/zfs-include/spa.h
+++ b/tools/libfsimage/zfs/zfs-include/spa.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/uberblock_impl.h b/tools/libfsimage/zfs/zfs-include/uberblock_impl.h
index 9e70ac0..282d3b9 100644
--- a/tools/libfsimage/zfs/zfs-include/uberblock_impl.h
+++ b/tools/libfsimage/zfs/zfs-include/uberblock_impl.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/vdev_impl.h b/tools/libfsimage/zfs/zfs-include/vdev_impl.h
index ba427dd..f3823b2 100644
--- a/tools/libfsimage/zfs/zfs-include/vdev_impl.h
+++ b/tools/libfsimage/zfs/zfs-include/vdev_impl.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/zap_impl.h b/tools/libfsimage/zfs/zfs-include/zap_impl.h
index ebb57a6..7adfc34 100644
--- a/tools/libfsimage/zfs/zfs-include/zap_impl.h
+++ b/tools/libfsimage/zfs/zfs-include/zap_impl.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/zap_leaf.h b/tools/libfsimage/zfs/zfs-include/zap_leaf.h
index 4ff3515..ecf3faa 100644
--- a/tools/libfsimage/zfs/zfs-include/zap_leaf.h
+++ b/tools/libfsimage/zfs/zfs-include/zap_leaf.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/zfs.h b/tools/libfsimage/zfs/zfs-include/zfs.h
index c0887d5..592d23a 100644
--- a/tools/libfsimage/zfs/zfs-include/zfs.h
+++ b/tools/libfsimage/zfs/zfs-include/zfs.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/zfs_acl.h b/tools/libfsimage/zfs/zfs-include/zfs_acl.h
index 77ebb8d..6a685e0 100644
--- a/tools/libfsimage/zfs/zfs-include/zfs_acl.h
+++ b/tools/libfsimage/zfs/zfs-include/zfs_acl.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/zfs_znode.h b/tools/libfsimage/zfs/zfs-include/zfs_znode.h
index ea37c08..902c497 100644
--- a/tools/libfsimage/zfs/zfs-include/zfs_znode.h
+++ b/tools/libfsimage/zfs/zfs-include/zfs_znode.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/zil.h b/tools/libfsimage/zfs/zfs-include/zil.h
index 87c1dc5..f34173c 100644
--- a/tools/libfsimage/zfs/zfs-include/zil.h
+++ b/tools/libfsimage/zfs/zfs-include/zil.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/zio.h b/tools/libfsimage/zfs/zfs-include/zio.h
index 298017a..513f321 100644
--- a/tools/libfsimage/zfs/zfs-include/zio.h
+++ b/tools/libfsimage/zfs/zfs-include/zio.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs-include/zio_checksum.h b/tools/libfsimage/zfs/zfs-include/zio_checksum.h
index 28ba792..5f22f29 100644
--- a/tools/libfsimage/zfs/zfs-include/zio_checksum.h
+++ b/tools/libfsimage/zfs/zfs-include/zio_checksum.h
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs_fletcher.c b/tools/libfsimage/zfs/zfs_fletcher.c
index 34a034e..cc9e8c4 100644
--- a/tools/libfsimage/zfs/zfs_fletcher.c
+++ b/tools/libfsimage/zfs/zfs_fletcher.c
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs_lzjb.c b/tools/libfsimage/zfs/zfs_lzjb.c
index c617362..0b11eee 100644
--- a/tools/libfsimage/zfs/zfs_lzjb.c
+++ b/tools/libfsimage/zfs/zfs_lzjb.c
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libfsimage/zfs/zfs_sha256.c b/tools/libfsimage/zfs/zfs_sha256.c
index 393eaee..616fc0a 100644
--- a/tools/libfsimage/zfs/zfs_sha256.c
+++ b/tools/libfsimage/zfs/zfs_sha256.c
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
diff --git a/tools/libvchan/Makefile b/tools/libvchan/Makefile
index d768df4..3c50fe6 100644
--- a/tools/libvchan/Makefile
+++ b/tools/libvchan/Makefile
@@ -42,13 +42,13 @@ vchan-node2: $(NODE2_OBJS) libxenvchan.so
 
 .PHONY: install
 install: all
-	$(INSTALL_DIR) $(DESTDIR)$(LIBDIR)
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_PROG) libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)
-	ln -sf libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxenvchan.so.$(MAJOR)
-	ln -sf libxenvchan.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxenvchan.so
-	$(INSTALL_DATA) libxenvchan.h $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_DATA) libxenvchan.a $(DESTDIR)$(LIBDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(libdir)
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)
+	$(INSTALL_PROG) libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)
+	ln -sf libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)/libxenvchan.so.$(MAJOR)
+	ln -sf libxenvchan.so.$(MAJOR) $(DESTDIR)$(libdir)/libxenvchan.so
+	$(INSTALL_DATA) libxenvchan.h $(DESTDIR)$(includedir)
+	$(INSTALL_DATA) libxenvchan.a $(DESTDIR)$(libdir)
 
 .PHONY: clean
 clean:
diff --git a/tools/libvchan/init.c b/tools/libvchan/init.c
index de10817..77be4e7 100644
--- a/tools/libvchan/init.c
+++ b/tools/libvchan/init.c
@@ -21,8 +21,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * @section DESCRIPTION
  *
diff --git a/tools/libvchan/io.c b/tools/libvchan/io.c
index e66bc4e..8a9629b 100644
--- a/tools/libvchan/io.c
+++ b/tools/libvchan/io.c
@@ -21,8 +21,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * @section DESCRIPTION
  *
diff --git a/tools/libvchan/libxenvchan.h b/tools/libvchan/libxenvchan.h
index 6365d36..0944a0e 100644
--- a/tools/libvchan/libxenvchan.h
+++ b/tools/libvchan/libxenvchan.h
@@ -21,8 +21,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * @section DESCRIPTION
  *
diff --git a/tools/libvchan/node-select.c b/tools/libvchan/node-select.c
index 13c5822..0394644 100644
--- a/tools/libvchan/node-select.c
+++ b/tools/libvchan/node-select.c
@@ -21,8 +21,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *  License along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * @section DESCRIPTION
  *
@@ -39,7 +38,7 @@
 
 #include <libxenvchan.h>
 
-void usage(char** argv)
+static void usage(char** argv)
 {
 	fprintf(stderr, "usage:\n"
 		"\t%s [client|server] domainid nodepath [rbufsiz wbufsiz]\n",
@@ -54,10 +53,12 @@ int insiz = 0;
 int outsiz = 0;
 struct libxenvchan *ctrl = 0;
 
-void vchan_wr() {
+static void vchan_wr(void) {
+	int ret;
+
 	if (!insiz)
 		return;
-	int ret = libxenvchan_write(ctrl, inbuf, insiz);
+	ret = libxenvchan_write(ctrl, inbuf, insiz);
 	if (ret < 0) {
 		fprintf(stderr, "vchan write failed\n");
 		exit(1);
@@ -68,10 +69,12 @@ void vchan_wr() {
 	}
 }
 
-void stdout_wr() {
+static void stdout_wr(void) {
+	int ret;
+
 	if (!outsiz)
 		return;
-	int ret = write(1, outbuf, outsiz);
+	ret = write(1, outbuf, outsiz);
 	if (ret < 0 && errno != EAGAIN)
 		exit(1);
 	if (ret > 0) {
diff --git a/tools/libvchan/node.c b/tools/libvchan/node.c
index cab8368..f1638f0 100644
--- a/tools/libvchan/node.c
+++ b/tools/libvchan/node.c
@@ -21,8 +21,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *  License along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * @section DESCRIPTION
  *
diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile
index bd2ca6c..a0f899b 100644
--- a/tools/libxc/Makefile
+++ b/tools/libxc/Makefile
@@ -1,10 +1,16 @@
 XEN_ROOT = $(CURDIR)/../..
 include $(XEN_ROOT)/tools/Rules.mk
 
-MAJOR    = 4.5
+MAJOR    = 4.6
 MINOR    = 0
 
+ifeq ($(CONFIG_LIBXC_MINIOS),y)
+# Save/restore of a domain is currently incompatible with a stubdom environment
+override CONFIG_MIGRATE := n
+endif
+
 CTRL_SRCS-y       :=
+CTRL_SRCS-y       += xc_altp2m.c
 CTRL_SRCS-y       += xc_core.c
 CTRL_SRCS-$(CONFIG_X86) += xc_core_x86.c
 CTRL_SRCS-$(CONFIG_ARM) += xc_core_arm.c
@@ -16,7 +22,6 @@ CTRL_SRCS-y       += xc_misc.c
 CTRL_SRCS-y       += xc_flask.c
 CTRL_SRCS-y       += xc_physdev.c
 CTRL_SRCS-y       += xc_private.c
-CTRL_SRCS-y       += xc_sedf.c
 CTRL_SRCS-y       += xc_csched.c
 CTRL_SRCS-y       += xc_csched2.c
 CTRL_SRCS-y       += xc_arinc653.c
@@ -26,7 +31,8 @@ CTRL_SRCS-y       += xc_pm.c
 CTRL_SRCS-y       += xc_cpu_hotplug.c
 CTRL_SRCS-y       += xc_resume.c
 CTRL_SRCS-y       += xc_tmem.c
-CTRL_SRCS-y       += xc_mem_event.c
+CTRL_SRCS-y       += xc_vm_event.c
+CTRL_SRCS-y       += xc_monitor.c
 CTRL_SRCS-y       += xc_mem_paging.c
 CTRL_SRCS-y       += xc_mem_access.c
 CTRL_SRCS-y       += xc_memshr.c
@@ -42,12 +48,21 @@ CTRL_SRCS-$(CONFIG_Linux) += xc_linux.c xc_linux_osdep.c
 CTRL_SRCS-$(CONFIG_FreeBSD) += xc_freebsd.c xc_freebsd_osdep.c
 CTRL_SRCS-$(CONFIG_SunOS) += xc_solaris.c
 CTRL_SRCS-$(CONFIG_NetBSD) += xc_netbsd.c
+CTRL_SRCS-$(CONFIG_NetBSDRump) += xc_netbsd.c
 CTRL_SRCS-$(CONFIG_MiniOS) += xc_minios.c
 
 GUEST_SRCS-y :=
 GUEST_SRCS-y += xg_private.c xc_suspend.c
 ifeq ($(CONFIG_MIGRATE),y)
-GUEST_SRCS-y += xc_domain_restore.c xc_domain_save.c
+GUEST_SRCS-y += xc_sr_common.c
+GUEST_SRCS-$(CONFIG_X86) += xc_sr_common_x86.c
+GUEST_SRCS-$(CONFIG_X86) += xc_sr_common_x86_pv.c
+GUEST_SRCS-$(CONFIG_X86) += xc_sr_restore_x86_pv.c
+GUEST_SRCS-$(CONFIG_X86) += xc_sr_restore_x86_hvm.c
+GUEST_SRCS-$(CONFIG_X86) += xc_sr_save_x86_pv.c
+GUEST_SRCS-$(CONFIG_X86) += xc_sr_save_x86_hvm.c
+GUEST_SRCS-y += xc_sr_restore.c
+GUEST_SRCS-y += xc_sr_save.c
 GUEST_SRCS-y += xc_offline_page.c xc_compression.c
 else
 GUEST_SRCS-y += xc_nomigrate.c
@@ -145,18 +160,18 @@ libs: $(LIB)
 
 .PHONY: install
 install: build
-	$(INSTALL_DIR) $(DESTDIR)$(LIBDIR)
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_SHLIB) libxenctrl.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)
-	$(INSTALL_DATA) libxenctrl.a $(DESTDIR)$(LIBDIR)
-	$(SYMLINK_SHLIB) libxenctrl.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxenctrl.so.$(MAJOR)
-	$(SYMLINK_SHLIB) libxenctrl.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxenctrl.so
-	$(INSTALL_DATA) include/xenctrl.h include/xenctrlosdep.h include/xentoollog.h $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_SHLIB) libxenguest.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)
-	$(INSTALL_DATA) libxenguest.a $(DESTDIR)$(LIBDIR)
-	$(SYMLINK_SHLIB) libxenguest.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxenguest.so.$(MAJOR)
-	$(SYMLINK_SHLIB) libxenguest.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxenguest.so
-	$(INSTALL_DATA) include/xenguest.h $(DESTDIR)$(INCLUDEDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(libdir)
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)
+	$(INSTALL_SHLIB) libxenctrl.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)
+	$(INSTALL_DATA) libxenctrl.a $(DESTDIR)$(libdir)
+	$(SYMLINK_SHLIB) libxenctrl.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)/libxenctrl.so.$(MAJOR)
+	$(SYMLINK_SHLIB) libxenctrl.so.$(MAJOR) $(DESTDIR)$(libdir)/libxenctrl.so
+	$(INSTALL_DATA) include/xenctrl.h include/xenctrlosdep.h include/xentoollog.h $(DESTDIR)$(includedir)
+	$(INSTALL_SHLIB) libxenguest.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)
+	$(INSTALL_DATA) libxenguest.a $(DESTDIR)$(libdir)
+	$(SYMLINK_SHLIB) libxenguest.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)/libxenguest.so.$(MAJOR)
+	$(SYMLINK_SHLIB) libxenguest.so.$(MAJOR) $(DESTDIR)$(libdir)/libxenguest.so
+	$(INSTALL_DATA) include/xenguest.h $(DESTDIR)$(includedir)
 
 .PHONY: TAGS
 TAGS:
@@ -170,6 +185,9 @@ clean:
             $(GUEST_LIB_OBJS) $(GUEST_PIC_OBJS) \
             $(OSDEP_LIB_OBJS) $(OSDEP_PIC_OBJS)
 
+.PHONY: distclean
+distclean: clean
+
 .PHONY: rpm
 rpm: build
 	rm -rf staging
diff --git a/tools/libxc/include/xc_dom.h b/tools/libxc/include/xc_dom.h
index 07d7224..600aef6 100644
--- a/tools/libxc/include/xc_dom.h
+++ b/tools/libxc/include/xc_dom.h
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/libelf/libelf.h>
@@ -119,8 +118,10 @@ struct xc_dom_image {
 
     /* physical memory
      *
-     * An x86 PV guest has a single contiguous block of physical RAM,
-     * consisting of total_pages starting at rambase_pfn.
+     * An x86 PV guest has one or more blocks of physical RAM,
+     * consisting of total_pages starting at rambase_pfn. The start
+     * address and size of each block is controlled by vNUMA
+     * structures.
      *
      * An ARM guest has GUEST_RAM_BANKS regions of RAM, with
      * rambank_size[i] pages in each. The lowest RAM address
@@ -129,6 +130,7 @@ struct xc_dom_image {
      */
     xen_pfn_t rambase_pfn;
     xen_pfn_t total_pages;
+    xen_pfn_t p2m_size;         /* number of pfns covered by p2m */
     struct xc_dom_phys *phys_pages;
     int realmodearea_log;
 #if defined (__arm__) || defined(__aarch64__)
@@ -167,6 +169,12 @@ struct xc_dom_image {
     struct xc_dom_loader *kernel_loader;
     void *private_loader;
 
+    /* vNUMA information */
+    xen_vmemrange_t *vmemranges;
+    unsigned int nr_vmemranges;
+    unsigned int *vnode_to_pnode;
+    unsigned int nr_vnodes;
+
     /* kernel loader */
     struct xc_dom_arch *arch_hooks;
     /* allocate up to virt_alloc_end */
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 0ad8b8d..37205c2 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -19,8 +19,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef XENCTRL_H
@@ -34,6 +33,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <stdbool.h>
 #include <xen/xen.h>
 #include <xen/domctl.h>
 #include <xen/physdev.h>
@@ -269,7 +269,7 @@ typedef struct xc_hypercall_buffer xc_hypercall_buffer_t;
  * transparently converted to the hypercall buffer as necessary.
  */
 #define DECLARE_HYPERCALL_BUFFER(_type, _name)                 \
-    _type *_name = NULL;                                       \
+    _type *(_name) = NULL;                                     \
     xc_hypercall_buffer_t XC__HYPERCALL_BUFFER_NAME(_name) = { \
         .hbuf = NULL,                                          \
         .param_shadow = NULL,                                  \
@@ -287,10 +287,11 @@ typedef struct xc_hypercall_buffer xc_hypercall_buffer_t;
  * required.
  */
 #define DECLARE_HYPERCALL_BUFFER_SHADOW(_type, _name, _hbuf)   \
-    _type *_name = _hbuf->hbuf;                                \
+    _type *(_name) = (_hbuf)->hbuf;                            \
+    __attribute__((unused))                                    \
     xc_hypercall_buffer_t XC__HYPERCALL_BUFFER_NAME(_name) = { \
         .hbuf = (void *)-1,                                    \
-        .param_shadow = _hbuf,                                 \
+        .param_shadow = (_hbuf),                               \
         HYPERCALL_BUFFER_INIT_NO_BOUNCE                        \
     }
 
@@ -301,7 +302,7 @@ typedef struct xc_hypercall_buffer xc_hypercall_buffer_t;
 #define DECLARE_HYPERCALL_BUFFER_ARGUMENT(_name)               \
     xc_hypercall_buffer_t XC__HYPERCALL_BUFFER_NAME(_name) = { \
         .hbuf = (void *)-1,                                    \
-        .param_shadow = _name,                                 \
+        .param_shadow = (_name),                               \
         HYPERCALL_BUFFER_INIT_NO_BOUNCE                        \
     }
 
@@ -321,16 +322,24 @@ typedef struct xc_hypercall_buffer xc_hypercall_buffer_t;
  * Set a xen_guest_handle in a type safe manner, ensuring that the
  * data pointer has been correctly allocated.
  */
-#undef set_xen_guest_handle
-#define set_xen_guest_handle(_hnd, _val)                        \
+#define set_xen_guest_handle_impl(_hnd, _val, _byte_off)        \
     do {                                                        \
         xc_hypercall_buffer_t _hcbuf_hnd1;                      \
         typeof(XC__HYPERCALL_BUFFER_NAME(_val)) *_hcbuf_hnd2 =  \
                 HYPERCALL_BUFFER(_val);                         \
         (void) (&_hcbuf_hnd1 == _hcbuf_hnd2);                   \
-        set_xen_guest_handle_raw(_hnd, (_hcbuf_hnd2)->hbuf);    \
+        set_xen_guest_handle_raw(_hnd,                          \
+                (_hcbuf_hnd2)->hbuf + (_byte_off));             \
     } while (0)
 
+#undef set_xen_guest_handle
+#define set_xen_guest_handle(_hnd, _val)                        \
+    set_xen_guest_handle_impl(_hnd, _val, 0)
+
+#define set_xen_guest_handle_offset(_hnd, _val, _off)           \
+    set_xen_guest_handle_impl(_hnd, _val,                       \
+            ((sizeof(*_val)*(_off))))
+
 /* Use with set_xen_guest_handle in place of NULL */
 extern xc_hypercall_buffer_t XC__HYPERCALL_BUFFER_NAME(HYPERCALL_BUFFER_NULL);
 
@@ -348,7 +357,12 @@ void xc__hypercall_buffer_free(xc_interface *xch, xc_hypercall_buffer_t *b);
 void *xc__hypercall_buffer_alloc_pages(xc_interface *xch, xc_hypercall_buffer_t *b, int nr_pages);
 #define xc_hypercall_buffer_alloc_pages(_xch, _name, _nr) xc__hypercall_buffer_alloc_pages(_xch, HYPERCALL_BUFFER(_name), _nr)
 void xc__hypercall_buffer_free_pages(xc_interface *xch, xc_hypercall_buffer_t *b, int nr_pages);
-#define xc_hypercall_buffer_free_pages(_xch, _name, _nr) xc__hypercall_buffer_free_pages(_xch, HYPERCALL_BUFFER(_name), _nr)
+#define xc_hypercall_buffer_free_pages(_xch, _name, _nr)                    \
+    do {                                                                    \
+        if ( _name )                                                        \
+            xc__hypercall_buffer_free_pages(_xch, HYPERCALL_BUFFER(_name),  \
+                                            _nr);                           \
+    } while (0)
 
 /*
  * Array of hypercall buffers.
@@ -361,7 +375,7 @@ void xc__hypercall_buffer_free_pages(xc_interface *xch, xc_hypercall_buffer_t *b
  * buffer and call xc_hypercall_buffer_array_get().
  *
  * Destroy the array with xc_hypercall_buffer_array_destroy() to free
- * the array and all its alocated hypercall buffers.
+ * the array and all its allocated hypercall buffers.
  */
 struct xc_hypercall_buffer_array;
 typedef struct xc_hypercall_buffer_array xc_hypercall_buffer_array_t;
@@ -394,6 +408,15 @@ int xc_get_cpumap_size(xc_interface *xch);
 /* allocate a cpumap */
 xc_cpumap_t xc_cpumap_alloc(xc_interface *xch);
 
+/* clear an CPU from the cpumap. */
+void xc_cpumap_clearcpu(int cpu, xc_cpumap_t map);
+
+/* set an CPU in the cpumap. */
+void xc_cpumap_setcpu(int cpu, xc_cpumap_t map);
+
+/* Test whether the CPU in cpumap is set. */
+int xc_cpumap_testcpu(int cpu, xc_cpumap_t map);
+
 /*
  * NODEMAP handling
  */
@@ -477,18 +500,20 @@ typedef union
 } start_info_any_t;
 #endif
 
+
+typedef struct xen_arch_domainconfig xc_domain_configuration_t;
+int xc_domain_create_config(xc_interface *xch,
+                            uint32_t ssidref,
+                            xen_domain_handle_t handle,
+                            uint32_t flags,
+                            uint32_t *pdomid,
+                            xc_domain_configuration_t *config);
 int xc_domain_create(xc_interface *xch,
                      uint32_t ssidref,
                      xen_domain_handle_t handle,
                      uint32_t flags,
                      uint32_t *pdomid);
 
-#if defined(__arm__) || defined(__aarch64__)
-typedef xen_domctl_arm_configuredomain_t xc_domain_configuration_t;
-
-int xc_domain_configure(xc_interface *xch, uint32_t domid,
-                        xc_domain_configuration_t *config);
-#endif
 
 /* Functions to produce a dump of a given domain
  *  xc_domain_dumpcore - produces a dump to a specified file
@@ -849,18 +874,6 @@ int xc_shadow_control(xc_interface *xch,
                       uint32_t mode,
                       xc_shadow_op_stats_t *stats);
 
-int xc_sedf_domain_set(xc_interface *xch,
-                       uint32_t domid,
-                       uint64_t period, uint64_t slice,
-                       uint64_t latency, uint16_t extratime,
-                       uint16_t weight);
-
-int xc_sedf_domain_get(xc_interface *xch,
-                       uint32_t domid,
-                       uint64_t* period, uint64_t *slice,
-                       uint64_t *latency, uint16_t *extratime,
-                       uint16_t *weight);
-
 int xc_sched_credit_domain_set(xc_interface *xch,
                                uint32_t domid,
                                struct xen_domctl_sched_credit *sdom);
@@ -1226,8 +1239,10 @@ int xc_readconsolering(xc_interface *xch,
 int xc_send_debug_keys(xc_interface *xch, char *keys);
 
 typedef xen_sysctl_physinfo_t xc_physinfo_t;
-typedef xen_sysctl_topologyinfo_t xc_topologyinfo_t;
+typedef xen_sysctl_cputopo_t xc_cputopo_t;
 typedef xen_sysctl_numainfo_t xc_numainfo_t;
+typedef xen_sysctl_meminfo_t xc_meminfo_t;
+typedef xen_sysctl_pcitopoinfo_t xc_pcitopoinfo_t;
 
 typedef uint32_t xc_cpu_to_node_t;
 typedef uint32_t xc_cpu_to_socket_t;
@@ -1237,8 +1252,12 @@ typedef uint64_t xc_node_to_memfree_t;
 typedef uint32_t xc_node_to_node_dist_t;
 
 int xc_physinfo(xc_interface *xch, xc_physinfo_t *info);
-int xc_topologyinfo(xc_interface *xch, xc_topologyinfo_t *info);
-int xc_numainfo(xc_interface *xch, xc_numainfo_t *info);
+int xc_cputopoinfo(xc_interface *xch, unsigned *max_cpus,
+                   xc_cputopo_t *cputopo);
+int xc_numainfo(xc_interface *xch, unsigned *max_nodes,
+                xc_meminfo_t *meminfo, uint32_t *distance);
+int xc_pcitopoinfo(xc_interface *xch, unsigned num_devs,
+                   physdev_pci_device_t *devs, uint32_t *nodes);
 
 int xc_sched_id(xc_interface *xch,
                 int *sched_id);
@@ -1253,7 +1272,7 @@ int xc_getcpuinfo(xc_interface *xch, int max_cpus,
 
 int xc_domain_setmaxmem(xc_interface *xch,
                         uint32_t domid,
-                        unsigned int max_memkb);
+                        uint64_t max_memkb);
 
 int xc_domain_set_memmap_limit(xc_interface *xch,
                                uint32_t domid,
@@ -1268,6 +1287,24 @@ int xc_domain_setvnuma(xc_interface *xch,
                         unsigned int *vdistance,
                         unsigned int *vcpu_to_vnode,
                         unsigned int *vnode_to_pnode);
+/*
+ * Retrieve vnuma configuration
+ * domid: IN, target domid
+ * nr_vnodes: IN/OUT, number of vnodes, not NULL
+ * nr_vmemranges: IN/OUT, number of vmemranges, not NULL
+ * nr_vcpus: IN/OUT, number of vcpus, not NULL
+ * vmemranges: OUT, an array which has length of nr_vmemranges
+ * vdistance: OUT, an array which has length of nr_vnodes * nr_vnodes
+ * vcpu_to_vnode: OUT, an array which has length of nr_vcpus
+ */
+int xc_domain_getvnuma(xc_interface *xch,
+                       uint32_t domid,
+                       uint32_t *nr_vnodes,
+                       uint32_t *nr_vmemranges,
+                       uint32_t *nr_vcpus,
+                       xen_vmemrange_t *vmemrange,
+                       unsigned int *vdistance,
+                       unsigned int *vcpu_to_vnode);
 
 #if defined(__i386__) || defined(__x86_64__)
 /*
@@ -1295,6 +1332,14 @@ int xc_get_machine_memory_map(xc_interface *xch,
                               struct e820entry entries[],
                               uint32_t max_entries);
 #endif
+
+int xc_reserved_device_memory_map(xc_interface *xch,
+                                  uint32_t flags,
+                                  uint16_t seg,
+                                  uint8_t bus,
+                                  uint8_t devfn,
+                                  struct xen_reserved_device_memory entries[],
+                                  uint32_t *max_entries);
 int xc_domain_set_time_offset(xc_interface *xch,
                               uint32_t domid,
                               int32_t time_offset_seconds);
@@ -1315,7 +1360,9 @@ int xc_domain_get_tsc_info(xc_interface *xch,
 
 int xc_domain_disable_migrate(xc_interface *xch, uint32_t domid);
 
-int xc_domain_maximum_gpfn(xc_interface *xch, domid_t domid);
+int xc_domain_maximum_gpfn(xc_interface *xch, domid_t domid, xen_pfn_t *gpfns);
+
+int xc_domain_nr_gpfns(xc_interface *xch, domid_t domid, xen_pfn_t *gpfns);
 
 int xc_domain_increase_reservation(xc_interface *xch,
                                    uint32_t domid,
@@ -1509,7 +1556,7 @@ int xc_mmuext_op(xc_interface *xch, struct mmuext_op *op, unsigned int nr_ops,
                  domid_t dom);
 
 /* System wide memory properties */
-long xc_maximum_ram_page(xc_interface *xch);
+int xc_maximum_ram_page(xc_interface *xch, unsigned long *max_mfn);
 
 /* Get current total pages allocated to a domain. */
 long xc_get_tot_pages(xc_interface *xch, uint32_t domid);
@@ -1574,7 +1621,7 @@ int xc_tbuf_set_size(xc_interface *xch, unsigned long size);
  */
 int xc_tbuf_get_size(xc_interface *xch, unsigned long *size);
 
-int xc_tbuf_set_cpu_mask(xc_interface *xch, uint32_t mask);
+int xc_tbuf_set_cpu_mask(xc_interface *xch, xc_cpumap_t mask);
 
 int xc_tbuf_set_evt_mask(xc_interface *xch, uint32_t mask);
 
@@ -1899,7 +1946,8 @@ int xc_get_hvm_param(xc_interface *handle, domid_t dom, int param, unsigned long
  *
  * @parm xch a handle to an open hypervisor interface.
  * @parm domid the domain id to be serviced
- * @parm handle_bufioreq should the IOREQ Server handle buffered requests?
+ * @parm handle_bufioreq how should the IOREQ Server handle buffered requests
+ *                       (HVM_IOREQSRV_BUFIOREQ_*)?
  * @parm id pointer to an ioservid_t to receive the IOREQ Server id.
  * @return 0 on success, -1 on failure.
  */
@@ -2036,22 +2084,33 @@ int xc_hvm_destroy_ioreq_server(xc_interface *xch,
 /* HVM guest pass-through */
 int xc_assign_device(xc_interface *xch,
                      uint32_t domid,
-                     uint32_t machine_bdf);
+                     uint32_t machine_sbdf,
+                     uint32_t flag);
 
 int xc_get_device_group(xc_interface *xch,
                      uint32_t domid,
-                     uint32_t machine_bdf,
+                     uint32_t machine_sbdf,
                      uint32_t max_sdevs,
                      uint32_t *num_sdevs,
                      uint32_t *sdev_array);
 
 int xc_test_assign_device(xc_interface *xch,
                           uint32_t domid,
-                          uint32_t machine_bdf);
+                          uint32_t machine_sbdf);
 
 int xc_deassign_device(xc_interface *xch,
                      uint32_t domid,
-                     uint32_t machine_bdf);
+                     uint32_t machine_sbdf);
+
+int xc_assign_dt_device(xc_interface *xch,
+                        uint32_t domid,
+                        char *path);
+int xc_test_assign_dt_device(xc_interface *xch,
+                             uint32_t domid,
+                             char *path);
+int xc_deassign_dt_device(xc_interface *xch,
+                          uint32_t domid,
+                          char *path);
 
 int xc_domain_memory_mapping(xc_interface *xch,
                              uint32_t domid,
@@ -2110,6 +2169,16 @@ int xc_domain_bind_pt_isa_irq(xc_interface *xch,
                               uint32_t domid,
                               uint8_t machine_irq);
 
+int xc_domain_bind_pt_spi_irq(xc_interface *xch,
+                              uint32_t domid,
+                              uint16_t vspi,
+                              uint16_t spi);
+
+int xc_domain_unbind_pt_spi_irq(xc_interface *xch,
+                                uint32_t domid,
+                                uint16_t vspi,
+                                uint16_t spi);
+
 int xc_domain_set_machine_address_size(xc_interface *xch,
 				       uint32_t domid,
 				       unsigned int width);
@@ -2244,16 +2313,12 @@ int xc_disable_turbo(xc_interface *xch, int cpuid);
  * tmem operations
  */
 
-struct tmem_oid {
-    uint64_t oid[3];
-};
-
 int xc_tmem_control_oid(xc_interface *xch, int32_t pool_id, uint32_t subop,
                         uint32_t cli_id, uint32_t arg1, uint32_t arg2,
-                        struct tmem_oid oid, void *buf);
+                        struct xen_tmem_oid oid, void *buf);
 int xc_tmem_control(xc_interface *xch,
                     int32_t pool_id, uint32_t subop, uint32_t cli_id,
-                    uint32_t arg1, uint32_t arg2, uint64_t arg3, void *buf);
+                    uint32_t arg1, uint32_t arg2, void *buf);
 int xc_tmem_auth(xc_interface *xch, int cli_id, char *uuid_str, int arg1);
 int xc_tmem_save(xc_interface *xch, int dom, int live, int fd, int field_marker);
 int xc_tmem_save_extra(xc_interface *xch, int dom, int fd, int field_marker);
@@ -2261,6 +2326,28 @@ void xc_tmem_save_done(xc_interface *xch, int dom);
 int xc_tmem_restore(xc_interface *xch, int dom, int fd);
 int xc_tmem_restore_extra(xc_interface *xch, int dom, int fd);
 
+/**
+ * altp2m operations
+ */
+
+int xc_altp2m_get_domain_state(xc_interface *handle, domid_t dom, bool *state);
+int xc_altp2m_set_domain_state(xc_interface *handle, domid_t dom, bool state);
+int xc_altp2m_set_vcpu_enable_notify(xc_interface *handle, domid_t domid,
+                                     uint32_t vcpuid, xen_pfn_t gfn);
+int xc_altp2m_create_view(xc_interface *handle, domid_t domid,
+                          xenmem_access_t default_access, uint16_t *view_id);
+int xc_altp2m_destroy_view(xc_interface *handle, domid_t domid,
+                           uint16_t view_id);
+/* Switch all vCPUs of the domain to the specified altp2m view */
+int xc_altp2m_switch_to_view(xc_interface *handle, domid_t domid,
+                             uint16_t view_id);
+int xc_altp2m_set_mem_access(xc_interface *handle, domid_t domid,
+                             uint16_t view_id, xen_pfn_t gfn,
+                             xenmem_access_t access);
+int xc_altp2m_change_gfn(xc_interface *handle, domid_t domid,
+                         uint16_t view_id, xen_pfn_t old_gfn,
+                         xen_pfn_t new_gfn);
+
 /** 
  * Mem paging operations.
  * Paging is supported only on the x86 architecture in 64 bit mode, with
@@ -2269,12 +2356,13 @@ int xc_tmem_restore_extra(xc_interface *xch, int dom, int fd);
  */
 int xc_mem_paging_enable(xc_interface *xch, domid_t domain_id, uint32_t *port);
 int xc_mem_paging_disable(xc_interface *xch, domid_t domain_id);
+int xc_mem_paging_resume(xc_interface *xch, domid_t domain_id);
 int xc_mem_paging_nominate(xc_interface *xch, domid_t domain_id,
-                           unsigned long gfn);
-int xc_mem_paging_evict(xc_interface *xch, domid_t domain_id, unsigned long gfn);
-int xc_mem_paging_prep(xc_interface *xch, domid_t domain_id, unsigned long gfn);
-int xc_mem_paging_load(xc_interface *xch, domid_t domain_id, 
-                        unsigned long gfn, void *buffer);
+                           uint64_t gfn);
+int xc_mem_paging_evict(xc_interface *xch, domid_t domain_id, uint64_t gfn);
+int xc_mem_paging_prep(xc_interface *xch, domid_t domain_id, uint64_t gfn);
+int xc_mem_paging_load(xc_interface *xch, domid_t domain_id,
+                       uint64_t gfn, void *buffer);
 
 /** 
  * Access tracking operations.
@@ -2282,17 +2370,6 @@ int xc_mem_paging_load(xc_interface *xch, domid_t domain_id,
  */
 
 /*
- * Enables mem_access and returns the mapped ring page.
- * Will return NULL on error.
- * Caller has to unmap this page when done.
- */
-void *xc_mem_access_enable(xc_interface *xch, domid_t domain_id, uint32_t *port);
-void *xc_mem_access_enable_introspection(xc_interface *xch, domid_t domain_id,
-                                         uint32_t *port);
-int xc_mem_access_disable(xc_interface *xch, domid_t domain_id);
-int xc_mem_access_resume(xc_interface *xch, domid_t domain_id);
-
-/*
  * Set a range of memory to a specific access.
  * Allowed types are XENMEM_access_default, XENMEM_access_n, any combination of
  * XENMEM_access_ + (rwx), and XENMEM_access_rx2rw
@@ -2307,6 +2384,47 @@ int xc_set_mem_access(xc_interface *xch, domid_t domain_id,
 int xc_get_mem_access(xc_interface *xch, domid_t domain_id,
                       uint64_t pfn, xenmem_access_t *access);
 
+/*
+ * Instructions causing a mem_access violation can be emulated by Xen
+ * to progress the execution without having to relax the mem_access
+ * permissions.
+ * This feature has to be first enabled, then in the vm_event
+ * response to a mem_access event it can be indicated if the instruction
+ * should be emulated.
+ */
+int xc_mem_access_enable_emulate(xc_interface *xch, domid_t domain_id);
+int xc_mem_access_disable_emulate(xc_interface *xch, domid_t domain_id);
+
+/***
+ * Monitor control operations.
+ *
+ * Enables the VM event monitor ring and returns the mapped ring page.
+ * This ring is used to deliver mem_access events, as well a set of additional
+ * events that can be enabled with the xc_monitor_* functions.
+ *
+ * Will return NULL on error.
+ * Caller has to unmap this page when done.
+ */
+void *xc_monitor_enable(xc_interface *xch, domid_t domain_id, uint32_t *port);
+int xc_monitor_disable(xc_interface *xch, domid_t domain_id);
+int xc_monitor_resume(xc_interface *xch, domid_t domain_id);
+/*
+ * Get a bitmap of supported monitor events in the form
+ * (1 << XEN_DOMCTL_MONITOR_EVENT_*).
+ */
+int xc_monitor_get_capabilities(xc_interface *xch, domid_t domain_id,
+                                uint32_t *capabilities);
+int xc_monitor_write_ctrlreg(xc_interface *xch, domid_t domain_id,
+                             uint16_t index, bool enable, bool sync,
+                             bool onchangeonly);
+int xc_monitor_mov_to_msr(xc_interface *xch, domid_t domain_id, bool enable,
+                          bool extended_capture);
+int xc_monitor_singlestep(xc_interface *xch, domid_t domain_id, bool enable);
+int xc_monitor_software_breakpoint(xc_interface *xch, domid_t domain_id,
+                                   bool enable);
+int xc_monitor_guest_request(xc_interface *xch, domid_t domain_id,
+                             bool enable, bool sync);
+
 /***
  * Memory sharing operations.
  *
@@ -2688,20 +2806,39 @@ int xc_resource_op(xc_interface *xch, uint32_t nr_ops, xc_resource_op_t *ops);
 #if defined(__i386__) || defined(__x86_64__)
 enum xc_psr_cmt_type {
     XC_PSR_CMT_L3_OCCUPANCY,
+    XC_PSR_CMT_TOTAL_MEM_COUNT,
+    XC_PSR_CMT_LOCAL_MEM_COUNT,
 };
 typedef enum xc_psr_cmt_type xc_psr_cmt_type;
+
+enum xc_psr_cat_type {
+    XC_PSR_CAT_L3_CBM = 1,
+};
+typedef enum xc_psr_cat_type xc_psr_cat_type;
+
 int xc_psr_cmt_attach(xc_interface *xch, uint32_t domid);
 int xc_psr_cmt_detach(xc_interface *xch, uint32_t domid);
 int xc_psr_cmt_get_domain_rmid(xc_interface *xch, uint32_t domid,
-    uint32_t *rmid);
+                               uint32_t *rmid);
 int xc_psr_cmt_get_total_rmid(xc_interface *xch, uint32_t *total_rmid);
 int xc_psr_cmt_get_l3_upscaling_factor(xc_interface *xch,
-    uint32_t *upscaling_factor);
+                                       uint32_t *upscaling_factor);
+int xc_psr_cmt_get_l3_event_mask(xc_interface *xch, uint32_t *event_mask);
 int xc_psr_cmt_get_l3_cache_size(xc_interface *xch, uint32_t cpu,
-    uint32_t *l3_cache_size);
-int xc_psr_cmt_get_data(xc_interface *xch, uint32_t rmid,
-    uint32_t cpu, uint32_t psr_cmt_type, uint64_t *monitor_data);
+                                 uint32_t *l3_cache_size);
+int xc_psr_cmt_get_data(xc_interface *xch, uint32_t rmid, uint32_t cpu,
+                        uint32_t psr_cmt_type, uint64_t *monitor_data,
+                        uint64_t *tsc);
 int xc_psr_cmt_enabled(xc_interface *xch);
+
+int xc_psr_cat_set_domain_data(xc_interface *xch, uint32_t domid,
+                               xc_psr_cat_type type, uint32_t target,
+                               uint64_t data);
+int xc_psr_cat_get_domain_data(xc_interface *xch, uint32_t domid,
+                               xc_psr_cat_type type, uint32_t target,
+                               uint64_t *data);
+int xc_psr_cat_get_l3_info(xc_interface *xch, uint32_t socket,
+                           uint32_t *cos_max, uint32_t *cbm_len);
 #endif
 
 #endif /* XENCTRL_H */
diff --git a/tools/libxc/include/xenctrlosdep.h b/tools/libxc/include/xenctrlosdep.h
index e97944b..5121d9b 100644
--- a/tools/libxc/include/xenctrlosdep.h
+++ b/tools/libxc/include/xenctrlosdep.h
@@ -15,8 +15,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
index 40bbac8..1a1a185 100644
--- a/tools/libxc/include/xenguest.h
+++ b/tools/libxc/include/xenguest.h
@@ -16,18 +16,20 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef XENGUEST_H
 #define XENGUEST_H
 
+#define XC_NUMA_NO_NODE   (~0U)
+
 #define XCFLAGS_LIVE      (1 << 0)
 #define XCFLAGS_DEBUG     (1 << 1)
 #define XCFLAGS_HVM       (1 << 2)
 #define XCFLAGS_STDVGA    (1 << 3)
 #define XCFLAGS_CHECKPOINT_COMPRESS    (1 << 4)
+#define XCFLAGS_CHECKPOINTED    (1 << 5)
 
 #define X86_64_B_SIZE   64 
 #define X86_32_B_SIZE   32
@@ -64,14 +66,6 @@ struct save_callbacks {
     /* Enable qemu-dm logging dirty pages to xen */
     int (*switch_qemu_logdirty)(int domid, unsigned enable, void *data); /* HVM only */
 
-    /* Save toolstack specific data
-     * @param buf the buffer with the data to be saved
-     * @param len the length of the buffer
-     * The callee allocates the buffer, the caller frees it (buffer must
-     * be free'able).
-     */
-    int (*toolstack_save)(uint32_t domid, uint8_t **buf, uint32_t *len, void *data);
-
     /* to be provided as the last argument to each callback function */
     void* data;
 };
@@ -88,12 +82,14 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iter
                    uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
                    struct save_callbacks* callbacks, int hvm);
 
-
 /* callbacks provided by xc_domain_restore */
 struct restore_callbacks {
-    /* callback to restore toolstack specific data */
-    int (*toolstack_restore)(uint32_t domid, const uint8_t *buf,
-            uint32_t size, void* data);
+    /* A checkpoint record has been found in the stream.
+     * returns: */
+#define XGR_CHECKPOINT_ERROR    0 /* Terminate processing */
+#define XGR_CHECKPOINT_SUCCESS  1 /* Continue reading more data from the stream */
+#define XGR_CHECKPOINT_FAILOVER 2 /* Failover and resume VM */
+    int (*checkpoint)(void* data);
 
     /* to be provided as the last argument to each callback function */
     void* data;
@@ -124,14 +120,6 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
                       unsigned int hvm, unsigned int pae, int superpages,
                       int checkpointed_stream,
                       struct restore_callbacks *callbacks);
-/**
- * xc_domain_restore writes a file to disk that contains the device
- * model saved state.
- * The pathname of this file is XC_DEVICE_MODEL_RESTORE_FILE; The domid
- * of the new domain is automatically appended to the filename,
- * separated by a ".".
- */
-#define XC_DEVICE_MODEL_RESTORE_FILE "/var/lib/xen/qemu-resume"
 
 /**
  * This function will create a domain for a paravirtualized Linux
@@ -230,6 +218,17 @@ struct xc_hvm_build_args {
     struct xc_hvm_firmware_module smbios_module;
     /* Whether to use claim hypercall (1 - enable, 0 - disable). */
     int claim_enabled;
+
+    /* vNUMA information*/
+    xen_vmemrange_t *vmemranges;
+    unsigned int nr_vmemranges;
+    unsigned int *vnode_to_pnode;
+    unsigned int nr_vnodes;
+
+    /* Out parameters  */
+    uint64_t lowmem_end;
+    uint64_t highmem_end;
+    uint64_t mmio_start;
 };
 
 /**
diff --git a/tools/libxc/include/xentoollog.h b/tools/libxc/include/xentoollog.h
index 85d3da9..853e9c7 100644
--- a/tools/libxc/include/xentoollog.h
+++ b/tools/libxc/include/xentoollog.h
@@ -15,8 +15,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef XENTOOLLOG_H
diff --git a/tools/libxc/xc_altp2m.c b/tools/libxc/xc_altp2m.c
new file mode 100644
index 0000000..87a0fdf
--- /dev/null
+++ b/tools/libxc/xc_altp2m.c
@@ -0,0 +1,247 @@
+/******************************************************************************
+ *
+ * xc_altp2m.c
+ *
+ * Interface to altp2m related HVMOPs
+ *
+ * Copyright (c) 2015 Tamas K Lengyel (tamas at tklengyel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "xc_private.h"
+#include <stdbool.h>
+#include <xen/hvm/hvm_op.h>
+
+int xc_altp2m_get_domain_state(xc_interface *handle, domid_t dom, bool *state)
+{
+    int rc;
+    DECLARE_HYPERCALL;
+    DECLARE_HYPERCALL_BUFFER(xen_hvm_altp2m_op_t, arg);
+
+    arg = xc_hypercall_buffer_alloc(handle, arg, sizeof(*arg));
+    if ( arg == NULL )
+        return -1;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_altp2m;
+    hypercall.arg[1] = HYPERCALL_BUFFER_AS_ARG(arg);
+
+    arg->version = HVMOP_ALTP2M_INTERFACE_VERSION;
+    arg->cmd = HVMOP_altp2m_get_domain_state;
+    arg->domain = dom;
+
+    rc = do_xen_hypercall(handle, &hypercall);
+
+    if ( !rc )
+        *state = arg->u.domain_state.state;
+
+    xc_hypercall_buffer_free(handle, arg);
+    return rc;
+}
+
+int xc_altp2m_set_domain_state(xc_interface *handle, domid_t dom, bool state)
+{
+    int rc;
+    DECLARE_HYPERCALL;
+    DECLARE_HYPERCALL_BUFFER(xen_hvm_altp2m_op_t, arg);
+
+    arg = xc_hypercall_buffer_alloc(handle, arg, sizeof(*arg));
+    if ( arg == NULL )
+        return -1;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_altp2m;
+    hypercall.arg[1] = HYPERCALL_BUFFER_AS_ARG(arg);
+
+    arg->version = HVMOP_ALTP2M_INTERFACE_VERSION;
+    arg->cmd = HVMOP_altp2m_set_domain_state;
+    arg->domain = dom;
+    arg->u.domain_state.state = state;
+
+    rc = do_xen_hypercall(handle, &hypercall);
+
+    xc_hypercall_buffer_free(handle, arg);
+    return rc;
+}
+
+/* This is a bit odd to me that it acts on current.. */
+int xc_altp2m_set_vcpu_enable_notify(xc_interface *handle, domid_t domid,
+                                     uint32_t vcpuid, xen_pfn_t gfn)
+{
+    int rc;
+    DECLARE_HYPERCALL;
+    DECLARE_HYPERCALL_BUFFER(xen_hvm_altp2m_op_t, arg);
+
+    arg = xc_hypercall_buffer_alloc(handle, arg, sizeof(*arg));
+    if ( arg == NULL )
+        return -1;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_altp2m;
+    hypercall.arg[1] = HYPERCALL_BUFFER_AS_ARG(arg);
+
+    arg->version = HVMOP_ALTP2M_INTERFACE_VERSION;
+    arg->cmd = HVMOP_altp2m_vcpu_enable_notify;
+    arg->domain = domid;
+    arg->u.enable_notify.vcpu_id = vcpuid;
+    arg->u.enable_notify.gfn = gfn;
+
+    rc = do_xen_hypercall(handle, &hypercall);
+
+    xc_hypercall_buffer_free(handle, arg);
+    return rc;
+}
+
+int xc_altp2m_create_view(xc_interface *handle, domid_t domid,
+                          xenmem_access_t default_access, uint16_t *view_id)
+{
+    int rc;
+    DECLARE_HYPERCALL;
+    DECLARE_HYPERCALL_BUFFER(xen_hvm_altp2m_op_t, arg);
+
+    arg = xc_hypercall_buffer_alloc(handle, arg, sizeof(*arg));
+    if ( arg == NULL )
+        return -1;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_altp2m;
+    hypercall.arg[1] = HYPERCALL_BUFFER_AS_ARG(arg);
+
+    arg->version = HVMOP_ALTP2M_INTERFACE_VERSION;
+    arg->cmd = HVMOP_altp2m_create_p2m;
+    arg->domain = domid;
+    arg->u.view.view = -1;
+    arg->u.view.hvmmem_default_access = default_access;
+
+    rc = do_xen_hypercall(handle, &hypercall);
+
+    if ( !rc )
+        *view_id = arg->u.view.view;
+
+    xc_hypercall_buffer_free(handle, arg);
+    return rc;
+}
+
+int xc_altp2m_destroy_view(xc_interface *handle, domid_t domid,
+                           uint16_t view_id)
+{
+    int rc;
+    DECLARE_HYPERCALL;
+    DECLARE_HYPERCALL_BUFFER(xen_hvm_altp2m_op_t, arg);
+
+    arg = xc_hypercall_buffer_alloc(handle, arg, sizeof(*arg));
+    if ( arg == NULL )
+        return -1;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_altp2m;
+    hypercall.arg[1] = HYPERCALL_BUFFER_AS_ARG(arg);
+
+    arg->version = HVMOP_ALTP2M_INTERFACE_VERSION;
+    arg->cmd = HVMOP_altp2m_destroy_p2m;
+    arg->domain = domid;
+    arg->u.view.view = view_id;
+
+    rc = do_xen_hypercall(handle, &hypercall);
+
+    xc_hypercall_buffer_free(handle, arg);
+    return rc;
+}
+
+/* Switch all vCPUs of the domain to the specified altp2m view */
+int xc_altp2m_switch_to_view(xc_interface *handle, domid_t domid,
+                             uint16_t view_id)
+{
+    int rc;
+    DECLARE_HYPERCALL;
+    DECLARE_HYPERCALL_BUFFER(xen_hvm_altp2m_op_t, arg);
+
+    arg = xc_hypercall_buffer_alloc(handle, arg, sizeof(*arg));
+    if ( arg == NULL )
+        return -1;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_altp2m;
+    hypercall.arg[1] = HYPERCALL_BUFFER_AS_ARG(arg);
+
+    arg->version = HVMOP_ALTP2M_INTERFACE_VERSION;
+    arg->cmd = HVMOP_altp2m_switch_p2m;
+    arg->domain = domid;
+    arg->u.view.view = view_id;
+
+    rc = do_xen_hypercall(handle, &hypercall);
+
+    xc_hypercall_buffer_free(handle, arg);
+    return rc;
+}
+
+int xc_altp2m_set_mem_access(xc_interface *handle, domid_t domid,
+                             uint16_t view_id, xen_pfn_t gfn,
+                             xenmem_access_t access)
+{
+    int rc;
+    DECLARE_HYPERCALL;
+    DECLARE_HYPERCALL_BUFFER(xen_hvm_altp2m_op_t, arg);
+
+    arg = xc_hypercall_buffer_alloc(handle, arg, sizeof(*arg));
+    if ( arg == NULL )
+        return -1;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_altp2m;
+    hypercall.arg[1] = HYPERCALL_BUFFER_AS_ARG(arg);
+
+    arg->version = HVMOP_ALTP2M_INTERFACE_VERSION;
+    arg->cmd = HVMOP_altp2m_set_mem_access;
+    arg->domain = domid;
+    arg->u.set_mem_access.view = view_id;
+    arg->u.set_mem_access.hvmmem_access = access;
+    arg->u.set_mem_access.gfn = gfn;
+
+    rc = do_xen_hypercall(handle, &hypercall);
+
+    xc_hypercall_buffer_free(handle, arg);
+    return rc;
+}
+
+int xc_altp2m_change_gfn(xc_interface *handle, domid_t domid,
+                         uint16_t view_id, xen_pfn_t old_gfn,
+                         xen_pfn_t new_gfn)
+{
+    int rc;
+    DECLARE_HYPERCALL;
+    DECLARE_HYPERCALL_BUFFER(xen_hvm_altp2m_op_t, arg);
+
+    arg = xc_hypercall_buffer_alloc(handle, arg, sizeof(*arg));
+    if ( arg == NULL )
+        return -1;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_altp2m;
+    hypercall.arg[1] = HYPERCALL_BUFFER_AS_ARG(arg);
+
+    arg->version = HVMOP_ALTP2M_INTERFACE_VERSION;
+    arg->cmd = HVMOP_altp2m_change_gfn;
+    arg->domain = domid;
+    arg->u.change_gfn.view = view_id;
+    arg->u.change_gfn.old_gfn = old_gfn;
+    arg->u.change_gfn.new_gfn = new_gfn;
+
+    rc = do_xen_hypercall(handle, &hypercall);
+
+    xc_hypercall_buffer_free(handle, arg);
+    return rc;
+}
+
diff --git a/tools/libxc/xc_bitops.h b/tools/libxc/xc_bitops.h
index dfce3b8..cd749f4 100644
--- a/tools/libxc/xc_bitops.h
+++ b/tools/libxc/xc_bitops.h
@@ -26,6 +26,11 @@ static inline unsigned long *bitmap_alloc(int nr_bits)
     return calloc(1, bitmap_size(nr_bits));
 }
 
+static inline void bitmap_set(unsigned long *addr, int nr_bits)
+{
+    memset(addr, 0xff, bitmap_size(nr_bits));
+}
+
 static inline void bitmap_clear(unsigned long *addr, int nr_bits)
 {
     memset(addr, 0, bitmap_size(nr_bits));
diff --git a/tools/libxc/xc_compression.c b/tools/libxc/xc_compression.c
index d42e651..b1b16e8 100644
--- a/tools/libxc/xc_compression.c
+++ b/tools/libxc/xc_compression.c
@@ -22,8 +22,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/tools/libxc/xc_core.c b/tools/libxc/xc_core.c
index dfa424b..011336c 100644
--- a/tools/libxc/xc_core.c
+++ b/tools/libxc/xc_core.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
diff --git a/tools/libxc/xc_core.h b/tools/libxc/xc_core.h
index 5867030..ffbe490 100644
--- a/tools/libxc/xc_core.h
+++ b/tools/libxc/xc_core.h
@@ -13,8 +13,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/tools/libxc/xc_core_arm.c b/tools/libxc/xc_core_arm.c
index 16508e7..d8570fd 100644
--- a/tools/libxc/xc_core_arm.c
+++ b/tools/libxc/xc_core_arm.c
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (c) 2011 Citrix Systems
  *
@@ -30,12 +29,6 @@ xc_core_arch_gpfn_may_present(struct xc_core_arch_context *arch_ctxt,
     return 0;
 }
 
-
-static int nr_gpfns(xc_interface *xch, domid_t domid)
-{
-    return xc_domain_maximum_gpfn(xch, domid) + 1;
-}
-
 int
 xc_core_arch_auto_translated_physmap(const xc_dominfo_t *info)
 {
@@ -48,9 +41,12 @@ xc_core_arch_memory_map_get(xc_interface *xch, struct xc_core_arch_context *unus
                             xc_core_memory_map_t **mapp,
                             unsigned int *nr_entries)
 {
-    unsigned long p2m_size = nr_gpfns(xch, info->domid);
+    xen_pfn_t p2m_size = 0;
     xc_core_memory_map_t *map;
 
+    if ( xc_domain_nr_gpfns(xch, info->domid, &p2m_size) < 0 )
+        return -1;
+
     map = malloc(sizeof(*map));
     if ( map == NULL )
     {
diff --git a/tools/libxc/xc_core_arm.h b/tools/libxc/xc_core_arm.h
index 24781eb..162f7a7 100644
--- a/tools/libxc/xc_core_arm.h
+++ b/tools/libxc/xc_core_arm.h
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (c) 2012 Citrix Systems
  *
diff --git a/tools/libxc/xc_core_x86.c b/tools/libxc/xc_core_x86.c
index d8846f1..679e753 100644
--- a/tools/libxc/xc_core_x86.c
+++ b/tools/libxc/xc_core_x86.c
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
  *                    VA Linux Systems Japan K.K.
@@ -35,12 +34,6 @@ xc_core_arch_gpfn_may_present(struct xc_core_arch_context *arch_ctxt,
     return 1;
 }
 
-
-static int nr_gpfns(xc_interface *xch, domid_t domid)
-{
-    return xc_domain_maximum_gpfn(xch, domid) + 1;
-}
-
 int
 xc_core_arch_auto_translated_physmap(const xc_dominfo_t *info)
 {
@@ -53,9 +46,12 @@ xc_core_arch_memory_map_get(xc_interface *xch, struct xc_core_arch_context *unus
                             xc_core_memory_map_t **mapp,
                             unsigned int *nr_entries)
 {
-    unsigned long p2m_size = nr_gpfns(xch, info->domid);
+    xen_pfn_t p2m_size = 0;
     xc_core_memory_map_t *map;
 
+    if ( xc_domain_nr_gpfns(xch, info->domid, &p2m_size) < 0 )
+        return -1;
+
     map = malloc(sizeof(*map));
     if ( map == NULL )
     {
@@ -88,7 +84,12 @@ xc_core_arch_map_p2m_rw(xc_interface *xch, struct domain_info_context *dinfo, xc
     int err;
     int i;
 
-    dinfo->p2m_size = nr_gpfns(xch, info->domid);
+    if ( xc_domain_nr_gpfns(xch, info->domid, &dinfo->p2m_size) < 0 )
+    {
+        ERROR("Could not get maximum GPFN!");
+        goto out;
+    }
+
     if ( dinfo->p2m_size < info->nr_pages  )
     {
         ERROR("p2m_size < nr_pages -1 (%lx < %lx", dinfo->p2m_size, info->nr_pages - 1);
@@ -210,16 +211,7 @@ int
 xc_core_arch_get_scratch_gpfn(xc_interface *xch, domid_t domid,
                               xen_pfn_t *gpfn)
 {
-    int rc;
-
-    rc = xc_domain_maximum_gpfn(xch, domid);
-
-    if ( rc < 0 )
-        return rc;
-
-    *gpfn = (xen_pfn_t)rc + 1;
-
-    return 0;
+    return xc_domain_nr_gpfns(xch, domid, gpfn);
 }
 
 /*
diff --git a/tools/libxc/xc_core_x86.h b/tools/libxc/xc_core_x86.h
index d5e04e7..867146b 100644
--- a/tools/libxc/xc_core_x86.h
+++ b/tools/libxc/xc_core_x86.h
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
  *                    VA Linux Systems Japan K.K.
diff --git a/tools/libxc/xc_cpu_hotplug.c b/tools/libxc/xc_cpu_hotplug.c
index e4659c0..58c2a0f 100644
--- a/tools/libxc/xc_cpu_hotplug.c
+++ b/tools/libxc/xc_cpu_hotplug.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/tools/libxc/xc_cpufeature.h b/tools/libxc/xc_cpufeature.h
index 66cc82e..c3ddc80 100644
--- a/tools/libxc/xc_cpufeature.h
+++ b/tools/libxc/xc_cpufeature.h
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __LIBXC_CPUFEATURE_H
diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index a18b1ff..e146a3e 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -16,8 +16,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <stdlib.h>
@@ -109,6 +108,7 @@ static void amd_xc_cpuid_policy(
         regs[3] &= (0x0183f3ff | /* features shared with 0x00000001:EDX */
                     bitmaskof(X86_FEATURE_NX) |
                     bitmaskof(X86_FEATURE_LM) |
+                    bitmaskof(X86_FEATURE_PAGE1GB) |
                     bitmaskof(X86_FEATURE_SYSCALL) |
                     bitmaskof(X86_FEATURE_MP) |
                     bitmaskof(X86_FEATURE_MMXEXT) |
@@ -192,6 +192,7 @@ static void intel_xc_cpuid_policy(
                     bitmaskof(X86_FEATURE_ABM));
         regs[3] &= (bitmaskof(X86_FEATURE_NX) |
                     bitmaskof(X86_FEATURE_LM) |
+                    bitmaskof(X86_FEATURE_PAGE1GB) |
                     bitmaskof(X86_FEATURE_SYSCALL) |
                     bitmaskof(X86_FEATURE_RDTSCP));
         break;
@@ -386,6 +387,7 @@ static void xc_cpuid_hvm_policy(
             clear_bit(X86_FEATURE_LM, regs[3]);
             clear_bit(X86_FEATURE_NX, regs[3]);
             clear_bit(X86_FEATURE_PSE36, regs[3]);
+            clear_bit(X86_FEATURE_PAGE1GB, regs[3]);
         }
         break;
 
diff --git a/tools/libxc/xc_cpupool.c b/tools/libxc/xc_cpupool.c
index 6393cfb..c42273e 100644
--- a/tools/libxc/xc_cpupool.c
+++ b/tools/libxc/xc_cpupool.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (c) 2009, J Gross.
  */
@@ -190,11 +189,11 @@ xc_cpumap_t xc_cpupool_freeinfo(xc_interface *xch)
     err = do_sysctl_save(xch, &sysctl);
 
     if ( err < 0 )
-	goto out;
+        goto out;
 
     cpumap = xc_cpumap_alloc(xch);
     if (cpumap == NULL)
-	goto out;
+        goto out;
 
     memcpy(cpumap, local, mapsize);
 
diff --git a/tools/libxc/xc_csched.c b/tools/libxc/xc_csched.c
index 390c645..bf03bfc 100644
--- a/tools/libxc/xc_csched.c
+++ b/tools/libxc/xc_csched.c
@@ -18,8 +18,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_csched2.c b/tools/libxc/xc_csched2.c
index 6da6a46..ed99605 100644
--- a/tools/libxc/xc_csched2.c
+++ b/tools/libxc/xc_csched2.c
@@ -18,8 +18,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
index 9b31b1f..aeaba54 100644
--- a/tools/libxc/xc_dom_arm.c
+++ b/tools/libxc/xc_dom_arm.c
@@ -12,8 +12,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (c) 2011, Citrix Systems
  */
@@ -26,9 +25,10 @@
 #include "xg_private.h"
 #include "xc_dom.h"
 
-#define NR_MAGIC_PAGES 2
+#define NR_MAGIC_PAGES 3
 #define CONSOLE_PFN_OFFSET 0
 #define XENSTORE_PFN_OFFSET 1
+#define MEMACCESS_PFN_OFFSET 2
 
 #define LPAE_SHIFT 9
 
@@ -87,10 +87,13 @@ static int alloc_magic_pages(struct xc_dom_image *dom)
 
     xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn);
     xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn);
+    xc_clear_domain_page(dom->xch, dom->guest_domid, base + MEMACCESS_PFN_OFFSET);
     xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_PFN,
             dom->console_pfn);
     xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_PFN,
             dom->xenstore_pfn);
+    xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_MONITOR_RING_PFN,
+            base + MEMACCESS_PFN_OFFSET);
     /* allocated by toolstack */
     xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_EVTCHN,
             dom->console_evtchn);
@@ -449,14 +452,15 @@ int arch_setup_meminit(struct xc_dom_image *dom)
     assert(dom->rambank_size[0] != 0);
     assert(ramsize == 0); /* Too much RAM is rejected above */
 
+    dom->p2m_size = p2m_size;
     dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) * p2m_size);
     if ( dom->p2m_host == NULL )
         return -EINVAL;
     for ( pfn = 0; pfn < p2m_size; pfn++ )
-        dom->p2m_host[pfn] = INVALID_MFN;
+        dom->p2m_host[pfn] = INVALID_P2M_ENTRY;
 
     /* setup initial p2m and allocate guest memory */
-    for ( i = 0; dom->rambank_size[i] && i < GUEST_RAM_BANKS; i++ )
+    for ( i = 0; i < GUEST_RAM_BANKS && dom->rambank_size[i]; i++ )
     {
         if ((rc = populate_guest_memory(dom,
                                         bankbase[i] >> XC_PAGE_SHIFT,
diff --git a/tools/libxc/xc_dom_armzimageloader.c b/tools/libxc/xc_dom_armzimageloader.c
index 2b28781..0df8c2a 100644
--- a/tools/libxc/xc_dom_armzimageloader.c
+++ b/tools/libxc/xc_dom_armzimageloader.c
@@ -16,8 +16,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/tools/libxc/xc_dom_binloader.c b/tools/libxc/xc_dom_binloader.c
index e1de5b5..740601a 100644
--- a/tools/libxc/xc_dom_binloader.c
+++ b/tools/libxc/xc_dom_binloader.c
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Some of the field descriptions were copied from "The Multiboot
  * Specification", Copyright 1995, 96 Bryan Ford <baford at cs.utah.edu>,
diff --git a/tools/libxc/xc_dom_boot.c b/tools/libxc/xc_dom_boot.c
index a141eb5..8e06406 100644
--- a/tools/libxc/xc_dom_boot.c
+++ b/tools/libxc/xc_dom_boot.c
@@ -18,8 +18,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * written 2006 by Gerd Hoffmann <kraxel at suse.de>.
  *
@@ -57,9 +56,9 @@ static int setup_hypercall_page(struct xc_dom_image *dom)
     domctl.u.hypercall_init.gmfn = xc_dom_p2m_guest(dom, pfn);
     rc = do_domctl(dom->xch, &domctl);
     if ( rc != 0 )
-        xc_dom_panic(dom->xch,
-                     XC_INTERNAL_ERROR, "%s: HYPERCALL_INIT failed (rc=%d)",
-                     __FUNCTION__, rc);
+        xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+                     "%s: HYPERCALL_INIT failed: %d - %s)",
+                     __FUNCTION__, errno, strerror(errno));
     return rc;
 }
 
diff --git a/tools/libxc/xc_dom_bzimageloader.c b/tools/libxc/xc_dom_bzimageloader.c
index 964ebdc..7fde42a 100644
--- a/tools/libxc/xc_dom_bzimageloader.c
+++ b/tools/libxc/xc_dom_bzimageloader.c
@@ -18,8 +18,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * written 2006 by Gerd Hoffmann <kraxel at suse.de>.
  * written 2007 by Jeremy Fitzhardinge <jeremy at xensource.com>
@@ -185,8 +184,9 @@ static int xc_try_bzip2_decode(
 static int xc_try_bzip2_decode(
     struct xc_dom_image *dom, void **blob, size_t *size)
 {
-    DOMPRINTF("%s: BZIP2 decompress support unavailable",
-              __FUNCTION__);
+    xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+                 "%s: BZIP2 decompress support unavailable",
+                 __FUNCTION__);
     return -1;
 }
 
@@ -367,16 +367,18 @@ static int xc_try_lzma_decode(
 static int xc_try_xz_decode(
     struct xc_dom_image *dom, void **blob, size_t *size)
 {
-    DOMPRINTF("%s: XZ decompress support unavailable",
-              __FUNCTION__);
+    xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+                 "%s: XZ decompress support unavailable",
+                 __FUNCTION__);
     return -1;
 }
 
 static int xc_try_lzma_decode(
     struct xc_dom_image *dom, void **blob, size_t *size)
 {
-    DOMPRINTF("%s: LZMA decompress support unavailable",
-              __FUNCTION__);
+    xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+                 "%s: LZMA decompress support unavailable",
+                 __FUNCTION__);
     return -1;
 }
 
@@ -577,8 +579,9 @@ static int xc_try_lzo1x_decode(
 static int xc_try_lzo1x_decode(
     struct xc_dom_image *dom, void **blob, size_t *size)
 {
-    DOMPRINTF("%s: LZO1x decompress support unavailable\n",
-                  __FUNCTION__);
+    xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+                 "%s: LZO1x decompress support unavailable\n",
+                 __FUNCTION__);
     return -1;
 }
 
diff --git a/tools/libxc/xc_dom_compat_linux.c b/tools/libxc/xc_dom_compat_linux.c
index 2c14a0f..a3abb99 100644
--- a/tools/libxc/xc_dom_compat_linux.c
+++ b/tools/libxc/xc_dom_compat_linux.c
@@ -16,8 +16,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * written 2006 by Gerd Hoffmann <kraxel at suse.de>.
  *
@@ -91,6 +90,8 @@ int xc_linux_build_mem(xc_interface *xch, uint32_t domid,
 
     xc_dom_loginit(xch);
     dom = xc_dom_allocate(xch, cmdline, features);
+    if (dom == NULL)
+        return -1;
     if ( (rc = xc_dom_kernel_mem(dom, image_buffer, image_size)) != 0 )
         goto out;
     if ( initrd && ((rc = xc_dom_ramdisk_mem(dom, initrd, initrd_len)) != 0) )
@@ -123,6 +124,8 @@ int xc_linux_build(xc_interface *xch, uint32_t domid,
 
     xc_dom_loginit(xch);
     dom = xc_dom_allocate(xch, cmdline, features);
+    if (dom == NULL)
+        return -1;
     if ( (rc = xc_dom_kernel_file(dom, image_name)) != 0 )
         goto out;
     if ( initrd_name && strlen(initrd_name) &&
@@ -146,6 +149,8 @@ int xc_get_bit_size(xc_interface *xch,
     int rc;
     *bit_size = 0;
     dom = xc_dom_allocate(xch, cmdline, features);
+    if (dom == NULL)
+        return -1;
     if ( (rc = xc_dom_kernel_file(dom, image_name)) != 0 )
         goto out;
     if ( (rc = xc_dom_parse_image(dom)) != 0 )
diff --git a/tools/libxc/xc_dom_core.c b/tools/libxc/xc_dom_core.c
index ecbf981..8466677 100644
--- a/tools/libxc/xc_dom_core.c
+++ b/tools/libxc/xc_dom_core.c
@@ -17,8 +17,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * written 2006 by Gerd Hoffmann <kraxel at suse.de>.
  *
@@ -931,9 +930,9 @@ int xc_dom_update_guest_p2m(struct xc_dom_image *dom)
     {
     case 4:
         DOMPRINTF("%s: dst 32bit, pages 0x%" PRIpfn "",
-                  __FUNCTION__, dom->total_pages);
+                  __FUNCTION__, dom->p2m_size);
         p2m_32 = dom->p2m_guest;
-        for ( i = 0; i < dom->total_pages; i++ )
+        for ( i = 0; i < dom->p2m_size; i++ )
             if ( dom->p2m_host[i] != INVALID_P2M_ENTRY )
                 p2m_32[i] = dom->p2m_host[i];
             else
@@ -941,9 +940,9 @@ int xc_dom_update_guest_p2m(struct xc_dom_image *dom)
         break;
     case 8:
         DOMPRINTF("%s: dst 64bit, pages 0x%" PRIpfn "",
-                  __FUNCTION__, dom->total_pages);
+                  __FUNCTION__, dom->p2m_size);
         p2m_64 = dom->p2m_guest;
-        for ( i = 0; i < dom->total_pages; i++ )
+        for ( i = 0; i < dom->p2m_size; i++ )
             if ( dom->p2m_host[i] != INVALID_P2M_ENTRY )
                 p2m_64[i] = dom->p2m_host[i];
             else
diff --git a/tools/libxc/xc_dom_elfloader.c b/tools/libxc/xc_dom_elfloader.c
index 9843b1f..66ea9d6 100644
--- a/tools/libxc/xc_dom_elfloader.c
+++ b/tools/libxc/xc_dom_elfloader.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * written 2006 by Gerd Hoffmann <kraxel at suse.de>.
  *
@@ -128,6 +127,8 @@ static elf_errorstatus xc_dom_load_elf_symtab(struct xc_dom_image *dom,
         return 0;
     }
 
+    size = elf->bsd_symtab_pend - elf->bsd_symtab_pstart;
+
     if ( load )
     {
         char *hdr_ptr;
@@ -135,11 +136,10 @@ static elf_errorstatus xc_dom_load_elf_symtab(struct xc_dom_image *dom,
 
         if ( !dom->bsd_symtab_start )
             return 0;
-        size = dom->kernel_seg.vend - dom->bsd_symtab_start;
         hdr_ptr = xc_dom_vaddr_to_ptr(dom, dom->bsd_symtab_start, &allow_size);
         if ( hdr_ptr == NULL )
         {
-            DOMPRINTF("%s/load: xc_dom_vaddr_to_ptr(dom,dom->bsd_symtab_start"
+            DOMPRINTF("%s: xc_dom_vaddr_to_ptr(dom,dom->bsd_symtab_start"
                       " => NULL", __FUNCTION__);
             return -1;
         }
@@ -152,8 +152,6 @@ static elf_errorstatus xc_dom_load_elf_symtab(struct xc_dom_image *dom,
     {
         char *hdr_ptr;
 
-        size = sizeof(unsigned) + elf_size(elf, elf->ehdr) +
-            elf_shdr_count(elf) * elf_size(elf, shdr);
         hdr_ptr = xc_dom_malloc(dom, size);
         if ( hdr_ptr == NULL )
             return 0;
@@ -161,6 +159,8 @@ static elf_errorstatus xc_dom_load_elf_symtab(struct xc_dom_image *dom,
         elf->caller_xdest_size = size;
         hdr = ELF_REALPTR2PTRVAL(hdr_ptr);
         dom->bsd_symtab_start = elf_round_up(elf, dom->kernel_seg.vend);
+        dom->kernel_seg.vend = elf_round_up(elf, dom->bsd_symtab_start + size);
+        return 0;
     }
 
     elf_memcpy_safe(elf, hdr + sizeof(unsigned),
@@ -189,9 +189,8 @@ static elf_errorstatus xc_dom_load_elf_symtab(struct xc_dom_image *dom,
     }
     if ( elf->caller_xdest_size < sizeof(unsigned) )
     {
-        DOMPRINTF("%s/%s: header size %"PRIx64" too small",
-                  __FUNCTION__, load ? "load" : "parse",
-                  (uint64_t)elf->caller_xdest_size);
+        DOMPRINTF("%s: header size %"PRIx64" too small",
+                  __FUNCTION__, (uint64_t)elf->caller_xdest_size);
         return -1;
     }
     if ( elf_init(&syms, elf->caller_xdest_base + sizeof(unsigned),
@@ -219,10 +218,9 @@ static elf_errorstatus xc_dom_load_elf_symtab(struct xc_dom_image *dom,
     maxaddr = elf_round_up(&syms, symtab + elf_size(&syms, syms.ehdr) +
                            elf_shdr_count(&syms) * elf_size(&syms, shdr));
 
-    DOMPRINTF("%s/%s: bsd_symtab_start=%" PRIx64 ", kernel.end=0x%" PRIx64
+    DOMPRINTF("%s: bsd_symtab_start=%" PRIx64 ", kernel.end=0x%" PRIx64
               " -- symtab=0x%" PRIx64 ", maxaddr=0x%" PRIx64 "",
-              __FUNCTION__, load ? "load" : "parse",
-              dom->bsd_symtab_start, dom->kernel_seg.vend,
+              __FUNCTION__, dom->bsd_symtab_start, dom->kernel_seg.vend,
               symtab, maxaddr);
 
     count = elf_shdr_count(&syms);
@@ -279,13 +277,10 @@ static elf_errorstatus xc_dom_load_elf_symtab(struct xc_dom_image *dom,
                       type == SHT_SYMTAB ? "symtab" : "strtab",
                       size, maxaddr);
 
-            if ( load )
-            {
-                shdr2 = elf_shdr_by_index(elf, h);
-                elf_memcpy_safe(elf, elf_section_start(&syms, shdr),
-                       elf_section_start(elf, shdr2),
-                       size);
-            }
+            shdr2 = elf_shdr_by_index(elf, h);
+            elf_memcpy_safe(elf, elf_section_start(&syms, shdr),
+                   elf_section_start(elf, shdr2),
+                   size);
         }
 
         /* Name is NULL. */
@@ -308,8 +303,7 @@ static elf_errorstatus xc_dom_load_elf_symtab(struct xc_dom_image *dom,
         dom->bsd_symtab_start = 0;
         return 0;
     }
-    if ( !load )
-        dom->kernel_seg.vend = maxaddr;
+
     return 0;
 }
 
diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c
index bf06fe4..3d40fa4 100644
--- a/tools/libxc/xc_dom_x86.c
+++ b/tools/libxc/xc_dom_x86.c
@@ -16,8 +16,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * written 2006 by Gerd Hoffmann <kraxel at suse.de>.
  *
@@ -42,6 +41,7 @@
 
 #define SUPERPAGE_PFN_SHIFT  9
 #define SUPERPAGE_NR_PFNS    (1UL << SUPERPAGE_PFN_SHIFT)
+#define SUPERPAGE_BATCH_SIZE 512
 
 #define bits_to_mask(bits)       (((xen_vaddr_t)1 << (bits))-1)
 #define round_down(addr, mask)   ((addr) & ~(mask))
@@ -122,11 +122,11 @@ static int count_pgtables(struct xc_dom_image *dom, int pae,
 
         try_pfn_end = (try_virt_end - dom->parms.virt_base) >> PAGE_SHIFT_X86;
 
-        if ( try_pfn_end > dom->total_pages )
+        if ( try_pfn_end > dom->p2m_size )
         {
             xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
                          "%s: not enough memory for initial mapping (%#"PRIpfn" > %#"PRIpfn")",
-                         __FUNCTION__, try_pfn_end, dom->total_pages);
+                         __FUNCTION__, try_pfn_end, dom->p2m_size);
             return -ENOMEM;
         }
 
@@ -440,10 +440,11 @@ pfn_error:
 
 static int alloc_magic_pages(struct xc_dom_image *dom)
 {
-    size_t p2m_size = dom->total_pages * dom->arch_hooks->sizeof_pfn;
+    size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn;
 
     /* allocate phys2mach table */
-    if ( xc_dom_alloc_segment(dom, &dom->p2m_seg, "phys2mach", 0, p2m_size) )
+    if ( xc_dom_alloc_segment(dom, &dom->p2m_seg, "phys2mach",
+                              0, p2m_alloc_size) )
         return -1;
     dom->p2m_guest = xc_dom_seg_to_ptr(dom, &dom->p2m_seg);
     if ( dom->p2m_guest == NULL )
@@ -759,7 +760,13 @@ static int x86_shadow(xc_interface *xch, domid_t domid)
 int arch_setup_meminit(struct xc_dom_image *dom)
 {
     int rc;
-    xen_pfn_t pfn, allocsz, i, j, mfn;
+    xen_pfn_t pfn, allocsz, mfn, total, pfn_base;
+    int i, j, k;
+    xen_vmemrange_t dummy_vmemrange[1];
+    unsigned int dummy_vnode_to_pnode[1];
+    xen_vmemrange_t *vmemranges;
+    unsigned int *vnode_to_pnode;
+    unsigned int nr_vmemranges, nr_vnodes;
 
     rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type);
     if ( rc )
@@ -772,15 +779,17 @@ int arch_setup_meminit(struct xc_dom_image *dom)
             return rc;
     }
 
-    dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) * dom->total_pages);
-    if ( dom->p2m_host == NULL )
-        return -EINVAL;
-
     if ( dom->superpages )
     {
         int count = dom->total_pages >> SUPERPAGE_PFN_SHIFT;
         xen_pfn_t extents[count];
 
+        dom->p2m_size = dom->total_pages;
+        dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) *
+                                      dom->p2m_size);
+        if ( dom->p2m_host == NULL )
+            return -EINVAL;
+
         DOMPRINTF("Populating memory with %d superpages", count);
         for ( pfn = 0; pfn < count; pfn++ )
             extents[pfn] = pfn << SUPERPAGE_PFN_SHIFT;
@@ -808,21 +817,135 @@ int arch_setup_meminit(struct xc_dom_image *dom)
             if ( rc )
                 return rc;
         }
-        /* setup initial p2m */
-        for ( pfn = 0; pfn < dom->total_pages; pfn++ )
-            dom->p2m_host[pfn] = pfn;
-        
+
+        /* Setup dummy vNUMA information if it's not provided. Note
+         * that this is a valid state if libxl doesn't provide any
+         * vNUMA information.
+         *
+         * The dummy values make libxc allocate all pages from
+         * arbitrary physical nodes. This is the expected behaviour if
+         * no vNUMA configuration is provided to libxc.
+         *
+         * Note that the following hunk is just for the convenience of
+         * allocation code. No defaulting happens in libxc.
+         */
+        if ( dom->nr_vmemranges == 0 )
+        {
+            nr_vmemranges = 1;
+            vmemranges = dummy_vmemrange;
+            vmemranges[0].start = 0;
+            vmemranges[0].end   = (uint64_t)dom->total_pages << PAGE_SHIFT;
+            vmemranges[0].flags = 0;
+            vmemranges[0].nid   = 0;
+
+            nr_vnodes = 1;
+            vnode_to_pnode = dummy_vnode_to_pnode;
+            vnode_to_pnode[0] = XC_NUMA_NO_NODE;
+        }
+        else
+        {
+            nr_vmemranges = dom->nr_vmemranges;
+            nr_vnodes = dom->nr_vnodes;
+            vmemranges = dom->vmemranges;
+            vnode_to_pnode = dom->vnode_to_pnode;
+        }
+
+        total = dom->p2m_size = 0;
+        for ( i = 0; i < nr_vmemranges; i++ )
+        {
+            total += ((vmemranges[i].end - vmemranges[i].start)
+                      >> PAGE_SHIFT);
+            dom->p2m_size =
+                dom->p2m_size > (vmemranges[i].end >> PAGE_SHIFT) ?
+                dom->p2m_size : (vmemranges[i].end >> PAGE_SHIFT);
+        }
+        if ( total != dom->total_pages )
+        {
+            xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+                         "%s: vNUMA page count mismatch (0x%"PRIpfn" != 0x%"PRIpfn")",
+                         __func__, total, dom->total_pages);
+            return -EINVAL;
+        }
+
+        dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) *
+                                      dom->p2m_size);
+        if ( dom->p2m_host == NULL )
+            return -EINVAL;
+        for ( pfn = 0; pfn < dom->p2m_size; pfn++ )
+            dom->p2m_host[pfn] = INVALID_P2M_ENTRY;
+
         /* allocate guest memory */
-        for ( i = rc = allocsz = 0;
-              (i < dom->total_pages) && !rc;
-              i += allocsz )
+        for ( i = 0; i < nr_vmemranges; i++ )
         {
-            allocsz = dom->total_pages - i;
-            if ( allocsz > 1024*1024 )
-                allocsz = 1024*1024;
-            rc = xc_domain_populate_physmap_exact(
-                dom->xch, dom->guest_domid, allocsz,
-                0, 0, &dom->p2m_host[i]);
+            unsigned int memflags;
+            uint64_t pages, super_pages;
+            unsigned int pnode = vnode_to_pnode[vmemranges[i].nid];
+            xen_pfn_t extents[SUPERPAGE_BATCH_SIZE];
+            xen_pfn_t pfn_base_idx;
+
+            memflags = 0;
+            if ( pnode != XC_NUMA_NO_NODE )
+                memflags |= XENMEMF_exact_node(pnode);
+
+            pages = (vmemranges[i].end - vmemranges[i].start)
+                >> PAGE_SHIFT;
+            super_pages = pages >> SUPERPAGE_PFN_SHIFT;
+            pfn_base = vmemranges[i].start >> PAGE_SHIFT;
+
+            for ( pfn = pfn_base; pfn < pfn_base+pages; pfn++ )
+                dom->p2m_host[pfn] = pfn;
+
+            pfn_base_idx = pfn_base;
+            while (super_pages) {
+                uint64_t count =
+                    min_t(uint64_t, super_pages,SUPERPAGE_BATCH_SIZE);
+                super_pages -= count;
+
+                for ( pfn = pfn_base_idx, j = 0;
+                      pfn < pfn_base_idx + (count << SUPERPAGE_PFN_SHIFT);
+                      pfn += SUPERPAGE_NR_PFNS, j++ )
+                    extents[j] = dom->p2m_host[pfn];
+                rc = xc_domain_populate_physmap(dom->xch, dom->guest_domid, count,
+                                                SUPERPAGE_PFN_SHIFT, memflags,
+                                                extents);
+                if ( rc < 0 )
+                    return rc;
+
+                /* Expand the returned mfns into the p2m array. */
+                pfn = pfn_base_idx;
+                for ( j = 0; j < rc; j++ )
+                {
+                    mfn = extents[j];
+                    for ( k = 0; k < SUPERPAGE_NR_PFNS; k++, pfn++ )
+                        dom->p2m_host[pfn] = mfn + k;
+                }
+                pfn_base_idx = pfn;
+            }
+
+            for ( j = pfn_base_idx - pfn_base; j < pages; j += allocsz )
+            {
+                allocsz = pages - j;
+                if ( allocsz > 1024*1024 )
+                    allocsz = 1024*1024;
+
+                rc = xc_domain_populate_physmap_exact(dom->xch,
+                         dom->guest_domid, allocsz, 0, memflags,
+                         &dom->p2m_host[pfn_base+j]);
+
+                if ( rc )
+                {
+                    if ( pnode != XC_NUMA_NO_NODE )
+                        xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+                                     "%s: failed to allocate 0x%"PRIx64" pages (v=%d, p=%d)",
+                                     __func__, pages, i, pnode);
+                    else
+                        xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+                                     "%s: failed to allocate 0x%"PRIx64" pages",
+                                     __func__, pages);
+                    return rc;
+                }
+            }
+            rc = 0;
         }
 
         /* Ensure no unclaimed pages are left unused.
@@ -855,7 +978,7 @@ static int map_grant_table_frames(struct xc_dom_image *dom)
     {
         rc = xc_domain_add_to_physmap(dom->xch, dom->guest_domid,
                                       XENMAPSPACE_grant_table,
-                                      i, dom->total_pages + i);
+                                      i, dom->p2m_size + i);
         if ( rc != 0 )
         {
             if ( (i > 0) && (errno == EINVAL) )
@@ -865,7 +988,8 @@ static int map_grant_table_frames(struct xc_dom_image *dom)
             }
             xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
                          "%s: mapping grant tables failed " "(pfn=0x%" PRIpfn
-                         ", rc=%d)", __FUNCTION__, dom->total_pages + i, rc);
+                         ", rc=%d, errno=%d)", __FUNCTION__, dom->p2m_size + i,
+                         rc, errno);
             return rc;
         }
     }
@@ -918,8 +1042,8 @@ int arch_setup_bootlate(struct xc_dom_image *dom)
         if ( rc != 0 )
         {
             xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: mapping"
-                         " shared_info failed (pfn=0x%" PRIpfn ", rc=%d)",
-                         __FUNCTION__, dom->shared_info_pfn, rc);
+                         " shared_info failed (pfn=0x%" PRIpfn ", rc=%d, errno: %d)",
+                         __FUNCTION__, dom->shared_info_pfn, rc, errno);
             return rc;
         }
 
diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index eb88eee..09ef748 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (c) 2003, K A Fraser.
  */
@@ -27,11 +26,12 @@
 #include <xen/memory.h>
 #include <xen/hvm/hvm_op.h>
 
-int xc_domain_create(xc_interface *xch,
-                     uint32_t ssidref,
-                     xen_domain_handle_t handle,
-                     uint32_t flags,
-                     uint32_t *pdomid)
+int xc_domain_create_config(xc_interface *xch,
+                            uint32_t ssidref,
+                            xen_domain_handle_t handle,
+                            uint32_t flags,
+                            uint32_t *pdomid,
+                            xc_domain_configuration_t *config)
 {
     int err;
     DECLARE_DOMCTL;
@@ -41,32 +41,40 @@ int xc_domain_create(xc_interface *xch,
     domctl.u.createdomain.ssidref = ssidref;
     domctl.u.createdomain.flags   = flags;
     memcpy(domctl.u.createdomain.handle, handle, sizeof(xen_domain_handle_t));
+    /* xc_domain_configure_t is an alias of arch_domainconfig_t */
+    memcpy(&domctl.u.createdomain.config, config, sizeof(*config));
     if ( (err = do_domctl(xch, &domctl)) != 0 )
         return err;
 
     *pdomid = (uint16_t)domctl.domain;
+    memcpy(config, &domctl.u.createdomain.config, sizeof(*config));
+
     return 0;
 }
 
-#if defined(__arm__) || defined(__aarch64__)
-int xc_domain_configure(xc_interface *xch, uint32_t domid,
-                        xc_domain_configuration_t *config)
+int xc_domain_create(xc_interface *xch,
+                     uint32_t ssidref,
+                     xen_domain_handle_t handle,
+                     uint32_t flags,
+                     uint32_t *pdomid)
 {
-    int rc;
-    DECLARE_DOMCTL;
+    xc_domain_configuration_t config;
 
-    domctl.cmd = XEN_DOMCTL_arm_configure_domain;
-    domctl.domain = (domid_t)domid;
-    /* xc_domain_configure_t is an alias of xen_domctl_arm_configuredomain */
-    memcpy(&domctl.u.configuredomain, config, sizeof(*config));
+    memset(&config, 0, sizeof(config));
 
-    rc = do_domctl(xch, &domctl);
-    if ( !rc )
-        memcpy(config, &domctl.u.configuredomain, sizeof(*config));
+#if defined (__i386) || defined(__x86_64__)
+    /* No arch-specific configuration for now */
+#elif defined (__arm__) || defined(__aarch64__)
+    config.gic_version = XEN_DOMCTL_CONFIG_GIC_NATIVE;
+    config.nr_spis = 0;
+#else
+    errno = ENOSYS;
+    return -1;
+#endif
 
-    return rc;
+    return xc_domain_create_config(xch, ssidref, handle,
+                                   flags, pdomid, &config);
 }
-#endif
 
 int xc_domain_cacheflush(xc_interface *xch, uint32_t domid,
                          xen_pfn_t start_pfn, xen_pfn_t nr_pfns)
@@ -112,14 +120,10 @@ int xc_domain_unpause(xc_interface *xch,
 int xc_domain_destroy(xc_interface *xch,
                       uint32_t domid)
 {
-    int ret;
     DECLARE_DOMCTL;
     domctl.cmd = XEN_DOMCTL_destroydomain;
     domctl.domain = (domid_t)domid;
-    do {
-        ret = do_domctl(xch, &domctl);
-    } while ( ret && (errno == EAGAIN) );
-    return ret;
+    return do_domctl(xch, &domctl);
 }
 
 int xc_domain_shutdown(xc_interface *xch,
@@ -630,7 +634,7 @@ int xc_shadow_control(xc_interface *xch,
 
 int xc_domain_setmaxmem(xc_interface *xch,
                         uint32_t domid,
-                        unsigned int max_memkb)
+                        uint64_t max_memkb)
 {
     DECLARE_DOMCTL;
     domctl.cmd = XEN_DOMCTL_max_mem;
@@ -679,6 +683,7 @@ int xc_domain_set_memory_map(xc_interface *xch,
 
     return rc;
 }
+
 int xc_get_machine_memory_map(xc_interface *xch,
                               struct e820entry entries[],
                               uint32_t max_entries)
@@ -725,6 +730,41 @@ int xc_domain_set_memmap_limit(xc_interface *xch,
 }
 #endif
 
+int xc_reserved_device_memory_map(xc_interface *xch,
+                                  uint32_t flags,
+                                  uint16_t seg,
+                                  uint8_t bus,
+                                  uint8_t devfn,
+                                  struct xen_reserved_device_memory entries[],
+                                  uint32_t *max_entries)
+{
+    int rc;
+    struct xen_reserved_device_memory_map xrdmmap = {
+        .flags = flags,
+        .dev.pci.seg = seg,
+        .dev.pci.bus = bus,
+        .dev.pci.devfn = devfn,
+        .nr_entries = *max_entries
+    };
+    DECLARE_HYPERCALL_BOUNCE(entries,
+                             sizeof(struct xen_reserved_device_memory) *
+                             *max_entries, XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+
+    if ( xc_hypercall_bounce_pre(xch, entries) )
+        return -1;
+
+    set_xen_guest_handle(xrdmmap.buffer, entries);
+
+    rc = do_memory_op(xch, XENMEM_reserved_device_memory_map,
+                      &xrdmmap, sizeof(xrdmmap));
+
+    xc_hypercall_bounce_post(xch, entries);
+
+    *max_entries = xrdmmap.nr_entries;
+
+    return rc;
+}
+
 int xc_domain_set_time_offset(xc_interface *xch,
                               uint32_t domid,
                               int32_t time_offset_seconds)
@@ -755,10 +795,10 @@ int xc_domain_set_tsc_info(xc_interface *xch,
     DECLARE_DOMCTL;
     domctl.cmd = XEN_DOMCTL_settscinfo;
     domctl.domain = (domid_t)domid;
-    domctl.u.tsc_info.info.tsc_mode = tsc_mode;
-    domctl.u.tsc_info.info.elapsed_nsec = elapsed_nsec;
-    domctl.u.tsc_info.info.gtsc_khz = gtsc_khz;
-    domctl.u.tsc_info.info.incarnation = incarnation;
+    domctl.u.tsc_info.tsc_mode = tsc_mode;
+    domctl.u.tsc_info.elapsed_nsec = elapsed_nsec;
+    domctl.u.tsc_info.gtsc_khz = gtsc_khz;
+    domctl.u.tsc_info.incarnation = incarnation;
     return do_domctl(xch, &domctl);
 }
 
@@ -771,31 +811,41 @@ int xc_domain_get_tsc_info(xc_interface *xch,
 {
     int rc;
     DECLARE_DOMCTL;
-    DECLARE_HYPERCALL_BUFFER(xen_guest_tsc_info_t, info);
-
-    info = xc_hypercall_buffer_alloc(xch, info, sizeof(*info));
-    if ( info == NULL )
-        return -ENOMEM;
 
     domctl.cmd = XEN_DOMCTL_gettscinfo;
     domctl.domain = (domid_t)domid;
-    set_xen_guest_handle(domctl.u.tsc_info.out_info, info);
     rc = do_domctl(xch, &domctl);
     if ( rc == 0 )
     {
-        *tsc_mode = info->tsc_mode;
-        *elapsed_nsec = info->elapsed_nsec;
-        *gtsc_khz = info->gtsc_khz;
-        *incarnation = info->incarnation;
+        *tsc_mode = domctl.u.tsc_info.tsc_mode;
+        *elapsed_nsec = domctl.u.tsc_info.elapsed_nsec;
+        *gtsc_khz = domctl.u.tsc_info.gtsc_khz;
+        *incarnation = domctl.u.tsc_info.incarnation;
     }
-    xc_hypercall_buffer_free(xch, info);
     return rc;
 }
 
 
-int xc_domain_maximum_gpfn(xc_interface *xch, domid_t domid)
+int xc_domain_maximum_gpfn(xc_interface *xch, domid_t domid, xen_pfn_t *gpfns)
+{
+    int rc = do_memory_op(xch, XENMEM_maximum_gpfn, &domid, sizeof(domid));
+
+    if ( rc >= 0 )
+    {
+        *gpfns = rc;
+        rc = 0;
+    }
+    return rc;
+}
+
+int xc_domain_nr_gpfns(xc_interface *xch, domid_t domid, xen_pfn_t *gpfns)
 {
-    return do_memory_op(xch, XENMEM_maximum_gpfn, &domid, sizeof(domid));
+    int rc = xc_domain_maximum_gpfn(xch, domid, gpfns);
+
+    if ( rc >= 0 )
+        *gpfns += 1;
+
+    return rc;
 }
 
 int xc_domain_increase_reservation(xc_interface *xch,
@@ -1295,11 +1345,32 @@ int xc_domain_send_trigger(xc_interface *xch,
     return do_domctl(xch, &domctl);
 }
 
+static inline int xc_hvm_param_deprecated_check(uint32_t param)
+{
+    switch ( param )
+    {
+        case HVM_PARAM_MEMORY_EVENT_CR0:
+        case HVM_PARAM_MEMORY_EVENT_CR3:
+        case HVM_PARAM_MEMORY_EVENT_CR4:
+        case HVM_PARAM_MEMORY_EVENT_INT3:
+        case HVM_PARAM_MEMORY_EVENT_SINGLE_STEP:
+        case HVM_PARAM_MEMORY_EVENT_MSR:
+            return -EOPNOTSUPP;
+        default:
+            break;
+    };
+
+    return 0;
+}
+
 int xc_hvm_param_set(xc_interface *handle, domid_t dom, uint32_t param, uint64_t value)
 {
     DECLARE_HYPERCALL;
     DECLARE_HYPERCALL_BUFFER(xen_hvm_param_t, arg);
-    int rc;
+    int rc = xc_hvm_param_deprecated_check(param);
+
+    if ( rc )
+        return rc;
 
     arg = xc_hypercall_buffer_alloc(handle, arg, sizeof(*arg));
     if ( arg == NULL )
@@ -1320,7 +1391,10 @@ int xc_hvm_param_get(xc_interface *handle, domid_t dom, uint32_t param, uint64_t
 {
     DECLARE_HYPERCALL;
     DECLARE_HYPERCALL_BUFFER(xen_hvm_param_t, arg);
-    int rc;
+    int rc = xc_hvm_param_deprecated_check(param);
+
+    if ( rc )
+        return rc;
 
     arg = xc_hypercall_buffer_alloc(handle, arg, sizeof(*arg));
     if ( arg == NULL )
@@ -1372,7 +1446,7 @@ int xc_hvm_create_ioreq_server(xc_interface *xch,
     hypercall.arg[1] = HYPERCALL_BUFFER_AS_ARG(arg);
 
     arg->domid = domid;
-    arg->handle_bufioreq = !!handle_bufioreq;
+    arg->handle_bufioreq = handle_bufioreq;
 
     rc = do_xen_hypercall(xch, &hypercall);
 
@@ -1622,13 +1696,16 @@ int xc_domain_setdebugging(xc_interface *xch,
 int xc_assign_device(
     xc_interface *xch,
     uint32_t domid,
-    uint32_t machine_sbdf)
+    uint32_t machine_sbdf,
+    uint32_t flag)
 {
     DECLARE_DOMCTL;
 
     domctl.cmd = XEN_DOMCTL_assign_device;
     domctl.domain = domid;
-    domctl.u.assign_device.machine_sbdf = machine_sbdf;
+    domctl.u.assign_device.dev = XEN_DOMCTL_DEV_PCI;
+    domctl.u.assign_device.u.pci.machine_sbdf = machine_sbdf;
+    domctl.u.assign_device.flag = flag;
 
     return do_domctl(xch, &domctl);
 }
@@ -1677,7 +1754,8 @@ int xc_test_assign_device(
 
     domctl.cmd = XEN_DOMCTL_test_assign_device;
     domctl.domain = domid;
-    domctl.u.assign_device.machine_sbdf = machine_sbdf;
+    domctl.u.assign_device.dev = XEN_DOMCTL_DEV_PCI;
+    domctl.u.assign_device.u.pci.machine_sbdf = machine_sbdf;
 
     return do_domctl(xch, &domctl);
 }
@@ -1691,11 +1769,101 @@ int xc_deassign_device(
 
     domctl.cmd = XEN_DOMCTL_deassign_device;
     domctl.domain = domid;
-    domctl.u.assign_device.machine_sbdf = machine_sbdf;
- 
+    domctl.u.assign_device.dev = XEN_DOMCTL_DEV_PCI;
+    domctl.u.assign_device.u.pci.machine_sbdf = machine_sbdf;
+
     return do_domctl(xch, &domctl);
 }
 
+int xc_assign_dt_device(
+    xc_interface *xch,
+    uint32_t domid,
+    char *path)
+{
+    int rc;
+    size_t size = strlen(path);
+    DECLARE_DOMCTL;
+    DECLARE_HYPERCALL_BOUNCE(path, size, XC_HYPERCALL_BUFFER_BOUNCE_IN);
+
+    if ( xc_hypercall_bounce_pre(xch, path) )
+        return -1;
+
+    domctl.cmd = XEN_DOMCTL_assign_device;
+    domctl.domain = (domid_t)domid;
+
+    domctl.u.assign_device.dev = XEN_DOMCTL_DEV_DT;
+    domctl.u.assign_device.u.dt.size = size;
+    /*
+     * DT doesn't own any RDM so actually DT has nothing to do
+     * for any flag and here just fix that as 0.
+     */
+    domctl.u.assign_device.flag = 0;
+    set_xen_guest_handle(domctl.u.assign_device.u.dt.path, path);
+
+    rc = do_domctl(xch, &domctl);
+
+    xc_hypercall_bounce_post(xch, path);
+
+    return rc;
+}
+
+int xc_test_assign_dt_device(
+    xc_interface *xch,
+    uint32_t domid,
+    char *path)
+{
+    int rc;
+    size_t size = strlen(path);
+    DECLARE_DOMCTL;
+    DECLARE_HYPERCALL_BOUNCE(path, size, XC_HYPERCALL_BUFFER_BOUNCE_IN);
+
+    if ( xc_hypercall_bounce_pre(xch, path) )
+        return -1;
+
+    domctl.cmd = XEN_DOMCTL_test_assign_device;
+    domctl.domain = (domid_t)domid;
+
+    domctl.u.assign_device.dev = XEN_DOMCTL_DEV_DT;
+    domctl.u.assign_device.u.dt.size = size;
+    set_xen_guest_handle(domctl.u.assign_device.u.dt.path, path);
+
+    rc = do_domctl(xch, &domctl);
+
+    xc_hypercall_bounce_post(xch, path);
+
+    return rc;
+}
+
+int xc_deassign_dt_device(
+    xc_interface *xch,
+    uint32_t domid,
+    char *path)
+{
+    int rc;
+    size_t size = strlen(path);
+    DECLARE_DOMCTL;
+    DECLARE_HYPERCALL_BOUNCE(path, size, XC_HYPERCALL_BUFFER_BOUNCE_IN);
+
+    if ( xc_hypercall_bounce_pre(xch, path) )
+        return -1;
+
+    domctl.cmd = XEN_DOMCTL_deassign_device;
+    domctl.domain = (domid_t)domid;
+
+    domctl.u.assign_device.dev = XEN_DOMCTL_DEV_DT;
+    domctl.u.assign_device.u.dt.size = size;
+    set_xen_guest_handle(domctl.u.assign_device.u.dt.path, path);
+
+    rc = do_domctl(xch, &domctl);
+
+    xc_hypercall_bounce_post(xch, path);
+
+    return rc;
+}
+
+
+
+
 int xc_domain_update_msi_irq(
     xc_interface *xch,
     uint32_t domid,
@@ -1751,15 +1919,16 @@ int xc_domain_unbind_msi_irq(
 }
 
 /* Pass-through: binds machine irq to guests irq */
-int xc_domain_bind_pt_irq(
+static int xc_domain_bind_pt_irq_int(
     xc_interface *xch,
     uint32_t domid,
-    uint8_t machine_irq,
+    uint32_t machine_irq,
     uint8_t irq_type,
     uint8_t bus,
     uint8_t device,
     uint8_t intx,
-    uint8_t isa_irq)
+    uint8_t isa_irq,
+    uint16_t spi)
 {
     int rc;
     xen_domctl_bind_pt_irq_t * bind;
@@ -1783,6 +1952,9 @@ int xc_domain_bind_pt_irq(
     case PT_IRQ_TYPE_ISA:
         bind->u.isa.isa_irq = isa_irq;
         break;
+    case PT_IRQ_TYPE_SPI:
+        bind->u.spi.spi = spi;
+        break;
     default:
         errno = EINVAL;
         return -1;
@@ -1792,7 +1964,7 @@ int xc_domain_bind_pt_irq(
     return rc;
 }
 
-int xc_domain_unbind_pt_irq(
+int xc_domain_bind_pt_irq(
     xc_interface *xch,
     uint32_t domid,
     uint8_t machine_irq,
@@ -1802,6 +1974,21 @@ int xc_domain_unbind_pt_irq(
     uint8_t intx,
     uint8_t isa_irq)
 {
+    return xc_domain_bind_pt_irq_int(xch, domid, machine_irq, irq_type,
+                                     bus, device, intx, isa_irq, 0);
+}
+
+static int xc_domain_unbind_pt_irq_int(
+    xc_interface *xch,
+    uint32_t domid,
+    uint32_t machine_irq,
+    uint8_t irq_type,
+    uint8_t bus,
+    uint8_t device,
+    uint8_t intx,
+    uint8_t isa_irq,
+    uint8_t spi)
+{
     int rc;
     xen_domctl_bind_pt_irq_t * bind;
     DECLARE_DOMCTL;
@@ -1824,6 +2011,9 @@ int xc_domain_unbind_pt_irq(
     case PT_IRQ_TYPE_ISA:
         bind->u.isa.isa_irq = isa_irq;
         break;
+    case PT_IRQ_TYPE_SPI:
+        bind->u.spi.spi = spi;
+        break;
     default:
         errno = EINVAL;
         return -1;
@@ -1833,6 +2023,20 @@ int xc_domain_unbind_pt_irq(
     return rc;
 }
 
+int xc_domain_unbind_pt_irq(
+    xc_interface *xch,
+    uint32_t domid,
+    uint8_t machine_irq,
+    uint8_t irq_type,
+    uint8_t bus,
+    uint8_t device,
+    uint8_t intx,
+    uint8_t isa_irq)
+{
+    return xc_domain_unbind_pt_irq_int(xch, domid, machine_irq, irq_type,
+                                       bus, device, intx, isa_irq, 0);
+}
+
 int xc_domain_bind_pt_pci_irq(
     xc_interface *xch,
     uint32_t domid,
@@ -1856,6 +2060,25 @@ int xc_domain_bind_pt_isa_irq(
                                   PT_IRQ_TYPE_ISA, 0, 0, 0, machine_irq));
 }
 
+int xc_domain_bind_pt_spi_irq(
+    xc_interface *xch,
+    uint32_t domid,
+    uint16_t vspi,
+    uint16_t spi)
+{
+    return (xc_domain_bind_pt_irq_int(xch, domid, vspi,
+                                      PT_IRQ_TYPE_SPI, 0, 0, 0, 0, spi));
+}
+
+int xc_domain_unbind_pt_spi_irq(xc_interface *xch,
+                                uint32_t domid,
+                                uint16_t vspi,
+                                uint16_t spi)
+{
+    return (xc_domain_unbind_pt_irq_int(xch, domid, vspi,
+                                        PT_IRQ_TYPE_SPI, 0, 0, 0, 0, spi));
+}
+
 int xc_unmap_domain_meminfo(xc_interface *xch, struct xc_domain_meminfo *minfo)
 {
     struct domain_info_context _di = { .guest_width = minfo->guest_width,
@@ -2013,7 +2236,7 @@ int xc_domain_memory_mapping(
     max_batch_sz = nr_mfns;
     do
     {
-        nr = min(nr_mfns - done, max_batch_sz);
+        nr = min_t(unsigned long, nr_mfns - done, max_batch_sz);
         domctl.u.memory_mapping.nr_mfns = nr;
         domctl.u.memory_mapping.first_gfn = first_gfn + done;
         domctl.u.memory_mapping.first_mfn = first_mfn + done;
@@ -2228,8 +2451,7 @@ int xc_domain_setvnuma(xc_interface *xch,
                              XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
     errno = EINVAL;
 
-    if ( nr_vnodes == 0 || nr_vmemranges == 0 ||
-         nr_vmemranges < nr_vnodes || nr_vcpus == 0 )
+    if ( nr_vnodes == 0 || nr_vmemranges == 0 || nr_vcpus == 0 )
         return -1;
 
     if ( !vdistance || !vcpu_to_vnode || !vmemrange || !vnode_to_pnode )
@@ -2271,6 +2493,60 @@ int xc_domain_setvnuma(xc_interface *xch,
     return rc;
 }
 
+int xc_domain_getvnuma(xc_interface *xch,
+                       uint32_t domid,
+                       uint32_t *nr_vnodes,
+                       uint32_t *nr_vmemranges,
+                       uint32_t *nr_vcpus,
+                       xen_vmemrange_t *vmemrange,
+                       unsigned int *vdistance,
+                       unsigned int *vcpu_to_vnode)
+{
+    int rc;
+    DECLARE_HYPERCALL_BOUNCE(vmemrange, sizeof(*vmemrange) * *nr_vmemranges,
+                             XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+    DECLARE_HYPERCALL_BOUNCE(vdistance, sizeof(*vdistance) *
+                             *nr_vnodes * *nr_vnodes,
+                             XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+    DECLARE_HYPERCALL_BOUNCE(vcpu_to_vnode, sizeof(*vcpu_to_vnode) * *nr_vcpus,
+                             XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+
+    struct xen_vnuma_topology_info vnuma_topo;
+
+    if ( xc_hypercall_bounce_pre(xch, vmemrange)      ||
+         xc_hypercall_bounce_pre(xch, vdistance)      ||
+         xc_hypercall_bounce_pre(xch, vcpu_to_vnode) )
+    {
+        rc = -1;
+        errno = ENOMEM;
+        goto vnumaget_fail;
+    }
+
+    set_xen_guest_handle(vnuma_topo.vmemrange.h, vmemrange);
+    set_xen_guest_handle(vnuma_topo.vdistance.h, vdistance);
+    set_xen_guest_handle(vnuma_topo.vcpu_to_vnode.h, vcpu_to_vnode);
+
+    vnuma_topo.nr_vnodes = *nr_vnodes;
+    vnuma_topo.nr_vcpus = *nr_vcpus;
+    vnuma_topo.nr_vmemranges = *nr_vmemranges;
+    vnuma_topo.domid = domid;
+    vnuma_topo.pad = 0;
+
+    rc = do_memory_op(xch, XENMEM_get_vnumainfo, &vnuma_topo,
+                      sizeof(vnuma_topo));
+
+    *nr_vnodes = vnuma_topo.nr_vnodes;
+    *nr_vcpus = vnuma_topo.nr_vcpus;
+    *nr_vmemranges = vnuma_topo.nr_vmemranges;
+
+ vnumaget_fail:
+    xc_hypercall_bounce_post(xch, vmemrange);
+    xc_hypercall_bounce_post(xch, vdistance);
+    xc_hypercall_bounce_post(xch, vcpu_to_vnode);
+
+    return rc;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
deleted file mode 100644
index a382701..0000000
--- a/tools/libxc/xc_domain_restore.c
+++ /dev/null
@@ -1,2399 +0,0 @@
-/******************************************************************************
- * xc_domain_restore.c
- *
- * Restore the state of a guest session.
- *
- * Copyright (c) 2003, K A Fraser.
- * Copyright (c) 2006, Intel Corporation
- * Copyright (c) 2007, XenSource Inc.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- *
- */
-
-/*
- * The superpages flag in restore has two different meanings depending on
- * the type of domain.
- *
- * For an HVM domain, the flag means to look for properly aligned contiguous
- * pages and try to allocate a superpage to satisfy it.  If that fails,
- * fall back to small pages.
- *
- * For a PV domain, the flag means allocate all memory as superpages.  If that
- * fails, the restore fails.  This behavior is required for PV guests who
- * want to use superpages.
- */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <inttypes.h>
-
-#include "xg_private.h"
-#include "xg_save_restore.h"
-#include "xc_dom.h"
-
-#include <xen/hvm/ioreq.h>
-#include <xen/hvm/params.h>
-
-struct restore_ctx {
-    unsigned long max_mfn; /* max mfn of the current host machine */
-    unsigned long hvirt_start; /* virtual starting address of the hypervisor */
-    unsigned int pt_levels; /* #levels of page tables used by the current guest */
-    unsigned long nr_pfns; /* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
-    xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
-    xen_pfn_t *p2m; /* A table mapping each PFN to its new MFN. */
-    xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region.  */
-    xen_pfn_t *p2m_saved_batch; /* Copy of p2m_batch array for pv superpage alloc */
-    int superpages; /* Superpage allocation has been requested */
-    int hvm;    /* This is an hvm domain */
-    int completed; /* Set when a consistent image is available */
-    int last_checkpoint; /* Set when we should commit to the current checkpoint when it completes. */
-    int compressing; /* Set when sender signals that pages would be sent compressed (for Remus) */
-    struct domain_info_context dinfo;
-};
-
-#define HEARTBEAT_MS 1000
-
-#ifndef __MINIOS__
-static ssize_t rdexact(xc_interface *xch, struct restore_ctx *ctx,
-                       int fd, void* buf, size_t size)
-{
-    size_t offset = 0;
-    ssize_t len;
-    struct timeval tv;
-    fd_set rfds;
-
-    while ( offset < size )
-    {
-        if ( ctx->completed ) {
-            /* expect a heartbeat every HEARBEAT_MS ms maximum */
-            tv.tv_sec = HEARTBEAT_MS / 1000;
-            tv.tv_usec = (HEARTBEAT_MS % 1000) * 1000;
-
-            FD_ZERO(&rfds);
-            FD_SET(fd, &rfds);
-            len = select(fd + 1, &rfds, NULL, NULL, &tv);
-            if ( len == -1 && errno == EINTR )
-                continue;
-            if ( !FD_ISSET(fd, &rfds) ) {
-                ERROR("%s failed (select returned %zd)", __func__, len);
-                errno = ETIMEDOUT;
-                return -1;
-            }
-        }
-
-        len = read(fd, buf + offset, size - offset);
-        if ( (len == -1) && ((errno == EINTR) || (errno == EAGAIN)) )
-            continue;
-        if ( len == 0 ) {
-            ERROR("0-length read");
-            errno = 0;
-        }
-        if ( len <= 0 ) {
-            ERROR("%s failed (read rc: %zd, errno: %d)", __func__, len, errno);
-            return -1;
-        }
-        offset += len;
-    }
-
-    return 0;
-}
-
-#define RDEXACT(fd,buf,size) rdexact(xch, ctx, fd, buf, size)
-#else
-#define RDEXACT read_exact
-#endif
-
-#define SUPERPAGE_PFN_SHIFT  9
-#define SUPERPAGE_NR_PFNS    (1UL << SUPERPAGE_PFN_SHIFT)
-#define SUPERPAGE(_pfn) ((_pfn) & (~(SUPERPAGE_NR_PFNS-1)))
-#define SUPER_PAGE_START(pfn)    (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
-
-/*
-** When we're restoring into a pv superpage-allocated guest, we take
-** a copy of the p2m_batch array to preserve the pfn, then allocate the
-** corresponding superpages.  We then fill in the p2m array using the saved
-** pfns.
-*/
-static int alloc_superpage_mfns(
-    xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, int nr_mfns)
-{
-    int i, j, max = 0;
-    unsigned long pfn, base_pfn, mfn;
-
-    for (i = 0; i < nr_mfns; i++)
-    {
-        pfn = ctx->p2m_batch[i];
-        base_pfn = SUPERPAGE(pfn);
-        if (ctx->p2m[base_pfn] != (INVALID_P2M_ENTRY-2))
-        {
-            ctx->p2m_saved_batch[max] = base_pfn;
-            ctx->p2m_batch[max] = base_pfn;
-            max++;
-            ctx->p2m[base_pfn] = INVALID_P2M_ENTRY-2;
-        }
-    }
-    if (xc_domain_populate_physmap_exact(xch, dom, max, SUPERPAGE_PFN_SHIFT,
-                                         0, ctx->p2m_batch) != 0)
-        return 1;
-
-    for (i = 0; i < max; i++)
-    {
-        mfn = ctx->p2m_batch[i];
-        pfn = ctx->p2m_saved_batch[i];
-        for (j = 0; j < SUPERPAGE_NR_PFNS; j++)
-            ctx->p2m[pfn++] = mfn++;
-    }
-    return 0;
-}
-/*
-** In the state file (or during transfer), all page-table pages are
-** converted into a 'canonical' form where references to actual mfns
-** are replaced with references to the corresponding pfns.
-** This function inverts that operation, replacing the pfn values with
-** the (now known) appropriate mfn values.
-*/
-static int uncanonicalize_pagetable(
-    xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, void *page)
-{
-    int i, rc, pte_last, nr_mfns = 0;
-    unsigned long pfn;
-    uint64_t pte;
-    struct domain_info_context *dinfo = &ctx->dinfo;
-
-    pte_last = PAGE_SIZE / 8;
-
-    /* First pass: work out how many (if any) MFNs we need to alloc */
-    for ( i = 0; i < pte_last; i++ )
-    {
-        pte = ((uint64_t *)page)[i];
-
-        /* XXX SMH: below needs fixing for PROT_NONE etc */
-        if ( !(pte & _PAGE_PRESENT) )
-            continue;
-        
-        pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
-        
-        if ( pfn >= dinfo->p2m_size )
-        {
-            /* This "page table page" is probably not one; bail. */
-            ERROR("Frame number in page table is out of range: "
-                  "i=%d pfn=0x%lx p2m_size=%lu",
-                  i, pfn, dinfo->p2m_size);
-            return 0;
-        }
-        
-        if ( ctx->p2m[pfn] == INVALID_P2M_ENTRY )
-        {
-            /* Have a 'valid' PFN without a matching MFN - need to alloc */
-            ctx->p2m_batch[nr_mfns++] = pfn; 
-            ctx->p2m[pfn]--;
-        }
-    }
-
-    /* Allocate the requisite number of mfns. */
-    if (nr_mfns)
-    {
-        if (!ctx->hvm && ctx->superpages)
-            rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
-        else
-            rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
-                                                  ctx->p2m_batch);
-
-        if (rc)
-        {
-            ERROR("Failed to allocate memory for batch.!\n");
-            errno = ENOMEM;
-            return 0;
-        }
-    }
-    
-    /* Second pass: uncanonicalize each present PTE */
-    nr_mfns = 0;
-    for ( i = 0; i < pte_last; i++ )
-    {
-        pte = ((uint64_t *)page)[i];
-        
-        /* XXX SMH: below needs fixing for PROT_NONE etc */
-        if ( !(pte & _PAGE_PRESENT) )
-            continue;
-        
-        pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
-
-        if ( ctx->p2m[pfn] == (INVALID_P2M_ENTRY-1) )
-            ctx->p2m[pfn] = ctx->p2m_batch[nr_mfns++];
-
-        pte &= ~MADDR_MASK_X86;
-        pte |= (uint64_t)ctx->p2m[pfn] << PAGE_SHIFT;
-
-        ((uint64_t *)page)[i] = (uint64_t)pte;
-    }
-
-    return 1;
-}
-
-
-/* Load the p2m frame list, plus potential extended info chunk */
-static xen_pfn_t *load_p2m_frame_list(
-    xc_interface *xch, struct restore_ctx *ctx,
-    int io_fd, int *pae_extended_cr3, int *ext_vcpucontext,
-    uint32_t *vcpuextstate_size)
-{
-    xen_pfn_t *p2m_frame_list;
-    vcpu_guest_context_any_t ctxt;
-    xen_pfn_t p2m_fl_zero;
-    struct domain_info_context *dinfo = &ctx->dinfo;
-
-    /* Read first entry of P2M list, or extended-info signature (~0UL). */
-    if ( RDEXACT(io_fd, &p2m_fl_zero, sizeof(long)) )
-    {
-        PERROR("read extended-info signature failed");
-        return NULL;
-    }
-    
-    if ( p2m_fl_zero == ~0UL )
-    {
-        uint32_t tot_bytes;
-        
-        /* Next 4 bytes: total size of following extended info. */
-        if ( RDEXACT(io_fd, &tot_bytes, sizeof(tot_bytes)) )
-        {
-            PERROR("read extended-info size failed");
-            return NULL;
-        }
-        
-        while ( tot_bytes )
-        {
-            uint32_t chunk_bytes;
-            char     chunk_sig[4];
-            
-            /* 4-character chunk signature + 4-byte remaining chunk size. */
-            if ( RDEXACT(io_fd, chunk_sig, sizeof(chunk_sig)) ||
-                 RDEXACT(io_fd, &chunk_bytes, sizeof(chunk_bytes)) ||
-                 (tot_bytes < (chunk_bytes + 8)) )
-            {
-                PERROR("read extended-info chunk signature failed");
-                return NULL;
-            }
-            tot_bytes -= 8;
-
-            /* VCPU context structure? */
-            if ( !strncmp(chunk_sig, "vcpu", 4) )
-            {
-                /* Pick a guest word-size and PT depth from the ctxt size */
-                if ( chunk_bytes == sizeof (ctxt.x32) )
-                {
-                    dinfo->guest_width = 4;
-                    ctx->pt_levels = 3;
-                }
-                else if ( chunk_bytes == sizeof (ctxt.x64) )
-                {
-                    dinfo->guest_width = 8;
-                    ctx->pt_levels = 4;
-                }
-                else 
-                {
-                    ERROR("bad extended-info context size %d", chunk_bytes);
-                    return NULL;
-                }
-
-                if ( RDEXACT(io_fd, &ctxt, chunk_bytes) )
-                {
-                    PERROR("read extended-info vcpu context failed");
-                    return NULL;
-                }
-                tot_bytes -= chunk_bytes;
-                chunk_bytes = 0;
-
-                if ( GET_FIELD(&ctxt, vm_assist, dinfo->guest_width)
-                     & (1UL << VMASST_TYPE_pae_extended_cr3) )
-                    *pae_extended_cr3 = 1;
-            }
-            else if ( !strncmp(chunk_sig, "extv", 4) )
-            {
-                *ext_vcpucontext = 1;
-            }
-            else if ( !strncmp(chunk_sig, "xcnt", 4) )
-            {
-                if ( RDEXACT(io_fd, vcpuextstate_size, sizeof(*vcpuextstate_size)) )
-                {
-                    PERROR("read extended vcpu state size failed");
-                    return NULL;
-                }
-                tot_bytes -= chunk_bytes;
-                chunk_bytes = 0;
-            }
-            
-            /* Any remaining bytes of this chunk: read and discard. */
-            while ( chunk_bytes )
-            {
-                unsigned long sz = min_t(unsigned long, chunk_bytes, sizeof(xen_pfn_t));
-                if ( RDEXACT(io_fd, &p2m_fl_zero, sz) )
-                {
-                    PERROR("read-and-discard extended-info chunk bytes failed");
-                    return NULL;
-                }
-                chunk_bytes -= sz;
-                tot_bytes   -= sz;
-            }
-        }
-
-        /* Now read the real first entry of P2M list. */
-        if ( RDEXACT(io_fd, &p2m_fl_zero, sizeof(xen_pfn_t)) )
-        {
-            PERROR("read first entry of p2m_frame_list failed");
-            return NULL;
-        }
-    }
-
-    /* Now that we know the guest's word-size, can safely allocate 
-     * the p2m frame list */
-    if ( (p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) == NULL )
-    {
-        ERROR("Couldn't allocate p2m_frame_list array");
-        return NULL;
-    }
-
-    /* First entry has already been read. */
-    p2m_frame_list[0] = p2m_fl_zero;
-    if ( RDEXACT(io_fd, &p2m_frame_list[1], 
-                 (P2M_FL_ENTRIES - 1) * sizeof(xen_pfn_t)) )
-    {
-        PERROR("read p2m_frame_list failed");
-        free(p2m_frame_list);
-        return NULL;
-    }
-    
-    return p2m_frame_list;
-}
-
-typedef struct {
-    int ishvm;
-    union {
-        struct tailbuf_pv {
-            unsigned int pfncount;
-            unsigned long* pfntab;
-            unsigned int vcpucount;
-            unsigned char* vcpubuf;
-            unsigned char shared_info_page[PAGE_SIZE];
-        } pv;
-        struct tailbuf_hvm {
-            uint64_t magicpfns[3];
-            uint32_t hvmbufsize, reclen;
-            uint8_t* hvmbuf;
-            struct {
-                uint32_t magic;
-                uint32_t version;
-                uint64_t len;
-            } qemuhdr;
-            uint32_t qemubufsize;
-            uint8_t* qemubuf;
-        } hvm;
-    } u;
-} tailbuf_t;
-
-/* read stream until EOF, growing buffer as necssary */
-static int compat_buffer_qemu(xc_interface *xch, struct restore_ctx *ctx,
-                              int fd, struct tailbuf_hvm *buf)
-{
-    uint8_t *qbuf, *tmp;
-    int blen = 0, dlen = 0;
-    int rc;
-
-    /* currently save records tend to be about 7K */
-    blen = 8192;
-    if ( !(qbuf = malloc(blen)) ) {
-        ERROR("Error allocating QEMU buffer");
-        return -1;
-    }
-
-    while( (rc = read(fd, qbuf+dlen, blen-dlen)) > 0 ) {
-        DPRINTF("Read %d bytes of QEMU data\n", rc);
-        dlen += rc;
-
-        if (dlen == blen) {
-            DPRINTF("%d-byte QEMU buffer full, reallocating...\n", dlen);
-            blen += 4096;
-            tmp = realloc(qbuf, blen);
-            if ( !tmp ) {
-                ERROR("Error growing QEMU buffer to %d bytes", blen);
-                free(qbuf);
-                return -1;
-            }
-            qbuf = tmp;
-        }
-    }
-
-    if ( rc < 0 ) {
-        ERROR("Error reading QEMU data");
-        free(qbuf);
-        return -1;
-    }
-
-    if ( memcmp(qbuf, "QEVM", 4) ) {
-        ERROR("Invalid QEMU magic: 0x%08"PRIx32, *(uint32_t*)qbuf);
-        free(qbuf);
-        return -1;
-    }
-
-    buf->qemubuf = qbuf;
-    buf->qemubufsize = dlen;
-
-    return 0;
-}
-
-static int buffer_qemu(xc_interface *xch, struct restore_ctx *ctx,
-                       int fd, struct tailbuf_hvm *buf)
-{
-    uint32_t qlen;
-    uint8_t *tmp;
-
-    if ( RDEXACT(fd, &qlen, sizeof(qlen)) ) {
-        PERROR("Error reading QEMU header length");
-        return -1;
-    }
-
-    if ( qlen > buf->qemubufsize ) {
-        if ( buf->qemubuf) {
-            tmp = realloc(buf->qemubuf, qlen);
-            if ( tmp )
-                buf->qemubuf = tmp;
-            else {
-                ERROR("Error reallocating QEMU state buffer");
-                return -1;
-            }
-        } else {
-            buf->qemubuf = malloc(qlen);
-            if ( !buf->qemubuf ) {
-                ERROR("Error allocating QEMU state buffer");
-                return -1;
-            }
-        }
-    }
-    buf->qemubufsize = qlen;
-
-    if ( RDEXACT(fd, buf->qemubuf, buf->qemubufsize) ) {
-        PERROR("Error reading QEMU state");
-        return -1;
-    }
-
-    return 0;
-}
-
-static int dump_qemu(xc_interface *xch, uint32_t dom, struct tailbuf_hvm *buf)
-{
-    int saved_errno;
-    char path[256];
-    FILE *fp;
-
-    sprintf(path, XC_DEVICE_MODEL_RESTORE_FILE".%u", dom);
-    fp = fopen(path, "wb");
-    if ( !fp )
-        return -1;
-
-    DPRINTF("Writing %d bytes of QEMU data\n", buf->qemubufsize);
-    if ( fwrite(buf->qemubuf, 1, buf->qemubufsize, fp) != buf->qemubufsize) {
-        saved_errno = errno;
-        fclose(fp);
-        errno = saved_errno;
-        return -1;
-    }
-
-    fclose(fp);
-
-    return 0;
-}
-
-static int buffer_tail_hvm(xc_interface *xch, struct restore_ctx *ctx,
-                           struct tailbuf_hvm *buf, int fd,
-                           unsigned int max_vcpu_id, uint64_t *vcpumap,
-                           int ext_vcpucontext,
-                           uint32_t vcpuextstate_size)
-{
-    uint8_t *tmp;
-    unsigned char qemusig[21];
-
-    if ( RDEXACT(fd, buf->magicpfns, sizeof(buf->magicpfns)) ) {
-        PERROR("Error reading magic PFNs");
-        return -1;
-    }
-
-    if ( RDEXACT(fd, &buf->reclen, sizeof(buf->reclen)) ) {
-        PERROR("Error reading HVM params size");
-        return -1;
-    }
-
-    if ( buf->reclen > buf->hvmbufsize ) {
-        if ( buf->hvmbuf) {
-            tmp = realloc(buf->hvmbuf, buf->reclen);
-            if ( tmp ) {
-                buf->hvmbuf = tmp;
-                buf->hvmbufsize = buf->reclen;
-            } else {
-                ERROR("Error reallocating HVM param buffer");
-                return -1;
-            }
-        } else {
-            buf->hvmbuf = malloc(buf->reclen);
-            if ( !buf->hvmbuf ) {
-                ERROR("Error allocating HVM param buffer");
-                return -1;
-            }
-            buf->hvmbufsize = buf->reclen;
-        }
-    }
-
-    if ( RDEXACT(fd, buf->hvmbuf, buf->reclen) ) {
-        PERROR("Error reading HVM params");
-        return -1;
-    }
-
-    if ( RDEXACT(fd, qemusig, sizeof(qemusig)) ) {
-        PERROR("Error reading QEMU signature");
-        return -1;
-    }
-
-    /* The legacy live-migration QEMU record has no length information.
-     * Short of reimplementing the QEMU parser, we're forced to just read
-     * until EOF.
-     *
-     * Gets around this by sending a different signatures for the new
-     * live-migration QEMU record and Remus which includes a length
-     * prefix
-     */
-    if ( !memcmp(qemusig, "QemuDeviceModelRecord", sizeof(qemusig)) )
-        return compat_buffer_qemu(xch, ctx, fd, buf);
-    else if ( !memcmp(qemusig, "DeviceModelRecord0002", sizeof(qemusig)) ||
-              !memcmp(qemusig, "RemusDeviceModelState", sizeof(qemusig)) )
-        return buffer_qemu(xch, ctx, fd, buf);
-
-    qemusig[20] = '\0';
-    ERROR("Invalid QEMU signature: %s", qemusig);
-    return -1;
-}
-
-static int buffer_tail_pv(xc_interface *xch, struct restore_ctx *ctx,
-                          struct tailbuf_pv *buf, int fd,
-                          unsigned int max_vcpu_id, uint64_t *vcpumap,
-                          int ext_vcpucontext,
-                          uint32_t vcpuextstate_size)
-{
-    unsigned int i;
-    size_t pfnlen, vcpulen;
-    struct domain_info_context *dinfo = &ctx->dinfo;
-
-    /* TODO: handle changing pfntab and vcpu counts */
-    /* PFN tab */
-    if ( RDEXACT(fd, &buf->pfncount, sizeof(buf->pfncount)) ||
-         (buf->pfncount > (1U << 28)) ) /* up to 1TB of address space */
-    {
-        PERROR("Error when reading pfn count");
-        return -1;
-    }
-    pfnlen = sizeof(unsigned long) * buf->pfncount;
-    if ( !(buf->pfntab) ) {
-        if ( !(buf->pfntab = malloc(pfnlen)) ) {
-            ERROR("Error allocating PFN tail buffer");
-            return -1;
-        }
-    }
-    // DPRINTF("Reading PFN tab: %d bytes\n", pfnlen);
-    if ( RDEXACT(fd, buf->pfntab, pfnlen) ) {
-        PERROR("Error when reading pfntab");
-        goto free_pfntab;
-    }
-
-    /* VCPU contexts */
-    buf->vcpucount = 0;
-    for (i = 0; i <= max_vcpu_id; i++) {
-        // DPRINTF("vcpumap: %llx, cpu: %d, bit: %llu\n", vcpumap[i/64], i, (vcpumap[i/64] & (1ULL << (i%64))));
-        if ( (!(vcpumap[i/64] & (1ULL << (i%64)))) )
-            continue;
-        buf->vcpucount++;
-    }
-    // DPRINTF("VCPU count: %d\n", buf->vcpucount);
-    vcpulen = ((dinfo->guest_width == 8) ? sizeof(vcpu_guest_context_x86_64_t)
-               : sizeof(vcpu_guest_context_x86_32_t)) * buf->vcpucount;
-    if ( ext_vcpucontext )
-        vcpulen += 128 * buf->vcpucount;
-    vcpulen += vcpuextstate_size * buf->vcpucount;
-
-    if ( !(buf->vcpubuf) ) {
-        if ( !(buf->vcpubuf = malloc(vcpulen)) ) {
-            ERROR("Error allocating VCPU ctxt tail buffer");
-            goto free_pfntab;
-        }
-    }
-    // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
-    if ( RDEXACT(fd, buf->vcpubuf, vcpulen) ) {
-        PERROR("Error when reading ctxt");
-        goto free_vcpus;
-    }
-
-    /* load shared_info_page */
-    // DPRINTF("Reading shared info: %lu bytes\n", PAGE_SIZE);
-    if ( RDEXACT(fd, buf->shared_info_page, PAGE_SIZE) ) {
-        PERROR("Error when reading shared info page");
-        goto free_vcpus;
-    }
-
-    return 0;
-
-  free_vcpus:
-    if (buf->vcpubuf) {
-        free (buf->vcpubuf);
-        buf->vcpubuf = NULL;
-    }
-  free_pfntab:
-    if (buf->pfntab) {
-        free (buf->pfntab);
-        buf->pfntab = NULL;
-    }
-
-    return -1;
-}
-
-static int buffer_tail(xc_interface *xch, struct restore_ctx *ctx,
-                       tailbuf_t *buf, int fd, unsigned int max_vcpu_id,
-                       uint64_t *vcpumap, int ext_vcpucontext,
-                       uint32_t vcpuextstate_size)
-{
-    if ( buf->ishvm )
-        return buffer_tail_hvm(xch, ctx, &buf->u.hvm, fd, max_vcpu_id, vcpumap,
-                               ext_vcpucontext, vcpuextstate_size);
-    else
-        return buffer_tail_pv(xch, ctx, &buf->u.pv, fd, max_vcpu_id, vcpumap,
-                              ext_vcpucontext, vcpuextstate_size);
-}
-
-static void tailbuf_free_hvm(struct tailbuf_hvm *buf)
-{
-    if ( buf->hvmbuf ) {
-        free(buf->hvmbuf);
-        buf->hvmbuf = NULL;
-    }
-    if ( buf->qemubuf ) {
-        free(buf->qemubuf);
-        buf->qemubuf = NULL;
-    }
-}
-
-static void tailbuf_free_pv(struct tailbuf_pv *buf)
-{
-    if ( buf->vcpubuf ) {
-        free(buf->vcpubuf);
-        buf->vcpubuf = NULL;
-    }
-    if ( buf->pfntab ) {
-        free(buf->pfntab);
-        buf->pfntab = NULL;
-    }
-}
-
-static void tailbuf_free(tailbuf_t *buf)
-{
-    if ( buf->ishvm )
-        tailbuf_free_hvm(&buf->u.hvm);
-    else
-        tailbuf_free_pv(&buf->u.pv);
-}
-
-struct toolstack_data_t {
-    uint8_t *data;
-    uint32_t len;
-};
-
-typedef struct {
-    void* pages;
-    /* pages is of length nr_physpages, pfn_types is of length nr_pages */
-    unsigned int nr_physpages, nr_pages;
-
-    /* checkpoint compression state */
-    int compressing;
-    unsigned long compbuf_pos, compbuf_size;
-
-    /* Types of the pfns in the current region */
-    unsigned long* pfn_types;
-
-    int verify;
-
-    int new_ctxt_format;
-    int max_vcpu_id;
-    uint64_t vcpumap[XC_SR_MAX_VCPUS/64];
-    uint64_t identpt;
-    uint64_t paging_ring_pfn;
-    uint64_t access_ring_pfn;
-    uint64_t sharing_ring_pfn;
-    uint64_t vm86_tss;
-    uint64_t console_pfn;
-    uint64_t acpi_ioport_location;
-    uint64_t viridian;
-    uint64_t vm_generationid_addr;
-    uint64_t ioreq_server_pfn;
-    uint64_t nr_ioreq_server_pages;
-
-    struct toolstack_data_t tdata;
-} pagebuf_t;
-
-static int pagebuf_init(pagebuf_t* buf)
-{
-    memset(buf, 0, sizeof(*buf));
-    return 0;
-}
-
-static void pagebuf_free(pagebuf_t* buf)
-{
-    if (buf->tdata.data != NULL) {
-        free(buf->tdata.data);
-        buf->tdata.data = NULL;
-    }
-    if (buf->pages) {
-        free(buf->pages);
-        buf->pages = NULL;
-    }
-    if(buf->pfn_types) {
-        free(buf->pfn_types);
-        buf->pfn_types = NULL;
-    }
-}
-
-static int pagebuf_get_one(xc_interface *xch, struct restore_ctx *ctx,
-                           pagebuf_t* buf, int fd, uint32_t dom)
-{
-    int count, countpages, oldcount, i;
-    void* ptmp;
-    unsigned long compbuf_size;
-
-    if ( RDEXACT(fd, &count, sizeof(count)) )
-    {
-        PERROR("Error when reading batch size");
-        return -1;
-    }
-
-    // DPRINTF("reading batch of %d pages\n", count);
-
-    switch ( count )
-    {
-    case 0:
-        // DPRINTF("Last batch read\n");
-        return 0;
-
-    case XC_SAVE_ID_ENABLE_VERIFY_MODE:
-        DPRINTF("Entering page verify mode\n");
-        buf->verify = 1;
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_VCPU_INFO:
-        buf->new_ctxt_format = 1;
-        if ( RDEXACT(fd, &buf->max_vcpu_id, sizeof(buf->max_vcpu_id)) ||
-             buf->max_vcpu_id >= XC_SR_MAX_VCPUS ||
-             RDEXACT(fd, buf->vcpumap, vcpumap_sz(buf->max_vcpu_id)) ) {
-            PERROR("Error when reading max_vcpu_id");
-            return -1;
-        }
-        // DPRINTF("Max VCPU ID: %d, vcpumap: %llx\n", buf->max_vcpu_id, buf->vcpumap[0]);
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_HVM_IDENT_PT:
-        /* Skip padding 4 bytes then read the EPT identity PT location. */
-        if ( RDEXACT(fd, &buf->identpt, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->identpt, sizeof(uint64_t)) )
-        {
-            PERROR("error read the address of the EPT identity map");
-            return -1;
-        }
-        // DPRINTF("EPT identity map address: %llx\n", buf->identpt);
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_HVM_PAGING_RING_PFN:
-        /* Skip padding 4 bytes then read the paging ring location. */
-        if ( RDEXACT(fd, &buf->paging_ring_pfn, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->paging_ring_pfn, sizeof(uint64_t)) )
-        {
-            PERROR("error read the paging ring pfn");
-            return -1;
-        }
-        // DPRINTF("paging ring pfn address: %llx\n", buf->paging_ring_pfn);
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_HVM_ACCESS_RING_PFN:
-        /* Skip padding 4 bytes then read the mem access ring location. */
-        if ( RDEXACT(fd, &buf->access_ring_pfn, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->access_ring_pfn, sizeof(uint64_t)) )
-        {
-            PERROR("error read the access ring pfn");
-            return -1;
-        }
-        // DPRINTF("access ring pfn address: %llx\n", buf->access_ring_pfn);
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_HVM_SHARING_RING_PFN:
-        /* Skip padding 4 bytes then read the sharing ring location. */
-        if ( RDEXACT(fd, &buf->sharing_ring_pfn, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->sharing_ring_pfn, sizeof(uint64_t)) )
-        {
-            PERROR("error read the sharing ring pfn");
-            return -1;
-        }
-        // DPRINTF("sharing ring pfn address: %llx\n", buf->sharing_ring_pfn);
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_HVM_VM86_TSS:
-        /* Skip padding 4 bytes then read the vm86 TSS location. */
-        if ( RDEXACT(fd, &buf->vm86_tss, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->vm86_tss, sizeof(uint64_t)) )
-        {
-            PERROR("error read the address of the vm86 TSS");
-            return -1;
-        }
-        // DPRINTF("VM86 TSS location: %llx\n", buf->vm86_tss);
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_TMEM:
-        DPRINTF("xc_domain_restore start tmem\n");
-        if ( xc_tmem_restore(xch, dom, fd) ) {
-            PERROR("error reading/restoring tmem");
-            return -1;
-        }
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_TMEM_EXTRA:
-        if ( xc_tmem_restore_extra(xch, dom, fd) ) {
-            PERROR("error reading/restoring tmem extra");
-            return -1;
-        }
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_TSC_INFO:
-    {
-        uint32_t tsc_mode, khz, incarn;
-        uint64_t nsec;
-        if ( RDEXACT(fd, &tsc_mode, sizeof(uint32_t)) ||
-             RDEXACT(fd, &nsec, sizeof(uint64_t)) ||
-             RDEXACT(fd, &khz, sizeof(uint32_t)) ||
-             RDEXACT(fd, &incarn, sizeof(uint32_t)) ||
-             xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) {
-            PERROR("error reading/restoring tsc info");
-            return -1;
-        }
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-    }
-
-    case XC_SAVE_ID_HVM_CONSOLE_PFN :
-        /* Skip padding 4 bytes then read the console pfn location. */
-        if ( RDEXACT(fd, &buf->console_pfn, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->console_pfn, sizeof(uint64_t)) )
-        {
-            PERROR("error read the address of the console pfn");
-            return -1;
-        }
-        // DPRINTF("console pfn location: %llx\n", buf->console_pfn);
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_LAST_CHECKPOINT:
-        ctx->last_checkpoint = 1;
-        // DPRINTF("last checkpoint indication received");
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION:
-        /* Skip padding 4 bytes then read the acpi ioport location. */
-        if ( RDEXACT(fd, &buf->acpi_ioport_location, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->acpi_ioport_location, sizeof(uint64_t)) )
-        {
-            PERROR("error read the acpi ioport location");
-            return -1;
-        }
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_HVM_VIRIDIAN:
-        /* Skip padding 4 bytes then read the acpi ioport location. */
-        if ( RDEXACT(fd, &buf->viridian, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->viridian, sizeof(uint64_t)) )
-        {
-            PERROR("error reading the viridian enlightenments");
-            return -1;
-        }
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_TOOLSTACK:
-        {
-            if ( RDEXACT(fd, &buf->tdata.len, sizeof(buf->tdata.len)) )
-            {
-                PERROR("error read toolstack id size");
-                return -1;
-            }
-            buf->tdata.data = (uint8_t*) realloc(buf->tdata.data, buf->tdata.len);
-            if ( buf->tdata.data == NULL )
-            {
-                PERROR("error memory allocation");
-                return -1;
-            }
-            if ( RDEXACT(fd, buf->tdata.data, buf->tdata.len) )
-            {
-                PERROR("error read toolstack id");
-                return -1;
-            }
-            return pagebuf_get_one(xch, ctx, buf, fd, dom);
-        }
-
-    case XC_SAVE_ID_ENABLE_COMPRESSION:
-        /* We cannot set compression flag directly in pagebuf structure,
-         * since this pagebuf still has uncompressed pages that are yet to
-         * be applied. We enable the compression field in pagebuf structure
-         * after receiving the first tailbuf.
-         */
-        ctx->compressing = 1;
-        // DPRINTF("compression flag received");
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_COMPRESSED_DATA:
-
-        /* read the length of compressed chunk coming in */
-        if ( RDEXACT(fd, &compbuf_size, sizeof(unsigned long)) )
-        {
-            PERROR("Error when reading compbuf_size");
-            return -1;
-        }
-        if (!compbuf_size) return 1;
-
-        buf->compbuf_size += compbuf_size;
-        if (!(ptmp = realloc(buf->pages, buf->compbuf_size))) {
-            ERROR("Could not (re)allocate compression buffer");
-            return -1;
-        }
-        buf->pages = ptmp;
-
-        if ( RDEXACT(fd, buf->pages + (buf->compbuf_size - compbuf_size),
-                     compbuf_size) ) {
-            PERROR("Error when reading compression buffer");
-            return -1;
-        }
-        return compbuf_size;
-
-    case XC_SAVE_ID_HVM_GENERATION_ID_ADDR:
-        /* Skip padding 4 bytes then read the generation id buffer location. */
-        if ( RDEXACT(fd, &buf->vm_generationid_addr, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->vm_generationid_addr, sizeof(uint64_t)) )
-        {
-            PERROR("error read the generation id buffer location");
-            return -1;
-        }
-        DPRINTF("read generation id buffer address");
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_HVM_IOREQ_SERVER_PFN:
-        /* Skip padding 4 bytes then read the ioreq server gmfn base. */
-        if ( RDEXACT(fd, &buf->ioreq_server_pfn, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->ioreq_server_pfn, sizeof(uint64_t)) )
-        {
-            PERROR("error read the ioreq server gmfn base");
-            return -1;
-        }
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    case XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES:
-        /* Skip padding 4 bytes then read the ioreq server gmfn count. */
-        if ( RDEXACT(fd, &buf->nr_ioreq_server_pages, sizeof(uint32_t)) ||
-             RDEXACT(fd, &buf->nr_ioreq_server_pages, sizeof(uint64_t)) )
-        {
-            PERROR("error read the ioreq server gmfn count");
-            return -1;
-        }
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    default:
-        if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
-            ERROR("Max batch size exceeded (%d). Giving up.", count);
-            errno = EMSGSIZE;
-            return -1;
-        }
-        break;
-    }
-
-    oldcount = buf->nr_pages;
-    buf->nr_pages += count;
-    if (!buf->pfn_types) {
-        if (!(buf->pfn_types = malloc(buf->nr_pages * sizeof(*(buf->pfn_types))))) {
-            ERROR("Could not allocate PFN type buffer");
-            return -1;
-        }
-    } else {
-        if (!(ptmp = realloc(buf->pfn_types, buf->nr_pages * sizeof(*(buf->pfn_types))))) {
-            ERROR("Could not reallocate PFN type buffer");
-            return -1;
-        }
-        buf->pfn_types = ptmp;
-    }
-    if ( RDEXACT(fd, buf->pfn_types + oldcount, count * sizeof(*(buf->pfn_types)))) {
-        PERROR("Error when reading region pfn types");
-        return -1;
-    }
-
-    countpages = count;
-    for (i = oldcount; i < buf->nr_pages; ++i)
-    {
-        unsigned long pagetype;
-
-        pagetype = buf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-        if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ||
-             pagetype == XEN_DOMCTL_PFINFO_BROKEN ||
-             pagetype == XEN_DOMCTL_PFINFO_XALLOC )
-            --countpages;
-    }
-
-    if (!countpages)
-        return count;
-
-    /* If Remus Checkpoint Compression is turned on, we will only be
-     * receiving the pfn lists now. The compressed pages will come in later,
-     * following a <XC_SAVE_ID_COMPRESSED_DATA, compressedChunkSize> tuple.
-     */
-    if (buf->compressing)
-        return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
-    oldcount = buf->nr_physpages;
-    buf->nr_physpages += countpages;
-    if (!buf->pages) {
-        if (!(buf->pages = malloc(buf->nr_physpages * PAGE_SIZE))) {
-            ERROR("Could not allocate page buffer");
-            return -1;
-        }
-    } else {
-        if (!(ptmp = realloc(buf->pages, buf->nr_physpages * PAGE_SIZE))) {
-            ERROR("Could not reallocate page buffer");
-            return -1;
-        }
-        buf->pages = ptmp;
-    }
-    if ( RDEXACT(fd, buf->pages + oldcount * PAGE_SIZE, countpages * PAGE_SIZE) ) {
-        PERROR("Error when reading pages");
-        return -1;
-    }
-
-    return count;
-}
-
-static int pagebuf_get(xc_interface *xch, struct restore_ctx *ctx,
-                       pagebuf_t* buf, int fd, uint32_t dom)
-{
-    int rc;
-
-    buf->nr_physpages = buf->nr_pages = 0;
-    buf->compbuf_pos = buf->compbuf_size = 0;
-
-    do {
-        rc = pagebuf_get_one(xch, ctx, buf, fd, dom);
-    } while (rc > 0);
-
-    if (rc < 0)
-        pagebuf_free(buf);
-
-    return rc;
-}
-
-static int apply_batch(xc_interface *xch, uint32_t dom, struct restore_ctx *ctx,
-                       xen_pfn_t* region_mfn, unsigned long* pfn_type, int pae_extended_cr3,
-                       struct xc_mmu* mmu,
-                       pagebuf_t* pagebuf, int curbatch, int *invalid_pages)
-{
-    int i, j, curpage, nr_mfns;
-    int k, scount;
-    unsigned long superpage_start=INVALID_P2M_ENTRY;
-    /* used by debug verify code */
-    unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
-    /* Our mapping of the current region (batch) */
-    char *region_base;
-    /* A temporary mapping, and a copy, of one frame of guest memory. */
-    unsigned long *page = NULL;
-    int nraces = 0;
-    struct domain_info_context *dinfo = &ctx->dinfo;
-    int* pfn_err = NULL;
-    int rc = -1;
-    int local_invalid_pages = 0;
-    /* We have handled curbatch pages before this batch, and there are
-     * *invalid_pages pages that are not in pagebuf->pages. So the first
-     * page for this page is (curbatch - *invalid_pages) page.
-     */
-    int first_page = curbatch - *invalid_pages;
-
-    unsigned long mfn, pfn, pagetype;
-
-    j = pagebuf->nr_pages - curbatch;
-    if (j > MAX_BATCH_SIZE)
-        j = MAX_BATCH_SIZE;
-
-    /* First pass for this batch: work out how much memory to alloc, and detect superpages */
-    nr_mfns = scount = 0;
-    for ( i = 0; i < j; i++ )
-    {
-        unsigned long pfn, pagetype;
-        pfn      = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-        pagetype = pagebuf->pfn_types[i + curbatch] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-        /* For allocation purposes, treat XEN_DOMCTL_PFINFO_XALLOC as a normal page */
-        if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && 
-             (ctx->p2m[pfn] == INVALID_P2M_ENTRY) )
-        {
-            /* Have a live PFN which hasn't had an MFN allocated */
-
-            /* Logic if we're in the middle of detecting a candidate superpage */
-            if ( superpage_start != INVALID_P2M_ENTRY )
-            {
-                /* Is this the next expected continuation? */
-                if ( pfn == superpage_start + scount )
-                {
-                    if ( !ctx->superpages )
-                    {
-                        ERROR("Unexpexted codepath with no superpages");
-                        return -1;
-                    }
-
-                    scount++;
-
-                    /* If we've found a whole superpage, allocate it and update p2m */
-                    if ( scount  == SUPERPAGE_NR_PFNS )
-                    {
-                        unsigned long supermfn;
-
-
-                        supermfn=superpage_start;
-                        if ( xc_domain_populate_physmap_exact(xch, dom, 1,
-                                         SUPERPAGE_PFN_SHIFT, 0, &supermfn) != 0 )
-                        {
-                            DPRINTF("No 2M page available for pfn 0x%lx, fall back to 4K page.\n",
-                                    superpage_start);
-                            /* If we're falling back from a failed allocation, subtract one
-                             * from count, since the last page == pfn, which will behandled
-                             * anyway. */
-                            scount--;
-                            goto fallback;
-                        }
-
-                        DPRINTF("Mapping superpage (%d) pfn %lx, mfn %lx\n", scount, superpage_start, supermfn);
-                        for (k=0; k<scount; k++)
-                        {
-                            /* We just allocated a new mfn above; update p2m */
-                            ctx->p2m[superpage_start+k] = supermfn+k;
-                            ctx->nr_pfns++;
-                            /* region_map[] will be set below */
-                        }
-                        superpage_start=INVALID_P2M_ENTRY;
-                        scount=0;
-                    }
-                    continue;
-                }
-                
-            fallback:
-                DPRINTF("Falling back %d pages pfn %lx\n", scount, superpage_start);
-                for (k=0; k<scount; k++)
-                {
-                    ctx->p2m_batch[nr_mfns++] = superpage_start+k; 
-                    ctx->p2m[superpage_start+k]--;
-                }
-                superpage_start = INVALID_P2M_ENTRY;
-                scount=0;
-            }
-
-            /* Are we ready to start a new superpage candidate? */
-            if ( ctx->hvm && ctx->superpages && SUPER_PAGE_START(pfn) )
-            {
-                superpage_start=pfn;
-                scount++;
-            }
-            else
-            {
-                /* Add the current pfn to pfn_batch */
-                ctx->p2m_batch[nr_mfns++] = pfn;
-                ctx->p2m[pfn]--;
-            }
-        }
-    }
-
-    /* Clean up any partial superpage candidates */
-    if ( superpage_start != INVALID_P2M_ENTRY )
-    {
-        DPRINTF("Falling back %d pages pfn %lx\n", scount, superpage_start);
-        for (k=0; k<scount; k++)
-        {
-            ctx->p2m_batch[nr_mfns++] = superpage_start+k; 
-            ctx->p2m[superpage_start+k]--;
-        }
-        superpage_start = INVALID_P2M_ENTRY;
-    }
-
-    /* Now allocate a bunch of mfns for this batch */
-    if ( nr_mfns )
-    {
-        DPRINTF("Mapping order 0,  %d; first pfn %lx\n", nr_mfns, ctx->p2m_batch[0]);
-    
-        if (!ctx->hvm && ctx->superpages)
-            rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
-        else
-            rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
-                                                  ctx->p2m_batch);
-
-        if (rc)
-        {
-            ERROR("Failed to allocate memory for batch.!\n"); 
-            errno = ENOMEM;
-            return -1;
-        }
-    }
-
-    /* Second pass for this batch: update p2m[] and region_mfn[] */
-    nr_mfns = 0; 
-    for ( i = 0; i < j; i++ )
-    {
-        unsigned long pfn, pagetype;
-        pfn      = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-        pagetype = pagebuf->pfn_types[i + curbatch] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-        if ( pagetype != XEN_DOMCTL_PFINFO_XTAB
-             && ctx->p2m[pfn] == (INVALID_P2M_ENTRY-1) )
-        {
-            /* We just allocated a new mfn above; update p2m */
-            ctx->p2m[pfn] = ctx->p2m_batch[nr_mfns++]; 
-            ctx->nr_pfns++; 
-        }
-
-        /* setup region_mfn[] for batch map, if necessary.
-         * For HVM guests, this interface takes PFNs, not MFNs */
-        if ( pagetype == XEN_DOMCTL_PFINFO_XTAB
-             || pagetype == XEN_DOMCTL_PFINFO_XALLOC )
-            region_mfn[i] = ~0UL; /* map will fail but we don't care */
-        else
-            region_mfn[i] = ctx->hvm ? pfn : ctx->p2m[pfn];
-    }
-
-    /* Map relevant mfns */
-    pfn_err = calloc(j, sizeof(*pfn_err));
-    if ( pfn_err == NULL )
-    {
-        PERROR("allocation for pfn_err failed");
-        return -1;
-    }
-    region_base = xc_map_foreign_bulk(
-        xch, dom, PROT_WRITE, region_mfn, pfn_err, j);
-
-    if ( region_base == NULL )
-    {
-        PERROR("map batch failed");
-        free(pfn_err);
-        return -1;
-    }
-
-    for ( i = 0, curpage = -1; i < j; i++ )
-    {
-        pfn      = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-        pagetype = pagebuf->pfn_types[i + curbatch] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-        if ( pagetype == XEN_DOMCTL_PFINFO_XTAB
-             || pagetype == XEN_DOMCTL_PFINFO_XALLOC)
-        {
-            local_invalid_pages++;
-            /* a bogus/unmapped/allocate-only page: skip it */
-            continue;
-        }
-
-        if ( pagetype == XEN_DOMCTL_PFINFO_BROKEN )
-        {
-            if ( xc_set_broken_page_p2m(xch, dom, pfn) )
-            {
-                ERROR("Set p2m for broken page failed, "
-                      "dom=%d, pfn=%lx\n", dom, pfn);
-                goto err_mapped;
-            }
-
-            local_invalid_pages++;
-            continue;
-        }
-
-        if (pfn_err[i])
-        {
-            ERROR("unexpected PFN mapping failure pfn %lx map_mfn %lx p2m_mfn %lx",
-                  pfn, region_mfn[i], ctx->p2m[pfn]);
-            goto err_mapped;
-        }
-
-        ++curpage;
-
-        if ( pfn > dinfo->p2m_size )
-        {
-            ERROR("pfn out of range");
-            goto err_mapped;
-        }
-
-        pfn_type[pfn] = pagetype;
-
-        mfn = ctx->p2m[pfn];
-
-        /* In verify mode, we use a copy; otherwise we work in place */
-        page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE);
-
-        /* Remus - page decompression */
-        if (pagebuf->compressing)
-        {
-            if (xc_compression_uncompress_page(xch, pagebuf->pages,
-                                               pagebuf->compbuf_size,
-                                               &pagebuf->compbuf_pos,
-                                               (char *)page))
-            {
-                ERROR("Failed to uncompress page (pfn=%lx)\n", pfn);
-                goto err_mapped;
-            }
-        }
-        else
-            memcpy(page, pagebuf->pages + (first_page + curpage) * PAGE_SIZE,
-                   PAGE_SIZE);
-
-        pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
-        if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
-             (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
-        {
-            /*
-            ** A page table page - need to 'uncanonicalize' it, i.e.
-            ** replace all the references to pfns with the corresponding
-            ** mfns for the new domain.
-            **
-            ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
-            ** so we may need to update the p2m after the main loop.
-            ** Hence we defer canonicalization of L1s until then.
-            */
-            if ((ctx->pt_levels != 3) ||
-                pae_extended_cr3 ||
-                (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
-
-                if (!uncanonicalize_pagetable(xch, dom, ctx, page)) {
-                    /*
-                    ** Failing to uncanonicalize a page table can be ok
-                    ** under live migration since the pages type may have
-                    ** changed by now (and we'll get an update later).
-                    */
-                    DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
-                            pagetype >> 28, pfn, mfn);
-                    nraces++;
-                    continue;
-                }
-            }
-        }
-        else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
-        {
-            ERROR("Bogus page type %lx page table is out of range: "
-                  "i=%d p2m_size=%lu", pagetype, i, dinfo->p2m_size);
-            goto err_mapped;
-        }
-
-        if ( pagebuf->verify )
-        {
-            int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
-            if ( res )
-            {
-                int v;
-
-                DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
-                        "actualcs=%08lx\n", pfn, pfn_type[pfn],
-                        csum_page(region_base + i * PAGE_SIZE),
-                        csum_page(buf));
-
-                for ( v = 0; v < 4; v++ )
-                {
-                    unsigned long *p = (unsigned long *)
-                        (region_base + i*PAGE_SIZE);
-                    if ( buf[v] != p[v] )
-                        DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
-                }
-            }
-        }
-
-        if ( !ctx->hvm &&
-             xc_add_mmu_update(xch, mmu,
-                               (((unsigned long long)mfn) << PAGE_SHIFT)
-                               | MMU_MACHPHYS_UPDATE, pfn) )
-        {
-            PERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
-            goto err_mapped;
-        }
-    } /* end of 'batch' for loop */
-
-    rc = nraces;
-    *invalid_pages += local_invalid_pages;
-
-  err_mapped:
-    munmap(region_base, j*PAGE_SIZE);
-    free(pfn_err);
-
-    return rc;
-}
-
-int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
-                      unsigned int store_evtchn, unsigned long *store_mfn,
-                      domid_t store_domid, unsigned int console_evtchn,
-                      unsigned long *console_mfn, domid_t console_domid,
-                      unsigned int hvm, unsigned int pae, int superpages,
-                      int checkpointed_stream,
-                      struct restore_callbacks *callbacks)
-{
-    DECLARE_DOMCTL;
-    xc_dominfo_t info;
-    int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
-    uint32_t vcpuextstate_size = 0;
-    unsigned long mfn, pfn;
-    int nraces = 0;
-
-    /* The new domain's shared-info frame number. */
-    unsigned long shared_info_frame;
-    unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
-    shared_info_any_t *old_shared_info = 
-        (shared_info_any_t *)shared_info_page;
-    shared_info_any_t *new_shared_info;
-
-    /* A copy of the CPU context of the guest. */
-    DECLARE_HYPERCALL_BUFFER(vcpu_guest_context_any_t, ctxt);
-
-    /* A copy of the CPU eXtended States of the guest. */
-    DECLARE_HYPERCALL_BUFFER(void, buffer);
-
-    /* A table containing the type of each PFN (/not/ MFN!). */
-    unsigned long *pfn_type = NULL;
-
-    /* A table of MFNs to map in the current region */
-    xen_pfn_t *region_mfn = NULL;
-
-    /* A copy of the pfn-to-mfn table frame list. */
-    xen_pfn_t *p2m_frame_list = NULL;
-    
-    /* A temporary mapping of the guest's start_info page. */
-    start_info_any_t *start_info;
-
-    /* Our mapping of the current region (batch) */
-    char *region_base;
-
-    struct xc_mmu *mmu = NULL;
-
-    struct mmuext_op pin[MAX_PIN_BATCH];
-    unsigned int nr_pins;
-
-    uint64_t vcpumap[XC_SR_MAX_VCPUS/64] = { 1ULL };
-    unsigned int max_vcpu_id = 0;
-    int new_ctxt_format = 0;
-
-    pagebuf_t pagebuf;
-    tailbuf_t tailbuf, tmptail;
-    struct toolstack_data_t tdata, tdatatmp;
-    void* vcpup;
-    uint64_t console_pfn = 0;
-
-    int orig_io_fd_flags;
-
-    struct restore_ctx _ctx;
-    struct restore_ctx *ctx = &_ctx;
-    struct domain_info_context *dinfo = &ctx->dinfo;
-
-    DPRINTF("%s: starting restore of new domid %u", __func__, dom);
-
-    pagebuf_init(&pagebuf);
-    memset(&tailbuf, 0, sizeof(tailbuf));
-    tailbuf.ishvm = hvm;
-    memset(&tdata, 0, sizeof(tdata));
-
-    memset(ctx, 0, sizeof(*ctx));
-
-    ctx->superpages = superpages;
-    ctx->hvm = hvm;
-    ctx->last_checkpoint = !checkpointed_stream;
-
-    ctxt = xc_hypercall_buffer_alloc(xch, ctxt, sizeof(*ctxt));
-
-    if ( ctxt == NULL )
-    {
-        PERROR("Unable to allocate VCPU ctxt buffer");
-        return 1;
-    }
-
-
-    if ( (orig_io_fd_flags = fcntl(io_fd, F_GETFL, 0)) < 0 ) {
-        PERROR("unable to read IO FD flags");
-        goto out;
-    }
-
-    if ( RDEXACT(io_fd, &dinfo->p2m_size, sizeof(unsigned long)) )
-    {
-        PERROR("read: p2m_size");
-        goto out;
-    }
-    DPRINTF("%s: p2m_size = %lx\n", __func__, dinfo->p2m_size);
-
-    if ( !get_platform_info(xch, dom,
-                            &ctx->max_mfn, &ctx->hvirt_start, &ctx->pt_levels, &dinfo->guest_width) )
-    {
-        ERROR("Unable to get platform info.");
-        return 1;
-    }
-    
-    /* The *current* word size of the guest isn't very interesting; for now
-     * assume the guest will be the same as we are.  We'll fix that later
-     * if we discover otherwise. */
-    dinfo->guest_width = sizeof(unsigned long);
-    ctx->pt_levels = (dinfo->guest_width == 8) ? 4 : 3;
-    
-    if ( !hvm ) 
-    {
-        /* Load the p2m frame list, plus potential extended info chunk */
-        p2m_frame_list = load_p2m_frame_list(xch, ctx,
-            io_fd, &pae_extended_cr3, &ext_vcpucontext,
-            &vcpuextstate_size);
-
-        if ( !p2m_frame_list )
-            goto out;
-
-        /* Now that we know the word size, tell Xen about it */
-        memset(&domctl, 0, sizeof(domctl));
-        domctl.domain = dom;
-        domctl.cmd    = XEN_DOMCTL_set_address_size;
-        domctl.u.address_size.size = dinfo->guest_width * 8;
-        frc = do_domctl(xch, &domctl);
-        if ( frc != 0 )
-        {
-            PERROR("Unable to set guest address size.");
-            goto out;
-        }
-    }
-
-    /* We want zeroed memory so use calloc rather than malloc. */
-    ctx->p2m   = calloc(dinfo->p2m_size, sizeof(xen_pfn_t));
-    pfn_type   = calloc(dinfo->p2m_size, sizeof(unsigned long));
-
-    region_mfn = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
-    ctx->p2m_batch = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
-    if (!ctx->hvm && ctx->superpages)
-    {
-        ctx->p2m_saved_batch =
-            malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
-        if ( ctx->p2m_saved_batch == NULL )
-        {
-            ERROR("saved batch memory alloc failed");
-            errno = ENOMEM;
-            goto out;
-        }
-    }
-
-    if ( (ctx->p2m == NULL) || (pfn_type == NULL) ||
-         (region_mfn == NULL) || (ctx->p2m_batch == NULL) )
-    {
-        ERROR("memory alloc failed");
-        errno = ENOMEM;
-        goto out;
-    }
-
-    memset(region_mfn, 0,
-           ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); 
-    memset(ctx->p2m_batch, 0,
-           ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); 
-
-    /* Get the domain's shared-info frame. */
-    if ( xc_domain_getinfo(xch, (domid_t)dom, 1, &info) != 1 )
-    {
-        PERROR("Could not get information on new domain");
-        goto out;
-    }
-    shared_info_frame = info.shared_info_frame;
-
-    /* Mark all PFNs as invalid; we allocate on demand */
-    for ( pfn = 0; pfn < dinfo->p2m_size; pfn++ )
-        ctx->p2m[pfn] = INVALID_P2M_ENTRY;
-
-    mmu = xc_alloc_mmu_updates(xch, dom);
-    if ( mmu == NULL )
-    {
-        PERROR("Could not initialise for MMU updates");
-        goto out;
-    }
-
-    xc_report_progress_start(xch, "Reloading memory pages", dinfo->p2m_size);
-
-    /*
-     * Now simply read each saved frame into its new machine frame.
-     * We uncanonicalise page tables as we go.
-     */
-
-    n = m = 0;
- loadpages:
-    for ( ; ; )
-    {
-        int j, curbatch, invalid_pages;
-
-        xc_report_progress_step(xch, n, dinfo->p2m_size);
-
-        if ( !ctx->completed ) {
-            pagebuf.nr_physpages = pagebuf.nr_pages = 0;
-            pagebuf.compbuf_pos = pagebuf.compbuf_size = 0;
-            if ( pagebuf_get_one(xch, ctx, &pagebuf, io_fd, dom) < 0 ) {
-                PERROR("Error when reading batch");
-                goto out;
-            }
-        }
-        j = pagebuf.nr_pages;
-
-        DBGPRINTF("batch %d\n",j);
-
-        if ( j == 0 ) {
-            /* catch vcpu updates */
-            if (pagebuf.new_ctxt_format) {
-                max_vcpu_id = pagebuf.max_vcpu_id;
-                memcpy(vcpumap, pagebuf.vcpumap, vcpumap_sz(max_vcpu_id));
-            }
-            /* should this be deferred? does it change? */
-            if ( pagebuf.identpt )
-                xc_hvm_param_set(xch, dom, HVM_PARAM_IDENT_PT, pagebuf.identpt);
-            if ( pagebuf.paging_ring_pfn )
-                xc_hvm_param_set(xch, dom, HVM_PARAM_PAGING_RING_PFN, pagebuf.paging_ring_pfn);
-            if ( pagebuf.access_ring_pfn )
-                xc_hvm_param_set(xch, dom, HVM_PARAM_ACCESS_RING_PFN, pagebuf.access_ring_pfn);
-            if ( pagebuf.sharing_ring_pfn )
-                xc_hvm_param_set(xch, dom, HVM_PARAM_SHARING_RING_PFN, pagebuf.sharing_ring_pfn);
-            if ( pagebuf.vm86_tss )
-                xc_hvm_param_set(xch, dom, HVM_PARAM_VM86_TSS, pagebuf.vm86_tss);
-            if ( pagebuf.console_pfn )
-                console_pfn = pagebuf.console_pfn;
-            if ( pagebuf.vm_generationid_addr )
-                xc_hvm_param_set(xch, dom, HVM_PARAM_VM_GENERATION_ID_ADDR,
-                                 pagebuf.vm_generationid_addr);
-
-            break;  /* our work here is done */
-        }
-
-        /* break pagebuf into batches */
-        curbatch = 0;
-        invalid_pages = 0;
-        while ( curbatch < j ) {
-            int brc;
-
-            brc = apply_batch(xch, dom, ctx, region_mfn, pfn_type,
-                              pae_extended_cr3, mmu, &pagebuf, curbatch,
-                              &invalid_pages);
-            if ( brc < 0 )
-                goto out;
-
-            nraces += brc;
-
-            curbatch += MAX_BATCH_SIZE;
-        }
-
-        pagebuf.nr_physpages = pagebuf.nr_pages = 0;
-        pagebuf.compbuf_pos = pagebuf.compbuf_size = 0;
-
-        n += j; /* crude stats */
-
-        /* 
-         * Discard cache for portion of file read so far up to last
-         *  page boundary every 16MB or so.
-         */
-        m += j;
-        if ( m > MAX_PAGECACHE_USAGE )
-        {
-            discard_file_cache(xch, io_fd, 0 /* no flush */);
-            m = 0;
-        }
-    }
-
-    /*
-     * Ensure we flush all machphys updates before potential PAE-specific
-     * reallocations below.
-     */
-    if ( !hvm && xc_flush_mmu_updates(xch, mmu) )
-    {
-        PERROR("Error doing flush_mmu_updates()");
-        goto out;
-    }
-
-    // DPRINTF("Received all pages (%d races)\n", nraces);
-
-    if ( !ctx->completed ) {
-
-        if ( buffer_tail(xch, ctx, &tailbuf, io_fd, max_vcpu_id, vcpumap,
-                         ext_vcpucontext, vcpuextstate_size) < 0 ) {
-            ERROR ("error buffering image tail");
-            goto out;
-        }
-
-        ctx->completed = 1;
-
-        /*
-         * If more checkpoints are expected then shift into
-         * nonblocking mode for the remainder.
-         */
-        if ( !ctx->last_checkpoint )
-            fcntl(io_fd, F_SETFL, orig_io_fd_flags | O_NONBLOCK);
-
-        /*
-         * If sender had sent enable compression flag, switch to compressed
-         * checkpoints mode once the first checkpoint is received.
-         */
-        if (ctx->compressing)
-            pagebuf.compressing = 1;
-    }
-
-    if (pagebuf.viridian != 0)
-        xc_hvm_param_set(xch, dom, HVM_PARAM_VIRIDIAN, pagebuf.viridian);
-
-    /*
-     * If we are migrating in from a host that does not support
-     * secondary emulators then nr_ioreq_server_pages will be 0, since
-     * there will be no XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES chunk in
-     * the image.
-     * If we are migrating from a host that does support secondary
-     * emulators then the XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES chunk
-     * will exist and is guaranteed to have a non-zero value. The
-     * existence of that chunk also implies the existence of the
-     * XC_SAVE_ID_HVM_IOREQ_SERVER_PFN chunk, which is also guaranteed
-     * to have a non-zero value.
-     */
-    if (!pagebuf.nr_ioreq_server_pages ^ !pagebuf.ioreq_server_pfn) {
-        ERROR("Inconsistent IOREQ Server settings (nr=%"PRIx64", pfn=%"PRIx64")",
-              pagebuf.nr_ioreq_server_pages, pagebuf.ioreq_server_pfn);
-    } else {
-        if (pagebuf.nr_ioreq_server_pages != 0 &&
-            pagebuf.ioreq_server_pfn != 0) {
-            xc_hvm_param_set(xch, dom, HVM_PARAM_NR_IOREQ_SERVER_PAGES,
-                             pagebuf.nr_ioreq_server_pages);
-            xc_hvm_param_set(xch, dom, HVM_PARAM_IOREQ_SERVER_PFN,
-                             pagebuf.ioreq_server_pfn);
-        }
-    }
-
-    if (pagebuf.acpi_ioport_location == 1) {
-        DBGPRINTF("Use new firmware ioport from the checkpoint\n");
-        xc_hvm_param_set(xch, dom, HVM_PARAM_ACPI_IOPORTS_LOCATION, 1);
-    } else if (pagebuf.acpi_ioport_location == 0) {
-        DBGPRINTF("Use old firmware ioport from the checkpoint\n");
-    } else {
-        ERROR("Error, unknow acpi ioport location (%"PRId64")", pagebuf.acpi_ioport_location);
-    }
-
-    tdatatmp = tdata;
-    tdata = pagebuf.tdata;
-    pagebuf.tdata = tdatatmp;
-
-    if ( ctx->last_checkpoint )
-    {
-        // DPRINTF("Last checkpoint, finishing\n");
-        goto finish;
-    }
-
-    // DPRINTF("Buffered checkpoint\n");
-
-    if ( pagebuf_get(xch, ctx, &pagebuf, io_fd, dom) ) {
-        PERROR("error when buffering batch, finishing");
-        /*
-         * Remus: discard the current incomplete checkpoint and restore
-         * backup from the last complete checkpoint.
-         */
-        goto finish;
-    }
-    memset(&tmptail, 0, sizeof(tmptail));
-    tmptail.ishvm = hvm;
-    if ( buffer_tail(xch, ctx, &tmptail, io_fd, max_vcpu_id, vcpumap,
-                     ext_vcpucontext, vcpuextstate_size) < 0 ) {
-        ERROR ("error buffering image tail, finishing");
-        /*
-         * Remus: discard the current incomplete checkpoint and restore
-         * backup from the last complete checkpoint.
-         */
-        goto finish;
-    }
-    tailbuf_free(&tailbuf);
-    memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
-
-    goto loadpages;
-
-  /* With Remus: restore from last complete checkpoint */
-  finish:
-    if ( hvm )
-        goto finish_hvm;
-
-    if ( (ctx->pt_levels == 3) && !pae_extended_cr3 )
-    {
-        /*
-        ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
-        ** is a little awkward and involves (a) finding all such PGDs and
-        ** replacing them with 'lowmem' versions; (b) upating the p2m[]
-        ** with the new info; and (c) canonicalizing all the L1s using the
-        ** (potentially updated) p2m[].
-        **
-        ** This is relatively slow (and currently involves two passes through
-        ** the pfn_type[] array), but at least seems to be correct. May wish
-        ** to consider more complex approaches to optimize this later.
-        */
-
-        int j, k;
-        
-        /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
-        for ( i = 0; i < dinfo->p2m_size; i++ )
-        {
-            if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
-                  XEN_DOMCTL_PFINFO_L3TAB) &&
-                 (ctx->p2m[i] > 0xfffffUL) )
-            {
-                unsigned long new_mfn;
-                uint64_t l3ptes[4];
-                uint64_t *l3tab;
-
-                l3tab = (uint64_t *)
-                    xc_map_foreign_range(xch, dom, PAGE_SIZE,
-                                         PROT_READ, ctx->p2m[i]);
-                if ( l3tab == NULL )
-                {
-                    PERROR("xc_map_foreign_range failed (for l3tab)");
-                    goto out;
-                }
-
-                for ( j = 0; j < 4; j++ )
-                    l3ptes[j] = l3tab[j];
-
-                munmap(l3tab, PAGE_SIZE);
-
-                new_mfn = xc_make_page_below_4G(xch, dom, ctx->p2m[i]);
-                if ( !new_mfn )
-                {
-                    PERROR("Couldn't get a page below 4GB :-(");
-                    goto out;
-                }
-
-                ctx->p2m[i] = new_mfn;
-                if ( xc_add_mmu_update(xch, mmu,
-                                       (((unsigned long long)new_mfn)
-                                        << PAGE_SHIFT) |
-                                       MMU_MACHPHYS_UPDATE, i) )
-                {
-                    PERROR("Couldn't m2p on PAE root pgdir");
-                    goto out;
-                }
-
-                l3tab = (uint64_t *)
-                    xc_map_foreign_range(xch, dom, PAGE_SIZE,
-                                         PROT_READ | PROT_WRITE, ctx->p2m[i]);
-                if ( l3tab == NULL )
-                {
-                    PERROR("xc_map_foreign_range failed (for l3tab, 2nd)");
-                    goto out;
-                }
-
-                for ( j = 0; j < 4; j++ )
-                    l3tab[j] = l3ptes[j];
-
-                munmap(l3tab, PAGE_SIZE);
-            }
-        }
-
-        /* Second pass: find all L1TABs and uncanonicalize them */
-        j = 0;
-
-        for ( i = 0; i < dinfo->p2m_size; i++ )
-        {
-            if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
-                  XEN_DOMCTL_PFINFO_L1TAB) )
-            {
-                region_mfn[j] = ctx->p2m[i];
-                j++;
-            }
-
-            if ( (i == (dinfo->p2m_size-1)) || (j == MAX_BATCH_SIZE) )
-            {
-                region_base = xc_map_foreign_pages(
-                    xch, dom, PROT_READ | PROT_WRITE, region_mfn, j);
-                if ( region_base == NULL )
-                {
-                    PERROR("map batch failed");
-                    goto out;
-                }
-
-                for ( k = 0; k < j; k++ )
-                {
-                    if ( !uncanonicalize_pagetable(
-                        xch, dom, ctx,
-                        region_base + k*PAGE_SIZE) )
-                    {
-                        ERROR("failed uncanonicalize pt!");
-                        goto out;
-                    }
-                }
-
-                munmap(region_base, j*PAGE_SIZE);
-                j = 0;
-            }
-        }
-
-        if ( xc_flush_mmu_updates(xch, mmu) )
-        {
-            PERROR("Error doing xc_flush_mmu_updates()");
-            goto out;
-        }
-    }
-
-    /*
-     * Pin page tables. Do this after writing to them as otherwise Xen
-     * will barf when doing the type-checking.
-     */
-    nr_pins = 0;
-    for ( i = 0; i < dinfo->p2m_size; i++ )
-    {
-        if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
-            continue;
-
-        switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
-        {
-        case XEN_DOMCTL_PFINFO_L1TAB:
-            pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
-            break;
-
-        case XEN_DOMCTL_PFINFO_L2TAB:
-            pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
-            break;
-
-        case XEN_DOMCTL_PFINFO_L3TAB:
-            pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
-            break;
-
-        case XEN_DOMCTL_PFINFO_L4TAB:
-            pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
-            break;
-
-        default:
-            continue;
-        }
-
-        pin[nr_pins].arg1.mfn = ctx->p2m[i];
-        nr_pins++;
-
-        /* Batch full? Then flush. */
-        if ( nr_pins == MAX_PIN_BATCH )
-        {
-            if ( xc_mmuext_op(xch, pin, nr_pins, dom) < 0 )
-            {
-                PERROR("Failed to pin batch of %d page tables", nr_pins);
-                goto out;
-            }
-            nr_pins = 0;
-        }
-    }
-
-    /* Flush final partial batch. */
-    if ( (nr_pins != 0) && (xc_mmuext_op(xch, pin, nr_pins, dom) < 0) )
-    {
-        PERROR("Failed to pin batch of %d page tables", nr_pins);
-        goto out;
-    }
-
-    DPRINTF("Memory reloaded (%ld pages)\n", ctx->nr_pfns);
-
-    /* Get the list of PFNs that are not in the psuedo-phys map */
-    {
-        int nr_frees = 0;
-
-        for ( i = 0; i < tailbuf.u.pv.pfncount; i++ )
-        {
-            unsigned long pfn = tailbuf.u.pv.pfntab[i];
-
-            if ( ctx->p2m[pfn] != INVALID_P2M_ENTRY )
-            {
-                /* pfn is not in physmap now, but was at some point during
-                   the save/migration process - need to free it */
-                tailbuf.u.pv.pfntab[nr_frees++] = ctx->p2m[pfn];
-                ctx->p2m[pfn]  = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
-            }
-        }
-
-        if ( nr_frees > 0 )
-        {
-            if ( (frc = xc_domain_decrease_reservation(xch, dom, nr_frees, 0, tailbuf.u.pv.pfntab)) != nr_frees )
-            {
-                PERROR("Could not decrease reservation : %d", frc);
-                goto out;
-            }
-            else
-                DPRINTF("Decreased reservation by %d pages\n", tailbuf.u.pv.pfncount);
-        }
-    }
-
-    vcpup = tailbuf.u.pv.vcpubuf;
-    for ( i = 0; i <= max_vcpu_id; i++ )
-    {
-        if ( !(vcpumap[i/64] & (1ULL << (i%64))) )
-            continue;
-
-        memcpy(ctxt, vcpup, ((dinfo->guest_width == 8) ? sizeof(ctxt->x64)
-                              : sizeof(ctxt->x32)));
-        vcpup += (dinfo->guest_width == 8) ? sizeof(ctxt->x64) : sizeof(ctxt->x32);
-
-        DPRINTF("read VCPU %d\n", i);
-
-        if ( !new_ctxt_format )
-            SET_FIELD(ctxt, flags,
-                      GET_FIELD(ctxt, flags, dinfo->guest_width) | VGCF_online,
-                      dinfo->guest_width);
-
-        if ( i == 0 )
-        {
-            /*
-             * Uncanonicalise the start info frame number and poke in
-             * updated values into the start info itself.
-             *
-             * The start info MFN is the 3rd argument to the
-             * HYPERVISOR_sched_op hypercall when op==SCHEDOP_shutdown
-             * and reason==SHUTDOWN_suspend, it is canonicalised in
-             * xc_domain_save and therefore the PFN is found in the
-             * edx register.
-             */
-            pfn = GET_FIELD(ctxt, user_regs.edx, dinfo->guest_width);
-            if ( (pfn >= dinfo->p2m_size) ||
-                 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
-            {
-                ERROR("Suspend record frame number is bad");
-                goto out;
-            }
-            mfn = ctx->p2m[pfn];
-            SET_FIELD(ctxt, user_regs.edx, mfn, dinfo->guest_width);
-            start_info = xc_map_foreign_range(
-                xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
-            if ( start_info == NULL )
-            {
-                PERROR("xc_map_foreign_range failed (for start_info)");
-                goto out;
-            }
-
-            SET_FIELD(start_info, nr_pages, dinfo->p2m_size, dinfo->guest_width);
-            SET_FIELD(start_info, shared_info, shared_info_frame<<PAGE_SHIFT, dinfo->guest_width);
-            SET_FIELD(start_info, flags, 0, dinfo->guest_width);
-            if ( GET_FIELD(start_info, store_mfn, dinfo->guest_width) > dinfo->p2m_size )
-            {
-                ERROR("Suspend record xenstore frame number is bad");
-                munmap(start_info, PAGE_SIZE);
-                goto out;
-            }
-            *store_mfn = ctx->p2m[GET_FIELD(start_info, store_mfn, dinfo->guest_width)];
-            SET_FIELD(start_info, store_mfn, *store_mfn, dinfo->guest_width);
-            SET_FIELD(start_info, store_evtchn, store_evtchn, dinfo->guest_width);
-            if ( GET_FIELD(start_info, console.domU.mfn, dinfo->guest_width) > dinfo->p2m_size )
-            {
-                ERROR("Suspend record console frame number is bad");
-                munmap(start_info, PAGE_SIZE);
-                goto out;
-            }
-            *console_mfn = ctx->p2m[GET_FIELD(start_info, console.domU.mfn, dinfo->guest_width)];
-            SET_FIELD(start_info, console.domU.mfn, *console_mfn, dinfo->guest_width);
-            SET_FIELD(start_info, console.domU.evtchn, console_evtchn, dinfo->guest_width);
-            munmap(start_info, PAGE_SIZE);
-        }
-        /* Uncanonicalise each GDT frame number. */
-        if ( GET_FIELD(ctxt, gdt_ents, dinfo->guest_width) > 8192 )
-        {
-            ERROR("GDT entry count out of range");
-            goto out;
-        }
-
-        for ( j = 0; (512*j) < GET_FIELD(ctxt, gdt_ents, dinfo->guest_width); j++ )
-        {
-            pfn = GET_FIELD(ctxt, gdt_frames[j], dinfo->guest_width);
-            if ( (pfn >= dinfo->p2m_size) ||
-                 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
-            {
-                ERROR("GDT frame number %i (0x%lx) is bad", 
-                      j, (unsigned long)pfn);
-                goto out;
-            }
-            SET_FIELD(ctxt, gdt_frames[j], ctx->p2m[pfn], dinfo->guest_width);
-        }
-        /* Uncanonicalise the page table base pointer. */
-        pfn = UNFOLD_CR3(GET_FIELD(ctxt, ctrlreg[3], dinfo->guest_width));
-
-        if ( pfn >= dinfo->p2m_size )
-        {
-            ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
-                  pfn, dinfo->p2m_size, pfn_type[pfn]);
-            goto out;
-        }
-
-        if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
-             ((unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
-        {
-            ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
-                  pfn, dinfo->p2m_size, pfn_type[pfn],
-                  (unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
-            goto out;
-        }
-        SET_FIELD(ctxt, ctrlreg[3], FOLD_CR3(ctx->p2m[pfn]), dinfo->guest_width);
-
-        /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
-        if ( (ctx->pt_levels == 4) && (ctxt->x64.ctrlreg[1] & 1) )
-        {
-            pfn = UNFOLD_CR3(ctxt->x64.ctrlreg[1] & ~1);
-            if ( pfn >= dinfo->p2m_size )
-            {
-                ERROR("User PT base is bad: pfn=%lu p2m_size=%lu",
-                      pfn, dinfo->p2m_size);
-                goto out;
-            }
-            if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
-                 ((unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
-            {
-                ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
-                      pfn, dinfo->p2m_size, pfn_type[pfn],
-                      (unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
-                goto out;
-            }
-            ctxt->x64.ctrlreg[1] = FOLD_CR3(ctx->p2m[pfn]);
-        }
-        frc = xc_vcpu_setcontext(xch, dom, i, ctxt);
-        if ( frc != 0 )
-        {
-            PERROR("Couldn't build vcpu%d", i);
-            goto out;
-        }
-
-        if ( !ext_vcpucontext )
-            goto vcpu_ext_state_restore;
-        memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
-        vcpup += 128;
-        domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
-        domctl.domain = dom;
-        frc = xc_domctl(xch, &domctl);
-        if ( frc != 0 )
-        {
-            PERROR("Couldn't set extended vcpu%d info", i);
-            goto out;
-        }
-
- vcpu_ext_state_restore:
-        if ( !vcpuextstate_size )
-            continue;
-
-        memcpy(&domctl.u.vcpuextstate.xfeature_mask, vcpup,
-               sizeof(domctl.u.vcpuextstate.xfeature_mask));
-        vcpup += sizeof(domctl.u.vcpuextstate.xfeature_mask);
-        memcpy(&domctl.u.vcpuextstate.size, vcpup,
-               sizeof(domctl.u.vcpuextstate.size));
-        vcpup += sizeof(domctl.u.vcpuextstate.size);
-
-        buffer = xc_hypercall_buffer_alloc(xch, buffer,
-                                           domctl.u.vcpuextstate.size);
-        if ( !buffer )
-        {
-            PERROR("Could not allocate buffer to restore eXtended States");
-            goto out;
-        }
-        memcpy(buffer, vcpup, domctl.u.vcpuextstate.size);
-        vcpup += domctl.u.vcpuextstate.size;
-
-        domctl.cmd = XEN_DOMCTL_setvcpuextstate;
-        domctl.domain = dom;
-        domctl.u.vcpuextstate.vcpu = i;
-        set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
-        frc = xc_domctl(xch, &domctl);
-        if ( frc != 0 )
-        {
-            PERROR("Couldn't set eXtended States for vcpu%d", i);
-            goto out;
-        }
-        xc_hypercall_buffer_free(xch, buffer);
-    }
-
-    memcpy(shared_info_page, tailbuf.u.pv.shared_info_page, PAGE_SIZE);
-
-    DPRINTF("Completed checkpoint load\n");
-
-    /* Restore contents of shared-info page. No checking needed. */
-    new_shared_info = xc_map_foreign_range(
-        xch, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
-    if ( new_shared_info == NULL )
-    {
-        PERROR("xc_map_foreign_range failed (for new_shared_info)");
-        goto out;
-    }
-
-    /* restore saved vcpu_info and arch specific info */
-    MEMCPY_FIELD(new_shared_info, old_shared_info, vcpu_info, dinfo->guest_width);
-    MEMCPY_FIELD(new_shared_info, old_shared_info, arch, dinfo->guest_width);
-
-    /* clear any pending events and the selector */
-    MEMSET_ARRAY_FIELD(new_shared_info, evtchn_pending, 0, dinfo->guest_width);
-    for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
-	    SET_FIELD(new_shared_info, vcpu_info[i].evtchn_pending_sel, 0, dinfo->guest_width);
-
-    /* mask event channels */
-    MEMSET_ARRAY_FIELD(new_shared_info, evtchn_mask, 0xff, dinfo->guest_width);
-
-    /* leave wallclock time. set by hypervisor */
-    munmap(new_shared_info, PAGE_SIZE);
-
-    /* Uncanonicalise the pfn-to-mfn table frame-number list. */
-    for ( i = 0; i < P2M_FL_ENTRIES; i++ )
-    {
-        pfn = p2m_frame_list[i];
-        if ( (pfn >= dinfo->p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
-        {
-            ERROR("PFN-to-MFN frame number %i (%#lx) is bad", i, pfn);
-            goto out;
-        }
-        p2m_frame_list[i] = ctx->p2m[pfn];
-    }
-
-    /* Copy the P2M we've constructed to the 'live' P2M */
-    if ( !(ctx->live_p2m = xc_map_foreign_pages(xch, dom, PROT_WRITE,
-                                           p2m_frame_list, P2M_FL_ENTRIES)) )
-    {
-        PERROR("Couldn't map p2m table");
-        goto out;
-    }
-
-    /* If the domain we're restoring has a different word size to ours,
-     * we need to adjust the live_p2m assignment appropriately */
-    if ( dinfo->guest_width > sizeof (xen_pfn_t) )
-        for ( i = dinfo->p2m_size - 1; i >= 0; i-- )
-            ((int64_t *)ctx->live_p2m)[i] = (long)ctx->p2m[i];
-    else if ( dinfo->guest_width < sizeof (xen_pfn_t) )
-        for ( i = 0; i < dinfo->p2m_size; i++ )   
-            ((uint32_t *)ctx->live_p2m)[i] = ctx->p2m[i];
-    else
-        memcpy(ctx->live_p2m, ctx->p2m, dinfo->p2m_size * sizeof(xen_pfn_t));
-    munmap(ctx->live_p2m, P2M_FL_ENTRIES * PAGE_SIZE);
-
-    frc = xc_dom_gnttab_seed(xch, dom, *console_mfn, *store_mfn,
-                             console_domid, store_domid);
-    if (frc != 0)
-    {
-        ERROR("error seeding grant table");
-        goto out;
-    }
-
-    DPRINTF("Domain ready to be built.\n");
-    rc = 0;
-    goto out;
-
-  finish_hvm:
-    if ( tdata.data != NULL )
-    {
-        if ( callbacks != NULL && callbacks->toolstack_restore != NULL )
-        {
-            frc = callbacks->toolstack_restore(dom, tdata.data, tdata.len,
-                                               callbacks->data);
-            free(tdata.data);
-            if ( frc < 0 )
-            {
-                PERROR("error calling toolstack_restore");
-                goto out;
-            }
-        } else {
-            rc = -1;
-            ERROR("toolstack data available but no callback provided\n");
-            free(tdata.data);
-            goto out;
-        }
-    }
-
-    /* Dump the QEMU state to a state file for QEMU to load */
-    if ( dump_qemu(xch, dom, &tailbuf.u.hvm) ) {
-        PERROR("Error dumping QEMU state to file");
-        goto out;
-    }
-
-    /* These comms pages need to be zeroed at the start of day */
-    if ( xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[0]) ||
-         xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[1]) ||
-         xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[2]) )
-    {
-        PERROR("error zeroing magic pages");
-        goto out;
-    }
-
-    if ( (frc = xc_hvm_param_set(xch, dom,
-                                 HVM_PARAM_IOREQ_PFN, tailbuf.u.hvm.magicpfns[0]))
-         || (frc = xc_hvm_param_set(xch, dom,
-                                    HVM_PARAM_BUFIOREQ_PFN, tailbuf.u.hvm.magicpfns[1]))
-         || (frc = xc_hvm_param_set(xch, dom,
-                                    HVM_PARAM_STORE_PFN, tailbuf.u.hvm.magicpfns[2]))
-         || (frc = xc_hvm_param_set(xch, dom,
-                                    HVM_PARAM_PAE_ENABLED, pae))
-         || (frc = xc_hvm_param_set(xch, dom,
-                                    HVM_PARAM_STORE_EVTCHN,
-                                    store_evtchn)) )
-    {
-        PERROR("error setting HVM params: %i", frc);
-        goto out;
-    }
-    *store_mfn = tailbuf.u.hvm.magicpfns[2];
-
-    if ( console_pfn ) {
-        if ( xc_clear_domain_page(xch, dom, console_pfn) ) {
-            PERROR("error zeroing console page");
-            goto out;
-        }
-        if ( (frc = xc_hvm_param_set(xch, dom,
-                                    HVM_PARAM_CONSOLE_PFN, console_pfn)) ) {
-            PERROR("error setting HVM param: %i", frc);
-            goto out;
-        }
-        *console_mfn = console_pfn;
-    }
-
-    frc = xc_domain_hvm_setcontext(xch, dom, tailbuf.u.hvm.hvmbuf,
-                                   tailbuf.u.hvm.reclen);
-    if ( frc )
-    {
-        PERROR("error setting the HVM context");
-        goto out;
-    }
-
-    frc = xc_dom_gnttab_hvm_seed(xch, dom, *console_mfn, *store_mfn,
-                                 console_domid, store_domid);
-    if (frc != 0)
-    {
-        ERROR("error seeding grant table");
-        goto out;
-    }
-
-    /* HVM success! */
-    rc = 0;
-
- out:
-    if ( (rc != 0) && (dom != 0) )
-        xc_domain_destroy(xch, dom);
-    xc_hypercall_buffer_free(xch, ctxt);
-    free(mmu);
-    free(ctx->p2m);
-    free(pfn_type);
-    free(region_mfn);
-    free(ctx->p2m_batch);
-    pagebuf_free(&pagebuf);
-    tailbuf_free(&tailbuf);
-
-    /* discard cache for save file  */
-    discard_file_cache(xch, io_fd, 1 /*flush*/);
-
-    fcntl(io_fd, F_SETFL, orig_io_fd_flags);
-
-    DPRINTF("Restore exit of domid %u with rc=%d\n", dom, rc);
-
-    return rc;
-}
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
deleted file mode 100644
index 254fdb3..0000000
--- a/tools/libxc/xc_domain_save.c
+++ /dev/null
@@ -1,2192 +0,0 @@
-/******************************************************************************
- * xc_linux_save.c
- *
- * Save the state of a running Linux session.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Copyright (c) 2003, K A Fraser.
- */
-
-#include <inttypes.h>
-#include <time.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <assert.h>
-
-#include "xc_private.h"
-#include "xc_bitops.h"
-#include "xc_dom.h"
-#include "xg_private.h"
-#include "xg_save_restore.h"
-
-#include <xen/hvm/params.h>
-
-/*
-** Default values for important tuning parameters. Can override by passing
-** non-zero replacement values to xc_domain_save().
-**
-** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
-**
-*/
-#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
-#define DEF_MAX_FACTOR   3   /* never send more than 3x p2m_size  */
-
-struct save_ctx {
-    unsigned long hvirt_start; /* virtual starting address of the hypervisor */
-    unsigned int pt_levels; /* #levels of page tables used by the current guest */
-    unsigned long max_mfn; /* max mfn of the whole machine */
-    xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
-    xen_pfn_t *live_m2p; /* Live mapping of system MFN to PFN table. */
-    unsigned long m2p_mfn0;
-    struct domain_info_context dinfo;
-};
-
-/* buffer for output */
-struct outbuf {
-    void* buf;
-    size_t size;
-    size_t pos;
-    int write_count;
-};
-
-#define OUTBUF_SIZE (16384 * 1024)
-
-/* grep fodder: machine_to_phys */
-
-#define mfn_to_pfn(_mfn)  (ctx->live_m2p[(_mfn)])
-
-#define pfn_to_mfn(_pfn)                                            \
-  ((xen_pfn_t) ((dinfo->guest_width==8)                               \
-                ? (((uint64_t *)ctx->live_p2m)[(_pfn)])                  \
-                : ((((uint32_t *)ctx->live_p2m)[(_pfn)]) == 0xffffffffU  \
-                   ? (-1UL) : (((uint32_t *)ctx->live_p2m)[(_pfn)]))))
-
-/*
- * Returns TRUE if the given machine frame number has a unique mapping
- * in the guest's pseudophysical map.
- */
-#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)          \
-    (((_mfn) < (ctx->max_mfn)) &&                \
-     ((mfn_to_pfn(_mfn) < (dinfo->p2m_size)) &&   \
-      (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
-
-#define SUPERPAGE_PFN_SHIFT  9
-#define SUPERPAGE_NR_PFNS    (1UL << SUPERPAGE_PFN_SHIFT)
-
-#define SUPER_PAGE_START(pfn)    (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
-
-static uint64_t tv_to_us(struct timeval *new)
-{
-    return (new->tv_sec * 1000000) + new->tv_usec;
-}
-
-static uint64_t llgettimeofday(void)
-{
-    struct timeval now;
-    gettimeofday(&now, NULL);
-    return tv_to_us(&now);
-}
-
-static uint64_t tv_delta(struct timeval *new, struct timeval *old)
-{
-    return (((new->tv_sec - old->tv_sec)*1000000) +
-            (new->tv_usec - old->tv_usec));
-}
-
-static int noncached_write(xc_interface *xch,
-                           struct outbuf* ob,
-                           int fd, void *buffer, int len) 
-{
-    int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
-
-    ob->write_count += len;
-    if ( ob->write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
-    {
-        /* Time to discard cache - dont care if this fails */
-        int saved_errno = errno;
-        discard_file_cache(xch, fd, 0 /* no flush */);
-        errno = saved_errno;
-        ob->write_count = 0;
-    }
-
-    return rc;
-}
-
-static int outbuf_init(xc_interface *xch, struct outbuf* ob, size_t size)
-{
-    memset(ob, 0, sizeof(*ob));
-
-    if ( !(ob->buf = malloc(size)) ) {
-        DPRINTF("error allocating output buffer of size %zu\n", size);
-        return -1;
-    }
-
-    ob->size = size;
-
-    return 0;
-}
-
-static int outbuf_free(struct outbuf *ob)
-{
-    free(ob->buf);
-    ob->buf = NULL;
-    return 0;
-}
-
-static inline int outbuf_write(xc_interface *xch,
-                               struct outbuf* ob, void* buf, size_t len)
-{
-    if ( len > ob->size - ob->pos ) {
-        errno = ERANGE;
-        DBGPRINTF("outbuf_write: %zu > %zu@%zu\n", len, ob->size - ob->pos, ob->pos);
-        return -1;
-    }
-
-    memcpy(ob->buf + ob->pos, buf, len);
-    ob->pos += len;
-
-    return 0;
-}
-
-/* prep for nonblocking I/O */
-static int outbuf_flush(xc_interface *xch, struct outbuf* ob, int fd)
-{
-    int rc;
-    int cur = 0;
-
-    if ( !ob->pos )
-        return 0;
-
-    rc = write(fd, ob->buf, ob->pos);
-    while (rc < 0 || cur + rc < ob->pos) {
-        if (rc < 0 && errno != EAGAIN && errno != EINTR) {
-            DPRINTF("error flushing output: %d\n", errno);
-            return -1;
-        }
-        if (rc > 0)
-            cur += rc;
-
-        rc = write(fd, ob->buf + cur, ob->pos - cur);
-    }
-
-    ob->pos = 0;
-
-    return 0;
-}
-
-/* if there's no room in the buffer, flush it and try again. */
-static inline int outbuf_hardwrite(xc_interface *xch,
-                                   struct outbuf* ob, int fd, void* buf,
-                                   size_t len)
-{
-    if ( !len )
-        return 0;
-
-    if ( !outbuf_write(xch, ob, buf, len) )
-        return 0;
-
-    if ( outbuf_flush(xch, ob, fd) < 0 )
-        return -1;
-
-    return outbuf_write(xch, ob, buf, len);
-}
-
-/* start buffering output once we've reached checkpoint mode. */
-static inline int write_buffer(xc_interface *xch,
-                               int dobuf, struct outbuf* ob, int fd, void* buf,
-                               size_t len)
-{
-    if ( dobuf )
-        return outbuf_hardwrite(xch, ob, fd, buf, len);
-    else
-        return write_exact(fd, buf, len);
-}
-
-/* like write_buffer for noncached, which returns number of bytes written */
-static inline int write_uncached(xc_interface *xch,
-                                   int dobuf, struct outbuf* ob, int fd,
-                                   void* buf, size_t len)
-{
-    if ( dobuf )
-        return outbuf_hardwrite(xch, ob, fd, buf, len) ? -1 : len;
-    else
-        return noncached_write(xch, ob, fd, buf, len);
-}
-
-static int write_compressed(xc_interface *xch, comp_ctx *compress_ctx,
-                            int dobuf, struct outbuf* ob, int fd)
-{
-    int rc = 0;
-    int header = sizeof(int) + sizeof(unsigned long);
-    int marker = XC_SAVE_ID_COMPRESSED_DATA;
-    unsigned long compbuf_len = 0;
-
-    for(;;)
-    {
-        /* check for available space (atleast 8k) */
-        if ((ob->pos + header + XC_PAGE_SIZE * 2) > ob->size)
-        {
-            if (outbuf_flush(xch, ob, fd) < 0)
-            {
-                ERROR("Error when flushing outbuf intermediate");
-                return -1;
-            }
-        }
-
-        rc = xc_compression_compress_pages(xch, compress_ctx,
-                                           ob->buf + ob->pos + header,
-                                           ob->size - ob->pos - header,
-                                           &compbuf_len);
-        if (!rc)
-            break;
-
-        if (outbuf_hardwrite(xch, ob, fd, &marker, sizeof(marker)) < 0)
-        {
-            PERROR("Error when writing marker (errno %d)", errno);
-            return -1;
-        }
-
-        if (outbuf_hardwrite(xch, ob, fd, &compbuf_len, sizeof(compbuf_len)) < 0)
-        {
-            PERROR("Error when writing compbuf_len (errno %d)", errno);
-            return -1;
-        }
-
-        ob->pos += (size_t) compbuf_len;
-        if (!dobuf && outbuf_flush(xch, ob, fd) < 0)
-        {
-            ERROR("Error when writing compressed chunk");
-            return -1;
-        }
-    }
-
-    return 0;
-}
-
-struct time_stats {
-    struct timeval wall;
-    long long d0_cpu, d1_cpu;
-};
-
-static int print_stats(xc_interface *xch, uint32_t domid, int pages_sent,
-                       struct time_stats *last,
-                       xc_shadow_op_stats_t *stats, int print)
-{
-    struct time_stats now;
-
-    gettimeofday(&now.wall, NULL);
-
-    now.d0_cpu = xc_domain_get_cpu_usage(xch, 0, /* FIXME */ 0)/1000;
-    now.d1_cpu = xc_domain_get_cpu_usage(xch, domid, /* FIXME */ 0)/1000;
-
-    if ( (now.d0_cpu == -1) || (now.d1_cpu == -1) )
-        DPRINTF("ARRHHH!!\n");
-
-    if ( print )
-    {
-        long long wall_delta;
-        long long d0_cpu_delta;
-        long long d1_cpu_delta;
-
-        wall_delta = tv_delta(&now.wall,&last->wall)/1000;
-        if ( wall_delta == 0 )
-            wall_delta = 1;
-
-        d0_cpu_delta = (now.d0_cpu - last->d0_cpu)/1000;
-        d1_cpu_delta = (now.d1_cpu - last->d1_cpu)/1000;
-
-        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
-                "dirtied %dMb/s %" PRId32 " pages\n",
-                wall_delta,
-                (int)((d0_cpu_delta*100)/wall_delta),
-                (int)((d1_cpu_delta*100)/wall_delta),
-                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
-                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
-                stats->dirty_count);
-    }
-
-    *last = now;
-
-    return 0;
-}
-
-
-static int analysis_phase(xc_interface *xch, uint32_t domid, struct save_ctx *ctx,
-                          xc_hypercall_buffer_t *arr, int runs)
-{
-    long long start, now;
-    xc_shadow_op_stats_t stats;
-    int j;
-    struct domain_info_context *dinfo = &ctx->dinfo;
-
-    start = llgettimeofday();
-
-    for ( j = 0; j < runs; j++ )
-    {
-        int i;
-
-        xc_shadow_control(xch, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
-                          arr, dinfo->p2m_size, NULL, 0, NULL);
-        DPRINTF("#Flush\n");
-        for ( i = 0; i < 40; i++ )
-        {
-            usleep(50000);
-            now = llgettimeofday();
-            xc_shadow_control(xch, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
-                              NULL, 0, NULL, 0, &stats);
-            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
-                    ((now-start)+500)/1000,
-                    stats.fault_count, stats.dirty_count);
-        }
-    }
-
-    return -1;
-}
-
-static int suspend_and_state(int (*suspend)(void*), void* data,
-                             xc_interface *xch, int io_fd, int dom,
-                             xc_dominfo_t *info)
-{
-    if ( !(*suspend)(data) )
-    {
-        ERROR("Suspend request failed");
-        return -1;
-    }
-
-    if ( (xc_domain_getinfo(xch, dom, 1, info) != 1) ||
-         !info->shutdown || (info->shutdown_reason != SHUTDOWN_suspend) )
-    {
-        ERROR("Domain not in suspended state");
-        return -1;
-    }
-
-    return 0;
-}
-
-/*
-** Map the top-level page of MFNs from the guest. The guest might not have
-** finished resuming from a previous restore operation, so we wait a while for
-** it to update the MFN to a reasonable value.
-*/
-static void *map_frame_list_list(xc_interface *xch, uint32_t dom,
-                                 struct save_ctx *ctx,
-                                 shared_info_any_t *shinfo)
-{
-    int count = 100;
-    void *p;
-    struct domain_info_context *dinfo = &ctx->dinfo;
-    uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list, dinfo->guest_width);
-
-    while ( count-- && (fll == 0) )
-    {
-        usleep(10000);
-        fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list, dinfo->guest_width);
-    }
-
-    if ( fll == 0 )
-    {
-        ERROR("Timed out waiting for frame list updated.");
-        return NULL;
-    }
-
-    p = xc_map_foreign_range(xch, dom, PAGE_SIZE, PROT_READ, fll);
-    if ( p == NULL )
-        PERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
-
-    return p;
-}
-
-/*
-** During transfer (or in the state file), all page-table pages must be
-** converted into a 'canonical' form where references to actual mfns
-** are replaced with references to the corresponding pfns.
-**
-** This function performs the appropriate conversion, taking into account
-** which entries do not require canonicalization (in particular, those
-** entries which map the virtual address reserved for the hypervisor).
-*/
-static int canonicalize_pagetable(struct save_ctx *ctx,
-                           unsigned long type, unsigned long pfn,
-                           const void *spage, void *dpage)
-{
-    struct domain_info_context *dinfo = &ctx->dinfo;
-    int i, pte_last, xen_start, xen_end, race = 0; 
-    uint64_t pte;
-
-    /*
-    ** We need to determine which entries in this page table hold
-    ** reserved hypervisor mappings. This depends on the current
-    ** page table type as well as the number of paging levels.
-    */
-    xen_start = xen_end = pte_last = PAGE_SIZE / 8;
-
-    if ( (ctx->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
-        xen_start = L3_PAGETABLE_ENTRIES_PAE;
-
-    /*
-    ** In PAE only the L2 mapping the top 1GB contains Xen mappings.
-    ** We can spot this by looking for the guest's mappingof the m2p.
-    ** Guests must ensure that this check will fail for other L2s.
-    */
-    if ( (ctx->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
-    {
-        int hstart;
-        uint64_t he;
-
-        hstart = (ctx->hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
-        he = ((const uint64_t *) spage)[hstart];
-
-        if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == ctx->m2p_mfn0 )
-        {
-            /* hvirt starts with xen stuff... */
-            xen_start = hstart;
-        }
-        else if ( ctx->hvirt_start != 0xf5800000 )
-        {
-            /* old L2s from before hole was shrunk... */
-            hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
-            he = ((const uint64_t *) spage)[hstart];
-            if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == ctx->m2p_mfn0 )
-                xen_start = hstart;
-        }
-    }
-
-    if ( (ctx->pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
-    {
-        /*
-        ** XXX SMH: should compute these from hvirt_start (which we have)
-        ** and hvirt_end (which we don't)
-        */
-        xen_start = 256;
-        xen_end   = 272;
-    }
-
-    /* Now iterate through the page table, canonicalizing each PTE */
-    for (i = 0; i < pte_last; i++ )
-    {
-        unsigned long pfn, mfn;
-
-        pte = ((const uint64_t*)spage)[i];
-
-        if ( (i >= xen_start) && (i < xen_end) )
-            pte = 0;
-
-        if ( pte & _PAGE_PRESENT )
-        {
-            mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
-            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
-            {
-                /* This will happen if the type info is stale which
-                   is quite feasible under live migration */
-                pfn  = 0;  /* zap it - we'll retransmit this page later */
-                /* XXX: We can't spot Xen mappings in compat-mode L2es 
-                 * from 64-bit tools, but the only thing in them is the
-                 * compat m2p, so we quietly zap them.  This doesn't
-                 * count as a race, so don't report it. */
-                if ( !(type == XEN_DOMCTL_PFINFO_L2TAB 
-                       && sizeof (unsigned long) > dinfo->guest_width) )
-                     race = 1;  /* inform the caller; fatal if !live */ 
-            }
-            else
-                pfn = mfn_to_pfn(mfn);
-
-            pte &= ~MADDR_MASK_X86;
-            pte |= (uint64_t)pfn << PAGE_SHIFT;
-
-            /*
-             * PAE guest L3Es can contain these flags when running on
-             * a 64bit hypervisor. We zap these here to avoid any
-             * surprise at restore time...
-             */
-            if ( (ctx->pt_levels == 3) &&
-                 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
-                 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
-                pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
-        }
-
-        ((uint64_t*)dpage)[i] = pte;
-    }
-
-    return race;
-}
-
-xen_pfn_t *xc_map_m2p(xc_interface *xch,
-                                 unsigned long max_mfn,
-                                 int prot,
-                                 unsigned long *mfn0)
-{
-    privcmd_mmap_entry_t *entries;
-    unsigned long m2p_chunks, m2p_size;
-    xen_pfn_t *m2p;
-    xen_pfn_t *extent_start;
-    int i;
-
-    m2p = NULL;
-    m2p_size   = M2P_SIZE(max_mfn);
-    m2p_chunks = M2P_CHUNKS(max_mfn);
-
-    extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
-    if ( !extent_start )
-    {
-        ERROR("failed to allocate space for m2p mfns");
-        goto err0;
-    }
-
-    if ( xc_machphys_mfn_list(xch, m2p_chunks, extent_start) )
-    {
-        PERROR("xc_get_m2p_mfns");
-        goto err1;
-    }
-
-    entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
-    if (entries == NULL)
-    {
-        ERROR("failed to allocate space for mmap entries");
-        goto err1;
-    }
-
-    for ( i = 0; i < m2p_chunks; i++ )
-        entries[i].mfn = extent_start[i];
-
-    m2p = xc_map_foreign_ranges(xch, DOMID_XEN,
-			m2p_size, prot, M2P_CHUNK_SIZE,
-			entries, m2p_chunks);
-    if (m2p == NULL)
-    {
-        PERROR("xc_mmap_foreign_ranges failed");
-        goto err2;
-    }
-
-    if (mfn0)
-        *mfn0 = entries[0].mfn;
-
-err2:
-    free(entries);
-err1:
-    free(extent_start);
-
-err0:
-    return m2p;
-}
-
-
-static xen_pfn_t *map_and_save_p2m_table(xc_interface *xch, 
-                                         int io_fd, 
-                                         uint32_t dom,
-                                         struct save_ctx *ctx,
-                                         shared_info_any_t *live_shinfo)
-{
-    vcpu_guest_context_any_t ctxt;
-    struct domain_info_context *dinfo = &ctx->dinfo;
-
-    /* Double and single indirect references to the live P2M table */
-    void *live_p2m_frame_list_list = NULL;
-    void *live_p2m_frame_list = NULL;
-
-    /* Copies of the above. */
-    xen_pfn_t *p2m_frame_list_list = NULL;
-    xen_pfn_t *p2m_frame_list = NULL;
-
-    /* The mapping of the live p2m table itself */
-    xen_pfn_t *p2m = NULL;
-
-    int i, success = 0;
-
-    live_p2m_frame_list_list = map_frame_list_list(xch, dom, ctx,
-                                                   live_shinfo);
-    if ( !live_p2m_frame_list_list )
-        goto out;
-
-    /* Get a local copy of the live_P2M_frame_list_list */
-    if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) )
-    {
-        ERROR("Couldn't allocate p2m_frame_list_list array");
-        goto out;
-    }
-    memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
-
-    /* Canonicalize guest's unsigned long vs ours */
-    if ( dinfo->guest_width > sizeof(unsigned long) )
-        for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
-            if ( i < PAGE_SIZE/dinfo->guest_width )
-                p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
-            else
-                p2m_frame_list_list[i] = 0;
-    else if ( dinfo->guest_width < sizeof(unsigned long) )
-        for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- )
-            p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
-
-    live_p2m_frame_list =
-        xc_map_foreign_pages(xch, dom, PROT_READ,
-                             p2m_frame_list_list,
-                             P2M_FLL_ENTRIES);
-    if ( !live_p2m_frame_list )
-    {
-        PERROR("Couldn't map p2m_frame_list");
-        goto out;
-    }
-
-    /* Get a local copy of the live_P2M_frame_list */
-    if ( !(p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) )
-    {
-        ERROR("Couldn't allocate p2m_frame_list array");
-        goto out;
-    }
-    memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE);
-    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE);
-
-    munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
-    live_p2m_frame_list = NULL;
-
-    /* Canonicalize guest's unsigned long vs ours */
-    if ( dinfo->guest_width > sizeof(unsigned long) )
-        for ( i = 0; i < P2M_FL_ENTRIES; i++ )
-            p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
-    else if ( dinfo->guest_width < sizeof(unsigned long) )
-        for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- )
-            p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
-
-
-    /* Map all the frames of the pfn->mfn table. For migrate to succeed,
-       the guest must not change which frames are used for this purpose.
-       (its not clear why it would want to change them, and we'll be OK
-       from a safety POV anyhow. */
-
-    p2m = xc_map_foreign_pages(xch, dom, PROT_READ,
-                               p2m_frame_list,
-                               P2M_FL_ENTRIES);
-    if ( !p2m )
-    {
-        PERROR("Couldn't map p2m table");
-        goto out;
-    }
-    ctx->live_p2m = p2m; /* So that translation macros will work */
-    
-    /* Canonicalise the pfn-to-mfn table frame-number list. */
-    for ( i = 0; i < dinfo->p2m_size; i += FPP )
-    {
-        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) )
-        {
-            ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
-            ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx",
-                  i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], ctx->max_mfn);
-            if ( p2m_frame_list[i/FPP] < ctx->max_mfn ) 
-            {
-                ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64, 
-                      (uint64_t)p2m_frame_list[i/FPP],
-                      (uint64_t)ctx->live_m2p[p2m_frame_list[i/FPP]]);
-                ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64, 
-                      (uint64_t)ctx->live_m2p[p2m_frame_list[i/FPP]],
-                      (uint64_t)p2m[ctx->live_m2p[p2m_frame_list[i/FPP]]]);
-
-            }
-            goto out;
-        }
-        p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
-    }
-
-    if ( xc_vcpu_getcontext(xch, dom, 0, &ctxt) )
-    {
-        PERROR("Could not get vcpu context");
-        goto out;
-    }
-
-    /*
-     * Write an extended-info structure to inform the restore code that
-     * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
-     * slow paths in the restore code.
-     */
-    {
-        unsigned long signature = ~0UL;
-        uint32_t chunk1_sz = ((dinfo->guest_width==8) 
-                              ? sizeof(ctxt.x64) 
-                              : sizeof(ctxt.x32));
-        uint32_t chunk2_sz = 0;
-        uint32_t chunk3_sz = 4;
-        uint32_t xcnt_size = 0;
-        uint32_t tot_sz;
-        DECLARE_DOMCTL;
-
-        domctl.cmd = XEN_DOMCTL_getvcpuextstate;
-        domctl.domain = dom;
-        domctl.u.vcpuextstate.vcpu = 0;
-        domctl.u.vcpuextstate.size = 0;
-        domctl.u.vcpuextstate.xfeature_mask = 0;
-        if ( xc_domctl(xch, &domctl) < 0 )
-        {
-            PERROR("No extended context for VCPU%d", i);
-            goto out;
-        }
-        xcnt_size = domctl.u.vcpuextstate.size + 2 * sizeof(uint64_t);
-
-        tot_sz = (chunk1_sz + 8) + (chunk2_sz + 8);
-        if ( domctl.u.vcpuextstate.xfeature_mask )
-            tot_sz += chunk3_sz + 8;
-
-        if ( write_exact(io_fd, &signature, sizeof(signature)) ||
-             write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
-             write_exact(io_fd, "vcpu", 4) ||
-             write_exact(io_fd, &chunk1_sz, sizeof(chunk1_sz)) ||
-             write_exact(io_fd, &ctxt, chunk1_sz) ||
-             write_exact(io_fd, "extv", 4) ||
-             write_exact(io_fd, &chunk2_sz, sizeof(chunk2_sz)) ||
-             (domctl.u.vcpuextstate.xfeature_mask) ?
-                (write_exact(io_fd, "xcnt", 4) ||
-                write_exact(io_fd, &chunk3_sz, sizeof(chunk3_sz)) ||
-                write_exact(io_fd, &xcnt_size, 4)) :
-                0 )
-        {
-            PERROR("write: extended info");
-            goto out;
-        }
-    }
-
-    if ( write_exact(io_fd, p2m_frame_list, 
-                     P2M_FL_ENTRIES * sizeof(xen_pfn_t)) )
-    {
-        PERROR("write: p2m_frame_list");
-        goto out;
-    }
-
-    success = 1;
-
- out:
-    
-    if ( !success && p2m )
-        munmap(p2m, P2M_FL_ENTRIES * PAGE_SIZE);
-
-    if ( live_p2m_frame_list_list )
-        munmap(live_p2m_frame_list_list, PAGE_SIZE);
-
-    if ( live_p2m_frame_list )
-        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
-
-    free(p2m_frame_list_list);
-
-    free(p2m_frame_list);
-
-    return success ? p2m : NULL;
-}
-
-/* must be done AFTER suspend_and_state() */
-static int save_tsc_info(xc_interface *xch, uint32_t dom, int io_fd)
-{
-    int marker = XC_SAVE_ID_TSC_INFO;
-    uint32_t tsc_mode, khz, incarn;
-    uint64_t nsec;
-
-    if ( xc_domain_get_tsc_info(xch, dom, &tsc_mode,
-                                &nsec, &khz, &incarn) < 0  ||
-         write_exact(io_fd, &marker, sizeof(marker)) ||
-         write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) ||
-         write_exact(io_fd, &nsec, sizeof(nsec)) ||
-         write_exact(io_fd, &khz, sizeof(khz)) ||
-         write_exact(io_fd, &incarn, sizeof(incarn)) )
-        return -1;
-    return 0;
-}
-
-int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iters,
-                   uint32_t max_factor, uint32_t flags,
-                   struct save_callbacks* callbacks, int hvm)
-{
-    xc_dominfo_t info;
-    DECLARE_DOMCTL;
-
-    int rc, frc, i, j, last_iter = 0, iter = 0;
-    int live  = (flags & XCFLAGS_LIVE);
-    int debug = (flags & XCFLAGS_DEBUG);
-    int superpages = !!hvm;
-    int race = 0, sent_last_iter, skip_this_iter = 0;
-    unsigned int sent_this_iter = 0;
-    int tmem_saved = 0;
-
-    /* The new domain's shared-info frame number. */
-    unsigned long shared_info_frame;
-
-    /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_any_t ctxt;
-
-    /* A table containing the type of each PFN (/not/ MFN!). */
-    xen_pfn_t *pfn_type = NULL;
-    unsigned long *pfn_batch = NULL;
-    int *pfn_err = NULL;
-
-    /* A copy of one frame of guest memory. */
-    char page[PAGE_SIZE];
-
-    /* Live mapping of shared info structure */
-    shared_info_any_t *live_shinfo = NULL;
-
-    /* base of the region in which domain memory is mapped */
-    unsigned char *region_base = NULL;
-
-    /* A copy of the CPU eXtended States of the guest. */
-    DECLARE_HYPERCALL_BUFFER(void, buffer);
-
-    /* bitmap of pages:
-       - that should be sent this iteration (unless later marked as skip);
-       - to skip this iteration because already dirty;
-       - to fixup by sending at the end if not already resent; */
-    DECLARE_HYPERCALL_BUFFER(unsigned long, to_skip);
-    DECLARE_HYPERCALL_BUFFER(unsigned long, to_send);
-    unsigned long *to_fix = NULL;
-
-    struct time_stats time_stats;
-    xc_shadow_op_stats_t shadow_stats;
-
-    unsigned long needed_to_fix = 0;
-    unsigned long total_sent    = 0;
-
-    uint64_t vcpumap[XC_SR_MAX_VCPUS/64] = { 1ULL };
-
-    /* HVM: a buffer for holding HVM context */
-    uint32_t hvm_buf_size = 0;
-    uint8_t *hvm_buf = NULL;
-
-    /* HVM: magic frames for ioreqs and xenstore comms. */
-    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
-
-    unsigned long mfn;
-
-    /* Without checkpoint compression, the dirty pages, pfn arrays
-     * and tailbuf (vcpu ctx, shared info page, etc.)  are written
-     * directly to outbuf. All of this is done while the domain is
-     * suspended.
-     *
-     * When checkpoint compression is enabled, the dirty pages are
-     * buffered, compressed "after" the domain is resumed and then
-     * written to outbuf. Since tailbuf data are collected while a
-     * domain is suspended, they cannot be directly written to the
-     * outbuf as there is no dirty page data preceeding tailbuf.
-     *
-     * So,two output buffers are maintained. Tailbuf data goes into
-     * ob_tailbuf. The dirty pages are compressed after resuming the
-     * domain and written to ob_pagebuf. ob_tailbuf is then appended
-     * to ob_pagebuf and finally flushed out.
-     */
-    struct outbuf ob_pagebuf, ob_tailbuf, *ob = NULL;
-    struct save_ctx _ctx;
-    struct save_ctx *ctx = &_ctx;
-    struct domain_info_context *dinfo = &ctx->dinfo;
-
-    /* Compression context */
-    comp_ctx *compress_ctx= NULL;
-    /* Even if XCFLAGS_CHECKPOINT_COMPRESS is set, we enable compression only
-     * after sending XC_SAVE_ID_ENABLE_COMPRESSION and the tailbuf for
-     * first time.
-     */
-    int compressing = 0;
-
-    int completed = 0;
-
-    DPRINTF("%s: starting save of domid %u", __func__, dom);
-
-    if ( hvm && !callbacks->switch_qemu_logdirty )
-    {
-        ERROR("No switch_qemu_logdirty callback provided.");
-        errno = EINVAL;
-        goto exit;
-    }
-
-    outbuf_init(xch, &ob_pagebuf, OUTBUF_SIZE);
-
-    memset(ctx, 0, sizeof(*ctx));
-
-    /* If no explicit control parameters given, use defaults */
-    max_iters  = max_iters  ? : DEF_MAX_ITERS;
-    max_factor = max_factor ? : DEF_MAX_FACTOR;
-
-    if ( !get_platform_info(xch, dom,
-                            &ctx->max_mfn, &ctx->hvirt_start, &ctx->pt_levels, &dinfo->guest_width) )
-    {
-        ERROR("Unable to get platform info.");
-        goto exit;
-    }
-
-    if ( xc_domain_getinfo(xch, dom, 1, &info) != 1 )
-    {
-        PERROR("Could not get domain info");
-        goto exit;
-    }
-
-    shared_info_frame = info.shared_info_frame;
-
-    /* Map the shared info frame */
-    if ( !hvm )
-    {
-        live_shinfo = xc_map_foreign_range(xch, dom, PAGE_SIZE,
-                                           PROT_READ, shared_info_frame);
-        if ( !live_shinfo )
-        {
-            PERROR("Couldn't map live_shinfo");
-            goto out;
-        }
-    }
-
-    /* Get the size of the P2M table */
-    dinfo->p2m_size = xc_domain_maximum_gpfn(xch, dom) + 1;
-
-    if ( dinfo->p2m_size > ~XEN_DOMCTL_PFINFO_LTAB_MASK )
-    {
-        errno = E2BIG;
-        ERROR("Cannot save this big a guest");
-        goto out;
-    }
-
-    /* Domain is still running at this point */
-    if ( live )
-    {
-        /* Live suspend. Enable log-dirty mode. */
-        if ( xc_shadow_control(xch, dom,
-                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
-                               NULL, 0, NULL, 0, NULL) < 0 )
-        {
-            /* log-dirty already enabled? There's no test op,
-               so attempt to disable then reenable it */
-            frc = xc_shadow_control(xch, dom, XEN_DOMCTL_SHADOW_OP_OFF,
-                                    NULL, 0, NULL, 0, NULL);
-            if ( frc >= 0 )
-            {
-                frc = xc_shadow_control(xch, dom,
-                                        XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
-                                        NULL, 0, NULL, 0, NULL);
-            }
-            
-            if ( frc < 0 )
-            {
-                PERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno );
-                goto out;
-            }
-        }
-
-        /* Enable qemu-dm logging dirty pages to xen */
-        if ( hvm && callbacks->switch_qemu_logdirty(dom, 1, callbacks->data) )
-        {
-            PERROR("Couldn't enable qemu log-dirty mode (errno %d)", errno);
-            goto out;
-        }
-    }
-    else
-    {
-        /* This is a non-live suspend. Suspend the domain .*/
-        if ( suspend_and_state(callbacks->suspend, callbacks->data, xch,
-                               io_fd, dom, &info) )
-        {
-            ERROR("Domain appears not to have suspended");
-            goto out;
-        }
-    }
-
-    if ( flags & XCFLAGS_CHECKPOINT_COMPRESS )
-    {
-        if (!(compress_ctx = xc_compression_create_context(xch, dinfo->p2m_size)))
-        {
-            ERROR("Failed to create compression context");
-            goto out;
-        }
-        outbuf_init(xch, &ob_tailbuf, OUTBUF_SIZE/4);
-    }
-
-    last_iter = !live;
-
-    /* pretend we sent all the pages last iteration */
-    sent_last_iter = dinfo->p2m_size;
-
-    /* Setup to_send / to_fix and to_skip bitmaps */
-    to_send = xc_hypercall_buffer_alloc_pages(xch, to_send, NRPAGES(bitmap_size(dinfo->p2m_size)));
-    to_skip = xc_hypercall_buffer_alloc_pages(xch, to_skip, NRPAGES(bitmap_size(dinfo->p2m_size)));
-    to_fix  = calloc(1, bitmap_size(dinfo->p2m_size));
-
-    if ( !to_send || !to_fix || !to_skip )
-    {
-        errno = ENOMEM;
-        ERROR("Couldn't allocate to_send array");
-        goto out;
-    }
-
-    memset(to_send, 0xff, bitmap_size(dinfo->p2m_size));
-
-    if ( hvm )
-    {
-        /* Need another buffer for HVM context */
-        hvm_buf_size = xc_domain_hvm_getcontext(xch, dom, 0, 0);
-        if ( hvm_buf_size == -1 )
-        {
-            PERROR("Couldn't get HVM context size from Xen");
-            goto out;
-        }
-        hvm_buf = malloc(hvm_buf_size);
-        if ( !hvm_buf )
-        {
-            errno = ENOMEM;
-            ERROR("Couldn't allocate memory");
-            goto out;
-        }
-    }
-
-    analysis_phase(xch, dom, ctx, HYPERCALL_BUFFER(to_skip), 0);
-
-    pfn_type   = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
-    pfn_batch  = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
-    pfn_err    = malloc(MAX_BATCH_SIZE * sizeof(*pfn_err));
-    if ( (pfn_type == NULL) || (pfn_batch == NULL) || (pfn_err == NULL) )
-    {
-        ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
-        errno = ENOMEM;
-        goto out;
-    }
-    memset(pfn_type, 0,
-           ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
-
-    /* Setup the mfn_to_pfn table mapping */
-    if ( !(ctx->live_m2p = xc_map_m2p(xch, ctx->max_mfn, PROT_READ, &ctx->m2p_mfn0)) )
-    {
-        PERROR("Failed to map live M2P table");
-        goto out;
-    }
-
-    /* Start writing out the saved-domain record. */
-    if ( write_exact(io_fd, &dinfo->p2m_size, sizeof(unsigned long)) )
-    {
-        PERROR("write: p2m_size");
-        goto out;
-    }
-
-    if ( !hvm )
-    {
-        int err = 0;
-
-        /* Map the P2M table, and write the list of P2M frames */
-        ctx->live_p2m = map_and_save_p2m_table(xch, io_fd, dom, ctx, live_shinfo);
-        if ( ctx->live_p2m == NULL )
-        {
-            PERROR("Failed to map/save the p2m frame list");
-            goto out;
-        }
-
-        /*
-         * Quick belt and braces sanity check.
-         */
-        
-        for ( i = 0; i < dinfo->p2m_size; i++ )
-        {
-            mfn = pfn_to_mfn(i);
-            if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
-            {
-                DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
-                        mfn, mfn_to_pfn(mfn));
-                err++;
-            }
-        }
-        DPRINTF("Had %d unexplained entries in p2m table\n", err);
-    }
-
-    print_stats(xch, dom, 0, &time_stats, &shadow_stats, 0);
-
-    tmem_saved = xc_tmem_save(xch, dom, io_fd, live, XC_SAVE_ID_TMEM);
-    if ( tmem_saved == -1 )
-    {
-        PERROR("Error when writing to state file (tmem)");
-        goto out;
-    }
-
-    if ( !live && save_tsc_info(xch, dom, io_fd) < 0 )
-    {
-        PERROR("Error when writing to state file (tsc)");
-        goto out;
-    }
-
-  copypages:
-#define wrexact(fd, buf, len) write_buffer(xch, last_iter, ob, (fd), (buf), (len))
-#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, ob, (fd), (buf), (len))
-#define wrcompressed(fd) write_compressed(xch, compress_ctx, last_iter, ob, (fd))
-
-    ob = &ob_pagebuf; /* Holds pfn_types, pages/compressed pages */
-    /* Now write out each data page, canonicalising page tables as we go... */
-    for ( ; ; )
-    {
-        unsigned int N, batch, run;
-        char reportbuf[80];
-
-        snprintf(reportbuf, sizeof(reportbuf),
-                 "Saving memory: iter %d (last sent %u skipped %u)",
-                 iter, sent_this_iter, skip_this_iter);
-
-        xc_report_progress_start(xch, reportbuf, dinfo->p2m_size);
-
-        iter++;
-        sent_this_iter = 0;
-        skip_this_iter = 0;
-        N = 0;
-
-        while ( N < dinfo->p2m_size )
-        {
-            xc_report_progress_step(xch, N, dinfo->p2m_size);
-
-            if ( !last_iter )
-            {
-                /* Slightly wasteful to peek the whole array every time,
-                   but this is fast enough for the moment. */
-                frc = xc_shadow_control(
-                    xch, dom, XEN_DOMCTL_SHADOW_OP_PEEK, HYPERCALL_BUFFER(to_skip),
-                    dinfo->p2m_size, NULL, 0, NULL);
-                if ( frc != dinfo->p2m_size )
-                {
-                    ERROR("Error peeking shadow bitmap");
-                    goto out;
-                }
-            }
-
-            /* load pfn_type[] with the mfn of all the pages we're doing in
-               this batch. */
-            for  ( batch = 0;
-                   (batch < MAX_BATCH_SIZE) && (N < dinfo->p2m_size);
-                   N++ )
-            {
-                int n = N;
-
-                if ( debug )
-                {
-                    DPRINTF("%d pfn= %08lx mfn= %08lx %d",
-                            iter, (unsigned long)n,
-                            hvm ? 0 : pfn_to_mfn(n),
-                            test_bit(n, to_send));
-                    if ( !hvm && is_mapped(pfn_to_mfn(n)) )
-                        DPRINTF("  [mfn]= %08lx",
-                                mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
-                    DPRINTF("\n");
-                }
-
-                if ( completed )
-                {
-                    /* for sparse bitmaps, word-by-word may save time */
-                    if ( !to_send[N >> ORDER_LONG] )
-                    {
-                        /* incremented again in for loop! */
-                        N += BITS_PER_LONG - 1;
-                        continue;
-                    }
-
-                    if ( !test_bit(n, to_send) )
-                        continue;
-
-                    pfn_batch[batch] = n;
-                    if ( hvm )
-                        pfn_type[batch] = n;
-                    else
-                        pfn_type[batch] = pfn_to_mfn(n);
-                }
-                else
-                {
-                    int dont_skip = (last_iter || (superpages && iter==1));
-
-                    if ( !dont_skip &&
-                         test_bit(n, to_send) &&
-                         test_bit(n, to_skip) )
-                        skip_this_iter++; /* stats keeping */
-
-                    if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
-                           (test_bit(n, to_send) && dont_skip) ||
-                           (test_bit(n, to_fix)  && last_iter)) )
-                        continue;
-
-                    /* First time through, try to keep superpages in the same batch */
-                    if ( superpages && iter == 1
-                         && SUPER_PAGE_START(n)
-                         && batch + SUPERPAGE_NR_PFNS > MAX_BATCH_SIZE )
-                        break;
-
-                    /*
-                    ** we get here if:
-                    **  1. page is marked to_send & hasn't already been re-dirtied
-                    **  2. (ignore to_skip in first and last iterations)
-                    **  3. add in pages that still need fixup (net bufs)
-                    */
-
-                    pfn_batch[batch] = n;
-
-                    /* Hypercall interfaces operate in PFNs for HVM guests
-                     * and MFNs for PV guests */
-                    if ( hvm )
-                        pfn_type[batch] = n;
-                    else
-                        pfn_type[batch] = pfn_to_mfn(n);
-                    
-                    if ( !is_mapped(pfn_type[batch]) )
-                    {
-                        /*
-                        ** not currently in psuedo-physical map -- set bit
-                        ** in to_fix since we must send this page in last_iter
-                        ** unless its sent sooner anyhow, or it never enters
-                        ** pseudo-physical map (e.g. for ballooned down doms)
-                        */
-                        set_bit(n, to_fix);
-                        continue;
-                    }
-                    
-                    if ( last_iter &&
-                         test_bit(n, to_fix) &&
-                         !test_bit(n, to_send) )
-                    {
-                        needed_to_fix++;
-                        DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
-                                iter, n, pfn_type[batch]);
-                    }
-
-                    clear_bit(n, to_fix);
-                }
-                
-                batch++;
-            }
-
-            if ( batch == 0 )
-                goto skip; /* vanishingly unlikely... */
-
-            region_base = xc_map_foreign_bulk(
-                xch, dom, PROT_READ, pfn_type, pfn_err, batch);
-            if ( region_base == NULL )
-            {
-                PERROR("map batch failed");
-                goto out;
-            }
-
-            /* Get page types */
-            if ( xc_get_pfn_type_batch(xch, dom, batch, pfn_type) )
-            {
-                PERROR("get_pfn_type_batch failed");
-                goto out;
-            }
-
-            for ( run = j = 0; j < batch; j++ )
-            {
-                unsigned long gmfn = pfn_batch[j];
-
-                if ( !hvm )
-                    gmfn = pfn_to_mfn(gmfn);
-
-                if ( pfn_type[j] == XEN_DOMCTL_PFINFO_BROKEN )
-                {
-                    pfn_type[j] |= pfn_batch[j];
-                    ++run;
-                    continue;
-                }
-
-                if ( pfn_err[j] )
-                {
-                    if ( pfn_type[j] == XEN_DOMCTL_PFINFO_XTAB )
-                        continue;
-
-                    DPRINTF("map fail: page %i mfn %08lx err %d\n",
-                            j, gmfn, pfn_err[j]);
-                    pfn_type[j] = XEN_DOMCTL_PFINFO_XTAB;
-                    continue;
-                }
-
-                if ( pfn_type[j] == XEN_DOMCTL_PFINFO_XTAB )
-                {
-                    DPRINTF("type fail: page %i mfn %08lx\n", j, gmfn);
-                    continue;
-                }
-
-                if ( superpages && iter==1 && test_bit(gmfn, to_skip))
-                    pfn_type[j] = XEN_DOMCTL_PFINFO_XALLOC;
-
-                /* canonicalise mfn->pfn */
-                pfn_type[j] |= pfn_batch[j];
-                ++run;
-
-                if ( debug )
-                {
-                    if ( hvm )
-                        DPRINTF("%d pfn=%08lx sum=%08lx\n",
-                                iter,
-                                pfn_type[j],
-                                csum_page(region_base + (PAGE_SIZE*j)));
-                    else
-                        DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
-                                " sum= %08lx\n",
-                                iter,
-                                pfn_type[j],
-                                gmfn,
-                                mfn_to_pfn(gmfn),
-                                csum_page(region_base + (PAGE_SIZE*j)));
-                }
-            }
-
-            if ( !run )
-            {
-                munmap(region_base, batch*PAGE_SIZE);
-                continue; /* bail on this batch: no valid pages */
-            }
-
-            if ( wrexact(io_fd, &batch, sizeof(unsigned int)) )
-            {
-                PERROR("Error when writing to state file (2)");
-                goto out;
-            }
-
-            if ( sizeof(unsigned long) < sizeof(*pfn_type) )
-                for ( j = 0; j < batch; j++ )
-                    ((unsigned long *)pfn_type)[j] = pfn_type[j];
-            if ( wrexact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
-            {
-                PERROR("Error when writing to state file (3)");
-                goto out;
-            }
-            if ( sizeof(unsigned long) < sizeof(*pfn_type) )
-                while ( --j >= 0 )
-                    pfn_type[j] = ((unsigned long *)pfn_type)[j];
-
-            /* entering this loop, pfn_type is now in pfns (Not mfns) */
-            run = 0;
-            for ( j = 0; j < batch; j++ )
-            {
-                unsigned long pfn, pagetype;
-                void *spage = (char *)region_base + (PAGE_SIZE*j);
-
-                pfn      = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
-                pagetype = pfn_type[j] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
-
-                if ( pagetype != 0 )
-                {
-                    /* If the page is not a normal data page, write out any
-                       run of pages we may have previously acumulated */
-                    if ( !compressing && run )
-                    {
-                        if ( wruncached(io_fd, live,
-                                       (char*)region_base+(PAGE_SIZE*(j-run)), 
-                                       PAGE_SIZE*run) != PAGE_SIZE*run )
-                        {
-                            PERROR("Error when writing to state file (4a)"
-                                  " (errno %d)", errno);
-                            goto out;
-                        }                        
-                        run = 0;
-                    }
-                }
-
-                /*
-                 * skip pages that aren't present,
-                 * or are broken, or are alloc-only
-                 */
-                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB
-                    || pagetype == XEN_DOMCTL_PFINFO_BROKEN
-                    || pagetype == XEN_DOMCTL_PFINFO_XALLOC )
-                    continue;
-
-                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
-                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
-                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
-                {
-                    /* We have a pagetable page: need to rewrite it. */
-                    race = 
-                        canonicalize_pagetable(ctx, pagetype, pfn, spage, page); 
-
-                    if ( race && !live )
-                    {
-                        ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
-                              pagetype);
-                        goto out;
-                    }
-
-                    if (compressing)
-                    {
-                        int c_err;
-                        /* Mark pagetable page to be sent uncompressed */
-                        c_err = xc_compression_add_page(xch, compress_ctx, page,
-                                                        pfn, 1 /* raw page */);
-                        if (c_err == -2) /* OOB PFN */
-                        {
-                            ERROR("Could not add pagetable page "
-                                  "(pfn:%" PRIpfn "to page buffer\n", pfn);
-                            goto out;
-                        }
-
-                        if (c_err == -1)
-                        {
-                            /*
-                             * We are out of buffer space to hold dirty
-                             * pages. Compress and flush the current buffer
-                             * to make space. This is a corner case, that
-                             * slows down checkpointing as the compression
-                             * happens while domain is suspended. Happens
-                             * seldom and if you find this occuring
-                             * frequently, increase the PAGE_BUFFER_SIZE
-                             * in xc_compression.c.
-                             */
-                            if (wrcompressed(io_fd) < 0)
-                            {
-                                ERROR("Error when writing compressed"
-                                      " data (4b)\n");
-                                goto out;
-                            }
-                        }
-                    }
-                    else if ( wruncached(io_fd, live, page,
-                                         PAGE_SIZE) != PAGE_SIZE )
-                    {
-                        PERROR("Error when writing to state file (4b)"
-                              " (errno %d)", errno);
-                        goto out;
-                    }
-                }
-                else
-                {
-                    /* We have a normal page: accumulate it for writing. */
-                    if (compressing)
-                    {
-                        int c_err;
-                        /* For checkpoint compression, accumulate the page in the
-                         * page buffer, to be compressed later.
-                         */
-                        c_err = xc_compression_add_page(xch, compress_ctx, spage,
-                                                        pfn, 0 /* not raw page */);
-
-                        if (c_err == -2) /* OOB PFN */
-                        {
-                            ERROR("Could not add page "
-                                  "(pfn:%" PRIpfn "to page buffer\n", pfn);
-                            goto out;
-                        }
-
-                        if (c_err == -1)
-                        {
-                            if (wrcompressed(io_fd) < 0)
-                            {
-                                ERROR("Error when writing compressed"
-                                      " data (4c)\n");
-                                goto out;
-                            }
-                        }
-                    }
-                    else
-                        run++;
-                }
-            } /* end of the write out for this batch */
-
-            if ( run )
-            {
-                /* write out the last accumulated run of pages */
-                if ( wruncached(io_fd, live,
-                               (char*)region_base+(PAGE_SIZE*(j-run)), 
-                               PAGE_SIZE*run) != PAGE_SIZE*run )
-                {
-                    PERROR("Error when writing to state file (4c)"
-                          " (errno %d)", errno);
-                    goto out;
-                }                        
-            }
-
-            sent_this_iter += batch;
-
-            munmap(region_base, batch*PAGE_SIZE);
-
-        } /* end of this while loop for this iteration */
-
-      skip:
-
-        xc_report_progress_step(xch, dinfo->p2m_size, dinfo->p2m_size);
-
-        total_sent += sent_this_iter;
-
-        if ( last_iter )
-        {
-            print_stats( xch, dom, sent_this_iter, &time_stats, &shadow_stats, 1);
-
-            DPRINTF("Total pages sent= %ld (%.2fx)\n",
-                    total_sent, ((float)total_sent)/dinfo->p2m_size );
-            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
-        }
-
-        if ( last_iter && debug )
-        {
-            int id = XC_SAVE_ID_ENABLE_VERIFY_MODE;
-            memset(to_send, 0xff, bitmap_size(dinfo->p2m_size));
-            debug = 0;
-            DPRINTF("Entering debug resend-all mode\n");
-
-            /* send "-1" to put receiver into debug mode */
-            if ( wrexact(io_fd, &id, sizeof(int)) )
-            {
-                PERROR("Error when writing to state file (6)");
-                goto out;
-            }
-
-            continue;
-        }
-
-        if ( last_iter )
-            break;
-
-        if ( live )
-        {
-            if ( (iter >= max_iters) ||
-                 (sent_this_iter+skip_this_iter < 50) ||
-                 (total_sent > dinfo->p2m_size*max_factor) )
-            {
-                DPRINTF("Start last iteration\n");
-                last_iter = 1;
-
-                if ( suspend_and_state(callbacks->suspend, callbacks->data,
-                                       xch, io_fd, dom, &info) )
-                {
-                    ERROR("Domain appears not to have suspended");
-                    goto out;
-                }
-
-                DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
-                if ( (tmem_saved > 0) &&
-                     (xc_tmem_save_extra(xch,dom,io_fd,XC_SAVE_ID_TMEM_EXTRA) == -1) )
-                {
-                        PERROR("Error when writing to state file (tmem)");
-                        goto out;
-                }
-
-                if ( save_tsc_info(xch, dom, io_fd) < 0 )
-                {
-                    PERROR("Error when writing to state file (tsc)");
-                    goto out;
-                }
-
-
-            }
-
-            if ( xc_shadow_control(xch, dom,
-                                   XEN_DOMCTL_SHADOW_OP_CLEAN, HYPERCALL_BUFFER(to_send),
-                                   dinfo->p2m_size, NULL, 0, &shadow_stats) != dinfo->p2m_size )
-            {
-                PERROR("Error flushing shadow PT");
-                goto out;
-            }
-
-            sent_last_iter = sent_this_iter;
-
-            print_stats(xch, dom, sent_this_iter, &time_stats, &shadow_stats, 1);
-
-        }
-    } /* end of infinite for loop */
-
-    DPRINTF("All memory is saved\n");
-
-    /* After last_iter, buffer the rest of pagebuf & tailbuf data into a
-     * separate output buffer and flush it after the compressed page chunks.
-     */
-    if (compressing)
-    {
-        ob = &ob_tailbuf;
-        ob->pos = 0;
-    }
-
-    {
-        struct chunk {
-            int id;
-            int max_vcpu_id;
-            uint64_t vcpumap[XC_SR_MAX_VCPUS/64];
-        } chunk = { XC_SAVE_ID_VCPU_INFO, info.max_vcpu_id };
-
-        if ( info.max_vcpu_id >= XC_SR_MAX_VCPUS )
-        {
-            errno = E2BIG;
-            ERROR("Too many VCPUS in guest!");
-            goto out;
-        }
-
-        for ( i = 1; i <= info.max_vcpu_id; i++ )
-        {
-            xc_vcpuinfo_t vinfo;
-            if ( (xc_vcpu_getinfo(xch, dom, i, &vinfo) == 0) &&
-                 vinfo.online )
-                vcpumap[i/64] |= 1ULL << (i%64);
-        }
-
-        memcpy(chunk.vcpumap, vcpumap, vcpumap_sz(info.max_vcpu_id));
-        if ( wrexact(io_fd, &chunk, offsetof(struct chunk, vcpumap)
-                     + vcpumap_sz(info.max_vcpu_id)) )
-        {
-            PERROR("Error when writing to state file");
-            goto out;
-        }
-    }
-
-    if ( hvm )
-    {
-        struct {
-            int id;
-            uint32_t pad;
-            uint64_t data;
-        } chunk = { 0, };
-
-        chunk.id = XC_SAVE_ID_HVM_GENERATION_ID_ADDR;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_VM_GENERATION_ID_ADDR, &chunk.data);
-
-        if ( (chunk.data != 0) &&
-             wrexact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            PERROR("Error when writing the generation id buffer location for guest");
-            goto out;
-        }
-
-        chunk.id = XC_SAVE_ID_HVM_IDENT_PT;
-        chunk.data = 0;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_IDENT_PT, &chunk.data);
-
-        if ( (chunk.data != 0) &&
-             wrexact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            PERROR("Error when writing the ident_pt for EPT guest");
-            goto out;
-        }
-
-        chunk.id = XC_SAVE_ID_HVM_PAGING_RING_PFN;
-        chunk.data = 0;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_PAGING_RING_PFN, &chunk.data);
-
-        if ( (chunk.data != 0) &&
-             wrexact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            PERROR("Error when writing the paging ring pfn for guest");
-            goto out;
-        }
-
-        chunk.id = XC_SAVE_ID_HVM_ACCESS_RING_PFN;
-        chunk.data = 0;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_ACCESS_RING_PFN, &chunk.data);
-
-        if ( (chunk.data != 0) &&
-             wrexact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            PERROR("Error when writing the access ring pfn for guest");
-            goto out;
-        }
-
-        chunk.id = XC_SAVE_ID_HVM_SHARING_RING_PFN;
-        chunk.data = 0;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_SHARING_RING_PFN, &chunk.data);
-
-        if ( (chunk.data != 0) &&
-             wrexact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            PERROR("Error when writing the sharing ring pfn for guest");
-            goto out;
-        }
-
-        chunk.id = XC_SAVE_ID_HVM_VM86_TSS;
-        chunk.data = 0;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_VM86_TSS, &chunk.data);
-
-        if ( (chunk.data != 0) &&
-             wrexact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            PERROR("Error when writing the vm86 TSS for guest");
-            goto out;
-        }
-
-        chunk.id = XC_SAVE_ID_HVM_CONSOLE_PFN;
-        chunk.data = 0;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_CONSOLE_PFN, &chunk.data);
-
-        if ( (chunk.data != 0) &&
-             wrexact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            PERROR("Error when writing the console pfn for guest");
-            goto out;
-        }
-
-        chunk.id = XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION;
-        chunk.data = 0;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_ACPI_IOPORTS_LOCATION, &chunk.data);
-
-        if ((chunk.data != 0) && wrexact(io_fd, &chunk, sizeof(chunk)))
-        {
-            PERROR("Error when writing the firmware ioport version");
-            goto out;
-        }
-
-        chunk.id = XC_SAVE_ID_HVM_VIRIDIAN;
-        chunk.data = 0;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_VIRIDIAN, &chunk.data);
-
-        if ( (chunk.data != 0) &&
-             wrexact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            PERROR("Error when writing the viridian flag");
-            goto out;
-        }
-
-        chunk.id = XC_SAVE_ID_HVM_IOREQ_SERVER_PFN;
-        chunk.data = 0;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_IOREQ_SERVER_PFN, &chunk.data);
-
-        if ( (chunk.data != 0) &&
-             wrexact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            PERROR("Error when writing the ioreq server gmfn base");
-            goto out;
-        }
-
-        chunk.id = XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES;
-        chunk.data = 0;
-        xc_hvm_param_get(xch, dom, HVM_PARAM_NR_IOREQ_SERVER_PAGES, &chunk.data);
-
-        if ( (chunk.data != 0) &&
-             wrexact(io_fd, &chunk, sizeof(chunk)) )
-        {
-            PERROR("Error when writing the ioreq server gmfn count");
-            goto out;
-        }
-    }
-
-    if ( callbacks != NULL && callbacks->toolstack_save != NULL )
-    {
-        int id = XC_SAVE_ID_TOOLSTACK;
-        uint8_t *buf;
-        uint32_t len;
-
-        if ( callbacks->toolstack_save(dom, &buf, &len, callbacks->data) < 0 )
-        {
-            PERROR("Error calling toolstack_save");
-            goto out;
-        }
-        wrexact(io_fd, &id, sizeof(id));
-        wrexact(io_fd, &len, sizeof(len));
-        wrexact(io_fd, buf, len);
-        free(buf);
-    }
-
-    if ( !callbacks->checkpoint )
-    {
-        /*
-         * If this is not a checkpointed save then this must be the first and
-         * last checkpoint.
-         */
-        i = XC_SAVE_ID_LAST_CHECKPOINT;
-        if ( wrexact(io_fd, &i, sizeof(int)) )
-        {
-            PERROR("Error when writing last checkpoint chunk");
-            goto out;
-        }
-    }
-
-    /* Enable compression logic on both sides by sending this
-     * one time marker.
-     * NOTE: We could have simplified this procedure by sending
-     * the enable/disable compression flag before the beginning of
-     * the main for loop. But this would break compatibility for
-     * live migration code, with older versions of xen. So we have
-     * to enable it after the last_iter, when the XC_SAVE_ID_*
-     * elements are sent.
-     */
-    if (!compressing && (flags & XCFLAGS_CHECKPOINT_COMPRESS))
-    {
-        i = XC_SAVE_ID_ENABLE_COMPRESSION;
-        if ( wrexact(io_fd, &i, sizeof(int)) )
-        {
-            PERROR("Error when writing enable_compression marker");
-            goto out;
-        }
-    }
-
-    /* Zero terminate */
-    i = 0;
-    if ( wrexact(io_fd, &i, sizeof(int)) )
-    {
-        PERROR("Error when writing to state file (6')");
-        goto out;
-    }
-
-    if ( hvm ) 
-    {
-        uint32_t rec_size;
-
-        /* Save magic-page locations. */
-        memset(magic_pfns, 0, sizeof(magic_pfns));
-        xc_hvm_param_get(xch, dom, HVM_PARAM_IOREQ_PFN, &magic_pfns[0]);
-        xc_hvm_param_get(xch, dom, HVM_PARAM_BUFIOREQ_PFN, &magic_pfns[1]);
-        xc_hvm_param_get(xch, dom, HVM_PARAM_STORE_PFN, &magic_pfns[2]);
-        if ( wrexact(io_fd, magic_pfns, sizeof(magic_pfns)) )
-        {
-            PERROR("Error when writing to state file (7)");
-            goto out;
-        }
-
-        /* Get HVM context from Xen and save it too */
-        if ( (rec_size = xc_domain_hvm_getcontext(xch, dom, hvm_buf, 
-                                                  hvm_buf_size)) == -1 )
-        {
-            PERROR("HVM:Could not get hvm buffer");
-            goto out;
-        }
-        
-        if ( wrexact(io_fd, &rec_size, sizeof(uint32_t)) )
-        {
-            PERROR("error write hvm buffer size");
-            goto out;
-        }
-        
-        if ( wrexact(io_fd, hvm_buf, rec_size) )
-        {
-            PERROR("write HVM info failed!");
-            goto out;
-        }
-        
-        /* HVM guests are done now */
-        goto success;
-    }
-
-    /* PV guests only from now on */
-
-    /* Send through a list of all the PFNs that were not in map at the close */
-    {
-        unsigned int i,j;
-        unsigned long pfntab[1024];
-
-        for ( i = 0, j = 0; i < dinfo->p2m_size; i++ )
-        {
-            if ( !is_mapped(pfn_to_mfn(i)) )
-                j++;
-        }
-
-        if ( wrexact(io_fd, &j, sizeof(unsigned int)) )
-        {
-            PERROR("Error when writing to state file (6a)");
-            goto out;
-        }
-
-        for ( i = 0, j = 0; i < dinfo->p2m_size; )
-        {
-            if ( !is_mapped(pfn_to_mfn(i)) )
-                pfntab[j++] = i;
-
-            i++;
-            if ( (j == 1024) || (i == dinfo->p2m_size) )
-            {
-                if ( wrexact(io_fd, &pfntab, sizeof(unsigned long)*j) )
-                {
-                    PERROR("Error when writing to state file (6b)");
-                    goto out;
-                }
-                j = 0;
-            }
-        }
-    }
-
-    if ( xc_vcpu_getcontext(xch, dom, 0, &ctxt) )
-    {
-        PERROR("Could not get vcpu context");
-        goto out;
-    }
-
-    /*
-     * Canonicalise the start info frame number.
-     *
-     * The start info MFN is the 3rd argument to the
-     * HYPERVISOR_sched_op hypercall when op==SCHEDOP_shutdown and
-     * reason==SHUTDOWN_suspend and is therefore found in the edx
-     * register.
-     */
-    mfn = GET_FIELD(&ctxt, user_regs.edx, dinfo->guest_width);
-    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
-    {
-        errno = ERANGE;
-        ERROR("Suspend record is not in range of pseudophys map");
-        goto out;
-    }
-    SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn), dinfo->guest_width);
-
-    for ( i = 0; i <= info.max_vcpu_id; i++ )
-    {
-        if ( !(vcpumap[i/64] & (1ULL << (i%64))) )
-            continue;
-
-        if ( (i != 0) && xc_vcpu_getcontext(xch, dom, i, &ctxt) )
-        {
-            PERROR("No context for VCPU%d", i);
-            goto out;
-        }
-
-        /* Canonicalise each GDT frame number. */
-        for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents, dinfo->guest_width); j++ )
-        {
-            mfn = GET_FIELD(&ctxt, gdt_frames[j], dinfo->guest_width);
-            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
-            {
-                errno = ERANGE;
-                ERROR("GDT frame is not in range of pseudophys map");
-                goto out;
-            }
-            SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn), dinfo->guest_width);
-        }
-
-        /* Canonicalise the page table base pointer. */
-        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(
-                 UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3], dinfo->guest_width))) )
-        {
-            errno = ERANGE;
-            ERROR("PT base is not in range of pseudophys map");
-            goto out;
-        }
-        SET_FIELD(&ctxt, ctrlreg[3], 
-                  FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(
-                                          GET_FIELD(&ctxt, ctrlreg[3], dinfo->guest_width)
-                                          ))), dinfo->guest_width);
-
-        /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
-        if ( (ctx->pt_levels == 4) && ctxt.x64.ctrlreg[1] )
-        {
-            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(ctxt.x64.ctrlreg[1])) )
-            {
-                errno = ERANGE;
-                ERROR("PT base is not in range of pseudophys map");
-                goto out;
-            }
-            /* Least-significant bit means 'valid PFN'. */
-            ctxt.x64.ctrlreg[1] = 1 |
-                FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(ctxt.x64.ctrlreg[1])));
-        }
-
-        if ( wrexact(io_fd, &ctxt, ((dinfo->guest_width==8) 
-                                        ? sizeof(ctxt.x64) 
-                                        : sizeof(ctxt.x32))) )
-        {
-            PERROR("Error when writing to state file (1)");
-            goto out;
-        }
-
-        domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
-        domctl.domain = dom;
-        memset(&domctl.u, 0, sizeof(domctl.u));
-        domctl.u.ext_vcpucontext.vcpu = i;
-        if ( xc_domctl(xch, &domctl) < 0 )
-        {
-            PERROR("No extended context for VCPU%d", i);
-            goto out;
-        }
-        if ( wrexact(io_fd, &domctl.u.ext_vcpucontext, 128) )
-        {
-            PERROR("Error when writing to state file (2)");
-            goto out;
-        }
-
-        /* Check there are no PV MSRs in use. */
-        domctl.cmd = XEN_DOMCTL_get_vcpu_msrs;
-        domctl.domain = dom;
-        memset(&domctl.u, 0, sizeof(domctl.u));
-        domctl.u.vcpu_msrs.vcpu = i;
-        domctl.u.vcpu_msrs.msr_count = 0;
-        set_xen_guest_handle_raw(domctl.u.vcpu_msrs.msrs, (void*)1);
-
-        if ( xc_domctl(xch, &domctl) < 0 )
-        {
-            if ( errno == ENOBUFS )
-            {
-                errno = EOPNOTSUPP;
-                PERROR("Unable to migrate PV guest using MSRs (yet)");
-            }
-            else
-                PERROR("Error querying maximum number of MSRs for VCPU%d", i);
-            goto out;
-        }
-
-        /* Start to fetch CPU eXtended States */
-        /* Get buffer size first */
-        domctl.cmd = XEN_DOMCTL_getvcpuextstate;
-        domctl.domain = dom;
-        domctl.u.vcpuextstate.vcpu = i;
-        domctl.u.vcpuextstate.xfeature_mask = 0;
-        domctl.u.vcpuextstate.size = 0;
-        if ( xc_domctl(xch, &domctl) < 0 )
-        {
-            PERROR("No eXtended states (XSAVE) for VCPU%d", i);
-            goto out;
-        }
-
-        if ( !domctl.u.vcpuextstate.xfeature_mask )
-            continue;
-
-        /* Getting eXtended states data */
-        buffer = xc_hypercall_buffer_alloc(xch, buffer, domctl.u.vcpuextstate.size);
-        if ( !buffer )
-        {
-            PERROR("Insufficient memory for getting eXtended states for"
-                   "VCPU%d", i);
-            goto out;
-        }
-        set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
-        if ( xc_domctl(xch, &domctl) < 0 )
-        {
-            PERROR("No eXtended states (XSAVE) for VCPU%d", i);
-            xc_hypercall_buffer_free(xch, buffer);
-            goto out;
-        }
-
-        if ( wrexact(io_fd, &domctl.u.vcpuextstate.xfeature_mask,
-                     sizeof(domctl.u.vcpuextstate.xfeature_mask)) ||
-             wrexact(io_fd, &domctl.u.vcpuextstate.size,
-                     sizeof(domctl.u.vcpuextstate.size)) ||
-             wrexact(io_fd, buffer, domctl.u.vcpuextstate.size) )
-        {
-            PERROR("Error when writing to state file VCPU extended state");
-            xc_hypercall_buffer_free(xch, buffer);
-            goto out;
-        }
-        xc_hypercall_buffer_free(xch, buffer);
-    }
-
-    /*
-     * Reset the MFN to be a known-invalid value. See map_frame_list_list().
-     */
-    memcpy(page, live_shinfo, PAGE_SIZE);
-    SET_FIELD(((shared_info_any_t *)page), 
-              arch.pfn_to_mfn_frame_list_list, 0, dinfo->guest_width);
-    if ( wrexact(io_fd, page, PAGE_SIZE) )
-    {
-        PERROR("Error when writing to state file (1)");
-        goto out;
-    }
-
-    /* Flush last write and check for errors. */
-    if ( fsync(io_fd) && errno != EINVAL )
-    {
-        PERROR("Error when flushing state file");
-        goto out;
-    }
-
-    /* Success! */
- success:
-    rc = errno = 0;
-    goto out_rc;
-
- out:
-    rc = errno;
-    assert(rc);
- out_rc:
-    completed = 1;
-
-    if ( !rc && callbacks->postcopy )
-        callbacks->postcopy(callbacks->data);
-
-    /* guest has been resumed. Now we can compress data
-     * at our own pace.
-     */
-    if (!rc && compressing)
-    {
-        ob = &ob_pagebuf;
-        if (wrcompressed(io_fd) < 0)
-        {
-            ERROR("Error when writing compressed data, after postcopy\n");
-            goto out;
-        }
-        /* Append the tailbuf data to the main outbuf */
-        if ( wrexact(io_fd, ob_tailbuf.buf, ob_tailbuf.pos) )
-        {
-            PERROR("Error when copying tailbuf into outbuf");
-            goto out;
-        }
-    }
-
-    /* Flush last write and discard cache for file. */
-    if ( ob && outbuf_flush(xch, ob, io_fd) < 0 ) {
-        PERROR("Error when flushing output buffer");
-        if (!rc)
-            rc = errno;
-    }
-
-    discard_file_cache(xch, io_fd, 1 /* flush */);
-
-    /* Enable compression now, finally */
-    compressing = (flags & XCFLAGS_CHECKPOINT_COMPRESS);
-
-    /* checkpoint_cb can spend arbitrarily long in between rounds */
-    if (!rc && callbacks->checkpoint &&
-        callbacks->checkpoint(callbacks->data) > 0)
-    {
-        /* reset stats timer */
-        print_stats(xch, dom, 0, &time_stats, &shadow_stats, 0);
-
-        /* last_iter = 1; */
-        if ( suspend_and_state(callbacks->suspend, callbacks->data, xch,
-                               io_fd, dom, &info) )
-        {
-            ERROR("Domain appears not to have suspended");
-            goto out;
-        }
-        DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
-        print_stats(xch, dom, 0, &time_stats, &shadow_stats, 1);
-
-        if ( xc_shadow_control(xch, dom,
-                               XEN_DOMCTL_SHADOW_OP_CLEAN, HYPERCALL_BUFFER(to_send),
-                               dinfo->p2m_size, NULL, 0, &shadow_stats) != dinfo->p2m_size )
-        {
-            PERROR("Error flushing shadow PT");
-        }
-
-        goto copypages;
-    }
-
-    if ( tmem_saved != 0 && live )
-        xc_tmem_save_done(xch, dom);
-
-    if ( live )
-    {
-        if ( xc_shadow_control(xch, dom, 
-                               XEN_DOMCTL_SHADOW_OP_OFF,
-                               NULL, 0, NULL, 0, NULL) < 0 )
-            DPRINTF("Warning - couldn't disable shadow mode");
-        if ( hvm && callbacks->switch_qemu_logdirty(dom, 0, callbacks->data) )
-            DPRINTF("Warning - couldn't disable qemu log-dirty mode");
-    }
-
-    if (compress_ctx)
-        xc_compression_free_context(xch, compress_ctx);
-
-    if ( live_shinfo )
-        munmap(live_shinfo, PAGE_SIZE);
-
-    if ( ctx->live_p2m )
-        munmap(ctx->live_p2m, P2M_FL_ENTRIES * PAGE_SIZE);
-
-    if ( ctx->live_m2p )
-        munmap(ctx->live_m2p, M2P_SIZE(ctx->max_mfn));
-
-    xc_hypercall_buffer_free_pages(xch, to_send, NRPAGES(bitmap_size(dinfo->p2m_size)));
-    xc_hypercall_buffer_free_pages(xch, to_skip, NRPAGES(bitmap_size(dinfo->p2m_size)));
-
-    free(pfn_type);
-    free(pfn_batch);
-    free(pfn_err);
-    free(to_fix);
-    free(hvm_buf);
-    outbuf_free(&ob_pagebuf);
-
-    errno = rc;
-exit:
-    DPRINTF("Save exit of domid %u with errno=%d\n", dom, errno);
-
-    return !!errno;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/tools/libxc/xc_efi.h b/tools/libxc/xc_efi.h
index 734da98..dbe105b 100644
--- a/tools/libxc/xc_efi.h
+++ b/tools/libxc/xc_efi.h
@@ -13,8 +13,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) 1999 VA Linux Systems
  * Copyright (C) 1999 Walt Drummond <drummond at valinux.com>
diff --git a/tools/libxc/xc_elf.h b/tools/libxc/xc_elf.h
index cb616d6..acbc028 100644
--- a/tools/libxc/xc_elf.h
+++ b/tools/libxc/xc_elf.h
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/libelf/elfstructs.h>
diff --git a/tools/libxc/xc_evtchn.c b/tools/libxc/xc_evtchn.c
index 2e0679e..15f0580 100644
--- a/tools/libxc/xc_evtchn.c
+++ b/tools/libxc/xc_evtchn.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (c) 2004, K A Fraser.
  */
diff --git a/tools/libxc/xc_flask.c b/tools/libxc/xc_flask.c
index bb117f7..b533656 100644
--- a/tools/libxc/xc_flask.c
+++ b/tools/libxc/xc_flask.c
@@ -12,8 +12,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
@@ -191,6 +190,12 @@ int xc_flask_getbool_byname(xc_interface *xch, char *name, int *curr, int *pend)
     DECLARE_FLASK_OP;
     DECLARE_HYPERCALL_BOUNCE(name, strlen(name), XC_HYPERCALL_BUFFER_BOUNCE_IN);
 
+    if ( xc_hypercall_bounce_pre(xch, name) )
+    {
+        PERROR("Could not bounce memory for flask op hypercall");
+        return -1;
+    }
+
     op.cmd = FLASK_GETBOOL;
     op.u.boolean.bool_id = -1;
     op.u.boolean.size = strlen(name);
@@ -217,6 +222,12 @@ int xc_flask_setbool(xc_interface *xch, char *name, int value, int commit)
     DECLARE_FLASK_OP;
     DECLARE_HYPERCALL_BOUNCE(name, strlen(name), XC_HYPERCALL_BUFFER_BOUNCE_IN);
 
+    if ( xc_hypercall_bounce_pre(xch, name) )
+    {
+        PERROR("Could not bounce memory for flask op hypercall");
+        return -1;
+    }
+
     op.cmd = FLASK_SETBOOL;
     op.u.boolean.bool_id = -1;
     op.u.boolean.new_value = value;
diff --git a/tools/libxc/xc_foreign_memory.c b/tools/libxc/xc_foreign_memory.c
index 43abf01..f42d140 100644
--- a/tools/libxc/xc_foreign_memory.c
+++ b/tools/libxc/xc_foreign_memory.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_freebsd.c b/tools/libxc/xc_freebsd.c
index 8e70a91..9dd48a3 100644
--- a/tools/libxc/xc_freebsd.c
+++ b/tools/libxc/xc_freebsd.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_freebsd_osdep.c b/tools/libxc/xc_freebsd_osdep.c
index 151d3bf..4d31a1e 100644
--- a/tools/libxc/xc_freebsd_osdep.c
+++ b/tools/libxc/xc_freebsd_osdep.c
@@ -17,8 +17,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <errno.h>
@@ -125,10 +124,13 @@ static void freebsd_privcmd_free_hypercall_buffer(xc_interface *xch,
                                                   int npages)
 {
 
+    int saved_errno = errno;
     /* Unlock pages */
     munlock(ptr, npages * XC_PAGE_SIZE);
 
     munmap(ptr, npages * XC_PAGE_SIZE);
+    /* We MUST propagate the hypercall errno, not unmap call's. */
+    errno = saved_errno;
 }
 
 static int freebsd_privcmd_hypercall(xc_interface *xch, xc_osdep_handle h,
@@ -157,7 +159,7 @@ static void *freebsd_privcmd_map_foreign_bulk(xc_interface *xch,
     addr = mmap(NULL, num << XC_PAGE_SHIFT, prot, MAP_SHARED, fd, 0);
     if ( addr == MAP_FAILED )
     {
-        PERROR("xc_map_foreign_batch: mmap failed");
+        PERROR("xc_map_foreign_bulk: mmap failed");
         return NULL;
     }
 
@@ -171,7 +173,7 @@ static void *freebsd_privcmd_map_foreign_bulk(xc_interface *xch,
     if ( rc < 0 )
     {
         int saved_errno = errno;
-        PERROR("xc_map_foreign_batch: ioctl failed");
+        PERROR("xc_map_foreign_bulk: ioctl failed");
         (void)munmap(addr, num << XC_PAGE_SHIFT);
         errno = saved_errno;
         return NULL;
diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c
index 4076e47..60335d8 100644
--- a/tools/libxc/xc_gnttab.c
+++ b/tools/libxc/xc_gnttab.c
@@ -13,8 +13,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_hcall_buf.c b/tools/libxc/xc_hcall_buf.c
index e762a93..6e3c958 100644
--- a/tools/libxc/xc_hcall_buf.c
+++ b/tools/libxc/xc_hcall_buf.c
@@ -12,8 +12,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <stdlib.h>
@@ -33,16 +32,22 @@ pthread_mutex_t hypercall_buffer_cache_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 static void hypercall_buffer_cache_lock(xc_interface *xch)
 {
+    int saved_errno = errno;
     if ( xch->flags & XC_OPENFLAG_NON_REENTRANT )
         return;
     pthread_mutex_lock(&hypercall_buffer_cache_mutex);
+    /* Ignore pthread errors. */
+    errno = saved_errno;
 }
 
 static void hypercall_buffer_cache_unlock(xc_interface *xch)
 {
+    int saved_errno = errno;
     if ( xch->flags & XC_OPENFLAG_NON_REENTRANT )
         return;
     pthread_mutex_unlock(&hypercall_buffer_cache_mutex);
+    /* Ignore pthread errors. */
+    errno = saved_errno;
 }
 
 static void *hypercall_buffer_cache_alloc(xc_interface *xch, int nr_pages)
diff --git a/tools/libxc/xc_hvm_build_arm.c b/tools/libxc/xc_hvm_build_arm.c
index ff66689..14f7c45 100644
--- a/tools/libxc/xc_hvm_build_arm.c
+++ b/tools/libxc/xc_hvm_build_arm.c
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (c) 2011, Citrix Systems
  */
diff --git a/tools/libxc/xc_hvm_build_x86.c b/tools/libxc/xc_hvm_build_x86.c
index c81a25b..ea250dd 100644
--- a/tools/libxc/xc_hvm_build_x86.c
+++ b/tools/libxc/xc_hvm_build_x86.c
@@ -12,8 +12,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <stddef.h>
@@ -88,25 +87,18 @@ static int modules_init(struct xc_hvm_build_args *args,
     return 0;
 }
 
-static void build_hvm_info(void *hvm_info_page, uint64_t mem_size,
-                           uint64_t mmio_start, uint64_t mmio_size)
+static void build_hvm_info(void *hvm_info_page,
+                           struct xc_hvm_build_args *args)
 {
     struct hvm_info_table *hvm_info = (struct hvm_info_table *)
         (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET);
-    uint64_t lowmem_end = mem_size, highmem_end = 0;
     uint8_t sum;
     int i;
 
-    if ( lowmem_end > mmio_start )
-    {
-        highmem_end = (1ull<<32) + (lowmem_end - mmio_start);
-        lowmem_end = mmio_start;
-    }
-
     memset(hvm_info_page, 0, PAGE_SIZE);
 
     /* Fill in the header. */
-    strncpy(hvm_info->signature, "HVM INFO", 8);
+    memcpy(hvm_info->signature, "HVM INFO", sizeof(hvm_info->signature));
     hvm_info->length = sizeof(struct hvm_info_table);
 
     /* Sensible defaults: these can be overridden by the caller. */
@@ -115,8 +107,8 @@ static void build_hvm_info(void *hvm_info_page, uint64_t mem_size,
     memset(hvm_info->vcpu_online, 0xff, sizeof(hvm_info->vcpu_online));
 
     /* Memory parameters. */
-    hvm_info->low_mem_pgend = lowmem_end >> PAGE_SHIFT;
-    hvm_info->high_mem_pgend = highmem_end >> PAGE_SHIFT;
+    hvm_info->low_mem_pgend = args->lowmem_end >> PAGE_SHIFT;
+    hvm_info->high_mem_pgend = args->highmem_end >> PAGE_SHIFT;
     hvm_info->reserved_mem_pgstart = ioreq_server_pfn(0);
 
     /* Finish with the checksum. */
@@ -244,10 +236,9 @@ static int setup_guest(xc_interface *xch,
                        char *image, unsigned long image_size)
 {
     xen_pfn_t *page_array = NULL;
-    unsigned long i, nr_pages = args->mem_size >> PAGE_SHIFT;
+    unsigned long i, vmemid, nr_pages = args->mem_size >> PAGE_SHIFT;
+    unsigned long p2m_size;
     unsigned long target_pages = args->mem_target >> PAGE_SHIFT;
-    uint64_t mmio_start = (1ull << 32) - args->mmio_size;
-    uint64_t mmio_size = args->mmio_size;
     unsigned long entry_eip, cur_pages, cur_pfn;
     void *hvm_info_page;
     uint32_t *ident_pt;
@@ -258,17 +249,23 @@ static int setup_guest(xc_interface *xch,
     xen_capabilities_info_t caps;
     unsigned long stat_normal_pages = 0, stat_2mb_pages = 0, 
         stat_1gb_pages = 0;
-    int pod_mode = 0;
+    unsigned int memflags = 0;
     int claim_enabled = args->claim_enabled;
     xen_pfn_t special_array[NR_SPECIAL_PAGES];
     xen_pfn_t ioreq_server_array[NR_IOREQ_SERVER_PAGES];
-
-    if ( nr_pages > target_pages )
-        pod_mode = XENMEMF_populate_on_demand;
+    uint64_t total_pages;
+    xen_vmemrange_t dummy_vmemrange[2];
+    unsigned int dummy_vnode_to_pnode[1];
+    xen_vmemrange_t *vmemranges;
+    unsigned int *vnode_to_pnode;
+    unsigned int nr_vmemranges, nr_vnodes;
 
     memset(&elf, 0, sizeof(elf));
     if ( elf_init(&elf, image, image_size) != 0 )
+    {
+        PERROR("Could not initialise ELF image");
         goto error_out;
+    }
 
     xc_elf_set_logfile(xch, &elf, 1);
 
@@ -276,6 +273,72 @@ static int setup_guest(xc_interface *xch,
     v_start = 0;
     v_end = args->mem_size;
 
+    if ( nr_pages > target_pages )
+        memflags |= XENMEMF_populate_on_demand;
+
+    if ( args->nr_vmemranges == 0 )
+    {
+        /* Build dummy vnode information
+         *
+         * Guest physical address space layout:
+         * [0, hole_start) [hole_start, 4G) [4G, highmem_end)
+         *
+         * Of course if there is no high memory, the second vmemrange
+         * has no effect on the actual result.
+         */
+
+        dummy_vmemrange[0].start = 0;
+        dummy_vmemrange[0].end   = args->lowmem_end;
+        dummy_vmemrange[0].flags = 0;
+        dummy_vmemrange[0].nid   = 0;
+        nr_vmemranges = 1;
+
+        if ( args->highmem_end > (1ULL << 32) )
+        {
+            dummy_vmemrange[1].start = 1ULL << 32;
+            dummy_vmemrange[1].end   = args->highmem_end;
+            dummy_vmemrange[1].flags = 0;
+            dummy_vmemrange[1].nid   = 0;
+
+            nr_vmemranges++;
+        }
+
+        dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE;
+        nr_vnodes = 1;
+        vmemranges = dummy_vmemrange;
+        vnode_to_pnode = dummy_vnode_to_pnode;
+    }
+    else
+    {
+        if ( nr_pages > target_pages )
+        {
+            PERROR("Cannot enable vNUMA and PoD at the same time");
+            goto error_out;
+        }
+
+        nr_vmemranges = args->nr_vmemranges;
+        nr_vnodes = args->nr_vnodes;
+        vmemranges = args->vmemranges;
+        vnode_to_pnode = args->vnode_to_pnode;
+    }
+
+    total_pages = 0;
+    p2m_size = 0;
+    for ( i = 0; i < nr_vmemranges; i++ )
+    {
+        total_pages += ((vmemranges[i].end - vmemranges[i].start)
+                        >> PAGE_SHIFT);
+        p2m_size = p2m_size > (vmemranges[i].end >> PAGE_SHIFT) ?
+            p2m_size : (vmemranges[i].end >> PAGE_SHIFT);
+    }
+
+    if ( total_pages != (args->mem_size >> PAGE_SHIFT) )
+    {
+        PERROR("vNUMA memory pages mismatch (0x%"PRIx64" != 0x%"PRIx64")",
+               total_pages, args->mem_size >> PAGE_SHIFT);
+        goto error_out;
+    }
+
     if ( xc_version(xch, XENVER_capabilities, &caps) != 0 )
     {
         PERROR("Could not get Xen capabilities");
@@ -294,16 +357,23 @@ static int setup_guest(xc_interface *xch,
     DPRINTF("  TOTAL:    %016"PRIx64"->%016"PRIx64"\n", v_start, v_end);
     DPRINTF("  ENTRY:    %016"PRIx64"\n", elf_uval(&elf, elf.ehdr, e_entry));
 
-    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
+    if ( (page_array = malloc(p2m_size * sizeof(xen_pfn_t))) == NULL )
     {
         PERROR("Could not allocate memory.");
         goto error_out;
     }
 
-    for ( i = 0; i < nr_pages; i++ )
-        page_array[i] = i;
-    for ( i = mmio_start >> PAGE_SHIFT; i < nr_pages; i++ )
-        page_array[i] += mmio_size >> PAGE_SHIFT;
+    for ( i = 0; i < p2m_size; i++ )
+        page_array[i] = ((xen_pfn_t)-1);
+    for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ )
+    {
+        uint64_t pfn;
+
+        for ( pfn = vmemranges[vmemid].start >> PAGE_SHIFT;
+              pfn < vmemranges[vmemid].end >> PAGE_SHIFT;
+              pfn++ )
+            page_array[pfn] = pfn;
+    }
 
     /*
      * Try to claim pages for early warning of insufficient memory available.
@@ -320,7 +390,7 @@ static int setup_guest(xc_interface *xch,
         }
     }
 
-    if ( pod_mode )
+    if ( memflags & XENMEMF_populate_on_demand )
     {
         /*
          * Subtract VGA_HOLE_SIZE from target_pages for the VGA
@@ -349,103 +419,139 @@ static int setup_guest(xc_interface *xch,
      * ensure that we can be preempted and hence dom0 remains responsive.
      */
     rc = xc_domain_populate_physmap_exact(
-        xch, dom, 0xa0, 0, pod_mode, &page_array[0x00]);
-    cur_pages = 0xc0;
-    stat_normal_pages = 0xc0;
+        xch, dom, 0xa0, 0, memflags, &page_array[0x00]);
 
-    while ( (rc == 0) && (nr_pages > cur_pages) )
+    stat_normal_pages = 0;
+    for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ )
     {
-        /* Clip count to maximum 1GB extent. */
-        unsigned long count = nr_pages - cur_pages;
-        unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS;
+        unsigned int new_memflags = memflags;
+        uint64_t end_pages;
+        unsigned int vnode = vmemranges[vmemid].nid;
+        unsigned int pnode = vnode_to_pnode[vnode];
 
-        if ( count > max_pages )
-            count = max_pages;
+        if ( pnode != XC_NUMA_NO_NODE )
+            new_memflags |= XENMEMF_exact_node(pnode);
 
-        cur_pfn = page_array[cur_pages];
-
-        /* Take care the corner cases of super page tails */
-        if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
-             (count > (-cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1))) )
-            count = -cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1);
-        else if ( ((count & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
-                  (count > SUPERPAGE_1GB_NR_PFNS) )
-            count &= ~(SUPERPAGE_1GB_NR_PFNS - 1);
-
-        /* Attemp to allocate 1GB super page. Because in each pass we only
-         * allocate at most 1GB, we don't have to clip super page boundaries.
+        end_pages = vmemranges[vmemid].end >> PAGE_SHIFT;
+        /*
+         * Consider vga hole belongs to the vmemrange that covers
+         * 0xA0000-0xC0000. Note that 0x00000-0xA0000 is populated just
+         * before this loop.
          */
-        if ( ((count | cur_pfn) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 &&
-             /* Check if there exists MMIO hole in the 1GB memory range */
-             !check_mmio_hole(cur_pfn << PAGE_SHIFT,
-                              SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT,
-                              mmio_start, mmio_size) )
+        if ( vmemranges[vmemid].start == 0 )
         {
-            long done;
-            unsigned long nr_extents = count >> SUPERPAGE_1GB_SHIFT;
-            xen_pfn_t sp_extents[nr_extents];
-
-            for ( i = 0; i < nr_extents; i++ )
-                sp_extents[i] = page_array[cur_pages+(i<<SUPERPAGE_1GB_SHIFT)];
-
-            done = xc_domain_populate_physmap(xch, dom, nr_extents, SUPERPAGE_1GB_SHIFT,
-                                              pod_mode, sp_extents);
-
-            if ( done > 0 )
-            {
-                stat_1gb_pages += done;
-                done <<= SUPERPAGE_1GB_SHIFT;
-                cur_pages += done;
-                count -= done;
-            }
+            cur_pages = 0xc0;
+            stat_normal_pages += 0xc0;
         }
+        else
+            cur_pages = vmemranges[vmemid].start >> PAGE_SHIFT;
 
-        if ( count != 0 )
+        while ( (rc == 0) && (end_pages > cur_pages) )
         {
-            /* Clip count to maximum 8MB extent. */
-            max_pages = SUPERPAGE_2MB_NR_PFNS * 4;
+            /* Clip count to maximum 1GB extent. */
+            unsigned long count = end_pages - cur_pages;
+            unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS;
+
             if ( count > max_pages )
                 count = max_pages;
-            
-            /* Clip partial superpage extents to superpage boundaries. */
-            if ( ((cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
-                 (count > (-cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1))) )
-                count = -cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1);
-            else if ( ((count & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
-                      (count > SUPERPAGE_2MB_NR_PFNS) )
-                count &= ~(SUPERPAGE_2MB_NR_PFNS - 1); /* clip non-s.p. tail */
-
-            /* Attempt to allocate superpage extents. */
-            if ( ((count | cur_pfn) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 )
+
+            cur_pfn = page_array[cur_pages];
+
+            /* Take care the corner cases of super page tails */
+            if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
+                 (count > (-cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1))) )
+                count = -cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1);
+            else if ( ((count & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
+                      (count > SUPERPAGE_1GB_NR_PFNS) )
+                count &= ~(SUPERPAGE_1GB_NR_PFNS - 1);
+
+            /* Attemp to allocate 1GB super page. Because in each pass
+             * we only allocate at most 1GB, we don't have to clip
+             * super page boundaries.
+             */
+            if ( ((count | cur_pfn) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 &&
+                 /* Check if there exists MMIO hole in the 1GB memory
+                  * range */
+                 !check_mmio_hole(cur_pfn << PAGE_SHIFT,
+                                  SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT,
+                                  args->mmio_start, args->mmio_size) )
             {
                 long done;
-                unsigned long nr_extents = count >> SUPERPAGE_2MB_SHIFT;
+                unsigned long nr_extents = count >> SUPERPAGE_1GB_SHIFT;
                 xen_pfn_t sp_extents[nr_extents];
 
                 for ( i = 0; i < nr_extents; i++ )
-                    sp_extents[i] = page_array[cur_pages+(i<<SUPERPAGE_2MB_SHIFT)];
+                    sp_extents[i] =
+                        page_array[cur_pages+(i<<SUPERPAGE_1GB_SHIFT)];
 
-                done = xc_domain_populate_physmap(xch, dom, nr_extents, SUPERPAGE_2MB_SHIFT,
-                                                  pod_mode, sp_extents);
+                done = xc_domain_populate_physmap(xch, dom, nr_extents,
+                                                  SUPERPAGE_1GB_SHIFT,
+                                                  new_memflags,
+                                                  sp_extents);
 
                 if ( done > 0 )
                 {
-                    stat_2mb_pages += done;
-                    done <<= SUPERPAGE_2MB_SHIFT;
+                    stat_1gb_pages += done;
+                    done <<= SUPERPAGE_1GB_SHIFT;
                     cur_pages += done;
                     count -= done;
                 }
             }
-        }
 
-        /* Fall back to 4kB extents. */
-        if ( count != 0 )
-        {
-            rc = xc_domain_populate_physmap_exact(
-                xch, dom, count, 0, pod_mode, &page_array[cur_pages]);
-            cur_pages += count;
-            stat_normal_pages += count;
+            if ( count != 0 )
+            {
+                /* Clip count to maximum 8MB extent. */
+                max_pages = SUPERPAGE_2MB_NR_PFNS * 4;
+                if ( count > max_pages )
+                    count = max_pages;
+
+                /* Clip partial superpage extents to superpage
+                 * boundaries. */
+                if ( ((cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
+                     (count > (-cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1))) )
+                    count = -cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1);
+                else if ( ((count & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
+                          (count > SUPERPAGE_2MB_NR_PFNS) )
+                    count &= ~(SUPERPAGE_2MB_NR_PFNS - 1); /* clip non-s.p. tail */
+
+                /* Attempt to allocate superpage extents. */
+                if ( ((count | cur_pfn) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 )
+                {
+                    long done;
+                    unsigned long nr_extents = count >> SUPERPAGE_2MB_SHIFT;
+                    xen_pfn_t sp_extents[nr_extents];
+
+                    for ( i = 0; i < nr_extents; i++ )
+                        sp_extents[i] =
+                            page_array[cur_pages+(i<<SUPERPAGE_2MB_SHIFT)];
+
+                    done = xc_domain_populate_physmap(xch, dom, nr_extents,
+                                                      SUPERPAGE_2MB_SHIFT,
+                                                      new_memflags,
+                                                      sp_extents);
+
+                    if ( done > 0 )
+                    {
+                        stat_2mb_pages += done;
+                        done <<= SUPERPAGE_2MB_SHIFT;
+                        cur_pages += done;
+                        count -= done;
+                    }
+                }
+            }
+
+            /* Fall back to 4kB extents. */
+            if ( count != 0 )
+            {
+                rc = xc_domain_populate_physmap_exact(
+                    xch, dom, count, 0, new_memflags, &page_array[cur_pages]);
+                cur_pages += count;
+                stat_normal_pages += count;
+            }
         }
+
+        if ( rc != 0 )
+            break;
     }
 
     if ( rc != 0 )
@@ -460,16 +566,25 @@ static int setup_guest(xc_interface *xch,
     DPRINTF("  1GB PAGES: 0x%016lx\n", stat_1gb_pages);
     
     if ( loadelfimage(xch, &elf, dom, page_array) != 0 )
+    {
+        PERROR("Could not load ELF image");
         goto error_out;
+    }
 
     if ( loadmodules(xch, args, m_start, m_end, dom, page_array) != 0 )
-        goto error_out;    
+    {
+        PERROR("Could not load ACPI modules");
+        goto error_out;
+    }
 
     if ( (hvm_info_page = xc_map_foreign_range(
               xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
               HVM_INFO_PFN)) == NULL )
+    {
+        PERROR("Could not map hvm info page");
         goto error_out;
-    build_hvm_info(hvm_info_page, v_end, mmio_start, mmio_size);
+    }
+    build_hvm_info(hvm_info_page, args);
     munmap(hvm_info_page, PAGE_SIZE);
 
     /* Allocate and clear special pages. */
@@ -485,7 +600,10 @@ static int setup_guest(xc_interface *xch,
     }
 
     if ( xc_clear_domain_pages(xch, dom, special_pfn(0), NR_SPECIAL_PAGES) )
-            goto error_out;
+    {
+        PERROR("Could not clear special pages");
+        goto error_out;
+    }
 
     xc_hvm_param_set(xch, dom, HVM_PARAM_STORE_PFN,
                      special_pfn(SPECIALPAGE_XENSTORE));
@@ -497,7 +615,7 @@ static int setup_guest(xc_interface *xch,
                      special_pfn(SPECIALPAGE_CONSOLE));
     xc_hvm_param_set(xch, dom, HVM_PARAM_PAGING_RING_PFN,
                      special_pfn(SPECIALPAGE_PAGING));
-    xc_hvm_param_set(xch, dom, HVM_PARAM_ACCESS_RING_PFN,
+    xc_hvm_param_set(xch, dom, HVM_PARAM_MONITOR_RING_PFN,
                      special_pfn(SPECIALPAGE_ACCESS));
     xc_hvm_param_set(xch, dom, HVM_PARAM_SHARING_RING_PFN,
                      special_pfn(SPECIALPAGE_SHARING));
@@ -518,7 +636,10 @@ static int setup_guest(xc_interface *xch,
     }
 
     if ( xc_clear_domain_pages(xch, dom, ioreq_server_pfn(0), NR_IOREQ_SERVER_PAGES) )
-            goto error_out;
+    {
+        PERROR("Could not clear ioreq page");
+        goto error_out;
+    }
 
     /* Tell the domain where the pages are and how many there are */
     xc_hvm_param_set(xch, dom, HVM_PARAM_IOREQ_SERVER_PFN,
@@ -533,7 +654,10 @@ static int setup_guest(xc_interface *xch,
     if ( (ident_pt = xc_map_foreign_range(
               xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
               special_pfn(SPECIALPAGE_IDENT_PT))) == NULL )
+    {
+        PERROR("Could not map special page ident_pt");
         goto error_out;
+    }
     for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
         ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
                        _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
@@ -548,7 +672,10 @@ static int setup_guest(xc_interface *xch,
         char *page0 = xc_map_foreign_range(
             xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, 0);
         if ( page0 == NULL )
+        {
+            PERROR("Could not map page0");
             goto error_out;
+        }
         page0[0] = 0xe9;
         *(uint32_t *)&page0[1] = entry_eip - 5;
         munmap(page0, PAGE_SIZE);
@@ -585,12 +712,6 @@ int xc_hvm_build(xc_interface *xch, uint32_t domid,
     if ( args.image_file_name == NULL )
         return -1;
 
-    if ( args.mem_target == 0 )
-        args.mem_target = args.mem_size;
-
-    if ( args.mmio_size == 0 )
-        args.mmio_size = HVM_BELOW_4G_MMIO_LENGTH;
-
     /* An HVM guest must be initialised with at least 2MB memory. */
     if ( args.mem_size < (2ull << 20) || args.mem_target < (2ull << 20) )
         return -1;
@@ -634,6 +755,8 @@ int xc_hvm_build_target_mem(xc_interface *xch,
     args.mem_size = (uint64_t)memsize << 20;
     args.mem_target = (uint64_t)target << 20;
     args.image_file_name = image_name;
+    if ( args.mmio_size == 0 )
+        args.mmio_size = HVM_BELOW_4G_MMIO_LENGTH;
 
     return xc_hvm_build(xch, domid, &args);
 }
diff --git a/tools/libxc/xc_linux.c b/tools/libxc/xc_linux.c
index 6121d80..c67c71c 100644
--- a/tools/libxc/xc_linux.c
+++ b/tools/libxc/xc_linux.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c
index a19e4b6..76c55ff 100644
--- a/tools/libxc/xc_linux_osdep.c
+++ b/tools/libxc/xc_linux_osdep.c
@@ -17,8 +17,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <errno.h>
@@ -122,10 +121,13 @@ out:
 
 static void linux_privcmd_free_hypercall_buffer(xc_interface *xch, xc_osdep_handle h, void *ptr, int npages)
 {
+    int saved_errno = errno;
     /* Recover the VMA flags. Maybe it's not necessary */
     madvise(ptr, npages * XC_PAGE_SIZE, MADV_DOFORK);
 
     munmap(ptr, npages * XC_PAGE_SIZE);
+    /* We MUST propagate the hypercall errno, not unmap call's. */
+    errno = saved_errno;
 }
 
 static int linux_privcmd_hypercall(xc_interface *xch, xc_osdep_handle h, privcmd_hypercall_t *hypercall)
@@ -316,6 +318,7 @@ static void *linux_privcmd_map_foreign_bulk(xc_interface *xch, xc_osdep_handle h
             if ( pfn == MAP_FAILED )
             {
                 PERROR("xc_map_foreign_bulk: mmap of pfn array failed");
+                (void)munmap(addr, (unsigned long)num << XC_PAGE_SHIFT);
                 return NULL;
             }
         }
@@ -739,7 +742,7 @@ static int linux_gnttab_munmap(xc_gnttab *xcg, xc_osdep_handle h,
     }
 
     /* Next, unmap the memory. */
-    if ( (rc = munmap(start_address, count * getpagesize())) )
+    if ( (rc = munmap(start_address, count * XC_PAGE_SIZE)) )
         return rc;
 
     /* Finally, unmap the driver slots used to store the grant information. */
diff --git a/tools/libxc/xc_mem_access.c b/tools/libxc/xc_mem_access.c
index 55d0e9f..3634c39 100644
--- a/tools/libxc/xc_mem_access.c
+++ b/tools/libxc/xc_mem_access.c
@@ -17,45 +17,12 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
 #include <xen/memory.h>
 
-void *xc_mem_access_enable(xc_interface *xch, domid_t domain_id, uint32_t *port)
-{
-    return xc_mem_event_enable(xch, domain_id, HVM_PARAM_ACCESS_RING_PFN,
-                               port, 0);
-}
-
-void *xc_mem_access_enable_introspection(xc_interface *xch, domid_t domain_id,
-                                         uint32_t *port)
-{
-    return xc_mem_event_enable(xch, domain_id, HVM_PARAM_ACCESS_RING_PFN,
-                               port, 1);
-}
-
-int xc_mem_access_disable(xc_interface *xch, domid_t domain_id)
-{
-    return xc_mem_event_control(xch, domain_id,
-                                XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE,
-                                XEN_DOMCTL_MEM_EVENT_OP_ACCESS,
-                                NULL);
-}
-
-int xc_mem_access_resume(xc_interface *xch, domid_t domain_id)
-{
-    xen_mem_access_op_t mao =
-    {
-        .op    = XENMEM_access_op_resume,
-        .domid = domain_id
-    };
-
-    return do_memory_op(xch, XENMEM_access_op, &mao, sizeof(mao));
-}
-
 int xc_set_mem_access(xc_interface *xch,
                       domid_t domain_id,
                       xenmem_access_t access,
@@ -95,6 +62,30 @@ int xc_get_mem_access(xc_interface *xch,
     return rc;
 }
 
+int xc_mem_access_enable_emulate(xc_interface *xch,
+                                 domid_t domain_id)
+{
+    xen_mem_access_op_t mao =
+    {
+        .op     = XENMEM_access_op_enable_emulate,
+        .domid  = domain_id,
+    };
+
+    return do_memory_op(xch, XENMEM_access_op, &mao, sizeof(mao));
+}
+
+int xc_mem_access_disable_emulate(xc_interface *xch,
+                                  domid_t domain_id)
+{
+    xen_mem_access_op_t mao =
+    {
+        .op     = XENMEM_access_op_disable_emulate,
+        .domid  = domain_id,
+    };
+
+    return do_memory_op(xch, XENMEM_access_op, &mao, sizeof(mao));
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxc/xc_mem_paging.c b/tools/libxc/xc_mem_paging.c
index 8aa7d4d..28611f4 100644
--- a/tools/libxc/xc_mem_paging.c
+++ b/tools/libxc/xc_mem_paging.c
@@ -17,12 +17,25 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
 
+static int xc_mem_paging_memop(xc_interface *xch, domid_t domain_id,
+                               unsigned int op, uint64_t gfn, void *buffer)
+{
+    xen_mem_paging_op_t mpo;
+
+    memset(&mpo, 0, sizeof(mpo));
+
+    mpo.op      = op;
+    mpo.domain  = domain_id;
+    mpo.gfn     = gfn;
+    mpo.buffer  = (unsigned long) buffer;
+
+    return do_memory_op(xch, XENMEM_paging_op, &mpo, sizeof(mpo));
+}
 
 int xc_mem_paging_enable(xc_interface *xch, domid_t domain_id,
                          uint32_t *port)
@@ -32,47 +45,52 @@ int xc_mem_paging_enable(xc_interface *xch, domid_t domain_id,
         errno = EINVAL;
         return -1;
     }
-        
-    return xc_mem_event_control(xch, domain_id,
-                                XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE,
-                                XEN_DOMCTL_MEM_EVENT_OP_PAGING,
-                                port);
+
+    return xc_vm_event_control(xch, domain_id,
+                               XEN_VM_EVENT_ENABLE,
+                               XEN_DOMCTL_VM_EVENT_OP_PAGING,
+                               port);
 }
 
 int xc_mem_paging_disable(xc_interface *xch, domid_t domain_id)
 {
-    return xc_mem_event_control(xch, domain_id,
-                                XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE,
-                                XEN_DOMCTL_MEM_EVENT_OP_PAGING,
-                                NULL);
+    return xc_vm_event_control(xch, domain_id,
+                               XEN_VM_EVENT_DISABLE,
+                               XEN_DOMCTL_VM_EVENT_OP_PAGING,
+                               NULL);
 }
 
-int xc_mem_paging_nominate(xc_interface *xch, domid_t domain_id, unsigned long gfn)
+int xc_mem_paging_resume(xc_interface *xch, domid_t domain_id)
 {
-    return xc_mem_event_memop(xch, domain_id,
-                                XENMEM_paging_op_nominate,
-                                XENMEM_paging_op,
-                                gfn, NULL);
+    return xc_vm_event_control(xch, domain_id,
+                               XEN_VM_EVENT_RESUME,
+                               XEN_DOMCTL_VM_EVENT_OP_PAGING,
+                               NULL);
 }
 
-int xc_mem_paging_evict(xc_interface *xch, domid_t domain_id, unsigned long gfn)
+int xc_mem_paging_nominate(xc_interface *xch, domid_t domain_id, uint64_t gfn)
 {
-    return xc_mem_event_memop(xch, domain_id,
-                                XENMEM_paging_op_evict,
-                                XENMEM_paging_op,
-                                gfn, NULL);
+    return xc_mem_paging_memop(xch, domain_id,
+                               XENMEM_paging_op_nominate,
+                               gfn, NULL);
 }
 
-int xc_mem_paging_prep(xc_interface *xch, domid_t domain_id, unsigned long gfn)
+int xc_mem_paging_evict(xc_interface *xch, domid_t domain_id, uint64_t gfn)
 {
-    return xc_mem_event_memop(xch, domain_id,
-                                XENMEM_paging_op_prep,
-                                XENMEM_paging_op,
-                                gfn, NULL);
+    return xc_mem_paging_memop(xch, domain_id,
+                               XENMEM_paging_op_evict,
+                               gfn, NULL);
 }
 
-int xc_mem_paging_load(xc_interface *xch, domid_t domain_id, 
-                                unsigned long gfn, void *buffer)
+int xc_mem_paging_prep(xc_interface *xch, domid_t domain_id, uint64_t gfn)
+{
+    return xc_mem_paging_memop(xch, domain_id,
+                               XENMEM_paging_op_prep,
+                               gfn, NULL);
+}
+
+int xc_mem_paging_load(xc_interface *xch, domid_t domain_id,
+                       uint64_t gfn, void *buffer)
 {
     int rc, old_errno;
 
@@ -86,11 +104,10 @@ int xc_mem_paging_load(xc_interface *xch, domid_t domain_id,
 
     if ( mlock(buffer, XC_PAGE_SIZE) )
         return -1;
-        
-    rc = xc_mem_event_memop(xch, domain_id,
-                                XENMEM_paging_op_prep,
-                                XENMEM_paging_op,
-                                gfn, buffer);
+
+    rc = xc_mem_paging_memop(xch, domain_id,
+                             XENMEM_paging_op_prep,
+                             gfn, buffer);
 
     old_errno = errno;
     munlock(buffer, XC_PAGE_SIZE);
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index d6a9539..deb0aa4 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -17,8 +17,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
@@ -51,20 +50,20 @@ int xc_memshr_ring_enable(xc_interface *xch,
         errno = EINVAL;
         return -1;
     }
-        
-    return xc_mem_event_control(xch, domid,
-                                XEN_DOMCTL_MEM_EVENT_OP_SHARING_ENABLE,
-                                XEN_DOMCTL_MEM_EVENT_OP_SHARING,
-                                port);
+
+    return xc_vm_event_control(xch, domid,
+                               XEN_VM_EVENT_ENABLE,
+                               XEN_DOMCTL_VM_EVENT_OP_SHARING,
+                               port);
 }
 
 int xc_memshr_ring_disable(xc_interface *xch, 
                            domid_t domid)
 {
-    return xc_mem_event_control(xch, domid,
-                                XEN_DOMCTL_MEM_EVENT_OP_SHARING_DISABLE,
-                                XEN_DOMCTL_MEM_EVENT_OP_SHARING,
-                                NULL);
+    return xc_vm_event_control(xch, domid,
+                               XEN_VM_EVENT_DISABLE,
+                               XEN_DOMCTL_VM_EVENT_OP_SHARING,
+                               NULL);
 }
 
 static int xc_memshr_memop(xc_interface *xch, domid_t domid, 
@@ -185,13 +184,10 @@ int xc_memshr_add_to_physmap(xc_interface *xch,
 int xc_memshr_domain_resume(xc_interface *xch,
                             domid_t domid)
 {
-    xen_mem_sharing_op_t mso;
-
-    memset(&mso, 0, sizeof(mso));
-
-    mso.op = XENMEM_sharing_op_resume;
-
-    return xc_memshr_memop(xch, domid, &mso);
+    return xc_vm_event_control(xch, domid,
+                               XEN_VM_EVENT_RESUME,
+                               XEN_DOMCTL_VM_EVENT_OP_SHARING,
+                               NULL);
 }
 
 int xc_memshr_debug_gfn(xc_interface *xch,
diff --git a/tools/libxc/xc_minios.c b/tools/libxc/xc_minios.c
index e703684..4f6498f 100644
--- a/tools/libxc/xc_minios.c
+++ b/tools/libxc/xc_minios.c
@@ -15,8 +15,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #undef NDEBUG
diff --git a/tools/libxc/xc_misc.c b/tools/libxc/xc_misc.c
index e253a58..c613545 100644
--- a/tools/libxc/xc_misc.c
+++ b/tools/libxc/xc_misc.c
@@ -14,10 +14,10 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include "xc_bitops.h"
 #include "xc_private.h"
 #include <xen/hvm/hvm_op.h>
 
@@ -93,6 +93,31 @@ xc_cpumap_t xc_cpumap_alloc(xc_interface *xch)
     return calloc(1, sz);
 }
 
+/*
+ * xc_bitops.h has macros that do this as well - however they assume that
+ * the bitmask is word aligned but xc_cpumap_t is only guaranteed to be
+ * byte aligned and so we need byte versions for architectures which do
+ * not support misaligned accesses (which is basically everyone
+ * but x86, although even on x86 it can be inefficient).
+ */
+#define BITS_PER_CPUMAP(map) (sizeof(*map) * 8)
+#define CPUMAP_ENTRY(cpu, map) ((map))[(cpu) / BITS_PER_CPUMAP(map)]
+#define CPUMAP_SHIFT(cpu, map) ((cpu) % BITS_PER_CPUMAP(map))
+void xc_cpumap_clearcpu(int cpu, xc_cpumap_t map)
+{
+    CPUMAP_ENTRY(cpu, map) &= ~(1U << CPUMAP_SHIFT(cpu, map));
+}
+
+void xc_cpumap_setcpu(int cpu, xc_cpumap_t map)
+{
+    CPUMAP_ENTRY(cpu, map) |= (1U << CPUMAP_SHIFT(cpu, map));
+}
+
+int xc_cpumap_testcpu(int cpu, xc_cpumap_t map)
+{
+    return (CPUMAP_ENTRY(cpu, map) >> CPUMAP_SHIFT(cpu, map)) & 1;
+}
+
 xc_nodemap_t xc_nodemap_alloc(xc_interface *xch)
 {
     int sz;
@@ -177,42 +202,106 @@ int xc_physinfo(xc_interface *xch,
     return 0;
 }
 
-int xc_topologyinfo(xc_interface *xch,
-                xc_topologyinfo_t *put_info)
+int xc_cputopoinfo(xc_interface *xch, unsigned *max_cpus,
+                   xc_cputopo_t *cputopo)
 {
     int ret;
     DECLARE_SYSCTL;
+    DECLARE_HYPERCALL_BOUNCE(cputopo, *max_cpus * sizeof(*cputopo),
+                             XC_HYPERCALL_BUFFER_BOUNCE_OUT);
 
-    sysctl.cmd = XEN_SYSCTL_topologyinfo;
+    if ( (ret = xc_hypercall_bounce_pre(xch, cputopo)) )
+        goto out;
 
-    memcpy(&sysctl.u.topologyinfo, put_info, sizeof(*put_info));
+    sysctl.u.cputopoinfo.num_cpus = *max_cpus;
+    set_xen_guest_handle(sysctl.u.cputopoinfo.cputopo, cputopo);
+
+    sysctl.cmd = XEN_SYSCTL_cputopoinfo;
 
     if ( (ret = do_sysctl(xch, &sysctl)) != 0 )
-        return ret;
+        goto out;
 
-    memcpy(put_info, &sysctl.u.topologyinfo, sizeof(*put_info));
+    *max_cpus = sysctl.u.cputopoinfo.num_cpus;
 
-    return 0;
+out:
+    xc_hypercall_bounce_post(xch, cputopo);
+
+    return ret;
 }
 
-int xc_numainfo(xc_interface *xch,
-                xc_numainfo_t *put_info)
+int xc_numainfo(xc_interface *xch, unsigned *max_nodes,
+                xc_meminfo_t *meminfo, uint32_t *distance)
 {
     int ret;
     DECLARE_SYSCTL;
+    DECLARE_HYPERCALL_BOUNCE(meminfo, *max_nodes * sizeof(*meminfo),
+                             XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+    DECLARE_HYPERCALL_BOUNCE(distance,
+                             *max_nodes * *max_nodes * sizeof(*distance),
+                             XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+
+    if ( (ret = xc_hypercall_bounce_pre(xch, meminfo)) )
+        goto out;
+    if ((ret = xc_hypercall_bounce_pre(xch, distance)) )
+        goto out;
+
+    sysctl.u.numainfo.num_nodes = *max_nodes;
+    set_xen_guest_handle(sysctl.u.numainfo.meminfo, meminfo);
+    set_xen_guest_handle(sysctl.u.numainfo.distance, distance);
 
     sysctl.cmd = XEN_SYSCTL_numainfo;
 
-    memcpy(&sysctl.u.numainfo, put_info, sizeof(*put_info));
+    if ( (ret = do_sysctl(xch, &sysctl)) != 0 )
+        goto out;
 
-    if ((ret = do_sysctl(xch, &sysctl)) != 0)
-        return ret;
+    *max_nodes = sysctl.u.numainfo.num_nodes;
 
-    memcpy(put_info, &sysctl.u.numainfo, sizeof(*put_info));
+out:
+    xc_hypercall_bounce_post(xch, meminfo);
+    xc_hypercall_bounce_post(xch, distance);
 
-    return 0;
+    return ret;
 }
 
+int xc_pcitopoinfo(xc_interface *xch, unsigned num_devs,
+                   physdev_pci_device_t *devs,
+                   uint32_t *nodes)
+{
+    int ret = 0;
+    unsigned processed = 0;
+    DECLARE_SYSCTL;
+    DECLARE_HYPERCALL_BOUNCE(devs, num_devs * sizeof(*devs),
+                             XC_HYPERCALL_BUFFER_BOUNCE_IN);
+    DECLARE_HYPERCALL_BOUNCE(nodes, num_devs* sizeof(*nodes),
+                             XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
+
+    if ( (ret = xc_hypercall_bounce_pre(xch, devs)) )
+        goto out;
+    if ( (ret = xc_hypercall_bounce_pre(xch, nodes)) )
+        goto out;
+
+    sysctl.cmd = XEN_SYSCTL_pcitopoinfo;
+
+    while ( processed < num_devs )
+    {
+        sysctl.u.pcitopoinfo.num_devs = num_devs - processed;
+        set_xen_guest_handle_offset(sysctl.u.pcitopoinfo.devs, devs,
+                                    processed);
+        set_xen_guest_handle_offset(sysctl.u.pcitopoinfo.nodes, nodes,
+                                    processed);
+
+        if ( (ret = do_sysctl(xch, &sysctl)) != 0 )
+                break;
+
+        processed += sysctl.u.pcitopoinfo.num_devs;
+    }
+
+ out:
+    xc_hypercall_bounce_post(xch, devs);
+    xc_hypercall_bounce_post(xch, nodes);
+
+    return ret;
+}
 
 int xc_sched_id(xc_interface *xch,
                 int *sched_id)
diff --git a/tools/libxc/xc_monitor.c b/tools/libxc/xc_monitor.c
new file mode 100644
index 0000000..065669c
--- /dev/null
+++ b/tools/libxc/xc_monitor.c
@@ -0,0 +1,145 @@
+/******************************************************************************
+ *
+ * xc_monitor.c
+ *
+ * Interface to VM event monitor
+ *
+ * Copyright (c) 2015 Tamas K Lengyel (tamas at tklengyel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "xc_private.h"
+
+void *xc_monitor_enable(xc_interface *xch, domid_t domain_id, uint32_t *port)
+{
+    return xc_vm_event_enable(xch, domain_id, HVM_PARAM_MONITOR_RING_PFN,
+                              port);
+}
+
+int xc_monitor_disable(xc_interface *xch, domid_t domain_id)
+{
+    return xc_vm_event_control(xch, domain_id,
+                               XEN_VM_EVENT_DISABLE,
+                               XEN_DOMCTL_VM_EVENT_OP_MONITOR,
+                               NULL);
+}
+
+int xc_monitor_resume(xc_interface *xch, domid_t domain_id)
+{
+    return xc_vm_event_control(xch, domain_id,
+                               XEN_VM_EVENT_RESUME,
+                               XEN_DOMCTL_VM_EVENT_OP_MONITOR,
+                               NULL);
+}
+
+int xc_monitor_get_capabilities(xc_interface *xch, domid_t domain_id,
+                                uint32_t *capabilities)
+{
+    int rc;
+    DECLARE_DOMCTL;
+
+    if ( !capabilities )
+    {
+        errno = EINVAL;
+        return -1;
+    }
+
+    domctl.cmd = XEN_DOMCTL_monitor_op;
+    domctl.domain = domain_id;
+    domctl.u.monitor_op.op = XEN_DOMCTL_MONITOR_OP_GET_CAPABILITIES;
+
+    rc = do_domctl(xch, &domctl);
+    if ( rc )
+        return rc;
+
+    *capabilities = domctl.u.monitor_op.event;
+    return 0;
+}
+
+int xc_monitor_write_ctrlreg(xc_interface *xch, domid_t domain_id,
+                             uint16_t index, bool enable, bool sync,
+                             bool onchangeonly)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_monitor_op;
+    domctl.domain = domain_id;
+    domctl.u.monitor_op.op = enable ? XEN_DOMCTL_MONITOR_OP_ENABLE
+                                    : XEN_DOMCTL_MONITOR_OP_DISABLE;
+    domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_WRITE_CTRLREG;
+    domctl.u.monitor_op.u.mov_to_cr.index = index;
+    domctl.u.monitor_op.u.mov_to_cr.sync = sync;
+    domctl.u.monitor_op.u.mov_to_cr.onchangeonly = onchangeonly;
+
+    return do_domctl(xch, &domctl);
+}
+
+int xc_monitor_mov_to_msr(xc_interface *xch, domid_t domain_id, bool enable,
+                          bool extended_capture)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_monitor_op;
+    domctl.domain = domain_id;
+    domctl.u.monitor_op.op = enable ? XEN_DOMCTL_MONITOR_OP_ENABLE
+                                    : XEN_DOMCTL_MONITOR_OP_DISABLE;
+    domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_MOV_TO_MSR;
+    domctl.u.monitor_op.u.mov_to_msr.extended_capture = extended_capture;
+
+    return do_domctl(xch, &domctl);
+}
+
+int xc_monitor_software_breakpoint(xc_interface *xch, domid_t domain_id,
+                                   bool enable)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_monitor_op;
+    domctl.domain = domain_id;
+    domctl.u.monitor_op.op = enable ? XEN_DOMCTL_MONITOR_OP_ENABLE
+                                    : XEN_DOMCTL_MONITOR_OP_DISABLE;
+    domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_SOFTWARE_BREAKPOINT;
+
+    return do_domctl(xch, &domctl);
+}
+
+int xc_monitor_singlestep(xc_interface *xch, domid_t domain_id,
+                          bool enable)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_monitor_op;
+    domctl.domain = domain_id;
+    domctl.u.monitor_op.op = enable ? XEN_DOMCTL_MONITOR_OP_ENABLE
+                                    : XEN_DOMCTL_MONITOR_OP_DISABLE;
+    domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_SINGLESTEP;
+
+    return do_domctl(xch, &domctl);
+}
+
+int xc_monitor_guest_request(xc_interface *xch, domid_t domain_id, bool enable,
+                             bool sync)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_monitor_op;
+    domctl.domain = domain_id;
+    domctl.u.monitor_op.op = enable ? XEN_DOMCTL_MONITOR_OP_ENABLE
+                                    : XEN_DOMCTL_MONITOR_OP_DISABLE;
+    domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_GUEST_REQUEST;
+    domctl.u.monitor_op.u.guest_request.sync = sync;
+
+    return do_domctl(xch, &domctl);
+}
diff --git a/tools/libxc/xc_msr_x86.h b/tools/libxc/xc_msr_x86.h
index 7c3e1a3..7f100e7 100644
--- a/tools/libxc/xc_msr_x86.h
+++ b/tools/libxc/xc_msr_x86.h
@@ -20,6 +20,7 @@
 #ifndef XC_MSR_X86_H
 #define XC_MSR_X86_H
 
+#define MSR_IA32_TSC            0x00000010
 #define MSR_IA32_CMT_EVTSEL     0x00000c8d
 #define MSR_IA32_CMT_CTR        0x00000c8e
 
diff --git a/tools/libxc/xc_netbsd.c b/tools/libxc/xc_netbsd.c
index 8a90ef3..54f1d7b 100644
--- a/tools/libxc/xc_netbsd.c
+++ b/tools/libxc/xc_netbsd.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_nomigrate.c b/tools/libxc/xc_nomigrate.c
index 76978a0..902429e 100644
--- a/tools/libxc/xc_nomigrate.c
+++ b/tools/libxc/xc_nomigrate.c
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (c) 2011, Citrix Systems
  */
diff --git a/tools/libxc/xc_offline_page.c b/tools/libxc/xc_offline_page.c
index 3147203..7bb522f 100644
--- a/tools/libxc/xc_offline_page.c
+++ b/tools/libxc/xc_offline_page.c
@@ -17,8 +17,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <inttypes.h>
@@ -58,12 +57,14 @@ int xc_mark_page_online(xc_interface *xch, unsigned long start,
     int ret = -1;
 
     if ( !status || (end < start) )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     if ( xc_hypercall_bounce_pre(xch, status) )
     {
         ERROR("Could not bounce memory for xc_mark_page_online\n");
-        return -EINVAL;
+        return -1;
     }
 
     sysctl.cmd = XEN_SYSCTL_page_offline_op;
@@ -86,12 +87,14 @@ int xc_mark_page_offline(xc_interface *xch, unsigned long start,
     int ret = -1;
 
     if ( !status || (end < start) )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     if ( xc_hypercall_bounce_pre(xch, status) )
     {
         ERROR("Could not bounce memory for xc_mark_page_offline");
-        return -EINVAL;
+        return -1;
     }
 
     sysctl.cmd = XEN_SYSCTL_page_offline_op;
@@ -114,12 +117,14 @@ int xc_query_page_offline_status(xc_interface *xch, unsigned long start,
     int ret = -1;
 
     if ( !status || (end < start) )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     if ( xc_hypercall_bounce_pre(xch, status) )
     {
         ERROR("Could not bounce memory for xc_query_page_offline_status\n");
-        return -EINVAL;
+        return -1;
     }
 
     sysctl.cmd = XEN_SYSCTL_page_offline_op;
@@ -390,6 +395,65 @@ static int is_page_exchangable(xc_interface *xch, int domid, xen_pfn_t mfn,
     return 1;
 }
 
+xen_pfn_t *xc_map_m2p(xc_interface *xch,
+                      unsigned long max_mfn,
+                      int prot,
+                      unsigned long *mfn0)
+{
+    privcmd_mmap_entry_t *entries;
+    unsigned long m2p_chunks, m2p_size;
+    xen_pfn_t *m2p;
+    xen_pfn_t *extent_start;
+    int i;
+
+    m2p = NULL;
+    m2p_size   = M2P_SIZE(max_mfn);
+    m2p_chunks = M2P_CHUNKS(max_mfn);
+
+    extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
+    if ( !extent_start )
+    {
+        ERROR("failed to allocate space for m2p mfns");
+        goto err0;
+    }
+
+    if ( xc_machphys_mfn_list(xch, m2p_chunks, extent_start) )
+    {
+        PERROR("xc_get_m2p_mfns");
+        goto err1;
+    }
+
+    entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
+    if (entries == NULL)
+    {
+        ERROR("failed to allocate space for mmap entries");
+        goto err1;
+    }
+
+    for ( i = 0; i < m2p_chunks; i++ )
+        entries[i].mfn = extent_start[i];
+
+    m2p = xc_map_foreign_ranges(xch, DOMID_XEN,
+			m2p_size, prot, M2P_CHUNK_SIZE,
+			entries, m2p_chunks);
+    if (m2p == NULL)
+    {
+        PERROR("xc_mmap_foreign_ranges failed");
+        goto err2;
+    }
+
+    if (mfn0)
+        *mfn0 = entries[0].mfn;
+
+err2:
+    free(entries);
+err1:
+    free(extent_start);
+
+err0:
+    return m2p;
+}
+
 /* The domain should be suspended when called here */
 int xc_exchange_page(xc_interface *xch, int domid, xen_pfn_t mfn)
 {
@@ -406,32 +470,32 @@ int xc_exchange_page(xc_interface *xch, int domid, xen_pfn_t mfn)
     uint32_t status;
     xen_pfn_t new_mfn, gpfn;
     xen_pfn_t *m2p_table;
-    int max_mfn;
+    unsigned long max_mfn;
 
     if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 )
     {
         ERROR("Could not get domain info");
-        return -EFAULT;
+        return -1;
     }
 
     if (!info.shutdown || info.shutdown_reason != SHUTDOWN_suspend)
     {
+        errno = EINVAL;
         ERROR("Can't exchange page unless domain is suspended\n");
-        return -EINVAL;
+        return -1;
     }
-
     if (!is_page_exchangable(xch, domid, mfn, &info))
     {
         ERROR("Could not exchange page\n");
-        return -EINVAL;
+        return -1;
     }
 
     /* Map M2P and obtain gpfn */
-    max_mfn = xc_maximum_ram_page(xch);
-    if ( !(m2p_table = xc_map_m2p(xch, max_mfn, PROT_READ, NULL)) )
+    rc = xc_maximum_ram_page(xch, &max_mfn);
+    if ( rc || !(m2p_table = xc_map_m2p(xch, max_mfn, PROT_READ, NULL)) )
     {
         PERROR("Failed to map live M2P table");
-        return -EFAULT;
+        return -1;
     }
     gpfn = m2p_table[mfn];
 
@@ -440,7 +504,7 @@ int xc_exchange_page(xc_interface *xch, int domid, xen_pfn_t mfn)
     if ( xc_map_domain_meminfo(xch, domid, &minfo) )
     {
         PERROR("Could not map domain's memory information\n");
-        return -EFAULT;
+        return -1;
     }
 
     /* For translation macros */
diff --git a/tools/libxc/xc_pagetab.c b/tools/libxc/xc_pagetab.c
index 8525967..ec97890 100644
--- a/tools/libxc/xc_pagetab.c
+++ b/tools/libxc/xc_pagetab.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_physdev.c b/tools/libxc/xc_physdev.c
index cf02d85..23108d6 100644
--- a/tools/libxc/xc_physdev.c
+++ b/tools/libxc/xc_physdev.c
@@ -17,8 +17,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
@@ -43,8 +42,10 @@ int xc_physdev_map_pirq(xc_interface *xch,
     struct physdev_map_pirq map;
 
     if ( !pirq )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     memset(&map, 0, sizeof(struct physdev_map_pirq));
     map.domid = domid;
     map.type = MAP_PIRQ_TYPE_GSI;
@@ -72,8 +73,10 @@ int xc_physdev_map_pirq_msi(xc_interface *xch,
     struct physdev_map_pirq map;
 
     if ( !pirq )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     memset(&map, 0, sizeof(struct physdev_map_pirq));
     map.domid = domid;
     map.type = MAP_PIRQ_TYPE_MSI;
diff --git a/tools/libxc/xc_pm.c b/tools/libxc/xc_pm.c
index e4e0fb9..5b38cf1 100644
--- a/tools/libxc/xc_pm.c
+++ b/tools/libxc/xc_pm.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
@@ -51,8 +50,10 @@ int xc_pm_get_pxstat(xc_interface *xch, int cpuid, struct xc_px_stat *pxpt)
     int max_px, ret;
 
     if ( !pxpt->trans_pt || !pxpt->pt )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     if ( (ret = xc_pm_get_max_px(xch, cpuid, &max_px)) != 0)
         return ret;
 
@@ -219,8 +220,10 @@ int xc_get_cpufreq_para(xc_interface *xch, int cpuid,
         if ( (!user_para->affected_cpus)                    ||
              (!user_para->scaling_available_frequencies)    ||
              (!user_para->scaling_available_governors) )
-            return -EINVAL;
-
+        {
+            errno = EINVAL;
+            return -1;
+        }
         if ( xc_hypercall_bounce_pre(xch, affected_cpus) )
             goto unlock_1;
         if ( xc_hypercall_bounce_pre(xch, scaling_available_frequencies) )
@@ -293,8 +296,10 @@ int xc_set_cpufreq_gov(xc_interface *xch, int cpuid, char *govname)
     char *scaling_governor = sysctl.u.pm_op.u.set_gov.scaling_governor;
 
     if ( !xch || !govname )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     sysctl.cmd = XEN_SYSCTL_pm_op;
     sysctl.u.pm_op.cmd = SET_CPUFREQ_GOV;
     sysctl.u.pm_op.cpuid = cpuid;
@@ -310,8 +315,10 @@ int xc_set_cpufreq_para(xc_interface *xch, int cpuid,
     DECLARE_SYSCTL;
 
     if ( !xch )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     sysctl.cmd = XEN_SYSCTL_pm_op;
     sysctl.u.pm_op.cmd = SET_CPUFREQ_PARA;
     sysctl.u.pm_op.cpuid = cpuid;
@@ -327,8 +334,10 @@ int xc_get_cpufreq_avgfreq(xc_interface *xch, int cpuid, int *avg_freq)
     DECLARE_SYSCTL;
 
     if ( !xch || !avg_freq )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     sysctl.cmd = XEN_SYSCTL_pm_op;
     sysctl.u.pm_op.cmd = GET_CPUFREQ_AVGFREQ;
     sysctl.u.pm_op.cpuid = cpuid;
@@ -392,8 +401,10 @@ int xc_get_cpuidle_max_cstate(xc_interface *xch, uint32_t *value)
     DECLARE_SYSCTL;
 
     if ( !xch || !value )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     sysctl.cmd = XEN_SYSCTL_pm_op;
     sysctl.u.pm_op.cmd = XEN_SYSCTL_pm_op_get_max_cstate;
     sysctl.u.pm_op.cpuid = 0;
@@ -409,8 +420,10 @@ int xc_set_cpuidle_max_cstate(xc_interface *xch, uint32_t value)
     DECLARE_SYSCTL;
 
     if ( !xch )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     sysctl.cmd = XEN_SYSCTL_pm_op;
     sysctl.u.pm_op.cmd = XEN_SYSCTL_pm_op_set_max_cstate;
     sysctl.u.pm_op.cpuid = 0;
@@ -424,8 +437,10 @@ int xc_enable_turbo(xc_interface *xch, int cpuid)
     DECLARE_SYSCTL;
 
     if ( !xch )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     sysctl.cmd = XEN_SYSCTL_pm_op;
     sysctl.u.pm_op.cmd = XEN_SYSCTL_pm_op_enable_turbo;
     sysctl.u.pm_op.cpuid = cpuid;
@@ -437,8 +452,10 @@ int xc_disable_turbo(xc_interface *xch, int cpuid)
     DECLARE_SYSCTL;
 
     if ( !xch )
-        return -EINVAL;
-
+    {
+        errno = EINVAL;
+        return -1;
+    }
     sysctl.cmd = XEN_SYSCTL_pm_op;
     sysctl.u.pm_op.cmd = XEN_SYSCTL_pm_op_disable_turbo;
     sysctl.u.pm_op.cpuid = cpuid;
diff --git a/tools/libxc/xc_private.c b/tools/libxc/xc_private.c
index df6cd9b..7c39897 100644
--- a/tools/libxc/xc_private.c
+++ b/tools/libxc/xc_private.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
@@ -388,18 +387,26 @@ void xc_osdep_log(xc_interface *xch, xentoollog_level level, int code, const cha
     va_end(args);
 }
 
-void xc_report_progress_start(xc_interface *xch, const char *doing,
-                              unsigned long total) {
+const char *xc_set_progress_prefix(xc_interface *xch, const char *doing)
+{
+    const char *old = xch->currently_progress_reporting;
+
     xch->currently_progress_reporting = doing;
-    xtl_progress(xch->error_handler, "xc", xch->currently_progress_reporting,
-                 0, total);
+    return old;
+}
+
+void xc_report_progress_single(xc_interface *xch, const char *doing)
+{
+    assert(doing);
+    xtl_progress(xch->error_handler, "xc", doing, 0, 0);
 }
 
 void xc_report_progress_step(xc_interface *xch,
-                             unsigned long done, unsigned long total) {
+                             unsigned long done, unsigned long total)
+{
     assert(xch->currently_progress_reporting);
-    xtl_progress(xch->error_handler, "xc", xch->currently_progress_reporting,
-                 done, total);
+    xtl_progress(xch->error_handler, "xc",
+                 xch->currently_progress_reporting, done, total);
 }
 
 int xc_get_pfn_type_batch(xc_interface *xch, uint32_t dom,
@@ -427,7 +434,7 @@ int xc_mmuext_op(
 {
     DECLARE_HYPERCALL;
     DECLARE_HYPERCALL_BOUNCE(op, nr_ops*sizeof(*op), XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
-    long ret = -EINVAL;
+    long ret = -1;
 
     if ( xc_hypercall_bounce_pre(xch, op) )
     {
@@ -516,7 +523,7 @@ int do_memory_op(xc_interface *xch, int cmd, void *arg, size_t len)
 {
     DECLARE_HYPERCALL;
     DECLARE_HYPERCALL_BOUNCE(arg, len, XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
-    long ret = -EINVAL;
+    long ret = -1;
 
     if ( xc_hypercall_bounce_pre(xch, arg) )
     {
@@ -535,9 +542,16 @@ int do_memory_op(xc_interface *xch, int cmd, void *arg, size_t len)
     return ret;
 }
 
-long xc_maximum_ram_page(xc_interface *xch)
+int xc_maximum_ram_page(xc_interface *xch, unsigned long *max_mfn)
 {
-    return do_memory_op(xch, XENMEM_maximum_ram_page, NULL, 0);
+    long rc = do_memory_op(xch, XENMEM_maximum_ram_page, NULL, 0);
+
+    if ( rc >= 0 )
+    {
+        *max_mfn = rc;
+        rc = 0;
+    }
+    return rc;
 }
 
 long long xc_domain_get_cpu_usage( xc_interface *xch, domid_t domid, int vcpu )
@@ -860,6 +874,91 @@ int write_exact(int fd, const void *data, size_t size)
     return 0;
 }
 
+#if defined(__MINIOS__)
+/*
+ * MiniOS's libc doesn't know about writev(). Implement it as multiple write()s.
+ */
+int writev_exact(int fd, const struct iovec *iov, int iovcnt)
+{
+    int rc, i;
+
+    for ( i = 0; i < iovcnt; ++i )
+    {
+        rc = write_exact(fd, iov[i].iov_base, iov[i].iov_len);
+        if ( rc )
+            return rc;
+    }
+
+    return 0;
+}
+#else
+int writev_exact(int fd, const struct iovec *iov, int iovcnt)
+{
+    struct iovec *local_iov = NULL;
+    int rc = 0, iov_idx = 0, saved_errno = 0;
+    ssize_t len;
+
+    while ( iov_idx < iovcnt )
+    {
+        /*
+         * Skip over iov[] entries with 0 length.
+         *
+         * This is needed to cover the case where we took a partial write and
+         * all remaining vectors are of 0 length.  In such a case, the results
+         * from writev() are indistinguishable from EOF.
+         */
+        while ( iov[iov_idx].iov_len == 0 )
+            if ( ++iov_idx == iovcnt )
+                goto out;
+
+        len = writev(fd, &iov[iov_idx], min(iovcnt - iov_idx, IOV_MAX));
+        saved_errno = errno;
+
+        if ( (len == -1) && (errno == EINTR) )
+            continue;
+        if ( len <= 0 )
+        {
+            rc = -1;
+            goto out;
+        }
+
+        /* Check iov[] to see whether we had a partial or complete write. */
+        while ( (len > 0) && (iov_idx < iovcnt) )
+        {
+            if ( len >= iov[iov_idx].iov_len )
+                len -= iov[iov_idx++].iov_len;
+            else
+            {
+                /* Partial write of iov[iov_idx]. Copy iov so we can adjust
+                 * element iov_idx and resubmit the rest. */
+                if ( !local_iov )
+                {
+                    local_iov = malloc(iovcnt * sizeof(*iov));
+                    if ( !local_iov )
+                    {
+                        saved_errno = ENOMEM;
+                        goto out;
+                    }
+
+                    iov = memcpy(local_iov, iov, iovcnt * sizeof(*iov));
+                }
+
+                local_iov[iov_idx].iov_base += len;
+                local_iov[iov_idx].iov_len  -= len;
+                break;
+            }
+        }
+    }
+
+    saved_errno = 0;
+
+ out:
+    free(local_iov);
+    errno = saved_errno;
+    return rc;
+}
+#endif
+
 int xc_ffs8(uint8_t x)
 {
     int i;
diff --git a/tools/libxc/xc_private.h b/tools/libxc/xc_private.h
index 45b8644..2df1d59 100644
--- a/tools/libxc/xc_private.h
+++ b/tools/libxc/xc_private.h
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef XC_PRIVATE_H
@@ -42,6 +41,19 @@
 #define VALGRIND_MAKE_MEM_UNDEFINED(addr, len) /* addr, len */
 #endif
 
+#if defined(__MINIOS__)
+/*
+ * MiniOS's libc doesn't know about sys/uio.h or writev().
+ * Declare enough of sys/uio.h to compile.
+ */
+struct iovec {
+    void *iov_base;
+    size_t iov_len;
+};
+#else
+#include <sys/uio.h>
+#endif
+
 #define DECLARE_HYPERCALL privcmd_hypercall_t hypercall
 #define DECLARE_DOMCTL struct xen_domctl domctl
 #define DECLARE_SYSCTL struct xen_sysctl sysctl
@@ -120,8 +132,8 @@ void xc_report(xc_interface *xch, xentoollog_logger *lg, xentoollog_level,
                int code, const char *fmt, ...)
      __attribute__((format(printf,5,6)));
 
-void xc_report_progress_start(xc_interface *xch, const char *doing,
-                              unsigned long total);
+const char *xc_set_progress_prefix(xc_interface *xch, const char *doing);
+void xc_report_progress_single(xc_interface *xch, const char *doing);
 void xc_report_progress_step(xc_interface *xch,
                              unsigned long done, unsigned long total);
 
@@ -395,6 +407,7 @@ int xc_flush_mmu_updates(xc_interface *xch, struct xc_mmu *mmu);
 /* Return 0 on success; -1 on error setting errno. */
 int read_exact(int fd, void *data, size_t size); /* EOF => -1, errno=0 */
 int write_exact(int fd, const void *data, size_t size);
+int writev_exact(int fd, const struct iovec *iov, int iovcnt);
 
 int xc_ffs8(uint8_t x);
 int xc_ffs16(uint16_t x);
@@ -421,18 +434,15 @@ int xc_ffs64(uint64_t x);
 #define DOMPRINTF_CALLED(xch) xc_dom_printf((xch), "%s: called", __FUNCTION__)
 
 /**
- * mem_event operations. Internal use only.
+ * vm_event operations. Internal use only.
  */
-int xc_mem_event_control(xc_interface *xch, domid_t domain_id, unsigned int op,
-                         unsigned int mode, uint32_t *port);
-int xc_mem_event_memop(xc_interface *xch, domid_t domain_id,
-                        unsigned int op, unsigned int mode,
-                        uint64_t gfn, void *buffer);
+int xc_vm_event_control(xc_interface *xch, domid_t domain_id, unsigned int op,
+                        unsigned int mode, uint32_t *port);
 /*
- * Enables mem_event and returns the mapped ring page indicated by param.
+ * Enables vm_event and returns the mapped ring page indicated by param.
  * param can be HVM_PARAM_PAGING/ACCESS/SHARING_RING_PFN
  */
-void *xc_mem_event_enable(xc_interface *xch, domid_t domain_id, int param,
-                          uint32_t *port, int enable_introspection);
+void *xc_vm_event_enable(xc_interface *xch, domid_t domain_id, int param,
+                         uint32_t *port);
 
 #endif /* __XC_PRIVATE_H__ */
diff --git a/tools/libxc/xc_psr.c b/tools/libxc/xc_psr.c
index 872e6dc..d8b3a51 100644
--- a/tools/libxc/xc_psr.c
+++ b/tools/libxc/xc_psr.c
@@ -17,12 +17,15 @@
  * GNU Lesser General Public License for more details.
  */
 
+#include <assert.h>
 #include "xc_private.h"
 #include "xc_msr_x86.h"
 
 #define IA32_CMT_CTR_ERROR_MASK         (0x3ull << 62)
 
 #define EVTID_L3_OCCUPANCY             0x1
+#define EVTID_TOTAL_MEM_COUNT          0x2
+#define EVTID_LOCAL_MEM_COUNT          0x3
 
 int xc_psr_cmt_attach(xc_interface *xch, uint32_t domid)
 {
@@ -47,7 +50,7 @@ int xc_psr_cmt_detach(xc_interface *xch, uint32_t domid)
 }
 
 int xc_psr_cmt_get_domain_rmid(xc_interface *xch, uint32_t domid,
-                                    uint32_t *rmid)
+                               uint32_t *rmid)
 {
     int rc;
     DECLARE_DOMCTL;
@@ -88,7 +91,7 @@ int xc_psr_cmt_get_total_rmid(xc_interface *xch, uint32_t *total_rmid)
 }
 
 int xc_psr_cmt_get_l3_upscaling_factor(xc_interface *xch,
-                                            uint32_t *upscaling_factor)
+                                       uint32_t *upscaling_factor)
 {
     static int val = 0;
     int rc;
@@ -112,8 +115,25 @@ int xc_psr_cmt_get_l3_upscaling_factor(xc_interface *xch,
     return rc;
 }
 
+int xc_psr_cmt_get_l3_event_mask(xc_interface *xch, uint32_t *event_mask)
+{
+    int rc;
+    DECLARE_SYSCTL;
+
+    sysctl.cmd = XEN_SYSCTL_psr_cmt_op;
+    sysctl.u.psr_cmt_op.cmd =
+        XEN_SYSCTL_PSR_CMT_get_l3_event_mask;
+    sysctl.u.psr_cmt_op.flags = 0;
+
+    rc = xc_sysctl(xch, &sysctl);
+    if ( !rc )
+        *event_mask = sysctl.u.psr_cmt_op.u.data;
+
+    return rc;
+}
+
 int xc_psr_cmt_get_l3_cache_size(xc_interface *xch, uint32_t cpu,
-                                      uint32_t *l3_cache_size)
+                                 uint32_t *l3_cache_size)
 {
     static int val = 0;
     int rc;
@@ -138,12 +158,14 @@ int xc_psr_cmt_get_l3_cache_size(xc_interface *xch, uint32_t cpu,
     return rc;
 }
 
-int xc_psr_cmt_get_data(xc_interface *xch, uint32_t rmid,
-    uint32_t cpu, xc_psr_cmt_type type, uint64_t *monitor_data)
+int xc_psr_cmt_get_data(xc_interface *xch, uint32_t rmid, uint32_t cpu,
+                        xc_psr_cmt_type type, uint64_t *monitor_data,
+                        uint64_t *tsc)
 {
     xc_resource_op_t op;
-    xc_resource_entry_t entries[2];
-    uint32_t evtid;
+    xc_resource_entry_t entries[3];
+    xc_resource_entry_t *tsc_entry = NULL;
+    uint32_t evtid, nr = 0;
     int rc;
 
     switch ( type )
@@ -151,33 +173,56 @@ int xc_psr_cmt_get_data(xc_interface *xch, uint32_t rmid,
     case XC_PSR_CMT_L3_OCCUPANCY:
         evtid = EVTID_L3_OCCUPANCY;
         break;
+    case XC_PSR_CMT_TOTAL_MEM_COUNT:
+        evtid = EVTID_TOTAL_MEM_COUNT;
+        break;
+    case XC_PSR_CMT_LOCAL_MEM_COUNT:
+        evtid = EVTID_LOCAL_MEM_COUNT;
+        break;
     default:
         return -1;
     }
 
-    entries[0].u.cmd = XEN_RESOURCE_OP_MSR_WRITE;
-    entries[0].idx = MSR_IA32_CMT_EVTSEL;
-    entries[0].val = (uint64_t)rmid << 32 | evtid;
-    entries[0].rsvd = 0;
+    entries[nr].u.cmd = XEN_RESOURCE_OP_MSR_WRITE;
+    entries[nr].idx = MSR_IA32_CMT_EVTSEL;
+    entries[nr].val = (uint64_t)rmid << 32 | evtid;
+    entries[nr].rsvd = 0;
+    nr++;
+
+    entries[nr].u.cmd = XEN_RESOURCE_OP_MSR_READ;
+    entries[nr].idx = MSR_IA32_CMT_CTR;
+    entries[nr].val = 0;
+    entries[nr].rsvd = 0;
+    nr++;
 
-    entries[1].u.cmd = XEN_RESOURCE_OP_MSR_READ;
-    entries[1].idx = MSR_IA32_CMT_CTR;
-    entries[1].val = 0;
-    entries[1].rsvd = 0;
+    if ( tsc != NULL )
+    {
+        tsc_entry = &entries[nr];
+        entries[nr].u.cmd = XEN_RESOURCE_OP_MSR_READ;
+        entries[nr].idx = MSR_IA32_TSC;
+        entries[nr].val = 0;
+        entries[nr].rsvd = 0;
+        nr++;
+    }
+
+    assert(nr <= ARRAY_SIZE(entries));
 
     op.cpu = cpu;
-    op.nr_entries = 2;
+    op.nr_entries = nr;
     op.entries = entries;
 
     rc = xc_resource_op(xch, 1, &op);
     if ( rc < 0 )
         return rc;
 
-    if ( op.result !=2 || entries[1].val & IA32_CMT_CTR_ERROR_MASK )
+    if ( op.result != nr || entries[1].val & IA32_CMT_CTR_ERROR_MASK )
         return -1;
 
     *monitor_data = entries[1].val;
 
+    if ( tsc_entry != NULL )
+        *tsc = tsc_entry->val;
+
     return 0;
 }
 
@@ -203,6 +248,82 @@ int xc_psr_cmt_enabled(xc_interface *xch)
 
     return 0;
 }
+int xc_psr_cat_set_domain_data(xc_interface *xch, uint32_t domid,
+                               xc_psr_cat_type type, uint32_t target,
+                               uint64_t data)
+{
+    DECLARE_DOMCTL;
+    uint32_t cmd;
+
+    switch ( type )
+    {
+    case XC_PSR_CAT_L3_CBM:
+        cmd = XEN_DOMCTL_PSR_CAT_OP_SET_L3_CBM;
+        break;
+    default:
+        errno = EINVAL;
+        return -1;
+    }
+
+    domctl.cmd = XEN_DOMCTL_psr_cat_op;
+    domctl.domain = (domid_t)domid;
+    domctl.u.psr_cat_op.cmd = cmd;
+    domctl.u.psr_cat_op.target = target;
+    domctl.u.psr_cat_op.data = data;
+
+    return do_domctl(xch, &domctl);
+}
+
+int xc_psr_cat_get_domain_data(xc_interface *xch, uint32_t domid,
+                               xc_psr_cat_type type, uint32_t target,
+                               uint64_t *data)
+{
+    int rc;
+    DECLARE_DOMCTL;
+    uint32_t cmd;
+
+    switch ( type )
+    {
+    case XC_PSR_CAT_L3_CBM:
+        cmd = XEN_DOMCTL_PSR_CAT_OP_GET_L3_CBM;
+        break;
+    default:
+        errno = EINVAL;
+        return -1;
+    }
+
+    domctl.cmd = XEN_DOMCTL_psr_cat_op;
+    domctl.domain = (domid_t)domid;
+    domctl.u.psr_cat_op.cmd = cmd;
+    domctl.u.psr_cat_op.target = target;
+
+    rc = do_domctl(xch, &domctl);
+
+    if ( !rc )
+        *data = domctl.u.psr_cat_op.data;
+
+    return rc;
+}
+
+int xc_psr_cat_get_l3_info(xc_interface *xch, uint32_t socket,
+                           uint32_t *cos_max, uint32_t *cbm_len)
+{
+    int rc;
+    DECLARE_SYSCTL;
+
+    sysctl.cmd = XEN_SYSCTL_psr_cat_op;
+    sysctl.u.psr_cat_op.cmd = XEN_SYSCTL_PSR_CAT_get_l3_info;
+    sysctl.u.psr_cat_op.target = socket;
+
+    rc = xc_sysctl(xch, &sysctl);
+    if ( !rc )
+    {
+        *cos_max = sysctl.u.psr_cat_op.u.l3_info.cos_max;
+        *cbm_len = sysctl.u.psr_cat_op.u.l3_info.cbm_len;
+    }
+
+    return rc;
+}
 
 /*
  * Local variables:
diff --git a/tools/libxc/xc_resume.c b/tools/libxc/xc_resume.c
index e67bebd..87d4324 100644
--- a/tools/libxc/xc_resume.c
+++ b/tools/libxc/xc_resume.c
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_rt.c b/tools/libxc/xc_rt.c
index b2d1cc5..d59e5ce 100644
--- a/tools/libxc/xc_rt.c
+++ b/tools/libxc/xc_rt.c
@@ -19,8 +19,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_sedf.c b/tools/libxc/xc_sedf.c
deleted file mode 100644
index db372ca..0000000
--- a/tools/libxc/xc_sedf.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/******************************************************************************
- * xc_sedf.c
- *
- * API for manipulating parameters of the Simple EDF scheduler.
- *
- * changes by Stephan Diestelhorst
- * based on code
- * by Mark Williamson, Copyright (c) 2004 Intel Research Cambridge.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include "xc_private.h"
-
-int xc_sedf_domain_set(
-    xc_interface *xch,
-    uint32_t domid,
-    uint64_t period,
-    uint64_t slice,
-    uint64_t latency,
-    uint16_t extratime,
-    uint16_t weight)
-{
-    DECLARE_DOMCTL;
-    struct xen_domctl_sched_sedf *p = &domctl.u.scheduler_op.u.sedf;
-
-    domctl.cmd = XEN_DOMCTL_scheduler_op;
-    domctl.domain  = (domid_t)domid;
-    domctl.u.scheduler_op.sched_id = XEN_SCHEDULER_SEDF;
-    domctl.u.scheduler_op.cmd = XEN_DOMCTL_SCHEDOP_putinfo;
-
-    p->period    = period;
-    p->slice     = slice;
-    p->latency   = latency;
-    p->extratime = extratime;
-    p->weight    = weight;
-    return do_domctl(xch, &domctl);
-}
-
-int xc_sedf_domain_get(
-    xc_interface *xch,
-    uint32_t domid,
-    uint64_t *period,
-    uint64_t *slice,
-    uint64_t *latency,
-    uint16_t *extratime,
-    uint16_t *weight)
-{
-    DECLARE_DOMCTL;
-    int ret;
-    struct xen_domctl_sched_sedf *p = &domctl.u.scheduler_op.u.sedf;
-
-    domctl.cmd = XEN_DOMCTL_scheduler_op;
-    domctl.domain = (domid_t)domid;
-    domctl.u.scheduler_op.sched_id = XEN_SCHEDULER_SEDF;
-    domctl.u.scheduler_op.cmd = XEN_DOMCTL_SCHEDOP_getinfo;
-
-    ret = do_domctl(xch, &domctl);
-
-    *period    = p->period;
-    *slice     = p->slice;
-    *latency   = p->latency;
-    *extratime = p->extratime;
-    *weight    = p->weight;
-    return ret;
-}
diff --git a/tools/libxc/xc_solaris.c b/tools/libxc/xc_solaris.c
index 7257a54..7e5d847 100644
--- a/tools/libxc/xc_solaris.c
+++ b/tools/libxc/xc_solaris.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
diff --git a/tools/libxc/xc_sr_common.c b/tools/libxc/xc_sr_common.c
new file mode 100644
index 0000000..945cfa6
--- /dev/null
+++ b/tools/libxc/xc_sr_common.c
@@ -0,0 +1,114 @@
+#include <assert.h>
+
+#include "xc_sr_common.h"
+
+static const char *dhdr_types[] =
+{
+    [DHDR_TYPE_X86_PV]  = "x86 PV",
+    [DHDR_TYPE_X86_HVM] = "x86 HVM",
+    [DHDR_TYPE_X86_PVH] = "x86 PVH",
+    [DHDR_TYPE_ARM]     = "ARM",
+};
+
+const char *dhdr_type_to_str(uint32_t type)
+{
+    if ( type < ARRAY_SIZE(dhdr_types) && dhdr_types[type] )
+        return dhdr_types[type];
+
+    return "Reserved";
+}
+
+static const char *mandatory_rec_types[] =
+{
+    [REC_TYPE_END]                  = "End",
+    [REC_TYPE_PAGE_DATA]            = "Page data",
+    [REC_TYPE_X86_PV_INFO]          = "x86 PV info",
+    [REC_TYPE_X86_PV_P2M_FRAMES]    = "x86 PV P2M frames",
+    [REC_TYPE_X86_PV_VCPU_BASIC]    = "x86 PV vcpu basic",
+    [REC_TYPE_X86_PV_VCPU_EXTENDED] = "x86 PV vcpu extended",
+    [REC_TYPE_X86_PV_VCPU_XSAVE]    = "x86 PV vcpu xsave",
+    [REC_TYPE_SHARED_INFO]          = "Shared info",
+    [REC_TYPE_TSC_INFO]             = "TSC info",
+    [REC_TYPE_HVM_CONTEXT]          = "HVM context",
+    [REC_TYPE_HVM_PARAMS]           = "HVM params",
+    [REC_TYPE_TOOLSTACK]            = "Toolstack",
+    [REC_TYPE_X86_PV_VCPU_MSRS]     = "x86 PV vcpu msrs",
+    [REC_TYPE_VERIFY]               = "Verify",
+    [REC_TYPE_CHECKPOINT]           = "Checkpoint",
+};
+
+const char *rec_type_to_str(uint32_t type)
+{
+    if ( !(type & REC_TYPE_OPTIONAL) )
+    {
+        if ( (type < ARRAY_SIZE(mandatory_rec_types)) &&
+             (mandatory_rec_types[type]) )
+            return mandatory_rec_types[type];
+    }
+
+    return "Reserved";
+}
+
+int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec,
+                       void *buf, size_t sz)
+{
+    static const char zeroes[(1u << REC_ALIGN_ORDER) - 1] = { 0 };
+
+    xc_interface *xch = ctx->xch;
+    typeof(rec->length) combined_length = rec->length + sz;
+    size_t record_length = ROUNDUP(combined_length, REC_ALIGN_ORDER);
+    struct iovec parts[] =
+    {
+        { &rec->type,       sizeof(rec->type) },
+        { &combined_length, sizeof(combined_length) },
+        { rec->data,        rec->length },
+        { buf,              sz },
+        { (void*)zeroes,    record_length - combined_length },
+    };
+
+    if ( record_length > REC_LENGTH_MAX )
+    {
+        ERROR("Record (0x%08x, %s) length %#x exceeds max (%#x)", rec->type,
+              rec_type_to_str(rec->type), rec->length, REC_LENGTH_MAX);
+        return -1;
+    }
+
+    if ( rec->length )
+        assert(rec->data);
+    if ( sz )
+        assert(buf);
+
+    if ( writev_exact(ctx->fd, parts, ARRAY_SIZE(parts)) )
+        goto err;
+
+    return 0;
+
+ err:
+    PERROR("Unable to write record to stream");
+    return -1;
+}
+
+static void __attribute__((unused)) build_assertions(void)
+{
+    XC_BUILD_BUG_ON(sizeof(struct xc_sr_ihdr) != 24);
+    XC_BUILD_BUG_ON(sizeof(struct xc_sr_dhdr) != 16);
+    XC_BUILD_BUG_ON(sizeof(struct xc_sr_rhdr) != 8);
+
+    XC_BUILD_BUG_ON(sizeof(struct xc_sr_rec_page_data_header)  != 8);
+    XC_BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_info)       != 8);
+    XC_BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_p2m_frames) != 8);
+    XC_BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_vcpu_hdr)   != 8);
+    XC_BUILD_BUG_ON(sizeof(struct xc_sr_rec_tsc_info)          != 24);
+    XC_BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params_entry)  != 16);
+    XC_BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params)        != 8);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
new file mode 100644
index 0000000..64f6082
--- /dev/null
+++ b/tools/libxc/xc_sr_common.h
@@ -0,0 +1,375 @@
+#ifndef __COMMON__H
+#define __COMMON__H
+
+#include <stdbool.h>
+
+#include "xg_private.h"
+#include "xg_save_restore.h"
+#include "xc_dom.h"
+#include "xc_bitops.h"
+
+#include "xc_sr_stream_format.h"
+
+/* String representation of Domain Header types. */
+const char *dhdr_type_to_str(uint32_t type);
+
+/* String representation of Record types. */
+const char *rec_type_to_str(uint32_t type);
+
+struct xc_sr_context;
+struct xc_sr_record;
+
+/**
+ * Save operations.  To be implemented for each type of guest, for use by the
+ * common save algorithm.
+ *
+ * Every function must be implemented, even if only with a no-op stub.
+ */
+struct xc_sr_save_ops
+{
+    /* Convert a PFN to GFN.  May return ~0UL for an invalid mapping. */
+    xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
+
+    /**
+     * Optionally transform the contents of a page from being specific to the
+     * sending environment, to being generic for the stream.
+     *
+     * The page of data at the end of 'page' may be a read-only mapping of a
+     * running guest; it must not be modified.  If no transformation is
+     * required, the callee should leave '*pages' untouched.
+     *
+     * If a transformation is required, the callee should allocate themselves
+     * a local page using malloc() and return it via '*page'.
+     *
+     * The caller shall free() '*page' in all cases.  In the case that the
+     * callee encounters an error, it should *NOT* free() the memory it
+     * allocated for '*page'.
+     *
+     * It is valid to fail with EAGAIN if the transformation is not able to be
+     * completed at this point.  The page shall be retried later.
+     *
+     * @returns 0 for success, -1 for failure, with errno appropriately set.
+     */
+    int (*normalise_page)(struct xc_sr_context *ctx, xen_pfn_t type,
+                          void **page);
+
+    /**
+     * Set up local environment to restore a domain.  This is called before
+     * any records are written to the stream.  (Typically querying running
+     * domain state, setting up mappings etc.)
+     */
+    int (*setup)(struct xc_sr_context *ctx);
+
+    /**
+     * Send records which need to be at the start of the stream.  This is
+     * called once, after the Image and Domain headers are written.
+     */
+    int (*start_of_stream)(struct xc_sr_context *ctx);
+
+    /**
+     * Send records which need to be at the start of a checkpoint.  This is
+     * called once, or once per checkpoint in a checkpointed stream, and is
+     * ahead of memory data.
+     */
+    int (*start_of_checkpoint)(struct xc_sr_context *ctx);
+
+    /**
+     * Send records which need to be at the end of the checkpoint.  This is
+     * called once, or once per checkpoint in a checkpointed stream, and is
+     * after the memory data.
+     */
+    int (*end_of_checkpoint)(struct xc_sr_context *ctx);
+
+    /**
+     * Clean up the local environment.  Will be called exactly once, either
+     * after a successful save, or upon encountering an error.
+     */
+    int (*cleanup)(struct xc_sr_context *ctx);
+};
+
+
+/**
+ * Restore operations.  To be implemented for each type of guest, for use by
+ * the common restore algorithm.
+ *
+ * Every function must be implemented, even if only with a no-op stub.
+ */
+struct xc_sr_restore_ops
+{
+    /* Convert a PFN to GFN.  May return ~0UL for an invalid mapping. */
+    xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
+
+    /* Check to see whether a PFN is valid. */
+    bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
+
+    /* Set the GFN of a PFN. */
+    void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn);
+
+    /* Set the type of a PFN. */
+    void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn,
+                          xen_pfn_t type);
+
+    /**
+     * Optionally transform the contents of a page from being generic in the
+     * stream, to being specific to the restoring environment.
+     *
+     * 'page' is expected to be modified in-place if a transformation is
+     * required.
+     *
+     * @returns 0 for success, -1 for failure, with errno appropriately set.
+     */
+    int (*localise_page)(struct xc_sr_context *ctx, uint32_t type, void *page);
+
+    /**
+     * Set up local environment to restore a domain.  This is called before
+     * any records are read from the stream.
+     */
+    int (*setup)(struct xc_sr_context *ctx);
+
+    /**
+     * Process an individual record from the stream.  The caller shall take
+     * care of processing common records (e.g. END, PAGE_DATA).
+     *
+     * @return 0 for success, -1 for failure, or the following sentinels:
+     *  - RECORD_NOT_PROCESSED
+     *  - BROKEN_CHANNEL: under Remus/COLO, this means master may be dead, and
+     *    a failover is needed.
+     */
+#define RECORD_NOT_PROCESSED 1
+#define BROKEN_CHANNEL 2
+    int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec);
+
+    /**
+     * Perform any actions required after the stream has been finished. Called
+     * after the END record has been received.
+     */
+    int (*stream_complete)(struct xc_sr_context *ctx);
+
+    /**
+     * Clean up the local environment.  Will be called exactly once, either
+     * after a successful restore, or upon encountering an error.
+     */
+    int (*cleanup)(struct xc_sr_context *ctx);
+};
+
+/* x86 PV per-vcpu storage structure for blobs heading Xen-wards. */
+struct xc_sr_x86_pv_restore_vcpu
+{
+    void *basic, *extd, *xsave, *msr;
+    size_t basicsz, extdsz, xsavesz, msrsz;
+};
+
+struct xc_sr_context
+{
+    xc_interface *xch;
+    uint32_t domid;
+    int fd;
+
+    xc_dominfo_t dominfo;
+
+    union /* Common save or restore data. */
+    {
+        struct /* Save data. */
+        {
+            struct xc_sr_save_ops ops;
+            struct save_callbacks *callbacks;
+
+            /* Live migrate vs non live suspend. */
+            bool live;
+
+            /* Plain VM, or checkpoints over time. */
+            bool checkpointed;
+
+            /* Further debugging information in the stream. */
+            bool debug;
+
+            /* Parameters for tweaking live migration. */
+            unsigned max_iterations;
+            unsigned dirty_threshold;
+
+            unsigned long p2m_size;
+
+            xen_pfn_t *batch_pfns;
+            unsigned nr_batch_pfns;
+            unsigned long *deferred_pages;
+            unsigned long nr_deferred_pages;
+            xc_hypercall_buffer_t dirty_bitmap_hbuf;
+        } save;
+
+        struct /* Restore data. */
+        {
+            struct xc_sr_restore_ops ops;
+            struct restore_callbacks *callbacks;
+
+            /* From Image Header. */
+            uint32_t format_version;
+
+            /* From Domain Header. */
+            uint32_t guest_type;
+            uint32_t guest_page_size;
+
+            /* Plain VM, or checkpoints over time. */
+            bool checkpointed;
+
+            /* Currently buffering records between a checkpoint */
+            bool buffer_all_records;
+
+/*
+ * With Remus, we buffer the records sent by the primary at checkpoint,
+ * in case the primary will fail, we can recover from the last
+ * checkpoint state.
+ * This should be enough for most of the cases because primary only send
+ * dirty pages at checkpoint.
+ */
+#define DEFAULT_BUF_RECORDS 1024
+            struct xc_sr_record *buffered_records;
+            unsigned allocated_rec_num;
+            unsigned buffered_rec_num;
+
+            /*
+             * Xenstore and Console parameters.
+             * INPUT:  evtchn & domid
+             * OUTPUT: gfn
+             */
+            xen_pfn_t    xenstore_gfn,    console_gfn;
+            unsigned int xenstore_evtchn, console_evtchn;
+            domid_t      xenstore_domid,  console_domid;
+
+            /* Bitmap of currently populated PFNs during restore. */
+            unsigned long *populated_pfns;
+            xen_pfn_t max_populated_pfn;
+
+            /* Sender has invoked verify mode on the stream. */
+            bool verify;
+        } restore;
+    };
+
+    union /* Guest-arch specific data. */
+    {
+        struct /* x86 PV guest. */
+        {
+            /* 4 or 8; 32 or 64 bit domain */
+            unsigned int width;
+            /* 3 or 4 pagetable levels */
+            unsigned int levels;
+
+            /* Maximum Xen frame */
+            xen_pfn_t max_mfn;
+            /* Read-only machine to phys map */
+            xen_pfn_t *m2p;
+            /* first mfn of the compat m2p (Only needed for 32bit PV guests) */
+            xen_pfn_t compat_m2p_mfn0;
+            /* Number of m2p frames mapped */
+            unsigned long nr_m2p_frames;
+
+            /* Maximum guest frame */
+            xen_pfn_t max_pfn;
+
+            /* Number of frames making up the p2m */
+            unsigned int p2m_frames;
+            /* Guest's phys to machine map.  Mapped read-only (save) or
+             * allocated locally (restore).  Uses guest unsigned longs. */
+            void *p2m;
+            /* The guest pfns containing the p2m leaves */
+            xen_pfn_t *p2m_pfns;
+
+            /* Read-only mapping of guests shared info page */
+            shared_info_any_t *shinfo;
+
+            union
+            {
+                struct
+                {
+                    /* State machine for the order of received records. */
+                    bool seen_pv_info;
+
+                    /* Types for each page (bounded by max_pfn). */
+                    uint32_t *pfn_types;
+
+                    /* Vcpu context blobs. */
+                    struct xc_sr_x86_pv_restore_vcpu *vcpus;
+                    unsigned nr_vcpus;
+                } restore;
+            };
+        } x86_pv;
+
+        struct /* x86 HVM guest. */
+        {
+            union
+            {
+                struct
+                {
+                    /* Whether qemu enabled logdirty mode, and we should
+                     * disable on cleanup. */
+                    bool qemu_enabled_logdirty;
+                } save;
+
+                struct
+                {
+                    /* HVM context blob. */
+                    void *context;
+                    size_t contextsz;
+                } restore;
+            };
+        } x86_hvm;
+    };
+};
+
+extern struct xc_sr_save_ops save_ops_x86_pv;
+extern struct xc_sr_save_ops save_ops_x86_hvm;
+
+extern struct xc_sr_restore_ops restore_ops_x86_pv;
+extern struct xc_sr_restore_ops restore_ops_x86_hvm;
+
+struct xc_sr_record
+{
+    uint32_t type;
+    uint32_t length;
+    void *data;
+};
+
+/*
+ * Writes a split record to the stream, applying correct padding where
+ * appropriate.  It is common when sending records containing blobs from Xen
+ * that the header and blob data are separate.  This function accepts a second
+ * buffer and length, and will merge it with the main record when sending.
+ *
+ * Records with a non-zero length must provide a valid data field; records
+ * with a 0 length shall have their data field ignored.
+ *
+ * Returns 0 on success and non0 on failure.
+ */
+int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec,
+                       void *buf, size_t sz);
+
+/*
+ * Writes a record to the stream, applying correct padding where appropriate.
+ * Records with a non-zero length must provide a valid data field; records
+ * with a 0 length shall have their data field ignored.
+ *
+ * Returns 0 on success and non0 on failure.
+ */
+static inline int write_record(struct xc_sr_context *ctx,
+                               struct xc_sr_record *rec)
+{
+    return write_split_record(ctx, rec, NULL, 0);
+}
+
+/*
+ * This would ideally be private in restore.c, but is needed by
+ * x86_pv_localise_page() if we receive pagetables frames ahead of the
+ * contents of the frames they point at.
+ */
+int populate_pfns(struct xc_sr_context *ctx, unsigned count,
+                  const xen_pfn_t *original_pfns, const uint32_t *types);
+
+#endif
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_common_x86.c b/tools/libxc/xc_sr_common_x86.c
new file mode 100644
index 0000000..98f1cef
--- /dev/null
+++ b/tools/libxc/xc_sr_common_x86.c
@@ -0,0 +1,54 @@
+#include "xc_sr_common_x86.h"
+
+int write_tsc_info(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_tsc_info tsc = { 0 };
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_TSC_INFO,
+        .length = sizeof(tsc),
+        .data = &tsc
+    };
+
+    if ( xc_domain_get_tsc_info(xch, ctx->domid, &tsc.mode,
+                                &tsc.nsec, &tsc.khz, &tsc.incarnation) < 0 )
+    {
+        PERROR("Unable to obtain TSC information");
+        return -1;
+    }
+
+    return write_record(ctx, &rec);
+}
+
+int handle_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_tsc_info *tsc = rec->data;
+
+    if ( rec->length != sizeof(*tsc) )
+    {
+        ERROR("TSC_INFO record wrong size: length %u, expected %zu",
+              rec->length, sizeof(*tsc));
+        return -1;
+    }
+
+    if ( xc_domain_set_tsc_info(xch, ctx->domid, tsc->mode,
+                                tsc->nsec, tsc->khz, tsc->incarnation) )
+    {
+        PERROR("Unable to set TSC information");
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_common_x86.h b/tools/libxc/xc_sr_common_x86.h
new file mode 100644
index 0000000..1d42da9
--- /dev/null
+++ b/tools/libxc/xc_sr_common_x86.h
@@ -0,0 +1,26 @@
+#ifndef __COMMON_X86__H
+#define __COMMON_X86__H
+
+#include "xc_sr_common.h"
+
+/*
+ * Obtains a domains TSC information from Xen and writes a TSC_INFO record
+ * into the stream.
+ */
+int write_tsc_info(struct xc_sr_context *ctx);
+
+/*
+ * Parses a TSC_INFO record and applies the result to the domain.
+ */
+int handle_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec);
+
+#endif
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_common_x86_pv.c b/tools/libxc/xc_sr_common_x86_pv.c
new file mode 100644
index 0000000..eb68c07
--- /dev/null
+++ b/tools/libxc/xc_sr_common_x86_pv.c
@@ -0,0 +1,210 @@
+#include <assert.h>
+
+#include "xc_sr_common_x86_pv.h"
+
+xen_pfn_t mfn_to_pfn(struct xc_sr_context *ctx, xen_pfn_t mfn)
+{
+    assert(mfn <= ctx->x86_pv.max_mfn);
+    return ctx->x86_pv.m2p[mfn];
+}
+
+bool mfn_in_pseudophysmap(struct xc_sr_context *ctx, xen_pfn_t mfn)
+{
+    return ( (mfn <= ctx->x86_pv.max_mfn) &&
+             (mfn_to_pfn(ctx, mfn) <= ctx->x86_pv.max_pfn) &&
+             (xc_pfn_to_mfn(mfn_to_pfn(ctx, mfn), ctx->x86_pv.p2m,
+                            ctx->x86_pv.width) == mfn) );
+}
+
+void dump_bad_pseudophysmap_entry(struct xc_sr_context *ctx, xen_pfn_t mfn)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t pfn = ~0UL;
+
+    ERROR("mfn %#lx, max %#lx", mfn, ctx->x86_pv.max_mfn);
+
+    if ( (mfn != ~0UL) && (mfn <= ctx->x86_pv.max_mfn) )
+    {
+        pfn = ctx->x86_pv.m2p[mfn];
+        ERROR("  m2p[%#lx] = %#lx, max_pfn %#lx",
+              mfn, pfn, ctx->x86_pv.max_pfn);
+    }
+
+    if ( (pfn != ~0UL) && (pfn <= ctx->x86_pv.max_pfn) )
+        ERROR("  p2m[%#lx] = %#lx",
+              pfn, xc_pfn_to_mfn(pfn, ctx->x86_pv.p2m, ctx->x86_pv.width));
+}
+
+xen_pfn_t cr3_to_mfn(struct xc_sr_context *ctx, uint64_t cr3)
+{
+    if ( ctx->x86_pv.width == 8 )
+        return cr3 >> 12;
+    else
+    {
+        /* 32bit guests can't represent mfns wider than 32 bits */
+        if ( cr3 & 0xffffffff00000000UL )
+            return ~0UL;
+        else
+            return (uint32_t)((cr3 >> 12) | (cr3 << 20));
+    }
+}
+
+uint64_t mfn_to_cr3(struct xc_sr_context *ctx, xen_pfn_t _mfn)
+{
+    uint64_t mfn = _mfn;
+
+    if ( ctx->x86_pv.width == 8 )
+        return mfn << 12;
+    else
+    {
+        /* 32bit guests can't represent mfns wider than 32 bits */
+        if ( mfn & 0xffffffff00000000UL )
+            return ~0UL;
+        else
+            return (uint32_t)((mfn << 12) | (mfn >> 20));
+    }
+}
+
+int x86_pv_domain_info(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned int guest_width, guest_levels, fpp;
+    xen_pfn_t max_pfn;
+
+    /* Get the domain width */
+    if ( xc_domain_get_guest_width(xch, ctx->domid, &guest_width) )
+    {
+        PERROR("Unable to determine dom%d's width", ctx->domid);
+        return -1;
+    }
+
+    if ( guest_width == 4 )
+        guest_levels = 3;
+    else if ( guest_width == 8 )
+        guest_levels = 4;
+    else
+    {
+        ERROR("Invalid guest width %d.  Expected 32 or 64", guest_width * 8);
+        return -1;
+    }
+    ctx->x86_pv.width = guest_width;
+    ctx->x86_pv.levels = guest_levels;
+    fpp = PAGE_SIZE / ctx->x86_pv.width;
+
+    DPRINTF("%d bits, %d levels", guest_width * 8, guest_levels);
+
+    /* Get the domain's size */
+    if ( xc_domain_maximum_gpfn(xch, ctx->domid, &max_pfn) < 0 )
+    {
+        PERROR("Unable to obtain guests max pfn");
+        return -1;
+    }
+
+    if ( max_pfn > 0 )
+    {
+        ctx->x86_pv.max_pfn = max_pfn;
+        ctx->x86_pv.p2m_frames = (ctx->x86_pv.max_pfn + fpp) / fpp;
+
+        DPRINTF("max_pfn %#lx, p2m_frames %d", max_pfn, ctx->x86_pv.p2m_frames);
+    }
+
+    return 0;
+}
+
+int x86_pv_map_m2p(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t m2p_chunks, m2p_size, max_page;
+    privcmd_mmap_entry_t *entries = NULL;
+    xen_pfn_t *extents_start = NULL;
+    int rc = -1, i;
+
+    if ( xc_maximum_ram_page(xch, &max_page) < 0 )
+    {
+        PERROR("Failed to get maximum ram page");
+        goto err;
+    }
+
+    ctx->x86_pv.max_mfn = max_page;
+    m2p_size   = M2P_SIZE(ctx->x86_pv.max_mfn);
+    m2p_chunks = M2P_CHUNKS(ctx->x86_pv.max_mfn);
+
+    extents_start = malloc(m2p_chunks * sizeof(xen_pfn_t));
+    if ( !extents_start )
+    {
+        ERROR("Unable to allocate %lu bytes for m2p mfns",
+              m2p_chunks * sizeof(xen_pfn_t));
+        goto err;
+    }
+
+    if ( xc_machphys_mfn_list(xch, m2p_chunks, extents_start) )
+    {
+        PERROR("Failed to get m2p mfn list");
+        goto err;
+    }
+
+    entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t));
+    if ( !entries )
+    {
+        ERROR("Unable to allocate %lu bytes for m2p mapping mfns",
+              m2p_chunks * sizeof(privcmd_mmap_entry_t));
+        goto err;
+    }
+
+    for ( i = 0; i < m2p_chunks; ++i )
+        entries[i].mfn = extents_start[i];
+
+    ctx->x86_pv.m2p = xc_map_foreign_ranges(
+        xch, DOMID_XEN, m2p_size, PROT_READ,
+        M2P_CHUNK_SIZE, entries, m2p_chunks);
+
+    if ( !ctx->x86_pv.m2p )
+    {
+        PERROR("Failed to mmap() m2p ranges");
+        goto err;
+    }
+
+    ctx->x86_pv.nr_m2p_frames = (M2P_CHUNK_SIZE >> PAGE_SHIFT) * m2p_chunks;
+
+#ifdef __i386__
+    /* 32 bit toolstacks automatically get the compat m2p */
+    ctx->x86_pv.compat_m2p_mfn0 = entries[0].mfn;
+#else
+    /* 64 bit toolstacks need to ask Xen specially for it */
+    {
+        struct xen_machphys_mfn_list xmml = {
+            .max_extents = 1,
+            .extent_start = { &ctx->x86_pv.compat_m2p_mfn0 }
+        };
+
+        rc = do_memory_op(xch, XENMEM_machphys_compat_mfn_list,
+                          &xmml, sizeof(xmml));
+        if ( rc || xmml.nr_extents != 1 )
+        {
+            PERROR("Failed to get compat mfn list from Xen");
+            rc = -1;
+            goto err;
+        }
+    }
+#endif
+
+    /* All Done */
+    rc = 0;
+    DPRINTF("max_mfn %#lx", ctx->x86_pv.max_mfn);
+
+err:
+    free(entries);
+    free(extents_start);
+
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_common_x86_pv.h b/tools/libxc/xc_sr_common_x86_pv.h
new file mode 100644
index 0000000..3234944
--- /dev/null
+++ b/tools/libxc/xc_sr_common_x86_pv.h
@@ -0,0 +1,102 @@
+#ifndef __COMMON_X86_PV_H
+#define __COMMON_X86_PV_H
+
+#include "xc_sr_common_x86.h"
+
+/*
+ * Convert an mfn to a pfn, given Xen's m2p table.
+ *
+ * Caller must ensure that the requested mfn is in range.
+ */
+xen_pfn_t mfn_to_pfn(struct xc_sr_context *ctx, xen_pfn_t mfn);
+
+/*
+ * Query whether a particular mfn is valid in the physmap of a guest.
+ */
+bool mfn_in_pseudophysmap(struct xc_sr_context *ctx, xen_pfn_t mfn);
+
+/*
+ * Debug a particular mfn by walking the p2m and m2p.
+ */
+void dump_bad_pseudophysmap_entry(struct xc_sr_context *ctx, xen_pfn_t mfn);
+
+/*
+ * Convert a PV cr3 field to an mfn.
+ *
+ * Adjusts for Xen's extended-cr3 format to pack a 44bit physical address into
+ * a 32bit architectural cr3.
+ */
+xen_pfn_t cr3_to_mfn(struct xc_sr_context *ctx, uint64_t cr3);
+
+/*
+ * Convert an mfn to a PV cr3 field.
+ *
+ * Adjusts for Xen's extended-cr3 format to pack a 44bit physical address into
+ * a 32bit architectural cr3.
+ */
+uint64_t mfn_to_cr3(struct xc_sr_context *ctx, xen_pfn_t mfn);
+
+/* Bits 12 through 51 of a PTE point at the frame */
+#define PTE_FRAME_MASK 0x000ffffffffff000ULL
+
+/*
+ * Extract an mfn from a Pagetable Entry.  May return INVALID_MFN if the pte
+ * would overflow a 32bit xen_pfn_t.
+ */
+static inline xen_pfn_t pte_to_frame(uint64_t pte)
+{
+    uint64_t frame = (pte & PTE_FRAME_MASK) >> PAGE_SHIFT;
+
+#ifdef __i386__
+    if ( frame >= INVALID_MFN )
+        return INVALID_MFN;
+#endif
+
+    return frame;
+}
+
+/*
+ * Change the frame in a Pagetable Entry while leaving the flags alone.
+ */
+static inline uint64_t merge_pte(uint64_t pte, xen_pfn_t mfn)
+{
+    return (pte & ~PTE_FRAME_MASK) | ((uint64_t)mfn << PAGE_SHIFT);
+}
+
+/*
+ * Get current domain information.
+ *
+ * Fills ctx->x86_pv
+ * - .width
+ * - .levels
+ * - .fpp
+ * - .p2m_frames
+ *
+ * Used by the save side to create the X86_PV_INFO record, and by the restore
+ * side to verify the incoming stream.
+ *
+ * Returns 0 on success and non-zero on error.
+ */
+int x86_pv_domain_info(struct xc_sr_context *ctx);
+
+/*
+ * Maps the Xen M2P.
+ *
+ * Fills ctx->x86_pv.
+ * - .max_mfn
+ * - .m2p
+ *
+ * Returns 0 on success and non-zero on error.
+ */
+int x86_pv_map_m2p(struct xc_sr_context *ctx);
+
+#endif
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c
new file mode 100644
index 0000000..f48e7fc
--- /dev/null
+++ b/tools/libxc/xc_sr_restore.c
@@ -0,0 +1,802 @@
+#include <arpa/inet.h>
+
+#include <assert.h>
+
+#include "xc_sr_common.h"
+
+/*
+ * Read and validate the Image and Domain headers.
+ */
+static int read_headers(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_ihdr ihdr;
+    struct xc_sr_dhdr dhdr;
+
+    if ( read_exact(ctx->fd, &ihdr, sizeof(ihdr)) )
+    {
+        PERROR("Failed to read Image Header from stream");
+        return -1;
+    }
+
+    ihdr.id      = ntohl(ihdr.id);
+    ihdr.version = ntohl(ihdr.version);
+    ihdr.options = ntohs(ihdr.options);
+
+    if ( ihdr.marker != IHDR_MARKER )
+    {
+        ERROR("Invalid marker: Got 0x%016"PRIx64, ihdr.marker);
+        return -1;
+    }
+    else if ( ihdr.id != IHDR_ID )
+    {
+        ERROR("Invalid ID: Expected 0x%08x, Got 0x%08x", IHDR_ID, ihdr.id);
+        return -1;
+    }
+    else if ( ihdr.version != IHDR_VERSION )
+    {
+        ERROR("Invalid Version: Expected %d, Got %d",
+              ihdr.version, IHDR_VERSION);
+        return -1;
+    }
+    else if ( ihdr.options & IHDR_OPT_BIG_ENDIAN )
+    {
+        ERROR("Unable to handle big endian streams");
+        return -1;
+    }
+
+    ctx->restore.format_version = ihdr.version;
+
+    if ( read_exact(ctx->fd, &dhdr, sizeof(dhdr)) )
+    {
+        PERROR("Failed to read Domain Header from stream");
+        return -1;
+    }
+
+    ctx->restore.guest_type = dhdr.type;
+    ctx->restore.guest_page_size = (1U << dhdr.page_shift);
+
+    if ( dhdr.xen_major == 0 )
+    {
+        IPRINTF("Found %s domain, converted from legacy stream format",
+                dhdr_type_to_str(dhdr.type));
+        DPRINTF("  Legacy conversion script version %u", dhdr.xen_minor);
+    }
+    else
+        IPRINTF("Found %s domain from Xen %u.%u",
+                dhdr_type_to_str(dhdr.type), dhdr.xen_major, dhdr.xen_minor);
+    return 0;
+}
+
+/*
+ * Reads a record from the stream, and fills in the record structure.
+ *
+ * Returns 0 on success and non-0 on failure.
+ *
+ * On success, the records type and size shall be valid.
+ * - If size is 0, data shall be NULL.
+ * - If size is non-0, data shall be a buffer allocated by malloc() which must
+ *   be passed to free() by the caller.
+ *
+ * On failure, the contents of the record structure are undefined.
+ */
+static int read_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rhdr rhdr;
+    size_t datasz;
+
+    if ( read_exact(ctx->fd, &rhdr, sizeof(rhdr)) )
+    {
+        PERROR("Failed to read Record Header from stream");
+        return -1;
+    }
+    else if ( rhdr.length > REC_LENGTH_MAX )
+    {
+        ERROR("Record (0x%08x, %s) length %#x exceeds max (%#x)", rhdr.type,
+              rec_type_to_str(rhdr.type), rhdr.length, REC_LENGTH_MAX);
+        return -1;
+    }
+
+    datasz = ROUNDUP(rhdr.length, REC_ALIGN_ORDER);
+
+    if ( datasz )
+    {
+        rec->data = malloc(datasz);
+
+        if ( !rec->data )
+        {
+            ERROR("Unable to allocate %zu bytes for record data (0x%08x, %s)",
+                  datasz, rhdr.type, rec_type_to_str(rhdr.type));
+            return -1;
+        }
+
+        if ( read_exact(ctx->fd, rec->data, datasz) )
+        {
+            free(rec->data);
+            rec->data = NULL;
+            PERROR("Failed to read %zu bytes of data for record (0x%08x, %s)",
+                   datasz, rhdr.type, rec_type_to_str(rhdr.type));
+            return -1;
+        }
+    }
+    else
+        rec->data = NULL;
+
+    rec->type   = rhdr.type;
+    rec->length = rhdr.length;
+
+    return 0;
+};
+
+/*
+ * Is a pfn populated?
+ */
+static bool pfn_is_populated(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    if ( pfn > ctx->restore.max_populated_pfn )
+        return false;
+    return test_bit(pfn, ctx->restore.populated_pfns);
+}
+
+/*
+ * Set a pfn as populated, expanding the tracking structures if needed. To
+ * avoid realloc()ing too excessively, the size increased to the nearest power
+ * of two large enough to contain the required pfn.
+ */
+static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    xc_interface *xch = ctx->xch;
+
+    if ( pfn > ctx->restore.max_populated_pfn )
+    {
+        xen_pfn_t new_max;
+        size_t old_sz, new_sz;
+        unsigned long *p;
+
+        /* Round up to the nearest power of two larger than pfn, less 1. */
+        new_max = pfn;
+        new_max |= new_max >> 1;
+        new_max |= new_max >> 2;
+        new_max |= new_max >> 4;
+        new_max |= new_max >> 8;
+        new_max |= new_max >> 16;
+#ifdef __x86_64__
+        new_max |= new_max >> 32;
+#endif
+
+        old_sz = bitmap_size(ctx->restore.max_populated_pfn + 1);
+        new_sz = bitmap_size(new_max + 1);
+        p = realloc(ctx->restore.populated_pfns, new_sz);
+        if ( !p )
+        {
+            ERROR("Failed to realloc populated bitmap");
+            errno = ENOMEM;
+            return -1;
+        }
+
+        memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz);
+
+        ctx->restore.populated_pfns    = p;
+        ctx->restore.max_populated_pfn = new_max;
+    }
+
+    assert(!test_bit(pfn, ctx->restore.populated_pfns));
+    set_bit(pfn, ctx->restore.populated_pfns);
+
+    return 0;
+}
+
+/*
+ * Given a set of pfns, obtain memory from Xen to fill the physmap for the
+ * unpopulated subset.  If types is NULL, no page type checking is performed
+ * and all unpopulated pfns are populated.
+ */
+int populate_pfns(struct xc_sr_context *ctx, unsigned count,
+                  const xen_pfn_t *original_pfns, const uint32_t *types)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
+        *pfns = malloc(count * sizeof(*pfns));
+    unsigned i, nr_pfns = 0;
+    int rc = -1;
+
+    if ( !mfns || !pfns )
+    {
+        ERROR("Failed to allocate %zu bytes for populating the physmap",
+              2 * count * sizeof(*mfns));
+        goto err;
+    }
+
+    for ( i = 0; i < count; ++i )
+    {
+        if ( (!types || (types &&
+                         (types[i] != XEN_DOMCTL_PFINFO_XTAB &&
+                          types[i] != XEN_DOMCTL_PFINFO_BROKEN))) &&
+             !pfn_is_populated(ctx, original_pfns[i]) )
+        {
+            rc = pfn_set_populated(ctx, original_pfns[i]);
+            if ( rc )
+                goto err;
+            pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
+            ++nr_pfns;
+        }
+    }
+
+    if ( nr_pfns )
+    {
+        rc = xc_domain_populate_physmap_exact(
+            xch, ctx->domid, nr_pfns, 0, 0, mfns);
+        if ( rc )
+        {
+            PERROR("Failed to populate physmap");
+            goto err;
+        }
+
+        for ( i = 0; i < nr_pfns; ++i )
+        {
+            if ( mfns[i] == INVALID_MFN )
+            {
+                ERROR("Populate physmap failed for pfn %u", i);
+                rc = -1;
+                goto err;
+            }
+
+            ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
+        }
+    }
+
+    rc = 0;
+
+ err:
+    free(pfns);
+    free(mfns);
+
+    return rc;
+}
+
+/*
+ * Given a list of pfns, their types, and a block of page data from the
+ * stream, populate and record their types, map the relevant subset and copy
+ * the data into the guest.
+ */
+static int process_page_data(struct xc_sr_context *ctx, unsigned count,
+                             xen_pfn_t *pfns, uint32_t *types, void *page_data)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t *mfns = malloc(count * sizeof(*mfns));
+    int *map_errs = malloc(count * sizeof(*map_errs));
+    int rc;
+    void *mapping = NULL, *guest_page = NULL;
+    unsigned i,    /* i indexes the pfns from the record. */
+        j,         /* j indexes the subset of pfns we decide to map. */
+        nr_pages = 0;
+
+    if ( !mfns || !map_errs )
+    {
+        rc = -1;
+        ERROR("Failed to allocate %zu bytes to process page data",
+              count * (sizeof(*mfns) + sizeof(*map_errs)));
+        goto err;
+    }
+
+    rc = populate_pfns(ctx, count, pfns, types);
+    if ( rc )
+    {
+        ERROR("Failed to populate pfns for batch of %u pages", count);
+        goto err;
+    }
+
+    for ( i = 0; i < count; ++i )
+    {
+        ctx->restore.ops.set_page_type(ctx, pfns[i], types[i]);
+
+        switch ( types[i] )
+        {
+        case XEN_DOMCTL_PFINFO_NOTAB:
+
+        case XEN_DOMCTL_PFINFO_L1TAB:
+        case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+        case XEN_DOMCTL_PFINFO_L2TAB:
+        case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+        case XEN_DOMCTL_PFINFO_L3TAB:
+        case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+        case XEN_DOMCTL_PFINFO_L4TAB:
+        case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+            mfns[nr_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, pfns[i]);
+            break;
+        }
+    }
+
+    /* Nothing to do? */
+    if ( nr_pages == 0 )
+        goto done;
+
+    mapping = guest_page = xc_map_foreign_bulk(
+        xch, ctx->domid, PROT_READ | PROT_WRITE,
+        mfns, map_errs, nr_pages);
+    if ( !mapping )
+    {
+        rc = -1;
+        PERROR("Unable to map %u mfns for %u pages of data",
+               nr_pages, count);
+        goto err;
+    }
+
+    for ( i = 0, j = 0; i < count; ++i )
+    {
+        switch ( types[i] )
+        {
+        case XEN_DOMCTL_PFINFO_XTAB:
+        case XEN_DOMCTL_PFINFO_BROKEN:
+        case XEN_DOMCTL_PFINFO_XALLOC:
+            /* No page data to deal with. */
+            continue;
+        }
+
+        if ( map_errs[j] )
+        {
+            rc = -1;
+            ERROR("Mapping pfn %lx (mfn %lx, type %#x)failed with %d",
+                  pfns[i], mfns[j], types[i], map_errs[j]);
+            goto err;
+        }
+
+        /* Undo page normalisation done by the saver. */
+        rc = ctx->restore.ops.localise_page(ctx, types[i], page_data);
+        if ( rc )
+        {
+            ERROR("Failed to localise pfn %lx (type %#x)",
+                  pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+            goto err;
+        }
+
+        if ( ctx->restore.verify )
+        {
+            /* Verify mode - compare incoming data to what we already have. */
+            if ( memcmp(guest_page, page_data, PAGE_SIZE) )
+                ERROR("verify pfn %lx failed (type %#x)",
+                      pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+        }
+        else
+        {
+            /* Regular mode - copy incoming data into place. */
+            memcpy(guest_page, page_data, PAGE_SIZE);
+        }
+
+        ++j;
+        guest_page += PAGE_SIZE;
+        page_data += PAGE_SIZE;
+    }
+
+ done:
+    rc = 0;
+
+ err:
+    if ( mapping )
+        munmap(mapping, nr_pages * PAGE_SIZE);
+
+    free(map_errs);
+    free(mfns);
+
+    return rc;
+}
+
+/*
+ * Validate a PAGE_DATA record from the stream, and pass the results to
+ * process_page_data() to actually perform the legwork.
+ */
+static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_page_data_header *pages = rec->data;
+    unsigned i, pages_of_data = 0;
+    int rc = -1;
+
+    xen_pfn_t *pfns = NULL, pfn;
+    uint32_t *types = NULL, type;
+
+    if ( rec->length < sizeof(*pages) )
+    {
+        ERROR("PAGE_DATA record truncated: length %u, min %zu",
+              rec->length, sizeof(*pages));
+        goto err;
+    }
+    else if ( pages->count < 1 )
+    {
+        ERROR("Expected at least 1 pfn in PAGE_DATA record");
+        goto err;
+    }
+    else if ( rec->length < sizeof(*pages) + (pages->count * sizeof(uint64_t)) )
+    {
+        ERROR("PAGE_DATA record (length %u) too short to contain %u"
+              " pfns worth of information", rec->length, pages->count);
+        goto err;
+    }
+
+    pfns = malloc(pages->count * sizeof(*pfns));
+    types = malloc(pages->count * sizeof(*types));
+    if ( !pfns || !types )
+    {
+        ERROR("Unable to allocate enough memory for %u pfns",
+              pages->count);
+        goto err;
+    }
+
+    for ( i = 0; i < pages->count; ++i )
+    {
+        pfn = pages->pfn[i] & PAGE_DATA_PFN_MASK;
+        if ( !ctx->restore.ops.pfn_is_valid(ctx, pfn) )
+        {
+            ERROR("pfn %#lx (index %u) outside domain maximum", pfn, i);
+            goto err;
+        }
+
+        type = (pages->pfn[i] & PAGE_DATA_TYPE_MASK) >> 32;
+        if ( ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) >= 5) &&
+             ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) <= 8) )
+        {
+            ERROR("Invalid type %#x for pfn %#lx (index %u)", type, pfn, i);
+            goto err;
+        }
+        else if ( type < XEN_DOMCTL_PFINFO_BROKEN )
+            /* NOTAB and all L1 through L4 tables (including pinned) should
+             * have a page worth of data in the record. */
+            pages_of_data++;
+
+        pfns[i] = pfn;
+        types[i] = type;
+    }
+
+    if ( rec->length != (sizeof(*pages) +
+                         (sizeof(uint64_t) * pages->count) +
+                         (PAGE_SIZE * pages_of_data)) )
+    {
+        ERROR("PAGE_DATA record wrong size: length %u, expected "
+              "%zu + %zu + %lu", rec->length, sizeof(*pages),
+              (sizeof(uint64_t) * pages->count), (PAGE_SIZE * pages_of_data));
+        goto err;
+    }
+
+    rc = process_page_data(ctx, pages->count, pfns, types,
+                           &pages->pfn[pages->count]);
+ err:
+    free(types);
+    free(pfns);
+
+    return rc;
+}
+
+static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec);
+static int handle_checkpoint(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc = 0, ret;
+    unsigned i;
+
+    if ( !ctx->restore.checkpointed )
+    {
+        ERROR("Found checkpoint in non-checkpointed stream");
+        rc = -1;
+        goto err;
+    }
+
+    ret = ctx->restore.callbacks->checkpoint(ctx->restore.callbacks->data);
+    switch ( ret )
+    {
+    case XGR_CHECKPOINT_SUCCESS:
+        break;
+
+    case XGR_CHECKPOINT_FAILOVER:
+        rc = BROKEN_CHANNEL;
+        goto err;
+
+    default: /* Other fatal error */
+        rc = -1;
+        goto err;
+    }
+
+    if ( ctx->restore.buffer_all_records )
+    {
+        IPRINTF("All records buffered");
+
+        for ( i = 0; i < ctx->restore.buffered_rec_num; i++ )
+        {
+            rc = process_record(ctx, &ctx->restore.buffered_records[i]);
+            if ( rc )
+                goto err;
+        }
+        ctx->restore.buffered_rec_num = 0;
+        IPRINTF("All records processed");
+    }
+    else
+        ctx->restore.buffer_all_records = true;
+
+ err:
+    return rc;
+}
+
+static int buffer_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned new_alloc_num;
+    struct xc_sr_record *p;
+
+    if ( ctx->restore.buffered_rec_num >= ctx->restore.allocated_rec_num )
+    {
+        new_alloc_num = ctx->restore.allocated_rec_num + DEFAULT_BUF_RECORDS;
+        p = realloc(ctx->restore.buffered_records,
+                    new_alloc_num * sizeof(struct xc_sr_record));
+        if ( !p )
+        {
+            ERROR("Failed to realloc memory for buffered records");
+            return -1;
+        }
+
+        ctx->restore.buffered_records = p;
+        ctx->restore.allocated_rec_num = new_alloc_num;
+    }
+
+    memcpy(&ctx->restore.buffered_records[ctx->restore.buffered_rec_num++],
+           rec, sizeof(*rec));
+
+    return 0;
+}
+
+static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    int rc = 0;
+
+    switch ( rec->type )
+    {
+    case REC_TYPE_END:
+        break;
+
+    case REC_TYPE_PAGE_DATA:
+        rc = handle_page_data(ctx, rec);
+        break;
+
+    case REC_TYPE_VERIFY:
+        DPRINTF("Verify mode enabled");
+        ctx->restore.verify = true;
+        break;
+
+    case REC_TYPE_CHECKPOINT:
+        rc = handle_checkpoint(ctx);
+        break;
+
+    default:
+        rc = ctx->restore.ops.process_record(ctx, rec);
+        break;
+    }
+
+    free(rec->data);
+    rec->data = NULL;
+
+    return rc;
+}
+
+static int setup(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc;
+
+    rc = ctx->restore.ops.setup(ctx);
+    if ( rc )
+        goto err;
+
+    ctx->restore.max_populated_pfn = (32 * 1024 / 4) - 1;
+    ctx->restore.populated_pfns = bitmap_alloc(
+        ctx->restore.max_populated_pfn + 1);
+    if ( !ctx->restore.populated_pfns )
+    {
+        ERROR("Unable to allocate memory for populated_pfns bitmap");
+        rc = -1;
+        goto err;
+    }
+
+    ctx->restore.buffered_records = malloc(
+        DEFAULT_BUF_RECORDS * sizeof(struct xc_sr_record));
+    if ( !ctx->restore.buffered_records )
+    {
+        ERROR("Unable to allocate memory for buffered records");
+        rc = -1;
+        goto err;
+    }
+    ctx->restore.allocated_rec_num = DEFAULT_BUF_RECORDS;
+
+ err:
+    return rc;
+}
+
+static void cleanup(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned i;
+
+    for ( i = 0; i < ctx->restore.buffered_rec_num; i++ )
+        free(ctx->restore.buffered_records[i].data);
+
+    free(ctx->restore.buffered_records);
+    free(ctx->restore.populated_pfns);
+    if ( ctx->restore.ops.cleanup(ctx) )
+        PERROR("Failed to clean up");
+}
+
+/*
+ * Restore a domain.
+ */
+static int restore(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_record rec;
+    int rc, saved_rc = 0, saved_errno = 0;
+
+    IPRINTF("Restoring domain");
+
+    rc = setup(ctx);
+    if ( rc )
+        goto err;
+
+    do
+    {
+        rc = read_record(ctx, &rec);
+        if ( rc )
+        {
+            if ( ctx->restore.buffer_all_records )
+                goto remus_failover;
+            else
+                goto err;
+        }
+
+        if ( ctx->restore.buffer_all_records &&
+             rec.type != REC_TYPE_END &&
+             rec.type != REC_TYPE_CHECKPOINT )
+        {
+            rc = buffer_record(ctx, &rec);
+            if ( rc )
+                goto err;
+        }
+        else
+        {
+            rc = process_record(ctx, &rec);
+            if ( rc == RECORD_NOT_PROCESSED )
+            {
+                if ( rec.type & REC_TYPE_OPTIONAL )
+                    DPRINTF("Ignoring optional record %#x (%s)",
+                            rec.type, rec_type_to_str(rec.type));
+                else
+                {
+                    ERROR("Mandatory record %#x (%s) not handled",
+                          rec.type, rec_type_to_str(rec.type));
+                    rc = -1;
+                    goto err;
+                }
+            }
+            else if ( rc == BROKEN_CHANNEL )
+                goto remus_failover;
+            else if ( rc )
+                goto err;
+        }
+
+    } while ( rec.type != REC_TYPE_END );
+
+ remus_failover:
+    /*
+     * With Remus, if we reach here, there must be some error on primary,
+     * failover from the last checkpoint state.
+     */
+    rc = ctx->restore.ops.stream_complete(ctx);
+    if ( rc )
+        goto err;
+
+    IPRINTF("Restore successful");
+    goto done;
+
+ err:
+    saved_errno = errno;
+    saved_rc = rc;
+    PERROR("Restore failed");
+
+ done:
+    cleanup(ctx);
+
+    if ( saved_rc )
+    {
+        rc = saved_rc;
+        errno = saved_errno;
+    }
+
+    return rc;
+}
+
+int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
+                      unsigned int store_evtchn, unsigned long *store_mfn,
+                      domid_t store_domid, unsigned int console_evtchn,
+                      unsigned long *console_gfn, domid_t console_domid,
+                      unsigned int hvm, unsigned int pae, int superpages,
+                      int checkpointed_stream,
+                      struct restore_callbacks *callbacks)
+{
+    struct xc_sr_context ctx =
+        {
+            .xch = xch,
+            .fd = io_fd,
+        };
+
+    /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */
+    ctx.restore.console_evtchn = console_evtchn;
+    ctx.restore.console_domid = console_domid;
+    ctx.restore.xenstore_evtchn = store_evtchn;
+    ctx.restore.xenstore_domid = store_domid;
+    ctx.restore.checkpointed = checkpointed_stream;
+    ctx.restore.callbacks = callbacks;
+
+    /* Sanity checks for callbacks. */
+    if ( checkpointed_stream )
+        assert(callbacks->checkpoint);
+
+    DPRINTF("fd %d, dom %u, hvm %u, pae %u, superpages %d"
+            ", checkpointed_stream %d", io_fd, dom, hvm, pae,
+            superpages, checkpointed_stream);
+
+    if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
+    {
+        PERROR("Failed to get domain info");
+        return -1;
+    }
+
+    if ( ctx.dominfo.domid != dom )
+    {
+        ERROR("Domain %u does not exist", dom);
+        return -1;
+    }
+
+    ctx.domid = dom;
+
+    if ( read_headers(&ctx) )
+        return -1;
+
+    if ( ctx.dominfo.hvm )
+    {
+        ctx.restore.ops = restore_ops_x86_hvm;
+        if ( restore(&ctx) )
+            return -1;
+    }
+    else
+    {
+        ctx.restore.ops = restore_ops_x86_pv;
+        if ( restore(&ctx) )
+            return -1;
+    }
+
+    IPRINTF("XenStore: mfn %#lx, dom %d, evt %u",
+            ctx.restore.xenstore_gfn,
+            ctx.restore.xenstore_domid,
+            ctx.restore.xenstore_evtchn);
+
+    IPRINTF("Console: mfn %#lx, dom %d, evt %u",
+            ctx.restore.console_gfn,
+            ctx.restore.console_domid,
+            ctx.restore.console_evtchn);
+
+    *console_gfn = ctx.restore.console_gfn;
+    *store_mfn = ctx.restore.xenstore_gfn;
+
+    return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_restore_x86_hvm.c b/tools/libxc/xc_sr_restore_x86_hvm.c
new file mode 100644
index 0000000..49d22c7
--- /dev/null
+++ b/tools/libxc/xc_sr_restore_x86_hvm.c
@@ -0,0 +1,233 @@
+#include <assert.h>
+#include <arpa/inet.h>
+
+#include "xc_sr_common_x86.h"
+
+/*
+ * Process an HVM_CONTEXT record from the stream.
+ */
+static int handle_hvm_context(struct xc_sr_context *ctx,
+                              struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    void *p;
+
+    p = malloc(rec->length);
+    if ( !p )
+    {
+        ERROR("Unable to allocate %u bytes for hvm context", rec->length);
+        return -1;
+    }
+
+    free(ctx->x86_hvm.restore.context);
+
+    ctx->x86_hvm.restore.context = memcpy(p, rec->data, rec->length);
+    ctx->x86_hvm.restore.contextsz = rec->length;
+
+    return 0;
+}
+
+/*
+ * Process an HVM_PARAMS record from the stream.
+ */
+static int handle_hvm_params(struct xc_sr_context *ctx,
+                             struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_hvm_params *hdr = rec->data;
+    struct xc_sr_rec_hvm_params_entry *entry = hdr->param;
+    unsigned int i;
+    int rc;
+
+    if ( rec->length < sizeof(*hdr)
+         || rec->length < sizeof(*hdr) + hdr->count * sizeof(*entry) )
+    {
+        ERROR("hvm_params record is too short");
+        return -1;
+    }
+
+    for ( i = 0; i < hdr->count; i++, entry++ )
+    {
+        switch ( entry->index )
+        {
+        case HVM_PARAM_CONSOLE_PFN:
+            ctx->restore.console_gfn = entry->value;
+            xc_clear_domain_page(xch, ctx->domid, entry->value);
+            break;
+        case HVM_PARAM_STORE_PFN:
+            ctx->restore.xenstore_gfn = entry->value;
+            xc_clear_domain_page(xch, ctx->domid, entry->value);
+            break;
+        case HVM_PARAM_IOREQ_PFN:
+        case HVM_PARAM_BUFIOREQ_PFN:
+            xc_clear_domain_page(xch, ctx->domid, entry->value);
+            break;
+        }
+
+        rc = xc_hvm_param_set(xch, ctx->domid, entry->index, entry->value);
+        if ( rc < 0 )
+        {
+            PERROR("set HVM param %"PRId64" = 0x%016"PRIx64,
+                   entry->index, entry->value);
+            return rc;
+        }
+    }
+    return 0;
+}
+
+/* restore_ops function. */
+static bool x86_hvm_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    return true;
+}
+
+/* restore_ops function. */
+static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx,
+                                    xen_pfn_t pfn)
+{
+    return pfn;
+}
+
+/* restore_ops function. */
+static void x86_hvm_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
+                            xen_pfn_t gfn)
+{
+    /* no op */
+}
+
+/* restore_ops function. */
+static void x86_hvm_set_page_type(struct xc_sr_context *ctx,
+                                  xen_pfn_t pfn, xen_pfn_t type)
+{
+    /* no-op */
+}
+
+/* restore_ops function. */
+static int x86_hvm_localise_page(struct xc_sr_context *ctx,
+                                 uint32_t type, void *page)
+{
+    /* no-op */
+    return 0;
+}
+
+/*
+ * restore_ops function. Confirms the stream matches the domain.
+ */
+static int x86_hvm_setup(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+
+    if ( ctx->restore.guest_type != DHDR_TYPE_X86_HVM )
+    {
+        ERROR("Unable to restore %s domain into an x86_hvm domain",
+              dhdr_type_to_str(ctx->restore.guest_type));
+        return -1;
+    }
+    else if ( ctx->restore.guest_page_size != PAGE_SIZE )
+    {
+        ERROR("Invalid page size %u for x86_hvm domains",
+              ctx->restore.guest_page_size);
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * restore_ops function.
+ */
+static int x86_hvm_process_record(struct xc_sr_context *ctx,
+                                  struct xc_sr_record *rec)
+{
+    switch ( rec->type )
+    {
+    case REC_TYPE_TSC_INFO:
+        return handle_tsc_info(ctx, rec);
+
+    case REC_TYPE_HVM_CONTEXT:
+        return handle_hvm_context(ctx, rec);
+
+    case REC_TYPE_HVM_PARAMS:
+        return handle_hvm_params(ctx, rec);
+
+    default:
+        return RECORD_NOT_PROCESSED;
+    }
+}
+
+/*
+ * restore_ops function.  Sets extra hvm parameters and seeds the grant table.
+ */
+static int x86_hvm_stream_complete(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc;
+
+    rc = xc_hvm_param_set(xch, ctx->domid, HVM_PARAM_STORE_EVTCHN,
+                          ctx->restore.xenstore_evtchn);
+    if ( rc )
+    {
+        PERROR("Failed to set HVM_PARAM_STORE_EVTCHN");
+        return rc;
+    }
+
+    rc = xc_hvm_param_set(xch, ctx->domid, HVM_PARAM_CONSOLE_EVTCHN,
+                          ctx->restore.console_evtchn);
+    if ( rc )
+    {
+        PERROR("Failed to set HVM_PARAM_CONSOLE_EVTCHN");
+        return rc;
+    }
+
+    rc = xc_domain_hvm_setcontext(xch, ctx->domid,
+                                  ctx->x86_hvm.restore.context,
+                                  ctx->x86_hvm.restore.contextsz);
+    if ( rc < 0 )
+    {
+        PERROR("Unable to restore HVM context");
+        return rc;
+    }
+
+    rc = xc_dom_gnttab_hvm_seed(xch, ctx->domid,
+                                ctx->restore.console_gfn,
+                                ctx->restore.xenstore_gfn,
+                                ctx->restore.console_domid,
+                                ctx->restore.xenstore_domid);
+    if ( rc )
+    {
+        PERROR("Failed to seed grant table");
+        return rc;
+    }
+
+    return rc;
+}
+
+static int x86_hvm_cleanup(struct xc_sr_context *ctx)
+{
+    free(ctx->x86_hvm.restore.context);
+
+    return 0;
+}
+
+struct xc_sr_restore_ops restore_ops_x86_hvm =
+{
+    .pfn_is_valid    = x86_hvm_pfn_is_valid,
+    .pfn_to_gfn      = x86_hvm_pfn_to_gfn,
+    .set_gfn         = x86_hvm_set_gfn,
+    .set_page_type   = x86_hvm_set_page_type,
+    .localise_page   = x86_hvm_localise_page,
+    .setup           = x86_hvm_setup,
+    .process_record  = x86_hvm_process_record,
+    .stream_complete = x86_hvm_stream_complete,
+    .cleanup         = x86_hvm_cleanup,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_restore_x86_pv.c b/tools/libxc/xc_sr_restore_x86_pv.c
new file mode 100644
index 0000000..bc604b3
--- /dev/null
+++ b/tools/libxc/xc_sr_restore_x86_pv.c
@@ -0,0 +1,1165 @@
+#include <assert.h>
+
+#include "xc_sr_common_x86_pv.h"
+
+static xen_pfn_t pfn_to_mfn(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    assert(pfn <= ctx->x86_pv.max_pfn);
+
+    return xc_pfn_to_mfn(pfn, ctx->x86_pv.p2m, ctx->x86_pv.width);
+}
+
+/*
+ * Expand our local tracking information for the p2m table and domains maximum
+ * size.  Normally this will be called once to expand from 0 to max_pfn, but
+ * is liable to expand multiple times if the domain grows on the sending side
+ * after migration has started.
+ */
+static int expand_p2m(struct xc_sr_context *ctx, unsigned long max_pfn)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned long old_max = ctx->x86_pv.max_pfn, i;
+    unsigned int fpp = PAGE_SIZE / ctx->x86_pv.width;
+    unsigned long end_frame = (max_pfn / fpp) + 1;
+    unsigned long old_end_frame = (old_max / fpp) + 1;
+    xen_pfn_t *p2m = NULL, *p2m_pfns = NULL;
+    uint32_t *pfn_types = NULL;
+    size_t p2msz, p2m_pfnsz, pfn_typesz;
+
+    assert(max_pfn > old_max);
+
+    p2msz = (max_pfn + 1) * ctx->x86_pv.width;
+    p2m = realloc(ctx->x86_pv.p2m, p2msz);
+    if ( !p2m )
+    {
+        ERROR("Failed to (re)alloc %zu bytes for p2m", p2msz);
+        return -1;
+    }
+    ctx->x86_pv.p2m = p2m;
+
+    pfn_typesz = (max_pfn + 1) * sizeof(*pfn_types);
+    pfn_types = realloc(ctx->x86_pv.restore.pfn_types, pfn_typesz);
+    if ( !pfn_types )
+    {
+        ERROR("Failed to (re)alloc %zu bytes for pfn_types", pfn_typesz);
+        return -1;
+    }
+    ctx->x86_pv.restore.pfn_types = pfn_types;
+
+    p2m_pfnsz = (end_frame + 1) * sizeof(*p2m_pfns);
+    p2m_pfns = realloc(ctx->x86_pv.p2m_pfns, p2m_pfnsz);
+    if ( !p2m_pfns )
+    {
+        ERROR("Failed to (re)alloc %zu bytes for p2m frame list", p2m_pfnsz);
+        return -1;
+    }
+    ctx->x86_pv.p2m_frames = end_frame;
+    ctx->x86_pv.p2m_pfns = p2m_pfns;
+
+    ctx->x86_pv.max_pfn = max_pfn;
+    for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i )
+    {
+        ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN);
+        ctx->restore.ops.set_page_type(ctx, i, 0);
+    }
+
+    for ( i = (old_end_frame ? old_end_frame + 1 : 0); i <= end_frame; ++i )
+        ctx->x86_pv.p2m_pfns[i] = INVALID_MFN;
+
+    DPRINTF("Changed max_pfn from %#lx to %#lx", old_max, max_pfn);
+    return 0;
+}
+
+/*
+ * Pin all of the pagetables.
+ */
+static int pin_pagetables(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned long i, nr_pins;
+    struct mmuext_op pin[MAX_PIN_BATCH];
+
+    for ( i = nr_pins = 0; i <= ctx->x86_pv.max_pfn; ++i )
+    {
+        if ( (ctx->x86_pv.restore.pfn_types[i] &
+              XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
+            continue;
+
+        switch ( (ctx->x86_pv.restore.pfn_types[i] &
+                  XEN_DOMCTL_PFINFO_LTABTYPE_MASK) )
+        {
+        case XEN_DOMCTL_PFINFO_L1TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
+            break;
+        case XEN_DOMCTL_PFINFO_L2TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
+            break;
+        case XEN_DOMCTL_PFINFO_L3TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
+            break;
+        case XEN_DOMCTL_PFINFO_L4TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
+            break;
+        default:
+            continue;
+        }
+
+        pin[nr_pins].arg1.mfn = pfn_to_mfn(ctx, i);
+        nr_pins++;
+
+        if ( nr_pins == MAX_PIN_BATCH )
+        {
+            if ( xc_mmuext_op(xch, pin, nr_pins, ctx->domid) != 0 )
+            {
+                PERROR("Failed to pin batch of pagetables");
+                return -1;
+            }
+            nr_pins = 0;
+        }
+    }
+
+    if ( (nr_pins > 0) && (xc_mmuext_op(xch, pin, nr_pins, ctx->domid) < 0) )
+    {
+        PERROR("Failed to pin batch of pagetables");
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Update details in a guests start_info structure.
+ */
+static int process_start_info(struct xc_sr_context *ctx,
+                              vcpu_guest_context_any_t *vcpu)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t pfn, mfn;
+    start_info_any_t *guest_start_info = NULL;
+    int rc = -1;
+
+    pfn = GET_FIELD(vcpu, user_regs.edx, ctx->x86_pv.width);
+
+    if ( pfn > ctx->x86_pv.max_pfn )
+    {
+        ERROR("Start Info pfn %#lx out of range", pfn);
+        goto err;
+    }
+    else if ( ctx->x86_pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB )
+    {
+        ERROR("Start Info pfn %#lx has bad type %u", pfn,
+              (ctx->x86_pv.restore.pfn_types[pfn] >>
+               XEN_DOMCTL_PFINFO_LTAB_SHIFT));
+        goto err;
+    }
+
+    mfn = pfn_to_mfn(ctx, pfn);
+    if ( !mfn_in_pseudophysmap(ctx, mfn) )
+    {
+        ERROR("Start Info has bad mfn");
+        dump_bad_pseudophysmap_entry(ctx, mfn);
+        goto err;
+    }
+
+    SET_FIELD(vcpu, user_regs.edx, mfn, ctx->x86_pv.width);
+    guest_start_info = xc_map_foreign_range(
+        xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
+    if ( !guest_start_info )
+    {
+        PERROR("Failed to map Start Info at mfn %#lx", mfn);
+        goto err;
+    }
+
+    /* Deal with xenstore stuff */
+    pfn = GET_FIELD(guest_start_info, store_mfn, ctx->x86_pv.width);
+    if ( pfn > ctx->x86_pv.max_pfn )
+    {
+        ERROR("XenStore pfn %#lx out of range", pfn);
+        goto err;
+    }
+
+    mfn = pfn_to_mfn(ctx, pfn);
+    if ( !mfn_in_pseudophysmap(ctx, mfn) )
+    {
+        ERROR("XenStore pfn has bad mfn");
+        dump_bad_pseudophysmap_entry(ctx, mfn);
+        goto err;
+    }
+
+    ctx->restore.xenstore_gfn = mfn;
+    SET_FIELD(guest_start_info, store_mfn, mfn, ctx->x86_pv.width);
+    SET_FIELD(guest_start_info, store_evtchn,
+              ctx->restore.xenstore_evtchn, ctx->x86_pv.width);
+
+    /* Deal with console stuff */
+    pfn = GET_FIELD(guest_start_info, console.domU.mfn, ctx->x86_pv.width);
+    if ( pfn > ctx->x86_pv.max_pfn )
+    {
+        ERROR("Console pfn %#lx out of range", pfn);
+        goto err;
+    }
+
+    mfn = pfn_to_mfn(ctx, pfn);
+    if ( !mfn_in_pseudophysmap(ctx, mfn) )
+    {
+        ERROR("Console pfn has bad mfn");
+        dump_bad_pseudophysmap_entry(ctx, mfn);
+        goto err;
+    }
+
+    ctx->restore.console_gfn = mfn;
+    SET_FIELD(guest_start_info, console.domU.mfn, mfn, ctx->x86_pv.width);
+    SET_FIELD(guest_start_info, console.domU.evtchn,
+              ctx->restore.console_evtchn, ctx->x86_pv.width);
+
+    /* Set other information */
+    SET_FIELD(guest_start_info, nr_pages,
+              ctx->x86_pv.max_pfn + 1, ctx->x86_pv.width);
+    SET_FIELD(guest_start_info, shared_info,
+              ctx->dominfo.shared_info_frame << PAGE_SHIFT, ctx->x86_pv.width);
+    SET_FIELD(guest_start_info, flags, 0, ctx->x86_pv.width);
+
+    rc = 0;
+
+err:
+    if ( guest_start_info )
+        munmap(guest_start_info, PAGE_SIZE);
+
+    return rc;
+}
+
+/*
+ * Process one stashed vcpu worth of basic state and send to Xen.
+ */
+static int process_vcpu_basic(struct xc_sr_context *ctx,
+                              unsigned int vcpuid)
+{
+    xc_interface *xch = ctx->xch;
+    vcpu_guest_context_any_t vcpu;
+    xen_pfn_t pfn, mfn;
+    unsigned i, gdt_count;
+    int rc = -1;
+
+    memcpy(&vcpu, ctx->x86_pv.restore.vcpus[vcpuid].basic,
+           ctx->x86_pv.restore.vcpus[vcpuid].basicsz);
+
+    /* Vcpu 0 is special: Convert the suspend record to an mfn. */
+    if ( vcpuid == 0 )
+    {
+        rc = process_start_info(ctx, &vcpu);
+        if ( rc )
+            return rc;
+        rc = -1;
+    }
+
+    SET_FIELD(&vcpu, flags,
+              GET_FIELD(&vcpu, flags, ctx->x86_pv.width) | VGCF_online,
+              ctx->x86_pv.width);
+
+    gdt_count = GET_FIELD(&vcpu, gdt_ents, ctx->x86_pv.width);
+    if ( gdt_count > FIRST_RESERVED_GDT_ENTRY )
+    {
+        ERROR("GDT entry count (%u) out of range (max %u)",
+              gdt_count, FIRST_RESERVED_GDT_ENTRY);
+        errno = ERANGE;
+        goto err;
+    }
+    gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */
+
+    /* Convert GDT frames to mfns. */
+    for ( i = 0; i < gdt_count; ++i )
+    {
+        pfn = GET_FIELD(&vcpu, gdt_frames[i], ctx->x86_pv.width);
+        if ( pfn > ctx->x86_pv.max_pfn )
+        {
+            ERROR("GDT frame %u (pfn %#lx) out of range", i, pfn);
+            goto err;
+        }
+        else if ( (ctx->x86_pv.restore.pfn_types[pfn] !=
+                   XEN_DOMCTL_PFINFO_NOTAB) )
+        {
+            ERROR("GDT frame %u (pfn %#lx) has bad type %u", i, pfn,
+                  (ctx->x86_pv.restore.pfn_types[pfn] >>
+                   XEN_DOMCTL_PFINFO_LTAB_SHIFT));
+            goto err;
+        }
+
+        mfn = pfn_to_mfn(ctx, pfn);
+        if ( !mfn_in_pseudophysmap(ctx, mfn) )
+        {
+            ERROR("GDT frame %u has bad mfn", i);
+            dump_bad_pseudophysmap_entry(ctx, mfn);
+            goto err;
+        }
+
+        SET_FIELD(&vcpu, gdt_frames[i], mfn, ctx->x86_pv.width);
+    }
+
+    /* Convert CR3 to an mfn. */
+    pfn = cr3_to_mfn(ctx, GET_FIELD(&vcpu, ctrlreg[3], ctx->x86_pv.width));
+    if ( pfn > ctx->x86_pv.max_pfn )
+    {
+        ERROR("cr3 (pfn %#lx) out of range", pfn);
+        goto err;
+    }
+    else if ( (ctx->x86_pv.restore.pfn_types[pfn] &
+                XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
+              (((xen_pfn_t)ctx->x86_pv.levels) <<
+               XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
+    {
+        ERROR("cr3 (pfn %#lx) has bad type %u, expected %u", pfn,
+              (ctx->x86_pv.restore.pfn_types[pfn] >>
+               XEN_DOMCTL_PFINFO_LTAB_SHIFT),
+              ctx->x86_pv.levels);
+        goto err;
+    }
+
+    mfn = pfn_to_mfn(ctx, pfn);
+    if ( !mfn_in_pseudophysmap(ctx, mfn) )
+    {
+        ERROR("cr3 has bad mfn");
+        dump_bad_pseudophysmap_entry(ctx, mfn);
+        goto err;
+    }
+
+    SET_FIELD(&vcpu, ctrlreg[3], mfn_to_cr3(ctx, mfn), ctx->x86_pv.width);
+
+    /* 64bit guests: Convert CR1 (guest pagetables) to mfn. */
+    if ( ctx->x86_pv.levels == 4 && (vcpu.x64.ctrlreg[1] & 1) )
+    {
+        pfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT;
+
+        if ( pfn > ctx->x86_pv.max_pfn )
+        {
+            ERROR("cr1 (pfn %#lx) out of range", pfn);
+            goto err;
+        }
+        else if ( (ctx->x86_pv.restore.pfn_types[pfn] &
+                   XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
+                  (((xen_pfn_t)ctx->x86_pv.levels) <<
+                   XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
+        {
+            ERROR("cr1 (pfn %#lx) has bad type %u, expected %u", pfn,
+                  (ctx->x86_pv.restore.pfn_types[pfn] >>
+                   XEN_DOMCTL_PFINFO_LTAB_SHIFT),
+                  ctx->x86_pv.levels);
+            goto err;
+        }
+
+        mfn = pfn_to_mfn(ctx, pfn);
+        if ( !mfn_in_pseudophysmap(ctx, mfn) )
+        {
+            ERROR("cr1 has bad mfn");
+            dump_bad_pseudophysmap_entry(ctx, mfn);
+            goto err;
+        }
+
+        vcpu.x64.ctrlreg[1] = (uint64_t)mfn << PAGE_SHIFT;
+    }
+
+    if ( xc_vcpu_setcontext(xch, ctx->domid, vcpuid, &vcpu) )
+    {
+        PERROR("Failed to set vcpu%u's basic info", vcpuid);
+        goto err;
+    }
+
+    rc = 0;
+
+ err:
+    return rc;
+}
+
+/*
+ * Process one stashed vcpu worth of extended state and send to Xen.
+ */
+static int process_vcpu_extended(struct xc_sr_context *ctx,
+                                 unsigned int vcpuid)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_x86_pv_restore_vcpu *vcpu =
+        &ctx->x86_pv.restore.vcpus[vcpuid];
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
+    domctl.domain = ctx->domid;
+    memcpy(&domctl.u.ext_vcpucontext, vcpu->extd, vcpu->extdsz);
+
+    if ( xc_domctl(xch, &domctl) != 0 )
+    {
+        PERROR("Failed to set vcpu%u's extended info", vcpuid);
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Process one stashed vcpu worth of xsave state and send to Xen.
+ */
+static int process_vcpu_xsave(struct xc_sr_context *ctx,
+                              unsigned int vcpuid)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_x86_pv_restore_vcpu *vcpu =
+        &ctx->x86_pv.restore.vcpus[vcpuid];
+    int rc;
+    DECLARE_DOMCTL;
+    DECLARE_HYPERCALL_BUFFER(void, buffer);
+
+    buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->xsavesz);
+    if ( !buffer )
+    {
+        ERROR("Unable to allocate %zu bytes for xsave hypercall buffer",
+              vcpu->xsavesz);
+        return -1;
+    }
+
+    domctl.cmd = XEN_DOMCTL_setvcpuextstate;
+    domctl.domain = ctx->domid;
+    domctl.u.vcpuextstate.vcpu = vcpuid;
+    domctl.u.vcpuextstate.size = vcpu->xsavesz;
+    set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
+
+    memcpy(buffer, vcpu->xsave, vcpu->xsavesz);
+
+    rc = xc_domctl(xch, &domctl);
+    if ( rc )
+        PERROR("Failed to set vcpu%u's xsave info", vcpuid);
+
+    xc_hypercall_buffer_free(xch, buffer);
+
+    return rc;
+}
+
+/*
+ * Process one stashed vcpu worth of msr state and send to Xen.
+ */
+static int process_vcpu_msrs(struct xc_sr_context *ctx,
+                             unsigned int vcpuid)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_x86_pv_restore_vcpu *vcpu =
+        &ctx->x86_pv.restore.vcpus[vcpuid];
+    int rc;
+    DECLARE_DOMCTL;
+    DECLARE_HYPERCALL_BUFFER(void, buffer);
+
+    buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->msrsz);
+    if ( !buffer )
+    {
+        ERROR("Unable to allocate %zu bytes for msr hypercall buffer",
+              vcpu->msrsz);
+        return -1;
+    }
+
+    domctl.cmd = XEN_DOMCTL_set_vcpu_msrs;
+    domctl.domain = ctx->domid;
+    domctl.u.vcpu_msrs.vcpu = vcpuid;
+    domctl.u.vcpu_msrs.msr_count = vcpu->msrsz % sizeof(xen_domctl_vcpu_msr_t);
+    set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
+
+    memcpy(buffer, vcpu->msr, vcpu->msrsz);
+
+    rc = xc_domctl(xch, &domctl);
+    if ( rc )
+        PERROR("Failed to set vcpu%u's msrs", vcpuid);
+
+    xc_hypercall_buffer_free(xch, buffer);
+
+    return rc;
+}
+
+/*
+ * Process all stashed vcpu context and send to Xen.
+ */
+static int update_vcpu_context(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_x86_pv_restore_vcpu *vcpu;
+    unsigned i;
+    int rc = 0;
+
+    for ( i = 0; i < ctx->x86_pv.restore.nr_vcpus; ++i )
+    {
+        vcpu = &ctx->x86_pv.restore.vcpus[i];
+
+        if ( vcpu->basic )
+        {
+            rc = process_vcpu_basic(ctx, i);
+            if ( rc )
+                return rc;
+        }
+        else if ( i == 0 )
+        {
+            ERROR("Sender didn't send vcpu0's basic state");
+            return -1;
+        }
+
+        if ( vcpu->extd )
+        {
+            rc = process_vcpu_extended(ctx, i);
+            if ( rc )
+                return rc;
+        }
+
+        if ( vcpu->xsave )
+        {
+            rc = process_vcpu_xsave(ctx, i);
+            if ( rc )
+                return rc;
+        }
+
+        if ( vcpu->msr )
+        {
+            rc = process_vcpu_msrs(ctx, i);
+            if ( rc )
+                return rc;
+        }
+    }
+
+    return rc;
+}
+
+/*
+ * Copy the p2m which has been constructed locally as memory has been
+ * allocated, over the p2m in guest, so the guest can find its memory again on
+ * resume.
+ */
+static int update_guest_p2m(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t mfn, pfn, *guest_p2m = NULL;
+    unsigned i;
+    int rc = -1;
+
+    for ( i = 0; i < ctx->x86_pv.p2m_frames; ++i )
+    {
+        pfn = ctx->x86_pv.p2m_pfns[i];
+
+        if ( pfn > ctx->x86_pv.max_pfn )
+        {
+            ERROR("pfn (%#lx) for p2m_frame_list[%u] out of range",
+                  pfn, i);
+            goto err;
+        }
+        else if ( (ctx->x86_pv.restore.pfn_types[pfn] !=
+                   XEN_DOMCTL_PFINFO_NOTAB) )
+        {
+            ERROR("pfn (%#lx) for p2m_frame_list[%u] has bad type %u", pfn, i,
+                  (ctx->x86_pv.restore.pfn_types[pfn] >>
+                   XEN_DOMCTL_PFINFO_LTAB_SHIFT));
+            goto err;
+        }
+
+        mfn = pfn_to_mfn(ctx, pfn);
+        if ( !mfn_in_pseudophysmap(ctx, mfn) )
+        {
+            ERROR("p2m_frame_list[%u] has bad mfn", i);
+            dump_bad_pseudophysmap_entry(ctx, mfn);
+            goto err;
+        }
+
+        ctx->x86_pv.p2m_pfns[i] = mfn;
+    }
+
+    guest_p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_WRITE,
+                                     ctx->x86_pv.p2m_pfns,
+                                     ctx->x86_pv.p2m_frames );
+    if ( !guest_p2m )
+    {
+        PERROR("Failed to map p2m frames");
+        goto err;
+    }
+
+    memcpy(guest_p2m, ctx->x86_pv.p2m,
+           (ctx->x86_pv.max_pfn + 1) * ctx->x86_pv.width);
+    rc = 0;
+ err:
+    if ( guest_p2m )
+        munmap(guest_p2m, ctx->x86_pv.p2m_frames * PAGE_SIZE);
+
+    return rc;
+}
+
+/*
+ * Process an X86_PV_INFO record.
+ */
+static int handle_x86_pv_info(struct xc_sr_context *ctx,
+                              struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_x86_pv_info *info = rec->data;
+
+    if ( ctx->x86_pv.restore.seen_pv_info )
+    {
+        ERROR("Already received X86_PV_INFO record");
+        return -1;
+    }
+
+    if ( rec->length < sizeof(*info) )
+    {
+        ERROR("X86_PV_INFO record truncated: length %u, expected %zu",
+              rec->length, sizeof(*info));
+        return -1;
+    }
+    else if ( info->guest_width != 4 &&
+              info->guest_width != 8 )
+    {
+        ERROR("Unexpected guest width %u, Expected 4 or 8",
+              info->guest_width);
+        return -1;
+    }
+    else if ( info->guest_width != ctx->x86_pv.width )
+    {
+        int rc;
+        struct xen_domctl domctl;
+
+        /* Try to set address size, domain is always created 64 bit. */
+        memset(&domctl, 0, sizeof(domctl));
+        domctl.domain = ctx->domid;
+        domctl.cmd    = XEN_DOMCTL_set_address_size;
+        domctl.u.address_size.size = info->guest_width * 8;
+        rc = do_domctl(xch, &domctl);
+        if ( rc != 0 )
+        {
+            ERROR("Width of guest in stream (%u"
+                  " bits) differs with existing domain (%u bits)",
+                  info->guest_width * 8, ctx->x86_pv.width * 8);
+            return -1;
+        }
+
+        /* Domain's information changed, better to refresh. */
+        rc = x86_pv_domain_info(ctx);
+        if ( rc != 0 )
+        {
+            ERROR("Unable to refresh guest information");
+            return -1;
+        }
+    }
+    else if ( info->pt_levels != 3 &&
+              info->pt_levels != 4 )
+    {
+        ERROR("Unexpected guest levels %u, Expected 3 or 4",
+              info->pt_levels);
+        return -1;
+    }
+    else if ( info->pt_levels != ctx->x86_pv.levels )
+    {
+        ERROR("Levels of guest in stream (%u"
+              ") differs with existing domain (%u)",
+              info->pt_levels, ctx->x86_pv.levels);
+        return -1;
+    }
+
+    ctx->x86_pv.restore.seen_pv_info = true;
+    return 0;
+}
+
+/*
+ * Process an X86_PV_P2M_FRAMES record.  Takes care of expanding the local p2m
+ * state if needed.
+ */
+static int handle_x86_pv_p2m_frames(struct xc_sr_context *ctx,
+                                    struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_x86_pv_p2m_frames *data = rec->data;
+    unsigned start, end, x, fpp = PAGE_SIZE / ctx->x86_pv.width;
+    int rc;
+
+    if ( !ctx->x86_pv.restore.seen_pv_info )
+    {
+        ERROR("Not yet received X86_PV_INFO record");
+        return -1;
+    }
+
+    if ( rec->length < sizeof(*data) )
+    {
+        ERROR("X86_PV_P2M_FRAMES record truncated: length %u, min %zu",
+              rec->length, sizeof(*data) + sizeof(uint64_t));
+        return -1;
+    }
+    else if ( data->start_pfn > data->end_pfn )
+    {
+        ERROR("End pfn in stream (%#x) exceeds Start (%#x)",
+              data->end_pfn, data->start_pfn);
+        return -1;
+    }
+
+    start =  data->start_pfn / fpp;
+    end = data->end_pfn / fpp + 1;
+
+    if ( rec->length != sizeof(*data) + ((end - start) * sizeof(uint64_t)) )
+    {
+        ERROR("X86_PV_P2M_FRAMES record wrong size: start_pfn %#x"
+              ", end_pfn %#x, length %u, expected %zu + (%u - %u) * %zu",
+              data->start_pfn, data->end_pfn, rec->length,
+              sizeof(*data), end, start, sizeof(uint64_t));
+        return -1;
+    }
+
+    if ( data->end_pfn > ctx->x86_pv.max_pfn )
+    {
+        rc = expand_p2m(ctx, data->end_pfn);
+        if ( rc )
+            return rc;
+    }
+
+    for ( x = 0; x < (end - start); ++x )
+        ctx->x86_pv.p2m_pfns[start + x] = data->p2m_pfns[x];
+
+    return 0;
+}
+
+/*
+ * Processes X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} records from the stream.
+ * The blobs are all stashed to one side as they need to be deferred until the
+ * very end of the stream, rather than being send to Xen at the point they
+ * arrive in the stream.  It performs all pre-hypercall size validation.
+ */
+static int handle_x86_pv_vcpu_blob(struct xc_sr_context *ctx,
+                                   struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_x86_pv_vcpu_hdr *vhdr = rec->data;
+    struct xc_sr_x86_pv_restore_vcpu *vcpu;
+    const char *rec_name;
+    size_t blobsz;
+    void *blob;
+    int rc = -1;
+
+    switch ( rec->type )
+    {
+    case REC_TYPE_X86_PV_VCPU_BASIC:
+        rec_name = "X86_PV_VCPU_BASIC";
+        break;
+
+    case REC_TYPE_X86_PV_VCPU_EXTENDED:
+        rec_name = "X86_PV_VCPU_EXTENDED";
+        break;
+
+    case REC_TYPE_X86_PV_VCPU_XSAVE:
+        rec_name = "X86_PV_VCPU_XSAVE";
+        break;
+
+    case REC_TYPE_X86_PV_VCPU_MSRS:
+        rec_name = "X86_PV_VCPU_MSRS";
+        break;
+
+    default:
+        ERROR("Unrecognised vcpu blob record %s (%u)",
+              rec_type_to_str(rec->type), rec->type);
+        goto out;
+    }
+
+    /* Confirm that there is a complete header. */
+    if ( rec->length <= sizeof(*vhdr) )
+    {
+        ERROR("%s record truncated: length %u, min %zu",
+              rec_name, rec->length, sizeof(*vhdr) + 1);
+        goto out;
+    }
+
+    blobsz = rec->length - sizeof(*vhdr);
+
+    /* Check that the vcpu id is within range. */
+    if ( vhdr->vcpu_id >= ctx->x86_pv.restore.nr_vcpus )
+    {
+        ERROR("%s record vcpu_id (%u) exceeds domain max (%u)",
+              rec_name, vhdr->vcpu_id, ctx->x86_pv.restore.nr_vcpus - 1);
+        goto out;
+    }
+
+    vcpu = &ctx->x86_pv.restore.vcpus[vhdr->vcpu_id];
+
+    /* Further per-record checks, where possible. */
+    switch ( rec->type )
+    {
+    case REC_TYPE_X86_PV_VCPU_BASIC:
+    {
+        size_t vcpusz = ctx->x86_pv.width == 8 ?
+            sizeof(vcpu_guest_context_x86_64_t) :
+            sizeof(vcpu_guest_context_x86_32_t);
+
+        if ( blobsz != vcpusz )
+        {
+            ERROR("%s record wrong size: expected %zu, got %u",
+                  rec_name, sizeof(*vhdr) + vcpusz, rec->length);
+            goto out;
+        }
+        break;
+    }
+
+    case REC_TYPE_X86_PV_VCPU_EXTENDED:
+        if ( blobsz > 128 )
+        {
+            ERROR("%s record too long: max %zu, got %u",
+                  rec_name, sizeof(*vhdr) + 128, rec->length);
+            goto out;
+        }
+        break;
+
+    case REC_TYPE_X86_PV_VCPU_XSAVE:
+        if ( blobsz % sizeof(xen_domctl_vcpu_msr_t) != 0 )
+        {
+            ERROR("%s record payload size %zu expected to be a multiple of %zu",
+                  rec_name, blobsz, sizeof(xen_domctl_vcpu_msr_t));
+            goto out;
+        }
+        break;
+    }
+
+    /* Allocate memory. */
+    blob = malloc(blobsz);
+    if ( !blob )
+    {
+        ERROR("Unable to allocate %zu bytes for vcpu%u %s blob",
+              blobsz, vhdr->vcpu_id, rec_name);
+        goto out;
+    }
+
+    memcpy(blob, &vhdr->context, blobsz);
+
+    /* Stash sideways for later. */
+    switch ( rec->type )
+    {
+#define RECSTORE(x, y) case REC_TYPE_X86_PV_ ## x: \
+        free(y); (y) = blob; (y ## sz) = blobsz; break
+
+        RECSTORE(VCPU_BASIC,    vcpu->basic);
+        RECSTORE(VCPU_EXTENDED, vcpu->extd);
+        RECSTORE(VCPU_XSAVE,    vcpu->xsave);
+        RECSTORE(VCPU_MSRS,     vcpu->msr);
+#undef RECSTORE
+    }
+
+    rc = 0;
+
+ out:
+    return rc;
+}
+
+/*
+ * Process a SHARED_INFO record from the stream.
+ */
+static int handle_shared_info(struct xc_sr_context *ctx,
+                              struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned i;
+    int rc = -1;
+    shared_info_any_t *guest_shinfo = NULL;
+    const shared_info_any_t *old_shinfo = rec->data;
+
+    if ( !ctx->x86_pv.restore.seen_pv_info )
+    {
+        ERROR("Not yet received X86_PV_INFO record");
+        return -1;
+    }
+
+    if ( rec->length != PAGE_SIZE )
+    {
+        ERROR("X86_PV_SHARED_INFO record wrong size: length %u"
+              ", expected 4096", rec->length);
+        goto err;
+    }
+
+    guest_shinfo = xc_map_foreign_range(
+        xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
+        ctx->dominfo.shared_info_frame);
+    if ( !guest_shinfo )
+    {
+        PERROR("Failed to map Shared Info at mfn %#lx",
+               ctx->dominfo.shared_info_frame);
+        goto err;
+    }
+
+    MEMCPY_FIELD(guest_shinfo, old_shinfo, vcpu_info, ctx->x86_pv.width);
+    MEMCPY_FIELD(guest_shinfo, old_shinfo, arch, ctx->x86_pv.width);
+
+    SET_FIELD(guest_shinfo, arch.pfn_to_mfn_frame_list_list,
+              0, ctx->x86_pv.width);
+
+    MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_pending, 0, ctx->x86_pv.width);
+    for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
+        SET_FIELD(guest_shinfo, vcpu_info[i].evtchn_pending_sel,
+                  0, ctx->x86_pv.width);
+
+    MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_mask, 0xff, ctx->x86_pv.width);
+
+    rc = 0;
+ err:
+
+    if ( guest_shinfo )
+        munmap(guest_shinfo, PAGE_SIZE);
+
+    return rc;
+}
+
+/* restore_ops function. */
+static bool x86_pv_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    return pfn <= ctx->x86_pv.max_pfn;
+}
+
+/* restore_ops function. */
+static void x86_pv_set_page_type(struct xc_sr_context *ctx, xen_pfn_t pfn,
+                                 unsigned long type)
+{
+    assert(pfn <= ctx->x86_pv.max_pfn);
+
+    ctx->x86_pv.restore.pfn_types[pfn] = type;
+}
+
+/* restore_ops function. */
+static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
+                           xen_pfn_t mfn)
+{
+    assert(pfn <= ctx->x86_pv.max_pfn);
+
+    if ( ctx->x86_pv.width == sizeof(uint64_t) )
+        /* 64 bit guest.  Need to expand INVALID_MFN for 32 bit toolstacks. */
+        ((uint64_t *)ctx->x86_pv.p2m)[pfn] = mfn == INVALID_MFN ? ~0ULL : mfn;
+    else
+        /* 32 bit guest.  Can truncate INVALID_MFN for 64 bit toolstacks. */
+        ((uint32_t *)ctx->x86_pv.p2m)[pfn] = mfn;
+}
+
+/*
+ * restore_ops function.  Convert pfns back to mfns in pagetables.  Possibly
+ * needs to populate new frames if a PTE is found referring to a frame which
+ * hasn't yet been seen from PAGE_DATA records.
+ */
+static int x86_pv_localise_page(struct xc_sr_context *ctx,
+                                uint32_t type, void *page)
+{
+    xc_interface *xch = ctx->xch;
+    uint64_t *table = page;
+    uint64_t pte;
+    unsigned i, to_populate;
+    xen_pfn_t pfns[(PAGE_SIZE / sizeof(uint64_t))];
+
+    type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+    /* Only page tables need localisation. */
+    if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
+        return 0;
+
+    /* Check to see whether we need to populate any new frames. */
+    for ( i = 0, to_populate = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
+    {
+        pte = table[i];
+
+        if ( pte & _PAGE_PRESENT )
+        {
+            xen_pfn_t pfn = pte_to_frame(pte);
+
+#ifdef __i386__
+            if ( pfn == INVALID_MFN )
+            {
+                ERROR("PTE truncation detected.  L%u[%u] = %016"PRIx64,
+                      type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
+                errno = E2BIG;
+                return -1;
+            }
+#endif
+
+            if ( pfn_to_mfn(ctx, pfn) == INVALID_MFN )
+                pfns[to_populate++] = pfn;
+        }
+    }
+
+    if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) )
+        return -1;
+
+    for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
+    {
+        pte = table[i];
+
+        if ( pte & _PAGE_PRESENT )
+        {
+            xen_pfn_t mfn, pfn;
+
+            pfn = pte_to_frame(pte);
+            mfn = pfn_to_mfn(ctx, pfn);
+
+            if ( !mfn_in_pseudophysmap(ctx, mfn) )
+            {
+                ERROR("Bad mfn for L%u[%u] - pte %"PRIx64,
+                      type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
+                dump_bad_pseudophysmap_entry(ctx, mfn);
+                errno = ERANGE;
+                return -1;
+            }
+
+            table[i] = merge_pte(pte, mfn);
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * restore_ops function.  Confirm that the incoming stream matches the type of
+ * domain we are attempting to restore into.
+ */
+static int x86_pv_setup(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc;
+
+    if ( ctx->restore.guest_type != DHDR_TYPE_X86_PV )
+    {
+        ERROR("Unable to restore %s domain into an x86_pv domain",
+              dhdr_type_to_str(ctx->restore.guest_type));
+        return -1;
+    }
+    else if ( ctx->restore.guest_page_size != PAGE_SIZE )
+    {
+        ERROR("Invalid page size %d for x86_pv domains",
+              ctx->restore.guest_page_size);
+        return -1;
+    }
+
+    rc = x86_pv_domain_info(ctx);
+    if ( rc )
+        return rc;
+
+    ctx->x86_pv.restore.nr_vcpus = ctx->dominfo.max_vcpu_id + 1;
+    ctx->x86_pv.restore.vcpus = calloc(sizeof(struct xc_sr_x86_pv_restore_vcpu),
+                                       ctx->x86_pv.restore.nr_vcpus);
+    if ( !ctx->x86_pv.restore.vcpus )
+    {
+        errno = ENOMEM;
+        return -1;
+    }
+
+    rc = x86_pv_map_m2p(ctx);
+    if ( rc )
+        return rc;
+
+    return rc;
+}
+
+/*
+ * restore_ops function.
+ */
+static int x86_pv_process_record(struct xc_sr_context *ctx,
+                                 struct xc_sr_record *rec)
+{
+    switch ( rec->type )
+    {
+    case REC_TYPE_X86_PV_INFO:
+        return handle_x86_pv_info(ctx, rec);
+
+    case REC_TYPE_X86_PV_P2M_FRAMES:
+        return handle_x86_pv_p2m_frames(ctx, rec);
+
+    case REC_TYPE_X86_PV_VCPU_BASIC:
+    case REC_TYPE_X86_PV_VCPU_EXTENDED:
+    case REC_TYPE_X86_PV_VCPU_XSAVE:
+    case REC_TYPE_X86_PV_VCPU_MSRS:
+        return handle_x86_pv_vcpu_blob(ctx, rec);
+
+    case REC_TYPE_SHARED_INFO:
+        return handle_shared_info(ctx, rec);
+
+    case REC_TYPE_TSC_INFO:
+        return handle_tsc_info(ctx, rec);
+
+    default:
+        return RECORD_NOT_PROCESSED;
+    }
+}
+
+/*
+ * restore_ops function.  Update the vcpu context in Xen, pin the pagetables,
+ * rewrite the p2m and seed the grant table.
+ */
+static int x86_pv_stream_complete(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc;
+
+    rc = update_vcpu_context(ctx);
+    if ( rc )
+        return rc;
+
+    rc = pin_pagetables(ctx);
+    if ( rc )
+        return rc;
+
+    rc = update_guest_p2m(ctx);
+    if ( rc )
+        return rc;
+
+    rc = xc_dom_gnttab_seed(xch, ctx->domid,
+                            ctx->restore.console_gfn,
+                            ctx->restore.xenstore_gfn,
+                            ctx->restore.console_domid,
+                            ctx->restore.xenstore_domid);
+    if ( rc )
+    {
+        PERROR("Failed to seed grant table");
+        return rc;
+    }
+
+    return rc;
+}
+
+/*
+ * restore_ops function.
+ */
+static int x86_pv_cleanup(struct xc_sr_context *ctx)
+{
+    free(ctx->x86_pv.p2m);
+    free(ctx->x86_pv.p2m_pfns);
+
+    if ( ctx->x86_pv.restore.vcpus )
+    {
+        unsigned i;
+
+        for ( i = 0; i < ctx->x86_pv.restore.nr_vcpus; ++i )
+        {
+            struct xc_sr_x86_pv_restore_vcpu *vcpu =
+                &ctx->x86_pv.restore.vcpus[i];
+
+            free(vcpu->basic);
+            free(vcpu->extd);
+            free(vcpu->xsave);
+            free(vcpu->msr);
+        }
+
+        free(ctx->x86_pv.restore.vcpus);
+    }
+
+    free(ctx->x86_pv.restore.pfn_types);
+
+    if ( ctx->x86_pv.m2p )
+        munmap(ctx->x86_pv.m2p, ctx->x86_pv.nr_m2p_frames * PAGE_SIZE);
+
+    return 0;
+}
+
+struct xc_sr_restore_ops restore_ops_x86_pv =
+{
+    .pfn_is_valid    = x86_pv_pfn_is_valid,
+    .pfn_to_gfn      = pfn_to_mfn,
+    .set_page_type   = x86_pv_set_page_type,
+    .set_gfn         = x86_pv_set_gfn,
+    .localise_page   = x86_pv_localise_page,
+    .setup           = x86_pv_setup,
+    .process_record  = x86_pv_process_record,
+    .stream_complete = x86_pv_stream_complete,
+    .cleanup         = x86_pv_cleanup,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_save.c b/tools/libxc/xc_sr_save.c
new file mode 100644
index 0000000..7dc3a48
--- /dev/null
+++ b/tools/libxc/xc_sr_save.c
@@ -0,0 +1,906 @@
+#include <assert.h>
+#include <arpa/inet.h>
+
+#include "xc_sr_common.h"
+
+/*
+ * Writes an Image header and Domain header into the stream.
+ */
+static int write_headers(struct xc_sr_context *ctx, uint16_t guest_type)
+{
+    xc_interface *xch = ctx->xch;
+    int32_t xen_version = xc_version(xch, XENVER_version, NULL);
+    struct xc_sr_ihdr ihdr =
+        {
+            .marker  = IHDR_MARKER,
+            .id      = htonl(IHDR_ID),
+            .version = htonl(IHDR_VERSION),
+            .options = htons(IHDR_OPT_LITTLE_ENDIAN),
+        };
+    struct xc_sr_dhdr dhdr =
+        {
+            .type       = guest_type,
+            .page_shift = XC_PAGE_SHIFT,
+            .xen_major  = (xen_version >> 16) & 0xffff,
+            .xen_minor  = (xen_version)       & 0xffff,
+        };
+
+    if ( xen_version < 0 )
+    {
+        PERROR("Unable to obtain Xen Version");
+        return -1;
+    }
+
+    if ( write_exact(ctx->fd, &ihdr, sizeof(ihdr)) )
+    {
+        PERROR("Unable to write Image Header to stream");
+        return -1;
+    }
+
+    if ( write_exact(ctx->fd, &dhdr, sizeof(dhdr)) )
+    {
+        PERROR("Unable to write Domain Header to stream");
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Writes an END record into the stream.
+ */
+static int write_end_record(struct xc_sr_context *ctx)
+{
+    struct xc_sr_record end = { REC_TYPE_END, 0, NULL };
+
+    return write_record(ctx, &end);
+}
+
+/*
+ * Writes a CHECKPOINT record into the stream.
+ */
+static int write_checkpoint_record(struct xc_sr_context *ctx)
+{
+    struct xc_sr_record checkpoint = { REC_TYPE_CHECKPOINT, 0, NULL };
+
+    return write_record(ctx, &checkpoint);
+}
+
+/*
+ * Writes a batch of memory as a PAGE_DATA record into the stream.  The batch
+ * is constructed in ctx->save.batch_pfns.
+ *
+ * This function:
+ * - gets the types for each pfn in the batch.
+ * - for each pfn with real data:
+ *   - maps and attempts to localise the pages.
+ * - construct and writes a PAGE_DATA record into the stream.
+ */
+static int write_batch(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t *mfns = NULL, *types = NULL;
+    void *guest_mapping = NULL;
+    void **guest_data = NULL;
+    void **local_pages = NULL;
+    int *errors = NULL, rc = -1;
+    unsigned i, p, nr_pages = 0, nr_pages_mapped = 0;
+    unsigned nr_pfns = ctx->save.nr_batch_pfns;
+    void *page, *orig_page;
+    uint64_t *rec_pfns = NULL;
+    struct iovec *iov = NULL; int iovcnt = 0;
+    struct xc_sr_rec_page_data_header hdr = { 0 };
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_PAGE_DATA,
+    };
+
+    assert(nr_pfns != 0);
+
+    /* Mfns of the batch pfns. */
+    mfns = malloc(nr_pfns * sizeof(*mfns));
+    /* Types of the batch pfns. */
+    types = malloc(nr_pfns * sizeof(*types));
+    /* Errors from attempting to map the gfns. */
+    errors = malloc(nr_pfns * sizeof(*errors));
+    /* Pointers to page data to send.  Mapped gfns or local allocations. */
+    guest_data = calloc(nr_pfns, sizeof(*guest_data));
+    /* Pointers to locally allocated pages.  Need freeing. */
+    local_pages = calloc(nr_pfns, sizeof(*local_pages));
+    /* iovec[] for writev(). */
+    iov = malloc((nr_pfns + 4) * sizeof(*iov));
+
+    if ( !mfns || !types || !errors || !guest_data || !local_pages || !iov )
+    {
+        ERROR("Unable to allocate arrays for a batch of %u pages",
+              nr_pfns);
+        goto err;
+    }
+
+    for ( i = 0; i < nr_pfns; ++i )
+    {
+        types[i] = mfns[i] = ctx->save.ops.pfn_to_gfn(ctx,
+                                                      ctx->save.batch_pfns[i]);
+
+        /* Likely a ballooned page. */
+        if ( mfns[i] == INVALID_MFN )
+        {
+            set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
+            ++ctx->save.nr_deferred_pages;
+        }
+    }
+
+    rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, types);
+    if ( rc )
+    {
+        PERROR("Failed to get types for pfn batch");
+        goto err;
+    }
+    rc = -1;
+
+    for ( i = 0; i < nr_pfns; ++i )
+    {
+        switch ( types[i] )
+        {
+        case XEN_DOMCTL_PFINFO_BROKEN:
+        case XEN_DOMCTL_PFINFO_XALLOC:
+        case XEN_DOMCTL_PFINFO_XTAB:
+            continue;
+        }
+
+        mfns[nr_pages++] = mfns[i];
+    }
+
+    if ( nr_pages > 0 )
+    {
+        guest_mapping = xc_map_foreign_bulk(
+            xch, ctx->domid, PROT_READ, mfns, errors, nr_pages);
+        if ( !guest_mapping )
+        {
+            PERROR("Failed to map guest pages");
+            goto err;
+        }
+        nr_pages_mapped = nr_pages;
+
+        for ( i = 0, p = 0; i < nr_pfns; ++i )
+        {
+            switch ( types[i] )
+            {
+            case XEN_DOMCTL_PFINFO_BROKEN:
+            case XEN_DOMCTL_PFINFO_XALLOC:
+            case XEN_DOMCTL_PFINFO_XTAB:
+                continue;
+            }
+
+            if ( errors[p] )
+            {
+                ERROR("Mapping of pfn %#lx (mfn %#lx) failed %d",
+                      ctx->save.batch_pfns[i], mfns[p], errors[p]);
+                goto err;
+            }
+
+            orig_page = page = guest_mapping + (p * PAGE_SIZE);
+            rc = ctx->save.ops.normalise_page(ctx, types[i], &page);
+
+            if ( orig_page != page )
+                local_pages[i] = page;
+
+            if ( rc )
+            {
+                if ( rc == -1 && errno == EAGAIN )
+                {
+                    set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
+                    ++ctx->save.nr_deferred_pages;
+                    types[i] = XEN_DOMCTL_PFINFO_XTAB;
+                    --nr_pages;
+                }
+                else
+                    goto err;
+            }
+            else
+                guest_data[i] = page;
+
+            rc = -1;
+            ++p;
+        }
+    }
+
+    rec_pfns = malloc(nr_pfns * sizeof(*rec_pfns));
+    if ( !rec_pfns )
+    {
+        ERROR("Unable to allocate %zu bytes of memory for page data pfn list",
+              nr_pfns * sizeof(*rec_pfns));
+        goto err;
+    }
+
+    hdr.count = nr_pfns;
+
+    rec.length = sizeof(hdr);
+    rec.length += nr_pfns * sizeof(*rec_pfns);
+    rec.length += nr_pages * PAGE_SIZE;
+
+    for ( i = 0; i < nr_pfns; ++i )
+        rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->save.batch_pfns[i];
+
+    iov[0].iov_base = &rec.type;
+    iov[0].iov_len = sizeof(rec.type);
+
+    iov[1].iov_base = &rec.length;
+    iov[1].iov_len = sizeof(rec.length);
+
+    iov[2].iov_base = &hdr;
+    iov[2].iov_len = sizeof(hdr);
+
+    iov[3].iov_base = rec_pfns;
+    iov[3].iov_len = nr_pfns * sizeof(*rec_pfns);
+
+    iovcnt = 4;
+
+    if ( nr_pages )
+    {
+        for ( i = 0; i < nr_pfns; ++i )
+        {
+            if ( guest_data[i] )
+            {
+                iov[iovcnt].iov_base = guest_data[i];
+                iov[iovcnt].iov_len = PAGE_SIZE;
+                iovcnt++;
+                --nr_pages;
+            }
+        }
+    }
+
+    if ( writev_exact(ctx->fd, iov, iovcnt) )
+    {
+        PERROR("Failed to write page data to stream");
+        goto err;
+    }
+
+    /* Sanity check we have sent all the pages we expected to. */
+    assert(nr_pages == 0);
+    rc = ctx->save.nr_batch_pfns = 0;
+
+ err:
+    free(rec_pfns);
+    if ( guest_mapping )
+        munmap(guest_mapping, nr_pages_mapped * PAGE_SIZE);
+    for ( i = 0; local_pages && i < nr_pfns; ++i )
+        free(local_pages[i]);
+    free(iov);
+    free(local_pages);
+    free(guest_data);
+    free(errors);
+    free(types);
+    free(mfns);
+
+    return rc;
+}
+
+/*
+ * Flush a batch of pfns into the stream.
+ */
+static int flush_batch(struct xc_sr_context *ctx)
+{
+    int rc = 0;
+
+    if ( ctx->save.nr_batch_pfns == 0 )
+        return rc;
+
+    rc = write_batch(ctx);
+
+    if ( !rc )
+    {
+        VALGRIND_MAKE_MEM_UNDEFINED(ctx->save.batch_pfns,
+                                    MAX_BATCH_SIZE *
+                                    sizeof(*ctx->save.batch_pfns));
+    }
+
+    return rc;
+}
+
+/*
+ * Add a single pfn to the batch, flushing the batch if full.
+ */
+static int add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    int rc = 0;
+
+    if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE )
+        rc = flush_batch(ctx);
+
+    if ( rc == 0 )
+        ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn;
+
+    return rc;
+}
+
+/*
+ * Pause/suspend the domain, and refresh ctx->dominfo if required.
+ */
+static int suspend_domain(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+
+    /* TODO: Properly specify the return value from this callback.  All
+     * implementations currently appear to return 1 for success, whereas
+     * the legacy code checks for != 0. */
+    int cb_rc = ctx->save.callbacks->suspend(ctx->save.callbacks->data);
+
+    if ( cb_rc == 0 )
+    {
+        ERROR("save callback suspend() failed: %d", cb_rc);
+        return -1;
+    }
+
+    /* Refresh domain information. */
+    if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) ||
+         (ctx->dominfo.domid != ctx->domid) )
+    {
+        PERROR("Unable to refresh domain information");
+        return -1;
+    }
+
+    /* Confirm the domain has actually been paused. */
+    if ( !ctx->dominfo.shutdown ||
+         (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) )
+    {
+        ERROR("Domain has not been suspended: shutdown %d, reason %d",
+              ctx->dominfo.shutdown, ctx->dominfo.shutdown_reason);
+        return -1;
+    }
+
+    xc_report_progress_single(xch, "Domain now suspended");
+
+    return 0;
+}
+
+/*
+ * Send a subset of pages in the guests p2m, according to the dirty bitmap.
+ * Used for each subsequent iteration of the live migration loop.
+ *
+ * Bitmap is bounded by p2m_size.
+ */
+static int send_dirty_pages(struct xc_sr_context *ctx,
+                            unsigned long entries)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t p;
+    unsigned long written;
+    int rc;
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->save.dirty_bitmap_hbuf);
+
+    for ( p = 0, written = 0; p < ctx->save.p2m_size; ++p )
+    {
+        if ( !test_bit(p, dirty_bitmap) )
+            continue;
+
+        rc = add_to_batch(ctx, p);
+        if ( rc )
+            return rc;
+
+        /* Update progress every 4MB worth of memory sent. */
+        if ( (written & ((1U << (22 - 12)) - 1)) == 0 )
+            xc_report_progress_step(xch, written, entries);
+
+        ++written;
+    }
+
+    rc = flush_batch(ctx);
+    if ( rc )
+        return rc;
+
+    if ( written > entries )
+        DPRINTF("Bitmap contained more entries than expected...");
+
+    xc_report_progress_step(xch, entries, entries);
+    return 0;
+}
+
+/*
+ * Send all pages in the guests p2m.  Used as the first iteration of the live
+ * migration loop, and for a non-live save.
+ */
+static int send_all_pages(struct xc_sr_context *ctx)
+{
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->save.dirty_bitmap_hbuf);
+
+    bitmap_set(dirty_bitmap, ctx->save.p2m_size);
+
+    return send_dirty_pages(ctx, ctx->save.p2m_size);
+}
+
+static int enable_logdirty(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int on1 = 0, off = 0, on2 = 0;
+    int rc;
+
+    /* This juggling is required if logdirty is enabled for VRAM tracking. */
+    rc = xc_shadow_control(xch, ctx->domid,
+                           XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                           NULL, 0, NULL, 0, NULL);
+    if ( rc < 0 )
+    {
+        on1 = errno;
+        rc = xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                               NULL, 0, NULL, 0, NULL);
+        if ( rc < 0 )
+            off = errno;
+        else {
+            rc = xc_shadow_control(xch, ctx->domid,
+                                   XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                                   NULL, 0, NULL, 0, NULL);
+            if ( rc < 0 )
+                on2 = errno;
+        }
+        if ( rc < 0 )
+        {
+            PERROR("Failed to enable logdirty: %d,%d,%d", on1, off, on2);
+            return rc;
+        }
+    }
+
+    return 0;
+}
+
+static int update_progress_string(struct xc_sr_context *ctx,
+                                  char **str, unsigned iter)
+{
+    xc_interface *xch = ctx->xch;
+    char *new_str = NULL;
+
+    if ( asprintf(&new_str, "Frames iteration %u of %u",
+                  iter, ctx->save.max_iterations) == -1 )
+    {
+        PERROR("Unable to allocate new progress string");
+        return -1;
+    }
+
+    free(*str);
+    *str = new_str;
+
+    xc_set_progress_prefix(xch, *str);
+    return 0;
+}
+
+/*
+ * Send memory while guest is running.
+ */
+static int send_memory_live(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
+    char *progress_str = NULL;
+    unsigned x;
+    int rc;
+
+    rc = update_progress_string(ctx, &progress_str, 0);
+    if ( rc )
+        goto out;
+
+    rc = send_all_pages(ctx);
+    if ( rc )
+        goto out;
+
+    for ( x = 1;
+          ((x < ctx->save.max_iterations) &&
+           (stats.dirty_count > ctx->save.dirty_threshold)); ++x )
+    {
+        if ( xc_shadow_control(
+                 xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+                 &ctx->save.dirty_bitmap_hbuf, ctx->save.p2m_size,
+                 NULL, 0, &stats) != ctx->save.p2m_size )
+        {
+            PERROR("Failed to retrieve logdirty bitmap");
+            rc = -1;
+            goto out;
+        }
+
+        if ( stats.dirty_count == 0 )
+            break;
+
+        rc = update_progress_string(ctx, &progress_str, x);
+        if ( rc )
+            goto out;
+
+        rc = send_dirty_pages(ctx, stats.dirty_count);
+        if ( rc )
+            goto out;
+    }
+
+ out:
+    xc_set_progress_prefix(xch, NULL);
+    free(progress_str);
+    return rc;
+}
+
+/*
+ * Suspend the domain and send dirty memory.
+ * This is the last iteration of the live migration and the
+ * heart of the checkpointed stream.
+ */
+static int suspend_and_send_dirty(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
+    char *progress_str = NULL;
+    int rc;
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->save.dirty_bitmap_hbuf);
+
+    rc = suspend_domain(ctx);
+    if ( rc )
+        goto out;
+
+    if ( xc_shadow_control(
+             xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+             HYPERCALL_BUFFER(dirty_bitmap), ctx->save.p2m_size,
+             NULL, 0, &stats) != ctx->save.p2m_size )
+    {
+        PERROR("Failed to retrieve logdirty bitmap");
+        rc = -1;
+        goto out;
+    }
+
+    if ( ctx->save.live )
+    {
+        rc = update_progress_string(ctx, &progress_str,
+                                    ctx->save.max_iterations);
+        if ( rc )
+            goto out;
+    }
+    else
+        xc_set_progress_prefix(xch, "Checkpointed save");
+
+    bitmap_or(dirty_bitmap, ctx->save.deferred_pages, ctx->save.p2m_size);
+
+    rc = send_dirty_pages(ctx, stats.dirty_count + ctx->save.nr_deferred_pages);
+    if ( rc )
+        goto out;
+
+    bitmap_clear(ctx->save.deferred_pages, ctx->save.p2m_size);
+    ctx->save.nr_deferred_pages = 0;
+
+ out:
+    xc_set_progress_prefix(xch, NULL);
+    free(progress_str);
+    return rc;
+}
+
+static int verify_frames(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
+    int rc;
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_VERIFY,
+        .length = 0,
+    };
+
+    DPRINTF("Enabling verify mode");
+
+    rc = write_record(ctx, &rec);
+    if ( rc )
+        goto out;
+
+    xc_set_progress_prefix(xch, "Frames verify");
+    rc = send_all_pages(ctx);
+    if ( rc )
+        goto out;
+
+    if ( xc_shadow_control(
+             xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_PEEK,
+             &ctx->save.dirty_bitmap_hbuf, ctx->save.p2m_size,
+             NULL, 0, &stats) != ctx->save.p2m_size )
+    {
+        PERROR("Failed to retrieve logdirty bitmap");
+        rc = -1;
+        goto out;
+    }
+
+    DPRINTF("  Further stats: faults %u, dirty %u",
+            stats.fault_count, stats.dirty_count);
+
+ out:
+    return rc;
+}
+
+/*
+ * Send all domain memory.  This is the heart of the live migration loop.
+ */
+static int send_domain_memory_live(struct xc_sr_context *ctx)
+{
+    int rc;
+
+    rc = enable_logdirty(ctx);
+    if ( rc )
+        goto out;
+
+    rc = send_memory_live(ctx);
+    if ( rc )
+        goto out;
+
+    rc = suspend_and_send_dirty(ctx);
+    if ( rc )
+        goto out;
+
+    if ( ctx->save.debug && !ctx->save.checkpointed )
+    {
+        rc = verify_frames(ctx);
+        if ( rc )
+            goto out;
+    }
+
+  out:
+    return rc;
+}
+
+/*
+ * Checkpointed save.
+ */
+static int send_domain_memory_checkpointed(struct xc_sr_context *ctx)
+{
+    return suspend_and_send_dirty(ctx);
+}
+
+/*
+ * Send all domain memory, pausing the domain first.  Generally used for
+ * suspend-to-file.
+ */
+static int send_domain_memory_nonlive(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc;
+
+    rc = suspend_domain(ctx);
+    if ( rc )
+        goto err;
+
+    xc_set_progress_prefix(xch, "Frames");
+
+    rc = send_all_pages(ctx);
+    if ( rc )
+        goto err;
+
+ err:
+    return rc;
+}
+
+static int setup(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc;
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->save.dirty_bitmap_hbuf);
+
+    dirty_bitmap = xc_hypercall_buffer_alloc_pages(
+                   xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->save.p2m_size)));
+    ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE *
+                                  sizeof(*ctx->save.batch_pfns));
+    ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size));
+
+    if ( !ctx->save.batch_pfns || !dirty_bitmap || !ctx->save.deferred_pages )
+    {
+        ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and"
+              " deferred pages");
+        rc = -1;
+        errno = ENOMEM;
+        goto err;
+    }
+
+    rc = ctx->save.ops.setup(ctx);
+    if ( rc )
+        goto err;
+
+    rc = 0;
+
+ err:
+    return rc;
+}
+
+static void cleanup(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->save.dirty_bitmap_hbuf);
+
+
+    xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                      NULL, 0, NULL, 0, NULL);
+
+    if ( ctx->save.ops.cleanup(ctx) )
+        PERROR("Failed to clean up");
+
+    xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
+                                   NRPAGES(bitmap_size(ctx->save.p2m_size)));
+    free(ctx->save.deferred_pages);
+    free(ctx->save.batch_pfns);
+}
+
+/*
+ * Save a domain.
+ */
+static int save(struct xc_sr_context *ctx, uint16_t guest_type)
+{
+    xc_interface *xch = ctx->xch;
+    int rc, saved_rc = 0, saved_errno = 0;
+
+    IPRINTF("Saving domain %d, type %s",
+            ctx->domid, dhdr_type_to_str(guest_type));
+
+    rc = setup(ctx);
+    if ( rc )
+        goto err;
+
+    xc_report_progress_single(xch, "Start of stream");
+
+    rc = write_headers(ctx, guest_type);
+    if ( rc )
+        goto err;
+
+    rc = ctx->save.ops.start_of_stream(ctx);
+    if ( rc )
+        goto err;
+
+    do {
+        rc = ctx->save.ops.start_of_checkpoint(ctx);
+        if ( rc )
+            goto err;
+
+        if ( ctx->save.live )
+            rc = send_domain_memory_live(ctx);
+        else if ( ctx->save.checkpointed )
+            rc = send_domain_memory_checkpointed(ctx);
+        else
+            rc = send_domain_memory_nonlive(ctx);
+
+        if ( rc )
+            goto err;
+
+        if ( !ctx->dominfo.shutdown ||
+             (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) )
+        {
+            ERROR("Domain has not been suspended");
+            rc = -1;
+            goto err;
+        }
+
+        rc = ctx->save.ops.end_of_checkpoint(ctx);
+        if ( rc )
+            goto err;
+
+        if ( ctx->save.checkpointed )
+        {
+            /*
+             * We have now completed the initial live portion of the checkpoint
+             * process. Therefore switch into periodically sending synchronous
+             * batches of pages.
+             */
+            ctx->save.live = false;
+
+            rc = write_checkpoint_record(ctx);
+            if ( rc )
+                goto err;
+
+            ctx->save.callbacks->postcopy(ctx->save.callbacks->data);
+
+            rc = ctx->save.callbacks->checkpoint(ctx->save.callbacks->data);
+            if ( rc <= 0 )
+                ctx->save.checkpointed = false;
+        }
+    } while ( ctx->save.checkpointed );
+
+    xc_report_progress_single(xch, "End of stream");
+
+    rc = write_end_record(ctx);
+    if ( rc )
+        goto err;
+
+    xc_report_progress_single(xch, "Complete");
+    goto done;
+
+ err:
+    saved_errno = errno;
+    saved_rc = rc;
+    PERROR("Save failed");
+
+ done:
+    cleanup(ctx);
+
+    if ( saved_rc )
+    {
+        rc = saved_rc;
+        errno = saved_errno;
+    }
+
+    return rc;
+};
+
+int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
+                   uint32_t max_iters, uint32_t max_factor, uint32_t flags,
+                   struct save_callbacks* callbacks, int hvm)
+{
+    xen_pfn_t nr_pfns;
+    struct xc_sr_context ctx =
+        {
+            .xch = xch,
+            .fd = io_fd,
+        };
+
+    /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */
+    ctx.save.callbacks = callbacks;
+    ctx.save.live  = !!(flags & XCFLAGS_LIVE);
+    ctx.save.debug = !!(flags & XCFLAGS_DEBUG);
+    ctx.save.checkpointed = !!(flags & XCFLAGS_CHECKPOINTED);
+
+    /*
+     * TODO: Find some time to better tweak the live migration algorithm.
+     *
+     * These parameters are better than the legacy algorithm especially for
+     * busy guests.
+     */
+    ctx.save.max_iterations = 5;
+    ctx.save.dirty_threshold = 50;
+
+    /* Sanity checks for callbacks. */
+    if ( hvm )
+        assert(callbacks->switch_qemu_logdirty);
+    if ( ctx.save.checkpointed )
+        assert(callbacks->checkpoint && callbacks->postcopy);
+
+    DPRINTF("fd %d, dom %u, max_iters %u, max_factor %u, flags %u, hvm %d",
+            io_fd, dom, max_iters, max_factor, flags, hvm);
+
+    if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
+    {
+        PERROR("Failed to get domain info");
+        return -1;
+    }
+
+    if ( ctx.dominfo.domid != dom )
+    {
+        ERROR("Domain %u does not exist", dom);
+        return -1;
+    }
+
+    ctx.domid = dom;
+
+    if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 )
+    {
+        PERROR("Unable to obtain the guest p2m size");
+        return -1;
+    }
+
+    ctx.save.p2m_size = nr_pfns;
+
+    if ( ctx.save.p2m_size > ~XEN_DOMCTL_PFINFO_LTAB_MASK )
+    {
+        errno = E2BIG;
+        ERROR("Cannot save this big a guest");
+        return -1;
+    }
+
+    if ( ctx.dominfo.hvm )
+    {
+        ctx.save.ops = save_ops_x86_hvm;
+        return save(&ctx, DHDR_TYPE_X86_HVM);
+    }
+    else
+    {
+        ctx.save.ops = save_ops_x86_pv;
+        return save(&ctx, DHDR_TYPE_X86_PV);
+    }
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_save_x86_hvm.c b/tools/libxc/xc_sr_save_x86_hvm.c
new file mode 100644
index 0000000..cdee774
--- /dev/null
+++ b/tools/libxc/xc_sr_save_x86_hvm.c
@@ -0,0 +1,220 @@
+#include <assert.h>
+
+#include "xc_sr_common_x86.h"
+
+#include <xen/hvm/params.h>
+
+/*
+ * Query for the HVM context and write an HVM_CONTEXT record into the stream.
+ */
+static int write_hvm_context(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc, hvm_buf_size;
+    struct xc_sr_record hvm_rec =
+    {
+        .type = REC_TYPE_HVM_CONTEXT,
+    };
+
+    hvm_buf_size = xc_domain_hvm_getcontext(xch, ctx->domid, 0, 0);
+    if ( hvm_buf_size < 0 )
+    {
+        PERROR("Couldn't get HVM context size from Xen");
+        rc = -1;
+        goto out;
+    }
+
+    hvm_rec.data = malloc(hvm_buf_size);
+    if ( !hvm_rec.data )
+    {
+        PERROR("Couldn't allocate memory");
+        rc = -1;
+        goto out;
+    }
+
+    hvm_buf_size = xc_domain_hvm_getcontext(xch, ctx->domid,
+                                            hvm_rec.data, hvm_buf_size);
+    if ( hvm_buf_size < 0 )
+    {
+        PERROR("Couldn't get HVM context from Xen");
+        rc = -1;
+        goto out;
+    }
+
+    hvm_rec.length = hvm_buf_size;
+    rc = write_record(ctx, &hvm_rec);
+    if ( rc < 0 )
+    {
+        PERROR("error write HVM_CONTEXT record");
+        goto out;
+    }
+
+ out:
+    free(hvm_rec.data);
+    return rc;
+}
+
+/*
+ * Query for a range of HVM parameters and write an HVM_PARAMS record into the
+ * stream.
+ */
+static int write_hvm_params(struct xc_sr_context *ctx)
+{
+    static const unsigned int params[] = {
+        HVM_PARAM_STORE_PFN,
+        HVM_PARAM_IOREQ_PFN,
+        HVM_PARAM_BUFIOREQ_PFN,
+        HVM_PARAM_PAGING_RING_PFN,
+        HVM_PARAM_MONITOR_RING_PFN,
+        HVM_PARAM_SHARING_RING_PFN,
+        HVM_PARAM_VM86_TSS,
+        HVM_PARAM_CONSOLE_PFN,
+        HVM_PARAM_ACPI_IOPORTS_LOCATION,
+        HVM_PARAM_VIRIDIAN,
+        HVM_PARAM_IDENT_PT,
+        HVM_PARAM_PAE_ENABLED,
+        HVM_PARAM_VM_GENERATION_ID_ADDR,
+        HVM_PARAM_IOREQ_SERVER_PFN,
+        HVM_PARAM_NR_IOREQ_SERVER_PAGES,
+    };
+
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_hvm_params_entry entries[ARRAY_SIZE(params)];
+    struct xc_sr_rec_hvm_params hdr = {
+        .count = 0,
+    };
+    struct xc_sr_record rec = {
+        .type   = REC_TYPE_HVM_PARAMS,
+        .length = sizeof(hdr),
+        .data   = &hdr,
+    };
+    unsigned int i;
+    int rc;
+
+    for ( i = 0; i < ARRAY_SIZE(params); i++ )
+    {
+        uint32_t index = params[i];
+        uint64_t value;
+
+        rc = xc_hvm_param_get(xch, ctx->domid, index, &value);
+        if ( rc )
+        {
+            PERROR("Failed to get HVMPARAM at index %u", index);
+            return rc;
+        }
+
+        if ( value != 0 )
+        {
+            entries[hdr.count].index = index;
+            entries[hdr.count].value = value;
+            hdr.count++;
+        }
+    }
+
+    rc = write_split_record(ctx, &rec, entries, hdr.count * sizeof(*entries));
+    if ( rc )
+        PERROR("Failed to write HVM_PARAMS record");
+
+    return rc;
+}
+
+static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx,
+                                    xen_pfn_t pfn)
+{
+    /* identity map */
+    return pfn;
+}
+
+static int x86_hvm_normalise_page(struct xc_sr_context *ctx,
+                                  xen_pfn_t type, void **page)
+{
+    /* no-op */
+    return 0;
+}
+
+static int x86_hvm_setup(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+
+    if ( ctx->save.callbacks->switch_qemu_logdirty(
+             ctx->domid, 1, ctx->save.callbacks->data) )
+    {
+        PERROR("Couldn't enable qemu log-dirty mode");
+        return -1;
+    }
+
+    ctx->x86_hvm.save.qemu_enabled_logdirty = true;
+
+    return 0;
+}
+
+static int x86_hvm_start_of_stream(struct xc_sr_context *ctx)
+{
+    /* no-op */
+    return 0;
+}
+
+static int x86_hvm_start_of_checkpoint(struct xc_sr_context *ctx)
+{
+    /* no-op */
+    return 0;
+}
+
+static int x86_hvm_end_of_checkpoint(struct xc_sr_context *ctx)
+{
+    int rc;
+
+    /* Write the TSC record. */
+    rc = write_tsc_info(ctx);
+    if ( rc )
+        return rc;
+
+    /* Write the HVM_CONTEXT record. */
+    rc = write_hvm_context(ctx);
+    if ( rc )
+        return rc;
+
+    /* Write HVM_PARAMS record contains applicable HVM params. */
+    rc = write_hvm_params(ctx);
+    if ( rc )
+        return rc;
+
+    return 0;
+}
+
+static int x86_hvm_cleanup(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+
+    /* If qemu successfully enabled logdirty mode, attempt to disable. */
+    if ( ctx->x86_hvm.save.qemu_enabled_logdirty &&
+         ctx->save.callbacks->switch_qemu_logdirty(
+             ctx->domid, 0, ctx->save.callbacks->data) )
+    {
+        PERROR("Couldn't disable qemu log-dirty mode");
+        return -1;
+    }
+
+    return 0;
+}
+
+struct xc_sr_save_ops save_ops_x86_hvm =
+{
+    .pfn_to_gfn          = x86_hvm_pfn_to_gfn,
+    .normalise_page      = x86_hvm_normalise_page,
+    .setup               = x86_hvm_setup,
+    .start_of_stream     = x86_hvm_start_of_stream,
+    .start_of_checkpoint = x86_hvm_start_of_checkpoint,
+    .end_of_checkpoint   = x86_hvm_end_of_checkpoint,
+    .cleanup             = x86_hvm_cleanup,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_save_x86_pv.c b/tools/libxc/xc_sr_save_x86_pv.c
new file mode 100644
index 0000000..f63f40b
--- /dev/null
+++ b/tools/libxc/xc_sr_save_x86_pv.c
@@ -0,0 +1,894 @@
+#include <assert.h>
+#include <limits.h>
+
+#include "xc_sr_common_x86_pv.h"
+
+/*
+ * Maps the guests shared info page.
+ */
+static int map_shinfo(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+
+    ctx->x86_pv.shinfo = xc_map_foreign_range(
+        xch, ctx->domid, PAGE_SIZE, PROT_READ, ctx->dominfo.shared_info_frame);
+    if ( !ctx->x86_pv.shinfo )
+    {
+        PERROR("Failed to map shared info frame at mfn %#lx",
+               ctx->dominfo.shared_info_frame);
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Copy a list of mfns from a guest, accounting for differences between guest
+ * and toolstack width.  Can fail if truncation would occur.
+ */
+static int copy_mfns_from_guest(const struct xc_sr_context *ctx,
+                                xen_pfn_t *dst, const void *src, size_t count)
+{
+    size_t x;
+
+    if ( ctx->x86_pv.width == sizeof(unsigned long) )
+        memcpy(dst, src, count * sizeof(*dst));
+    else
+    {
+        for ( x = 0; x < count; ++x )
+        {
+#ifdef __x86_64__
+            /* 64bit toolstack, 32bit guest.  Expand any INVALID_MFN. */
+            uint32_t s = ((uint32_t *)src)[x];
+
+            dst[x] = s == ~0U ? INVALID_MFN : s;
+#else
+            /*
+             * 32bit toolstack, 64bit guest.  Truncate INVALID_MFN, but bail
+             * if any other truncation would occur.
+             *
+             * This will only occur on hosts where a PV guest has ram above
+             * the 16TB boundary.  A 32bit dom0 is unlikely to have
+             * successfully booted on a system this large.
+             */
+            uint64_t s = ((uint64_t *)src)[x];
+
+            if ( (s != ~0ULL) && ((s >> 32) != 0) )
+            {
+                errno = E2BIG;
+                return -1;
+            }
+
+            dst[x] = s;
+#endif
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Walk the guests frame list list and frame list to identify and map the
+ * frames making up the guests p2m table.  Construct a list of pfns making up
+ * the table.
+ */
+static int map_p2m(struct xc_sr_context *ctx)
+{
+    /* Terminology:
+     *
+     * fll   - frame list list, top level p2m, list of fl mfns
+     * fl    - frame list, mid level p2m, list of leaf mfns
+     * local - own allocated buffers, adjusted for bitness
+     * guest - mappings into the domain
+     */
+    xc_interface *xch = ctx->xch;
+    int rc = -1;
+    unsigned x, fpp, fll_entries, fl_entries;
+    xen_pfn_t fll_mfn;
+
+    xen_pfn_t *local_fll = NULL;
+    void *guest_fll = NULL;
+    size_t local_fll_size;
+
+    xen_pfn_t *local_fl = NULL;
+    void *guest_fl = NULL;
+    size_t local_fl_size;
+
+    fpp = PAGE_SIZE / ctx->x86_pv.width;
+    fll_entries = (ctx->x86_pv.max_pfn / (fpp * fpp)) + 1;
+    fl_entries  = (ctx->x86_pv.max_pfn / fpp) + 1;
+
+    fll_mfn = GET_FIELD(ctx->x86_pv.shinfo, arch.pfn_to_mfn_frame_list_list,
+                        ctx->x86_pv.width);
+    if ( fll_mfn == 0 || fll_mfn > ctx->x86_pv.max_mfn )
+    {
+        ERROR("Bad mfn %#lx for p2m frame list list", fll_mfn);
+        goto err;
+    }
+
+    /* Map the guest top p2m. */
+    guest_fll = xc_map_foreign_range(xch, ctx->domid, PAGE_SIZE,
+                                     PROT_READ, fll_mfn);
+    if ( !guest_fll )
+    {
+        PERROR("Failed to map p2m frame list list at %#lx", fll_mfn);
+        goto err;
+    }
+
+    local_fll_size = fll_entries * sizeof(*local_fll);
+    local_fll = malloc(local_fll_size);
+    if ( !local_fll )
+    {
+        ERROR("Cannot allocate %zu bytes for local p2m frame list list",
+              local_fll_size);
+        goto err;
+    }
+
+    if ( copy_mfns_from_guest(ctx, local_fll, guest_fll, fll_entries) )
+    {
+        ERROR("Truncation detected copying p2m frame list list");
+        goto err;
+    }
+
+    /* Check for bad mfns in frame list list. */
+    for ( x = 0; x < fll_entries; ++x )
+    {
+        if ( local_fll[x] == 0 || local_fll[x] > ctx->x86_pv.max_mfn )
+        {
+            ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list list",
+                  local_fll[x], x, fll_entries);
+            goto err;
+        }
+    }
+
+    /* Map the guest mid p2m frames. */
+    guest_fl = xc_map_foreign_pages(xch, ctx->domid, PROT_READ,
+                                    local_fll, fll_entries);
+    if ( !guest_fl )
+    {
+        PERROR("Failed to map p2m frame list");
+        goto err;
+    }
+
+    local_fl_size = fl_entries * sizeof(*local_fl);
+    local_fl = malloc(local_fl_size);
+    if ( !local_fl )
+    {
+        ERROR("Cannot allocate %zu bytes for local p2m frame list",
+              local_fl_size);
+        goto err;
+    }
+
+    if ( copy_mfns_from_guest(ctx, local_fl, guest_fl, fl_entries) )
+    {
+        ERROR("Truncation detected copying p2m frame list");
+        goto err;
+    }
+
+    for ( x = 0; x < fl_entries; ++x )
+    {
+        if ( local_fl[x] == 0 || local_fl[x] > ctx->x86_pv.max_mfn )
+        {
+            ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list",
+                  local_fl[x], x, fl_entries);
+            goto err;
+        }
+    }
+
+    /* Map the p2m leaves themselves. */
+    ctx->x86_pv.p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_READ,
+                                           local_fl, fl_entries);
+    if ( !ctx->x86_pv.p2m )
+    {
+        PERROR("Failed to map p2m frames");
+        goto err;
+    }
+
+    ctx->x86_pv.p2m_frames = fl_entries;
+    ctx->x86_pv.p2m_pfns = malloc(local_fl_size);
+    if ( !ctx->x86_pv.p2m_pfns )
+    {
+        ERROR("Cannot allocate %zu bytes for p2m pfns list",
+              local_fl_size);
+        goto err;
+    }
+
+    /* Convert leaf frames from mfns to pfns. */
+    for ( x = 0; x < fl_entries; ++x )
+    {
+        if ( !mfn_in_pseudophysmap(ctx, local_fl[x]) )
+        {
+            ERROR("Bad mfn in p2m_frame_list[%u]", x);
+            dump_bad_pseudophysmap_entry(ctx, local_fl[x]);
+            errno = ERANGE;
+            goto err;
+        }
+
+        ctx->x86_pv.p2m_pfns[x] = mfn_to_pfn(ctx, local_fl[x]);
+    }
+
+    rc = 0;
+err:
+
+    free(local_fl);
+    if ( guest_fl )
+        munmap(guest_fl, fll_entries * PAGE_SIZE);
+
+    free(local_fll);
+    if ( guest_fll )
+        munmap(guest_fll, PAGE_SIZE);
+
+    return rc;
+}
+
+/*
+ * Obtain a specific vcpus basic state and write an X86_PV_VCPU_BASIC record
+ * into the stream.  Performs mfn->pfn conversion on architectural state.
+ */
+static int write_one_vcpu_basic(struct xc_sr_context *ctx, uint32_t id)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t mfn, pfn;
+    unsigned i, gdt_count;
+    int rc = -1;
+    vcpu_guest_context_any_t vcpu;
+    struct xc_sr_rec_x86_pv_vcpu_hdr vhdr =
+    {
+        .vcpu_id = id,
+    };
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_X86_PV_VCPU_BASIC,
+        .length = sizeof(vhdr),
+        .data = &vhdr,
+    };
+
+    if ( xc_vcpu_getcontext(xch, ctx->domid, id, &vcpu) )
+    {
+        PERROR("Failed to get vcpu%u context", id);
+        goto err;
+    }
+
+    /* Vcpu0 is special: Convert the suspend record to a pfn. */
+    if ( id == 0 )
+    {
+        mfn = GET_FIELD(&vcpu, user_regs.edx, ctx->x86_pv.width);
+        if ( !mfn_in_pseudophysmap(ctx, mfn) )
+        {
+            ERROR("Bad mfn for suspend record");
+            dump_bad_pseudophysmap_entry(ctx, mfn);
+            errno = ERANGE;
+            goto err;
+        }
+        SET_FIELD(&vcpu, user_regs.edx, mfn_to_pfn(ctx, mfn),
+                  ctx->x86_pv.width);
+    }
+
+    gdt_count = GET_FIELD(&vcpu, gdt_ents, ctx->x86_pv.width);
+    if ( gdt_count > FIRST_RESERVED_GDT_ENTRY )
+    {
+        ERROR("GDT entry count (%u) out of range (max %u)",
+              gdt_count, FIRST_RESERVED_GDT_ENTRY);
+        errno = ERANGE;
+        goto err;
+    }
+    gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */
+
+    /* Convert GDT frames to pfns. */
+    for ( i = 0; i < gdt_count; ++i )
+    {
+        mfn = GET_FIELD(&vcpu, gdt_frames[i], ctx->x86_pv.width);
+        if ( !mfn_in_pseudophysmap(ctx, mfn) )
+        {
+            ERROR("Bad mfn for frame %u of vcpu%u's GDT", i, id);
+            dump_bad_pseudophysmap_entry(ctx, mfn);
+            errno = ERANGE;
+            goto err;
+        }
+        SET_FIELD(&vcpu, gdt_frames[i], mfn_to_pfn(ctx, mfn),
+                  ctx->x86_pv.width);
+    }
+
+    /* Convert CR3 to a pfn. */
+    mfn = cr3_to_mfn(ctx, GET_FIELD(&vcpu, ctrlreg[3], ctx->x86_pv.width));
+    if ( !mfn_in_pseudophysmap(ctx, mfn) )
+    {
+        ERROR("Bad mfn for vcpu%u's cr3", id);
+        dump_bad_pseudophysmap_entry(ctx, mfn);
+        errno = ERANGE;
+        goto err;
+    }
+    pfn = mfn_to_pfn(ctx, mfn);
+    SET_FIELD(&vcpu, ctrlreg[3], mfn_to_cr3(ctx, pfn), ctx->x86_pv.width);
+
+    /* 64bit guests: Convert CR1 (guest pagetables) to pfn. */
+    if ( ctx->x86_pv.levels == 4 && vcpu.x64.ctrlreg[1] )
+    {
+        mfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT;
+        if ( !mfn_in_pseudophysmap(ctx, mfn) )
+        {
+            ERROR("Bad mfn for vcpu%u's cr1", id);
+            dump_bad_pseudophysmap_entry(ctx, mfn);
+            errno = ERANGE;
+            goto err;
+        }
+        pfn = mfn_to_pfn(ctx, mfn);
+        vcpu.x64.ctrlreg[1] = 1 | ((uint64_t)pfn << PAGE_SHIFT);
+    }
+
+    if ( ctx->x86_pv.width == 8 )
+        rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x64));
+    else
+        rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x32));
+
+ err:
+    return rc;
+}
+
+/*
+ * Obtain a specific vcpus extended state and write an X86_PV_VCPU_EXTENDED
+ * record into the stream.
+ */
+static int write_one_vcpu_extended(struct xc_sr_context *ctx, uint32_t id)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_x86_pv_vcpu_hdr vhdr =
+    {
+        .vcpu_id = id,
+    };
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_X86_PV_VCPU_EXTENDED,
+        .length = sizeof(vhdr),
+        .data = &vhdr,
+    };
+    struct xen_domctl domctl =
+    {
+        .cmd = XEN_DOMCTL_get_ext_vcpucontext,
+        .domain = ctx->domid,
+        .u.ext_vcpucontext.vcpu = id,
+    };
+
+    if ( xc_domctl(xch, &domctl) < 0 )
+    {
+        PERROR("Unable to get vcpu%u extended context", id);
+        return -1;
+    }
+
+    return write_split_record(ctx, &rec, &domctl.u.ext_vcpucontext,
+                              domctl.u.ext_vcpucontext.size);
+}
+
+/*
+ * Query to see whether a specific vcpu has xsave state and if so, write an
+ * X86_PV_VCPU_XSAVE record into the stream.
+ */
+static int write_one_vcpu_xsave(struct xc_sr_context *ctx, uint32_t id)
+{
+    xc_interface *xch = ctx->xch;
+    int rc = -1;
+    DECLARE_HYPERCALL_BUFFER(void, buffer);
+    struct xc_sr_rec_x86_pv_vcpu_hdr vhdr =
+    {
+        .vcpu_id = id,
+    };
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_X86_PV_VCPU_XSAVE,
+        .length = sizeof(vhdr),
+        .data = &vhdr,
+    };
+    struct xen_domctl domctl =
+    {
+        .cmd = XEN_DOMCTL_getvcpuextstate,
+        .domain = ctx->domid,
+        .u.vcpuextstate.vcpu = id,
+    };
+
+    if ( xc_domctl(xch, &domctl) < 0 )
+    {
+        PERROR("Unable to get vcpu%u's xsave context", id);
+        goto err;
+    }
+
+    /* No xsave state? skip this record. */
+    if ( !domctl.u.vcpuextstate.xfeature_mask )
+        goto out;
+
+    buffer = xc_hypercall_buffer_alloc(xch, buffer, domctl.u.vcpuextstate.size);
+    if ( !buffer )
+    {
+        ERROR("Unable to allocate %"PRIx64" bytes for vcpu%u's xsave context",
+              domctl.u.vcpuextstate.size, id);
+        goto err;
+    }
+
+    set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
+    if ( xc_domctl(xch, &domctl) < 0 )
+    {
+        PERROR("Unable to get vcpu%u's xsave context", id);
+        goto err;
+    }
+
+    rc = write_split_record(ctx, &rec, buffer, domctl.u.vcpuextstate.size);
+    if ( rc )
+        goto err;
+
+ out:
+    rc = 0;
+
+ err:
+    xc_hypercall_buffer_free(xch, buffer);
+
+    return rc;
+}
+
+/*
+ * Query to see whether a specific vcpu has msr state and if so, write an
+ * X86_PV_VCPU_MSRS record into the stream.
+ */
+static int write_one_vcpu_msrs(struct xc_sr_context *ctx, uint32_t id)
+{
+    xc_interface *xch = ctx->xch;
+    int rc = -1;
+    size_t buffersz;
+    DECLARE_HYPERCALL_BUFFER(void, buffer);
+    struct xc_sr_rec_x86_pv_vcpu_hdr vhdr =
+    {
+        .vcpu_id = id,
+    };
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_X86_PV_VCPU_MSRS,
+        .length = sizeof(vhdr),
+        .data = &vhdr,
+    };
+    struct xen_domctl domctl =
+    {
+        .cmd = XEN_DOMCTL_get_vcpu_msrs,
+        .domain = ctx->domid,
+        .u.vcpu_msrs.vcpu = id,
+    };
+
+    if ( xc_domctl(xch, &domctl) < 0 )
+    {
+        PERROR("Unable to get vcpu%u's msrs", id);
+        goto err;
+    }
+
+    /* No MSRs? skip this record. */
+    if ( !domctl.u.vcpu_msrs.msr_count )
+        goto out;
+
+    buffersz = domctl.u.vcpu_msrs.msr_count * sizeof(xen_domctl_vcpu_msr_t);
+    buffer = xc_hypercall_buffer_alloc(xch, buffer, buffersz);
+    if ( !buffer )
+    {
+        ERROR("Unable to allocate %zu bytes for vcpu%u's msrs",
+              buffersz, id);
+        goto err;
+    }
+
+    set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer);
+    if ( xc_domctl(xch, &domctl) < 0 )
+    {
+        PERROR("Unable to get vcpu%u's msrs", id);
+        goto err;
+    }
+
+    rc = write_split_record(ctx, &rec, buffer,
+                            domctl.u.vcpu_msrs.msr_count *
+                            sizeof(xen_domctl_vcpu_msr_t));
+    if ( rc )
+        goto err;
+
+ out:
+    rc = 0;
+
+ err:
+    xc_hypercall_buffer_free(xch, buffer);
+
+    return rc;
+}
+
+/*
+ * For each vcpu, if it is online, write its state into the stream.
+ */
+static int write_all_vcpu_information(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    xc_vcpuinfo_t vinfo;
+    unsigned int i;
+    int rc;
+
+    for ( i = 0; i <= ctx->dominfo.max_vcpu_id; ++i )
+    {
+        rc = xc_vcpu_getinfo(xch, ctx->domid, i, &vinfo);
+        if ( rc )
+        {
+            PERROR("Failed to get vcpu%u information", i);
+            return rc;
+        }
+
+        /* Vcpu offline? skip all these records. */
+        if ( !vinfo.online )
+            continue;
+
+        rc = write_one_vcpu_basic(ctx, i);
+        if ( rc )
+            return rc;
+
+        rc = write_one_vcpu_extended(ctx, i);
+        if ( rc )
+            return rc;
+
+        rc = write_one_vcpu_xsave(ctx, i);
+        if ( rc )
+            return rc;
+
+        rc = write_one_vcpu_msrs(ctx, i);
+        if ( rc )
+            return rc;
+    }
+
+    return 0;
+}
+
+/*
+ * Writes an X86_PV_INFO record into the stream.
+ */
+static int write_x86_pv_info(struct xc_sr_context *ctx)
+{
+    struct xc_sr_rec_x86_pv_info info =
+        {
+            .guest_width = ctx->x86_pv.width,
+            .pt_levels = ctx->x86_pv.levels,
+        };
+    struct xc_sr_record rec =
+        {
+            .type = REC_TYPE_X86_PV_INFO,
+            .length = sizeof(info),
+            .data = &info
+        };
+
+    return write_record(ctx, &rec);
+}
+
+/*
+ * Writes an X86_PV_P2M_FRAMES record into the stream.  This contains the list
+ * of pfns making up the p2m table.
+ */
+static int write_x86_pv_p2m_frames(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc; unsigned i;
+    size_t datasz = ctx->x86_pv.p2m_frames * sizeof(uint64_t);
+    uint64_t *data = NULL;
+    struct xc_sr_rec_x86_pv_p2m_frames hdr =
+        {
+            .start_pfn = 0,
+            .end_pfn = ctx->x86_pv.max_pfn,
+        };
+    struct xc_sr_record rec =
+        {
+            .type = REC_TYPE_X86_PV_P2M_FRAMES,
+            .length = sizeof(hdr),
+            .data = &hdr,
+        };
+
+    /* No need to translate if sizeof(uint64_t) == sizeof(xen_pfn_t). */
+    if ( sizeof(uint64_t) != sizeof(*ctx->x86_pv.p2m_pfns) )
+    {
+        if ( !(data = malloc(datasz)) )
+        {
+            ERROR("Cannot allocate %zu bytes for X86_PV_P2M_FRAMES data",
+                  datasz);
+            return -1;
+        }
+
+        for ( i = 0; i < ctx->x86_pv.p2m_frames; ++i )
+            data[i] = ctx->x86_pv.p2m_pfns[i];
+    }
+    else
+        data = (uint64_t *)ctx->x86_pv.p2m_pfns;
+
+    rc = write_split_record(ctx, &rec, data, datasz);
+
+    if ( data != (uint64_t *)ctx->x86_pv.p2m_pfns )
+        free(data);
+
+    return rc;
+}
+
+/*
+ * Writes an SHARED_INFO record into the stream.
+ */
+static int write_shared_info(struct xc_sr_context *ctx)
+{
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_SHARED_INFO,
+        .length = PAGE_SIZE,
+        .data = ctx->x86_pv.shinfo,
+    };
+
+    return write_record(ctx, &rec);
+}
+
+/*
+ * Normalise a pagetable for the migration stream.  Performs pfn->mfn
+ * conversions on the ptes.
+ */
+static int normalise_pagetable(struct xc_sr_context *ctx, const uint64_t *src,
+                               uint64_t *dst, unsigned long type)
+{
+    xc_interface *xch = ctx->xch;
+    uint64_t pte;
+    unsigned i, xen_first = -1, xen_last = -1; /* Indices of Xen mappings. */
+
+    type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+    if ( ctx->x86_pv.levels == 4 )
+    {
+        /* 64bit guests only have Xen mappings in their L4 tables. */
+        if ( type == XEN_DOMCTL_PFINFO_L4TAB )
+        {
+            xen_first = 256;
+            xen_last = 271;
+        }
+    }
+    else
+    {
+        switch ( type )
+        {
+        case XEN_DOMCTL_PFINFO_L4TAB:
+            ERROR("??? Found L4 table for 32bit guest");
+            errno = EINVAL;
+            return -1;
+
+        case XEN_DOMCTL_PFINFO_L3TAB:
+            /* 32bit guests can only use the first 4 entries of their L3 tables.
+             * All other are potentially used by Xen. */
+            xen_first = 4;
+            xen_last = 512;
+            break;
+
+        case XEN_DOMCTL_PFINFO_L2TAB:
+            /* It is hard to spot Xen mappings in a 32bit guest's L2.  Most
+             * are normal but only a few will have Xen mappings.
+             *
+             * 428 = (HYPERVISOR_VIRT_START_PAE >> L2_PAGETABLE_SHIFT_PAE)&0x1ff
+             *
+             * ...which is conveniently unavailable to us in a 64bit build.
+             */
+            if ( pte_to_frame(src[428]) == ctx->x86_pv.compat_m2p_mfn0 )
+            {
+                xen_first = 428;
+                xen_last = 512;
+            }
+            break;
+        }
+    }
+
+    for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
+    {
+        xen_pfn_t mfn;
+
+        pte = src[i];
+
+        /* Remove Xen mappings: Xen will reconstruct on the other side. */
+        if ( i >= xen_first && i <= xen_last )
+            pte = 0;
+
+        /*
+         * Errors during the live part of migration are expected as a result
+         * of split pagetable updates, page type changes, active grant
+         * mappings etc.  The pagetable will need to be resent after pausing.
+         * In such cases we fail with EAGAIN.
+         *
+         * For domains which are already paused, errors are fatal.
+         */
+        if ( pte & _PAGE_PRESENT )
+        {
+            mfn = pte_to_frame(pte);
+
+#ifdef __i386__
+            if ( mfn == INVALID_MFN )
+            {
+                ERROR("PTE truncation detected.  L%lu[%u] = %016"PRIx64,
+                      type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
+                errno = E2BIG;
+                return -1;
+            }
+#endif
+
+            if ( (type > XEN_DOMCTL_PFINFO_L1TAB) && (pte & _PAGE_PSE) )
+            {
+                if ( !ctx->dominfo.paused )
+                    errno = EAGAIN;
+                else
+                {
+                    ERROR("Cannot migrate superpage (L%lu[%u]: 0x%016"PRIx64")",
+                          type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
+                    errno = E2BIG;
+                }
+                return -1;
+            }
+
+            if ( !mfn_in_pseudophysmap(ctx, mfn) )
+            {
+                if ( !ctx->dominfo.paused )
+                    errno = EAGAIN;
+                else
+                {
+                    ERROR("Bad mfn for L%lu[%u]",
+                          type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i);
+                    dump_bad_pseudophysmap_entry(ctx, mfn);
+                    errno = ERANGE;
+                }
+                return -1;
+            }
+
+            pte = merge_pte(pte, mfn_to_pfn(ctx, mfn));
+        }
+
+        dst[i] = pte;
+    }
+
+    return 0;
+}
+
+/* save_ops function. */
+static xen_pfn_t x86_pv_pfn_to_gfn(const struct xc_sr_context *ctx,
+                                   xen_pfn_t pfn)
+{
+    assert(pfn <= ctx->x86_pv.max_pfn);
+
+    return xc_pfn_to_mfn(pfn, ctx->x86_pv.p2m, ctx->x86_pv.width);
+}
+
+
+/*
+ * save_ops function.  Performs pagetable normalisation on appropriate pages.
+ */
+static int x86_pv_normalise_page(struct xc_sr_context *ctx, xen_pfn_t type,
+                                 void **page)
+{
+    xc_interface *xch = ctx->xch;
+    void *local_page;
+    int rc;
+
+    type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+    if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
+        return 0;
+
+    local_page = malloc(PAGE_SIZE);
+    if ( !local_page )
+    {
+        ERROR("Unable to allocate scratch page");
+        rc = -1;
+        goto out;
+    }
+
+    rc = normalise_pagetable(ctx, *page, local_page, type);
+    *page = local_page;
+
+  out:
+    return rc;
+}
+
+/*
+ * save_ops function.  Queries domain information and maps the Xen m2p and the
+ * guests shinfo and p2m table.
+ */
+static int x86_pv_setup(struct xc_sr_context *ctx)
+{
+    int rc;
+
+    rc = x86_pv_domain_info(ctx);
+    if ( rc )
+        return rc;
+
+    rc = x86_pv_map_m2p(ctx);
+    if ( rc )
+        return rc;
+
+    rc = map_shinfo(ctx);
+    if ( rc )
+        return rc;
+
+    rc = map_p2m(ctx);
+    if ( rc )
+        return rc;
+
+    return 0;
+}
+
+/*
+ * save_ops function.  Writes PV header records into the stream.
+ */
+static int x86_pv_start_of_stream(struct xc_sr_context *ctx)
+{
+    int rc;
+
+    rc = write_x86_pv_info(ctx);
+    if ( rc )
+        return rc;
+
+    /*
+     * Ideally should be able to change during migration.  Currently
+     * corruption will occur if the contents or location of the P2M changes
+     * during the live migration loop.  If one is very lucky, the breakage
+     * will not be subtle.
+     */
+    rc = write_x86_pv_p2m_frames(ctx);
+    if ( rc )
+        return rc;
+
+    return 0;
+}
+
+static int x86_pv_start_of_checkpoint(struct xc_sr_context *ctx)
+{
+    return 0;
+}
+
+static int x86_pv_end_of_checkpoint(struct xc_sr_context *ctx)
+{
+    int rc;
+
+    rc = write_tsc_info(ctx);
+    if ( rc )
+        return rc;
+
+    rc = write_shared_info(ctx);
+    if ( rc )
+        return rc;
+
+    rc = write_all_vcpu_information(ctx);
+    if ( rc )
+        return rc;
+
+    return 0;
+}
+
+/*
+ * save_ops function.  Cleanup.
+ */
+static int x86_pv_cleanup(struct xc_sr_context *ctx)
+{
+    free(ctx->x86_pv.p2m_pfns);
+
+    if ( ctx->x86_pv.p2m )
+        munmap(ctx->x86_pv.p2m, ctx->x86_pv.p2m_frames * PAGE_SIZE);
+
+    if ( ctx->x86_pv.shinfo )
+        munmap(ctx->x86_pv.shinfo, PAGE_SIZE);
+
+    if ( ctx->x86_pv.m2p )
+        munmap(ctx->x86_pv.m2p, ctx->x86_pv.nr_m2p_frames * PAGE_SIZE);
+
+    return 0;
+}
+
+struct xc_sr_save_ops save_ops_x86_pv =
+{
+    .pfn_to_gfn          = x86_pv_pfn_to_gfn,
+    .normalise_page      = x86_pv_normalise_page,
+    .setup               = x86_pv_setup,
+    .start_of_stream     = x86_pv_start_of_stream,
+    .start_of_checkpoint = x86_pv_start_of_checkpoint,
+    .end_of_checkpoint   = x86_pv_end_of_checkpoint,
+    .cleanup             = x86_pv_cleanup,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_sr_stream_format.h b/tools/libxc/xc_sr_stream_format.h
new file mode 100644
index 0000000..6d0f8fd
--- /dev/null
+++ b/tools/libxc/xc_sr_stream_format.h
@@ -0,0 +1,149 @@
+#ifndef __STREAM_FORMAT__H
+#define __STREAM_FORMAT__H
+
+/*
+ * C structures for the Migration v2 stream format.
+ * See docs/specs/libxc-migration-stream.pandoc
+ */
+
+#include <inttypes.h>
+
+/*
+ * Image Header
+ */
+struct xc_sr_ihdr
+{
+    uint64_t marker;
+    uint32_t id;
+    uint32_t version;
+    uint16_t options;
+    uint16_t _res1;
+    uint32_t _res2;
+};
+
+#define IHDR_MARKER  0xffffffffffffffffULL
+#define IHDR_ID      0x58454E46U
+#define IHDR_VERSION 2
+
+#define _IHDR_OPT_ENDIAN 0
+#define IHDR_OPT_LITTLE_ENDIAN (0 << _IHDR_OPT_ENDIAN)
+#define IHDR_OPT_BIG_ENDIAN    (1 << _IHDR_OPT_ENDIAN)
+
+/*
+ * Domain Header
+ */
+struct xc_sr_dhdr
+{
+    uint32_t type;
+    uint16_t page_shift;
+    uint16_t _res1;
+    uint32_t xen_major;
+    uint32_t xen_minor;
+};
+
+#define DHDR_TYPE_X86_PV  0x00000001U
+#define DHDR_TYPE_X86_HVM 0x00000002U
+#define DHDR_TYPE_X86_PVH 0x00000003U
+#define DHDR_TYPE_ARM     0x00000004U
+
+/*
+ * Record Header
+ */
+struct xc_sr_rhdr
+{
+    uint32_t type;
+    uint32_t length;
+};
+
+/* All records must be aligned up to an 8 octet boundary */
+#define REC_ALIGN_ORDER               (3U)
+/* Somewhat arbitrary - 8MB */
+#define REC_LENGTH_MAX                (8U << 20)
+
+#define REC_TYPE_END                  0x00000000U
+#define REC_TYPE_PAGE_DATA            0x00000001U
+#define REC_TYPE_X86_PV_INFO          0x00000002U
+#define REC_TYPE_X86_PV_P2M_FRAMES    0x00000003U
+#define REC_TYPE_X86_PV_VCPU_BASIC    0x00000004U
+#define REC_TYPE_X86_PV_VCPU_EXTENDED 0x00000005U
+#define REC_TYPE_X86_PV_VCPU_XSAVE    0x00000006U
+#define REC_TYPE_SHARED_INFO          0x00000007U
+#define REC_TYPE_TSC_INFO             0x00000008U
+#define REC_TYPE_HVM_CONTEXT          0x00000009U
+#define REC_TYPE_HVM_PARAMS           0x0000000aU
+#define REC_TYPE_TOOLSTACK            0x0000000bU
+#define REC_TYPE_X86_PV_VCPU_MSRS     0x0000000cU
+#define REC_TYPE_VERIFY               0x0000000dU
+#define REC_TYPE_CHECKPOINT           0x0000000eU
+
+#define REC_TYPE_OPTIONAL             0x80000000U
+
+/* PAGE_DATA */
+struct xc_sr_rec_page_data_header
+{
+    uint32_t count;
+    uint32_t _res1;
+    uint64_t pfn[0];
+};
+
+#define PAGE_DATA_PFN_MASK  0x000fffffffffffffULL
+#define PAGE_DATA_TYPE_MASK 0xf000000000000000ULL
+
+/* X86_PV_INFO */
+struct xc_sr_rec_x86_pv_info
+{
+    uint8_t guest_width;
+    uint8_t pt_levels;
+    uint8_t _res[6];
+};
+
+/* X86_PV_P2M_FRAMES */
+struct xc_sr_rec_x86_pv_p2m_frames
+{
+    uint32_t start_pfn;
+    uint32_t end_pfn;
+    uint64_t p2m_pfns[0];
+};
+
+/* X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} */
+struct xc_sr_rec_x86_pv_vcpu_hdr
+{
+    uint32_t vcpu_id;
+    uint32_t _res1;
+    uint8_t context[0];
+};
+
+/* TSC_INFO */
+struct xc_sr_rec_tsc_info
+{
+    uint32_t mode;
+    uint32_t khz;
+    uint64_t nsec;
+    uint32_t incarnation;
+    uint32_t _res1;
+};
+
+/* HVM_PARAMS */
+struct xc_sr_rec_hvm_params_entry
+{
+    uint64_t index;
+    uint64_t value;
+};
+
+struct xc_sr_rec_hvm_params
+{
+    uint32_t count;
+    uint32_t _res1;
+    struct xc_sr_rec_hvm_params_entry param[0];
+};
+
+#endif
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xc_suspend.c b/tools/libxc/xc_suspend.c
index e22f4ac..bba36e7 100644
--- a/tools/libxc/xc_suspend.c
+++ b/tools/libxc/xc_suspend.c
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <unistd.h>
diff --git a/tools/libxc/xc_tbuf.c b/tools/libxc/xc_tbuf.c
index 8777492..695939a 100644
--- a/tools/libxc/xc_tbuf.c
+++ b/tools/libxc/xc_tbuf.c
@@ -19,8 +19,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
@@ -113,15 +112,30 @@ int xc_tbuf_disable(xc_interface *xch)
     return tbuf_enable(xch, 0);
 }
 
-int xc_tbuf_set_cpu_mask(xc_interface *xch, uint32_t mask)
+int xc_tbuf_set_cpu_mask(xc_interface *xch, xc_cpumap_t mask)
 {
     DECLARE_SYSCTL;
-    DECLARE_HYPERCALL_BUFFER(uint8_t, bytemap);
+    DECLARE_HYPERCALL_BOUNCE(mask, 0, XC_HYPERCALL_BUFFER_BOUNCE_IN);
     int ret = -1;
-    uint64_t mask64 = mask;
+    int bits, cpusize;
 
-    bytemap = xc_hypercall_buffer_alloc(xch, bytemap, sizeof(mask64));
-    if ( bytemap == NULL )
+    cpusize = xc_get_cpumap_size(xch);
+    if (cpusize <= 0)
+    {
+        PERROR("Could not get number of cpus");
+        return -1;
+    }
+
+    HYPERCALL_BOUNCE_SET_SIZE(mask, cpusize);
+
+    bits = xc_get_max_cpus(xch);
+    if (bits <= 0)
+    {
+        PERROR("Could not get number of bits");
+        return -1;
+    }
+
+    if ( xc_hypercall_bounce_pre(xch, mask) )
     {
         PERROR("Could not allocate memory for xc_tbuf_set_cpu_mask hypercall");
         goto out;
@@ -131,14 +145,12 @@ int xc_tbuf_set_cpu_mask(xc_interface *xch, uint32_t mask)
     sysctl.interface_version = XEN_SYSCTL_INTERFACE_VERSION;
     sysctl.u.tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_cpu_mask;
 
-    bitmap_64_to_byte(bytemap, &mask64, sizeof (mask64) * 8);
-
-    set_xen_guest_handle(sysctl.u.tbuf_op.cpu_mask.bitmap, bytemap);
-    sysctl.u.tbuf_op.cpu_mask.nr_bits = sizeof(bytemap) * 8;
+    set_xen_guest_handle(sysctl.u.tbuf_op.cpu_mask.bitmap, mask);
+    sysctl.u.tbuf_op.cpu_mask.nr_bits = bits;
 
     ret = do_sysctl(xch, &sysctl);
 
-    xc_hypercall_buffer_free(xch, bytemap);
+    xc_hypercall_bounce_post(xch, mask);
 
  out:
     return ret;
diff --git a/tools/libxc/xc_tmem.c b/tools/libxc/xc_tmem.c
index 3261e10..8f4c0cc 100644
--- a/tools/libxc/xc_tmem.c
+++ b/tools/libxc/xc_tmem.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
@@ -48,44 +47,46 @@ static int do_tmem_op(xc_interface *xch, tmem_op_t *op)
 
 int xc_tmem_control(xc_interface *xch,
                     int32_t pool_id,
-                    uint32_t subop,
+                    uint32_t cmd,
                     uint32_t cli_id,
                     uint32_t arg1,
                     uint32_t arg2,
-                    uint64_t arg3,
                     void *buf)
 {
-    tmem_op_t op;
+    DECLARE_SYSCTL;
     DECLARE_HYPERCALL_BOUNCE(buf, arg1, XC_HYPERCALL_BUFFER_BOUNCE_OUT);
     int rc;
 
-    op.cmd = TMEM_CONTROL;
-    op.pool_id = pool_id;
-    op.u.ctrl.subop = subop;
-    op.u.ctrl.cli_id = cli_id;
-    op.u.ctrl.arg1 = arg1;
-    op.u.ctrl.arg2 = arg2;
-    /* use xc_tmem_control_oid if arg3 is required */
-    op.u.ctrl.oid[0] = 0;
-    op.u.ctrl.oid[1] = 0;
-    op.u.ctrl.oid[2] = 0;
-
-    if ( subop == TMEMC_LIST && arg1 != 0 )
+    sysctl.cmd = XEN_SYSCTL_tmem_op;
+    sysctl.u.tmem_op.pool_id = pool_id;
+    sysctl.u.tmem_op.cmd = cmd;
+    sysctl.u.tmem_op.cli_id = cli_id;
+    sysctl.u.tmem_op.arg1 = arg1;
+    sysctl.u.tmem_op.arg2 = arg2;
+    sysctl.u.tmem_op.pad = 0;
+    sysctl.u.tmem_op.oid.oid[0] = 0;
+    sysctl.u.tmem_op.oid.oid[1] = 0;
+    sysctl.u.tmem_op.oid.oid[2] = 0;
+
+    if ( cmd == XEN_SYSCTL_TMEM_OP_LIST && arg1 != 0 )
     {
         if ( buf == NULL )
-            return -EINVAL;
+        {
+            errno = EINVAL;
+            return -1;
+        }
         if ( xc_hypercall_bounce_pre(xch, buf) )
         {
             PERROR("Could not bounce buffer for tmem control hypercall");
-            return -ENOMEM;
+            return -1;
         }
     }
 
-    set_xen_guest_handle(op.u.ctrl.buf, buf);
+    set_xen_guest_handle(sysctl.u.tmem_op.buf, buf);
 
-    rc = do_tmem_op(xch, &op);
+    rc = do_sysctl(xch, &sysctl);
 
-    if (subop == TMEMC_LIST && arg1 != 0)
+    if ( cmd == XEN_SYSCTL_TMEM_OP_LIST && arg1 != 0 )
             xc_hypercall_bounce_post(xch, buf);
 
     return rc;
@@ -93,44 +94,45 @@ int xc_tmem_control(xc_interface *xch,
 
 int xc_tmem_control_oid(xc_interface *xch,
                         int32_t pool_id,
-                        uint32_t subop,
+                        uint32_t cmd,
                         uint32_t cli_id,
                         uint32_t arg1,
                         uint32_t arg2,
-                        struct tmem_oid oid,
+                        struct xen_tmem_oid oid,
                         void *buf)
 {
-    tmem_op_t op;
+    DECLARE_SYSCTL;
     DECLARE_HYPERCALL_BOUNCE(buf, arg1, XC_HYPERCALL_BUFFER_BOUNCE_OUT);
     int rc;
 
-    op.cmd = TMEM_CONTROL;
-    op.pool_id = pool_id;
-    op.u.ctrl.subop = subop;
-    op.u.ctrl.cli_id = cli_id;
-    set_xen_guest_handle(op.u.ctrl.buf,buf);
-    op.u.ctrl.arg1 = arg1;
-    op.u.ctrl.arg2 = arg2;
-    op.u.ctrl.oid[0] = oid.oid[0];
-    op.u.ctrl.oid[1] = oid.oid[1];
-    op.u.ctrl.oid[2] = oid.oid[2];
-
-    if ( subop == TMEMC_LIST && arg1 != 0 )
+    sysctl.cmd = XEN_SYSCTL_tmem_op;
+    sysctl.u.tmem_op.pool_id = pool_id;
+    sysctl.u.tmem_op.cmd = cmd;
+    sysctl.u.tmem_op.cli_id = cli_id;
+    sysctl.u.tmem_op.arg1 = arg1;
+    sysctl.u.tmem_op.arg2 = arg2;
+    sysctl.u.tmem_op.pad = 0;
+    sysctl.u.tmem_op.oid = oid;
+
+    if ( cmd == XEN_SYSCTL_TMEM_OP_LIST && arg1 != 0 )
     {
         if ( buf == NULL )
-            return -EINVAL;
+        {
+            errno = EINVAL;
+            return -1;
+        }
         if ( xc_hypercall_bounce_pre(xch, buf) )
         {
             PERROR("Could not bounce buffer for tmem control (OID) hypercall");
-            return -ENOMEM;
+            return -1;
         }
     }
 
-    set_xen_guest_handle(op.u.ctrl.buf, buf);
+    set_xen_guest_handle(sysctl.u.tmem_op.buf, buf);
 
-    rc = do_tmem_op(xch, &op);
+    rc = do_sysctl(xch, &sysctl);
 
-    if (subop == TMEMC_LIST && arg1 != 0)
+    if ( cmd == XEN_SYSCTL_TMEM_OP_LIST && arg1 != 0 )
             xc_hypercall_bounce_post(xch, buf);
 
     return rc;
@@ -216,28 +218,28 @@ int xc_tmem_save(xc_interface *xch,
     uint32_t minusone = -1;
     struct tmem_handle *h;
 
-    if ( xc_tmem_control(xch,0,TMEMC_SAVE_BEGIN,dom,live,0,0,NULL) <= 0 )
+    if ( xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SAVE_BEGIN,dom,live,0,NULL) <= 0 )
         return 0;
 
     if ( write_exact(io_fd, &marker, sizeof(marker)) )
         return -1;
-    version = xc_tmem_control(xch,0,TMEMC_SAVE_GET_VERSION,0,0,0,0,NULL);
+    version = xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SAVE_GET_VERSION,0,0,0,NULL);
     if ( write_exact(io_fd, &version, sizeof(version)) )
         return -1;
-    max_pools = xc_tmem_control(xch,0,TMEMC_SAVE_GET_MAXPOOLS,0,0,0,0,NULL);
+    max_pools = xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SAVE_GET_MAXPOOLS,0,0,0,NULL);
     if ( write_exact(io_fd, &max_pools, sizeof(max_pools)) )
         return -1;
     if ( version == -1 || max_pools == -1 )
         return -1;
     if ( write_exact(io_fd, &minusone, sizeof(minusone)) )
         return -1;
-    flags = xc_tmem_control(xch,0,TMEMC_SAVE_GET_CLIENT_FLAGS,dom,0,0,0,NULL);
+    flags = xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_FLAGS,dom,0,0,NULL);
     if ( write_exact(io_fd, &flags, sizeof(flags)) )
         return -1;
-    weight = xc_tmem_control(xch,0,TMEMC_SAVE_GET_CLIENT_WEIGHT,dom,0,0,0,NULL);
+    weight = xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_WEIGHT,dom,0,0,NULL);
     if ( write_exact(io_fd, &weight, sizeof(weight)) )
         return -1;
-    cap = xc_tmem_control(xch,0,TMEMC_SAVE_GET_CLIENT_CAP,dom,0,0,0,NULL);
+    cap = xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_CAP,dom,0,0,NULL);
     if ( write_exact(io_fd, &cap, sizeof(cap)) )
         return -1;
     if ( flags == -1 || weight == -1 || cap == -1 )
@@ -254,14 +256,14 @@ int xc_tmem_save(xc_interface *xch,
         int checksum = 0;
 
         /* get pool id, flags, pagesize, n_pages, uuid */
-        flags = xc_tmem_control(xch,i,TMEMC_SAVE_GET_POOL_FLAGS,dom,0,0,0,NULL);
+        flags = xc_tmem_control(xch,i,XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_FLAGS,dom,0,0,NULL);
         if ( flags != -1 )
         {
             pool_id = i;
-            n_pages = xc_tmem_control(xch,i,TMEMC_SAVE_GET_POOL_NPAGES,dom,0,0,0,NULL);
+            n_pages = xc_tmem_control(xch,i,XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_NPAGES,dom,0,0,NULL);
             if ( !(flags & TMEM_POOL_PERSIST) )
                 n_pages = 0;
-            (void)xc_tmem_control(xch,i,TMEMC_SAVE_GET_POOL_UUID,dom,sizeof(uuid),0,0,&uuid);
+            (void)xc_tmem_control(xch,i,XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_UUID,dom,sizeof(uuid),0,&uuid);
             if ( write_exact(io_fd, &pool_id, sizeof(pool_id)) )
                 return -1;
             if ( write_exact(io_fd, &flags, sizeof(flags)) )
@@ -285,8 +287,8 @@ int xc_tmem_save(xc_interface *xch,
             {
                 int ret;
                 if ( (ret = xc_tmem_control(xch, pool_id,
-                                            TMEMC_SAVE_GET_NEXT_PAGE, dom,
-                                            bufsize, 0, 0, buf)) > 0 )
+                                            XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_PAGE, dom,
+                                            bufsize, 0, buf)) > 0 )
                 {
                     h = (struct tmem_handle *)buf;
                     if ( write_exact(io_fd, &h->oid, sizeof(h->oid)) )
@@ -302,7 +304,7 @@ int xc_tmem_save(xc_interface *xch,
                 } else {
                     /* page list terminator */
                     h = (struct tmem_handle *)buf;
-                    h->oid[0] = h->oid[1] = h->oid[2] = -1L;
+                    h->oid.oid[0] = h->oid.oid[1] = h->oid.oid[2] = -1L;
                     if ( write_exact(io_fd, &h->oid, sizeof(h->oid)) )
                         return -1;
                     break;
@@ -330,8 +332,8 @@ int xc_tmem_save_extra(xc_interface *xch, int dom, int io_fd, int field_marker)
 
     if ( write_exact(io_fd, &marker, sizeof(marker)) )
         return -1;
-    while ( xc_tmem_control(xch, 0, TMEMC_SAVE_GET_NEXT_INV, dom,
-                            sizeof(handle),0,0,&handle) > 0 ) {
+    while ( xc_tmem_control(xch, 0, XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_INV, dom,
+                            sizeof(handle),0,&handle) > 0 ) {
         if ( write_exact(io_fd, &handle.pool_id, sizeof(handle.pool_id)) )
             return -1;
         if ( write_exact(io_fd, &handle.oid, sizeof(handle.oid)) )
@@ -339,8 +341,8 @@ int xc_tmem_save_extra(xc_interface *xch, int dom, int io_fd, int field_marker)
         if ( write_exact(io_fd, &handle.index, sizeof(handle.index)) )
             return -1;
         count++;
-        checksum += handle.pool_id + handle.oid[0] + handle.oid[1] +
-                    handle.oid[2] + handle.index;
+        checksum += handle.pool_id + handle.oid.oid[0] + handle.oid.oid[1] +
+                    handle.oid.oid[2] + handle.index;
     }
     if ( count )
             DPRINTF("needed %d tmem invalidates, check=%d\n",count,checksum);
@@ -353,7 +355,7 @@ int xc_tmem_save_extra(xc_interface *xch, int dom, int io_fd, int field_marker)
 /* only called for live migration */
 void xc_tmem_save_done(xc_interface *xch, int dom)
 {
-    xc_tmem_control(xch,0,TMEMC_SAVE_END,dom,0,0,0,NULL);
+    xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SAVE_END,dom,0,0,NULL);
 }
 
 /* restore routines */
@@ -387,7 +389,7 @@ int xc_tmem_restore(xc_interface *xch, int dom, int io_fd)
     uint32_t weight, cap, flags;
     int checksum = 0;
 
-    save_version = xc_tmem_control(xch,0,TMEMC_SAVE_GET_VERSION,dom,0,0,0,NULL);
+    save_version = xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SAVE_GET_VERSION,dom,0,0,NULL);
     if ( save_version == -1 )
         return -1; /* domain doesn't exist */
     if ( read_exact(io_fd, &this_version, sizeof(this_version)) )
@@ -399,23 +401,23 @@ int xc_tmem_restore(xc_interface *xch, int dom, int io_fd)
         return -1;
     if ( minusone != -1 )
         return -1;
-    if ( xc_tmem_control(xch,0,TMEMC_RESTORE_BEGIN,dom,0,0,0,NULL) < 0 )
+    if ( xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN,dom,0,0,NULL) < 0 )
         return -1;
     if ( read_exact(io_fd, &flags, sizeof(flags)) )
         return -1;
     if ( flags & TMEM_CLIENT_COMPRESS )
-        if ( xc_tmem_control(xch,0,TMEMC_SET_COMPRESS,dom,1,0,0,NULL) < 0 )
+        if ( xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SET_COMPRESS,dom,1,0,NULL) < 0 )
             return -1;
     if ( flags & TMEM_CLIENT_FROZEN )
-        if ( xc_tmem_control(xch,0,TMEMC_FREEZE,dom,0,0,0,NULL) < 0 )
+        if ( xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_FREEZE,dom,0,0,NULL) < 0 )
             return -1;
     if ( read_exact(io_fd, &weight, sizeof(weight)) )
         return -1;
-    if ( xc_tmem_control(xch,0,TMEMC_SET_WEIGHT,dom,0,0,0,NULL) < 0 )
+    if ( xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SET_WEIGHT,dom,0,0,NULL) < 0 )
         return -1;
     if ( read_exact(io_fd, &cap, sizeof(cap)) )
         return -1;
-    if ( xc_tmem_control(xch,0,TMEMC_SET_CAP,dom,0,0,0,NULL) < 0 )
+    if ( xc_tmem_control(xch,0,XEN_SYSCTL_TMEM_OP_SET_CAP,dom,0,0,NULL) < 0 )
         return -1;
     if ( read_exact(io_fd, &minusone, sizeof(minusone)) )
         return -1;
@@ -449,7 +451,7 @@ int xc_tmem_restore(xc_interface *xch, int dom, int io_fd)
         }
         for ( j = n_pages; j > 0; j-- )
         {
-            struct tmem_oid oid;
+            struct xen_tmem_oid oid;
             uint32_t index;
             int rc;
             if ( read_exact(io_fd, &oid, sizeof(oid)) )
@@ -462,7 +464,7 @@ int xc_tmem_restore(xc_interface *xch, int dom, int io_fd)
                 return -1;
             checksum += *buf;
             if ( (rc = xc_tmem_control_oid(xch, pool_id,
-                                           TMEMC_RESTORE_PUT_PAGE, dom,
+                                           XEN_SYSCTL_TMEM_OP_RESTORE_PUT_PAGE, dom,
                                            bufsize, index, oid, buf)) <= 0 )
             {
                 DPRINTF("xc_tmem_restore: putting page failed, rc=%d\n",rc);
@@ -483,7 +485,7 @@ int xc_tmem_restore(xc_interface *xch, int dom, int io_fd)
 int xc_tmem_restore_extra(xc_interface *xch, int dom, int io_fd)
 {
     uint32_t pool_id;
-    struct tmem_oid oid;
+    struct xen_tmem_oid oid;
     uint32_t index;
     int count = 0;
     int checksum = 0;
@@ -494,7 +496,7 @@ int xc_tmem_restore_extra(xc_interface *xch, int dom, int io_fd)
             return -1;
         if ( read_exact(io_fd, &index, sizeof(index)) )
             return -1;
-        if ( xc_tmem_control_oid(xch, pool_id, TMEMC_RESTORE_FLUSH_PAGE, dom,
+        if ( xc_tmem_control_oid(xch, pool_id, XEN_SYSCTL_TMEM_OP_RESTORE_FLUSH_PAGE, dom,
                              0,index,oid,NULL) <= 0 )
             return -1;
         count++;
diff --git a/tools/libxc/xc_mem_event.c b/tools/libxc/xc_vm_event.c
similarity index 67%
rename from tools/libxc/xc_mem_event.c
rename to tools/libxc/xc_vm_event.c
index 8c0be4e..2fef96a 100644
--- a/tools/libxc/xc_mem_event.c
+++ b/tools/libxc/xc_vm_event.c
@@ -1,6 +1,6 @@
 /******************************************************************************
  *
- * xc_mem_event.c
+ * xc_vm_event.c
  *
  * Interface to low-level memory event functionality.
  *
@@ -17,47 +17,30 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
 
-int xc_mem_event_control(xc_interface *xch, domid_t domain_id, unsigned int op,
-                         unsigned int mode, uint32_t *port)
+int xc_vm_event_control(xc_interface *xch, domid_t domain_id, unsigned int op,
+                        unsigned int mode, uint32_t *port)
 {
     DECLARE_DOMCTL;
     int rc;
 
-    domctl.cmd = XEN_DOMCTL_mem_event_op;
+    domctl.cmd = XEN_DOMCTL_vm_event_op;
     domctl.domain = domain_id;
-    domctl.u.mem_event_op.op = op;
-    domctl.u.mem_event_op.mode = mode;
-    
+    domctl.u.vm_event_op.op = op;
+    domctl.u.vm_event_op.mode = mode;
+
     rc = do_domctl(xch, &domctl);
     if ( !rc && port )
-        *port = domctl.u.mem_event_op.port;
+        *port = domctl.u.vm_event_op.port;
     return rc;
 }
 
-int xc_mem_event_memop(xc_interface *xch, domid_t domain_id, 
-                        unsigned int op, unsigned int mode,
-                        uint64_t gfn, void *buffer)
-{
-    xen_mem_event_op_t meo;
-
-    memset(&meo, 0, sizeof(meo));
-
-    meo.op      = op;
-    meo.domain  = domain_id;
-    meo.gfn     = gfn;
-    meo.buffer  = (unsigned long) buffer;
-
-    return do_memory_op(xch, mode, &meo, sizeof(meo));
-}
-
-void *xc_mem_event_enable(xc_interface *xch, domid_t domain_id, int param,
-                          uint32_t *port, int enable_introspection)
+void *xc_vm_event_enable(xc_interface *xch, domid_t domain_id, int param,
+                         uint32_t *port)
 {
     void *ring_page = NULL;
     uint64_t pfn;
@@ -115,26 +98,23 @@ void *xc_mem_event_enable(xc_interface *xch, domid_t domain_id, int param,
     switch ( param )
     {
     case HVM_PARAM_PAGING_RING_PFN:
-        op = XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE;
-        mode = XEN_DOMCTL_MEM_EVENT_OP_PAGING;
+        op = XEN_VM_EVENT_ENABLE;
+        mode = XEN_DOMCTL_VM_EVENT_OP_PAGING;
         break;
 
-    case HVM_PARAM_ACCESS_RING_PFN:
-        if ( enable_introspection )
-            op = XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE_INTROSPECTION;
-        else
-            op = XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE;
-        mode = XEN_DOMCTL_MEM_EVENT_OP_ACCESS;
+    case HVM_PARAM_MONITOR_RING_PFN:
+        op = XEN_VM_EVENT_ENABLE;
+        mode = XEN_DOMCTL_VM_EVENT_OP_MONITOR;
         break;
 
     case HVM_PARAM_SHARING_RING_PFN:
-        op = XEN_DOMCTL_MEM_EVENT_OP_SHARING_ENABLE;
-        mode = XEN_DOMCTL_MEM_EVENT_OP_SHARING;
+        op = XEN_VM_EVENT_ENABLE;
+        mode = XEN_DOMCTL_VM_EVENT_OP_SHARING;
         break;
 
     /*
      * This is for the outside chance that the HVM_PARAM is valid but is invalid
-     * as far as mem_event goes.
+     * as far as vm_event goes.
      */
     default:
         errno = EINVAL;
@@ -142,10 +122,10 @@ void *xc_mem_event_enable(xc_interface *xch, domid_t domain_id, int param,
         goto out;
     }
 
-    rc1 = xc_mem_event_control(xch, domain_id, op, mode, port);
+    rc1 = xc_vm_event_control(xch, domain_id, op, mode, port);
     if ( rc1 != 0 )
     {
-        PERROR("Failed to enable mem_event\n");
+        PERROR("Failed to enable vm_event\n");
         goto out;
     }
 
diff --git a/tools/libxc/xg_private.c b/tools/libxc/xg_private.c
index c52cb44..67946e1 100644
--- a/tools/libxc/xg_private.c
+++ b/tools/libxc/xg_private.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <stdlib.h>
@@ -37,7 +36,7 @@ char *xc_read_image(xc_interface *xch,
 
     if ( (kernel_fd = open(filename, O_RDONLY)) < 0 )
     {
-        PERROR("Could not open kernel image");
+        PERROR("Could not open kernel image '%s'", filename);
         goto out;
     }
 
diff --git a/tools/libxc/xg_private.h b/tools/libxc/xg_private.h
index 1910361..07eeb67 100644
--- a/tools/libxc/xg_private.h
+++ b/tools/libxc/xg_private.h
@@ -10,8 +10,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef XG_PRIVATE_H
diff --git a/tools/libxc/xg_save_restore.h b/tools/libxc/xg_save_restore.h
index bdd9009..303081d 100644
--- a/tools/libxc/xg_save_restore.h
+++ b/tools/libxc/xg_save_restore.h
@@ -12,8 +12,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xc_private.h"
@@ -22,248 +21,6 @@
 #include <xen/foreign/x86_64.h>
 
 /*
- * SAVE/RESTORE/MIGRATE PROTOCOL
- * =============================
- *
- * The general form of a stream of chunks is a header followed by a
- * body consisting of a variable number of chunks (terminated by a
- * chunk with type 0) followed by a trailer.
- *
- * For a rolling/checkpoint (e.g. remus) migration then the body and
- * trailer phases can be repeated until an external event
- * (e.g. failure) causes the process to terminate and commit to the
- * most recent complete checkpoint.
- *
- * HEADER
- * ------
- *
- * unsigned long        : p2m_size
- *
- * extended-info (PV-only, optional):
- *
- *   If first unsigned long == ~0UL then extended info is present,
- *   otherwise unsigned long is part of p2m. Note that p2m_size above
- *   does not include the length of the extended info.
- *
- *   extended-info:
- *
- *     unsigned long    : signature == ~0UL
- *     uint32_t	        : number of bytes remaining in extended-info
- *
- *     1 or more extended-info blocks of form:
- *     char[4]          : block identifier
- *     uint32_t         : block data size
- *     bytes            : block data
- *
- *     defined extended-info blocks:
- *     "vcpu"		: VCPU context info containing vcpu_guest_context_t.
- *                        The precise variant of the context structure
- *                        (e.g. 32 vs 64 bit) is distinguished by
- *                        the block size.
- *     "extv"           : Presence indicates use of extended VCPU context in
- *                        tail, data size is 0.
- *
- * p2m (PV-only):
- *
- *   consists of p2m_size bytes comprising an array of xen_pfn_t sized entries.
- *
- * BODY PHASE - Format A (for live migration or Remus without compression)
- * ----------
- *
- * A series of chunks with a common header:
- *   int              : chunk type
- *
- * If the chunk type is +ve then chunk contains guest memory data, and the
- * type contains the number of pages in the batch:
- *
- *     unsigned long[]  : PFN array, length == number of pages in batch
- *                        Each entry consists of XEN_DOMCTL_PFINFO_*
- *                        in bits 31-28 and the PFN number in bits 27-0.
- *     page data        : PAGE_SIZE bytes for each page marked present in PFN
- *                        array
- *
- * If the chunk type is -ve then chunk consists of one of a number of
- * metadata types.  See definitions of XC_SAVE_ID_* below.
- *
- * If chunk type is 0 then body phase is complete.
- *
- *
- * BODY PHASE - Format B (for Remus with compression)
- * ----------
- *
- * A series of chunks with a common header:
- *   int              : chunk type
- *
- * If the chunk type is +ve then chunk contains array of PFNs corresponding
- * to guest memory and type contains the number of PFNs in the batch:
- *
- *     unsigned long[]  : PFN array, length == number of pages in batch
- *                        Each entry consists of XEN_DOMCTL_PFINFO_*
- *                        in bits 31-28 and the PFN number in bits 27-0.
- *
- * If the chunk type is -ve then chunk consists of one of a number of
- * metadata types.  See definitions of XC_SAVE_ID_* below.
- *
- * If the chunk type is -ve and equals XC_SAVE_ID_COMPRESSED_DATA, then the
- * chunk consists of compressed page data, in the following format:
- *
- *     unsigned long        : Size of the compressed chunk to follow
- *     compressed data :      variable length data of size indicated above.
- *                            This chunk consists of compressed page data.
- *                            The number of pages in one chunk depends on
- *                            the amount of space available in the sender's
- *                            output buffer.
- *
- * Format of compressed data:
- *   compressed_data = <deltas>*
- *   delta           = <marker, run*>
- *   marker          = (RUNFLAG|SKIPFLAG) bitwise-or RUNLEN [1 byte marker]
- *   RUNFLAG         = 0
- *   SKIPFLAG        = 1 << 7
- *   RUNLEN          = 7-bit unsigned value indicating number of WORDS in the run
- *   run             = string of bytes of length sizeof(WORD) * RUNLEN
- *
- *    If marker contains RUNFLAG, then RUNLEN * sizeof(WORD) bytes of data following
- *   the marker is copied into the target page at the appropriate offset indicated by
- *   the offset_ptr
- *    If marker contains SKIPFLAG, then the offset_ptr is advanced
- *   by RUNLEN * sizeof(WORD).
- *
- * If chunk type is 0 then body phase is complete.
- *
- * There can be one or more chunks with type XC_SAVE_ID_COMPRESSED_DATA,
- * containing compressed pages. The compressed chunks are collated to form
- * one single compressed chunk for the entire iteration. The number of pages
- * present in this final compressed chunk will be equal to the total number
- * of valid PFNs specified by the +ve chunks.
- *
- * At the sender side, compressed pages are inserted into the output stream
- * in the same order as they would have been if compression logic was absent.
- *
- * Until last iteration, the BODY is sent in Format A, to maintain live
- * migration compatibility with receivers of older Xen versions.
- * At the last iteration, if Remus compression was enabled, the sender sends
- * a trigger, XC_SAVE_ID_ENABLE_COMPRESSION to tell the receiver to parse the
- * BODY in Format B from the next iteration onwards.
- *
- * An example sequence of chunks received in Format B:
- *     +16                              +ve chunk
- *     unsigned long[16]                PFN array
- *     +100                             +ve chunk
- *     unsigned long[100]               PFN array
- *     +50                              +ve chunk
- *     unsigned long[50]                PFN array
- *
- *     XC_SAVE_ID_COMPRESSED_DATA       TAG
- *       N                              Length of compressed data
- *       N bytes of DATA                Decompresses to 166 pages
- *
- *     XC_SAVE_ID_*                     other xc save chunks
- *     0                                END BODY TAG
- *
- * Corner case with checkpoint compression:
- *     At sender side, after pausing the domain, dirty pages are usually
- *   copied out to a temporary buffer. After the domain is resumed,
- *   compression is done and the compressed chunk(s) are sent, followed by
- *   other XC_SAVE_ID_* chunks.
- *     If the temporary buffer gets full while scanning for dirty pages,
- *   the sender stops buffering of dirty pages, compresses the temporary
- *   buffer and sends the compressed data with XC_SAVE_ID_COMPRESSED_DATA.
- *   The sender then resumes the buffering of dirty pages and continues
- *   scanning for the dirty pages.
- *     For e.g., assume that the temporary buffer can hold 4096 pages and
- *   there are 5000 dirty pages. The following is the sequence of chunks
- *   that the receiver will see:
- *
- *     +1024                       +ve chunk
- *     unsigned long[1024]         PFN array
- *     +1024                       +ve chunk
- *     unsigned long[1024]         PFN array
- *     +1024                       +ve chunk
- *     unsigned long[1024]         PFN array
- *     +1024                       +ve chunk
- *     unsigned long[1024]         PFN array
- *
- *     XC_SAVE_ID_COMPRESSED_DATA  TAG
- *      N                          Length of compressed data
- *      N bytes of DATA            Decompresses to 4096 pages
- *
- *     +4                          +ve chunk
- *     unsigned long[4]            PFN array
- *
- *     XC_SAVE_ID_COMPRESSED_DATA  TAG
- *      M                          Length of compressed data
- *      M bytes of DATA            Decompresses to 4 pages
- *
- *     XC_SAVE_ID_*                other xc save chunks
- *     0                           END BODY TAG
- *
- *     In other words, XC_SAVE_ID_COMPRESSED_DATA can be interleaved with
- *   +ve chunks arbitrarily. But at the receiver end, the following condition
- *   always holds true until the end of BODY PHASE:
- *    num(PFN entries +ve chunks) >= num(pages received in compressed form)
- *
- * TAIL PHASE
- * ----------
- *
- * Content differs for PV and HVM guests.
- *
- * HVM TAIL:
- *
- *  "Magic" pages:
- *     uint64_t         : I/O req PFN
- *     uint64_t         : Buffered I/O req PFN
- *     uint64_t         : Store PFN
- *  Xen HVM Context:
- *     uint32_t         : Length of context in bytes
- *     bytes            : Context data
- *  Qemu context:
- *     char[21]         : Signature:
- *       "QemuDeviceModelRecord" : Read Qemu save data until EOF
- *       "DeviceModelRecord0002" : uint32_t length field followed by that many
- *                                 bytes of Qemu save data
- *       "RemusDeviceModelState" : Currently the same as "DeviceModelRecord0002".
- *
- * PV TAIL:
- *
- *  Unmapped PFN list   : list of all the PFNs that were not in map at the close
- *     unsigned int     : Number of unmapped pages
- *     unsigned long[]  : PFNs of unmapped pages
- *
- *  VCPU context data   : A series of VCPU records, one per present VCPU
- *                        Maximum and present map supplied in XC_SAVE_ID_VCPUINFO
- *     bytes:           : VCPU context structure. Size is determined by size
- *                        provided in extended-info header
- *     bytes[128]       : Extended VCPU context (present IFF "extv" block
- *                        present in extended-info header)
- *
- *  Shared Info Page    : 4096 bytes of shared info page
- */
-
-#define XC_SAVE_ID_ENABLE_VERIFY_MODE -1 /* Switch to validation phase. */
-#define XC_SAVE_ID_VCPU_INFO          -2 /* Additional VCPU info */
-#define XC_SAVE_ID_HVM_IDENT_PT       -3 /* (HVM-only) */
-#define XC_SAVE_ID_HVM_VM86_TSS       -4 /* (HVM-only) */
-#define XC_SAVE_ID_TMEM               -5
-#define XC_SAVE_ID_TMEM_EXTRA         -6
-#define XC_SAVE_ID_TSC_INFO           -7
-#define XC_SAVE_ID_HVM_CONSOLE_PFN    -8 /* (HVM-only) */
-#define XC_SAVE_ID_LAST_CHECKPOINT    -9 /* Commit to restoring after completion of current iteration. */
-#define XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION -10
-#define XC_SAVE_ID_HVM_VIRIDIAN       -11
-#define XC_SAVE_ID_COMPRESSED_DATA    -12 /* Marker to indicate arrival of compressed data */
-#define XC_SAVE_ID_ENABLE_COMPRESSION -13 /* Marker to enable compression logic at receiver side */
-#define XC_SAVE_ID_HVM_GENERATION_ID_ADDR -14
-/* Markers for the pfn's hosting these mem event rings */
-#define XC_SAVE_ID_HVM_PAGING_RING_PFN  -15
-#define XC_SAVE_ID_HVM_ACCESS_RING_PFN  -16
-#define XC_SAVE_ID_HVM_SHARING_RING_PFN -17
-#define XC_SAVE_ID_TOOLSTACK          -18 /* Optional toolstack specific info */
-/* These are a pair; it is an error for one to exist without the other */
-#define XC_SAVE_ID_HVM_IOREQ_SERVER_PFN -19
-#define XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES -20
-
-/*
 ** We process save/restore/migrate in batches of pages; the below
 ** determines how many pages we (at maximum) deal with in each batch.
 */
@@ -272,11 +29,6 @@
 /* When pinning page tables at the end of restore, we also use batching. */
 #define MAX_PIN_BATCH  1024
 
-/* Maximum #VCPUs currently supported for save/restore. */
-#define XC_SR_MAX_VCPUS 4096
-#define vcpumap_sz(max_id) (((max_id)/64+1)*sizeof(uint64_t))
-
-
 /*
 ** Determine various platform information required for save/restore, in
 ** particular:
@@ -311,7 +63,8 @@ static inline int get_platform_info(xc_interface *xch, uint32_t dom,
     if (xc_version(xch, XENVER_capabilities, &xen_caps) != 0)
         return 0;
 
-    *max_mfn = xc_maximum_ram_page(xch);
+    if (xc_maximum_ram_page(xch, max_mfn))
+        return 0;
 
     *hvirt_start = xen_params.virt_start;
 
diff --git a/tools/libxc/xtl_core.c b/tools/libxc/xtl_core.c
index 326b97e..c4724a0 100644
--- a/tools/libxc/xtl_core.c
+++ b/tools/libxc/xtl_core.c
@@ -17,8 +17,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xentoollog.h"
@@ -66,13 +65,14 @@ void xtl_log(struct xentoollog_logger *logger,
 void xtl_progress(struct xentoollog_logger *logger,
                   const char *context, const char *doing_what,
                   unsigned long done, unsigned long total) {
-    int percent;
+    int percent = 0;
 
     if (!logger->progress) return;
 
-    percent = (total < LONG_MAX/100)
-        ? (done * 100) / total
-        : done / ((total + 99) / 100);
+    if ( total )
+        percent = (total < LONG_MAX/100)
+            ? (done * 100) / total
+            : done / ((total + 99) / 100);
 
     logger->progress(logger, context, doing_what, percent, done, total);
 }
diff --git a/tools/libxc/xtl_logger_stdio.c b/tools/libxc/xtl_logger_stdio.c
index d8646e0..0cd9206 100644
--- a/tools/libxc/xtl_logger_stdio.c
+++ b/tools/libxc/xtl_logger_stdio.c
@@ -17,8 +17,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "xentoollog.h"
@@ -61,10 +60,13 @@ static void stdiostream_vmessage(xentoollog_logger *logger_in,
         struct tm lt_buf;
         time_t now = time(0);
         struct tm *lt= localtime_r(&now, &lt_buf);
-        fprintf(lg->f, "%04d-%02d-%02d %02d:%02d:%02d %s ",
-                lt->tm_year+1900, lt->tm_mon+1, lt->tm_mday,
-                lt->tm_hour, lt->tm_min, lt->tm_sec,
-                tzname[!!lt->tm_isdst]);
+        if (lt != NULL)
+            fprintf(lg->f, "%04d-%02d-%02d %02d:%02d:%02d %s ",
+                    lt->tm_year+1900, lt->tm_mon+1, lt->tm_mday,
+                    lt->tm_hour, lt->tm_min, lt->tm_sec,
+                    tzname[!!lt->tm_isdst]);
+        else
+            fprintf(lg->f, "[localtime_r failed: %d] ", errno);
     }
     if (lg->flags & XTL_STDIOSTREAM_SHOW_PID)
         fprintf(lg->f, "[%lu] ", (unsigned long)getpid());
diff --git a/tools/libxl/CODING_STYLE b/tools/libxl/CODING_STYLE
index f5b5890..a65efb3 100644
--- a/tools/libxl/CODING_STYLE
+++ b/tools/libxl/CODING_STYLE
@@ -62,6 +62,7 @@ whenever they are applicable.  For example:
   libxl__ctx_[un]lock     CTX_LOCK, CTX_UNLOCK
   gc=...; ao=...;         EGC_GC, AO_GC, STATE_AO_GC
   explicit gc creation    GC_INIT, GC_FREE
+  memset(..,0,sizeof..)   FILLZERO
 
 
 ERROR HANDLING
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index 6a8575b..c5ecec1 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -5,10 +5,10 @@
 XEN_ROOT = $(CURDIR)/../..
 include $(XEN_ROOT)/tools/Rules.mk
 
-MAJOR = 4.5
+MAJOR = 4.6
 MINOR = 0
 
-XLUMAJOR = 4.3
+XLUMAJOR = 4.6
 XLUMINOR = 0
 
 CFLAGS += -Werror -Wno-format-zero-length -Wmissing-declarations \
@@ -56,10 +56,16 @@ else
 LIBXL_OBJS-y += libxl_nonetbuffer.o
 endif
 
+ifeq ($(CONFIG_X86),y)
+LIBXL_OBJS-y += libxl_convert_callout.o
+else
+LIBXL_OBJS-y += libxl_no_convert_callout.o
+endif
+
 LIBXL_OBJS-y += libxl_remus_device.o libxl_remus_disk_drbd.o
 
 LIBXL_OBJS-$(CONFIG_X86) += libxl_cpuid.o libxl_x86.o libxl_psr.o
-LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o
+LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o libxl_libfdt_compat.o
 
 ifeq ($(CONFIG_NetBSD),y)
 LIBXL_OBJS-y += libxl_netbsd.o
@@ -93,13 +99,18 @@ LIBXL_LIBS += -lyajl
 LIBXL_OBJS = flexarray.o libxl.o libxl_create.o libxl_dm.o libxl_pci.o \
 			libxl_dom.o libxl_exec.o libxl_xshelp.o libxl_device.o \
 			libxl_internal.o libxl_utils.o libxl_uuid.o \
-			libxl_json.o libxl_aoutils.o libxl_numa.o \
+			libxl_json.o libxl_aoutils.o libxl_numa.o libxl_vnuma.o \
+			libxl_stream_read.o libxl_stream_write.o \
 			libxl_save_callout.o _libxl_save_msgs_callout.o \
-			libxl_qmp.o libxl_event.o libxl_fork.o $(LIBXL_OBJS-y)
+			libxl_qmp.o libxl_event.o libxl_fork.o \
+			libxl_dom_suspend.o $(LIBXL_OBJS-y)
 LIBXL_OBJS += libxl_genid.o
 LIBXL_OBJS += _libxl_types.o libxl_flask.o _libxl_types_internal.o
 
 LIBXL_TESTS += timedereg
+LIBXL_TESTS_PROGS = $(LIBXL_TESTS) fdderegrace
+LIBXL_TESTS_INSIDE = $(LIBXL_TESTS) fdevent
+
 # Each entry FOO in LIBXL_TESTS has two main .c files:
 #   libxl_test_FOO.c  "inside libxl" code to support the test case
 #   test_FOO.c        "outside libxl" code to exercise the test case
@@ -109,11 +120,17 @@ LIBXL_TESTS += timedereg
 # "outside libxl" file is compiled exactly like a piece of application
 # code.  They must share information via explicit libxl entrypoints.
 # Unlike proper parts of libxl, it is permissible for libxl_test_FOO.c
-# to use private global variables for its state.
+# to use private global variables for its state.  Note that all the
+# "inside" parts are compiled into a single test library, so their
+# symbol names must be unique.
+#
+# To run these tests, either use LD_PRELOAD to get libxenlight_test.so
+# loaded, or rename it to libxenlight.so so it is the target of the
+# appropriate symlinks.
 
-LIBXL_TEST_OBJS += $(foreach t, $(LIBXL_TESTS),libxl_test_$t.o)
-TEST_PROG_OBJS += $(foreach t, $(LIBXL_TESTS),test_$t.o) test_common.o
-TEST_PROGS += $(foreach t, $(LIBXL_TESTS),test_$t)
+LIBXL_TEST_OBJS += $(foreach t, $(LIBXL_TESTS_INSIDE),libxl_test_$t.o)
+TEST_PROG_OBJS += $(foreach t, $(LIBXL_TESTS_PROGS),test_$t.o) test_common.o
+TEST_PROGS += $(foreach t, $(LIBXL_TESTS_PROGS),test_$t)
 
 $(LIBXL_OBJS) $(LIBXL_TEST_OBJS): CFLAGS += $(CFLAGS_LIBXL) -include $(XEN_ROOT)/tools/config.h
 
@@ -143,13 +160,15 @@ $(XEN_INIT_DOM0_OBJS): CFLAGS += $(CFLAGS_libxenstore)
 SAVE_HELPER_OBJS = libxl_save_helper.o _libxl_save_msgs_helper.o
 $(SAVE_HELPER_OBJS): CFLAGS += $(CFLAGS_libxenctrl)
 
+PKG_CONFIG = xenlight.pc xlutil.pc
+
 testidl.o: CFLAGS += $(CFLAGS_libxenctrl) $(CFLAGS_libxenlight)
 testidl.c: libxl_types.idl gentest.py libxl.h $(AUTOINCS)
 	$(PYTHON) gentest.py libxl_types.idl testidl.c.new
 	mv testidl.c.new testidl.c
 
 .PHONY: all
-all: $(CLIENTS) $(TEST_PROGS) \
+all: $(CLIENTS) $(TEST_PROGS) $(PKG_CONFIG) \
 		libxenlight.so libxenlight.a libxlutil.so libxlutil.a \
 	$(AUTOSRCS) $(AUTOINCS)
 
@@ -248,34 +267,44 @@ libxl-save-helper: $(SAVE_HELPER_OBJS) libxenlight.so
 testidl: testidl.o libxlutil.so libxenlight.so
 	$(CC) $(LDFLAGS) -o $@ testidl.o libxlutil.so $(LDLIBS_libxenlight) $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
 
+$(PKG_CONFIG): % : %.in Makefile
+	@sed -e 's/@@version@@/$(MAJOR).$(MINOR)/g' < $< > $@.new
+	@mv -f $@.new $@
+
 .PHONY: install
 install: all
-	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
-	$(INSTALL_DIR) $(DESTDIR)$(LIBDIR)
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(sbindir)
+	$(INSTALL_DIR) $(DESTDIR)$(libdir)
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)
 	$(INSTALL_DIR) $(DESTDIR)$(BASH_COMPLETION_DIR)
 	$(INSTALL_DIR) $(DESTDIR)$(LIBEXEC_BIN)
-	$(INSTALL_PROG) xl $(DESTDIR)$(SBINDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(SHAREDIR)/pkgconfig
+	$(INSTALL_PROG) xl $(DESTDIR)$(sbindir)
 	$(INSTALL_PROG) xen-init-dom0 $(DESTDIR)$(LIBEXEC_BIN)
 	$(INSTALL_PROG) libxl-save-helper $(DESTDIR)$(LIBEXEC_BIN)
-	$(INSTALL_SHLIB) libxenlight.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)
-	$(SYMLINK_SHLIB) libxenlight.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxenlight.so.$(MAJOR)
-	$(SYMLINK_SHLIB) libxenlight.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxenlight.so
-	$(INSTALL_DATA) libxenlight.a $(DESTDIR)$(LIBDIR)
-	$(INSTALL_SHLIB) libxlutil.so.$(XLUMAJOR).$(XLUMINOR) $(DESTDIR)$(LIBDIR)
-	$(SYMLINK_SHLIB) libxlutil.so.$(XLUMAJOR).$(XLUMINOR) $(DESTDIR)$(LIBDIR)/libxlutil.so.$(XLUMAJOR)
-	$(SYMLINK_SHLIB) libxlutil.so.$(XLUMAJOR) $(DESTDIR)$(LIBDIR)/libxlutil.so
-	$(INSTALL_DATA) libxlutil.a $(DESTDIR)$(LIBDIR)
-	$(INSTALL_DATA) libxl.h libxl_event.h libxl_json.h _libxl_types.h _libxl_types_json.h _libxl_list.h libxl_utils.h libxl_uuid.h libxlutil.h $(DESTDIR)$(INCLUDEDIR)
+	$(INSTALL_SHLIB) libxenlight.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)
+	$(SYMLINK_SHLIB) libxenlight.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)/libxenlight.so.$(MAJOR)
+	$(SYMLINK_SHLIB) libxenlight.so.$(MAJOR) $(DESTDIR)$(libdir)/libxenlight.so
+	$(INSTALL_DATA) libxenlight.a $(DESTDIR)$(libdir)
+	$(INSTALL_SHLIB) libxlutil.so.$(XLUMAJOR).$(XLUMINOR) $(DESTDIR)$(libdir)
+	$(SYMLINK_SHLIB) libxlutil.so.$(XLUMAJOR).$(XLUMINOR) $(DESTDIR)$(libdir)/libxlutil.so.$(XLUMAJOR)
+	$(SYMLINK_SHLIB) libxlutil.so.$(XLUMAJOR) $(DESTDIR)$(libdir)/libxlutil.so
+	$(INSTALL_DATA) libxlutil.a $(DESTDIR)$(libdir)
+	$(INSTALL_DATA) libxl.h libxl_event.h libxl_json.h _libxl_types.h _libxl_types_json.h _libxl_list.h libxl_utils.h libxl_uuid.h libxlutil.h $(DESTDIR)$(includedir)
 	$(INSTALL_DATA) bash-completion $(DESTDIR)$(BASH_COMPLETION_DIR)/xl.sh
+	$(INSTALL_DATA) xenlight.pc $(DESTDIR)$(SHAREDIR)/pkgconfig/
+	$(INSTALL_DATA) xlutil.pc $(DESTDIR)$(SHAREDIR)/pkgconfig/
 
 .PHONY: clean
 clean:
 	$(RM) -f _*.h *.o *.so* *.a $(CLIENTS) $(DEPS)
 	$(RM) -f _*.c *.pyc _paths.*.tmp _*.api-for-check
 	$(RM) -f testidl.c.new testidl.c *.api-ok
+	$(RM) -f xenlight.pc
+	$(RM) -f xlutil.pc
 
 distclean: clean
+	$(RM) -f xenlight.pc.in xlutil.pc.in
 
 realclean: distclean
 	$(RM) -f $(AUTOSRCS) $(AUTOINCS)
diff --git a/tools/libxl/gentest.py b/tools/libxl/gentest.py
index 95323d1..989959f 100644
--- a/tools/libxl/gentest.py
+++ b/tools/libxl/gentest.py
@@ -30,9 +30,10 @@ def gen_rand_init(ty, v, indent = "    ", parent = None):
     elif isinstance(ty, idl.Array):
         if parent is None:
             raise Exception("Array type must have a parent")
-        s += "%s = rand()%%8;\n" % (parent + ty.lenvar.name)
+        s += "%s = test_rand(8);\n" % (parent + ty.lenvar.name)
         s += "%s = calloc(%s, sizeof(*%s));\n" % \
             (v, parent + ty.lenvar.name, v)
+        s += "assert(%s);\n" % (v, )
         s += "{\n"
         s += "    int i;\n"
         s += "    for (i=0; i<%s; i++)\n" % (parent + ty.lenvar.name)
@@ -63,13 +64,13 @@ def gen_rand_init(ty, v, indent = "    ", parent = None):
     elif ty.typename in ["libxl_uuid", "libxl_mac", "libxl_hwcap", "libxl_ms_vm_genid"]:
         s += "rand_bytes((uint8_t *)%s, sizeof(*%s));\n" % (v,v)
     elif ty.typename in ["libxl_domid", "libxl_devid"] or isinstance(ty, idl.Number):
-        s += "%s = rand() %% (sizeof(%s)*8);\n" % \
+        s += "%s = test_rand(sizeof(%s) * 8);\n" % \
              (ty.pass_arg(v, parent is None),
               ty.pass_arg(v, parent is None))
     elif ty.typename in ["bool"]:
-        s += "%s = rand() %% 2;\n" % v
+        s += "%s = test_rand(2);\n" % v
     elif ty.typename in ["libxl_defbool"]:
-        s += "libxl_defbool_set(%s, !!rand() %% 1);\n" % v
+        s += "libxl_defbool_set(%s, test_rand(2));\n" % v
     elif ty.typename in ["char *"]:
         s += "%s = rand_str();\n" % v
     elif ty.private:
@@ -98,16 +99,24 @@ if __name__ == '__main__':
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <assert.h>
 
 #include "libxl.h"
 #include "libxl_utils.h"
 
+static int test_rand(unsigned max)
+{
+    /* We are not using rand() for its cryptographic properies. */
+    return rand() % max;
+}
+
 static char *rand_str(void)
 {
-    int i, sz = rand() % 32;
+    int i, sz = test_rand(32);
     char *s = malloc(sz+1);
+    assert(s);
     for (i=0; i<sz; i++)
-        s[i] = 'a' + (rand() % 26);
+        s[i] = 'a' + test_rand(26);
     s[i] = '\\0';
     return s;
 }
@@ -116,16 +125,17 @@ static void rand_bytes(uint8_t *p, size_t sz)
 {
     int i;
     for (i=0; i<sz; i++)
-        p[i] = rand() % 256;
+        p[i] = test_rand(256);
 }
 
 static void libxl_bitmap_rand_init(libxl_bitmap *bitmap)
 {
     int i;
-    bitmap->size = rand() % 16;
+    bitmap->size = test_rand(16);
     bitmap->map = calloc(bitmap->size, sizeof(*bitmap->map));
+    assert(bitmap->map);
     libxl_for_each_bit(i, *bitmap) {
-        if (rand() % 2)
+        if (test_rand(2))
             libxl_bitmap_set(bitmap, i);
         else
             libxl_bitmap_reset(bitmap, i);
@@ -134,12 +144,13 @@ static void libxl_bitmap_rand_init(libxl_bitmap *bitmap)
 
 static void libxl_key_value_list_rand_init(libxl_key_value_list *pkvl)
 {
-    int i, nr_kvp = rand() % 16;
+    int i, nr_kvp = test_rand(16);
     libxl_key_value_list kvl = calloc(nr_kvp+1, 2*sizeof(char *));
+    assert(kvl);
 
     for (i = 0; i<2*nr_kvp; i += 2) {
         kvl[i] = rand_str();
-        if (rand() % 8)
+        if (test_rand(8))
             kvl[i+1] = rand_str();
         else
             kvl[i+1] = NULL;
@@ -151,7 +162,7 @@ static void libxl_key_value_list_rand_init(libxl_key_value_list *pkvl)
 
 static void libxl_cpuid_policy_list_rand_init(libxl_cpuid_policy_list *pp)
 {
-    int i, nr_policies = rand() % 16;
+    int i, nr_policies = test_rand(16);
     struct {
         const char *n;
         int w;
@@ -184,8 +195,8 @@ static void libxl_cpuid_policy_list_rand_init(libxl_cpuid_policy_list *pp)
     libxl_cpuid_policy_list p = NULL;
 
     for (i = 0; i < nr_policies; i++) {
-        int opt = rand() % nr_options;
-        int val = rand() % (1<<options[opt].w);
+        int opt = test_rand(nr_options);
+        int val = test_rand(1<<options[opt].w);
         snprintf(buf, 64, \"%s=%#x\", options[opt].n, val);
         libxl_cpuid_parse_config(&p, buf);
     }
@@ -194,8 +205,9 @@ static void libxl_cpuid_policy_list_rand_init(libxl_cpuid_policy_list *pp)
 
 static void libxl_string_list_rand_init(libxl_string_list *p)
 {
-    int i, nr = rand() % 16;
+    int i, nr = test_rand(16);
     libxl_string_list l = calloc(nr+1, sizeof(char *));
+    assert(l);
 
     for (i = 0; i<nr; i++) {
         l[i] = rand_str();
@@ -229,7 +241,7 @@ int main(int argc, char **argv)
                 (ty.typename, ty.typename, ty.typename))
     f.write("""
     int rc;
-    char *s, *new_s;
+    char *s, *new_s, *json_string;
     xentoollog_logger_stdiostream *logger;
     libxl_ctx *ctx;
 
@@ -249,8 +261,11 @@ int main(int argc, char **argv)
         f.write("    %s_rand_init(%s);\n" % (ty.typename, \
             ty.pass_arg(arg, isref=False, passby=idl.PASS_BY_REFERENCE)))
         if not isinstance(ty, idl.Enumeration):
-            f.write("    %s_init(%s_new);\n" % (ty.typename, \
-                ty.pass_arg(arg, isref=False, passby=idl.PASS_BY_REFERENCE)))
+            iters = random.randrange(1,10)
+            while iters > 0:
+                f.write("    %s_init(%s_new);\n" % (ty.typename, \
+                    ty.pass_arg(arg, isref=False, passby=idl.PASS_BY_REFERENCE)))
+                iters -= 1
         f.write("    s = %s_to_json(ctx, %s);\n" % \
                 (ty.typename, ty.pass_arg(arg, isref=False)))
         f.write("    printf(\"%%s: %%s\\n\", \"%s\", s);\n" % ty.typename)
@@ -269,8 +284,11 @@ int main(int argc, char **argv)
         f.write("    free(s);\n")
         f.write("    free(new_s);\n")
         if ty.dispose_fn is not None:
+            iters = random.randrange(1,10)
             f.write("    %s(&%s_val);\n" % (ty.dispose_fn, ty.typename))
-            f.write("    %s(&%s_val_new);\n" % (ty.dispose_fn, ty.typename))
+            while iters > 0:
+                f.write("    %s(&%s_val_new);\n" % (ty.dispose_fn, ty.typename))
+                iters -= 1
         f.write("\n")
 
     f.write("    printf(\"Testing TYPE_copy()\\n\");\n")
@@ -323,9 +341,13 @@ int main(int argc, char **argv)
 
         f.write("    printf(\"%s -- to JSON:\\n\");\n" % (ty.typename))
         for v in ty.values:
+            f.write("    json_string = %s_to_json(ctx, %s);\n" % \
+                    (ty.typename, v.name))
             f.write("    printf(\"\\t%s = %%d = %%s\", " \
-                    "%s, %s_to_json(ctx, %s));\n" %\
-                    (v.valuename, v.name, ty.typename, v.name))
+                    "%s, json_string);\n" %\
+                    (v.valuename, v.name))
+            f.write("    free(json_string);\n");
+            f.write("    json_string = NULL;\n");
         f.write("\n")
 
         f.write("    printf(\"%s -- from string:\\n\");\n" % (ty.typename))
diff --git a/tools/libxl/gentypes.py b/tools/libxl/gentypes.py
index 3e73821..00816c0 100644
--- a/tools/libxl/gentypes.py
+++ b/tools/libxl/gentypes.py
@@ -432,8 +432,7 @@ def libxl_C_type_parse_json(ty, w, v, indent = "    ", parent = None, discrimina
         for f in [f for f in ty.fields if not f.const and not f.type.private]:
             saved_var_name = "saved_%s" % f.name
             s += "{\n"
-            s += "    const libxl__json_object *%s = NULL;\n" % saved_var_name
-            s += "    %s = x;\n" % saved_var_name
+            s += "    const libxl__json_object *%s = x;\n" % saved_var_name
             if isinstance(f.type, idl.KeyedUnion):
                 for x in f.type.fields:
                     s += "    x = libxl__json_map_get(\"%s\", %s, JSON_MAP);\n" % \
@@ -637,15 +636,15 @@ if __name__ == '__main__':
 
 #include "libxl_internal.h"
 
-#define LIBXL_DTOR_POISON 0xa5
 
 """ % " ".join(sys.argv))
 
     for ty in [t for t in types if t.dispose_fn is not None and t.autogenerate_dispose_fn]:
         f.write("void %s(%s)\n" % (ty.dispose_fn, ty.make_arg("p")))
         f.write("{\n")
+        f.write("    if (!p) return;\n")
         f.write(libxl_C_type_dispose(ty, "p"))
-        f.write("    memset(p, LIBXL_DTOR_POISON, sizeof(*p));\n")
+        f.write("    memset(p, 0, sizeof(*p));\n")
         f.write("}\n")
         f.write("\n")
 
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 372dd3b..d38d0c7 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -59,8 +59,10 @@ int libxl_ctx_alloc(libxl_ctx **pctx, int version,
 
     ctx->osevent_hooks = 0;
 
+    ctx->poller_app = 0;
     LIBXL_LIST_INIT(&ctx->pollers_event);
     LIBXL_LIST_INIT(&ctx->pollers_idle);
+    LIBXL_LIST_INIT(&ctx->pollers_fds_changed);
 
     LIBXL_LIST_INIT(&ctx->efds);
     LIBXL_TAILQ_INIT(&ctx->etimes);
@@ -73,6 +75,8 @@ int libxl_ctx_alloc(libxl_ctx **pctx, int version,
     LIBXL_LIST_INIT(&ctx->evtchns_waiting);
     libxl__ev_fd_init(&ctx->evtchn_efd);
 
+    LIBXL_LIST_INIT(&ctx->aos_inprogress);
+
     LIBXL_TAILQ_INIT(&ctx->death_list);
     libxl__ev_xswatch_init(&ctx->death_watch);
 
@@ -101,8 +105,11 @@ int libxl_ctx_alloc(libxl_ctx **pctx, int version,
     rc = libxl__atfork_init(ctx);
     if (rc) goto out;
 
-    rc = libxl__poller_init(gc, &ctx->poller_app);
-    if (rc) goto out;
+    ctx->poller_app = libxl__poller_get(gc);
+    if (!ctx->poller_app) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
 
     ctx->xch = xc_interface_open(lg,lg,0);
     if (!ctx->xch) {
@@ -146,11 +153,13 @@ int libxl_ctx_free(libxl_ctx *ctx)
 {
     if (!ctx) return 0;
 
-    assert(!ctx->osevent_in_hook);
-
     int i;
     GC_INIT(ctx);
 
+    CTX_LOCK;
+    assert(!ctx->osevent_in_hook);
+    CTX->osevent_in_hook += 1000; /* make violations easier to debug */
+
     /* Deregister all libxl__ev_KINDs: */
 
     free_disable_deaths(gc, &CTX->death_list);
@@ -172,14 +181,17 @@ int libxl_ctx_free(libxl_ctx *ctx)
     assert(LIBXL_LIST_EMPTY(&ctx->efds));
     assert(LIBXL_TAILQ_EMPTY(&ctx->etimes));
     assert(LIBXL_LIST_EMPTY(&ctx->evtchns_waiting));
+    assert(LIBXL_LIST_EMPTY(&ctx->aos_inprogress));
 
     if (ctx->xch) xc_interface_close(ctx->xch);
     libxl_version_info_dispose(&ctx->version_info);
     if (ctx->xsh) xs_daemon_close(ctx->xsh);
     if (ctx->xce) xc_evtchn_close(ctx->xce);
 
-    libxl__poller_dispose(&ctx->poller_app);
+    libxl__poller_put(ctx, ctx->poller_app);
+    ctx->poller_app = NULL;
     assert(LIBXL_LIST_EMPTY(&ctx->pollers_event));
+    assert(LIBXL_LIST_EMPTY(&ctx->pollers_fds_changed));
     libxl__poller *poller, *poller_tmp;
     LIBXL_LIST_FOREACH_SAFE(poller, &ctx->pollers_idle, entry, poller_tmp) {
         libxl__poller_dispose(poller);
@@ -196,6 +208,7 @@ int libxl_ctx_free(libxl_ctx *ctx)
     libxl__sigchld_notneeded(gc);
     libxl__pipe_close(ctx->sigchld_selfpipe);
 
+    CTX_UNLOCK;
     pthread_mutex_destroy(&ctx->lock);
 
     GC_FREE;
@@ -211,9 +224,12 @@ void libxl_string_list_dispose(libxl_string_list *psl)
     if (!sl)
         return;
 
-    for (i = 0; sl[i] != NULL; i++)
+    for (i = 0; sl[i] != NULL; i++) {
         free(sl[i]);
+        sl[i] = NULL;
+    }
     free(sl);
+    *psl = NULL;
 }
 
 void libxl_string_list_copy(libxl_ctx *ctx,
@@ -273,10 +289,14 @@ void libxl_key_value_list_dispose(libxl_key_value_list *pkvl)
 
     for (i = 0; kvl[i] != NULL; i += 2) {
         free(kvl[i]);
-        if (kvl[i + 1])
+        kvl[i] = NULL;
+        if (kvl[i + 1]) {
             free(kvl[i + 1]);
+            kvl[i+1] = NULL;
+        }
     }
     free(kvl);
+    *pkvl = NULL;
 }
 
 void libxl_key_value_list_copy(libxl_ctx *ctx,
@@ -445,7 +465,7 @@ int libxl__domain_rename(libxl__gc *gc, uint32_t domid,
     /* update /vm/<uuid>/name */
     rc = libxl_domain_info(ctx, &info, domid);
     if (rc)
-        goto x_fail;
+        goto x_rc;
 
     uuid = GCSPRINTF(LIBXL_UUID_FMT, LIBXL_UUID_BYTES(info.uuid));
     vm_name_path = GCSPRINTF("/vm/%s/name", uuid);
@@ -500,39 +520,6 @@ int libxl_domain_rename(libxl_ctx *ctx, uint32_t domid,
     return rc;
 }
 
-int libxl__domain_resume(libxl__gc *gc, uint32_t domid, int suspend_cancel)
-{
-    int rc = 0;
-
-    if (xc_domain_resume(CTX->xch, domid, suspend_cancel)) {
-        LOGE(ERROR, "xc_domain_resume failed for domain %u", domid);
-        rc = ERROR_FAIL;
-        goto out;
-    }
-
-    libxl_domain_type type = libxl__domain_type(gc, domid);
-    if (type == LIBXL_DOMAIN_TYPE_INVALID) {
-        rc = ERROR_FAIL;
-        goto out;
-    }
-
-    if (type == LIBXL_DOMAIN_TYPE_HVM) {
-        rc = libxl__domain_resume_device_model(gc, domid);
-        if (rc) {
-            LOG(ERROR, "failed to resume device model for domain %u:%d",
-                domid, rc);
-            goto out;
-        }
-    }
-
-    if (!xs_resume_domain(CTX->xsh, domid)) {
-        LOGE(ERROR, "xs_resume_domain failed for domain %u", domid);
-        rc = ERROR_FAIL;
-    }
-out:
-    return rc;
-}
-
 int libxl_domain_resume(libxl_ctx *ctx, uint32_t domid, int suspend_cancel,
                         const libxl_asyncop_how *ao_how)
 {
@@ -688,13 +675,18 @@ int libxl_domain_info(libxl_ctx *ctx, libxl_dominfo *info_r,
         LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "getting domain info list");
         return ERROR_FAIL;
     }
-    if (ret==0 || xcinfo.domain != domid) return ERROR_INVAL;
+    if (ret==0 || xcinfo.domain != domid) return ERROR_DOMAIN_NOTFOUND;
 
     if (info_r)
         xcinfo2xlinfo(ctx, &xcinfo, info_r);
     return 0;
 }
 
+/* Returns:
+ *   0 - success
+ *   ERROR_FAIL + errno == ENOENT - no entry found
+ *   ERROR_$FOO + errno != ENOENT - other failure
+ */
 static int cpupool_info(libxl__gc *gc,
                         libxl_cpupoolinfo *info,
                         uint32_t poolid,
@@ -707,7 +699,7 @@ static int cpupool_info(libxl__gc *gc,
     if (xcinfo == NULL)
     {
         if (exact || errno != ENOENT)
-            LOGE(ERROR, "failed to get info for cpupool%d\n", poolid);
+            LOGE(ERROR, "failed to get info for cpupool%d", poolid);
         return ERROR_FAIL;
     }
 
@@ -750,7 +742,8 @@ int libxl_cpupool_info(libxl_ctx *ctx,
 libxl_cpupoolinfo * libxl_list_cpupool(libxl_ctx *ctx, int *nb_pool_out)
 {
     GC_INIT(ctx);
-    libxl_cpupoolinfo info, *ptr, *tmp;
+    libxl_cpupoolinfo info, *ptr;
+
     int i;
     uint32_t poolid;
 
@@ -758,24 +751,29 @@ libxl_cpupoolinfo * libxl_list_cpupool(libxl_ctx *ctx, int *nb_pool_out)
 
     poolid = 0;
     for (i = 0;; i++) {
-        if (cpupool_info(gc, &info, poolid, false))
+        libxl_cpupoolinfo_init(&info);
+        if (cpupool_info(gc, &info, poolid, false)) {
+            libxl_cpupoolinfo_dispose(&info);
+            if (errno != ENOENT) goto out;
             break;
-        tmp = realloc(ptr, (i + 1) * sizeof(libxl_cpupoolinfo));
-        if (!tmp) {
-            LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "allocating cpupool info");
-            libxl_cpupoolinfo_list_free(ptr, i);
-            ptr = NULL;
-            goto out;
         }
-        ptr = tmp;
+
+        ptr = libxl__realloc(NOGC, ptr, (i+1) * sizeof(libxl_cpupoolinfo));
         ptr[i] = info;
         poolid = info.poolid + 1;
+        /* Don't dispose of info because it will be returned to caller */
     }
 
     *nb_pool_out = i;
-out:
+
     GC_FREE;
     return ptr;
+
+out:
+    libxl_cpupoolinfo_list_free(ptr, i);
+    *nb_pool_out = 0;
+    GC_FREE;
+    return NULL;
 }
 
 /* this API call only list VM running on this host. A VM can
@@ -815,10 +813,12 @@ out:
     return ptr;
 }
 
-static void libxl__remus_setup_done(libxl__egc *egc,
-                                    libxl__remus_devices_state *rds, int rc);
-static void libxl__remus_setup_failed(libxl__egc *egc,
-                                      libxl__remus_devices_state *rds, int rc);
+static void libxl__remus_setup(libxl__egc *egc,
+                               libxl__domain_suspend_state *dss);
+static void remus_setup_done(libxl__egc *egc,
+                             libxl__remus_devices_state *rds, int rc);
+static void remus_setup_failed(libxl__egc *egc,
+                               libxl__remus_devices_state *rds, int rc);
 static void remus_failover_cb(libxl__egc *egc,
                               libxl__domain_suspend_state *dss, int rc);
 
@@ -867,13 +867,26 @@ int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
 
     assert(info);
 
+    /* Point of no return */
+    libxl__remus_setup(egc, dss);
+    return AO_INPROGRESS;
+
+ out:
+    return AO_CREATE_FAIL(rc);
+}
+
+static void libxl__remus_setup(libxl__egc *egc,
+                               libxl__domain_suspend_state *dss)
+{
     /* Convenience aliases */
     libxl__remus_devices_state *const rds = &dss->rds;
+    const libxl_domain_remus_info *const info = dss->remus;
+
+    STATE_AO_GC(dss->ao);
 
     if (libxl_defbool_val(info->netbuf)) {
         if (!libxl__netbuffer_enabled(gc)) {
             LOG(ERROR, "Remus: No support for network buffering");
-            rc = ERROR_FAIL;
             goto out;
         }
         rds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VIF);
@@ -883,36 +896,35 @@ int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
         rds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VBD);
 
     rds->ao = ao;
-    rds->domid = domid;
-    rds->callback = libxl__remus_setup_done;
+    rds->domid = dss->domid;
+    rds->callback = remus_setup_done;
 
-    /* Point of no return */
     libxl__remus_devices_setup(egc, rds);
-    return AO_INPROGRESS;
+    return;
 
- out:
-    return AO_ABORT(rc);
+out:
+    dss->callback(egc, dss, ERROR_FAIL);
 }
 
-static void libxl__remus_setup_done(libxl__egc *egc,
-                                    libxl__remus_devices_state *rds, int rc)
+static void remus_setup_done(libxl__egc *egc,
+                             libxl__remus_devices_state *rds, int rc)
 {
     libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
     STATE_AO_GC(dss->ao);
 
     if (!rc) {
-        libxl__domain_suspend(egc, dss);
+        libxl__domain_save(egc, dss);
         return;
     }
 
     LOG(ERROR, "Remus: failed to setup device for guest with domid %u, rc %d",
         dss->domid, rc);
-    rds->callback = libxl__remus_setup_failed;
+    rds->callback = remus_setup_failed;
     libxl__remus_devices_teardown(egc, rds);
 }
 
-static void libxl__remus_setup_failed(libxl__egc *egc,
-                                      libxl__remus_devices_state *rds, int rc)
+static void remus_setup_failed(libxl__egc *egc,
+                               libxl__remus_devices_state *rds, int rc)
 {
     libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
     STATE_AO_GC(dss->ao);
@@ -940,6 +952,12 @@ static void domain_suspend_cb(libxl__egc *egc,
                               libxl__domain_suspend_state *dss, int rc)
 {
     STATE_AO_GC(dss->ao);
+    int flrc;
+
+    flrc = libxl__fd_flags_restore(gc, dss->fd, dss->fdfl);
+    /* If suspend has failed already then report that error not this one. */
+    if (flrc && !rc) rc = flrc;
+
     libxl__ao_complete(egc,ao,rc);
 
 }
@@ -968,11 +986,16 @@ int libxl_domain_suspend(libxl_ctx *ctx, uint32_t domid, int fd, int flags,
     dss->live = flags & LIBXL_SUSPEND_LIVE;
     dss->debug = flags & LIBXL_SUSPEND_DEBUG;
 
-    libxl__domain_suspend(egc, dss);
+    rc = libxl__fd_flags_modify_save(gc, dss->fd,
+                                     ~(O_NONBLOCK|O_NDELAY), 0,
+                                     &dss->fdfl);
+    if (rc < 0) goto out_err;
+
+    libxl__domain_save(egc, dss);
     return AO_INPROGRESS;
 
  out_err:
-    return AO_ABORT(rc);
+    return AO_CREATE_FAIL(rc);
 }
 
 int libxl_domain_pause(libxl_ctx *ctx, uint32_t domid)
@@ -1012,8 +1035,6 @@ out:
 int libxl_domain_unpause(libxl_ctx *ctx, uint32_t domid)
 {
     GC_INIT(ctx);
-    char *path;
-    char *state;
     int ret, rc = 0;
 
     libxl_domain_type type = libxl__domain_type(gc, domid);
@@ -1023,12 +1044,11 @@ int libxl_domain_unpause(libxl_ctx *ctx, uint32_t domid)
     }
 
     if (type == LIBXL_DOMAIN_TYPE_HVM) {
-        path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/state", domid);
-        state = libxl__xs_read(gc, XBT_NULL, path);
-        if (state != NULL && !strcmp(state, "paused")) {
-            libxl__qemu_traditional_cmd(gc, domid, "continue");
-            libxl__wait_for_device_model_deprecated(gc, domid, "running",
-                                         NULL, NULL, NULL);
+        rc = libxl__domain_resume_device_model(gc, domid);
+        if (rc < 0) {
+            LOG(ERROR, "failed to unpause device model for domain %u:%d",
+                domid, rc);
+            goto out;
         }
     }
     ret = xc_domain_unpause(ctx->xch, domid);
@@ -1158,22 +1178,20 @@ static void domain_death_xswatch_callback(libxl__egc *egc, libxl__ev_xswatch *w,
                                         const char *wpath, const char *epath) {
     EGC_GC;
     libxl_evgen_domain_death *evg;
-    uint32_t domid;
     int rc;
 
     CTX_LOCK;
 
     evg = LIBXL_TAILQ_FIRST(&CTX->death_list);
-    if (!evg) goto out;
-
-    domid = evg->domid;
 
     for (;;) {
+        if (!evg) goto out;
+
         int nentries = LIBXL_TAILQ_NEXT(evg, entry) ? 200 : 1;
         xc_domaininfo_t domaininfos[nentries];
         const xc_domaininfo_t *got = domaininfos, *gotend;
 
-        rc = xc_domain_getinfolist(CTX->xch, domid, nentries, domaininfos);
+        rc = xc_domain_getinfolist(CTX->xch, evg->domid, nentries, domaininfos);
         if (rc == -1) {
             LIBXL__EVENT_DISASTER(egc, "xc_domain_getinfolist failed while"
                                   " processing @releaseDomain watch event",
@@ -1183,8 +1201,10 @@ static void domain_death_xswatch_callback(libxl__egc *egc, libxl__ev_xswatch *w,
         gotend = &domaininfos[rc];
 
         LIBXL__LOG(CTX, LIBXL__LOG_DEBUG, "[evg=%p:%"PRIu32"]"
-                   " from domid=%"PRIu32" nentries=%d rc=%d",
-                   evg, evg->domid, domid, nentries, rc);
+                   " nentries=%d rc=%d %ld..%ld",
+                   evg, evg->domid, nentries, rc,
+                   rc>0 ? (long)domaininfos[0].domain : 0,
+                   rc>0 ? (long)domaininfos[rc-1].domain : 0);
 
         for (;;) {
             if (!evg) {
@@ -1247,7 +1267,6 @@ static void domain_death_xswatch_callback(libxl__egc *egc, libxl__ev_xswatch *w,
         }
 
         assert(rc); /* rc==0 results in us eating all evgs and quitting */
-        domid = gotend[-1].domain;
     }
  all_reported:
  out:
@@ -1549,6 +1568,10 @@ static void devices_destroy_cb(libxl__egc *egc,
                                libxl__devices_remove_state *drs,
                                int rc);
 
+static void domain_destroy_domid_cb(libxl__egc *egc,
+                                    libxl__ev_child *destroyer,
+                                    pid_t pid, int status);
+
 void libxl__destroy_domid(libxl__egc *egc, libxl__destroy_domid_state *dis)
 {
     STATE_AO_GC(dis->ao);
@@ -1558,11 +1581,13 @@ void libxl__destroy_domid(libxl__egc *egc, libxl__destroy_domid_state *dis)
     char *pid;
     int rc, dm_present;
 
+    libxl__ev_child_init(&dis->destroyer);
+
     rc = libxl_domain_info(ctx, NULL, domid);
     switch(rc) {
     case 0:
         break;
-    case ERROR_INVAL:
+    case ERROR_DOMAIN_NOTFOUND:
         LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "non-existant domain %d", domid);
     default:
         goto out;
@@ -1627,7 +1652,7 @@ static void devices_destroy_cb(libxl__egc *egc,
     uint32_t domid = dis->domid;
     char *dom_path;
     char *vm_path;
-    libxl__domain_userdata_lock *lock = NULL;
+    libxl__domain_userdata_lock *lock;
 
     dom_path = libxl__xs_get_dompath(gc, domid);
     if (!dom_path) {
@@ -1649,8 +1674,6 @@ static void devices_destroy_cb(libxl__egc *egc,
 
     xs_rm(ctx->xsh, XBT_NULL, libxl__xs_libxl_path(gc, domid));
     xs_rm(ctx->xsh, XBT_NULL, libxl__sprintf(gc,
-                                "/local/domain/0/device-model/%d", domid));
-    xs_rm(ctx->xsh, XBT_NULL, libxl__sprintf(gc,
                                 "/local/domain/%d/hvmloader", domid));
 
     /* This is async operation, we already hold CTX lock */
@@ -1661,18 +1684,70 @@ static void devices_destroy_cb(libxl__egc *egc,
     }
     libxl__userdata_destroyall(gc, domid);
 
-    rc = xc_domain_destroy(ctx->xch, domid);
-    if (rc < 0) {
-        LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, rc, "xc_domain_destroy failed for %d", domid);
+    libxl__unlock_domain_userdata(lock);
+
+    /* Clean up qemu-save and qemu-resume files. They are
+     * intermediate files created by libxc. Unfortunately they
+     * don't fit in existing userdata scheme very well.
+     */
+    rc = libxl__remove_file(gc, libxl__device_model_savefile(gc, domid));
+    if (rc < 0) goto out;
+    rc = libxl__remove_file(gc,
+             GCSPRINTF(LIBXL_DEVICE_MODEL_RESTORE_FILE".%u", domid));
+    if (rc < 0) goto out;
+
+    rc = libxl__ev_child_fork(gc, &dis->destroyer, domain_destroy_domid_cb);
+    if (rc < 0) goto out;
+    if (!rc) { /* child */
+        ctx->xch = xc_interface_open(ctx->lg,0,0);
+        if (!ctx->xch) goto badchild;
+
+        rc = xc_domain_destroy(ctx->xch, domid);
+        if (rc < 0) goto badchild;
+        _exit(0);
+
+    badchild:
+        if (errno > 0  && errno < 126) {
+            _exit(errno);
+        } else {
+            LOGE(ERROR,
+ "xc_domain_destroy failed for %d (with difficult errno value %d)",
+                 domid, errno);
+            _exit(-1);
+        }
+    }
+    LOG(DEBUG, "forked pid %ld for destroy of domain %d", (long)rc, domid);
+
+    return;
+
+out:
+    dis->callback(egc, dis, rc);
+    return;
+}
+
+static void domain_destroy_domid_cb(libxl__egc *egc,
+                                    libxl__ev_child *destroyer,
+                                    pid_t pid, int status)
+{
+    libxl__destroy_domid_state *dis = CONTAINER_OF(destroyer, *dis, destroyer);
+    STATE_AO_GC(dis->ao);
+    int rc;
+
+    if (status) {
+        if (WIFEXITED(status) && WEXITSTATUS(status)<126) {
+            LOGEV(ERROR, WEXITSTATUS(status),
+                  "xc_domain_destroy failed for %"PRIu32"", dis->domid);
+        } else {
+            libxl_report_child_exitstatus(CTX, XTL_ERROR,
+                                          "async domain destroy", pid, status);
+        }
         rc = ERROR_FAIL;
         goto out;
     }
     rc = 0;
 
-out:
-    if (lock) libxl__unlock_domain_userdata(lock);
+ out:
     dis->callback(egc, dis, rc);
-    return;
 }
 
 int libxl_console_exec(libxl_ctx *ctx, uint32_t domid, int cons_num,
@@ -1734,7 +1809,7 @@ int libxl_console_get_tty(libxl_ctx *ctx, uint32_t domid, int cons_num,
     }
 
     tty = libxl__xs_read(gc, XBT_NULL, tty_path);
-    if (!tty) {
+    if (!tty || tty[0] == '\0') {
        LOGE(ERROR,"unable to read console tty path `%s'",tty_path);
        rc = ERROR_FAIL;
        goto out;
@@ -1892,8 +1967,10 @@ int libxl__get_domid(libxl__gc *gc, uint32_t *domid)
     const char *xs_domid;
 
     rc = libxl__xs_read_checked(gc, XBT_NULL, DOMID_XS_PATH, &xs_domid);
-    if (rc || !xs_domid) {
-        rc = rc ? rc : ERROR_FAIL;
+    if (rc) goto out;
+    if (!xs_domid) {
+        LOG(ERROR, "failed to get own domid (%s)", DOMID_XS_PATH);
+        rc = ERROR_FAIL;
         goto out;
     }
 
@@ -2023,7 +2100,7 @@ void libxl__device_vtpm_add(libxl__egc *egc, uint32_t domid,
     flexarray_append(back, "online");
     flexarray_append(back, "1");
     flexarray_append(back, "state");
-    flexarray_append(back, GCSPRINTF("%d", 1));
+    flexarray_append(back, GCSPRINTF("%d", XenbusStateInitialising));
     flexarray_append(back, "handle");
     flexarray_append(back, GCSPRINTF("%d", vtpm->devid));
 
@@ -2035,7 +2112,7 @@ void libxl__device_vtpm_add(libxl__egc *egc, uint32_t domid,
     flexarray_append(front, "backend-id");
     flexarray_append(front, GCSPRINTF("%d", vtpm->backend_domid));
     flexarray_append(front, "state");
-    flexarray_append(front, GCSPRINTF("%d", 1));
+    flexarray_append(front, GCSPRINTF("%d", XenbusStateInitialising));
     flexarray_append(front, "handle");
     flexarray_append(front, GCSPRINTF("%d", vtpm->devid));
 
@@ -2195,7 +2272,7 @@ int libxl_device_vtpm_getinfo(libxl_ctx *ctx,
     val = libxl__xs_read(gc, XBT_NULL,
           GCSPRINTF("%s/uuid", vtpminfo->backend));
     if(val == NULL) {
-       LOG(ERROR, "%s/uuid does not exist!\n", vtpminfo->backend);
+       LOG(ERROR, "%s/uuid does not exist!", vtpminfo->backend);
        goto err;
     }
     if(libxl_uuid_from_string(&(vtpminfo->uuid), val)) {
@@ -2286,8 +2363,8 @@ int libxl__device_from_disk(libxl__gc *gc, uint32_t domid,
             device->backend_kind = LIBXL__DEVICE_KIND_QDISK;
             break;
         default:
-            LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "unrecognized disk backend type: %d\n",
-                       disk->backend);
+            LIBXL__LOG(ctx, LIBXL__LOG_ERROR,
+                       "unrecognized disk backend type: %d", disk->backend);
             return ERROR_INVAL;
     }
 
@@ -2416,9 +2493,9 @@ static void device_disk_add(libxl__egc *egc, uint32_t domid,
                 if (!disk->script &&
                     disk->backend_domid == LIBXL_TOOLSTACK_DOMID) {
                     int major, minor;
-                    libxl__device_physdisk_major_minor(dev, &major, &minor);
-                    flexarray_append_pair(back, "physical-device",
-                            libxl__sprintf(gc, "%x:%x", major, minor));
+                    if (!libxl__device_physdisk_major_minor(dev, &major, &minor))
+                        flexarray_append_pair(back, "physical-device",
+                                              libxl__sprintf(gc, "%x:%x", major, minor));
                 }
 
                 assert(device->backend_kind == LIBXL__DEVICE_KIND_VBD);
@@ -2429,7 +2506,7 @@ static void device_disk_add(libxl__egc *egc, uint32_t domid,
                     dev = libxl__blktap_devpath(gc, disk->pdev_path,
                                                 disk->format);
                     if (!dev) {
-                        LOG(ERROR, "failed to get blktap devpath for %p\n",
+                        LOG(ERROR, "failed to get blktap devpath for %p",
                             disk->pdev_path);
                         rc = ERROR_FAIL;
                         goto out;
@@ -2453,7 +2530,9 @@ static void device_disk_add(libxl__egc *egc, uint32_t domid,
                 assert(device->backend_kind == LIBXL__DEVICE_KIND_QDISK);
                 break;
             default:
-                LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "unrecognized disk backend type: %d\n", disk->backend);
+                LIBXL__LOG(ctx, LIBXL__LOG_ERROR,
+                           "unrecognized disk backend type: %d",
+                           disk->backend);
                 rc = ERROR_INVAL;
                 goto out;
         }
@@ -2467,7 +2546,7 @@ static void device_disk_add(libxl__egc *egc, uint32_t domid,
         flexarray_append(back, "bootable");
         flexarray_append(back, libxl__sprintf(gc, "%d", 1));
         flexarray_append(back, "state");
-        flexarray_append(back, libxl__sprintf(gc, "%d", 1));
+        flexarray_append(back, GCSPRINTF("%d", XenbusStateInitialising));
         flexarray_append(back, "dev");
         flexarray_append(back, disk->vdev);
         flexarray_append(back, "type");
@@ -2487,7 +2566,7 @@ static void device_disk_add(libxl__egc *egc, uint32_t domid,
         flexarray_append(front, "backend-id");
         flexarray_append(front, libxl__sprintf(gc, "%d", disk->backend_domid));
         flexarray_append(front, "state");
-        flexarray_append(front, libxl__sprintf(gc, "%d", 1));
+        flexarray_append(front, GCSPRINTF("%d", XenbusStateInitialising));
         flexarray_append(front, "virtual-device");
         flexarray_append(front, libxl__sprintf(gc, "%d", device->devid));
         flexarray_append(front, "device-type");
@@ -2941,7 +3020,7 @@ out:
 
     if (lock) libxl__unlock_domain_userdata(lock);
 
-    if (rc) return AO_ABORT(rc);
+    if (rc) return AO_CREATE_FAIL(rc);
     return AO_INPROGRESS;
 }
 
@@ -2986,7 +3065,6 @@ void libxl__device_disk_local_initiate_attach(libxl__egc *egc,
                                      libxl__disk_local_state *dls)
 {
     STATE_AO_GC(dls->ao);
-    libxl_ctx *ctx = CTX;
     char *dev = NULL;
     int rc;
     const libxl_device_disk *in_disk = dls->in_disk;
@@ -3004,55 +3082,21 @@ void libxl__device_disk_local_initiate_attach(libxl__egc *egc,
     rc = libxl__device_disk_setdefault(gc, disk);
     if (rc) goto out;
 
-    switch (disk->backend) {
-        case LIBXL_DISK_BACKEND_PHY:
-            LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, "locally attaching PHY disk %s",
-                       disk->pdev_path);
-            dev = disk->pdev_path;
-            break;
-        case LIBXL_DISK_BACKEND_TAP:
-            switch (disk->format) {
-            case LIBXL_DISK_FORMAT_RAW:
-                /* optimise away the early tapdisk attach in this case */
-                LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, "locally attaching"
-                           " tap disk %s directly (ie without using blktap)",
-                           disk->pdev_path);
-                dev = disk->pdev_path;
-                break;
-            case LIBXL_DISK_FORMAT_VHD:
-                dev = libxl__blktap_devpath(gc, disk->pdev_path,
-                                            disk->format);
-                break;
-            case LIBXL_DISK_FORMAT_QCOW:
-            case LIBXL_DISK_FORMAT_QCOW2:
-                abort(); /* prevented by libxl__device_disk_set_backend */
-            default:
-                LIBXL__LOG(ctx, LIBXL__LOG_ERROR,
-                           "unrecognized disk format: %d", disk->format);
-                rc = ERROR_FAIL;
-                goto out;
-            }
-            break;
-        case LIBXL_DISK_BACKEND_QDISK:
-            if (disk->format != LIBXL_DISK_FORMAT_RAW) {
-                libxl__prepare_ao_device(ao, &dls->aodev);
-                dls->aodev.callback = local_device_attach_cb;
-                device_disk_add(egc, LIBXL_TOOLSTACK_DOMID, disk,
-                                &dls->aodev, libxl__alloc_vdev,
-                                (void *) blkdev_start);
-                return;
-            } else {
-                dev = disk->pdev_path;
-            }
-            LOG(DEBUG, "locally attaching qdisk %s", dev);
-            break;
-        default:
-            LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "unrecognized disk backend "
-                "type: %d", disk->backend);
-            rc = ERROR_FAIL;
-            goto out;
+    /* If this is in a driver domain, or it's not a raw format, or it involves
+     * running a script, we have to do a local attach. */
+    if (disk->backend_domname != NULL
+        || disk->format != LIBXL_DISK_FORMAT_RAW
+        || disk->script != NULL) {
+        libxl__prepare_ao_device(ao, &dls->aodev);
+        dls->aodev.callback = local_device_attach_cb;
+        device_disk_add(egc, LIBXL_TOOLSTACK_DOMID, disk, &dls->aodev,
+                        libxl__alloc_vdev, (void *) blkdev_start);
+        return;
     }
 
+    LOG(DEBUG, "locally attaching RAW disk %s", disk->pdev_path);
+    dev = disk->pdev_path;
+
     if (dev != NULL)
         dls->diskpath = libxl__strdup(gc, dev);
 
@@ -3085,13 +3129,13 @@ static void local_device_attach_cb(libxl__egc *egc, libxl__ao_device *aodev)
     }
 
     dev = GCSPRINTF("/dev/%s", disk->vdev);
-    LOG(DEBUG, "locally attaching qdisk %s", dev);
+    LOG(DEBUG, "locally attaching disk %s", dev);
 
     rc = libxl__device_from_disk(gc, LIBXL_TOOLSTACK_DOMID, disk, &device);
     if (rc < 0)
         goto out;
     be_path = libxl__device_backend_path(gc, &device);
-    rc = libxl__wait_for_backend(gc, be_path, "4");
+    rc = libxl__wait_for_backend(gc, be_path, GCSPRINTF("%d", XenbusStateConnected));
     if (rc < 0)
         goto out;
 
@@ -3124,29 +3168,18 @@ void libxl__device_disk_local_initiate_detach(libxl__egc *egc,
 
     if (!dls->diskpath) goto out;
 
-    switch (disk->backend) {
-        case LIBXL_DISK_BACKEND_QDISK:
-            if (disk->vdev != NULL) {
-                GCNEW(device);
-                rc = libxl__device_from_disk(gc, LIBXL_TOOLSTACK_DOMID,
-                                             disk, device);
-                if (rc != 0) goto out;
-
-                aodev->action = LIBXL__DEVICE_ACTION_REMOVE;
-                aodev->dev = device;
-                aodev->callback = local_device_detach_cb;
-                aodev->force = 0;
-                libxl__initiate_device_remove(egc, aodev);
-                return;
-            }
-            /* disk->vdev == NULL; fall through */
-        default:
-            /*
-             * Nothing to do for PHYSTYPE_PHY.
-             * For other device types assume that the blktap2 process is
-             * needed by the soon to be started domain and do nothing.
-             */
-            goto out;
+    if (disk->vdev != NULL) {
+        GCNEW(device);
+        rc = libxl__device_from_disk(gc, LIBXL_TOOLSTACK_DOMID,
+                                     disk, device);
+        if (rc != 0) goto out;
+        
+        aodev->action = LIBXL__DEVICE_ACTION_REMOVE;
+        aodev->dev = device;
+        aodev->callback = local_device_detach_cb;
+        aodev->force = 0;
+        libxl__initiate_device_remove(egc, aodev);
+        return;
     }
 
 out:
@@ -3185,7 +3218,6 @@ out:
 int libxl__device_nic_setdefault(libxl__gc *gc, libxl_device_nic *nic,
                                  uint32_t domid)
 {
-    int run_hotplug_scripts;
     int rc;
 
     if (!nic->mtu)
@@ -3216,12 +3248,6 @@ int libxl__device_nic_setdefault(libxl__gc *gc, libxl_device_nic *nic,
                                   libxl__xen_script_dir_path()) < 0 )
         return ERROR_FAIL;
 
-    run_hotplug_scripts = libxl__hotplug_settings(gc, XBT_NULL);
-    if (run_hotplug_scripts < 0) {
-        LOG(ERROR, "unable to get current hotplug scripts execution setting");
-        return run_hotplug_scripts;
-    }
-
     rc = libxl__resolve_domid(gc, nic->backend_domname, &nic->backend_domid);
     if (rc < 0) return rc;
 
@@ -3301,7 +3327,7 @@ void libxl__device_nic_add(libxl__egc *egc, uint32_t domid,
     flexarray_append(back, "online");
     flexarray_append(back, "1");
     flexarray_append(back, "state");
-    flexarray_append(back, libxl__sprintf(gc, "%d", 1));
+    flexarray_append(back, GCSPRINTF("%d", XenbusStateInitialising));
     if (nic->script)
         flexarray_append_pair(back, "script",
                               libxl__abs_path(gc, nic->script,
@@ -3342,7 +3368,7 @@ void libxl__device_nic_add(libxl__egc *egc, uint32_t domid,
     flexarray_append(front, "backend-id");
     flexarray_append(front, libxl__sprintf(gc, "%d", nic->backend_domid));
     flexarray_append(front, "state");
-    flexarray_append(front, libxl__sprintf(gc, "%d", 1));
+    flexarray_append(front, GCSPRINTF("%d", XenbusStateInitialising));
     flexarray_append(front, "handle");
     flexarray_append(front, libxl__sprintf(gc, "%d", nic->devid));
     flexarray_append(front, "mac");
@@ -3628,7 +3654,7 @@ int libxl__device_console_add(libxl__gc *gc, uint32_t domid,
     flexarray_append(back, "online");
     flexarray_append(back, "1");
     flexarray_append(back, "state");
-    flexarray_append(back, libxl__sprintf(gc, "%d", 1));
+    flexarray_append(back, GCSPRINTF("%d", XenbusStateInitialising));
     flexarray_append(back, "protocol");
     flexarray_append(back, LIBXL_XENCONSOLE_PROTOCOL);
 
@@ -3667,7 +3693,7 @@ int libxl__device_console_add(libxl__gc *gc, uint32_t domid,
         flexarray_append(ro_front, libxl__sprintf(gc, "%lu", state->console_mfn));
     } else {
         flexarray_append(front, "state");
-        flexarray_append(front, libxl__sprintf(gc, "%d", 1));
+        flexarray_append(front, GCSPRINTF("%d", XenbusStateInitialising));
         flexarray_append(front, "protocol");
         flexarray_append(front, LIBXL_XENCONSOLE_PROTOCOL);
     }
@@ -3964,12 +3990,12 @@ int libxl__device_vkb_add(libxl__gc *gc, uint32_t domid,
     flexarray_append(back, "online");
     flexarray_append(back, "1");
     flexarray_append(back, "state");
-    flexarray_append(back, libxl__sprintf(gc, "%d", 1));
+    flexarray_append(back, GCSPRINTF("%d", XenbusStateInitialising));
 
     flexarray_append(front, "backend-id");
     flexarray_append(front, libxl__sprintf(gc, "%d", vkb->backend_domid));
     flexarray_append(front, "state");
-    flexarray_append(front, libxl__sprintf(gc, "%d", 1));
+    flexarray_append(front, GCSPRINTF("%d", XenbusStateInitialising));
 
     libxl__device_generic_add(gc, XBT_NULL, &device,
                               libxl__xs_kvs_of_flexarray(gc, back, back->count),
@@ -4060,7 +4086,7 @@ int libxl__device_vfb_add(libxl__gc *gc, uint32_t domid, libxl_device_vfb *vfb)
 
     flexarray_append_pair(back, "frontend-id", libxl__sprintf(gc, "%d", domid));
     flexarray_append_pair(back, "online", "1");
-    flexarray_append_pair(back, "state", libxl__sprintf(gc, "%d", 1));
+    flexarray_append_pair(back, "state", GCSPRINTF("%d", XenbusStateInitialising));
     flexarray_append_pair(back, "vnc",
                           libxl_defbool_val(vfb->vnc.enable) ? "1" : "0");
     flexarray_append_pair(back, "vnclisten", vfb->vnc.listen);
@@ -4082,7 +4108,7 @@ int libxl__device_vfb_add(libxl__gc *gc, uint32_t domid, libxl_device_vfb *vfb)
 
     flexarray_append_pair(front, "backend-id",
                           libxl__sprintf(gc, "%d", vfb->backend_domid));
-    flexarray_append_pair(front, "state", libxl__sprintf(gc, "%d", 1));
+    flexarray_append_pair(front, "state", GCSPRINTF("%d", XenbusStateInitialising));
 
     libxl__device_generic_add(gc, XBT_NULL, &device,
                               libxl__xs_kvs_of_flexarray(gc, back, back->count),
@@ -4131,7 +4157,7 @@ out:
         libxl__initiate_device_remove(egc, aodev);                      \
                                                                         \
     out:                                                                \
-        if (rc) return AO_ABORT(rc);                                    \
+        if (rc) return AO_CREATE_FAIL(rc);                                    \
         return AO_INPROGRESS;                                           \
     }
 
@@ -4395,32 +4421,36 @@ static void backend_watch_callback(libxl__egc *egc, libxl__ev_xswatch *watch,
     libxl__ao *nested_ao = libxl__nested_ao_create(ddomain->ao);
     STATE_AO_GC(nested_ao);
     char *p, *path;
-    const char *sstate;
-    int state, rc, num_devs;
+    const char *sstate, *sonline;
+    int state, online, rc, num_devs;
     libxl__device *dev = NULL;
     libxl__ddomain_device *ddev = NULL;
     libxl__ddomain_guest *dguest = NULL;
     bool free_ao = false;
 
-    /* Check if event_path ends with "state" and truncate it */
-    if (strlen(event_path) < strlen("state"))
-        goto skip;
-
+    /* Check if event_path ends with "state" or "online" and truncate it. */
     path = libxl__strdup(gc, event_path);
-    p = path + strlen(path) - strlen("state") - 1;
-    if (*p != '/')
+    p = strrchr(path, '/');
+    if (p == NULL)
         goto skip;
-    *p = '\0';
-    p++;
-    if (strcmp(p, "state") != 0)
+    if (strcmp(p, "/state") != 0 && strcmp(p, "/online") != 0)
         goto skip;
+    /* Truncate the string so it points to the backend directory. */
+    *p = '\0';
 
-    /* Check if the state is 1 (XenbusStateInitialising) or greater */
-    rc = libxl__xs_read_checked(gc, XBT_NULL, event_path, &sstate);
+    /* Fetch the value of the state and online nodes. */
+    rc = libxl__xs_read_checked(gc, XBT_NULL, GCSPRINTF("%s/state", path),
+                                &sstate);
     if (rc || !sstate)
         goto skip;
     state = atoi(sstate);
 
+    rc = libxl__xs_read_checked(gc, XBT_NULL, GCSPRINTF("%s/online", path),
+                                &sonline);
+    if (rc || !sonline)
+        goto skip;
+    online = atoi(sonline);
+
     dev = libxl__zalloc(NOGC, sizeof(*dev));
     rc = libxl__parse_backend_path(gc, path, dev);
     if (rc)
@@ -4462,7 +4492,7 @@ static void backend_watch_callback(libxl__egc *egc, libxl__ev_xswatch *watch,
         rc = add_device(egc, nested_ao, dguest, ddev);
         if (rc > 0)
             free_ao = true;
-    } else if (state == XenbusStateClosed) {
+    } else if (state == XenbusStateClosed && online == 0) {
         /*
          * Removal of an active device, remove it from the list and
          * free it's data structures if they are no longer needed.
@@ -4524,12 +4554,6 @@ int libxl_device_events_handler(libxl_ctx *ctx,
         goto out;
     }
 
-    rc = libxl__xs_write_checked(gc, XBT_NULL, DISABLE_UDEV_PATH, "1");
-    if (rc) {
-        LOGE(ERROR, "unable to write %s = 1", DISABLE_UDEV_PATH);
-        goto out;
-    }
-
     /*
      * We use absolute paths because we want xswatch to also return
      * absolute paths that can be parsed by libxl__parse_backend_path.
@@ -4537,11 +4561,12 @@ int libxl_device_events_handler(libxl_ctx *ctx,
     be_path = GCSPRINTF("/local/domain/%u/backend", domid);
     rc = libxl__ev_xswatch_register(gc, &ddomain.watch, backend_watch_callback,
                                     be_path);
+    if (rc) goto out;
 
-out:
-    GC_FREE;
-    if (rc) return AO_ABORT(rc);
     return AO_INPROGRESS;
+
+out:
+    return AO_CREATE_FAIL(rc);
 }
 
 /******************************************************************************/
@@ -4553,10 +4578,21 @@ int libxl_domain_setmaxmem(libxl_ctx *ctx, uint32_t domid, uint32_t max_memkb)
     uint32_t memorykb;
     char *dompath = libxl__xs_get_dompath(gc, domid);
     int rc = 1;
+    libxl__domain_userdata_lock *lock = NULL;
+
+    CTX_LOCK;
+
+    lock = libxl__lock_domain_userdata(gc, domid);
+    if (!lock) {
+        rc = ERROR_LOCK_FAIL;
+        goto out;
+    }
 
     mem = libxl__xs_read(gc, XBT_NULL, libxl__sprintf(gc, "%s/memory/target", dompath));
     if (!mem) {
-        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "cannot get memory info from %s/memory/target\n", dompath);
+        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR,
+                         "cannot get memory info from %s/memory/target",
+                         dompath);
         goto out;
     }
     memorykb = strtoul(mem, &endptr, 10);
@@ -4566,7 +4602,8 @@ int libxl_domain_setmaxmem(libxl_ctx *ctx, uint32_t domid, uint32_t max_memkb)
     }
 
     if (max_memkb < memorykb) {
-        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "memory_static_max must be greater than or or equal to memory_dynamic_max\n");
+        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR,
+                         "memory_static_max must be greater than or or equal to memory_dynamic_max");
         goto out;
     }
     rc = xc_domain_setmaxmem(ctx->xch, domid, max_memkb + LIBXL_MAXMEM_CONSTANT);
@@ -4579,6 +4616,8 @@ int libxl_domain_setmaxmem(libxl_ctx *ctx, uint32_t domid, uint32_t max_memkb)
 
     rc = 0;
 out:
+    if (lock) libxl__unlock_domain_userdata(lock);
+    CTX_UNLOCK;
     GC_FREE;
     return rc;
 }
@@ -4589,13 +4628,11 @@ static int libxl__fill_dom0_memory_info(libxl__gc *gc, uint32_t *target_memkb,
     int rc;
     libxl_dominfo info;
     libxl_physinfo physinfo;
-    char *target = NULL, *staticmax = NULL, *freememslack = NULL, *endptr = NULL;
+    char *target = NULL, *staticmax = NULL, *endptr = NULL;
     char *target_path = "/local/domain/0/memory/target";
     char *max_path = "/local/domain/0/memory/static-max";
-    char *free_mem_slack_path = "/local/domain/0/memory/freemem-slack";
     xs_transaction_t t;
     libxl_ctx *ctx = libxl__gc_owner(gc);
-    uint32_t free_mem_slack_kb = 0;
 
     libxl_dominfo_init(&info);
 
@@ -4604,8 +4641,7 @@ retry_transaction:
 
     target = libxl__xs_read(gc, t, target_path);
     staticmax = libxl__xs_read(gc, t, max_path);
-    freememslack = libxl__xs_read(gc, t, free_mem_slack_path);
-    if (target && staticmax && freememslack) {
+    if (target && staticmax) {
         rc = 0;
         goto out;
     }
@@ -4652,15 +4688,6 @@ retry_transaction:
         *max_memkb = (uint32_t) info.max_memkb;
     }
 
-    if (freememslack == NULL) {
-        free_mem_slack_kb = (uint32_t) (PAGE_TO_MEMKB(physinfo.total_pages) -
-                info.current_memkb);
-        /* From empirical measurements the free_mem_slack shouldn't be more
-         * than 15% of the total memory present on the system. */
-        if (free_mem_slack_kb > PAGE_TO_MEMKB(physinfo.total_pages) * 0.15)
-            free_mem_slack_kb = PAGE_TO_MEMKB(physinfo.total_pages) * 0.15;
-        libxl__xs_write(gc, t, free_mem_slack_path, "%"PRIu32, free_mem_slack_kb);
-    }
     rc = 0;
 
 out:
@@ -4675,39 +4702,13 @@ out:
     return rc;
 }
 
-/* returns how much memory should be left free in the system */
-static int libxl__get_free_memory_slack(libxl__gc *gc, uint32_t *free_mem_slack)
-{
-    int rc;
-    char *free_mem_slack_path = "/local/domain/0/memory/freemem-slack";
-    char *free_mem_slack_s, *endptr;
-    uint32_t target_memkb, max_memkb;
-
-retry:
-    free_mem_slack_s = libxl__xs_read(gc, XBT_NULL, free_mem_slack_path);
-    if (!free_mem_slack_s) {
-        rc = libxl__fill_dom0_memory_info(gc, &target_memkb, &max_memkb);
-        if (rc < 0)
-            return rc;
-        goto retry;
-    } else {
-        *free_mem_slack = strtoul(free_mem_slack_s, &endptr, 10);
-        if (*endptr != '\0') {
-            LIBXL__LOG_ERRNO(gc->owner, LIBXL__LOG_ERROR,
-                    "invalid free_mem_slack %s from %s\n",
-                    free_mem_slack_s, free_mem_slack_path);
-            return ERROR_FAIL;
-        }
-    }
-    return 0;
-}
-
 int libxl_set_memory_target(libxl_ctx *ctx, uint32_t domid,
         int32_t target_memkb, int relative, int enforce)
 {
     GC_INIT(ctx);
     int rc = 1, abort_transaction = 0;
-    uint32_t memorykb = 0, videoram = 0;
+    uint64_t memorykb;
+    uint32_t videoram = 0;
     uint32_t current_target_memkb = 0, new_target_memkb = 0;
     uint32_t current_max_memkb = 0;
     char *memmax, *endptr, *videoram_s = NULL, *target = NULL;
@@ -4716,6 +4717,15 @@ int libxl_set_memory_target(libxl_ctx *ctx, uint32_t domid,
     libxl_dominfo ptr;
     char *uuid;
     xs_transaction_t t;
+    libxl__domain_userdata_lock *lock;
+
+    CTX_LOCK;
+
+    lock = libxl__lock_domain_userdata(gc, domid);
+    if (!lock) {
+        rc = ERROR_LOCK_FAIL;
+        goto out_no_transaction;
+    }
 
 retry_transaction:
     t = xs_transaction_start(ctx->xsh);
@@ -4732,8 +4742,8 @@ retry_transaction:
         goto retry_transaction;
     } else if (!target) {
         LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR,
-                "cannot get target memory info from %s/memory/target\n",
-                dompath);
+                         "cannot get target memory info from %s/memory/target",
+                         dompath);
         abort_transaction = 1;
         goto out;
     } else {
@@ -4750,8 +4760,8 @@ retry_transaction:
                 "%s/memory/static-max", dompath));
     if (!memmax) {
         LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR,
-                "cannot get memory info from %s/memory/static-max\n",
-                dompath);
+                         "cannot get memory info from %s/memory/static-max",
+                         dompath);
         abort_transaction = 1;
         goto out;
     }
@@ -4764,13 +4774,17 @@ retry_transaction:
         goto out;
     }
 
+    videoram_s = libxl__xs_read(gc, t, libxl__sprintf(gc,
+                "%s/memory/videoram", dompath));
+    videoram = videoram_s ? atoi(videoram_s) : 0;
+
     if (relative) {
         if (target_memkb < 0 && abs(target_memkb) > current_target_memkb)
             new_target_memkb = 0;
         else
             new_target_memkb = current_target_memkb + target_memkb;
     } else
-        new_target_memkb = target_memkb;
+        new_target_memkb = target_memkb - videoram;
     if (new_target_memkb > memorykb) {
         LIBXL__LOG(ctx, LIBXL__LOG_ERROR,
                 "memory_dynamic_max must be less than or equal to"
@@ -4781,29 +4795,25 @@ retry_transaction:
 
     if (!domid && new_target_memkb < LIBXL_MIN_DOM0_MEM) {
         LIBXL__LOG(ctx, LIBXL__LOG_ERROR,
-                "new target %d for dom0 is below the minimum threshold\n",
-                 new_target_memkb);
+                   "new target %d for dom0 is below the minimum threshold",
+                   new_target_memkb);
         abort_transaction = 1;
         goto out;
     }
-    videoram_s = libxl__xs_read(gc, t, libxl__sprintf(gc,
-                "%s/memory/videoram", dompath));
-    videoram = videoram_s ? atoi(videoram_s) : 0;
 
     if (enforce) {
-        memorykb = new_target_memkb;
+        memorykb = new_target_memkb + videoram;
         rc = xc_domain_setmaxmem(ctx->xch, domid, memorykb +
                 LIBXL_MAXMEM_CONSTANT);
         if (rc != 0) {
             LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR,
-                    "xc_domain_setmaxmem domid=%d memkb=%d failed "
+                    "xc_domain_setmaxmem domid=%u memkb=%"PRIu64" failed "
                     "rc=%d\n", domid, memorykb + LIBXL_MAXMEM_CONSTANT, rc);
             abort_transaction = 1;
             goto out;
         }
     }
 
-    new_target_memkb -= videoram;
     rc = xc_domain_set_pod_target(ctx->xch, domid,
             new_target_memkb / 4, NULL, NULL, NULL);
     if (rc != 0) {
@@ -4837,6 +4847,8 @@ out:
             goto retry_transaction;
 
 out_no_transaction:
+    if (lock) libxl__unlock_domain_userdata(lock);
+    CTX_UNLOCK;
     GC_FREE;
     return rc;
 }
@@ -4864,12 +4876,12 @@ static int libxl__get_memory_target(libxl__gc *gc, uint32_t domid,
             goto out;
     } else if (!target) {
         LIBXL__LOG_ERRNO(CTX, LIBXL__LOG_ERROR,
-                "cannot get target memory info from %s/memory/target\n",
-                dompath);
+                         "cannot get target memory info from %s/memory/target",
+                         dompath);
         goto out;
     } else if (!static_max) {
         LIBXL__LOG_ERRNO(CTX, LIBXL__LOG_ERROR,
-                "cannot get target memory info from %s/memory/static-max\n",
+                "cannot get target memory info from %s/memory/static-max",
                 dompath);
         goto out;
     } else {
@@ -4950,20 +4962,13 @@ int libxl_get_free_memory(libxl_ctx *ctx, uint32_t *memkb)
 {
     int rc = 0;
     libxl_physinfo info;
-    uint32_t freemem_slack;
     GC_INIT(ctx);
 
     rc = libxl_get_physinfo(ctx, &info);
     if (rc < 0)
         goto out;
-    rc = libxl__get_free_memory_slack(gc, &freemem_slack);
-    if (rc < 0)
-        goto out;
 
-    if ((info.free_pages + info.scrub_pages) * 4 > freemem_slack)
-        *memkb = (info.free_pages + info.scrub_pages) * 4 - freemem_slack;
-    else
-        *memkb = 0;
+    *memkb = (info.free_pages + info.scrub_pages) * 4;
 
 out:
     GC_FREE;
@@ -4975,18 +4980,13 @@ int libxl_wait_for_free_memory(libxl_ctx *ctx, uint32_t domid, uint32_t
 {
     int rc = 0;
     libxl_physinfo info;
-    uint32_t freemem_slack;
     GC_INIT(ctx);
 
-    rc = libxl__get_free_memory_slack(gc, &freemem_slack);
-    if (rc < 0)
-        goto out;
     while (wait_secs > 0) {
         rc = libxl_get_physinfo(ctx, &info);
         if (rc < 0)
             goto out;
-        if (info.free_pages * 4 >= freemem_slack &&
-            info.free_pages * 4 - freemem_slack >= memory_kb) {
+        if (info.free_pages * 4 >= memory_kb) {
             rc = 0;
             goto out;
         }
@@ -5004,26 +5004,41 @@ int libxl_wait_for_memory_target(libxl_ctx *ctx, uint32_t domid, int wait_secs)
 {
     int rc = 0;
     uint32_t target_memkb = 0;
+    uint64_t current_memkb, prev_memkb;
     libxl_dominfo info;
 
+    rc = libxl_get_memory_target(ctx, domid, &target_memkb);
+    if (rc < 0)
+        return rc;
+
     libxl_dominfo_init(&info);
+    prev_memkb = UINT64_MAX;
 
     do {
-        wait_secs--;
-        sleep(1);
-
-        rc = libxl_get_memory_target(ctx, domid, &target_memkb);
-        if (rc < 0)
-            goto out;
+        sleep(2);
 
         libxl_dominfo_dispose(&info);
         libxl_dominfo_init(&info);
         rc = libxl_domain_info(ctx, &info, domid);
         if (rc < 0)
             goto out;
-    } while (wait_secs > 0 && (info.current_memkb + info.outstanding_memkb) > target_memkb);
 
-    if ((info.current_memkb + info.outstanding_memkb) <= target_memkb)
+        current_memkb = info.current_memkb + info.outstanding_memkb;
+
+        if (current_memkb > prev_memkb)
+        {
+            rc = ERROR_FAIL;
+            goto out;
+        }
+        else if (current_memkb == prev_memkb)
+            wait_secs -= 2;
+        /* if current_memkb < prev_memkb loop for free as progress has
+         * been made */
+
+        prev_memkb = current_memkb;
+    } while (wait_secs > 0 && current_memkb > target_memkb);
+
+    if (current_memkb <= target_memkb)
         rc = 0;
     else
         rc = ERROR_FAIL;
@@ -5054,7 +5069,7 @@ int libxl_get_physinfo(libxl_ctx *ctx, libxl_physinfo *physinfo)
     physinfo->scrub_pages = xcphysinfo.scrub_pages;
     physinfo->outstanding_pages = xcphysinfo.outstanding_pages;
     l = xc_sharing_freed_pages(ctx->xch);
-    if (l == -ENOSYS) {
+    if (l < 0 && errno == ENOSYS) {
         l = 0;
     } else if (l < 0) {
         LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, l,
@@ -5063,7 +5078,7 @@ int libxl_get_physinfo(libxl_ctx *ctx, libxl_physinfo *physinfo)
     }
     physinfo->sharing_freed_pages = l;
     l = xc_sharing_used_frames(ctx->xch);
-    if (l == -ENOSYS) {
+    if (l < 0 && errno == ENOSYS) {
         l = 0;
     } else if (l < 0) {
         LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, l,
@@ -5084,64 +5099,83 @@ int libxl_get_physinfo(libxl_ctx *ctx, libxl_physinfo *physinfo)
 libxl_cputopology *libxl_get_cpu_topology(libxl_ctx *ctx, int *nb_cpu_out)
 {
     GC_INIT(ctx);
-    xc_topologyinfo_t tinfo;
-    DECLARE_HYPERCALL_BUFFER(xc_cpu_to_core_t, coremap);
-    DECLARE_HYPERCALL_BUFFER(xc_cpu_to_socket_t, socketmap);
-    DECLARE_HYPERCALL_BUFFER(xc_cpu_to_node_t, nodemap);
+    xc_cputopo_t *cputopo;
     libxl_cputopology *ret = NULL;
     int i;
-    int max_cpus;
+    unsigned num_cpus = 0;
 
-    max_cpus = libxl_get_max_cpus(ctx);
-    if (max_cpus < 0)
+    /* Setting buffer to NULL makes the call return number of CPUs */
+    if (xc_cputopoinfo(ctx->xch, &num_cpus, NULL))
     {
-        LIBXL__LOG(ctx, XTL_ERROR, "Unable to determine number of CPUS");
-        ret = NULL;
+        LOGE(ERROR, "Unable to determine number of CPUS");
         goto out;
     }
 
-    coremap = xc_hypercall_buffer_alloc
-        (ctx->xch, coremap, sizeof(*coremap) * max_cpus);
-    socketmap = xc_hypercall_buffer_alloc
-        (ctx->xch, socketmap, sizeof(*socketmap) * max_cpus);
-    nodemap = xc_hypercall_buffer_alloc
-        (ctx->xch, nodemap, sizeof(*nodemap) * max_cpus);
-    if ((coremap == NULL) || (socketmap == NULL) || (nodemap == NULL)) {
-        LIBXL__LOG_ERRNOVAL(ctx, XTL_ERROR, ENOMEM,
-                            "Unable to allocate hypercall arguments");
-        goto fail;
+    cputopo = libxl__zalloc(gc, sizeof(*cputopo) * num_cpus);
+
+    if (xc_cputopoinfo(ctx->xch, &num_cpus, cputopo)) {
+        LOGE(ERROR, "CPU topology info hypercall failed");
+        goto out;
     }
 
-    set_xen_guest_handle(tinfo.cpu_to_core, coremap);
-    set_xen_guest_handle(tinfo.cpu_to_socket, socketmap);
-    set_xen_guest_handle(tinfo.cpu_to_node, nodemap);
-    tinfo.max_cpu_index = max_cpus - 1;
-    if (xc_topologyinfo(ctx->xch, &tinfo) != 0) {
-        LIBXL__LOG_ERRNO(ctx, XTL_ERROR, "Topology info hypercall failed");
-        goto fail;
+    ret = libxl__zalloc(NOGC, sizeof(libxl_cputopology) * num_cpus);
+
+    for (i = 0; i < num_cpus; i++) {
+#define V(map, i, invalid) ( cputopo[i].map == invalid) ? \
+   LIBXL_CPUTOPOLOGY_INVALID_ENTRY : cputopo[i].map
+        ret[i].core = V(core, i, XEN_INVALID_CORE_ID);
+        ret[i].socket = V(socket, i, XEN_INVALID_SOCKET_ID);
+        ret[i].node = V(node, i, XEN_INVALID_NODE_ID);
+#undef V
     }
 
-    if (tinfo.max_cpu_index < max_cpus - 1)
-        max_cpus = tinfo.max_cpu_index + 1;
+    *nb_cpu_out = num_cpus;
+
+ out:
+    GC_FREE;
+    return ret;
+}
+
+libxl_pcitopology *libxl_get_pci_topology(libxl_ctx *ctx, int *num_devs)
+{
+    GC_INIT(ctx);
+    physdev_pci_device_t *devs;
+    uint32_t *nodes;
+    libxl_pcitopology *ret = NULL;
+    int i, rc;
+
+    *num_devs = libxl__pci_numdevs(gc);
+    if (*num_devs < 0) {
+        LOG(ERROR, "Unable to determine number of PCI devices, rc %d",
+            *num_devs);
+        goto out;
+    }
 
-    ret = libxl__zalloc(NOGC, sizeof(libxl_cputopology) * max_cpus);
+    devs = libxl__zalloc(gc, sizeof(*devs) * *num_devs);
+    nodes = libxl__zalloc(gc, sizeof(*nodes) * *num_devs);
 
-    for (i = 0; i < max_cpus; i++) {
-#define V(map, i) (map[i] == INVALID_TOPOLOGY_ID) ? \
-    LIBXL_CPUTOPOLOGY_INVALID_ENTRY : map[i]
-        ret[i].core = V(coremap, i);
-        ret[i].socket = V(socketmap, i);
-        ret[i].node = V(nodemap, i);
-#undef V
+    rc = libxl__pci_topology_init(gc, devs, *num_devs);
+    if (rc) {
+        LOG(ERROR, "Cannot initialize PCI hypercall structure, rc %d", rc);
+        goto out;
+    }
+
+    if (xc_pcitopoinfo(ctx->xch, *num_devs, devs, nodes) != 0) {
+        LOGE(ERROR, "PCI topology info hypercall failed");
+        goto out;
     }
 
-fail:
-    xc_hypercall_buffer_free(ctx->xch, coremap);
-    xc_hypercall_buffer_free(ctx->xch, socketmap);
-    xc_hypercall_buffer_free(ctx->xch, nodemap);
+    ret = libxl__zalloc(NOGC, sizeof(libxl_pcitopology) * *num_devs);
+
+    for (i = 0; i < *num_devs; i++) {
+        ret[i].seg = devs[i].seg;
+        ret[i].bus = devs[i].bus;
+        ret[i].devfn = devs[i].devfn;
+        ret[i].node = ((nodes[i] == XEN_INVALID_NODE_ID) ||
+                       (nodes[i] == XEN_INVALID_DEV)) ?
+            LIBXL_PCITOPOLOGY_INVALID_ENTRY : nodes[i];
+    }
 
-    if (ret)
-        *nb_cpu_out = max_cpus;
  out:
     GC_FREE;
     return ret;
@@ -5150,66 +5184,43 @@ fail:
 libxl_numainfo *libxl_get_numainfo(libxl_ctx *ctx, int *nr)
 {
     GC_INIT(ctx);
-    xc_numainfo_t ninfo;
-    DECLARE_HYPERCALL_BUFFER(xc_node_to_memsize_t, memsize);
-    DECLARE_HYPERCALL_BUFFER(xc_node_to_memfree_t, memfree);
-    DECLARE_HYPERCALL_BUFFER(uint32_t, node_dists);
+    xc_meminfo_t *meminfo;
+    uint32_t *distance;
     libxl_numainfo *ret = NULL;
-    int i, j, max_nodes;
+    int i, j;
+    unsigned num_nodes = 0;
 
-    max_nodes = libxl_get_max_nodes(ctx);
-    if (max_nodes < 0)
-    {
-        LIBXL__LOG(ctx, XTL_ERROR, "Unable to determine number of NODES");
-        ret = NULL;
+    if (xc_numainfo(ctx->xch, &num_nodes, NULL, NULL)) {
+        LOGE(ERROR, "Unable to determine number of nodes");
         goto out;
     }
 
-    memsize = xc_hypercall_buffer_alloc
-        (ctx->xch, memsize, sizeof(*memsize) * max_nodes);
-    memfree = xc_hypercall_buffer_alloc
-        (ctx->xch, memfree, sizeof(*memfree) * max_nodes);
-    node_dists = xc_hypercall_buffer_alloc
-        (ctx->xch, node_dists, sizeof(*node_dists) * max_nodes * max_nodes);
-    if ((memsize == NULL) || (memfree == NULL) || (node_dists == NULL)) {
-        LIBXL__LOG_ERRNOVAL(ctx, XTL_ERROR, ENOMEM,
-                            "Unable to allocate hypercall arguments");
-        goto fail;
-    }
-
-    set_xen_guest_handle(ninfo.node_to_memsize, memsize);
-    set_xen_guest_handle(ninfo.node_to_memfree, memfree);
-    set_xen_guest_handle(ninfo.node_to_node_distance, node_dists);
-    ninfo.max_node_index = max_nodes - 1;
-    if (xc_numainfo(ctx->xch, &ninfo) != 0) {
-        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "getting numainfo");
-        goto fail;
-    }
-
-    if (ninfo.max_node_index < max_nodes - 1)
-        max_nodes = ninfo.max_node_index + 1;
-
-    *nr = max_nodes;
-
-    ret = libxl__zalloc(NOGC, sizeof(libxl_numainfo) * max_nodes);
-    for (i = 0; i < max_nodes; i++)
-        ret[i].dists = libxl__calloc(NOGC, max_nodes, sizeof(*node_dists));
-
-    for (i = 0; i < max_nodes; i++) {
-#define V(mem, i) (mem[i] == INVALID_NUMAINFO_ID) ? \
-    LIBXL_NUMAINFO_INVALID_ENTRY : mem[i]
-        ret[i].size = V(memsize, i);
-        ret[i].free = V(memfree, i);
-        ret[i].num_dists = max_nodes;
-        for (j = 0; j < ret[i].num_dists; j++)
-            ret[i].dists[j] = V(node_dists, i * max_nodes + j);
-#undef V
+    meminfo = libxl__zalloc(gc, sizeof(*meminfo) * num_nodes);
+    distance = libxl__zalloc(gc, sizeof(*distance) * num_nodes * num_nodes);
+
+    if (xc_numainfo(ctx->xch, &num_nodes, meminfo, distance)) {
+        LOGE(ERROR, "getting numainfo");
+        goto out;
     }
 
- fail:
-    xc_hypercall_buffer_free(ctx->xch, memsize);
-    xc_hypercall_buffer_free(ctx->xch, memfree);
-    xc_hypercall_buffer_free(ctx->xch, node_dists);
+    *nr = num_nodes;
+
+    ret = libxl__zalloc(NOGC, sizeof(libxl_numainfo) * num_nodes);
+    for (i = 0; i < num_nodes; i++)
+        ret[i].dists = libxl__calloc(NOGC, num_nodes, sizeof(*distance));
+
+    for (i = 0; i < num_nodes; i++) {
+#define V(val, invalid) (val == invalid) ? \
+       LIBXL_NUMAINFO_INVALID_ENTRY : val
+        ret[i].size = V(meminfo[i].memsize, XEN_INVALID_MEM_SZ);
+        ret[i].free = V(meminfo[i].memfree, XEN_INVALID_MEM_SZ);
+        ret[i].num_dists = num_nodes;
+        for (j = 0; j < ret[i].num_dists; j++) {
+            unsigned idx = i * num_nodes + j;
+            ret[i].dists[j] = V(distance[idx], XEN_INVALID_NODE_DIST);
+        }
+#undef V
+    }
 
  out:
     GC_FREE;
@@ -5440,25 +5451,19 @@ int libxl_domain_get_nodeaffinity(libxl_ctx *ctx, uint32_t domid,
 }
 
 static int libxl__set_vcpuonline_xenstore(libxl__gc *gc, uint32_t domid,
-                                         libxl_bitmap *cpumap)
+                                         libxl_bitmap *cpumap,
+                                         const libxl_dominfo *info)
 {
-    libxl_dominfo info;
     char *dompath;
     xs_transaction_t t;
     int i, rc = ERROR_FAIL;
 
-    libxl_dominfo_init(&info);
-
-    if (libxl_domain_info(CTX, &info, domid) < 0) {
-        LOGE(ERROR, "getting domain info list");
-        goto out;
-    }
     if (!(dompath = libxl__xs_get_dompath(gc, domid)))
         goto out;
 
 retry_transaction:
     t = xs_transaction_start(CTX->xsh);
-    for (i = 0; i <= info.vcpu_max_id; i++)
+    for (i = 0; i <= info->vcpu_max_id; i++)
         libxl__xs_write(gc, t,
                        libxl__sprintf(gc, "%s/cpu/%u/availability", dompath, i),
                        "%s", libxl_bitmap_test(cpumap, i) ? "online" : "offline");
@@ -5468,24 +5473,16 @@ retry_transaction:
     } else
         rc = 0;
 out:
-    libxl_dominfo_dispose(&info);
     return rc;
 }
 
 static int libxl__set_vcpuonline_qmp(libxl__gc *gc, uint32_t domid,
-                                     libxl_bitmap *cpumap)
+                                     libxl_bitmap *cpumap,
+                                     const libxl_dominfo *info)
 {
-    libxl_dominfo info;
     int i;
 
-    libxl_dominfo_init(&info);
-
-    if (libxl_domain_info(CTX, &info, domid) < 0) {
-        LOGE(ERROR, "getting domain info list");
-        libxl_dominfo_dispose(&info);
-        return ERROR_FAIL;
-    }
-    for (i = 0; i <= info.vcpu_max_id; i++) {
+    for (i = 0; i <= info->vcpu_max_id; i++) {
         if (libxl_bitmap_test(cpumap, i)) {
             /* Return value is ignore because it does not tell anything useful
              * on the completion of the command.
@@ -5495,33 +5492,53 @@ static int libxl__set_vcpuonline_qmp(libxl__gc *gc, uint32_t domid,
             libxl__qmp_cpu_add(gc, domid, i);
         }
     }
-    libxl_dominfo_dispose(&info);
     return 0;
 }
 
 int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap)
 {
     GC_INIT(ctx);
-    int rc;
+    int rc, maxcpus;
+    libxl_dominfo info;
+
+    libxl_dominfo_init(&info);
+
+    rc = libxl_domain_info(CTX, &info, domid);
+    if (rc < 0) {
+        LOGE(ERROR, "getting domain info list");
+        goto out;
+    }
+
+    maxcpus = libxl_bitmap_count_set(cpumap);
+    if (maxcpus > info.vcpu_max_id + 1)
+    {
+        LOGE(ERROR, "Requested %d VCPUs, however maxcpus is %d!",
+             maxcpus, info.vcpu_max_id + 1);
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
     switch (libxl__domain_type(gc, domid)) {
     case LIBXL_DOMAIN_TYPE_HVM:
         switch (libxl__device_model_version_running(gc, domid)) {
         case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL:
-            rc = libxl__set_vcpuonline_xenstore(gc, domid, cpumap);
+            rc = libxl__set_vcpuonline_xenstore(gc, domid, cpumap, &info);
             break;
         case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
-            rc = libxl__set_vcpuonline_qmp(gc, domid, cpumap);
+            rc = libxl__set_vcpuonline_qmp(gc, domid, cpumap, &info);
             break;
         default:
             rc = ERROR_INVAL;
         }
         break;
     case LIBXL_DOMAIN_TYPE_PV:
-        rc = libxl__set_vcpuonline_xenstore(gc, domid, cpumap);
+        rc = libxl__set_vcpuonline_xenstore(gc, domid, cpumap, &info);
         break;
     default:
         rc = ERROR_INVAL;
     }
+out:
+    libxl_dominfo_dispose(&info);
     GC_FREE;
     return rc;
 }
@@ -5657,7 +5674,7 @@ int libxl_sched_credit_params_set(libxl_ctx *ctx, uint32_t poolid,
     }
     if (scinfo->ratelimit_us > scinfo->tslice_ms*1000) {
         LIBXL__LOG(ctx, LIBXL__LOG_ERROR,
-                   "Ratelimit cannot be greater than timeslice\n");
+                   "Ratelimit cannot be greater than timeslice");
         return ERROR_INVAL;
     }
 
@@ -5725,73 +5742,6 @@ static int sched_credit2_domain_set(libxl__gc *gc, uint32_t domid,
     return 0;
 }
 
-static int sched_sedf_domain_get(libxl__gc *gc, uint32_t domid,
-                                 libxl_domain_sched_params *scinfo)
-{
-    uint64_t period;
-    uint64_t slice;
-    uint64_t latency;
-    uint16_t extratime;
-    uint16_t weight;
-    int rc;
-
-    rc = xc_sedf_domain_get(CTX->xch, domid, &period, &slice, &latency,
-                            &extratime, &weight);
-    if (rc != 0) {
-        LOGE(ERROR, "getting domain sched sedf");
-        return ERROR_FAIL;
-    }
-
-    libxl_domain_sched_params_init(scinfo);
-    scinfo->sched = LIBXL_SCHEDULER_SEDF;
-    scinfo->period = period / 1000000;
-    scinfo->slice = slice / 1000000;
-    scinfo->latency = latency / 1000000;
-    scinfo->extratime = extratime;
-    scinfo->weight = weight;
-
-    return 0;
-}
-
-static int sched_sedf_domain_set(libxl__gc *gc, uint32_t domid,
-                                 const libxl_domain_sched_params *scinfo)
-{
-    uint64_t period;
-    uint64_t slice;
-    uint64_t latency;
-    uint16_t extratime;
-    uint16_t weight;
-
-    int ret;
-
-    ret = xc_sedf_domain_get(CTX->xch, domid, &period, &slice, &latency,
-                            &extratime, &weight);
-    if (ret != 0) {
-        LOGE(ERROR, "getting domain sched sedf");
-        return ERROR_FAIL;
-    }
-
-    if (scinfo->period != LIBXL_DOMAIN_SCHED_PARAM_PERIOD_DEFAULT)
-        period = (uint64_t)scinfo->period * 1000000;
-    if (scinfo->slice != LIBXL_DOMAIN_SCHED_PARAM_SLICE_DEFAULT)
-        slice = (uint64_t)scinfo->slice * 1000000;
-    if (scinfo->latency != LIBXL_DOMAIN_SCHED_PARAM_LATENCY_DEFAULT)
-        latency = (uint64_t)scinfo->latency * 1000000;
-    if (scinfo->extratime != LIBXL_DOMAIN_SCHED_PARAM_EXTRATIME_DEFAULT)
-        extratime = scinfo->extratime;
-    if (scinfo->weight != LIBXL_DOMAIN_SCHED_PARAM_WEIGHT_DEFAULT)
-        weight = scinfo->weight;
-
-    ret = xc_sedf_domain_set(CTX->xch, domid, period, slice, latency,
-                            extratime, weight);
-    if ( ret < 0 ) {
-        LOGE(ERROR, "setting domain sched sedf");
-        return ERROR_FAIL;
-    }
-
-    return 0;
-}
-
 static int sched_rtds_domain_get(libxl__gc *gc, uint32_t domid,
                                libxl_domain_sched_params *scinfo)
 {
@@ -5870,7 +5820,8 @@ int libxl_domain_sched_params_set(libxl_ctx *ctx, uint32_t domid,
 
     switch (sched) {
     case LIBXL_SCHEDULER_SEDF:
-        ret=sched_sedf_domain_set(gc, domid, scinfo);
+        LOG(ERROR, "SEDF scheduler no longer available");
+        ret=ERROR_FEATURE_REMOVED;
         break;
     case LIBXL_SCHEDULER_CREDIT:
         ret=sched_credit_domain_set(gc, domid, scinfo);
@@ -5906,7 +5857,8 @@ int libxl_domain_sched_params_get(libxl_ctx *ctx, uint32_t domid,
 
     switch (scinfo->sched) {
     case LIBXL_SCHEDULER_SEDF:
-        ret=sched_sedf_domain_get(gc, domid, scinfo);
+        LOG(ERROR, "SEDF scheduler no longer available");
+        ret=ERROR_FEATURE_REMOVED;
         break;
     case LIBXL_SCHEDULER_CREDIT:
         ret=sched_credit_domain_get(gc, domid, scinfo);
@@ -6108,8 +6060,8 @@ char *libxl_tmem_list(libxl_ctx *ctx, uint32_t domid, int use_long)
     int rc;
     char _buf[32768];
 
-    rc = xc_tmem_control(ctx->xch, -1, TMEMC_LIST, domid, 32768, use_long,
-                         0, _buf);
+    rc = xc_tmem_control(ctx->xch, -1, XEN_SYSCTL_TMEM_OP_LIST, domid, 32768, use_long,
+                         _buf);
     if (rc < 0) {
         LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, rc,
             "Can not get tmem list");
@@ -6123,8 +6075,8 @@ int libxl_tmem_freeze(libxl_ctx *ctx, uint32_t domid)
 {
     int rc;
 
-    rc = xc_tmem_control(ctx->xch, -1, TMEMC_FREEZE, domid, 0, 0,
-                         0, NULL);
+    rc = xc_tmem_control(ctx->xch, -1, XEN_SYSCTL_TMEM_OP_FREEZE, domid, 0, 0,
+                         NULL);
     if (rc < 0) {
         LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, rc,
             "Can not freeze tmem pools");
@@ -6138,8 +6090,8 @@ int libxl_tmem_thaw(libxl_ctx *ctx, uint32_t domid)
 {
     int rc;
 
-    rc = xc_tmem_control(ctx->xch, -1, TMEMC_THAW, domid, 0, 0,
-                         0, NULL);
+    rc = xc_tmem_control(ctx->xch, -1, XEN_SYSCTL_TMEM_OP_THAW, domid, 0, 0,
+                         NULL);
     if (rc < 0) {
         LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, rc,
             "Can not thaw tmem pools");
@@ -6152,11 +6104,11 @@ int libxl_tmem_thaw(libxl_ctx *ctx, uint32_t domid)
 static int32_t tmem_setop_from_string(char *set_name)
 {
     if (!strcmp(set_name, "weight"))
-        return TMEMC_SET_WEIGHT;
+        return XEN_SYSCTL_TMEM_OP_SET_WEIGHT;
     else if (!strcmp(set_name, "cap"))
-        return TMEMC_SET_CAP;
+        return XEN_SYSCTL_TMEM_OP_SET_CAP;
     else if (!strcmp(set_name, "compress"))
-        return TMEMC_SET_COMPRESS;
+        return XEN_SYSCTL_TMEM_OP_SET_COMPRESS;
     else
         return -1;
 }
@@ -6171,7 +6123,7 @@ int libxl_tmem_set(libxl_ctx *ctx, uint32_t domid, char* name, uint32_t set)
             "Invalid set, valid sets are <weight|cap|compress>");
         return ERROR_INVAL;
     }
-    rc = xc_tmem_control(ctx->xch, -1, subop, domid, set, 0, 0, NULL);
+    rc = xc_tmem_control(ctx->xch, -1, subop, domid, set, 0, NULL);
     if (rc < 0) {
         LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, rc,
             "Can not set tmem %s", name);
@@ -6200,7 +6152,7 @@ int libxl_tmem_freeable(libxl_ctx *ctx)
 {
     int rc;
 
-    rc = xc_tmem_control(ctx->xch, -1, TMEMC_QUERY_FREEABLE_MB, -1, 0, 0, 0, 0);
+    rc = xc_tmem_control(ctx->xch, -1, XEN_SYSCTL_TMEM_OP_QUERY_FREEABLE_MB, -1, 0, 0, 0);
     if (rc < 0) {
         LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, rc,
             "Can not get tmem freeable memory");
@@ -6388,15 +6340,33 @@ out:
 
 int libxl_cpupool_cpuadd(libxl_ctx *ctx, uint32_t poolid, int cpu)
 {
-    int rc;
+    GC_INIT(ctx);
+    int rc = 0;
 
     rc = xc_cpupool_addcpu(ctx->xch, poolid, cpu);
     if (rc) {
-        LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, rc,
-            "Error moving cpu to cpupool");
-        return ERROR_FAIL;
+        LOGE(ERROR, "Error moving cpu %d to cpupool", cpu);
+        rc = ERROR_FAIL;
     }
-    return 0;
+
+    GC_FREE;
+    return rc;
+}
+
+int libxl_cpupool_cpuadd_cpumap(libxl_ctx *ctx, uint32_t poolid,
+                                const libxl_bitmap *cpumap)
+{
+    int c, ncpus = 0, rc = 0;
+
+    libxl_for_each_set_bit(c, *cpumap) {
+        if (!libxl_cpupool_cpuadd(ctx, poolid, c))
+            ncpus++;
+    }
+
+    if (ncpus != libxl_bitmap_count_set(cpumap))
+        rc = ERROR_FAIL;
+
+    return rc;
 }
 
 int libxl_cpupool_cpuadd_node(libxl_ctx *ctx, uint32_t poolid, int node, int *cpus)
@@ -6433,15 +6403,33 @@ out:
 
 int libxl_cpupool_cpuremove(libxl_ctx *ctx, uint32_t poolid, int cpu)
 {
-    int rc;
+    GC_INIT(ctx);
+    int rc = 0;
 
     rc = xc_cpupool_removecpu(ctx->xch, poolid, cpu);
     if (rc) {
-        LIBXL__LOG_ERRNOVAL(ctx, LIBXL__LOG_ERROR, rc,
-            "Error removing cpu from cpupool");
-        return ERROR_FAIL;
+        LOGE(ERROR, "Error removing cpu %d from cpupool", cpu);
+        rc = ERROR_FAIL;
     }
-    return 0;
+
+    GC_FREE;
+    return rc;
+}
+
+int libxl_cpupool_cpuremove_cpumap(libxl_ctx *ctx, uint32_t poolid,
+                                   const libxl_bitmap *cpumap)
+{
+    int c, ncpus = 0, rc = 0;
+
+    libxl_for_each_set_bit(c, *cpumap) {
+        if (!libxl_cpupool_cpuremove(ctx, poolid, c))
+            ncpus++;
+    }
+
+    if (ncpus != libxl_bitmap_count_set(cpumap))
+        rc = ERROR_FAIL;
+
+    return rc;
 }
 
 int libxl_cpupool_cpuremove_node(libxl_ctx *ctx, uint32_t poolid, int node, int *cpus)
@@ -6534,6 +6522,60 @@ int libxl_fd_set_cloexec(libxl_ctx *ctx, int fd, int cloexec)
 int libxl_fd_set_nonblock(libxl_ctx *ctx, int fd, int nonblock)
   { return fd_set_flags(ctx,fd, F_GETFL,F_SETFL,"FL", O_NONBLOCK, nonblock); }
 
+int libxl__fd_flags_modify_save(libxl__gc *gc, int fd,
+                                int mask, int val, int *r_oldflags)
+{
+    int rc, ret, fdfl;
+
+    fdfl = fcntl(fd, F_GETFL);
+    if (fdfl < 0) {
+        LOGE(ERROR, "failed to fcntl.F_GETFL for fd %d", fd);
+        rc = ERROR_FAIL;
+        goto out_err;
+    }
+
+    LOG(DEBUG, "fnctl F_GETFL flags for fd %d are 0x%x", fd, fdfl);
+
+    if (r_oldflags)
+        *r_oldflags = fdfl;
+
+    fdfl &= mask;
+    fdfl |= val;
+
+    LOG(DEBUG, "fnctl F_SETFL of fd %d to 0x%x", fd, fdfl);
+
+    ret = fcntl(fd, F_SETFL, fdfl);
+    if (ret < 0) {
+        LOGE(ERROR, "failed to fcntl.F_SETFL for fd %d", fd);
+        rc = ERROR_FAIL;
+        goto out_err;
+    }
+
+    rc = 0;
+
+out_err:
+    return rc;
+}
+
+int libxl__fd_flags_restore(libxl__gc *gc, int fd, int fdfl)
+{
+    int ret, rc;
+
+    LOG(DEBUG, "fnctl F_SETFL of fd %d to 0x%x", fd, fdfl);
+
+    ret = fcntl(fd, F_SETFL, fdfl);
+    if (ret < 0) {
+        LOGE(ERROR, "failed to fcntl.F_SETFL for fd %d", fd);
+        rc = ERROR_FAIL;
+        goto out_err;
+    }
+
+    rc = 0;
+
+out_err:
+    return rc;
+
+}
 
 void libxl_hwcap_copy(libxl_ctx *ctx,libxl_hwcap *dst, libxl_hwcap *src)
 {
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 0a123f1..fa5aedd 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -67,6 +67,13 @@
  * the same $(XEN_VERSION) (e.g. throughout a major release).
  */
 
+/* LIBXL_HAVE_VNUMA
+ *
+ * If this is defined the type libxl_vnode_info exists, and a
+ * field 'vnuma_nodes' is present in libxl_domain_build_info.
+ */
+#define LIBXL_HAVE_VNUMA 1
+
 /* LIBXL_HAVE_USERDATA_UNLINK
  *
  * If it is defined, libxl has a library function called
@@ -83,6 +90,24 @@
  */
 #define LIBXL_HAVE_CPUPOOL_QUALIFIER_TO_CPUPOOLID 1
 
+/* LIBXL_HAVE_CPUPOOL_ADD_REM_CPUMAP
+ *
+ * If this is defined, libxl has two library functions called
+ * libxl_cpupool_cpuadd_cpumap and libxl_cpupool_cpuremove_cpumap,
+ * which allow to add to or remove from a cpupool all the cpus
+ * specified in a bitmap.
+ */
+#define LIBXL_HAVE_CPUPOOL_ADD_REM_CPUMAP 1
+
+/*
+ *
+ * LIBXL_HAVE_BITMAP_AND_OR
+ *
+ * If this is defined, libxl has two library functions, libxl_bitmap_and
+ * and libxl_bitmap_or to compute the logical and and or of two bitmaps
+ */
+#define LIBXL_HAVE_BITMAP_AND_OR 1
+
 /*
  * LIBXL_HAVE_FIRMWARE_PASSTHROUGH indicates the feature for
  * passing in SMBIOS and ACPI firmware to HVM guests is present
@@ -163,6 +188,23 @@
 #define LIBXL_HAVE_BUILDINFO_HVM_MMIO_HOLE_MEMKB 1
 
 /*
+ * libxl_domain_info returns ERROR_DOMAIN_NOTFOUND if the domain
+ * is not present, instead of ERROR_INVAL.
+ */
+#define LIBXL_HAVE_ERROR_DOMAIN_NOTFOUND 1
+
+/*
+ * libxl_domain_build_info has device_tree and libxl_device_dtdev
+ * exists. This mean Device Tree passthrough is supported for ARM
+ */
+#define LIBXL_HAVE_DEVICETREE_PASSTHROUGH 1
+
+/*
+ * libxl_domain_build_info has the arm.gic_version field.
+ */
+#define LIBXL_HAVE_BUILDINFO_ARM_GIC_VERSION 1
+
+/*
  * libxl ABI compatibility
  *
  * The only guarantee which libxl makes regarding ABI compatibility
@@ -308,8 +350,7 @@
  * once afterwards, to clean up, regardless of whether operations on
  * this object succeeded or failed.  See the xl code for examples.
  *
- * "init" is idempotent.  We intend that "dispose" will become
- * idempotent, but this is not currently the case.
+ * "init" and "dispose" are idempotent.
  *
  * void libxl_<type>_init(<type> *p):
  *
@@ -506,6 +547,16 @@ typedef struct libxl__ctx libxl_ctx;
 #define LIBXL_HAVE_DOMINFO_OUTSTANDING_MEMKB 1
 
 /*
+ * LIBXL_HAVE_QXL
+ *
+ * If defined, then the libxl_vga_interface_type will contain another value:
+ * "QXL". This value define if qxl vga is supported.
+ *
+ * If this is not defined, the qxl vga support is missed.
+ */
+#define LIBXL_HAVE_QXL 1
+
+/*
  * LIBXL_HAVE_SPICE_VDAGENT
  *
  * If defined, then the libxl_spice_info structure will contain a boolean type:
@@ -528,6 +579,36 @@ typedef struct libxl__ctx libxl_ctx;
 #define LIBXL_HAVE_SPICE_USBREDIREDIRECTION 1
 
 /*
+ * LIBXL_HAVE_SPICE_IMAGECOMPRESSION
+ *
+ * If defined, then the libxl_spice_info structure will contain a string type
+ * field: image_compression. This value defines what Spice image compression
+ * is used.
+ *
+ * If this is not defined, the Spice image compression setting support is ignored.
+ */
+#define LIBXL_HAVE_SPICE_IMAGECOMPRESSION 1
+
+/*
+ * LIBXL_HAVE_SPICE_STREAMINGVIDEO
+ *
+ * If defined, then the libxl_spice_info structure will contain a string type
+ * field: streaming_video. This value defines what Spice streaming video setting
+ * is used.
+ *
+ * If this is not defined, the Spice streaming video setting support is ignored.
+ */
+#define LIBXL_HAVE_SPICE_STREAMINGVIDEO 1
+
+/*
+ * LIBXL_HAVE_HVM_HDTYPE
+ *
+ * If defined, then the u.hvm structure will contain a enum type
+ * hdtype.
+ */
+#define LIBXL_HAVE_HVM_HDTYPE 1
+
+/*
  * LIBXL_HAVE_DOMAIN_CREATE_RESTORE_PARAMS 1
  *
  * If this is defined, libxl_domain_create_restore()'s API has changed to
@@ -620,6 +701,11 @@ typedef struct libxl__ctx libxl_ctx;
  */
 #define LIBXL_HAVE_DEVICE_CHANNEL 1
 
+/*
+ * LIBXL_HAVE_AO_ABORT indicates the availability of libxl_ao_abort
+ */
+#define LIBXL_HAVE_AO_ABORT 1
+
 /* Functions annotated with LIBXL_EXTERNAL_CALLERS_ONLY may not be
  * called from within libxl itself. Callers outside libxl, who
  * do not #include libxl_internal.h, are fine. */
@@ -672,6 +758,12 @@ typedef struct libxl__ctx libxl_ctx;
 #define LIBXL_HAVE_BUILDINFO_SERIAL_LIST 1
 
 /*
+ * LIBXL_HAVE_ALTP2M
+ * If this is defined, then libxl supports alternate p2m functionality.
+ */
+#define LIBXL_HAVE_ALTP2M 1
+
+/*
  * LIBXL_HAVE_REMUS
  * If this is defined, then libxl supports remus.
  */
@@ -690,8 +782,57 @@ void libxl_mac_copy(libxl_ctx *ctx, libxl_mac *dst, libxl_mac *src);
  * If this is defined, the Cache Monitoring Technology feature is supported.
  */
 #define LIBXL_HAVE_PSR_CMT 1
+
+/*
+ * LIBXL_HAVE_PSR_MBM
+ *
+ * If this is defined, the Memory Bandwidth Monitoring feature is supported.
+ */
+#define LIBXL_HAVE_PSR_MBM 1
+
+/*
+ * LIBXL_HAVE_PSR_CAT
+ *
+ * If this is defined, the Cache Allocation Technology feature is supported.
+ */
+#define LIBXL_HAVE_PSR_CAT 1
 #endif
 
+/*
+ * LIBXL_HAVE_PCITOPOLOGY
+ *
+ * If this is defined, then interface to query hypervisor about PCI device
+ * topology is available.
+ */
+#define LIBXL_HAVE_PCITOPOLOGY 1
+
+/*
+ * LIBXL_HAVE_SOCKET_BITMAP
+ *
+ * If this is defined, then libxl_socket_bitmap_alloc and
+ * libxl_get_online_socketmap exist.
+ */
+#define LIBXL_HAVE_SOCKET_BITMAP 1
+
+/*
+ * LIBXL_HAVE_SRM_V2
+ *
+ * If this is defined, then the libxl_domain_create_restore() interface takes
+ * a "stream_version" parameter and supports a value of 2.
+ *
+ * libxl_domain_suspend() will produce a v2 stream.
+ */
+#define LIBXL_HAVE_SRM_V2 1
+
+/*
+ * LIBXL_HAVE_SRM_V1
+ *
+ * In the case that LIBXL_HAVE_SRM_V2 is set, LIBXL_HAVE_SRM_V1
+ * indicates that libxl_domain_create_restore() can handle a "stream_version"
+ * parameter of 1, and convert the stream format automatically.
+ */
+#define LIBXL_HAVE_SRM_V1 1
+
 typedef char **libxl_string_list;
 void libxl_string_list_dispose(libxl_string_list *sl);
 int libxl_string_list_length(const libxl_string_list *sl);
@@ -775,6 +916,12 @@ const char *libxl_defbool_to_string(libxl_defbool b);
 #define LIBXL_TIMER_MODE_DEFAULT -1
 #define LIBXL_MEMKB_DEFAULT ~0ULL
 
+/*
+ * We'd like to set a memory boundary to determine if we need to check
+ * any overlap with reserved device memory.
+ */
+#define LIBXL_RDM_MEM_BOUNDARY_MEMKB_DEFAULT (2048 * 1024)
+
 #define LIBXL_MS_VM_GENID_LEN 16
 typedef struct {
     uint8_t bytes[LIBXL_MS_VM_GENID_LEN];
@@ -882,6 +1029,59 @@ typedef struct {
     void *for_callback; /* passed to callback */
 } libxl_asyncprogress_how;
 
+/*
+ * It is sometimes possible to abort an asynchronous operation.
+ *
+ * libxl_ao_abort searches for an ongoing asynchronous operation whose
+ * ao_how is identical to *how, and tries to abort it.  The return
+ * values from libxl_ao_abort are as follows:
+ *
+ *  0
+ *
+ *     The operation was found, and attempts are being made to cut it
+ *     short.  However, it may still take some time to stop.  It is
+ *     also possible that the operation will nevertheless complete
+ *     successfully.
+ *
+ *  ERROR_NOTFOUND
+ *
+ *      No matching ongoing operation was found.  This might happen
+ *      for an actual operation if the operation has already completed
+ *      (perhaps on another thread).  The call to libxl_ao_abort has
+ *      had no effect.
+ *
+ *  ERROR_ABORTED
+ *
+ *     The operation has already been the subject of at least one
+ *     call to libxl_ao_abort.
+ *
+ * If the operation was indeed cut short due to the abort request, it
+ * will complete, at some point in the future, with ERROR_ABORTED.  In
+ * that case, depending on the operation it have performed some of the
+ * work in question and left the operation half-done.  Consult the
+ * documentation for individual operations.
+ *
+ * Note that an aborted operation might still fail for other reasons
+ * even after the abort was requested.
+ *
+ * If your application is multithreaded you must not reuse an
+ * ao_how->for_event or ao_how->for_callback value (with a particular
+ * ao_how->callback) unless you are sure that none of your other
+ * threads are going to abort the previous operation using that
+ * value; otherwise you risk aborting the wrong operation if the
+ * intended target of the abort request completes in the meantime.
+ *
+ * It is possible to abort even an operation which is being performed
+ * synchronously, but since in that case how==NULL you had better only
+ * have one such operation, because it is not possible to tell them
+ * apart (and libxl_ao_abort will abort only the first one it finds).
+ * (And, if you want to do this, obviously the abort would have to be
+ * requested on a different thread.)
+ */
+int libxl_ao_abort(libxl_ctx *ctx, const libxl_asyncop_how *how)
+                   LIBXL_EXTERNAL_CALLERS_ONLY;
+
+
 #define LIBXL_VERSION 0
 
 /* context functions */
@@ -892,6 +1092,10 @@ int libxl_ctx_free(libxl_ctx *ctx /* 0 is OK */);
 
 /* domain related functions */
 
+/* If the result is ERROR_ABORTED, the domain may or may not exist
+ * (in a half-created state).  *domid will be valid and will be the
+ * domain id, or -1, as appropriate */
+
 int libxl_domain_create_new(libxl_ctx *ctx, libxl_domain_config *d_config,
                             uint32_t *domid,
                             const libxl_asyncop_how *ao_how,
@@ -1022,7 +1226,19 @@ int libxl_domain_need_memory(libxl_ctx *ctx, libxl_domain_build_info *b_info,
 int libxl_get_free_memory(libxl_ctx *ctx, uint32_t *memkb);
 /* wait for a given amount of memory to be free in the system */
 int libxl_wait_for_free_memory(libxl_ctx *ctx, uint32_t domid, uint32_t memory_kb, int wait_secs);
-/* wait for the memory target of a domain to be reached */
+/*
+ * Wait for the memory target of a domain to be reached. Does not
+ * decrement wait_secs if the domain is making progress toward reaching
+ * the target. If the domain is not making progress, wait_secs is
+ * decremented. If the timeout expires before the target is reached, the
+ * function returns ERROR_FAIL.
+ *
+ * Older versions of this function (Xen 4.5 and older), decremented
+ * wait_secs even if the domain was making progress, resulting in far
+ * lower overall wait times. To make sure that your calling routine
+ * works with new and old implementations of the function, pass enough
+ * time for the guest to reach its target as an argument.
+ */
 int libxl_wait_for_memory_target(libxl_ctx *ctx, uint32_t domid, int wait_secs);
 
 int libxl_vncviewer_exec(libxl_ctx *ctx, uint32_t domid, int autopass);
@@ -1048,7 +1264,9 @@ int libxl_console_get_tty(libxl_ctx *ctx, uint32_t domid, int cons_num,
  */
 int libxl_primary_console_get_tty(libxl_ctx *ctx, uint32_t domid_vm, char **path);
 
-/* May be called with info_r == NULL to check for domain's existance */
+/* May be called with info_r == NULL to check for domain's existence.
+ * Returns ERROR_DOMAIN_NOTFOUND if domain does not exist (used to return
+ * ERROR_INVAL for this scenario). */
 int libxl_domain_info(libxl_ctx*, libxl_dominfo *info_r,
                       uint32_t domid);
 
@@ -1070,6 +1288,10 @@ void libxl_vminfo_list_free(libxl_vminfo *list, int nb_vm);
 libxl_cputopology *libxl_get_cpu_topology(libxl_ctx *ctx, int *nb_cpu_out);
 void libxl_cputopology_list_free(libxl_cputopology *, int nb_cpu);
 
+#define LIBXL_PCITOPOLOGY_INVALID_ENTRY (~(uint32_t)0)
+libxl_pcitopology *libxl_get_pci_topology(libxl_ctx *ctx, int *num_devs);
+void libxl_pcitopology_list_free(libxl_pcitopology *, int num_devs);
+
 #define LIBXL_NUMAINFO_INVALID_ENTRY (~(uint32_t)0)
 libxl_numainfo *libxl_get_numainfo(libxl_ctx *ctx, int *nr);
 void libxl_numainfo_list_free(libxl_numainfo *, int nr);
@@ -1263,6 +1485,9 @@ libxl_device_pci *libxl_device_pci_list(libxl_ctx *ctx, uint32_t domid,
  * From a libxl API point of view, this starts a long-running
  * operation.  That operation consists of "being a driver domain"
  * and never completes.
+ *
+ * Attempting to abort this operation is not advisable; proper
+ * shutdown of the driver domain task is not supported.
  */
 int libxl_device_events_handler(libxl_ctx *ctx,
                                 const libxl_asyncop_how *ao_how)
@@ -1428,8 +1653,12 @@ int libxl_cpupool_destroy(libxl_ctx *ctx, uint32_t poolid);
 int libxl_cpupool_rename(libxl_ctx *ctx, const char *name, uint32_t poolid);
 int libxl_cpupool_cpuadd(libxl_ctx *ctx, uint32_t poolid, int cpu);
 int libxl_cpupool_cpuadd_node(libxl_ctx *ctx, uint32_t poolid, int node, int *cpus);
+int libxl_cpupool_cpuadd_cpumap(libxl_ctx *ctx, uint32_t poolid,
+                                const libxl_bitmap *cpumap);
 int libxl_cpupool_cpuremove(libxl_ctx *ctx, uint32_t poolid, int cpu);
 int libxl_cpupool_cpuremove_node(libxl_ctx *ctx, uint32_t poolid, int node, int *cpus);
+int libxl_cpupool_cpuremove_cpumap(libxl_ctx *ctx, uint32_t poolid,
+                                   const libxl_bitmap *cpumap);
 int libxl_cpupool_movedomain(libxl_ctx *ctx, uint32_t poolid, uint32_t domid);
 int libxl_cpupool_info(libxl_ctx *ctx, libxl_cpupoolinfo *info, uint32_t poolid);
 
@@ -1454,10 +1683,51 @@ int libxl_psr_cmt_detach(libxl_ctx *ctx, uint32_t domid);
 int libxl_psr_cmt_domain_attached(libxl_ctx *ctx, uint32_t domid);
 int libxl_psr_cmt_enabled(libxl_ctx *ctx);
 int libxl_psr_cmt_get_total_rmid(libxl_ctx *ctx, uint32_t *total_rmid);
-int libxl_psr_cmt_get_l3_cache_size(libxl_ctx *ctx, uint32_t socketid,
-    uint32_t *l3_cache_size);
-int libxl_psr_cmt_get_cache_occupancy(libxl_ctx *ctx, uint32_t domid,
-    uint32_t socketid, uint32_t *l3_cache_occupancy);
+int libxl_psr_cmt_get_l3_cache_size(libxl_ctx *ctx,
+                                    uint32_t socketid,
+                                    uint32_t *l3_cache_size);
+int libxl_psr_cmt_get_cache_occupancy(libxl_ctx *ctx,
+                                      uint32_t domid,
+                                      uint32_t socketid,
+                                      uint32_t *l3_cache_occupancy);
+#endif
+
+#ifdef LIBXL_HAVE_PSR_MBM
+int libxl_psr_cmt_type_supported(libxl_ctx *ctx, libxl_psr_cmt_type type);
+int libxl_psr_cmt_get_sample(libxl_ctx *ctx,
+                             uint32_t domid,
+                             libxl_psr_cmt_type type,
+                             uint64_t scope,
+                             uint64_t *sample_r,
+                             uint64_t *tsc_r);
+#endif
+
+#ifdef LIBXL_HAVE_PSR_CAT
+/*
+ * Function to set a domain's cbm. It operates on a single or multiple
+ * target(s) defined in 'target_map'. The definition of 'target_map' is
+ * related to 'type':
+ * 'L3_CBM': 'target_map' specifies all the sockets to be operated on.
+ */
+int libxl_psr_cat_set_cbm(libxl_ctx *ctx, uint32_t domid,
+                          libxl_psr_cbm_type type, libxl_bitmap *target_map,
+                          uint64_t cbm);
+/*
+ * Function to get a domain's cbm. It operates on a single 'target'.
+ * The definition of 'target' is related to 'type':
+ * 'L3_CBM': 'target' specifies which socket to be operated on.
+ */
+int libxl_psr_cat_get_cbm(libxl_ctx *ctx, uint32_t domid,
+                          libxl_psr_cbm_type type, uint32_t target,
+                          uint64_t *cbm_r);
+
+/*
+ * On success, the function returns an array of elements in 'info',
+ * and the length in 'nr'.
+ */
+int libxl_psr_cat_get_l3_info(libxl_ctx *ctx, libxl_psr_cat_info **info,
+                              int *nr);
+void libxl_psr_cat_info_list_free(libxl_psr_cat_info *list, int nr);
 #endif
 
 /* misc */
diff --git a/tools/libxl/libxl_aoutils.c b/tools/libxl/libxl_aoutils.c
index b10d2e1..d5fbc4d 100644
--- a/tools/libxl/libxl_aoutils.c
+++ b/tools/libxl/libxl_aoutils.c
@@ -46,7 +46,7 @@ int libxl__xswait_start(libxl__gc *gc, libxl__xswait_state *xswa)
 {
     int rc;
 
-    rc = libxl__ev_time_register_rel(gc, &xswa->time_ev,
+    rc = libxl__ev_time_register_rel(xswa->ao, &xswa->time_ev,
                                      xswait_timeout_callback, xswa->timeout_ms);
     if (rc) goto err;
 
@@ -80,12 +80,13 @@ void xswait_xswatch_callback(libxl__egc *egc, libxl__ev_xswatch *xsw,
 }
 
 void xswait_timeout_callback(libxl__egc *egc, libxl__ev_time *ev,
-                             const struct timeval *requested_abs)
+                             const struct timeval *requested_abs,
+                             int rc)
 {
     EGC_GC;
     libxl__xswait_state *xswa = CONTAINER_OF(ev, *xswa, time_ev);
     LOG(DEBUG, "%s: xswait timeout (path=%s)", xswa->what, xswa->path);
-    xswait_report_error(egc, xswa, ERROR_TIMEDOUT);
+    xswait_report_error(egc, xswa, rc);
 }
 
 static void xswait_report_error(libxl__egc *egc, libxl__xswait_state *xswa,
@@ -102,6 +103,7 @@ static void xswait_report_error(libxl__egc *egc, libxl__xswait_state *xswa,
 void libxl__datacopier_init(libxl__datacopier_state *dc)
 {
     assert(dc->ao);
+    libxl__ao_abortable_init(&dc->abrt);
     libxl__ev_fd_init(&dc->toread);
     libxl__ev_fd_init(&dc->towrite);
     LIBXL_TAILQ_INIT(&dc->bufs);
@@ -112,6 +114,7 @@ void libxl__datacopier_kill(libxl__datacopier_state *dc)
     STATE_AO_GC(dc->ao);
     libxl__datacopier_buf *buf, *tbuf;
 
+    libxl__ao_abortable_deregister(&dc->abrt);
     libxl__ev_fd_deregister(gc, &dc->toread);
     libxl__ev_fd_deregister(gc, &dc->towrite);
     LIBXL_TAILQ_FOREACH_SAFE(buf, &dc->bufs, entry, tbuf)
@@ -120,10 +123,10 @@ void libxl__datacopier_kill(libxl__datacopier_state *dc)
 }
 
 static void datacopier_callback(libxl__egc *egc, libxl__datacopier_state *dc,
-                                int onwrite, int errnoval)
+                                int rc, int onwrite, int errnoval)
 {
     libxl__datacopier_kill(dc);
-    dc->callback(egc, dc, onwrite, errnoval);
+    dc->callback(egc, dc, rc, onwrite, errnoval);
 }
 
 static void datacopier_writable(libxl__egc *egc, libxl__ev_fd *ev,
@@ -134,20 +137,21 @@ static void datacopier_check_state(libxl__egc *egc, libxl__datacopier_state *dc)
     STATE_AO_GC(dc->ao);
     int rc;
     
-    if (dc->used) {
+    if (dc->used && !dc->readbuf) {
         if (!libxl__ev_fd_isregistered(&dc->towrite)) {
             rc = libxl__ev_fd_register(gc, &dc->towrite, datacopier_writable,
                                        dc->writefd, POLLOUT);
             if (rc) {
                 LOG(ERROR, "unable to establish write event on %s"
                     " during copy of %s", dc->writewhat, dc->copywhat);
-                datacopier_callback(egc, dc, -1, 0);
+                datacopier_callback(egc, dc, ERROR_FAIL, -1, EIO);
                 return;
             }
         }
-    } else if (!libxl__ev_fd_isregistered(&dc->toread)) {
+    } else if (!libxl__ev_fd_isregistered(&dc->toread) ||
+               dc->bytes_to_read == 0) {
         /* we have had eof */
-        datacopier_callback(egc, dc, 0, 0);
+        datacopier_callback(egc, dc, 0, 0, 0);
         return;
     } else {
         /* nothing buffered, but still reading */
@@ -160,6 +164,8 @@ void libxl__datacopier_prefixdata(libxl__egc *egc, libxl__datacopier_state *dc,
 {
     EGC_GC;
     libxl__datacopier_buf *buf;
+    const uint8_t *ptr;
+
     /*
      * It is safe for this to be called immediately after _start, as
      * is documented in the public comment.  _start's caller must have
@@ -170,75 +176,118 @@ void libxl__datacopier_prefixdata(libxl__egc *egc, libxl__datacopier_state *dc,
 
     assert(len < dc->maxsz - dc->used);
 
-    buf = libxl__zalloc(NOGC, sizeof(*buf));
-    buf->used = len;
-    memcpy(buf->buf, data, len);
+    for (ptr = data; len; len -= buf->used, ptr += buf->used) {
+        buf = libxl__malloc(NOGC, sizeof(*buf));
+        buf->used = min(len, sizeof(buf->buf));
+        memcpy(buf->buf, ptr, buf->used);
 
-    dc->used += len;
-    LIBXL_TAILQ_INSERT_TAIL(&dc->bufs, buf, entry);
+        dc->used += buf->used;
+        LIBXL_TAILQ_INSERT_TAIL(&dc->bufs, buf, entry);
+    }
 }
 
 static int datacopier_pollhup_handled(libxl__egc *egc,
                                       libxl__datacopier_state *dc,
-                                      short revents, int onwrite)
+                                      int fd, short revents, int onwrite)
 {
     STATE_AO_GC(dc->ao);
 
     if (dc->callback_pollhup && (revents & POLLHUP)) {
-        LOG(DEBUG, "received POLLHUP on %s during copy of %s",
-            onwrite ? dc->writewhat : dc->readwhat,
-            dc->copywhat);
+        LOG(DEBUG, "received POLLHUP on fd %d: %s during copy of %s",
+            fd, onwrite ? dc->writewhat : dc->readwhat, dc->copywhat);
         libxl__datacopier_kill(dc);
-        dc->callback_pollhup(egc, dc, onwrite, -1);
+        dc->callback_pollhup(egc, dc, ERROR_FAIL, onwrite, -1);
         return 1;
     }
     return 0;
 }
 
+static void datacopier_abort(libxl__egc *egc, libxl__ao_abortable *abrt,
+                             int rc)
+{
+    libxl__datacopier_state *dc = CONTAINER_OF(abrt, *dc, abrt);
+    STATE_AO_GC(dc->ao);
+
+    datacopier_callback(egc, dc, rc, -1, 0);
+}
+
 static void datacopier_readable(libxl__egc *egc, libxl__ev_fd *ev,
                                 int fd, short events, short revents) {
     libxl__datacopier_state *dc = CONTAINER_OF(ev, *dc, toread);
     STATE_AO_GC(dc->ao);
 
-    if (datacopier_pollhup_handled(egc, dc, revents, 0))
+    if (datacopier_pollhup_handled(egc, dc, fd, revents, 0))
         return;
 
-    if (revents & ~POLLIN) {
-        LOG(ERROR, "unexpected poll event 0x%x (should be POLLIN)"
-            " on %s during copy of %s", revents, dc->readwhat, dc->copywhat);
-        datacopier_callback(egc, dc, -1, 0);
+    if (revents & ~(POLLIN|POLLHUP)) {
+        LOG(ERROR, "unexpected poll event 0x%x on fd %d (expected POLLIN "
+            "and/or POLLHUP) reading %s during copy of %s",
+            revents, fd, dc->readwhat, dc->copywhat);
+        datacopier_callback(egc, dc, ERROR_FAIL, -1, EIO);
         return;
     }
-    assert(revents & POLLIN);
+    assert(revents & (POLLIN|POLLHUP));
     for (;;) {
-        while (dc->used >= dc->maxsz) {
-            libxl__datacopier_buf *rm = LIBXL_TAILQ_FIRST(&dc->bufs);
-            dc->used -= rm->used;
-            assert(dc->used >= 0);
-            LIBXL_TAILQ_REMOVE(&dc->bufs, rm, entry);
-            free(rm);
-        }
+        libxl__datacopier_buf *buf = NULL;
+        int r;
+
+        if (dc->readbuf) {
+            r = read(ev->fd, dc->readbuf + dc->used, dc->bytes_to_read);
+        } else {
+            while (dc->used >= dc->maxsz) {
+                libxl__datacopier_buf *rm = LIBXL_TAILQ_FIRST(&dc->bufs);
+                dc->used -= rm->used;
+                assert(dc->used >= 0);
+                LIBXL_TAILQ_REMOVE(&dc->bufs, rm, entry);
+                free(rm);
+            }
 
-        libxl__datacopier_buf *buf =
-            LIBXL_TAILQ_LAST(&dc->bufs, libxl__datacopier_bufs);
-        if (!buf || buf->used >= sizeof(buf->buf)) {
-            buf = malloc(sizeof(*buf));
-            if (!buf) libxl__alloc_failed(CTX, __func__, 1, sizeof(*buf));
-            buf->used = 0;
-            LIBXL_TAILQ_INSERT_TAIL(&dc->bufs, buf, entry);
+            buf = LIBXL_TAILQ_LAST(&dc->bufs, libxl__datacopier_bufs);
+            if (!buf || buf->used >= sizeof(buf->buf)) {
+                buf = libxl__malloc(NOGC, sizeof(*buf));
+                buf->used = 0;
+                LIBXL_TAILQ_INSERT_TAIL(&dc->bufs, buf, entry);
+            }
+            r = read(ev->fd, buf->buf + buf->used,
+                     min_t(size_t, sizeof(buf->buf) - buf->used,
+                           (dc->bytes_to_read == -1) ? SIZE_MAX : dc->bytes_to_read));
         }
-        int r = read(ev->fd,
-                     buf->buf + buf->used,
-                     sizeof(buf->buf) - buf->used);
         if (r < 0) {
             if (errno == EINTR) continue;
-            if (errno == EWOULDBLOCK) break;
+            assert(errno);
+            if (errno == EWOULDBLOCK) {
+                if (revents & POLLHUP) {
+                    LOG(ERROR,
+                        "poll reported HUP but fd read gave EWOULDBLOCK"
+                        " on %s during copy of %s",
+                        dc->readwhat, dc->copywhat);
+                    datacopier_callback(egc, dc, ERROR_FAIL, -1, 0);
+                    return;
+                }
+                break;
+            }
             LOGE(ERROR, "error reading %s during copy of %s",
                  dc->readwhat, dc->copywhat);
-            datacopier_callback(egc, dc, 0, errno);
+            datacopier_callback(egc, dc, ERROR_FAIL, 0, errno);
             return;
         }
         if (r == 0) {
+            if (dc->callback_pollhup) {
+                /* It might be that this "eof" is actually a HUP.  If
+                 * the caller cares about the difference,
+                 * double-check using poll(2). */
+                struct pollfd hupchk;
+                hupchk.fd = ev->fd;
+                hupchk.events = POLLIN;
+                hupchk.revents = 0;
+                r = poll(&hupchk, 1, 0);
+                if (r < 0)
+                    LIBXL__EVENT_DISASTER(egc,
+     "unexpected failure polling fd for datacopier eof hup check",
+                                  errno, 0);
+                if (datacopier_pollhup_handled(egc, dc, fd, hupchk.revents, 0))
+                    return;
+            }
             libxl__ev_fd_deregister(gc, &dc->toread);
             break;
         }
@@ -248,13 +297,19 @@ static void datacopier_readable(libxl__egc *egc, libxl__ev_fd *ev,
                 assert(ferror(dc->log));
                 assert(errno);
                 LOGE(ERROR, "error logging %s", dc->copywhat);
-                datacopier_callback(egc, dc, 0, errno);
+                datacopier_callback(egc, dc, ERROR_FAIL, 0, errno);
                 return;
             }
         }
-        buf->used += r;
+        if (!dc->readbuf) {
+            buf->used += r;
+            assert(buf->used <= sizeof(buf->buf));
+        }
         dc->used += r;
-        assert(buf->used <= sizeof(buf->buf));
+        if (dc->bytes_to_read > 0)
+            dc->bytes_to_read -= r;
+        if (dc->bytes_to_read == 0)
+            break;
     }
     datacopier_check_state(egc, dc);
 }
@@ -264,13 +319,14 @@ static void datacopier_writable(libxl__egc *egc, libxl__ev_fd *ev,
     libxl__datacopier_state *dc = CONTAINER_OF(ev, *dc, towrite);
     STATE_AO_GC(dc->ao);
 
-    if (datacopier_pollhup_handled(egc, dc, revents, 1))
+    if (datacopier_pollhup_handled(egc, dc, fd, revents, 1))
         return;
 
     if (revents & ~POLLOUT) {
-        LOG(ERROR, "unexpected poll event 0x%x (should be POLLOUT)"
-            " on %s during copy of %s", revents, dc->writewhat, dc->copywhat);
-        datacopier_callback(egc, dc, -1, 0);
+        LOG(ERROR, "unexpected poll event 0x%x on fd %d (should be POLLOUT)"
+            " writing %s during copy of %s",
+            revents, fd, dc->writewhat, dc->copywhat);
+        datacopier_callback(egc, dc, ERROR_FAIL, -1, EIO);
         return;
     }
     assert(revents & POLLOUT);
@@ -287,9 +343,10 @@ static void datacopier_writable(libxl__egc *egc, libxl__ev_fd *ev,
         if (r < 0) {
             if (errno == EINTR) continue;
             if (errno == EWOULDBLOCK) break;
+            assert(errno);
             LOGE(ERROR, "error writing to %s during copy of %s",
                  dc->writewhat, dc->copywhat);
-            datacopier_callback(egc, dc, 1, errno);
+            datacopier_callback(egc, dc, ERROR_FAIL, 1, errno);
             return;
         }
         assert(r > 0);
@@ -309,14 +366,26 @@ int libxl__datacopier_start(libxl__datacopier_state *dc)
 
     libxl__datacopier_init(dc);
 
-    rc = libxl__ev_fd_register(gc, &dc->toread, datacopier_readable,
-                               dc->readfd, POLLIN);
-    if (rc) goto out;
+    assert(dc->readfd >= 0 || dc->writefd >= 0);
+    assert(!(dc->readbuf && dc->bytes_to_read == -1));
 
-    rc = libxl__ev_fd_register(gc, &dc->towrite, datacopier_writable,
-                               dc->writefd, POLLOUT);
+    dc->abrt.ao = ao;
+    dc->abrt.callback = datacopier_abort;
+    rc = libxl__ao_abortable_register(&dc->abrt);
     if (rc) goto out;
 
+    if (dc->readfd >= 0) {
+        rc = libxl__ev_fd_register(gc, &dc->toread, datacopier_readable,
+                                   dc->readfd, POLLIN);
+        if (rc) goto out;
+    }
+
+    if (dc->writefd >= 0) {
+        rc = libxl__ev_fd_register(gc, &dc->towrite, datacopier_writable,
+                                   dc->writefd, POLLOUT);
+        if (rc) goto out;
+    }
+
     return 0;
 
  out:
@@ -451,13 +520,19 @@ int libxl__openptys(libxl__openpty_state *op,
     return rc;
 }
 
+/*----- async exec -----*/
+
 static void async_exec_timeout(libxl__egc *egc,
                                libxl__ev_time *ev,
-                               const struct timeval *requested_abs)
+                               const struct timeval *requested_abs,
+                               int rc)
 {
     libxl__async_exec_state *aes = CONTAINER_OF(ev, *aes, time);
     STATE_AO_GC(aes->ao);
 
+    if (!aes->rc)
+        aes->rc = rc;
+
     libxl__ev_time_deregister(gc, &aes->time);
 
     assert(libxl__ev_child_inuse(&aes->child));
@@ -481,11 +556,12 @@ static void async_exec_done(libxl__egc *egc,
     libxl__ev_time_deregister(gc, &aes->time);
 
     if (status) {
-        libxl_report_child_exitstatus(CTX, LIBXL__LOG_ERROR,
-                                      aes->what, pid, status);
+        if (!aes->rc)
+            libxl_report_child_exitstatus(CTX, LIBXL__LOG_ERROR,
+                                          aes->what, pid, status);
     }
 
-    aes->callback(egc, aes, status);
+    aes->callback(egc, aes, aes->rc, status);
 }
 
 void libxl__async_exec_init(libxl__async_exec_state *aes)
@@ -494,16 +570,20 @@ void libxl__async_exec_init(libxl__async_exec_state *aes)
     libxl__ev_child_init(&aes->child);
 }
 
-int libxl__async_exec_start(libxl__gc *gc, libxl__async_exec_state *aes)
+int libxl__async_exec_start(libxl__async_exec_state *aes)
 {
     pid_t pid;
 
     /* Convenience aliases */
+    libxl__ao *ao = aes->ao;
+    AO_GC;
     libxl__ev_child *const child = &aes->child;
     char ** const args = aes->args;
 
+    aes->rc = 0;
+
     /* Set execution timeout */
-    if (libxl__ev_time_register_rel(gc, &aes->time,
+    if (libxl__ev_time_register_rel(ao, &aes->time,
                                     async_exec_timeout,
                                     aes->timeout_ms)) {
         LOG(ERROR, "unable to register timeout for executing: %s", aes->what);
@@ -540,3 +620,18 @@ bool libxl__async_exec_inuse(const libxl__async_exec_state *aes)
     assert(time_inuse == child_inuse);
     return child_inuse;
 }
+
+void libxl__kill(libxl__gc *gc, pid_t pid, int sig, const char *what)
+{
+    int r = kill(pid, sig);
+    if (r) LOGE(WARN, "failed to kill() %s [%lu] (signal %d)",
+                what, (unsigned long)pid, sig);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxl_arch.h b/tools/libxl/libxl_arch.h
index d3bc136..bd030b6 100644
--- a/tools/libxl/libxl_arch.h
+++ b/tools/libxl/libxl_arch.h
@@ -15,16 +15,51 @@
 #ifndef LIBXL_ARCH_H
 #define LIBXL_ARCH_H
 
+/* fill the arch specific configuration for the domain */
+_hidden
+int libxl__arch_domain_prepare_config(libxl__gc *gc,
+                                      libxl_domain_config *d_config,
+                                      xc_domain_configuration_t *xc_config);
+
+/* save the arch specific configuration for the domain */
+_hidden
+int libxl__arch_domain_save_config(libxl__gc *gc,
+                                   libxl_domain_config *d_config,
+                                   const xc_domain_configuration_t *xc_config);
+
 /* arch specific internal domain creation function */
+_hidden
 int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
                uint32_t domid);
 
 /* setup arch specific hardware description, i.e. DTB on ARM */
+_hidden
 int libxl__arch_domain_init_hw_description(libxl__gc *gc,
                                            libxl_domain_build_info *info,
+                                           libxl__domain_build_state *state,
                                            struct xc_dom_image *dom);
 /* finalize arch specific hardware description. */
+_hidden
 int libxl__arch_domain_finalise_hw_description(libxl__gc *gc,
                                       libxl_domain_build_info *info,
                                       struct xc_dom_image *dom);
+
+/* build vNUMA vmemrange with arch specific information */
+_hidden
+int libxl__arch_vnuma_build_vmemrange(libxl__gc *gc,
+                                      uint32_t domid,
+                                      libxl_domain_build_info *b_info,
+                                      libxl__domain_build_state *state);
+
+/* arch specific irq map function */
+_hidden
+int libxl__arch_domain_map_irq(libxl__gc *gc, uint32_t domid, int irq);
+
+/* arch specific to construct memory mapping function */
+_hidden
+int libxl__arch_domain_construct_memmap(libxl__gc *gc,
+                                        libxl_domain_config *d_config,
+                                        uint32_t domid,
+                                        struct xc_hvm_build_args *args);
+
 #endif
diff --git a/tools/libxl/libxl_arm.c b/tools/libxl/libxl_arm.c
index 448ac07..0af8010 100644
--- a/tools/libxl/libxl_arm.c
+++ b/tools/libxl/libxl_arm.c
@@ -1,5 +1,6 @@
 #include "libxl_internal.h"
 #include "libxl_arch.h"
+#include "libxl_libfdt_compat.h"
 
 #include <xc_dom.h>
 #include <stdbool.h>
@@ -35,6 +36,69 @@ static const char *gicv_to_string(uint8_t gic_version)
     }
 }
 
+int libxl__arch_domain_prepare_config(libxl__gc *gc,
+                                      libxl_domain_config *d_config,
+                                      xc_domain_configuration_t *xc_config)
+{
+    uint32_t nr_spis = 0;
+    unsigned int i;
+
+    for (i = 0; i < d_config->b_info.num_irqs; i++) {
+        uint32_t irq = d_config->b_info.irqs[i];
+        uint32_t spi;
+
+        if (irq < 32)
+            continue;
+
+        spi = irq - 32;
+
+        if (nr_spis <= spi)
+            nr_spis = spi + 1;
+    }
+
+    LOG(DEBUG, "Configure the domain");
+
+    xc_config->nr_spis = nr_spis;
+    LOG(DEBUG, " - Allocate %u SPIs", nr_spis);
+
+    switch (d_config->b_info.arch_arm.gic_version) {
+    case LIBXL_GIC_VERSION_DEFAULT:
+        xc_config->gic_version = XEN_DOMCTL_CONFIG_GIC_NATIVE;
+        break;
+    case LIBXL_GIC_VERSION_V2:
+        xc_config->gic_version = XEN_DOMCTL_CONFIG_GIC_V2;
+        break;
+    case LIBXL_GIC_VERSION_V3:
+        xc_config->gic_version = XEN_DOMCTL_CONFIG_GIC_V3;
+        break;
+    default:
+        LOG(ERROR, "Unknown GIC version %d",
+            d_config->b_info.arch_arm.gic_version);
+        return ERROR_FAIL;
+    }
+
+    return 0;
+}
+
+int libxl__arch_domain_save_config(libxl__gc *gc,
+                                   libxl_domain_config *d_config,
+                                   const xc_domain_configuration_t *xc_config)
+{
+    switch (xc_config->gic_version) {
+    case XEN_DOMCTL_CONFIG_GIC_V2:
+        d_config->b_info.arch_arm.gic_version = LIBXL_GIC_VERSION_V2;
+        break;
+    case XEN_DOMCTL_CONFIG_GIC_V3:
+        d_config->b_info.arch_arm.gic_version = LIBXL_GIC_VERSION_V3;
+        break;
+    default:
+        LOG(ERROR, "Unexpected gic version %u", xc_config->gic_version);
+        return ERROR_FAIL;
+    }
+
+    return 0;
+}
+
 int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
                               uint32_t domid)
 {
@@ -50,10 +114,11 @@ static struct arch_info {
     {"xen-3.0-aarch64", "arm,armv8-timer", "arm,armv8" },
 };
 
-enum {
-    PHANDLE_NONE = 0,
-    PHANDLE_GIC,
-};
+/*
+ * The device tree compiler (DTC) is allocating the phandle from 1 to
+ * onwards. Reserve a high value for the GIC phandle.
+ */
+#define PHANDLE_GIC (65000)
 
 typedef uint32_t be32;
 typedef be32 gic_interrupt[3];
@@ -195,6 +260,7 @@ static int make_root_properties(libxl__gc *gc,
 }
 
 static int make_chosen_node(libxl__gc *gc, void *fdt, bool ramdisk,
+                            libxl__domain_build_state *state,
                             const libxl_domain_build_info *info)
 {
     int res;
@@ -203,8 +269,9 @@ static int make_chosen_node(libxl__gc *gc, void *fdt, bool ramdisk,
     res = fdt_begin_node(fdt, "chosen");
     if (res) return res;
 
-    if (info->cmdline) {
-        res = fdt_property_string(fdt, "bootargs", info->cmdline);
+    if (state->pv_cmdline) {
+        LOG(DEBUG, "/chosen/bootargs = %s", state->pv_cmdline);
+        res = fdt_property_string(fdt, "bootargs", state->pv_cmdline);
         if (res) return res;
     }
 
@@ -227,6 +294,7 @@ static int make_cpus_node(libxl__gc *gc, void *fdt, int nr_cpus,
                           const struct arch_info *ainfo)
 {
     int res, i;
+    uint64_t mpidr_aff;
 
     res = fdt_begin_node(fdt, "cpus");
     if (res) return res;
@@ -238,7 +306,16 @@ static int make_cpus_node(libxl__gc *gc, void *fdt, int nr_cpus,
     if (res) return res;
 
     for (i = 0; i < nr_cpus; i++) {
-        const char *name = GCSPRINTF("cpu@%d", i);
+        const char *name;
+
+        /*
+         * According to ARM CPUs bindings, the reg field should match
+         * the MPIDR's affinity bits. We will use AFF0 and AFF1 when
+         * constructing the reg value of the guest at the moment, for it
+         * is enough for the current max vcpu number.
+         */
+        mpidr_aff = (i & 0x0f) | (((i >> 4) & 0xff) << 8);
+        name = GCSPRINTF("cpu@%"PRIx64, mpidr_aff);
 
         res = fdt_begin_node(fdt, name);
         if (res) return res;
@@ -252,7 +329,7 @@ static int make_cpus_node(libxl__gc *gc, void *fdt, int nr_cpus,
         res = fdt_property_string(fdt, "enable-method", "psci");
         if (res) return res;
 
-        res = fdt_property_regs(gc, fdt, 1, 0, 1, (uint64_t)i);
+        res = fdt_property_regs(gc, fdt, 1, 0, 1, mpidr_aff);
         if (res) return res;
 
         res = fdt_end_node(fdt);
@@ -412,7 +489,9 @@ static int make_gicv3_node(libxl__gc *gc, void *fdt)
     return 0;
 }
 
-static int make_timer_node(libxl__gc *gc, void *fdt, const struct arch_info *ainfo)
+static int make_timer_node(libxl__gc *gc, void *fdt,
+                           const struct arch_info *ainfo,
+                           uint32_t frequency)
 {
     int res;
     gic_interrupt ints[3];
@@ -430,6 +509,9 @@ static int make_timer_node(libxl__gc *gc, void *fdt, const struct arch_info *ain
     res = fdt_property_interrupts(gc, fdt, ints, 3);
     if (res) return res;
 
+    if ( frequency )
+        fdt_property_u32(fdt, "clock-frequency", frequency);
+
     res = fdt_end_node(fdt);
     if (res) return res;
 
@@ -484,7 +566,7 @@ static const struct arch_info *get_arch_info(libxl__gc *gc,
         if (!strcmp(dom->guest_type, info->guest_type))
             return info;
     }
-    LOG(ERROR, "Unable to find arch FDT info for %s\n", dom->guest_type);
+    LOG(ERROR, "Unable to find arch FDT info for %s", dom->guest_type);
     return NULL;
 }
 
@@ -512,20 +594,176 @@ out:
     }
 }
 
+#ifdef ENABLE_PARTIAL_DEVICE_TREE
+
+static int check_partial_fdt(libxl__gc *gc, void *fdt, size_t size)
+{
+    int r;
+
+    if (fdt_magic(fdt) != FDT_MAGIC) {
+        LOG(ERROR, "Partial FDT is not a valid Flat Device Tree");
+        return ERROR_FAIL;
+    }
+
+    r = fdt_check_header(fdt);
+    if (r) {
+        LOG(ERROR, "Failed to check the partial FDT (%d)", r);
+        return ERROR_FAIL;
+    }
+
+    if (fdt_totalsize(fdt) > size) {
+        LOG(ERROR, "Partial FDT totalsize is too big");
+        return ERROR_FAIL;
+    }
+
+    return 0;
+}
+
+static int copy_properties(libxl__gc *gc, void *fdt, void *pfdt,
+                           int nodeoff)
+{
+    int propoff, nameoff, r;
+    const struct fdt_property *prop;
+
+    for (propoff = fdt_first_property_offset(pfdt, nodeoff);
+         propoff >= 0;
+         propoff = fdt_next_property_offset(pfdt, propoff)) {
+
+        if (!(prop = fdt_get_property_by_offset(pfdt, propoff, NULL))) {
+            return -FDT_ERR_INTERNAL;
+        }
+
+        nameoff = fdt32_to_cpu(prop->nameoff);
+        r = fdt_property(fdt, fdt_string(pfdt, nameoff),
+                         prop->data, fdt32_to_cpu(prop->len));
+        if (r) return r;
+    }
+
+    /* FDT_ERR_NOTFOUND => There is no more properties for this node */
+    return (propoff != -FDT_ERR_NOTFOUND)? propoff : 0;
+}
+
+/* Copy a node from the partial device tree to the guest device tree */
+static int copy_node(libxl__gc *gc, void *fdt, void *pfdt,
+                     int nodeoff, int depth)
+{
+    int r;
+
+    r = fdt_begin_node(fdt, fdt_get_name(pfdt, nodeoff, NULL));
+    if (r) return r;
+
+    r = copy_properties(gc, fdt, pfdt, nodeoff);
+    if (r) return r;
+
+    for (nodeoff = fdt_first_subnode(pfdt, nodeoff);
+         nodeoff >= 0;
+         nodeoff = fdt_next_subnode(pfdt, nodeoff)) {
+        r = copy_node(gc, fdt, pfdt, nodeoff, depth + 1);
+        if (r) return r;
+    }
+
+    if (nodeoff != -FDT_ERR_NOTFOUND)
+        return nodeoff;
+
+    r = fdt_end_node(fdt);
+    if (r) return r;
+
+    return 0;
+}
+
+static int copy_node_by_path(libxl__gc *gc, const char *path,
+                             void *fdt, void *pfdt)
+{
+    int nodeoff, r;
+    const char *name = strrchr(path, '/');
+
+    if (!name)
+        return -FDT_ERR_INTERNAL;
+
+    name++;
+
+    /*
+     * The FDT function to look at a node doesn't take into account the
+     * unit (i.e anything after @) when search by name. Check if the
+     * name exactly matches.
+     */
+    nodeoff = fdt_path_offset(pfdt, path);
+    if (nodeoff < 0)
+        return nodeoff;
+
+    if (strcmp(fdt_get_name(pfdt, nodeoff, NULL), name))
+        return -FDT_ERR_NOTFOUND;
+
+    r = copy_node(gc, fdt, pfdt, nodeoff, 0);
+    if (r) return r;
+
+    return 0;
+}
+
+/*
+ * The partial device tree is not copied entirely. Only the relevant bits are
+ * copied to the guest device tree:
+ *  - /passthrough node
+ *  - /aliases node
+ */
+static int copy_partial_fdt(libxl__gc *gc, void *fdt, void *pfdt)
+{
+    int r;
+
+    r = copy_node_by_path(gc, "/passthrough", fdt, pfdt);
+    if (r < 0) {
+        LOG(ERROR, "Can't copy the node \"/passthrough\" from the partial FDT");
+        return r;
+    }
+
+    r = copy_node_by_path(gc, "/aliases", fdt, pfdt);
+    if (r < 0 && r != -FDT_ERR_NOTFOUND) {
+        LOG(ERROR, "Can't copy the node \"/aliases\" from the partial FDT");
+        return r;
+    }
+
+    return 0;
+}
+
+#else
+
+static int check_partial_fdt(libxl__gc *gc, void *fdt, size_t size)
+{
+    LOG(ERROR, "partial device tree not supported");
+
+    return ERROR_FAIL;
+}
+
+static int copy_partial_fdt(libxl__gc *gc, void *fdt, void *pfdt)
+{
+    /*
+     * We should never be here when the partial device tree is not
+     * supported.
+     * */
+    return -FDT_ERR_INTERNAL;
+}
+
+#endif /* ENABLE_PARTIAL_DEVICE_TREE */
+
 #define FDT_MAX_SIZE (1<<20)
 
 int libxl__arch_domain_init_hw_description(libxl__gc *gc,
                                            libxl_domain_build_info *info,
+                                           libxl__domain_build_state *state,
                                            struct xc_dom_image *dom)
 {
-    xc_domain_configuration_t config;
     void *fdt = NULL;
+    void *pfdt = NULL;
     int rc, res;
     size_t fdt_size = 0;
+    int pfdt_size = 0;
 
     const libxl_version_info *vers;
     const struct arch_info *ainfo;
 
+    /* convenience aliases */
+    xc_domain_configuration_t *xc_config = &state->config;
+
     assert(info->type == LIBXL_DOMAIN_TYPE_PV);
 
     vers = libxl_get_version_info(CTX);
@@ -534,19 +772,28 @@ int libxl__arch_domain_init_hw_description(libxl__gc *gc,
     ainfo = get_arch_info(gc, dom);
     if (ainfo == NULL) return ERROR_FAIL;
 
-    LOG(DEBUG, "configure the domain");
-    config.gic_version = XEN_DOMCTL_CONFIG_GIC_DEFAULT;
-    if (xc_domain_configure(CTX->xch, dom->guest_domid, &config) != 0) {
-        LOG(ERROR, "couldn't configure the domain");
-        return ERROR_FAIL;
-    }
-
     LOG(DEBUG, "constructing DTB for Xen version %d.%d guest",
         vers->xen_version_major, vers->xen_version_minor);
-    LOG(DEBUG, "  - vGIC version: %s", gicv_to_string(config.gic_version));
+    LOG(DEBUG, " - vGIC version: %s", gicv_to_string(xc_config->gic_version));
+
+    if (info->device_tree) {
+        LOG(DEBUG, " - Partial device tree provided: %s", info->device_tree);
+
+        rc = libxl_read_file_contents(CTX, info->device_tree,
+                                      &pfdt, &pfdt_size);
+        if (rc) {
+            LOGEV(ERROR, rc, "failed to read the partial device file %s",
+                  info->device_tree);
+            return ERROR_FAIL;
+        }
+        libxl__ptr_add(gc, pfdt);
+
+        if (check_partial_fdt(gc, pfdt, pfdt_size))
+            return ERROR_FAIL;
+    }
 
 /*
- * Call "call" handling FDR_ERR_*. Will either:
+ * Call "call" handling FDT_ERR_*. Will either:
  * - loop back to retry_resize
  * - set rc and goto out
  * - fall through successfully
@@ -586,13 +833,13 @@ next_resize:
         FDT( fdt_begin_node(fdt, "") );
 
         FDT( make_root_properties(gc, vers, fdt) );
-        FDT( make_chosen_node(gc, fdt, !!dom->ramdisk_blob, info) );
+        FDT( make_chosen_node(gc, fdt, !!dom->ramdisk_blob, state, info) );
         FDT( make_cpus_node(gc, fdt, info->max_vcpus, ainfo) );
         FDT( make_psci_node(gc, fdt) );
 
         FDT( make_memory_nodes(gc, fdt, dom) );
 
-        switch (config.gic_version) {
+        switch (xc_config->gic_version) {
         case XEN_DOMCTL_CONFIG_GIC_V2:
             FDT( make_gicv2_node(gc, fdt,
                                  GUEST_GICD_BASE, GUEST_GICD_SIZE,
@@ -602,14 +849,18 @@ next_resize:
             FDT( make_gicv3_node(gc, fdt) );
             break;
         default:
-            LOG(ERROR, "Unknown GIC version %d", config.gic_version);
+            LOG(ERROR, "Unknown GIC version %s",
+                gicv_to_string(xc_config->gic_version));
             rc = ERROR_FAIL;
             goto out;
         }
 
-        FDT( make_timer_node(gc, fdt, ainfo) );
+        FDT( make_timer_node(gc, fdt, ainfo, xc_config->clock_frequency) );
         FDT( make_hypervisor_node(gc, fdt, vers) );
 
+        if (pfdt)
+            FDT( copy_partial_fdt(gc, fdt, pfdt) );
+
         FDT( fdt_end_node(fdt) );
 
         FDT( fdt_finish(fdt) );
@@ -706,3 +957,32 @@ int libxl__arch_domain_finalise_hw_description(libxl__gc *gc,
 
     return 0;
 }
+
+int libxl__arch_vnuma_build_vmemrange(libxl__gc *gc,
+                                      uint32_t domid,
+                                      libxl_domain_build_info *info,
+                                      libxl__domain_build_state *state)
+{
+    return libxl__vnuma_build_vmemrange_pv_generic(gc, domid, info, state);
+}
+
+int libxl__arch_domain_map_irq(libxl__gc *gc, uint32_t domid, int irq)
+{
+    return xc_domain_bind_pt_spi_irq(CTX->xch, domid, irq, irq);
+}
+
+int libxl__arch_domain_construct_memmap(libxl__gc *gc,
+                                        libxl_domain_config *d_config,
+                                        uint32_t domid,
+                                        struct xc_hvm_build_args *args)
+{
+    return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxl_bootloader.c b/tools/libxl/libxl_bootloader.c
index 79947d4..95dde98 100644
--- a/tools/libxl/libxl_bootloader.c
+++ b/tools/libxl/libxl_bootloader.c
@@ -30,10 +30,11 @@
 
 static void bootloader_gotptys(libxl__egc *egc, libxl__openpty_state *op);
 static void bootloader_keystrokes_copyfail(libxl__egc *egc,
-       libxl__datacopier_state *dc, int onwrite, int errnoval);
+       libxl__datacopier_state *dc, int rc, int onwrite, int errnoval);
 static void bootloader_display_copyfail(libxl__egc *egc,
-       libxl__datacopier_state *dc, int onwrite, int errnoval);
-static void bootloader_domaindeath(libxl__egc*, libxl__domaindeathcheck *dc);
+       libxl__datacopier_state *dc, int rc, int onwrite, int errnoval);
+static void bootloader_domaindeath(libxl__egc*, libxl__domaindeathcheck *dc,
+                                   int rc);
 static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child,
                                 pid_t pid, int status);
 
@@ -496,7 +497,7 @@ static void bootloader_gotptys(libxl__egc *egc, libxl__openpty_state *op)
     bl->deathcheck.what = "stopping bootloader";
     bl->deathcheck.domid = bl->domid;
     bl->deathcheck.callback = bootloader_domaindeath;
-    rc = libxl__domaindeathcheck_start(gc, &bl->deathcheck);
+    rc = libxl__domaindeathcheck_start(ao, &bl->deathcheck);
     if (rc) goto out;
 
     if (bl->console_available)
@@ -516,6 +517,7 @@ static void bootloader_gotptys(libxl__egc *egc, libxl__openpty_state *op)
 
     bl->keystrokes.ao = ao;
     bl->keystrokes.maxsz = BOOTLOADER_BUF_OUT;
+    bl->keystrokes.bytes_to_read = -1;
     bl->keystrokes.copywhat =
         GCSPRINTF("bootloader input for domain %"PRIu32, bl->domid);
     bl->keystrokes.callback =         bootloader_keystrokes_copyfail;
@@ -527,6 +529,7 @@ static void bootloader_gotptys(libxl__egc *egc, libxl__openpty_state *op)
 
     bl->display.ao = ao;
     bl->display.maxsz = BOOTLOADER_BUF_IN;
+    bl->display.bytes_to_read = -1;
     bl->display.copywhat =
         GCSPRINTF("bootloader output for domain %"PRIu32, bl->domid);
     bl->display.callback =         bootloader_display_copyfail;
@@ -576,10 +579,10 @@ static void bootloader_gotptys(libxl__egc *egc, libxl__openpty_state *op)
 
 /* perhaps one of these will be called, but perhaps not */
 static void bootloader_copyfail(libxl__egc *egc, const char *which,
-        libxl__bootloader_state *bl, int ondisplay, int onwrite, int errnoval)
+        libxl__bootloader_state *bl, int ondisplay,
+        int rc, int onwrite, int errnoval)
 {
     STATE_AO_GC(bl->ao);
-    int rc = ERROR_FAIL;
 
     if (errnoval==-1) {
         /* POLLHUP */
@@ -590,28 +593,32 @@ static void bootloader_copyfail(libxl__egc *egc, const char *which,
             LOG(ERROR, "unexpected POLLHUP on %s", which);
         }
     }
-    if (!onwrite && !errnoval)
+    if (!rc) {
         LOG(ERROR, "unexpected eof copying %s", which);
+        rc = ERROR_FAIL;
+    }
 
     bootloader_stop(egc, bl, rc);
 }
 static void bootloader_keystrokes_copyfail(libxl__egc *egc,
-       libxl__datacopier_state *dc, int onwrite, int errnoval)
+       libxl__datacopier_state *dc, int rc, int onwrite, int errnoval)
 {
     libxl__bootloader_state *bl = CONTAINER_OF(dc, *bl, keystrokes);
-    bootloader_copyfail(egc, "bootloader input", bl, 0, onwrite, errnoval);
+    bootloader_copyfail(egc, "bootloader input", bl, 0, rc,onwrite,errnoval);
 }
 static void bootloader_display_copyfail(libxl__egc *egc,
-       libxl__datacopier_state *dc, int onwrite, int errnoval)
+       libxl__datacopier_state *dc, int rc, int onwrite, int errnoval)
 {
     libxl__bootloader_state *bl = CONTAINER_OF(dc, *bl, display);
-    bootloader_copyfail(egc, "bootloader output", bl, 1, onwrite, errnoval);
+    bootloader_copyfail(egc, "bootloader output", bl, 1, rc,onwrite,errnoval);
 }
 
-static void bootloader_domaindeath(libxl__egc *egc, libxl__domaindeathcheck *dc)
+static void bootloader_domaindeath(libxl__egc *egc,
+                                   libxl__domaindeathcheck *dc,
+                                   int rc)
 {
     libxl__bootloader_state *bl = CONTAINER_OF(dc, *bl, deathcheck);
-    bootloader_stop(egc, bl, ERROR_FAIL);
+    bootloader_stop(egc, bl, rc);
 }
 
 static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child,
diff --git a/tools/libxl/libxl_convert_callout.c b/tools/libxl/libxl_convert_callout.c
new file mode 100644
index 0000000..5e5678b
--- /dev/null
+++ b/tools/libxl/libxl_convert_callout.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2014      Citrix Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h"
+
+#include "libxl_internal.h"
+
+/*
+ * Infrastructure for converting a legacy migration stream into a
+ * libxl v2 stream.
+ *
+ * This is done by fork()ing the python conversion script, which takes
+ * in a legacy stream, and puts out a suitably-formatted v2 stream.
+ */
+
+static void helper_exited(libxl__egc *egc, libxl__ev_child *ch,
+                          pid_t pid, int status);
+static void helper_stop(libxl__egc *egc, libxl__ao_abortable *abrt, int rc);
+static void helper_done(libxl__egc *egc,
+                        libxl__conversion_helper_state *chs);
+
+/*----- Entrypoints -----*/
+
+void libxl__conversion_helper_init(libxl__conversion_helper_state *chs)
+{
+    assert(chs->ao);
+
+    chs->v2_carefd = NULL;
+    chs->rc = 0;
+    libxl__ao_abortable_init(&chs->abrt);
+    libxl__ev_child_init(&chs->child);
+}
+
+int libxl__convert_legacy_stream(libxl__egc *egc,
+                                 libxl__conversion_helper_state *chs)
+{
+    STATE_AO_GC(chs->ao);
+    libxl__carefd *child_in = NULL, *child_out = NULL;
+    int rc = 0;
+
+    chs->abrt.ao = chs->ao;
+    chs->abrt.callback = helper_stop;
+    rc = libxl__ao_abortable_register(&chs->abrt);
+    if (rc) goto err;
+
+    libxl__carefd_begin();
+    int fds[2];
+    if (libxl_pipe(CTX, fds)) {
+        rc = ERROR_FAIL;
+        libxl__carefd_unlock();
+        goto err;
+    }
+    child_out = libxl__carefd_record(CTX, fds[0]);
+    child_in  = libxl__carefd_record(CTX, fds[1]);
+    libxl__carefd_unlock();
+
+    pid_t pid = libxl__ev_child_fork(gc, &chs->child, helper_exited);
+    if (!pid) {
+        char * const args[] =
+        {
+            getenv("LIBXL_CONVERT_HELPER") ?:
+                LIBEXEC_BIN "/convert-legacy-stream",
+            "--in",     GCSPRINTF("%d", chs->legacy_fd),
+            "--out",    GCSPRINTF("%d", fds[1]),
+            /*
+             * The width calculation is an assumption for the common
+             * case.  The conversion script needs to know the width of
+             * the toolstack which saved the legacy stream.
+             *
+             * In the overwhelming majority of cases, the width of the
+             * saving toolstack will be the same as our current
+             * width.  To avoid extending the libxl API with a
+             * parameter intended to disappear shortly, this option
+             * has not been exposed to the caller.
+             *
+             * If more complicated conversion is required, the
+             * conversion script can be instantiated manually, which
+             * will bypass all of this conversion logic.
+             */
+            "--width",  sizeof(unsigned long) == 8 ? "64" : "32",
+
+            "--guest",  chs->hvm ? "hvm" : "pv",
+            "--format", "libxl",
+            /* "--verbose", */
+            NULL,
+        };
+
+        libxl_fd_set_cloexec(CTX, chs->legacy_fd, 0);
+        libxl_fd_set_cloexec(CTX, libxl__carefd_fd(child_in), 0);
+
+        libxl__exec(gc,
+                    -1, -1, -1,
+                    args[0], args, NULL);
+    }
+
+    libxl__carefd_close(child_in);
+    chs->v2_carefd = child_out;
+
+    assert(!rc);
+    return rc;
+
+ err:
+    libxl__ao_abortable_deregister(&chs->abrt);
+    assert(rc);
+    return rc;
+}
+
+void libxl__conversion_helper_abort(libxl__egc *egc,
+                                    libxl__conversion_helper_state *chs,
+                                    int rc)
+{
+    STATE_AO_GC(chs->ao);
+    assert(rc);
+
+    if (libxl__conversion_helper_inuse(chs)) {
+
+        if (!chs->rc)
+            chs->rc = rc;
+
+        libxl__kill(gc, chs->child.pid, SIGTERM, "conversion helper");
+    }
+}
+
+/*----- State handling -----*/
+
+static void helper_stop(libxl__egc *egc, libxl__ao_abortable *abrt, int rc)
+{
+    libxl__conversion_helper_state *chs = CONTAINER_OF(abrt, *chs, abrt);
+    STATE_AO_GC(chs->ao);
+
+    libxl__conversion_helper_abort(egc, chs, rc);
+}
+
+static void helper_exited(libxl__egc *egc, libxl__ev_child *ch,
+                          pid_t pid, int status)
+{
+    libxl__conversion_helper_state *chs = CONTAINER_OF(ch, *chs, child);
+    STATE_AO_GC(chs->ao);
+
+    if (status) {
+        libxl_report_child_exitstatus(
+            CTX, chs->rc ? XTL_DEBUG : XTL_ERROR,
+            "conversion helper", pid, status);
+
+        if (!chs->rc)
+            chs->rc = ERROR_FAIL;
+    }
+
+    helper_done(egc, chs);
+}
+
+static void helper_done(libxl__egc *egc,
+                        libxl__conversion_helper_state *chs)
+{
+    STATE_AO_GC(chs->ao);
+
+    assert(!libxl__conversion_helper_inuse(chs));
+
+    libxl__ao_abortable_deregister(&chs->abrt);
+
+    chs->completion_callback(egc, chs, chs->rc);
+}
diff --git a/tools/libxl/libxl_cpuid.c b/tools/libxl/libxl_cpuid.c
index 7cfa6b7..c66e912 100644
--- a/tools/libxl/libxl_cpuid.c
+++ b/tools/libxl/libxl_cpuid.c
@@ -28,10 +28,13 @@ void libxl_cpuid_dispose(libxl_cpuid_policy_list *p_cpuid_list)
         return;
     for (i = 0; cpuid_list[i].input[0] != XEN_CPUID_INPUT_UNUSED; i++) {
         for (j = 0; j < 4; j++)
-            if (cpuid_list[i].policy[j] != NULL)
+            if (cpuid_list[i].policy[j] != NULL) {
                 free(cpuid_list[i].policy[j]);
+                cpuid_list[i].policy[j] = NULL;
+            }
     }
     free(cpuid_list);
+    *p_cpuid_list = NULL;
     return;
 }
 
@@ -220,9 +223,6 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str)
     }
     entry = cpuid_find_match(cpuid, flag->leaf, flag->subleaf);
     resstr = entry->policy[flag->reg - 1];
-    if (resstr == NULL) {
-        resstr = strdup("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
-    }
     num = strtoull(val, &endptr, 0);
     flags[flag->length] = 0;
     if (endptr != val) {
@@ -239,6 +239,11 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str)
             return 3;
         }
     }
+
+    if (resstr == NULL) {
+        resstr = strdup("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+    }
+
     /* the family and model entry is potentially split up across
      * two fields in Fn0000_0001_EAX, so handle them here separately.
      */
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 6f87d1c..f5771da 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -25,6 +25,8 @@
 #include <xen/hvm/hvm_info_table.h>
 #include <xen/hvm/e820.h>
 
+#include <xen-xsm/flask/flask.h>
+
 int libxl__domain_create_info_setdefault(libxl__gc *gc,
                                          libxl_domain_create_info *c_info)
 {
@@ -42,62 +44,20 @@ int libxl__domain_create_info_setdefault(libxl__gc *gc,
     libxl_defbool_setdefault(&c_info->run_hotplug_scripts, true);
     libxl_defbool_setdefault(&c_info->driver_domain, false);
 
+    if (!c_info->ssidref)
+        c_info->ssidref = SECINITSID_DOMU;
+
     return 0;
 }
 
-static int sched_params_valid(libxl__gc *gc,
-                              uint32_t domid, libxl_domain_sched_params *scp)
+void libxl__rdm_setdefault(libxl__gc *gc, libxl_domain_build_info *b_info)
 {
-    int has_weight = scp->weight != LIBXL_DOMAIN_SCHED_PARAM_WEIGHT_DEFAULT;
-    int has_period = scp->period != LIBXL_DOMAIN_SCHED_PARAM_PERIOD_DEFAULT;
-    int has_slice = scp->slice != LIBXL_DOMAIN_SCHED_PARAM_SLICE_DEFAULT;
-    int has_extratime =
-                scp->extratime != LIBXL_DOMAIN_SCHED_PARAM_EXTRATIME_DEFAULT;
-
-    /* The sedf scheduler needs some more consistency checking */
-    if (libxl__domain_scheduler(gc, domid) == LIBXL_SCHEDULER_SEDF) {
-        if (has_weight && (has_period || has_slice))
-            return 0;
-        /* If you want a real-time domain, with its own period and
-         * slice, please, do provide both! */
-        if (has_period != has_slice)
-            return 0;
-
-        /*
-         * Idea is, if we specify a weight, then both period and
-         * slice has to be zero. OTOH, if we do specify a period and
-         * slice, it is weight that should be zeroed. See
-         * docs/misc/sedf_scheduler_mini-HOWTO.txt for more details
-         * on the meaningful combinations and their meanings.
-         */
-        if (has_weight) {
-            scp->slice = 0;
-            scp->period = 0;
-        }
-        else if (!has_period) {
-            /* No weight nor slice/period means best effort. Parameters needs
-             * some mangling in order to properly ask for that, though. */
-
-            /*
-             * Providing no weight does not make any sense if we do not allow
-             * the domain to run in extra time. On the other hand, if we have
-             * extra time, weight will be ignored (and zeroed) by Xen, but it
-             * can't be zero here, or the call for setting the scheduling
-             * parameters will fail. So, avoid the latter by setting a random
-             * weight (namely, 1), as it will be ignored anyway.
-             */
-
-            /* We can setup a proper best effort domain (extra time only)
-             * iff we either already have or are asking for some extra time. */
-            scp->weight = has_extratime ? scp->extratime : 1;
-            scp->period = 0;
-        } else {
-            /* Real-time domain: will get slice CPU time over every period */
-            scp->weight = 0;
-        }
-    }
+    if (b_info->u.hvm.rdm.policy == LIBXL_RDM_RESERVE_POLICY_INVALID)
+        b_info->u.hvm.rdm.policy = LIBXL_RDM_RESERVE_POLICY_RELAXED;
 
-    return 1;
+    if (b_info->u.hvm.rdm_mem_boundary_memkb == LIBXL_MEMKB_DEFAULT)
+        b_info->u.hvm.rdm_mem_boundary_memkb =
+                            LIBXL_RDM_MEM_BOUNDARY_MEMKB_DEFAULT;
 }
 
 int libxl__domain_build_info_setdefault(libxl__gc *gc,
@@ -111,6 +71,10 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
 
     libxl_defbool_setdefault(&b_info->device_model_stubdomain, false);
 
+    if (libxl_defbool_val(b_info->device_model_stubdomain) &&
+        !b_info->device_model_ssidref)
+        b_info->device_model_ssidref = SECINITSID_DOMDM;
+
     if (!b_info->device_model_version) {
         if (b_info->type == LIBXL_DOMAIN_TYPE_HVM) {
             if (libxl_defbool_val(b_info->device_model_stubdomain)) {
@@ -170,6 +134,14 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
             break;
         default:abort();
         }
+
+        /* Check HVM direct boot parameters, we should honour ->ramdisk and
+         * ->cmdline iff ->kernel is set.
+         */
+        if (!b_info->kernel && (b_info->ramdisk || b_info->cmdline)) {
+            LOG(ERROR, "direct boot parameters specified but kernel missing");
+            return ERROR_INVAL;
+        }
     }
 
     if (b_info->type == LIBXL_DOMAIN_TYPE_HVM &&
@@ -233,6 +205,9 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
         if (!b_info->u.hvm.vga.kind)
             b_info->u.hvm.vga.kind = LIBXL_VGA_INTERFACE_TYPE_CIRRUS;
 
+        if (!b_info->u.hvm.hdtype)
+            b_info->u.hvm.hdtype = LIBXL_HDTYPE_IDE;
+
         switch (b_info->device_model_version) {
         case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL:
             switch (b_info->u.hvm.vga.kind) {
@@ -240,6 +215,10 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
                 if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
                     b_info->video_memkb = 0;
                 break;
+            case LIBXL_VGA_INTERFACE_TYPE_QXL:
+                LOG(ERROR,"qemu upstream required for qxl vga");
+                return ERROR_INVAL;
+                break;
             case LIBXL_VGA_INTERFACE_TYPE_STD:
                 if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
                     b_info->video_memkb = 8 * 1024;
@@ -264,6 +243,15 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
                 if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
                     b_info->video_memkb = 0;
                 break;
+            case LIBXL_VGA_INTERFACE_TYPE_QXL:
+                if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT) {
+                    b_info->video_memkb = (128 * 1024);
+                } else if (b_info->video_memkb < (128 * 1024)) {
+                    LOG(ERROR,
+                        "128 Mib videoram is the minimum for qxl default");
+                    return ERROR_INVAL;
+                }
+                break;
             case LIBXL_VGA_INTERFACE_TYPE_STD:
                 if (b_info->video_memkb == LIBXL_MEMKB_DEFAULT)
                     b_info->video_memkb = 16 * 1024;
@@ -299,6 +287,7 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
         libxl_defbool_setdefault(&b_info->u.hvm.hpet,               true);
         libxl_defbool_setdefault(&b_info->u.hvm.vpt_align,          true);
         libxl_defbool_setdefault(&b_info->u.hvm.nested_hvm,         false);
+        libxl_defbool_setdefault(&b_info->u.hvm.altp2m,             false);
         libxl_defbool_setdefault(&b_info->u.hvm.usb,                false);
         libxl_defbool_setdefault(&b_info->u.hvm.xen_platform_pci,   true);
 
@@ -354,6 +343,7 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
 
         libxl_defbool_setdefault(&b_info->u.hvm.gfx_passthru, false);
 
+        libxl__rdm_setdefault(gc, b_info);
         break;
     case LIBXL_DOMAIN_TYPE_PV:
         libxl_defbool_setdefault(&b_info->u.pv.e820_host, false);
@@ -422,7 +412,7 @@ int libxl__domain_build(libxl__gc *gc,
 
     switch (info->type) {
     case LIBXL_DOMAIN_TYPE_HVM:
-        ret = libxl__build_hvm(gc, domid, info, state);
+        ret = libxl__build_hvm(gc, domid, d_config, state);
         if (ret)
             goto out;
 
@@ -489,8 +479,8 @@ out:
     return ret;
 }
 
-int libxl__domain_make(libxl__gc *gc, libxl_domain_create_info *info,
-                       uint32_t *domid)
+int libxl__domain_make(libxl__gc *gc, libxl_domain_config *d_config,
+                       uint32_t *domid, xc_domain_configuration_t *xc_config)
 {
     libxl_ctx *ctx = libxl__gc_owner(gc);
     int flags, ret, rc, nb_vm;
@@ -503,6 +493,8 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_create_info *info,
     xen_domain_handle_t handle;
     libxl_vminfo *vm_list;
 
+    /* convenience aliases */
+    libxl_domain_create_info *info = &d_config->c_info;
 
     assert(!libxl_domid_valid_guest(*domid));
 
@@ -531,13 +523,26 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_create_info *info,
     /* Ultimately, handle is an array of 16 uint8_t, same as uuid */
     libxl_uuid_copy(ctx, (libxl_uuid *)handle, &info->uuid);
 
-    ret = xc_domain_create(ctx->xch, info->ssidref, handle, flags, domid);
+    ret = libxl__arch_domain_prepare_config(gc, d_config, xc_config);
+    if (ret < 0) {
+        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "fail to get domain config");
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    ret = xc_domain_create_config(ctx->xch, info->ssidref,
+                                  handle, flags, domid,
+                                  xc_config);
     if (ret < 0) {
         LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "domain creation fail");
         rc = ERROR_FAIL;
         goto out;
     }
 
+    rc = libxl__arch_domain_save_config(gc, d_config, xc_config);
+    if (rc < 0)
+        goto out;
+
     ret = xc_cpupool_movedomain(ctx->xch, info->poolid, *domid);
     if (ret < 0) {
         LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "domain move fail");
@@ -641,33 +646,6 @@ retry_transaction:
         goto out;
     }
     libxl_vminfo_list_free(vm_list, nb_vm);
-    int hotplug_setting = libxl__hotplug_settings(gc, t);
-    if (hotplug_setting < 0) {
-        LOG(ERROR, "unable to get current hotplug scripts execution setting");
-        rc = ERROR_FAIL;
-        goto out;
-    }
-    if (libxl_defbool_val(info->run_hotplug_scripts) != hotplug_setting &&
-        (nb_vm - 1)) {
-        LOG(ERROR, "cannot change hotplug execution option once set, "
-                    "please shutdown all guests before changing it");
-        rc = ERROR_FAIL;
-        goto out;
-    }
-
-    if (libxl_defbool_val(info->run_hotplug_scripts)) {
-        rc = libxl__xs_write_checked(gc, t, DISABLE_UDEV_PATH, "1");
-        if (rc) {
-            LOGE(ERROR, "unable to write %s = 1", DISABLE_UDEV_PATH);
-            goto out;
-        }
-    } else {
-        rc = libxl__xs_rm_checked(gc, t, DISABLE_UDEV_PATH);
-        if (rc) {
-            LOGE(ERROR, "unable to delete %s", DISABLE_UDEV_PATH);
-            goto out;
-        }
-    }
 
     xs_write(ctx->xsh, t, libxl__sprintf(gc, "%s/uuid", vm_path), uuid_string, strlen(uuid_string));
     xs_write(ctx->xsh, t, libxl__sprintf(gc, "%s/name", vm_path), info->name, strlen(info->name));
@@ -706,6 +684,27 @@ static int store_libxl_entry(libxl__gc *gc, uint32_t domid,
         libxl_device_model_version_to_string(b_info->device_model_version));
 }
 
+/*----- remus asynchronous checkpoint callback -----*/
+
+static void remus_checkpoint_stream_done(
+    libxl__egc *egc, libxl__stream_read_state *srs, int rc);
+
+static void libxl__remus_domain_restore_checkpoint_callback(void *data)
+{
+    libxl__save_helper_state *shs = data;
+    libxl__domain_create_state *dcs = shs->caller_state;
+    libxl__egc *egc = shs->egc;
+    STATE_AO_GC(dcs->ao);
+
+    libxl__stream_read_start_checkpoint(egc, &dcs->srs);
+}
+
+static void remus_checkpoint_stream_done(
+    libxl__egc *egc, libxl__stream_read_state *stream, int rc)
+{
+    libxl__xc_domain_saverestore_async_callback_done(egc, &stream->shs, rc);
+}
+
 /*----- main domain creation -----*/
 
 /* We have a linear control flow; only one event callback is
@@ -732,10 +731,16 @@ static void domcreate_attach_vtpms(libxl__egc *egc, libxl__multidev *multidev,
                                    int ret);
 static void domcreate_attach_pci(libxl__egc *egc, libxl__multidev *aodevs,
                                  int ret);
+static void domcreate_attach_dtdev(libxl__egc *egc,
+                                   libxl__domain_create_state *dcs);
 
 static void domcreate_console_available(libxl__egc *egc,
                                         libxl__domain_create_state *dcs);
 
+static void domcreate_stream_done(libxl__egc *egc,
+                                  libxl__stream_read_state *srs,
+                                  int ret);
+
 static void domcreate_rebuild_done(libxl__egc *egc,
                                    libxl__domain_create_state *dcs,
                                    int ret);
@@ -764,6 +769,7 @@ static void initiate_domain_create(libxl__egc *egc,
 
     /* convenience aliases */
     libxl_domain_config *const d_config = dcs->guest_config;
+    libxl__domain_build_state *const state = &dcs->build_state;
     const int restore_fd = dcs->restore_fd;
     memset(&dcs->build_state, 0, sizeof(dcs->build_state));
 
@@ -845,10 +851,32 @@ static void initiate_domain_create(libxl__egc *egc,
         goto error_out;
     }
 
+    /* Disallow PoD and vNUMA to be enabled at the same time because PoD
+     * pool is not vNUMA-aware yet.
+     */
+    if (pod_enabled && d_config->b_info.num_vnuma_nodes) {
+        ret = ERROR_INVAL;
+        LOG(ERROR, "Cannot enable PoD and vNUMA at the same time");
+        goto error_out;
+    }
+
+    /* PV vNUMA is not yet supported because there is an issue with
+     * cpuid handling.
+     */
+    if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_PV &&
+        d_config->b_info.num_vnuma_nodes) {
+        ret = ERROR_INVAL;
+        LOG(ERROR, "PV vNUMA is not yet supported");
+        goto error_out;
+    }
+
     ret = libxl__domain_create_info_setdefault(gc, &d_config->c_info);
-    if (ret) goto error_out;
+    if (ret) {
+        LOG(ERROR, "Unable to set domain create info defaults");
+        goto error_out;
+    }
 
-    ret = libxl__domain_make(gc, &d_config->c_info, &domid);
+    ret = libxl__domain_make(gc, d_config, &domid, &state->config);
     if (ret) {
         LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "cannot make domain: %d", ret);
         dcs->guest_domid = domid;
@@ -860,17 +888,24 @@ static void initiate_domain_create(libxl__egc *egc,
     dcs->dmss.dm.guest_domid = 0; /* means we haven't spawned */
 
     ret = libxl__domain_build_info_setdefault(gc, &d_config->b_info);
-    if (ret) goto error_out;
+    if (ret) {
+        LOG(ERROR, "Unable to set domain build info defaults");
+        goto error_out;
+    }
 
-    if (!sched_params_valid(gc, domid, &d_config->b_info.sched_params)) {
-        LOG(ERROR, "Invalid scheduling parameters\n");
-        ret = ERROR_INVAL;
+    if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_HVM &&
+        (libxl_defbool_val(d_config->b_info.u.hvm.nested_hvm) &&
+         libxl_defbool_val(d_config->b_info.u.hvm.altp2m))) {
+        LOG(ERROR, "nestedhvm and altp2mhvm cannot be used together");
         goto error_out;
     }
 
     for (i = 0; i < d_config->num_disks; i++) {
         ret = libxl__device_disk_setdefault(gc, &d_config->disks[i]);
-        if (ret) goto error_out;
+        if (ret) {
+            LOG(ERROR, "Unable to set disk defaults for disk %d", i);
+            goto error_out;
+        }
     }
 
     dcs->bl.ao = ao;
@@ -890,7 +925,10 @@ static void initiate_domain_create(libxl__egc *egc,
          * but qemu needs the nic information to be complete.
          */
         ret = libxl__device_nic_setdefault(gc, &d_config->nics[i], domid);
-        if (ret) goto error_out;
+        if (ret) {
+            LOG(ERROR, "Unable to set nic defaults for nic %d", i);
+            goto error_out;
+        }
 
         if (d_config->nics[i].devid > last_devid)
             last_devid = d_config->nics[i].devid;
@@ -949,11 +987,10 @@ static void domcreate_bootloader_done(libxl__egc *egc,
     /* convenience aliases */
     const uint32_t domid = dcs->guest_domid;
     libxl_domain_config *const d_config = dcs->guest_config;
-    libxl_domain_build_info *const info = &d_config->b_info;
     const int restore_fd = dcs->restore_fd;
     libxl__domain_build_state *const state = &dcs->build_state;
     libxl__srm_restore_autogen_callbacks *const callbacks =
-        &dcs->shs.callbacks.restore.a;
+        &dcs->srs.shs.callbacks.restore.a;
 
     if (rc) {
         domcreate_rebuild_done(egc, dcs, rc);
@@ -981,42 +1018,31 @@ static void domcreate_bootloader_done(libxl__egc *egc,
     }
 
     /* Restore */
+    callbacks->checkpoint = libxl__remus_domain_restore_checkpoint_callback;
 
     rc = libxl__build_pre(gc, domid, d_config, state);
     if (rc)
         goto out;
 
-    /* read signature */
-    int hvm, pae, superpages;
-    switch (info->type) {
-    case LIBXL_DOMAIN_TYPE_HVM:
-        hvm = 1;
-        superpages = 1;
-        pae = libxl_defbool_val(info->u.hvm.pae);
-        callbacks->toolstack_restore = libxl__toolstack_restore;
-        break;
-    case LIBXL_DOMAIN_TYPE_PV:
-        hvm = 0;
-        superpages = 0;
-        pae = 1;
-        break;
-    default:
-        rc = ERROR_INVAL;
-        goto out;
-    }
-    libxl__xc_domain_restore(egc, dcs,
-                             hvm, pae, superpages);
+    dcs->srs.ao = ao;
+    dcs->srs.dcs = dcs;
+    dcs->srs.fd = restore_fd;
+    dcs->srs.legacy = (dcs->restore_params.stream_version == 1);
+    dcs->srs.completion_callback = domcreate_stream_done;
+    dcs->srs.checkpoint_callback = remus_checkpoint_stream_done;
+
+    libxl__stream_read_start(egc, &dcs->srs);
     return;
 
  out:
-    libxl__xc_domain_restore_done(egc, dcs, rc, 0, 0);
+    domcreate_stream_done(egc, &dcs->srs, rc);
 }
 
 void libxl__srm_callout_callback_restore_results(unsigned long store_mfn,
           unsigned long console_mfn, void *user)
 {
     libxl__save_helper_state *shs = user;
-    libxl__domain_create_state *dcs = CONTAINER_OF(shs, *dcs, shs);
+    libxl__domain_create_state *dcs = shs->caller_state;
     STATE_AO_GC(dcs->ao);
     libxl__domain_build_state *const state = &dcs->build_state;
 
@@ -1025,10 +1051,11 @@ void libxl__srm_callout_callback_restore_results(unsigned long store_mfn,
     shs->need_results =           0;
 }
 
-void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
-                                   int ret, int retval, int errnoval)
+static void domcreate_stream_done(libxl__egc *egc,
+                                  libxl__stream_read_state *srs,
+                                  int ret)
 {
-    libxl__domain_create_state *dcs = dcs_void;
+    libxl__domain_create_state *dcs = srs->dcs;
     STATE_AO_GC(dcs->ao);
     libxl_ctx *ctx = libxl__gc_owner(gc);
     char **vments = NULL, **localents = NULL;
@@ -1045,12 +1072,6 @@ void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
     if (ret)
         goto out;
 
-    if (retval) {
-        LOGEV(ERROR, errnoval, "restoring domain");
-        ret = ERROR_FAIL;
-        goto out;
-    }
-
     gettimeofday(&start_time, NULL);
 
     switch (info->type) {
@@ -1091,7 +1112,7 @@ void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
 
     if (info->type == LIBXL_DOMAIN_TYPE_HVM) {
         state->saved_state = GCSPRINTF(
-                       XC_DEVICE_MODEL_RESTORE_FILE".%d", domid);
+                       LIBXL_DEVICE_MODEL_RESTORE_FILE".%d", domid);
     }
 
 out:
@@ -1176,11 +1197,9 @@ static void domcreate_launch_dm(libxl__egc *egc, libxl__multidev *multidev,
 
         LOG(DEBUG, "dom%d irq %d", domid, irq);
 
-        ret = irq >= 0 ? xc_physdev_map_pirq(CTX->xch, domid, irq, &irq)
+        ret = irq >= 0 ? libxl__arch_domain_map_irq(gc, domid, irq)
                        : -EOVERFLOW;
-        if (!ret)
-            ret = xc_domain_irq_permission(CTX->xch, domid, irq, 1);
-        if (ret < 0) {
+        if (ret) {
             LOGE(ERROR, "failed give dom%d access to irq %d", domid, irq);
             ret = ERROR_FAIL;
             goto error_out;
@@ -1417,6 +1436,36 @@ static void domcreate_attach_pci(libxl__egc *egc, libxl__multidev *multidev,
         }
     }
 
+    domcreate_attach_dtdev(egc, dcs);
+    return;
+
+error_out:
+    assert(ret);
+    domcreate_complete(egc, dcs, ret);
+}
+
+static void domcreate_attach_dtdev(libxl__egc *egc,
+                                   libxl__domain_create_state *dcs)
+{
+    STATE_AO_GC(dcs->ao);
+    int i;
+    int ret;
+    int domid = dcs->guest_domid;
+
+    /* convenience aliases */
+    libxl_domain_config *const d_config = dcs->guest_config;
+
+    for (i = 0; i < d_config->num_dtdevs; i++) {
+        const libxl_device_dtdev *dtdev = &d_config->dtdevs[i];
+
+        LOG(DEBUG, "Assign device \"%s\" to dom%u", dtdev->path, domid);
+        ret = xc_assign_dt_device(CTX->xch, domid, dtdev->path);
+        if (ret < 0) {
+            LOG(ERROR, "xc_assign_dtdevice failed: %d", ret);
+            goto error_out;
+        }
+    }
+
     domcreate_console_available(egc, dcs);
 
     domcreate_complete(egc, dcs, 0);
@@ -1438,7 +1487,9 @@ static void domcreate_complete(libxl__egc *egc,
     if (!rc && d_config->b_info.exec_ssidref)
         rc = xc_flask_relabel_domain(CTX->xch, dcs->guest_domid, d_config->b_info.exec_ssidref);
 
-    if (!rc) {
+    bool retain_domain = !rc || rc == ERROR_ABORTED;
+
+    if (retain_domain) {
         libxl__domain_userdata_lock *lock;
 
         /* Note that we hold CTX lock at this point so only need to
@@ -1449,16 +1500,18 @@ static void domcreate_complete(libxl__egc *egc,
             rc = ERROR_LOCK_FAIL;
         } else {
             libxl__update_domain_configuration(gc, d_config_saved, d_config);
-            rc = libxl__set_domain_configuration(gc, dcs->guest_domid,
-                                                 d_config_saved);
+            int cfg_rc = libxl__set_domain_configuration
+                (gc, dcs->guest_domid, d_config_saved);
+            if (!rc)
+                rc = cfg_rc;
             libxl__unlock_domain_userdata(lock);
         }
     }
 
     libxl_domain_config_dispose(d_config_saved);
 
-    if (rc) {
-        if (dcs->guest_domid) {
+    if (!retain_domain) {
+        if (dcs->guest_domid > 0) {
             dcs->dds.ao = ao;
             dcs->dds.domid = dcs->guest_domid;
             dcs->dds.callback = domcreate_destruction_cb;
@@ -1496,28 +1549,39 @@ static void domain_create_cb(libxl__egc *egc,
                              int rc, uint32_t domid);
 
 static int do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,
-                            uint32_t *domid,
-                            int restore_fd, int checkpointed_stream,
+                            uint32_t *domid, int restore_fd,
+                            const libxl_domain_restore_params *params,
                             const libxl_asyncop_how *ao_how,
                             const libxl_asyncprogress_how *aop_console_how)
 {
     AO_CREATE(ctx, 0, ao_how);
     libxl__app_domain_create_state *cdcs;
+    int rc;
 
     GCNEW(cdcs);
     cdcs->dcs.ao = ao;
     cdcs->dcs.guest_config = d_config;
     libxl_domain_config_init(&cdcs->dcs.guest_config_saved);
     libxl_domain_config_copy(ctx, &cdcs->dcs.guest_config_saved, d_config);
-    cdcs->dcs.restore_fd = restore_fd;
+    cdcs->dcs.restore_fd = cdcs->dcs.libxc_fd = restore_fd;
+    if (restore_fd > -1) {
+        cdcs->dcs.restore_params = *params;
+        rc = libxl__fd_flags_modify_save(gc, cdcs->dcs.restore_fd,
+                                         ~(O_NONBLOCK|O_NDELAY), 0,
+                                         &cdcs->dcs.restore_fdfl);
+        if (rc < 0) goto out_err;
+    }
     cdcs->dcs.callback = domain_create_cb;
-    cdcs->dcs.checkpointed_stream = checkpointed_stream;
     libxl__ao_progress_gethow(&cdcs->dcs.aop_console_how, aop_console_how);
     cdcs->domid_out = domid;
 
     initiate_domain_create(egc, &cdcs->dcs);
 
     return AO_INPROGRESS;
+
+ out_err:
+    return AO_CREATE_FAIL(rc);
+
 }
 
 static void domain_create_cb(libxl__egc *egc,
@@ -1525,10 +1589,20 @@ static void domain_create_cb(libxl__egc *egc,
                              int rc, uint32_t domid)
 {
     libxl__app_domain_create_state *cdcs = CONTAINER_OF(dcs, *cdcs, dcs);
+    int flrc;
     STATE_AO_GC(cdcs->dcs.ao);
 
-    if (!rc)
-        *cdcs->domid_out = domid;
+    *cdcs->domid_out = domid;
+
+    if (dcs->restore_fd > -1) {
+        flrc = libxl__fd_flags_restore(gc,
+                dcs->restore_fd, dcs->restore_fdfl);
+        /*
+         * If restore has failed already then report that error not
+         * this one.
+         */
+        if (flrc && !rc) rc = flrc;
+    }
 
     libxl__ao_complete(egc, ao, rc);
 }
@@ -1538,7 +1612,7 @@ int libxl_domain_create_new(libxl_ctx *ctx, libxl_domain_config *d_config,
                             const libxl_asyncop_how *ao_how,
                             const libxl_asyncprogress_how *aop_console_how)
 {
-    return do_domain_create(ctx, d_config, domid, -1, 0,
+    return do_domain_create(ctx, d_config, domid, -1, NULL,
                             ao_how, aop_console_how);
 }
 
@@ -1548,8 +1622,8 @@ int libxl_domain_create_restore(libxl_ctx *ctx, libxl_domain_config *d_config,
                                 const libxl_asyncop_how *ao_how,
                                 const libxl_asyncprogress_how *aop_console_how)
 {
-    return do_domain_create(ctx, d_config, domid, restore_fd,
-                            params->checkpointed_stream, ao_how, aop_console_how);
+    return do_domain_create(ctx, d_config, domid, restore_fd, params,
+                            ao_how, aop_console_how);
 }
 
 /*
diff --git a/tools/libxl/libxl_device.c b/tools/libxl/libxl_device.c
index 4b51ded..8bb5e93 100644
--- a/tools/libxl/libxl_device.c
+++ b/tools/libxl/libxl_device.c
@@ -332,6 +332,8 @@ int libxl__device_physdisk_major_minor(const char *physpath, int *major, int *mi
     struct stat buf;
     if (stat(physpath, &buf) < 0)
         return -1;
+    if (!S_ISBLK(buf.st_mode))
+        return -1;
     *major = major(buf.st_rdev);
     *minor = minor(buf.st_rdev);
     return 0;
@@ -448,7 +450,7 @@ void libxl__prepare_ao_device(libxl__ao *ao, libxl__ao_device *aodev)
      * Initialize xs_watch, because it's not used on all possible
      * execution paths, but it's unconditionally destroyed when finished.
      */
-    libxl__ev_xswatch_init(&aodev->xs_watch);
+    libxl__xswait_init(&aodev->xswait);
     aodev->active = 1;
     /* We init this here because we might call device_hotplug_done
      * without actually calling any hotplug script */
@@ -589,15 +591,15 @@ int libxl__device_destroy(libxl__gc *gc, libxl__device *dev)
 
         if (domid == LIBXL_TOOLSTACK_DOMID) {
             /*
-             * The toolstack domain is in charge for removing both the
-             * frontend and the backend path
+             * The toolstack domain is in charge of removing the
+             * frontend path.
              */
             libxl__xs_path_cleanup(gc, t, fe_path);
-            libxl__xs_path_cleanup(gc, t, be_path);
-        } else if (dev->backend_domid == domid) {
+        }
+        if (dev->backend_domid == domid) {
             /*
-             * The driver domain is in charge for removing what it can
-             * from the backend path
+             * The driver domain is in charge of removing what it can
+             * from the backend path.
              */
             libxl__xs_path_cleanup(gc, t, be_path);
         }
@@ -715,7 +717,7 @@ out:
 
 /* This callback is part of the Qemu devices Badge */
 static void device_qemu_timeout(libxl__egc *egc, libxl__ev_time *ev,
-                                const struct timeval *requested_abs);
+                                const struct timeval *requested_abs, int rc);
 
 static void device_backend_callback(libxl__egc *egc, libxl__ev_devstate *ds,
                                    int rc);
@@ -727,15 +729,11 @@ static void device_hotplug(libxl__egc *egc, libxl__ao_device *aodev);
 
 static void device_hotplug_child_death_cb(libxl__egc *egc,
                                           libxl__async_exec_state *aes,
-                                          int status);
-
-static void device_destroy_be_timeout_cb(libxl__egc *egc, libxl__ev_time *ev,
-                                         const struct timeval *requested_abs);
+                                          int rc, int status);
 
 static void device_destroy_be_watch_cb(libxl__egc *egc,
-                                       libxl__ev_xswatch *watch,
-                                       const char *watch_path,
-                                       const char *event_path);
+                                       libxl__xswait_state *xswait,
+                                       int rc, const char *data);
 
 static void device_hotplug_done(libxl__egc *egc, libxl__ao_device *aodev);
 
@@ -760,7 +758,7 @@ void libxl__wait_device_connection(libxl__egc *egc, libxl__ao_device *aodev)
         return;
     }
 
-    rc = libxl__ev_devstate_wait(gc, &aodev->backend_ds,
+    rc = libxl__ev_devstate_wait(ao, &aodev->backend_ds,
                                  device_backend_callback,
                                  state_path, XenbusStateInitWait,
                                  LIBXL_INIT_TIMEOUT * 1000);
@@ -810,7 +808,7 @@ void libxl__initiate_device_remove(libxl__egc *egc,
              * TODO: 4.2 Bodge due to QEMU, see comment on top of
              * libxl__initiate_device_remove in libxl_internal.h
              */
-            rc = libxl__ev_time_register_rel(gc, &aodev->timeout,
+            rc = libxl__ev_time_register_rel(ao, &aodev->timeout,
                                              device_qemu_timeout,
                                              LIBXL_QEMU_BODGE_TIMEOUT * 1000);
             if (rc) {
@@ -818,7 +816,7 @@ void libxl__initiate_device_remove(libxl__egc *egc,
                            be_path);
                 goto out;
             }
-            return;
+            goto out_success;
         }
     }
 
@@ -839,17 +837,16 @@ void libxl__initiate_device_remove(libxl__egc *egc,
             goto out;
         }
 
+        rc = libxl__xs_write_checked(gc, t, online_path, "0");
+        if (rc)
+            goto out;
+
         /*
          * Check if device is already in "closed" state, in which case
          * it should not be changed.
          */
          if (state && atoi(state) != XenbusStateClosed) {
-            rc = libxl__xs_write_checked(gc, t, online_path, "0");
-            if (rc) {
-                LOG(ERROR, "unable to write to xenstore path %s", online_path);
-                goto out;
-            }
-            rc = libxl__xs_write_checked(gc, t, state_path, "5");
+            rc = libxl__xs_write_checked(gc, t, state_path, GCSPRINTF("%d", XenbusStateClosing));
             if (rc) {
                 LOG(ERROR, "unable to write to xenstore path %s", state_path);
                 goto out;
@@ -861,7 +858,7 @@ void libxl__initiate_device_remove(libxl__egc *egc,
         if (rc < 0) goto out;
     }
 
-    rc = libxl__ev_devstate_wait(gc, &aodev->backend_ds,
+    rc = libxl__ev_devstate_wait(ao, &aodev->backend_ds,
                                  device_backend_callback,
                                  state_path, XenbusStateClosed,
                                  LIBXL_DESTROY_TIMEOUT * 1000);
@@ -870,6 +867,7 @@ void libxl__initiate_device_remove(libxl__egc *egc,
         goto out;
     }
 
+out_success:
     libxl_dominfo_dispose(&info);
     return;
 
@@ -882,7 +880,7 @@ out:
 }
 
 static void device_qemu_timeout(libxl__egc *egc, libxl__ev_time *ev,
-                                const struct timeval *requested_abs)
+                                const struct timeval *requested_abs, int rc)
 {
     libxl__ao_device *aodev = CONTAINER_OF(ev, *aodev, timeout);
     STATE_AO_GC(aodev->ao);
@@ -890,7 +888,9 @@ static void device_qemu_timeout(libxl__egc *egc, libxl__ev_time *ev,
     char *state_path = GCSPRINTF("%s/state", be_path);
     const char *xs_state;
     xs_transaction_t t = 0;
-    int rc = 0;
+
+    if (rc != ERROR_TIMEDOUT)
+        goto out;
 
     libxl__ev_time_deregister(gc, &aodev->timeout);
 
@@ -911,7 +911,7 @@ static void device_qemu_timeout(libxl__egc *egc, libxl__ev_time *ev,
         if (rc) goto out;
 
         if (xs_state && atoi(xs_state) != XenbusStateClosed) {
-            rc = libxl__xs_write_checked(gc, XBT_NULL, state_path, "6");
+            rc = libxl__xs_write_checked(gc, XBT_NULL, state_path, GCSPRINTF("%d", XenbusStateClosed));
             if (rc) goto out;
         }
 
@@ -934,11 +934,13 @@ static void device_backend_callback(libxl__egc *egc, libxl__ev_devstate *ds,
     libxl__ao_device *aodev = CONTAINER_OF(ds, *aodev, backend_ds);
     STATE_AO_GC(aodev->ao);
 
+    LOG(DEBUG, "calling device_backend_cleanup");
     device_backend_cleanup(gc, aodev);
 
     if (rc == ERROR_TIMEDOUT &&
         aodev->action == LIBXL__DEVICE_ACTION_REMOVE &&
         !aodev->force) {
+        LOG(DEBUG, "Timeout reached, initiating forced remove");
         aodev->force = 1;
         libxl__initiate_device_remove(egc, aodev);
         return;
@@ -981,27 +983,30 @@ static void device_hotplug(libxl__egc *egc, libxl__ao_device *aodev)
      * hotplug scripts
      */
     rc = libxl__get_domid(gc, &domid);
-    if (rc) goto out;
+    if (rc) {
+        LOG(ERROR, "Failed to get domid");
+        goto out;
+    }
     if (aodev->dev->backend_domid != domid) {
-        if (aodev->action != LIBXL__DEVICE_ACTION_REMOVE)
-            goto out;
+        LOG(DEBUG, "Backend domid %d, domid %d, assuming driver domains",
+            aodev->dev->backend_domid, domid);
 
-        rc = libxl__ev_time_register_rel(gc, &aodev->timeout,
-                                         device_destroy_be_timeout_cb,
-                                         LIBXL_DESTROY_TIMEOUT * 1000);
-        if (rc) {
-            LOG(ERROR, "setup of xs watch timeout failed");
+        if (aodev->action != LIBXL__DEVICE_ACTION_REMOVE) {
+            LOG(DEBUG, "Not a remove, not executing hotplug scripts");
             goto out;
         }
 
-        rc = libxl__ev_xswatch_register(gc, &aodev->xs_watch,
-                                        device_destroy_be_watch_cb,
-                                        be_path);
+        aodev->xswait.ao = ao;
+        aodev->xswait.what = "removal of backend path";
+        aodev->xswait.path = be_path;
+        aodev->xswait.timeout_ms = LIBXL_DESTROY_TIMEOUT * 1000;
+        aodev->xswait.callback = device_destroy_be_watch_cb;
+        rc = libxl__xswait_start(gc, &aodev->xswait);
         if (rc) {
-            LOG(ERROR, "setup of xs watch for %s failed", be_path);
-            libxl__ev_time_deregister(gc, &aodev->timeout);
+            LOG(ERROR, "Setup of backend removal watch failed (path %s)", be_path);
             goto out;
         }
+
         return;
     }
 
@@ -1013,6 +1018,7 @@ static void device_hotplug(libxl__egc *egc, libxl__ao_device *aodev)
     switch (hotplug) {
     case 0:
         /* no hotplug script to execute */
+        LOG(DEBUG, "No hotplug script to execute");
         goto out;
     case 1:
         /* execute hotplug script */
@@ -1044,7 +1050,7 @@ static void device_hotplug(libxl__egc *egc, libxl__ao_device *aodev)
     aes->stdfds[1] = 2;
     aes->stdfds[2] = -1;
 
-    rc = libxl__async_exec_start(gc, aes);
+    rc = libxl__async_exec_start(aes);
     if (rc)
         goto out;
 
@@ -1062,7 +1068,7 @@ out:
 
 static void device_hotplug_child_death_cb(libxl__egc *egc,
                                           libxl__async_exec_state *aes,
-                                          int status)
+                                          int rc, int status)
 {
     libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
     STATE_AO_GC(aodev->ao);
@@ -1071,12 +1077,17 @@ static void device_hotplug_child_death_cb(libxl__egc *egc,
 
     device_hotplug_clean(gc, aodev);
 
-    if (status) {
+    if (status && !rc) {
         hotplug_error = libxl__xs_read(gc, XBT_NULL,
                                        GCSPRINTF("%s/hotplug-error", be_path));
         if (hotplug_error)
             LOG(ERROR, "script: %s", hotplug_error);
-        aodev->rc = ERROR_FAIL;
+        rc = ERROR_FAIL;
+    }
+
+    if (rc) {
+        if (!aodev->rc)
+            aodev->rc = rc;
         if (aodev->action == LIBXL__DEVICE_ACTION_ADD)
             /*
              * Only fail on device connection, on disconnection
@@ -1099,37 +1110,21 @@ error:
     device_hotplug_done(egc, aodev);
 }
 
-static void device_destroy_be_timeout_cb(libxl__egc *egc, libxl__ev_time *ev,
-                                         const struct timeval *requested_abs)
-{
-    libxl__ao_device *aodev = CONTAINER_OF(ev, *aodev, timeout);
-    STATE_AO_GC(aodev->ao);
-
-    LOG(ERROR, "timed out while waiting for %s to be removed",
-               libxl__device_backend_path(gc, aodev->dev));
-
-    aodev->rc = ERROR_TIMEDOUT;
-
-    device_hotplug_done(egc, aodev);
-    return;
-}
-
 static void device_destroy_be_watch_cb(libxl__egc *egc,
-                                       libxl__ev_xswatch *watch,
-                                       const char *watch_path,
-                                       const char *event_path)
+                                       libxl__xswait_state *xswait,
+                                       int rc, const char *dir)
 {
-    libxl__ao_device *aodev = CONTAINER_OF(watch, *aodev, xs_watch);
+    libxl__ao_device *aodev = CONTAINER_OF(xswait, *aodev, xswait);
     STATE_AO_GC(aodev->ao);
-    const char *dir;
-    int rc;
 
-    rc = libxl__xs_read_checked(gc, XBT_NULL, watch_path, &dir);
     if (rc) {
-        LOG(ERROR, "unable to read backend path: %s", watch_path);
+        if (rc == ERROR_TIMEDOUT)
+            LOG(ERROR, "timed out while waiting for %s to be removed",
+                xswait->path);
         aodev->rc = rc;
         goto out;
     }
+
     if (dir) {
         /* backend path still exists, wait a little longer... */
         return;
@@ -1162,7 +1157,7 @@ static void device_hotplug_clean(libxl__gc *gc, libxl__ao_device *aodev)
 {
     /* Clean events and check reentrancy */
     libxl__ev_time_deregister(gc, &aodev->timeout);
-    libxl__ev_xswatch_deregister(gc, &aodev->xs_watch);
+    libxl__xswait_stop(gc, &aodev->xswait);
     assert(!libxl__async_exec_inuse(&aodev->aes));
 }
 
@@ -1186,7 +1181,9 @@ int libxl__wait_for_device_model_deprecated(libxl__gc *gc,
                                  void *check_callback_userdata)
 {
     char *path;
-    path = GCSPRINTF("/local/domain/0/device-model/%d/state", domid);
+    uint32_t dm_domid = libxl_get_stubdom_id(CTX, domid);
+
+    path = libxl__device_model_xs_path(gc, dm_domid, domid, "/state");
     return libxl__xenstore_child_wait_deprecated(gc, domid,
                                      LIBXL_DEVICE_MODEL_START_TIMEOUT,
                                      "Device Model", path, state, spawning,
diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index 094a133..89b3bb7 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -36,19 +36,17 @@ const char *libxl__device_model_savefile(libxl__gc *gc, uint32_t domid)
 
 static const char *qemu_xen_path(libxl__gc *gc)
 {
-#ifdef QEMU_XEN_PATH
     return QEMU_XEN_PATH;
-#else
-    return libxl__abs_path(gc, "qemu-system-i386", libxl__private_bindir_path());
-#endif
 }
 
 static int libxl__create_qemu_logfile(libxl__gc *gc, char *name)
 {
     char *logfile;
-    int logfile_w;
+    int rc, logfile_w;
+
+    rc = libxl_create_logfile(CTX, name, &logfile);
+    if (rc) return rc;
 
-    libxl_create_logfile(CTX, name, &logfile);
     logfile_w = open(logfile, O_WRONLY|O_CREAT|O_APPEND, 0644);
     free(logfile);
 
@@ -81,7 +79,7 @@ const char *libxl__domain_device_model(libxl__gc *gc,
             break;
         default:
             LIBXL__LOG(ctx, LIBXL__LOG_ERROR,
-                       "invalid device model version %d\n",
+                       "invalid device model version %d",
                        info->device_model_version);
             dm = NULL;
             break;
@@ -90,6 +88,281 @@ const char *libxl__domain_device_model(libxl__gc *gc,
     return dm;
 }
 
+static int
+libxl__xc_device_get_rdm(libxl__gc *gc,
+                         uint32_t flags,
+                         uint16_t seg,
+                         uint8_t bus,
+                         uint8_t devfn,
+                         unsigned int *nr_entries,
+                         struct xen_reserved_device_memory **xrdm)
+{
+    int rc = 0, r;
+
+    /*
+     * We really can't presume how many entries we can get in advance.
+     */
+    *nr_entries = 0;
+    r = xc_reserved_device_memory_map(CTX->xch, flags, seg, bus, devfn,
+                                      NULL, nr_entries);
+    assert(r <= 0);
+    /* "0" means we have no any rdm entry. */
+    if (!r) goto out;
+
+    if (errno != ENOBUFS) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    GCNEW_ARRAY(*xrdm, *nr_entries);
+    r = xc_reserved_device_memory_map(CTX->xch, flags, seg, bus, devfn,
+                                      *xrdm, nr_entries);
+    if (r)
+        rc = ERROR_FAIL;
+
+ out:
+    if (rc) {
+        *nr_entries = 0;
+        *xrdm = NULL;
+        LOG(ERROR, "Could not get reserved device memory maps.");
+    }
+    return rc;
+}
+
+/*
+ * Check whether there exists rdm hole in the specified memory range.
+ * Returns true if exists, else returns false.
+ */
+static bool overlaps_rdm(uint64_t start, uint64_t memsize,
+                         uint64_t rdm_start, uint64_t rdm_size)
+{
+    return (start + memsize > rdm_start) && (start < rdm_start + rdm_size);
+}
+
+static void
+add_rdm_entry(libxl__gc *gc, libxl_domain_config *d_config,
+              uint64_t rdm_start, uint64_t rdm_size, int rdm_policy)
+{
+    d_config->rdms = libxl__realloc(NOGC, d_config->rdms,
+                    (d_config->num_rdms+1) * sizeof(libxl_device_rdm));
+
+    d_config->rdms[d_config->num_rdms].start = rdm_start;
+    d_config->rdms[d_config->num_rdms].size = rdm_size;
+    d_config->rdms[d_config->num_rdms].policy = rdm_policy;
+    d_config->num_rdms++;
+}
+
+/*
+ * Check reported RDM regions and handle potential gfn conflicts according
+ * to user preferred policy.
+ *
+ * RDM can reside in address space beyond 4G theoretically, but we never
+ * see this in real world. So in order to avoid breaking highmem layout
+ * we don't solve highmem conflict. Note this means highmem rmrr could
+ * still be supported if no conflict.
+ *
+ * But in the case of lowmem, RDM probably scatter the whole RAM space.
+ * Especially multiple RDM entries would worsen this to lead a complicated
+ * memory layout. And then its hard to extend hvm_info_table{} to work
+ * hvmloader out. So here we're trying to figure out a simple solution to
+ * avoid breaking existing layout. So when a conflict occurs,
+ *
+ * #1. Above a predefined boundary (default 2G)
+ * - Move lowmem_end below reserved region to solve conflict;
+ *
+ * #2. Below a predefined boundary (default 2G)
+ * - Check strict/relaxed policy.
+ * "strict" policy leads to fail libxl.
+ * "relaxed" policy issue a warning message and also mask this entry
+ * INVALID to indicate we shouldn't expose this entry to hvmloader.
+ * Note when both policies are specified on a given region, the per-device
+ * policy should override the global policy.
+ */
+int libxl__domain_device_construct_rdm(libxl__gc *gc,
+                                       libxl_domain_config *d_config,
+                                       uint64_t rdm_mem_boundary,
+                                       struct xc_hvm_build_args *args)
+{
+    int i, j, conflict, rc;
+    struct xen_reserved_device_memory *xrdm = NULL;
+    uint32_t strategy = d_config->b_info.u.hvm.rdm.strategy;
+    uint16_t seg;
+    uint8_t bus, devfn;
+    uint64_t rdm_start, rdm_size;
+    uint64_t highmem_end = args->highmem_end ? args->highmem_end : (1ull<<32);
+
+    /*
+     * We just want to construct RDM once since RDM is specific to the
+     * given platform, so this shouldn't change again.
+     */
+    if (d_config->num_rdms)
+        return 0;
+
+    /* Might not expose rdm. */
+    if (strategy == LIBXL_RDM_RESERVE_STRATEGY_IGNORE &&
+        !d_config->num_pcidevs)
+        return 0;
+
+    /* Query all RDM entries in this platform */
+    if (strategy == LIBXL_RDM_RESERVE_STRATEGY_HOST) {
+        unsigned int nr_entries;
+
+        /* Collect all rdm info if exist. */
+        rc = libxl__xc_device_get_rdm(gc, XENMEM_RDM_ALL,
+                                      0, 0, 0, &nr_entries, &xrdm);
+        if (rc)
+            goto out;
+        if (!nr_entries)
+            return 0;
+
+        assert(xrdm);
+
+        for (i = 0; i < nr_entries; i++)
+        {
+            add_rdm_entry(gc, d_config,
+                          pfn_to_paddr(xrdm[i].start_pfn),
+                          pfn_to_paddr(xrdm[i].nr_pages),
+                          d_config->b_info.u.hvm.rdm.policy);
+        }
+    }
+
+    /* Query RDM entries per-device */
+    for (i = 0; i < d_config->num_pcidevs; i++) {
+        unsigned int nr_entries;
+        bool new = true;
+
+        seg = d_config->pcidevs[i].domain;
+        bus = d_config->pcidevs[i].bus;
+        devfn = PCI_DEVFN(d_config->pcidevs[i].dev,
+                          d_config->pcidevs[i].func);
+        nr_entries = 0;
+        rc = libxl__xc_device_get_rdm(gc, 0,
+                                      seg, bus, devfn, &nr_entries, &xrdm);
+        if (rc)
+            goto out;
+        /* No RDM to associated with this device. */
+        if (!nr_entries)
+            continue;
+
+        assert(xrdm);
+
+        /*
+         * Need to check whether this entry is already saved in the array.
+         * This could come from two cases:
+         *
+         *   - user may configure to get all RDMs in this platform, which
+         *   is already queried before this point
+         *   - or two assigned devices may share one RDM entry
+         *
+         * Different policies may be configured on the same RDM due to
+         * above two cases. But we don't allow to assign such a group
+         * devies right now so it doesn't come true in our case.
+         */
+        for (j = 0; j < d_config->num_rdms; j++) {
+            if (d_config->rdms[j].start == pfn_to_paddr(xrdm[0].start_pfn))
+            {
+                /*
+                 * So the per-device policy always override the global
+                 * policy in this case.
+                 */
+                d_config->rdms[j].policy = d_config->pcidevs[i].rdm_policy;
+                new = false;
+                break;
+            }
+        }
+
+        if (new) {
+            add_rdm_entry(gc, d_config,
+                          pfn_to_paddr(xrdm[0].start_pfn),
+                          pfn_to_paddr(xrdm[0].nr_pages),
+                          d_config->pcidevs[i].rdm_policy);
+        }
+    }
+
+    /*
+     * Next step is to check and avoid potential conflict between RDM
+     * entries and guest RAM. To avoid intrusive impact to existing
+     * memory layout {lowmem, mmio, highmem} which is passed around
+     * various function blocks, below conflicts are not handled which
+     * are rare and handling them would lead to a more scattered
+     * layout:
+     *  - RDM  in highmem area (>4G)
+     *  - RDM lower than a defined memory boundary (e.g. 2G)
+     * Otherwise for conflicts between boundary and 4G, we'll simply
+     * move lowmem end below reserved region to solve conflict.
+     *
+     * If a conflict is detected on a given RDM entry, an error will
+     * be returned if 'strict' policy is specified. Instead, if
+     * 'relaxed' policy specified, this conflict is treated just as a
+     * warning, but we mark this RDM entry as INVALID to indicate that
+     * this entry shouldn't be exposed to hvmloader.
+     *
+     * Firstly we should check the case of rdm < 4G because we may
+     * need to expand highmem_end.
+     */
+    for (i = 0; i < d_config->num_rdms; i++) {
+        rdm_start = d_config->rdms[i].start;
+        rdm_size = d_config->rdms[i].size;
+        conflict = overlaps_rdm(0, args->lowmem_end, rdm_start, rdm_size);
+
+        if (!conflict)
+            continue;
+
+        /* Just check if RDM > our memory boundary. */
+        if (rdm_start > rdm_mem_boundary) {
+            /*
+             * We will move downwards lowmem_end so we have to expand
+             * highmem_end.
+             */
+            highmem_end += (args->lowmem_end - rdm_start);
+            /* Now move downwards lowmem_end. */
+            args->lowmem_end = rdm_start;
+        }
+    }
+
+    /* Sync highmem_end. */
+    args->highmem_end = highmem_end;
+
+    /*
+     * Finally we can take same policy to check lowmem(< 2G) and
+     * highmem adjusted above.
+     */
+    for (i = 0; i < d_config->num_rdms; i++) {
+        rdm_start = d_config->rdms[i].start;
+        rdm_size = d_config->rdms[i].size;
+        /* Does this entry conflict with lowmem? */
+        conflict = overlaps_rdm(0, args->lowmem_end,
+                                rdm_start, rdm_size);
+        /* Does this entry conflict with highmem? */
+        conflict |= overlaps_rdm((1ULL<<32),
+                                 args->highmem_end - (1ULL<<32),
+                                 rdm_start, rdm_size);
+
+        if (!conflict)
+            continue;
+
+        if (d_config->rdms[i].policy == LIBXL_RDM_RESERVE_POLICY_STRICT) {
+            LOG(ERROR, "RDM conflict at 0x%"PRIx64".\n",
+                       d_config->rdms[i].start);
+            goto out;
+        } else {
+            LOG(WARN, "Ignoring RDM conflict at 0x%"PRIx64".\n",
+                      d_config->rdms[i].start);
+
+            /*
+             * Then mask this INVALID to indicate we shouldn't expose this
+             * to hvmloader.
+             */
+            d_config->rdms[i].policy = LIBXL_RDM_RESERVE_POLICY_INVALID;
+        }
+    }
+
+    return 0;
+
+ out:
+    return ERROR_FAIL;
+}
+
 const libxl_vnc_info *libxl__dm_vnc(const libxl_domain_config *guest_config)
 {
     const libxl_vnc_info *vnc = NULL;
@@ -122,9 +395,10 @@ static const char *dm_keymap(const libxl_domain_config *guest_config)
         return NULL;
 }
 
-static char ** libxl__build_device_model_args_old(libxl__gc *gc,
+static int libxl__build_device_model_args_old(libxl__gc *gc,
                                         const char *dm, int domid,
                                         const libxl_domain_config *guest_config,
+                                        char ***args, char ***envs,
                                         const libxl__domain_build_state *state)
 {
     const libxl_domain_create_info *c_info = &guest_config->c_info;
@@ -135,8 +409,9 @@ static char ** libxl__build_device_model_args_old(libxl__gc *gc,
     const int num_nics = guest_config->num_nics;
     const char *keymap = dm_keymap(guest_config);
     int i;
-    flexarray_t *dm_args;
+    flexarray_t *dm_args, *dm_envs;
     dm_args = flexarray_make(gc, 16, 1);
+    dm_envs = flexarray_make(gc, 16, 1);
 
     flexarray_vappend(dm_args, dm,
                       "-d", libxl__sprintf(gc, "%d", domid), NULL);
@@ -161,7 +436,7 @@ static char ** libxl__build_device_model_args_old(libxl__gc *gc,
             if (strchr(vnc->listen, ':') != NULL) {
                 if (vnc->display) {
                     LOG(ERROR, "vncdisplay set, vnclisten contains display");
-                    return NULL;
+                    return ERROR_INVAL;
                 }
                 vncarg = vnc->listen;
             } else {
@@ -194,7 +469,10 @@ static char ** libxl__build_device_model_args_old(libxl__gc *gc,
         if (!libxl_defbool_val(sdl->opengl)) {
             flexarray_append(dm_args, "-disable-opengl");
         }
-        /* XXX sdl->{display,xauthority} into $DISPLAY/$XAUTHORITY */
+        if (sdl->display)
+            flexarray_append_pair(dm_envs, "DISPLAY", sdl->display);
+        if (sdl->xauthority)
+            flexarray_append_pair(dm_envs, "XAUTHORITY", sdl->xauthority);
     }
     if (keymap) {
         flexarray_vappend(dm_args, "-k", keymap, NULL);
@@ -207,14 +485,14 @@ static char ** libxl__build_device_model_args_old(libxl__gc *gc,
         if (b_info->kernel) {
             LOG(ERROR, "HVM direct kernel boot is not supported by "
                 "qemu-xen-traditional");
-            return NULL;
+            return ERROR_INVAL;
         }
 
         if (b_info->u.hvm.serial || b_info->u.hvm.serial_list) {
             if ( b_info->u.hvm.serial && b_info->u.hvm.serial_list )
             {
                 LOG(ERROR, "Both serial and serial_list set");
-                return NULL;
+                return ERROR_INVAL;
             }
             if (b_info->u.hvm.serial) {
                 flexarray_vappend(dm_args,
@@ -251,6 +529,8 @@ static char ** libxl__build_device_model_args_old(libxl__gc *gc,
         case LIBXL_VGA_INTERFACE_TYPE_NONE:
             flexarray_append_pair(dm_args, "-vga", "none");
             break;
+        case LIBXL_VGA_INTERFACE_TYPE_QXL:
+            break;
         }
 
         if (b_info->u.hvm.boot) {
@@ -262,7 +542,7 @@ static char ** libxl__build_device_model_args_old(libxl__gc *gc,
             if ( b_info->u.hvm.usbdevice && b_info->u.hvm.usbdevice_list )
             {
                 LOG(ERROR, "Both usbdevice and usbdevice_list set");
-                return NULL;
+                return ERROR_INVAL;
             }
             flexarray_append(dm_args, "-usb");
             if (b_info->u.hvm.usbdevice) {
@@ -353,7 +633,11 @@ static char ** libxl__build_device_model_args_old(libxl__gc *gc,
         abort();
     }
     flexarray_append(dm_args, NULL);
-    return (char **) flexarray_contents(dm_args);
+    *args = (char **) flexarray_contents(dm_args);
+    flexarray_append(dm_envs, NULL);
+    if (envs)
+        *envs = (char **) flexarray_contents(dm_envs);
+    return 0;
 }
 
 static const char *qemu_disk_format_string(libxl_disk_format format)
@@ -405,12 +689,21 @@ static char *dm_spice_options(libxl__gc *gc,
     if (!libxl_defbool_val(spice->clipboard_sharing))
         opt = libxl__sprintf(gc, "%s,disable-copy-paste", opt);
 
+    if (spice->image_compression)
+        opt = libxl__sprintf(gc, "%s,image-compression=%s", opt,
+                             spice->image_compression);
+
+    if (spice->streaming_video)
+        opt = libxl__sprintf(gc, "%s,streaming-video=%s", opt,
+                             spice->streaming_video);
+
     return opt;
 }
 
-static char ** libxl__build_device_model_args_new(libxl__gc *gc,
+static int libxl__build_device_model_args_new(libxl__gc *gc,
                                         const char *dm, int guest_domid,
                                         const libxl_domain_config *guest_config,
+                                        char ***args, char ***envs,
                                         const libxl__domain_build_state *state,
                                         int *dm_state_fd)
 {
@@ -425,12 +718,13 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
     const libxl_sdl_info *sdl = dm_sdl(guest_config);
     const char *keymap = dm_keymap(guest_config);
     char *machinearg;
-    flexarray_t *dm_args;
+    flexarray_t *dm_args, *dm_envs;
     int i, connection, devid;
     uint64_t ram_size;
     const char *path, *chardev;
 
     dm_args = flexarray_make(gc, 16, 1);
+    dm_envs = flexarray_make(gc, 16, 1);
 
     flexarray_vappend(dm_args, dm,
                       "-xen-domid",
@@ -442,9 +736,19 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
                                     "path=%s/qmp-libxl-%d,server,nowait",
                                     libxl__run_dir_path(), guest_domid));
 
+    flexarray_append(dm_args, "-no-shutdown");
     flexarray_append(dm_args, "-mon");
     flexarray_append(dm_args, "chardev=libxl-cmd,mode=control");
 
+    flexarray_append(dm_args, "-chardev");
+    flexarray_append(dm_args,
+                     libxl__sprintf(gc, "socket,id=libxenstat-cmd,"
+                                    "path=%s/qmp-libxenstat-%d,server,nowait",
+                                    libxl__run_dir_path(), guest_domid));
+
+    flexarray_append(dm_args, "-mon");
+    flexarray_append(dm_args, "chardev=libxenstat-cmd,mode=control");
+
     for (i = 0; i < guest_config->num_channels; i++) {
         connection = guest_config->channels[i].connection;
         devid = guest_config->channels[i].devid;
@@ -461,7 +765,7 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
                 /* We've forgotten to add the clause */
                 LOG(ERROR, "%s: unknown channel connection %d",
                     __func__, connection);
-                return NULL;
+                return ERROR_INVAL;
         }
         flexarray_append(dm_args, "-chardev");
         flexarray_append(dm_args, (void*)chardev);
@@ -498,7 +802,7 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
             if (strchr(vnc->listen, ':') != NULL) {
                 if (vnc->display) {
                     LOG(ERROR, "vncdisplay set, vnclisten contains display");
-                    return NULL;
+                    return ERROR_INVAL;
                 }
                 vncarg = vnc->listen;
             } else {
@@ -534,7 +838,10 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
 
     if (sdl) {
         flexarray_append(dm_args, "-sdl");
-        /* XXX sdl->{display,xauthority} into $DISPLAY/$XAUTHORITY */
+        if (sdl->display)
+            flexarray_append_pair(dm_envs, "DISPLAY", sdl->display);
+        if (sdl->xauthority)
+            flexarray_append_pair(dm_envs, "XAUTHORITY", sdl->xauthority);
     }
 
     if (keymap) {
@@ -557,7 +864,7 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
             if ( b_info->u.hvm.serial && b_info->u.hvm.serial_list )
             {
                 LOG(ERROR, "Both serial and serial_list set");
-                return NULL;
+                return ERROR_INVAL;
             }
             if (b_info->u.hvm.serial) {
                 flexarray_vappend(dm_args,
@@ -582,7 +889,7 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
             const libxl_spice_info *spice = &b_info->u.hvm.spice;
             char *spiceoptions = dm_spice_options(gc, spice);
             if (!spiceoptions)
-                return NULL;
+                return ERROR_INVAL;
 
             flexarray_append(dm_args, "-spice");
             flexarray_append(dm_args, spiceoptions);
@@ -607,6 +914,12 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
             break;
         case LIBXL_VGA_INTERFACE_TYPE_NONE:
             break;
+        case LIBXL_VGA_INTERFACE_TYPE_QXL:
+            /* QXL have 2 ram regions, ram and vram */
+            flexarray_append_pair(dm_args, "-device",
+                GCSPRINTF("qxl-vga,vram_size_mb=%"PRIu64",ram_size_mb=%"PRIu64,
+                (b_info->video_memkb/2/1024), (b_info->video_memkb/2/1024) ) );
+            break;
         }
 
         if (b_info->u.hvm.boot) {
@@ -619,7 +932,7 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
             if ( b_info->u.hvm.usbdevice && b_info->u.hvm.usbdevice_list )
             {
                 LOG(ERROR, "Both usbdevice and usbdevice_list set");
-                return NULL;
+                return ERROR_INVAL;
             }
             flexarray_append(dm_args, "-usb");
             if (b_info->u.hvm.usbdevice) {
@@ -657,7 +970,7 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
             default:
                 LOG(ERROR, "usbversion parameter is invalid, "
                     "must be between 1 and 3");
-                return NULL;
+                return ERROR_INVAL;
             }
             if (b_info->u.hvm.spice.usbredirection >= 0 &&
                 b_info->u.hvm.spice.usbredirection < 5) {
@@ -669,7 +982,7 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
             } else {
                 LOG(ERROR, "usbredirection parameter is invalid, "
                     "it must be between 1 and 4");
-                return NULL;
+                return ERROR_INVAL;
             }
         }
         if (b_info->u.hvm.soundhw) {
@@ -778,6 +1091,8 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
     flexarray_append(dm_args, libxl__sprintf(gc, "%"PRId64, ram_size));
 
     if (b_info->type == LIBXL_DOMAIN_TYPE_HVM) {
+        if (b_info->u.hvm.hdtype == LIBXL_HDTYPE_AHCI)
+            flexarray_append_pair(dm_args, "-device", "ahci,id=ahci0");
         for (i = 0; i < num_disks; i++) {
             int disk, part;
             int dev_number =
@@ -795,13 +1110,18 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
             if (disks[i].is_cdrom) {
                 if (disks[i].format == LIBXL_DISK_FORMAT_EMPTY)
                     drive = libxl__sprintf
-                        (gc, "if=ide,index=%d,media=cdrom,cache=writeback,id=ide-%i",
-                         disk, dev_number);
+                        (gc, "if=ide,index=%d,readonly=%s,media=cdrom,cache=writeback,id=ide-%i",
+                         disk, disks[i].readwrite ? "off" : "on", dev_number);
                 else
                     drive = libxl__sprintf
-                        (gc, "file=%s,if=ide,index=%d,media=cdrom,format=%s,cache=writeback,id=ide-%i",
-                         disks[i].pdev_path, disk, format, dev_number);
+                        (gc, "file=%s,if=ide,index=%d,readonly=%s,media=cdrom,format=%s,cache=writeback,id=ide-%i",
+                         disks[i].pdev_path, disk, disks[i].readwrite ? "off" : "on", format, dev_number);
             } else {
+                if (!disks[i].readwrite) {
+                    LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "qemu-xen doesn't support read-only disk drivers");
+                    return ERROR_INVAL;
+                }
+
                 if (disks[i].format == LIBXL_DISK_FORMAT_EMPTY) {
                     LIBXL__LOG(ctx, LIBXL__LOG_WARNING, "cannot support"
                                " empty disk format for %s", disks[i].vdev);
@@ -832,7 +1152,14 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
                     drive = libxl__sprintf
                         (gc, "file=%s,if=scsi,bus=0,unit=%d,format=%s,cache=writeback",
                          pdev_path, disk, format);
-                else if (disk < 4)
+                else if (disk < 6 && b_info->u.hvm.hdtype == LIBXL_HDTYPE_AHCI) {
+                    flexarray_vappend(dm_args, "-drive",
+                        GCSPRINTF("file=%s,if=none,id=ahcidisk-%d,format=%s,cache=writeback",
+                        pdev_path, disk, format),
+                        "-device", GCSPRINTF("ide-hd,bus=ahci0.%d,unit=0,drive=ahcidisk-%d",
+                        disk, disk), NULL);
+                    continue;
+                } else if (disk < 4)
                     drive = libxl__sprintf
                         (gc, "file=%s,if=ide,index=%d,media=disk,format=%s,cache=writeback",
                          pdev_path, disk, format);
@@ -854,12 +1181,17 @@ static char ** libxl__build_device_model_args_new(libxl__gc *gc,
         }
     }
     flexarray_append(dm_args, NULL);
-    return (char **) flexarray_contents(dm_args);
+    *args = (char **) flexarray_contents(dm_args);
+    flexarray_append(dm_envs, NULL);
+    if (envs)
+        *envs = (char **) flexarray_contents(dm_envs);
+    return 0;
 }
 
-static char ** libxl__build_device_model_args(libxl__gc *gc,
+static int libxl__build_device_model_args(libxl__gc *gc,
                                         const char *dm, int guest_domid,
                                         const libxl_domain_config *guest_config,
+                                        char ***args, char ***envs,
                                         const libxl__domain_build_state *state,
                                         int *dm_state_fd)
 /* dm_state_fd may be NULL iff caller knows we are using old stubdom
@@ -871,17 +1203,19 @@ static char ** libxl__build_device_model_args(libxl__gc *gc,
     case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL:
         return libxl__build_device_model_args_old(gc, dm,
                                                   guest_domid, guest_config,
+                                                  args, envs,
                                                   state);
     case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
         assert(dm_state_fd != NULL);
         assert(*dm_state_fd < 0);
         return libxl__build_device_model_args_new(gc, dm,
                                                   guest_domid, guest_config,
+                                                  args, envs,
                                                   state, dm_state_fd);
     default:
         LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "unknown device model version %d",
                          guest_config->b_info.device_model_version);
-        return NULL;
+        return ERROR_INVAL;
     }
 }
 
@@ -954,7 +1288,7 @@ static int libxl__write_stub_dmargs(libxl__gc *gc,
         i++;
     }
     dmargs_size++;
-    dmargs = (char *) malloc(dmargs_size);
+    dmargs = (char *) libxl__malloc(gc, dmargs_size);
     i = 1;
     dmargs[0] = '\0';
     while (args[i] != NULL) {
@@ -974,7 +1308,6 @@ retry_transaction:
     if (!xs_transaction_end(ctx->xsh, t, 0))
         if (errno == EAGAIN)
             goto retry_transaction;
-    free(dmargs);
     return 0;
 }
 
@@ -989,9 +1322,8 @@ static void stubdom_pvqemu_cb(libxl__egc *egc,
                               libxl__multidev *aodevs,
                               int rc);
 
-static void spaw_stubdom_pvqemu_destroy_cb(libxl__egc *egc,
-                                           libxl__destroy_domid_state *dis,
-                                           int rc);
+static void stubdom_xswait_cb(libxl__egc *egc, libxl__xswait_state *xswait,
+                              int rc, const char *p);
 
 char *libxl__stub_dm_name(libxl__gc *gc, const char *guest_name)
 {
@@ -1040,7 +1372,8 @@ void libxl__spawn_stub_dm(libxl__egc *egc, libxl__stub_dm_spawn_state *sdss)
     libxl_domain_build_info_init_type(&dm_config->b_info, LIBXL_DOMAIN_TYPE_PV);
 
     dm_config->b_info.max_vcpus = 1;
-    dm_config->b_info.max_memkb = 32 * 1024;
+    dm_config->b_info.max_memkb = 28 * 1024 +
+        guest_config->b_info.video_memkb;
     dm_config->b_info.target_memkb = dm_config->b_info.max_memkb;
 
     dm_config->b_info.u.pv.features = "";
@@ -1080,7 +1413,8 @@ void libxl__spawn_stub_dm(libxl__egc *egc, libxl__stub_dm_spawn_state *sdss)
     stubdom_state->pv_ramdisk.path = "";
 
     /* fixme: this function can leak the stubdom if it fails */
-    ret = libxl__domain_make(gc, &dm_config->c_info, &sdss->pvqemu.guest_domid);
+    ret = libxl__domain_make(gc, dm_config, &sdss->pvqemu.guest_domid,
+                             &stubdom_state->config);
     if (ret)
         goto out;
     uint32_t dm_domid = sdss->pvqemu.guest_domid;
@@ -1088,9 +1422,10 @@ void libxl__spawn_stub_dm(libxl__egc *egc, libxl__stub_dm_spawn_state *sdss)
     if (ret)
         goto out;
 
-    args = libxl__build_device_model_args(gc, "stubdom-dm", guest_domid,
-                                          guest_config, d_state, NULL);
-    if (!args) {
+    ret = libxl__build_device_model_args(gc, "stubdom-dm", guest_domid,
+                                         guest_config, &args, NULL,
+                                         d_state, NULL);
+    if (ret) {
         ret = ERROR_FAIL;
         goto out;
     }
@@ -1121,9 +1456,10 @@ void libxl__spawn_stub_dm(libxl__egc *egc, libxl__stub_dm_spawn_state *sdss)
 retry_transaction:
     t = xs_transaction_start(ctx->xsh);
     xs_mkdir(ctx->xsh, t,
-        libxl__sprintf(gc, "/local/domain/0/device-model/%d", guest_domid));
+             libxl__device_model_xs_path(gc, dm_domid, guest_domid, ""));
     xs_set_permissions(ctx->xsh, t,
-        libxl__sprintf(gc, "/local/domain/0/device-model/%d", guest_domid),
+                       libxl__device_model_xs_path(gc, dm_domid,
+                                                   guest_domid, ""),
                        perm, ARRAY_SIZE(perm));
     if (!xs_transaction_end(ctx->xsh, t, 0))
         if (errno == EAGAIN)
@@ -1183,10 +1519,6 @@ static void spawn_stub_launch_dm(libxl__egc *egc,
         num_console++;
 
     console = libxl__calloc(gc, num_console, sizeof(libxl__device_console));
-    if (!console) {
-        ret = ERROR_NOMEM;
-        goto out;
-    }
 
     for (i = 0; i < num_console; i++) {
         libxl__device device;
@@ -1202,7 +1534,8 @@ static void spawn_stub_launch_dm(libxl__egc *egc,
             case STUBDOM_CONSOLE_LOGGING:
                 name = libxl__sprintf(gc, "qemu-dm-%s",
                                       libxl_domid_to_name(ctx, guest_domid));
-                libxl_create_logfile(ctx, name, &filename);
+                ret = libxl_create_logfile(ctx, name, &filename);
+                if (ret) goto out;
                 console[i].output = libxl__sprintf(gc, "file:%s", filename);
                 free(filename);
                 break;
@@ -1273,6 +1606,8 @@ static void stubdom_pvqemu_cb(libxl__egc *egc,
     STATE_AO_GC(sdss->dm.spawn.ao);
     uint32_t dm_domid = sdss->pvqemu.guest_domid;
 
+    libxl__xswait_init(&sdss->xswait);
+
     if (rc) {
         LOGE(ERROR, "error connecting nics devices");
         goto out;
@@ -1281,30 +1616,41 @@ static void stubdom_pvqemu_cb(libxl__egc *egc,
     rc = libxl_domain_unpause(CTX, dm_domid);
     if (rc) goto out;
 
+    sdss->xswait.ao = ao;
+    sdss->xswait.what = GCSPRINTF("Stubdom %u for %u startup",
+                                  dm_domid, sdss->dm.guest_domid);
+    sdss->xswait.path =
+        libxl__device_model_xs_path(gc, dm_domid, sdss->dm.guest_domid,
+                                    "/state");
+    sdss->xswait.timeout_ms = LIBXL_STUBDOM_START_TIMEOUT * 1000;
+    sdss->xswait.callback = stubdom_xswait_cb;
+    rc = libxl__xswait_start(gc, &sdss->xswait);
+    if (rc) goto out;
+
+    return;
+
  out:
-    if (rc) {
-        if (dm_domid) {
-            sdss->dis.ao = ao;
-            sdss->dis.domid = dm_domid;
-            sdss->dis.callback = spaw_stubdom_pvqemu_destroy_cb;
-            libxl__destroy_domid(egc, &sdss->dis);
-            return;
-        }
-    }
-    sdss->callback(egc, &sdss->dm, rc);
+    stubdom_xswait_cb(egc, &sdss->xswait, rc, NULL);
 }
 
-static void spaw_stubdom_pvqemu_destroy_cb(libxl__egc *egc,
-                                           libxl__destroy_domid_state *dis,
-                                           int rc)
+static void stubdom_xswait_cb(libxl__egc *egc, libxl__xswait_state *xswait,
+                              int rc, const char *p)
 {
-    libxl__stub_dm_spawn_state *sdss = CONTAINER_OF(dis, *sdss, dis);
-    STATE_AO_GC(sdss->dis.ao);
+    EGC_GC;
+    libxl__stub_dm_spawn_state *sdss = CONTAINER_OF(xswait, *sdss, xswait);
 
-    if (rc)
-        LOG(ERROR, "destruction of domain %u after failed creation failed",
-                   sdss->pvqemu.guest_domid);
+    if (rc) {
+        if (rc == ERROR_TIMEDOUT)
+            LOG(ERROR, "%s: startup timed out", xswait->what);
+        goto out;
+    }
 
+    if (!p) return;
+
+    if (strcmp(p, "running"))
+        return;
+ out:
+    libxl__xswait_stop(gc, xswait);
     sdss->callback(egc, &sdss->dm, rc);
 }
 
@@ -1312,7 +1658,8 @@ static void spaw_stubdom_pvqemu_destroy_cb(libxl__egc *egc,
 static void device_model_confirm(libxl__egc *egc, libxl__spawn_state *spawn,
                                  const char *xsdata);
 static void device_model_startup_failed(libxl__egc *egc,
-                                        libxl__spawn_state *spawn);
+                                        libxl__spawn_state *spawn,
+                                        int rc);
 static void device_model_detached(libxl__egc *egc,
                                   libxl__spawn_state *spawn);
 
@@ -1338,7 +1685,7 @@ void libxl__spawn_local_dm(libxl__egc *egc, libxl__dm_spawn_state *dmss)
     char *path;
     int logfile_w, null;
     int rc;
-    char **args, **arg;
+    char **args, **arg, **envs;
     xs_transaction_t t;
     char *vm_path;
     char **pass_stuff;
@@ -1360,12 +1707,11 @@ void libxl__spawn_local_dm(libxl__egc *egc, libxl__dm_spawn_state *dmss)
         rc = ERROR_FAIL;
         goto out;
     }
-    args = libxl__build_device_model_args(gc, dm, domid, guest_config, state,
+    rc = libxl__build_device_model_args(gc, dm, domid, guest_config,
+                                          &args, &envs, state,
                                           &dm_state_fd);
-    if (!args) {
-        rc = ERROR_FAIL;
+    if (rc)
         goto out;
-    }
 
     if (b_info->type == LIBXL_DOMAIN_TYPE_HVM) {
         path = xs_get_domain_path(ctx->xsh, domid);
@@ -1373,17 +1719,19 @@ void libxl__spawn_local_dm(libxl__egc *egc, libxl__dm_spawn_state *dmss)
                         libxl__sprintf(gc, "%s/hvmloader/bios", path),
                         "%s", libxl_bios_type_to_string(b_info->u.hvm.bios));
         /* Disable relocating memory to make the MMIO hole larger
-         * unless we're running qemu-traditional */
+         * unless we're running qemu-traditional and vNUMA is not
+         * configured. */
         libxl__xs_write(gc, XBT_NULL,
                         libxl__sprintf(gc,
                                        "%s/hvmloader/allow-memory-relocate",
                                        path),
                         "%d",
-                        b_info->device_model_version==LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL);
+                        b_info->device_model_version==LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL &&
+                        !libxl__vnuma_configured(b_info));
         free(path);
     }
 
-    path = libxl__sprintf(gc, "/local/domain/0/device-model/%d", domid);
+    path = libxl__device_model_xs_path(gc, LIBXL_TOOLSTACK_DOMID, domid, "");
     xs_mkdir(ctx->xsh, XBT_NULL, path);
 
     if (b_info->type == LIBXL_DOMAIN_TYPE_HVM &&
@@ -1430,9 +1778,15 @@ retry_transaction:
     LIBXL__LOG(CTX, XTL_DEBUG, "Spawning device-model %s with arguments:", dm);
     for (arg = args; *arg; arg++)
         LIBXL__LOG(CTX, XTL_DEBUG, "  %s", *arg);
+    if (*envs) {
+        LOG(DEBUG, "Spawning device-model %s with additional environment:", dm);
+        for (arg = envs; *arg; arg += 2)
+            LOG(DEBUG, "  %s=%s", arg[0], arg[1]);
+    }
 
     spawn->what = GCSPRINTF("domain %d device model", domid);
-    spawn->xspath = GCSPRINTF("/local/domain/0/device-model/%d/state", domid);
+    spawn->xspath = libxl__device_model_xs_path(gc, LIBXL_TOOLSTACK_DOMID,
+                                                domid, "/state");
     spawn->timeout_ms = LIBXL_DEVICE_MODEL_START_TIMEOUT * 1000;
     spawn->pidpath = GCSPRINTF("%s/image/device-model-pid", dom_path);
     spawn->midproc_cb = libxl__spawn_record_pid;
@@ -1445,7 +1799,7 @@ retry_transaction:
         goto out_close;
     if (!rc) { /* inner child */
         setsid();
-        libxl__exec(gc, null, logfile_w, logfile_w, dm, args, NULL);
+        libxl__exec(gc, null, logfile_w, logfile_w, dm, args, envs);
     }
 
     rc = 0;
@@ -1475,10 +1829,11 @@ static void device_model_confirm(libxl__egc *egc, libxl__spawn_state *spawn,
 }
 
 static void device_model_startup_failed(libxl__egc *egc,
-                                        libxl__spawn_state *spawn)
+                                        libxl__spawn_state *spawn,
+                                        int rc)
 {
     libxl__dm_spawn_state *dmss = CONTAINER_OF(spawn, *dmss, spawn);
-    device_model_spawn_outcome(egc, dmss, ERROR_FAIL);
+    device_model_spawn_outcome(egc, dmss, rc);
 }
 
 static void device_model_detached(libxl__egc *egc,
@@ -1520,7 +1875,7 @@ void libxl__spawn_qdisk_backend(libxl__egc *egc, libxl__dm_spawn_state *dmss)
     flexarray_t *dm_args;
     char **args;
     const char *dm;
-    int logfile_w, null, rc;
+    int logfile_w, null = -1, rc;
     uint32_t domid = dmss->guest_domid;
 
     /* Always use qemu-xen as device model */
@@ -1546,6 +1901,10 @@ void libxl__spawn_qdisk_backend(libxl__egc *egc, libxl__dm_spawn_state *dmss)
         goto error;
     }
     null = open("/dev/null", O_RDONLY);
+    if (null < 0) {
+       rc = ERROR_FAIL;
+       goto error;
+    }
 
     dmss->guest_config = NULL;
     /*
@@ -1580,6 +1939,8 @@ void libxl__spawn_qdisk_backend(libxl__egc *egc, libxl__dm_spawn_state *dmss)
 
 error:
     assert(rc);
+    if (logfile_w >= 0) close(logfile_w);
+    if (null >= 0) close(null);
     dmss->callback(egc, dmss, rc);
     return;
 }
@@ -1637,6 +1998,11 @@ out:
 
 int libxl__destroy_device_model(libxl__gc *gc, uint32_t domid)
 {
+    char *path = libxl__device_model_xs_path(gc, LIBXL_TOOLSTACK_DOMID,
+                                             domid, "");
+    if (!xs_rm(CTX->xsh, XBT_NULL, path))
+        LOG(ERROR, "xs_rm failed for %s", path);
+    /* We should try to destroy the device model anyway. */
     return kill_device_model(gc,
                 GCSPRINTF("/local/domain/%d/image/device-model-pid", domid));
 }
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 1d33a18..b514377 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -24,6 +24,7 @@
 #include <xen/hvm/hvm_info_table.h>
 #include <xen/hvm/hvm_xs_strings.h>
 #include <xen/hvm/e820.h>
+#include <xen/errno.h>
 
 libxl_domain_type libxl__domain_type(libxl__gc *gc, uint32_t domid)
 {
@@ -42,23 +43,6 @@ libxl_domain_type libxl__domain_type(libxl__gc *gc, uint32_t domid)
         return LIBXL_DOMAIN_TYPE_PV;
 }
 
-int libxl__domain_shutdown_reason(libxl__gc *gc, uint32_t domid)
-{
-    libxl_ctx *ctx = libxl__gc_owner(gc);
-    xc_domaininfo_t info;
-    int ret;
-
-    ret = xc_domain_getinfolist(ctx->xch, domid, 1, &info);
-    if (ret != 1)
-        return -1;
-    if (info.domain != domid)
-        return -1;
-    if (!(info.flags & XEN_DOMINF_shutdown))
-        return -1;
-
-    return (info.flags >> XEN_DOMINF_shutdownshift) & XEN_DOMINF_shutdownmask;
-}
-
 int libxl__domain_cpupool(libxl__gc *gc, uint32_t domid)
 {
     xc_domaininfo_t info;
@@ -67,7 +51,7 @@ int libxl__domain_cpupool(libxl__gc *gc, uint32_t domid)
     ret = xc_domain_getinfolist(CTX->xch, domid, 1, &info);
     if (ret != 1)
     {
-        LOGE(ERROR, "getinfolist failed %d\n", ret);
+        LOGE(ERROR, "getinfolist failed %d", ret);
         return ERROR_FAIL;
     }
     if (info.domain != domid)
@@ -142,6 +126,7 @@ static int numa_place_domain(libxl__gc *gc, uint32_t domid,
 
     libxl__numa_candidate_init(&candidate);
     libxl_bitmap_init(&cpupool_nodemap);
+    libxl_cpupoolinfo_init(&cpupool_info);
 
     /*
      * Extract the cpumap from the cpupool the domain belong to. In fact,
@@ -150,10 +135,10 @@ static int numa_place_domain(libxl__gc *gc, uint32_t domid,
      */
     rc = cpupool = libxl__domain_cpupool(gc, domid);
     if (rc < 0)
-        return rc;
+        goto out;
     rc = libxl_cpupool_info(CTX, &cpupool_info, cpupool);
     if (rc)
-        return rc;
+        goto out;
 
     rc = libxl_domain_need_memory(CTX, info, &memkb);
     if (rc)
@@ -263,6 +248,9 @@ static int hvm_set_viridian_features(libxl__gc *gc, uint32_t domid,
     if (libxl_bitmap_test(&enlightenments, LIBXL_VIRIDIAN_ENLIGHTENMENT_TIME_REF_COUNT))
         mask |= HVMPV_time_ref_count;
 
+    if (libxl_bitmap_test(&enlightenments, LIBXL_VIRIDIAN_ENLIGHTENMENT_REFERENCE_TSC))
+        mask |= HVMPV_reference_tsc;
+
     if (mask != 0 &&
         xc_hvm_param_set(CTX->xch,
                          domid,
@@ -297,6 +285,8 @@ static void hvm_set_conf_params(xc_interface *handle, uint32_t domid,
                     libxl_defbool_val(info->u.hvm.vpt_align));
     xc_hvm_param_set(handle, domid, HVM_PARAM_NESTEDHVM,
                     libxl_defbool_val(info->u.hvm.nested_hvm));
+    xc_hvm_param_set(handle, domid, HVM_PARAM_ALTP2M,
+                    libxl_defbool_val(info->u.hvm.altp2m));
 }
 
 int libxl__build_pre(libxl__gc *gc, uint32_t domid,
@@ -327,53 +317,60 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
      * reflect the placement result if that is the case
      */
     if (libxl_defbool_val(info->numa_placement)) {
-        libxl_bitmap cpumap_soft;
-
-        if (info->cpumap.size ||
-            info->num_vcpu_hard_affinity || info->num_vcpu_soft_affinity) {
-            LOG(ERROR, "Can run NUMA placement only if no vcpu "
-                       "(hard or soft) affinity is specified explicitly");
-            return ERROR_INVAL;
-        }
-        if (info->nodemap.size) {
-            LOG(ERROR, "Can run NUMA placement only if the domain does not "
-                       "have any NUMA node affinity set already");
-            return ERROR_INVAL;
-        }
-
-        rc = libxl_node_bitmap_alloc(ctx, &info->nodemap, 0);
-        if (rc)
-            return rc;
-        libxl_bitmap_set_any(&info->nodemap);
+        if (info->cpumap.size || info->num_vcpu_hard_affinity ||
+            info->num_vcpu_soft_affinity)
+            LOG(WARN, "Can't run NUMA placement, as an (hard or soft) "
+                      "affinity has been specified explicitly");
+        else if (info->nodemap.size)
+            LOG(WARN, "Can't run NUMA placement, as the domain has "
+                      "NUMA node affinity set already");
+        else {
+            libxl_bitmap cpumap_soft;
+
+            rc = libxl_node_bitmap_alloc(ctx, &info->nodemap, 0);
+            if (rc)
+                return rc;
+            libxl_bitmap_set_any(&info->nodemap);
+
+            rc = libxl_cpu_bitmap_alloc(ctx, &cpumap_soft, 0);
+            if (rc)
+                return rc;
+
+            rc = numa_place_domain(gc, domid, info);
+            if (rc) {
+                libxl_bitmap_dispose(&cpumap_soft);
+                return rc;
+            }
 
-        rc = libxl_cpu_bitmap_alloc(ctx, &cpumap_soft, 0);
-        if (rc)
-            return rc;
+            /*
+             * All we need to do now is converting the result of automatic
+             * placement from nodemap to cpumap, and then use such cpumap
+             * as the soft affinity for all the vcpus of the domain.
+             *
+             * When calling libxl_set_vcpuaffinity_all(), it is ok to use
+             * NULL as hard affinity, as we know we don't have one, or we
+             * won't be here.
+             */
+            libxl_nodemap_to_cpumap(ctx, &info->nodemap, &cpumap_soft);
+            libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus,
+                                       NULL, &cpumap_soft);
 
-        rc = numa_place_domain(gc, domid, info);
-        if (rc) {
             libxl_bitmap_dispose(&cpumap_soft);
-            return rc;
-        }
-
-        /*
-         * All we need to do now is converting the result of automatic
-         * placement from nodemap to cpumap, and then use such cpumap as
-         * the soft affinity for all the vcpus of the domain.
-         *
-         * When calling libxl_set_vcpuaffinity_all(), it is ok to use NULL
-         * as hard affinity, as we know we don't have one, or we won't be
-         * here.
-         */
-        libxl_nodemap_to_cpumap(ctx, &info->nodemap, &cpumap_soft);
-        libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus,
-                                   NULL, &cpumap_soft);
 
-        libxl_bitmap_dispose(&cpumap_soft);
+            /*
+             * Placement has run, so avoid for it to be re-run, if this
+             * same config we are using and building here is ever re-used.
+             * This means that people re-using configs will get the same
+             * results, consistently, across every re-use, which is what
+             * we expect most people to want.
+             */
+            libxl_defbool_set(&info->numa_placement, false);
+        }
     }
+
     if (info->nodemap.size)
         libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap);
-    /* As mentioned in libxl.h, vcpu_hard_array takes precedence */
+
     if (info->num_vcpu_hard_affinity || info->num_vcpu_soft_affinity) {
         libxl_bitmap *hard_affinity, *soft_affinity;
         int i, n_vcpus;
@@ -436,6 +433,49 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
     return rc;
 }
 
+static int set_vnuma_affinity(libxl__gc *gc, uint32_t domid,
+                              libxl_domain_build_info *info)
+{
+    libxl_bitmap cpumap;
+    libxl_vnode_info *v;
+    unsigned int i, j;
+    int rc = 0;
+
+    libxl_bitmap_init(&cpumap);
+
+    rc = libxl_cpu_bitmap_alloc(CTX, &cpumap, 0);
+    if (rc) {
+        LOG(ERROR, "Can't allocate nodemap");
+        goto out;
+    }
+
+    /*
+     * For each vcpu in each vnode, set its soft affinity to
+     * the pcpus belonging to the pnode the vnode is on
+     */
+    for (i = 0; i < info->num_vnuma_nodes; i++) {
+        v = &info->vnuma_nodes[i];
+
+        rc = libxl_node_to_cpumap(CTX, v->pnode, &cpumap);
+        if (rc) {
+            LOG(ERROR, "Can't get cpumap for vnode %d", i);
+            goto out;
+        }
+
+        libxl_for_each_set_bit(j, v->vcpus) {
+            rc = libxl_set_vcpuaffinity(CTX, domid, j, NULL, &cpumap);
+            if (rc) {
+                LOG(ERROR, "Can't set cpu affinity for %d", j);
+                goto out;
+            }
+        }
+    }
+
+out:
+    libxl_bitmap_dispose(&cpumap);
+    return rc;
+}
+
 int libxl__build_post(libxl__gc *gc, uint32_t domid,
                       libxl_domain_build_info *info,
                       libxl__domain_build_state *state,
@@ -447,6 +487,12 @@ int libxl__build_post(libxl__gc *gc, uint32_t domid,
     char **ents;
     int i, rc;
 
+    if (info->num_vnuma_nodes && !info->num_vcpu_soft_affinity) {
+        rc = set_vnuma_affinity(gc, domid, info);
+        if (rc)
+            return rc;
+    }
+
     rc = libxl_domain_sched_params_set(CTX, domid, &info->sched_params);
     if (rc)
         return rc;
@@ -512,6 +558,51 @@ retry_transaction:
     return 0;
 }
 
+static int set_vnuma_info(libxl__gc *gc, uint32_t domid,
+                          const libxl_domain_build_info *info,
+                          const libxl__domain_build_state *state)
+{
+    int rc = 0;
+    unsigned int i, nr_vdistance;
+    unsigned int *vcpu_to_vnode, *vnode_to_pnode, *vdistance = NULL;
+
+    vcpu_to_vnode = libxl__calloc(gc, info->max_vcpus,
+                                  sizeof(unsigned int));
+    vnode_to_pnode = libxl__calloc(gc, info->num_vnuma_nodes,
+                                   sizeof(unsigned int));
+
+    nr_vdistance = info->num_vnuma_nodes * info->num_vnuma_nodes;
+    vdistance = libxl__calloc(gc, nr_vdistance, sizeof(unsigned int));
+
+    for (i = 0; i < info->num_vnuma_nodes; i++) {
+        libxl_vnode_info *v = &info->vnuma_nodes[i];
+        int j;
+
+        /* vnode to pnode mapping */
+        vnode_to_pnode[i] = v->pnode;
+
+        /* vcpu to vnode mapping */
+        libxl_for_each_set_bit(j, v->vcpus)
+            vcpu_to_vnode[j] = i;
+
+        /* node distances */
+        assert(info->num_vnuma_nodes == v->num_distances);
+        memcpy(vdistance + (i * info->num_vnuma_nodes),
+               v->distances,
+               v->num_distances * sizeof(unsigned int));
+    }
+
+    if (xc_domain_setvnuma(CTX->xch, domid, info->num_vnuma_nodes,
+                           state->num_vmemranges, info->max_vcpus,
+                           state->vmemranges, vdistance,
+                           vcpu_to_vnode, vnode_to_pnode) < 0) {
+        LOGE(ERROR, "xc_domain_setvnuma failed");
+        rc = ERROR_FAIL;
+    }
+
+    return rc;
+}
+
 int libxl__build_pv(libxl__gc *gc, uint32_t domid,
              libxl_domain_build_info *info, libxl__domain_build_state *state)
 {
@@ -569,6 +660,38 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
     dom->xenstore_domid = state->store_domid;
     dom->claim_enabled = libxl_defbool_val(info->claim_mode);
 
+    if (info->num_vnuma_nodes != 0) {
+        unsigned int i;
+
+        ret = libxl__vnuma_build_vmemrange_pv(gc, domid, info, state);
+        if (ret) {
+            LOGE(ERROR, "cannot build vmemranges");
+            goto out;
+        }
+        ret = libxl__vnuma_config_check(gc, info, state);
+        if (ret) goto out;
+
+        ret = set_vnuma_info(gc, domid, info, state);
+        if (ret) goto out;
+
+        dom->nr_vmemranges = state->num_vmemranges;
+        dom->vmemranges = xc_dom_malloc(dom, sizeof(*dom->vmemranges) *
+                                        dom->nr_vmemranges);
+
+        for (i = 0; i < dom->nr_vmemranges; i++) {
+            dom->vmemranges[i].start = state->vmemranges[i].start;
+            dom->vmemranges[i].end   = state->vmemranges[i].end;
+            dom->vmemranges[i].flags = state->vmemranges[i].flags;
+            dom->vmemranges[i].nid   = state->vmemranges[i].nid;
+        }
+
+        dom->nr_vnodes = info->num_vnuma_nodes;
+        dom->vnode_to_pnode = xc_dom_malloc(dom, sizeof(*dom->vnode_to_pnode) *
+                                            dom->nr_vnodes);
+        for (i = 0; i < info->num_vnuma_nodes; i++)
+            dom->vnode_to_pnode[i] = info->vnuma_nodes[i].pnode;
+    }
+
     if ( (ret = xc_dom_boot_xen_init(dom, ctx->xch, domid)) != 0 ) {
         LOGE(ERROR, "xc_dom_boot_xen_init failed");
         goto out;
@@ -583,7 +706,7 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
         LOGE(ERROR, "xc_dom_parse_image failed");
         goto out;
     }
-    if ( (ret = libxl__arch_domain_init_hw_description(gc, info, dom)) != 0 ) {
+    if ( (ret = libxl__arch_domain_init_hw_description(gc, info, state, dom)) != 0 ) {
         LOGE(ERROR, "libxl__arch_domain_init_hw_description failed");
         goto out;
     }
@@ -644,7 +767,7 @@ static int hvm_build_set_params(xc_interface *handle, uint32_t domid,
                                   XC_PAGE_SIZE, PROT_READ | PROT_WRITE,
                                   HVM_INFO_PFN);
     if (va_map == NULL)
-        return -1;
+        return ERROR_FAIL;
 
     va_hvm = (struct hvm_info_table *)(va_map + HVM_INFO_OFFSET);
     va_hvm->apic_mode = libxl_defbool_val(info->u.hvm.apic);
@@ -785,12 +908,14 @@ out:
 }
 
 int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
-              libxl_domain_build_info *info,
+              libxl_domain_config *d_config,
               libxl__domain_build_state *state)
 {
     libxl_ctx *ctx = libxl__gc_owner(gc);
     struct xc_hvm_build_args args = {};
-    int ret, rc = ERROR_FAIL;
+    int ret, rc;
+    uint64_t mmio_start, lowmem_end, highmem_end;
+    libxl_domain_build_info *const info = &d_config->b_info;
 
     memset(&args, 0, sizeof(struct xc_hvm_build_args));
     /* The params from the configuration file are in Mb, which are then
@@ -808,34 +933,98 @@ int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
         if (max_ram_below_4g < HVM_BELOW_4G_MMIO_START)
             args.mmio_size = info->u.hvm.mmio_hole_memkb << 10;
     }
-    if (libxl__domain_firmware(gc, info, &args)) {
+
+    rc = libxl__domain_firmware(gc, info, &args);
+    if (rc != 0) {
         LOG(ERROR, "initializing domain firmware failed");
         goto out;
     }
+    if (args.mem_target == 0)
+        args.mem_target = args.mem_size;
+    if (args.mmio_size == 0)
+        args.mmio_size = HVM_BELOW_4G_MMIO_LENGTH;
+    lowmem_end = args.mem_size;
+    highmem_end = 0;
+    mmio_start = (1ull << 32) - args.mmio_size;
+    if (lowmem_end > mmio_start)
+    {
+        highmem_end = (1ull << 32) + (lowmem_end - mmio_start);
+        lowmem_end = mmio_start;
+    }
+    args.lowmem_end = lowmem_end;
+    args.highmem_end = highmem_end;
+    args.mmio_start = mmio_start;
+
+    rc = libxl__domain_device_construct_rdm(gc, d_config,
+                                            info->u.hvm.rdm_mem_boundary_memkb*1024,
+                                            &args);
+    if (rc) {
+        LOG(ERROR, "checking reserved device memory failed");
+        goto out;
+    }
+
+    if (info->num_vnuma_nodes != 0) {
+        int i;
+
+        rc = libxl__vnuma_build_vmemrange_hvm(gc, domid, info, state, &args);
+        if (rc != 0) {
+            LOG(ERROR, "hvm build vmemranges failed");
+            goto out;
+        }
+        rc = libxl__vnuma_config_check(gc, info, state);
+        if (rc != 0) goto out;
+        rc = set_vnuma_info(gc, domid, info, state);
+        if (rc != 0) goto out;
+
+        args.nr_vmemranges = state->num_vmemranges;
+        args.vmemranges = libxl__malloc(gc, sizeof(*args.vmemranges) *
+                                        args.nr_vmemranges);
+
+        for (i = 0; i < args.nr_vmemranges; i++) {
+            args.vmemranges[i].start = state->vmemranges[i].start;
+            args.vmemranges[i].end   = state->vmemranges[i].end;
+            args.vmemranges[i].flags = state->vmemranges[i].flags;
+            args.vmemranges[i].nid   = state->vmemranges[i].nid;
+        }
+
+        args.nr_vnodes = info->num_vnuma_nodes;
+        args.vnode_to_pnode = libxl__malloc(gc, sizeof(*args.vnode_to_pnode) *
+                                            args.nr_vnodes);
+        for (i = 0; i < args.nr_vnodes; i++)
+            args.vnode_to_pnode[i] = info->vnuma_nodes[i].pnode;
+    }
 
     ret = xc_hvm_build(ctx->xch, domid, &args);
     if (ret) {
         LOGEV(ERROR, ret, "hvm building failed");
+        rc = ERROR_FAIL;
         goto out;
     }
 
-    ret = hvm_build_set_params(ctx->xch, domid, info, state->store_port,
+    rc = libxl__arch_domain_construct_memmap(gc, d_config, domid, &args);
+    if (rc != 0) {
+        LOG(ERROR, "setting domain memory map failed");
+        goto out;
+    }
+
+    rc = hvm_build_set_params(ctx->xch, domid, info, state->store_port,
                                &state->store_mfn, state->console_port,
                                &state->console_mfn, state->store_domid,
                                state->console_domid);
-    if (ret) {
-        LOGEV(ERROR, ret, "hvm build set params failed");
+    if (rc != 0) {
+        LOG(ERROR, "hvm build set params failed");
         goto out;
     }
 
-    ret = hvm_build_set_xs_values(gc, domid, &args);
-    if (ret) {
-        LOG(ERROR, "hvm build set xenstore values failed (ret=%d)", ret);
+    rc = hvm_build_set_xs_values(gc, domid, &args);
+    if (rc != 0) {
+        LOG(ERROR, "hvm build set xenstore values failed");
         goto out;
     }
 
     return 0;
 out:
+    assert(rc != 0);
     return rc;
 }
 
@@ -843,91 +1032,87 @@ int libxl__qemu_traditional_cmd(libxl__gc *gc, uint32_t domid,
                                 const char *cmd)
 {
     char *path = NULL;
-    path = GCSPRINTF("/local/domain/0/device-model/%d/command", domid);
+    uint32_t dm_domid = libxl_get_stubdom_id(CTX, domid);
+    path = libxl__device_model_xs_path(gc, dm_domid, domid, "/command");
     return libxl__xs_write(gc, XBT_NULL, path, "%s", cmd);
 }
 
-struct libxl__physmap_info {
-    uint64_t phys_offset;
-    uint64_t start_addr;
-    uint64_t size;
-    uint32_t namelen;
-    char name[];
-};
+/*
+ * Inspect the buffer between start and end, and return a pointer to the
+ * character following the NUL terminator of start, or NULL if start is not
+ * terminated before end.
+ */
+static const char *next_string(const char *start, const char *end)
+{
+    if (start >= end) return NULL;
 
-#define TOOLSTACK_SAVE_VERSION 1
+    size_t total_len = end - start;
+    size_t len = strnlen(start, total_len);
 
-static inline char *restore_helper(libxl__gc *gc, uint32_t domid,
-        uint64_t phys_offset, char *node)
-{
-    return GCSPRINTF("/local/domain/0/device-model/%d/physmap/%"PRIx64"/%s",
-            domid, phys_offset, node);
+    if (len == total_len)
+        return NULL;
+    else
+        return start + len + 1;
 }
 
-int libxl__toolstack_restore(uint32_t domid, const uint8_t *buf,
-                             uint32_t size, void *user)
+int libxl__restore_emulator_xenstore_data(libxl__domain_create_state *dcs,
+                                          const char *ptr, uint32_t size)
 {
-    libxl__save_helper_state *shs = user;
-    libxl__domain_create_state *dcs = CONTAINER_OF(shs, *dcs, shs);
     STATE_AO_GC(dcs->ao);
-    int i, ret;
-    const uint8_t *ptr = buf;
-    uint32_t count = 0, version = 0;
-    struct libxl__physmap_info* pi;
-    char *xs_path;
+    const char *next = ptr, *end = ptr + size, *key, *val;
+    int rc;
 
-    LOG(DEBUG,"domain=%"PRIu32" toolstack data size=%"PRIu32, domid, size);
+    const uint32_t domid = dcs->guest_domid;
+    const uint32_t dm_domid = libxl_get_stubdom_id(CTX, domid);
+    const char *xs_root = libxl__device_model_xs_path(gc, dm_domid, domid, "");
 
-    if (size < sizeof(version) + sizeof(count)) {
-        LOG(ERROR, "wrong size");
-        return -1;
-    }
+    while (next < end) {
+        key = next;
+        next = next_string(next, end);
 
-    memcpy(&version, ptr, sizeof(version));
-    ptr += sizeof(version);
+        /* Sanitise 'key'. */
+        if (!next) {
+            rc = ERROR_FAIL;
+            LOG(ERROR, "Key in xenstore data not NUL terminated");
+            goto out;
+        }
+        if (key[0] == '\0') {
+            rc = ERROR_FAIL;
+            LOG(ERROR, "empty key found in xenstore data");
+            goto out;
+        }
+        if (key[0] == '/') {
+            rc = ERROR_FAIL;
+            LOG(ERROR, "Key in xenstore data not relative");
+            goto out;
+        }
 
-    if (version != TOOLSTACK_SAVE_VERSION) {
-        LOG(ERROR, "wrong version");
-        return -1;
-    }
+        val = next;
+        next = next_string(next, end);
 
-    memcpy(&count, ptr, sizeof(count));
-    ptr += sizeof(count);
+        /* Sanitise 'val'. */
+        if (!next) {
+            rc = ERROR_FAIL;
+            LOG(ERROR, "Val in xenstore data not NUL terminated");
+            goto out;
+        }
 
-    if (size < sizeof(version) + sizeof(count) +
-            count * (sizeof(struct libxl__physmap_info))) {
-        LOG(ERROR, "wrong size");
-        return -1;
+        libxl__xs_write(gc, XBT_NULL,
+                        GCSPRINTF("%s/%s", xs_root, key), "%s", val);
     }
 
-    for (i = 0; i < count; i++) {
-        pi = (struct libxl__physmap_info*) ptr;
-        ptr += sizeof(struct libxl__physmap_info) + pi->namelen;
+    rc = 0;
 
-        xs_path = restore_helper(gc, domid, pi->phys_offset, "start_addr");
-        ret = libxl__xs_write(gc, 0, xs_path, "%"PRIx64, pi->start_addr);
-        if (ret)
-            return -1;
-        xs_path = restore_helper(gc, domid, pi->phys_offset, "size");
-        ret = libxl__xs_write(gc, 0, xs_path, "%"PRIx64, pi->size);
-        if (ret)
-            return -1;
-        if (pi->namelen > 0) {
-            xs_path = restore_helper(gc, domid, pi->phys_offset, "name");
-            ret = libxl__xs_write(gc, 0, xs_path, "%s", pi->name);
-            if (ret)
-                return -1;
-        }
-    }
-    return 0;
+ out:
+    return rc;
 }
 
 /*==================== Domain suspend (save) ====================*/
 
-static void domain_suspend_done(libxl__egc *egc,
-                        libxl__domain_suspend_state *dss, int rc);
-static void domain_suspend_callback_common_done(libxl__egc *egc,
-                                libxl__domain_suspend_state *dss, int ok);
+static void stream_done(libxl__egc *egc,
+                        libxl__stream_write_state *sws, int rc);
+static void domain_save_done(libxl__egc *egc,
+                             libxl__domain_suspend_state *dss, int rc);
 
 /*----- complicated callback, called by xc_domain_save -----*/
 
@@ -940,11 +1125,12 @@ static void domain_suspend_callback_common_done(libxl__egc *egc,
  */
 
 static void switch_logdirty_timeout(libxl__egc *egc, libxl__ev_time *ev,
-                                    const struct timeval *requested_abs);
+                                    const struct timeval *requested_abs,
+                                    int rc);
 static void switch_logdirty_xswatch(libxl__egc *egc, libxl__ev_xswatch*,
                             const char *watch_path, const char *event_path);
 static void switch_logdirty_done(libxl__egc *egc,
-                                 libxl__domain_suspend_state *dss, int ok);
+                                 libxl__domain_suspend_state *dss, int rc);
 
 static void logdirty_init(libxl__logdirty_switch *lds)
 {
@@ -958,7 +1144,7 @@ static void domain_suspend_switch_qemu_xen_traditional_logdirty
                                 libxl__save_helper_state *shs)
 {
     libxl__egc *egc = shs->egc;
-    libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+    libxl__domain_suspend_state *dss = shs->caller_state;
     libxl__logdirty_switch *lds = &dss->logdirty;
     STATE_AO_GC(dss->ao);
     int rc;
@@ -966,10 +1152,11 @@ static void domain_suspend_switch_qemu_xen_traditional_logdirty
     const char *got;
 
     if (!lds->cmd_path) {
-        lds->cmd_path = GCSPRINTF(
-                   "/local/domain/0/device-model/%u/logdirty/cmd", domid);
-        lds->ret_path = GCSPRINTF(
-                   "/local/domain/0/device-model/%u/logdirty/ret", domid);
+        uint32_t dm_domid = libxl_get_stubdom_id(CTX, domid);
+        lds->cmd_path = libxl__device_model_xs_path(gc, dm_domid, domid,
+                                                    "/logdirty/cmd");
+        lds->ret_path = libxl__device_model_xs_path(gc, dm_domid, domid,
+                                                    "/logdirty/ret");
     }
     lds->cmd = enable ? "enable" : "disable";
 
@@ -977,7 +1164,7 @@ static void domain_suspend_switch_qemu_xen_traditional_logdirty
                                 switch_logdirty_xswatch, lds->ret_path);
     if (rc) goto out;
 
-    rc = libxl__ev_time_register_rel(gc, &lds->timeout,
+    rc = libxl__ev_time_register_rel(ao, &lds->timeout,
                                 switch_logdirty_timeout, 10*1000);
     if (rc) goto out;
 
@@ -1019,9 +1206,9 @@ static void domain_suspend_switch_qemu_xen_traditional_logdirty
     return;
 
  out:
-    LOG(ERROR,"logdirty switch failed (rc=%d), aborting suspend",rc);
+    LOG(ERROR,"logdirty switch failed (rc=%d), abandoning suspend",rc);
     libxl__xs_transaction_abort(gc, &t);
-    switch_logdirty_done(egc,dss,-1);
+    switch_logdirty_done(egc,dss,rc);
 }
 
 static void domain_suspend_switch_qemu_xen_logdirty
@@ -1029,7 +1216,7 @@ static void domain_suspend_switch_qemu_xen_logdirty
                                 libxl__save_helper_state *shs)
 {
     libxl__egc *egc = shs->egc;
-    libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+    libxl__domain_suspend_state *dss = shs->caller_state;
     STATE_AO_GC(dss->ao);
     int rc;
 
@@ -1037,7 +1224,8 @@ static void domain_suspend_switch_qemu_xen_logdirty
     if (!rc) {
         libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
     } else {
-        LOG(ERROR,"logdirty switch failed (rc=%d), aborting suspend",rc);
+        LOG(ERROR,"logdirty switch failed (rc=%d), abandoning suspend",rc);
+        dss->rc = rc;
         libxl__xc_domain_saverestore_async_callback_done(egc, shs, -1);
     }
 }
@@ -1047,7 +1235,7 @@ void libxl__domain_suspend_common_switch_qemu_logdirty
 {
     libxl__save_helper_state *shs = user;
     libxl__egc *egc = shs->egc;
-    libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+    libxl__domain_suspend_state *dss = shs->caller_state;
     STATE_AO_GC(dss->ao);
 
     switch (libxl__device_model_version_running(gc, domid)) {
@@ -1059,17 +1247,19 @@ void libxl__domain_suspend_common_switch_qemu_logdirty
         break;
     default:
         LOG(ERROR,"logdirty switch failed"
-            ", no valid device model version found, aborting suspend");
+            ", no valid device model version found, abandoning suspend");
+        dss->rc = ERROR_FAIL;
         libxl__xc_domain_saverestore_async_callback_done(egc, shs, -1);
     }
 }
 static void switch_logdirty_timeout(libxl__egc *egc, libxl__ev_time *ev,
-                                    const struct timeval *requested_abs)
+                                    const struct timeval *requested_abs,
+                                    int rc)
 {
     libxl__domain_suspend_state *dss = CONTAINER_OF(ev, *dss, logdirty.timeout);
     STATE_AO_GC(dss->ao);
     LOG(ERROR,"logdirty switch: wait for device model timed out");
-    switch_logdirty_done(egc,dss,-1);
+    switch_logdirty_done(egc,dss,ERROR_FAIL);
 }
 
 static void switch_logdirty_xswatch(libxl__egc *egc, libxl__ev_xswatch *watch,
@@ -1121,17 +1311,16 @@ static void switch_logdirty_xswatch(libxl__egc *egc, libxl__ev_xswatch *watch,
      */
     libxl__xs_transaction_abort(gc, &t);
 
-    if (!rc) {
-        switch_logdirty_done(egc,dss,0);
-    } else if (rc < 0) {
-        LOG(ERROR,"logdirty switch: failed (rc=%d)",rc);
-        switch_logdirty_done(egc,dss,-1);
+    if (rc <= 0) {
+        if (rc < 0)
+            LOG(ERROR,"logdirty switch: failed (rc=%d)",rc);
+        switch_logdirty_done(egc,dss,rc);
     }
 }
 
 static void switch_logdirty_done(libxl__egc *egc,
                                  libxl__domain_suspend_state *dss,
-                                 int broke)
+                                 int rc)
 {
     STATE_AO_GC(dss->ao);
     libxl__logdirty_switch *lds = &dss->logdirty;
@@ -1139,453 +1328,81 @@ static void switch_logdirty_done(libxl__egc *egc,
     libxl__ev_xswatch_deregister(gc, &lds->watch);
     libxl__ev_time_deregister(gc, &lds->timeout);
 
-    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, broke);
-}
-
-/*----- callbacks, called by xc_domain_save -----*/
-
-int libxl__domain_suspend_device_model(libxl__gc *gc,
-                                       libxl__domain_suspend_state *dss)
-{
-    int ret = 0;
-    uint32_t const domid = dss->domid;
-    const char *const filename = dss->dm_savefile;
-
-    switch (libxl__device_model_version_running(gc, domid)) {
-    case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: {
-        LOG(DEBUG, "Saving device model state to %s", filename);
-        libxl__qemu_traditional_cmd(gc, domid, "save");
-        libxl__wait_for_device_model_deprecated(gc, domid, "paused", NULL, NULL, NULL);
-        break;
-    }
-    case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
-        if (libxl__qmp_stop(gc, domid))
-            return ERROR_FAIL;
-        /* Save DM state into filename */
-        ret = libxl__qmp_save(gc, domid, filename);
-        if (ret)
-            unlink(filename);
-        break;
-    default:
-        return ERROR_INVAL;
-    }
-
-    return ret;
-}
-
-int libxl__domain_resume_device_model(libxl__gc *gc, uint32_t domid)
-{
-
-    switch (libxl__device_model_version_running(gc, domid)) {
-    case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: {
-        libxl__qemu_traditional_cmd(gc, domid, "continue");
-        libxl__wait_for_device_model_deprecated(gc, domid, "running", NULL, NULL, NULL);
-        break;
-    }
-    case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
-        if (libxl__qmp_resume(gc, domid))
-            return ERROR_FAIL;
-        break;
-    default:
-        return ERROR_INVAL;
-    }
-
-    return 0;
-}
-
-static void domain_suspend_common_wait_guest(libxl__egc *egc,
-                                             libxl__domain_suspend_state *dss);
-static void domain_suspend_common_guest_suspended(libxl__egc *egc,
-                                         libxl__domain_suspend_state *dss);
-
-static void domain_suspend_common_pvcontrol_suspending(libxl__egc *egc,
-      libxl__xswait_state *xswa, int rc, const char *state);
-static void domain_suspend_common_wait_guest_evtchn(libxl__egc *egc,
-        libxl__ev_evtchn *evev);
-static void suspend_common_wait_guest_watch(libxl__egc *egc,
-      libxl__ev_xswatch *xsw, const char *watch_path, const char *event_path);
-static void suspend_common_wait_guest_check(libxl__egc *egc,
-        libxl__domain_suspend_state *dss);
-static void suspend_common_wait_guest_timeout(libxl__egc *egc,
-      libxl__ev_time *ev, const struct timeval *requested_abs);
-
-static void domain_suspend_common_failed(libxl__egc *egc,
-                                         libxl__domain_suspend_state *dss);
-static void domain_suspend_common_done(libxl__egc *egc,
-                                       libxl__domain_suspend_state *dss,
-                                       bool ok);
-
-static bool domain_suspend_pvcontrol_acked(const char *state) {
-    /* any value other than "suspend", including ENOENT (i.e. !state), is OK */
-    if (!state) return 1;
-    return strcmp(state,"suspend");
-}
-
-/* calls dss->callback_common_done when done */
-static void domain_suspend_callback_common(libxl__egc *egc,
-                                           libxl__domain_suspend_state *dss)
-{
-    STATE_AO_GC(dss->ao);
-    uint64_t hvm_s_state = 0, hvm_pvdrv = 0;
-    int ret, rc;
-
-    /* Convenience aliases */
-    const uint32_t domid = dss->domid;
-
-    if (dss->hvm) {
-        xc_hvm_param_get(CTX->xch, domid, HVM_PARAM_CALLBACK_IRQ, &hvm_pvdrv);
-        xc_hvm_param_get(CTX->xch, domid, HVM_PARAM_ACPI_S_STATE, &hvm_s_state);
-    }
-
-    if ((hvm_s_state == 0) && (dss->guest_evtchn.port >= 0)) {
-        LOG(DEBUG, "issuing %s suspend request via event channel",
-            dss->hvm ? "PVHVM" : "PV");
-        ret = xc_evtchn_notify(CTX->xce, dss->guest_evtchn.port);
-        if (ret < 0) {
-            LOG(ERROR, "xc_evtchn_notify failed ret=%d", ret);
-            goto err;
-        }
-
-        dss->guest_evtchn.callback = domain_suspend_common_wait_guest_evtchn;
-        rc = libxl__ev_evtchn_wait(gc, &dss->guest_evtchn);
-        if (rc) goto err;
-
-        rc = libxl__ev_time_register_rel(gc, &dss->guest_timeout,
-                                         suspend_common_wait_guest_timeout,
-                                         60*1000);
-        if (rc) goto err;
-
-        return;
-    }
-
-    if (dss->hvm && (!hvm_pvdrv || hvm_s_state)) {
-        LOG(DEBUG, "Calling xc_domain_shutdown on HVM domain");
-        ret = xc_domain_shutdown(CTX->xch, domid, SHUTDOWN_suspend);
-        if (ret < 0) {
-            LOGE(ERROR, "xc_domain_shutdown failed");
-            goto err;
-        }
-        /* The guest does not (need to) respond to this sort of request. */
-        dss->guest_responded = 1;
-        domain_suspend_common_wait_guest(egc, dss);
-        return;
+    int broke;
+    if (rc) {
+        broke = -1;
+        dss->rc = rc;
+    } else {
+        broke = 0;
     }
-
-    LOG(DEBUG, "issuing %s suspend request via XenBus control node",
-        dss->hvm ? "PVHVM" : "PV");
-
-    libxl__domain_pvcontrol_write(gc, XBT_NULL, domid, "suspend");
-
-    dss->pvcontrol.path = libxl__domain_pvcontrol_xspath(gc, domid);
-    if (!dss->pvcontrol.path) goto err;
-
-    dss->pvcontrol.ao = ao;
-    dss->pvcontrol.what = "guest acknowledgement of suspend request";
-    dss->pvcontrol.timeout_ms = 60 * 1000;
-    dss->pvcontrol.callback = domain_suspend_common_pvcontrol_suspending;
-    libxl__xswait_start(gc, &dss->pvcontrol);
-    return;
-
- err:
-    domain_suspend_common_failed(egc, dss);
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->sws.shs, broke);
 }
 
-static void domain_suspend_common_wait_guest_evtchn(libxl__egc *egc,
-        libxl__ev_evtchn *evev)
-{
-    libxl__domain_suspend_state *dss = CONTAINER_OF(evev, *dss, guest_evtchn);
-    STATE_AO_GC(dss->ao);
-    /* If we should be done waiting, suspend_common_wait_guest_check
-     * will end up calling domain_suspend_common_guest_suspended or
-     * domain_suspend_common_failed, both of which cancel the evtchn
-     * wait.  So re-enable it now. */
-    libxl__ev_evtchn_wait(gc, &dss->guest_evtchn);
-    suspend_common_wait_guest_check(egc, dss);
-}
+/*----- callbacks, called by xc_domain_save -----*/
 
-static void domain_suspend_common_pvcontrol_suspending(libxl__egc *egc,
-      libxl__xswait_state *xswa, int rc, const char *state)
+/*
+ * Expand the buffer 'buf' of length 'len', to append 'str' including its NUL
+ * terminator.
+ */
+static void append_string(libxl__gc *gc, char **buf, uint32_t *len,
+                          const char *str)
 {
-    libxl__domain_suspend_state *dss = CONTAINER_OF(xswa, *dss, pvcontrol);
-    STATE_AO_GC(dss->ao);
-    xs_transaction_t t = 0;
-
-    if (!rc && !domain_suspend_pvcontrol_acked(state))
-        /* keep waiting */
-        return;
-
-    libxl__xswait_stop(gc, &dss->pvcontrol);
-
-    if (rc == ERROR_TIMEDOUT) {
-        /*
-         * Guest appears to not be responding. Cancel the suspend
-         * request.
-         *
-         * We re-read the suspend node and clear it within a
-         * transaction in order to handle the case where we race
-         * against the guest catching up and acknowledging the request
-         * at the last minute.
-         */
-        for (;;) {
-            rc = libxl__xs_transaction_start(gc, &t);
-            if (rc) goto err;
-
-            rc = libxl__xs_read_checked(gc, t, xswa->path, &state);
-            if (rc) goto err;
-
-            if (domain_suspend_pvcontrol_acked(state))
-                /* last minute ack */
-                break;
+    size_t extralen = strlen(str) + 1;
+    char *new = libxl__realloc(gc, *buf, *len + extralen);
 
-            rc = libxl__xs_write_checked(gc, t, xswa->path, "");
-            if (rc) goto err;
-
-            rc = libxl__xs_transaction_commit(gc, &t);
-            if (!rc) {
-                LOG(ERROR,
-                    "guest didn't acknowledge suspend, cancelling request");
-                goto err;
-            }
-            if (rc<0) goto err;
-        }
-    } else if (rc) {
-        /* some error in xswait's read of xenstore, already logged */
-        goto err;
-    }
-
-    assert(domain_suspend_pvcontrol_acked(state));
-    LOG(DEBUG, "guest acknowledged suspend request");
-
-    libxl__xs_transaction_abort(gc, &t);
-    dss->guest_responded = 1;
-    domain_suspend_common_wait_guest(egc,dss);
-    return;
-
- err:
-    libxl__xs_transaction_abort(gc, &t);
-    domain_suspend_common_failed(egc, dss);
-    return;
+    *buf = new;
+    memcpy(new + *len, str, extralen);
+    *len += extralen;
 }
 
-static void domain_suspend_common_wait_guest(libxl__egc *egc,
-                                             libxl__domain_suspend_state *dss)
+int libxl__save_emulator_xenstore_data(libxl__domain_suspend_state *dss,
+                                       char **callee_buf,
+                                       uint32_t *callee_len)
 {
     STATE_AO_GC(dss->ao);
+    const char *xs_root;
+    char **entries, *buf = NULL;
+    unsigned int nr_entries, i, j, len = 0;
     int rc;
 
-    LOG(DEBUG, "wait for the guest to suspend");
-
-    rc = libxl__ev_xswatch_register(gc, &dss->guest_watch,
-                                    suspend_common_wait_guest_watch,
-                                    "@releaseDomain");
-    if (rc) goto err;
-
-    rc = libxl__ev_time_register_rel(gc, &dss->guest_timeout,
-                                     suspend_common_wait_guest_timeout,
-                                     60*1000);
-    if (rc) goto err;
-    return;
-
- err:
-    domain_suspend_common_failed(egc, dss);
-}
-
-static void suspend_common_wait_guest_watch(libxl__egc *egc,
-      libxl__ev_xswatch *xsw, const char *watch_path, const char *event_path)
-{
-    libxl__domain_suspend_state *dss = CONTAINER_OF(xsw, *dss, guest_watch);
-    suspend_common_wait_guest_check(egc, dss);
-}
-
-static void suspend_common_wait_guest_check(libxl__egc *egc,
-        libxl__domain_suspend_state *dss)
-{
-    STATE_AO_GC(dss->ao);
-    xc_domaininfo_t info;
-    int ret;
-    int shutdown_reason;
-
-    /* Convenience aliases */
     const uint32_t domid = dss->domid;
+    const uint32_t dm_domid = libxl_get_stubdom_id(CTX, domid);
 
-    ret = xc_domain_getinfolist(CTX->xch, domid, 1, &info);
-    if (ret < 0) {
-        LOGE(ERROR, "unable to check for status of guest %"PRId32"", domid);
-        goto err;
-    }
-
-    if (!(ret == 1 && info.domain == domid)) {
-        LOGE(ERROR, "guest %"PRId32" we were suspending has been destroyed",
-             domid);
-        goto err;
-    }
-
-    if (!(info.flags & XEN_DOMINF_shutdown))
-        /* keep waiting */
-        return;
+    xs_root = libxl__device_model_xs_path(gc, dm_domid, domid, "");
 
-    shutdown_reason = (info.flags >> XEN_DOMINF_shutdownshift)
-        & XEN_DOMINF_shutdownmask;
-    if (shutdown_reason != SHUTDOWN_suspend) {
-        LOG(DEBUG, "guest %"PRId32" we were suspending has shut down"
-            " with unexpected reason code %d", domid, shutdown_reason);
-        goto err;
-    }
+    entries = libxl__xs_directory(gc, 0, GCSPRINTF("%s/physmap", xs_root),
+                                  &nr_entries);
+    if (!entries || nr_entries == 0) { rc = 0; goto out; }
 
-    LOG(DEBUG, "guest has suspended");
-    domain_suspend_common_guest_suspended(egc, dss);
-    return;
+    for (i = 0; i < nr_entries; ++i) {
+        static const char *const physmap_subkeys[] = {
+            "start_addr", "size", "name"
+        };
 
- err:
-    domain_suspend_common_failed(egc, dss);
-}
+        for (j = 0; j < ARRAY_SIZE(physmap_subkeys); ++j) {
+            const char *key = GCSPRINTF("physmap/%s/%s",
+                                        entries[i], physmap_subkeys[j]);
 
-static void suspend_common_wait_guest_timeout(libxl__egc *egc,
-      libxl__ev_time *ev, const struct timeval *requested_abs)
-{
-    libxl__domain_suspend_state *dss = CONTAINER_OF(ev, *dss, guest_timeout);
-    STATE_AO_GC(dss->ao);
-    LOG(ERROR, "guest did not suspend, timed out");
-    domain_suspend_common_failed(egc, dss);
-}
-
-static void domain_suspend_common_guest_suspended(libxl__egc *egc,
-                                         libxl__domain_suspend_state *dss)
-{
-    STATE_AO_GC(dss->ao);
-    int ret;
+            const char *val =
+                libxl__xs_read(gc, XBT_NULL,
+                               GCSPRINTF("%s/%s", xs_root, key));
 
-    libxl__ev_evtchn_cancel(gc, &dss->guest_evtchn);
-    libxl__ev_xswatch_deregister(gc, &dss->guest_watch);
-    libxl__ev_time_deregister(gc, &dss->guest_timeout);
+            if (!val) { rc = ERROR_FAIL; goto out; }
 
-    if (dss->hvm) {
-        ret = libxl__domain_suspend_device_model(gc, dss);
-        if (ret) {
-            LOG(ERROR, "libxl__domain_suspend_device_model failed ret=%d", ret);
-            domain_suspend_common_failed(egc, dss);
-            return;
+            append_string(gc, &buf, &len, key);
+            append_string(gc, &buf, &len, val);
         }
     }
-    domain_suspend_common_done(egc, dss, 1);
-}
-
-static void domain_suspend_common_failed(libxl__egc *egc,
-                                         libxl__domain_suspend_state *dss)
-{
-    domain_suspend_common_done(egc, dss, 0);
-}
-
-static void domain_suspend_common_done(libxl__egc *egc,
-                                       libxl__domain_suspend_state *dss,
-                                       bool ok)
-{
-    EGC_GC;
-    assert(!libxl__xswait_inuse(&dss->pvcontrol));
-    libxl__ev_evtchn_cancel(gc, &dss->guest_evtchn);
-    libxl__ev_xswatch_deregister(gc, &dss->guest_watch);
-    libxl__ev_time_deregister(gc, &dss->guest_timeout);
-    dss->callback_common_done(egc, dss, ok);
-}
-
-static inline char *physmap_path(libxl__gc *gc, uint32_t domid,
-        char *phys_offset, char *node)
-{
-    return GCSPRINTF("/local/domain/0/device-model/%d/physmap/%s/%s",
-            domid, phys_offset, node);
-}
-
-int libxl__toolstack_save(uint32_t domid, uint8_t **buf,
-        uint32_t *len, void *dss_void)
-{
-    libxl__domain_suspend_state *dss = dss_void;
-    STATE_AO_GC(dss->ao);
-    int i = 0;
-    char *start_addr = NULL, *size = NULL, *phys_offset = NULL, *name = NULL;
-    unsigned int num = 0;
-    uint32_t count = 0, version = TOOLSTACK_SAVE_VERSION, namelen = 0;
-    uint8_t *ptr = NULL;
-    char **entries = NULL;
-    struct libxl__physmap_info *pi;
-
-    entries = libxl__xs_directory(gc, 0, GCSPRINTF(
-                "/local/domain/0/device-model/%d/physmap", domid), &num);
-    count = num;
-
-    *len = sizeof(version) + sizeof(count);
-    *buf = calloc(1, *len);
-    ptr = *buf;
-    if (*buf == NULL)
-        return -1;
-
-    memcpy(ptr, &version, sizeof(version));
-    ptr += sizeof(version);
-    memcpy(ptr, &count, sizeof(count));
-    ptr += sizeof(count);
-
-    for (i = 0; i < count; i++) {
-        unsigned long offset;
-        char *xs_path;
-        phys_offset = entries[i];
-        if (phys_offset == NULL) {
-            LOG(ERROR, "phys_offset %d is NULL", i);
-            return -1;
-        }
-
-        xs_path = physmap_path(gc, domid, phys_offset, "start_addr");
-        start_addr = libxl__xs_read(gc, 0, xs_path);
-        if (start_addr == NULL) {
-            LOG(ERROR, "%s is NULL", xs_path);
-            return -1;
-        }
-
-        xs_path = physmap_path(gc, domid, phys_offset, "size");
-        size = libxl__xs_read(gc, 0, xs_path);
-        if (size == NULL) {
-            LOG(ERROR, "%s is NULL", xs_path);
-            return -1;
-        }
 
-        xs_path = physmap_path(gc, domid, phys_offset, "name");
-        name = libxl__xs_read(gc, 0, xs_path);
-        if (name == NULL)
-            namelen = 0;
-        else
-            namelen = strlen(name) + 1;
-        *len += namelen + sizeof(struct libxl__physmap_info);
-        offset = ptr - (*buf);
-        *buf = realloc(*buf, *len);
-        if (*buf == NULL)
-            return -1;
-        ptr = (*buf) + offset;
-        pi = (struct libxl__physmap_info *) ptr;
-        pi->phys_offset = strtoll(phys_offset, NULL, 16);
-        pi->start_addr = strtoll(start_addr, NULL, 16);
-        pi->size = strtoll(size, NULL, 16);
-        pi->namelen = namelen;
-        memcpy(pi->name, name, namelen);
-        ptr += sizeof(struct libxl__physmap_info) + namelen;
-    }
-
-    LOG(DEBUG,"domain=%"PRIu32" toolstack data size=%"PRIu32, domid, *len);
-
-    return 0;
-}
-
-static void libxl__domain_suspend_callback(void *data)
-{
-    libxl__save_helper_state *shs = data;
-    libxl__egc *egc = shs->egc;
-    libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+    rc = 0;
 
-    dss->callback_common_done = domain_suspend_callback_common_done;
-    domain_suspend_callback_common(egc, dss);
-}
+ out:
+    if (!rc) {
+        *callee_buf = buf;
+        *callee_len = len;
+    }
 
-static void domain_suspend_callback_common_done(libxl__egc *egc,
-                                libxl__domain_suspend_state *dss, int ok)
-{
-    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+    return rc;
 }
 
 /*----- remus callbacks -----*/
@@ -1602,16 +1419,16 @@ static void libxl__remus_domain_suspend_callback(void *data)
 {
     libxl__save_helper_state *shs = data;
     libxl__egc *egc = shs->egc;
-    libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+    libxl__domain_suspend_state *dss = shs->caller_state;
 
     dss->callback_common_done = remus_domain_suspend_callback_common_done;
-    domain_suspend_callback_common(egc, dss);
+    libxl__domain_suspend(egc, dss);
 }
 
 static void remus_domain_suspend_callback_common_done(libxl__egc *egc,
-                                libxl__domain_suspend_state *dss, int ok)
+                                libxl__domain_suspend_state *dss, int rc)
 {
-    if (!ok)
+    if (rc)
         goto out;
 
     libxl__remus_devices_state *const rds = &dss->rds;
@@ -1620,30 +1437,32 @@ static void remus_domain_suspend_callback_common_done(libxl__egc *egc,
     return;
 
 out:
-    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+    dss->rc = rc;
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->sws.shs, !rc);
 }
 
 static void remus_devices_postsuspend_cb(libxl__egc *egc,
                                          libxl__remus_devices_state *rds,
                                          int rc)
 {
-    int ok = 0;
     libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
 
     if (rc)
         goto out;
 
-    ok = 1;
+    rc = 0;
 
 out:
-    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+    if (rc)
+        dss->rc = rc;
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->sws.shs, !rc);
 }
 
 static void libxl__remus_domain_resume_callback(void *data)
 {
     libxl__save_helper_state *shs = data;
     libxl__egc *egc = shs->egc;
-    libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+    libxl__domain_suspend_state *dss = shs->caller_state;
     STATE_AO_GC(dss->ao);
 
     libxl__remus_devices_state *const rds = &dss->rds;
@@ -1655,7 +1474,6 @@ static void remus_devices_preresume_cb(libxl__egc *egc,
                                        libxl__remus_devices_state *rds,
                                        int rc)
 {
-    int ok = 0;
     libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
     STATE_AO_GC(dss->ao);
 
@@ -1667,40 +1485,40 @@ static void remus_devices_preresume_cb(libxl__egc *egc,
     if (rc)
         goto out;
 
-    ok = 1;
+    rc = 0;
 
 out:
-    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+    if (rc)
+        dss->rc = rc;
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->sws.shs, !rc);
 }
 
 /*----- remus asynchronous checkpoint callback -----*/
 
-static void remus_checkpoint_dm_saved(libxl__egc *egc,
-                                      libxl__domain_suspend_state *dss, int rc);
+static void remus_checkpoint_stream_written(
+    libxl__egc *egc, libxl__stream_write_state *sws, int rc);
 static void remus_devices_commit_cb(libxl__egc *egc,
                                     libxl__remus_devices_state *rds,
                                     int rc);
 static void remus_next_checkpoint(libxl__egc *egc, libxl__ev_time *ev,
-                                  const struct timeval *requested_abs);
+                                  const struct timeval *requested_abs,
+                                  int rc);
 
-static void libxl__remus_domain_checkpoint_callback(void *data)
+static void libxl__remus_domain_save_checkpoint_callback(void *data)
 {
     libxl__save_helper_state *shs = data;
-    libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
-    libxl__egc *egc = dss->shs.egc;
+    libxl__domain_suspend_state *dss = shs->caller_state;
+    libxl__egc *egc = shs->egc;
     STATE_AO_GC(dss->ao);
 
-    /* This would go into tailbuf. */
-    if (dss->hvm) {
-        libxl__domain_save_device_model(egc, dss, remus_checkpoint_dm_saved);
-    } else {
-        remus_checkpoint_dm_saved(egc, dss, 0);
-    }
+    libxl__stream_write_start_checkpoint(egc, &dss->sws);
 }
 
-static void remus_checkpoint_dm_saved(libxl__egc *egc,
-                                      libxl__domain_suspend_state *dss, int rc)
+static void remus_checkpoint_stream_written(
+    libxl__egc *egc, libxl__stream_write_state *sws, int rc)
 {
+    libxl__domain_suspend_state *dss = CONTAINER_OF(sws, *dss, sws);
+
     /* Convenience aliases */
     libxl__remus_devices_state *const rds = &dss->rds;
 
@@ -1717,7 +1535,7 @@ static void remus_checkpoint_dm_saved(libxl__egc *egc,
     return;
 
 out:
-    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->sws.shs, 0);
 }
 
 static void remus_devices_commit_cb(libxl__egc *egc,
@@ -1742,7 +1560,7 @@ static void remus_devices_commit_cb(libxl__egc *egc,
      */
 
     /* Set checkpoint interval timeout */
-    rc = libxl__ev_time_register_rel(gc, &dss->checkpoint_timeout,
+    rc = libxl__ev_time_register_rel(ao, &dss->checkpoint_timeout,
                                      remus_next_checkpoint,
                                      dss->interval);
 
@@ -1752,32 +1570,40 @@ static void remus_devices_commit_cb(libxl__egc *egc,
     return;
 
 out:
-    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->sws.shs, 0);
 }
 
 static void remus_next_checkpoint(libxl__egc *egc, libxl__ev_time *ev,
-                                  const struct timeval *requested_abs)
+                                  const struct timeval *requested_abs,
+                                  int rc)
 {
     libxl__domain_suspend_state *dss =
                             CONTAINER_OF(ev, *dss, checkpoint_timeout);
 
     STATE_AO_GC(dss->ao);
 
+    if (rc == ERROR_TIMEDOUT) /* As intended */
+        rc = 0;
+
     /*
      * Time to checkpoint the guest again. We return 1 to libxc
      * (xc_domain_save.c). in order to continue executing the infinite loop
      * (suspend, checkpoint, resume) in xc_domain_save().
      */
-    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 1);
+
+    if (rc)
+        dss->rc = rc;
+
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->sws.shs, !rc);
 }
 
-/*----- main code for suspending, in order of execution -----*/
+/*----- main code for saving, in order of execution -----*/
 
-void libxl__domain_suspend(libxl__egc *egc, libxl__domain_suspend_state *dss)
+void libxl__domain_save(libxl__egc *egc, libxl__domain_suspend_state *dss)
 {
     STATE_AO_GC(dss->ao);
     int port;
-    int rc = ERROR_FAIL;
+    int rc, ret;
 
     /* Convenience aliases */
     const uint32_t domid = dss->domid;
@@ -1786,8 +1612,10 @@ void libxl__domain_suspend(libxl__egc *egc, libxl__domain_suspend_state *dss)
     const int debug = dss->debug;
     const libxl_domain_remus_info *const r_info = dss->remus;
     libxl__srm_save_autogen_callbacks *const callbacks =
-        &dss->shs.callbacks.save.a;
+        &dss->sws.shs.callbacks.save.a;
+    unsigned int nr_vnodes = 0, nr_vmemranges = 0, nr_vcpus = 0;
 
+    dss->rc = 0;
     logdirty_init(&dss->logdirty);
     libxl__xswait_init(&dss->pvcontrol);
     libxl__ev_evtchn_init(&dss->guest_evtchn);
@@ -1810,6 +1638,21 @@ void libxl__domain_suspend(libxl__egc *egc, libxl__domain_suspend_state *dss)
           | (debug ? XCFLAGS_DEBUG : 0)
           | (dss->hvm ? XCFLAGS_HVM : 0);
 
+    /* Disallow saving a guest with vNUMA configured because migration
+     * stream does not preserve node information.
+     *
+     * Reject any domain which has vnuma enabled, even if the
+     * configuration is empty. Only domains which have no vnuma
+     * configuration at all are supported.
+     */
+    ret = xc_domain_getvnuma(CTX->xch, domid, &nr_vnodes, &nr_vmemranges,
+                             &nr_vcpus, NULL, NULL, NULL);
+    if (ret != -1 || errno != XEN_EOPNOTSUPP) {
+        LOG(ERROR, "Cannot save a guest with vNUMA configured");
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
     dss->guest_evtchn.port = -1;
     dss->guest_evtchn_lockfd = -1;
     dss->guest_responded = 0;
@@ -1817,6 +1660,7 @@ void libxl__domain_suspend(libxl__egc *egc, libxl__domain_suspend_state *dss)
 
     if (r_info != NULL) {
         dss->interval = r_info->interval;
+        dss->xcflags |= XCFLAGS_CHECKPOINTED;
         if (libxl_defbool_val(r_info->compression))
             dss->xcflags |= XCFLAGS_CHECKPOINT_COMPRESS;
     }
@@ -1842,60 +1686,33 @@ void libxl__domain_suspend(libxl__egc *egc, libxl__domain_suspend_state *dss)
     if (r_info != NULL) {
         callbacks->suspend = libxl__remus_domain_suspend_callback;
         callbacks->postcopy = libxl__remus_domain_resume_callback;
-        callbacks->checkpoint = libxl__remus_domain_checkpoint_callback;
+        callbacks->checkpoint = libxl__remus_domain_save_checkpoint_callback;
+        dss->sws.checkpoint_callback = remus_checkpoint_stream_written;
     } else
         callbacks->suspend = libxl__domain_suspend_callback;
 
     callbacks->switch_qemu_logdirty = libxl__domain_suspend_common_switch_qemu_logdirty;
-    dss->shs.callbacks.save.toolstack_save = libxl__toolstack_save;
 
-    libxl__xc_domain_save(egc, dss);
+    dss->sws.ao  = dss->ao;
+    dss->sws.dss = dss;
+    dss->sws.fd  = dss->fd;
+    dss->sws.completion_callback = stream_done;
+
+    libxl__stream_write_start(egc, &dss->sws);
     return;
 
  out:
-    domain_suspend_done(egc, dss, rc);
+    domain_save_done(egc, dss, rc);
 }
 
-void libxl__xc_domain_save_done(libxl__egc *egc, void *dss_void,
-                                int rc, int retval, int errnoval)
+static void stream_done(libxl__egc *egc,
+                        libxl__stream_write_state *sws, int rc)
 {
-    libxl__domain_suspend_state *dss = dss_void;
-    STATE_AO_GC(dss->ao);
-
-    /* Convenience aliases */
-    const libxl_domain_type type = dss->type;
-
-    if (rc)
-        goto out;
-
-    if (retval) {
-        LOGEV(ERROR, errnoval, "saving domain: %s",
-                         dss->guest_responded ?
-                         "domain responded to suspend request" :
-                         "domain did not respond to suspend request");
-        if ( !dss->guest_responded )
-            rc = ERROR_GUEST_TIMEDOUT;
-        else
-            rc = ERROR_FAIL;
-        goto out;
-    }
-
-    if (type == LIBXL_DOMAIN_TYPE_HVM) {
-        rc = libxl__domain_suspend_device_model(gc, dss);
-        if (rc) goto out;
-
-        libxl__domain_save_device_model(egc, dss, domain_suspend_done);
-        return;
-    }
-
-    rc = 0;
-
-out:
-    domain_suspend_done(egc, dss, rc);
+    domain_save_done(egc, sws->dss, rc);
 }
 
 static void save_device_model_datacopier_done(libxl__egc *egc,
-     libxl__datacopier_state *dc, int onwrite, int errnoval);
+     libxl__datacopier_state *dc, int rc, int onwrite, int errnoval);
 
 void libxl__domain_save_device_model(libxl__egc *egc,
                                      libxl__domain_suspend_state *dss,
@@ -1919,6 +1736,7 @@ void libxl__domain_save_device_model(libxl__egc *egc,
     dc->readfd = -1;
     dc->writefd = fd;
     dc->maxsz = INT_MAX;
+    dc->bytes_to_read = -1;
     dc->copywhat = GCSPRINTF("qemu save file for domain %"PRIu32, dss->domid);
     dc->writewhat = "save/migration stream";
     dc->callback = save_device_model_datacopier_done;
@@ -1926,17 +1744,20 @@ void libxl__domain_save_device_model(libxl__egc *egc,
     dc->readfd = open(filename, O_RDONLY);
     if (dc->readfd < 0) {
         LOGE(ERROR, "unable to open %s", dc->readwhat);
+        rc = ERROR_FAIL;
         goto out;
     }
 
     if (fstat(dc->readfd, &st))
     {
         LOGE(ERROR, "unable to fstat %s", dc->readwhat);
+        rc = ERROR_FAIL;
         goto out;
     }
 
     if (!S_ISREG(st.st_mode)) {
         LOG(ERROR, "%s is not a plain file!", dc->readwhat);
+        rc = ERROR_FAIL;
         goto out;
     }
 
@@ -1954,11 +1775,11 @@ void libxl__domain_save_device_model(libxl__egc *egc,
     return;
 
  out:
-    save_device_model_datacopier_done(egc, dc, -1, 0);
+    save_device_model_datacopier_done(egc, dc, rc, -1, EIO);
 }
 
 static void save_device_model_datacopier_done(libxl__egc *egc,
-     libxl__datacopier_state *dc, int onwrite, int errnoval)
+     libxl__datacopier_state *dc, int our_rc, int onwrite, int errnoval)
 {
     libxl__domain_suspend_state *dss =
         CONTAINER_OF(dc, *dss, save_dm_datacopier);
@@ -1966,14 +1787,10 @@ static void save_device_model_datacopier_done(libxl__egc *egc,
 
     /* Convenience aliases */
     const char *const filename = dss->dm_savefile;
-    int our_rc = 0;
     int rc;
 
     libxl__datacopier_kill(dc);
 
-    if (onwrite || errnoval)
-        our_rc = ERROR_FAIL;
-
     if (dc->readfd >= 0) {
         close(dc->readfd);
         dc->readfd = -1;
@@ -1985,12 +1802,15 @@ static void save_device_model_datacopier_done(libxl__egc *egc,
     dss->save_dm_callback(egc, dss, our_rc);
 }
 
+static void libxl__remus_teardown(libxl__egc *egc,
+                                  libxl__domain_suspend_state *dss,
+                                  int rc);
 static void remus_teardown_done(libxl__egc *egc,
                                        libxl__remus_devices_state *rds,
                                        int rc);
 
-static void domain_suspend_done(libxl__egc *egc,
-                        libxl__domain_suspend_state *dss, int rc)
+static void domain_save_done(libxl__egc *egc,
+                             libxl__domain_suspend_state *dss, int rc)
 {
     STATE_AO_GC(dss->ao);
 
@@ -2003,17 +1823,26 @@ static void domain_suspend_done(libxl__egc *egc,
         xc_suspend_evtchn_release(CTX->xch, CTX->xce, domid,
                            dss->guest_evtchn.port, &dss->guest_evtchn_lockfd);
 
-    if (!dss->remus) {
-        remus_teardown_done(egc, &dss->rds, rc);
+    if (dss->remus) {
+        /*
+         * With Remus, if we reach this point, it means either
+         * backup died or some network error occurred preventing us
+         * from sending checkpoints. Teardown the network buffers and
+         * release netlink resources.  This is an async op.
+         */
+        libxl__remus_teardown(egc, dss, rc);
         return;
     }
 
-    /*
-     * With Remus, if we reach this point, it means either
-     * backup died or some network error occurred preventing us
-     * from sending checkpoints. Teardown the network buffers and
-     * release netlink resources.  This is an async op.
-     */
+    dss->callback(egc, dss, rc);
+}
+
+static void libxl__remus_teardown(libxl__egc *egc,
+                                  libxl__domain_suspend_state *dss,
+                                  int rc)
+{
+    EGC_GC;
+
     LOG(WARN, "Remus: Domain suspend terminated with rc %d,"
         " teardown Remus devices...", rc);
     dss->rds.callback = remus_teardown_done;
diff --git a/tools/libxl/libxl_dom_suspend.c b/tools/libxl/libxl_dom_suspend.c
new file mode 100644
index 0000000..4cc01ad
--- /dev/null
+++ b/tools/libxl/libxl_dom_suspend.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2009      Citrix Ltd.
+ * Author Vincent Hanquez <vincent.hanquez at eu.citrix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include "libxl_internal.h"
+
+/*====================== Domain suspend =======================*/
+
+/*----- callbacks, called by xc_domain_save -----*/
+
+int libxl__domain_suspend_device_model(libxl__gc *gc,
+                                       libxl__domain_suspend_state *dss)
+{
+    int ret = 0;
+    uint32_t const domid = dss->domid;
+    const char *const filename = dss->dm_savefile;
+
+    switch (libxl__device_model_version_running(gc, domid)) {
+    case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: {
+        LOG(DEBUG, "Saving device model state to %s", filename);
+        libxl__qemu_traditional_cmd(gc, domid, "save");
+        libxl__wait_for_device_model_deprecated(gc, domid, "paused", NULL, NULL, NULL);
+        break;
+    }
+    case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
+        if (libxl__qmp_stop(gc, domid))
+            return ERROR_FAIL;
+        /* Save DM state into filename */
+        ret = libxl__qmp_save(gc, domid, filename);
+        if (ret)
+            unlink(filename);
+        break;
+    default:
+        return ERROR_INVAL;
+    }
+
+    return ret;
+}
+
+static void domain_suspend_common_wait_guest(libxl__egc *egc,
+                                             libxl__domain_suspend_state *dss);
+static void domain_suspend_common_guest_suspended(libxl__egc *egc,
+                                         libxl__domain_suspend_state *dss);
+
+static void domain_suspend_common_pvcontrol_suspending(libxl__egc *egc,
+      libxl__xswait_state *xswa, int rc, const char *state);
+static void domain_suspend_common_wait_guest_evtchn(libxl__egc *egc,
+        libxl__ev_evtchn *evev);
+static void suspend_common_wait_guest_watch(libxl__egc *egc,
+      libxl__ev_xswatch *xsw, const char *watch_path, const char *event_path);
+static void suspend_common_wait_guest_check(libxl__egc *egc,
+        libxl__domain_suspend_state *dss);
+static void suspend_common_wait_guest_timeout(libxl__egc *egc,
+      libxl__ev_time *ev, const struct timeval *requested_abs, int rc);
+
+static void domain_suspend_common_done(libxl__egc *egc,
+                                       libxl__domain_suspend_state *dss,
+                                       int rc);
+
+static void domain_suspend_callback_common(libxl__egc *egc,
+                                           libxl__domain_suspend_state *dss);
+static void domain_suspend_callback_common_done(libxl__egc *egc,
+                                libxl__domain_suspend_state *dss, int rc);
+
+/* calls dss->callback_common_done when done */
+void libxl__domain_suspend(libxl__egc *egc,
+                           libxl__domain_suspend_state *dss)
+{
+    domain_suspend_callback_common(egc, dss);
+}
+
+static bool domain_suspend_pvcontrol_acked(const char *state) {
+    /* any value other than "suspend", including ENOENT (i.e. !state), is OK */
+    if (!state) return 1;
+    return strcmp(state,"suspend");
+}
+
+/* calls dss->callback_common_done when done */
+static void domain_suspend_callback_common(libxl__egc *egc,
+                                           libxl__domain_suspend_state *dss)
+{
+    STATE_AO_GC(dss->ao);
+    uint64_t hvm_s_state = 0, hvm_pvdrv = 0;
+    int ret, rc;
+
+    /* Convenience aliases */
+    const uint32_t domid = dss->domid;
+
+    if (dss->hvm) {
+        xc_hvm_param_get(CTX->xch, domid, HVM_PARAM_CALLBACK_IRQ, &hvm_pvdrv);
+        xc_hvm_param_get(CTX->xch, domid, HVM_PARAM_ACPI_S_STATE, &hvm_s_state);
+    }
+
+    if ((hvm_s_state == 0) && (dss->guest_evtchn.port >= 0)) {
+        LOG(DEBUG, "issuing %s suspend request via event channel",
+            dss->hvm ? "PVHVM" : "PV");
+        ret = xc_evtchn_notify(CTX->xce, dss->guest_evtchn.port);
+        if (ret < 0) {
+            LOG(ERROR, "xc_evtchn_notify failed ret=%d", ret);
+            rc = ERROR_FAIL;
+            goto err;
+        }
+
+        dss->guest_evtchn.callback = domain_suspend_common_wait_guest_evtchn;
+        rc = libxl__ev_evtchn_wait(gc, &dss->guest_evtchn);
+        if (rc) goto err;
+
+        rc = libxl__ev_time_register_rel(ao, &dss->guest_timeout,
+                                         suspend_common_wait_guest_timeout,
+                                         60*1000);
+        if (rc) goto err;
+
+        return;
+    }
+
+    if (dss->hvm && (!hvm_pvdrv || hvm_s_state)) {
+        LOG(DEBUG, "Calling xc_domain_shutdown on HVM domain");
+        ret = xc_domain_shutdown(CTX->xch, domid, SHUTDOWN_suspend);
+        if (ret < 0) {
+            LOGE(ERROR, "xc_domain_shutdown failed");
+            rc = ERROR_FAIL;
+            goto err;
+        }
+        /* The guest does not (need to) respond to this sort of request. */
+        dss->guest_responded = 1;
+        domain_suspend_common_wait_guest(egc, dss);
+        return;
+    }
+
+    LOG(DEBUG, "issuing %s suspend request via XenBus control node",
+        dss->hvm ? "PVHVM" : "PV");
+
+    libxl__domain_pvcontrol_write(gc, XBT_NULL, domid, "suspend");
+
+    dss->pvcontrol.path = libxl__domain_pvcontrol_xspath(gc, domid);
+    if (!dss->pvcontrol.path) { rc = ERROR_FAIL; goto err; }
+
+    dss->pvcontrol.ao = ao;
+    dss->pvcontrol.what = "guest acknowledgement of suspend request";
+    dss->pvcontrol.timeout_ms = 60 * 1000;
+    dss->pvcontrol.callback = domain_suspend_common_pvcontrol_suspending;
+    libxl__xswait_start(gc, &dss->pvcontrol);
+    return;
+
+ err:
+    domain_suspend_common_done(egc, dss, rc);
+}
+
+static void domain_suspend_common_wait_guest_evtchn(libxl__egc *egc,
+        libxl__ev_evtchn *evev)
+{
+    libxl__domain_suspend_state *dss = CONTAINER_OF(evev, *dss, guest_evtchn);
+    STATE_AO_GC(dss->ao);
+    /* If we should be done waiting, suspend_common_wait_guest_check
+     * will end up calling domain_suspend_common_guest_suspended or
+     * domain_suspend_common_done, both of which cancel the evtchn
+     * wait as needed.  So re-enable it now. */
+    libxl__ev_evtchn_wait(gc, &dss->guest_evtchn);
+    suspend_common_wait_guest_check(egc, dss);
+}
+
+static void domain_suspend_common_pvcontrol_suspending(libxl__egc *egc,
+      libxl__xswait_state *xswa, int rc, const char *state)
+{
+    libxl__domain_suspend_state *dss = CONTAINER_OF(xswa, *dss, pvcontrol);
+    STATE_AO_GC(dss->ao);
+    xs_transaction_t t = 0;
+
+    if (!rc && !domain_suspend_pvcontrol_acked(state))
+        /* keep waiting */
+        return;
+
+    libxl__xswait_stop(gc, &dss->pvcontrol);
+
+    if (rc == ERROR_TIMEDOUT) {
+        /*
+         * Guest appears to not be responding. Cancel the suspend
+         * request.
+         *
+         * We re-read the suspend node and clear it within a
+         * transaction in order to handle the case where we race
+         * against the guest catching up and acknowledging the request
+         * at the last minute.
+         */
+        for (;;) {
+            rc = libxl__xs_transaction_start(gc, &t);
+            if (rc) goto err;
+
+            rc = libxl__xs_read_checked(gc, t, xswa->path, &state);
+            if (rc) goto err;
+
+            if (domain_suspend_pvcontrol_acked(state))
+                /* last minute ack */
+                break;
+
+            rc = libxl__xs_write_checked(gc, t, xswa->path, "");
+            if (rc) goto err;
+
+            rc = libxl__xs_transaction_commit(gc, &t);
+            if (!rc) {
+                LOG(ERROR,
+                    "guest didn't acknowledge suspend, cancelling request");
+                goto err;
+            }
+            if (rc<0) goto err;
+        }
+    } else if (rc) {
+        /* some error in xswait's read of xenstore, already logged */
+        goto err;
+    }
+
+    assert(domain_suspend_pvcontrol_acked(state));
+    LOG(DEBUG, "guest acknowledged suspend request");
+
+    libxl__xs_transaction_abort(gc, &t);
+    dss->guest_responded = 1;
+    domain_suspend_common_wait_guest(egc,dss);
+    return;
+
+ err:
+    libxl__xs_transaction_abort(gc, &t);
+    domain_suspend_common_done(egc, dss, rc);
+    return;
+}
+
+static void domain_suspend_common_wait_guest(libxl__egc *egc,
+                                             libxl__domain_suspend_state *dss)
+{
+    STATE_AO_GC(dss->ao);
+    int rc;
+
+    LOG(DEBUG, "wait for the guest to suspend");
+
+    rc = libxl__ev_xswatch_register(gc, &dss->guest_watch,
+                                    suspend_common_wait_guest_watch,
+                                    "@releaseDomain");
+    if (rc) goto err;
+
+    rc = libxl__ev_time_register_rel(ao, &dss->guest_timeout,
+                                     suspend_common_wait_guest_timeout,
+                                     60*1000);
+    if (rc) goto err;
+    return;
+
+ err:
+    domain_suspend_common_done(egc, dss, rc);
+}
+
+static void suspend_common_wait_guest_watch(libxl__egc *egc,
+      libxl__ev_xswatch *xsw, const char *watch_path, const char *event_path)
+{
+    libxl__domain_suspend_state *dss = CONTAINER_OF(xsw, *dss, guest_watch);
+    suspend_common_wait_guest_check(egc, dss);
+}
+
+static void suspend_common_wait_guest_check(libxl__egc *egc,
+        libxl__domain_suspend_state *dss)
+{
+    STATE_AO_GC(dss->ao);
+    xc_domaininfo_t info;
+    int ret;
+    int shutdown_reason;
+
+    /* Convenience aliases */
+    const uint32_t domid = dss->domid;
+
+    ret = xc_domain_getinfolist(CTX->xch, domid, 1, &info);
+    if (ret < 0) {
+        LOGE(ERROR, "unable to check for status of guest %"PRId32"", domid);
+        goto err;
+    }
+
+    if (!(ret == 1 && info.domain == domid)) {
+        LOGE(ERROR, "guest %"PRId32" we were suspending has been destroyed",
+             domid);
+        goto err;
+    }
+
+    if (!(info.flags & XEN_DOMINF_shutdown))
+        /* keep waiting */
+        return;
+
+    shutdown_reason = (info.flags >> XEN_DOMINF_shutdownshift)
+        & XEN_DOMINF_shutdownmask;
+    if (shutdown_reason != SHUTDOWN_suspend) {
+        LOG(DEBUG, "guest %"PRId32" we were suspending has shut down"
+            " with unexpected reason code %d", domid, shutdown_reason);
+        goto err;
+    }
+
+    LOG(DEBUG, "guest has suspended");
+    domain_suspend_common_guest_suspended(egc, dss);
+    return;
+
+ err:
+    domain_suspend_common_done(egc, dss, ERROR_FAIL);
+}
+
+static void suspend_common_wait_guest_timeout(libxl__egc *egc,
+      libxl__ev_time *ev, const struct timeval *requested_abs, int rc)
+{
+    libxl__domain_suspend_state *dss = CONTAINER_OF(ev, *dss, guest_timeout);
+    STATE_AO_GC(dss->ao);
+    if (rc == ERROR_TIMEDOUT) {
+        LOG(ERROR, "guest did not suspend, timed out");
+        rc = ERROR_GUEST_TIMEDOUT;
+    }
+    domain_suspend_common_done(egc, dss, rc);
+}
+
+static void domain_suspend_common_guest_suspended(libxl__egc *egc,
+                                         libxl__domain_suspend_state *dss)
+{
+    STATE_AO_GC(dss->ao);
+    int rc;
+
+    libxl__ev_evtchn_cancel(gc, &dss->guest_evtchn);
+    libxl__ev_xswatch_deregister(gc, &dss->guest_watch);
+    libxl__ev_time_deregister(gc, &dss->guest_timeout);
+
+    if (dss->hvm) {
+        rc = libxl__domain_suspend_device_model(gc, dss);
+        if (rc) {
+            LOG(ERROR, "libxl__domain_suspend_device_model failed ret=%d", rc);
+            domain_suspend_common_done(egc, dss, rc);
+            return;
+        }
+    }
+    domain_suspend_common_done(egc, dss, 0);
+}
+
+static void domain_suspend_common_done(libxl__egc *egc,
+                                       libxl__domain_suspend_state *dss,
+                                       int rc)
+{
+    EGC_GC;
+    assert(!libxl__xswait_inuse(&dss->pvcontrol));
+    libxl__ev_evtchn_cancel(gc, &dss->guest_evtchn);
+    libxl__ev_xswatch_deregister(gc, &dss->guest_watch);
+    libxl__ev_time_deregister(gc, &dss->guest_timeout);
+    dss->callback_common_done(egc, dss, rc);
+}
+
+void libxl__domain_suspend_callback(void *data)
+{
+    libxl__save_helper_state *shs = data;
+    libxl__egc *egc = shs->egc;
+    libxl__domain_suspend_state *dss = shs->caller_state;
+
+    dss->callback_common_done = domain_suspend_callback_common_done;
+    domain_suspend_callback_common(egc, dss);
+}
+
+static void domain_suspend_callback_common_done(libxl__egc *egc,
+                                libxl__domain_suspend_state *dss, int rc)
+{
+    dss->rc = rc;
+    libxl__xc_domain_saverestore_async_callback_done(egc, &dss->sws.shs, !rc);
+}
+
+/*======================= Domain resume ========================*/
+
+int libxl__domain_resume_device_model(libxl__gc *gc, uint32_t domid)
+{
+    const char *path, *state;
+
+    switch (libxl__device_model_version_running(gc, domid)) {
+    case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: {
+        uint32_t dm_domid = libxl_get_stubdom_id(CTX, domid);
+
+        path = libxl__device_model_xs_path(gc, dm_domid, domid, "/state");
+        state = libxl__xs_read(gc, XBT_NULL, path);
+        if (state != NULL && !strcmp(state, "paused")) {
+            libxl__qemu_traditional_cmd(gc, domid, "continue");
+            libxl__wait_for_device_model_deprecated(gc, domid, "running",
+                                                    NULL, NULL, NULL);
+        }
+        break;
+    }
+    case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
+        if (libxl__qmp_resume(gc, domid))
+            return ERROR_FAIL;
+        break;
+    default:
+        return ERROR_INVAL;
+    }
+
+    return 0;
+}
+
+int libxl__domain_resume(libxl__gc *gc, uint32_t domid, int suspend_cancel)
+{
+    int rc = 0;
+
+    if (xc_domain_resume(CTX->xch, domid, suspend_cancel)) {
+        LOGE(ERROR, "xc_domain_resume failed for domain %u", domid);
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    libxl_domain_type type = libxl__domain_type(gc, domid);
+    if (type == LIBXL_DOMAIN_TYPE_INVALID) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    if (type == LIBXL_DOMAIN_TYPE_HVM) {
+        rc = libxl__domain_resume_device_model(gc, domid);
+        if (rc) {
+            LOG(ERROR, "failed to resume device model for domain %u:%d",
+                domid, rc);
+            goto out;
+        }
+    }
+
+    if (!xs_resume_domain(CTX->xsh, domid)) {
+        LOGE(ERROR, "xs_resume_domain failed for domain %u", domid);
+        rc = ERROR_FAIL;
+    }
+out:
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxl_event.c b/tools/libxl/libxl_event.c
index 0d874d9..bfb6b31 100644
--- a/tools/libxl/libxl_event.c
+++ b/tools/libxl/libxl_event.c
@@ -31,6 +31,11 @@
 #define DBG(args, ...) LIBXL__DBG_LOG(CTX, args, __VA_ARGS__)
 
 
+static libxl__ao *ao_nested_root(libxl__ao *ao);
+
+static void ao__check_destroy(libxl_ctx *ctx, libxl__ao *ao);
+
+
 /*
  * The counter osevent_in_hook is used to ensure that the application
  * honours the reentrancy restriction documented in libxl_event.h.
@@ -220,6 +225,7 @@ int libxl__ev_fd_modify(libxl__gc *gc, libxl__ev_fd *ev, short events)
 void libxl__ev_fd_deregister(libxl__gc *gc, libxl__ev_fd *ev)
 {
     CTX_LOCK;
+    libxl__poller *poller;
 
     if (!libxl__ev_fd_isregistered(ev)) {
         DBG("ev_fd=%p deregister unregistered",ev);
@@ -232,10 +238,36 @@ void libxl__ev_fd_deregister(libxl__gc *gc, libxl__ev_fd *ev)
     LIBXL_LIST_REMOVE(ev, entry);
     ev->fd = -1;
 
+    LIBXL_LIST_FOREACH(poller, &CTX->pollers_fds_changed, fds_changed_entry)
+        poller->fds_changed = 1;
+
  out:
     CTX_UNLOCK;
 }
 
+short libxl__fd_poll_recheck(libxl__egc *egc, int fd, short events) {
+    struct pollfd check;
+    int r;
+
+    for (;;) {
+        check.fd = fd;
+        check.events = events;
+        r = poll(&check, 1, 0);
+        DBG("poll recheck fd=%d r=%d revents=%#x", fd, r, check.revents);
+        if (!r)
+            break;
+        if (r==1)
+            break;
+        assert(r<0);
+        if (errno != EINTR) {
+            LIBXL__EVENT_DISASTER(egc, "failed poll to check for fd", errno, 0);
+            return 0;
+        }
+    }
+    assert(!!r == !!check.revents);
+    return check.revents;
+}
+
 /*
  * timeouts
  */
@@ -287,6 +319,8 @@ static int time_register_finite(libxl__gc *gc, libxl__ev_time *ev,
 
 static void time_deregister(libxl__gc *gc, libxl__ev_time *ev)
 {
+    libxl__ao_abortable_deregister(&ev->abrt);
+
     if (!ev->infinite) {
         struct timeval right_away = { 0, 0 };
         if (ev->nexus) /* only set if app provided hooks */
@@ -309,10 +343,28 @@ static void time_done_debug(libxl__gc *gc, const char *func,
 #endif
 }
 
-int libxl__ev_time_register_abs(libxl__gc *gc, libxl__ev_time *ev,
+static void time_aborted(libxl__egc *egc, libxl__ao_abortable *abrt, int rc)
+{
+    libxl__ev_time *ev = CONTAINER_OF(abrt, *ev, abrt);
+    EGC_GC;
+
+    time_deregister(gc, ev);
+    DBG("ev_time=%p aborted", ev);
+    ev->func(egc, ev, &ev->abs, rc);
+}
+
+static int time_register_abortable(libxl__ao *ao, libxl__ev_time *ev)
+{
+    ev->abrt.ao = ao;
+    ev->abrt.callback = time_aborted;
+    return libxl__ao_abortable_register(&ev->abrt);
+}
+
+int libxl__ev_time_register_abs(libxl__ao *ao, libxl__ev_time *ev,
                                 libxl__ev_time_callback *func,
                                 struct timeval absolute)
 {
+    AO_GC;
     int rc;
 
     CTX_LOCK;
@@ -320,6 +372,9 @@ int libxl__ev_time_register_abs(libxl__gc *gc, libxl__ev_time *ev,
     DBG("ev_time=%p register abs=%lu.%06lu",
         ev, (unsigned long)absolute.tv_sec, (unsigned long)absolute.tv_usec);
 
+    rc = time_register_abortable(ao, ev);
+    if (rc) goto out;
+
     rc = time_register_finite(gc, ev, absolute);
     if (rc) goto out;
 
@@ -327,16 +382,18 @@ int libxl__ev_time_register_abs(libxl__gc *gc, libxl__ev_time *ev,
 
     rc = 0;
  out:
+    libxl__ao_abortable_deregister(&ev->abrt);
     time_done_debug(gc,__func__,ev,rc);
     CTX_UNLOCK;
     return rc;
 }
 
 
-int libxl__ev_time_register_rel(libxl__gc *gc, libxl__ev_time *ev,
+int libxl__ev_time_register_rel(libxl__ao *ao, libxl__ev_time *ev,
                                 libxl__ev_time_callback *func,
                                 int milliseconds /* as for poll(2) */)
 {
+    AO_GC;
     struct timeval absolute;
     int rc;
 
@@ -344,6 +401,9 @@ int libxl__ev_time_register_rel(libxl__gc *gc, libxl__ev_time *ev,
 
     DBG("ev_time=%p register ms=%d", ev, milliseconds);
 
+    rc = time_register_abortable(ao, ev);
+    if (rc) goto out;
+
     if (milliseconds < 0) {
         ev->infinite = 1;
     } else {
@@ -358,6 +418,8 @@ int libxl__ev_time_register_rel(libxl__gc *gc, libxl__ev_time *ev,
     rc = 0;
 
  out:
+    if (!libxl__ev_time_isregistered(ev))
+        libxl__ao_abortable_deregister(&ev->abrt);
     time_done_debug(gc,__func__,ev,rc);
     CTX_UNLOCK;
     return rc;
@@ -381,7 +443,7 @@ void libxl__ev_time_deregister(libxl__gc *gc, libxl__ev_time *ev)
     return;
 }
 
-static void time_occurs(libxl__egc *egc, libxl__ev_time *etime)
+static void time_occurs(libxl__egc *egc, libxl__ev_time *etime, int rc)
 {
     DBG("ev_time=%p occurs abs=%lu.%06lu",
         etime, (unsigned long)etime->abs.tv_sec,
@@ -389,7 +451,7 @@ static void time_occurs(libxl__egc *egc, libxl__ev_time *etime)
 
     libxl__ev_time_callback *func = etime->func;
     etime->func = 0;
-    func(egc, etime, &etime->abs);
+    func(egc, etime, &etime->abs, rc);
 }
 
 
@@ -661,9 +723,8 @@ static void evtchn_fd_callback(libxl__egc *egc, libxl__ev_fd *ev,
 {
     EGC_GC;
     libxl__ev_evtchn *evev;
-    int r, rc;
+    int rc;
     evtchn_port_or_error_t port;
-    struct pollfd recheck;
 
     rc = evtchn_revents_check(egc, revents);
     if (rc) return;
@@ -674,21 +735,10 @@ static void evtchn_fd_callback(libxl__egc *egc, libxl__ev_fd *ev,
          * held continuously since someone noticed the fd.  Normally
          * this wouldn't be a problem but evtchn devices don't always
          * honour O_NONBLOCK (see xenctrl.h). */
-
-        recheck.fd = fd;
-        recheck.events = POLLIN;
-        recheck.revents = 0;
-        r = poll(&recheck, 1, 0);
-        DBG("ev_evtchn recheck r=%d revents=%#x", r, recheck.revents);
-        if (r < 0) {
-            LIBXL__EVENT_DISASTER(egc,
-     "unexpected failure polling event channel fd for recheck",
-                                  errno, 0);
-            return;
-        }
-        if (r == 0)
+        revents = libxl__fd_poll_recheck(egc,fd,POLLIN);
+        if (!revents)
             break;
-        rc = evtchn_revents_check(egc, recheck.revents);
+        rc = evtchn_revents_check(egc, revents);
         if (rc) return;
 
         /* OK, that's that workaround done.  We can actually check for
@@ -805,68 +855,59 @@ void libxl__ev_evtchn_cancel(libxl__gc *gc, libxl__ev_evtchn *evev)
  * waiting for device state
  */
 
-static void devstate_watch_callback(libxl__egc *egc, libxl__ev_xswatch *watch,
-                                const char *watch_path, const char *event_path)
+static void devstate_callback(libxl__egc *egc, libxl__xswait_state *xsw,
+                              int rc, const char *sstate)
 {
     EGC_GC;
-    libxl__ev_devstate *ds = CONTAINER_OF(watch, *ds, watch);
-    int rc;
+    libxl__ev_devstate *ds = CONTAINER_OF(xsw, *ds, w);
 
-    char *sstate = libxl__xs_read(gc, XBT_NULL, watch_path);
+    if (rc) {
+        if (rc == ERROR_TIMEDOUT)
+            LIBXL__LOG(CTX, LIBXL__LOG_DEBUG, "backend %s wanted state %d "
+                       " timed out", ds->w.path, ds->wanted);
+        goto out;
+    }
     if (!sstate) {
-        if (errno == ENOENT) {
-            LIBXL__LOG(CTX, LIBXL__LOG_DEBUG, "backend %s wanted state %d"
-                       " but it was removed", watch_path, ds->wanted);
-            rc = ERROR_INVAL;
-        } else {
-            LIBXL__LOG_ERRNO(CTX, LIBXL__LOG_ERROR, "backend %s wanted state"
-                             " %d but read failed", watch_path, ds->wanted);
-            rc = ERROR_FAIL;
-        }
+        LIBXL__LOG(CTX, LIBXL__LOG_DEBUG, "backend %s wanted state %d"
+                   " but it was removed", ds->w.path, ds->wanted);
+        rc = ERROR_INVAL;
+        goto out;
+    }
+
+    int got = atoi(sstate);
+    if (got == ds->wanted) {
+        LIBXL__LOG(CTX, LIBXL__LOG_DEBUG, "backend %s wanted state %d ok",
+                   ds->w.path, ds->wanted);
+        rc = 0;
     } else {
-        int got = atoi(sstate);
-        if (got == ds->wanted) {
-            LIBXL__LOG(CTX, LIBXL__LOG_DEBUG, "backend %s wanted state %d ok",
-                       watch_path, ds->wanted);
-            rc = 0;
-        } else {
-            LIBXL__LOG(CTX, LIBXL__LOG_DEBUG, "backend %s wanted state %d"
-                       " still waiting state %d", watch_path, ds->wanted, got);
-            return;
-        }
+        LIBXL__LOG(CTX, LIBXL__LOG_DEBUG, "backend %s wanted state %d"
+                   " still waiting state %d", ds->w.path, ds->wanted, got);
+        return;
     }
-    libxl__ev_devstate_cancel(gc, ds);
-    ds->callback(egc, ds, rc);
-}
 
-static void devstate_timeout(libxl__egc *egc, libxl__ev_time *ev,
-                             const struct timeval *requested_abs)
-{
-    EGC_GC;
-    libxl__ev_devstate *ds = CONTAINER_OF(ev, *ds, timeout);
-    LIBXL__LOG(CTX, LIBXL__LOG_DEBUG, "backend %s wanted state %d "
-               " timed out", ds->watch.path, ds->wanted);
+ out:
     libxl__ev_devstate_cancel(gc, ds);
-    ds->callback(egc, ds, ERROR_TIMEDOUT);
+    ds->callback(egc, ds, rc);
 }
 
-int libxl__ev_devstate_wait(libxl__gc *gc, libxl__ev_devstate *ds,
+int libxl__ev_devstate_wait(libxl__ao *ao, libxl__ev_devstate *ds,
                             libxl__ev_devstate_callback cb,
                             const char *state_path, int state, int milliseconds)
 {
+    AO_GC;
     int rc;
 
-    libxl__ev_time_init(&ds->timeout);
-    libxl__ev_xswatch_init(&ds->watch);
+    libxl__xswait_init(&ds->w);
     ds->wanted = state;
     ds->callback = cb;
 
-    rc = libxl__ev_time_register_rel(gc, &ds->timeout, devstate_timeout,
-                                     milliseconds);
-    if (rc) goto out;
-
-    rc = libxl__ev_xswatch_register(gc, &ds->watch, devstate_watch_callback,
-                                    state_path);
+    ds->w.ao = ao;
+    ds->w.what = GCSPRINTF("backend %s (hoping for state change to %d)",
+                           state_path, state);
+    ds->w.path = state_path;
+    ds->w.timeout_ms = milliseconds;
+    ds->w.callback = devstate_callback;
+    rc = libxl__xswait_start(gc, &ds->w);
     if (rc) goto out;
 
     return 0;
@@ -896,6 +937,18 @@ int libxl__ev_devstate_wait(libxl__gc *gc, libxl__ev_devstate *ds,
  * futile.
  */
 
+void libxl__domaindeathcheck_init(libxl__domaindeathcheck *dc)
+{
+    libxl__ao_abortable_init(&dc->abrt);
+    libxl__ev_xswatch_init(&dc->watch);
+}
+
+void libxl__domaindeathcheck_stop(libxl__gc *gc, libxl__domaindeathcheck *dc)
+{
+    libxl__ao_abortable_deregister(&dc->abrt);
+    libxl__ev_xswatch_deregister(gc,&dc->watch);
+}
+
 static void domaindeathcheck_callback(libxl__egc *egc, libxl__ev_xswatch *w,
                             const char *watch_path, const char *event_path)
 {
@@ -904,6 +957,8 @@ static void domaindeathcheck_callback(libxl__egc *egc, libxl__ev_xswatch *w,
     const char *p = libxl__xs_read(gc, XBT_NULL, watch_path);
     if (p) return;
 
+    libxl__domaindeathcheck_stop(gc,dc);
+
     if (errno!=ENOENT) {
         LIBXL__EVENT_DISASTER(egc,"failed to read xenstore"
                               " for domain detach check", errno, 0);
@@ -912,15 +967,43 @@ static void domaindeathcheck_callback(libxl__egc *egc, libxl__ev_xswatch *w,
 
     LOG(ERROR,"%s: domain %"PRIu32" removed (%s no longer in xenstore)",
         dc->what, dc->domid, watch_path);
-    dc->callback(egc, dc);
+    dc->callback(egc, dc, ERROR_DOMAIN_DESTROYED);
 }
 
-int libxl__domaindeathcheck_start(libxl__gc *gc,
+static void domaindeathcheck_abort(libxl__egc *egc,
+                                   libxl__ao_abortable *abrt,
+                                   int rc)
+{
+    libxl__domaindeathcheck *dc = CONTAINER_OF(abrt, *dc, abrt);
+    EGC_GC;
+
+    libxl__domaindeathcheck_stop(gc,dc);
+    dc->callback(egc, dc, rc);
+}
+
+int libxl__domaindeathcheck_start(libxl__ao *ao,
                                   libxl__domaindeathcheck *dc)
 {
+    AO_GC;
+    int rc;
     const char *path = GCSPRINTF("/local/domain/%"PRIu32, dc->domid);
-    return libxl__ev_xswatch_register(gc, &dc->watch,
-                                      domaindeathcheck_callback, path);
+
+    libxl__domaindeathcheck_init(dc);
+
+    dc->abrt.ao = ao;
+    dc->abrt.callback = domaindeathcheck_abort;
+    rc = libxl__ao_abortable_register(&dc->abrt);
+    if (rc) goto out;
+
+    rc = libxl__ev_xswatch_register(gc, &dc->watch,
+                                    domaindeathcheck_callback, path);
+    if (rc) goto out;
+
+    return 0;
+
+ out:
+    libxl__domaindeathcheck_stop(gc,dc);
+    return rc;
 }
 
 /*
@@ -1031,6 +1114,8 @@ static int beforepoll_internal(libxl__gc *gc, libxl__poller *poller,
 
     *nfds_io = used;
 
+    poller->fds_changed = 0;
+
     libxl__ev_time *etime = LIBXL_TAILQ_FIRST(&CTX->etimes);
     if (etime) {
         int our_timeout;
@@ -1059,7 +1144,7 @@ int libxl_osevent_beforepoll(libxl_ctx *ctx, int *nfds_io,
 {
     EGC_INIT(ctx);
     CTX_LOCK;
-    int rc = beforepoll_internal(gc, &ctx->poller_app,
+    int rc = beforepoll_internal(gc, ctx->poller_app,
                                  nfds_io, fds, timeout_upd, now);
     CTX_UNLOCK;
     EGC_FREE;
@@ -1095,7 +1180,7 @@ static int afterpoll_check_fd(libxl__poller *poller,
             /* again, stale slot entry */
             continue;
 
-        assert(!(fds[slot].revents & POLLNVAL));
+        assert(poller->fds_changed || !(fds[slot].revents & POLLNVAL));
 
         /* we mask in case requested events have changed */
         int slot_revents = fds[slot].revents & events;
@@ -1110,6 +1195,17 @@ static int afterpoll_check_fd(libxl__poller *poller,
     return revents;
 }
 
+static void fd_occurs(libxl__egc *egc, libxl__ev_fd *efd, short revents_ign)
+{
+    short revents_current = libxl__fd_poll_recheck(egc, efd->fd, efd->events);
+
+    DBG("ev_fd=%p occurs fd=%d events=%x revents_ign=%x revents_current=%x",
+        efd, efd->fd, efd->events, revents_ign, revents_current);
+
+    if (revents_current)
+        efd->func(egc, efd, efd->fd, efd->events, revents_current);
+}
+
 static void afterpoll_internal(libxl__egc *egc, libxl__poller *poller,
                                int nfds, const struct pollfd *fds,
                                struct timeval now)
@@ -1172,10 +1268,7 @@ static void afterpoll_internal(libxl__egc *egc, libxl__poller *poller,
         break;
 
     found_fd_event:
-        DBG("ev_fd=%p occurs fd=%d events=%x revents=%x",
-            efd, efd->fd, efd->events, revents);
-
-        efd->func(egc, efd, efd->fd, efd->events, revents);
+        fd_occurs(egc, efd, revents);
     }
 
     if (afterpoll_check_fd(poller,fds,nfds, poller->wakeup_pipe[0],POLLIN)) {
@@ -1195,7 +1288,7 @@ static void afterpoll_internal(libxl__egc *egc, libxl__poller *poller,
 
         time_deregister(gc, etime);
 
-        time_occurs(egc, etime);
+        time_occurs(egc, etime, ERROR_TIMEDOUT);
     }
 }
 
@@ -1204,7 +1297,7 @@ void libxl_osevent_afterpoll(libxl_ctx *ctx, int nfds, const struct pollfd *fds,
 {
     EGC_INIT(ctx);
     CTX_LOCK;
-    afterpoll_internal(egc, &ctx->poller_app, nfds, fds, now);
+    afterpoll_internal(egc, ctx->poller_app, nfds, fds, now);
     CTX_UNLOCK;
     EGC_FREE;
 }
@@ -1239,24 +1332,7 @@ void libxl_osevent_occurred_fd(libxl_ctx *ctx, void *for_libxl,
     if (!ev) goto out;
     if (ev->fd != fd) goto out;
 
-    struct pollfd check;
-    for (;;) {
-        check.fd = fd;
-        check.events = ev->events;
-        int r = poll(&check, 1, 0);
-        if (!r)
-            goto out;
-        if (r==1)
-            break;
-        assert(r<0);
-        if (errno != EINTR) {
-            LIBXL__EVENT_DISASTER(egc, "failed poll to check for fd", errno, 0);
-            goto out;
-        }
-    }
-
-    if (check.revents)
-        ev->func(egc, ev, fd, ev->events, check.revents);
+    fd_occurs(egc, ev, revents_ign);
 
  out:
     CTX_UNLOCK;
@@ -1279,7 +1355,7 @@ void libxl_osevent_occurred_timeout(libxl_ctx *ctx, void *for_libxl)
 
     LIBXL_TAILQ_REMOVE(&CTX->etimes, ev, entry);
 
-    time_occurs(egc, ev);
+    time_occurs(egc, ev, ERROR_TIMEDOUT);
 
  out:
     CTX_UNLOCK;
@@ -1335,6 +1411,7 @@ static void egc_run_callbacks(libxl__egc *egc)
         aop->how->callback(CTX, aop->ev, aop->how->for_callback);
 
         CTX_LOCK;
+        assert(aop->ao->magic == LIBXL__AO_MAGIC);
         aop->ao->progress_reports_outstanding--;
         libxl__ao_complete_check_progress_reports(egc, aop->ao);
         CTX_UNLOCK;
@@ -1348,8 +1425,7 @@ static void egc_run_callbacks(libxl__egc *egc)
         ao->how.callback(CTX, ao->rc, ao->how.u.for_callback);
         CTX_LOCK;
         ao->notified = 1;
-        if (!ao->in_initiator)
-            libxl__ao__destroy(CTX, ao);
+        ao__check_destroy(CTX, ao);
         CTX_UNLOCK;
     }
 }
@@ -1531,6 +1607,7 @@ int libxl__poller_init(libxl__gc *gc, libxl__poller *p)
     int rc;
     p->fd_polls = 0;
     p->fd_rindices = 0;
+    p->fds_changed = 0;
 
     rc = libxl__pipe_nonblock(CTX, p->wakeup_pipe);
     if (rc) goto out;
@@ -1557,23 +1634,25 @@ libxl__poller *libxl__poller_get(libxl__gc *gc)
     libxl__poller *p = LIBXL_LIST_FIRST(&CTX->pollers_idle);
     if (p) {
         LIBXL_LIST_REMOVE(p, entry);
-        return p;
-    }
-
-    p = libxl__zalloc(NOGC, sizeof(*p));
+    } else {
+        p = libxl__zalloc(NOGC, sizeof(*p));
 
-    rc = libxl__poller_init(gc, p);
-    if (rc) {
-        free(p);
-        return NULL;
+        rc = libxl__poller_init(gc, p);
+        if (rc) {
+            free(p);
+            return NULL;
+        }
     }
 
+    LIBXL_LIST_INSERT_HEAD(&CTX->pollers_fds_changed, p,
+                           fds_changed_entry);
     return p;
 }
 
 void libxl__poller_put(libxl_ctx *ctx, libxl__poller *p)
 {
     if (!p) return;
+    LIBXL_LIST_REMOVE(p, fds_changed_entry);
     LIBXL_LIST_INSERT_HEAD(&ctx->pollers_idle, p, entry);
 }
 
@@ -1730,6 +1809,33 @@ int libxl_event_wait(libxl_ctx *ctx, libxl_event **event_r,
  *                              - destroy the ao
  */
 
+
+/*
+ * A "manip" is a libxl public function manipulating this ao, which
+ * has a pointer to it.  We have to not destroy it while that's the
+ * case, obviously.  Callers must have the ctx locked, obviously.
+ */
+static void ao__manip_enter(libxl__ao *ao)
+{
+    assert(ao->manip_refcnt < INT_MAX);
+    ao->manip_refcnt++;
+}
+
+static void ao__manip_leave(libxl_ctx *ctx, libxl__ao *ao)
+{
+    assert(ao->manip_refcnt > 0);
+    ao->manip_refcnt--;
+    ao__check_destroy(ctx, ao);
+}
+
+static void ao__check_destroy(libxl_ctx *ctx, libxl__ao *ao)
+{
+    if (!ao->manip_refcnt && ao->notified) {
+        assert(ao->complete);
+        libxl__ao__destroy(ctx, ao);
+    }
+}
+
 void libxl__ao__destroy(libxl_ctx *ctx, libxl__ao *ao)
 {
     AO_GC;
@@ -1741,19 +1847,22 @@ void libxl__ao__destroy(libxl_ctx *ctx, libxl__ao *ao)
     free(ao);
 }
 
-void libxl__ao_abort(libxl__ao *ao)
+void libxl__ao_create_fail(libxl__ao *ao)
 {
     AO_GC;
-    LOG(DEBUG,"ao %p: abort",ao);
+    LOG(DEBUG,"ao %p: create fail",ao);
     assert(ao->magic == LIBXL__AO_MAGIC);
     assert(ao->in_initiator);
     assert(!ao->complete);
     assert(!ao->progress_reports_outstanding);
+    assert(!ao->aborting);
+    LIBXL_LIST_REMOVE(ao, inprogress_entry);
     libxl__ao__destroy(CTX, ao);
 }
 
 libxl__gc *libxl__ao_inprogress_gc(libxl__ao *ao)
 {
+    assert(ao);
     assert(ao->magic == LIBXL__AO_MAGIC);
     assert(!ao->complete);
     return &ao->gc;
@@ -1765,26 +1874,33 @@ void libxl__ao_complete(libxl__egc *egc, libxl__ao *ao, int rc)
     LOG(DEBUG,"ao %p: complete, rc=%d",ao,rc);
     assert(ao->magic == LIBXL__AO_MAGIC);
     assert(!ao->complete);
-    assert(!ao->nested);
+    assert(!ao->nested_root);
+    assert(!ao->nested_progeny);
     ao->complete = 1;
     ao->rc = rc;
-
+    LIBXL_LIST_REMOVE(ao, inprogress_entry);
     libxl__ao_complete_check_progress_reports(egc, ao);
 }
 
-void libxl__ao_complete_check_progress_reports(libxl__egc *egc, libxl__ao *ao)
+static bool ao_work_outstanding(libxl__ao *ao)
 {
     /*
      * We don't consider an ao complete if it has any outstanding
      * callbacks.  These callbacks might be outstanding on other
      * threads, queued up in the other threads' egc's.  Those threads
      * will, after making the callback, take out the lock again,
-     * decrement progress_reports_outstanding, and call us again.
+     * decrement progress_reports_outstanding, and call
+     * libxl__ao_complete_check_progress_reports.
      */
+    return !ao->complete || ao->progress_reports_outstanding;
+}
+
+void libxl__ao_complete_check_progress_reports(libxl__egc *egc, libxl__ao *ao)
+{
     libxl_ctx *ctx = libxl__gc_owner(&egc->gc);
     assert(ao->progress_reports_outstanding >= 0);
 
-    if (!ao->complete || ao->progress_reports_outstanding)
+    if (ao_work_outstanding(ao))
         return;
 
     if (ao->poller) {
@@ -1804,8 +1920,8 @@ void libxl__ao_complete_check_progress_reports(libxl__egc *egc, libxl__ao *ao)
         }
         ao->notified = 1;
     }
-    if (!ao->in_initiator && ao->notified)
-        libxl__ao__destroy(ctx, ao);
+    
+    ao__check_destroy(ctx, ao);
 }
 
 libxl__ao *libxl__ao_create(libxl_ctx *ctx, uint32_t domid,
@@ -1820,6 +1936,7 @@ libxl__ao *libxl__ao_create(libxl_ctx *ctx, uint32_t domid,
     ao->magic = LIBXL__AO_MAGIC;
     ao->constructing = 1;
     ao->in_initiator = 1;
+    ao__manip_enter(ao);
     ao->poller = 0;
     ao->domid = domid;
     LIBXL_INIT_GC(ao->gc, ctx);
@@ -1834,6 +1951,8 @@ libxl__ao *libxl__ao_create(libxl_ctx *ctx, uint32_t domid,
                "ao %p: create: how=%p callback=%p poller=%p",
                ao, how, ao->how.callback, ao->poller);
 
+    LIBXL_LIST_INSERT_HEAD(&ctx->aos_inprogress, ao, inprogress_entry);
+
     return ao;
 
  out:
@@ -1871,7 +1990,7 @@ int libxl__ao_inprogress(libxl__ao *ao,
         for (;;) {
             assert(ao->magic == LIBXL__AO_MAGIC);
 
-            if (ao->complete) {
+            if (!ao_work_outstanding(ao)) {
                 rc = ao->rc;
                 ao->notified = 1;
                 break;
@@ -1887,8 +2006,8 @@ int libxl__ao_inprogress(libxl__ao *ao,
                 sleep(1);
                 /* It's either this or return ERROR_I_DONT_KNOW_WHETHER
                  * _THE_THING_YOU_ASKED_FOR_WILL_BE_DONE_LATER_WHEN
-                 * _YOU_DIDNT_EXPECT_IT, since we don't have any kind of
-                 * cancellation ability. */
+                 * _YOU_DIDNT_EXPECT_IT, since we don't have a
+                 * synchronous cancellation ability. */
             }
 
             CTX_UNLOCK;
@@ -1900,15 +2019,144 @@ int libxl__ao_inprogress(libxl__ao *ao,
     }
 
     ao->in_initiator = 0;
+    ao__manip_leave(CTX, ao);
 
-    if (ao->notified) {
-        assert(ao->complete);
-        libxl__ao__destroy(CTX,ao);
+    return rc;
+}
+
+
+/* abort requests */
+
+static int ao__abort(libxl_ctx *ctx, libxl__ao *parent)
+/* Temporarily unlocks ctx, which must be locked exactly once on entry. */
+{
+    int rc;
+    ao__manip_enter(parent);
+
+    if (parent->aborting) {
+        rc = ERROR_ABORTED;
+        goto out;
+    }
+
+    parent->aborting = 1;
+
+    if (LIBXL_LIST_EMPTY(&parent->abortables)) {
+        LIBXL__LOG(ctx, XTL_DEBUG,
+                   "ao %p: abort requested and noted, but no-one interested",
+                   parent);
+        rc = 0;
+        goto out;
     }
 
+    /* We keep calling abort hooks until there are none left */
+    while (!LIBXL_LIST_EMPTY(&parent->abortables)) {
+        libxl__egc egc;
+        LIBXL_INIT_EGC(egc,ctx);
+
+        assert(!parent->complete);
+
+        libxl__ao_abortable *abrt = LIBXL_LIST_FIRST(&parent->abortables);
+        assert(parent == ao_nested_root(abrt->ao));
+
+        LIBXL_LIST_REMOVE(abrt, entry);
+        abrt->registered = 0;
+
+        LIBXL__LOG(ctx, XTL_DEBUG, "ao %p: abrt=%p: aborting",
+                   parent, abrt->ao);
+        abrt->callback(&egc, abrt, ERROR_ABORTED);
+
+        libxl__ctx_unlock(ctx);
+        libxl__egc_cleanup(&egc);
+        libxl__ctx_lock(ctx);
+    }
+
+    rc = 0;
+
+ out:
+    ao__manip_leave(ctx, parent);
     return rc;
 }
 
+int libxl_ao_abort(libxl_ctx *ctx, const libxl_asyncop_how *how)
+{
+    libxl__ao *search;
+    libxl__ctx_lock(ctx);
+    int rc;
+
+    LIBXL_LIST_FOREACH(search, &ctx->aos_inprogress, inprogress_entry) {
+        if (how) {
+            /* looking for ao to be reported by callback or event */
+            if (search->poller)
+                /* sync */
+                continue;
+            if (how->callback != search->how.callback)
+                continue;
+            if (how->callback
+                ? (how->u.for_callback != search->how.u.for_callback)
+                : (how->u.for_event != search->how.u.for_event))
+                continue;
+        } else {
+            /* looking for synchronous call */
+            if (!search->poller)
+                /* async */
+                continue;
+        }
+        goto found;
+    }
+    rc = ERROR_NOTFOUND;
+    goto out;
+
+ found:
+    rc = ao__abort(ctx, search);
+ out:
+    libxl__ctx_unlock(ctx);
+    return rc;
+}
+
+int libxl__ao_aborting(libxl__ao *ao)
+{
+    libxl__ao *root = ao_nested_root(ao);
+    if (root->aborting) {
+        DBG("ao=%p: aborting at explicit check (root=%p)", ao, root);
+        return ERROR_ABORTED;
+    }
+
+    return 0;
+}
+
+int libxl__ao_abortable_register(libxl__ao_abortable *abrt)
+{
+    libxl__ao *ao = abrt->ao;
+    libxl__ao *root = ao_nested_root(ao);
+    AO_GC;
+
+    if (root->aborting) {
+ DBG("ao=%p: preemptively aborting ao_abortable registration %p (root=%p)",
+            ao, abrt, root);
+        return ERROR_ABORTED;
+    }
+
+    DBG("ao=%p, abrt=%p: registering (root=%p)", ao, abrt, root);
+    LIBXL_LIST_INSERT_HEAD(&root->abortables, abrt, entry);
+    abrt->registered = 1;
+
+    return 0;
+}
+
+_hidden void libxl__ao_abortable_deregister(libxl__ao_abortable *abrt)
+{
+    if (!abrt->registered)
+        return;
+
+    libxl__ao *ao = abrt->ao;
+    libxl__ao *root __attribute__((unused)) = ao_nested_root(ao);
+    AO_GC;
+
+    DBG("ao=%p, abrt=%p: deregistering (root=%p)", ao, abrt, root);
+    LIBXL_LIST_REMOVE(abrt, entry);
+    abrt->registered = 0;
+}
+
 
 /* progress reporting */
 
@@ -1930,7 +2178,7 @@ void libxl__ao_progress_report(libxl__egc *egc, libxl__ao *ao,
         const libxl_asyncprogress_how *how, libxl_event *ev)
 {
     AO_GC;
-    assert(!ao->nested);
+    assert(!ao->nested_root);
     if (how->callback == dummy_asyncprogress_callback_ignore) {
         LOG(DEBUG,"ao %p: progress report: ignored",ao);
         libxl_event_free(CTX,ev);
@@ -1953,21 +2201,25 @@ void libxl__ao_progress_report(libxl__egc *egc, libxl__ao *ao,
 
 /* nested ao */
 
+static libxl__ao *ao_nested_root(libxl__ao *ao) {
+    libxl__ao *root = ao->nested_root ? : ao;
+    assert(!root->nested_root);
+    return root;
+}
+
 _hidden libxl__ao *libxl__nested_ao_create(libxl__ao *parent)
 {
-    /* We only use the parent to get the ctx.  However, we require the
-     * caller to provide us with an ao, not just a ctx, to prove that
-     * they are already in an asynchronous operation.  That will avoid
-     * people using this to (for example) make an ao in a non-ao_how
-     * function somewhere in the middle of libxl. */
-    libxl__ao *child = NULL;
+    libxl__ao *child = NULL, *root;
     libxl_ctx *ctx = libxl__gc_owner(&parent->gc);
 
     assert(parent->magic == LIBXL__AO_MAGIC);
+    root = ao_nested_root(parent);
 
     child = libxl__zalloc(&ctx->nogc_gc, sizeof(*child));
     child->magic = LIBXL__AO_MAGIC;
-    child->nested = 1;
+    child->nested_root = root;
+    assert(root->nested_progeny < INT_MAX);
+    root->nested_progeny++;
     LIBXL_INIT_GC(child->gc, ctx);
     libxl__gc *gc = &child->gc;
 
@@ -1978,7 +2230,10 @@ _hidden libxl__ao *libxl__nested_ao_create(libxl__ao *parent)
 _hidden void libxl__nested_ao_free(libxl__ao *child)
 {
     assert(child->magic == LIBXL__AO_MAGIC);
-    assert(child->nested);
+    libxl__ao *root = child->nested_root;
+    assert(root);
+    assert(root->nested_progeny > 0);
+    root->nested_progeny--;
     libxl_ctx *ctx = libxl__gc_owner(&child->gc);
     libxl__ao__destroy(ctx, child);
 }
diff --git a/tools/libxl/libxl_event.h b/tools/libxl/libxl_event.h
index 3c6fcfe..fad4c14 100644
--- a/tools/libxl/libxl_event.h
+++ b/tools/libxl/libxl_event.h
@@ -213,7 +213,7 @@ void libxl_evdisable_disk_eject(libxl_ctx *ctx, libxl_evgen_disk_eject*);
  *      libxl_osevent_afterpoll(...);
  *      for (;;) {
  *          r = libxl_event_check(...);
- *          if (r==LIBXL_NOT_READY) break;
+ *          if (r==ERROR_NOT_READY) break;
  *          if (r) goto error_out;
  *          do something with the event;
  *      }
diff --git a/tools/libxl/libxl_exec.c b/tools/libxl/libxl_exec.c
index 478b4c2..ecb30cf 100644
--- a/tools/libxl/libxl_exec.c
+++ b/tools/libxl/libxl_exec.c
@@ -238,11 +238,11 @@ err:
 /*
  * Full set of possible states of a libxl__spawn_state and its _detachable:
  *
- *                   detaching failed  mid     timeout      xswatch          
+ *                   detaching rc      mid     timeout      xswatch
  *  - Undefined         undef   undef   -        undef        undef
  *  - Idle              any     any     Idle     Idle         Idle
  *  - Attached OK       0       0       Active   Active       Active
- *  - Attached Failed   0       1       Active   Idle         Idle
+ *  - Attached Failed   0       non-0   Active   Idle         Idle
  *  - Detaching         1       maybe   Active   Idle         Idle
  *  - Partial           any     any     Idle     Active/Idle  Active/Idle
  *
@@ -267,7 +267,7 @@ static void spawn_cleanup(libxl__gc *gc, libxl__spawn_state *ss);
 
 /* Precondition: Attached or Detaching; caller has logged failure reason.
  * Results: Detaching, or Attached Failed */
-static void spawn_fail(libxl__egc *egc, libxl__spawn_state *ss);
+static void spawn_fail(libxl__egc *egc, libxl__spawn_state *ss, int rc);
 
 void libxl__spawn_init(libxl__spawn_state *ss)
 {
@@ -283,7 +283,7 @@ int libxl__spawn_spawn(libxl__egc *egc, libxl__spawn_state *ss)
     int status, rc;
 
     libxl__spawn_init(ss);
-    ss->failed = ss->detaching = 0;
+    ss->rc = ss->detaching = 0;
 
     ss->xswait.ao = ao;
     ss->xswait.what = GCSPRINTF("%s startup", ss->what);
@@ -352,12 +352,13 @@ static void spawn_cleanup(libxl__gc *gc, libxl__spawn_state *ss)
 
 static void spawn_detach(libxl__gc *gc, libxl__spawn_state *ss)
 /* Precondition: Attached or Detaching, but caller must have just set
- * at least one of detaching or failed.
+ * at least one of detaching or rc.
  * Results: Detaching or Attached Failed */
 {
     int r;
 
     assert(libxl__ev_child_inuse(&ss->mid));
+    assert(ss->detaching || ss->rc);
     libxl__xswait_stop(gc, &ss->xswait);
 
     pid_t child = ss->mid.pid;
@@ -373,12 +374,13 @@ void libxl__spawn_initiate_detach(libxl__gc *gc, libxl__spawn_state *ss)
     spawn_detach(gc, ss);
 }
 
-static void spawn_fail(libxl__egc *egc, libxl__spawn_state *ss)
+static void spawn_fail(libxl__egc *egc, libxl__spawn_state *ss, int rc)
 /* Caller must have logged.  Must be last thing in calling function,
  * as it may make the callback.  Precondition: Attached or Detaching. */
 {
     EGC_GC;
-    ss->failed = 1;
+    assert(rc);
+    ss->rc = rc;
     spawn_detach(gc, ss);
 }
 
@@ -391,9 +393,10 @@ static void spawn_watch_event(libxl__egc *egc, libxl__xswait_state *xswa,
     if (rc) {
         if (rc == ERROR_TIMEDOUT)
             LOG(ERROR, "%s: startup timed out", ss->what);
-        spawn_fail(egc, ss); /* must be last */
+        spawn_fail(egc, ss, rc); /* must be last */
         return;
     }
+    LOG(DEBUG, "%s: spawn watch p=%s", ss->what, p);
     ss->confirm_cb(egc, ss, p); /* must be last */
 }
 
@@ -404,16 +407,18 @@ static void spawn_middle_death(libxl__egc *egc, libxl__ev_child *childw,
     EGC_GC;
     libxl__spawn_state *ss = CONTAINER_OF(childw, *ss, mid);
 
-    if ((ss->failed || ss->detaching) &&
+    if ((ss->rc || ss->detaching) &&
         ((WIFEXITED(status) && WEXITSTATUS(status)==0) ||
          (WIFSIGNALED(status) && WTERMSIG(status)==SIGKILL))) {
         /* as expected */
+        const char *what = GCSPRINTF("%s (dying as expected)", ss->what);
+        libxl_report_child_exitstatus(CTX, XTL_DEBUG, what, pid, status);
     } else if (!WIFEXITED(status)) {
         int loglevel = ss->detaching ? XTL_WARN : XTL_ERROR;
         const char *what =
             GCSPRINTF("%s intermediate process (startup monitor)", ss->what);
         libxl_report_child_exitstatus(CTX, loglevel, what, pid, status);
-        ss->failed = 1;
+        ss->rc = ERROR_FAIL;
     } else {
         if (!status)
             LOG(ERROR, "%s [%ld]: unexpectedly exited with exit status 0,"
@@ -432,19 +437,20 @@ static void spawn_middle_death(libxl__egc *egc, libxl__ev_child *childw,
                 LOG(ERROR, "%s [%ld]: died during startup due to unknown fatal"
                     " signal number %d", ss->what, (unsigned long)pid, sig);
         }
-        ss->failed = 1;
+        ss->rc = ERROR_FAIL;
     }
 
     spawn_cleanup(gc, ss);
 
-    if (ss->failed && !ss->detaching) {
-        ss->failure_cb(egc, ss); /* must be last */
+    if (ss->rc && !ss->detaching) {
+        ss->failure_cb(egc, ss, ss->rc); /* must be last */
         return;
     }
     
-    if (ss->failed && ss->detaching)
-        LOG(WARN,"%s underlying machinery seemed to fail,"
-            " but its function seems to have been successful", ss->what);
+    if (ss->rc && ss->detaching)
+        LOG(WARN,"%s underlying machinery seemed to fail (%d),"
+            " but its function seems to have been successful",
+            ss->what, ss->rc);
 
     assert(ss->detaching);
     ss->detached_cb(egc, ss);
diff --git a/tools/libxl/libxl_flask.c b/tools/libxl/libxl_flask.c
index 23f2476..38347a3 100644
--- a/tools/libxl/libxl_flask.c
+++ b/tools/libxl/libxl_flask.c
@@ -1,10 +1,15 @@
 /*
- *
  *  Author: Machon Gregory, <mbgrego at tycho.ncsc.mil>
  *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2,
- *  as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
  */
 
 #include "libxl_osdeps.h" /* must come before any other headers */
diff --git a/tools/libxl/libxl_fork.c b/tools/libxl/libxl_fork.c
index 144208a..024c1e2 100644
--- a/tools/libxl/libxl_fork.c
+++ b/tools/libxl/libxl_fork.c
@@ -112,9 +112,12 @@ libxl__carefd *libxl__carefd_record(libxl_ctx *ctx, int fd)
 libxl__carefd *libxl__carefd_opened(libxl_ctx *ctx, int fd)
 {
     libxl__carefd *cf = 0;
+    int saved_errno = errno;
 
-    cf = libxl__carefd_record(ctx, fd);
+    if (fd >= 0)
+        cf = libxl__carefd_record(ctx, fd);
     libxl__carefd_unlock();
+    errno = saved_errno;
     return cf;
 }
 
@@ -447,9 +450,10 @@ static int perhaps_sigchld_needed(libxl__gc *gc, bool creating)
 static void childproc_reaped_ours(libxl__egc *egc, libxl__ev_child *ch,
                                  int status)
 {
+    pid_t pid = ch->pid;
     LIBXL_LIST_REMOVE(ch, entry);
     ch->pid = -1;
-    ch->callback(egc, ch, ch->pid, status);
+    ch->callback(egc, ch, pid, status);
 }
 
 static int childproc_reaped(libxl__egc *egc, pid_t pid, int status)
diff --git a/tools/libxl/libxl_freebsd.c b/tools/libxl/libxl_freebsd.c
index e8b88b3..47c3391 100644
--- a/tools/libxl/libxl_freebsd.c
+++ b/tools/libxl/libxl_freebsd.c
@@ -131,3 +131,15 @@ libxl_device_model_version libxl__default_device_model(libxl__gc *gc)
 {
     return LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN;
 }
+
+int libxl__pci_numdevs(libxl__gc *gc)
+{
+    return ERROR_NI;
+}
+
+int libxl__pci_topology_init(libxl__gc *gc,
+                             physdev_pci_device_t *devs,
+                             int num_devs)
+{
+    return ERROR_NI;
+}
diff --git a/tools/libxl/libxl_internal.c b/tools/libxl/libxl_internal.c
index 00c3b1e..366ea05 100644
--- a/tools/libxl/libxl_internal.c
+++ b/tools/libxl/libxl_internal.c
@@ -127,21 +127,33 @@ void *libxl__realloc(libxl__gc *gc, void *ptr, size_t new_size)
     return new_ptr;
 }
 
-char *libxl__sprintf(libxl__gc *gc, const char *fmt, ...)
+char *libxl__vsprintf(libxl__gc *gc, const char *fmt, va_list ap)
 {
     char *s;
-    va_list ap;
+    va_list aq;
     int ret;
 
-    va_start(ap, fmt);
-    ret = vsnprintf(NULL, 0, fmt, ap);
-    va_end(ap);
+    va_copy(aq, ap);
+    ret = vsnprintf(NULL, 0, fmt, aq);
+    va_end(aq);
 
     assert(ret >= 0);
 
     s = libxl__zalloc(gc, ret + 1);
+    va_copy(aq, ap);
+    ret = vsnprintf(s, ret + 1, fmt, aq);
+    va_end(aq);
+
+    return s;
+}
+
+char *libxl__sprintf(libxl__gc *gc, const char *fmt, ...)
+{
+    char *s;
+    va_list ap;
+
     va_start(ap, fmt);
-    ret = vsnprintf(s, ret + 1, fmt, ap);
+    s = libxl__vsprintf(gc, fmt, ap);
     va_end(ap);
 
     return s;
@@ -149,7 +161,11 @@ char *libxl__sprintf(libxl__gc *gc, const char *fmt, ...)
 
 char *libxl__strdup(libxl__gc *gc, const char *c)
 {
-    char *s = strdup(c);
+    char *s;
+
+    if (!c) return NULL;
+
+    s = strdup(c);
 
     if (!s) libxl__alloc_failed(CTX, __func__, strlen(c), 1);
 
@@ -160,7 +176,11 @@ char *libxl__strdup(libxl__gc *gc, const char *c)
 
 char *libxl__strndup(libxl__gc *gc, const char *c, size_t n)
 {
-    char *s = strndup(c, n);
+    char *s;
+
+    if (!c) return NULL;
+
+    s = strndup(c, n);
 
     if (!s) libxl__alloc_failed(CTX, __func__, n, 1);
 
@@ -221,8 +241,7 @@ void libxl__log(libxl_ctx *ctx, xentoollog_level msglevel, int errnoval,
 
 char *libxl__abs_path(libxl__gc *gc, const char *s, const char *path)
 {
-    if (!s || s[0] == '/')
-        return libxl__strdup(gc, s);
+    if (s[0] == '/') return libxl__strdup(gc, s);
     return libxl__sprintf(gc, "%s/%s", path, s);
 }
 
@@ -321,19 +340,18 @@ _hidden int libxl__init_recursive_mutex(libxl_ctx *ctx, pthread_mutex_t *lock)
     int rc = 0;
 
     if (pthread_mutexattr_init(&attr) != 0) {
-        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, 
-                         "Failed to init mutex attributes\n");
+        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR,
+                         "Failed to init mutex attributes");
         return ERROR_FAIL;
     }
     if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE) != 0) {
-        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, 
-                         "Failed to set mutex attributes\n");
+        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR,
+                         "Failed to set mutex attributes");
         rc = ERROR_FAIL;
         goto out;
     }
     if (pthread_mutex_init(lock, &attr) != 0) {
-        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, 
-                         "Failed to init mutex\n");
+        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "Failed to init mutex");
         rc = ERROR_FAIL;
         goto out;
     }
@@ -364,25 +382,6 @@ int libxl__device_model_version_running(libxl__gc *gc, uint32_t domid)
     return value;
 }
 
-int libxl__hotplug_settings(libxl__gc *gc, xs_transaction_t t)
-{
-    int rc = 0;
-    char *val;
-
-    val = libxl__xs_read(gc, t, DISABLE_UDEV_PATH);
-    if (!val && errno != ENOENT) {
-        LOGE(ERROR, "cannot read %s from xenstore", DISABLE_UDEV_PATH);
-        rc = ERROR_FAIL;
-        goto out;
-    }
-    if (!val) val = "0";
-
-    rc = !!atoi(val);
-
-out:
-    return rc;
-}
-
 /* Portability note: this lock utilises flock(2) so a proper implementation of
  * flock(2) is required.
  */
@@ -405,7 +404,7 @@ libxl__domain_userdata_lock *libxl__lock_domain_userdata(libxl__gc *gc,
         fd = open(lockfile, O_RDWR|O_CREAT, 0666);
         if (fd < 0)
             LOGE(ERROR, "cannot open lockfile %s, errno=%d", lockfile, errno);
-        lock->lock_carefd = libxl__carefd_opened(CTX, fd);
+        lock->carefd = libxl__carefd_opened(CTX, fd);
         if (fd < 0) goto out;
 
         /* Lock the file in exclusive mode, wait indefinitely to
@@ -440,7 +439,7 @@ libxl__domain_userdata_lock *libxl__lock_domain_userdata(libxl__gc *gc,
                 break;
         }
 
-        libxl__carefd_close(lock->lock_carefd);
+        libxl__carefd_close(lock->carefd);
     }
 
     /* Check the domain is still there, if not we should release the
@@ -458,8 +457,22 @@ out:
 
 void libxl__unlock_domain_userdata(libxl__domain_userdata_lock *lock)
 {
+    /* It's important to unlink the file before closing fd to avoid
+     * the following race (if close before unlink):
+     *
+     *   P1 LOCK                         P2 UNLOCK
+     *   fd1 = open(lockfile)
+     *                                   close(fd2)
+     *   flock(fd1)
+     *   fstat and stat check success
+     *                                   unlink(lockfile)
+     *   return lock
+     *
+     * In above case P1 thinks it has got hold of the lock but
+     * actually lock is released by P2 (lockfile unlinked).
+     */
     if (lock->path) unlink(lock->path);
-    if (lock->lock_carefd) libxl__carefd_close(lock->lock_carefd);
+    if (lock->carefd) libxl__carefd_close(lock->carefd);
     free(lock->path);
     free(lock);
 }
@@ -541,6 +554,22 @@ void libxl__update_domain_configuration(libxl__gc *gc,
     dst->b_info.video_memkb = src->b_info.video_memkb;
 }
 
+char *libxl__device_model_xs_path(libxl__gc *gc, uint32_t dm_domid,
+                                  uint32_t domid, const char *format,  ...)
+{
+    char *s, *fmt;
+    va_list ap;
+
+    fmt = GCSPRINTF("/local/domain/%u/device-model/%u%s", dm_domid,
+                    domid, format);
+
+    va_start(ap, format);
+    s = libxl__vsprintf(gc, fmt, ap);
+    va_end(ap);
+
+    return s;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 6dac0f8..1699f32 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -19,6 +19,8 @@
 
 #include "libxl_osdeps.h" /* must come before any other headers */
 
+#include "libxl_sr_stream_format.h"
+
 #include <assert.h>
 #include <dirent.h>
 #include <errno.h>
@@ -84,8 +86,12 @@
 
 #define LIBXL_INIT_TIMEOUT 10
 #define LIBXL_DESTROY_TIMEOUT 10
-#define LIBXL_HOTPLUG_TIMEOUT 10
-#define LIBXL_DEVICE_MODEL_START_TIMEOUT 10
+#define LIBXL_HOTPLUG_TIMEOUT 40
+/* QEMU may be slow to load and start due to a bug in Linux where the I/O
+ * subsystem sometime produce high latency under load. */
+#define LIBXL_DEVICE_MODEL_START_TIMEOUT 60
+#define LIBXL_DEVICE_MODEL_RESTORE_FILE "/var/lib/xen/qemu-resume" /* .$domid */
+#define LIBXL_STUBDOM_START_TIMEOUT 30
 #define LIBXL_QEMU_BODGE_TIMEOUT 2
 #define LIBXL_XENCONSOLE_LIMIT 1048576
 #define LIBXL_XENCONSOLE_PROTOCOL "vt100"
@@ -103,11 +109,29 @@
 #define STUBDOM_CONSOLE_SERIAL 3
 #define STUBDOM_SPECIAL_CONSOLES 3
 #define TAP_DEVICE_SUFFIX "-emu"
-#define DISABLE_UDEV_PATH "libxl/disable_udev"
 #define DOMID_XS_PATH "domid"
 
 #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
 
+#define ROUNDUP(_val, _order)                                           \
+    (((unsigned long)(_val)+(1UL<<(_order))-1) & ~((1UL<<(_order))-1))
+
+#define min(X, Y) ({                             \
+            const typeof (X) _x = (X);           \
+            const typeof (Y) _y = (Y);           \
+            (void) (&_x == &_y);                 \
+            (_x < _y) ? _x : _y; })
+#define max(X, Y) ({                             \
+            const typeof (X) _x = (X);           \
+            const typeof (Y) _y = (Y);           \
+            (void) (&_x == &_y);                 \
+            (_x > _y) ? _x : _y; })
+
+#define min_t(type, x, y)                                               \
+    ({ const type _x = (x); const type _y = (y); _x < _y ? _x: _y; })
+#define max_t(type, x, y)                                               \
+    ({ const type _x = (x); const type _y = (y); _x > _y ? _x: _y; })
+
 #define LIBXL__LOGGING_ENABLED
 
 #ifdef LIBXL__LOGGING_ENABLED
@@ -121,6 +145,9 @@
 #endif
   /* all of these macros preserve errno (saving and restoring) */
 
+/* Convert pfn to physical address space. */
+#define pfn_to_paddr(x) ((uint64_t)(x) << XC_PAGE_SHIFT)
+
 /* logging */
 _hidden void libxl__logv(libxl_ctx *ctx, xentoollog_level msglevel, int errnoval,
              const char *file /* may be 0 */, int line /* ignored if !file */,
@@ -160,6 +187,9 @@ typedef void libxl__ev_fd_callback(libxl__egc *egc, libxl__ev_fd *ev,
    *
    * It is not permitted to listen for the same or overlapping events
    * on the same fd using multiple different libxl__ev_fd's.
+   *
+   * (Spurious wakeups, and spurious bits set in revents, are
+   * suppressed by the libxl event core.)
    */
 struct libxl__ev_fd {
     /* caller should include this in their own struct */
@@ -173,9 +203,46 @@ struct libxl__ev_fd {
 };
 
 
+typedef struct libxl__ao_abortable libxl__ao_abortable;
+typedef void libxl__ao_abortable_callback(libxl__egc *egc,
+                  libxl__ao_abortable *ao_abortable, int rc /* ABORTED */);
+
+struct libxl__ao_abortable {
+    /* caller must fill this in and it must remain valid */
+    libxl__ao *ao;
+    libxl__ao_abortable_callback *callback;
+    /* remainder is private for abort machinery */
+    bool registered;
+    LIBXL_LIST_ENTRY(libxl__ao_abortable) entry;
+    /*
+     * For nested aos:
+     *  Semantically, abort affects the whole tree of aos,
+     *    not just the parent.
+     *  libxl__ao_abortable.ao refers to the child, so
+     *    that the child callback sees the right ao.  (After all,
+     *    it was code dealing with the child that set .ao.)
+     *  But, the abortable is recorded on the "abortables" list
+     *    for the ultimate root ao, so that every possible child
+     *    abort occurs as a result of the abort of the parent.
+     *  We set ao->aborting only in the root.
+     */
+};
+
+_hidden int libxl__ao_abortable_register(libxl__ao_abortable*);
+_hidden void libxl__ao_abortable_deregister(libxl__ao_abortable*);
+
+static inline void libxl__ao_abortable_init
+  (libxl__ao_abortable *c) { c->registered = 0; }
+static inline bool libxl__ao_abortable_isregistered
+  (const libxl__ao_abortable *c) { return c->registered; }
+
+int libxl__ao_aborting(libxl__ao *ao); /* -> 0 or ERROR_ABORTED */
+
+
 typedef struct libxl__ev_time libxl__ev_time;
 typedef void libxl__ev_time_callback(libxl__egc *egc, libxl__ev_time *ev,
-                                     const struct timeval *requested_abs);
+                                     const struct timeval *requested_abs,
+                                     int rc); /* TIMEDOUT or ABORTED */
 struct libxl__ev_time {
     /* caller should include this in their own struct */
     /* read-only for caller, who may read only when registered: */
@@ -185,6 +252,7 @@ struct libxl__ev_time {
     LIBXL_TAILQ_ENTRY(libxl__ev_time) entry;
     struct timeval abs;
     libxl__osevent_hook_nexus *nexus;
+    libxl__ao_abortable abrt;
 };
 
 typedef struct libxl__ev_xswatch libxl__ev_xswatch;
@@ -304,6 +372,18 @@ struct libxl__poller {
     int (*fd_rindices)[3]; /* see libxl_event.c:beforepoll_internal */
 
     int wakeup_pipe[2]; /* 0 means no fd allocated */
+
+    /*
+     * We also use the poller to record whether any fds have been
+     * deregistered since we entered poll.  Each poller which is not
+     * idle is on the list pollers_fds_changed.  fds_changed is
+     * cleared by beforepoll, and tested by afterpoll.  Whenever an fd
+     * event is deregistered, we set the fds_changed of all non-idle
+     * pollers.  So afterpoll can tell whether any POLLNVAL is
+     * plausibly due to an fd being closed and reopened.
+     */
+    LIBXL_LIST_ENTRY(libxl__poller) fds_changed_entry;
+    bool fds_changed;
 };
 
 struct libxl__gc {
@@ -343,8 +423,9 @@ struct libxl__ctx {
       /* See the comment for OSEVENT_HOOK_INTERN in libxl_event.c
        * for restrictions on the use of the osevent fields. */
 
-    libxl__poller poller_app; /* libxl_osevent_beforepoll and _afterpoll */
+    libxl__poller *poller_app; /* libxl_osevent_beforepoll and _afterpoll */
     LIBXL_LIST_HEAD(, libxl__poller) pollers_event, pollers_idle;
+    LIBXL_LIST_HEAD(, libxl__poller) pollers_fds_changed;
 
     LIBXL_SLIST_HEAD(libxl__osevent_hook_nexi, libxl__osevent_hook_nexus)
         hook_fd_nexi_idle, hook_timeout_nexi_idle;
@@ -361,6 +442,8 @@ struct libxl__ctx {
     LIBXL_LIST_HEAD(, libxl__ev_evtchn) evtchns_waiting;
     libxl__ev_fd evtchn_efd;
 
+    LIBXL_LIST_HEAD(, libxl__ao) aos_inprogress;
+
     LIBXL_TAILQ_HEAD(libxl__evgen_domain_death_list, libxl_evgen_domain_death)
         death_list /* sorted by domid */,
         death_reported;
@@ -447,9 +530,15 @@ struct libxl__ao {
      * only in libxl__ao_complete.)
      */
     uint32_t magic;
-    unsigned constructing:1, in_initiator:1, complete:1, notified:1, nested:1;
+    unsigned constructing:1, in_initiator:1, complete:1, notified:1,
+        aborting:1;
+    int manip_refcnt;
+    libxl__ao *nested_root;
+    int nested_progeny;
     int progress_reports_outstanding;
     int rc;
+    LIBXL_LIST_HEAD(, libxl__ao_abortable) abortables;
+    LIBXL_LIST_ENTRY(libxl__ao) inprogress_entry;
     libxl__gc gc;
     libxl_asyncop_how how;
     libxl__poller *poller;
@@ -521,10 +610,14 @@ _hidden void *libxl__realloc(libxl__gc *gc_opt, void *ptr, size_t new_size) NN1;
 /* print @fmt into an allocated string large enoughto contain the result.
  * (similar to gc'd asprintf(3)). */
 _hidden char *libxl__sprintf(libxl__gc *gc_opt, const char *fmt, ...) PRINTF_ATTRIBUTE(2, 3) NN1;
+_hidden char *libxl__vsprintf(libxl__gc *gc, const char *format, va_list ap);
 /* duplicate the string @c (similar to a gc'd strdup(3)). */
-_hidden char *libxl__strdup(libxl__gc *gc_opt, const char *c) NN1;
+_hidden char *libxl__strdup(libxl__gc *gc_opt,
+                            const char *c /* may be NULL */) NN1;
 /* duplicate at most @n bytes of string @c (similar to a gc'd strndup(3)). */
-_hidden char *libxl__strndup(libxl__gc *gc_opt, const char *c, size_t n) NN1;
+_hidden char *libxl__strndup(libxl__gc *gc_opt,
+                             const char *c /* may be NULL */,
+                             size_t n) NN1;
 /* strip the last path component from @s and return as a newly allocated
  * string. (similar to a gc'd dirname(3)). */
 _hidden char *libxl__dirname(libxl__gc *gc_opt, const char *s) NN1;
@@ -537,6 +630,17 @@ _hidden int libxl__pipe_nonblock(libxl_ctx *ctx, int fds[2]);
  * `not open'.  Ignores any errors.  Sets fds[] to -1. */
 _hidden void libxl__pipe_close(int fds[2]);
 
+/* Change the flags for the file description associated with fd to
+ *    (flags & mask) | val.
+ * If r_oldflags != NULL then sets *r_oldflags to the original set of
+ * flags.
+ */
+_hidden int libxl__fd_flags_modify_save(libxl__gc *gc, int fd,
+                                        int mask, int val, int *r_oldflags);
+/* Restores the flags for the file description associated with fd to
+ * to the previous value (returned by libxl__fd_flags_modify_save)
+ */
+_hidden int libxl__fd_flags_restore(libxl__gc *gc, int fd, int old_flags);
 
 /* Each of these logs errors and returns a libxl error code.
  * They do not mind if path is already removed.
@@ -770,10 +874,10 @@ static inline void libxl__ev_fd_init(libxl__ev_fd *efd)
 static inline int libxl__ev_fd_isregistered(const libxl__ev_fd *efd)
                     { return efd->fd >= 0; }
 
-_hidden int libxl__ev_time_register_rel(libxl__gc*, libxl__ev_time *ev_out,
+_hidden int libxl__ev_time_register_rel(libxl__ao*, libxl__ev_time *ev_out,
                                         libxl__ev_time_callback*,
                                         int milliseconds /* as for poll(2) */);
-_hidden int libxl__ev_time_register_abs(libxl__gc*, libxl__ev_time *ev_out,
+_hidden int libxl__ev_time_register_abs(libxl__ao*, libxl__ev_time *ev_out,
                                         libxl__ev_time_callback*,
                                         struct timeval);
 _hidden int libxl__ev_time_modify_rel(libxl__gc*, libxl__ev_time *ev,
@@ -782,7 +886,7 @@ _hidden int libxl__ev_time_modify_abs(libxl__gc*, libxl__ev_time *ev,
                                       struct timeval);
 _hidden void libxl__ev_time_deregister(libxl__gc*, libxl__ev_time *ev);
 static inline void libxl__ev_time_init(libxl__ev_time *ev)
-                { ev->func = 0; }
+                { ev->func = 0; libxl__ao_abortable_init(&ev->abrt); }
 static inline int libxl__ev_time_isregistered(const libxl__ev_time *ev)
                 { return !!ev->func; }
 
@@ -950,7 +1054,6 @@ _hidden int libxl__file_reference_unmap(libxl__file_reference *f);
 
 /* from xl_dom */
 _hidden libxl_domain_type libxl__domain_type(libxl__gc *gc, uint32_t domid);
-_hidden int libxl__domain_shutdown_reason(libxl__gc *gc, uint32_t domid);
 _hidden int libxl__domain_cpupool(libxl__gc *gc, uint32_t domid);
 _hidden libxl_scheduler libxl__domain_scheduler(libxl__gc *gc, uint32_t domid);
 _hidden int libxl__sched_set_params(libxl__gc *gc, uint32_t domid,
@@ -973,6 +1076,11 @@ typedef struct {
     libxl__file_reference pv_ramdisk;
     const char * pv_cmdline;
     bool pvh_enabled;
+
+    xen_vmemrange_t *vmemranges;
+    uint32_t num_vmemranges;
+
+    xc_domain_configuration_t config;
 } libxl__domain_build_state;
 
 _hidden int libxl__build_pre(libxl__gc *gc, uint32_t domid,
@@ -985,7 +1093,7 @@ _hidden int libxl__build_post(libxl__gc *gc, uint32_t domid,
 _hidden int libxl__build_pv(libxl__gc *gc, uint32_t domid,
              libxl_domain_build_info *info, libxl__domain_build_state *state);
 _hidden int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
-              libxl_domain_build_info *info,
+              libxl_domain_config *d_config,
               libxl__domain_build_state *state);
 
 _hidden int libxl__qemu_traditional_cmd(libxl__gc *gc, uint32_t domid,
@@ -994,8 +1102,6 @@ _hidden int libxl__domain_rename(libxl__gc *gc, uint32_t domid,
                                  const char *old_name, const char *new_name,
                                  xs_transaction_t trans);
 
-_hidden int libxl__toolstack_restore(uint32_t domid, const uint8_t *buf,
-                                     uint32_t size, void *data);
 _hidden int libxl__domain_resume_device_model(libxl__gc *gc, uint32_t domid);
 
 _hidden const char *libxl__userdata_path(libxl__gc *gc, uint32_t domid,
@@ -1082,6 +1188,8 @@ _hidden int libxl__device_vtpm_setdefault(libxl__gc *gc, libxl_device_vtpm *vtpm
 _hidden int libxl__device_vfb_setdefault(libxl__gc *gc, libxl_device_vfb *vfb);
 _hidden int libxl__device_vkb_setdefault(libxl__gc *gc, libxl_device_vkb *vkb);
 _hidden int libxl__device_pci_setdefault(libxl__gc *gc, libxl_device_pci *pci);
+_hidden void libxl__rdm_setdefault(libxl__gc *gc,
+                                   libxl_domain_build_info *b_info);
 
 _hidden const char *libxl__device_nic_devname(libxl__gc *gc,
                                               uint32_t domid,
@@ -1090,6 +1198,61 @@ _hidden const char *libxl__device_nic_devname(libxl__gc *gc,
 
 _hidden int libxl__get_domid(libxl__gc *gc, uint32_t *domid);
 
+/*----- xswait: wait for a xenstore node to be suitable -----*/
+
+typedef struct libxl__xswait_state libxl__xswait_state;
+
+/*
+ * rc describes the circumstances of this callback:
+ *
+ * rc==0
+ *
+ *     The xenstore path (may have) changed.  It has been read for
+ *     you.  The result is in data (allocated from the ao gc).
+ *     data may be NULL, which means that the xenstore read gave
+ *     ENOENT.
+ *
+ *     If you are satisfied, you MUST call libxl__xswait_stop.
+ *     Otherwise, xswait will continue waiting and watching and
+ *     will call you back later.
+ *
+ * rc==ERROR_TIMEDOUT, rc==ERROR_ABORTED
+ *
+ *     The specified timeout was reached.
+ *     This has NOT been logged (except to the debug log).
+ *     xswait will not continue (but calling libxl__xswait_stop is OK).
+ *
+ * rc!=0, !=ERROR_TIMEDOUT, !=ERROR_ABORTED
+ *
+ *     Some other error occurred.
+ *     This HAS been logged.
+ *     xswait will not continue (but calling libxl__xswait_stop is OK).
+ *
+ * xswait.path may start with with '@', in which case no read is done
+ * and the callback will always get data==0.
+ */
+typedef void libxl__xswait_callback(libxl__egc *egc,
+      libxl__xswait_state *xswa, int rc, const char *data);
+
+struct libxl__xswait_state {
+    /* caller must fill these in, and they must all remain valid */
+    libxl__ao *ao;
+    const char *what; /* for error msgs: noun phrase, what we're waiting for */
+    const char *path;
+    int timeout_ms; /* as for poll(2) */
+    libxl__xswait_callback *callback;
+    /* remaining fields are private to xswait */
+    libxl__ev_time time_ev;
+    libxl__ev_xswatch watch_ev;
+};
+
+void libxl__xswait_init(libxl__xswait_state*);
+void libxl__xswait_stop(libxl__gc*, libxl__xswait_state*); /*idempotent*/
+bool libxl__xswait_inuse(const libxl__xswait_state *ss);
+
+int libxl__xswait_start(libxl__gc*, libxl__xswait_state*);
+
+
 /*
  * libxl__ev_devstate - waits a given time for a device to
  * reach a given state.  Follows the libxl_ev_* conventions.
@@ -1099,8 +1262,9 @@ _hidden int libxl__get_domid(libxl__gc *gc, uint32_t *domid);
 typedef struct libxl__ev_devstate libxl__ev_devstate;
 typedef void libxl__ev_devstate_callback(libxl__egc *egc, libxl__ev_devstate*,
                                          int rc);
-  /* rc will be 0, ERROR_TIMEDOUT, ERROR_INVAL (meaning path was removed),
-   * or ERROR_FAIL if other stuff went wrong (in which latter case, logged) */
+  /* rc will be 0, ERROR_TIMEDOUT, ERROR_ABORTED, ERROR_INVAL
+   * (meaning path was removed), or ERROR_FAIL if other stuff went
+   * wrong (in which latter case, logged) */
 
 struct libxl__ev_devstate {
     /* read-only for caller, who may read only when waiting: */
@@ -1108,24 +1272,21 @@ struct libxl__ev_devstate {
     libxl__ev_devstate_callback *callback;
     /* as for the remainder, read-only public parts may also be
      * read by the caller (notably, watch.path), but only when waiting: */
-    libxl__ev_xswatch watch;
-    libxl__ev_time timeout;
+    libxl__xswait_state w;
 };
 
 static inline void libxl__ev_devstate_init(libxl__ev_devstate *ds)
 {
-    libxl__ev_time_init(&ds->timeout);
-    libxl__ev_xswatch_init(&ds->watch);
+    libxl__xswait_init(&ds->w);
 }
 
 static inline void libxl__ev_devstate_cancel(libxl__gc *gc,
                                              libxl__ev_devstate *ds)
 {
-    libxl__ev_time_deregister(gc,&ds->timeout);
-    libxl__ev_xswatch_deregister(gc,&ds->watch);
+    libxl__xswait_stop(gc,&ds->w);
 }
 
-_hidden int libxl__ev_devstate_wait(libxl__gc *gc, libxl__ev_devstate *ds,
+_hidden int libxl__ev_devstate_wait(libxl__ao *ao, libxl__ev_devstate *ds,
                                     libxl__ev_devstate_callback cb,
                                     const char *state_path,
                                     int state, int milliseconds);
@@ -1138,7 +1299,8 @@ _hidden int libxl__ev_devstate_wait(libxl__gc *gc, libxl__ev_devstate *ds,
 
 typedef struct libxl__domaindeathcheck libxl__domaindeathcheck;
 typedef void libxl___domaindeathcheck_callback(libxl__egc *egc,
-                                         libxl__domaindeathcheck*);
+        libxl__domaindeathcheck*,
+        int rc /* DESTROYED or ABORTED */);
 
 struct libxl__domaindeathcheck {
     /* must be filled in by caller, and remain valid: */
@@ -1146,16 +1308,15 @@ struct libxl__domaindeathcheck {
     uint32_t domid;
     libxl___domaindeathcheck_callback *callback;
     /* private */
+    libxl__ao_abortable abrt;
     libxl__ev_xswatch watch;
 };
 
-_hidden int libxl__domaindeathcheck_start(libxl__gc *gc,
+_hidden int libxl__domaindeathcheck_start(libxl__ao *ao,
                                           libxl__domaindeathcheck *dc);
 
-static inline void libxl__domaindeathcheck_init
- (libxl__domaindeathcheck *dc) { libxl__ev_xswatch_init(&dc->watch); }
-static inline void libxl__domaindeathcheck_stop(libxl__gc *gc,
-  libxl__domaindeathcheck *dc) { libxl__ev_xswatch_deregister(gc,&dc->watch); }
+void libxl__domaindeathcheck_init(libxl__domaindeathcheck *dc);
+void libxl__domaindeathcheck_stop(libxl__gc *gc, libxl__domaindeathcheck *dc);
 
 
 /*
@@ -1170,6 +1331,11 @@ _hidden int libxl__try_phy_backend(mode_t st_mode);
 
 _hidden char *libxl__devid_to_localdev(libxl__gc *gc, int devid);
 
+_hidden int libxl__pci_numdevs(libxl__gc *gc);
+_hidden int libxl__pci_topology_init(libxl__gc *gc,
+                                     physdev_pci_device_t *devs,
+                                     int num_devs);
+
 /* from libxl_pci */
 
 _hidden int libxl__device_pci_add(libxl__gc *gc, uint32_t domid, libxl_device_pci *pcidev, int starting);
@@ -1177,59 +1343,10 @@ _hidden int libxl__create_pci_backend(libxl__gc *gc, uint32_t domid,
                                       libxl_device_pci *pcidev, int num);
 _hidden int libxl__device_pci_destroy_all(libxl__gc *gc, uint32_t domid);
 
-/*----- xswait: wait for a xenstore node to be suitable -----*/
-
-typedef struct libxl__xswait_state libxl__xswait_state;
-
-/*
- * rc describes the circumstances of this callback:
- *
- * rc==0
- *
- *     The xenstore path (may have) changed.  It has been read for
- *     you.  The result is in data (allocated from the ao gc).
- *     data may be NULL, which means that the xenstore read gave
- *     ENOENT.
- *
- *     If you are satisfied, you MUST call libxl__xswait_stop.
- *     Otherwise, xswait will continue waiting and watching and
- *     will call you back later.
- *
- * rc==ERROR_TIMEDOUT
- *
- *     The specified timeout was reached.
- *     This has NOT been logged (except to the debug log).
- *     xswait will not continue (but calling libxl__xswait_stop is OK).
- *
- * rc!=0, !=ERROR_TIMEDOUT
- *
- *     Some other error occurred.
- *     This HAS been logged.
- *     xswait will not continue (but calling libxl__xswait_stop is OK).
- *
- * xswait.path may start with with '@', in which case no read is done
- * and the callback will always get data==0.
- */
-typedef void libxl__xswait_callback(libxl__egc *egc,
-      libxl__xswait_state *xswa, int rc, const char *data);
-
-struct libxl__xswait_state {
-    /* caller must fill these in, and they must all remain valid */
-    libxl__ao *ao;
-    const char *what; /* for error msgs: noun phrase, what we're waiting for */
-    const char *path;
-    int timeout_ms; /* as for poll(2) */
-    libxl__xswait_callback *callback;
-    /* remaining fields are private to xswait */
-    libxl__ev_time time_ev;
-    libxl__ev_xswatch watch_ev;
-};
-
-void libxl__xswait_init(libxl__xswait_state*);
-void libxl__xswait_stop(libxl__gc*, libxl__xswait_state*); /*idempotent*/
-bool libxl__xswait_inuse(const libxl__xswait_state *ss);
+/* from libxl_dtdev */
 
-int libxl__xswait_start(libxl__gc*, libxl__xswait_state*);
+_hidden int libxl__device_dt_add(libxl__gc *gc, uint32_t domid,
+                                 const libxl_device_dtdev *dtdev);
 
 /*
  *----- spawn -----
@@ -1329,7 +1446,8 @@ libxl__spawn_midproc_cb(libxl__gc*, libxl__spawn_state*, pid_t inner);
  * The spawn state will be Idle on entry to the callback (and
  * it may be reused immediately if desired).
  */
-typedef void libxl__spawn_failure_cb(libxl__egc*, libxl__spawn_state*);
+typedef void libxl__spawn_failure_cb(libxl__egc*, libxl__spawn_state*,
+                                     int rc);
 
 /*
  * Called when the xspath watch triggers.  xspath will have been read
@@ -1370,7 +1488,7 @@ struct libxl__spawn_state {
 
     /* remaining fields are private to libxl_spawn_... */
     int detaching; /* we are in Detaching */
-    int failed; /* might be true whenever we are not Idle */
+    int rc; /* might be non-0 whenever we are not Idle */
     libxl__ev_child mid; /* always in use whenever we are not Idle */
     libxl__xswait_state xswait;
 };
@@ -1393,7 +1511,7 @@ _hidden int libxl__spawn_record_pid(libxl__gc*, libxl__spawn_state*,
  * This is a NOT function for waiting for ordinary child processes.
  * If you want to run (fork/exec/wait) subprocesses from libxl:
  *  - Make your libxl entrypoint use the ao machinery
- *  - Use libxl__ev_fork, and use the callback programming style
+ *  - Use libxl__ev_child_fork, and use the callback programming style
  *
  * This function is intended for interprocess communication with a
  * service process.  If the service process does not respond quickly,
@@ -1462,8 +1580,9 @@ _hidden  void libxl__exec(libxl__gc *gc, int stdinfd, int stdoutfd,
  /* on entry, libxl_domid_valid_guest(domid) must be false;
   * on exit (even error exit), domid may be valid and refer to a domain */
 _hidden int libxl__domain_make(libxl__gc *gc,
-                               libxl_domain_create_info *info,
-                               uint32_t *domid);
+                               libxl_domain_config *d_config,
+                               uint32_t *domid,
+                               xc_domain_configuration_t *xc_config);
 
 _hidden int libxl__domain_build(libxl__gc *gc,
                                 libxl_domain_config *d_config,
@@ -1480,6 +1599,15 @@ _hidden int libxl__need_xenpv_qemu(libxl__gc *gc,
         int nr_channels, libxl_device_channel *channels);
 
 /*
+ * This function will fix reserved device memory conflict
+ * according to user's configuration.
+ */
+_hidden int libxl__domain_device_construct_rdm(libxl__gc *gc,
+                                   libxl_domain_config *d_config,
+                                   uint64_t rdm_mem_guard,
+                                   struct xc_hvm_build_args *args);
+
+/*
  * This function will cause the whole libxl process to hang
  * if the device model does not respond.  It is deprecated.
  *
@@ -1563,6 +1691,10 @@ _hidden int libxl__device_from_disk(libxl__gc *gc, uint32_t domid,
                                    libxl_device_disk *disk,
                                    libxl__device *device);
 
+/* Calls poll() again - useful to check whether a signaled condition
+ * is still true.  Cannot fail.  Returns currently-true revents. */
+_hidden short libxl__fd_poll_recheck(libxl__egc *egc, int fd, short events);
+
 _hidden char *libxl__uuid2string(libxl__gc *gc, const libxl_uuid uuid);
 
 struct libxl__xen_console_reader {
@@ -1794,9 +1926,9 @@ _hidden libxl__json_object *libxl__json_parse(libxl__gc *gc_opt, const char *s);
 _hidden int libxl__device_model_version_running(libxl__gc *gc, uint32_t domid);
   /* Return the system-wide default device model */
 _hidden libxl_device_model_version libxl__default_device_model(libxl__gc *gc);
-
-/* Check how executes hotplug script currently */
-int libxl__hotplug_settings(libxl__gc *gc, xs_transaction_t t);
+_hidden char *libxl__device_model_xs_path(libxl__gc *gc, uint32_t dm_domid,
+                                          uint32_t domid,
+                                          const char *format, ...) PRINTF_ATTRIBUTE(4, 5);
 
 /*
  * Calling context and GC for event-generating functions:
@@ -1864,8 +1996,9 @@ _hidden void libxl__egc_cleanup(libxl__egc *egc);
  * All "slow" functions (see below for the exact definition) need to
  * use the asynchronous operation ("ao") machinery.  The function
  * should take a parameter const libxl_asyncop_how *ao_how and must
- * start with a call to AO_INITIATOR_ENTRY.  These functions MAY NOT
- * be called from inside libxl, because they can cause reentrancy
+ * start with a call to AO_CREATE or equivalent.  These functions MAY
+ * NOT be called from inside libxl (regardless of what is passed for
+ * ao_how), because they can cause reentrancy hazards due to
  * callbacks.
  *
  * For the same reason functions taking an ao_how may make themselves
@@ -1909,28 +2042,41 @@ _hidden void libxl__egc_cleanup(libxl__egc *egc);
  *        must be copied into the per-operation structure using
  *        libxl__ao_progress_gethow.
  *
- * - If initiation is successful, the initiating function needs
- *   to run libxl__ao_inprogress right before unlocking and
- *   returning, and return whatever it returns (AO_INPROGRESS macro).
- *
  * - If the initiation is unsuccessful, the initiating function must
- *   call libxl__ao_abort before unlocking and returning whatever
- *   error code is appropriate (AO_ABORT macro).
+ *   call libxl__ao_create_fail before unlocking and returning whatever
+ *   error code is appropriate (AO_CREATE_FAIL macro).
+ *
+ * If initiation is successful:
+ *
+ * - The initiating function must run libxl__ao_inprogress right
+ *   before unlocking and returning, and return whatever it returns.
+ *   This is best achieved with the AO_INPROGRESS macro.
  *
  * - If the operation supports progress reports, it may generate
  *   suitable events with NEW_EVENT and report them with
  *   libxl__ao_progress_report (with the ctx locked).
  *
- * - Later, some callback function, whose callback has been requested
- *   directly or indirectly, should call libxl__ao_complete (with the
- *   ctx locked, as it will generally already be in any event callback
- *   function).  This must happen exactly once for each ao (and not if
- *   the ao has been destroyed, obviously).
+ * - Eventually, some callback function, whose callback has been
+ *   requested directly or indirectly, should call libxl__ao_complete
+ *   (with the ctx locked, as it will generally already be in any
+ *   event callback function).  This must happen exactly once for each
+ *   ao, as the last that happens with that ao.
+ *
+ * - However, it is permissible for the initiating function to call
+ *   libxl__ao_inprogress and/or libxl__ao_complete (directly or
+ *   indirectly), before it uses AO_INPROGRESS to return.  (The ao
+ *   infrastructure will arrange to defer destruction of the ao, etc.,
+ *   until the proper time.)  An initiating function should do this
+ *   if it takes a codepath which completes synchronously.
+ *
+ * - Conversely it is forbidden to call libxl__ao_complete in the
+ *   initiating function _after_ AO_INPROGRESS, because
+ *   libxl__ao_complete requires the ctx to be locked.
  *
  * - Note that during callback functions, two gcs are available:
  *    - The one in egc, whose lifetime is only this callback
  *    - The one in ao, whose lifetime is the asynchronous operation
- *   Usually callback function should use CONTAINER_OF to obtain its
+ *   Usually a callback function should use CONTAINER_OF to obtain its
  *   own state structure, containing a pointer to the ao.  It should
  *   then obtain the ao and use the ao's gc; this is most easily done
  *   using the convenience macro STATE_AO_GC.
@@ -1955,18 +2101,31 @@ _hidden void libxl__egc_cleanup(libxl__egc *egc);
         (ao__rc);                                               \
    })
 
-#define AO_ABORT(rc) ({                                         \
+#define AO_CREATE_FAIL(rc) ({                                   \
         libxl_ctx *ao__ctx = libxl__gc_owner(&ao->gc);          \
         assert(rc);                                             \
-        libxl__ao_abort(ao);                                    \
+        libxl__ao_create_fail(ao);                              \
         libxl__ctx_unlock(ao__ctx); /* gc is now invalid */     \
         EGC_FREE;                                               \
         (rc);                                                   \
     })
 
+
+/*
+ * Given, in scope,
+ *   libxl__ao *ao;
+ * produces, in scope,
+ *   libxl__gc *gc;
+ */
 #define AO_GC                                   \
     libxl__gc *const gc __attribute__((unused)) = &ao->gc
 
+/*
+ * void STATE_AO_GC(libxl__ao *ao_spec);
+ * // Produces, in scope:
+ *   libxl__ao *ao;  // set from ao_spec
+ *   libxl__gc *gc;
+ */
 #define STATE_AO_GC(op_ao)                      \
     libxl__ao *const ao = (op_ao);              \
     libxl__gc *const gc __attribute__((unused)) = libxl__ao_inprogress_gc(ao)
@@ -1979,7 +2138,7 @@ _hidden libxl__ao *libxl__ao_create(libxl_ctx*, uint32_t domid,
        const char *file, int line, const char *func);
 _hidden int libxl__ao_inprogress(libxl__ao *ao,
        const char *file, int line, const char *func); /* temporarily unlocks */
-_hidden void libxl__ao_abort(libxl__ao *ao);
+_hidden void libxl__ao_create_fail(libxl__ao *ao);
 _hidden void libxl__ao_complete(libxl__egc *egc, libxl__ao *ao, int rc);
 _hidden libxl__gc *libxl__ao_inprogress_gc(libxl__ao *ao);
 
@@ -2008,7 +2167,7 @@ _hidden void libxl__ao_complete_check_progress_reports(libxl__egc*, libxl__ao*);
  * The returned sub-ao is suitable for passing to gc-related functions
  * and macros such as libxl__ao_inprogress_gc, AO_GC, and STATE_AO_GC.
  *
- * It MUST NOT be used with AO_INPROGRESS, AO_ABORT,
+ * It MUST NOT be used with AO_INPROGRESS, AO_CREATE_FAIL,
  * libxl__ao_complete, libxl__ao_progress_report, and so on.
  *
  * The caller must ensure that all of the sub-ao's are freed before
@@ -2068,7 +2227,7 @@ _hidden void libxl__carefd_unlock(void);
 _hidden libxl__carefd *libxl__carefd_record(libxl_ctx *ctx, int fd);
 
 /* Combines _record and _unlock in a single call.  If fd==-1,
- * still does the unlock, but returns 0.  Cannot fail. */
+ * still does the unlock, but returns 0. */
 _hidden libxl__carefd *libxl__carefd_opened(libxl_ctx *ctx, int fd);
 
 /* Works just like close(2).  You may pass NULL, in which case it's
@@ -2091,7 +2250,16 @@ _hidden const char *libxl__run_dir_path(void);
 typedef struct libxl__async_exec_state libxl__async_exec_state;
 
 typedef void libxl__async_exec_callback(libxl__egc *egc,
-                        libxl__async_exec_state *aes, int status);
+                        libxl__async_exec_state *aes, int rc, int status);
+/*
+ * Meaning of status and rc:
+ *  rc==0, status==0    all went well
+ *  rc==0, status!=0    everything OK except child exited nonzero (logged)
+ *  rc!=0               something else went wrong (status is real
+ *                       exit status; maybe reflecting SIGKILL, and
+ *                       therefore not very interesting, if aes code
+ *                       killed the child).  Logged unless ABORTED.
+ */
 
 struct libxl__async_exec_state {
     /* caller must fill these in */
@@ -2107,12 +2275,15 @@ struct libxl__async_exec_state {
     /* private */
     libxl__ev_time time;
     libxl__ev_child child;
+    int rc;
 };
 
 void libxl__async_exec_init(libxl__async_exec_state *aes);
-int libxl__async_exec_start(libxl__gc *gc, libxl__async_exec_state *aes);
+int libxl__async_exec_start(libxl__async_exec_state *aes);
 bool libxl__async_exec_inuse(const libxl__async_exec_state *aes);
 
+_hidden void libxl__kill(libxl__gc *gc, pid_t pid, int sig, const char *what);
+
 /*----- device addition/removal -----*/
 
 typedef struct libxl__ao_device libxl__ao_device;
@@ -2152,7 +2323,7 @@ struct libxl__ao_device {
     /* Bodge for Qemu devices */
     libxl__ev_time timeout;
     /* xenstore watch for backend path of driver domains */
-    libxl__ev_xswatch xs_watch;
+    libxl__xswait_state xswait;
     int num_exec;
     /* for calling hotplug scripts */
     libxl__async_exec_state aes;
@@ -2324,7 +2495,7 @@ struct libxl__multidev {
  *          any stale entry
  *       for loop -- xs transaction
  *           open xs transaction
- *           check device existence, abort if it exists
+ *           check device existence, bail if it exists
  *           write in-memory json config to disk
  *           commit xs transaction
  *       end for loop
@@ -2506,16 +2677,21 @@ _hidden void libxl__device_disk_local_initiate_detach(libxl__egc *egc,
 typedef struct libxl__datacopier_state libxl__datacopier_state;
 typedef struct libxl__datacopier_buf libxl__datacopier_buf;
 
-/* onwrite==1 means failure happened when writing, logged, errnoval is valid
- * onwrite==0 means failure happened when reading
- *     errnoval==0 means we got eof and all data was written
- *     errnoval!=0 means we had a read error, logged
- * onwrite==-1 means some other internal failure, errnoval not valid, logged
- * If we get POLLHUP, we call callback_pollhup(..., onwrite, -1);
- * or if callback_pollhup==0 this is an internal failure, as above.
+/* onwrite==1 means problem happened when writing
+ *     rc==FAIL    errnoval >0    we had a write error, logged
+ * onwrite==0 means problem happened when reading
+ *     rc==0       errnoval==0    we got eof and all data was written
+ *     rc==FAIL    errnoval >0    we had a read error, logged
+ * onwrite==-1 means some other internal problem
+ *     rc==FAIL    errnoval==EIO  some other internal failure, logged
+ *     rc==ABORTED errnoval==0    abort requested, not logged
+ * If we get POLLHUP, we call callback_pollhup with
+ *     rc==FAIL    errnoval==-1   POLLHUP signalled
+ * or if callback_pollhup==0 this is treated as eof (if POLLIN|POLLHUP
+ * on the reading fd) or an internal failure (otherwise), as above.
  * In all cases copier is killed before calling this callback */
 typedef void libxl__datacopier_callback(libxl__egc *egc,
-     libxl__datacopier_state *dc, int onwrite, int errnoval);
+     libxl__datacopier_state *dc, int rc, int onwrite, int errnoval);
 
 struct libxl__datacopier_buf {
     /* private to datacopier */
@@ -2529,11 +2705,16 @@ struct libxl__datacopier_state {
     libxl__ao *ao;
     int readfd, writefd;
     ssize_t maxsz;
+    ssize_t bytes_to_read; /* set to -1 to read until EOF */
     const char *copywhat, *readwhat, *writewhat; /* for error msgs */
     FILE *log; /* gets a copy of everything */
     libxl__datacopier_callback *callback;
     libxl__datacopier_callback *callback_pollhup;
+    void *readbuf; /* Set this to read data into it without writing to an
+                      fd. The buffer should be at least as large as the
+                      bytes_to_read parameter, which should not be -1. */
     /* remaining fields are private to datacopier */
+    libxl__ao_abortable abrt;
     libxl__ev_fd toread, towrite;
     ssize_t used;
     LIBXL_TAILQ_HEAD(libxl__datacopier_bufs, libxl__datacopier_buf) bufs;
@@ -2554,8 +2735,6 @@ _hidden void libxl__datacopier_prefixdata(libxl__egc*, libxl__datacopier_state*,
 
 typedef struct libxl__srm_save_callbacks {
     libxl__srm_save_autogen_callbacks a;
-    int (*toolstack_save)(uint32_t domid, uint8_t **buf,
-                          uint32_t *len, void *data);
 } libxl__srm_save_callbacks;
 
 typedef struct libxl__srm_restore_callbacks {
@@ -2583,11 +2762,11 @@ typedef struct libxl__save_helper_state {
     int rc;
     int completed; /* retval/errnoval valid iff completed */
     int retval, errnoval; /* from xc_domain_save / xc_domain_restore */
+    libxl__ao_abortable abrt;
     libxl__carefd *pipes[2]; /* 0 = helper's stdin, 1 = helper's stdout */
     libxl__ev_fd readable;
     libxl__ev_child child;
     const char *stdin_what, *stdout_what;
-    FILE *toolstack_data_file;
 
     libxl__egc *egc; /* valid only for duration of each event callback;
                       * is here in this struct for the benefit of the
@@ -2771,7 +2950,44 @@ _hidden void libxl__remus_devices_commit(libxl__egc *egc,
                                          libxl__remus_devices_state *rds);
 _hidden int libxl__netbuffer_enabled(libxl__gc *gc);
 
+/*----- Legacy conversion helper -----*/
+typedef struct libxl__conversion_helper_state libxl__conversion_helper_state;
+
+struct libxl__conversion_helper_state {
+    /* Public - Must be filled by caller unless noted. */
+    libxl__ao *ao;
+    int legacy_fd;             /* fd to read the legacy stream from. */
+    bool hvm;                  /* pv or hvm domain? */
+    libxl__carefd *v2_carefd;  /* Filled by successful call to
+                                * libxl__convert_legacy_stream().  Caller
+                                * assumes ownership of the fd. */
+    void (*completion_callback)(
+        libxl__egc *egc, libxl__conversion_helper_state *chs, int rc);
+    /* private */
+    int rc;
+    libxl__ao_abortable abrt;
+    libxl__ev_child child;
+};
+
+_hidden void libxl__conversion_helper_init
+                    (libxl__conversion_helper_state *chs);
+_hidden int libxl__convert_legacy_stream(libxl__egc *egc,
+                    libxl__conversion_helper_state *chs);
+_hidden void libxl__conversion_helper_abort(libxl__egc *egc,
+                    libxl__conversion_helper_state *chs, int rc);
+static inline bool libxl__conversion_helper_inuse
+                    (const libxl__conversion_helper_state *chs)
+{ return libxl__ev_child_inuse(&chs->child); }
+
+
 /*----- Domain suspend (save) state structure -----*/
+/*
+ * "suspend" refers to quiescing the VM, so pausing qemu, making a
+ * remote_shutdown(SHUTDOWN_suspend) hypercall etc.
+ *
+ * "save" refers to the actions involved in actually shuffling the
+ * state of the VM, so xc_domain_save() etc.
+ */
 
 typedef struct libxl__domain_suspend_state libxl__domain_suspend_state;
 
@@ -2780,6 +2996,55 @@ typedef void libxl__domain_suspend_cb(libxl__egc*,
 typedef void libxl__save_device_model_cb(libxl__egc*,
                                          libxl__domain_suspend_state*, int rc);
 
+/* State for writing a libxl migration v2 stream */
+typedef struct libxl__stream_write_state libxl__stream_write_state;
+typedef void (*sws_record_done_cb)(libxl__egc *egc,
+                                   libxl__stream_write_state *sws);
+struct libxl__stream_write_state {
+    /* filled by the user */
+    libxl__ao *ao;
+    libxl__domain_suspend_state *dss;
+    int fd;
+    void (*completion_callback)(libxl__egc *egc,
+                                libxl__stream_write_state *sws,
+                                int rc);
+    void (*checkpoint_callback)(libxl__egc *egc,
+                                libxl__stream_write_state *sws,
+                                int rc);
+    /* Private */
+    int rc;
+    bool running;
+    bool in_checkpoint;
+    bool sync_teardown;  /* Only used to coordinate shutdown on error path. */
+    libxl__save_helper_state shs;
+
+    /* Main stream-writing data. */
+    libxl__datacopier_state dc;
+    sws_record_done_cb record_done_callback;
+
+    /* Only used when constructing EMULATOR records. */
+    libxl__datacopier_state emu_dc;
+    libxl__carefd *emu_carefd;
+    libxl__sr_rec_hdr emu_rec_hdr;
+    libxl__sr_emulator_hdr emu_sub_hdr;
+    void *emu_body;
+};
+
+_hidden void libxl__stream_write_init(libxl__stream_write_state *stream);
+_hidden void libxl__stream_write_start(libxl__egc *egc,
+                                       libxl__stream_write_state *stream);
+_hidden void
+libxl__stream_write_start_checkpoint(libxl__egc *egc,
+                                     libxl__stream_write_state *stream);
+_hidden void libxl__stream_write_abort(libxl__egc *egc,
+                                       libxl__stream_write_state *stream,
+                                       int rc);
+static inline bool
+libxl__stream_write_inuse(const libxl__stream_write_state *stream)
+{
+    return stream->running;
+}
+
 typedef struct libxl__logdirty_switch {
     const char *cmd;
     const char *cmd_path;
@@ -2789,17 +3054,19 @@ typedef struct libxl__logdirty_switch {
 } libxl__logdirty_switch;
 
 struct libxl__domain_suspend_state {
-    /* set by caller of libxl__domain_suspend */
+    /* set by caller of libxl__domain_save */
     libxl__ao *ao;
     libxl__domain_suspend_cb *callback;
 
     uint32_t domid;
     int fd;
+    int fdfl; /* original flags on fd */
     libxl_domain_type type;
     int live;
     int debug;
     const libxl_domain_remus_info *remus;
     /* private */
+    int rc;
     libxl__ev_evtchn guest_evtchn;
     int guest_evtchn_lockfd;
     int hvm;
@@ -2812,7 +3079,7 @@ struct libxl__domain_suspend_state {
     libxl__remus_devices_state rds;
     libxl__ev_time checkpoint_timeout; /* used for Remus checkpoint */
     int interval; /* checkpoint interval (for Remus) */
-    libxl__save_helper_state shs;
+    libxl__stream_write_state sws;
     libxl__logdirty_switch logdirty;
     void (*callback_common_done)(libxl__egc*,
                                  struct libxl__domain_suspend_state*, int ok);
@@ -2957,6 +3224,7 @@ struct libxl__destroy_domid_state {
     libxl__domid_destroy_cb *callback;
     /* private to implementation */
     libxl__devices_remove_state drs;
+    libxl__ev_child destroyer;
 };
 
 struct libxl__domain_destroy_state {
@@ -3044,6 +3312,7 @@ typedef struct {
     libxl__dm_spawn_state pvqemu;
     libxl__destroy_domid_state dis;
     libxl__multidev multidev;
+    libxl__xswait_state xswait;
 } libxl__stub_dm_spawn_state;
 
 _hidden void libxl__spawn_stub_dm(libxl__egc *egc, libxl__stub_dm_spawn_state*);
@@ -3064,23 +3333,87 @@ typedef void libxl__domain_create_cb(libxl__egc *egc,
                                      libxl__domain_create_state*,
                                      int rc, uint32_t domid);
 
+/* State for manipulating a libxl migration v2 stream */
+typedef struct libxl__stream_read_state libxl__stream_read_state;
+
+typedef struct libxl__sr_record_buf {
+    /* private to stream read helper */
+    LIBXL_STAILQ_ENTRY(struct libxl__sr_record_buf) entry;
+    libxl__sr_rec_hdr hdr;
+    void *body; /* iff hdr.length != 0 */
+} libxl__sr_record_buf;
+
+struct libxl__stream_read_state {
+    /* filled by the user */
+    libxl__ao *ao;
+    libxl__domain_create_state *dcs;
+    int fd;
+    bool legacy;
+    void (*completion_callback)(libxl__egc *egc,
+                                libxl__stream_read_state *srs,
+                                int rc);
+    void (*checkpoint_callback)(libxl__egc *egc,
+                                libxl__stream_read_state *srs,
+                                int rc);
+    /* Private */
+    int rc;
+    bool running;
+    bool in_checkpoint;
+    bool sync_teardown; /* Only used to coordinate shutdown on error path. */
+    libxl__save_helper_state shs;
+    libxl__conversion_helper_state chs;
+
+    /* Main stream-reading data. */
+    libxl__datacopier_state dc; /* Only used when reading a record */
+    libxl__sr_hdr hdr;
+    LIBXL_STAILQ_HEAD(, libxl__sr_record_buf) record_queue; /* NOGC */
+    enum {
+        SRS_PHASE_NORMAL,
+        SRS_PHASE_BUFFERING,
+        SRS_PHASE_UNBUFFERING,
+    } phase;
+    bool recursion_guard;
+
+    /* Only used while actively reading a record from the stream. */
+    libxl__sr_record_buf *incoming_record; /* NOGC */
+
+    /* Both only used when processing an EMULATOR record. */
+    libxl__datacopier_state emu_dc;
+    libxl__carefd *emu_carefd;
+};
+
+_hidden void libxl__stream_read_init(libxl__stream_read_state *stream);
+_hidden void libxl__stream_read_start(libxl__egc *egc,
+                                      libxl__stream_read_state *stream);
+_hidden void libxl__stream_read_start_checkpoint(libxl__egc *egc,
+                                                 libxl__stream_read_state *stream);
+_hidden void libxl__stream_read_abort(libxl__egc *egc,
+                                      libxl__stream_read_state *stream, int rc);
+static inline bool
+libxl__stream_read_inuse(const libxl__stream_read_state *stream)
+{
+    return stream->running;
+}
+
+
 struct libxl__domain_create_state {
     /* filled in by user */
     libxl__ao *ao;
     libxl_domain_config *guest_config;
     libxl_domain_config guest_config_saved; /* vanilla config */
-    int restore_fd;
+    int restore_fd, libxc_fd;
+    int restore_fdfl; /* original flags of restore_fd */
+    libxl_domain_restore_params restore_params;
     libxl__domain_create_cb *callback;
     libxl_asyncprogress_how aop_console_how;
     /* private to domain_create */
     int guest_domid;
-    int checkpointed_stream;
     libxl__domain_build_state build_state;
     libxl__bootloader_state bl;
     libxl__stub_dm_spawn_state dmss;
         /* If we're not doing stubdom, we use only dmss.dm,
          * for the non-stubdom device model. */
-    libxl__save_helper_state shs;
+    libxl__stream_read_state srs;
     /* necessary if the domain creation failed and we have to destroy it */
     libxl__domain_destroy_state dds;
     libxl__multidev multidev;
@@ -3089,12 +3422,14 @@ struct libxl__domain_create_state {
 /*----- Domain suspend (save) functions -----*/
 
 /* calls dss->callback when done */
-_hidden void libxl__domain_suspend(libxl__egc *egc,
-                                   libxl__domain_suspend_state *dss);
+_hidden void libxl__domain_save(libxl__egc *egc,
+                                libxl__domain_suspend_state *dss);
 
 
 /* calls libxl__xc_domain_suspend_done when done */
-_hidden void libxl__xc_domain_save(libxl__egc*, libxl__domain_suspend_state*);
+_hidden void libxl__xc_domain_save(libxl__egc *egc,
+                                   libxl__domain_suspend_state *dss,
+                                   libxl__save_helper_state *shs);
 /* If rc==0 then retval is the return value from xc_domain_save
  * and errnoval is the errno value it provided.
  * If rc!=0, retval and errnoval are undefined. */
@@ -3111,13 +3446,16 @@ void libxl__xc_domain_saverestore_async_callback_done(libxl__egc *egc,
 
 _hidden void libxl__domain_suspend_common_switch_qemu_logdirty
                                (int domid, unsigned int enable, void *data);
-_hidden int libxl__toolstack_save(uint32_t domid, uint8_t **buf,
-        uint32_t *len, void *data);
+_hidden int libxl__save_emulator_xenstore_data(libxl__domain_suspend_state *dss,
+                                               char **buf, uint32_t *len);
+_hidden int libxl__restore_emulator_xenstore_data
+    (libxl__domain_create_state *dcs, const char *ptr, uint32_t size);
 
 
 /* calls libxl__xc_domain_restore_done when done */
 _hidden void libxl__xc_domain_restore(libxl__egc *egc,
                                       libxl__domain_create_state *dcs,
+                                      libxl__save_helper_state *shs,
                                       int hvm, int pae, int superpages);
 /* If rc==0 then retval is the return value from xc_domain_save
  * and errnoval is the errno value it provided.
@@ -3125,6 +3463,15 @@ _hidden void libxl__xc_domain_restore(libxl__egc *egc,
 _hidden void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
                                            int rc, int retval, int errnoval);
 
+_hidden void libxl__save_helper_init(libxl__save_helper_state *shs);
+_hidden void libxl__save_helper_abort(libxl__egc *egc,
+                                      libxl__save_helper_state *shs);
+
+static inline bool libxl__save_helper_inuse(const libxl__save_helper_state *shs)
+{
+    return libxl__ev_child_inuse(&shs->child);
+}
+
 /* Each time the dm needs to be saved, we must call suspend and then save */
 _hidden int libxl__domain_suspend_device_model(libxl__gc *gc,
                                            libxl__domain_suspend_state *dss);
@@ -3134,6 +3481,12 @@ _hidden void libxl__domain_save_device_model(libxl__egc *egc,
 
 _hidden const char *libxl__device_model_savefile(libxl__gc *gc, uint32_t domid);
 
+/* calls dss->callback_common_done when done */
+_hidden void libxl__domain_suspend(libxl__egc *egc,
+                                   libxl__domain_suspend_state *dss);
+/* used by libxc to suspend the guest during migration */
+_hidden void libxl__domain_suspend_callback(void *data);
+
 
 /*
  * Convenience macros.
@@ -3169,6 +3522,9 @@ _hidden const char *libxl__device_model_savefile(libxl__gc *gc, uint32_t domid);
     })
 
 
+#define FILLZERO LIBXL_FILLZERO
+
+
 /*
  * All of these assume (or define)
  *    libxl__gc *gc;
@@ -3391,6 +3747,27 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
     libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
 }
 
+/* Check if vNUMA config is valid. Returns 0 if valid,
+ * ERROR_VNUMA_CONFIG_INVALID otherwise.
+ */
+int libxl__vnuma_config_check(libxl__gc *gc,
+                              const libxl_domain_build_info *b_info,
+                              const libxl__domain_build_state *state);
+int libxl__vnuma_build_vmemrange_pv_generic(libxl__gc *gc,
+                                            uint32_t domid,
+                                            libxl_domain_build_info *b_info,
+                                            libxl__domain_build_state *state);
+int libxl__vnuma_build_vmemrange_pv(libxl__gc *gc,
+                                    uint32_t domid,
+                                    libxl_domain_build_info *b_info,
+                                    libxl__domain_build_state *state);
+int libxl__vnuma_build_vmemrange_hvm(libxl__gc *gc,
+                                     uint32_t domid,
+                                     libxl_domain_build_info *b_info,
+                                     libxl__domain_build_state *state,
+                                     struct xc_hvm_build_args *args);
+bool libxl__vnuma_configured(const libxl_domain_build_info *b_info);
+
 _hidden int libxl__ms_vm_genid_set(libxl__gc *gc, uint32_t domid,
                                    const libxl_ms_vm_genid *id);
 
@@ -3524,9 +3901,10 @@ int libxl__cpuid_policy_is_empty(libxl_cpuid_policy_list *pl);
 
 /* Portability note: a proper flock(2) implementation is required */
 typedef struct {
-    libxl__carefd *lock_carefd;
+    libxl__carefd *carefd;
     char *path; /* path of the lock file itself */
 } libxl__domain_userdata_lock;
+/* The CTX_LOCK must be held around uses of this lock */
 libxl__domain_userdata_lock *libxl__lock_domain_userdata(libxl__gc *gc,
                                                          uint32_t domid);
 void libxl__unlock_domain_userdata(libxl__domain_userdata_lock *lock);
@@ -3630,6 +4008,8 @@ static inline void libxl__update_config_vtpm(libxl__gc *gc,
  */
 void libxl__bitmap_copy_best_effort(libxl__gc *gc, libxl_bitmap *dptr,
                                     const libxl_bitmap *sptr);
+
+int libxl__count_physical_sockets(libxl__gc *gc, int *sockets);
 #endif
 
 /*
diff --git a/tools/libxl/libxl_json.c b/tools/libxl/libxl_json.c
index ceb014a..3b695dd 100644
--- a/tools/libxl/libxl_json.c
+++ b/tools/libxl/libxl_json.c
@@ -59,8 +59,8 @@ struct libxl__yajl_ctx {
         const unsigned char *buf = NULL; \
         size_t len = 0; \
         yajl_gen_get_buf((yajl_ctx)->g, &buf, &len); \
-        LIBXL__LOG(libxl__gc_owner((yajl_ctx)->gc), \
-                   LIBXL__LOG_DEBUG, "response:\n%s", buf); \
+        LIBXL__LOG(libxl__gc_owner((yajl_ctx)->gc), LIBXL__LOG_DEBUG,
+		   "response:\n", buf); \
         yajl_gen_free((yajl_ctx)->g); \
         (yajl_ctx)->g = NULL; \
     } while (0)
@@ -247,7 +247,7 @@ int libxl__key_value_list_parse_json(libxl__gc *gc, const libxl__json_object *o,
 
     maps = libxl__json_object_get_map(o);
     size = maps->count * 2;
-    kvl = *p = libxl__calloc(NOGC, size, sizeof(char *));
+    kvl = *p = libxl__calloc(NOGC, size+1, sizeof(char *));
 
     for (i = 0; i < maps->count; i++) {
         int idx = i * 2;
@@ -487,7 +487,7 @@ int libxl__json_object_append_to(libxl__gc *gc, libxl__json_object *obj,
             break;
         default:
             LIBXL__LOG(libxl__gc_owner(gc), LIBXL__LOG_ERROR,
-                       "Try append an object is not a map/array (%i)\n",
+                       "Try append an object is not a map/array (%i)",
                        dst->type);
             return ERROR_FAIL;
         }
@@ -1013,7 +1013,7 @@ out:
 yajl_gen_status libxl__uint64_gen_json(yajl_gen hand, uint64_t val)
 {
     char *num;
-    unsigned int len;
+    int len;
     yajl_gen_status s;
 
 
diff --git a/tools/libxl/libxl_libfdt_compat.c b/tools/libxl/libxl_libfdt_compat.c
new file mode 100644
index 0000000..02b8f74
--- /dev/null
+++ b/tools/libxl/libxl_libfdt_compat.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ *
+ * This file is part of libxl, and was originally taken from libfdt.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * Additionally, this particular file is dual licensed.  That is,
+ * alternatively, at your option:
+ *
+ *      Redistribution and use in source and binary forms, with or
+ *      without modification, are permitted provided that the following
+ *      conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ *      THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ *      CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ *      INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *      MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *      DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ *      CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *      SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ *      NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *      LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ *      HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *      CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *      OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ *      EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Note that this applies only to this file, and other files with a
+ * similar notice.  Also, note that when the same code is distributed
+ * along with the rest of libxl, you must comply with the terms of the
+ * LGPLv2.1 for the whole of libxl including this file.
+ *
+ * The intent is to permit, in particular, upstream libfdt to
+ * incorporate improvements to this file within upstream libfdt.  At
+ * the time of writing, upstream libfdt is dual licensed: 2-clause BSD
+ * (as above) and GPLv2-or-later.  The 2-clause BSD licence is
+ * compatible with both GPLv2-or-later and LGPLv2.1-only; this permits
+ * copying in both directions, and the optional licence upgrade to a
+ * copyleft licence by libdft upstream or the Xen Project,
+ * respectively.
+ */
+
+#include <libfdt.h>
+
+#include "libxl_libfdt_compat.h"
+
+#ifndef HAVE_FDT_FIRST_SUBNODE
+_hidden int fdt_first_subnode(const void *fdt, int offset)
+{
+	int depth = 0;
+
+	offset = fdt_next_node(fdt, offset, &depth);
+	if (offset < 0 || depth != 1)
+		return -FDT_ERR_NOTFOUND;
+
+	return offset;
+}
+#endif
+
+#ifndef HAVE_FDT_NEXT_SUBNODE
+_hidden int fdt_next_subnode(const void *fdt, int offset)
+{
+	int depth = 1;
+
+	/*
+	 * With respect to the parent, the depth of the next subnode will be
+	 * the same as the last.
+	 */
+	do {
+		offset = fdt_next_node(fdt, offset, &depth);
+		if (offset < 0 || depth < 1)
+			return -FDT_ERR_NOTFOUND;
+	} while (depth > 1);
+
+	return offset;
+}
+#endif
diff --git a/tools/libxl/libxl_libfdt_compat.h b/tools/libxl/libxl_libfdt_compat.h
new file mode 100644
index 0000000..23230b5
--- /dev/null
+++ b/tools/libxl/libxl_libfdt_compat.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2006 David Gibson, IBM Corporation.
+ *
+ * This file is part of libxl, and was originally taken from libfdt.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * Additionally, this particular file is dual licensed.  That is,
+ * alternatively, at your option:
+ *
+ *      Redistribution and use in source and binary forms, with or
+ *      without modification, are permitted provided that the following
+ *      conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ *      THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ *      CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ *      INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *      MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *      DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ *      CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *      SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ *      NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *      LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ *      HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *      CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *      OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ *      EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Note that this applies only to this file, and other files with a
+ * similar notice.  Also, note that when the same code is distributed
+ * along with the rest of libxl, you must comply with the terms of the
+ * LGPLv2.1 for the whole of libxl including this file.
+ *
+ * The intent is to permit, in particular, upstream libfdt to
+ * incorporate improvements to this file within upstream libfdt.  At
+ * the time of writing, upstream libfdt is dual licensed: 2-clause BSD
+ * (as above) and GPLv2-or-later.  The 2-clause BSD licence is
+ * compatible with both GPLv2-or-later and LGPLv2.1-only; this permits
+ * copying in both directions, and the optional licence upgrade to a
+ * copyleft licence by libdft upstream or the Xen Project,
+ * respectively.
+ */
+
+#ifndef LIBXL_LIBFDT_COMPAT_H
+#define LIBXL_LIBFDT_COMPAT_H
+
+#include "libxl_internal.h"
+#include <libfdt.h>
+
+#if !HAVE_DECL_FDT_FIRST_SUBNODE
+_hidden int fdt_first_subnode(const void *fdt, int offset);
+#endif
+
+#if !HAVE_DECL_FDT_NEXT_SUBNODE
+_hidden int fdt_next_subnode(const void *fdt, int offset);
+#endif
+
+#if !HAVE_DECL_FDT_PROPERTY_U32
+static inline int fdt_property_u32(void *fdt, const char *name, uint32_t val)
+{
+	uint32_t tmp = cpu_to_fdt32(val);
+	return fdt_property(fdt, name, &tmp, sizeof(tmp));
+}
+#endif
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxl_linux.c b/tools/libxl/libxl_linux.c
index ea5d8c1..be4afc6 100644
--- a/tools/libxl/libxl_linux.c
+++ b/tools/libxl/libxl_linux.c
@@ -19,11 +19,11 @@
  
 int libxl__try_phy_backend(mode_t st_mode)
 {
-    if (!S_ISBLK(st_mode)) {
-        return 0;
+    if (S_ISBLK(st_mode) || S_ISREG(st_mode)) {
+        return 1;
     }
 
-    return 1;
+    return 0;
 }
 
 #define EXT_SHIFT 28
@@ -214,6 +214,7 @@ static int libxl__hotplug_disk(libxl__gc *gc, libxl__device *dev,
 
     *env = get_hotplug_env(gc, script, dev);
     if (!*env) {
+        LOG(ERROR, "Failed to get hotplug environment");
         rc = ERROR_FAIL;
         goto error;
     }
@@ -225,6 +226,7 @@ static int libxl__hotplug_disk(libxl__gc *gc, libxl__device *dev,
     (*args)[nr++] = NULL;
     assert(nr == arraysize);
 
+    LOG(DEBUG, "Args and environment ready");
     rc = 1;
 
 error:
@@ -236,18 +238,12 @@ int libxl__get_hotplug_script_info(libxl__gc *gc, libxl__device *dev,
                                    libxl__device_action action,
                                    int num_exec)
 {
-    char *disable_udev = libxl__xs_read(gc, XBT_NULL, DISABLE_UDEV_PATH);
     int rc;
 
-    /* Check if we have to run hotplug scripts */
-    if (!disable_udev) {
-        rc = 0;
-        goto out;
-    }
-
     switch (dev->backend_kind) {
     case LIBXL__DEVICE_KIND_VBD:
         if (num_exec != 0) {
+            LOG(DEBUG, "num_exec %d, not running hotplug scripts", num_exec);
             rc = 0;
             goto out;
         }
@@ -260,6 +256,7 @@ int libxl__get_hotplug_script_info(libxl__gc *gc, libxl__device *dev,
          */
         if ((num_exec > 1) ||
             (libxl_get_stubdom_id(CTX, dev->domid) && num_exec)) {
+            LOG(DEBUG, "num_exec %d, not running hotplug scripts", num_exec);
             rc = 0;
             goto out;
         }
@@ -267,6 +264,7 @@ int libxl__get_hotplug_script_info(libxl__gc *gc, libxl__device *dev,
         break;
     default:
         /* No need to execute any hotplug scripts */
+        LOG(DEBUG, "backend_kind %d, no need to execute scripts", dev->backend_kind);
         rc = 0;
         break;
     }
@@ -279,3 +277,73 @@ libxl_device_model_version libxl__default_device_model(libxl__gc *gc)
 {
     return LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN;
 }
+
+int libxl__pci_numdevs(libxl__gc *gc)
+{
+    DIR *dir;
+    struct dirent *entry;
+    int num_devs = 0;
+
+    dir = opendir("/sys/bus/pci/devices");
+    if (!dir) {
+        LOGE(ERROR, "Cannot open /sys/bus/pci/devices");
+        return ERROR_FAIL;
+    }
+
+    while ((entry = readdir(dir))) {
+        if (entry->d_name[0] == '.')
+            continue;
+        num_devs++;
+    }
+    closedir(dir);
+
+    return num_devs;
+}
+
+int libxl__pci_topology_init(libxl__gc *gc,
+                             physdev_pci_device_t *devs,
+                             int num_devs)
+{
+
+    DIR *dir;
+    struct dirent *entry;
+    int i, err = 0;
+
+    dir = opendir("/sys/bus/pci/devices");
+    if (!dir) {
+        LOGE(ERROR, "Cannot open /sys/bus/pci/devices");
+        return ERROR_FAIL;
+    }
+
+    i = 0;
+    while ((entry = readdir(dir))) {
+        unsigned int dom, bus, dev, func;
+
+        if (entry->d_name[0] == '.')
+            continue;
+
+        if (i == num_devs) {
+            LOG(ERROR, "Too many devices");
+            err = ERROR_FAIL;
+            errno = -ENOSPC;
+            goto out;
+        }
+
+        if (sscanf(entry->d_name, "%x:%x:%x.%d", &dom, &bus, &dev, &func) < 4) {
+            LOGE(ERROR, "Error processing /sys/bus/pci/devices");
+            err = ERROR_FAIL;
+            goto out;
+        }
+
+        devs[i].seg = dom;
+        devs[i].bus = bus;
+        devs[i].devfn = ((dev & 0x1f) << 3) | (func & 7);
+
+        i++;
+    }
+
+ out:
+    closedir(dir);
+
+    return err;
+}
diff --git a/tools/libxl/libxl_netbsd.c b/tools/libxl/libxl_netbsd.c
index 898e160..096c057 100644
--- a/tools/libxl/libxl_netbsd.c
+++ b/tools/libxl/libxl_netbsd.c
@@ -64,15 +64,8 @@ int libxl__get_hotplug_script_info(libxl__gc *gc, libxl__device *dev,
                                    libxl__device_action action,
                                    int num_exec)
 {
-    char *disable_udev = libxl__xs_read(gc, XBT_NULL, DISABLE_UDEV_PATH);
     int rc;
 
-    /* Check if we have to run hotplug scripts */
-    if (!disable_udev || num_exec > 0) {
-        rc = 0;
-        goto out;
-    }
-
     switch (dev->backend_kind) {
     case LIBXL__DEVICE_KIND_VBD:
     case LIBXL__DEVICE_KIND_VIF:
@@ -95,3 +88,15 @@ libxl_device_model_version libxl__default_device_model(libxl__gc *gc)
 {
     return LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL;
 }
+
+int libxl__pci_numdevs(libxl__gc *gc)
+{
+    return ERROR_NI;
+}
+
+int libxl__pci_topology_init(libxl__gc *gc,
+                             physdev_pci_device_t *devs,
+                             int num_devs)
+{
+    return ERROR_NI;
+}
diff --git a/tools/libxl/libxl_netbuffer.c b/tools/libxl/libxl_netbuffer.c
index edc6843..107e867 100644
--- a/tools/libxl/libxl_netbuffer.c
+++ b/tools/libxl/libxl_netbuffer.c
@@ -219,10 +219,10 @@ out:
 
 static void netbuf_setup_script_cb(libxl__egc *egc,
                                    libxl__async_exec_state *aes,
-                                   int status);
+                                   int rc, int status);
 static void netbuf_teardown_script_cb(libxl__egc *egc,
                                       libxl__async_exec_state *aes,
-                                      int status);
+                                      int rc, int status);
 
 /*
  * the script needs the following env & args
@@ -310,7 +310,7 @@ static void nic_setup(libxl__egc *egc, libxl__remus_device *dev)
     }
 
     setup_async_exec(dev, "setup");
-    rc = libxl__async_exec_start(gc, &dev->aodev.aes);
+    rc = libxl__async_exec_start(&dev->aodev.aes);
     if (rc)
         goto out;
 
@@ -327,14 +327,13 @@ out:
  */
 static void netbuf_setup_script_cb(libxl__egc *egc,
                                    libxl__async_exec_state *aes,
-                                   int status)
+                                   int rc, int status)
 {
     libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
     libxl__remus_device *dev = CONTAINER_OF(aodev, *dev, aodev);
     libxl__remus_device_nic *remus_nic = dev->concrete_data;
     libxl__remus_devices_state *rds = dev->rds;
     const char *out_path_base, *hotplug_error = NULL;
-    int rc;
 
     STATE_AO_GC(rds->ao);
 
@@ -344,6 +343,11 @@ static void netbuf_setup_script_cb(libxl__egc *egc,
     const char *const vif = remus_nic->vif;
     const char **const ifb = &remus_nic->ifb;
 
+    if (status && !rc)
+        rc = ERROR_FAIL;
+    if (rc)
+        goto out;
+
     /*
      * we need to get ifb first because it's needed for teardown
      */
@@ -398,7 +402,7 @@ static void nic_teardown(libxl__egc *egc, libxl__remus_device *dev)
 
     setup_async_exec(dev, "teardown");
 
-    rc = libxl__async_exec_start(gc, &dev->aodev.aes);
+    rc = libxl__async_exec_start(&dev->aodev.aes);
     if (rc)
         goto out;
 
@@ -411,17 +415,14 @@ out:
 
 static void netbuf_teardown_script_cb(libxl__egc *egc,
                                       libxl__async_exec_state *aes,
-                                      int status)
+                                      int rc, int status)
 {
-    int rc;
     libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
     libxl__remus_device *dev = CONTAINER_OF(aodev, *dev, aodev);
     libxl__remus_device_nic *remus_nic = dev->concrete_data;
 
-    if (status)
+    if (status && !rc)
         rc = ERROR_FAIL;
-    else
-        rc = 0;
 
     free_qdisc(remus_nic);
 
diff --git a/tools/libxl/libxl_no_convert_callout.c b/tools/libxl/libxl_no_convert_callout.c
new file mode 100644
index 0000000..6ba4d92
--- /dev/null
+++ b/tools/libxl/libxl_no_convert_callout.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2015      Citrix Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h"
+
+#include "libxl_internal.h"
+
+void libxl__conversion_helper_init(libxl__conversion_helper_state *chs)
+{
+    libxl__ev_child_init(&chs->child);
+}
+
+int libxl__convert_legacy_stream(libxl__egc *egc,
+                                 libxl__conversion_helper_state *chs)
+{
+    return ERROR_FAIL;
+}
+
+void libxl__conversion_helper_abort(libxl__egc *egc,
+                                    libxl__conversion_helper_state *chs,
+                                    int rc)
+{
+    /* no op */
+}
diff --git a/tools/libxl/libxl_osdeps.h b/tools/libxl/libxl_osdeps.h
index 08eaf0c..d9661c9 100644
--- a/tools/libxl/libxl_osdeps.h
+++ b/tools/libxl/libxl_osdeps.h
@@ -42,6 +42,7 @@
 #define SYSFS_PCIBACK_DRIVER   "/dev/null"
 #define NETBACK_NIC_NAME       "xnb%u.%d"
 #include <libutil.h>
+#include <sys/endian.h>
 #endif
 
 #ifndef SYSFS_PCIBACK_DRIVER
@@ -58,6 +59,42 @@ int asprintf(char **buffer, char *fmt, ...);
 int vasprintf(char **buffer, const char *fmt, va_list ap);
 #endif /*NEED_OWN_ASPRINTF*/
 
+#ifndef htobe32 /* glibc < 2.9 */
+# include <byteswap.h>
+
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+#  define htobe16(x) __bswap_16(x)
+#  define htole16(x) (x)
+#  define be16toh(x) __bswap_16(x)
+#  define le16toh(x) (x)
+
+#  define htobe32(x) __bswap_32(x)
+#  define htole32(x) (x)
+#  define be32toh(x) __bswap_32(x)
+#  define le32toh(x) (x)
+
+#  define htobe64(x) __bswap_64(x)
+#  define htole64(x) (x)
+#  define be64toh(x) __bswap_64(x)
+#  define le64toh(x) (x)
+# else
+#  define htobe16(x) (x)
+#  define htole16(x) __bswap_16(x)
+#  define be16toh(x) (x)
+#  define le16toh(x) __bswap_16(x)
+
+#  define htobe32(x) (x)
+#  define htole32(x) __bswap_32(x)
+#  define be32toh(x) (x)
+#  define le32toh(x) __bswap_32(x)
+
+#  define htobe64(x) (x)
+#  define htole64(x) __bswap_64(x)
+#  define be64toh(x) (x)
+#  define le64toh(x) __bswap_64(x)
+# endif
+#endif
+
 #endif
 
 /*
diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c
index f3ae132..19c597e 100644
--- a/tools/libxl/libxl_pci.c
+++ b/tools/libxl/libxl_pci.c
@@ -61,7 +61,7 @@ static void libxl_create_pci_backend_device(libxl__gc *gc, flexarray_t *back, in
               libxl__sprintf(gc, "msitranslate=%d,power_mgmt=%d,permissive=%d",
                              pcidev->msitranslate, pcidev->power_mgmt,
                              pcidev->permissive));
-    flexarray_append_pair(back, libxl__sprintf(gc, "state-%d", num), libxl__sprintf(gc, "%d", 1));
+    flexarray_append_pair(back, libxl__sprintf(gc, "state-%d", num), GCSPRINTF("%d", XenbusStateInitialising));
 }
 
 static int libxl__device_from_pcidev(libxl__gc *gc, uint32_t domid,
@@ -99,7 +99,7 @@ int libxl__create_pci_backend(libxl__gc *gc, uint32_t domid,
 
     flexarray_append_pair(back, "frontend-id", libxl__sprintf(gc, "%d", domid));
     flexarray_append_pair(back, "online", "1");
-    flexarray_append_pair(back, "state", libxl__sprintf(gc, "%d", 1));
+    flexarray_append_pair(back, "state", GCSPRINTF("%d", XenbusStateInitialising));
     flexarray_append_pair(back, "domain", libxl__domid_to_name(gc, domid));
 
     for (i = 0; i < num; i++, pcidev++)
@@ -107,7 +107,7 @@ int libxl__create_pci_backend(libxl__gc *gc, uint32_t domid,
 
     flexarray_append_pair(back, "num_devs", libxl__sprintf(gc, "%d", num));
     flexarray_append_pair(front, "backend-id", libxl__sprintf(gc, "%d", 0));
-    flexarray_append_pair(front, "state", libxl__sprintf(gc, "%d", 1));
+    flexarray_append_pair(front, "state", GCSPRINTF("%d", XenbusStateInitialising));
 
     libxl__device_generic_add(gc, XBT_NULL, &device,
                               libxl__xs_kvs_of_flexarray(gc, back, back->count),
@@ -123,7 +123,7 @@ static int libxl__device_pci_add_xenstore(libxl__gc *gc, uint32_t domid, libxl_d
     flexarray_t *back;
     char *num_devs, *be_path;
     int num = 0;
-    xs_transaction_t t;
+    xs_transaction_t t = XBT_NULL;
     libxl__device *device;
     int rc;
     libxl_domain_config d_config;
@@ -144,7 +144,7 @@ static int libxl__device_pci_add_xenstore(libxl__gc *gc, uint32_t domid, libxl_d
         return ERROR_FAIL;
 
     if (!starting && domtype == LIBXL_DOMAIN_TYPE_PV) {
-        if (libxl__wait_for_backend(gc, be_path, "4") < 0)
+        if (libxl__wait_for_backend(gc, be_path, GCSPRINTF("%d", XenbusStateConnected)) < 0)
             return ERROR_FAIL;
     }
 
@@ -155,7 +155,7 @@ static int libxl__device_pci_add_xenstore(libxl__gc *gc, uint32_t domid, libxl_d
     libxl_create_pci_backend_device(gc, back, num, pcidev);
     flexarray_append_pair(back, "num_devs", libxl__sprintf(gc, "%d", num + 1));
     if (!starting)
-        flexarray_append_pair(back, "state", libxl__sprintf(gc, "%d", 7));
+        flexarray_append_pair(back, "state", GCSPRINTF("%d", XenbusStateReconfiguring));
 
     GCNEW(device);
     libxl__device_from_pcidev(gc, domid, pcidev, device);
@@ -213,7 +213,7 @@ static int libxl__device_pci_remove_xenstore(libxl__gc *gc, uint32_t domid, libx
         return ERROR_FAIL;
 
     if (domtype == LIBXL_DOMAIN_TYPE_PV) {
-        if (libxl__wait_for_backend(gc, be_path, "4") < 0) {
+        if (libxl__wait_for_backend(gc, be_path, GCSPRINTF("%d", XenbusStateConnected)) < 0) {
             LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, "pci backend at %s is not ready", be_path);
             return ERROR_FAIL;
         }
@@ -235,14 +235,14 @@ static int libxl__device_pci_remove_xenstore(libxl__gc *gc, uint32_t domid, libx
 
 retry_transaction:
     t = xs_transaction_start(ctx->xsh);
-    xs_write(ctx->xsh, t, libxl__sprintf(gc, "%s/state-%d", be_path, i), "5", strlen("5"));
-    xs_write(ctx->xsh, t, libxl__sprintf(gc, "%s/state", be_path), "7", strlen("7"));
+    xs_write(ctx->xsh, t, libxl__sprintf(gc, "%s/state-%d", be_path, i), GCSPRINTF("%d", XenbusStateClosing), 1);
+    xs_write(ctx->xsh, t, libxl__sprintf(gc, "%s/state", be_path), GCSPRINTF("%d", XenbusStateReconfiguring), 1);
     if (!xs_transaction_end(ctx->xsh, t, 0))
         if (errno == EAGAIN)
             goto retry_transaction;
 
     if (domtype == LIBXL_DOMAIN_TYPE_PV) {
-        if (libxl__wait_for_backend(gc, be_path, "4") < 0) {
+        if (libxl__wait_for_backend(gc, be_path, GCSPRINTF("%d", XenbusStateConnected)) < 0) {
             LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, "pci backend at %s is not ready", be_path);
             return ERROR_FAIL;
         }
@@ -850,11 +850,12 @@ static int qemu_pci_add_xenstore(libxl__gc *gc, uint32_t domid,
     int rc = 0;
     char *path;
     char *state, *vdevfn;
+    uint32_t dm_domid;
 
-    path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/state", domid);
+    dm_domid = libxl_get_stubdom_id(CTX, domid);
+    path = libxl__device_model_xs_path(gc, dm_domid, domid, "/state");
     state = libxl__xs_read(gc, XBT_NULL, path);
-    path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/parameter",
-                          domid);
+    path = libxl__device_model_xs_path(gc, dm_domid, domid, "/parameter");
     if (pcidev->vdevfn) {
         libxl__xs_write(gc, XBT_NULL, path, PCI_BDF_VDEVFN","PCI_OPTIONS,
                         pcidev->domain, pcidev->bus, pcidev->dev,
@@ -869,11 +870,9 @@ static int qemu_pci_add_xenstore(libxl__gc *gc, uint32_t domid,
     libxl__qemu_traditional_cmd(gc, domid, "pci-ins");
     rc = libxl__wait_for_device_model_deprecated(gc, domid, NULL, NULL,
                                       pci_ins_check, state);
-    path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/parameter",
-                          domid);
+    path = libxl__device_model_xs_path(gc, dm_domid, domid, "/parameter");
     vdevfn = libxl__xs_read(gc, XBT_NULL, path);
-    path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/state",
-                          domid);
+    path = libxl__device_model_xs_path(gc, dm_domid, domid, "/state");
     if ( rc < 0 )
         LIBXL__LOG(ctx, LIBXL__LOG_ERROR,
                    "qemu refused to add device: %s", vdevfn);
@@ -895,6 +894,7 @@ static int do_pci_add(libxl__gc *gc, uint32_t domid, libxl_device_pci *pcidev, i
     FILE *f;
     unsigned long long start, end, flags, size;
     int irq, i, rc, hvm = 0;
+    uint32_t flag = XEN_DOMCTL_DEV_RDM_RELAXED;
 
     if (type == LIBXL_DOMAIN_TYPE_INVALID)
         return ERROR_FAIL;
@@ -988,7 +988,13 @@ static int do_pci_add(libxl__gc *gc, uint32_t domid, libxl_device_pci *pcidev, i
 
 out:
     if (!libxl_is_stubdom(ctx, domid, NULL)) {
-        rc = xc_assign_device(ctx->xch, domid, pcidev_encode_bdf(pcidev));
+        if (pcidev->rdm_policy == LIBXL_RDM_RESERVE_POLICY_STRICT) {
+            flag &= ~XEN_DOMCTL_DEV_RDM_RELAXED;
+        } else if (pcidev->rdm_policy != LIBXL_RDM_RESERVE_POLICY_RELAXED) {
+            LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "unknown rdm check flag.");
+            return ERROR_FAIL;
+        }
+        rc = xc_assign_device(ctx->xch, domid, pcidev_encode_bdf(pcidev), flag);
         if (rc < 0 && (hvm || errno != ENOSYS)) {
             LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "xc_assign_device failed");
             return ERROR_FAIL;
@@ -1040,6 +1046,9 @@ static int libxl__device_pci_reset(libxl__gc *gc, unsigned int domain, unsigned
 
 int libxl__device_pci_setdefault(libxl__gc *gc, libxl_device_pci *pci)
 {
+    /* We'd like to force reserve rdm specific to a device by default.*/
+    if (pci->rdm_policy == LIBXL_RDM_RESERVE_POLICY_INVALID)
+        pci->rdm_policy = LIBXL_RDM_RESERVE_POLICY_STRICT;
     return 0;
 }
 
@@ -1175,10 +1184,13 @@ static int qemu_pci_remove_xenstore(libxl__gc *gc, uint32_t domid,
     libxl_ctx *ctx = libxl__gc_owner(gc);
     char *state;
     char *path;
+    uint32_t dm_domid;
+
+    dm_domid = libxl_get_stubdom_id(CTX, domid);
 
-    path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/state", domid);
+    path = libxl__device_model_xs_path(gc, dm_domid, domid, "/state");
     state = libxl__xs_read(gc, XBT_NULL, path);
-    path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/parameter", domid);
+    path = libxl__device_model_xs_path(gc, dm_domid, domid, "/parameter");
     libxl__xs_write(gc, XBT_NULL, path, PCI_BDF, pcidev->domain,
                     pcidev->bus, pcidev->dev, pcidev->func);
 
@@ -1196,7 +1208,7 @@ static int qemu_pci_remove_xenstore(libxl__gc *gc, uint32_t domid,
             return ERROR_FAIL;
         }
     }
-    path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/state", domid);
+    path = libxl__device_model_xs_path(gc, dm_domid, domid, "/state");
     xs_write(ctx->xsh, XBT_NULL, path, state, strlen(state));
 
     return 0;
diff --git a/tools/libxl/libxl_psr.c b/tools/libxl/libxl_psr.c
index 0437465..000d748 100644
--- a/tools/libxl/libxl_psr.c
+++ b/tools/libxl/libxl_psr.c
@@ -19,14 +19,37 @@
 
 #define IA32_QM_CTR_ERROR_MASK         (0x3ul << 62)
 
-static void libxl__psr_cmt_log_err_msg(libxl__gc *gc, int err)
+static void libxl__psr_log_err_msg(libxl__gc *gc, int err)
 {
     char *msg;
 
     switch (err) {
     case ENOSYS:
+    case EOPNOTSUPP:
         msg = "unsupported operation";
         break;
+    case ESRCH:
+        msg = "invalid domain ID";
+        break;
+    case ENOTSOCK:
+        msg = "socket is not supported";
+        break;
+    case EFAULT:
+        msg = "failed to exchange data with Xen";
+        break;
+    default:
+        msg = "unknown error";
+        break;
+    }
+
+    LOGE(ERROR, "%s", msg);
+}
+
+static void libxl__psr_cmt_log_err_msg(libxl__gc *gc, int err)
+{
+    char *msg;
+
+    switch (err) {
     case ENODEV:
         msg = "CMT is not supported in this system";
         break;
@@ -36,18 +59,38 @@ static void libxl__psr_cmt_log_err_msg(libxl__gc *gc, int err)
     case ENOENT:
         msg = "CMT is not attached to this domain";
         break;
-    case EUSERS:
+    case EOVERFLOW:
         msg = "no free RMID available";
         break;
-    case ESRCH:
-        msg = "invalid domain ID";
+    default:
+        libxl__psr_log_err_msg(gc, err);
+        return;
+    }
+
+    LOGE(ERROR, "%s", msg);
+}
+
+static void libxl__psr_cat_log_err_msg(libxl__gc *gc, int err)
+{
+    char *msg;
+
+    switch (err) {
+    case ENODEV:
+        msg = "CAT is not supported in this system";
         break;
-    case EFAULT:
-        msg = "failed to exchange data with Xen";
+    case ENOENT:
+        msg = "CAT is not enabled on the socket";
         break;
-    default:
-        msg = "unknown error";
+    case EOVERFLOW:
+        msg = "no free COS available";
+        break;
+    case EEXIST:
+        msg = "The same CBM is already set to this domain";
         break;
+
+    default:
+        libxl__psr_log_err_msg(gc, err);
+        return;
     }
 
     LOGE(ERROR, "%s", msg);
@@ -135,8 +178,9 @@ int libxl_psr_cmt_get_total_rmid(libxl_ctx *ctx, uint32_t *total_rmid)
     return rc;
 }
 
-int libxl_psr_cmt_get_l3_cache_size(libxl_ctx *ctx, uint32_t socketid,
-                                         uint32_t *l3_cache_size)
+int libxl_psr_cmt_get_l3_cache_size(libxl_ctx *ctx,
+                                    uint32_t socketid,
+                                    uint32_t *l3_cache_size)
 {
     GC_INIT(ctx);
 
@@ -160,16 +204,36 @@ out:
     return rc;
 }
 
-int libxl_psr_cmt_get_cache_occupancy(libxl_ctx *ctx, uint32_t domid,
-    uint32_t socketid, uint32_t *l3_cache_occupancy)
+int libxl_psr_cmt_type_supported(libxl_ctx *ctx, libxl_psr_cmt_type type)
 {
     GC_INIT(ctx);
+    uint32_t event_mask;
+    int rc;
 
+    rc = xc_psr_cmt_get_l3_event_mask(ctx->xch, &event_mask);
+    if (rc < 0) {
+        libxl__psr_cmt_log_err_msg(gc, errno);
+        rc = 0;
+    } else {
+        rc = event_mask & (1 << (type - 1));
+    }
+
+    GC_FREE;
+    return rc;
+}
+
+int libxl_psr_cmt_get_sample(libxl_ctx *ctx,
+                             uint32_t domid,
+                             libxl_psr_cmt_type type,
+                             uint64_t scope,
+                             uint64_t *sample_r,
+                             uint64_t *tsc_r)
+{
+    GC_INIT(ctx);
     unsigned int rmid;
     uint32_t upscaling_factor;
     uint64_t monitor_data;
     int cpu, rc;
-    xc_psr_cmt_type type;
 
     rc = xc_psr_cmt_get_domain_rmid(ctx->xch, domid, &rmid);
     if (rc < 0 || rmid == 0) {
@@ -179,15 +243,15 @@ int libxl_psr_cmt_get_cache_occupancy(libxl_ctx *ctx, uint32_t domid,
         goto out;
     }
 
-    cpu = libxl__pick_socket_cpu(gc, socketid);
+    cpu = libxl__pick_socket_cpu(gc, scope);
     if (cpu < 0) {
         LOGE(ERROR, "failed to get socket cpu");
         rc = ERROR_FAIL;
         goto out;
     }
 
-    type = XC_PSR_CMT_L3_OCCUPANCY;
-    rc = xc_psr_cmt_get_data(ctx->xch, rmid, cpu, type, &monitor_data);
+    rc = xc_psr_cmt_get_data(ctx->xch, rmid, cpu, type - 1,
+                             &monitor_data, tsc_r);
     if (rc < 0) {
         LOGE(ERROR, "failed to get monitoring data");
         rc = ERROR_FAIL;
@@ -201,13 +265,130 @@ int libxl_psr_cmt_get_cache_occupancy(libxl_ctx *ctx, uint32_t domid,
         goto out;
     }
 
-    *l3_cache_occupancy = upscaling_factor * monitor_data / 1024;
-    rc = 0;
+    *sample_r = monitor_data * upscaling_factor;
 out:
     GC_FREE;
     return rc;
 }
 
+int libxl_psr_cmt_get_cache_occupancy(libxl_ctx *ctx,
+                                      uint32_t domid,
+                                      uint32_t socketid,
+                                      uint32_t *l3_cache_occupancy)
+{
+    uint64_t data;
+    int rc;
+
+    rc = libxl_psr_cmt_get_sample(ctx, domid,
+                                  LIBXL_PSR_CMT_TYPE_CACHE_OCCUPANCY,
+                                  socketid, &data, NULL);
+    if (rc < 0)
+        goto out;
+
+    *l3_cache_occupancy = data / 1024;
+out:
+    return rc;
+}
+
+int libxl_psr_cat_set_cbm(libxl_ctx *ctx, uint32_t domid,
+                          libxl_psr_cbm_type type, libxl_bitmap *target_map,
+                          uint64_t cbm)
+{
+    GC_INIT(ctx);
+    int rc;
+    int socketid, nr_sockets;
+
+    rc = libxl__count_physical_sockets(gc, &nr_sockets);
+    if (rc) {
+        LOGE(ERROR, "failed to get system socket count");
+        goto out;
+    }
+
+    libxl_for_each_set_bit(socketid, *target_map) {
+        if (socketid >= nr_sockets)
+            break;
+        if (xc_psr_cat_set_domain_data(ctx->xch, domid, type, socketid, cbm)) {
+            libxl__psr_cat_log_err_msg(gc, errno);
+            rc = ERROR_FAIL;
+        }
+    }
+
+out:
+    GC_FREE;
+    return rc;
+}
+
+int libxl_psr_cat_get_cbm(libxl_ctx *ctx, uint32_t domid,
+                          libxl_psr_cbm_type type, uint32_t target,
+                          uint64_t *cbm_r)
+{
+    GC_INIT(ctx);
+    int rc = 0;
+
+    if (xc_psr_cat_get_domain_data(ctx->xch, domid, type, target, cbm_r)) {
+        libxl__psr_cat_log_err_msg(gc, errno);
+        rc = ERROR_FAIL;
+    }
+
+    GC_FREE;
+    return rc;
+}
+
+int libxl_psr_cat_get_l3_info(libxl_ctx *ctx, libxl_psr_cat_info **info,
+                              int *nr)
+{
+    GC_INIT(ctx);
+    int rc;
+    int i = 0, socketid, nr_sockets;
+    libxl_bitmap socketmap;
+    libxl_psr_cat_info *ptr;
+
+    libxl_bitmap_init(&socketmap);
+
+    rc = libxl__count_physical_sockets(gc, &nr_sockets);
+    if (rc) {
+        LOGE(ERROR, "failed to get system socket count");
+        goto out;
+    }
+
+    libxl_socket_bitmap_alloc(ctx, &socketmap, nr_sockets);
+    rc = libxl_get_online_socketmap(ctx, &socketmap);
+    if (rc < 0) {
+        LOGE(ERROR, "failed to get available sockets");
+        goto out;
+    }
+
+    ptr = libxl__malloc(NOGC, nr_sockets * sizeof(libxl_psr_cat_info));
+
+    libxl_for_each_set_bit(socketid, socketmap) {
+        ptr[i].id = socketid;
+        if (xc_psr_cat_get_l3_info(ctx->xch, socketid, &ptr[i].cos_max,
+                                   &ptr[i].cbm_len)) {
+            libxl__psr_cat_log_err_msg(gc, errno);
+            rc = ERROR_FAIL;
+            free(ptr);
+            goto out;
+        }
+        i++;
+    }
+
+    *info = ptr;
+    *nr = i;
+out:
+    libxl_bitmap_dispose(&socketmap);
+    GC_FREE;
+    return rc;
+}
+
+void libxl_psr_cat_info_list_free(libxl_psr_cat_info *list, int nr)
+{
+    int i;
+
+    for (i = 0; i < nr; i++)
+        libxl_psr_cat_info_dispose(&list[i]);
+    free(list);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxl/libxl_qmp.c b/tools/libxl/libxl_qmp.c
index c7324e6..f798de7 100644
--- a/tools/libxl/libxl_qmp.c
+++ b/tools/libxl/libxl_qmp.c
@@ -357,22 +357,32 @@ static libxl__qmp_handler *qmp_init_handler(libxl__gc *gc, uint32_t domid)
 static int qmp_open(libxl__qmp_handler *qmp, const char *qmp_socket_path,
                     int timeout)
 {
-    int ret;
+    int ret = -1;
     int i = 0;
 
     qmp->qmp_fd = socket(AF_UNIX, SOCK_STREAM, 0);
     if (qmp->qmp_fd < 0) {
-        return -1;
+        goto out;
     }
     ret = libxl_fd_set_nonblock(qmp->ctx, qmp->qmp_fd, 1);
-    if (ret) return -1;
+    if (ret) {
+        ret = -1;
+        goto out;
+    }
     ret = libxl_fd_set_cloexec(qmp->ctx, qmp->qmp_fd, 1);
-    if (ret) return -1;
+    if (ret) {
+        ret = -1;
+        goto out;
+    }
 
+    if (sizeof (qmp->addr.sun_path) <= strlen(qmp_socket_path)) {
+        ret = -1;
+        goto out;
+    }
     memset(&qmp->addr, 0, sizeof (qmp->addr));
     qmp->addr.sun_family = AF_UNIX;
     strncpy(qmp->addr.sun_path, qmp_socket_path,
-            sizeof (qmp->addr.sun_path));
+            sizeof (qmp->addr.sun_path)-1);
 
     do {
         ret = connect(qmp->qmp_fd, (struct sockaddr *) &qmp->addr,
@@ -384,9 +394,13 @@ static int qmp_open(libxl__qmp_handler *qmp, const char *qmp_socket_path,
              * ECONNREFUSED : Leftover socket hasn't been removed yet */
             continue;
         }
-        return -1;
+        ret = -1;
+        goto out;
     } while ((++i / 5 <= timeout) && (usleep(200 * 1000) <= 0));
 
+out:
+    if (ret == -1 && qmp->qmp_fd > -1) close(qmp->qmp_fd);
+
     return ret;
 }
 
@@ -475,7 +489,7 @@ static int qmp_next(libxl__gc *gc, libxl__qmp_handler *qmp)
                 if (o) {
                     rc = qmp_handle_response(gc, qmp, o);
                 } else {
-                    LOG(ERROR, "Parse error of : %s\n", s);
+                    LOG(ERROR, "Parse error of : %s", s);
                     return -1;
                 }
 
@@ -680,6 +694,7 @@ libxl__qmp_handler *libxl__qmp_initialize(libxl__gc *gc, uint32_t domid)
     char *qmp_socket;
 
     qmp = qmp_init_handler(gc, domid);
+    if (!qmp) return NULL;
 
     qmp_socket = GCSPRINTF("%s/qmp-libxl-%d", libxl__run_dir_path(), domid);
     if ((ret = qmp_open(qmp, qmp_socket, QMP_SOCKET_CONNECT_TIMEOUT)) < 0) {
@@ -723,6 +738,13 @@ void libxl__qmp_cleanup(libxl__gc *gc, uint32_t domid)
             LOGE(ERROR, "Failed to remove QMP socket file %s", qmp_socket);
         }
     }
+
+    qmp_socket = GCSPRINTF("%s/qmp-libxenstat-%d", libxl__run_dir_path(), domid);
+    if (unlink(qmp_socket) == -1) {
+        if (errno != ENOENT) {
+            LOGE(ERROR, "Failed to remove QMP socket file %s", qmp_socket);
+        }
+    }
 }
 
 int libxl__qmp_query_serial(libxl__qmp_handler *qmp)
@@ -828,6 +850,18 @@ int libxl__qmp_pci_add(libxl__gc *gc, int domid, libxl_device_pci *pcidev)
         QMP_PARAMETERS_SPRINTF(&args, "addr", "%x.%x",
                                PCI_SLOT(pcidev->vdevfn), PCI_FUNC(pcidev->vdevfn));
     }
+    /*
+     * Version of QEMU prior to the XSA-131 fix did not support this
+     * property and were effectively always in permissive mode. The
+     * fix for XSA-131 switched the default to be restricted by
+     * default and added the permissive property.
+     *
+     * Therefore in order to support both old and new QEMU we only set
+     * the permissive flag if it is true. Users of older QEMU have no
+     * reason to set the flag so this is ok.
+     */
+    if (pcidev->permissive)
+        qmp_parameters_add_bool(gc, &args, "permissive", true);
 
     rc = qmp_synchronous_send(qmp, "device_add", args,
                               NULL, NULL, qmp->timeout);
diff --git a/tools/libxl/libxl_remus_disk_drbd.c b/tools/libxl/libxl_remus_disk_drbd.c
index 3215f93..1c3a88a 100644
--- a/tools/libxl/libxl_remus_disk_drbd.c
+++ b/tools/libxl/libxl_remus_disk_drbd.c
@@ -47,7 +47,7 @@ static void drbd_async_call(libxl__egc *egc,
                             void func(libxl__remus_device *),
                             libxl__ev_child_callback callback)
 {
-    int pid = -1, rc;
+    int pid, rc;
     libxl__ao_device *aodev = &dev->aodev;
     STATE_AO_GC(dev->rds->ao);
 
@@ -78,7 +78,7 @@ out:
 /* callbacks */
 static void match_async_exec_cb(libxl__egc *egc,
                                 libxl__async_exec_state *aes,
-                                int status);
+                                int rc, int status);
 
 /* implementations */
 
@@ -120,7 +120,7 @@ static void match_async_exec(libxl__egc *egc, libxl__remus_device *dev)
     aes->stdfds[1] = -1;
     aes->stdfds[2] = -1;
 
-    rc = libxl__async_exec_start(gc, aes);
+    rc = libxl__async_exec_start(aes);
     if (rc)
         goto out;
 
@@ -133,9 +133,8 @@ out:
 
 static void match_async_exec_cb(libxl__egc *egc,
                                 libxl__async_exec_state *aes,
-                                int status)
+                                int rc, int status)
 {
-    int rc;
     libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
     libxl__remus_device *dev = CONTAINER_OF(aodev, *dev, aodev);
     libxl__remus_drbd_disk *drbd_disk;
@@ -143,8 +142,13 @@ static void match_async_exec_cb(libxl__egc *egc,
 
     STATE_AO_GC(aodev->ao);
 
+    if (rc)
+        goto out;
+
     if (status) {
         rc = ERROR_REMUS_DEVOPS_DOES_NOT_MATCH;
+        /* BUG: seems to assume that any exit status means `no match' */
+        /* BUG: exit status will have been logged as an error */
         goto out;
     }
 
diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
index 40b25e4..3af99af 100644
--- a/tools/libxl/libxl_save_callout.c
+++ b/tools/libxl/libxl_save_callout.c
@@ -32,6 +32,7 @@ static void run_helper(libxl__egc *egc, libxl__save_helper_state *shs,
                        const unsigned long *argnums, int num_argnums);
 
 static void helper_failed(libxl__egc*, libxl__save_helper_state *shs, int rc);
+static void helper_stop(libxl__egc *egc, libxl__ao_abortable*, int rc);
 static void helper_stdout_readable(libxl__egc *egc, libxl__ev_fd *ev,
                                    int fd, short events, short revents);
 static void helper_exited(libxl__egc *egc, libxl__ev_child *ch,
@@ -41,17 +42,18 @@ static void helper_done(libxl__egc *egc, libxl__save_helper_state *shs);
 /*----- entrypoints -----*/
 
 void libxl__xc_domain_restore(libxl__egc *egc, libxl__domain_create_state *dcs,
+                              libxl__save_helper_state *shs,
                               int hvm, int pae, int superpages)
 {
     STATE_AO_GC(dcs->ao);
 
     /* Convenience aliases */
     const uint32_t domid = dcs->guest_domid;
-    const int restore_fd = dcs->restore_fd;
+    const int restore_fd = dcs->libxc_fd;
     libxl__domain_build_state *const state = &dcs->build_state;
 
-    unsigned cbflags = libxl__srm_callout_enumcallbacks_restore
-        (&dcs->shs.callbacks.restore.a);
+    unsigned cbflags =
+        libxl__srm_callout_enumcallbacks_restore(&shs->callbacks.restore.a);
 
     const unsigned long argnums[] = {
         domid,
@@ -59,81 +61,44 @@ void libxl__xc_domain_restore(libxl__egc *egc, libxl__domain_create_state *dcs,
         state->store_domid, state->console_port,
         state->console_domid,
         hvm, pae, superpages,
-        cbflags, dcs->checkpointed_stream,
+        cbflags, dcs->restore_params.checkpointed_stream,
     };
 
-    dcs->shs.ao = ao;
-    dcs->shs.domid = domid;
-    dcs->shs.recv_callback = libxl__srm_callout_received_restore;
-    dcs->shs.completion_callback = libxl__xc_domain_restore_done;
-    dcs->shs.caller_state = dcs;
-    dcs->shs.need_results = 1;
-    dcs->shs.toolstack_data_file = 0;
+    shs->ao = ao;
+    shs->domid = domid;
+    shs->recv_callback = libxl__srm_callout_received_restore;
+    shs->completion_callback = libxl__xc_domain_restore_done;
+    shs->caller_state = dcs;
+    shs->need_results = 1;
 
-    run_helper(egc, &dcs->shs, "--restore-domain", restore_fd, 0,0,
+    run_helper(egc, shs, "--restore-domain", restore_fd, 0, 0,
                argnums, ARRAY_SIZE(argnums));
 }
 
-void libxl__xc_domain_save(libxl__egc *egc, libxl__domain_suspend_state *dss)
+void libxl__xc_domain_save(libxl__egc *egc, libxl__domain_suspend_state *dss,
+                           libxl__save_helper_state *shs)
 {
     STATE_AO_GC(dss->ao);
-    int r, rc, toolstack_data_fd = -1;
-    uint32_t toolstack_data_len = 0;
 
-    /* Resources we need to free */
-    uint8_t *toolstack_data_buf = 0;
-
-    unsigned cbflags = libxl__srm_callout_enumcallbacks_save
-        (&dss->shs.callbacks.save.a);
-
-    if (dss->shs.callbacks.save.toolstack_save) {
-        r = dss->shs.callbacks.save.toolstack_save
-            (dss->domid, &toolstack_data_buf, &toolstack_data_len, dss);
-        if (r) { rc = ERROR_FAIL; goto out; }
-
-        dss->shs.toolstack_data_file = tmpfile();
-        if (!dss->shs.toolstack_data_file) {
-            LOGE(ERROR, "cannot create toolstack data tmpfile");
-            rc = ERROR_FAIL;
-            goto out;
-        }
-        toolstack_data_fd = fileno(dss->shs.toolstack_data_file);
-
-        r = libxl_write_exactly(CTX, toolstack_data_fd,
-                                toolstack_data_buf, toolstack_data_len,
-                                "toolstack data tmpfile", 0);
-        if (r) { rc = ERROR_FAIL; goto out; }
-
-        /* file position must be reset before passing to libxl-save-helper. */
-        r = lseek(toolstack_data_fd, 0, SEEK_SET);
-        if (r) { rc = ERROR_FAIL; goto out; }
-    }
+    unsigned cbflags =
+        libxl__srm_callout_enumcallbacks_save(&shs->callbacks.save.a);
 
     const unsigned long argnums[] = {
         dss->domid, 0, 0, dss->xcflags, dss->hvm,
-        toolstack_data_fd, toolstack_data_len,
         cbflags,
     };
 
-    dss->shs.ao = ao;
-    dss->shs.domid = dss->domid;
-    dss->shs.recv_callback = libxl__srm_callout_received_save;
-    dss->shs.completion_callback = libxl__xc_domain_save_done;
-    dss->shs.caller_state = dss;
-    dss->shs.need_results = 0;
+    shs->ao = ao;
+    shs->domid = dss->domid;
+    shs->recv_callback = libxl__srm_callout_received_save;
+    shs->completion_callback = libxl__xc_domain_save_done;
+    shs->caller_state = dss;
+    shs->need_results = 0;
 
-    free(toolstack_data_buf);
-
-    run_helper(egc, &dss->shs, "--save-domain", dss->fd,
-               &toolstack_data_fd, 1,
+    run_helper(egc, shs, "--save-domain", dss->fd,
+               NULL, 0,
                argnums, ARRAY_SIZE(argnums));
     return;
-
- out:
-    free(toolstack_data_buf);
-    if (dss->shs.toolstack_data_file) fclose(dss->shs.toolstack_data_file);
-
-    libxl__xc_domain_save_done(egc, dss, rc, 0, 0);
 }
 
 
@@ -145,6 +110,13 @@ void libxl__xc_domain_saverestore_async_callback_done(libxl__egc *egc,
     shs->egc = 0;
 }
 
+void libxl__save_helper_init(libxl__save_helper_state *shs)
+{
+    libxl__ao_abortable_init(&shs->abrt);
+    libxl__ev_fd_init(&shs->readable);
+    libxl__ev_child_init(&shs->child);
+}
+
 /*----- helper execution -----*/
 
 static void run_helper(libxl__egc *egc, libxl__save_helper_state *shs,
@@ -166,8 +138,12 @@ static void run_helper(libxl__egc *egc, libxl__save_helper_state *shs,
     shs->rc = 0;
     shs->completed = 0;
     shs->pipes[0] = shs->pipes[1] = 0;
-    libxl__ev_fd_init(&shs->readable);
-    libxl__ev_child_init(&shs->child);
+    libxl__save_helper_init(shs);
+
+    shs->abrt.ao = shs->ao;
+    shs->abrt.callback = helper_stop;
+    rc = libxl__ao_abortable_register(&shs->abrt);
+    if (rc) goto out;
 
     shs->stdin_what = GCSPRINTF("domain %"PRIu32" save/restore helper"
                                 " stdin pipe", domid);
@@ -248,14 +224,34 @@ static void helper_failed(libxl__egc *egc, libxl__save_helper_state *shs,
 
     libxl__ev_fd_deregister(gc, &shs->readable);
 
-    if (!libxl__ev_child_inuse(&shs->child)) {
+    if (!libxl__save_helper_inuse(shs)) {
         helper_done(egc, shs);
         return;
     }
 
-    int r = kill(shs->child.pid, SIGKILL);
-    if (r) LOGE(WARN, "failed to kill save/restore helper [%lu]",
-                (unsigned long)shs->child.pid);
+    libxl__kill(gc, shs->child.pid, SIGKILL, "save/restore helper");
+}
+
+static void helper_stop(libxl__egc *egc, libxl__ao_abortable *abrt, int rc)
+{
+    libxl__save_helper_state *shs = CONTAINER_OF(abrt, *shs, abrt);
+    STATE_AO_GC(shs->ao);
+
+    if (!libxl__save_helper_inuse(shs)) {
+        helper_failed(egc, shs, rc);
+        return;
+    }
+
+    if (!shs->rc)
+        shs->rc = rc;
+
+    libxl__kill(gc, shs->child.pid, SIGTERM, "save/restore helper");
+}
+
+void libxl__save_helper_abort(libxl__egc *egc,
+                              libxl__save_helper_state *shs)
+{
+    helper_stop(egc, &shs->abrt, ERROR_FAIL);
 }
 
 static void helper_stdout_readable(libxl__egc *egc, libxl__ev_fd *ev,
@@ -305,19 +301,22 @@ static void helper_exited(libxl__egc *egc, libxl__ev_child *ch,
 
     if (status) {
         libxl_report_child_exitstatus(CTX, XTL_ERROR, what, pid, status);
-        shs->rc = ERROR_FAIL;
+        if (!shs->rc)
+            shs->rc = ERROR_FAIL;
     }
 
     if (shs->need_results) {
-        if (!shs->rc)
+        if (!shs->rc) {
             LOG(ERROR,"%s exited without providing results",what);
-        shs->rc = ERROR_FAIL;
+            shs->rc = ERROR_FAIL;
+        }
     }
 
     if (!shs->completed) {
-        if (!shs->rc)
+        if (!shs->rc) {
             LOG(ERROR,"%s exited without signaling completion",what);
-        shs->rc = ERROR_FAIL;
+            shs->rc = ERROR_FAIL;
+        }
     }
 
     helper_done(egc, shs);
@@ -328,11 +327,11 @@ static void helper_done(libxl__egc *egc, libxl__save_helper_state *shs)
 {
     STATE_AO_GC(shs->ao);
 
+    libxl__ao_abortable_deregister(&shs->abrt);
     libxl__ev_fd_deregister(gc, &shs->readable);
     libxl__carefd_close(shs->pipes[0]);  shs->pipes[0] = 0;
     libxl__carefd_close(shs->pipes[1]);  shs->pipes[1] = 0;
-    assert(!libxl__ev_child_inuse(&shs->child));
-    if (shs->toolstack_data_file) fclose(shs->toolstack_data_file);
+    assert(!libxl__save_helper_inuse(shs));
 
     shs->egc = egc;
     shs->completion_callback(egc, shs->caller_state,
diff --git a/tools/libxl/libxl_save_helper.c b/tools/libxl/libxl_save_helper.c
index 74826a1..57ae978 100644
--- a/tools/libxl/libxl_save_helper.c
+++ b/tools/libxl/libxl_save_helper.c
@@ -40,8 +40,11 @@
 #include <unistd.h>
 #include <assert.h>
 #include <inttypes.h>
+#include <fcntl.h>
+#include <signal.h>
 
 #include "libxl.h"
+#include "libxl_utils.h"
 
 #include "xenctrl.h"
 #include "xenguest.h"
@@ -85,6 +88,7 @@ static xentoollog_logger logger = {
     tellparent_destroy,
 };
 static xc_interface *xch;
+static int io_fd;
 
 /*----- error handling -----*/
 
@@ -119,6 +123,58 @@ static void *xmalloc(size_t sz)
     return r;
 }
 
+/*----- signal handling -----*/
+
+static int unwriteable_fd;
+
+static void save_signal_handler(int num)
+{
+    /*
+     * We want to be able to interrupt save.  But the code in libxc
+     * which does the actual saving is straight-through, and we need
+     * to execute its error path to put the guest back to sanity.
+     *
+     * So what we do is this: when we get the signal, we dup2
+     * the result of open("/dev/null",O_RDONLY) onto the output fd.
+     *
+     * This is guaranteed to 1. interrupt libxc's write (causing it to
+     * return short, or maybe EINTR); 2. make the next write give
+     * EBADF, so that: 3. at latest, libxc will notice when it next
+     * tries to write data and will then go into its cleanup path.
+     *
+     * We make no effort here to sanitise the resulting errors.
+     * That's libxl's job.
+     */
+    int esave = errno;
+
+    int r = dup2(unwriteable_fd, io_fd);
+    assert(r == io_fd); /* if not we can't write an xtl message because we
+                         * might end up interleaving on our control stream */
+
+    errno = esave;
+}
+
+static void setup_signals(void (*handler)(int))
+{
+    struct sigaction sa;
+    sigset_t spmask;
+    int r;
+
+    unwriteable_fd = open("/dev/null",O_RDONLY);
+    if (unwriteable_fd < 0) fail(errno,"open /dev/null for reading");
+
+    LIBXL_FILLZERO(sa);
+    sa.sa_handler = handler;
+    sigemptyset(&sa.sa_mask);
+    r = sigaction(SIGTERM, &sa, 0);
+    if (r) fail(errno,"sigaction SIGTERM failed");
+
+    sigemptyset(&spmask);
+    sigaddset(&spmask,SIGTERM);
+    r = sigprocmask(SIG_UNBLOCK,&spmask,0);
+    if (r) fail(errno,"sigprocmask unblock SIGTERM failed");
+}
+
 /*----- helper functions called by autogenerated stubs -----*/
 
 unsigned char * helper_allocbuf(int len, void *user)
@@ -157,32 +213,8 @@ int helper_getreply(void *user)
 
 /*----- other callbacks -----*/
 
-static int toolstack_save_fd;
-static uint32_t toolstack_save_len;
 static struct save_callbacks helper_save_callbacks;
 
-static int toolstack_save_cb(uint32_t domid, uint8_t **buf,
-                             uint32_t *len, void *data)
-{
-    int r;
-
-    assert(toolstack_save_fd > 0);
-
-    /* This is a hack for remus */
-    if (helper_save_callbacks.checkpoint) {
-        r = lseek(toolstack_save_fd, 0, SEEK_SET);
-        if (r) fail(errno,"rewind toolstack data tmpfile");
-    }
-
-    *buf = xmalloc(toolstack_save_len);
-    r = read_exactly(toolstack_save_fd, *buf, toolstack_save_len);
-    if (r<0) fail(errno,"read toolstack data");
-    if (r==0) fail(0,"read toolstack data eof");
-
-    *len = toolstack_save_len;
-    return 0;
-}
-
 static void startup(const char *op) {
     xtl_log(&logger,XTL_DEBUG,0,program,"starting %s",op);
 
@@ -211,30 +243,27 @@ int main(int argc, char **argv)
 
     if (!strcmp(mode,"--save-domain")) {
 
-        int io_fd =                atoi(NEXTARG);
+        io_fd =                    atoi(NEXTARG);
         uint32_t dom =             strtoul(NEXTARG,0,10);
         uint32_t max_iters =       strtoul(NEXTARG,0,10);
         uint32_t max_factor =      strtoul(NEXTARG,0,10);
         uint32_t flags =           strtoul(NEXTARG,0,10);
         int hvm =                  atoi(NEXTARG);
-        toolstack_save_fd  =       atoi(NEXTARG);
-        toolstack_save_len =       strtoul(NEXTARG,0,10);
         unsigned cbflags =         strtoul(NEXTARG,0,10);
         assert(!*++argv);
 
-        if (toolstack_save_fd >= 0)
-            helper_save_callbacks.toolstack_save = toolstack_save_cb;
-
         helper_setcallbacks_save(&helper_save_callbacks, cbflags);
 
         startup("save");
+        setup_signals(save_signal_handler);
+
         r = xc_domain_save(xch, io_fd, dom, max_iters, max_factor, flags,
                            &helper_save_callbacks, hvm);
         complete(r);
 
     } else if (!strcmp(mode,"--restore-domain")) {
 
-        int io_fd =                atoi(NEXTARG);
+        io_fd =                    atoi(NEXTARG);
         uint32_t dom =             strtoul(NEXTARG,0,10);
         unsigned store_evtchn =    strtoul(NEXTARG,0,10);
         domid_t store_domid =      strtoul(NEXTARG,0,10);
@@ -253,6 +282,8 @@ int main(int argc, char **argv)
         unsigned long console_mfn = 0;
 
         startup("restore");
+        setup_signals(SIG_DFL);
+
         r = xc_domain_restore(xch, io_fd, dom, store_evtchn, &store_mfn,
                               store_domid, console_evtchn, &console_mfn,
                               console_domid, hvm, pae, superpages,
diff --git a/tools/libxl/libxl_save_msgs_gen.pl b/tools/libxl/libxl_save_msgs_gen.pl
index 6b4b65e..d6d2967 100755
--- a/tools/libxl/libxl_save_msgs_gen.pl
+++ b/tools/libxl/libxl_save_msgs_gen.pl
@@ -25,15 +25,12 @@ our @msgs = (
                                                 'unsigned long', 'total'] ],
     [  3, 'scxA',   "suspend", [] ],
     [  4, 'scxA',   "postcopy", [] ],
-    [  5, 'scxA',   "checkpoint", [] ],
+    [  5, 'srcxA',  "checkpoint", [] ],
     [  6, 'scxA',   "switch_qemu_logdirty",  [qw(int domid
                                               unsigned enable)] ],
-    #                toolstack_save          done entirely `by hand'
-    [  7, 'rcxW',   "toolstack_restore",     [qw(uint32_t domid
-                                                BLOCK tsdata)] ],
-    [  8, 'r',      "restore_results",       ['unsigned long', 'store_mfn',
+    [  7, 'r',      "restore_results",       ['unsigned long', 'store_mfn',
                                               'unsigned long', 'console_mfn'] ],
-    [  9, 'srW',    "complete",              [qw(int retval
+    [  8, 'srW',    "complete",              [qw(int retval
                                                  int errnoval)] ],
 );
 
diff --git a/tools/libxl/libxl_sr_stream_format.h b/tools/libxl/libxl_sr_stream_format.h
new file mode 100644
index 0000000..54da360
--- /dev/null
+++ b/tools/libxl/libxl_sr_stream_format.h
@@ -0,0 +1,58 @@
+#ifndef LIBXL__SR_STREAM_FORMAT_H
+#define LIBXL__SR_STREAM_FORMAT_H
+
+/*
+ * C structures for the Migration v2 stream format.
+ * See docs/specs/libxl-migration-stream.pandoc
+ */
+
+#include <stdint.h>
+
+typedef struct libxl__sr_hdr
+{
+    uint64_t ident;
+    uint32_t version;
+    uint32_t options;
+} libxl__sr_hdr;
+
+#define RESTORE_STREAM_IDENT         0x4c6962786c466d74UL
+#define RESTORE_STREAM_VERSION       0x00000002U
+
+#define RESTORE_OPT_BIG_ENDIAN       (1u << 0)
+#define RESTORE_OPT_LEGACY           (1u << 1)
+
+
+typedef struct libxl__sr_rec_hdr
+{
+    uint32_t type;
+    uint32_t length;
+} libxl__sr_rec_hdr;
+
+/* All records must be aligned up to an 8 octet boundary */
+#define REC_ALIGN_ORDER              3U
+
+#define REC_TYPE_END                    0x00000000U
+#define REC_TYPE_LIBXC_CONTEXT          0x00000001U
+#define REC_TYPE_EMULATOR_XENSTORE_DATA 0x00000002U
+#define REC_TYPE_EMULATOR_CONTEXT       0x00000003U
+#define REC_TYPE_CHECKPOINT_END         0x00000004U
+
+typedef struct libxl__sr_emulator_hdr
+{
+    uint32_t id;
+    uint32_t index;
+} libxl__sr_emulator_hdr;
+
+#define EMULATOR_UNKNOWN             0x00000000U
+#define EMULATOR_QEMU_TRADITIONAL    0x00000001U
+#define EMULATOR_QEMU_UPSTREAM       0x00000002U
+
+#endif /* LIBXL__SR_STREAM_FORMAT_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxl_stream_read.c b/tools/libxl/libxl_stream_read.c
new file mode 100644
index 0000000..4ec29da
--- /dev/null
+++ b/tools/libxl/libxl_stream_read.c
@@ -0,0 +1,829 @@
+/*
+ * Copyright (C) 2015      Citrix Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include "libxl_internal.h"
+
+/*
+ * Infrastructure for reading and acting on the contents of a libxl
+ * migration stream. There are a lot of moving parts here.
+ *
+ * The logic revolves around two actions; reading another record from
+ * the stream, and processing the records.  The stream_continue()
+ * function is responsible for choosing the next action to perform.
+ *
+ * The exact order of reading and processing is controlled by 'phase'.
+ * All complete records are held in the record_queue before being
+ * processed, and all records will be processed in queue order.
+ *
+ * Internal states:
+ *           running  phase       in_         record   incoming
+ *                                checkpoint  _queue   _record
+ *
+ * Undefined    undef  undef        undef       undef    undef
+ * Idle         false  undef        false       0        0
+ * Active       true   NORMAL       false       0/1      0/partial
+ * Active       true   BUFFERING    true        any      0/partial
+ * Active       true   UNBUFFERING  true        any      0
+ *
+ * While reading data from the stream, 'dc' is active and a callback
+ * is expected.  Most actions in process_record() start a callback of
+ * their own.  Those which don't return out and stream_continue() sets
+ * up the next action.
+ *
+ * PHASE_NORMAL:
+ *   This phase is used for regular migration or resume from file.
+ *   Records are read one at time and immediately processed.  (The
+ *   record queue will not contain more than a single record.)
+ *
+ * PHASE_BUFFERING:
+ *   This phase is used in checkpointed streams, when libxc signals
+ *   the presence of a checkpoint in the stream.  Records are read and
+ *   buffered until a CHECKPOINT_END record has been read.
+ *
+ * PHASE_UNBUFFERING:
+ *   Once a CHECKPOINT_END record has been read, all buffered records
+ *   are processed.
+ *
+ * Note:
+ *   Record buffers are not allocated from a GC; they are allocated
+ *   and tracked manually.  This is to avoid OOM with Remus where the
+ *   AO lives for the lifetime of the process.  Per-checkpoint AO's
+ *   might be an avenue to explore.
+ *
+ * Entry points from outside:
+ *  - libxl__stream_read_init()
+ *     - Initialises state.  Must be called once before _start()
+ *  - libxl__stream_read_start()
+ *     - Starts reading records from the stream, and acting on them.
+ *  - libxl__stream_read_start_checkpoint()
+ *     - Starts buffering records at a checkpoint.  Must be called on
+ *       a running stream.
+ *
+ * There are several chains of event:
+ *
+ * 1) Starting a stream follows:
+ *    - libxl__stream_read_start()
+ *    - stream_header_done()
+ *    - stream_continue()
+ *
+ * 2) Reading a record follows:
+ *    - stream_continue()
+ *    - record_header_done()
+ *    - record_body_done()
+ *    - stream_continue()
+ *
+ * 3) Processing a record had several chains to follow, depending on
+ *    the record in question.
+ * 3a) "Simple" record:
+ *    - process_record()
+ *    - stream_continue()
+ * 3b) LIBXC record:
+ *    - process_record()
+ *    - libxl__xc_domain_restore()
+ *    - libxl__xc_domain_restore_done()
+ *    - stream_continue()
+ * 3c) EMULATOR record:
+ *    - process_record()
+ *    - stream_write_emulator()
+ *    - stream_write_emulator_done()
+ *    - stream_continue()
+ *
+ * Depending on the contents of the stream, there are likely to be several
+ * parallel tasks being managed.  check_all_finished() is used to join all
+ * tasks in both success and error cases.
+ */
+
+/* Success/error/cleanup handling. */
+static void stream_complete(libxl__egc *egc,
+                            libxl__stream_read_state *stream, int rc);
+static void checkpoint_done(libxl__egc *egc,
+                            libxl__stream_read_state *stream, int rc);
+static void stream_done(libxl__egc *egc,
+                        libxl__stream_read_state *stream, int rc);
+static void conversion_done(libxl__egc *egc,
+                            libxl__conversion_helper_state *chs, int rc);
+static void check_all_finished(libxl__egc *egc,
+                               libxl__stream_read_state *stream, int rc);
+
+/* Event chain for first iteration, from _start(). */
+static void stream_header_done(libxl__egc *egc,
+                               libxl__datacopier_state *dc,
+                               int rc, int onwrite, int errnoval);
+static void stream_continue(libxl__egc *egc,
+                            libxl__stream_read_state *stream);
+static void setup_read_record(libxl__egc *egc,
+                              libxl__stream_read_state *stream);
+static void record_header_done(libxl__egc *egc,
+                               libxl__datacopier_state *dc,
+                               int rc, int onwrite, int errnoval);
+static void record_body_done(libxl__egc *egc,
+                             libxl__datacopier_state *dc,
+                             int rc, int onwrite, int errnoval);
+static bool process_record(libxl__egc *egc,
+                           libxl__stream_read_state *stream);
+
+/* Event chain for processing an emulator blob. */
+static void write_emulator_blob(libxl__egc *egc,
+                                libxl__stream_read_state *stream,
+                                libxl__sr_record_buf *rec);
+static void write_emulator_done(libxl__egc *egc,
+                                libxl__datacopier_state *dc,
+                                int rc, int onwrite, int errnoval);
+
+/*----- Helpers -----*/
+
+/* Helper to set up reading some data from the stream. */
+static int setup_read(libxl__stream_read_state *stream,
+                      const char *what, void *ptr, size_t nr_bytes,
+                      libxl__datacopier_callback cb)
+{
+    libxl__datacopier_state *dc = &stream->dc;
+
+    dc->readwhat      = what;
+    dc->readbuf       = ptr;
+    dc->bytes_to_read = nr_bytes;
+    dc->used          = 0;
+    dc->callback      = cb;
+
+    return libxl__datacopier_start(dc);
+}
+
+static void free_record(libxl__sr_record_buf *rec)
+{
+    if (rec) {
+        free(rec->body);
+        free(rec);
+    }
+}
+
+/*----- Entrypoints -----*/
+
+void libxl__stream_read_init(libxl__stream_read_state *stream)
+{
+    assert(stream->ao);
+
+    stream->shs.ao = stream->ao;
+    libxl__save_helper_init(&stream->shs);
+
+    stream->chs.ao = stream->ao;
+    libxl__conversion_helper_init(&stream->chs);
+
+    stream->rc = 0;
+    stream->running = false;
+    stream->in_checkpoint = false;
+    stream->sync_teardown = false;
+    FILLZERO(stream->dc);
+    FILLZERO(stream->hdr);
+    LIBXL_STAILQ_INIT(&stream->record_queue);
+    stream->phase = SRS_PHASE_NORMAL;
+    stream->recursion_guard = false;
+    stream->incoming_record = NULL;
+    FILLZERO(stream->emu_dc);
+    stream->emu_carefd = NULL;
+}
+
+void libxl__stream_read_start(libxl__egc *egc,
+                              libxl__stream_read_state *stream)
+{
+    libxl__datacopier_state *dc = &stream->dc;
+    STATE_AO_GC(stream->ao);
+    int rc = 0;
+
+    libxl__stream_read_init(stream);
+
+    stream->running = true;
+    stream->phase   = SRS_PHASE_NORMAL;
+
+    if (stream->legacy) {
+        /* Convert the legacy stream. */
+        libxl__conversion_helper_state *chs = &stream->chs;
+
+        chs->legacy_fd = stream->fd;
+        chs->hvm =
+            (stream->dcs->guest_config->b_info.type == LIBXL_DOMAIN_TYPE_HVM);
+        chs->completion_callback = conversion_done;
+
+        rc = libxl__convert_legacy_stream(egc, &stream->chs);
+
+        if (rc) {
+            LOG(ERROR, "Failed to start the legacy stream conversion helper");
+            goto err;
+        }
+
+        assert(stream->chs.v2_carefd);
+        stream->fd = libxl__carefd_fd(stream->chs.v2_carefd);
+        stream->dcs->libxc_fd = stream->fd;
+    }
+    /* stream->fd is now a v2 stream. */
+
+    dc->ao       = stream->ao;
+    dc->copywhat = "restore v2 stream";
+    dc->readfd   = stream->fd;
+    dc->writefd  = -1;
+
+    /* Start reading the stream header. */
+    rc = setup_read(stream, "stream header",
+                    &stream->hdr, sizeof(stream->hdr),
+                    stream_header_done);
+    if (rc)
+        goto err;
+
+    assert(!rc);
+    return;
+
+ err:
+    assert(rc);
+    stream_complete(egc, stream, rc);
+}
+
+void libxl__stream_read_start_checkpoint(libxl__egc *egc,
+                                         libxl__stream_read_state *stream)
+{
+    assert(stream->running);
+    assert(!stream->in_checkpoint);
+
+    stream->in_checkpoint = true;
+    stream->phase = SRS_PHASE_BUFFERING;
+
+    /*
+     * Libxc has handed control of the fd to us.  Start reading some
+     * libxl records out of it.
+     */
+    stream_continue(egc, stream);
+}
+
+void libxl__stream_read_abort(libxl__egc *egc,
+                              libxl__stream_read_state *stream, int rc)
+{
+    assert(rc);
+
+    if (stream->running)
+        stream_complete(egc, stream, rc);
+}
+
+/*----- Event logic -----*/
+
+static void stream_header_done(libxl__egc *egc,
+                               libxl__datacopier_state *dc,
+                               int rc, int onwrite, int errnoval)
+{
+    libxl__stream_read_state *stream = CONTAINER_OF(dc, *stream, dc);
+    libxl__sr_hdr *hdr = &stream->hdr;
+    STATE_AO_GC(dc->ao);
+
+    if (rc)
+        goto err;
+
+    hdr->ident   = be64toh(hdr->ident);
+    hdr->version = be32toh(hdr->version);
+    hdr->options = be32toh(hdr->options);
+
+    if (hdr->ident != RESTORE_STREAM_IDENT) {
+        rc = ERROR_FAIL;
+        LOG(ERROR,
+            "Invalid ident: expected 0x%016"PRIx64", got 0x%016"PRIx64,
+            RESTORE_STREAM_IDENT, hdr->ident);
+        goto err;
+    }
+    if (hdr->version != RESTORE_STREAM_VERSION) {
+        rc = ERROR_FAIL;
+        LOG(ERROR, "Unexpected Version: expected %"PRIu32", got %"PRIu32,
+            RESTORE_STREAM_VERSION, hdr->version);
+        goto err;
+    }
+    if (hdr->options & RESTORE_OPT_BIG_ENDIAN) {
+        rc = ERROR_FAIL;
+        LOG(ERROR, "Unable to handle big endian streams");
+        goto err;
+    }
+
+    LOG(DEBUG, "Stream v%"PRIu32"%s", hdr->version,
+        hdr->options & RESTORE_OPT_LEGACY ? " (from legacy)" : "");
+
+    stream_continue(egc, stream);
+    return;
+
+ err:
+    assert(rc);
+    stream_complete(egc, stream, rc);
+}
+
+static void stream_continue(libxl__egc *egc,
+                            libxl__stream_read_state *stream)
+{
+    STATE_AO_GC(stream->ao);
+
+    /*
+     * Must not mutually recurse with process_record().
+     *
+     * For records whose processing function is synchronous
+     * (e.g. TOOLSTACK), process_record() does not start another async
+     * operation, and a further operation should be started.
+     *
+     * A naive solution, which would function in general, would be for
+     * process_record() to call stream_continue().  However, this
+     * would allow the content of the stream to cause mutual
+     * recursion, and possibly for us to fall off our stack.
+     *
+     * Instead, process_record() indicates with its return value
+     * whether a further operation needs to start, and the
+     * recursion_guard is in place to catch any code paths which get
+     * this wrong.
+     */
+    assert(stream->recursion_guard == false);
+    stream->recursion_guard = true;
+
+    switch (stream->phase) {
+    case SRS_PHASE_NORMAL:
+        /*
+         * Normal phase (regular migration or restore from file):
+         *
+         * logically:
+         *   do { read_record(); process_record(); } while ( not END );
+         *
+         * Alternate between reading a record from the stream, and
+         * processing the record.  There should never be two records
+         * in the queue.
+         */
+        if (LIBXL_STAILQ_EMPTY(&stream->record_queue))
+            setup_read_record(egc, stream);
+        else {
+            if (process_record(egc, stream))
+                setup_read_record(egc, stream);
+
+            /*
+             * process_record() had better have consumed the one and
+             * only record in the queue.
+             */
+            assert(LIBXL_STAILQ_EMPTY(&stream->record_queue));
+        }
+        break;
+
+    case SRS_PHASE_BUFFERING: {
+        /*
+         * Buffering phase (checkpointed streams only):
+         *
+         * logically:
+         *   do { read_record(); } while ( not CHECKPOINT_END );
+         *
+         * Read and buffer all records from the stream until a
+         * CHECKPOINT_END record is encountered.  We need to peek at
+         * the tail to spot the CHECKPOINT_END record, and switch to
+         * the unbuffering phase.
+         */
+        libxl__sr_record_buf *rec = LIBXL_STAILQ_LAST(
+            &stream->record_queue, libxl__sr_record_buf, entry);
+
+        assert(stream->in_checkpoint);
+
+        if (!rec || (rec->hdr.type != REC_TYPE_CHECKPOINT_END)) {
+            setup_read_record(egc, stream);
+            break;
+        }
+
+        /*
+         * There are now some number of buffered records, with a
+         * CHECKPOINT_END at the end. Start processing them all.
+         */
+        stream->phase = SRS_PHASE_UNBUFFERING;
+    }
+        /* FALLTHROUGH */
+    case SRS_PHASE_UNBUFFERING:
+        /*
+         * Unbuffering phase (checkpointed streams only):
+         *
+         * logically:
+         *   do { process_record(); } while ( not CHECKPOINT_END );
+         *
+         * Process all records collected during the buffering phase.
+         */
+        assert(stream->in_checkpoint);
+
+        while (process_record(egc, stream))
+            ; /*
+               * Nothing! process_record() helpfully tells us if no specific
+               * futher actions have been set up, in which case we want to go
+               * ahead and process the next record.
+               */
+        break;
+
+    default:
+        abort();
+    }
+
+    assert(stream->recursion_guard == true);
+    stream->recursion_guard = false;
+}
+
+static void setup_read_record(libxl__egc *egc,
+                              libxl__stream_read_state *stream)
+{
+    libxl__sr_record_buf *rec = NULL;
+    STATE_AO_GC(stream->ao);
+    int rc;
+
+    assert(stream->incoming_record == NULL);
+    stream->incoming_record = rec = libxl__zalloc(NOGC, sizeof(*rec));
+
+    rc = setup_read(stream, "record header",
+                    &rec->hdr, sizeof(rec->hdr),
+                    record_header_done);
+    if (rc)
+        goto err;
+    return;
+
+ err:
+    assert(rc);
+    stream_complete(egc, stream, rc);
+}
+
+static void record_header_done(libxl__egc *egc,
+                               libxl__datacopier_state *dc,
+                               int rc, int onwrite, int errnoval)
+{
+    libxl__stream_read_state *stream = CONTAINER_OF(dc, *stream, dc);
+    libxl__sr_record_buf *rec = stream->incoming_record;
+    STATE_AO_GC(dc->ao);
+
+    if (rc)
+        goto err;
+
+    /* No body? All done. */
+    if (rec->hdr.length == 0) {
+        record_body_done(egc, dc, 0, 0, 0);
+        return;
+    }
+
+    size_t bytes_to_read = ROUNDUP(rec->hdr.length, REC_ALIGN_ORDER);
+    rec->body = libxl__malloc(NOGC, bytes_to_read);
+
+    rc = setup_read(stream, "record body",
+                    rec->body, bytes_to_read,
+                    record_body_done);
+    if (rc)
+        goto err;
+    return;
+
+ err:
+    assert(rc);
+    stream_complete(egc, stream, rc);
+}
+
+static void record_body_done(libxl__egc *egc,
+                             libxl__datacopier_state *dc,
+                             int rc, int onwrite, int errnoval)
+{
+    libxl__stream_read_state *stream = CONTAINER_OF(dc, *stream, dc);
+    libxl__sr_record_buf *rec = stream->incoming_record;
+    STATE_AO_GC(dc->ao);
+
+    if (rc)
+        goto err;
+
+    LIBXL_STAILQ_INSERT_TAIL(&stream->record_queue, rec, entry);
+    stream->incoming_record = NULL;
+
+    stream_continue(egc, stream);
+    return;
+
+ err:
+    assert(rc);
+    stream_complete(egc, stream, rc);
+}
+
+/*
+ * Returns a boolean indicating whether a further action should be set
+ * up by the caller.  This is needed to prevent mutual recursion with
+ * stream_continue().
+ *
+ * It is a bug for this function to ever call stream_continue() or
+ * setup_read_record().
+ */
+static bool process_record(libxl__egc *egc,
+                           libxl__stream_read_state *stream)
+{
+    STATE_AO_GC(stream->ao);
+    libxl__domain_create_state *dcs = stream->dcs;
+    libxl__sr_record_buf *rec;
+    bool further_action_needed = false;
+    int rc = 0;
+
+    /* Pop a record from the head of the queue. */
+    assert(!LIBXL_STAILQ_EMPTY(&stream->record_queue));
+    rec = LIBXL_STAILQ_FIRST(&stream->record_queue);
+    LIBXL_STAILQ_REMOVE_HEAD(&stream->record_queue, entry);
+
+    LOG(DEBUG, "Record: %u, length %u", rec->hdr.type, rec->hdr.length);
+
+    switch (rec->hdr.type) {
+
+    case REC_TYPE_END:
+        stream_complete(egc, stream, 0);
+        break;
+
+    case REC_TYPE_LIBXC_CONTEXT:
+        libxl__xc_domain_restore(egc, dcs, &stream->shs, 0, 0, 0);
+        break;
+
+    case REC_TYPE_EMULATOR_XENSTORE_DATA:
+        if (rec->hdr.length < sizeof(libxl__sr_emulator_hdr)) {
+            rc = ERROR_FAIL;
+            LOG(ERROR,
+                "Emulator xenstore data record too short to contain header");
+            goto err;
+        }
+
+        rc = libxl__restore_emulator_xenstore_data(dcs,
+            rec->body + sizeof(libxl__sr_emulator_hdr),
+            rec->hdr.length - sizeof(libxl__sr_emulator_hdr));
+        if (rc)
+            goto err;
+
+        /*
+         * libxl__restore_emulator_xenstore_data() is a synchronous function.
+         * Request that our caller queues another action for us.
+         */
+        further_action_needed = true;
+        break;
+
+    case REC_TYPE_EMULATOR_CONTEXT:
+        write_emulator_blob(egc, stream, rec);
+        break;
+
+    case REC_TYPE_CHECKPOINT_END:
+        if (!stream->in_checkpoint) {
+            LOG(ERROR, "Unexpected CHECKPOINT_END record in stream");
+            rc = ERROR_FAIL;
+            goto err;
+        }
+        checkpoint_done(egc, stream, 0);
+        break;
+
+    default:
+        LOG(ERROR, "Unrecognised record 0x%08x", rec->hdr.type);
+        rc = ERROR_FAIL;
+        goto err;
+    }
+
+    assert(!rc);
+    free_record(rec);
+    return further_action_needed;
+
+ err:
+    assert(rc);
+    free_record(rec);
+    stream_complete(egc, stream, rc);
+    return false;
+}
+
+static void write_emulator_blob(libxl__egc *egc,
+                                libxl__stream_read_state *stream,
+                                libxl__sr_record_buf *rec)
+{
+    libxl__domain_create_state *dcs = stream->dcs;
+    libxl__datacopier_state *dc = &stream->emu_dc;
+    libxl__sr_emulator_hdr *emu_hdr;
+    STATE_AO_GC(stream->ao);
+    char path[256];
+    int rc = 0, writefd;
+
+    if (rec->hdr.length < sizeof(*emu_hdr)) {
+        rc = ERROR_FAIL;
+        LOG(ERROR, "Emulator record too short to contain header");
+        goto err;
+    }
+    emu_hdr = rec->body;
+
+    sprintf(path, LIBXL_DEVICE_MODEL_RESTORE_FILE".%u", dcs->guest_domid);
+
+    assert(stream->emu_carefd == NULL);
+    libxl__carefd_begin();
+    writefd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0600);
+    stream->emu_carefd = libxl__carefd_opened(CTX, writefd);
+
+    if (writefd == -1) {
+        rc = ERROR_FAIL;
+        LOGE(ERROR, "unable to open %s", path);
+        goto err;
+    }
+
+    FILLZERO(*dc);
+    dc->ao         = stream->ao;
+    dc->writewhat  = "qemu save file";
+    dc->copywhat   = "restore v2 stream";
+    dc->writefd    = writefd;
+    dc->readfd     = -1;
+    dc->maxsz      = -1;
+    dc->callback   = write_emulator_done;
+
+    rc = libxl__datacopier_start(dc);
+    if (rc)
+        goto err;
+
+    libxl__datacopier_prefixdata(egc, dc,
+                                 rec->body + sizeof(*emu_hdr),
+                                 rec->hdr.length - sizeof(*emu_hdr));
+    return;
+
+ err:
+    assert(rc);
+    stream_complete(egc, stream, rc);
+}
+
+static void write_emulator_done(libxl__egc *egc,
+                                libxl__datacopier_state *dc,
+                                int rc, int onwrite, int errnoval)
+{
+    libxl__stream_read_state *stream = CONTAINER_OF(dc, *stream, emu_dc);
+    STATE_AO_GC(dc->ao);
+
+    libxl__carefd_close(stream->emu_carefd);
+    stream->emu_carefd = NULL;
+
+    if (rc)
+        goto err;
+
+    stream_continue(egc, stream);
+    return;
+
+ err:
+    assert(rc);
+    stream_complete(egc, stream, rc);
+}
+
+/*----- Success/error/cleanup handling. -----*/
+
+static void stream_complete(libxl__egc *egc,
+                            libxl__stream_read_state *stream, int rc)
+{
+    assert(stream->running);
+
+    if (stream->in_checkpoint) {
+        assert(rc);
+
+        /*
+         * If an error is encountered while in a checkpoint, pass it
+         * back to libxc.  The failure will come back around to us via
+         * libxl__xc_domain_restore_done()
+         */
+        checkpoint_done(egc, stream, rc);
+        return;
+    }
+
+    stream_done(egc, stream, rc);
+}
+
+static void checkpoint_done(libxl__egc *egc,
+                            libxl__stream_read_state *stream, int rc)
+{
+    int ret;
+
+    assert(stream->in_checkpoint);
+
+    if (rc == 0)
+        ret = XGR_CHECKPOINT_SUCCESS;
+    else if (stream->phase == SRS_PHASE_BUFFERING)
+        ret = XGR_CHECKPOINT_FAILOVER;
+    else
+        ret = XGR_CHECKPOINT_ERROR;
+
+    stream->checkpoint_callback(egc, stream, ret);
+
+    stream->in_checkpoint = false;
+    stream->phase = SRS_PHASE_NORMAL;
+}
+
+static void stream_done(libxl__egc *egc,
+                        libxl__stream_read_state *stream, int rc)
+{
+    libxl__sr_record_buf *rec, *trec;
+
+    assert(stream->running);
+    assert(!stream->in_checkpoint);
+    stream->running = false;
+
+    if (stream->incoming_record)
+        free_record(stream->incoming_record);
+
+    if (stream->emu_carefd)
+        libxl__carefd_close(stream->emu_carefd);
+
+    /* If we started a conversion helper, we took ownership of its carefd. */
+    if (stream->chs.v2_carefd)
+        libxl__carefd_close(stream->chs.v2_carefd);
+
+    /* The record queue had better be empty if the stream believes
+     * itself to have been successful. */
+    assert(LIBXL_STAILQ_EMPTY(&stream->record_queue) || stream->rc);
+
+    LIBXL_STAILQ_FOREACH_SAFE(rec, &stream->record_queue, entry, trec)
+        free_record(rec);
+
+    check_all_finished(egc, stream, rc);
+}
+
+void libxl__xc_domain_restore_done(libxl__egc *egc, void *dcs_void,
+                                   int rc, int retval, int errnoval)
+{
+    libxl__domain_create_state *dcs = dcs_void;
+    libxl__stream_read_state *stream = &dcs->srs;
+    STATE_AO_GC(dcs->ao);
+
+    if (rc)
+        goto err;
+
+    if (retval) {
+        LOGEV(ERROR, errnoval, "restoring domain");
+        rc = ERROR_FAIL;
+        goto err;
+    }
+
+ err:
+    check_all_finished(egc, stream, rc);
+
+    /*
+     * This function is the callback associated with the save helper
+     * task, not the stream task.  We do not know whether the stream is
+     * alive, and check_all_finished() may have torn it down around us.
+     * If the stream is not still alive, we must not continue any work.
+     */
+    if (libxl__stream_read_inuse(stream)) {
+        /*
+         * Libxc has indicated that it is done with the stream.  Resume reading
+         * libxl records from it.
+         */
+        stream_continue(egc, stream);
+    }
+}
+
+static void conversion_done(libxl__egc *egc,
+                            libxl__conversion_helper_state *chs, int rc)
+{
+    libxl__stream_read_state *stream = CONTAINER_OF(chs, *stream, chs);
+
+    check_all_finished(egc, stream, rc);
+}
+
+static void check_all_finished(libxl__egc *egc,
+                               libxl__stream_read_state *stream, int rc)
+{
+    STATE_AO_GC(stream->ao);
+
+    /*
+     * In the case of a failure, the _abort()'s below might cancel
+     * synchronously on top of us, or asynchronously at a later point.
+     *
+     * We must avoid the situation where all _abort() cancel
+     * synchronously and the completion_callback() gets called twice;
+     * once by the first error and once by the final stacked abort(),
+     * both of whom will find that all of the tasks have stopped.
+     *
+     * To avoid this problem, any stacked re-entry into this function is
+     * ineligible to fire the completion callback.  The outermost
+     * instance will take care of completing, once the stack has
+     * unwound.
+     */
+    if (stream->sync_teardown)
+        return;
+
+    if (!stream->rc && rc) {
+        /* First reported failure. Tear everything down. */
+        stream->rc = rc;
+        stream->sync_teardown = true;
+
+        libxl__stream_read_abort(egc, stream, rc);
+        libxl__save_helper_abort(egc, &stream->shs);
+        libxl__conversion_helper_abort(egc, &stream->chs, rc);
+
+        stream->sync_teardown = false;
+    }
+
+    /* Don't fire the callback until all our parallel tasks have stopped. */
+    if (libxl__stream_read_inuse(stream) ||
+        libxl__save_helper_inuse(&stream->shs) ||
+        libxl__conversion_helper_inuse(&stream->chs))
+        return;
+
+    stream->completion_callback(egc, stream, stream->rc);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxl_stream_write.c b/tools/libxl/libxl_stream_write.c
new file mode 100644
index 0000000..52a60d7
--- /dev/null
+++ b/tools/libxl/libxl_stream_write.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (C) 2015      Citrix Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include "libxl_internal.h"
+
+/*
+ * Infrastructure for writing a domain to a libxl migration v2 stream.
+ *
+ * Entry points from outside:
+ *  - libxl__stream_write_start()
+ *     - Start writing a stream from the start.
+ *  - libxl__stream_write_start_checkpoint()
+ *     - Write the records which form a checkpoint into a stream.
+ *
+ * In normal operation, there are two tasks running at once; this
+ * stream processing, and the libxl-save-helper.  check_all_finished()
+ * is used to join all the tasks in both success and error cases.
+ *
+ * Nomenclature for event callbacks:
+ *  - $FOO_done(): Completion callback for $FOO
+ *  - write_$FOO(): Set up the datacopier to write a $FOO
+ *  - $BAR_header(): A $BAR record header only
+ *  - $BAR_record(): A complete $BAR record with header and content
+ *
+ * The main loop for a plain VM writes:
+ *  - Stream header
+ *  - Libxc record
+ *  - (optional) Emulator xenstore record
+ *  - if (hvm)
+ *      - Emulator context record
+ *  - End record
+ *
+ * For checkpointed stream, there is a second loop which is triggered by a
+ * save-helper checkpoint callback.  It writes:
+ *  - (optional) Emulator xenstore record
+ *  - if (hvm)
+ *      - Emulator context record
+ *  - Checkpoint end record
+ */
+
+/* Success/error/cleanup handling. */
+static void stream_success(libxl__egc *egc,
+                           libxl__stream_write_state *stream);
+static void stream_complete(libxl__egc *egc,
+                            libxl__stream_write_state *stream, int rc);
+static void stream_done(libxl__egc *egc,
+                        libxl__stream_write_state *stream, int rc);
+static void checkpoint_done(libxl__egc *egc,
+                            libxl__stream_write_state *stream,
+                            int rc);
+static void check_all_finished(libxl__egc *egc,
+                               libxl__stream_write_state *stream, int rc);
+
+/* Event chain for a plain VM. */
+static void stream_header_done(libxl__egc *egc,
+                               libxl__datacopier_state *dc,
+                               int rc, int onwrite, int errnoval);
+static void libxc_header_done(libxl__egc *egc,
+                              libxl__stream_write_state *stream);
+/* libxl__xc_domain_save_done() lives here, event-order wise. */
+static void write_emulator_xenstore_record(libxl__egc *egc,
+                                           libxl__stream_write_state *stream);
+static void emulator_xenstore_record_done(libxl__egc *egc,
+                                          libxl__stream_write_state *stream);
+static void write_emulator_context_record(libxl__egc *egc,
+                                          libxl__stream_write_state *stream);
+static void emulator_context_read_done(libxl__egc *egc,
+                                       libxl__datacopier_state *dc,
+                                       int rc, int onwrite, int errnoval);
+static void emulator_context_record_done(libxl__egc *egc,
+                                         libxl__stream_write_state *stream);
+static void write_end_record(libxl__egc *egc,
+                             libxl__stream_write_state *stream);
+
+/* Event chain unique to checkpointed streams. */
+static void write_checkpoint_end_record(libxl__egc *egc,
+                                        libxl__stream_write_state *stream);
+static void checkpoint_end_record_done(libxl__egc *egc,
+                                       libxl__stream_write_state *stream);
+
+/*----- Helpers -----*/
+
+static void write_done(libxl__egc *egc,
+                       libxl__datacopier_state *dc,
+                       int rc, int onwrite, int errnoval);
+
+/* Generic helper to set up writing some data to the stream. */
+static void setup_generic_write(libxl__egc *egc,
+                                libxl__stream_write_state *stream,
+                                const char *what,
+                                libxl__sr_rec_hdr *hdr,
+                                libxl__sr_emulator_hdr *emu_hdr,
+                                void *body,
+                                sws_record_done_cb cb)
+{
+    static const uint8_t zero_padding[1U << REC_ALIGN_ORDER] = { 0 };
+
+    libxl__datacopier_state *dc = &stream->dc;
+    int rc;
+
+    assert(stream->record_done_callback == NULL);
+
+    dc->writewhat = what;
+    dc->used      = 0;
+    dc->callback  = write_done;
+    rc = libxl__datacopier_start(dc);
+
+    if (rc) {
+        stream_complete(egc, stream, rc);
+        return;
+    }
+
+    size_t padsz = ROUNDUP(hdr->length, REC_ALIGN_ORDER) - hdr->length;
+    uint32_t length = hdr->length;
+
+    /* Insert header */
+    libxl__datacopier_prefixdata(egc, dc, hdr, sizeof(*hdr));
+
+    /* Optional emulator sub-header */
+    if (emu_hdr) {
+        assert(length >= sizeof(*emu_hdr));
+        libxl__datacopier_prefixdata(egc, dc, emu_hdr, sizeof(*emu_hdr));
+        length -= sizeof(*emu_hdr);
+    }
+
+    /* Optional body */
+    if (body)
+        libxl__datacopier_prefixdata(egc, dc, body, length);
+
+    /* Any required padding */
+    if (padsz > 0)
+        libxl__datacopier_prefixdata(egc, dc,
+                                     zero_padding, padsz);
+    stream->record_done_callback = cb;
+}
+
+/* Helper to set up writing a regular record to the stream. */
+static void setup_write(libxl__egc *egc,
+                        libxl__stream_write_state *stream,
+                        const char *what,
+                        libxl__sr_rec_hdr *hdr,
+                        void *body,
+                        sws_record_done_cb cb)
+{
+    setup_generic_write(egc, stream, what, hdr, NULL, body, cb);
+}
+
+/* Helper to set up writing a record with an emulator prefix to the stream. */
+static void setup_emulator_write(libxl__egc *egc,
+                                 libxl__stream_write_state *stream,
+                                 const char *what,
+                                 libxl__sr_rec_hdr *hdr,
+                                 libxl__sr_emulator_hdr *emu_hdr,
+                                 void *body,
+                                 sws_record_done_cb cb)
+{
+    setup_generic_write(egc, stream, what, hdr, emu_hdr, body, cb);
+}
+
+
+static void write_done(libxl__egc *egc,
+                       libxl__datacopier_state *dc,
+                       int rc, int onwrite, int errnoval)
+{
+    libxl__stream_write_state *stream = CONTAINER_OF(dc, *stream, dc);
+    STATE_AO_GC(stream->ao);
+    sws_record_done_cb cb = stream->record_done_callback;
+
+    stream->record_done_callback = NULL;
+
+    if (onwrite || errnoval)
+        stream_complete(egc, stream, rc ?: ERROR_FAIL);
+    else
+        cb(egc, stream);
+}
+
+/*----- Entrypoints -----*/
+
+void libxl__stream_write_init(libxl__stream_write_state *stream)
+{
+    assert(stream->ao);
+
+    stream->shs.ao = stream->ao;
+    libxl__save_helper_init(&stream->shs);
+
+    stream->rc = 0;
+    stream->running = false;
+    stream->in_checkpoint = false;
+    stream->sync_teardown = false;
+    FILLZERO(stream->dc);
+    stream->record_done_callback = NULL;
+    FILLZERO(stream->emu_dc);
+    stream->emu_carefd = NULL;
+    FILLZERO(stream->emu_rec_hdr);
+    FILLZERO(stream->emu_sub_hdr);
+    stream->emu_body = NULL;
+}
+
+void libxl__stream_write_start(libxl__egc *egc,
+                               libxl__stream_write_state *stream)
+{
+    libxl__datacopier_state *dc = &stream->dc;
+    libxl__domain_suspend_state *dss = stream->dss;
+    STATE_AO_GC(stream->ao);
+    struct libxl__sr_hdr hdr;
+    int rc = 0;
+
+    libxl__stream_write_init(stream);
+
+    stream->running = true;
+
+    if (dss->type == LIBXL_DOMAIN_TYPE_HVM) {
+        switch (libxl__device_model_version_running(gc, dss->domid)) {
+        case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL:
+            stream->emu_sub_hdr.id = EMULATOR_QEMU_TRADITIONAL;
+            break;
+
+        case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
+            stream->emu_sub_hdr.id = EMULATOR_QEMU_UPSTREAM;
+            break;
+
+        default:
+            rc = ERROR_FAIL;
+            LOG(ERROR, "Unknown emulator for HVM domain");
+            goto err;
+        }
+        stream->emu_sub_hdr.index = 0;
+    }
+
+    dc->ao        = ao;
+    dc->readfd    = -1;
+    dc->writewhat = "stream header";
+    dc->copywhat  = "save v2 stream";
+    dc->writefd   = stream->fd;
+    dc->maxsz     = -1;
+    dc->callback  = stream_header_done;
+
+    rc = libxl__datacopier_start(dc);
+    if (rc)
+        goto err;
+
+    FILLZERO(hdr);
+    hdr.ident   = htobe64(RESTORE_STREAM_IDENT);
+    hdr.version = htobe32(RESTORE_STREAM_VERSION);
+    hdr.options = htobe32(0);
+
+    libxl__datacopier_prefixdata(egc, dc, &hdr, sizeof(hdr));
+    return;
+
+ err:
+    assert(rc);
+    stream_complete(egc, stream, rc);
+}
+
+void libxl__stream_write_start_checkpoint(libxl__egc *egc,
+                                          libxl__stream_write_state *stream)
+{
+    assert(stream->running);
+    assert(!stream->in_checkpoint);
+    stream->in_checkpoint = true;
+
+    write_emulator_xenstore_record(egc, stream);
+}
+
+void libxl__stream_write_abort(libxl__egc *egc,
+                               libxl__stream_write_state *stream, int rc)
+{
+    assert(rc);
+
+    if (stream->running)
+        stream_complete(egc, stream, rc);
+}
+
+/*----- Event logic -----*/
+
+static void stream_header_done(libxl__egc *egc,
+                               libxl__datacopier_state *dc,
+                               int rc, int onwrite, int errnoval)
+{
+    libxl__stream_write_state *stream = CONTAINER_OF(dc, *stream, dc);
+    STATE_AO_GC(stream->ao);
+    struct libxl__sr_rec_hdr rec;
+
+    if (rc || errnoval) {
+        stream_complete(egc, stream, rc ?: ERROR_FAIL);
+        return;
+    }
+
+    FILLZERO(rec);
+    rec.type = REC_TYPE_LIBXC_CONTEXT;
+
+    setup_write(egc, stream, "libxc header",
+                &rec, NULL, libxc_header_done);
+}
+
+static void libxc_header_done(libxl__egc *egc,
+                              libxl__stream_write_state *stream)
+{
+    libxl__xc_domain_save(egc, stream->dss, &stream->shs);
+}
+
+void libxl__xc_domain_save_done(libxl__egc *egc, void *dss_void,
+                                int rc, int retval, int errnoval)
+{
+    libxl__domain_suspend_state *dss = dss_void;
+    libxl__stream_write_state *stream = &dss->sws;
+    STATE_AO_GC(dss->ao);
+
+    if (rc)
+        goto err;
+
+    if (retval) {
+        LOGEV(ERROR, errnoval, "saving domain: %s",
+              dss->guest_responded ?
+              "domain responded to suspend request" :
+              "domain did not respond to suspend request");
+        if (!dss->guest_responded)
+            rc = ERROR_GUEST_TIMEDOUT;
+        else if (dss->rc)
+            rc = dss->rc;
+        else
+            rc = ERROR_FAIL;
+        goto err;
+    }
+
+ err:
+    check_all_finished(egc, stream, rc);
+
+    /*
+     * This function is the callback associated with the save helper
+     * task, not the stream task.  We do not know whether the stream is
+     * alive, and check_all_finished() may have torn it down around us.
+     * If the stream is not still alive, we must not continue any work.
+     */
+    if (libxl__stream_write_inuse(stream))
+        write_emulator_xenstore_record(egc, stream);
+}
+
+static void write_emulator_xenstore_record(libxl__egc *egc,
+                                           libxl__stream_write_state *stream)
+{
+    libxl__domain_suspend_state *dss = stream->dss;
+    STATE_AO_GC(stream->ao);
+    struct libxl__sr_rec_hdr rec;
+    int rc;
+    char *buf = NULL;
+    uint32_t len = 0;
+
+    rc = libxl__save_emulator_xenstore_data(dss, &buf, &len);
+    if (rc)
+        goto err;
+
+    /* No record? - All done. */
+    if (len == 0) {
+        emulator_xenstore_record_done(egc, stream);
+        return;
+    }
+
+    FILLZERO(rec);
+    rec.type = REC_TYPE_EMULATOR_XENSTORE_DATA;
+    rec.length = len + sizeof(stream->emu_sub_hdr);
+
+    setup_emulator_write(egc, stream, "emulator xenstore record",
+                         &rec, &stream->emu_sub_hdr, buf,
+                         emulator_xenstore_record_done);
+    return;
+
+ err:
+    assert(rc);
+    stream_complete(egc, stream, rc);
+}
+
+static void emulator_xenstore_record_done(libxl__egc *egc,
+                                          libxl__stream_write_state *stream)
+{
+    libxl__domain_suspend_state *dss = stream->dss;
+
+    if (dss->type == LIBXL_DOMAIN_TYPE_HVM)
+        write_emulator_context_record(egc, stream);
+    else {
+        if (stream->in_checkpoint)
+            write_checkpoint_end_record(egc, stream);
+        else
+            write_end_record(egc, stream);
+    }
+}
+
+static void write_emulator_context_record(libxl__egc *egc,
+                                          libxl__stream_write_state *stream)
+{
+    libxl__domain_suspend_state *dss = stream->dss;
+    libxl__datacopier_state *dc = &stream->emu_dc;
+    STATE_AO_GC(stream->ao);
+    struct libxl__sr_rec_hdr *rec = &stream->emu_rec_hdr;
+    struct stat st;
+    int rc;
+
+    assert(dss->type == LIBXL_DOMAIN_TYPE_HVM);
+
+    /* Convenience aliases */
+    const char *const filename = dss->dm_savefile;
+
+    libxl__carefd_begin();
+    int readfd = open(filename, O_RDONLY);
+    stream->emu_carefd = libxl__carefd_opened(CTX, readfd);
+    if (readfd == -1) {
+        rc = ERROR_FAIL;
+        LOGE(ERROR, "unable to open %s", filename);
+        goto err;
+    }
+
+    if (fstat(readfd, &st)) {
+        rc = ERROR_FAIL;
+        LOGE(ERROR, "unable to fstat %s", filename);
+        goto err;
+    }
+
+    if (!S_ISREG(st.st_mode)) {
+        rc = ERROR_FAIL;
+        LOG(ERROR, "%s is not a plain file!", filename);
+        goto err;
+    }
+
+    rec->type = REC_TYPE_EMULATOR_CONTEXT;
+    rec->length = st.st_size + sizeof(stream->emu_sub_hdr);
+    stream->emu_body = libxl__malloc(NOGC, st.st_size);
+
+    FILLZERO(*dc);
+    dc->ao            = stream->ao;
+    dc->readwhat      = "qemu save file";
+    dc->copywhat      = "save v2 stream";
+    dc->readfd        = readfd;
+    dc->writefd       = -1;
+    dc->maxsz         = -1;
+    dc->readbuf       = stream->emu_body;
+    dc->bytes_to_read = st.st_size;
+    dc->callback      = emulator_context_read_done;
+
+    rc = libxl__datacopier_start(dc);
+    if (rc)
+        goto err;
+
+    return;
+
+ err:
+    assert(rc);
+    stream_complete(egc, stream, rc);
+}
+
+static void emulator_context_read_done(libxl__egc *egc,
+                                       libxl__datacopier_state *dc,
+                                       int rc, int onwrite, int errnoval)
+{
+    libxl__stream_write_state *stream = CONTAINER_OF(dc, *stream, emu_dc);
+    STATE_AO_GC(stream->ao);
+
+    if (rc || onwrite || errnoval) {
+        stream_complete(egc, stream, rc ?: ERROR_FAIL);
+        return;
+    }
+
+    libxl__carefd_close(stream->emu_carefd);
+    stream->emu_carefd = NULL;
+
+    setup_emulator_write(egc, stream, "emulator record",
+                         &stream->emu_rec_hdr,
+                         &stream->emu_sub_hdr,
+                         stream->emu_body,
+                         emulator_context_record_done);
+}
+
+static void emulator_context_record_done(libxl__egc *egc,
+                                         libxl__stream_write_state *stream)
+{
+    free(stream->emu_body);
+    stream->emu_body = NULL;
+
+    if (stream->in_checkpoint)
+        write_checkpoint_end_record(egc, stream);
+    else
+        write_end_record(egc, stream);
+}
+
+static void write_end_record(libxl__egc *egc,
+                             libxl__stream_write_state *stream)
+{
+    struct libxl__sr_rec_hdr rec;
+
+    FILLZERO(rec);
+    rec.type = REC_TYPE_END;
+
+    setup_write(egc, stream, "end record",
+                &rec, NULL, stream_success);
+}
+
+static void write_checkpoint_end_record(libxl__egc *egc,
+                                        libxl__stream_write_state *stream)
+{
+    struct libxl__sr_rec_hdr rec;
+
+    FILLZERO(rec);
+    rec.type = REC_TYPE_CHECKPOINT_END;
+
+    setup_write(egc, stream, "checkpoint end record",
+                &rec, NULL, checkpoint_end_record_done);
+}
+
+static void checkpoint_end_record_done(libxl__egc *egc,
+                                       libxl__stream_write_state *stream)
+{
+    checkpoint_done(egc, stream, 0);
+}
+
+/*----- Success/error/cleanup handling. -----*/
+
+static void stream_success(libxl__egc *egc, libxl__stream_write_state *stream)
+{
+    stream_complete(egc, stream, 0);
+}
+
+static void stream_complete(libxl__egc *egc,
+                            libxl__stream_write_state *stream, int rc)
+{
+    assert(stream->running);
+
+    if (stream->in_checkpoint) {
+        assert(rc);
+
+        /*
+         * If an error is encountered while in a checkpoint, pass it
+         * back to libxc.  The failure will come back around to us via
+         * libxl__xc_domain_save_done()
+         */
+        checkpoint_done(egc, stream, rc);
+        return;
+    }
+
+    stream_done(egc, stream, rc);
+}
+
+static void stream_done(libxl__egc *egc,
+                        libxl__stream_write_state *stream, int rc)
+{
+    assert(stream->running);
+    stream->running = false;
+
+    if (stream->emu_carefd)
+        libxl__carefd_close(stream->emu_carefd);
+    free(stream->emu_body);
+
+    check_all_finished(egc, stream, rc);
+}
+
+static void checkpoint_done(libxl__egc *egc,
+                            libxl__stream_write_state *stream,
+                            int rc)
+{
+    assert(stream->in_checkpoint);
+
+    stream->in_checkpoint = false;
+    stream->checkpoint_callback(egc, stream, rc);
+}
+
+static void check_all_finished(libxl__egc *egc,
+                               libxl__stream_write_state *stream,
+                               int rc)
+{
+    STATE_AO_GC(stream->ao);
+
+    /*
+     * In the case of a failure, the _abort()'s below might cancel
+     * synchronously on top of us, or asynchronously at a later point.
+     *
+     * We must avoid the situation where all _abort() cancel
+     * synchronously and the completion_callback() gets called twice;
+     * once by the first error and once by the final stacked abort(),
+     * both of whom will find that all of the tasks have stopped.
+     *
+     * To avoid this problem, any stacked re-entry into this function is
+     * ineligible to fire the completion callback.  The outermost
+     * instance will take care of completing, once the stack has
+     * unwound.
+     */
+    if (stream->sync_teardown)
+        return;
+
+    if (!stream->rc && rc) {
+        /* First reported failure. Tear everything down. */
+        stream->rc = rc;
+        stream->sync_teardown = true;
+
+        libxl__stream_write_abort(egc, stream, rc);
+        libxl__save_helper_abort(egc, &stream->shs);
+
+        stream->sync_teardown = false;
+    }
+
+    /* Don't fire the callback until all our parallel tasks have stopped. */
+    if (libxl__stream_write_inuse(stream) ||
+        libxl__save_helper_inuse(&stream->shs))
+        return;
+
+    stream->completion_callback(egc, stream, stream->rc);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxl_test_fdevent.c b/tools/libxl/libxl_test_fdevent.c
new file mode 100644
index 0000000..2d875d9
--- /dev/null
+++ b/tools/libxl/libxl_test_fdevent.c
@@ -0,0 +1,79 @@
+/*
+ * fdevent test helpr for the libxl event system
+ */
+
+#include "libxl_internal.h"
+
+#include "libxl_test_fdevent.h"
+
+typedef struct {
+    libxl__ao *ao;
+    libxl__ev_fd fd;
+    libxl__ao_abortable abrt;
+} libxl__test_fdevent;
+
+static void fdevent_complete(libxl__egc *egc, libxl__test_fdevent *tfe,
+                             int rc);
+
+static void tfe_init(libxl__test_fdevent *tfe, libxl__ao *ao)
+{
+    tfe->ao = ao;
+    libxl__ev_fd_init(&tfe->fd);
+    libxl__ao_abortable_init(&tfe->abrt);
+}
+
+static void tfe_cleanup(libxl__gc *gc, libxl__test_fdevent *tfe)
+{
+    libxl__ev_fd_deregister(gc, &tfe->fd);
+    libxl__ao_abortable_deregister(&tfe->abrt);
+}
+
+static void tfe_fd_cb(libxl__egc *egc, libxl__ev_fd *ev,
+                      int fd, short events, short revents)
+{
+    libxl__test_fdevent *tfe = CONTAINER_OF(ev,*tfe,fd);
+    STATE_AO_GC(tfe->ao);
+    fdevent_complete(egc, tfe, 0);
+}
+
+static void tfe_abrt_cb(libxl__egc *egc, libxl__ao_abortable *abrt,
+                        int rc)
+{
+    libxl__test_fdevent *tfe = CONTAINER_OF(abrt,*tfe,abrt);
+    STATE_AO_GC(tfe->ao);
+    fdevent_complete(egc, tfe, rc);
+}
+
+static void fdevent_complete(libxl__egc *egc, libxl__test_fdevent *tfe,
+                             int rc)
+{
+    STATE_AO_GC(tfe->ao);
+    tfe_cleanup(gc, tfe);
+    libxl__ao_complete(egc, ao, rc);
+}
+
+int libxl_test_fdevent(libxl_ctx *ctx, int fd, short events,
+                       libxl_asyncop_how *ao_how)
+{
+    int rc;
+    libxl__test_fdevent *tfe;
+
+    AO_CREATE(ctx, 0, ao_how);
+    GCNEW(tfe);
+
+    tfe_init(tfe, ao);
+
+    rc = libxl__ev_fd_register(gc, &tfe->fd, tfe_fd_cb, fd, events);
+    if (rc) goto out;
+
+    tfe->abrt.ao = ao;
+    tfe->abrt.callback = tfe_abrt_cb;
+    rc = libxl__ao_abortable_register(&tfe->abrt);
+    if (rc) goto out;
+
+    return AO_INPROGRESS;
+
+ out:
+    tfe_cleanup(gc, tfe);
+    return AO_CREATE_FAIL(rc);
+}
diff --git a/tools/libxl/libxl_test_fdevent.h b/tools/libxl/libxl_test_fdevent.h
new file mode 100644
index 0000000..82a307e
--- /dev/null
+++ b/tools/libxl/libxl_test_fdevent.h
@@ -0,0 +1,12 @@
+#ifndef TEST_FDEVENT_H
+#define TEST_FDEVENT_H
+
+#include <pthread.h>
+
+int libxl_test_fdevent(libxl_ctx *ctx, int fd, short events,
+                       libxl_asyncop_how *ao_how)
+                       LIBXL_EXTERNAL_CALLERS_ONLY;
+/* This operation waits for one of the poll events to occur on fd, and
+ * then completes successfully.  (Or, it can be aborted.) */
+
+#endif /*TEST_FDEVENT_H*/
diff --git a/tools/libxl/libxl_test_timedereg.c b/tools/libxl/libxl_test_timedereg.c
index a44639f..a567db6 100644
--- a/tools/libxl/libxl_test_timedereg.c
+++ b/tools/libxl/libxl_test_timedereg.c
@@ -28,14 +28,15 @@ static libxl__ao *tao;
 static int seq;
 
 static void occurs(libxl__egc *egc, libxl__ev_time *ev,
-                   const struct timeval *requested_abs);
+                   const struct timeval *requested_abs, int rc);
 
-static void regs(libxl__gc *gc, int j)
+static void regs(libxl__ao *ao, int j)
 {
+    AO_GC;
     int rc, i;
     LOG(DEBUG,"regs(%d)", j);
     for (i=0; i<NTIMES; i++) {
-        rc = libxl__ev_time_register_rel(gc, &et[j][i], occurs, ms[j][i]);
+        rc = libxl__ev_time_register_rel(ao, &et[j][i], occurs, ms[j][i]);
         assert(!rc);
     }    
 }
@@ -52,26 +53,28 @@ int libxl_test_timedereg(libxl_ctx *ctx, libxl_asyncop_how *ao_how)
         libxl__ev_time_init(&et[1][i]);
     }
 
-    regs(gc, 0);
+    regs(ao, 0);
 
     return AO_INPROGRESS;
 }
 
 static void occurs(libxl__egc *egc, libxl__ev_time *ev,
-                   const struct timeval *requested_abs)
+                   const struct timeval *requested_abs, int rc)
 {
     EGC_GC;
     int i;
 
     int off = ev - &et[0][0];
-    LOG(DEBUG,"occurs[%d][%d] seq=%d", off/NTIMES, off%NTIMES, seq);
+    LOG(DEBUG,"occurs[%d][%d] seq=%d rc=%d", off/NTIMES, off%NTIMES, seq, rc);
+
+    assert(rc == ERROR_TIMEDOUT);
 
     switch (seq) {
     case 0:
         assert(ev == &et[0][1]);
         libxl__ev_time_deregister(gc, &et[0][0]);
         libxl__ev_time_deregister(gc, &et[0][2]);
-        regs(gc, 1);
+        regs(tao, 1);
         libxl__ev_time_deregister(gc, &et[0][1]);
         break;
 
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index f7fc695..502a148 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -63,6 +63,12 @@ libxl_error = Enumeration("error", [
     (-17, "DEVICE_EXISTS"),
     (-18, "REMUS_DEVOPS_DOES_NOT_MATCH"),
     (-19, "REMUS_DEVICE_NOT_SUPPORTED"),
+    (-20, "VNUMA_CONFIG_INVALID"),
+    (-21, "DOMAIN_NOTFOUND"),
+    (-22, "ABORTED"),
+    (-23, "NOTFOUND"),
+    (-24, "DOMAIN_DESTROYED"), # Target domain ceased to exist during op
+    (-25, "FEATURE_REMOVED"), # For functionality that has been removed
     ], value_namespace = "")
 
 libxl_domain_type = Enumeration("domain_type", [
@@ -71,6 +77,17 @@ libxl_domain_type = Enumeration("domain_type", [
     (2, "PV"),
     ], init_val = "LIBXL_DOMAIN_TYPE_INVALID")
 
+libxl_rdm_reserve_strategy = Enumeration("rdm_reserve_strategy", [
+    (0, "ignore"),
+    (1, "host"),
+    ])
+
+libxl_rdm_reserve_policy = Enumeration("rdm_reserve_policy", [
+    (-1, "invalid"),
+    (0, "strict"),
+    (1, "relaxed"),
+    ], init_val = "LIBXL_RDM_RESERVE_POLICY_INVALID")
+
 libxl_channel_connection = Enumeration("channel_connection", [
     (0, "UNKNOWN"),
     (1, "PTY"),
@@ -181,6 +198,7 @@ libxl_vga_interface_type = Enumeration("vga_interface_type", [
     (1, "CIRRUS"),
     (2, "STD"),
     (3, "NONE"),
+    (4, "QXL"),
     ], init_val = "LIBXL_VGA_INTERFACE_TYPE_CIRRUS")
 
 libxl_vendor_device = Enumeration("vendor_device", [
@@ -192,8 +210,14 @@ libxl_viridian_enlightenment = Enumeration("viridian_enlightenment", [
     (0, "base"),
     (1, "freq"),
     (2, "time_ref_count"),
+    (3, "reference_tsc"),
     ])
 
+libxl_hdtype = Enumeration("hdtype", [
+    (1, "IDE"),
+    (2, "AHCI"),
+    ], init_val = "LIBXL_HDTYPE_IDE")
+
 #
 # Complex libxl types
 #
@@ -240,6 +264,8 @@ libxl_spice_info = Struct("spice_info", [
     ("vdagent",     libxl_defbool),
     ("clipboard_sharing", libxl_defbool),
     ("usbredirection", integer),
+    ("image_compression", string),
+    ("streaming_video", string),
     ])
 
 libxl_sdl_info = Struct("sdl_info", [
@@ -340,6 +366,7 @@ libxl_domain_create_info = Struct("domain_create_info",[
 
 libxl_domain_restore_params = Struct("domain_restore_params", [
     ("checkpointed_stream", integer),
+    ("stream_version", uint32, {'init_val': '1'}),
     ])
 
 libxl_domain_sched_params = Struct("domain_sched_params",[
@@ -347,10 +374,33 @@ libxl_domain_sched_params = Struct("domain_sched_params",[
     ("weight",       integer, {'init_val': 'LIBXL_DOMAIN_SCHED_PARAM_WEIGHT_DEFAULT'}),
     ("cap",          integer, {'init_val': 'LIBXL_DOMAIN_SCHED_PARAM_CAP_DEFAULT'}),
     ("period",       integer, {'init_val': 'LIBXL_DOMAIN_SCHED_PARAM_PERIOD_DEFAULT'}),
+    ("budget",       integer, {'init_val': 'LIBXL_DOMAIN_SCHED_PARAM_BUDGET_DEFAULT'}),
+
+    # The following three parameters ('slice', 'latency' and 'extratime') are deprecated,
+    # and will have no effect if used, since the SEDF scheduler has been removed.
+    # Note that 'period' was an SDF parameter too, but it is still effective as it is
+    # now used (together with 'budget') by the RTDS scheduler.
     ("slice",        integer, {'init_val': 'LIBXL_DOMAIN_SCHED_PARAM_SLICE_DEFAULT'}),
     ("latency",      integer, {'init_val': 'LIBXL_DOMAIN_SCHED_PARAM_LATENCY_DEFAULT'}),
     ("extratime",    integer, {'init_val': 'LIBXL_DOMAIN_SCHED_PARAM_EXTRATIME_DEFAULT'}),
-    ("budget",       integer, {'init_val': 'LIBXL_DOMAIN_SCHED_PARAM_BUDGET_DEFAULT'}),
+    ])
+
+libxl_vnode_info = Struct("vnode_info", [
+    ("memkb", MemKB),
+    ("distances", Array(uint32, "num_distances")), # distances from this node to other nodes
+    ("pnode", uint32), # physical node of this node
+    ("vcpus", libxl_bitmap), # vcpus in this node
+    ])
+
+libxl_gic_version = Enumeration("gic_version", [
+    (0, "DEFAULT"),
+    (0x20, "v2"),
+    (0x30, "v3")
+    ], init_val = "LIBXL_GIC_VERSION_DEFAULT")
+
+libxl_rdm_reserve = Struct("rdm_reserve", [
+    ("strategy",    libxl_rdm_reserve_strategy),
+    ("policy",      libxl_rdm_reserve_policy),
     ])
 
 libxl_domain_build_info = Struct("domain_build_info",[
@@ -373,6 +423,8 @@ libxl_domain_build_info = Struct("domain_build_info",[
     ("disable_migrate", libxl_defbool),
     ("cpuid",           libxl_cpuid_policy_list),
     ("blkdev_start",    string),
+
+    ("vnuma_nodes", Array(libxl_vnode_info, "num_vnuma_nodes")),
     
     ("device_model_version", libxl_device_model_version),
     ("device_model_stubdomain", libxl_defbool),
@@ -398,6 +450,12 @@ libxl_domain_build_info = Struct("domain_build_info",[
     ("kernel",           string),
     ("cmdline",          string),
     ("ramdisk",          string),
+    # Given the complexity of verifying the validity of a device tree,
+    # libxl doesn't do any security check on it. It's the responsibility
+    # of the caller to provide only trusted device tree.
+    # Note that the partial device tree should avoid to use the phandle
+    # 65000 which is reserved by the toolstack.
+    ("device_tree",      string),
     ("u", KeyedUnion(None, libxl_domain_type, "type",
                 [("hvm", Struct(None, [("firmware",         string),
                                        ("bios",             libxl_bios_type),
@@ -416,8 +474,10 @@ libxl_domain_build_info = Struct("domain_build_info",[
                                        ("mmio_hole_memkb",  MemKB),
                                        ("timer_mode",       libxl_timer_mode),
                                        ("nested_hvm",       libxl_defbool),
+                                       ("altp2m",           libxl_defbool),
                                        ("smbios_firmware",  string),
                                        ("acpi_firmware",    string),
+                                       ("hdtype",           libxl_hdtype),
                                        ("nographic",        libxl_defbool),
                                        ("vga",              libxl_vga_interface_info),
                                        ("vnc",              libxl_vnc_info),
@@ -443,6 +503,8 @@ libxl_domain_build_info = Struct("domain_build_info",[
                                        # See libxl_ms_vm_genid_generate()
                                        ("ms_vm_genid",      libxl_ms_vm_genid),
                                        ("serial_list",      libxl_string_list),
+                                       ("rdm", libxl_rdm_reserve),
+                                       ("rdm_mem_boundary_memkb", MemKB),
                                        ])),
                  ("pv", Struct(None, [("kernel", string),
                                       ("slack_memkb", MemKB),
@@ -456,6 +518,11 @@ libxl_domain_build_info = Struct("domain_build_info",[
                                       ])),
                  ("invalid", None),
                  ], keyvar_init_val = "LIBXL_DOMAIN_TYPE_INVALID")),
+
+
+    ("arch_arm", Struct(None, [("gic_version", libxl_gic_version),
+                              ])),
+
     ], dir=DIR_IN
 )
 
@@ -518,6 +585,17 @@ libxl_device_pci = Struct("device_pci", [
     ("power_mgmt", bool),
     ("permissive", bool),
     ("seize", bool),
+    ("rdm_policy",      libxl_rdm_reserve_policy),
+    ])
+
+libxl_device_rdm = Struct("device_rdm", [
+    ("start", uint64),
+    ("size", uint64),
+    ("policy", libxl_rdm_reserve_policy),
+    ])
+
+libxl_device_dtdev = Struct("device_dtdev", [
+    ("path", string),
     ])
 
 libxl_device_vtpm = Struct("device_vtpm", [
@@ -546,6 +624,8 @@ libxl_domain_config = Struct("domain_config", [
     ("disks", Array(libxl_device_disk, "num_disks")),
     ("nics", Array(libxl_device_nic, "num_nics")),
     ("pcidevs", Array(libxl_device_pci, "num_pcidevs")),
+    ("rdms", Array(libxl_device_rdm, "num_rdms")),
+    ("dtdevs", Array(libxl_device_dtdev, "num_dtdevs")),
     ("vfbs", Array(libxl_device_vfb, "num_vfbs")),
     ("vkbs", Array(libxl_device_vkb, "num_vkbs")),
     ("vtpms", Array(libxl_device_vtpm, "num_vtpms")),
@@ -642,6 +722,13 @@ libxl_cputopology = Struct("cputopology", [
     ("node", uint32),
     ], dir=DIR_OUT)
 
+libxl_pcitopology = Struct("pcitopology", [
+    ("seg", uint16),
+    ("bus", uint8),
+    ("devfn", uint8),
+    ("node", uint32),
+    ], dir=DIR_OUT)
+
 libxl_sched_credit_params = Struct("sched_credit_params", [
     ("tslice_ms", integer),
     ("ratelimit_us", integer),
@@ -693,4 +780,17 @@ libxl_event = Struct("event",[
 
 libxl_psr_cmt_type = Enumeration("psr_cmt_type", [
     (1, "CACHE_OCCUPANCY"),
+    (2, "TOTAL_MEM_COUNT"),
+    (3, "LOCAL_MEM_COUNT"),
+    ])
+
+libxl_psr_cbm_type = Enumeration("psr_cbm_type", [
+    (0, "UNKNOWN"),
+    (1, "L3_CBM"),
+    ])
+
+libxl_psr_cat_info = Struct("psr_cat_info", [
+    ("id", uint32),
+    ("cos_max", uint32),
+    ("cbm_len", uint32),
     ])
diff --git a/tools/libxl/libxl_utils.c b/tools/libxl/libxl_utils.c
index 7095b58..408ec85 100644
--- a/tools/libxl/libxl_utils.c
+++ b/tools/libxl/libxl_utils.c
@@ -604,7 +604,12 @@ void libxl_bitmap_init(libxl_bitmap *map)
 
 void libxl_bitmap_dispose(libxl_bitmap *map)
 {
+    if (!map)
+        return;
+
     free(map->map);
+    map->map = NULL;
+    map->size = 0;
 }
 
 void libxl_bitmap_copy(libxl_ctx *ctx, libxl_bitmap *dptr,
@@ -686,6 +691,76 @@ void libxl_bitmap_reset(libxl_bitmap *bitmap, int bit)
     bitmap->map[bit / 8] &= ~(1 << (bit & 7));
 }
 
+int libxl_bitmap_or(libxl_ctx *ctx, libxl_bitmap *or_map,
+                    const libxl_bitmap *map1, const libxl_bitmap *map2)
+{
+    GC_INIT(ctx);
+    int rc;
+    uint32_t i;
+    const libxl_bitmap *large_map;
+    const libxl_bitmap *small_map;
+
+    if (map1->size > map2->size) {
+        large_map = map1;
+        small_map = map2;
+    } else {
+        large_map = map2;
+        small_map = map1;
+    }
+
+    rc = libxl_bitmap_alloc(ctx, or_map, large_map->size * 8);
+    if (rc)
+        goto out;
+
+    /*
+     *  If bitmaps aren't the same size, their union (logical or) will
+     *  be size of larger bit map.  Any bit past the end of the
+     *  smaller bit map, will match the larger one.
+     */
+    for (i = 0; i < small_map->size; i++)
+        or_map->map[i] = (small_map->map[i] | large_map->map[i]);
+
+    for (i = small_map->size; i < large_map->size; i++)
+        or_map->map[i] = large_map->map[i];
+
+out:
+    GC_FREE;
+    return rc;
+}
+
+int libxl_bitmap_and(libxl_ctx *ctx, libxl_bitmap *and_map,
+                     const libxl_bitmap *map1, const libxl_bitmap *map2)
+{
+    GC_INIT(ctx);
+    int rc;
+    uint32_t i;
+    const libxl_bitmap *large_map;
+    const libxl_bitmap *small_map;
+
+    if (map1->size > map2->size) {
+        large_map = map1;
+        small_map = map2;
+    } else {
+        large_map = map2;
+        small_map = map1;
+    }
+
+    rc = libxl_bitmap_alloc(ctx, and_map, small_map->size * 8);
+    if (rc)
+        goto out;
+
+    /*
+     *  If bitmaps aren't same size, their 'and' will be size of
+     *  smaller bit map
+     */
+    for (i = 0; i < and_map->size; i++)
+        and_map->map[i] = (large_map->map[i] & small_map->map[i]);
+
+out:
+    GC_FREE;
+    return rc;
+}
+
 int libxl_bitmap_count_set(const libxl_bitmap *bitmap)
 {
     int i, nr_set_bits = 0;
@@ -765,6 +840,74 @@ int libxl_node_bitmap_alloc(libxl_ctx *ctx, libxl_bitmap *nodemap,
     return rc;
 }
 
+int libxl__count_physical_sockets(libxl__gc *gc, int *sockets)
+{
+    int rc;
+    libxl_physinfo info;
+
+    libxl_physinfo_init(&info);
+
+    rc = libxl_get_physinfo(CTX, &info);
+    if (rc)
+        return rc;
+
+    *sockets = info.nr_cpus / info.threads_per_core
+                            / info.cores_per_socket;
+
+    libxl_physinfo_dispose(&info);
+    return 0;
+}
+
+int libxl_socket_bitmap_alloc(libxl_ctx *ctx, libxl_bitmap *socketmap,
+                              int max_sockets)
+{
+    GC_INIT(ctx);
+    int rc = 0;
+
+    if (max_sockets < 0) {
+        rc = ERROR_INVAL;
+        LOG(ERROR, "invalid number of sockets provided");
+        goto out;
+    }
+
+    if (max_sockets == 0) {
+        rc = libxl__count_physical_sockets(gc, &max_sockets);
+        if (rc) {
+            LOGE(ERROR, "failed to get system socket count");
+            goto out;
+        }
+    }
+    /* This can't fail: no need to check and log */
+    libxl_bitmap_alloc(ctx, socketmap, max_sockets);
+
+ out:
+    GC_FREE;
+    return rc;
+
+}
+
+int libxl_get_online_socketmap(libxl_ctx *ctx, libxl_bitmap *socketmap)
+{
+    libxl_cputopology *tinfo = NULL;
+    int nr_cpus = 0, i, rc = 0;
+
+    tinfo = libxl_get_cpu_topology(ctx, &nr_cpus);
+    if (tinfo == NULL) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    libxl_bitmap_set_none(socketmap);
+    for (i = 0; i < nr_cpus; i++)
+        if (tinfo[i].socket != XEN_INVALID_SOCKET_ID
+            && !libxl_bitmap_test(socketmap, tinfo[i].socket))
+            libxl_bitmap_set(socketmap, tinfo[i].socket);
+
+ out:
+    libxl_cputopology_list_free(tinfo, nr_cpus);
+    return rc;
+}
+
 int libxl_nodemap_to_cpumap(libxl_ctx *ctx,
                             const libxl_bitmap *nodemap,
                             libxl_bitmap *cpumap)
@@ -877,6 +1020,14 @@ void libxl_cputopology_list_free(libxl_cputopology *list, int nr)
     free(list);
 }
 
+void libxl_pcitopology_list_free(libxl_pcitopology *list, int nr)
+{
+    int i;
+    for (i = 0; i < nr; i++)
+        libxl_pcitopology_dispose(&list[i]);
+    free(list);
+}
+
 void libxl_numainfo_list_free(libxl_numainfo *list, int nr)
 {
     int i;
diff --git a/tools/libxl/libxl_utils.h b/tools/libxl/libxl_utils.h
index acacdd9..339ebdf 100644
--- a/tools/libxl/libxl_utils.h
+++ b/tools/libxl/libxl_utils.h
@@ -89,8 +89,14 @@ int libxl_bitmap_is_empty(const libxl_bitmap *bitmap);
 int libxl_bitmap_test(const libxl_bitmap *bitmap, int bit);
 void libxl_bitmap_set(libxl_bitmap *bitmap, int bit);
 void libxl_bitmap_reset(libxl_bitmap *bitmap, int bit);
-int libxl_bitmap_count_set(const libxl_bitmap *cpumap);
-char *libxl_bitmap_to_hex_string(libxl_ctx *ctx, const libxl_bitmap *cpumap);
+int libxl_bitmap_count_set(const libxl_bitmap *bitmap);
+int libxl_bitmap_or(libxl_ctx *ctx, libxl_bitmap *or_map,
+                    const libxl_bitmap *map1,
+                    const libxl_bitmap *map2);
+int libxl_bitmap_and(libxl_ctx *ctx, libxl_bitmap *and_map,
+                     const libxl_bitmap *map1,
+                     const libxl_bitmap *map2);
+char *libxl_bitmap_to_hex_string(libxl_ctx *ctx, const libxl_bitmap *bitmap);
 static inline void libxl_bitmap_set_any(libxl_bitmap *bitmap)
 {
     memset(bitmap->map, -1, bitmap->size);
@@ -135,6 +141,10 @@ static inline int libxl_bitmap_equal(const libxl_bitmap *ba,
 int libxl_cpu_bitmap_alloc(libxl_ctx *ctx, libxl_bitmap *cpumap, int max_cpus);
 int libxl_node_bitmap_alloc(libxl_ctx *ctx, libxl_bitmap *nodemap,
                             int max_nodes);
+int libxl_socket_bitmap_alloc(libxl_ctx *ctx, libxl_bitmap *socketmap,
+                              int max_sockets);
+/* Fill socketmap with the CPU topology information on the system. */
+int libxl_get_online_socketmap(libxl_ctx *ctx, libxl_bitmap *socketmap);
 
 /* Populate cpumap with the cpus spanned by the nodes in nodemap */
 int libxl_nodemap_to_cpumap(libxl_ctx *ctx,
@@ -145,7 +155,7 @@ int libxl_node_to_cpumap(libxl_ctx *ctx, int node,
                          libxl_bitmap *cpumap);
 /* Populate nodemap with the nodes of the cpus in cpumap */
 int libxl_cpumap_to_nodemap(libxl_ctx *ctx,
-                            const libxl_bitmap *cpuemap,
+                            const libxl_bitmap *cpumap,
                             libxl_bitmap *nodemap);
 
  static inline uint32_t libxl__sizekb_to_mb(uint32_t s) {
@@ -154,6 +164,9 @@ int libxl_cpumap_to_nodemap(libxl_ctx *ctx,
 
 void libxl_string_copy(libxl_ctx *ctx, char **dst, char **src);
 
+
+#define LIBXL_FILLZERO(object) (memset(&(object), 0, sizeof((object))))
+
 #endif
 
 /*
diff --git a/tools/libxl/libxl_vnuma.c b/tools/libxl/libxl_vnuma.c
new file mode 100644
index 0000000..56856d2
--- /dev/null
+++ b/tools/libxl/libxl_vnuma.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (C) 2014      Citrix Ltd.
+ * Author Wei Liu <wei.liu2 at citrix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+#include "libxl_osdeps.h" /* must come before any other headers */
+#include "libxl_internal.h"
+#include "libxl_arch.h"
+#include <stdlib.h>
+
+bool libxl__vnuma_configured(const libxl_domain_build_info *b_info)
+{
+    return b_info->num_vnuma_nodes != 0;
+}
+
+/* Sort vmemranges in ascending order with "start" */
+static int compare_vmemrange(const void *a, const void *b)
+{
+    const xen_vmemrange_t *x = a, *y = b;
+    if (x->start < y->start)
+        return -1;
+    if (x->start > y->start)
+        return 1;
+    return 0;
+}
+
+/* Check if a vcpu has an hard (or soft) affinity set in such
+ * a way that it does not match the pnode to which the vcpu itself
+ * is assigned to.
+ */
+static int check_vnuma_affinity(libxl__gc *gc,
+                                 unsigned int vcpu,
+                                 unsigned int pnode,
+                                 unsigned int num_affinity,
+                                 const libxl_bitmap *affinity,
+                                 const char *kind)
+{
+    libxl_bitmap nodemap;
+    int rc = 0;
+
+    libxl_bitmap_init(&nodemap);
+
+    rc = libxl_node_bitmap_alloc(CTX, &nodemap, 0);
+    if (rc) {
+        LOG(ERROR, "Can't allocate nodemap");
+        goto out;
+    }
+
+    rc = libxl_cpumap_to_nodemap(CTX, affinity, &nodemap);
+    if (rc) {
+        LOG(ERROR, "Can't convert Vcpu %d affinity to nodemap", vcpu);
+        goto out;
+    }
+
+    if (libxl_bitmap_count_set(&nodemap) != 1 ||
+        !libxl_bitmap_test(&nodemap, pnode))
+        LOG(WARN, "Vcpu %d %s affinity and vnuma info mismatch", vcpu, kind);
+
+out:
+    libxl_bitmap_dispose(&nodemap);
+    return rc;
+}
+
+/* Check if vNUMA configuration is valid:
+ *  1. all pnodes inside vnode_to_pnode array are valid
+ *  2. each vcpu belongs to one and only one vnode
+ *  3. each vmemrange is valid and doesn't overlap with any other
+ *  4. local distance cannot be larger than remote distance
+ *
+ * Check also, if any hard or soft affinity is specified, whether
+ * they match with the vNUMA related bits (namely vcpus to vnodes
+ * mappings and vnodes to pnodes association). If that does not
+ * hold, however, just print a warning, as that has "only"
+ * performance implications.
+ */
+int libxl__vnuma_config_check(libxl__gc *gc,
+                              const libxl_domain_build_info *b_info,
+                              const libxl__domain_build_state *state)
+{
+    int nr_nodes = 0, rc = ERROR_VNUMA_CONFIG_INVALID;
+    unsigned int i, j;
+    libxl_numainfo *ninfo = NULL;
+    uint64_t total_memkb = 0;
+    libxl_bitmap cpumap;
+    libxl_vnode_info *v;
+
+    libxl_bitmap_init(&cpumap);
+
+    /* Check pnode specified is valid */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (!ninfo) {
+        LOG(ERROR, "libxl_get_numainfo failed");
+        goto out;
+    }
+
+    for (i = 0; i < b_info->num_vnuma_nodes; i++) {
+        uint32_t pnode;
+
+        v = &b_info->vnuma_nodes[i];
+        pnode = v->pnode;
+
+        /* The pnode specified is not valid? */
+        if (pnode >= nr_nodes) {
+            LOG(ERROR, "Invalid pnode %"PRIu32" specified", pnode);
+            goto out;
+        }
+
+        total_memkb += v->memkb;
+    }
+
+    if (total_memkb != b_info->max_memkb) {
+        LOG(ERROR, "Amount of memory mismatch (0x%"PRIx64" != 0x%"PRIx64")",
+            total_memkb, b_info->max_memkb);
+        goto out;
+    }
+
+    /* Check vcpu mapping */
+    libxl_cpu_bitmap_alloc(CTX, &cpumap, b_info->max_vcpus);
+    for (i = 0; i < b_info->num_vnuma_nodes; i++) {
+        v = &b_info->vnuma_nodes[i];
+        libxl_for_each_set_bit(j, v->vcpus) {
+            if (!libxl_bitmap_test(&cpumap, j))
+                libxl_bitmap_set(&cpumap, j);
+            else {
+                LOG(ERROR, "Vcpu %d assigned more than once", j);
+                goto out;
+            }
+        }
+    }
+
+    for (i = 0; i < b_info->max_vcpus; i++) {
+        if (!libxl_bitmap_test(&cpumap, i)) {
+            LOG(ERROR, "Vcpu %d is not assigned to any vnode", i);
+            goto out;
+        }
+    }
+
+    /* Check whether vcpu affinity (if any) matches vnuma configuration */
+    for (i = 0; i < b_info->num_vnuma_nodes; i++) {
+        v = &b_info->vnuma_nodes[i];
+        libxl_for_each_set_bit(j, v->vcpus) {
+            if (b_info->num_vcpu_hard_affinity > j)
+                check_vnuma_affinity(gc, j, v->pnode,
+                                     b_info->num_vcpu_hard_affinity,
+                                     &b_info->vcpu_hard_affinity[j],
+                                     "hard");
+            if (b_info->num_vcpu_soft_affinity > j)
+                check_vnuma_affinity(gc, j, v->pnode,
+                                     b_info->num_vcpu_soft_affinity,
+                                     &b_info->vcpu_soft_affinity[j],
+                                     "soft");
+        }
+    }
+
+    /* Check vmemranges */
+    qsort(state->vmemranges, state->num_vmemranges, sizeof(xen_vmemrange_t),
+          compare_vmemrange);
+
+    for (i = 0; i < state->num_vmemranges; i++) {
+        if (state->vmemranges[i].end < state->vmemranges[i].start) {
+                LOG(ERROR, "Vmemrange end < start");
+                goto out;
+        }
+    }
+
+    for (i = 0; i < state->num_vmemranges - 1; i++) {
+        if (state->vmemranges[i].end > state->vmemranges[i+1].start) {
+            LOG(ERROR,
+                "Vmemranges overlapped, 0x%"PRIx64"-0x%"PRIx64", 0x%"PRIx64"-0x%"PRIx64,
+                state->vmemranges[i].start, state->vmemranges[i].end,
+                state->vmemranges[i+1].start, state->vmemranges[i+1].end);
+            goto out;
+        }
+    }
+
+    /* Check vdistances */
+    for (i = 0; i < b_info->num_vnuma_nodes; i++) {
+        uint32_t local_distance, remote_distance;
+
+        v = &b_info->vnuma_nodes[i];
+        local_distance = v->distances[i];
+
+        for (j = 0; j < v->num_distances; j++) {
+            if (i == j) continue;
+            remote_distance = v->distances[j];
+            if (local_distance > remote_distance) {
+                LOG(ERROR,
+                    "Distance from %u to %u smaller than %u's local distance",
+                    i, j, i);
+                goto out;
+            }
+        }
+    }
+
+    rc = 0;
+out:
+    libxl_numainfo_list_free(ninfo, nr_nodes);
+    libxl_bitmap_dispose(&cpumap);
+    return rc;
+}
+
+int libxl__vnuma_build_vmemrange_pv_generic(libxl__gc *gc,
+                                            uint32_t domid,
+                                            libxl_domain_build_info *b_info,
+                                            libxl__domain_build_state *state)
+{
+    int i;
+    uint64_t next;
+    xen_vmemrange_t *v = NULL;
+
+    /* Generate one vmemrange for each virtual node. */
+    GCREALLOC_ARRAY(v, b_info->num_vnuma_nodes);
+    next = 0;
+    for (i = 0; i < b_info->num_vnuma_nodes; i++) {
+        libxl_vnode_info *p = &b_info->vnuma_nodes[i];
+
+        v[i].start = next;
+        v[i].end = next + (p->memkb << 10);
+        v[i].flags = 0;
+        v[i].nid = i;
+
+        next = v[i].end;
+    }
+
+    state->vmemranges = v;
+    state->num_vmemranges = i;
+
+    return 0;
+}
+
+/* Build vmemranges for PV guest */
+int libxl__vnuma_build_vmemrange_pv(libxl__gc *gc,
+                                    uint32_t domid,
+                                    libxl_domain_build_info *b_info,
+                                    libxl__domain_build_state *state)
+{
+    assert(state->vmemranges == NULL);
+    return libxl__arch_vnuma_build_vmemrange(gc, domid, b_info, state);
+}
+
+/* Build vmemranges for HVM guest */
+int libxl__vnuma_build_vmemrange_hvm(libxl__gc *gc,
+                                     uint32_t domid,
+                                     libxl_domain_build_info *b_info,
+                                     libxl__domain_build_state *state,
+                                     struct xc_hvm_build_args *args)
+{
+    uint64_t hole_start, hole_end, next;
+    int nid, nr_vmemrange;
+    xen_vmemrange_t *vmemranges;
+    int rc;
+
+    /* Derive vmemranges from vnode size and memory hole.
+     *
+     * Guest physical address space layout:
+     * [0, hole_start) [hole_start, hole_end) [hole_end, highmem_end)
+     */
+    hole_start = args->lowmem_end < args->mmio_start ?
+        args->lowmem_end : args->mmio_start;
+    hole_end = (args->mmio_start + args->mmio_size) > (1ULL << 32) ?
+        (args->mmio_start + args->mmio_size) : (1ULL << 32);
+
+    assert(state->vmemranges == NULL);
+
+    next = 0;
+    nr_vmemrange = 0;
+    vmemranges = NULL;
+    for (nid = 0; nid < b_info->num_vnuma_nodes; nid++) {
+        libxl_vnode_info *p = &b_info->vnuma_nodes[nid];
+        uint64_t remaining_bytes = p->memkb << 10;
+
+        /* Consider video ram belongs to vnode 0 */
+        if (nid == 0) {
+            if (p->memkb < b_info->video_memkb) {
+                LOG(ERROR, "vnode 0 too small to contain video ram");
+                rc = ERROR_INVAL;
+                goto out;
+            }
+            remaining_bytes -= (b_info->video_memkb << 10);
+        }
+
+        while (remaining_bytes > 0) {
+            uint64_t count = remaining_bytes;
+
+            if (next >= hole_start && next < hole_end)
+                next = hole_end;
+            if ((next < hole_start) && (next + remaining_bytes >= hole_start))
+                count = hole_start - next;
+
+            GCREALLOC_ARRAY(vmemranges, nr_vmemrange+1);
+            vmemranges[nr_vmemrange].start = next;
+            vmemranges[nr_vmemrange].end = next + count;
+            vmemranges[nr_vmemrange].flags = 0;
+            vmemranges[nr_vmemrange].nid = nid;
+
+            nr_vmemrange++;
+            remaining_bytes -= count;
+            next += count;
+        }
+    }
+
+    state->vmemranges = vmemranges;
+    state->num_vmemranges = nr_vmemrange;
+
+    rc = 0;
+out:
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index 7589060..896f34c 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -1,6 +1,22 @@
 #include "libxl_internal.h"
 #include "libxl_arch.h"
 
+int libxl__arch_domain_prepare_config(libxl__gc *gc,
+                                      libxl_domain_config *d_config,
+                                      xc_domain_configuration_t *xc_config)
+{
+    /* No specific configuration right now */
+
+    return 0;
+}
+
+int libxl__arch_domain_save_config(libxl__gc *gc,
+                                   libxl_domain_config *d_config,
+                                   const xc_domain_configuration_t *xc_config)
+{
+    return 0;
+}
+
 static const char *e820_names(int type)
 {
     switch (type) {
@@ -207,6 +223,25 @@ static int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
     return 0;
 }
 
+static int e820_host_sanitize(libxl__gc *gc,
+                              libxl_domain_build_info *b_info,
+                              struct e820entry map[],
+                              uint32_t *nr)
+{
+    int rc;
+
+    rc = xc_get_machine_memory_map(CTX->xch, map, *nr);
+    if (rc < 0)
+        return ERROR_FAIL;
+
+    *nr = rc;
+
+    rc = e820_sanitize(CTX, map, nr, b_info->target_memkb,
+                       (b_info->max_memkb - b_info->target_memkb) +
+                       b_info->u.pv.slack_memkb);
+    return rc;
+}
+
 static int libxl__e820_alloc(libxl__gc *gc, uint32_t domid,
         libxl_domain_config *d_config)
 {
@@ -223,24 +258,16 @@ static int libxl__e820_alloc(libxl__gc *gc, uint32_t domid,
     if (!libxl_defbool_val(b_info->u.pv.e820_host))
         return ERROR_INVAL;
 
-    rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX);
-    if (rc < 0) {
-        errno = rc;
-        return ERROR_FAIL;
-    }
-    nr = rc;
-    rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb,
-                       (b_info->max_memkb - b_info->target_memkb) +
-                       b_info->u.pv.slack_memkb);
+    nr = E820MAX;
+    rc = e820_host_sanitize(gc, b_info, map, &nr);
     if (rc)
         return ERROR_FAIL;
 
     rc = xc_domain_set_memory_map(ctx->xch, domid, map, nr);
 
-    if (rc < 0) {
-        errno  = rc;
+    if (rc < 0)
         return ERROR_FAIL;
-    }
+
     return 0;
 }
 
@@ -279,10 +306,16 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
     rtc_timeoffset = d_config->b_info.rtc_timeoffset;
     if (libxl_defbool_val(d_config->b_info.localtime)) {
         time_t t;
-        struct tm *tm;
+        struct tm *tm, result;
 
         t = time(NULL);
-        tm = localtime(&t);
+        tm = localtime_r(&t, &result);
+
+        if (!tm) {
+            LOGE(ERROR, "Failed to call localtime_r");
+            ret = ERROR_FAIL;
+            goto out;
+        }
 
         rtc_timeoffset += tm->tm_gmtoff;
     }
@@ -308,11 +341,13 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
         }
     }
 
+out:
     return ret;
 }
 
 int libxl__arch_domain_init_hw_description(libxl__gc *gc,
                                            libxl_domain_build_info *info,
+                                           libxl__domain_build_state *state,
                                            struct xc_dom_image *dom)
 {
     return 0;
@@ -324,3 +359,185 @@ int libxl__arch_domain_finalise_hw_description(libxl__gc *gc,
 {
     return 0;
 }
+
+/* Return 0 on success, ERROR_* on failure. */
+int libxl__arch_vnuma_build_vmemrange(libxl__gc *gc,
+                                      uint32_t domid,
+                                      libxl_domain_build_info *b_info,
+                                      libxl__domain_build_state *state)
+{
+    int nid, nr_vmemrange, rc;
+    uint32_t nr_e820, e820_count;
+    struct e820entry map[E820MAX];
+    xen_vmemrange_t *vmemranges;
+    unsigned int array_size;
+
+    /* If e820_host is not set, call the generic function */
+    if (!(b_info->type == LIBXL_DOMAIN_TYPE_PV &&
+          libxl_defbool_val(b_info->u.pv.e820_host)))
+        return libxl__vnuma_build_vmemrange_pv_generic(gc, domid, b_info,
+                                                       state);
+
+    assert(state->vmemranges == NULL);
+
+    nr_e820 = E820MAX;
+    rc = e820_host_sanitize(gc, b_info, map, &nr_e820);
+    if (rc) goto out;
+
+    e820_count = 0;
+    nr_vmemrange = 0;
+    vmemranges = NULL;
+    array_size = 0;
+    for (nid = 0; nid < b_info->num_vnuma_nodes; nid++) {
+        libxl_vnode_info *p = &b_info->vnuma_nodes[nid];
+        uint64_t remaining_bytes = (p->memkb << 10), bytes;
+
+        while (remaining_bytes > 0) {
+            if (e820_count >= nr_e820) {
+                rc = ERROR_NOMEM;
+                goto out;
+            }
+
+            /* Skip non RAM region */
+            if (map[e820_count].type != E820_RAM) {
+                e820_count++;
+                continue;
+            }
+
+            if (nr_vmemrange >= array_size) {
+                array_size += 32;
+                GCREALLOC_ARRAY(vmemranges, array_size);
+            }
+
+            bytes = map[e820_count].size >= remaining_bytes ?
+                remaining_bytes : map[e820_count].size;
+
+            vmemranges[nr_vmemrange].start = map[e820_count].addr;
+            vmemranges[nr_vmemrange].end = map[e820_count].addr + bytes;
+
+            if (map[e820_count].size >= remaining_bytes) {
+                map[e820_count].addr += bytes;
+                map[e820_count].size -= bytes;
+            } else {
+                e820_count++;
+            }
+
+            remaining_bytes -= bytes;
+
+            vmemranges[nr_vmemrange].flags = 0;
+            vmemranges[nr_vmemrange].nid = nid;
+            nr_vmemrange++;
+        }
+    }
+
+    state->vmemranges = vmemranges;
+    state->num_vmemranges = nr_vmemrange;
+
+    rc = 0;
+out:
+    return rc;
+}
+
+int libxl__arch_domain_map_irq(libxl__gc *gc, uint32_t domid, int irq)
+{
+    int ret;
+
+    ret = xc_physdev_map_pirq(CTX->xch, domid, irq, &irq);
+    if (ret)
+        return ret;
+
+    ret = xc_domain_irq_permission(CTX->xch, domid, irq, 1);
+
+    return ret;
+}
+
+/*
+ * Here we're just trying to set these kinds of e820 mappings:
+ *
+ * #1. Low memory region
+ *
+ * Low RAM starts at least from 1M to make sure all standard regions
+ * of the PC memory map, like BIOS, VGA memory-mapped I/O and vgabios,
+ * have enough space.
+ * Note: Those stuffs below 1M are still constructed with multiple
+ * e820 entries by hvmloader. At this point we don't change anything.
+ *
+ * #2. RDM region if it exists
+ *
+ * #3. High memory region if it exists
+ *
+ * Note: these regions are not overlapping since we already check
+ * to adjust them. Please refer to libxl__domain_device_construct_rdm().
+ */
+#define GUEST_LOW_MEM_START_DEFAULT 0x100000
+int libxl__arch_domain_construct_memmap(libxl__gc *gc,
+                                        libxl_domain_config *d_config,
+                                        uint32_t domid,
+                                        struct xc_hvm_build_args *args)
+{
+    int rc = 0;
+    unsigned int nr = 0, i;
+    /* We always own at least one lowmem entry. */
+    unsigned int e820_entries = 1;
+    struct e820entry *e820 = NULL;
+    uint64_t highmem_size =
+                    args->highmem_end ? args->highmem_end - (1ull << 32) : 0;
+
+    /* Add all rdm entries. */
+    for (i = 0; i < d_config->num_rdms; i++)
+        if (d_config->rdms[i].policy != LIBXL_RDM_RESERVE_POLICY_INVALID)
+            e820_entries++;
+
+
+    /* If we should have a highmem range. */
+    if (highmem_size)
+        e820_entries++;
+
+    if (e820_entries >= E820MAX) {
+        LOG(ERROR, "Ooops! Too many entries in the memory map!");
+        rc = ERROR_INVAL;
+        goto out;
+    }
+
+    e820 = libxl__malloc(gc, sizeof(struct e820entry) * e820_entries);
+
+    /* Low memory */
+    e820[nr].addr = GUEST_LOW_MEM_START_DEFAULT;
+    e820[nr].size = args->lowmem_end - GUEST_LOW_MEM_START_DEFAULT;
+    e820[nr].type = E820_RAM;
+    nr++;
+
+    /* RDM mapping */
+    for (i = 0; i < d_config->num_rdms; i++) {
+        if (d_config->rdms[i].policy == LIBXL_RDM_RESERVE_POLICY_INVALID)
+            continue;
+
+        e820[nr].addr = d_config->rdms[i].start;
+        e820[nr].size = d_config->rdms[i].size;
+        e820[nr].type = E820_RESERVED;
+        nr++;
+    }
+
+    /* High memory */
+    if (highmem_size) {
+        e820[nr].addr = ((uint64_t)1 << 32);
+        e820[nr].size = highmem_size;
+        e820[nr].type = E820_RAM;
+    }
+
+    if (xc_domain_set_memory_map(CTX->xch, domid, e820, e820_entries) != 0) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+out:
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxlu_cfg.c b/tools/libxl/libxlu_cfg.c
index 22adcb0..1d70909 100644
--- a/tools/libxl/libxlu_cfg.c
+++ b/tools/libxl/libxlu_cfg.c
@@ -131,20 +131,35 @@ int xlu_cfg_readdata(XLU_Config *cfg, const char *data, int length) {
     return ctx.err;
 }
 
-void xlu__cfg_set_free(XLU_ConfigSetting *set) {
+void xlu__cfg_value_free(XLU_ConfigValue *value)
+{
     int i;
 
+    if (!value) return;
+
+    switch (value->type) {
+    case XLU_STRING:
+        free(value->u.string);
+        break;
+    case XLU_LIST:
+        for (i = 0; i < value->u.list.nvalues; i++)
+            xlu__cfg_value_free(value->u.list.values[i]);
+        free(value->u.list.values);
+    }
+    free(value);
+}
+
+void xlu__cfg_set_free(XLU_ConfigSetting *set) {
     if (!set) return;
     free(set->name);
-    for (i=0; i<set->nvalues; i++)
-        free(set->values[i]);
-    free(set->values);
+    xlu__cfg_value_free(set->value);
     free(set);
 }
 
 void xlu_cfg_destroy(XLU_Config *cfg) {
     XLU_ConfigSetting *set, *set_next;
 
+    if (!cfg) return;
     for (set= cfg->settings;
          set;
          set= set_next) {
@@ -173,7 +188,7 @@ static int find_atom(const XLU_Config *cfg, const char *n,
     set= find(cfg,n);
     if (!set) return ESRCH;
 
-    if (set->avalues!=1) {
+    if (set->value->type!=XLU_STRING) {
         if (!dont_warn)
             fprintf(cfg->report,
                     "%s:%d: warning: parameter `%s' is"
@@ -185,13 +200,60 @@ static int find_atom(const XLU_Config *cfg, const char *n,
     return 0;
 }
 
+
+enum XLU_ConfigValueType xlu_cfg_value_type(const XLU_ConfigValue *value)
+{
+    return value->type;
+}
+
+int xlu_cfg_value_get_string(const XLU_Config *cfg, XLU_ConfigValue *value,
+                             char **value_r, int dont_warn)
+{
+    if (value->type != XLU_STRING) {
+        if (!dont_warn)
+            fprintf(cfg->report,
+                    "%s:%d:%d: warning: value is not a string\n",
+                    cfg->config_source, value->loc.first_line,
+                    value->loc.first_column);
+        *value_r = NULL;
+        return EINVAL;
+    }
+
+    *value_r = value->u.string;
+    return 0;
+}
+
+int xlu_cfg_value_get_list(const XLU_Config *cfg, XLU_ConfigValue *value,
+                           XLU_ConfigList **value_r, int dont_warn)
+{
+    if (value->type != XLU_LIST) {
+        if (!dont_warn)
+            fprintf(cfg->report,
+                    "%s:%d:%d: warning: value is not a list\n",
+                    cfg->config_source, value->loc.first_line,
+                    value->loc.first_column);
+        *value_r = NULL;
+        return EINVAL;
+    }
+
+    *value_r = &value->u.list;
+    return 0;
+}
+
+XLU_ConfigValue *xlu_cfg_get_listitem2(const XLU_ConfigList *list,
+                                       int entry)
+{
+    if (entry < 0 || entry >= list->nvalues) return NULL;
+    return list->values[entry];
+}
+
 int xlu_cfg_get_string(const XLU_Config *cfg, const char *n,
                        const char **value_r, int dont_warn) {
     XLU_ConfigSetting *set;
     int e;
 
     e= find_atom(cfg,n,&set,dont_warn);  if (e) return e;
-    *value_r= set->values[0];
+    *value_r= set->value->u.string;
     return 0;
 }
 
@@ -202,7 +264,7 @@ int xlu_cfg_replace_string(const XLU_Config *cfg, const char *n,
 
     e= find_atom(cfg,n,&set,dont_warn);  if (e) return e;
     free(*value_r);
-    *value_r= strdup(set->values[0]);
+    *value_r= strdup(set->value->u.string);
     return 0;
 }
 
@@ -214,7 +276,7 @@ int xlu_cfg_get_long(const XLU_Config *cfg, const char *n,
     char *ep;
 
     e= find_atom(cfg,n,&set,dont_warn);  if (e) return e;
-    errno= 0; l= strtol(set->values[0], &ep, 0);
+    errno= 0; l= strtol(set->value->u.string, &ep, 0);
     e= errno;
     if (errno) {
         e= errno;
@@ -226,7 +288,7 @@ int xlu_cfg_get_long(const XLU_Config *cfg, const char *n,
                     cfg->config_source, set->lineno, n, strerror(e));
         return e;
     }
-    if (*ep || ep==set->values[0]) {
+    if (*ep || ep==set->value->u.string) {
         if (!dont_warn)
             fprintf(cfg->report,
                     "%s:%d: warning: parameter `%s' is not a valid number\n",
@@ -253,7 +315,7 @@ int xlu_cfg_get_list(const XLU_Config *cfg, const char *n,
                      XLU_ConfigList **list_r, int *entries_r, int dont_warn) {
     XLU_ConfigSetting *set;
     set= find(cfg,n);  if (!set) return ESRCH;
-    if (set->avalues==1) {
+    if (set->value->type!=XLU_LIST) {
         if (!dont_warn) {
             fprintf(cfg->report,
                     "%s:%d: warning: parameter `%s' is a single value"
@@ -262,8 +324,8 @@ int xlu_cfg_get_list(const XLU_Config *cfg, const char *n,
         }
         return EINVAL;
     }
-    if (list_r) *list_r= set;
-    if (entries_r) *entries_r= set->nvalues;
+    if (list_r) *list_r= &set->value->u.list;
+    if (entries_r) *entries_r= set->value->u.list.nvalues;
     return 0;
 }
 
@@ -290,72 +352,118 @@ int xlu_cfg_get_list_as_string_list(const XLU_Config *cfg, const char *n,
     return 0;
 }
 
-const char *xlu_cfg_get_listitem(const XLU_ConfigList *set, int entry) {
-    if (entry < 0 || entry >= set->nvalues) return 0;
-    return set->values[entry];
+const char *xlu_cfg_get_listitem(const XLU_ConfigList *list, int entry) {
+    if (entry < 0 || entry >= list->nvalues) return 0;
+    if (list->values[entry]->type != XLU_STRING) return 0;
+    return list->values[entry]->u.string;
 }
 
 
-XLU_ConfigSetting *xlu__cfg_set_mk(CfgParseContext *ctx,
-                                   int alloc, char *atom) {
-    XLU_ConfigSetting *set= 0;
+XLU_ConfigValue *xlu__cfg_string_mk(CfgParseContext *ctx, char *atom,
+                                    YYLTYPE *loc)
+{
+    XLU_ConfigValue *value = NULL;
 
     if (ctx->err) goto x;
-    assert(!!alloc == !!atom);
 
-    set= malloc(sizeof(*set));
-    if (!set) goto xe;
+    value = malloc(sizeof(*value));
+    if (!value) goto xe;
+    value->type = XLU_STRING;
+    value->u.string = atom;
+    memcpy(&value->loc, loc, sizeof(*loc));
 
-    set->name= 0; /* tbd */
-    set->avalues= alloc;
+    return value;
 
-    if (!alloc) {
-        set->nvalues= 0;
-        set->values= 0;
-    } else {
-        set->values= malloc(sizeof(*set->values) * alloc);
-        if (!set->values) goto xe;
+ xe:
+    ctx->err= errno;
+ x:
+    free(value);
+    free(atom);
+    return NULL;
+}
 
-        set->nvalues= 1;
-        set->values[0]= atom;
-    }
-    return set;
+XLU_ConfigValue *xlu__cfg_list_mk(CfgParseContext *ctx,
+                                  XLU_ConfigValue *val,
+                                  YYLTYPE *loc)
+{
+    XLU_ConfigValue *value = NULL;
+    XLU_ConfigValue **values = NULL;
+
+    if (ctx->err) goto x;
+
+    values = malloc(sizeof(*values));
+    if (!values) goto xe;
+    values[0] = val;
+
+    value = malloc(sizeof(*value));
+    if (!value) goto xe;
+    value->type = XLU_LIST;
+    value->u.list.nvalues = !!val;
+    value->u.list.avalues = 1;
+    value->u.list.values = values;
+    memcpy(&value->loc, loc, sizeof(*loc));
+
+    return value;
 
  xe:
     ctx->err= errno;
  x:
-    free(set);
-    free(atom);
-    return 0;
+    free(value);
+    free(values);
+    xlu__cfg_value_free(val);
+    return NULL;
 }
 
-void xlu__cfg_set_add(CfgParseContext *ctx, XLU_ConfigSetting *set,
-                      char *atom) {
+void xlu__cfg_list_append(CfgParseContext *ctx,
+                          XLU_ConfigValue *list,
+                          XLU_ConfigValue *val)
+{
     if (ctx->err) return;
 
-    assert(atom);
+    assert(val);
+    assert(list->type == XLU_LIST);
 
-    if (set->nvalues >= set->avalues) {
+    if (list->u.list.nvalues >= list->u.list.avalues) {
         int new_avalues;
-        char **new_values;
-
-        if (set->avalues > INT_MAX / 100) { ctx->err= ERANGE; return; }
-        new_avalues= set->avalues * 4;
-        new_values= realloc(set->values,
-                            sizeof(*new_values) * new_avalues);
-        if (!new_values) { ctx->err= errno; free(atom); return; }
-        set->values= new_values;
-        set->avalues= new_avalues;
+        XLU_ConfigValue **new_values = NULL;
+
+        if (list->u.list.avalues > INT_MAX / 100) {
+            ctx->err = ERANGE;
+            xlu__cfg_value_free(val);
+            return;
+        }
+
+        new_avalues = list->u.list.avalues * 4;
+        new_values  = realloc(list->u.list.values,
+                              sizeof(*new_values) * new_avalues);
+        if (!new_values) {
+            ctx->err = errno;
+            xlu__cfg_value_free(val);
+            return;
+        }
+
+        list->u.list.avalues = new_avalues;
+        list->u.list.values  = new_values;
     }
-    set->values[set->nvalues++]= atom;
+
+    list->u.list.values[list->u.list.nvalues] = val;
+    list->u.list.nvalues++;
 }
 
 void xlu__cfg_set_store(CfgParseContext *ctx, char *name,
-                        XLU_ConfigSetting *set, int lineno) {
+                        XLU_ConfigValue *val, int lineno) {
+    XLU_ConfigSetting *set;
+
     if (ctx->err) return;
 
     assert(name);
+    set = malloc(sizeof(*set));
+    if (!set) {
+        ctx->err = errno;
+        return;
+    }
     set->name= name;
+    set->value = val;
     set->lineno= lineno;
     set->next= ctx->cfg->settings;
     ctx->cfg->settings= set;
diff --git a/tools/libxl/libxlu_cfg_i.h b/tools/libxl/libxlu_cfg_i.h
index 54d033c..1b59b33 100644
--- a/tools/libxl/libxlu_cfg_i.h
+++ b/tools/libxl/libxlu_cfg_i.h
@@ -23,11 +23,17 @@
 #include "libxlu_cfg_y.h"
 
 void xlu__cfg_set_free(XLU_ConfigSetting *set);
-XLU_ConfigSetting *xlu__cfg_set_mk(CfgParseContext*, int alloc, char *atom);
-void xlu__cfg_set_add(CfgParseContext*, XLU_ConfigSetting *set, char *atom);
 void xlu__cfg_set_store(CfgParseContext*, char *name,
-                        XLU_ConfigSetting *set, int lineno);
-
+                        XLU_ConfigValue *val, int lineno);
+XLU_ConfigValue *xlu__cfg_string_mk(CfgParseContext *ctx,
+                                    char *atom, YYLTYPE *loc);
+XLU_ConfigValue *xlu__cfg_list_mk(CfgParseContext *ctx,
+                                  XLU_ConfigValue *val,
+                                  YYLTYPE *loc);
+void xlu__cfg_list_append(CfgParseContext *ctx,
+                          XLU_ConfigValue *list,
+                          XLU_ConfigValue *val);
+void xlu__cfg_value_free(XLU_ConfigValue *value);
 char *xlu__cfgl_strdup(CfgParseContext*, const char *src);
 char *xlu__cfgl_dequote(CfgParseContext*, const char *src);
 
diff --git a/tools/libxl/libxlu_cfg_y.c b/tools/libxl/libxlu_cfg_y.c
index 07b5a1d..fbfdd0f 100644
--- a/tools/libxl/libxlu_cfg_y.c
+++ b/tools/libxl/libxlu_cfg_y.c
@@ -126,7 +126,7 @@ typedef union YYSTYPE
 #line 25 "libxlu_cfg_y.y"
 
   char *string;
-  XLU_ConfigSetting *setting;
+  XLU_ConfigValue *value;
 
 
 
@@ -377,7 +377,7 @@ union yyalloc
 /* YYFINAL -- State number of the termination state.  */
 #define YYFINAL  3
 /* YYLAST -- Last index in YYTABLE.  */
-#define YYLAST   24
+#define YYLAST   25
 
 /* YYNTOKENS -- Number of terminals.  */
 #define YYNTOKENS  12
@@ -444,8 +444,8 @@ static const yytype_int8 yyrhs[] =
       15,    -1,    16,    17,    -1,    17,    -1,     1,     6,    -1,
        3,     7,    18,    -1,     6,    -1,     8,    -1,    19,    -1,
        9,    22,    20,    10,    -1,     4,    -1,     5,    -1,    -1,
-      21,    -1,    21,    11,    22,    -1,    19,    22,    -1,    21,
-      11,    22,    19,    22,    -1,    -1,    22,     6,    -1
+      21,    -1,    21,    11,    22,    -1,    18,    22,    -1,    21,
+      11,    22,    18,    22,    -1,    -1,    22,     6,    -1
 };
 
 /* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
@@ -517,14 +517,14 @@ static const yytype_int8 yydefgoto[] =
 static const yytype_int8 yypact[] =
 {
      -18,     4,     0,   -18,    -1,     6,   -18,   -18,   -18,     3,
-     -18,   -18,    11,   -18,   -18,   -18,   -18,   -18,   -18,    13,
-     -18,   -18,    12,    10,    17,   -18,   -18,    13,   -18,    17
+     -18,   -18,    14,   -18,   -18,   -18,   -18,   -18,   -18,    11,
+     -18,   -18,    12,    10,    18,   -18,   -18,    11,   -18,    18
 };
 
 /* YYPGOTO[NTERM-NUM].  */
 static const yytype_int8 yypgoto[] =
 {
-     -18,   -18,   -18,   -18,   -18,    15,   -18,   -17,   -18,   -18,
+     -18,   -18,   -18,   -18,   -18,    16,   -17,   -18,   -18,   -18,
      -14
 };
 
@@ -535,8 +535,8 @@ static const yytype_int8 yypgoto[] =
 static const yytype_int8 yytable[] =
 {
       -2,     4,    21,     5,     3,    11,     6,    24,     7,     6,
-      28,     7,    27,    12,    29,    14,    15,    14,    15,    20,
-      16,    26,    25,    20,    13
+      28,     7,    27,    12,    29,    14,    15,    20,    14,    15,
+      16,    26,    25,    16,    20,    13
 };
 
 #define yypact_value_is_default(yystate) \
@@ -548,8 +548,8 @@ static const yytype_int8 yytable[] =
 static const yytype_uint8 yycheck[] =
 {
        0,     1,    19,     3,     0,     6,     6,    21,     8,     6,
-      27,     8,    26,     7,    28,     4,     5,     4,     5,     6,
-       9,    11,    10,     6,     9
+      27,     8,    26,     7,    28,     4,     5,     6,     4,     5,
+       9,    11,    10,     9,     6,     9
 };
 
 /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
@@ -558,7 +558,7 @@ static const yytype_uint8 yystos[] =
 {
        0,    13,    14,     0,     1,     3,     6,     8,    15,    16,
       17,     6,     7,    17,     4,     5,     9,    18,    19,    22,
-       6,    19,    20,    21,    22,    10,    11,    22,    19,    22
+       6,    18,    20,    21,    22,    10,    11,    22,    18,    22
 };
 
 #define yyerrok		(yyerrstatus = 0)
@@ -1148,7 +1148,7 @@ yydestruct (yymsg, yytype, yyvaluep, yylocationp, ctx)
 
 /* Line 1391 of yacc.c  */
 #line 43 "libxlu_cfg_y.y"
-	{ xlu__cfg_set_free((yyvaluep->setting)); };
+	{ xlu__cfg_value_free((yyvaluep->value)); };
 
 /* Line 1391 of yacc.c  */
 #line 1155 "libxlu_cfg_y.c"
@@ -1166,7 +1166,7 @@ yydestruct (yymsg, yytype, yyvaluep, yylocationp, ctx)
 
 /* Line 1391 of yacc.c  */
 #line 43 "libxlu_cfg_y.y"
-	{ xlu__cfg_set_free((yyvaluep->setting)); };
+	{ xlu__cfg_value_free((yyvaluep->value)); };
 
 /* Line 1391 of yacc.c  */
 #line 1173 "libxlu_cfg_y.c"
@@ -1175,7 +1175,7 @@ yydestruct (yymsg, yytype, yyvaluep, yylocationp, ctx)
 
 /* Line 1391 of yacc.c  */
 #line 43 "libxlu_cfg_y.y"
-	{ xlu__cfg_set_free((yyvaluep->setting)); };
+	{ xlu__cfg_value_free((yyvaluep->value)); };
 
 /* Line 1391 of yacc.c  */
 #line 1182 "libxlu_cfg_y.c"
@@ -1508,21 +1508,21 @@ yyreduce:
 
 /* Line 1806 of yacc.c  */
 #line 57 "libxlu_cfg_y.y"
-    { xlu__cfg_set_store(ctx,(yyvsp[(1) - (3)].string),(yyvsp[(3) - (3)].setting),(yylsp[(3) - (3)]).first_line); }
+    { xlu__cfg_set_store(ctx,(yyvsp[(1) - (3)].string),(yyvsp[(3) - (3)].value),(yylsp[(3) - (3)]).first_line); }
     break;
 
   case 12:
 
 /* Line 1806 of yacc.c  */
 #line 62 "libxlu_cfg_y.y"
-    { (yyval.setting)= xlu__cfg_set_mk(ctx,1,(yyvsp[(1) - (1)].string)); }
+    { (yyval.value)= xlu__cfg_string_mk(ctx,(yyvsp[(1) - (1)].string),&(yylsp[(1) - (1)])); }
     break;
 
   case 13:
 
 /* Line 1806 of yacc.c  */
 #line 63 "libxlu_cfg_y.y"
-    { (yyval.setting)= (yyvsp[(3) - (4)].setting); }
+    { (yyval.value)= (yyvsp[(3) - (4)].value); }
     break;
 
   case 14:
@@ -1543,35 +1543,35 @@ yyreduce:
 
 /* Line 1806 of yacc.c  */
 #line 68 "libxlu_cfg_y.y"
-    { (yyval.setting)= xlu__cfg_set_mk(ctx,0,0); }
+    { (yyval.value)= xlu__cfg_list_mk(ctx,NULL,&yylloc); }
     break;
 
   case 17:
 
 /* Line 1806 of yacc.c  */
 #line 69 "libxlu_cfg_y.y"
-    { (yyval.setting)= (yyvsp[(1) - (1)].setting); }
+    { (yyval.value)= (yyvsp[(1) - (1)].value); }
     break;
 
   case 18:
 
 /* Line 1806 of yacc.c  */
 #line 70 "libxlu_cfg_y.y"
-    { (yyval.setting)= (yyvsp[(1) - (3)].setting); }
+    { (yyval.value)= (yyvsp[(1) - (3)].value); }
     break;
 
   case 19:
 
 /* Line 1806 of yacc.c  */
 #line 72 "libxlu_cfg_y.y"
-    { (yyval.setting)= xlu__cfg_set_mk(ctx,2,(yyvsp[(1) - (2)].string)); }
+    { (yyval.value)= xlu__cfg_list_mk(ctx,(yyvsp[(1) - (2)].value),&(yylsp[(1) - (2)])); }
     break;
 
   case 20:
 
 /* Line 1806 of yacc.c  */
 #line 73 "libxlu_cfg_y.y"
-    { xlu__cfg_set_add(ctx,(yyvsp[(1) - (5)].setting),(yyvsp[(4) - (5)].string)); (yyval.setting)= (yyvsp[(1) - (5)].setting); }
+    { xlu__cfg_list_append(ctx,(yyvsp[(1) - (5)].value),(yyvsp[(4) - (5)].value)); (yyval.value)= (yyvsp[(1) - (5)].value); }
     break;
 
 
diff --git a/tools/libxl/libxlu_cfg_y.h b/tools/libxl/libxlu_cfg_y.h
index d7dfaf2..37e8213 100644
--- a/tools/libxl/libxlu_cfg_y.h
+++ b/tools/libxl/libxlu_cfg_y.h
@@ -54,7 +54,7 @@ typedef union YYSTYPE
 #line 25 "libxlu_cfg_y.y"
 
   char *string;
-  XLU_ConfigSetting *setting;
+  XLU_ConfigValue *value;
 
 
 
diff --git a/tools/libxl/libxlu_cfg_y.y b/tools/libxl/libxlu_cfg_y.y
index 5acd438..a923f76 100644
--- a/tools/libxl/libxlu_cfg_y.y
+++ b/tools/libxl/libxlu_cfg_y.y
@@ -24,7 +24,7 @@
 
 %union {
   char *string;
-  XLU_ConfigSetting *setting;
+  XLU_ConfigValue *value;
 }
 
 %locations
@@ -39,8 +39,8 @@
 %type <string>            atom
 %destructor { free($$); } atom IDENT STRING NUMBER
 
-%type <setting>                         value valuelist values
-%destructor { xlu__cfg_set_free($$); }  value valuelist values
+%type <value>                             value valuelist values
+%destructor { xlu__cfg_value_free($$); }  value valuelist values
 
 %%
 
@@ -59,18 +59,18 @@ assignment: IDENT '=' value { xlu__cfg_set_store(ctx,$1,$3, at 3.first_line); }
 endstmt: NEWLINE
  |      ';'
 
-value:  atom                         { $$= xlu__cfg_set_mk(ctx,1,$1); }
+value:  atom                         { $$= xlu__cfg_string_mk(ctx,$1,&@1); }
  |      '[' nlok valuelist ']'       { $$= $3; }
 
 atom:   STRING                   { $$= $1; }
  |      NUMBER                   { $$= $1; }
 
-valuelist: /* empty */           { $$= xlu__cfg_set_mk(ctx,0,0); }
+valuelist: /* empty */           { $$= xlu__cfg_list_mk(ctx,NULL,&yylloc); }
  |      values                  { $$= $1; }
  |      values ',' nlok         { $$= $1; }
 
-values: atom nlok                  { $$= xlu__cfg_set_mk(ctx,2,$1); }
- |      values ',' nlok atom nlok  { xlu__cfg_set_add(ctx,$1,$4); $$= $1; }
+values: value nlok                  { $$= xlu__cfg_list_mk(ctx,$1,&@1); }
+ |      values ',' nlok value nlok  { xlu__cfg_list_append(ctx,$1,$4); $$= $1; }
 
 nlok:
         /* nothing */
diff --git a/tools/libxl/libxlu_internal.h b/tools/libxl/libxlu_internal.h
index 7579158..0acdde3 100644
--- a/tools/libxl/libxlu_internal.h
+++ b/tools/libxl/libxlu_internal.h
@@ -23,17 +23,38 @@
 #include <assert.h>
 #include <regex.h>
 
-#define XLU_ConfigList XLU_ConfigSetting
-
 #include "libxlutil.h"
 
-struct XLU_ConfigSetting { /* transparent */
+struct XLU_ConfigList {
+    int avalues; /* available slots */
+    int nvalues; /* actual occupied slots */
+    XLU_ConfigValue **values;
+};
+
+typedef struct YYLTYPE
+{
+  int first_line;
+  int first_column;
+  int last_line;
+  int last_column;
+} YYLTYPE;
+#define YYLTYPE_IS_DECLARED
+
+struct XLU_ConfigValue {
+    enum XLU_ConfigValueType type;
+    union {
+        char *string;
+        XLU_ConfigList list;
+    } u;
+    YYLTYPE loc;
+};
+
+typedef struct XLU_ConfigSetting { /* transparent */
     struct XLU_ConfigSetting *next;
     char *name;
-    int nvalues, avalues; /* lists have avalues>1 */
-    char **values;
+    XLU_ConfigValue *value;
     int lineno;
-};
+} XLU_ConfigSetting;
 
 struct XLU_Config {
     XLU_ConfigSetting *settings;
diff --git a/tools/libxl/libxlu_pci.c b/tools/libxl/libxlu_pci.c
index 26fb143..2cd793d 100644
--- a/tools/libxl/libxlu_pci.c
+++ b/tools/libxl/libxlu_pci.c
@@ -42,6 +42,9 @@ static int pcidev_struct_fill(libxl_device_pci *pcidev, unsigned int domain,
 #define STATE_OPTIONS_K 6
 #define STATE_OPTIONS_V 7
 #define STATE_TERMINAL  8
+#define STATE_TYPE      9
+#define STATE_RDM_STRATEGY      10
+#define STATE_RESERVE_POLICY    11
 int xlu_pci_parse_bdf(XLU_Config *cfg, libxl_device_pci *pcidev, const char *str)
 {
     unsigned state = STATE_DOMAIN;
@@ -143,7 +146,18 @@ int xlu_pci_parse_bdf(XLU_Config *cfg, libxl_device_pci *pcidev, const char *str
                     pcidev->permissive = atoi(tok);
                 }else if ( !strcmp(optkey, "seize") ) {
                     pcidev->seize = atoi(tok);
-                }else{
+                } else if (!strcmp(optkey, "rdm_policy")) {
+                    if (!strcmp(tok, "strict")) {
+                        pcidev->rdm_policy = LIBXL_RDM_RESERVE_POLICY_STRICT;
+                    } else if (!strcmp(tok, "relaxed")) {
+                        pcidev->rdm_policy = LIBXL_RDM_RESERVE_POLICY_RELAXED;
+                    } else {
+                        XLU__PCI_ERR(cfg, "%s is not an valid PCI RDM property"
+                                          " policy: 'strict' or 'relaxed'.",
+                                     tok);
+                        goto parse_error;
+                    }
+                } else {
                     XLU__PCI_ERR(cfg, "Unknown PCI BDF option: %s", optkey);
                 }
                 tok = ptr + 1;
@@ -153,17 +167,95 @@ int xlu_pci_parse_bdf(XLU_Config *cfg, libxl_device_pci *pcidev, const char *str
         }
     }
 
-    free(buf2);
-
     if ( tok != ptr || state != STATE_TERMINAL )
         goto parse_error;
 
     /* Just a pretty way to fill in the values */
     pcidev_struct_fill(pcidev, dom, bus, dev, func, vslot << 3);
 
+    free(buf2);
+
     return 0;
 
 parse_error:
+    free(buf2);
+    return ERROR_INVAL;
+}
+
+int xlu_rdm_parse(XLU_Config *cfg, libxl_rdm_reserve *rdm, const char *str)
+{
+    unsigned state = STATE_TYPE;
+    char *buf2, *tok, *ptr, *end;
+
+    if (NULL == (buf2 = ptr = strdup(str)))
+        return ERROR_NOMEM;
+
+    for (tok = ptr, end = ptr + strlen(ptr) + 1; ptr < end; ptr++) {
+        switch(state) {
+        case STATE_TYPE:
+            if (*ptr == '=') {
+                state = STATE_RDM_STRATEGY;
+                *ptr = '\0';
+                if (strcmp(tok, "strategy")) {
+                    XLU__PCI_ERR(cfg, "Unknown RDM state option: %s", tok);
+                    goto parse_error;
+                }
+                tok = ptr + 1;
+            }
+            break;
+        case STATE_RDM_STRATEGY:
+            if (*ptr == '\0' || *ptr == ',') {
+                state = STATE_RESERVE_POLICY;
+                *ptr = '\0';
+                if (!strcmp(tok, "host")) {
+                    rdm->strategy = LIBXL_RDM_RESERVE_STRATEGY_HOST;
+                } else {
+                    XLU__PCI_ERR(cfg, "Unknown RDM strategy option: %s", tok);
+                    goto parse_error;
+                }
+                tok = ptr + 1;
+            }
+            break;
+        case STATE_RESERVE_POLICY:
+            if (*ptr == '=') {
+                state = STATE_OPTIONS_V;
+                *ptr = '\0';
+                if (strcmp(tok, "policy")) {
+                    XLU__PCI_ERR(cfg, "Unknown RDM property value: %s", tok);
+                    goto parse_error;
+                }
+                tok = ptr + 1;
+            }
+            break;
+        case STATE_OPTIONS_V:
+            if (*ptr == ',' || *ptr == '\0') {
+                state = STATE_TERMINAL;
+                *ptr = '\0';
+                if (!strcmp(tok, "strict")) {
+                    rdm->policy = LIBXL_RDM_RESERVE_POLICY_STRICT;
+                } else if (!strcmp(tok, "relaxed")) {
+                    rdm->policy = LIBXL_RDM_RESERVE_POLICY_RELAXED;
+                } else {
+                    XLU__PCI_ERR(cfg, "Unknown RDM property policy value: %s",
+                                 tok);
+                    goto parse_error;
+                }
+                tok = ptr + 1;
+            }
+        default:
+            break;
+        }
+    }
+
+    if (tok != ptr || state != STATE_TERMINAL)
+        goto parse_error;
+
+    free(buf2);
+
+    return 0;
+
+parse_error:
+    free(buf2);
     return ERROR_INVAL;
 }
 
diff --git a/tools/libxl/libxlutil.h b/tools/libxl/libxlutil.h
index 0333e55..e81b644 100644
--- a/tools/libxl/libxlutil.h
+++ b/tools/libxl/libxlutil.h
@@ -20,9 +20,15 @@
 
 #include "libxl.h"
 
+enum XLU_ConfigValueType {
+    XLU_STRING,
+    XLU_LIST,
+};
+
 /* Unless otherwise stated, all functions return an errno value. */
 typedef struct XLU_Config XLU_Config;
 typedef struct XLU_ConfigList XLU_ConfigList;
+typedef struct XLU_ConfigValue XLU_ConfigValue;
 
 XLU_Config *xlu_cfg_init(FILE *report, const char *report_filename);
   /* 0 means we got ENOMEM. */
@@ -66,6 +72,13 @@ const char *xlu_cfg_get_listitem(const XLU_ConfigList*, int entry);
   /* xlu_cfg_get_listitem cannot fail, except that if entry is
    * out of range it returns 0 (not setting errno) */
 
+enum XLU_ConfigValueType xlu_cfg_value_type(const XLU_ConfigValue *value);
+int xlu_cfg_value_get_string(const XLU_Config *cfg,  XLU_ConfigValue *value,
+                             char **value_r, int dont_warn);
+int xlu_cfg_value_get_list(const XLU_Config *cfg, XLU_ConfigValue *value,
+                           XLU_ConfigList **value_r, int dont_warn);
+XLU_ConfigValue *xlu_cfg_get_listitem2(const XLU_ConfigList *list,
+                                       int entry);
 
 /*
  * Disk specification parsing.
@@ -93,6 +106,10 @@ int xlu_disk_parse(XLU_Config *cfg, int nspecs, const char *const *specs,
  */
 int xlu_pci_parse_bdf(XLU_Config *cfg, libxl_device_pci *pcidev, const char *str);
 
+/*
+ * RDM parsing
+ */
+int xlu_rdm_parse(XLU_Config *cfg, libxl_rdm_reserve *rdm, const char *str);
 
 /*
  * Vif rate parsing.
diff --git a/tools/libxl/test_common.c b/tools/libxl/test_common.c
index 83b94eb..c6bbbab 100644
--- a/tools/libxl/test_common.c
+++ b/tools/libxl/test_common.c
@@ -12,4 +12,46 @@ void test_common_setup(int level)
 
     int rc = libxl_ctx_alloc(&ctx, LIBXL_VERSION, 0, logger);
     assert(!rc);
-}    
+}
+
+struct timeval now;
+
+void test_common_get_now(void)
+{
+    int r = gettimeofday(&now, 0);  assert(!r);
+}
+
+int poll_nfds, poll_nfds_allocd;
+struct pollfd *poll_fds;
+int poll_timeout;
+
+void test_common_beforepoll(void)
+{
+    for (;;) {
+        test_common_get_now();
+
+        poll_timeout = -1;
+        poll_nfds = poll_nfds_allocd;
+        int rc = libxl_osevent_beforepoll(ctx, &poll_nfds, poll_fds,
+                                          &poll_timeout, now);
+        if (!rc) return;
+        assert(rc == ERROR_BUFFERFULL);
+
+        assert(poll_nfds > poll_nfds_allocd);
+        poll_fds = realloc(poll_fds, poll_nfds * sizeof(poll_fds[0]));
+        assert(poll_fds);
+        poll_nfds_allocd = poll_nfds;
+    }
+}
+
+void test_common_dopoll(void) {
+    errno = 0;
+    int r = poll(poll_fds, poll_nfds, poll_timeout);
+    fprintf(stderr, "poll: r=%d errno=%s\n", r, strerror(errno));
+}
+
+void test_common_afterpoll(void)
+{
+    test_common_get_now();
+    libxl_osevent_afterpoll(ctx, poll_nfds, poll_fds, now);
+}
diff --git a/tools/libxl/test_common.h b/tools/libxl/test_common.h
index 8b2471e..10c7166 100644
--- a/tools/libxl/test_common.h
+++ b/tools/libxl/test_common.h
@@ -6,9 +6,24 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 
 void test_common_setup(int level);
 
 extern libxl_ctx *ctx;
 
+void test_common_get_now(void);
+
+extern struct timeval now;
+
+void test_common_beforepoll(void);
+void test_common_dopoll(void);
+void test_common_afterpoll(void);
+
+extern int poll_nfds, poll_nfds_allocd;
+extern struct pollfd *poll_fds;
+extern int poll_timeout;
+
 #endif /*TEST_COMMON_H*/
diff --git a/tools/libxl/test_fdderegrace.c b/tools/libxl/test_fdderegrace.c
new file mode 100644
index 0000000..f57965f
--- /dev/null
+++ b/tools/libxl/test_fdderegrace.c
@@ -0,0 +1,56 @@
+#include "test_common.h"
+#include "libxl_test_fdevent.h"
+
+int main(int argc, char **argv) {
+    int rc, i;
+    libxl_asyncop_how how;
+    libxl_event *event;
+
+    test_common_setup(XTL_DEBUG);
+
+    how.callback = NULL;
+    how.u.for_event = 1;
+
+    int fd = open("/dev/null", O_RDONLY);
+    assert(fd > 0);
+
+    rc = libxl_test_fdevent(ctx, fd, POLLIN, &how);
+    assert(!rc);
+
+    test_common_beforepoll();
+
+    rc = libxl_ao_abort(ctx, &how);
+    assert(!rc);
+
+    rc = libxl_event_check(ctx, &event, LIBXL_EVENTMASK_ALL, 0,0);
+    assert(!rc);
+    assert(event);
+    assert(event->for_user == how.u.for_event);
+    assert(event->type == LIBXL_EVENT_TYPE_OPERATION_COMPLETE);
+    assert(event->u.operation_complete.rc == ERROR_ABORTED);
+
+    close(fd);
+
+    test_common_dopoll();
+
+    for (i=0; i<poll_nfds; i++) {
+        if (poll_fds[i].fd == fd && (poll_fds[i].revents & POLLNVAL)) {
+            fprintf(stderr, "POLLNVAL on fd=%d in slot i=%d as expected\n",
+                    fd, i);
+            goto found;
+        }
+    }
+    abort();
+ found:;
+
+    int fd2 = open("/dev/null", O_RDONLY);
+    assert(fd2 == fd);
+
+    how.u.for_event++;
+    rc = libxl_test_fdevent(ctx, fd, POLLIN, &how);
+    assert(!rc);
+
+    test_common_afterpoll();
+
+    fprintf(stderr, "complete\n");
+}
diff --git a/tools/libxl/xenlight.pc.in.in b/tools/libxl/xenlight.pc.in.in
new file mode 100644
index 0000000..c27872e
--- /dev/null
+++ b/tools/libxl/xenlight.pc.in.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+includedir=@includedir@
+libdir=@libdir@
+xenfirmwaredir=@XENFIRMWAREDIR@
+libexec_bin=@LIBEXEC_BIN@
+
+Name: Xenlight
+Description: The Xenlight library for Xen hypervisor
+Version: @@version@@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lxenlight
diff --git a/tools/libxl/xl.c b/tools/libxl/xl.c
index f014306..5316ad9 100644
--- a/tools/libxl/xl.c
+++ b/tools/libxl/xl.c
@@ -118,7 +118,7 @@ static void parse_global_config(const char *configfile,
     }
 
     if (!lockfile) {
-        fprintf(stderr, "failed to allocate lockdir\n");
+        fprintf(stderr, "failed to allocate lockfile\n");
         exit(1);
     }
 
diff --git a/tools/libxl/xl.h b/tools/libxl/xl.h
index 5bc138c..13bccba 100644
--- a/tools/libxl/xl.h
+++ b/tools/libxl/xl.h
@@ -66,7 +66,6 @@ int main_memmax(int argc, char **argv);
 int main_memset(int argc, char **argv);
 int main_sched_credit(int argc, char **argv);
 int main_sched_credit2(int argc, char **argv);
-int main_sched_sedf(int argc, char **argv);
 int main_sched_rtds(int argc, char **argv);
 int main_domid(int argc, char **argv);
 int main_domname(int argc, char **argv);
@@ -113,10 +112,15 @@ int main_remus(int argc, char **argv);
 #endif
 int main_devd(int argc, char **argv);
 #ifdef LIBXL_HAVE_PSR_CMT
+int main_psr_hwinfo(int argc, char **argv);
 int main_psr_cmt_attach(int argc, char **argv);
 int main_psr_cmt_detach(int argc, char **argv);
 int main_psr_cmt_show(int argc, char **argv);
 #endif
+#ifdef LIBXL_HAVE_PSR_CAT
+int main_psr_cat_cbm_set(int argc, char **argv);
+int main_psr_cat_show(int argc, char **argv);
+#endif
 
 void help(const char *command);
 
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index ed0d478..9a2870e 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -109,7 +109,10 @@ static const char migrate_report[]=
    */
 
 #define XL_MANDATORY_FLAG_JSON (1U << 0) /* config data is in JSON format */
-#define XL_MANDATORY_FLAG_ALL  (XL_MANDATORY_FLAG_JSON)
+#define XL_MANDATORY_FLAG_STREAMv2 (1U << 1) /* stream is v2 */
+#define XL_MANDATORY_FLAG_ALL  (XL_MANDATORY_FLAG_JSON |        \
+                                XL_MANDATORY_FLAG_STREAMv2)
+
 struct save_file_header {
     char magic[32]; /* savefileheader_magic */
     /* All uint32_ts are in domain's byte order. */
@@ -151,7 +154,7 @@ struct domain_create {
     int console_autoconnect;
     int checkpointed_stream;
     const char *config_file;
-    const char *extra_config; /* extra config string */
+    char *extra_config; /* extra config string */
     const char *restore_file;
     int migrate_fd; /* -1 means none */
     char **migration_domname_r; /* from malloc */
@@ -289,6 +292,16 @@ static void *xmalloc(size_t sz) {
     return r;
 }
 
+static void *xcalloc(size_t n, size_t sz) __attribute__((unused));
+static void *xcalloc(size_t n, size_t sz) {
+    void *r = calloc(n, sz);
+    if (!r) {
+        fprintf(stderr,"xl: Unable to calloc %zu bytes.\n", sz*n);
+        exit(-ERROR_FAIL);
+    }
+    return r;
+}
+
 static void *xrealloc(void *ptr, size_t sz) {
     void *r;
     if (!sz) { free(ptr); return 0; }
@@ -313,16 +326,24 @@ static char *xstrdup(const char *x)
     return r;
 }
 
-#define ARRAY_EXTEND_INIT(array,count,initfn)                           \
+#define ARRAY_EXTEND_INIT__CORE(array,count,initfn,more)                \
     ({                                                                  \
         typeof((count)) array_extend_old_count = (count);               \
         (count)++;                                                      \
         (array) = xrealloc((array), sizeof(*array) * (count));          \
         (initfn)(&(array)[array_extend_old_count]);                     \
-        (array)[array_extend_old_count].devid = array_extend_old_count; \
+        more;                                                           \
         &(array)[array_extend_old_count];                               \
     })
 
+#define ARRAY_EXTEND_INIT(array,count,initfn)                           \
+    ARRAY_EXTEND_INIT__CORE((array),(count),(initfn), ({                \
+        (array)[array_extend_old_count].devid = array_extend_old_count; \
+        }))
+
+#define ARRAY_EXTEND_INIT_NODEVID(array,count,initfn) \
+    ARRAY_EXTEND_INIT__CORE((array),(count),(initfn), /* nothing */ )
+
 #define LOG(_f, _a...)   dolog(__FILE__, __LINE__, __func__, _f "\n", ##_a)
 
 static void dolog(const char *file, int line, const char *func, char *fmt, ...)
@@ -344,6 +365,27 @@ static void dolog(const char *file, int line, const char *func, char *fmt, ...)
     free(s);
 }
 
+static void xvasprintf(char **strp, const char *fmt, va_list ap)
+    __attribute__((format(printf,2,0)));
+static void xvasprintf(char **strp, const char *fmt, va_list ap)
+{
+    int r = vasprintf(strp, fmt, ap);
+    if (r == -1) {
+        perror("asprintf failed");
+        exit(-ERROR_FAIL);
+    }
+}
+
+static void xasprintf(char **strp, const char *fmt, ...)
+    __attribute__((format(printf,2,3)));
+static void xasprintf(char **strp, const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    xvasprintf(strp, fmt, ap);
+    va_end(ap);
+}
+
 static yajl_gen_status printf_info_one_json(yajl_gen hand, int domid,
                                             libxl_domain_config *d_config)
 {
@@ -379,6 +421,20 @@ static yajl_gen_status printf_info_one_json(yajl_gen hand, int domid,
 out:
     return s;
 }
+
+static void flush_stream(FILE *fh)
+{
+    const char *fh_name =
+        fh == stdout ? "stdout" :
+        fh == stderr ? "stderr" :
+        (abort(), (const char*)0);
+
+    if (ferror(fh) || fflush(fh)) {
+        perror(fh_name);
+        exit(-1);
+    }
+}
+
 static void printf_info(enum output_format output_format,
                         int domid,
                         libxl_domain_config *d_config, FILE *fh)
@@ -414,16 +470,10 @@ out:
         fprintf(stderr,
                 "unable to format domain config as JSON (YAJL:%d)\n", s);
 
-    if (ferror(fh) || fflush(fh)) {
-        if (fh == stdout)
-            perror("stdout");
-        else
-            perror("stderr");
-        exit(-1);
-    }
+    flush_stream(fh);
 }
 
-static int do_daemonize(char *name)
+static int do_daemonize(char *name, const char *pidfile)
 {
     char *fullname;
     pid_t child1;
@@ -455,6 +505,33 @@ static int do_daemonize(char *name)
 
     CHK_SYSCALL(daemon(0, 1));
 
+    if (pidfile) {
+        int fd = open(pidfile, O_RDWR | O_CREAT, S_IRUSR|S_IWUSR);
+        char *pid = NULL;
+
+        if (fd == -1) {
+            perror("Unable to open pidfile");
+            exit(1);
+        }
+
+        if (asprintf(&pid, "%ld\n", (long)getpid()) == -1) {
+            perror("Formatting pid");
+            exit(1);
+        }
+
+        if (write(fd, pid, strlen(pid)) < 0) {
+            perror("Writing pid");
+            exit(1);
+        }
+
+        if (close(fd) < 0) {
+            perror("Closing pidfile");
+            exit(1);
+        }
+
+        free(pid);
+    }
+
 out:
     return ret;
 }
@@ -600,26 +677,25 @@ typedef int (*char_predicate_t)(const int c);
 
 static void trim(char_predicate_t predicate, const char *input, char **output)
 {
-    char *p, *q, *tmp;
+    const char *first, *after;
 
-    *output = NULL;
-    if (*input == '\000')
-        return;
-    /* Input has length >= 1 */
-
-    p = tmp = xstrdup(input);
-    /* Skip past the characters for which predicate is true */
-    while ((*p != '\000') && (predicate((unsigned char)*p)))
-        p ++;
-    q = p + strlen(p) - 1;
-    /* q points to the last non-NULL character */
-    while ((q > p) && (predicate((unsigned char)*q)))
-        q --;
-    /* q points to the last character we want */
-    q ++;
-    *q = '\000';
-    *output = xstrdup(p);
-    free(tmp);
+    for (first = input;
+         *first && predicate((unsigned char)first[0]);
+         first++)
+        ;
+
+    for (after = first + strlen(first);
+         after > first && predicate((unsigned char)after[-1]);
+         after--)
+        ;
+
+    size_t len_nonnull = after - first;
+    char *result = xmalloc(len_nonnull + 1);
+
+    memcpy(result, first, len_nonnull);
+    result[len_nonnull] = 0;
+
+    *output = result;
 }
 
 static int split_string_into_pair(const char *str,
@@ -760,7 +836,7 @@ static int update_cpumap_range(const char *str, libxl_bitmap *cpumap)
  * single cpus or as eintire NUMA nodes) and turns it into the
  * corresponding libxl_bitmap (in cpumap).
  */
-static int vcpupin_parse(const char *cpu, libxl_bitmap *cpumap)
+static int cpurange_parse(const char *cpu, libxl_bitmap *cpumap)
 {
     char *ptr, *saveptr = NULL, *buf = strdup(cpu);
     int rc = 0;
@@ -813,9 +889,10 @@ static char *parse_cmdline(XLU_Config *config)
             fprintf(stderr, "Warning: ignoring root= and extra= "
                     "in favour of cmdline=\n");
     } else {
-        if (root) {
-            if (asprintf(&cmdline, "root=%s %s", root, extra) == -1)
-                cmdline = NULL;
+        if (root && extra) {
+            xasprintf(&cmdline, "root=%s %s", root, extra);
+        } else if (root) {
+            xasprintf(&cmdline, "root=%s", root);
         } else if (extra) {
             cmdline = strdup(extra);
         }
@@ -870,7 +947,7 @@ static void parse_vcpu_affinity(libxl_domain_build_info *b_info,
                 exit(1);
             }
 
-            if (vcpupin_parse(buf, &vcpu_affinity_array[j]))
+            if (cpurange_parse(buf, &vcpu_affinity_array[j]))
                 exit(1);
 
             j++;
@@ -887,7 +964,7 @@ static void parse_vcpu_affinity(libxl_domain_build_info *b_info,
             exit(1);
         }
 
-        if (vcpupin_parse(buf, &vcpu_affinity_array[0]))
+        if (cpurange_parse(buf, &vcpu_affinity_array[0]))
             exit(1);
 
         for (i = 1; i < b_info->max_vcpus; i++) {
@@ -910,16 +987,282 @@ static void replace_string(char **str, const char *val)
     *str = xstrdup(val);
 }
 
+static int match_option_size(const char *prefix, size_t len,
+        char *arg, char **argopt)
+{
+    int rc = strncmp(prefix, arg, len);
+    if (!rc) *argopt = arg+len;
+    return !rc;
+}
+#define MATCH_OPTION(prefix, arg, oparg) \
+    match_option_size((prefix "="), sizeof((prefix)), (arg), &(oparg))
+
+/* Parses network data and adds info into nic
+ * Returns 1 if the input token does not match one of the keys
+ * or parsed values are not correct. Successful parse returns 0 */
+static int parse_nic_config(libxl_device_nic *nic, XLU_Config **config, char *token)
+{
+    char *endptr, *oparg;
+    int i;
+    unsigned int val;
+
+    if (MATCH_OPTION("type", token, oparg)) {
+        if (!strcmp("vif", oparg)) {
+            nic->nictype = LIBXL_NIC_TYPE_VIF;
+        } else if (!strcmp("ioemu", oparg)) {
+            nic->nictype = LIBXL_NIC_TYPE_VIF_IOEMU;
+        } else {
+            fprintf(stderr, "Invalid parameter `type'.\n");
+            return 1;
+        }
+    } else if (MATCH_OPTION("mac", token, oparg)) {
+        for (i = 0; i < 6; i++) {
+            val = strtoul(oparg, &endptr, 16);
+            if ((oparg == endptr) || (val > 255)) {
+                fprintf(stderr, "Invalid parameter `mac'.\n");
+                return 1;
+            }
+            nic->mac[i] = val;
+            oparg = endptr + 1;
+        }
+    } else if (MATCH_OPTION("bridge", token, oparg)) {
+        replace_string(&nic->bridge, oparg);
+    } else if (MATCH_OPTION("netdev", token, oparg)) {
+        fprintf(stderr, "the netdev parameter is deprecated, "
+                        "please use gatewaydev instead\n");
+        replace_string(&nic->gatewaydev, oparg);
+    } else if (MATCH_OPTION("gatewaydev", token, oparg)) {
+        replace_string(&nic->gatewaydev, oparg);
+    } else if (MATCH_OPTION("ip", token, oparg)) {
+        replace_string(&nic->ip, oparg);
+    } else if (MATCH_OPTION("script", token, oparg)) {
+        replace_string(&nic->script, oparg);
+    } else if (MATCH_OPTION("backend", token, oparg)) {
+        replace_string(&nic->backend_domname, oparg);
+    } else if (MATCH_OPTION("vifname", token, oparg)) {
+        replace_string(&nic->ifname, oparg);
+    } else if (MATCH_OPTION("model", token, oparg)) {
+        replace_string(&nic->model, oparg);
+    } else if (MATCH_OPTION("rate", token, oparg)) {
+        parse_vif_rate(config, oparg, nic);
+    } else if (MATCH_OPTION("accel", token, oparg)) {
+        fprintf(stderr, "the accel parameter for vifs is currently not supported\n");
+    } else {
+        fprintf(stderr, "unrecognized argument `%s'\n", token);
+        return 1;
+    }
+    return 0;
+}
+
+static unsigned long parse_ulong(const char *str)
+{
+    char *endptr;
+    unsigned long val;
+
+    val = strtoul(str, &endptr, 10);
+    if (endptr == str || val == ULONG_MAX) {
+        fprintf(stderr, "xl: failed to convert \"%s\" to number\n", str);
+        exit(1);
+    }
+    return val;
+}
+
+static void parse_vnuma_config(const XLU_Config *config,
+                               libxl_domain_build_info *b_info)
+{
+    libxl_physinfo physinfo;
+    uint32_t nr_nodes;
+    XLU_ConfigList *vnuma;
+    int i, j, len, num_vnuma;
+    unsigned long max_vcpus = 0, max_memkb = 0;
+    /* Temporary storage for parsed vcpus information to avoid
+     * parsing config twice. This array has num_vnuma elements.
+     */
+    libxl_bitmap *vcpu_parsed;
+
+    libxl_physinfo_init(&physinfo);
+    if (libxl_get_physinfo(ctx, &physinfo) != 0) {
+        libxl_physinfo_dispose(&physinfo);
+        fprintf(stderr, "libxl_get_physinfo failed\n");
+        exit(1);
+    }
+
+    nr_nodes = physinfo.nr_nodes;
+    libxl_physinfo_dispose(&physinfo);
+
+    if (xlu_cfg_get_list(config, "vnuma", &vnuma, &num_vnuma, 1))
+        return;
+
+    if (!num_vnuma)
+        return;
+
+    b_info->num_vnuma_nodes = num_vnuma;
+    b_info->vnuma_nodes = xcalloc(num_vnuma, sizeof(libxl_vnode_info));
+    vcpu_parsed = xcalloc(num_vnuma, sizeof(libxl_bitmap));
+    for (i = 0; i < num_vnuma; i++) {
+        libxl_bitmap_init(&vcpu_parsed[i]);
+        if (libxl_cpu_bitmap_alloc(ctx, &vcpu_parsed[i], b_info->max_vcpus)) {
+            fprintf(stderr, "libxl_node_bitmap_alloc failed.\n");
+            exit(1);
+        }
+    }
+
+    for (i = 0; i < b_info->num_vnuma_nodes; i++) {
+        libxl_vnode_info *p = &b_info->vnuma_nodes[i];
+
+        libxl_vnode_info_init(p);
+        p->distances = xcalloc(b_info->num_vnuma_nodes,
+                               sizeof(*p->distances));
+        p->num_distances = b_info->num_vnuma_nodes;
+    }
+
+    for (i = 0; i < num_vnuma; i++) {
+        XLU_ConfigValue *vnode_spec, *conf_option;
+        XLU_ConfigList *vnode_config_list;
+        int conf_count;
+        libxl_vnode_info *p = &b_info->vnuma_nodes[i];
+
+        vnode_spec = xlu_cfg_get_listitem2(vnuma, i);
+        assert(vnode_spec);
+
+        xlu_cfg_value_get_list(config, vnode_spec, &vnode_config_list, 0);
+        if (!vnode_config_list) {
+            fprintf(stderr, "xl: cannot get vnode config option list\n");
+            exit(1);
+        }
+
+        for (conf_count = 0;
+             (conf_option =
+              xlu_cfg_get_listitem2(vnode_config_list, conf_count));
+             conf_count++) {
+
+            if (xlu_cfg_value_type(conf_option) == XLU_STRING) {
+                char *buf, *option_untrimmed, *value_untrimmed;
+                char *option, *value;
+                unsigned long val;
+
+                xlu_cfg_value_get_string(config, conf_option, &buf, 0);
+
+                if (!buf) continue;
+
+                if (split_string_into_pair(buf, "=",
+                                           &option_untrimmed,
+                                           &value_untrimmed)) {
+                    fprintf(stderr, "xl: failed to split \"%s\" into pair\n",
+                            buf);
+                    exit(1);
+                }
+                trim(isspace, option_untrimmed, &option);
+                trim(isspace, value_untrimmed, &value);
+
+                if (!strcmp("pnode", option)) {
+                    val = parse_ulong(value);
+                    if (val >= nr_nodes) {
+                        fprintf(stderr,
+                                "xl: invalid pnode number: %lu\n", val);
+                        exit(1);
+                    }
+                    p->pnode = val;
+                    libxl_defbool_set(&b_info->numa_placement, false);
+                } else if (!strcmp("size", option)) {
+                    val = parse_ulong(value);
+                    p->memkb = val << 10;
+                    max_memkb += p->memkb;
+                } else if (!strcmp("vcpus", option)) {
+                    libxl_string_list cpu_spec_list;
+                    unsigned long s, e;
+
+                    split_string_into_string_list(value, ",", &cpu_spec_list);
+                    len = libxl_string_list_length(&cpu_spec_list);
+
+                    for (j = 0; j < len; j++) {
+                        parse_range(cpu_spec_list[j], &s, &e);
+                        for (; s <= e; s++) {
+                            /*
+                             * Note that if we try to set a bit beyond
+                             * the size of bitmap, libxl_bitmap_set
+                             * has no effect. The resulted bitmap
+                             * doesn't reflect what user wants. The
+                             * fallout is dealt with later after
+                             * parsing.
+                             */
+                            libxl_bitmap_set(&vcpu_parsed[i], s);
+                            max_vcpus++;
+                        }
+                    }
+
+                    libxl_string_list_dispose(&cpu_spec_list);
+                } else if (!strcmp("vdistances", option)) {
+                    libxl_string_list vdist;
+
+                    split_string_into_string_list(value, ",", &vdist);
+                    len = libxl_string_list_length(&vdist);
+
+                    for (j = 0; j < len; j++) {
+                        val = parse_ulong(vdist[j]);
+                        p->distances[j] = val;
+                    }
+                    libxl_string_list_dispose(&vdist);
+                }
+                free(option);
+                free(value);
+                free(option_untrimmed);
+                free(value_untrimmed);
+            }
+        }
+    }
+
+    /* User has specified maxvcpus= */
+    if (b_info->max_vcpus != 0) {
+        if (b_info->max_vcpus != max_vcpus) {
+            fprintf(stderr, "xl: vnuma vcpus and maxvcpus= mismatch\n");
+            exit(1);
+        }
+    } else {
+        int host_cpus = libxl_get_online_cpus(ctx);
+
+        if (host_cpus < 0) {
+            fprintf(stderr, "Failed to get online cpus\n");
+            exit(1);
+        }
+
+        if (host_cpus < max_vcpus) {
+            fprintf(stderr, "xl: vnuma specifies more vcpus than pcpus, "\
+                    "use maxvcpus= to override this check.\n");
+            exit(1);
+        }
+
+        b_info->max_vcpus = max_vcpus;
+    }
+
+    /* User has specified maxmem= */
+    if (b_info->max_memkb != LIBXL_MEMKB_DEFAULT &&
+        b_info->max_memkb != max_memkb) {
+        fprintf(stderr, "xl: maxmem and vnuma memory size mismatch\n");
+        exit(1);
+    } else
+        b_info->max_memkb = max_memkb;
+
+    for (i = 0; i < b_info->num_vnuma_nodes; i++) {
+        libxl_vnode_info *p = &b_info->vnuma_nodes[i];
+
+        libxl_bitmap_copy_alloc(ctx, &p->vcpus, &vcpu_parsed[i]);
+        libxl_bitmap_dispose(&vcpu_parsed[i]);
+    }
+
+    free(vcpu_parsed);
+}
+
 static void parse_config_data(const char *config_source,
                               const char *config_data,
                               int config_len,
                               libxl_domain_config *d_config)
 {
     const char *buf;
-    long l;
+    long l, vcpus = 0;
     XLU_Config *config;
     XLU_ConfigList *cpus, *vbds, *nics, *pcis, *cvfbs, *cpuids, *vtpms;
-    XLU_ConfigList *channels, *ioports, *irqs, *iomem, *viridian;
+    XLU_ConfigList *channels, *ioports, *irqs, *iomem, *viridian, *dtdevs;
     int num_ioports, num_irqs, num_iomem, num_cpus, num_viridian;
     int pci_power_mgmt = 0;
     int pci_msitranslate = 0;
@@ -1003,9 +1346,14 @@ static void parse_config_data(const char *config_source,
     if (!xlu_cfg_get_long (config, "extratime", &l, 0))
         b_info->sched_params.extratime = l;
 
-    if (!xlu_cfg_get_long (config, "vcpus", &l, 0)) {
-        b_info->max_vcpus = l;
+    if (!xlu_cfg_get_long (config, "memory", &l, 0))
+        b_info->target_memkb = l * 1024;
+
+    if (!xlu_cfg_get_long (config, "maxmem", &l, 0))
+        b_info->max_memkb = l * 1024;
 
+    if (!xlu_cfg_get_long (config, "vcpus", &l, 0)) {
+        vcpus = l;
         if (libxl_cpu_bitmap_alloc(ctx, &b_info->avail_vcpus, l)) {
             fprintf(stderr, "Unable to allocate cpumap\n");
             exit(1);
@@ -1018,6 +1366,21 @@ static void parse_config_data(const char *config_source,
     if (!xlu_cfg_get_long (config, "maxvcpus", &l, 0))
         b_info->max_vcpus = l;
 
+    parse_vnuma_config(config, b_info);
+
+    /* Set max_memkb to target_memkb and max_vcpus to avail_vcpus if
+     * they are not set by user specified config option or vnuma.
+     */
+    if (b_info->max_memkb == LIBXL_MEMKB_DEFAULT)
+        b_info->max_memkb = b_info->target_memkb;
+    if (b_info->max_vcpus == 0)
+        b_info->max_vcpus = vcpus;
+
+    if (b_info->max_vcpus < vcpus) {
+        fprintf(stderr, "xl: maxvcpus < vcpus\n");
+        exit(1);
+    }
+
     buf = NULL;
     if (!xlu_cfg_get_list (config, "cpus", &cpus, &num_cpus, 1) ||
         !xlu_cfg_get_string (config, "cpus", &buf, 0))
@@ -1028,14 +1391,6 @@ static void parse_config_data(const char *config_source,
         !xlu_cfg_get_string (config, "cpus_soft", &buf, 0))
         parse_vcpu_affinity(b_info, cpus, buf, num_cpus, false);
 
-    if (!xlu_cfg_get_long (config, "memory", &l, 0)) {
-        b_info->max_memkb = l * 1024;
-        b_info->target_memkb = b_info->max_memkb;
-    }
-
-    if (!xlu_cfg_get_long (config, "maxmem", &l, 0))
-        b_info->max_memkb = l * 1024;
-
     libxl_defbool_set(&b_info->claim_mode, claim_mode);
 
     if (xlu_cfg_get_string (config, "on_poweroff", &buf, 0))
@@ -1117,6 +1472,7 @@ static void parse_config_data(const char *config_source,
 
     xlu_cfg_replace_string (config, "kernel", &b_info->kernel, 0);
     xlu_cfg_replace_string (config, "ramdisk", &b_info->ramdisk, 0);
+    xlu_cfg_replace_string (config, "device_tree", &b_info->device_tree, 0);
     b_info->cmdline = parse_cmdline(config);
 
     xlu_cfg_get_defbool(config, "driver_domain", &c_info->driver_domain, 0);
@@ -1237,6 +1593,8 @@ static void parse_config_data(const char *config_source,
 
         xlu_cfg_get_defbool(config, "nestedhvm", &b_info->u.hvm.nested_hvm, 0);
 
+        xlu_cfg_get_defbool(config, "altp2mhvm", &b_info->u.hvm.altp2m, 0);
+
         xlu_cfg_replace_string(config, "smbios_firmware",
                                &b_info->u.hvm.smbios_firmware, 0);
         xlu_cfg_replace_string(config, "acpi_firmware",
@@ -1256,6 +1614,9 @@ static void parse_config_data(const char *config_source,
                     exit(1);
             }
         }
+
+        if (!xlu_cfg_get_long (config, "rdm_mem_boundary", &l, 0))
+            b_info->u.hvm.rdm_mem_boundary_memkb = l * 1024;
         break;
     case LIBXL_DOMAIN_TYPE_PV:
     {
@@ -1412,12 +1773,12 @@ static void parse_config_data(const char *config_source,
             libxl_device_disk *disk;
             char *buf2 = strdup(buf);
 
-            d_config->disks = (libxl_device_disk *) realloc(d_config->disks, sizeof (libxl_device_disk) * (d_config->num_disks + 1));
-            disk = d_config->disks + d_config->num_disks;
+            disk = ARRAY_EXTEND_INIT_NODEVID(d_config->disks,
+                                             d_config->num_disks,
+                                             libxl_device_disk_init);
             parse_disk_config(&config, buf2, disk);
 
             free(buf2);
-            d_config->num_disks++;
         }
     }
 
@@ -1430,11 +1791,9 @@ static void parse_config_data(const char *config_source,
             char *p, *p2;
             bool got_backend = false;
 
-            d_config->vtpms = (libxl_device_vtpm *) realloc(d_config->vtpms,
-                  sizeof(libxl_device_vtpm) * (d_config->num_vtpms+1));
-            vtpm = d_config->vtpms + d_config->num_vtpms;
-            libxl_device_vtpm_init(vtpm);
-            vtpm->devid = d_config->num_vtpms;
+            vtpm = ARRAY_EXTEND_INIT(d_config->vtpms,
+                                     d_config->num_vtpms,
+                                     libxl_device_vtpm_init);
 
             p = strtok(buf2, ",");
             if(p) {
@@ -1464,7 +1823,6 @@ static void parse_config_data(const char *config_source,
                exit(1);
             }
             free(buf2);
-            d_config->num_vtpms++;
         }
     }
 
@@ -1552,12 +1910,11 @@ static void parse_config_data(const char *config_source,
         while ((buf = xlu_cfg_get_listitem (nics, d_config->num_nics)) != NULL) {
             libxl_device_nic *nic;
             char *buf2 = strdup(buf);
-            char *p, *p2;
+            char *p;
 
-            d_config->nics = (libxl_device_nic *) realloc(d_config->nics, sizeof (libxl_device_nic) * (d_config->num_nics+1));
-            nic = d_config->nics + d_config->num_nics;
-            libxl_device_nic_init(nic);
-            nic->devid = d_config->num_nics;
+            nic = ARRAY_EXTEND_INIT(d_config->nics,
+                                    d_config->num_nics,
+                                    libxl_device_nic_init);
             set_default_nic_values(nic);
 
             p = strtok(buf2, ",");
@@ -1566,68 +1923,10 @@ static void parse_config_data(const char *config_source,
             do {
                 while (*p == ' ')
                     p++;
-                if ((p2 = strchr(p, '=')) == NULL)
-                    break;
-                *p2 = '\0';
-                if (!strcmp(p, "model")) {
-                    free(nic->model);
-                    nic->model = strdup(p2 + 1);
-                } else if (!strcmp(p, "mac")) {
-                    char *p3 = p2 + 1;
-                    *(p3 + 2) = '\0';
-                    nic->mac[0] = strtol(p3, NULL, 16);
-                    p3 = p3 + 3;
-                    *(p3 + 2) = '\0';
-                    nic->mac[1] = strtol(p3, NULL, 16);
-                    p3 = p3 + 3;
-                    *(p3 + 2) = '\0';
-                    nic->mac[2] = strtol(p3, NULL, 16);
-                    p3 = p3 + 3;
-                    *(p3 + 2) = '\0';
-                    nic->mac[3] = strtol(p3, NULL, 16);
-                    p3 = p3 + 3;
-                    *(p3 + 2) = '\0';
-                    nic->mac[4] = strtol(p3, NULL, 16);
-                    p3 = p3 + 3;
-                    *(p3 + 2) = '\0';
-                    nic->mac[5] = strtol(p3, NULL, 16);
-                } else if (!strcmp(p, "bridge")) {
-                    free(nic->bridge);
-                    nic->bridge = strdup(p2 + 1);
-                } else if (!strcmp(p, "type")) {
-                    if (!strcmp(p2 + 1, "ioemu"))
-                        nic->nictype = LIBXL_NIC_TYPE_VIF_IOEMU;
-                    else
-                        nic->nictype = LIBXL_NIC_TYPE_VIF;
-                } else if (!strcmp(p, "ip")) {
-                    free(nic->ip);
-                    nic->ip = strdup(p2 + 1);
-                } else if (!strcmp(p, "script")) {
-                    free(nic->script);
-                    nic->script = strdup(p2 + 1);
-                } else if (!strcmp(p, "vifname")) {
-                    free(nic->ifname);
-                    nic->ifname = strdup(p2 + 1);
-                } else if (!strcmp(p, "backend")) {
-                    free(nic->backend_domname);
-                    nic->backend_domname = strdup(p2 + 1);
-                } else if (!strcmp(p, "rate")) {
-                    parse_vif_rate(&config, (p2 + 1), nic);
-                } else if (!strcmp(p, "accel")) {
-                    fprintf(stderr, "the accel parameter for vifs is currently not supported\n");
-                } else if (!strcmp(p, "netdev")) {
-                    fprintf(stderr, "the netdev parameter is deprecated, "
-                                    "please use gatewaydev instead\n");
-                    free(nic->gatewaydev);
-                    nic->gatewaydev = strdup(p2 + 1);
-                } else if (!strcmp(p, "gatewaydev")) {
-                    free(nic->gatewaydev);
-                    nic->gatewaydev = strdup(p2 + 1);
-                }
+                parse_nic_config(nic, &config, p);
             } while ((p = strtok(NULL, ",")) != NULL);
 skip_nic:
             free(buf2);
-            d_config->num_nics++;
         }
     }
 
@@ -1714,27 +2013,62 @@ skip_vfb:
         xlu_cfg_get_defbool(config, "e820_host", &b_info->u.pv.e820_host, 0);
     }
 
+    if (!xlu_cfg_get_string(config, "rdm", &buf, 0)) {
+        libxl_rdm_reserve rdm;
+        if (!xlu_rdm_parse(config, &rdm, buf)) {
+            b_info->u.hvm.rdm.strategy = rdm.strategy;
+            b_info->u.hvm.rdm.policy = rdm.policy;
+        }
+    }
+
     if (!xlu_cfg_get_list (config, "pci", &pcis, 0, 0)) {
         d_config->num_pcidevs = 0;
         d_config->pcidevs = NULL;
         for(i = 0; (buf = xlu_cfg_get_listitem (pcis, i)) != NULL; i++) {
             libxl_device_pci *pcidev;
 
-            d_config->pcidevs = (libxl_device_pci *) realloc(d_config->pcidevs, sizeof (libxl_device_pci) * (d_config->num_pcidevs + 1));
-            pcidev = d_config->pcidevs + d_config->num_pcidevs;
-            libxl_device_pci_init(pcidev);
-
+            pcidev = ARRAY_EXTEND_INIT_NODEVID(d_config->pcidevs,
+                                               d_config->num_pcidevs,
+                                               libxl_device_pci_init);
             pcidev->msitranslate = pci_msitranslate;
             pcidev->power_mgmt = pci_power_mgmt;
             pcidev->permissive = pci_permissive;
             pcidev->seize = pci_seize;
-            if (!xlu_pci_parse_bdf(config, pcidev, buf))
-                d_config->num_pcidevs++;
+            /*
+             * Like other pci option, the per-device policy always follows
+             * the global policy by default.
+             */
+            pcidev->rdm_policy = b_info->u.hvm.rdm.policy;
+            e = xlu_pci_parse_bdf(config, pcidev, buf);
+            if (e) {
+                fprintf(stderr,
+                        "unable to parse PCI BDF `%s' for passthrough\n",
+                        buf);
+                exit(-e);
+            }
         }
         if (d_config->num_pcidevs && c_info->type == LIBXL_DOMAIN_TYPE_PV)
             libxl_defbool_set(&b_info->u.pv.e820_host, true);
     }
 
+    if (!xlu_cfg_get_list (config, "dtdev", &dtdevs, 0, 0)) {
+        d_config->num_dtdevs = 0;
+        d_config->dtdevs = NULL;
+        for (i = 0; (buf = xlu_cfg_get_listitem(dtdevs, i)) != NULL; i++) {
+            libxl_device_dtdev *dtdev;
+
+            dtdev = ARRAY_EXTEND_INIT_NODEVID(d_config->dtdevs,
+                                              d_config->num_dtdevs,
+                                              libxl_device_dtdev_init);
+
+            dtdev->path = strdup(buf);
+            if (dtdev->path == NULL) {
+                fprintf(stderr, "unable to duplicate string for dtdevs\n");
+                exit(-1);
+            }
+        }
+    }
+
     switch (xlu_cfg_get_list(config, "cpuid", &cpuids, 0, 1)) {
     case 0:
         {
@@ -1910,6 +2244,8 @@ skip_vfb:
                 b_info->u.hvm.vga.kind = LIBXL_VGA_INTERFACE_TYPE_CIRRUS;
             } else if (!strcmp(buf, "none")) {
                 b_info->u.hvm.vga.kind = LIBXL_VGA_INTERFACE_TYPE_NONE;
+            } else if (!strcmp(buf, "qxl")) {
+                b_info->u.hvm.vga.kind = LIBXL_VGA_INTERFACE_TYPE_QXL;
             } else {
                 fprintf(stderr, "Unknown vga \"%s\" specified\n", buf);
                 exit(1);
@@ -1918,6 +2254,13 @@ skip_vfb:
             b_info->u.hvm.vga.kind = l ? LIBXL_VGA_INTERFACE_TYPE_STD :
                                          LIBXL_VGA_INTERFACE_TYPE_CIRRUS;
 
+        if (!xlu_cfg_get_string(config, "hdtype", &buf, 0) &&
+            libxl_hdtype_from_string(buf, &b_info->u.hvm.hdtype)) {
+                fprintf(stderr, "ERROR: invalid value \"%s\" for \"hdtype\"\n",
+                    buf);
+                exit (1);
+        }
+
         xlu_cfg_replace_string (config, "keymap", &b_info->u.hvm.keymap, 0);
         xlu_cfg_get_defbool (config, "spice", &b_info->u.hvm.spice.enable, 0);
         if (!xlu_cfg_get_long (config, "spiceport", &l, 0))
@@ -1938,6 +2281,10 @@ skip_vfb:
                             &b_info->u.hvm.spice.clipboard_sharing, 0);
         if (!xlu_cfg_get_long (config, "spiceusbredirection", &l, 0))
             b_info->u.hvm.spice.usbredirection = l;
+        xlu_cfg_replace_string (config, "spice_image_compression",
+                                &b_info->u.hvm.spice.image_compression, 0);
+        xlu_cfg_replace_string (config, "spice_streaming_video",
+                                &b_info->u.hvm.spice.streaming_video, 0);
         xlu_cfg_get_defbool(config, "nographic", &b_info->u.hvm.nographic, 0);
         xlu_cfg_get_defbool(config, "gfx_passthru",
                             &b_info->u.hvm.gfx_passthru, 0);
@@ -2010,6 +2357,15 @@ skip_vfb:
         }
     }
 
+    if (!xlu_cfg_get_string (config, "gic_version", &buf, 1)) {
+        e = libxl_gic_version_from_string(buf, &b_info->arch_arm.gic_version);
+        if (e) {
+            fprintf(stderr,
+                    "Unknown gic_version \"%s\" specified\n", buf);
+            exit(-ERROR_FAIL);
+        }
+     }
+
     xlu_cfg_destroy(config);
 }
 
@@ -2092,14 +2448,11 @@ static int handle_domain_death(uint32_t *r_domid,
         char *corefile;
         int rc;
 
-        if (asprintf(&corefile, "/var/xen/dump/%s", d_config->c_info.name) < 0) {
-            LOG("failed to construct core dump path");
-        } else {
-            LOG("dumping core to %s", corefile);
-            rc=libxl_domain_core_dump(ctx, *r_domid, corefile, NULL);
-            if (rc) LOG("core dump failed (rc=%d).", rc);
-            free(corefile);
-        }
+        xasprintf(&corefile, XEN_DUMP_DIR "/%s", d_config->c_info.name);
+        LOG("dumping core to %s", corefile);
+        rc = libxl_domain_core_dump(ctx, *r_domid, corefile, NULL);
+        if (rc) LOG("core dump failed (rc=%d).", rc);
+        free(corefile);
         /* No point crying over spilled milk, continue on failure. */
 
         if (action == LIBXL_ACTION_ON_SHUTDOWN_COREDUMP_DESTROY)
@@ -2137,17 +2490,6 @@ static int handle_domain_death(uint32_t *r_domid,
     return restart;
 }
 
-/* for now used only by main_networkattach, but can be reused elsewhere */
-static int match_option_size(const char *prefix, size_t len,
-        char *arg, char **argopt)
-{
-    int rc = strncmp(prefix, arg, len);
-    if (!rc) *argopt = arg+len;
-    return !rc;
-}
-#define MATCH_OPTION(prefix, arg, oparg) \
-    match_option_size((prefix "="), sizeof((prefix)), (arg), &(oparg))
-
 /* Preserve a copy of a domain under a new name. Updates *r_domid */
 static int preserve_domain(uint32_t *r_domid, libxl_event *event,
                            libxl_domain_config *d_config)
@@ -2217,15 +2559,9 @@ static int freemem(uint32_t domid, libxl_domain_build_info *b_info)
         if (rc < 0)
             return rc;
 
-        rc = libxl_wait_for_free_memory(ctx, domid, need_memkb, 10);
-        if (!rc)
-            return 0;
-        else if (rc != ERROR_NOMEM)
-            return rc;
-
-        /* the memory target has been reached but the free memory is still
-         * not enough: loop over again */
-        rc = libxl_wait_for_memory_target(ctx, 0, 1);
+        /* wait until dom0 reaches its target, as long as we are making
+         * progress */
+        rc = libxl_wait_for_memory_target(ctx, 0, 10);
         if (rc < 0)
             return rc;
 
@@ -2318,6 +2654,7 @@ static uint32_t create_domain(struct domain_create *dom_info)
     void *config_data = 0;
     int config_len = 0;
     int restore_fd = -1;
+    int restore_fd_to_close = -1;
     const libxl_asyncprogress_how *autoconnect_console_how;
     struct save_file_header hdr;
 
@@ -2341,6 +2678,7 @@ static uint32_t create_domain(struct domain_create *dom_info)
                 fprintf(stderr, "Can't open restore file: %s\n", strerror(errno));
                 return ERROR_INVAL;
             }
+            restore_fd_to_close = restore_fd;
             rc = libxl_fd_set_cloexec(ctx, restore_fd, 1);
             if (rc) return rc;
         }
@@ -2419,7 +2757,7 @@ static uint32_t create_domain(struct domain_create *dom_info)
         }
         if (!restoring && extra_config && strlen(extra_config)) {
             if (config_len > INT_MAX - (strlen(extra_config) + 2 + 1)) {
-                fprintf(stderr, "Failed to attach extra configration\n");
+                fprintf(stderr, "Failed to attach extra configuration\n");
                 return ERROR_FAIL;
             }
             /* allocate space for the extra config plus two EOLs plus \0 */
@@ -2463,18 +2801,28 @@ static uint32_t create_domain(struct domain_create *dom_info)
             common_domname = d_config.c_info.name;
             d_config.c_info.name = 0; /* steals allocation from config */
 
-            if (asprintf(&d_config.c_info.name,
-                         "%s--incoming", common_domname) < 0) {
-                fprintf(stderr, "Failed to allocate memory in asprintf\n");
+            xasprintf(&d_config.c_info.name, "%s--incoming", common_domname);
+            *dom_info->migration_domname_r = strdup(d_config.c_info.name);
+        }
+    }
+
+    if (debug || dom_info->dryrun) {
+        FILE *cfg_print_fh = (debug && !dom_info->dryrun) ? stderr : stdout;
+        if (default_output_format == OUTPUT_FORMAT_SXP) {
+            printf_info_sexp(-1, &d_config, cfg_print_fh);
+        } else {
+            char *json = libxl_domain_config_to_json(ctx, &d_config);
+            if (!json) {
+                fprintf(stderr,
+                        "Failed to convert domain configuration to JSON\n");
                 exit(1);
             }
-            *dom_info->migration_domname_r = strdup(d_config.c_info.name);
+            fputs(json, cfg_print_fh);
+            free(json);
+            flush_stream(cfg_print_fh);
         }
     }
 
-    if (debug || dom_info->dryrun)
-        printf_info(default_output_format, -1, &d_config,
-                    debug ? stderr : stdout);
 
     ret = 0;
     if (dom_info->dryrun)
@@ -2508,6 +2856,9 @@ start:
         libxl_domain_restore_params_init(&params);
 
         params.checkpointed_stream = dom_info->checkpointed_stream;
+        params.stream_version =
+            (hdr.mandatory_flags & XL_MANDATORY_FLAG_STREAMv2) ? 2 : 1;
+
         ret = libxl_domain_create_restore(ctx, &d_config,
                                           &domid, restore_fd,
                                           &params,
@@ -2529,6 +2880,13 @@ start:
 
     release_lock();
 
+    if (restore_fd_to_close >= 0) {
+        if (close(restore_fd_to_close))
+            fprintf(stderr, "Failed to close restoring file, fd %d, errno %d\n",
+                    restore_fd_to_close, errno);
+        restore_fd_to_close = -1;
+    }
+
     if (!paused)
         libxl_domain_unpause(ctx, domid);
 
@@ -2542,11 +2900,8 @@ start:
     if (need_daemon) {
         char *name;
 
-        if (asprintf(&name, "xl-%s", d_config.c_info.name) < 0) {
-            LOG("Failed to allocate memory in asprintf");
-            exit(1);
-        }
-        ret = do_daemonize(name);
+        xasprintf(&name, "xl-%s", d_config.c_info.name);
+        ret = do_daemonize(name, NULL);
         free(name);
         if (ret) {
             ret = (ret == 1) ? domid : ret;
@@ -2738,11 +3093,14 @@ static int64_t parse_mem_size_kb(const char *mem)
     switch (tolower((uint8_t)*endptr)) {
     case 't':
         kbytes <<= 10;
+        /* fallthrough */
     case 'g':
         kbytes <<= 10;
+        /* fallthrough */
     case '\0':
     case 'm':
         kbytes <<= 10;
+        /* fallthrough */
     case 'k':
         break;
     case 'b':
@@ -2755,7 +3113,9 @@ static int64_t parse_mem_size_kb(const char *mem)
     return kbytes;
 }
 
-#define COMMON_LONG_OPTS {"help", 0, 0, 'h'}
+/* Must be last in list */
+#define COMMON_LONG_OPTS {"help", 0, 0, 'h'}, \
+                         {0, 0, 0, 0}
 
 /*
  * Callers should use SWITCH_FOREACH_OPT in preference to calling this
@@ -2768,8 +3128,7 @@ static int def_getopt(int argc, char * const argv[],
 {
     int opt;
     const struct option def_options[] = {
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
 
     if (!longopts)
@@ -2803,14 +3162,15 @@ static int def_getopt(int argc, char * const argv[],
  * Wraps def_getopt into a convenient loop+switch to process all
  * arguments. This macro is intended to be called from main_XXX().
  *
- *   SWITCH_FOREACH_OPT(int *opt, const char *opts,
+ *   SWITCH_FOREACH_OPT(int *opt, "OPTS",
  *                      const struct option *longopts,
  *                      const char *commandname,
  *                      int num_opts_req) { ...
  *
  * opt:               pointer to an int variable, holds the current option
  *                    during processing.
- * opts:              short options, as per getopt_long(3)'s optstring argument.
+ * OPTS:              short options, as per getopt_long(3)'s optstring argument.
+ *                    do not include "h"; will be provided automatically
  * longopts:          long options, as per getopt_long(3)'s longopts argument.
  *                    May be null.
  * commandname:       name of this command, for usage string.
@@ -2854,7 +3214,7 @@ static int def_getopt(int argc, char * const argv[],
  */
 #define SWITCH_FOREACH_OPT(opt, opts, longopts,                         \
                            commandname, num_required_opts)              \
-    while (((opt) = def_getopt(argc, argv, (opts), (longopts),          \
+    while (((opt) = def_getopt(argc, argv, "h" opts, (longopts),          \
                                 (commandname), (num_required_opts))) != -1) \
         switch (opt)
 
@@ -2935,11 +3295,8 @@ static int cd_insert(uint32_t domid, const char *virtdev, char *phys)
     struct stat b;
     int rc = 0;
 
-    if (asprintf(&buf, "vdev=%s,access=r,devtype=cdrom,target=%s",
-                 virtdev, phys ? phys : "") < 0) {
-        fprintf(stderr, "out of memory\n");
-        return 1;
-    }
+    xasprintf(&buf, "vdev=%s,access=r,devtype=cdrom,target=%s",
+              virtdev, phys ? phys : "");
 
     parse_disk_config(&config, buf, &disk);
 
@@ -3037,13 +3394,12 @@ int main_vncviewer(int argc, char **argv)
     static const struct option opts[] = {
         {"autopass", 0, 0, 'a'},
         {"vncviewer-autopass", 0, 0, 'a'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
     uint32_t domid;
     int opt, autopass = 0;
 
-    SWITCH_FOREACH_OPT(opt, "ah", opts, "vncviewer", 1) {
+    SWITCH_FOREACH_OPT(opt, "a", opts, "vncviewer", 1) {
     case 'a':
         autopass = 1;
         break;
@@ -3503,8 +3859,8 @@ static void print_bitmap(uint8_t *map, int maplen, FILE *stream)
     }
 }
 
-static void list_domains(int verbose, int context, int claim, int numa,
-                         const libxl_dominfo *info, int nb_domain)
+static void list_domains(bool verbose, bool context, bool claim, bool numa,
+                         bool cpupool, const libxl_dominfo *info, int nb_domain)
 {
     int i;
     static const char shutdown_reason_letters[]= "-rscw";
@@ -3518,6 +3874,7 @@ static void list_domains(int verbose, int context, int claim, int numa,
     if (verbose) printf("   UUID                            Reason-Code\tSecurity Label");
     if (context && !verbose) printf("   Security Label");
     if (claim) printf("  Claimed");
+    if (cpupool) printf("         Cpupool");
     if (numa) {
         if (libxl_node_bitmap_alloc(ctx, &nodemap, 0)) {
             fprintf(stderr, "libxl_node_bitmap_alloc_failed.\n");
@@ -3562,6 +3919,11 @@ static void list_domains(int verbose, int context, int claim, int numa,
             printf(" %5lu", (unsigned long)info[i].outstanding_memkb / 1024);
         if (verbose || context)
             printf(" %16s", info[i].ssid_label ? : "-");
+        if (cpupool) {
+            char *poolname = libxl_cpupoolid_to_name(ctx, info[i].cpupool);
+            printf("%16s", poolname);
+            free(poolname);
+        }
         if (numa) {
             libxl_domain_get_nodeaffinity(ctx, info[i].domid, &nodemap);
 
@@ -3659,6 +4021,7 @@ static void save_domain_core_writeconfig(int fd, const char *source,
     memset(&hdr, 0, sizeof(hdr));
     memcpy(hdr.magic, savefileheader_magic, sizeof(hdr.magic));
     hdr.byteorder = SAVEFILE_BYTEORDER_VALUE;
+    hdr.mandatory_flags = XL_MANDATORY_FLAG_STREAMv2;
 
     optdata_begin= 0;
 
@@ -3737,7 +4100,7 @@ static pid_t create_migration_child(const char *rune, int *send_fd,
                                         int *recv_fd)
 {
     int sendpipe[2], recvpipe[2];
-    pid_t child = -1;
+    pid_t child;
 
     if (!rune || !send_fd || !recv_fd)
         return -1;
@@ -3934,8 +4297,7 @@ static void migrate_domain(uint32_t domid, const char *rune, int debug,
     fprintf(stderr, "migration sender: Target has acknowledged transfer.\n");
 
     if (common_domname) {
-        if (asprintf(&away_domname, "%s--migratedaway", common_domname) < 0)
-            goto failed_resume;
+        xasprintf(&away_domname, "%s--migratedaway", common_domname);
         rc = libxl_domain_rename(ctx, domid, common_domname, away_domname);
         if (rc) goto failed_resume;
     }
@@ -4165,11 +4527,10 @@ int main_restore(int argc, char **argv)
     static struct option opts[] = {
         {"vncviewer", 0, 0, 'V'},
         {"vncviewer-autopass", 0, 0, 'A'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
 
-    SWITCH_FOREACH_OPT(opt, "FhcpdeVA", opts, "restore", 1) {
+    SWITCH_FOREACH_OPT(opt, "FcpdeVA", opts, "restore", 1) {
     case 'c':
         console_autoconnect = 1;
         break;
@@ -4297,8 +4658,8 @@ int main_migrate(int argc, char **argv)
     int opt, daemonize = 1, monitor = 1, debug = 0;
     static struct option opts[] = {
         {"debug", 0, 0, 0x100},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        {"live", 0, 0, 0x200},
+        COMMON_LONG_OPTS
     };
 
     SWITCH_FOREACH_OPT(opt, "FC:s:e", opts, "migrate", 2) {
@@ -4315,9 +4676,12 @@ int main_migrate(int argc, char **argv)
         daemonize = 0;
         monitor = 0;
         break;
-    case 0x100:
+    case 0x100: /* --debug */
         debug = 1;
         break;
+    case 0x200: /* --live */
+        /* ignored for compatibility with xm */
+        break;
     }
 
     domid = find_domain(argv[optind]);
@@ -4339,13 +4703,12 @@ int main_migrate(int argc, char **argv)
         } else {
             verbose_len = (minmsglevel_default - minmsglevel) + 2;
         }
-        if (asprintf(&rune, "exec %s %s xl%s%.*s migrate-receive%s%s",
-                     ssh_command, host,
-                     pass_tty_arg ? " -t" : "",
-                     verbose_len, verbose_buf,
-                     daemonize ? "" : " -e",
-                     debug ? " -d" : "") < 0)
-            return 1;
+        xasprintf(&rune, "exec %s %s xl%s%.*s migrate-receive%s%s",
+                  ssh_command, host,
+                  pass_tty_arg ? " -t" : "",
+                  verbose_len, verbose_buf,
+                  daemonize ? "" : " -e",
+                  debug ? " -d" : "");
     }
 
     migrate_domain(domid, rune, debug, config_filename);
@@ -4418,8 +4781,7 @@ static int main_shutdown_or_reboot(int do_reboot, int argc, char **argv)
     static struct option opts[] = {
         {"all", 0, 0, 'a'},
         {"wait", 0, 0, 'w'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
 
     SWITCH_FOREACH_OPT(opt, "awF", opts, what, 0) {
@@ -4457,8 +4819,10 @@ static int main_shutdown_or_reboot(int do_reboot, int argc, char **argv)
                fallback_trigger);
         }
 
-        if (wait_for_it)
+        if (wait_for_it) {
             wait_for_domain_deaths(deathws, nb_domain - 1 /* not dom 0 */);
+            free(deathws);
+        }
 
         libxl_dominfo_list_free(dominfo, nb_domain);
     } else {
@@ -4487,38 +4851,45 @@ int main_reboot(int argc, char **argv)
 
 int main_list(int argc, char **argv)
 {
-    int opt, verbose = 0;
-    int context = 0;
-    int details = 0;
-    int numa = 0;
+    int opt;
+    bool verbose = false;
+    bool context = false;
+    bool details = false;
+    bool cpupool = false;
+    bool numa = false;
     static struct option opts[] = {
         {"long", 0, 0, 'l'},
         {"verbose", 0, 0, 'v'},
         {"context", 0, 0, 'Z'},
+        {"cpupool", 0, 0, 'c'},
         {"numa", 0, 0, 'n'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
 
     libxl_dominfo info_buf;
     libxl_dominfo *info, *info_free=0;
     int nb_domain, rc;
 
-    SWITCH_FOREACH_OPT(opt, "lvhZn", opts, "list", 0) {
+    SWITCH_FOREACH_OPT(opt, "lvhZcn", opts, "list", 0) {
     case 'l':
-        details = 1;
+        details = true;
         break;
     case 'v':
-        verbose = 1;
+        verbose = true;
         break;
     case 'Z':
-        context = 1;
+        context = true;
+        break;
+    case 'c':
+        cpupool = true;
         break;
     case 'n':
-        numa = 1;
+        numa = true;
         break;
     }
 
+    libxl_dominfo_init(&info_buf);
+
     if (optind >= argc) {
         info = libxl_list_domain(ctx, &nb_domain);
         if (!info) {
@@ -4529,7 +4900,7 @@ int main_list(int argc, char **argv)
     } else if (optind == argc-1) {
         uint32_t domid = find_domain(argv[optind]);
         rc = libxl_domain_info(ctx, &info_buf, domid);
-        if (rc == ERROR_INVAL) {
+        if (rc == ERROR_DOMAIN_NOTFOUND) {
             fprintf(stderr, "Error: Domain \'%s\' does not exist.\n",
                 argv[optind]);
             return -rc;
@@ -4548,12 +4919,13 @@ int main_list(int argc, char **argv)
     if (details)
         list_domains_details(info, nb_domain);
     else
-        list_domains(verbose, context, 0 /* claim */, numa, info, nb_domain);
+        list_domains(verbose, context, false /* claim */, numa, cpupool,
+                     info, nb_domain);
 
     if (info_free)
         libxl_dominfo_list_free(info, nb_domain);
-    else
-        libxl_dominfo_dispose(info);
+
+    libxl_dominfo_dispose(&info_buf);
 
     return 0;
 }
@@ -4570,11 +4942,25 @@ int main_vm_list(int argc, char **argv)
     return 0;
 }
 
+static void string_realloc_append(char **accumulate, const char *more)
+{
+    /* Appends more to accumulate.  Accumulate is either NULL, or
+     * points (always) to a malloc'd nul-terminated string. */
+
+    size_t oldlen = *accumulate ? strlen(*accumulate) : 0;
+    size_t morelen = strlen(more) + 1/*nul*/;
+    if (oldlen > SSIZE_MAX || morelen > SSIZE_MAX - oldlen) {
+        fprintf(stderr,"Additional config data far too large\n");
+        exit(-ERROR_FAIL);
+    }
+
+    *accumulate = xrealloc(*accumulate, oldlen + morelen);
+    memcpy(*accumulate + oldlen, more, morelen);
+}
+
 int main_create(int argc, char **argv)
 {
     const char *filename = NULL;
-    char *p;
-    char extra_config[1024];
     struct domain_create dom_info;
     int paused = 0, debug = 0, daemonize = 1, console_autoconnect = 0,
         quiet = 0, monitor = 1, vnc = 0, vncautopass = 0;
@@ -4585,16 +4971,17 @@ int main_create(int argc, char **argv)
         {"defconfig", 1, 0, 'f'},
         {"vncviewer", 0, 0, 'V'},
         {"vncviewer-autopass", 0, 0, 'A'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
 
+    dom_info.extra_config = NULL;
+
     if (argv[1] && argv[1][0] != '-' && !strchr(argv[1], '=')) {
         filename = argv[1];
         argc--; argv++;
     }
 
-    SWITCH_FOREACH_OPT(opt, "Fhnqf:pcdeVA", opts, "create", 0) {
+    SWITCH_FOREACH_OPT(opt, "Fnqf:pcdeVA", opts, "create", 0) {
     case 'f':
         filename = optarg;
         break;
@@ -4628,20 +5015,21 @@ int main_create(int argc, char **argv)
         break;
     }
 
-    extra_config[0] = '\0';
-    for (p = extra_config; optind < argc; optind++) {
+    memset(&dom_info, 0, sizeof(dom_info));
+
+    for (; optind < argc; optind++) {
         if (strchr(argv[optind], '=') != NULL) {
-            p += snprintf(p, sizeof(extra_config) - (p - extra_config),
-                "%s\n", argv[optind]);
+            string_realloc_append(&dom_info.extra_config, argv[optind]);
+            string_realloc_append(&dom_info.extra_config, "\n");
         } else if (!filename) {
             filename = argv[optind];
         } else {
             help("create");
+            free(dom_info.extra_config);
             return 2;
         }
     }
 
-    memset(&dom_info, 0, sizeof(dom_info));
     dom_info.debug = debug;
     dom_info.daemonize = daemonize;
     dom_info.monitor = monitor;
@@ -4649,16 +5037,18 @@ int main_create(int argc, char **argv)
     dom_info.dryrun = dryrun_only;
     dom_info.quiet = quiet;
     dom_info.config_file = filename;
-    dom_info.extra_config = extra_config;
     dom_info.migrate_fd = -1;
     dom_info.vnc = vnc;
     dom_info.vncautopass = vncautopass;
     dom_info.console_autoconnect = console_autoconnect;
 
     rc = create_domain(&dom_info);
-    if (rc < 0)
+    if (rc < 0) {
+        free(dom_info.extra_config);
         return -rc;
+    }
 
+    free(dom_info.extra_config);
     return 0;
 }
 
@@ -4666,8 +5056,7 @@ int main_config_update(int argc, char **argv)
 {
     uint32_t domid;
     const char *filename = NULL;
-    char *p;
-    char extra_config[1024];
+    char *extra_config = NULL;
     void *config_data = 0;
     int config_len = 0;
     libxl_domain_config d_config;
@@ -4675,8 +5064,7 @@ int main_config_update(int argc, char **argv)
     int debug = 0;
     static struct option opts[] = {
         {"defconfig", 1, 0, 'f'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
 
     if (argc < 2) {
@@ -4696,7 +5084,7 @@ int main_config_update(int argc, char **argv)
         argc--; argv++;
     }
 
-    SWITCH_FOREACH_OPT(opt, "dhqf:", opts, "config_update", 0) {
+    SWITCH_FOREACH_OPT(opt, "dqf:", opts, "config_update", 0) {
     case 'd':
         debug = 1;
         break;
@@ -4705,15 +5093,15 @@ int main_config_update(int argc, char **argv)
         break;
     }
 
-    extra_config[0] = '\0';
-    for (p = extra_config; optind < argc; optind++) {
+    for (; optind < argc; optind++) {
         if (strchr(argv[optind], '=') != NULL) {
-            p += snprintf(p, sizeof(extra_config) - (p - extra_config),
-                "%s\n", argv[optind]);
+            string_realloc_append(&extra_config, argv[optind]);
+            string_realloc_append(&extra_config, "\n");
         } else if (!filename) {
             filename = argv[optind];
         } else {
             help("create");
+            free(extra_config);
             return 2;
         }
     }
@@ -4722,10 +5110,11 @@ int main_config_update(int argc, char **argv)
         rc = libxl_read_file_contents(ctx, filename,
                                       &config_data, &config_len);
         if (rc) { fprintf(stderr, "Failed to read config file: %s: %s\n",
-                           filename, strerror(errno)); return ERROR_FAIL; }
-        if (strlen(extra_config)) {
+                           filename, strerror(errno));
+                  free(extra_config); return ERROR_FAIL; }
+        if (extra_config && strlen(extra_config)) {
             if (config_len > INT_MAX - (strlen(extra_config) + 2 + 1)) {
-                fprintf(stderr, "Failed to attach extra configration\n");
+                fprintf(stderr, "Failed to attach extra configuration\n");
                 exit(1);
             }
             /* allocate space for the extra config plus two EOLs plus \0 */
@@ -4763,7 +5152,7 @@ int main_config_update(int argc, char **argv)
     libxl_domain_config_dispose(&d_config);
 
     free(config_data);
-
+    free(extra_config);
     return 0;
 }
 
@@ -4949,7 +5338,7 @@ int main_vcpupin(int argc, char **argv)
      */
     if (!strcmp(hard_str, "-"))
         hard = NULL;
-    else if (vcpupin_parse(hard_str, hard))
+    else if (cpurange_parse(hard_str, hard))
         goto out;
     /*
      * Soft affinity is handled similarly. Only difference: we also want
@@ -4957,7 +5346,7 @@ int main_vcpupin(int argc, char **argv)
      */
     if (argc <= optind+3 || !strcmp(soft_str, "-"))
         soft = NULL;
-    else if (vcpupin_parse(soft_str, soft))
+    else if (cpurange_parse(soft_str, soft))
         goto out;
 
     if (dryrun_only) {
@@ -5010,16 +5399,18 @@ int main_vcpupin(int argc, char **argv)
     return rc;
 }
 
-static void vcpuset(uint32_t domid, const char* nr_vcpus, int check_host)
+static int vcpuset(uint32_t domid, const char* nr_vcpus, int check_host)
 {
     char *endptr;
     unsigned int max_vcpus, i;
     libxl_bitmap cpumap;
+    int rc;
 
+    libxl_bitmap_init(&cpumap);
     max_vcpus = strtoul(nr_vcpus, &endptr, 10);
     if (nr_vcpus == endptr) {
         fprintf(stderr, "Error: Invalid argument.\n");
-        return;
+        return 1;
     }
 
     /*
@@ -5028,33 +5419,46 @@ static void vcpuset(uint32_t domid, const char* nr_vcpus, int check_host)
      */
     if (check_host) {
         unsigned int host_cpu = libxl_get_max_cpus(ctx);
-        if (max_vcpus > host_cpu) {
-            fprintf(stderr, "You are overcommmitting! You have %d physical " \
-                    " CPUs and want %d vCPUs! Aborting, use --ignore-host to " \
+        libxl_dominfo dominfo;
+
+        rc = libxl_domain_info(ctx, &dominfo, domid);
+        if (rc)
+            return 1;
+
+        if (max_vcpus > dominfo.vcpu_online && max_vcpus > host_cpu) {
+            fprintf(stderr, "You are overcommmitting! You have %d physical" \
+                    " CPUs and want %d vCPUs! Aborting, use --ignore-host to" \
                     " continue\n", host_cpu, max_vcpus);
-            return;
+            rc = 1;
         }
-        /* NB: This also limits how many are set in the bitmap */
-        max_vcpus = (max_vcpus > host_cpu ? host_cpu : max_vcpus);
+        libxl_dominfo_dispose(&dominfo);
+        if (rc)
+            return 1;
     }
-    if (libxl_cpu_bitmap_alloc(ctx, &cpumap, max_vcpus)) {
-        fprintf(stderr, "libxl_cpu_bitmap_alloc failed\n");
-        return;
+    rc = libxl_cpu_bitmap_alloc(ctx, &cpumap, max_vcpus);
+    if (rc) {
+        fprintf(stderr, "libxl_cpu_bitmap_alloc failed, rc: %d\n", rc);
+        return 1;
     }
     for (i = 0; i < max_vcpus; i++)
         libxl_bitmap_set(&cpumap, i);
 
-    if (libxl_set_vcpuonline(ctx, domid, &cpumap) < 0)
-        fprintf(stderr, "libxl_set_vcpuonline failed domid=%d max_vcpus=%d\n", domid, max_vcpus);
+    rc = libxl_set_vcpuonline(ctx, domid, &cpumap);
+    if (rc == ERROR_DOMAIN_NOTFOUND)
+        fprintf(stderr, "Domain %u does not exist.\n", domid);
+    else if (rc)
+        fprintf(stderr, "libxl_set_vcpuonline failed domid=%d max_vcpus=%d," \
+                " rc: %d\n", domid, max_vcpus, rc);
 
     libxl_bitmap_dispose(&cpumap);
+    return rc ? 1 : 0;
 }
 
 int main_vcpuset(int argc, char **argv)
 {
     static struct option opts[] = {
         {"ignore-host", 0, 0, 'i'},
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
     int opt, check_host = 1;
 
@@ -5066,8 +5470,7 @@ int main_vcpuset(int argc, char **argv)
         break;
     }
 
-    vcpuset(find_domain(argv[optind]), argv[optind + 1], check_host);
-    return 0;
+    return vcpuset(find_domain(argv[optind]), argv[optind + 1], check_host);
 }
 
 static void output_xeninfo(void)
@@ -5196,12 +5599,15 @@ static void output_numainfo(void)
 
 static void output_topologyinfo(void)
 {
-    libxl_cputopology *info;
+    libxl_cputopology *cpuinfo;
     int i, nr;
+    libxl_pcitopology *pciinfo;
+    int valid_devs = 0;
 
-    info = libxl_get_cpu_topology(ctx, &nr);
-    if (info == NULL) {
-        fprintf(stderr, "libxl_get_topologyinfo failed.\n");
+
+    cpuinfo = libxl_get_cpu_topology(ctx, &nr);
+    if (cpuinfo == NULL) {
+        fprintf(stderr, "libxl_get_cpu_topology failed.\n");
         return;
     }
 
@@ -5209,12 +5615,35 @@ static void output_topologyinfo(void)
     printf("cpu:    core    socket     node\n");
 
     for (i = 0; i < nr; i++) {
-        if (info[i].core != LIBXL_CPUTOPOLOGY_INVALID_ENTRY)
+        if (cpuinfo[i].core != LIBXL_CPUTOPOLOGY_INVALID_ENTRY)
             printf("%3d:    %4d     %4d     %4d\n", i,
-                   info[i].core, info[i].socket, info[i].node);
+                   cpuinfo[i].core, cpuinfo[i].socket, cpuinfo[i].node);
     }
 
-    libxl_cputopology_list_free(info, nr);
+    libxl_cputopology_list_free(cpuinfo, nr);
+
+    pciinfo = libxl_get_pci_topology(ctx, &nr);
+    if (pciinfo == NULL) {
+        fprintf(stderr, "libxl_get_pci_topology failed.\n");
+        return;
+    }
+
+    printf("device topology        :\n");
+    printf("device           node\n");
+    for (i = 0; i < nr; i++) {
+        if (pciinfo[i].node != LIBXL_PCITOPOLOGY_INVALID_ENTRY) {
+            printf("%04x:%02x:%02x.%01x      %d\n", pciinfo[i].seg,
+                   pciinfo[i].bus,
+                   ((pciinfo[i].devfn >> 3) & 0x1f), (pciinfo[i].devfn & 7),
+                   pciinfo[i].node);
+            valid_devs++;
+        }
+    }
+
+    if (valid_devs == 0)
+        printf("No device topology data available\n");
+
+    libxl_pcitopology_list_free(pciinfo, nr);
 
     return;
 }
@@ -5241,12 +5670,11 @@ int main_info(int argc, char **argv)
     int opt;
     static struct option opts[] = {
         {"numa", 0, 0, 'n'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
     int numa = 0;
 
-    SWITCH_FOREACH_OPT(opt, "hn", opts, "info", 0) {
+    SWITCH_FOREACH_OPT(opt, "n", opts, "info", 0) {
     case 'n':
         numa = 1;
         break;
@@ -5298,7 +5726,7 @@ int main_sharing(int argc, char **argv)
     } else if (optind == argc-1) {
         uint32_t domid = find_domain(argv[optind]);
         rc = libxl_domain_info(ctx, &info_buf, domid);
-        if (rc == ERROR_INVAL) {
+        if (rc == ERROR_DOMAIN_NOTFOUND) {
             fprintf(stderr, "Error: Domain \'%s\' does not exist.\n",
                 argv[optind]);
             return -rc;
@@ -5387,6 +5815,8 @@ static int sched_credit_domain_output(int domid)
         printf("%-33s %4s %6s %4s\n", "Name", "ID", "Weight", "Cap");
         return 0;
     }
+
+    libxl_domain_sched_params_init(&scinfo);
     rc = sched_domain_get(LIBXL_SCHEDULER_CREDIT, domid, &scinfo);
     if (rc)
         return rc;
@@ -5433,6 +5863,8 @@ static int sched_credit2_domain_output(
         printf("%-33s %4s %6s\n", "Name", "ID", "Weight");
         return 0;
     }
+
+    libxl_domain_sched_params_init(&scinfo);
     rc = sched_domain_get(LIBXL_SCHEDULER_CREDIT2, domid, &scinfo);
     if (rc)
         return rc;
@@ -5446,35 +5878,6 @@ static int sched_credit2_domain_output(
     return 0;
 }
 
-static int sched_sedf_domain_output(
-    int domid)
-{
-    char *domname;
-    libxl_domain_sched_params scinfo;
-    int rc;
-
-    if (domid < 0) {
-        printf("%-33s %4s %6s %-6s %7s %5s %6s\n", "Name", "ID", "Period",
-               "Slice", "Latency", "Extra", "Weight");
-        return 0;
-    }
-    rc = sched_domain_get(LIBXL_SCHEDULER_SEDF, domid, &scinfo);
-    if (rc)
-        return rc;
-    domname = libxl_domid_to_name(ctx, domid);
-    printf("%-33s %4d %6d %6d %7d %5d %6d\n",
-        domname,
-        domid,
-        scinfo.period,
-        scinfo.slice,
-        scinfo.latency,
-        scinfo.extratime,
-        scinfo.weight);
-    free(domname);
-    libxl_domain_sched_params_dispose(&scinfo);
-    return 0;
-}
-
 static int sched_rtds_domain_output(
     int domid)
 {
@@ -5605,11 +6008,10 @@ int main_sched_credit(int argc, char **argv)
         {"tslice_ms", 1, 0, 't'},
         {"ratelimit_us", 1, 0, 'r'},
         {"cpupool", 1, 0, 'p'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
 
-    SWITCH_FOREACH_OPT(opt, "d:w:c:p:t:r:hs", opts, "sched-credit", 0) {
+    SWITCH_FOREACH_OPT(opt, "d:w:c:p:t:r:s", opts, "sched-credit", 0) {
     case 'd':
         dom = optarg;
         break;
@@ -5721,11 +6123,10 @@ int main_sched_credit2(int argc, char **argv)
         {"domain", 1, 0, 'd'},
         {"weight", 1, 0, 'w'},
         {"cpupool", 1, 0, 'p'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
 
-    SWITCH_FOREACH_OPT(opt, "d:w:p:h", opts, "sched-credit2", 0) {
+    SWITCH_FOREACH_OPT(opt, "d:w:p:", opts, "sched-credit2", 0) {
     case 'd':
         dom = optarg;
         break;
@@ -5775,114 +6176,6 @@ int main_sched_credit2(int argc, char **argv)
     return 0;
 }
 
-int main_sched_sedf(int argc, char **argv)
-{
-    const char *dom = NULL;
-    const char *cpupool = NULL;
-    int period = 0, opt_p = 0;
-    int slice = 0, opt_s = 0;
-    int latency = 0, opt_l = 0;
-    int extra = 0, opt_e = 0;
-    int weight = 0, opt_w = 0;
-    int opt, rc;
-    static struct option opts[] = {
-        {"period", 1, 0, 'p'},
-        {"slice", 1, 0, 's'},
-        {"latency", 1, 0, 'l'},
-        {"extra", 1, 0, 'e'},
-        {"weight", 1, 0, 'w'},
-        {"cpupool", 1, 0, 'c'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
-    };
-
-    SWITCH_FOREACH_OPT(opt, "d:p:s:l:e:w:c:h", opts, "sched-sedf", 0) {
-    case 'd':
-        dom = optarg;
-        break;
-    case 'p':
-        period = strtol(optarg, NULL, 10);
-        opt_p = 1;
-        break;
-    case 's':
-        slice = strtol(optarg, NULL, 10);
-        opt_s = 1;
-        break;
-    case 'l':
-        latency = strtol(optarg, NULL, 10);
-        opt_l = 1;
-        break;
-    case 'e':
-        extra = strtol(optarg, NULL, 10);
-        opt_e = 1;
-        break;
-    case 'w':
-        weight = strtol(optarg, NULL, 10);
-        opt_w = 1;
-        break;
-    case 'c':
-        cpupool = optarg;
-        break;
-    }
-
-    if (cpupool && (dom || opt_p || opt_s || opt_l || opt_e || opt_w)) {
-        fprintf(stderr, "Specifying a cpupool is not allowed with other "
-                "options.\n");
-        return 1;
-    }
-    if (!dom && (opt_p || opt_s || opt_l || opt_e || opt_w)) {
-        fprintf(stderr, "Must specify a domain.\n");
-        return 1;
-    }
-    if (opt_w && (opt_p || opt_s)) {
-        fprintf(stderr, "Specifying a weight AND period or slice is not "
-                "allowed.\n");
-    }
-
-    if (!dom) { /* list all domain's credit scheduler info */
-        return -sched_domain_output(LIBXL_SCHEDULER_SEDF,
-                                    sched_sedf_domain_output,
-                                    sched_default_pool_output,
-                                    cpupool);
-    } else {
-        uint32_t domid = find_domain(dom);
-
-        if (!opt_p && !opt_s && !opt_l && !opt_e && !opt_w) {
-            /* output sedf scheduler info */
-            sched_sedf_domain_output(-1);
-            return -sched_sedf_domain_output(domid);
-        } else { /* set sedf scheduler paramaters */
-            libxl_domain_sched_params scinfo;
-            libxl_domain_sched_params_init(&scinfo);
-            scinfo.sched = LIBXL_SCHEDULER_SEDF;
-
-            if (opt_p) {
-                scinfo.period = period;
-                scinfo.weight = 0;
-            }
-            if (opt_s) {
-                scinfo.slice = slice;
-                scinfo.weight = 0;
-            }
-            if (opt_l)
-                scinfo.latency = latency;
-            if (opt_e)
-                scinfo.extratime = extra;
-            if (opt_w) {
-                scinfo.weight = weight;
-                scinfo.period = 0;
-                scinfo.slice = 0;
-            }
-            rc = sched_domain_set(domid, &scinfo);
-            libxl_domain_sched_params_dispose(&scinfo);
-            if (rc)
-                return -rc;
-        }
-    }
-
-    return 0;
-}
-
 /*
  * <nothing>            : List all domain paramters and sched params
  * -d [domid]           : List domain params for domain
@@ -5902,11 +6195,10 @@ int main_sched_rtds(int argc, char **argv)
         {"period", 1, 0, 'p'},
         {"budget", 1, 0, 'b'},
         {"cpupool", 1, 0, 'c'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
 
-    SWITCH_FOREACH_OPT(opt, "d:p:b:c:h", opts, "sched-rtds", 0) {
+    SWITCH_FOREACH_OPT(opt, "d:p:b:c:", opts, "sched-rtds", 0) {
     case 'd':
         dom = optarg;
         break;
@@ -6159,20 +6451,11 @@ int main_networkattach(int argc, char **argv)
     int opt;
     libxl_device_nic nic;
     XLU_Config *config = 0;
-    char *endptr, *oparg;
-    const char *tok;
-    int i;
-    unsigned int val;
 
     SWITCH_FOREACH_OPT(opt, "", NULL, "network-attach", 1) {
         /* No options */
     }
 
-    if (argc-optind > 11) {
-        help("network-attach");
-        return 0;
-    }
-
     domid = find_domain(argv[optind]);
 
     config= xlu_cfg_init(stderr, "command line");
@@ -6185,50 +6468,8 @@ int main_networkattach(int argc, char **argv)
     set_default_nic_values(&nic);
 
     for (argv += optind+1, argc -= optind+1; argc > 0; ++argv, --argc) {
-        if (MATCH_OPTION("type", *argv, oparg)) {
-            if (!strcmp("vif", oparg)) {
-                nic.nictype = LIBXL_NIC_TYPE_VIF;
-            } else if (!strcmp("ioemu", oparg)) {
-                nic.nictype = LIBXL_NIC_TYPE_VIF_IOEMU;
-            } else {
-                fprintf(stderr, "Invalid parameter `type'.\n");
-                return 1;
-            }
-        } else if (MATCH_OPTION("mac", *argv, oparg)) {
-            tok = strtok(oparg, ":");
-            for (i = 0; tok && i < 6; tok = strtok(NULL, ":"), ++i) {
-                val = strtoul(tok, &endptr, 16);
-                if ((tok == endptr) || (val > 255)) {
-                    fprintf(stderr, "Invalid parameter `mac'.\n");
-                    return 1;
-                }
-                nic.mac[i] = val;
-            }
-        } else if (MATCH_OPTION("bridge", *argv, oparg)) {
-            replace_string(&nic.bridge, oparg);
-        } else if (MATCH_OPTION("netdev", *argv, oparg)) {
-            fprintf(stderr, "the netdev parameter is deprecated, "
-                            "please use gatewaydev instead\n");
-            replace_string(&nic.gatewaydev, oparg);
-        } else if (MATCH_OPTION("gatewaydev", *argv, oparg)) {
-            replace_string(&nic.gatewaydev, oparg);
-        } else if (MATCH_OPTION("ip", *argv, oparg)) {
-            replace_string(&nic.ip, oparg);
-        } else if (MATCH_OPTION("script", *argv, oparg)) {
-            replace_string(&nic.script, oparg);
-        } else if (MATCH_OPTION("backend", *argv, oparg)) {
-            replace_string(&nic.backend_domname, oparg);
-        } else if (MATCH_OPTION("vifname", *argv, oparg)) {
-            replace_string(&nic.ifname, oparg);
-        } else if (MATCH_OPTION("model", *argv, oparg)) {
-            replace_string(&nic.model, oparg);
-        } else if (MATCH_OPTION("rate", *argv, oparg)) {
-            parse_vif_rate(&config, oparg, &nic);
-        } else if (MATCH_OPTION("accel", *argv, oparg)) {
-        } else {
-            fprintf(stderr, "unrecognized argument `%s'\n", *argv);
+        if (parse_nic_config(&nic, &config, *argv))
             return 1;
-        }
     }
 
     if (dryrun_only) {
@@ -6588,7 +6829,6 @@ static char *uptime_to_string(unsigned long uptime, int short_mode)
 {
     int sec, min, hour, day;
     char *time_string;
-    int ret;
 
     day = (int)(uptime / 86400);
     uptime -= (day * 86400);
@@ -6600,21 +6840,19 @@ static char *uptime_to_string(unsigned long uptime, int short_mode)
 
     if (short_mode)
         if (day > 1)
-            ret = asprintf(&time_string, "%d days, %2d:%02d", day, hour, min);
+            xasprintf(&time_string, "%d days, %2d:%02d", day, hour, min);
         else if (day == 1)
-            ret = asprintf(&time_string, "%d day, %2d:%02d", day, hour, min);
+            xasprintf(&time_string, "%d day, %2d:%02d", day, hour, min);
         else
-            ret = asprintf(&time_string, "%2d:%02d", hour, min);
+            xasprintf(&time_string, "%2d:%02d", hour, min);
     else
         if (day > 1)
-            ret = asprintf(&time_string, "%d days, %2d:%02d:%02d", day, hour, min, sec);
+            xasprintf(&time_string, "%d days, %2d:%02d:%02d", day, hour, min, sec);
         else if (day == 1)
-            ret = asprintf(&time_string, "%d day, %2d:%02d:%02d", day, hour, min, sec);
+            xasprintf(&time_string, "%d day, %2d:%02d:%02d", day, hour, min, sec);
         else
-            ret = asprintf(&time_string, "%2d:%02d:%02d", hour, min, sec);
+            xasprintf(&time_string, "%2d:%02d:%02d", hour, min, sec);
 
-    if (ret < 0)
-        return NULL;
     return time_string;
 }
 
@@ -6637,8 +6875,8 @@ int main_claims(int argc, char **argv)
         return 1;
     }
 
-    list_domains(0 /* verbose */, 0 /* context */, 1 /* claim */,
-                 0 /* numa */, info, nb_domain);
+    list_domains(false /* verbose */, false /* context */, true /* claim */,
+                 false /* numa */, false /* cpupool */, info, nb_domain);
 
     libxl_dominfo_list_free(info, nb_domain);
     return 0;
@@ -7020,13 +7258,12 @@ int main_cpupoolcreate(int argc, char **argv)
 {
     const char *filename = NULL, *config_src=NULL;
     const char *p;
-    char extra_config[1024];
+    char *extra_config = NULL;
     int opt;
     static struct option opts[] = {
         {"defconfig", 1, 0, 'f'},
         {"dryrun", 0, 0, 'n'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
     int ret;
     char *config_data = 0;
@@ -7043,9 +7280,9 @@ int main_cpupoolcreate(int argc, char **argv)
     libxl_bitmap cpumap;
     libxl_uuid uuid;
     libxl_cputopology *topology;
-    int rc = -ERROR_FAIL;
+    int rc = 1;
 
-    SWITCH_FOREACH_OPT(opt, "hnf:", opts, "cpupool-create", 0) {
+    SWITCH_FOREACH_OPT(opt, "nf:", opts, "cpupool-create", 0) {
     case 'f':
         filename = optarg;
         break;
@@ -7054,13 +7291,13 @@ int main_cpupoolcreate(int argc, char **argv)
         break;
     }
 
-    memset(extra_config, 0, sizeof(extra_config));
+    libxl_bitmap_init(&freemap);
+    libxl_bitmap_init(&cpumap);
+
     while (optind < argc) {
         if ((p = strchr(argv[optind], '='))) {
-            if (strlen(extra_config) + 1 + strlen(argv[optind]) < sizeof(extra_config)) {
-                strcat(extra_config, "\n");
-                strcat(extra_config, argv[optind]);
-            }
+            string_realloc_append(&extra_config, "\n");
+            string_realloc_append(&extra_config, argv[optind]);
         } else if (!filename) {
             filename = argv[optind];
         } else {
@@ -7083,9 +7320,9 @@ int main_cpupoolcreate(int argc, char **argv)
     else
         config_src="command line";
 
-    if (strlen(extra_config)) {
+    if (extra_config && strlen(extra_config)) {
         if (config_len > INT_MAX - (strlen(extra_config) + 2)) {
-            fprintf(stderr, "Failed to attach extra configration\n");
+            fprintf(stderr, "Failed to attach extra configuration\n");
             goto out;
         }
         config_data = xrealloc(config_data,
@@ -7172,18 +7409,29 @@ int main_cpupoolcreate(int argc, char **argv)
             fprintf(stderr, "no free cpu found\n");
             goto out_cfg;
         }
-    } else if (!xlu_cfg_get_list(config, "cpus", &cpus, 0, 0)) {
+    } else if (!xlu_cfg_get_list(config, "cpus", &cpus, 0, 1)) {
         n_cpus = 0;
         while ((buf = xlu_cfg_get_listitem(cpus, n_cpus)) != NULL) {
             i = atoi(buf);
-            if ((i < 0) || (i >= freemap.size * 8) ||
-                !libxl_bitmap_test(&freemap, i)) {
+            if ((i < 0) || !libxl_bitmap_test(&freemap, i)) {
                 fprintf(stderr, "cpu %d illegal or not free\n", i);
                 goto out_cfg;
             }
             libxl_bitmap_set(&cpumap, i);
             n_cpus++;
         }
+    } else if (!xlu_cfg_get_string(config, "cpus", &buf, 0)) {
+        if (cpurange_parse(buf, &cpumap))
+            goto out_cfg;
+
+        n_cpus = 0;
+        libxl_for_each_set_bit(i, cpumap) {
+            if (!libxl_bitmap_test(&freemap, i)) {
+                fprintf(stderr, "cpu %d illegal or not free\n", i);
+                goto out_cfg;
+            }
+            n_cpus++;
+        }
     } else
         n_cpus = 0;
 
@@ -7207,8 +7455,11 @@ int main_cpupoolcreate(int argc, char **argv)
 out_cfg:
     xlu_cfg_destroy(config);
 out:
+    libxl_bitmap_dispose(&freemap);
+    libxl_bitmap_dispose(&cpumap);
     free(name);
     free(config_data);
+    free(extra_config);
     return rc;
 }
 
@@ -7217,8 +7468,7 @@ int main_cpupoollist(int argc, char **argv)
     int opt;
     static struct option opts[] = {
         {"cpus", 0, 0, 'c'},
-        COMMON_LONG_OPTS,
-        {0, 0, 0, 0}
+        COMMON_LONG_OPTS
     };
     int opt_cpus = 0;
     const char *pool = NULL;
@@ -7226,9 +7476,8 @@ int main_cpupoollist(int argc, char **argv)
     int n_pools, p, c, n;
     uint32_t poolid;
     char *name;
-    int ret = 0;
 
-    SWITCH_FOREACH_OPT(opt, "hc", opts, "cpupool-list", 0) {
+    SWITCH_FOREACH_OPT(opt, "c", opts, "cpupool-list", 0) {
     case 'c':
         opt_cpus = 1;
         break;
@@ -7238,14 +7487,14 @@ int main_cpupoollist(int argc, char **argv)
         pool = argv[optind];
         if (libxl_name_to_cpupoolid(ctx, pool, &poolid)) {
             fprintf(stderr, "Pool \'%s\' does not exist\n", pool);
-            return -ERROR_FAIL;
+            return 1;
         }
     }
 
     poolinfo = libxl_list_cpupool(ctx, &n_pools);
     if (!poolinfo) {
         fprintf(stderr, "error getting cpupool info\n");
-        return -ERROR_NOMEM;
+        return 1;
     }
 
     printf("%-19s", "Name");
@@ -7255,7 +7504,7 @@ int main_cpupoollist(int argc, char **argv)
         printf("CPUs   Sched     Active   Domain count\n");
 
     for (p = 0; p < n_pools; p++) {
-        if (!ret && (!pool || (poolinfo[p].poolid == poolid))) {
+        if (!pool || (poolinfo[p].poolid == poolid)) {
             name = poolinfo[p].pool_name;
             printf("%-19s", name);
             n = 0;
@@ -7276,7 +7525,7 @@ int main_cpupoollist(int argc, char **argv)
 
     libxl_cpupoolinfo_list_free(poolinfo, n_pools);
 
-    return ret;
+    return 0;
 }
 
 int main_cpupooldestroy(int argc, char **argv)
@@ -7293,11 +7542,14 @@ int main_cpupooldestroy(int argc, char **argv)
 
     if (libxl_cpupool_qualifier_to_cpupoolid(ctx, pool, &poolid, NULL) ||
         !libxl_cpupoolid_is_valid(ctx, poolid)) {
-        fprintf(stderr, "unknown cpupool \'%s\'\n", pool);
-        return -ERROR_FAIL;
+        fprintf(stderr, "unknown cpupool '%s'\n", pool);
+        return 1;
     }
 
-    return -libxl_cpupool_destroy(ctx, poolid);
+    if (libxl_cpupool_destroy(ctx, poolid))
+        return 1;
+
+    return 0;
 }
 
 int main_cpupoolrename(int argc, char **argv)
@@ -7315,14 +7567,14 @@ int main_cpupoolrename(int argc, char **argv)
 
     if (libxl_cpupool_qualifier_to_cpupoolid(ctx, pool, &poolid, NULL) ||
         !libxl_cpupoolid_is_valid(ctx, poolid)) {
-        fprintf(stderr, "unknown cpupool \'%s\'\n", pool);
-        return -ERROR_FAIL;
+        fprintf(stderr, "unknown cpupool '%s'\n", pool);
+        return 1;
     }
 
     new_name = argv[optind];
 
     if (libxl_cpupool_rename(ctx, new_name, poolid)) {
-        fprintf(stderr, "Can't rename cpupool '%s'.\n", pool);
+        fprintf(stderr, "Can't rename cpupool '%s'\n", pool);
         return 1;
     }
 
@@ -7334,44 +7586,37 @@ int main_cpupoolcpuadd(int argc, char **argv)
     int opt;
     const char *pool;
     uint32_t poolid;
-    int cpu;
-    int node;
-    int n;
+    libxl_bitmap cpumap;
+    int rc = 1;
 
     SWITCH_FOREACH_OPT(opt, "", NULL, "cpupool-cpu-add", 2) {
         /* No options */
     }
 
-    pool = argv[optind++];
-    node = -1;
-    cpu = -1;
-    if (strncmp(argv[optind], "node:", 5) == 0) {
-        node = atoi(argv[optind] + 5);
-    } else {
-        cpu = atoi(argv[optind]);
+    libxl_bitmap_init(&cpumap);
+    if (libxl_cpu_bitmap_alloc(ctx, &cpumap, 0)) {
+        fprintf(stderr, "Unable to allocate cpumap");
+        return 1;
     }
 
+    pool = argv[optind++];
+    if (cpurange_parse(argv[optind], &cpumap))
+        goto out;
+
     if (libxl_cpupool_qualifier_to_cpupoolid(ctx, pool, &poolid, NULL) ||
         !libxl_cpupoolid_is_valid(ctx, poolid)) {
         fprintf(stderr, "unknown cpupool \'%s\'\n", pool);
-        return -ERROR_FAIL;
-    }
-
-    if (cpu >= 0) {
-        return -libxl_cpupool_cpuadd(ctx, poolid, cpu);
+        goto out;
     }
 
-    if (libxl_cpupool_cpuadd_node(ctx, poolid, node, &n)) {
-        fprintf(stderr, "libxl_cpupool_cpuadd_node failed\n");
-        return -ERROR_FAIL;
-    }
+    if (libxl_cpupool_cpuadd_cpumap(ctx, poolid, &cpumap))
+        fprintf(stderr, "some cpus may not have been added to %s\n", pool);
 
-    if (n > 0) {
-        return 0;
-    }
+    rc = 0;
 
-    fprintf(stderr, "no free cpu found\n");
-    return -ERROR_FAIL;
+out:
+    libxl_bitmap_dispose(&cpumap);
+    return rc;
 }
 
 int main_cpupoolcpuremove(int argc, char **argv)
@@ -7379,44 +7624,37 @@ int main_cpupoolcpuremove(int argc, char **argv)
     int opt;
     const char *pool;
     uint32_t poolid;
-    int cpu;
-    int node;
-    int n;
+    libxl_bitmap cpumap;
+    int rc = 1;
+
+    libxl_bitmap_init(&cpumap);
+    if (libxl_cpu_bitmap_alloc(ctx, &cpumap, 0)) {
+        fprintf(stderr, "Unable to allocate cpumap");
+        return 1;
+    }
 
     SWITCH_FOREACH_OPT(opt, "", NULL, "cpupool-cpu-remove", 2) {
         /* No options */
     }
 
     pool = argv[optind++];
-    node = -1;
-    cpu = -1;
-    if (strncmp(argv[optind], "node:", 5) == 0) {
-        node = atoi(argv[optind] + 5);
-    } else {
-        cpu = atoi(argv[optind]);
-    }
+    if (cpurange_parse(argv[optind], &cpumap))
+        goto out;
 
     if (libxl_cpupool_qualifier_to_cpupoolid(ctx, pool, &poolid, NULL) ||
         !libxl_cpupoolid_is_valid(ctx, poolid)) {
         fprintf(stderr, "unknown cpupool \'%s\'\n", pool);
-        return -ERROR_FAIL;
+        goto out;
     }
 
-    if (cpu >= 0) {
-        return -libxl_cpupool_cpuremove(ctx, poolid, cpu);
-    }
+    if (libxl_cpupool_cpuremove_cpumap(ctx, poolid, &cpumap))
+        fprintf(stderr, "some cpus may not have been removed from %s\n", pool);
 
-    if (libxl_cpupool_cpuremove_node(ctx, poolid, node, &n)) {
-        fprintf(stderr, "libxl_cpupool_cpuremove_node failed\n");
-        return -ERROR_FAIL;
-    }
-
-    if (n == 0) {
-        fprintf(stderr, "no cpu of node found in cpupool\n");
-        return -ERROR_FAIL;
-    }
+    rc = 0;
 
-    return 0;
+out:
+    libxl_bitmap_dispose(&cpumap);
+    return rc;
 }
 
 int main_cpupoolmigrate(int argc, char **argv)
@@ -7436,22 +7674,25 @@ int main_cpupoolmigrate(int argc, char **argv)
 
     if (libxl_domain_qualifier_to_domid(ctx, dom, &domid) ||
         !libxl_domid_to_name(ctx, domid)) {
-        fprintf(stderr, "unknown domain \'%s\'\n", dom);
-        return -ERROR_FAIL;
+        fprintf(stderr, "unknown domain '%s'\n", dom);
+        return 1;
     }
 
     if (libxl_cpupool_qualifier_to_cpupoolid(ctx, pool, &poolid, NULL) ||
         !libxl_cpupoolid_is_valid(ctx, poolid)) {
-        fprintf(stderr, "unknown cpupool \'%s\'\n", pool);
-        return -ERROR_FAIL;
+        fprintf(stderr, "unknown cpupool '%s'\n", pool);
+        return 1;
     }
 
-    return -libxl_cpupool_movedomain(ctx, poolid, domid);
+    if (libxl_cpupool_movedomain(ctx, poolid, domid))
+        return 1;
+
+    return 0;
 }
 
 int main_cpupoolnumasplit(int argc, char **argv)
 {
-    int ret;
+    int rc;
     int opt;
     int p;
     int c;
@@ -7461,7 +7702,7 @@ int main_cpupoolnumasplit(int argc, char **argv)
     int n_pools;
     int node;
     int n_cpus;
-    char name[16];
+    char *name = NULL;
     libxl_uuid uuid;
     libxl_bitmap cpumap;
     libxl_cpupoolinfo *poolinfo;
@@ -7472,33 +7713,34 @@ int main_cpupoolnumasplit(int argc, char **argv)
         /* No options */
     }
 
-    ret = 0;
+    libxl_dominfo_init(&info);
 
+    rc = 1;
+
+    libxl_bitmap_init(&cpumap);
     poolinfo = libxl_list_cpupool(ctx, &n_pools);
     if (!poolinfo) {
         fprintf(stderr, "error getting cpupool info\n");
-        return -ERROR_NOMEM;
+        return 1;
     }
     poolid = poolinfo[0].poolid;
     sched = poolinfo[0].sched;
-    for (p = 0; p < n_pools; p++) {
-        libxl_cpupoolinfo_dispose(poolinfo + p);
-    }
+    libxl_cpupoolinfo_list_free(poolinfo, n_pools);
+
     if (n_pools > 1) {
         fprintf(stderr, "splitting not possible, already cpupools in use\n");
-        return -ERROR_FAIL;
+        return 1;
     }
 
     topology = libxl_get_cpu_topology(ctx, &n_cpus);
     if (topology == NULL) {
         fprintf(stderr, "libxl_get_topologyinfo failed\n");
-        return -ERROR_FAIL;
+        return 1;
     }
 
     if (libxl_cpu_bitmap_alloc(ctx, &cpumap, 0)) {
         fprintf(stderr, "Failed to allocate cpumap\n");
-        libxl_cputopology_list_free(topology, n_cpus);
-        return -ERROR_FAIL;
+        goto out;
     }
 
     /* Reset Pool-0 to 1st node: first add cpus, then remove cpus to avoid
@@ -7507,12 +7749,11 @@ int main_cpupoolnumasplit(int argc, char **argv)
     node = topology[0].node;
     if (libxl_cpupool_cpuadd_node(ctx, 0, node, &n)) {
         fprintf(stderr, "error on adding cpu to Pool 0\n");
-        return -ERROR_FAIL;
+        goto out;
     }
 
-    snprintf(name, 15, "Pool-node%d", node);
-    ret = -libxl_cpupool_rename(ctx, name, 0);
-    if (ret) {
+    xasprintf(&name, "Pool-node%d", node);
+    if (libxl_cpupool_rename(ctx, name, 0)) {
         fprintf(stderr, "error on renaming Pool 0\n");
         goto out;
     }
@@ -7530,6 +7771,12 @@ int main_cpupoolnumasplit(int argc, char **argv)
         goto out;
     }
     for (c = 0; c < 10; c++) {
+        /* We've called libxl_dominfo_init before the loop and will
+         * call libxl_dominfo_dispose after the loop when we're done
+         * with info.
+         */
+        libxl_dominfo_dispose(&info);
+        libxl_dominfo_init(&info);
         if (libxl_domain_info(ctx, &info, 0)) {
             fprintf(stderr, "error on getting info for Domain-0\n");
             goto out;
@@ -7551,23 +7798,21 @@ int main_cpupoolnumasplit(int argc, char **argv)
         }
 
         node = topology[c].node;
-        ret = -libxl_cpupool_cpuremove_node(ctx, 0, node, &n);
-        if (ret) {
+        if (libxl_cpupool_cpuremove_node(ctx, 0, node, &n)) {
             fprintf(stderr, "error on removing cpu from Pool 0\n");
             goto out;
         }
 
-        snprintf(name, 15, "Pool-node%d", node);
+        free(name);
+        xasprintf(&name, "Pool-node%d", node);
         libxl_uuid_generate(&uuid);
         poolid = 0;
-        ret = -libxl_cpupool_create(ctx, name, sched, cpumap, &uuid, &poolid);
-        if (ret) {
+        if (libxl_cpupool_create(ctx, name, sched, cpumap, &uuid, &poolid)) {
             fprintf(stderr, "error on creating cpupool\n");
             goto out;
         }
 
-        ret = -libxl_cpupool_cpuadd_node(ctx, poolid, node, &n);
-        if (ret) {
+        if (libxl_cpupool_cpuadd_node(ctx, poolid, node, &n)) {
             fprintf(stderr, "error on adding cpus to cpupool\n");
             goto out;
         }
@@ -7579,11 +7824,15 @@ int main_cpupoolnumasplit(int argc, char **argv)
         }
     }
 
+    rc = 0;
+
 out:
     libxl_cputopology_list_free(topology, n_cpus);
     libxl_bitmap_dispose(&cpumap);
+    libxl_dominfo_dispose(&info);
+    free(name);
 
-    return ret;
+    return rc;
 }
 
 int main_getenforce(int argc, char **argv)
@@ -7608,7 +7857,7 @@ int main_getenforce(int argc, char **argv)
 
 int main_setenforce(int argc, char **argv)
 {
-    int ret, mode = -1;
+    int ret, mode;
     const char *p = NULL;
 
     if (optind >= argc) {
@@ -7647,7 +7896,7 @@ int main_setenforce(int argc, char **argv)
 int main_loadpolicy(int argc, char **argv)
 {
     const char *polFName;
-    int polFd = 0;
+    int polFd = -1;
     void *polMemCp = NULL;
     struct stat info;
     int ret;
@@ -7659,7 +7908,7 @@ int main_loadpolicy(int argc, char **argv)
 
     polFName = argv[optind];
     polFd = open(polFName, O_RDONLY);
-    if ( polFd < 0 ) {
+    if (polFd < 0) {
         fprintf(stderr, "Error occurred opening policy file '%s': %s\n",
                 polFName, strerror(errno));
         ret = -1;
@@ -7667,7 +7916,7 @@ int main_loadpolicy(int argc, char **argv)
     }
 
     ret = stat(polFName, &info);
-    if ( ret < 0 ) {
+    if (ret < 0) {
         fprintf(stderr, "Error occurred retrieving information about"
                 "policy file '%s': %s\n", polFName, strerror(errno));
         goto done;
@@ -7699,7 +7948,7 @@ int main_loadpolicy(int argc, char **argv)
 
 done:
     free(polMemCp);
-    if ( polFd > 0 )
+    if (polFd >= 0)
         close(polFd);
 
     return ret;
@@ -7770,10 +8019,9 @@ int main_remus(int argc, char **argv)
         if (!ssh_command[0]) {
             rune = host;
         } else {
-            if (asprintf(&rune, "exec %s %s xl migrate-receive -r %s",
-                         ssh_command, host,
-                         daemonize ? "" : " -e") < 0)
-                return 1;
+            xasprintf(&rune, "exec %s %s xl migrate-receive -r %s",
+                      ssh_command, host,
+                      daemonize ? "" : " -e");
         }
 
         save_domain_core_begin(domid, NULL, &config_data, &config_len);
@@ -7824,15 +8072,24 @@ int main_remus(int argc, char **argv)
 int main_devd(int argc, char **argv)
 {
     int ret = 0, opt = 0, daemonize = 1;
+    const char *pidfile = NULL;
+    static const struct option opts[] = {
+        {"pidfile", 1, 0, 'p'},
+        COMMON_LONG_OPTS,
+        {0, 0, 0, 0}
+    };
 
-    SWITCH_FOREACH_OPT(opt, "F", NULL, "devd", 0) {
+    SWITCH_FOREACH_OPT(opt, "Fp:", opts, "devd", 0) {
     case 'F':
         daemonize = 0;
         break;
+    case 'p':
+        pidfile = optarg;
+        break;
     }
 
     if (daemonize) {
-        ret = do_daemonize("xldevd");
+        ret = do_daemonize("xldevd", pidfile);
         if (ret) {
             ret = (ret == 1) ? 0 : ret;
             goto out;
@@ -7846,12 +8103,91 @@ out:
 }
 
 #ifdef LIBXL_HAVE_PSR_CMT
-static void psr_cmt_print_domain_cache_occupancy(libxl_dominfo *dominfo,
-                                                    uint32_t nr_sockets)
+static int psr_cmt_hwinfo(void)
+{
+    int rc;
+    int enabled;
+    uint32_t total_rmid;
+
+    printf("Cache Monitoring Technology (CMT):\n");
+
+    enabled = libxl_psr_cmt_enabled(ctx);
+    printf("%-16s: %s\n", "Enabled", enabled ? "1" : "0");
+    if (!enabled)
+        return 0;
+
+    rc = libxl_psr_cmt_get_total_rmid(ctx, &total_rmid);
+    if (rc) {
+        fprintf(stderr, "Failed to get max RMID value\n");
+        return rc;
+    }
+    printf("%-16s: %u\n", "Total RMID", total_rmid);
+
+    printf("Supported monitor types:\n");
+    if (libxl_psr_cmt_type_supported(ctx, LIBXL_PSR_CMT_TYPE_CACHE_OCCUPANCY))
+        printf("cache-occupancy\n");
+    if (libxl_psr_cmt_type_supported(ctx, LIBXL_PSR_CMT_TYPE_TOTAL_MEM_COUNT))
+        printf("total-mem-bandwidth\n");
+    if (libxl_psr_cmt_type_supported(ctx, LIBXL_PSR_CMT_TYPE_LOCAL_MEM_COUNT))
+        printf("local-mem-bandwidth\n");
+
+    return rc;
+}
+
+#define MBM_SAMPLE_RETRY_MAX 4
+static int psr_cmt_get_mem_bandwidth(uint32_t domid,
+                                     libxl_psr_cmt_type type,
+                                     uint32_t socketid,
+                                     uint64_t *bandwidth_r)
+{
+    uint64_t sample1, sample2;
+    uint64_t tsc1, tsc2;
+    int retry_attempts = 0;
+    int rc;
+
+    while (1) {
+        rc = libxl_psr_cmt_get_sample(ctx, domid, type, socketid,
+                                      &sample1, &tsc1);
+        if (rc < 0)
+            return rc;
+
+        usleep(10000);
+
+        rc = libxl_psr_cmt_get_sample(ctx, domid, type, socketid,
+                                      &sample2, &tsc2);
+        if (rc < 0)
+            return rc;
+
+        if (tsc2 <= tsc1)
+            return -1;
+
+        /*
+         * Hardware guarantees at most 1 overflow can happen if the duration
+         * between two samples is less than 1 second. Note that tsc returned
+         * from hypervisor is already-scaled time(ns).
+         */
+        if (tsc2 - tsc1 < 1000000000 && sample2 >= sample1)
+            break;
+
+        if (retry_attempts < MBM_SAMPLE_RETRY_MAX) {
+            retry_attempts++;
+        } else {
+            fprintf(stderr, "event counter overflowed\n");
+            return -1;
+        }
+    }
+
+    *bandwidth_r = (sample2 - sample1) * 1000000000 / (tsc2 - tsc1) / 1024;
+    return 0;
+}
+
+static void psr_cmt_print_domain_info(libxl_dominfo *dominfo,
+                                      libxl_psr_cmt_type type,
+                                      libxl_bitmap *socketmap)
 {
     char *domain_name;
     uint32_t socketid;
-    uint32_t l3_cache_occupancy;
+    uint64_t monitor_data;
 
     if (!libxl_psr_cmt_domain_attached(ctx, dominfo->domid))
         return;
@@ -7860,20 +8196,32 @@ static void psr_cmt_print_domain_cache_occupancy(libxl_dominfo *dominfo,
     printf("%-40s %5d", domain_name, dominfo->domid);
     free(domain_name);
 
-    for (socketid = 0; socketid < nr_sockets; socketid++) {
-        if ( !libxl_psr_cmt_get_cache_occupancy(ctx, dominfo->domid,
-                 socketid, &l3_cache_occupancy) )
-            printf("%13u KB", l3_cache_occupancy);
+    libxl_for_each_set_bit(socketid, *socketmap) {
+        switch (type) {
+        case LIBXL_PSR_CMT_TYPE_CACHE_OCCUPANCY:
+            if (!libxl_psr_cmt_get_sample(ctx, dominfo->domid, type, socketid,
+                                          &monitor_data, NULL))
+                printf("%13"PRIu64" KB", monitor_data / 1024);
+            break;
+        case LIBXL_PSR_CMT_TYPE_TOTAL_MEM_COUNT:
+        case LIBXL_PSR_CMT_TYPE_LOCAL_MEM_COUNT:
+            if (!psr_cmt_get_mem_bandwidth(dominfo->domid, type, socketid,
+                                           &monitor_data))
+                printf("%11"PRIu64" KB/s", monitor_data);
+            break;
+        default:
+            return;
+        }
     }
 
     printf("\n");
 }
 
-static int psr_cmt_show_cache_occupancy(uint32_t domid)
+static int psr_cmt_show(libxl_psr_cmt_type type, uint32_t domid)
 {
-    uint32_t i, socketid, nr_sockets, total_rmid;
+    uint32_t i, socketid, total_rmid;
     uint32_t l3_cache_size;
-    libxl_physinfo info;
+    libxl_bitmap socketmap;
     int rc, nr_domains;
 
     if (!libxl_psr_cmt_enabled(ctx)) {
@@ -7881,64 +8229,80 @@ static int psr_cmt_show_cache_occupancy(uint32_t domid)
         return -1;
     }
 
-    libxl_physinfo_init(&info);
-    rc = libxl_get_physinfo(ctx, &info);
-    if (rc < 0) {
-        fprintf(stderr, "Failed getting physinfo, rc: %d\n", rc);
-        libxl_physinfo_dispose(&info);
+    if (!libxl_psr_cmt_type_supported(ctx, type)) {
+        fprintf(stderr, "Monitor type '%s' is not supported in the system\n",
+                libxl_psr_cmt_type_to_string(type));
         return -1;
     }
-    nr_sockets = info.nr_cpus / info.threads_per_core / info.cores_per_socket;
-    libxl_physinfo_dispose(&info);
+
+    libxl_bitmap_init(&socketmap);
+    libxl_socket_bitmap_alloc(ctx, &socketmap, 0);
+    rc = libxl_get_online_socketmap(ctx, &socketmap);
+    if (rc < 0) {
+        fprintf(stderr, "Failed getting available sockets, rc: %d\n", rc);
+        goto out;
+    }
 
     rc = libxl_psr_cmt_get_total_rmid(ctx, &total_rmid);
     if (rc < 0) {
         fprintf(stderr, "Failed to get max RMID value\n");
-        return -1;
+        goto out;
     }
 
     printf("Total RMID: %d\n", total_rmid);
 
     /* Header */
     printf("%-40s %5s", "Name", "ID");
-    for (socketid = 0; socketid < nr_sockets; socketid++)
+    libxl_for_each_set_bit(socketid, socketmap)
         printf("%14s %d", "Socket", socketid);
     printf("\n");
 
-    /* Total L3 cache size */
-    printf("%-46s", "Total L3 Cache Size");
-    for (socketid = 0; socketid < nr_sockets; socketid++) {
-        rc = libxl_psr_cmt_get_l3_cache_size(ctx, socketid, &l3_cache_size);
-        if (rc < 0) {
-            fprintf(stderr, "Failed to get system l3 cache size for socket:%d\n",
+    if (type == LIBXL_PSR_CMT_TYPE_CACHE_OCCUPANCY) {
+            /* Total L3 cache size */
+            printf("%-46s", "Total L3 Cache Size");
+            libxl_for_each_set_bit(socketid, socketmap) {
+                rc = libxl_psr_cmt_get_l3_cache_size(ctx, socketid,
+                                                     &l3_cache_size);
+                if (rc < 0) {
+                    fprintf(stderr,
+                            "Failed to get system l3 cache size for socket:%d\n",
                             socketid);
-            return -1;
-        }
-        printf("%13u KB", l3_cache_size);
+                    goto out;
+                }
+                printf("%13u KB", l3_cache_size);
+            }
+            printf("\n");
     }
-    printf("\n");
 
     /* Each domain */
     if (domid != INVALID_DOMID) {
         libxl_dominfo dominfo;
+
+        libxl_dominfo_init(&dominfo);
         if (libxl_domain_info(ctx, &dominfo, domid)) {
             fprintf(stderr, "Failed to get domain info for %d\n", domid);
-            return -1;
+            rc = -1;
+            goto out;
         }
-        psr_cmt_print_domain_cache_occupancy(&dominfo, nr_sockets);
+        psr_cmt_print_domain_info(&dominfo, type, &socketmap);
+        libxl_dominfo_dispose(&dominfo);
     }
     else
     {
         libxl_dominfo *list;
         if (!(list = libxl_list_domain(ctx, &nr_domains))) {
             fprintf(stderr, "Failed to get domain info for domain list.\n");
-            return -1;
+            rc = -1;
+            goto out;
         }
         for (i = 0; i < nr_domains; i++)
-            psr_cmt_print_domain_cache_occupancy(list + i, nr_sockets);
+            psr_cmt_print_domain_info(list + i, type, &socketmap);
         libxl_dominfo_list_free(list, nr_domains);
     }
-    return 0;
+
+out:
+    libxl_bitmap_dispose(&socketmap);
+    return rc;
 }
 
 int main_psr_cmt_attach(int argc, char **argv)
@@ -7981,7 +8345,16 @@ int main_psr_cmt_show(int argc, char **argv)
         /* No options */
     }
 
-    libxl_psr_cmt_type_from_string(argv[optind], &type);
+    if (!strcmp(argv[optind], "cache-occupancy"))
+        type = LIBXL_PSR_CMT_TYPE_CACHE_OCCUPANCY;
+    else if (!strcmp(argv[optind], "total-mem-bandwidth"))
+        type = LIBXL_PSR_CMT_TYPE_TOTAL_MEM_COUNT;
+    else if (!strcmp(argv[optind], "local-mem-bandwidth"))
+        type = LIBXL_PSR_CMT_TYPE_LOCAL_MEM_COUNT;
+    else {
+        help("psr-cmt-show");
+        return 2;
+    }
 
     if (optind + 1 >= argc)
         domid = INVALID_DOMID;
@@ -7992,17 +8365,231 @@ int main_psr_cmt_show(int argc, char **argv)
         return 2;
     }
 
-    switch (type) {
-    case LIBXL_PSR_CMT_TYPE_CACHE_OCCUPANCY:
-        ret = psr_cmt_show_cache_occupancy(domid);
+    ret = psr_cmt_show(type, domid);
+
+    return ret;
+}
+#endif
+
+#ifdef LIBXL_HAVE_PSR_CAT
+static int psr_cat_hwinfo(void)
+{
+    int rc;
+    int i, nr;
+    uint32_t l3_cache_size;
+    libxl_psr_cat_info *info;
+
+    printf("Cache Allocation Technology (CAT):\n");
+
+    rc = libxl_psr_cat_get_l3_info(ctx, &info, &nr);
+    if (rc) {
+        fprintf(stderr, "Failed to get cat info\n");
+        return rc;
+    }
+
+    for (i = 0; i < nr; i++) {
+        rc = libxl_psr_cmt_get_l3_cache_size(ctx, info[i].id, &l3_cache_size);
+        if (rc) {
+            fprintf(stderr, "Failed to get l3 cache size for socket:%d\n",
+                    info[i].id);
+            goto out;
+        }
+        printf("%-16s: %u\n", "Socket ID", info[i].id);
+        printf("%-16s: %uKB\n", "L3 Cache", l3_cache_size);
+        printf("%-16s: %u\n", "Maximum COS", info[i].cos_max);
+        printf("%-16s: %u\n", "CBM length", info[i].cbm_len);
+        printf("%-16s: %#llx\n", "Default CBM",
+               (1ull << info[i].cbm_len) - 1);
+    }
+
+out:
+    libxl_psr_cat_info_list_free(info, nr);
+    return rc;
+}
+
+static void psr_cat_print_one_domain_cbm(uint32_t domid, uint32_t socketid)
+{
+    char *domain_name;
+    uint64_t cbm;
+
+    domain_name = libxl_domid_to_name(ctx, domid);
+    printf("%5d%25s", domid, domain_name);
+    free(domain_name);
+
+    if (!libxl_psr_cat_get_cbm(ctx, domid, LIBXL_PSR_CBM_TYPE_L3_CBM,
+                               socketid, &cbm))
+         printf("%#16"PRIx64, cbm);
+
+    printf("\n");
+}
+
+static int psr_cat_print_domain_cbm(uint32_t domid, uint32_t socketid)
+{
+    int i, nr_domains;
+    libxl_dominfo *list;
+
+    if (domid != INVALID_DOMID) {
+        psr_cat_print_one_domain_cbm(domid, socketid);
+        return 0;
+    }
+
+    if (!(list = libxl_list_domain(ctx, &nr_domains))) {
+        fprintf(stderr, "Failed to get domain list for cbm display\n");
+        return -1;
+    }
+
+    for (i = 0; i < nr_domains; i++)
+        psr_cat_print_one_domain_cbm(list[i].domid, socketid);
+    libxl_dominfo_list_free(list, nr_domains);
+
+    return 0;
+}
+
+static int psr_cat_print_socket(uint32_t domid, libxl_psr_cat_info *info)
+{
+    int rc;
+    uint32_t l3_cache_size;
+
+    rc = libxl_psr_cmt_get_l3_cache_size(ctx, info->id, &l3_cache_size);
+    if (rc) {
+        fprintf(stderr, "Failed to get l3 cache size for socket:%d\n",
+                info->id);
+        return -1;
+    }
+
+    printf("%-16s: %u\n", "Socket ID", info->id);
+    printf("%-16s: %uKB\n", "L3 Cache", l3_cache_size);
+    printf("%-16s: %#llx\n", "Default CBM", (1ull << info->cbm_len) - 1);
+    printf("%5s%25s%16s\n", "ID", "NAME", "CBM");
+
+    return psr_cat_print_domain_cbm(domid, info->id);
+}
+
+static int psr_cat_show(uint32_t domid)
+{
+    int i, nr;
+    int rc;
+    libxl_psr_cat_info *info;
+
+    rc = libxl_psr_cat_get_l3_info(ctx, &info, &nr);
+    if (rc) {
+        fprintf(stderr, "Failed to get cat info\n");
+        return rc;
+    }
+
+    for (i = 0; i < nr; i++) {
+        rc = psr_cat_print_socket(domid, info + i);
+        if (rc)
+            goto out;
+    }
+
+out:
+    libxl_psr_cat_info_list_free(info, nr);
+    return rc;
+}
+
+int main_psr_cat_cbm_set(int argc, char **argv)
+{
+    uint32_t domid;
+    libxl_psr_cbm_type type = LIBXL_PSR_CBM_TYPE_L3_CBM;
+    uint64_t cbm;
+    int ret, opt = 0;
+    libxl_bitmap target_map;
+    char *value;
+    libxl_string_list socket_list;
+    unsigned long start, end;
+    int i, j, len;
+
+    static struct option opts[] = {
+        {"socket", 1, 0, 's'},
+        COMMON_LONG_OPTS
+    };
+
+    libxl_socket_bitmap_alloc(ctx, &target_map, 0);
+    libxl_bitmap_set_none(&target_map);
+
+    SWITCH_FOREACH_OPT(opt, "s:", opts, "psr-cat-cbm-set", 2) {
+    case 's':
+        trim(isspace, optarg, &value);
+        split_string_into_string_list(value, ",", &socket_list);
+        len = libxl_string_list_length(&socket_list);
+        for (i = 0; i < len; i++) {
+            parse_range(socket_list[i], &start, &end);
+            for (j = start; j <= end; j++)
+                libxl_bitmap_set(&target_map, j);
+        }
+
+        libxl_string_list_dispose(&socket_list);
+        free(value);
         break;
-    default:
-        help("psr-cmt-show");
+    }
+
+    if (libxl_bitmap_is_empty(&target_map))
+        libxl_bitmap_set_any(&target_map);
+
+    if (argc != optind + 2) {
+        help("psr-cat-cbm-set");
+        return 2;
+    }
+
+    domid = find_domain(argv[optind]);
+    cbm = strtoll(argv[optind + 1], NULL , 0);
+
+    ret = libxl_psr_cat_set_cbm(ctx, domid, type, &target_map, cbm);
+
+    libxl_bitmap_dispose(&target_map);
+    return ret;
+}
+
+int main_psr_cat_show(int argc, char **argv)
+{
+    int opt;
+    uint32_t domid;
+
+    SWITCH_FOREACH_OPT(opt, "", NULL, "psr-cat-show", 0) {
+        /* No options */
+    }
+
+    if (optind >= argc)
+        domid = INVALID_DOMID;
+    else if (optind == argc - 1)
+        domid = find_domain(argv[optind]);
+    else {
+        help("psr-cat-show");
         return 2;
     }
 
+    return psr_cat_show(domid);
+}
+
+int main_psr_hwinfo(int argc, char **argv)
+{
+    int opt, ret = 0;
+    bool all = true, cmt = false, cat = false;
+    static struct option opts[] = {
+        {"cmt", 0, 0, 'm'},
+        {"cat", 0, 0, 'a'},
+        COMMON_LONG_OPTS
+    };
+
+    SWITCH_FOREACH_OPT(opt, "ma", opts, "psr-hwinfo", 0) {
+    case 'm':
+        all = false; cmt = true;
+        break;
+    case 'a':
+        all = false; cat = true;
+        break;
+    }
+
+    if (!ret && (all || cmt))
+        ret = psr_cmt_hwinfo();
+
+    if (!ret && (all || cat))
+        ret = psr_cat_hwinfo();
+
     return ret;
 }
+
 #endif
 
 /*
diff --git a/tools/libxl/xl_cmdtable.c b/tools/libxl/xl_cmdtable.c
index 4b30d3d..0071f12 100644
--- a/tools/libxl/xl_cmdtable.c
+++ b/tools/libxl/xl_cmdtable.c
@@ -30,6 +30,7 @@ struct cmd_spec cmd_table[] = {
       "-n, --dryrun            Dry run - prints the resulting configuration\n"
       "                         (deprecated in favour of global -N option).\n"
       "-d                      Enable debug messages.\n"
+      "-F                      Run in foreground until death of the domain.\n"
       "-e                      Do not wait in the background for the death of the domain.\n"
       "-V, --vncviewer         Connect to the VNC display after the domain is created.\n"
       "-A, --vncviewer-autopass\n"
@@ -53,6 +54,7 @@ struct cmd_spec cmd_table[] = {
       "-l, --long              Output all VM details\n"
       "-v, --verbose           Prints out UUIDs and security context\n"
       "-Z, --context           Prints out security context\n"
+      "-c, --cpupool           Prints the cpupool the domain is in\n"
       "-n, --numa              Prints out NUMA node affinity"
     },
     { "destroy",
@@ -263,22 +265,6 @@ struct cmd_spec cmd_table[] = {
       "-w WEIGHT, --weight=WEIGHT     Weight (int)\n"
       "-p CPUPOOL, --cpupool=CPUPOOL  Restrict output to CPUPOOL"
     },
-    { "sched-sedf",
-      &main_sched_sedf, 0, 1,
-      "Get/set sedf scheduler parameters",
-      "[options]",
-      "-d DOMAIN, --domain=DOMAIN     Domain to modify\n"
-      "-p MS, --period=MS             Relative deadline(ms)\n"
-      "-s MS, --slice=MS              Worst-case execution time(ms).\n"
-      "                               (slice < period)\n"
-      "-l MS, --latency=MS            Scaled period (ms) when domain\n"
-      "                               performs heavy I/O\n"
-      "-e FLAG, --extra=FLAG          Flag (0 or 1) controls if domain\n"
-      "                               can run in extra time\n"
-      "-w FLOAT, --weight=FLOAT       CPU Period/slice (do not set with\n"
-      "                               --period/--slice)\n"
-      "-c CPUPOOL, --cpupool=CPUPOOL  Restrict output to CPUPOOL"
-    },
     { "sched-rtds",
       &main_sched_rtds, 0, 1,
       "Get/set rtds scheduler parameters",
@@ -520,9 +506,17 @@ struct cmd_spec cmd_table[] = {
       &main_devd, 0, 1,
       "Daemon that listens for devices and launches backends",
       "[options]",
-      "-F                      Run in the foreground",
+      "-F                      Run in the foreground.\n"
+      "-p, --pidfile [FILE]    Write PID to pidfile when daemonizing.",
     },
 #ifdef LIBXL_HAVE_PSR_CMT
+    { "psr-hwinfo",
+      &main_psr_hwinfo, 0, 1,
+      "Show hardware information for Platform Shared Resource",
+      "[options]",
+      "-m, --cmt       Show Cache Monitoring Technology (CMT) hardware info\n"
+      "-a, --cat       Show Cache Allocation Technology (CAT) hardware info\n"
+    },
     { "psr-cmt-attach",
       &main_psr_cmt_attach, 0, 1,
       "Attach Cache Monitoring Technology service to a domain",
@@ -538,9 +532,25 @@ struct cmd_spec cmd_table[] = {
       "Show Cache Monitoring Technology information",
       "<PSR-CMT-Type> <Domain>",
       "Available monitor types:\n"
-      "\"cache_occupancy\":         Show L3 cache occupancy\n",
+      "\"cache-occupancy\":         Show L3 cache occupancy(KB)\n"
+      "\"total-mem-bandwidth\":     Show total memory bandwidth(KB/s)\n"
+      "\"local-mem-bandwidth\":     Show local memory bandwidth(KB/s)\n",
     },
 #endif
+#ifdef LIBXL_HAVE_PSR_CAT
+    { "psr-cat-cbm-set",
+      &main_psr_cat_cbm_set, 0, 1,
+      "Set cache capacity bitmasks(CBM) for a domain",
+      "[options] <Domain> <CBM>",
+      "-s <socket>       Specify the socket to process, otherwise all sockets are processed\n"
+    },
+    { "psr-cat-show",
+      &main_psr_cat_show, 0, 1,
+      "Show Cache Allocation Technology information",
+      "<Domain>",
+    },
+
+#endif
 };
 
 int cmdtable_len = sizeof(cmd_table)/sizeof(struct cmd_spec);
diff --git a/tools/libxl/xlutil.pc.in.in b/tools/libxl/xlutil.pc.in.in
new file mode 100644
index 0000000..e7dc14d
--- /dev/null
+++ b/tools/libxl/xlutil.pc.in.in
@@ -0,0 +1,9 @@
+prefix=@prefix@
+includedir=@includedir@
+libdir=@libdir@
+
+Name: Xlutil
+Description: The xl utility library for Xen hypervisor
+Version: @@version@@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lxlutil
diff --git a/tools/memshr/Makefile b/tools/memshr/Makefile
index 2c34f96..ce86f20 100644
--- a/tools/memshr/Makefile
+++ b/tools/memshr/Makefile
@@ -39,6 +39,9 @@ install: all
 clean:
 	rm -rf *.a *.o *~ $(DEPS)
 
-.PHONY: all build clean install
+.PHONY: distclean
+distclean: clean
+
+.PHONY: all build clean install distclean
 
 -include $(DEPS)
diff --git a/tools/memshr/bidir-daemon.c b/tools/memshr/bidir-daemon.c
index a601837..ddb7c00 100644
--- a/tools/memshr/bidir-daemon.c
+++ b/tools/memshr/bidir-daemon.c
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #include <pthread.h>
 #include <inttypes.h>
diff --git a/tools/memshr/bidir-daemon.h b/tools/memshr/bidir-daemon.h
index 29c3dcd..f683280 100644
--- a/tools/memshr/bidir-daemon.h
+++ b/tools/memshr/bidir-daemon.h
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __BIDIR_DAEMON_H__
diff --git a/tools/memshr/bidir-hash.c b/tools/memshr/bidir-hash.c
index 3d34637..c5cc71e 100644
--- a/tools/memshr/bidir-hash.c
+++ b/tools/memshr/bidir-hash.c
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #include <assert.h>
 #include <errno.h>
diff --git a/tools/memshr/bidir-hash.h b/tools/memshr/bidir-hash.h
index cc9166f..d32c8e4 100644
--- a/tools/memshr/bidir-hash.h
+++ b/tools/memshr/bidir-hash.h
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef __BIDIR_HASH_H__
 #define __BIDIR_HASH_H__
diff --git a/tools/memshr/bidir-namedefs.h b/tools/memshr/bidir-namedefs.h
index 2694f82..6ab4b3d 100644
--- a/tools/memshr/bidir-namedefs.h
+++ b/tools/memshr/bidir-namedefs.h
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #include "memshr-priv.h"
 
diff --git a/tools/memshr/interface.c b/tools/memshr/interface.c
index 1c39dfa..ba43c3d 100644
--- a/tools/memshr/interface.c
+++ b/tools/memshr/interface.c
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #include <string.h>
 #include <inttypes.h>
diff --git a/tools/memshr/memshr-priv.h b/tools/memshr/memshr-priv.h
index 66ab477..f2c3f1f 100644
--- a/tools/memshr/memshr-priv.h
+++ b/tools/memshr/memshr-priv.h
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef __MEMSHR_PRIV_H__
 #define __MEMSHR_PRIV_H__
diff --git a/tools/memshr/memshr.h b/tools/memshr/memshr.h
index 32c1d81..f3f898c 100644
--- a/tools/memshr/memshr.h
+++ b/tools/memshr/memshr.h
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef __MEMSHR_H__
 #define __MEMSHR_H__
diff --git a/tools/memshr/shm.c b/tools/memshr/shm.c
index 30975dc..00e9a51 100644
--- a/tools/memshr/shm.c
+++ b/tools/memshr/shm.c
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #include <assert.h>
 #include <stdlib.h>
diff --git a/tools/memshr/shm.h b/tools/memshr/shm.h
index 3ce6758..adb6eb9 100644
--- a/tools/memshr/shm.h
+++ b/tools/memshr/shm.h
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef __SHM_H__
 #define __SHM_H__
diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index 7a2bfd2..c4490f3 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -2,54 +2,70 @@ XEN_ROOT=$(CURDIR)/../..
 include $(XEN_ROOT)/tools/Rules.mk
 
 CFLAGS += -Werror
-
+# Include configure output (config.h)
+CFLAGS += -include $(XEN_ROOT)/tools/config.h
 CFLAGS += $(CFLAGS_libxenctrl)
 CFLAGS += $(CFLAGS_xeninclude)
 CFLAGS += $(CFLAGS_libxenstore)
-# xen-hptool.c and xen-mfndump.c incorrectly use libxc internals
-CFLAGS += -I$(XEN_ROOT)/tools/libxc
-
-HDRS     = $(wildcard *.h)
 
-TARGETS-y := xenperf xenpm xen-tmem-list-parse gtraceview gtracestat xenlockprof xenwatchdogd xencov
-TARGETS-$(CONFIG_X86) += xen-detect xen-hvmctx xen-hvmcrash xen-lowmemd xen-mfndump
-TARGETS-$(CONFIG_MIGRATE) += xen-hptool
-TARGETS := $(TARGETS-y)
+# Everything to be installed in regular bin/
+INSTALL_BIN-$(CONFIG_X86)      += xen-detect
+INSTALL_BIN                    += xencons
+INSTALL_BIN                    += xencov_split
+INSTALL_BIN += $(INSTALL_BIN-y)
 
-INSTALL_BIN-y := xencons xencov_split
-INSTALL_BIN-$(CONFIG_X86) += xen-detect
-INSTALL_BIN := $(INSTALL_BIN-y)
-
-INSTALL_SBIN-y := xen-bugtool xenperf xenpm xen-tmem-list-parse gtraceview \
-	gtracestat xenlockprof xenwatchdogd xen-ringwatch xencov
-INSTALL_SBIN-$(CONFIG_X86) += xen-hvmctx xen-hvmcrash xen-lowmemd xen-mfndump
+# Everything to be installed in regular sbin/
+INSTALL_SBIN                   += gtracestat
+INSTALL_SBIN                   += gtraceview
+INSTALL_SBIN                   += xen-bugtool
 INSTALL_SBIN-$(CONFIG_MIGRATE) += xen-hptool
-INSTALL_SBIN := $(INSTALL_SBIN-y)
-
-INSTALL_PRIVBIN-y := xenpvnetboot
-INSTALL_PRIVBIN := $(INSTALL_PRIVBIN-y)
-
-# Include configure output (config.h) to headers search path
-CFLAGS += -I$(XEN_ROOT)/tools
-
-.PHONY: all
-all: build
-
-.PHONY: build
-build: $(TARGETS)
+INSTALL_SBIN-$(CONFIG_X86)     += xen-hvmcrash
+INSTALL_SBIN-$(CONFIG_X86)     += xen-hvmctx
+INSTALL_SBIN-$(CONFIG_X86)     += xen-lowmemd
+INSTALL_SBIN-$(CONFIG_X86)     += xen-mfndump
+INSTALL_SBIN                   += xen-ringwatch
+INSTALL_SBIN                   += xen-tmem-list-parse
+INSTALL_SBIN                   += xencov
+INSTALL_SBIN                   += xenlockprof
+INSTALL_SBIN                   += xenperf
+INSTALL_SBIN                   += xenpm
+INSTALL_SBIN                   += xenwatchdogd
+INSTALL_SBIN += $(INSTALL_SBIN-y)
+
+# Everything to be installed in a private bin/
+INSTALL_PRIVBIN                += xenpvnetboot
+
+# Everything to be installed
+TARGETS_ALL := $(INSTALL_BIN) $(INSTALL_SBIN) $(INSTALL_PRIVBIN)
+
+# Everything which only needs copying to install
+TARGETS_COPY += xen-bugtool
+TARGETS_COPY += xen-ringwatch
+TARGETS_COPY += xencons
+TARGETS_COPY += xencov_split
+TARGETS_COPY += xenpvnetboot
+
+# Everything which needs to be built
+TARGETS_BUILD := $(filter-out $(TARGETS_COPY),$(TARGETS_ALL))
+
+.PHONY: all build
+all build: $(TARGETS_BUILD)
 
 .PHONY: install
 install: build
-	$(INSTALL_DIR) $(DESTDIR)$(BINDIR)
-	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(bindir)
+	$(INSTALL_DIR) $(DESTDIR)$(sbindir)
 	$(INSTALL_DIR) $(DESTDIR)$(LIBEXEC_BIN)
-	$(INSTALL_PYTHON_PROG) $(INSTALL_BIN) $(DESTDIR)$(BINDIR)
-	$(INSTALL_PYTHON_PROG) $(INSTALL_SBIN) $(DESTDIR)$(SBINDIR)
+	$(INSTALL_PYTHON_PROG) $(INSTALL_BIN) $(DESTDIR)$(bindir)
+	$(INSTALL_PYTHON_PROG) $(INSTALL_SBIN) $(DESTDIR)$(sbindir)
 	$(INSTALL_PYTHON_PROG) $(INSTALL_PRIVBIN) $(DESTDIR)$(LIBEXEC_BIN)
 
 .PHONY: clean
 clean:
-	$(RM) *.o $(TARGETS) *~ $(DEPS)
+	$(RM) *.o $(TARGETS_BUILD) *~ $(DEPS)
+
+.PHONY: distclean
+distclean: clean
 
 xen-hvmctx: xen-hvmctx.o
 	$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
@@ -69,9 +85,13 @@ gtracestat: gtracestat.o
 xenlockprof: xenlockprof.o
 	$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
 
+# xen-hptool incorrectly uses libxc internals
+xen-hptool.o: CFLAGS += -I$(XEN_ROOT)/tools/libxc
 xen-hptool: xen-hptool.o
 	$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(LDLIBS_libxenguest) $(LDLIBS_libxenstore) $(APPEND_LDFLAGS)
 
+# xen-mfndump incorrectly uses libxc internals
+xen-mfndump.o: CFLAGS += -I$(XEN_ROOT)/tools/libxc
 xen-mfndump: xen-mfndump.o
 	$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(LDLIBS_libxenguest) $(APPEND_LDFLAGS)
 
@@ -82,7 +102,7 @@ xen-lowmemd: xen-lowmemd.o
 	$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(LDLIBS_libxenstore) $(APPEND_LDFLAGS)
 
 gtraceview: gtraceview.o
-	$(CC) $(LDFLAGS) -o $@ $< $(CURSES_LIBS) $(APPEND_LDFLAGS)
+	$(CC) $(LDFLAGS) -o $@ $< $(CURSES_LIBS) $(TINFO_LIBS) $(APPEND_LDFLAGS)
 
 xencov: xencov.o
 	$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
diff --git a/tools/misc/gtracestat.c b/tools/misc/gtracestat.c
index 874a043..5164397 100644
--- a/tools/misc/gtracestat.c
+++ b/tools/misc/gtracestat.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <stdio.h>
@@ -167,6 +166,7 @@ int main(int argc, char *argv[])
             tsc2phase = atoll(optarg);
             if (tsc2phase <= 0)
                 tsc2phase = 55800000UL;
+            break;
         case 'd':
             is_digest = 1;
             break;
diff --git a/tools/misc/gtraceview.c b/tools/misc/gtraceview.c
index cf9287c..52190a5 100644
--- a/tools/misc/gtraceview.c
+++ b/tools/misc/gtraceview.c
@@ -12,13 +12,9 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
-/* Include output from configure */
-#include <config.h>
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -1097,6 +1093,7 @@ void choose_cpus(void)
                     this->init();
                 return;
             }
+            /* fallthrough */
         case KEY_F(4):
             exit(EXIT_SUCCESS);
         }
diff --git a/tools/misc/mkrpm b/tools/misc/mkrpm
index 9b8c6d9..f9363a1 100644
--- a/tools/misc/mkrpm
+++ b/tools/misc/mkrpm
@@ -17,9 +17,7 @@ xenroot="$1"
 # version and release.  Default to "0" if there isn't a release.
 v=(${2/-/ })
 version=${v[0]}
-release=${v[1]}
-
-[[ -n "$release" ]] || release="0"
+release="${v[1]:-0}${PKG_RELEASE:+.$PKG_RELEASE}"
 
 cd $xenroot
 
diff --git a/tools/misc/mktarball b/tools/misc/mktarball
index aad1096..73282b5 100755
--- a/tools/misc/mktarball
+++ b/tools/misc/mktarball
@@ -6,7 +6,7 @@
 set -ex
 
 function git_archive_into {
-    mkdir "$2"
+    mkdir -p "$2"
 
     git --git-dir="$1"/.git \
 	archive --format=tar HEAD | \
@@ -33,6 +33,8 @@ git_archive_into $xen_root/tools/qemu-xen-dir-remote $tdir/xen-$desc/tools/qemu-
 
 git_archive_into $xen_root/tools/qemu-xen-traditional-dir-remote $tdir/xen-$desc/tools/qemu-xen-traditional
 
+git_archive_into $xen_root/extras/mini-os-remote $tdir/xen-$desc/extras/mini-os
+
 GZIP=-9v tar cz -f $xen_root/dist/xen-$desc.tar.gz -C $tdir xen-$desc
 
 echo "Source tarball in $xen_root/dist/xen-$desc.tar.gz"
diff --git a/tools/misc/sbdf2devicepath b/tools/misc/sbdf2devicepath
deleted file mode 100644
index 690834a..0000000
--- a/tools/misc/sbdf2devicepath
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env python
-#  -*- mode: python; -*-
-#============================================================================
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of version 2.1 of the GNU Lesser General Public
-# License as published by the Free Software Foundation.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-#============================================================================
-# Copyright (c) 2009, NEC Corporation.
-#============================================================================
-# This script converts SBDF into device path.
-#   'SBDF' format is "[SEG#:]BUS#:DEV#.FUNC#"
-#       ex) 0000:0a:1f.3
-#   Device path format is "HID[:UID]-DEV#.FUNC#[-DEV#.FUNC#[...]]"
-#       ex) PNP0A08:0-2.0-0.0
-#=============================================================================
-
-import sys
-import os
-
-# add fallback path for non-native python path installs if needed
-sys.path.append('/usr/lib/python')
-sys.path.append('/usr/lib64/python')
-from xen.util.pci import *
-
-SYSFS_ACPI_DEVS_PATH = '/firmware/acpi/namespace/ACPI/_SB'
-
-def find_hid_uid(dom, b, d, f):
-    obj_list = os.listdir(sb_path)
-    for obj in obj_list:
-        obj_path = sb_path + '/' + obj.strip() + '/'
-        if os.path.exists(obj_path + 'seg') and \
-            os.path.exists(obj_path + 'bbn'):
-            seg = open(obj_path + 'seg').read()
-            bbn = open(obj_path + 'bbn').read()
-            if int(seg) == dom and int(bbn) == b:
-                hid = open(obj_path + 'hid').read()
-                if os.path.exists(obj_path + 'uid') is False:
-                    path_str = hid.strip()
-                else:
-                    uid = open(obj_path + 'uid').read()
-                    path_str = hid.strip() + ':' + uid.strip()
-                return path_str
-    return None
-
-def make_device_path(dom, b, d, f):
-    dev = PciDevice(dom, b, d, f)
-    parent = dev.find_parent()
-    if parent is None:
-        path_str = find_hid_uid(dom, b, d, f)
-        path_str = path_str + '-' + hex(d).replace('0x', '') + '.' + \
-            hex(f).replace('0x', '')
-        return path_str
-    (pdom, pb, pd, pf) = parent
-    path_str = make_device_path(pdom, pb, pd, pf)
-    path_str = path_str + '-' + hex(d).replace('0x', '') + '.' + \
-        hex(f).replace('0x', '')
-    return path_str
-
-# main
-if len(sys.argv) <> 2:
-    print 'Usage: sbdf2devicepath SBDF\n'
-else:
-    sb_path = find_sysfs_mnt() + SYSFS_ACPI_DEVS_PATH
-    if os.path.exists(sb_path):
-        path = os.environ['PATH']
-        os.environ['PATH'] = path + ':/sbin' + ':/user/sbin'
-        sbdf = sys.argv[1]
-        (dom, b, d, f) = parse_pci_name(sbdf)
-        path_str = make_device_path(dom, b, d, f)
-        print path_str
-    else:
-        print sb_path + ' not found.\n'
-        print 'This command is only for linux 2.6.18.8 xen kernel.\n'
diff --git a/tools/misc/xen-hptool.c b/tools/misc/xen-hptool.c
index 1134603..c7561a9 100644
--- a/tools/misc/xen-hptool.c
+++ b/tools/misc/xen-hptool.c
@@ -49,7 +49,7 @@ static int hp_mem_online_func(int argc, char *argv[])
     ret = xc_mark_page_online(xch, mfn, mfn, &status);
 
     if (ret < 0)
-        fprintf(stderr, "Onlining page mfn %lx failed, error %x", mfn, ret);
+        fprintf(stderr, "Onlining page mfn %lx failed, error %x", mfn, errno);
     else if (status & (PG_ONLINE_FAILED |PG_ONLINE_BROKEN)) {
         fprintf(stderr, "Onlining page mfn %lx is broken, "
                         "Memory online failed\n", mfn);
@@ -80,7 +80,7 @@ static int hp_mem_query_func(int argc, char *argv[])
     ret = xc_query_page_offline_status(xch, mfn, mfn, &status);
 
     if (ret < 0)
-        fprintf(stderr, "Querying page mfn %lx failed, error %x", mfn, ret);
+        fprintf(stderr, "Querying page mfn %lx failed, error %x", mfn, errno);
     else
     {
 		printf("Memory Status %x: [", status);
@@ -160,7 +160,7 @@ static int hp_mem_offline_func(int argc, char *argv[])
     printf("Prepare to offline MEMORY mfn %lx\n", mfn);
     ret = xc_mark_page_offline(xch, mfn, mfn, &status);
     if (ret < 0) {
-        fprintf(stderr, "Offlining page mfn %lx failed, error %x\n", mfn, ret);
+        fprintf(stderr, "Offlining page mfn %lx failed, error %x\n", mfn, errno);
         if (status & (PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED))
             fprintf(stderr, "XEN_PAGE is not permitted be offlined\n");
         else if (status & (PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM))
diff --git a/tools/misc/xen-mfndump.c b/tools/misc/xen-mfndump.c
index 0761f6e..ceeeaa9 100644
--- a/tools/misc/xen-mfndump.c
+++ b/tools/misc/xen-mfndump.c
@@ -31,7 +31,7 @@ int help_func(int argc, char *argv[])
 int dump_m2p_func(int argc, char *argv[])
 {
     unsigned long i;
-    long max_mfn;
+    unsigned long max_mfn;
     xen_pfn_t *m2p_table;
 
     if ( argc > 0 )
@@ -41,8 +41,7 @@ int dump_m2p_func(int argc, char *argv[])
     }
 
     /* Map M2P and obtain gpfn */
-    max_mfn = xc_maximum_ram_page(xch);
-    if ( max_mfn < 0 )
+    if ( xc_maximum_ram_page(xch, &max_mfn) < 0 )
     {
         ERROR("Failed to get the maximum mfn");
         return -1;
@@ -183,8 +182,8 @@ int dump_ptes_func(int argc, char *argv[])
     }
 
     /* Map M2P and obtain gpfn */
-    max_mfn = xc_maximum_ram_page(xch);
-    if ( (mfn > max_mfn) ||
+    rc = xc_maximum_ram_page(xch, &max_mfn);
+    if ( rc || (mfn > max_mfn) ||
          !(m2p_table = xc_map_m2p(xch, max_mfn, PROT_READ, NULL)) )
     {
         xc_unmap_domain_meminfo(xch, &minfo);
diff --git a/tools/misc/xen-ringwatch b/tools/misc/xen-ringwatch
index b81db77..e6f5361 100644
--- a/tools/misc/xen-ringwatch
+++ b/tools/misc/xen-ringwatch
@@ -12,9 +12,7 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-# USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #
 
 """Overview:
diff --git a/tools/misc/xencov.c b/tools/misc/xencov.c
index fb4b2ff..2aafb1d 100644
--- a/tools/misc/xencov.c
+++ b/tools/misc/xencov.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xenctrl.h>
diff --git a/tools/misc/xencov_split b/tools/misc/xencov_split
index 2e5aa80..b11f27c 100755
--- a/tools/misc/xencov_split
+++ b/tools/misc/xencov_split
@@ -16,8 +16,7 @@
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 
 use strict;
 use File::Path qw(mkpath);
diff --git a/tools/misc/xenpm.c b/tools/misc/xenpm.c
index e43924c..08f2242 100644
--- a/tools/misc/xenpm.c
+++ b/tools/misc/xenpm.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #define MAX_NR_CPU 512
 
@@ -355,16 +354,18 @@ static void signal_int_handler(int signo)
     int i, j, k;
     struct timeval tv;
     int cx_cap = 0, px_cap = 0;
-    DECLARE_HYPERCALL_BUFFER(uint32_t, cpu_to_core);
-    DECLARE_HYPERCALL_BUFFER(uint32_t, cpu_to_socket);
-    DECLARE_HYPERCALL_BUFFER(uint32_t, cpu_to_node);
-    xc_topologyinfo_t info = { 0 };
+    xc_cputopo_t *cputopo = NULL;
+    unsigned max_cpus = 0;
 
-    cpu_to_core = xc_hypercall_buffer_alloc(xc_handle, cpu_to_core, sizeof(*cpu_to_core) * MAX_NR_CPU);
-    cpu_to_socket = xc_hypercall_buffer_alloc(xc_handle, cpu_to_socket, sizeof(*cpu_to_socket) * MAX_NR_CPU);
-    cpu_to_node = xc_hypercall_buffer_alloc(xc_handle, cpu_to_node, sizeof(*cpu_to_node) * MAX_NR_CPU);
+    if ( xc_cputopoinfo(xc_handle, &max_cpus, NULL) != 0 )
+    {
+        fprintf(stderr, "failed to discover number of CPUs: %s\n",
+                strerror(errno));
+        goto out;
+    }
 
-    if ( cpu_to_core == NULL || cpu_to_socket == NULL || cpu_to_node == NULL )
+    cputopo = calloc(max_cpus, sizeof(*cputopo));
+    if ( cputopo == NULL )
     {
 	fprintf(stderr, "failed to allocate hypercall buffers\n");
 	goto out;
@@ -448,47 +449,42 @@ static void signal_int_handler(int signo)
             printf("  Avg freq\t%d\tKHz\n", avgfreq[i]);
     }
 
-    set_xen_guest_handle(info.cpu_to_core, cpu_to_core);
-    set_xen_guest_handle(info.cpu_to_socket, cpu_to_socket);
-    set_xen_guest_handle(info.cpu_to_node, cpu_to_node);
-    info.max_cpu_index = MAX_NR_CPU - 1;
-
-    if ( cx_cap && !xc_topologyinfo(xc_handle, &info) )
+    if ( cx_cap && !xc_cputopoinfo(xc_handle, &max_cpus, cputopo) )
     {
         uint32_t socket_ids[MAX_NR_CPU];
         uint32_t core_ids[MAX_NR_CPU];
         uint32_t socket_nr = 0;
         uint32_t core_nr = 0;
 
-        if ( info.max_cpu_index > MAX_NR_CPU - 1 )
-            info.max_cpu_index = MAX_NR_CPU - 1;
+        if ( max_cpus > MAX_NR_CPU )
+            max_cpus = MAX_NR_CPU;
         /* check validity */
-        for ( i = 0; i <= info.max_cpu_index; i++ )
+        for ( i = 0; i < max_cpus; i++ )
         {
-            if ( cpu_to_core[i] == INVALID_TOPOLOGY_ID ||
-                 cpu_to_socket[i] == INVALID_TOPOLOGY_ID )
+            if ( cputopo[i].core == XEN_INVALID_CORE_ID ||
+                 cputopo[i].socket == XEN_INVALID_SOCKET_ID )
                 break;
         }
-        if ( i > info.max_cpu_index )
+        if ( i >= max_cpus )
         {
             /* find socket nr & core nr per socket */
-            for ( i = 0; i <= info.max_cpu_index; i++ )
+            for ( i = 0; i < max_cpus; i++ )
             {
                 for ( j = 0; j < socket_nr; j++ )
-                    if ( cpu_to_socket[i] == socket_ids[j] )
+                    if ( cputopo[i].socket == socket_ids[j] )
                         break;
                 if ( j == socket_nr )
                 {
-                    socket_ids[j] = cpu_to_socket[i];
+                    socket_ids[j] = cputopo[i].socket;
                     socket_nr++;
                 }
 
                 for ( j = 0; j < core_nr; j++ )
-                    if ( cpu_to_core[i] == core_ids[j] )
+                    if ( cputopo[i].core == core_ids[j] )
                         break;
                 if ( j == core_nr )
                 {
-                    core_ids[j] = cpu_to_core[i];
+                    core_ids[j] = cputopo[i].core;
                     core_nr++;
                 }
             }
@@ -499,9 +495,9 @@ static void signal_int_handler(int signo)
                 unsigned int n;
                 uint64_t res;
 
-                for ( j = 0; j <= info.max_cpu_index; j++ )
+                for ( j = 0; j < max_cpus; j++ )
                 {
-                    if ( cpu_to_socket[j] == socket_ids[i] )
+                    if ( cputopo[j].socket == socket_ids[i] )
                         break;
                 }
                 printf("\nSocket %d\n", socket_ids[i]);
@@ -518,10 +514,10 @@ static void signal_int_handler(int signo)
                 }
                 for ( k = 0; k < core_nr; k++ )
                 {
-                    for ( j = 0; j <= info.max_cpu_index; j++ )
+                    for ( j = 0; j < max_cpus; j++ )
                     {
-                        if ( cpu_to_socket[j] == socket_ids[i] &&
-                             cpu_to_core[j] == core_ids[k] )
+                        if ( cputopo[j].socket == socket_ids[i] &&
+                             cputopo[j].core == core_ids[k] )
                             break;
                     }
                     printf("\t Core %d CPU %d\n", core_ids[k], j);
@@ -556,9 +552,7 @@ static void signal_int_handler(int signo)
     free(sum);
     free(avgfreq);
 out:
-    xc_hypercall_buffer_free(xc_handle, cpu_to_core);
-    xc_hypercall_buffer_free(xc_handle, cpu_to_socket);
-    xc_hypercall_buffer_free(xc_handle, cpu_to_node);
+    free(cputopo);
     xc_interface_close(xc_handle);
     exit(0);
 }
@@ -965,28 +959,27 @@ void scaling_governor_func(int argc, char *argv[])
 
 void cpu_topology_func(int argc, char *argv[])
 {
-    DECLARE_HYPERCALL_BUFFER(uint32_t, cpu_to_core);
-    DECLARE_HYPERCALL_BUFFER(uint32_t, cpu_to_socket);
-    DECLARE_HYPERCALL_BUFFER(uint32_t, cpu_to_node);
-    xc_topologyinfo_t info = { 0 };
-    int i, rc = ENOMEM;
+    xc_cputopo_t *cputopo = NULL;
+    unsigned max_cpus = 0;
+    int i, rc;
 
-    cpu_to_core = xc_hypercall_buffer_alloc(xc_handle, cpu_to_core, sizeof(*cpu_to_core) * MAX_NR_CPU);
-    cpu_to_socket = xc_hypercall_buffer_alloc(xc_handle, cpu_to_socket, sizeof(*cpu_to_socket) * MAX_NR_CPU);
-    cpu_to_node = xc_hypercall_buffer_alloc(xc_handle, cpu_to_node, sizeof(*cpu_to_node) * MAX_NR_CPU);
+    if ( xc_cputopoinfo(xc_handle, &max_cpus, NULL) != 0 )
+    {
+        rc = errno;
+        fprintf(stderr, "failed to discover number of CPUs (%d - %s)\n",
+                errno, strerror(errno));
+        goto out;
+    }
 
-    if ( cpu_to_core == NULL || cpu_to_socket == NULL || cpu_to_node == NULL )
+    cputopo = calloc(max_cpus, sizeof(*cputopo));
+    if ( cputopo == NULL )
     {
+	rc = ENOMEM;
 	fprintf(stderr, "failed to allocate hypercall buffers\n");
 	goto out;
     }
 
-    set_xen_guest_handle(info.cpu_to_core, cpu_to_core);
-    set_xen_guest_handle(info.cpu_to_socket, cpu_to_socket);
-    set_xen_guest_handle(info.cpu_to_node, cpu_to_node);
-    info.max_cpu_index = MAX_NR_CPU-1;
-
-    if ( xc_topologyinfo(xc_handle, &info) )
+    if ( xc_cputopoinfo(xc_handle, &max_cpus, cputopo) )
     {
         rc = errno;
         fprintf(stderr, "Cannot get Xen CPU topology (%d - %s)\n",
@@ -994,22 +987,17 @@ void cpu_topology_func(int argc, char *argv[])
         goto out;
     }
 
-    if ( info.max_cpu_index > (MAX_NR_CPU-1) )
-        info.max_cpu_index = MAX_NR_CPU-1;
-
     printf("CPU\tcore\tsocket\tnode\n");
-    for ( i = 0; i <= info.max_cpu_index; i++ )
+    for ( i = 0; i < max_cpus; i++ )
     {
-        if ( cpu_to_core[i] == INVALID_TOPOLOGY_ID )
+        if ( cputopo[i].core == XEN_INVALID_CORE_ID )
             continue;
         printf("CPU%d\t %d\t %d\t %d\n",
-               i, cpu_to_core[i], cpu_to_socket[i], cpu_to_node[i]);
+               i, cputopo[i].core, cputopo[i].socket, cputopo[i].node);
     }
     rc = 0;
 out:
-    xc_hypercall_buffer_free(xc_handle, cpu_to_core);
-    xc_hypercall_buffer_free(xc_handle, cpu_to_socket);
-    xc_hypercall_buffer_free(xc_handle, cpu_to_node);
+    free(cputopo);
     if ( rc )
         exit(rc);
 }
diff --git a/tools/misc/xenpvnetboot b/tools/misc/xenpvnetboot
index 98413f0..be972b9 100755
--- a/tools/misc/xenpvnetboot
+++ b/tools/misc/xenpvnetboot
@@ -8,9 +8,7 @@
 # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
 # Public License for more details.  You should have received a copy of the GNU
-# General Public License along with this program; if not, write to the Free
-# Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 021110-1307,
-# USA.
+# General Public License along with this program; If not, see <http://www.gnu.org/licenses/>.
 
 import sys
 import os
diff --git a/tools/ocaml/LICENSE b/tools/ocaml/LICENSE
index 80fe144..b9e62cd 100644
--- a/tools/ocaml/LICENSE
+++ b/tools/ocaml/LICENSE
@@ -181,8 +181,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.
 
 You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+License along with this library; If not, see <http://www.gnu.org/licenses/>.
 
 Also add information on how to contact you by electronic and paper mail. 
 
diff --git a/tools/ocaml/Makefile b/tools/ocaml/Makefile
index 5710a5f..cfa931a 100644
--- a/tools/ocaml/Makefile
+++ b/tools/ocaml/Makefile
@@ -20,3 +20,6 @@ install: subdirs-install
 
 .PHONY: clean
 clean: subdirs-clean
+
+.PHONY: distclean
+distclean: subdirs-distclean
diff --git a/tools/ocaml/Makefile.rules b/tools/ocaml/Makefile.rules
index 0745e83..1796060 100644
--- a/tools/ocaml/Makefile.rules
+++ b/tools/ocaml/Makefile.rules
@@ -47,6 +47,8 @@ ALL_OCAML_OBJ_SOURCES=$(addsuffix .ml, $(ALL_OCAML_OBJS))
 clean: $(CLEAN_HOOKS)
 	$(Q)rm -f .*.d *.o *.so *.a *.cmo *.cmi *.cma *.cmx *.cmxa *.annot *.spot *.spit $(LIBS) $(PROGRAMS) $(GENERATED_FILES) .ocamldep.make META
 
+distclean: clean
+
 quiet-command = $(if $(V),$1, at printf " %-8s %s\n" "$2" "$3" && $1)
 
 mk-caml-lib-native = $(call quiet-command, $(OCAMLOPT) $(OCAMLOPTFLAGS) -a -o $1 $2 $3,MLA,$1)
diff --git a/tools/ocaml/libs/Makefile b/tools/ocaml/libs/Makefile
index 3afdc89..f7c3c0e 100644
--- a/tools/ocaml/libs/Makefile
+++ b/tools/ocaml/libs/Makefile
@@ -15,3 +15,6 @@ install: subdirs-install
 
 .PHONY: clean
 clean: subdirs-clean
+
+.PHONY: distclean
+distclean: subdirs-distclean
diff --git a/tools/ocaml/libs/xb/op.ml b/tools/ocaml/libs/xb/op.ml
index 0ee8666..69346d8 100644
--- a/tools/ocaml/libs/xb/op.ml
+++ b/tools/ocaml/libs/xb/op.ml
@@ -19,7 +19,8 @@ type operation = Debug | Directory | Read | Getperms |
                  Transaction_end | Introduce | Release |
                  Getdomainpath | Write | Mkdir | Rm |
                  Setperms | Watchevent | Error | Isintroduced |
-                 Resume | Set_target | Restrict | Invalid
+                 Resume | Set_target | Restrict | Reset_watches |
+                 Invalid
 
 let operation_c_mapping =
 	[| Debug; Directory; Read; Getperms;
@@ -27,7 +28,7 @@ let operation_c_mapping =
            Transaction_end; Introduce; Release;
            Getdomainpath; Write; Mkdir; Rm;
            Setperms; Watchevent; Error; Isintroduced;
-           Resume; Set_target; Restrict |]
+           Resume; Set_target; Restrict; Reset_watches |]
 let size = Array.length operation_c_mapping
 
 let array_search el a =
@@ -68,4 +69,5 @@ let to_string ty =
 	| Resume		-> "RESUME"
 	| Set_target		-> "SET_TARGET"
 	| Restrict		-> "RESTRICT"
+	| Reset_watches         -> "RESET_WATCHES"
 	| Invalid		-> "INVALID"
diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli
index 4e1f833..6c242da 100644
--- a/tools/ocaml/libs/xb/xb.mli
+++ b/tools/ocaml/libs/xb/xb.mli
@@ -23,6 +23,7 @@ module Op :
       | Resume
       | Set_target
       | Restrict
+      | Reset_watches
       | Invalid
     val operation_c_mapping : operation array
     val size : int
diff --git a/tools/ocaml/libs/xb/xs_ring_stubs.c b/tools/ocaml/libs/xb/xs_ring_stubs.c
index fc9b0c5..fd561a2 100644
--- a/tools/ocaml/libs/xb/xs_ring_stubs.c
+++ b/tools/ocaml/libs/xb/xs_ring_stubs.c
@@ -55,7 +55,7 @@ CAMLprim value ml_interface_read(value ml_interface,
 
 	cons = *(volatile uint32_t*)&intf->req_cons;
 	prod = *(volatile uint32_t*)&intf->req_prod;
-	connection = *(volatile uint32*)&intf->connection;
+	connection = *(volatile uint32_t*)&intf->connection;
 
 	if (connection != XENSTORE_CONNECTED)
 		caml_raise_constant(*caml_named_value("Xb.Reconnect"));
@@ -105,7 +105,7 @@ CAMLprim value ml_interface_write(value ml_interface,
 
 	cons = *(volatile uint32_t*)&intf->rsp_cons;
 	prod = *(volatile uint32_t*)&intf->rsp_prod;
-	connection = *(volatile uint32*)&intf->connection;
+	connection = *(volatile uint32_t*)&intf->connection;
 
 	if (connection != XENSTORE_CONNECTED)
 		caml_raise_constant(*caml_named_value("Xb.Reconnect"));
diff --git a/tools/ocaml/libs/xc/xenctrl_stubs.c b/tools/ocaml/libs/xc/xenctrl_stubs.c
index f0810eb..b7de615 100644
--- a/tools/ocaml/libs/xc/xenctrl_stubs.c
+++ b/tools/ocaml/libs/xc/xenctrl_stubs.c
@@ -51,21 +51,22 @@
 	i1 = (uint32_t) Int64_val(Field(input, 0)); \
 	i2 = ((Field(input, 1) == Val_none) ? 0xffffffff : (uint32_t) Int64_val(Field(Field(input, 1), 0)));
 
-#define ERROR_STRLEN 1024
-void failwith_xc(xc_interface *xch)
+static void Noreturn failwith_xc(xc_interface *xch)
 {
-	static char error_str[ERROR_STRLEN];
+	char error_str[256];
 	if (xch) {
 		const xc_error *error = xc_get_last_error(xch);
 		if (error->code == XC_ERROR_NONE)
-                	snprintf(error_str, ERROR_STRLEN, "%d: %s", errno, strerror(errno));
+			snprintf(error_str, sizeof(error_str),
+				 "%d: %s", errno, strerror(errno));
 		else
-			snprintf(error_str, ERROR_STRLEN, "%d: %s: %s",
-				 error->code,
+			snprintf(error_str, sizeof(error_str),
+				 "%d: %s: %s", error->code,
 				 xc_error_code_to_desc(error->code),
 				 error->message);
 	} else {
-		snprintf(error_str, ERROR_STRLEN, "Unable to open XC interface");
+		snprintf(error_str, sizeof(error_str),
+			 "Unable to open XC interface");
 	}
 	caml_raise_with_string(*caml_named_value("xc.error"), error_str);
 }
@@ -457,6 +458,9 @@ CAMLprim value stub_xc_vcpu_getaffinity(value xch, value domid,
 	int i, len = xc_get_max_cpus(_H(xch));
 	int retval;
 
+	if (len < 1)
+		failwith_xc(_H(xch));
+
 	c_cpumap = xc_cpumap_alloc(_H(xch));
 	if (c_cpumap == NULL)
 		failwith_xc(_H(xch));
@@ -526,26 +530,65 @@ CAMLprim value stub_xc_evtchn_reset(value xch, value domid)
 }
 
 
-#define RING_SIZE 32768
-static char ring[RING_SIZE];
-
 CAMLprim value stub_xc_readconsolering(value xch)
 {
-	unsigned int size = RING_SIZE - 1;
-	char *ring_ptr = ring;
-	int retval;
+	/* Safe to use outside of blocking sections because of Ocaml GC lock. */
+	static unsigned int conring_size = 16384 + 1;
+
+	unsigned int count = conring_size, size = count, index = 0;
+	char *str = NULL, *ptr;
+	int ret;
 
 	CAMLparam1(xch);
+	CAMLlocal1(ring);
+
+	str = malloc(size);
+	if (!str)
+		caml_raise_out_of_memory();
 
+	/* Hopefully our conring_size guess is sufficient */
 	caml_enter_blocking_section();
-	retval = xc_readconsolering(_H(xch), ring_ptr, &size, 0, 0, NULL);
+	ret = xc_readconsolering(_H(xch), str, &count, 0, 0, &index);
 	caml_leave_blocking_section();
 
-	if (retval)
+	if (ret < 0) {
+		free(str);
 		failwith_xc(_H(xch));
+	}
+
+	while (count == size && ret >= 0) {
+		size += count - 1;
+		if (size < count)
+			break;
+
+		ptr = realloc(str, size);
+		if (!ptr)
+			break;
+
+		str = ptr + count;
+		count = size - count;
+
+		caml_enter_blocking_section();
+		ret = xc_readconsolering(_H(xch), str, &count, 0, 1, &index);
+		caml_leave_blocking_section();
 
-	ring[size] = '\0';
-	CAMLreturn(caml_copy_string(ring));
+		count += str - ptr;
+		str = ptr;
+	}
+
+	/*
+	 * If we didn't break because of an overflow with size, and we have
+	 * needed to realloc() ourself more space, update our tracking of the
+	 * real console ring size.
+	 */
+	if (size > conring_size)
+		conring_size = size;
+
+	ring = caml_alloc_string(count);
+	memcpy(String_val(ring), str, count);
+	free(str);
+
+	CAMLreturn(ring);
 }
 
 CAMLprim value stub_xc_send_debug_keys(value xch, value keys)
@@ -821,6 +864,12 @@ CAMLprim value stub_xc_version_version(value xch)
 
 	caml_enter_blocking_section();
 	packed = xc_version(_H(xch), XENVER_version, NULL);
+	caml_leave_blocking_section();
+
+	if (packed < 0)
+		failwith_xc(_H(xch));
+
+	caml_enter_blocking_section();
 	retval = xc_version(_H(xch), XENVER_extraversion, &extra);
 	caml_leave_blocking_section();
 
@@ -1123,12 +1172,17 @@ CAMLprim value stub_xc_domain_test_assign_device(value xch, value domid, value d
 	CAMLreturn(Val_bool(ret == 0));
 }
 
-CAMLprim value stub_xc_domain_assign_device(value xch, value domid, value desc)
+static int domain_assign_device_rdm_flag_table[] = {
+    XEN_DOMCTL_DEV_RDM_RELAXED,
+};
+
+CAMLprim value stub_xc_domain_assign_device(value xch, value domid, value desc,
+                                            value rflag)
 {
-	CAMLparam3(xch, domid, desc);
+	CAMLparam4(xch, domid, desc, rflag);
 	int ret;
 	int domain, bus, dev, func;
-	uint32_t sbdf;
+	uint32_t sbdf, flag;
 
 	domain = Int_val(Field(desc, 0));
 	bus = Int_val(Field(desc, 1));
@@ -1136,7 +1190,10 @@ CAMLprim value stub_xc_domain_assign_device(value xch, value domid, value desc)
 	func = Int_val(Field(desc, 3));
 	sbdf = encode_sbdf(domain, bus, dev, func);
 
-	ret = xc_assign_device(_H(xch), _D(domid), sbdf);
+	ret = Int_val(Field(rflag, 0));
+	flag = domain_assign_device_rdm_flag_table[ret];
+
+	ret = xc_assign_device(_H(xch), _D(domid), sbdf, flag);
 
 	if (ret < 0)
 		failwith_xc(_H(xch));
diff --git a/tools/ocaml/libs/xl/genwrap.py b/tools/ocaml/libs/xl/genwrap.py
index 402e489..1c8ad81 100644
--- a/tools/ocaml/libs/xl/genwrap.py
+++ b/tools/ocaml/libs/xl/genwrap.py
@@ -87,7 +87,10 @@ def ocaml_type_of(ty):
     elif isinstance(ty,idl.KeyedUnion):
         return ty.union_name
     elif isinstance(ty,idl.Aggregate):
-        return ty.rawname.capitalize() + ".t"
+        if ty.rawname is None:
+            return ty.anon_struct
+        else:
+            return ty.rawname.capitalize() + ".t"
     else:
         return ty.rawname
 
@@ -111,14 +114,14 @@ def ocaml_instance_of_field(f):
         name = f.name
     return "%s : %s" % (munge_name(name), ocaml_type_of(f.type))
 
-def gen_struct(ty):
+def gen_struct(ty, indent):
     s = ""
     for f in ty.fields:
         if f.type.private:
             continue
         x = ocaml_instance_of_field(f)
-        x = x.replace("\n", "\n\t\t")
-        s += "\t\t" + x + ";\n"
+        x = x.replace("\n", "\n"+indent)
+        s += indent + x + ";\n"
     return s
 
 def gen_ocaml_keyedunions(ty, interface, indent, parent = None):
@@ -140,7 +143,7 @@ def gen_ocaml_keyedunions(ty, interface, indent, parent = None):
             if isinstance(f.type, idl.Struct) and not f.type.has_fields(): continue
             s += "\ntype %s_%s =\n" % (nparent,f.name)
             s += "{\n"
-            s += gen_struct(f.type)
+            s += gen_struct(f.type, indent + "\t")
             s += "}\n"
 
         name = "%s__union" % ty.keyvar.name
@@ -169,6 +172,23 @@ def gen_ocaml_keyedunions(ty, interface, indent, parent = None):
         return None, None
     return s.replace("\n", "\n%s" % indent), union_type
 
+def gen_ocaml_anonstruct(ty, interface, indent, parent = None):
+    s= ""
+
+    if ty.rawname is not None:
+        # Non-anonymous types need no special handling
+        pass
+    elif isinstance(ty, idl.Struct):
+        name = "%s__anon" % parent
+        s += "type %s = {\n" % name
+        s += gen_struct(ty, indent)
+        s += "}\n"
+        ty.anon_struct = name
+    if s == "":
+        return None
+    s = indent + s
+    return s.replace("\n", "\n%s" % indent)
+
 def gen_ocaml_ml(ty, interface, indent=""):
 
     if interface:
@@ -212,9 +232,16 @@ def gen_ocaml_ml(ty, interface, indent=""):
             if union_type is not None:
                 union_types.append(union_type)
 
+        # Handle anonymous structs...
+        for f in ty.fields:
+            anon = gen_ocaml_anonstruct(f.type, interface, "\t", f.name)
+            if anon is not None:
+                s += anon
+                s += "\n"
+
         s += "\ttype t =\n"
         s += "\t{\n"
-        s += gen_struct(ty)
+        s += gen_struct(ty, "\t\t")
         s += "\t}\n"
 
         if ty.init_fn is not None:
diff --git a/tools/ocaml/libs/xs/xs.ml b/tools/ocaml/libs/xs/xs.ml
index 5757571..7e14487 100644
--- a/tools/ocaml/libs/xs/xs.ml
+++ b/tools/ocaml/libs/xs/xs.ml
@@ -162,7 +162,13 @@ let daemon_open () =
 	with _ -> raise Failed_to_connect
 
 let domain_open () =
-	let path = "/proc/xen/xenbus" in
+	let path = try
+		let devpath = "/dev/xen/xenbus" in
+		Unix.access devpath [ Unix.F_OK ];
+		devpath
+	with Unix.Unix_error(_, _, _) ->
+		"/proc/xen/xenbus" in
+
 	let fd = Unix.openfile path [ Unix.O_RDWR ] 0o550 in
 	Unix.set_close_on_exec fd;
 	make fd
diff --git a/tools/ocaml/xenstored/Makefile b/tools/ocaml/xenstored/Makefile
index 48f1079..59875f7 100644
--- a/tools/ocaml/xenstored/Makefile
+++ b/tools/ocaml/xenstored/Makefile
@@ -2,7 +2,8 @@ XEN_ROOT = $(CURDIR)/../../..
 OCAML_TOPLEVEL = $(CURDIR)/..
 include $(OCAML_TOPLEVEL)/common.make
 
-CFLAGS += -I$(XEN_ROOT)/tools/
+# Include configure output (config.h)
+CFLAGS += -include $(XEN_ROOT)/tools/config.h
 CFLAGS-$(CONFIG_SYSTEMD)  += $(SYSTEMD_CFLAGS)
 LDFLAGS-$(CONFIG_SYSTEMD) += $(SYSTEMD_LIBS)
 
@@ -29,6 +30,8 @@ systemd_OBJS = systemd
 systemd_C_OBJS = systemd_stubs
 OCAML_LIBRARY += systemd
 
+LIBS_systemd += $(LDFLAGS-y)
+
 OBJS = define \
 	stdext \
 	trie \
@@ -77,8 +80,8 @@ bins: $(PROGRAMS)
 libs: $(LIBS)
 
 install: all
-	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
-	$(INSTALL_PROG) oxenstored $(DESTDIR)$(SBINDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(sbindir)
+	$(INSTALL_PROG) oxenstored $(DESTDIR)$(sbindir)
 	$(INSTALL_DIR) $(DESTDIR)$(XEN_CONFIG_DIR)
 	$(INSTALL_DATA) oxenstored.conf $(DESTDIR)$(XEN_CONFIG_DIR)
 
diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml
index b4dc9cb..0a2c481 100644
--- a/tools/ocaml/xenstored/connection.ml
+++ b/tools/ocaml/xenstored/connection.ml
@@ -186,6 +186,13 @@ let del_watch con path token =
 	con.nb_watches <- con.nb_watches - 1;
 	apath, w
 
+let del_watches con =
+  Hashtbl.clear con.watches;
+  con.nb_watches <- 0
+
+let del_transactions con =
+  Hashtbl.clear con.transactions
+
 let list_watches con =
 	let ll = Hashtbl.fold 
 		(fun _ watches acc -> List.map (fun watch -> watch.path, watch.token) watches :: acc)
diff --git a/tools/ocaml/xenstored/logging.ml b/tools/ocaml/xenstored/logging.ml
index 665b922..4c90032 100644
--- a/tools/ocaml/xenstored/logging.ml
+++ b/tools/ocaml/xenstored/logging.ml
@@ -242,6 +242,7 @@ let string_of_access_type = function
 	| Xenbus.Xb.Op.Rm                -> "rm       "
 	| Xenbus.Xb.Op.Setperms          -> "setperms "
 	| Xenbus.Xb.Op.Restrict          -> "restrict "
+	| Xenbus.Xb.Op.Reset_watches     -> "reset watches"
 	| Xenbus.Xb.Op.Set_target        -> "settarget"
 
 	| Xenbus.Xb.Op.Error             -> "error    "
diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml
index 0620585..e827678 100644
--- a/tools/ocaml/xenstored/process.ml
+++ b/tools/ocaml/xenstored/process.ml
@@ -272,6 +272,11 @@ let do_restrict con t domains cons data =
 	in
 	Connection.restrict con domid
 
+(* only in xen >= 4.2 *)
+let do_reset_watches con t domains cons data =
+  Connection.del_watches con;
+  Connection.del_transactions con
+
 (* only in >= xen3.3                                                                                    *)
 (* we ensure backward compatibility with restrict by counting the number of argument of set_target ...  *)
 (* This is not very elegant, but it is safe as 'restrict' only restricts permission of dom0 connections *)
@@ -324,6 +329,7 @@ let function_of_type ty =
 	| Xenbus.Xb.Op.Resume            -> reply_ack do_resume
 	| Xenbus.Xb.Op.Set_target        -> reply_ack do_set_target
 	| Xenbus.Xb.Op.Restrict          -> reply_ack do_restrict
+	| Xenbus.Xb.Op.Reset_watches     -> reply_ack do_reset_watches
 	| Xenbus.Xb.Op.Invalid           -> reply_ack do_error
 	| _                              -> reply_ack do_error
 
diff --git a/tools/ocaml/xenstored/systemd.ml b/tools/ocaml/xenstored/systemd.ml
index 2aa39ea..732446d 100644
--- a/tools/ocaml/xenstored/systemd.ml
+++ b/tools/ocaml/xenstored/systemd.ml
@@ -13,5 +13,5 @@
  *)
 
 external sd_listen_fds: string -> Unix.file_descr = "ocaml_sd_listen_fds"
-external sd_booted: unit -> bool = "ocaml_sd_booted"
+external launched_by_systemd: unit -> bool = "ocaml_launched_by_systemd"
 external sd_notify_ready: unit -> unit = "ocaml_sd_notify_ready"
diff --git a/tools/ocaml/xenstored/systemd.mli b/tools/ocaml/xenstored/systemd.mli
index 85c9f2e..538fc5e 100644
--- a/tools/ocaml/xenstored/systemd.mli
+++ b/tools/ocaml/xenstored/systemd.mli
@@ -17,8 +17,8 @@
  *  us do sanity checks on the expected sockets *)
 val sd_listen_fds: string -> Unix.file_descr
 
-(** Tells us whether or not systemd support was compiled in *)
-val sd_booted: unit -> bool
+(** Tells us whether the process is launched by systemd *)
+val launched_by_systemd: unit -> bool
 
 (** Tells systemd we're ready *)
 external sd_notify_ready: unit -> unit = "ocaml_sd_notify_ready"
diff --git a/tools/ocaml/xenstored/systemd_stubs.c b/tools/ocaml/xenstored/systemd_stubs.c
index 623592c..1bd5dea 100644
--- a/tools/ocaml/xenstored/systemd_stubs.c
+++ b/tools/ocaml/xenstored/systemd_stubs.c
@@ -22,7 +22,6 @@
 #include <caml/custom.h>
 #include <caml/signals.h>
 #include <caml/fail.h>
-#include <config.h>
 
 #if defined(HAVE_SYSTEMD)
 
@@ -93,14 +92,14 @@ CAMLprim value ocaml_sd_listen_fds(value connect_to)
 	CAMLreturn(sock_ret);
 }
 
-CAMLprim value ocaml_sd_booted(value ignore)
+CAMLprim value ocaml_launched_by_systemd(value ignore)
 {
 	CAMLparam1(ignore);
 	CAMLlocal1(ret);
 
 	ret = Val_false;
 
-	if (sd_booted())
+	if (sd_listen_fds(0) > 0)
 		ret = Val_true;
 
 	CAMLreturn(ret);
@@ -130,7 +129,7 @@ CAMLprim value ocaml_sd_listen_fds(value connect_to)
 	CAMLreturn(sock_ret);
 }
 
-CAMLprim value ocaml_sd_booted(value ignore)
+CAMLprim value ocaml_launched_by_systemd(value ignore)
 {
 	CAMLparam1(ignore);
 	CAMLlocal1(ret);
diff --git a/tools/ocaml/xenstored/utils.ml b/tools/ocaml/xenstored/utils.ml
index 61321c6..9f82c1c 100644
--- a/tools/ocaml/xenstored/utils.ml
+++ b/tools/ocaml/xenstored/utils.ml
@@ -84,7 +84,7 @@ let create_regular_unix_socket name =
         sock
 
 let create_unix_socket name =
-        if Systemd.sd_booted() then
+        if Systemd.launched_by_systemd() then
                 Systemd.sd_listen_fds name
         else
                 create_regular_unix_socket name
diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml
index bfe689b..42b8183 100644
--- a/tools/ocaml/xenstored/xenstored.ml
+++ b/tools/ocaml/xenstored/xenstored.ml
@@ -428,11 +428,11 @@ let _ =
 		process_domains store cons domains
 		in
 
+	if Systemd.launched_by_systemd () then
+		Systemd.sd_notify_ready ();
 	while not !quit
 	do
 		try
-                        if Systemd.sd_booted() then
-                                Systemd.sd_notify_ready ();
 			main_loop ()
 		with exc ->
 			error "caught exception %s" (Printexc.to_string exc);
diff --git a/tools/pygrub/Makefile b/tools/pygrub/Makefile
index 3dff608..fe8e03b 100644
--- a/tools/pygrub/Makefile
+++ b/tools/pygrub/Makefile
@@ -2,25 +2,30 @@
 XEN_ROOT = $(CURDIR)/../..
 include $(XEN_ROOT)/tools/Rules.mk
 
+PY_CFLAGS = $(CFLAGS) $(PY_NOOPT_CFLAGS) $(APPEND_LDFLAGS)
+
 .PHONY: all
 all: build
 .PHONY: build
 build:
-	CC="$(CC)" CFLAGS="$(CFLAGS) $(APPEND_LDFLAGS)" $(PYTHON) setup.py build
+	CC="$(CC)" CFLAGS="$(PY_CFLAGS)" $(PYTHON) setup.py build
 
 .PHONY: install
 install: all
-	CC="$(CC)" CFLAGS="$(CFLAGS) $(APPEND_LDFLAGS)" $(PYTHON) setup.py install \
+	CC="$(CC)" CFLAGS="$(PY_CFLAGS)" $(PYTHON) setup.py install \
 		$(PYTHON_PREFIX_ARG) --root="$(DESTDIR)" \
 		--install-scripts=$(LIBEXEC_BIN) --force
-	set -e; if [ $(BINDIR) != $(LIBEXEC_BIN) -a \
-	             "`readlink -f $(DESTDIR)/$(BINDIR)`" != \
+	set -e; if [ $(bindir) != $(LIBEXEC_BIN) -a \
+	             "`readlink -f $(DESTDIR)/$(bindir)`" != \
 	             "`readlink -f $(LIBEXEC_BIN)`" ]; then \
-	    ln -sf $(LIBEXEC_BIN)/pygrub $(DESTDIR)/$(BINDIR); \
+	    ln -sf $(LIBEXEC_BIN)/pygrub $(DESTDIR)/$(bindir); \
 	fi
 
 .PHONY: clean
 clean:
 	rm -rf build tmp *.pyc *.pyo *.o *.a *~ a.out $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 -include $(DEPS)
diff --git a/tools/pygrub/examples/ubuntu-14.04-lts.grub2 b/tools/pygrub/examples/ubuntu-14.04-lts.grub2
new file mode 100644
index 0000000..a14c8df
--- /dev/null
+++ b/tools/pygrub/examples/ubuntu-14.04-lts.grub2
@@ -0,0 +1,234 @@
+#
+# DO NOT EDIT THIS FILE
+#
+# It is automatically generated by grub-mkconfig using templates
+# from /etc/grub.d and settings from /etc/default/grub
+#
+
+### BEGIN /etc/grub.d/00_header ###
+if [ -s $prefix/grubenv ]; then
+  set have_grubenv=true
+  load_env
+fi
+if [ "${next_entry}" ] ; then
+   set default="${next_entry}"
+   set next_entry=
+   save_env next_entry
+   set boot_once=true
+else
+   set default="0"
+fi
+
+if [ x"${feature_menuentry_id}" = xy ]; then
+  menuentry_id_option="--id"
+else
+  menuentry_id_option=""
+fi
+
+export menuentry_id_option
+
+if [ "${prev_saved_entry}" ]; then
+  set saved_entry="${prev_saved_entry}"
+  save_env saved_entry
+  set prev_saved_entry=
+  save_env prev_saved_entry
+  set boot_once=true
+fi
+
+function savedefault {
+  if [ -z "${boot_once}" ]; then
+    saved_entry="${chosen}"
+    save_env saved_entry
+  fi
+}
+function recordfail {
+  set recordfail=1
+  if [ -n "${have_grubenv}" ]; then if [ -z "${boot_once}" ]; then save_env recordfail; fi; fi
+}
+function load_video {
+  if [ x$feature_all_video_module = xy ]; then
+    insmod all_video
+  else
+    insmod efi_gop
+    insmod efi_uga
+    insmod ieee1275_fb
+    insmod vbe
+    insmod vga
+    insmod video_bochs
+    insmod video_cirrus
+  fi
+}
+
+if [ x$feature_default_font_path = xy ] ; then
+   font=unicode
+else
+insmod part_msdos
+insmod lvm
+insmod ext2
+set root='lvmid/VFRfK8-JAgW-a2Rt-svO9-f06E-Frur-fzowWw/2zIHcW-s2DX-h7hm-V32p-6nz5-bH1A-fNVEWg'
+if [ x$feature_platform_search_hint = xy ]; then
+  search --no-floppy --fs-uuid --set=root --hint='lvmid/VFRfK8-JAgW-a2Rt-svO9-f06E-Frur-fzowWw/2zIHcW-s2DX-h7hm-V32p-6nz5-bH1A-fNVEWg'  c7a4b4ca-71da-4d03-b374-a6b76ebcfc90
+else
+  search --no-floppy --fs-uuid --set=root c7a4b4ca-71da-4d03-b374-a6b76ebcfc90
+fi
+    font="/usr/share/grub/unicode.pf2"
+fi
+
+if loadfont $font ; then
+  set gfxmode=auto
+  load_video
+  insmod gfxterm
+fi
+terminal_output gfxterm
+if [ "${recordfail}" = 1 ] ; then
+  set timeout=-1
+else
+  if [ x$feature_timeout_style = xy ] ; then
+    set timeout_style=menu
+    set timeout=5
+  # Fallback normal timeout code in case the timeout_style feature is
+  # unavailable.
+  else
+    set timeout=5
+  fi
+fi
+### END /etc/grub.d/00_header ###
+
+### BEGIN /etc/grub.d/05_debian_theme ###
+set menu_color_normal=white/black
+set menu_color_highlight=black/light-gray
+### END /etc/grub.d/05_debian_theme ###
+
+### BEGIN /etc/grub.d/10_linux ###
+function gfxmode {
+	set gfxpayload="${1}"
+	if [ "${1}" = "keep" ]; then
+		set vt_handoff=vt.handoff=7
+	else
+		set vt_handoff=
+	fi
+}
+if [ "${recordfail}" != 1 ]; then
+  if [ -e ${prefix}/gfxblacklist.txt ]; then
+    if hwmatch ${prefix}/gfxblacklist.txt 3; then
+      if [ ${match} = 0 ]; then
+        set linux_gfx_mode=keep
+      else
+        set linux_gfx_mode=text
+      fi
+    else
+      set linux_gfx_mode=text
+    fi
+  else
+    set linux_gfx_mode=keep
+  fi
+else
+  set linux_gfx_mode=text
+fi
+export linux_gfx_mode
+menuentry 'Ubuntu' --class ubuntu --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-simple-c7a4b4ca-71da-4d03-b374-a6b76ebcfc90' {
+	load_video
+	gfxmode $linux_gfx_mode
+	insmod gzio
+	insmod part_msdos
+	insmod ext2
+	if [ x$feature_platform_search_hint = xy ]; then
+	  search --no-floppy --fs-uuid --set=root  86ba9198-4319-4809-908d-6dbe6938b19a
+	else
+	  search --no-floppy --fs-uuid --set=root 86ba9198-4319-4809-908d-6dbe6938b19a
+	fi
+	linux	/vmlinuz-3.13.0-44-generic root=/dev/mapper/BoxenSys00-root ro biosdevname=0 quiet
+	initrd	/initrd.img-3.13.0-44-generic
+}
+submenu 'Advanced options for Ubuntu' --unrestricted $menuentry_id_option 'gnulinux-advanced-c7a4b4ca-71da-4d03-b374-a6b76ebcfc90' {
+	menuentry 'Ubuntu, with Linux 3.13.0-44-generic' --class ubuntu --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-3.13.0-44-generic-advanced-c7a4b4ca-71da-4d03-b374-a6b76ebcfc90' {
+		load_video
+		gfxmode $linux_gfx_mode
+		insmod gzio
+		insmod part_msdos
+		insmod ext2
+		if [ x$feature_platform_search_hint = xy ]; then
+		  search --no-floppy --fs-uuid --set=root  86ba9198-4319-4809-908d-6dbe6938b19a
+		else
+		  search --no-floppy --fs-uuid --set=root 86ba9198-4319-4809-908d-6dbe6938b19a
+		fi
+		echo	'Loading Linux 3.13.0-44-generic ...'
+		linux	/vmlinuz-3.13.0-44-generic root=/dev/mapper/BoxenSys00-root ro biosdevname=0 quiet
+		echo	'Loading initial ramdisk ...'
+		initrd	/initrd.img-3.13.0-44-generic
+	}
+	menuentry 'Ubuntu, with Linux 3.13.0-44-generic (recovery mode)' --class ubuntu --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-3.13.0-44-generic-recovery-c7a4b4ca-71da-4d03-b374-a6b76ebcfc90' {
+		load_video
+		insmod gzio
+		insmod part_msdos
+		insmod ext2
+		if [ x$feature_platform_search_hint = xy ]; then
+		  search --no-floppy --fs-uuid --set=root  86ba9198-4319-4809-908d-6dbe6938b19a
+		else
+		  search --no-floppy --fs-uuid --set=root 86ba9198-4319-4809-908d-6dbe6938b19a
+		fi
+		echo	'Loading Linux 3.13.0-44-generic ...'
+		linux	/vmlinuz-3.13.0-44-generic root=/dev/mapper/BoxenSys00-root ro recovery nomodeset biosdevname=0
+		echo	'Loading initial ramdisk ...'
+		initrd	/initrd.img-3.13.0-44-generic
+	}
+}
+
+### END /etc/grub.d/10_linux ###
+
+### BEGIN /etc/grub.d/20_linux_xen ###
+
+### END /etc/grub.d/20_linux_xen ###
+
+### BEGIN /etc/grub.d/20_memtest86+ ###
+menuentry 'Memory test (memtest86+)' {
+	insmod part_msdos
+	insmod ext2
+	if [ x$feature_platform_search_hint = xy ]; then
+	  search --no-floppy --fs-uuid --set=root  86ba9198-4319-4809-908d-6dbe6938b19a
+	else
+	  search --no-floppy --fs-uuid --set=root 86ba9198-4319-4809-908d-6dbe6938b19a
+	fi
+	knetbsd	/memtest86+.elf
+}
+menuentry 'Memory test (memtest86+, serial console 115200)' {
+	insmod part_msdos
+	insmod ext2
+	if [ x$feature_platform_search_hint = xy ]; then
+	  search --no-floppy --fs-uuid --set=root  86ba9198-4319-4809-908d-6dbe6938b19a
+	else
+	  search --no-floppy --fs-uuid --set=root 86ba9198-4319-4809-908d-6dbe6938b19a
+	fi
+	linux16	/memtest86+.bin console=ttyS0,115200n8
+}
+### END /etc/grub.d/20_memtest86+ ###
+
+### BEGIN /etc/grub.d/30_os-prober ###
+### END /etc/grub.d/30_os-prober ###
+
+### BEGIN /etc/grub.d/30_uefi-firmware ###
+### END /etc/grub.d/30_uefi-firmware ###
+
+### BEGIN /etc/grub.d/40_custom ###
+# This file provides an easy way to add custom menu entries.  Simply type the
+# menu entries you want to add after this comment.  Be careful not to change
+# the 'exec tail' line above.
+set superusers="root"
+password_pbkdf2 root grub.pbkdf2.sha512.
+### END /etc/grub.d/40_custom ###
+
+### BEGIN /etc/grub.d/40_custom.bakPW ###
+# This file provides an easy way to add custom menu entries.  Simply type the
+# menu entries you want to add after this comment.  Be careful not to change
+# the 'exec tail' line above.
+set superusers="root"
+password_pbkdf2 root dummy
+### END /etc/grub.d/40_custom.bakPW ###
+
+### BEGIN /etc/grub.d/41_custom ###
+if [ -f  ${config_directory}/custom.cfg ]; then
+  source ${config_directory}/custom.cfg
+elif [ -z "${config_directory}" -a -f  $prefix/custom.cfg ]; then
+  source $prefix/custom.cfg;
+fi
+### END /etc/grub.d/41_custom ###
diff --git a/tools/pygrub/src/ExtLinuxConf.py b/tools/pygrub/src/ExtLinuxConf.py
index 510099b..d1789bf 100644
--- a/tools/pygrub/src/ExtLinuxConf.py
+++ b/tools/pygrub/src/ExtLinuxConf.py
@@ -7,8 +7,7 @@
 # general public license.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 
 import sys, re, os
diff --git a/tools/pygrub/src/GrubConf.py b/tools/pygrub/src/GrubConf.py
index dea7044..dc810d5 100644
--- a/tools/pygrub/src/GrubConf.py
+++ b/tools/pygrub/src/GrubConf.py
@@ -9,8 +9,7 @@
 # general public license.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 
 import os, sys
diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub
index 3ec52fd..e4aedda 100755
--- a/tools/pygrub/src/pygrub
+++ b/tools/pygrub/src/pygrub
@@ -9,8 +9,7 @@
 # general public license.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 
 import os, sys, string, struct, tempfile, re, traceback, stat, errno
diff --git a/tools/python/Makefile b/tools/python/Makefile
index 533d3de..0395e50 100644
--- a/tools/python/Makefile
+++ b/tools/python/Makefile
@@ -4,6 +4,8 @@ include $(XEN_ROOT)/tools/Rules.mk
 .PHONY: all
 all: build
 
+PY_CFLAGS = $(CFLAGS) $(PY_NOOPT_CFLAGS) $(LDFLAGS) $(APPEND_LDFLAGS)
+
 .PHONY: build
 build: genwrap.py $(XEN_ROOT)/tools/libxl/libxl_types.idl \
 		$(XEN_ROOT)/tools/libxl/idl.py
@@ -11,13 +13,18 @@ build: genwrap.py $(XEN_ROOT)/tools/libxl/libxl_types.idl \
 		$(XEN_ROOT)/tools/libxl/libxl_types.idl \
 		xen/lowlevel/xl/_pyxl_types.h \
 		xen/lowlevel/xl/_pyxl_types.c
-	CC="$(CC)" CFLAGS="$(CFLAGS) $(LDFLAGS) $(APPEND_LDFLAGS)" $(PYTHON) setup.py build
+	CC="$(CC)" CFLAGS="$(PY_CFLAGS)" $(PYTHON) setup.py build
 
 .PHONY: install
 install:
-	CC="$(CC)" CFLAGS="$(CFLAGS) $(LDFLAGS) $(APPEND_LDFLAGS)" $(PYTHON) setup.py install \
+	$(INSTALL_DIR) $(DESTDIR)$(LIBEXEC_BIN)
+
+	CC="$(CC)" CFLAGS="$(PY_CFLAGS)" $(PYTHON) setup.py install \
 		$(PYTHON_PREFIX_ARG) --root="$(DESTDIR)" --force
 
+	$(INSTALL_PROG) scripts/convert-legacy-stream $(DESTDIR)$(LIBEXEC_BIN)
+	$(INSTALL_PROG) scripts/verify-stream-v2 $(DESTDIR)$(LIBEXEC_BIN)
+
 .PHONY: test
 test:
 	export LD_LIBRARY_PATH=$$(readlink -f ../libxc):$$(readlink -f ../xenstore); $(PYTHON) test.py -b -u
@@ -28,4 +35,7 @@ clean:
 	rm -rf build/
 	rm -f $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 -include $(DEPS)
diff --git a/tools/python/scripts/convert-legacy-stream b/tools/python/scripts/convert-legacy-stream
new file mode 100755
index 0000000..41fee10
--- /dev/null
+++ b/tools/python/scripts/convert-legacy-stream
@@ -0,0 +1,730 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Convert a legacy migration stream to a v2 stream.
+"""
+
+import sys
+import os, os.path
+import syslog
+import traceback
+
+from struct import calcsize, unpack, pack
+
+from xen.migration import legacy, public, libxc, libxl, xl
+
+__version__ = 1
+
+fin = None             # Input file/fd
+fout = None            # Output file/fd
+twidth = 0             # Legacy toolstack bitness (32 or 64)
+pv = None              # Boolean (pv or hvm)
+qemu = True            # Boolean - process qemu record?
+log_to_syslog = False  # Boolean - Log to syslog instead of stdout/err?
+verbose = False        # Boolean - Summarise stream contents
+
+def stream_read(_ = None):
+    """Read from the input"""
+    return fin.read(_)
+
+def stream_write(_):
+    """Write to the output"""
+    return fout.write(_)
+
+def info(msg):
+    """Info message, routed to appropriate destination"""
+    if verbose:
+        if log_to_syslog:
+            for line in msg.split("\n"):
+                syslog.syslog(syslog.LOG_INFO, line)
+        else:
+            print msg
+
+def err(msg):
+    """Error message, routed to appropriate destination"""
+    if log_to_syslog:
+        for line in msg.split("\n"):
+            syslog.syslog(syslog.LOG_ERR, line)
+    print >> sys.stderr, msg
+
+class StreamError(StandardError):
+    """Error with the incoming migration stream"""
+    pass
+
+class VM(object):
+    """Container of VM parameters"""
+
+    def __init__(self, fmt):
+        # Common
+        self.p2m_size = 0
+
+        # PV
+        self.max_vcpu_id = 0
+        self.online_vcpu_map = []
+        self.width = 0
+        self.levels = 0
+        self.basic_len = 0
+        self.extd = False
+        self.xsave_len = 0
+
+        # libxl
+        self.libxl = fmt == "libxl"
+        self.emu_xenstore = "" # NUL terminated key&val pairs from "toolstack" records
+
+def write_libxc_ihdr():
+    stream_write(pack(libxc.IHDR_FORMAT,
+                      libxc.IHDR_MARKER,  # Marker
+                      libxc.IHDR_IDENT,   # Ident
+                      libxc.IHDR_VERSION, # Version
+                      libxc.IHDR_OPT_LE,  # Options
+                      0, 0))              # Reserved
+
+def write_libxc_dhdr():
+    if pv:
+        dtype = libxc.DHDR_TYPE_x86_pv
+    else:
+        dtype = libxc.DHDR_TYPE_x86_hvm
+
+    stream_write(pack(libxc.DHDR_FORMAT,
+                      dtype,        # Type
+                      12,           # Page size
+                      0,            # Reserved
+                      0,            # Xen major (converted)
+                      __version__)) # Xen minor (converted)
+
+def write_libxl_hdr():
+    stream_write(pack(libxl.HDR_FORMAT,
+                      libxl.HDR_IDENT,     # Ident
+                      libxl.HDR_VERSION,   # Version 2
+                      libxl.HDR_OPT_LE |   # Options
+                      libxl.HDR_OPT_LEGACY # Little Endian and Legacy
+                      ))
+
+def write_record(rt, *argl):
+    alldata = ''.join(argl)
+    length = len(alldata)
+
+    record = pack(libxc.RH_FORMAT, rt, length) + alldata
+    plen = (8 - (length & 7)) & 7
+    record += '\x00' * plen
+
+    stream_write(record)
+
+def write_libxc_pv_info(vm):
+    write_record(libxc.REC_TYPE_x86_pv_info,
+                 pack(libxc.X86_PV_INFO_FORMAT,
+                      vm.width, vm.levels, 0, 0))
+
+def write_libxc_pv_p2m_frames(vm, pfns):
+    write_record(libxc.REC_TYPE_x86_pv_p2m_frames,
+                 pack(libxc.X86_PV_P2M_FRAMES_FORMAT,
+                      0, vm.p2m_size - 1),
+                 pack("Q" * len(pfns), *pfns))
+
+def write_libxc_pv_vcpu_basic(vcpu_id, data):
+    write_record(libxc.REC_TYPE_x86_pv_vcpu_basic,
+                 pack(libxc.X86_PV_VCPU_HDR_FORMAT, vcpu_id, 0), data)
+
+def write_libxc_pv_vcpu_extd(vcpu_id, data):
+    write_record(libxc.REC_TYPE_x86_pv_vcpu_extended,
+                 pack(libxc.X86_PV_VCPU_HDR_FORMAT, vcpu_id, 0), data)
+
+def write_libxc_pv_vcpu_xsave(vcpu_id, data):
+    write_record(libxc.REC_TYPE_x86_pv_vcpu_xsave,
+                 pack(libxc.X86_PV_VCPU_HDR_FORMAT, vcpu_id, 0), data)
+
+def write_page_data(pfns, pages):
+    if fout is None: # Save copying 1M buffers around for no reason
+        return
+
+    new_pfns = [(((x & 0xf0000000) << 32) | (x & 0x0fffffff)) for x in pfns]
+
+    # Optimise the needless buffer copying in write_record()
+    stream_write(pack(libxc.RH_FORMAT,
+                      libxc.REC_TYPE_page_data,
+                      8 + (len(new_pfns) * 8) + len(pages)))
+    stream_write(pack(libxc.PAGE_DATA_FORMAT, len(new_pfns), 0))
+    stream_write(pack("Q" * len(new_pfns), *new_pfns))
+    stream_write(pages)
+
+def write_libxc_tsc_info(mode, khz, nsec, incarn):
+    write_record(libxc.REC_TYPE_tsc_info,
+                 pack(libxc.TSC_INFO_FORMAT,
+                      mode, khz, nsec, incarn, 0))
+
+def write_libxc_hvm_params(params):
+    if pv:
+        raise StreamError("HVM-only param in PV stream")
+    elif len(params) % 2:
+        raise RuntimeError("Expected even length list of hvm parameters")
+
+    write_record(libxc.REC_TYPE_hvm_params,
+                 pack(libxc.HVM_PARAMS_FORMAT, len(params) / 2, 0),
+                 pack("Q" * len(params), *params))
+
+def write_libxl_end():
+    write_record(libxl.REC_TYPE_end, "")
+
+def write_libxl_libxc_context():
+    write_record(libxl.REC_TYPE_libxc_context, "")
+
+def write_libxl_emulator_xenstore_data(data):
+    write_record(libxl.REC_TYPE_emulator_xenstore_data,
+                 pack(libxl.EMULATOR_HEADER_FORMAT,
+                      libxl.EMULATOR_ID_unknown, 0) + data)
+
+def write_libxl_emulator_context(blob):
+    write_record(libxl.REC_TYPE_emulator_context,
+                 pack(libxl.EMULATOR_HEADER_FORMAT,
+                      libxl.EMULATOR_ID_unknown, 0) + blob)
+
+def rdexact(nr_bytes):
+    """Read exactly nr_bytes from fin"""
+    _ = stream_read(nr_bytes)
+    if len(_) != nr_bytes:
+        raise IOError("Stream truncated")
+    return _
+
+def unpack_exact(fmt):
+    """Unpack a format from fin"""
+    sz = calcsize(fmt)
+    return unpack(fmt, rdexact(sz))
+
+def unpack_ulongs(nr_ulongs):
+    if twidth == 32:
+        return unpack_exact("I" * nr_ulongs)
+    else:
+        return unpack_exact("Q" * nr_ulongs)
+
+def read_pv_extended_info(vm):
+
+    marker, = unpack_ulongs(1)
+
+    if twidth == 32:
+        expected = 0xffffffff
+    else:
+        expected = 0xffffffffffffffff
+
+    if marker != expected:
+        raise StreamError("Unexpected extended info marker 0x%x" % (marker, ))
+
+    total_length, = unpack_exact("I")
+    so_far = 0
+
+    info("Extended Info: length 0x%x" % (total_length, ))
+
+    while so_far < total_length:
+
+        blkid, datasz = unpack_exact("=4sI")
+        so_far += 8
+
+        info("  Record type: %s, size 0x%x" % (blkid, datasz))
+
+        data = rdexact(datasz)
+        so_far += datasz
+
+        # Eww, but this is how it is done :(
+        if blkid == "vcpu":
+
+            vm.basic_len = datasz
+
+            if datasz == 0x1430:
+                vm.width = 8
+                vm.levels = 4
+                info("    64bit domain, 4 levels")
+            elif datasz == 0xaf0:
+                vm.width = 4
+                vm.levels = 3
+                info("    32bit domain, 3 levels")
+            else:
+                raise StreamError("Unable to determine guest width/level")
+
+            write_libxc_pv_info(vm)
+
+        elif blkid == "extv":
+            vm.extd = True
+
+        elif blkid == "xcnt":
+            vm.xsave_len, = unpack("I", data[:4])
+            info("xcnt sz 0x%x" % (vm.xsave_len, ))
+
+        else:
+            raise StreamError("Unrecognised extended block")
+
+
+    if so_far != total_length:
+        raise StreamError("Overshot Extended Info size by %d bytes"
+                          % (so_far - total_length,))
+
+def read_pv_p2m_frames(vm):
+    fpp = 4096 / vm.width
+    p2m_frame_len = (vm.p2m_size - 1) / fpp + 1
+
+    info("P2M frames: fpp %d, p2m_frame_len %d" % (fpp, p2m_frame_len))
+    write_libxc_pv_p2m_frames(vm, unpack_ulongs(p2m_frame_len))
+
+def read_pv_tail(vm):
+
+    nr_unmapped_pfns, = unpack_exact("I")
+
+    if nr_unmapped_pfns != 0:
+        # "Unmapped" pfns are bogus
+        _ = unpack_ulongs(nr_unmapped_pfns)
+        info("discarding %d bogus 'unmapped pfns'" % (nr_unmapped_pfns, ))
+
+    for vcpu_id in vm.online_vcpu_map:
+
+        basic = rdexact(vm.basic_len)
+        info("Got VCPU basic (size 0x%x)" % (vm.basic_len, ))
+        write_libxc_pv_vcpu_basic(vcpu_id, basic)
+
+        if vm.extd:
+            extd = rdexact(128)
+            info("Got VCPU extd (size 0x%x)" % (128, ))
+            write_libxc_pv_vcpu_extd(vcpu_id, extd)
+
+        if vm.xsave_len:
+            mask, size = unpack_exact("QQ")
+            assert vm.xsave_len - 16 == size
+
+            xsave = rdexact(size)
+            info("Got VCPU xsave (mask 0x%x, size 0x%x)" % (mask, size))
+            write_libxc_pv_vcpu_xsave(vcpu_id, xsave)
+
+    shinfo = rdexact(4096)
+    info("Got shinfo")
+
+    write_record(libxc.REC_TYPE_shared_info, shinfo)
+    write_record(libxc.REC_TYPE_end, "")
+
+
+def read_libxl_toolstack(vm, data):
+
+    if len(data) < 8:
+        raise StreamError("Overly short libxl toolstack data")
+
+    ver, count = unpack("=II", data[:8])
+    data = data[8:]
+
+    if ver != 1:
+        raise StreamError("Cannot decode libxl toolstack version %u" % (ver, ))
+    info("    Version %u, count %u" % (ver, count))
+
+    for x in range(count):
+
+        if len(data) < 28:
+            raise StreamError("Remaining data too short for physmap header")
+
+        phys, start, size, namelen = unpack("=QQQI", data[:28])
+        data = data[28:]
+
+        if namelen == 0:
+            raise StreamError("No physmap info name")
+
+        # 64bit leaked 4 bytes of padding onto the end of name
+        if twidth == 64:
+            namelen += 4
+
+        if len(data) < namelen:
+            raise StreamError("Remaining data too short for physmap name")
+
+        name = data[:namelen]
+        data = data[namelen:]
+
+        # Strip padding off the end of name
+        if twidth == 64:
+            name = name[:-4]
+
+        if name[-1] != '\x00':
+            raise StreamError("physmap name not NUL terminated")
+
+        root = "physmap/%x" % (phys,)
+        kv = [root + "/start_addr", "%x" % (start, ),
+              root + "/size",       "%x" % (size, ),
+              root + "/name",       name[:-1]]
+
+        for key, val in zip(kv[0::2], kv[1::2]):
+            info("    '%s' = '%s'" % (key, val))
+
+        vm.emu_xenstore += '\x00'.join(kv) + '\x00'
+
+
+def read_chunks(vm):
+
+    hvm_params = []
+
+    while True:
+
+        marker, = unpack_exact("=i")
+        if marker <= 0:
+            info("Chunk: %d - %s" %
+                 (marker, legacy.chunk_type_to_str.get(marker, "unknown")))
+
+        if marker == legacy.CHUNK_end:
+            info("  End")
+
+            if hvm_params:
+                write_libxc_hvm_params(hvm_params)
+
+            return
+
+        elif marker > 0:
+
+            if marker > legacy.MAX_BATCH:
+                raise StreamError("Page batch (%d) exceeded MAX_BATCH (%d)"
+                                  % (marker, legacy.MAX_BATCH))
+            pfns = unpack_ulongs(marker)
+
+            # xc_domain_save() leaves many XEN_DOMCTL_PFINFO_XTAB records for
+            # sequences of pfns it cant map.  Drop these.
+            pfns = [ x for x in pfns if x != 0xf0000000 ]
+
+            if len(set(pfns)) != len(pfns):
+                raise StreamError("Duplicate pfns in batch")
+
+            nr_pages = len([x for x in pfns if (x & 0xf0000000) < 0xd0000000])
+            pages = rdexact(nr_pages * 4096)
+
+            write_page_data(pfns, pages)
+
+        elif marker == legacy.CHUNK_enable_verify_mode:
+            # For debugging purposes only.  Will not be seen in real migration
+            raise RuntimeError("Unable to convert a debug stream")
+
+        elif marker == legacy.CHUNK_vcpu_info:
+            max_id, = unpack_exact("i")
+
+            if max_id > legacy.MAX_VCPU_ID:
+                raise StreamError("Vcpu max_id out of range: %d > %d"
+                                  % (max_id, legacy.MAX_VCPU_ID))
+
+            vm.max_vcpu_id = max_id
+            bitmap = unpack_exact("Q" * ((max_id/64) + 1))
+
+            for idx, word in enumerate(bitmap):
+                bit_idx = 0
+
+                while word > 0:
+                    if word & 1:
+                        vm.online_vcpu_map.append((idx * 64) + bit_idx)
+
+                    bit_idx += 1
+                    word >>= 1
+
+            info("  Vcpu info: max_id %d, online map %s"
+                 % (vm.max_vcpu_id, vm.online_vcpu_map))
+
+        elif marker == legacy.CHUNK_hvm_ident_pt:
+            _, ident_pt = unpack_exact("=IQ")
+            info("  EPT Identity Pagetable: 0x%x" % (ident_pt, ))
+            hvm_params.extend([public.HVM_PARAM_IDENT_PT, ident_pt])
+
+        elif marker == legacy.CHUNK_hvm_vm86_tss:
+            _, vm86_tss = unpack_exact("=IQ")
+            info("  VM86 TSS: 0x%x" % (vm86_tss, ))
+            hvm_params.extend([public.HVM_PARAM_VM86_TSS, vm86_tss])
+
+        elif marker == legacy.CHUNK_tmem:
+            raise RuntimeError("todo")
+
+        elif marker == legacy.CHUNK_tmem_extra:
+            raise RuntimeError("todo")
+
+        elif marker == legacy.CHUNK_tsc_info:
+            mode, nsec, khz, incarn = unpack_exact("=IQII")
+            info("  TSC_INFO: mode %s, %d ns, %d khz, %d incarn"
+                 % (mode, nsec, khz, incarn))
+            write_libxc_tsc_info(mode, khz, nsec, incarn)
+
+        elif marker == legacy.CHUNK_hvm_console_pfn:
+            _, console_pfn = unpack_exact("=IQ")
+            info("  Console pfn: 0x%x" % (console_pfn, ))
+            hvm_params.extend([public.HVM_PARAM_CONSOLE_PFN, console_pfn])
+
+        elif marker == legacy.CHUNK_last_checkpoint:
+            info("  Last Checkpoint")
+            # Nothing to do
+
+        elif marker == legacy.CHUNK_hvm_acpi_ioports_location:
+            _, loc = unpack_exact("=IQ")
+            info("  ACPI ioport location: 0x%x" % (loc, ))
+            hvm_params.extend([public.HVM_PARAM_ACPI_IOPORTS_LOCATION, loc])
+
+        elif marker == legacy.CHUNK_hvm_viridian:
+            _, loc = unpack_exact("=IQ")
+            info("  Viridian location: 0x%x" % (loc, ))
+            hvm_params.extend([public.HVM_PARAM_VIRIDIAN, loc])
+
+        elif marker == legacy.CHUNK_compressed_data:
+            sz, = unpack_exact("I")
+            data = rdexact(sz)
+            info("  Compressed Data: sz 0x%x" % (sz, ))
+            raise RuntimeError("todo")
+
+        elif marker == legacy.CHUNK_enable_compression:
+            raise RuntimeError("todo")
+
+        elif marker == legacy.CHUNK_hvm_generation_id_addr:
+            _, genid_loc = unpack_exact("=IQ")
+            info("  Generation ID Address: 0x%x" % (genid_loc, ))
+            hvm_params.extend(
+                [public.HVM_PARAM_VM_GENERATION_ID_ADDR, genid_loc])
+
+        elif marker == legacy.CHUNK_hvm_paging_ring_pfn:
+            _, pfn = unpack_exact("=IQ")
+            info("  Paging ring pfn: 0x%x" % (pfn, ))
+            hvm_params.extend([public.HVM_PARAM_PAGING_RING_PFN, pfn])
+
+        elif marker == legacy.CHUNK_hvm_monitor_ring_pfn:
+            _, pfn = unpack_exact("=IQ")
+            info("  Monitor ring pfn: 0x%x" % (pfn, ))
+            hvm_params.extend([public.HVM_PARAM_MONITOR_RING_PFN, pfn])
+
+        elif marker == legacy.CHUNK_hvm_sharing_ring_pfn:
+            _, pfn = unpack_exact("=IQ")
+            info("  Sharing ring pfn: 0x%x" % (pfn, ))
+            hvm_params.extend([public.HVM_PARAM_SHARING_RING_PFN, pfn])
+
+        elif marker == legacy.CHUNK_toolstack:
+            sz, = unpack_exact("I")
+
+            if sz:
+                data = rdexact(sz)
+                info("  Toolstack Data: sz 0x%x" % (sz, ))
+
+                if vm.libxl:
+                    read_libxl_toolstack(vm, data)
+                else:
+                    info("    Discarding")
+
+        elif marker == legacy.CHUNK_hvm_ioreq_server_pfn:
+            _, pfn = unpack_exact("=IQ")
+            info("  IOREQ server pfn: 0x%x" % (pfn, ))
+            hvm_params.extend([public.HVM_PARAM_IOREQ_SERVER_PFN, pfn])
+
+        elif marker == legacy.CHUNK_hvm_nr_ioreq_server_pages:
+            _, nr_pages = unpack_exact("=IQ")
+            info("  IOREQ server pages: %d" % (nr_pages, ))
+            hvm_params.extend(
+                [public.HVM_PARAM_NR_IOREQ_SERVER_PAGES, nr_pages])
+
+        else:
+            raise StreamError("Unrecognised chunk %d" % (marker,))
+
+def read_hvm_tail(vm):
+
+    io, bufio, store = unpack_exact("QQQ")
+    info("Magic pfns: 0x%x 0x%x 0x%x" % (io, bufio, store))
+    write_libxc_hvm_params([public.HVM_PARAM_IOREQ_PFN,    io,
+                            public.HVM_PARAM_BUFIOREQ_PFN, bufio,
+                            public.HVM_PARAM_STORE_PFN,    store])
+
+    blobsz, = unpack_exact("I")
+    info("Got HVM Context (0x%x bytes)" % (blobsz, ))
+    blob = rdexact(blobsz)
+
+    write_record(libxc.REC_TYPE_hvm_context, blob)
+    write_record(libxc.REC_TYPE_end, "")
+
+
+
+def read_qemu(vm):
+
+    rawsig = rdexact(21)
+    sig, = unpack("21s", rawsig)
+    info("Qemu signature: %s" % (sig, ))
+
+    if sig == "DeviceModelRecord0002":
+        rawsz = rdexact(4)
+        sz, = unpack("I", rawsz)
+        qdata = rdexact(sz)
+
+        if vm.libxl:
+            write_libxl_emulator_context(qdata)
+        else:
+            stream_write(rawsig)
+            stream_write(rawsz)
+            stream_write(qdata)
+
+    else:
+        raise RuntimeError("Unrecognised Qemu sig '%s'" % (sig, ))
+
+
+def skip_xl_header(fmt):
+    """Skip over an xl header in the stream"""
+
+    hdr = rdexact(len(xl.MAGIC))
+    if hdr != xl.MAGIC:
+        raise StreamError("No xl header")
+
+    byteorder, mflags, oflags, optlen = unpack_exact(xl.HEADER_FORMAT)
+
+    if fmt == "libxl":
+        mflags |= xl.MANDATORY_FLAG_STREAMV2
+
+    opts = pack(xl.HEADER_FORMAT, byteorder, mflags, oflags, optlen)
+
+    optdata = rdexact(optlen)
+
+    info("Processed xl header")
+
+    stream_write(hdr)
+    stream_write(opts)
+    stream_write(optdata)
+
+def read_legacy_stream(vm):
+
+    try:
+        vm.p2m_size, = unpack_ulongs(1)
+        info("P2M Size: 0x%x" % (vm.p2m_size,))
+
+        if vm.libxl:
+            write_libxl_hdr()
+            write_libxl_libxc_context()
+
+        write_libxc_ihdr()
+        write_libxc_dhdr()
+
+        if pv:
+            read_pv_extended_info(vm)
+            read_pv_p2m_frames(vm)
+
+        read_chunks(vm)
+
+        if pv:
+            read_pv_tail(vm)
+        else:
+            read_hvm_tail(vm)
+
+        if vm.libxl and len(vm.emu_xenstore):
+            write_libxl_emulator_xenstore_data(vm.emu_xenstore)
+
+        if not pv and (vm.libxl or qemu):
+            read_qemu(vm)
+
+        if vm.libxl:
+            write_libxl_end()
+
+    except (IOError, StreamError):
+        err("Stream Error:")
+        err(traceback.format_exc())
+        return 1
+
+    except RuntimeError:
+        err("Script Error:")
+        err(traceback.format_exc())
+        err("Please fix me")
+        return 2
+    return 0
+
+def open_file_or_fd(val, mode):
+    """
+    If 'val' looks like a decimal integer, open it as an fd.  If not, try to
+    open it as a regular file.
+    """
+
+    fd = -1
+    try:
+        # Does it look like an integer?
+        try:
+            fd = int(val, 10)
+        except ValueError:
+            pass
+
+        # Try to open it...
+        if fd != -1:
+            return os.fdopen(fd, mode, 0)
+        else:
+            return open(val, mode, 0)
+
+    except StandardError, e:
+        if fd != -1:
+            err("Unable to open fd %d: %s: %s" %
+                (fd, e.__class__.__name__, e))
+        else:
+            err("Unable to open file '%s': %s: %s" %
+                (val, e.__class__.__name__, e))
+
+    raise SystemExit(1)
+
+
+def main():
+    from optparse import OptionParser
+    global fin, fout, twidth, pv, qemu, verbose
+
+    # Change stdout to be line-buffered.
+    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 1)
+
+    parser = OptionParser(version = __version__,
+                          usage = ("%prog [options] -i INPUT -o OUTPUT"
+                                   " -w WIDTH -g GUEST"),
+                          description =
+                          "Convert a legacy stream to a v2 stream")
+
+    # Required options
+    parser.add_option("-i", "--in", dest = "fin", metavar = "<FD or FILE>",
+                      help = "Legacy input to convert")
+    parser.add_option("-o", "--out", dest = "fout", metavar = "<FD or FILE>",
+                      help = "v2 format output")
+    parser.add_option("-w", "--width", dest = "twidth",
+                      metavar = "<32/64>", choices = ["32", "64"],
+                      help = "Legacy toolstack bitness")
+    parser.add_option("-g", "--guest-type", dest = "gtype",
+                      metavar = "<pv/hvm>", choices = ["pv", "hvm"],
+                      help = "Type of guest in stream")
+
+    # Optional options
+    parser.add_option("-f", "--format", dest = "format",
+                      metavar = "<libxc|libxl>", default = "libxc",
+                      choices = ["libxc", "libxl"],
+                      help = "Desired format of the outgoing stream " \
+                          "(defaults to libxc)")
+    parser.add_option("-v", "--verbose", action = "store_true", default = False,
+                      help = "Summarise stream contents")
+    parser.add_option("-x", "--xl", action = "store_true", default = False,
+                      help = ("Is an `xl` header present in the stream?"
+                              " (default no)"))
+    parser.add_option("--skip-qemu", action = "store_true", default = False,
+                      help = ("Skip processing of the qemu tail?"
+                              " (default no)"))
+    parser.add_option("--syslog", action = "store_true", default = False,
+                      help = "Log to syslog instead of stdout")
+
+    opts, _ = parser.parse_args()
+
+    if (opts.fin is None or opts.fout is None or
+        opts.twidth is None or opts.gtype is None):
+
+        parser.print_help(sys.stderr)
+        raise SystemExit(1)
+
+    if opts.syslog:
+        global log_to_syslog
+
+        syslog.openlog("convert-legacy-stream", syslog.LOG_PID)
+        log_to_syslog = True
+
+    fin     = open_file_or_fd(opts.fin,  "rb")
+    fout    = open_file_or_fd(opts.fout, "wb")
+    twidth  = int(opts.twidth)
+    pv      = opts.gtype == "pv"
+    verbose = opts.verbose
+    if opts.skip_qemu:
+        qemu = False
+
+    if opts.xl:
+        skip_xl_header(opts.format)
+
+    rc = read_legacy_stream(VM(opts.format))
+    fout.close()
+
+    return rc
+
+if __name__ == "__main__":
+    try:
+        sys.exit(main())
+    except SystemExit, e:
+        sys.exit(e.code)
+    except KeyboardInterrupt:
+        sys.exit(1)
diff --git a/tools/python/scripts/verify-stream-v2 b/tools/python/scripts/verify-stream-v2
new file mode 100755
index 0000000..3daf257
--- /dev/null
+++ b/tools/python/scripts/verify-stream-v2
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+""" Verify a v2 format migration stream """
+
+import sys
+import struct
+import os, os.path
+import syslog
+import traceback
+
+from xen.migration.verify import StreamError, RecordError
+from xen.migration.libxc import VerifyLibxc
+from xen.migration.libxl import VerifyLibxl
+
+fin = None             # Input file/fd
+log_to_syslog = False  # Boolean - Log to syslog instead of stdout/err?
+verbose = False        # Boolean - Summarise stream contents
+quiet = False          # Boolean - Suppress error printing
+
+def info(msg):
+    """Info message, routed to appropriate destination"""
+    if not quiet and verbose:
+        if log_to_syslog:
+            for line in msg.split("\n"):
+                syslog.syslog(syslog.LOG_INFO, line)
+        else:
+            print msg
+
+def err(msg):
+    """Error message, routed to appropriate destination"""
+    if not quiet:
+        if log_to_syslog:
+            for line in msg.split("\n"):
+                syslog.syslog(syslog.LOG_ERR, line)
+        print >> sys.stderr, msg
+
+def stream_read(_ = None):
+    """Read from input"""
+    return fin.read(_)
+
+def rdexact(nr_bytes):
+    """Read exactly nr_bytes from fin"""
+    _ = stream_read(nr_bytes)
+    if len(_) != nr_bytes:
+        raise IOError("Stream truncated")
+    return _
+
+def unpack_exact(fmt):
+    """Unpack a format from fin"""
+    sz = struct.calcsize(fmt)
+    return struct.unpack(fmt, rdexact(sz))
+
+
+def skip_xl_header():
+    """Skip over an xl header in the stream"""
+
+    hdr = rdexact(32)
+    if hdr != "Xen saved domain, xl format\n \0 \r":
+        raise StreamError("No xl header")
+
+    _, mflags, _, optlen = unpack_exact("=IIII")
+    _ = rdexact(optlen)
+
+    info("Processed xl header")
+
+    if mflags & 2: # XL_MANDATORY_FLAG_STREAMv2
+        return "libxl"
+    else:
+        return "libxc"
+
+def read_stream(fmt):
+    """ Read an entire stream """
+
+    try:
+        if fmt == "xl":
+            fmt = skip_xl_header()
+
+        if fmt == "libxc":
+            VerifyLibxc(info, stream_read).verify()
+        else:
+            VerifyLibxl(info, stream_read).verify()
+
+    except (IOError, StreamError, RecordError):
+        err("Stream Error:")
+        err(traceback.format_exc())
+        return 1
+
+    except StandardError:
+        err("Script Error:")
+        err(traceback.format_exc())
+        err("Please fix me")
+        return 2
+
+    return 0
+
+def open_file_or_fd(val, mode, buffering):
+    """
+    If 'val' looks like a decimal integer, open it as an fd.  If not, try to
+    open it as a regular file.
+    """
+
+    fd = -1
+    try:
+        # Does it look like an integer?
+        try:
+            fd = int(val, 10)
+        except ValueError:
+            pass
+
+        # Try to open it...
+        if fd != -1:
+            return os.fdopen(fd, mode, buffering)
+        else:
+            return open(val, mode, buffering)
+
+    except StandardError, e:
+        if fd != -1:
+            err("Unable to open fd %d: %s: %s" %
+                (fd, e.__class__.__name__, e))
+        else:
+            err("Unable to open file '%s': %s: %s" %
+                (val, e.__class__.__name__, e))
+
+    raise SystemExit(2)
+
+def main():
+    """ main """
+    from optparse import OptionParser
+    global fin, quiet, verbose
+
+    # Change stdout to be line-buffered.
+    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 1)
+
+    parser = OptionParser(usage = "%prog [options]",
+                          description =
+                          "Verify a stream according to the v2 spec")
+
+    # Optional options
+    parser.add_option("-i", "--in", dest = "fin", metavar = "<FD or FILE>",
+                      default = "0",
+                      help = "Stream to verify (defaults to stdin)")
+    parser.add_option("-v", "--verbose", action = "store_true", default = False,
+                      help = "Summarise stream contents")
+    parser.add_option("-q", "--quiet", action = "store_true", default = False,
+                      help = "Suppress all logging/errors")
+    parser.add_option("-f", "--format", dest = "format",
+                      metavar = "<libxc|libxl|xl>", default = "libxc",
+                      choices = ["libxc", "libxl", "xl"],
+                      help = "Format of the incoming stream (defaults to libxc)")
+    parser.add_option("--syslog", action = "store_true", default = False,
+                      help = "Log to syslog instead of stdout")
+
+    opts, _ = parser.parse_args()
+
+    if opts.syslog:
+        global log_to_syslog
+
+        syslog.openlog("verify-stream-v2", syslog.LOG_PID)
+        log_to_syslog = True
+
+    verbose = opts.verbose
+    quiet = opts.quiet
+    fin = open_file_or_fd(opts.fin, "rb", 0)
+
+    return read_stream(opts.format)
+
+if __name__ == "__main__":
+    try:
+        sys.exit(main())
+    except SystemExit, e:
+        sys.exit(e.code)
+    except KeyboardInterrupt:
+        sys.exit(2)
diff --git a/tools/python/setup.py b/tools/python/setup.py
index 439c429..5bf81be 100644
--- a/tools/python/setup.py
+++ b/tools/python/setup.py
@@ -43,6 +43,7 @@ setup(name            = 'xen',
       version         = '3.0',
       description     = 'Xen',
       packages        = ['xen',
+                         'xen.migration',
                          'xen.lowlevel',
                         ],
       ext_package = "xen.lowlevel",
diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c
index 2aa0dc7..9ab53fb 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -667,7 +667,7 @@ static PyObject *pyxc_assign_device(XcObject *self,
         sbdf |= (dev & 0x1f) << 3;
         sbdf |= (func & 0x7);
 
-        if ( xc_assign_device(self->xc_handle, dom, sbdf) != 0 )
+        if ( xc_assign_device(self->xc_handle, dom, sbdf, 0) != 0 )
         {
             if (errno == ENOSYS)
                 sbdf = -1;
@@ -1210,6 +1210,7 @@ static PyObject *pyxc_getcpuinfo(XcObject *self, PyObject *args, PyObject *kwds)
     for (i = 0; i < nr_cpus; i++) {
         cpuinfo_obj = Py_BuildValue("{s:k}", "idletime", cpuinfo_ptr->idletime);
         PyList_Append(cpuinfo_list_obj, cpuinfo_obj);
+        Py_DECREF(cpuinfo_obj);
         cpuinfo_ptr++;
     }
 
@@ -1220,78 +1221,62 @@ static PyObject *pyxc_getcpuinfo(XcObject *self, PyObject *args, PyObject *kwds)
 
 static PyObject *pyxc_topologyinfo(XcObject *self)
 {
-#define MAX_CPU_INDEX 255
-    xc_topologyinfo_t tinfo = { 0 };
-    int i, max_cpu_index;
+    xc_cputopo_t *cputopo = NULL;
+    unsigned i, num_cpus = 0;
     PyObject *ret_obj = NULL;
     PyObject *cpu_to_core_obj, *cpu_to_socket_obj, *cpu_to_node_obj;
-    DECLARE_HYPERCALL_BUFFER(xc_cpu_to_core_t, coremap);
-    DECLARE_HYPERCALL_BUFFER(xc_cpu_to_socket_t, socketmap);
-    DECLARE_HYPERCALL_BUFFER(xc_cpu_to_node_t, nodemap);
 
-    coremap = xc_hypercall_buffer_alloc(self->xc_handle, coremap, sizeof(*coremap) * (MAX_CPU_INDEX+1));
-    if ( coremap == NULL )
-        goto out;
-    socketmap = xc_hypercall_buffer_alloc(self->xc_handle, socketmap, sizeof(*socketmap) * (MAX_CPU_INDEX+1));
-    if ( socketmap == NULL  )
-        goto out;
-    nodemap = xc_hypercall_buffer_alloc(self->xc_handle, nodemap, sizeof(*nodemap) * (MAX_CPU_INDEX+1));
-    if ( nodemap == NULL )
+    if ( xc_cputopoinfo(self->xc_handle, &num_cpus, NULL) != 0 )
         goto out;
 
-    set_xen_guest_handle(tinfo.cpu_to_core, coremap);
-    set_xen_guest_handle(tinfo.cpu_to_socket, socketmap);
-    set_xen_guest_handle(tinfo.cpu_to_node, nodemap);
-    tinfo.max_cpu_index = MAX_CPU_INDEX;
+    cputopo = calloc(num_cpus, sizeof(*cputopo));
+    if ( cputopo == NULL )
+    	goto out;
 
-    if ( xc_topologyinfo(self->xc_handle, &tinfo) != 0 )
+    if ( xc_cputopoinfo(self->xc_handle, &num_cpus, cputopo) != 0 )
         goto out;
 
-    max_cpu_index = tinfo.max_cpu_index;
-    if ( max_cpu_index > MAX_CPU_INDEX )
-        max_cpu_index = MAX_CPU_INDEX;
-
     /* Construct cpu-to-* lists. */
     cpu_to_core_obj = PyList_New(0);
     cpu_to_socket_obj = PyList_New(0);
     cpu_to_node_obj = PyList_New(0);
-    for ( i = 0; i <= max_cpu_index; i++ )
+    for ( i = 0; i < num_cpus; i++ )
     {
-        if ( coremap[i] == INVALID_TOPOLOGY_ID )
+        if ( cputopo[i].core == XEN_INVALID_CORE_ID )
         {
             PyList_Append(cpu_to_core_obj, Py_None);
         }
         else
         {
-            PyObject *pyint = PyInt_FromLong(coremap[i]);
+            PyObject *pyint = PyInt_FromLong(cputopo[i].core);
             PyList_Append(cpu_to_core_obj, pyint);
             Py_DECREF(pyint);
         }
 
-        if ( socketmap[i] == INVALID_TOPOLOGY_ID )
+        if ( cputopo[i].socket == XEN_INVALID_SOCKET_ID )
         {
             PyList_Append(cpu_to_socket_obj, Py_None);
         }
         else
         {
-            PyObject *pyint = PyInt_FromLong(socketmap[i]);
+            PyObject *pyint = PyInt_FromLong(cputopo[i].socket);
             PyList_Append(cpu_to_socket_obj, pyint);
             Py_DECREF(pyint);
         }
 
-        if ( nodemap[i] == INVALID_TOPOLOGY_ID )
+        if ( cputopo[i].node == XEN_INVALID_NODE_ID )
         {
             PyList_Append(cpu_to_node_obj, Py_None);
         }
         else
         {
-            PyObject *pyint = PyInt_FromLong(nodemap[i]);
+            PyObject *pyint = PyInt_FromLong(cputopo[i].node);
             PyList_Append(cpu_to_node_obj, pyint);
             Py_DECREF(pyint);
         }
     }
 
-    ret_obj = Py_BuildValue("{s:i}", "max_cpu_index", max_cpu_index);
+    ret_obj = Py_BuildValue("{s:i}", "max_cpu_index", num_cpus + 1);
 
     PyDict_SetItemString(ret_obj, "cpu_to_core", cpu_to_core_obj);
     Py_DECREF(cpu_to_core_obj);
@@ -1303,64 +1288,48 @@ static PyObject *pyxc_topologyinfo(XcObject *self)
     Py_DECREF(cpu_to_node_obj);
 
 out:
-    xc_hypercall_buffer_free(self->xc_handle, coremap);
-    xc_hypercall_buffer_free(self->xc_handle, socketmap);
-    xc_hypercall_buffer_free(self->xc_handle, nodemap);
+    free(cputopo);
     return ret_obj ? ret_obj : pyxc_error_to_exception(self->xc_handle);
-#undef MAX_CPU_INDEX
 }
 
 static PyObject *pyxc_numainfo(XcObject *self)
 {
-#define MAX_NODE_INDEX 31
-    xc_numainfo_t ninfo = { 0 };
-    int i, j, max_node_index;
+    unsigned i, j, num_nodes = 0;
     uint64_t free_heap;
     PyObject *ret_obj = NULL, *node_to_node_dist_list_obj;
     PyObject *node_to_memsize_obj, *node_to_memfree_obj;
     PyObject *node_to_dma32_mem_obj, *node_to_node_dist_obj;
-    DECLARE_HYPERCALL_BUFFER(xc_node_to_memsize_t, node_memsize);
-    DECLARE_HYPERCALL_BUFFER(xc_node_to_memfree_t, node_memfree);
-    DECLARE_HYPERCALL_BUFFER(xc_node_to_node_dist_t, nodes_dist);
+    xc_meminfo_t *meminfo = NULL;
+    uint32_t *distance = NULL;
 
-    node_memsize = xc_hypercall_buffer_alloc(self->xc_handle, node_memsize, sizeof(*node_memsize)*(MAX_NODE_INDEX+1));
-    if ( node_memsize == NULL )
-        goto out;
-    node_memfree = xc_hypercall_buffer_alloc(self->xc_handle, node_memfree, sizeof(*node_memfree)*(MAX_NODE_INDEX+1));
-    if ( node_memfree == NULL )
-        goto out;
-    nodes_dist = xc_hypercall_buffer_alloc(self->xc_handle, nodes_dist, sizeof(*nodes_dist)*(MAX_NODE_INDEX+1)*(MAX_NODE_INDEX+1));
-    if ( nodes_dist == NULL )
+    if ( xc_numainfo(self->xc_handle, &num_nodes, NULL, NULL) != 0 )
         goto out;
 
-    set_xen_guest_handle(ninfo.node_to_memsize, node_memsize);
-    set_xen_guest_handle(ninfo.node_to_memfree, node_memfree);
-    set_xen_guest_handle(ninfo.node_to_node_distance, nodes_dist);
-    ninfo.max_node_index = MAX_NODE_INDEX;
-
-    if ( xc_numainfo(self->xc_handle, &ninfo) != 0 )
+    meminfo = calloc(num_nodes, sizeof(*meminfo));
+    distance = calloc(num_nodes * num_nodes, sizeof(*distance));
+    if ( (meminfo == NULL) || (distance == NULL) )
         goto out;
 
-    max_node_index = ninfo.max_node_index;
-    if ( max_node_index > MAX_NODE_INDEX )
-        max_node_index = MAX_NODE_INDEX;
+    if ( xc_numainfo(self->xc_handle, &num_nodes, meminfo, distance) != 0 )
+        goto out;
 
     /* Construct node-to-* lists. */
     node_to_memsize_obj = PyList_New(0);
     node_to_memfree_obj = PyList_New(0);
     node_to_dma32_mem_obj = PyList_New(0);
     node_to_node_dist_list_obj = PyList_New(0);
-    for ( i = 0; i <= max_node_index; i++ )
+    for ( i = 0; i < num_nodes; i++ )
     {
         PyObject *pyint;
+        unsigned invalid_node;
 
         /* Total Memory */
-        pyint = PyInt_FromLong(node_memsize[i] >> 20); /* MB */
+        pyint = PyInt_FromLong(meminfo[i].memsize >> 20); /* MB */
         PyList_Append(node_to_memsize_obj, pyint);
         Py_DECREF(pyint);
 
         /* Free Memory */
-        pyint = PyInt_FromLong(node_memfree[i] >> 20); /* MB */
+        pyint = PyInt_FromLong(meminfo[i].memfree >> 20); /* MB */
         PyList_Append(node_to_memfree_obj, pyint);
         Py_DECREF(pyint);
 
@@ -1372,10 +1341,11 @@ static PyObject *pyxc_numainfo(XcObject *self)
 
         /* Node to Node Distance */
         node_to_node_dist_obj = PyList_New(0);
-        for ( j = 0; j <= max_node_index; j++ )
+        invalid_node = (meminfo[i].memsize == XEN_INVALID_MEM_SZ);
+        for ( j = 0; j < num_nodes; j++ )
         {
-            uint32_t dist = nodes_dist[i*(max_node_index+1) + j];
-            if ( dist == INVALID_TOPOLOGY_ID )
+            uint32_t dist = distance[i * num_nodes + j];
+            if ( invalid_node || (dist == XEN_INVALID_NODE_DIST) )
             {
                 PyList_Append(node_to_node_dist_obj, Py_None);
             }
@@ -1390,7 +1360,7 @@ static PyObject *pyxc_numainfo(XcObject *self)
         Py_DECREF(node_to_node_dist_obj);
     }
 
-    ret_obj = Py_BuildValue("{s:i}", "max_node_index", max_node_index);
+    ret_obj = Py_BuildValue("{s:i}", "max_node_index", num_nodes + 1);
 
     PyDict_SetItemString(ret_obj, "node_memsize", node_to_memsize_obj);
     Py_DECREF(node_to_memsize_obj);
@@ -1406,11 +1376,9 @@ static PyObject *pyxc_numainfo(XcObject *self)
     Py_DECREF(node_to_node_dist_list_obj);
 
 out:
-    xc_hypercall_buffer_free(self->xc_handle, node_memsize);
-    xc_hypercall_buffer_free(self->xc_handle, node_memfree);
-    xc_hypercall_buffer_free(self->xc_handle, nodes_dist);
+    free(meminfo);
+    free(distance);
     return ret_obj ? ret_obj : pyxc_error_to_exception(self->xc_handle);
-#undef MAX_NODE_INDEX
 }
 
 static PyObject *pyxc_xeninfo(XcObject *self)
@@ -1466,51 +1434,6 @@ static PyObject *pyxc_xeninfo(XcObject *self)
                          "cc_compile_date", xen_cc.compile_date);
 }
 
-
-static PyObject *pyxc_sedf_domain_set(XcObject *self,
-                                      PyObject *args,
-                                      PyObject *kwds)
-{
-    uint32_t domid;
-    uint64_t period, slice, latency;
-    uint16_t extratime, weight;
-    static char *kwd_list[] = { "domid", "period", "slice",
-                                "latency", "extratime", "weight",NULL };
-    
-    if( !PyArg_ParseTupleAndKeywords(args, kwds, "iLLLhh", kwd_list, 
-                                     &domid, &period, &slice,
-                                     &latency, &extratime, &weight) )
-        return NULL;
-   if ( xc_sedf_domain_set(self->xc_handle, domid, period,
-                           slice, latency, extratime,weight) != 0 )
-        return pyxc_error_to_exception(self->xc_handle);
-
-    Py_INCREF(zero);
-    return zero;
-}
-
-static PyObject *pyxc_sedf_domain_get(XcObject *self, PyObject *args)
-{
-    uint32_t domid;
-    uint64_t period, slice,latency;
-    uint16_t weight, extratime;
-    
-    if(!PyArg_ParseTuple(args, "i", &domid))
-        return NULL;
-    
-    if (xc_sedf_domain_get(self->xc_handle, domid, &period,
-                           &slice,&latency,&extratime,&weight))
-        return pyxc_error_to_exception(self->xc_handle);
-
-    return Py_BuildValue("{s:i,s:L,s:L,s:L,s:i,s:i}",
-                         "domid",    domid,
-                         "period",    period,
-                         "slice",     slice,
-                         "latency",   latency,
-                         "extratime", extratime,
-                         "weight",    weight);
-}
-
 static PyObject *pyxc_shadow_control(PyObject *self,
                                      PyObject *args,
                                      PyObject *kwds)
@@ -1876,36 +1799,35 @@ static PyObject *pyxc_tmem_control(XcObject *self,
     uint32_t cli_id;
     uint32_t arg1;
     uint32_t arg2;
-    uint64_t arg3;
     char *buf;
     char _buffer[32768], *buffer = _buffer;
     int rc;
 
-    static char *kwd_list[] = { "pool_id", "subop", "cli_id", "arg1", "arg2", "arg3", "buf", NULL };
+    static char *kwd_list[] = { "pool_id", "subop", "cli_id", "arg1", "arg2", "buf", NULL };
 
     if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiiiis", kwd_list,
-                        &pool_id, &subop, &cli_id, &arg1, &arg2, &arg3, &buf) )
+                        &pool_id, &subop, &cli_id, &arg1, &arg2, &buf) )
         return NULL;
 
-    if ( (subop == TMEMC_LIST) && (arg1 > 32768) )
+    if ( (subop == XEN_SYSCTL_TMEM_OP_LIST) && (arg1 > 32768) )
         arg1 = 32768;
 
-    if ( (rc = xc_tmem_control(self->xc_handle, pool_id, subop, cli_id, arg1, arg2, arg3, buffer)) < 0 )
+    if ( (rc = xc_tmem_control(self->xc_handle, pool_id, subop, cli_id, arg1, arg2, buffer)) < 0 )
         return Py_BuildValue("i", rc);
 
     switch (subop) {
-        case TMEMC_LIST:
+        case XEN_SYSCTL_TMEM_OP_LIST:
             return Py_BuildValue("s", buffer);
-        case TMEMC_FLUSH:
+        case XEN_SYSCTL_TMEM_OP_FLUSH:
             return Py_BuildValue("i", rc);
-        case TMEMC_QUERY_FREEABLE_MB:
+        case XEN_SYSCTL_TMEM_OP_QUERY_FREEABLE_MB:
             return Py_BuildValue("i", rc);
-        case TMEMC_THAW:
-        case TMEMC_FREEZE:
-        case TMEMC_DESTROY:
-        case TMEMC_SET_WEIGHT:
-        case TMEMC_SET_CAP:
-        case TMEMC_SET_COMPRESS:
+        case XEN_SYSCTL_TMEM_OP_THAW:
+        case XEN_SYSCTL_TMEM_OP_FREEZE:
+        case XEN_SYSCTL_TMEM_OP_DESTROY:
+        case XEN_SYSCTL_TMEM_OP_SET_WEIGHT:
+        case XEN_SYSCTL_TMEM_OP_SET_CAP:
+        case XEN_SYSCTL_TMEM_OP_SET_COMPRESS:
         default:
             break;
     }
@@ -2523,30 +2445,6 @@ static PyMethodDef pyxc_methods[] = {
       "Get the current scheduler type in use.\n"
       "Returns: [int] sched_id.\n" },    
 
-    { "sedf_domain_set",
-      (PyCFunction)pyxc_sedf_domain_set,
-      METH_KEYWORDS, "\n"
-      "Set the scheduling parameters for a domain when running with Atropos.\n"
-      " dom       [int]:  domain to set\n"
-      " period    [long]: domain's scheduling period\n"
-      " slice     [long]: domain's slice per period\n"
-      " latency   [long]: domain's wakeup latency hint\n"
-      " extratime [int]:  domain aware of extratime?\n"
-      "Returns: [int] 0 on success; -1 on error.\n" },
-
-    { "sedf_domain_get",
-      (PyCFunction)pyxc_sedf_domain_get,
-      METH_VARARGS, "\n"
-      "Get the current scheduling parameters for a domain when running with\n"
-      "the Atropos scheduler."
-      " dom       [int]: domain to query\n"
-      "Returns:   [dict]\n"
-      " domain    [int]: domain ID\n"
-      " period    [long]: scheduler period\n"
-      " slice     [long]: CPU reservation per period\n"
-      " latency   [long]: domain's wakeup latency hint\n"
-      " extratime [int]:  domain aware of extratime?\n"},
-    
     { "sched_credit_domain_set",
       (PyCFunction)pyxc_sched_credit_domain_set,
       METH_KEYWORDS, "\n"
@@ -3066,7 +2964,6 @@ PyMODINIT_FUNC initxc(void)
     PyModule_AddObject(m, "Error", xc_error_obj);
 
     /* Expose some libxc constants to Python */
-    PyModule_AddIntConstant(m, "XEN_SCHEDULER_SEDF", XEN_SCHEDULER_SEDF);
     PyModule_AddIntConstant(m, "XEN_SCHEDULER_CREDIT", XEN_SCHEDULER_CREDIT);
     PyModule_AddIntConstant(m, "XEN_SCHEDULER_CREDIT2", XEN_SCHEDULER_CREDIT2);
 
diff --git a/tools/python/xen/lowlevel/xl/xl.c b/tools/python/xen/lowlevel/xl/xl.c
index 32f982a..20423a3 100644
--- a/tools/python/xen/lowlevel/xl/xl.c
+++ b/tools/python/xen/lowlevel/xl/xl.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/tools/python/xen/lowlevel/xs/xs.c b/tools/python/xen/lowlevel/xs/xs.c
index ec364bb..76ae3ac 100644
--- a/tools/python/xen/lowlevel/xs/xs.c
+++ b/tools/python/xen/lowlevel/xs/xs.c
@@ -11,8 +11,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) 2005 Mike Wray Hewlett-Packard
  * Copyright (C) 2005 Christian Limpach <Christian.Limpach at cl.cam.ac.uk>
diff --git a/tools/python/xen/migration/__init__.py b/tools/python/xen/migration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/python/xen/migration/legacy.py b/tools/python/xen/migration/legacy.py
new file mode 100644
index 0000000..6456d61
--- /dev/null
+++ b/tools/python/xen/migration/legacy.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Legacy migration stream information.
+
+Documentation and record structures for legacy migration, for both libxc
+and libxl.
+"""
+
+"""
+Libxc:
+
+SAVE/RESTORE/MIGRATE PROTOCOL
+=============================
+
+The general form of a stream of chunks is a header followed by a
+body consisting of a variable number of chunks (terminated by a
+chunk with type 0) followed by a trailer.
+
+For a rolling/checkpoint (e.g. remus) migration then the body and
+trailer phases can be repeated until an external event
+(e.g. failure) causes the process to terminate and commit to the
+most recent complete checkpoint.
+
+HEADER
+------
+
+unsigned long        : p2m_size
+
+extended-info (PV-only, optional):
+
+  If first unsigned long == ~0UL then extended info is present,
+  otherwise unsigned long is part of p2m. Note that p2m_size above
+  does not include the length of the extended info.
+
+  extended-info:
+
+    unsigned long    : signature == ~0UL
+    uint32_t	        : number of bytes remaining in extended-info
+
+    1 or more extended-info blocks of form:
+    char[4]          : block identifier
+    uint32_t         : block data size
+    bytes            : block data
+
+    defined extended-info blocks:
+    "vcpu"		: VCPU context info containing vcpu_guest_context_t.
+                       The precise variant of the context structure
+                       (e.g. 32 vs 64 bit) is distinguished by
+                       the block size.
+    "extv"           : Presence indicates use of extended VCPU context in
+                       tail, data size is 0.
+
+p2m (PV-only):
+
+  consists of p2m_size bytes comprising an array of xen_pfn_t sized entries.
+
+BODY PHASE - Format A (for live migration or Remus without compression)
+----------
+
+A series of chunks with a common header:
+  int              : chunk type
+
+If the chunk type is +ve then chunk contains guest memory data, and the
+type contains the number of pages in the batch:
+
+    unsigned long[]  : PFN array, length == number of pages in batch
+                       Each entry consists of XEN_DOMCTL_PFINFO_*
+                       in bits 31-28 and the PFN number in bits 27-0.
+    page data        : PAGE_SIZE bytes for each page marked present in PFN
+                       array
+
+If the chunk type is -ve then chunk consists of one of a number of
+metadata types.  See definitions of XC_SAVE_ID_* below.
+
+If chunk type is 0 then body phase is complete.
+
+
+BODY PHASE - Format B (for Remus with compression)
+----------
+
+A series of chunks with a common header:
+  int              : chunk type
+
+If the chunk type is +ve then chunk contains array of PFNs corresponding
+to guest memory and type contains the number of PFNs in the batch:
+
+    unsigned long[]  : PFN array, length == number of pages in batch
+                       Each entry consists of XEN_DOMCTL_PFINFO_*
+                       in bits 31-28 and the PFN number in bits 27-0.
+
+If the chunk type is -ve then chunk consists of one of a number of
+metadata types.  See definitions of XC_SAVE_ID_* below.
+
+If the chunk type is -ve and equals XC_SAVE_ID_COMPRESSED_DATA, then the
+chunk consists of compressed page data, in the following format:
+
+    unsigned long        : Size of the compressed chunk to follow
+    compressed data :      variable length data of size indicated above.
+                           This chunk consists of compressed page data.
+                           The number of pages in one chunk depends on
+                           the amount of space available in the sender's
+                           output buffer.
+
+Format of compressed data:
+  compressed_data = <deltas>*
+  delta           = <marker, run*>
+  marker          = (RUNFLAG|SKIPFLAG) bitwise-or RUNLEN [1 byte marker]
+  RUNFLAG         = 0
+  SKIPFLAG        = 1 << 7
+  RUNLEN          = 7-bit unsigned value indicating number of WORDS in the run
+  run             = string of bytes of length sizeof(WORD) * RUNLEN
+
+   If marker contains RUNFLAG, then RUNLEN * sizeof(WORD) bytes of data following
+  the marker is copied into the target page at the appropriate offset indicated by
+  the offset_ptr
+   If marker contains SKIPFLAG, then the offset_ptr is advanced
+  by RUNLEN * sizeof(WORD).
+
+If chunk type is 0 then body phase is complete.
+
+There can be one or more chunks with type XC_SAVE_ID_COMPRESSED_DATA,
+containing compressed pages. The compressed chunks are collated to form
+one single compressed chunk for the entire iteration. The number of pages
+present in this final compressed chunk will be equal to the total number
+of valid PFNs specified by the +ve chunks.
+
+At the sender side, compressed pages are inserted into the output stream
+in the same order as they would have been if compression logic was absent.
+
+Until last iteration, the BODY is sent in Format A, to maintain live
+migration compatibility with receivers of older Xen versions.
+At the last iteration, if Remus compression was enabled, the sender sends
+a trigger, XC_SAVE_ID_ENABLE_COMPRESSION to tell the receiver to parse the
+BODY in Format B from the next iteration onwards.
+
+An example sequence of chunks received in Format B:
+    +16                              +ve chunk
+    unsigned long[16]                PFN array
+    +100                             +ve chunk
+    unsigned long[100]               PFN array
+    +50                              +ve chunk
+    unsigned long[50]                PFN array
+
+    XC_SAVE_ID_COMPRESSED_DATA       TAG
+      N                              Length of compressed data
+      N bytes of DATA                Decompresses to 166 pages
+
+    XC_SAVE_ID_*                     other xc save chunks
+    0                                END BODY TAG
+
+Corner case with checkpoint compression:
+    At sender side, after pausing the domain, dirty pages are usually
+  copied out to a temporary buffer. After the domain is resumed,
+  compression is done and the compressed chunk(s) are sent, followed by
+  other XC_SAVE_ID_* chunks.
+    If the temporary buffer gets full while scanning for dirty pages,
+  the sender stops buffering of dirty pages, compresses the temporary
+  buffer and sends the compressed data with XC_SAVE_ID_COMPRESSED_DATA.
+  The sender then resumes the buffering of dirty pages and continues
+  scanning for the dirty pages.
+    For e.g., assume that the temporary buffer can hold 4096 pages and
+  there are 5000 dirty pages. The following is the sequence of chunks
+  that the receiver will see:
+
+    +1024                       +ve chunk
+    unsigned long[1024]         PFN array
+    +1024                       +ve chunk
+    unsigned long[1024]         PFN array
+    +1024                       +ve chunk
+    unsigned long[1024]         PFN array
+    +1024                       +ve chunk
+    unsigned long[1024]         PFN array
+
+    XC_SAVE_ID_COMPRESSED_DATA  TAG
+     N                          Length of compressed data
+     N bytes of DATA            Decompresses to 4096 pages
+
+    +4                          +ve chunk
+    unsigned long[4]            PFN array
+
+    XC_SAVE_ID_COMPRESSED_DATA  TAG
+     M                          Length of compressed data
+     M bytes of DATA            Decompresses to 4 pages
+
+    XC_SAVE_ID_*                other xc save chunks
+    0                           END BODY TAG
+
+    In other words, XC_SAVE_ID_COMPRESSED_DATA can be interleaved with
+  +ve chunks arbitrarily. But at the receiver end, the following condition
+  always holds true until the end of BODY PHASE:
+   num(PFN entries +ve chunks) >= num(pages received in compressed form)
+
+TAIL PHASE
+----------
+
+Content differs for PV and HVM guests.
+
+HVM TAIL:
+
+ "Magic" pages:
+    uint64_t         : I/O req PFN
+    uint64_t         : Buffered I/O req PFN
+    uint64_t         : Store PFN
+ Xen HVM Context:
+    uint32_t         : Length of context in bytes
+    bytes            : Context data
+ Qemu context:
+    char[21]         : Signature:
+      "QemuDeviceModelRecord" : Read Qemu save data until EOF
+      "DeviceModelRecord0002" : uint32_t length field followed by that many
+                                bytes of Qemu save data
+      "RemusDeviceModelState" : Currently the same as "DeviceModelRecord0002".
+
+PV TAIL:
+
+ Unmapped PFN list   : list of all the PFNs that were not in map at the close
+    unsigned int     : Number of unmapped pages
+    unsigned long[]  : PFNs of unmapped pages
+
+ VCPU context data   : A series of VCPU records, one per present VCPU
+                       Maximum and present map supplied in XC_SAVE_ID_VCPUINFO
+    bytes:           : VCPU context structure. Size is determined by size
+                       provided in extended-info header
+    bytes[128]       : Extended VCPU context (present IFF "extv" block
+                       present in extended-info header)
+
+ Shared Info Page    : 4096 bytes of shared info page
+"""
+
+CHUNK_end                       = 0
+CHUNK_enable_verify_mode        = -1
+CHUNK_vcpu_info                 = -2
+CHUNK_hvm_ident_pt              = -3
+CHUNK_hvm_vm86_tss              = -4
+CHUNK_tmem                      = -5
+CHUNK_tmem_extra                = -6
+CHUNK_tsc_info                  = -7
+CHUNK_hvm_console_pfn           = -8
+CHUNK_last_checkpoint           = -9
+CHUNK_hvm_acpi_ioports_location = -10
+CHUNK_hvm_viridian              = -11
+CHUNK_compressed_data           = -12
+CHUNK_enable_compression        = -13
+CHUNK_hvm_generation_id_addr    = -14
+CHUNK_hvm_paging_ring_pfn       = -15
+CHUNK_hvm_monitor_ring_pfn      = -16
+CHUNK_hvm_sharing_ring_pfn      = -17
+CHUNK_toolstack                 = -18
+CHUNK_hvm_ioreq_server_pfn      = -19
+CHUNK_hvm_nr_ioreq_server_pages = -20
+
+chunk_type_to_str = {
+    CHUNK_end                       : "end",
+    CHUNK_enable_verify_mode        : "enable_verify_mode",
+    CHUNK_vcpu_info                 : "vcpu_info",
+    CHUNK_hvm_ident_pt              : "hvm_ident_pt",
+    CHUNK_hvm_vm86_tss              : "hvm_vm86_tss",
+    CHUNK_tmem                      : "tmem",
+    CHUNK_tmem_extra                : "tmem_extra",
+    CHUNK_tsc_info                  : "tsc_info",
+    CHUNK_hvm_console_pfn           : "hvm_console_pfn",
+    CHUNK_last_checkpoint           : "last_checkpoint",
+    CHUNK_hvm_acpi_ioports_location : "hvm_acpi_ioports_location",
+    CHUNK_hvm_viridian              : "hvm_viridian",
+    CHUNK_compressed_data           : "compressed_data",
+    CHUNK_enable_compression        : "enable_compression",
+    CHUNK_hvm_generation_id_addr    : "hvm_generation_id_addr",
+    CHUNK_hvm_paging_ring_pfn       : "hvm_paging_ring_pfn",
+    CHUNK_hvm_monitor_ring_pfn      : "hvm_monitor_ring_pfn",
+    CHUNK_hvm_sharing_ring_pfn      : "hvm_sharing_ring_pfn",
+    CHUNK_toolstack                 : "toolstack",
+    CHUNK_hvm_ioreq_server_pfn      : "hvm_ioreq_server_pfn",
+    CHUNK_hvm_nr_ioreq_server_pages : "hvm_nr_ioreq_server_pages",
+}
+
+# Up to 1024 pages (4MB) at a time
+MAX_BATCH = 1024
+
+# Maximum #VCPUs currently supported for save/restore
+MAX_VCPU_ID = 4095
+
+
+"""
+Libxl:
+
+Legacy "toolstack" record layout:
+
+Version 1:
+  uint32_t version
+  QEMU physmap data:
+    uint32_t count
+    libxl__physmap_info * count
+
+The problem is that libxl__physmap_info was declared as:
+
+struct libxl__physmap_info {
+    uint64_t phys_offset;
+    uint64_t start_addr;
+    uint64_t size;
+    uint32_t namelen;
+    char name[];
+};
+
+Which has 4 bytes of padding at the end in a 64bit build, thus not the
+same between 32 and 64bit builds.
+
+Because of the pointer arithmatic used to construct the record, the 'name' was
+shifted up to start at the padding, leaving the erronious 4 bytes at the end
+of the name string, after the NUL terminator.
+
+Instead, the information described here has been changed to fit in a new
+EMULATOR_XENSTORE_DATA record made of NUL terminated strings.
+"""
diff --git a/tools/python/xen/migration/libxc.py b/tools/python/xen/migration/libxc.py
new file mode 100644
index 0000000..b0255ac
--- /dev/null
+++ b/tools/python/xen/migration/libxc.py
@@ -0,0 +1,446 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Libxc Migration v2 streams
+
+Record structures as per docs/specs/libxc-migration-stream.pandoc, and
+verification routines.
+"""
+
+import sys
+
+from struct import calcsize, unpack
+
+from xen.migration.verify import StreamError, RecordError, VerifyBase
+
+# Image Header
+IHDR_FORMAT = "!QIIHHI"
+
+IHDR_MARKER  = 0xffffffffffffffff
+IHDR_IDENT   = 0x58454E46 # "XENF" in ASCII
+IHDR_VERSION = 2
+
+IHDR_OPT_BIT_ENDIAN = 0
+IHDR_OPT_LE = (0 << IHDR_OPT_BIT_ENDIAN)
+IHDR_OPT_BE = (1 << IHDR_OPT_BIT_ENDIAN)
+
+IHDR_OPT_RESZ_MASK = 0xfffe
+
+# Domain Header
+DHDR_FORMAT = "IHHII"
+
+DHDR_TYPE_x86_pv  = 0x00000001
+DHDR_TYPE_x86_hvm = 0x00000002
+DHDR_TYPE_x86_pvh = 0x00000003
+DHDR_TYPE_arm     = 0x00000004
+
+dhdr_type_to_str = {
+    DHDR_TYPE_x86_pv  : "x86 PV",
+    DHDR_TYPE_x86_hvm : "x86 HVM",
+    DHDR_TYPE_x86_pvh : "x86 PVH",
+    DHDR_TYPE_arm     : "ARM",
+}
+
+# Records
+RH_FORMAT = "II"
+
+REC_TYPE_end                  = 0x00000000
+REC_TYPE_page_data            = 0x00000001
+REC_TYPE_x86_pv_info          = 0x00000002
+REC_TYPE_x86_pv_p2m_frames    = 0x00000003
+REC_TYPE_x86_pv_vcpu_basic    = 0x00000004
+REC_TYPE_x86_pv_vcpu_extended = 0x00000005
+REC_TYPE_x86_pv_vcpu_xsave    = 0x00000006
+REC_TYPE_shared_info          = 0x00000007
+REC_TYPE_tsc_info             = 0x00000008
+REC_TYPE_hvm_context          = 0x00000009
+REC_TYPE_hvm_params           = 0x0000000a
+REC_TYPE_toolstack            = 0x0000000b
+REC_TYPE_x86_pv_vcpu_msrs     = 0x0000000c
+REC_TYPE_verify               = 0x0000000d
+REC_TYPE_checkpoint           = 0x0000000e
+
+rec_type_to_str = {
+    REC_TYPE_end                  : "End",
+    REC_TYPE_page_data            : "Page data",
+    REC_TYPE_x86_pv_info          : "x86 PV info",
+    REC_TYPE_x86_pv_p2m_frames    : "x86 PV P2M frames",
+    REC_TYPE_x86_pv_vcpu_basic    : "x86 PV vcpu basic",
+    REC_TYPE_x86_pv_vcpu_extended : "x86 PV vcpu extended",
+    REC_TYPE_x86_pv_vcpu_xsave    : "x86 PV vcpu xsave",
+    REC_TYPE_shared_info          : "Shared info",
+    REC_TYPE_tsc_info             : "TSC info",
+    REC_TYPE_hvm_context          : "HVM context",
+    REC_TYPE_hvm_params           : "HVM params",
+    REC_TYPE_toolstack            : "Toolstack",
+    REC_TYPE_x86_pv_vcpu_msrs     : "x86 PV vcpu msrs",
+    REC_TYPE_verify               : "Verify",
+    REC_TYPE_checkpoint           : "Checkpoint",
+}
+
+# page_data
+PAGE_DATA_FORMAT             = "II"
+PAGE_DATA_PFN_MASK           = (1L << 52) - 1
+PAGE_DATA_PFN_RESZ_MASK      = ((1L << 60) - 1) & ~((1L << 52) - 1)
+
+# flags from xen/public/domctl.h: XEN_DOMCTL_PFINFO_* shifted by 32 bits
+PAGE_DATA_TYPE_SHIFT         = 60
+PAGE_DATA_TYPE_LTABTYPE_MASK = (0x7L << PAGE_DATA_TYPE_SHIFT)
+PAGE_DATA_TYPE_LTAB_MASK     = (0xfL << PAGE_DATA_TYPE_SHIFT)
+PAGE_DATA_TYPE_LPINTAB       = (0x8L << PAGE_DATA_TYPE_SHIFT) # Pinned pagetable
+
+PAGE_DATA_TYPE_NOTAB         = (0x0L << PAGE_DATA_TYPE_SHIFT) # Regular page
+PAGE_DATA_TYPE_L1TAB         = (0x1L << PAGE_DATA_TYPE_SHIFT) # L1 pagetable
+PAGE_DATA_TYPE_L2TAB         = (0x2L << PAGE_DATA_TYPE_SHIFT) # L2 pagetable
+PAGE_DATA_TYPE_L3TAB         = (0x3L << PAGE_DATA_TYPE_SHIFT) # L3 pagetable
+PAGE_DATA_TYPE_L4TAB         = (0x4L << PAGE_DATA_TYPE_SHIFT) # L4 pagetable
+PAGE_DATA_TYPE_BROKEN        = (0xdL << PAGE_DATA_TYPE_SHIFT) # Broken
+PAGE_DATA_TYPE_XALLOC        = (0xeL << PAGE_DATA_TYPE_SHIFT) # Allocate-only
+PAGE_DATA_TYPE_XTAB          = (0xfL << PAGE_DATA_TYPE_SHIFT) # Invalid
+
+# x86_pv_info
+X86_PV_INFO_FORMAT        = "BBHI"
+
+X86_PV_P2M_FRAMES_FORMAT  = "II"
+
+# x86_pv_vcpu_{basic,extended,xsave,msrs}
+X86_PV_VCPU_HDR_FORMAT    = "II"
+
+# tsc_info
+TSC_INFO_FORMAT           = "IIQII"
+
+# hvm_params
+HVM_PARAMS_ENTRY_FORMAT   = "QQ"
+HVM_PARAMS_FORMAT         = "II"
+
+class VerifyLibxc(VerifyBase):
+    """ Verify a Libxc v2 stream """
+
+    def __init__(self, info, read):
+        VerifyBase.__init__(self, info, read)
+
+        self.squashed_pagedata_records = 0
+
+
+    def verify(self):
+        """ Verity a libxc stream """
+
+        self.verify_ihdr()
+        self.verify_dhdr()
+
+        while self.verify_record() != REC_TYPE_end:
+            pass
+
+
+    def verify_ihdr(self):
+        """ Verify an Image Header """
+        marker, ident, version, options, res1, res2 = \
+            self.unpack_exact(IHDR_FORMAT)
+
+        if marker != IHDR_MARKER:
+            raise StreamError("Bad image marker: Expected 0x%x, got 0x%x"
+                              % (IHDR_MARKER, marker))
+
+        if ident != IHDR_IDENT:
+            raise StreamError("Bad image id: Expected 0x%x, got 0x%x"
+                              % (IHDR_IDENT, ident))
+
+        if version != IHDR_VERSION:
+            raise StreamError("Unknown image version: Expected %d, got %d"
+                              % (IHDR_VERSION, version))
+
+        if options & IHDR_OPT_RESZ_MASK:
+            raise StreamError("Reserved bits set in image options field: 0x%x"
+                              % (options & IHDR_OPT_RESZ_MASK))
+
+        if res1 != 0 or res2 != 0:
+            raise StreamError("Reserved bits set in image header: 0x%04x:0x%08x"
+                              % (res1, res2))
+
+        if ( (sys.byteorder == "little") and
+             ((options & IHDR_OPT_BIT_ENDIAN) != IHDR_OPT_LE) ):
+            raise StreamError(
+                "Stream is not native endianess - unable to validate")
+
+        endian = ["little", "big"][options & IHDR_OPT_LE]
+        self.info("Libxc Image Header: %s endian" % (endian, ))
+
+
+    def verify_dhdr(self):
+        """ Verify a domain header """
+
+        gtype, page_shift, res1, major, minor = \
+            self.unpack_exact(DHDR_FORMAT)
+
+        if gtype not in dhdr_type_to_str:
+            raise StreamError("Unrecognised domain type 0x%x" % (gtype, ))
+
+        if res1 != 0:
+            raise StreamError("Reserved bits set in domain header 0x%04x"
+                              % (res1, ))
+
+        if page_shift != 12:
+            raise StreamError("Page shift expected to be 12.  Got %d"
+                              % (page_shift, ))
+
+        if major == 0:
+            self.info("Domain Header: legacy converted %s"
+                      % (dhdr_type_to_str[gtype], ))
+        else:
+            self.info("Domain Header: %s from Xen %d.%d"
+                      % (dhdr_type_to_str[gtype], major, minor))
+
+
+    def verify_record(self):
+        """ Verify an individual record """
+
+        rtype, length = self.unpack_exact(RH_FORMAT)
+
+        if rtype not in rec_type_to_str:
+            raise StreamError("Unrecognised record type 0x%x" % (rtype, ))
+
+        contentsz = (length + 7) & ~7
+        content = self.rdexact(contentsz)
+
+        if rtype != REC_TYPE_page_data:
+
+            if self.squashed_pagedata_records > 0:
+                self.info("Squashed %d Page Data records together"
+                          % (self.squashed_pagedata_records, ))
+                self.squashed_pagedata_records = 0
+
+            self.info("Libxc Record: %s, length %d"
+                      % (rec_type_to_str[rtype], length))
+
+        else:
+            self.squashed_pagedata_records += 1
+
+        padding = content[length:]
+        if padding != "\x00" * len(padding):
+            raise StreamError("Padding containing non0 bytes found")
+
+        if rtype not in record_verifiers:
+            raise RuntimeError("No verification function for libxc record '%s'"
+                               % rec_type_to_str[rtype])
+        else:
+            record_verifiers[rtype](self, content[:length])
+
+        return rtype
+
+
+    def verify_record_end(self, content):
+        """ End record """
+
+        if len(content) != 0:
+            raise RecordError("End record with non-zero length")
+
+
+    def verify_record_page_data(self, content):
+        """ Page Data record """
+        minsz = calcsize(PAGE_DATA_FORMAT)
+
+        if len(content) <= minsz:
+            raise RecordError("PAGE_DATA record must be at least %d bytes long"
+                              % (minsz, ))
+
+        count, res1 = unpack(PAGE_DATA_FORMAT, content[:minsz])
+
+        if res1 != 0:
+            raise StreamError("Reserved bits set in PAGE_DATA record 0x%04x"
+                              % (res1, ))
+
+        pfnsz = count * 8
+        if (len(content) - minsz) < pfnsz:
+            raise RecordError("PAGE_DATA record must contain a pfn record for "
+                              "each count")
+
+        pfns = list(unpack("=%dQ" % (count,), content[minsz:minsz + pfnsz]))
+
+        nr_pages = 0
+        for idx, pfn in enumerate(pfns):
+
+            if pfn & PAGE_DATA_PFN_RESZ_MASK:
+                raise RecordError("Reserved bits set in pfn[%d]: 0x%016x",
+                                  idx, pfn & PAGE_DATA_PFN_RESZ_MASK)
+
+            if pfn >> PAGE_DATA_TYPE_SHIFT in (5, 6, 7, 8):
+                raise RecordError("Invalid type value in pfn[%d]: 0x%016x",
+                                  idx, pfn & PAGE_DATA_TYPE_LTAB_MASK)
+
+            # We expect page data for each normal page or pagetable
+            if PAGE_DATA_TYPE_NOTAB <= (pfn & PAGE_DATA_TYPE_LTABTYPE_MASK) \
+                    <= PAGE_DATA_TYPE_L4TAB:
+                nr_pages += 1
+
+        pagesz = nr_pages * 4096
+        if len(content) != minsz + pfnsz + pagesz:
+            raise RecordError("Expected %u + %u + %u, got %u"
+                              % (minsz, pfnsz, pagesz, len(content)))
+
+
+    def verify_record_x86_pv_info(self, content):
+        """ x86 PV Info record """
+
+        expectedsz = calcsize(X86_PV_INFO_FORMAT)
+        if len(content) != expectedsz:
+            raise RecordError("x86_pv_info: expected length of %d, got %d"
+                              % (expectedsz, len(content)))
+
+        width, levels, res1, res2 = unpack(X86_PV_INFO_FORMAT, content)
+
+        if width not in (4, 8):
+            raise RecordError("Expected width of 4 or 8, got %d" % (width, ))
+
+        if levels not in (3, 4):
+            raise RecordError("Expected levels of 3 or 4, got %d" % (levels, ))
+
+        if res1 != 0 or res2 != 0:
+            raise StreamError("Reserved bits set in X86_PV_INFO: 0x%04x 0x%08x"
+                              % (res1, res2))
+
+        bitness = {4:32, 8:64}[width]
+        self.info("  %sbit guest, %d levels of pagetables" % (bitness, levels))
+
+
+    def verify_record_x86_pv_p2m_frames(self, content):
+        """ x86 PV p2m frames record """
+
+        if len(content) % 8 != 0:
+            raise RecordError("Length expected to be a multiple of 8, not %d"
+                              % (len(content), ))
+
+        start, end = unpack("=II", content[:8])
+        self.info("  Start pfn 0x%x, End 0x%x" % (start, end))
+
+
+    def verify_record_x86_pv_vcpu_generic(self, content, name):
+        """ Generic for all REC_TYPE_x86_pv_vcpu_{basic,extended,xsave,msrs} """
+        minsz = calcsize(X86_PV_VCPU_HDR_FORMAT)
+
+        if len(content) <= minsz:
+            raise RecordError("X86_PV_VCPU_%s record length must be at least %d"
+                              " bytes long" % (name, minsz))
+
+        vcpuid, res1 = unpack(X86_PV_VCPU_HDR_FORMAT, content[:minsz])
+
+        if res1 != 0:
+            raise StreamError(
+                "Reserved bits set in x86_pv_vcpu_%s record 0x%04x"
+                              % (name, res1))
+
+        self.info("  vcpu%d %s context, %d bytes"
+                  % (vcpuid, name, len(content) - minsz))
+
+
+    def verify_record_shared_info(self, content):
+        """ shared info record """
+
+        if len(content) != 4096:
+            raise RecordError("Length expected to be 4906 bytes, not %d"
+                              % (len(content), ))
+
+
+    def verify_record_tsc_info(self, content):
+        """ tsc info record """
+
+        sz = calcsize(TSC_INFO_FORMAT)
+
+        if len(content) != sz:
+            raise RecordError("Length should be %u bytes" % (sz, ))
+
+        mode, khz, nsec, incarn, res1 = unpack(TSC_INFO_FORMAT, content)
+
+        if res1 != 0:
+            raise StreamError("Reserved bits set in TSC_INFO: 0x%08x"
+                              % (res1, ))
+
+        self.info("  Mode %u, %u kHz, %u ns, incarnation %d"
+                  % (mode, khz, nsec, incarn))
+
+
+    def verify_record_hvm_context(self, content):
+        """ hvm context record """
+
+        if len(content) == 0:
+            raise RecordError("Zero length HVM context")
+
+
+    def verify_record_hvm_params(self, content):
+        """ hvm params record """
+
+        sz = calcsize(HVM_PARAMS_FORMAT)
+
+        if len(content) < sz:
+            raise RecordError("Length should be at least %u bytes" % (sz, ))
+
+        count, rsvd = unpack(HVM_PARAMS_FORMAT, content[:sz])
+
+        if rsvd != 0:
+            raise RecordError("Reserved field not zero (0x%04x)" % (rsvd, ))
+
+        sz += count * calcsize(HVM_PARAMS_ENTRY_FORMAT)
+
+        if len(content) != sz:
+            raise RecordError("Length should be %u bytes" % (sz, ))
+
+
+    def verify_record_toolstack(self, _):
+        """ toolstack record """
+        raise DeprecationWarning("Found Toolstack record in stream")
+
+
+    def verify_record_verify(self, content):
+        """ verify record """
+
+        if len(content) != 0:
+            raise RecordError("Verify record with non-zero length")
+
+
+    def verify_record_checkpoint(self, content):
+        """ checkpoint record """
+
+        if len(content) != 0:
+            raise RecordError("Checkpoint record with non-zero length")
+
+
+record_verifiers = {
+    REC_TYPE_end:
+        VerifyLibxc.verify_record_end,
+    REC_TYPE_page_data:
+        VerifyLibxc.verify_record_page_data,
+
+    REC_TYPE_x86_pv_info:
+        VerifyLibxc.verify_record_x86_pv_info,
+    REC_TYPE_x86_pv_p2m_frames:
+        VerifyLibxc.verify_record_x86_pv_p2m_frames,
+
+    REC_TYPE_x86_pv_vcpu_basic:
+        lambda s, x:
+        VerifyLibxc.verify_record_x86_pv_vcpu_generic(s, x, "basic"),
+    REC_TYPE_x86_pv_vcpu_extended:
+        lambda s, x:
+        VerifyLibxc.verify_record_x86_pv_vcpu_generic(s, x, "extended"),
+    REC_TYPE_x86_pv_vcpu_xsave:
+        lambda s, x:
+        VerifyLibxc.verify_record_x86_pv_vcpu_generic(s, x, "xsave"),
+    REC_TYPE_x86_pv_vcpu_msrs:
+        lambda s, x:
+        VerifyLibxc.verify_record_x86_pv_vcpu_generic(s, x, "msrs"),
+
+    REC_TYPE_shared_info:
+        VerifyLibxc.verify_record_shared_info,
+    REC_TYPE_tsc_info:
+        VerifyLibxc.verify_record_tsc_info,
+
+    REC_TYPE_hvm_context:
+        VerifyLibxc.verify_record_hvm_context,
+    REC_TYPE_hvm_params:
+        VerifyLibxc.verify_record_hvm_params,
+    REC_TYPE_toolstack:
+        VerifyLibxc.verify_record_toolstack,
+    REC_TYPE_verify:
+        VerifyLibxc.verify_record_verify,
+    REC_TYPE_checkpoint:
+        VerifyLibxc.verify_record_checkpoint,
+    }
diff --git a/tools/python/xen/migration/libxl.py b/tools/python/xen/migration/libxl.py
new file mode 100644
index 0000000..fc0acf6
--- /dev/null
+++ b/tools/python/xen/migration/libxl.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Libxl Migration v2 streams
+
+Record structures as per docs/specs/libxl-migration-stream.pandoc, and
+verification routines.
+"""
+
+import sys
+
+from struct import calcsize, unpack, unpack_from
+from xen.migration.verify import StreamError, RecordError, VerifyBase
+from xen.migration.libxc import VerifyLibxc
+
+# Header
+HDR_FORMAT = "!QII"
+
+HDR_IDENT = 0x4c6962786c466d74 # "LibxlFmt" in ASCII
+HDR_VERSION = 2
+
+HDR_OPT_BIT_ENDIAN = 0
+HDR_OPT_BIT_LEGACY = 1
+
+HDR_OPT_LE     = (0 << HDR_OPT_BIT_ENDIAN)
+HDR_OPT_BE     = (1 << HDR_OPT_BIT_ENDIAN)
+HDR_OPT_LEGACY = (1 << HDR_OPT_BIT_LEGACY)
+
+HDR_OPT_RESZ_MASK = 0xfffc
+
+# Records
+RH_FORMAT = "II"
+
+REC_TYPE_end                    = 0x00000000
+REC_TYPE_libxc_context          = 0x00000001
+REC_TYPE_emulator_xenstore_data = 0x00000002
+REC_TYPE_emulator_context       = 0x00000003
+REC_TYPE_checkpoint_end         = 0x00000004
+
+rec_type_to_str = {
+    REC_TYPE_end                    : "End",
+    REC_TYPE_libxc_context          : "Libxc context",
+    REC_TYPE_emulator_xenstore_data : "Emulator xenstore data",
+    REC_TYPE_emulator_context       : "Emulator context",
+    REC_TYPE_checkpoint_end         : "Checkpoint end",
+}
+
+# emulator_* header
+EMULATOR_HEADER_FORMAT = "II"
+
+EMULATOR_ID_unknown       = 0x00000000
+EMULATOR_ID_qemu_trad     = 0x00000001
+EMULATOR_ID_qemu_upstream = 0x00000002
+
+emulator_id_to_str = {
+    EMULATOR_ID_unknown       : "Unknown",
+    EMULATOR_ID_qemu_trad     : "Qemu Traditional",
+    EMULATOR_ID_qemu_upstream : "Qemu Upstream",
+}
+
+
+#
+# libxl format
+#
+
+LIBXL_QEMU_SIGNATURE = "DeviceModelRecord0002"
+LIBXL_QEMU_RECORD_HDR = "=%dsI" % (len(LIBXL_QEMU_SIGNATURE), )
+
+class VerifyLibxl(VerifyBase):
+    """ Verify a Libxl v2 stream """
+
+    def __init__(self, info, read):
+        VerifyBase.__init__(self, info, read)
+
+
+    def verify(self):
+        """ Verity a libxl stream """
+
+        self.verify_hdr()
+
+        while self.verify_record() != REC_TYPE_end:
+            pass
+
+
+    def verify_hdr(self):
+        """ Verify a Header """
+        ident, version, options = self.unpack_exact(HDR_FORMAT)
+
+        if ident != HDR_IDENT:
+            raise StreamError("Bad image id: Expected 0x%x, got 0x%x"
+                              % (HDR_IDENT, ident))
+
+        if version != HDR_VERSION:
+            raise StreamError("Unknown image version: Expected %d, got %d"
+                              % (HDR_VERSION, version))
+
+        if options & HDR_OPT_RESZ_MASK:
+            raise StreamError("Reserved bits set in image options field: 0x%x"
+                              % (options & HDR_OPT_RESZ_MASK))
+
+        if ( (sys.byteorder == "little") and
+             ((options & HDR_OPT_BIT_ENDIAN) != HDR_OPT_LE) ):
+            raise StreamError(
+                "Stream is not native endianess - unable to validate")
+
+        endian = ["little", "big"][options & HDR_OPT_LE]
+
+        if options & HDR_OPT_LEGACY:
+            self.info("Libxl Header: %s endian, legacy converted" % (endian, ))
+        else:
+            self.info("Libxl Header: %s endian" % (endian, ))
+
+
+    def verify_record(self):
+        """ Verify an individual record """
+        rtype, length = self.unpack_exact(RH_FORMAT)
+
+        if rtype not in rec_type_to_str:
+            raise StreamError("Unrecognised record type %x" % (rtype, ))
+
+        self.info("Libxl Record: %s, length %d"
+                  % (rec_type_to_str[rtype], length))
+
+        contentsz = (length + 7) & ~7
+        content = self.rdexact(contentsz)
+
+        padding = content[length:]
+        if padding != "\x00" * len(padding):
+            raise StreamError("Padding containing non0 bytes found")
+
+        if rtype not in record_verifiers:
+            raise RuntimeError("No verification function for libxl record '%s'"
+                               % rec_type_to_str[rtype])
+        else:
+            record_verifiers[rtype](self, content[:length])
+
+        return rtype
+
+
+    def verify_record_end(self, content):
+        """ End record """
+
+        if len(content) != 0:
+            raise RecordError("End record with non-zero length")
+
+
+    def verify_record_libxc_context(self, content):
+        """ Libxc context record """
+
+        if len(content) != 0:
+            raise RecordError("Libxc context record with non-zero length")
+
+        # Verify the libxc stream, as we can't seek forwards through it
+        VerifyLibxc(self.info, self.read).verify()
+
+
+    def verify_record_emulator_xenstore_data(self, content):
+        """ Emulator Xenstore Data record """
+        minsz = calcsize(EMULATOR_HEADER_FORMAT)
+
+        if len(content) < minsz:
+            raise RecordError("Length must be at least %d bytes, got %d"
+                              % (minsz, len(content)))
+
+        emu_id, emu_idx = unpack(EMULATOR_HEADER_FORMAT, content[:minsz])
+
+        if emu_id not in emulator_id_to_str:
+            raise RecordError("Unrecognised emulator id 0x%x" % (emu_id, ))
+
+        self.info("Emulator Xenstore Data (%s, idx %d)"
+                  % (emulator_id_to_str[emu_id], emu_idx))
+
+        # Chop off the emulator header
+        content = content[minsz:]
+
+        if len(content):
+
+            if content[-1] != '\x00':
+                raise RecordError("Data not NUL terminated")
+
+            # Split without the final NUL, to get an even number of parts
+            parts = content[:-1].split("\x00")
+
+            if (len(parts) % 2) != 0:
+                raise RecordError("Expected an even number of strings, got %d"
+                                  % (len(parts), ))
+
+            for key, val in zip(parts[0::2], parts[1::2]):
+                self.info("  '%s' = '%s'" % (key, val))
+
+
+    def verify_record_emulator_context(self, content):
+        """ Emulator Context record """
+        minsz = calcsize(EMULATOR_HEADER_FORMAT)
+
+        if len(content) < minsz:
+            raise RecordError("Length must be at least %d bytes, got %d"
+                              % (minsz, len(content)))
+
+        emu_id, emu_idx = unpack(EMULATOR_HEADER_FORMAT, content[:minsz])
+
+        if emu_id not in emulator_id_to_str:
+            raise RecordError("Unrecognised emulator id 0x%x" % (emu_id, ))
+
+        self.info("  Index %d, type %s" % (emu_idx, emulator_id_to_str[emu_id]))
+
+
+    def verify_record_checkpoint_end(self, content):
+        """ Checkpoint end record """
+
+        if len(content) != 0:
+            raise RecordError("Checkpoint end record with non-zero length")
+
+
+record_verifiers = {
+    REC_TYPE_end:
+        VerifyLibxl.verify_record_end,
+    REC_TYPE_libxc_context:
+        VerifyLibxl.verify_record_libxc_context,
+    REC_TYPE_emulator_xenstore_data:
+        VerifyLibxl.verify_record_emulator_xenstore_data,
+    REC_TYPE_emulator_context:
+        VerifyLibxl.verify_record_emulator_context,
+    REC_TYPE_checkpoint_end:
+        VerifyLibxl.verify_record_checkpoint_end,
+}
diff --git a/tools/python/xen/migration/public.py b/tools/python/xen/migration/public.py
new file mode 100644
index 0000000..fab2f84
--- /dev/null
+++ b/tools/python/xen/migration/public.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Xen public ABI constants, used in migration
+"""
+
+HVM_PARAM_STORE_PFN             = 1
+HVM_PARAM_IOREQ_PFN             = 5
+HVM_PARAM_BUFIOREQ_PFN          = 6
+HVM_PARAM_VIRIDIAN              = 9
+HVM_PARAM_IDENT_PT              = 12
+HVM_PARAM_VM86_TSS              = 15
+HVM_PARAM_CONSOLE_PFN           = 17
+HVM_PARAM_ACPI_IOPORTS_LOCATION = 19
+HVM_PARAM_PAGING_RING_PFN       = 27
+HVM_PARAM_MONITOR_RING_PFN      = 28
+HVM_PARAM_SHARING_RING_PFN      = 29
+HVM_PARAM_IOREQ_SERVER_PFN      = 32
+HVM_PARAM_NR_IOREQ_SERVER_PAGES = 33
+HVM_PARAM_VM_GENERATION_ID_ADDR = 34
diff --git a/tools/python/xen/migration/tests.py b/tools/python/xen/migration/tests.py
new file mode 100644
index 0000000..026cf38
--- /dev/null
+++ b/tools/python/xen/migration/tests.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Unit tests for migration v2 streams
+"""
+
+import unittest
+
+from struct import calcsize
+
+from xen.migration import libxc, libxl
+
+class TestLibxc(unittest.TestCase):
+
+    def test_format_sizes(self):
+
+        for fmt, sz in ( (libxc.IHDR_FORMAT, 24),
+                         (libxc.DHDR_FORMAT, 16),
+                         (libxc.RH_FORMAT, 8),
+
+                         (libxc.PAGE_DATA_FORMAT, 8),
+                         (libxc.X86_PV_INFO_FORMAT, 8),
+                         (libxc.X86_PV_P2M_FRAMES_FORMAT, 8),
+                         (libxc.X86_PV_VCPU_HDR_FORMAT, 8),
+                         (libxc.TSC_INFO_FORMAT, 24),
+                         (libxc.HVM_PARAMS_ENTRY_FORMAT, 16),
+                         (libxc.HVM_PARAMS_FORMAT, 8),
+                         ):
+            self.assertEqual(calcsize(fmt), sz)
+
+
+class TestLibxl(unittest.TestCase):
+
+    def test_format_sizes(self):
+
+        for fmt, sz in ( (libxl.HDR_FORMAT, 16),
+                         (libxl.RH_FORMAT, 8),
+
+                         (libxl.EMULATOR_HEADER_FORMAT, 8),
+                         ):
+            self.assertEqual(calcsize(fmt), sz)
+
+
+def test_suite():
+    suite = unittest.TestSuite()
+
+    suite.addTest(unittest.makeSuite(TestLibxc))
+    suite.addTest(unittest.makeSuite(TestLibxl))
+
+    return suite
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/python/xen/migration/verify.py b/tools/python/xen/migration/verify.py
new file mode 100644
index 0000000..7a42dbf
--- /dev/null
+++ b/tools/python/xen/migration/verify.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Common verification infrastructure for v2 streams
+"""
+
+from struct import calcsize, unpack
+
+class StreamError(StandardError):
+    """Error with the stream"""
+    pass
+
+class RecordError(StandardError):
+    """Error with a record in the stream"""
+    pass
+
+
+class VerifyBase(object):
+
+    def __init__(self, info, read):
+
+        self.info = info
+        self.read = read
+
+    def rdexact(self, nr_bytes):
+        """Read exactly nr_bytes from the stream"""
+        _ = self.read(nr_bytes)
+        if len(_) != nr_bytes:
+            raise IOError("Stream truncated")
+        return _
+
+    def unpack_exact(self, fmt):
+        """Unpack a struct format string from the stream"""
+        sz = calcsize(fmt)
+        return unpack(fmt, self.rdexact(sz))
+
diff --git a/tools/python/xen/migration/xl.py b/tools/python/xen/migration/xl.py
new file mode 100644
index 0000000..978e744
--- /dev/null
+++ b/tools/python/xen/migration/xl.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+XL migration stream format
+"""
+
+MAGIC = "Xen saved domain, xl format\n \0 \r"
+
+HEADER_FORMAT = "=IIII"
+
+MANDATORY_FLAG_STREAMV2 = 2
diff --git a/tools/tests/mce-test/Makefile b/tools/tests/mce-test/Makefile
index 7c80eb3..07a774a 100644
--- a/tools/tests/mce-test/Makefile
+++ b/tools/tests/mce-test/Makefile
@@ -1,7 +1,10 @@
-.PHONY: all clean
+.PHONY: all clean distclean
 
 all: 
 	$(MAKE) -C tools
 
 clean:
 	$(MAKE) -C tools clean
+
+distclean:
+	$(MAKE) -C tools distclean
diff --git a/tools/tests/mce-test/cases/srao_llc/dom0/cases.sh b/tools/tests/mce-test/cases/srao_llc/dom0/cases.sh
index 8f63eaa..c540f64 100644
--- a/tools/tests/mce-test/cases/srao_llc/dom0/cases.sh
+++ b/tools/tests/mce-test/cases/srao_llc/dom0/cases.sh
@@ -12,8 +12,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/cases/srao_llc/guest/cases.sh b/tools/tests/mce-test/cases/srao_llc/guest/cases.sh
index 16b0b8e..47a7ee4 100644
--- a/tools/tests/mce-test/cases/srao_llc/guest/cases.sh
+++ b/tools/tests/mce-test/cases/srao_llc/guest/cases.sh
@@ -12,8 +12,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/cases/srao_llc/xen/cases.sh b/tools/tests/mce-test/cases/srao_llc/xen/cases.sh
index 49e3e38..1d8e02f 100644
--- a/tools/tests/mce-test/cases/srao_llc/xen/cases.sh
+++ b/tools/tests/mce-test/cases/srao_llc/xen/cases.sh
@@ -12,8 +12,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/cases/srao_mem/dom0/cases.sh b/tools/tests/mce-test/cases/srao_mem/dom0/cases.sh
index ea896d5..22d4a00 100644
--- a/tools/tests/mce-test/cases/srao_mem/dom0/cases.sh
+++ b/tools/tests/mce-test/cases/srao_mem/dom0/cases.sh
@@ -12,8 +12,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/cases/srao_mem/guest/cases.sh b/tools/tests/mce-test/cases/srao_mem/guest/cases.sh
index 2d6e054..7ab4523 100644
--- a/tools/tests/mce-test/cases/srao_mem/guest/cases.sh
+++ b/tools/tests/mce-test/cases/srao_mem/guest/cases.sh
@@ -12,8 +12,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/cases/srao_mem/xen/cases.sh b/tools/tests/mce-test/cases/srao_mem/xen/cases.sh
index 7f0e213..7ae49a8 100644
--- a/tools/tests/mce-test/cases/srao_mem/xen/cases.sh
+++ b/tools/tests/mce-test/cases/srao_mem/xen/cases.sh
@@ -12,8 +12,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/cases/ucna_llc/dom0/cases.sh b/tools/tests/mce-test/cases/ucna_llc/dom0/cases.sh
index ab30ffc..808f007 100644
--- a/tools/tests/mce-test/cases/ucna_llc/dom0/cases.sh
+++ b/tools/tests/mce-test/cases/ucna_llc/dom0/cases.sh
@@ -12,8 +12,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/cases/ucna_llc/guest/cases.sh b/tools/tests/mce-test/cases/ucna_llc/guest/cases.sh
index 5212770..0ca4e2c 100644
--- a/tools/tests/mce-test/cases/ucna_llc/guest/cases.sh
+++ b/tools/tests/mce-test/cases/ucna_llc/guest/cases.sh
@@ -12,8 +12,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/cases/ucna_llc/xen/cases.sh b/tools/tests/mce-test/cases/ucna_llc/xen/cases.sh
index d1f6db5..c73a2f6 100644
--- a/tools/tests/mce-test/cases/ucna_llc/xen/cases.sh
+++ b/tools/tests/mce-test/cases/ucna_llc/xen/cases.sh
@@ -12,8 +12,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/config/setup.conf b/tools/tests/mce-test/config/setup.conf
index 40db017..05f754d 100644
--- a/tools/tests/mce-test/config/setup.conf
+++ b/tools/tests/mce-test/config/setup.conf
@@ -14,8 +14,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/lib/xen-mceinj-tool.sh b/tools/tests/mce-test/lib/xen-mceinj-tool.sh
index 1d25d2a..c0a3b29 100644
--- a/tools/tests/mce-test/lib/xen-mceinj-tool.sh
+++ b/tools/tests/mce-test/lib/xen-mceinj-tool.sh
@@ -14,8 +14,7 @@
 # General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+# along with this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # Author: Xudong Hao <xudong.hao at intel.com>
 #
diff --git a/tools/tests/mce-test/tools/Makefile b/tools/tests/mce-test/tools/Makefile
index aba7177..924e4dd 100644
--- a/tools/tests/mce-test/tools/Makefile
+++ b/tools/tests/mce-test/tools/Makefile
@@ -13,11 +13,14 @@ CFLAGS += $(CFLAGS_xeninclude)
 all: xen-mceinj
 
 install: xen-mceinj
-	$(INSTALL_PROG) xen-mceinj $(DESTDIR)$(SBINDIR)
+	$(INSTALL_PROG) xen-mceinj $(DESTDIR)$(sbindir)
 
 .PHONY: clean
 clean:
 	$(RM) *.o xen-mceinj
 
+.PHONY: distclean
+distclean: clean
+
 xen-mceinj: xen-mceinj.o Makefile
 	$(CC) -o $@ $< $(LDFLAGS) $(LDLIBS_libxenctrl) $(LDLIBS_libxenguest) $(LDLIBS_libxenstore)
diff --git a/tools/tests/mce-test/tools/xen-mceinj.c b/tools/tests/mce-test/tools/xen-mceinj.c
index 8ad045f..e2e49cb 100644
--- a/tools/tests/mce-test/tools/xen-mceinj.c
+++ b/tools/tests/mce-test/tools/xen-mceinj.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  * 
  * Authors: Yunhong Jiang <yunhong.jiang at intel.com>
  *          Haicheng Li <haicheng.li at intel.com>
diff --git a/tools/tests/mem-sharing/Makefile b/tools/tests/mem-sharing/Makefile
index 1354502..d89e283 100644
--- a/tools/tests/mem-sharing/Makefile
+++ b/tools/tests/mem-sharing/Makefile
@@ -20,6 +20,9 @@ build: $(TARGETS)
 clean:
 	$(RM) *.o $(TARGETS) *~ $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 memshrtool: memshrtool.o
 	$(CC) -o $@ $< $(LDFLAGS) $(LDLIBS_libxenctrl)
 
diff --git a/tools/tests/mem-sharing/memshrtool.c b/tools/tests/mem-sharing/memshrtool.c
index db44294..6454bc3 100644
--- a/tools/tests/mem-sharing/memshrtool.c
+++ b/tools/tests/mem-sharing/memshrtool.c
@@ -55,11 +55,19 @@ int main(int argc, const char** argv)
 
     if( !strcasecmp(cmd, "info") )
     {
+        long rc;
         if( argc != 2 )
             return usage(argv[0]);
 
-        printf("used = %ld\n", xc_sharing_used_frames(xch));
-        printf("freed = %ld\n", xc_sharing_freed_pages(xch));
+        rc = xc_sharing_freed_pages(xch);
+        if ( rc < 0 )
+            return 1;
+
+        printf("used = %ld\n", rc);
+        rc = xc_sharing_used_frames(xch);
+        if ( rc < 0 )
+            return 1;
+        printf("freed = %ld\n", rc);
     }
     else if( !strcasecmp(cmd, "enable") )
     {
diff --git a/tools/tests/regression/Makefile b/tools/tests/regression/Makefile
index 6e91023..70d524a 100644
--- a/tools/tests/regression/Makefile
+++ b/tools/tests/regression/Makefile
@@ -22,8 +22,8 @@ check-python-syntax: runtime-environment
 .PHONY: runtime-environment
 runtime-environment: $(PYTHON_VERSIONS)
 
-.PHONY: dist-clean
-dist-clean:
+.PHONY: distclean
+distclean:
 	rm -fr $(REG_TEST_DIR)/installed $(REG_TEST_DIR)/downloads \
 		$(REG_TEST_DIR)/build
 
diff --git a/tools/tests/utests/run_all_tests.py b/tools/tests/utests/run_all_tests.py
index 3e302dd..7318c81 100644
--- a/tools/tests/utests/run_all_tests.py
+++ b/tools/tests/utests/run_all_tests.py
@@ -9,8 +9,7 @@
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# License along with this library; If not, see <http://www.gnu.org/licenses/>.
 #============================================================================
 # Copyright (C) 2009 flonatel GmbH & Co. KG
 #============================================================================
diff --git a/tools/tests/vhpet/Makefile b/tools/tests/vhpet/Makefile
index 763409d..cb88dd0 100644
--- a/tools/tests/vhpet/Makefile
+++ b/tools/tests/vhpet/Makefile
@@ -26,6 +26,9 @@ $(TARGET): hpet.c main.c hpet.h emul.h Makefile
 clean:
 	rm -rf $(TARGET) $(TARGET).out *.o *~ core* hpet.h hpet.c
 
+.PHONY: distclean
+distclean: clean
+
 .PHONY: install
 install:
 
diff --git a/tools/tests/vhpet/emul.h b/tools/tests/vhpet/emul.h
index 09e4611..383acff 100644
--- a/tools/tests/vhpet/emul.h
+++ b/tools/tests/vhpet/emul.h
@@ -237,11 +237,11 @@ typedef int (*hvm_mmio_write_t)(struct vcpu *v,
 typedef int (*hvm_mmio_check_t)(struct vcpu *v, unsigned long addr);
 
 
-struct hvm_mmio_handler
+struct hvm_mmio_ops
 {
-    hvm_mmio_check_t check_handler;
-    hvm_mmio_read_t read_handler;
-    hvm_mmio_write_t write_handler;
+    hvm_mmio_check_t check;
+    hvm_mmio_read_t  read;
+    hvm_mmio_write_t write;
 };
 
 /* Marshalling and unmarshalling uses a buffer with size and cursor. */
diff --git a/tools/tests/vhpet/main.c b/tools/tests/vhpet/main.c
index fbd7510..6fe65ea 100644
--- a/tools/tests/vhpet/main.c
+++ b/tools/tests/vhpet/main.c
@@ -70,7 +70,7 @@ static int skip_error_on_load;
 
 static char *global_thousep;
 
-extern const struct hvm_mmio_handler hpet_mmio_handler;
+extern const struct hvm_mmio_ops hpet_mmio_ops;
 
 struct domain dom1;
 struct vcpu vcpu0;
@@ -297,13 +297,13 @@ void udelay(int w)
 unsigned int hpet_readl(unsigned long a)
 {
     unsigned long ret = 0;
-    hpet_mmio_handler.read_handler(current, a, 4, &ret);
+    hpet_mmio_ops.read(current, a, 4, &ret);
     return ret;
 }
 
 void hpet_writel(unsigned long d, unsigned long a)
 {
-    hpet_mmio_handler.write_handler(current, a, 4, d);
+    hpet_mmio_ops.write(current, a, 4, d);
     return;
 }
 
diff --git a/tools/tests/x86_emulator/Makefile b/tools/tests/x86_emulator/Makefile
index 73517b7..b52f227 100644
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -34,6 +34,9 @@ $(TARGET): x86_emulate.o test_x86_emulator.o
 clean:
 	rm -rf $(TARGET) *.o *~ core blowfish.h blowfish.bin x86_emulate
 
+.PHONY: distclean
+distclean: clean
+
 .PHONY: install
 install:
 
diff --git a/tools/tests/x86_emulator/blowfish.c b/tools/tests/x86_emulator/blowfish.c
index 8b9280c..8f0939c 100644
--- a/tools/tests/x86_emulator/blowfish.c
+++ b/tools/tests/x86_emulator/blowfish.c
@@ -12,8 +12,7 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.
 You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+License along with this library; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #include <stdint.h>
@@ -21,7 +20,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 uint64_t blowfish_test(uint64_t input);
 
 asm (
-    ".globl _start\n"
+    "\t.text\n"
+    "\t.globl _start\n"
     "_start:\n"
 #if defined(__i386__)
     "push %edx; push %eax; "
diff --git a/tools/tests/x86_emulator/test_x86_emulator.c b/tools/tests/x86_emulator/test_x86_emulator.c
index 6f67fc7..1b78bf7 100644
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -288,7 +288,7 @@ int main(int argc, char **argv)
     rc = x86_emulate(&ctxt, &emulops);
     if ( (rc != X86EMUL_OKAY) || 
          (*res != 0x923456AA) || 
-         ((regs.eflags&0x240) != 0x200) ||
+         ((regs.eflags & 0xad5) != 0xa91) ||
          (regs.eax != 0xAABBCCAA) ||
          (regs.ecx != 0xFF) ||
          (regs.eip != (unsigned long)&instr[4]) )
@@ -934,3 +934,12 @@ int main(int argc, char **argv)
     printf("failed!\n");
     return 1;
 }
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/tests/x86_emulator/x86_emulate.c b/tools/tests/x86_emulator/x86_emulate.c
index ef9bfe9..0b3b34a 100644
--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -17,4 +17,8 @@ typedef bool bool_t;
 #define __packed __attribute__((packed))
 
 #include "x86_emulate/x86_emulate.h"
+
+#define get_stub(stb) ((void *)((stb).addr = (uintptr_t)(stb).buf))
+#define put_stub(stb)
+
 #include "x86_emulate/x86_emulate.c"
diff --git a/tools/tests/xen-access/Makefile b/tools/tests/xen-access/Makefile
index 65eef99..f810543 100644
--- a/tools/tests/xen-access/Makefile
+++ b/tools/tests/xen-access/Makefile
@@ -7,8 +7,8 @@ CFLAGS += $(CFLAGS_libxenctrl)
 CFLAGS += $(CFLAGS_libxenguest)
 CFLAGS += $(CFLAGS_xeninclude)
 
-TARGETS-y := 
-TARGETS-$(CONFIG_X86) += xen-access
+TARGETS-y :=
+TARGETS-$(HAS_MEM_ACCESS) := xen-access
 TARGETS := $(TARGETS-y)
 
 .PHONY: all
@@ -21,6 +21,9 @@ build: $(TARGETS)
 clean:
 	$(RM) *.o $(TARGETS) *~ $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 xen-access: xen-access.o Makefile
 	$(CC) -o $@ $< $(LDFLAGS) $(LDLIBS_libxenctrl) $(LDLIBS_libxenguest)
 
diff --git a/tools/tests/xen-access/xen-access.c b/tools/tests/xen-access/xen-access.c
index 6cb382d..a52ca6e 100644
--- a/tools/tests/xen-access/xen-access.c
+++ b/tools/tests/xen-access/xen-access.c
@@ -39,78 +39,34 @@
 #include <sys/poll.h>
 
 #include <xenctrl.h>
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
+
+#if defined(__arm__) || defined(__aarch64__)
+#include <xen/arch-arm.h>
+#define START_PFN (GUEST_RAM0_BASE >> 12)
+#elif defined(__i386__) || defined(__x86_64__)
+#define START_PFN 0ULL
+#endif
 
 #define DPRINTF(a, b...) fprintf(stderr, a, ## b)
 #define ERROR(a, b...) fprintf(stderr, a "\n", ## b)
 #define PERROR(a, b...) fprintf(stderr, a ": %s\n", ## b, strerror(errno))
 
-/* Spinlock and mem event definitions */
-
-#define SPIN_LOCK_UNLOCKED 0
-
-#define ADDR (*(volatile long *) addr)
-/**
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It also implies a memory barrier.
- */
-static inline int test_and_set_bit(int nr, volatile void *addr)
-{
-    int oldbit;
-
-    asm volatile (
-        "btsl %2,%1\n\tsbbl %0,%0"
-        : "=r" (oldbit), "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
-    return oldbit;
-}
-
-typedef int spinlock_t;
-
-static inline void spin_lock(spinlock_t *lock)
-{
-    while ( test_and_set_bit(1, lock) );
-}
-
-static inline void spin_lock_init(spinlock_t *lock)
-{
-    *lock = SPIN_LOCK_UNLOCKED;
-}
-
-static inline void spin_unlock(spinlock_t *lock)
-{
-    *lock = SPIN_LOCK_UNLOCKED;
-}
-
-static inline int spin_trylock(spinlock_t *lock)
-{
-    return !test_and_set_bit(1, lock);
-}
-
-#define mem_event_ring_lock_init(_m)  spin_lock_init(&(_m)->ring_lock)
-#define mem_event_ring_lock(_m)       spin_lock(&(_m)->ring_lock)
-#define mem_event_ring_unlock(_m)     spin_unlock(&(_m)->ring_lock)
-
-typedef struct mem_event {
+typedef struct vm_event {
     domid_t domain_id;
     xc_evtchn *xce_handle;
     int port;
-    mem_event_back_ring_t back_ring;
+    vm_event_back_ring_t back_ring;
     uint32_t evtchn_port;
     void *ring_page;
-    spinlock_t ring_lock;
-} mem_event_t;
+} vm_event_t;
 
 typedef struct xenaccess {
     xc_interface *xc_handle;
 
-    xc_domaininfo_t    *domain_info;
+    xen_pfn_t max_gpfn;
 
-    mem_event_t mem_event;
+    vm_event_t vm_event;
 } xenaccess_t;
 
 static int interrupted;
@@ -170,37 +126,40 @@ int xenaccess_teardown(xc_interface *xch, xenaccess_t *xenaccess)
         return 0;
 
     /* Tear down domain xenaccess in Xen */
-    if ( xenaccess->mem_event.ring_page )
-        munmap(xenaccess->mem_event.ring_page, XC_PAGE_SIZE);
+    if ( xenaccess->vm_event.ring_page )
+        munmap(xenaccess->vm_event.ring_page, XC_PAGE_SIZE);
 
     if ( mem_access_enable )
     {
-        rc = xc_mem_access_disable(xenaccess->xc_handle,
-                                   xenaccess->mem_event.domain_id);
+        rc = xc_monitor_disable(xenaccess->xc_handle,
+                                xenaccess->vm_event.domain_id);
         if ( rc != 0 )
         {
             ERROR("Error tearing down domain xenaccess in xen");
+            return rc;
         }
     }
 
     /* Unbind VIRQ */
     if ( evtchn_bind )
     {
-        rc = xc_evtchn_unbind(xenaccess->mem_event.xce_handle,
-                              xenaccess->mem_event.port);
+        rc = xc_evtchn_unbind(xenaccess->vm_event.xce_handle,
+                              xenaccess->vm_event.port);
         if ( rc != 0 )
         {
             ERROR("Error unbinding event port");
+            return rc;
         }
     }
 
     /* Close event channel */
     if ( evtchn_open )
     {
-        rc = xc_evtchn_close(xenaccess->mem_event.xce_handle);
+        rc = xc_evtchn_close(xenaccess->vm_event.xce_handle);
         if ( rc != 0 )
         {
             ERROR("Error closing event channel");
+            return rc;
         }
     }
 
@@ -209,10 +168,10 @@ int xenaccess_teardown(xc_interface *xch, xenaccess_t *xenaccess)
     if ( rc != 0 )
     {
         ERROR("Error closing connection to xen");
+        return rc;
     }
     xenaccess->xc_handle = NULL;
 
-    free(xenaccess->domain_info);
     free(xenaccess);
 
     return 0;
@@ -239,17 +198,14 @@ xenaccess_t *xenaccess_init(xc_interface **xch_r, domid_t domain_id)
     xenaccess->xc_handle = xch;
 
     /* Set domain id */
-    xenaccess->mem_event.domain_id = domain_id;
-
-    /* Initialise lock */
-    mem_event_ring_lock_init(&xenaccess->mem_event);
+    xenaccess->vm_event.domain_id = domain_id;
 
     /* Enable mem_access */
-    xenaccess->mem_event.ring_page =
-            xc_mem_access_enable(xenaccess->xc_handle,
-                                 xenaccess->mem_event.domain_id,
-                                 &xenaccess->mem_event.evtchn_port);
-    if ( xenaccess->mem_event.ring_page == NULL )
+    xenaccess->vm_event.ring_page =
+            xc_monitor_enable(xenaccess->xc_handle,
+                              xenaccess->vm_event.domain_id,
+                              &xenaccess->vm_event.evtchn_port);
+    if ( xenaccess->vm_event.ring_page == NULL )
     {
         switch ( errno ) {
             case EBUSY:
@@ -267,8 +223,8 @@ xenaccess_t *xenaccess_init(xc_interface **xch_r, domid_t domain_id)
     mem_access_enable = 1;
 
     /* Open event channel */
-    xenaccess->mem_event.xce_handle = xc_evtchn_open(NULL, 0);
-    if ( xenaccess->mem_event.xce_handle == NULL )
+    xenaccess->vm_event.xce_handle = xc_evtchn_open(NULL, 0);
+    if ( xenaccess->vm_event.xce_handle == NULL )
     {
         ERROR("Failed to open event channel");
         goto err;
@@ -276,58 +232,71 @@ xenaccess_t *xenaccess_init(xc_interface **xch_r, domid_t domain_id)
     evtchn_open = 1;
 
     /* Bind event notification */
-    rc = xc_evtchn_bind_interdomain(xenaccess->mem_event.xce_handle,
-                                    xenaccess->mem_event.domain_id,
-                                    xenaccess->mem_event.evtchn_port);
+    rc = xc_evtchn_bind_interdomain(xenaccess->vm_event.xce_handle,
+                                    xenaccess->vm_event.domain_id,
+                                    xenaccess->vm_event.evtchn_port);
     if ( rc < 0 )
     {
         ERROR("Failed to bind event channel");
         goto err;
     }
     evtchn_bind = 1;
-    xenaccess->mem_event.port = rc;
+    xenaccess->vm_event.port = rc;
 
     /* Initialise ring */
-    SHARED_RING_INIT((mem_event_sring_t *)xenaccess->mem_event.ring_page);
-    BACK_RING_INIT(&xenaccess->mem_event.back_ring,
-                   (mem_event_sring_t *)xenaccess->mem_event.ring_page,
+    SHARED_RING_INIT((vm_event_sring_t *)xenaccess->vm_event.ring_page);
+    BACK_RING_INIT(&xenaccess->vm_event.back_ring,
+                   (vm_event_sring_t *)xenaccess->vm_event.ring_page,
                    XC_PAGE_SIZE);
 
-    /* Get domaininfo */
-    xenaccess->domain_info = malloc(sizeof(xc_domaininfo_t));
-    if ( xenaccess->domain_info == NULL )
-    {
-        ERROR("Error allocating memory for domain info");
-        goto err;
-    }
+    /* Get max_gpfn */
+    rc = xc_domain_maximum_gpfn(xenaccess->xc_handle,
+                                xenaccess->vm_event.domain_id,
+                                &xenaccess->max_gpfn);
 
-    rc = xc_domain_getinfolist(xenaccess->xc_handle, domain_id, 1,
-                               xenaccess->domain_info);
-    if ( rc != 1 )
+    if ( rc )
     {
-        ERROR("Error getting domain info");
+        ERROR("Failed to get max gpfn");
         goto err;
     }
 
-    DPRINTF("max_pages = %"PRIx64"\n", xenaccess->domain_info->max_pages);
+    DPRINTF("max_gpfn = %"PRI_xen_pfn"\n", xenaccess->max_gpfn);
 
     return xenaccess;
 
  err:
-    xenaccess_teardown(xch, xenaccess);
+    rc = xenaccess_teardown(xch, xenaccess);
+    if ( rc )
+    {
+        ERROR("Failed to teardown xenaccess structure!\n");
+    }
 
  err_iface:
     return NULL;
 }
 
-int get_request(mem_event_t *mem_event, mem_event_request_t *req)
+static inline
+int control_singlestep(
+    xc_interface *xch,
+    domid_t domain_id,
+    unsigned long vcpu,
+    bool enable)
 {
-    mem_event_back_ring_t *back_ring;
-    RING_IDX req_cons;
+    uint32_t op = enable ?
+        XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON : XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF;
 
-    mem_event_ring_lock(mem_event);
+    return xc_domain_debug_control(xch, domain_id, op, vcpu);
+}
 
-    back_ring = &mem_event->back_ring;
+/*
+ * Note that this function is not thread safe.
+ */
+static void get_request(vm_event_t *vm_event, vm_event_request_t *req)
+{
+    vm_event_back_ring_t *back_ring;
+    RING_IDX req_cons;
+
+    back_ring = &vm_event->back_ring;
     req_cons = back_ring->req_cons;
 
     /* Copy request */
@@ -337,20 +306,17 @@ int get_request(mem_event_t *mem_event, mem_event_request_t *req)
     /* Update ring */
     back_ring->req_cons = req_cons;
     back_ring->sring->req_event = req_cons + 1;
-
-    mem_event_ring_unlock(mem_event);
-
-    return 0;
 }
 
-static int put_response(mem_event_t *mem_event, mem_event_response_t *rsp)
+/*
+ * Note that this function is not thread safe.
+ */
+static void put_response(vm_event_t *vm_event, vm_event_response_t *rsp)
 {
-    mem_event_back_ring_t *back_ring;
+    vm_event_back_ring_t *back_ring;
     RING_IDX rsp_prod;
 
-    mem_event_ring_lock(mem_event);
-
-    back_ring = &mem_event->back_ring;
+    back_ring = &vm_event->back_ring;
     rsp_prod = back_ring->rsp_prod_pvt;
 
     /* Copy response */
@@ -360,39 +326,19 @@ static int put_response(mem_event_t *mem_event, mem_event_response_t *rsp)
     /* Update ring */
     back_ring->rsp_prod_pvt = rsp_prod;
     RING_PUSH_RESPONSES(back_ring);
-
-    mem_event_ring_unlock(mem_event);
-
-    return 0;
-}
-
-static int xenaccess_resume_page(xenaccess_t *paging, mem_event_response_t *rsp)
-{
-    int ret;
-
-    /* Put the page info on the ring */
-    ret = put_response(&paging->mem_event, rsp);
-    if ( ret != 0 )
-        goto out;
-
-    /* Tell Xen page is ready */
-    ret = xc_mem_access_resume(paging->xc_handle, paging->mem_event.domain_id);
-    ret = xc_evtchn_notify(paging->mem_event.xce_handle,
-                           paging->mem_event.port);
-
- out:
-    return ret;
 }
 
 void usage(char* progname)
 {
-    fprintf(stderr,
-            "Usage: %s [-m] <domain_id> write|exec|int3\n"
+    fprintf(stderr, "Usage: %s [-m] <domain_id> write|exec", progname);
+#if defined(__i386__) || defined(__x86_64__)
+            fprintf(stderr, "|breakpoint|altp2m_write|altp2m_exec");
+#endif
+            fprintf(stderr,
             "\n"
-            "Logs first page writes, execs, or int3 traps that occur on the domain.\n"
+            "Logs first page writes, execs, or breakpoint traps that occur on the domain.\n"
             "\n"
-            "-m requires this program to run, or else the domain may pause\n",
-            progname);
+            "-m requires this program to run, or else the domain may pause\n");
 }
 
 int main(int argc, char *argv[])
@@ -400,16 +346,18 @@ int main(int argc, char *argv[])
     struct sigaction act;
     domid_t domain_id;
     xenaccess_t *xenaccess;
-    mem_event_request_t req;
-    mem_event_response_t rsp;
+    vm_event_request_t req;
+    vm_event_response_t rsp;
     int rc = -1;
     int rc1;
     xc_interface *xch;
     xenmem_access_t default_access = XENMEM_access_rwx;
     xenmem_access_t after_first_access = XENMEM_access_rwx;
     int required = 0;
-    int int3 = 0;
+    int breakpoint = 0;
     int shutting_down = 0;
+    int altp2m = 0;
+    uint16_t altp2m_view_id = 0;
 
     char* progname = argv[0];
     argv++;
@@ -448,10 +396,22 @@ int main(int argc, char *argv[])
         default_access = XENMEM_access_rw;
         after_first_access = XENMEM_access_rwx;
     }
-    else if ( !strcmp(argv[0], "int3") )
+#if defined(__i386__) || defined(__x86_64__)
+    else if ( !strcmp(argv[0], "breakpoint") )
+    {
+        breakpoint = 1;
+    }
+    else if ( !strcmp(argv[0], "altp2m_write") )
     {
-        int3 = 1;
+        default_access = XENMEM_access_rx;
+        altp2m = 1;
     }
+    else if ( !strcmp(argv[0], "altp2m_exec") )
+    {
+        default_access = XENMEM_access_rw;
+        altp2m = 1;
+    }
+#endif
     else
     {
         usage(argv[0]);
@@ -484,31 +444,83 @@ int main(int argc, char *argv[])
         goto exit;
     }
 
-    /* Set the default access type and convert all pages to it */
-    rc = xc_set_mem_access(xch, domain_id, default_access, ~0ull, 0);
-    if ( rc < 0 )
+    /* With altp2m we just create a new, restricted view of the memory */
+    if ( altp2m )
     {
-        ERROR("Error %d setting default mem access type\n", rc);
-        goto exit;
+        xen_pfn_t gfn = 0;
+        unsigned long perm_set = 0;
+
+        rc = xc_altp2m_set_domain_state( xch, domain_id, 1 );
+        if ( rc < 0 )
+        {
+            ERROR("Error %d enabling altp2m on domain!\n", rc);
+            goto exit;
+        }
+
+        rc = xc_altp2m_create_view( xch, domain_id, default_access, &altp2m_view_id );
+        if ( rc < 0 )
+        {
+            ERROR("Error %d creating altp2m view!\n", rc);
+            goto exit;
+        }
+
+        DPRINTF("altp2m view created with id %u\n", altp2m_view_id);
+        DPRINTF("Setting altp2m mem_access permissions.. ");
+
+        for(; gfn < xenaccess->max_gpfn; ++gfn)
+        {
+            rc = xc_altp2m_set_mem_access( xch, domain_id, altp2m_view_id, gfn,
+                                           default_access);
+            if ( !rc )
+                perm_set++;
+        }
+
+        DPRINTF("done! Permissions set on %lu pages.\n", perm_set);
+
+        rc = xc_altp2m_switch_to_view( xch, domain_id, altp2m_view_id );
+        if ( rc < 0 )
+        {
+            ERROR("Error %d switching to altp2m view!\n", rc);
+            goto exit;
+        }
+
+        rc = xc_monitor_singlestep( xch, domain_id, 1 );
+        if ( rc < 0 )
+        {
+            ERROR("Error %d failed to enable singlestep monitoring!\n", rc);
+            goto exit;
+        }
     }
 
-    rc = xc_set_mem_access(xch, domain_id, default_access, 0,
-                           xenaccess->domain_info->max_pages);
-    if ( rc < 0 )
+    if ( !altp2m )
     {
-        ERROR("Error %d setting all memory to access type %d\n", rc,
-              default_access);
-        goto exit;
+        /* Set the default access type and convert all pages to it */
+        rc = xc_set_mem_access(xch, domain_id, default_access, ~0ull, 0);
+        if ( rc < 0 )
+        {
+            ERROR("Error %d setting default mem access type\n", rc);
+            goto exit;
+        }
+
+        rc = xc_set_mem_access(xch, domain_id, default_access, START_PFN,
+                               (xenaccess->max_gpfn - START_PFN) );
+
+        if ( rc < 0 )
+        {
+            ERROR("Error %d setting all memory to access type %d\n", rc,
+                  default_access);
+            goto exit;
+        }
     }
 
-    if ( int3 )
-        rc = xc_hvm_param_set(xch, domain_id, HVM_PARAM_MEMORY_EVENT_INT3, HVMPME_mode_sync);
-    else
-        rc = xc_hvm_param_set(xch, domain_id, HVM_PARAM_MEMORY_EVENT_INT3, HVMPME_mode_disabled);
-    if ( rc < 0 )
+    if ( breakpoint )
     {
-        ERROR("Error %d setting int3 mem_event\n", rc);
-        goto exit;
+        rc = xc_monitor_software_breakpoint(xch, domain_id, 1);
+        if ( rc < 0 )
+        {
+            ERROR("Error %d setting breakpoint trapping with vm_event\n", rc);
+            goto exit;
+        }
     }
 
     /* Wait for access */
@@ -516,18 +528,34 @@ int main(int argc, char *argv[])
     {
         if ( interrupted )
         {
+            /* Unregister for every event */
             DPRINTF("xenaccess shutting down on signal %d\n", interrupted);
 
-            /* Unregister for every event */
-            rc = xc_set_mem_access(xch, domain_id, XENMEM_access_rwx, ~0ull, 0);
-            rc = xc_set_mem_access(xch, domain_id, XENMEM_access_rwx, 0,
-                                   xenaccess->domain_info->max_pages);
-            rc = xc_hvm_param_set(xch, domain_id, HVM_PARAM_MEMORY_EVENT_INT3, HVMPME_mode_disabled);
+            if ( breakpoint )
+                rc = xc_monitor_software_breakpoint(xch, domain_id, 0);
+
+            if ( altp2m )
+            {
+                uint32_t vcpu_id;
+
+                rc = xc_altp2m_switch_to_view( xch, domain_id, 0 );
+                rc = xc_altp2m_destroy_view(xch, domain_id, altp2m_view_id);
+                rc = xc_altp2m_set_domain_state(xch, domain_id, 0);
+                rc = xc_monitor_singlestep(xch, domain_id, 0);
+
+                for ( vcpu_id = 0; vcpu_id<XEN_LEGACY_MAX_VCPUS; vcpu_id++)
+                    rc = control_singlestep(xch, domain_id, vcpu_id, 0);
+
+            } else {
+                rc = xc_set_mem_access(xch, domain_id, XENMEM_access_rwx, ~0ull, 0);
+                rc = xc_set_mem_access(xch, domain_id, XENMEM_access_rwx, START_PFN,
+                                       (xenaccess->max_gpfn - START_PFN) );
+            }
 
             shutting_down = 1;
         }
 
-        rc = xc_wait_for_event_or_timeout(xch, xenaccess->mem_event.xce_handle, 100);
+        rc = xc_wait_for_event_or_timeout(xch, xenaccess->vm_event.xce_handle, 100);
         if ( rc < -1 )
         {
             ERROR("Error getting event");
@@ -539,25 +567,27 @@ int main(int argc, char *argv[])
             DPRINTF("Got event from Xen\n");
         }
 
-        while ( RING_HAS_UNCONSUMED_REQUESTS(&xenaccess->mem_event.back_ring) )
+        while ( RING_HAS_UNCONSUMED_REQUESTS(&xenaccess->vm_event.back_ring) )
         {
             xenmem_access_t access;
 
-            rc = get_request(&xenaccess->mem_event, &req);
-            if ( rc != 0 )
+            get_request(&xenaccess->vm_event, &req);
+
+            if ( req.version != VM_EVENT_INTERFACE_VERSION )
             {
-                ERROR("Error getting request");
+                ERROR("Error: vm_event interface version mismatch!\n");
                 interrupted = -1;
                 continue;
             }
 
             memset( &rsp, 0, sizeof (rsp) );
+            rsp.version = VM_EVENT_INTERFACE_VERSION;
             rsp.vcpu_id = req.vcpu_id;
             rsp.flags = req.flags;
 
             switch (req.reason) {
-            case MEM_EVENT_REASON_VIOLATION:
-                rc = xc_get_mem_access(xch, domain_id, req.gfn, &access);
+            case VM_EVENT_REASON_MEM_ACCESS:
+                rc = xc_get_mem_access(xch, domain_id, req.u.mem_access.gfn, &access);
                 if (rc < 0)
                 {
                     ERROR("Error %d getting mem_access event\n", rc);
@@ -566,22 +596,33 @@ int main(int argc, char *argv[])
                 }
 
                 printf("PAGE ACCESS: %c%c%c for GFN %"PRIx64" (offset %06"
-                       PRIx64") gla %016"PRIx64" (valid: %c; fault in gpt: %c; fault with gla: %c) (vcpu %u)\n",
-                       req.access_r ? 'r' : '-',
-                       req.access_w ? 'w' : '-',
-                       req.access_x ? 'x' : '-',
-                       req.gfn,
-                       req.offset,
-                       req.gla,
-                       req.gla_valid ? 'y' : 'n',
-                       req.fault_in_gpt ? 'y' : 'n',
-                       req.fault_with_gla ? 'y': 'n',
-                       req.vcpu_id);
+                       PRIx64") gla %016"PRIx64" (valid: %c; fault in gpt: %c; fault with gla: %c) (vcpu %u, altp2m view %u)\n",
+                       (req.u.mem_access.flags & MEM_ACCESS_R) ? 'r' : '-',
+                       (req.u.mem_access.flags & MEM_ACCESS_W) ? 'w' : '-',
+                       (req.u.mem_access.flags & MEM_ACCESS_X) ? 'x' : '-',
+                       req.u.mem_access.gfn,
+                       req.u.mem_access.offset,
+                       req.u.mem_access.gla,
+                       (req.u.mem_access.flags & MEM_ACCESS_GLA_VALID) ? 'y' : 'n',
+                       (req.u.mem_access.flags & MEM_ACCESS_FAULT_IN_GPT) ? 'y' : 'n',
+                       (req.u.mem_access.flags & MEM_ACCESS_FAULT_WITH_GLA) ? 'y': 'n',
+                       req.vcpu_id,
+                       req.altp2m_idx);
+
+                if ( altp2m && req.flags & VM_EVENT_FLAG_ALTERNATE_P2M)
+                {
+                    DPRINTF("\tSwitching back to default view!\n");
+
+                    rsp.reason = req.reason;
+                    rsp.flags = req.flags;
+                    rsp.altp2m_idx = 0;
 
-                if ( default_access != after_first_access )
+                    control_singlestep(xch, domain_id, rsp.vcpu_id, 1);
+                }
+                else if ( default_access != after_first_access )
                 {
                     rc = xc_set_mem_access(xch, domain_id, after_first_access,
-                                           req.gfn, 1);
+                                           req.u.mem_access.gfn, 1);
                     if (rc < 0)
                     {
                         ERROR("Error %d setting gfn to access_type %d\n", rc,
@@ -591,14 +632,12 @@ int main(int argc, char *argv[])
                     }
                 }
 
-
-                rsp.gfn = req.gfn;
-                rsp.p2mt = req.p2mt;
+                rsp.u.mem_access.gfn = req.u.mem_access.gfn;
                 break;
-            case MEM_EVENT_REASON_INT3:
-                printf("INT3: rip=%016"PRIx64", gfn=%"PRIx64" (vcpu %d)\n", 
-                       req.gla, 
-                       req.gfn,
+            case VM_EVENT_REASON_SOFTWARE_BREAKPOINT:
+                printf("Breakpoint: rip=%016"PRIx64", gfn=%"PRIx64" (vcpu %d)\n",
+                       req.data.regs.x86.rip,
+                       req.u.software_breakpoint.gfn,
                        req.vcpu_id);
 
                 /* Reinject */
@@ -607,23 +646,45 @@ int main(int argc, char *argv[])
                     HVMOP_TRAP_sw_exc, -1, 0, 0);
                 if (rc < 0)
                 {
-                    ERROR("Error %d injecting int3\n", rc);
+                    ERROR("Error %d injecting breakpoint\n", rc);
                     interrupted = -1;
                     continue;
                 }
 
                 break;
+            case VM_EVENT_REASON_SINGLESTEP:
+                printf("Singlestep: rip=%016"PRIx64", vcpu %d\n",
+                       req.data.regs.x86.rip,
+                       req.vcpu_id);
+
+                if ( altp2m )
+                {
+                    printf("\tSwitching altp2m to view %u!\n", altp2m_view_id);
+
+                    rsp.reason = req.reason;
+                    rsp.flags |= VM_EVENT_FLAG_ALTERNATE_P2M;
+                    rsp.altp2m_idx = altp2m_view_id;
+                }
+
+                control_singlestep(xch, domain_id, req.vcpu_id, 0);
+
+                break;
             default:
                 fprintf(stderr, "UNKNOWN REASON CODE %d\n", req.reason);
             }
 
-            rc = xenaccess_resume_page(xenaccess, &rsp);
-            if ( rc != 0 )
-            {
-                ERROR("Error resuming page");
-                interrupted = -1;
-                continue;
-            }
+            /* Put the response on the ring */
+            put_response(&xenaccess->vm_event, &rsp);
+        }
+
+        /* Tell Xen page is ready */
+        rc = xc_evtchn_notify(xenaccess->vm_event.xce_handle,
+                              xenaccess->vm_event.port);
+
+        if ( rc != 0 )
+        {
+            ERROR("Error resuming page");
+            interrupted = -1;
         }
 
         if ( shutting_down )
diff --git a/tools/xcutils/Makefile b/tools/xcutils/Makefile
index 1c5237c..fff519d 100644
--- a/tools/xcutils/Makefile
+++ b/tools/xcutils/Makefile
@@ -42,4 +42,7 @@ clean:
 	$(RM) *.o $(PROGRAMS)
 	$(RM) $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 -include $(DEPS)
diff --git a/tools/xenbackendd/Makefile b/tools/xenbackendd/Makefile
index 31e5c62..f52be74 100644
--- a/tools/xenbackendd/Makefile
+++ b/tools/xenbackendd/Makefile
@@ -25,13 +25,16 @@ build: xenbackendd
 
 .PHONY: install
 install: build
-	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
-	$(INSTALL_PROG) xenbackendd $(DESTDIR)$(SBINDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(sbindir)
+	$(INSTALL_PROG) xenbackendd $(DESTDIR)$(sbindir)
 
 .PHONY: clean
 clean:
 	$(RM) *.a *.so *.o $(DEPS) xenbackendd
 
+.PHONY: distclean
+distclean: clean
+
 xenbackendd: xenbackendd.o
 	$(CC) $(LDFLAGS) $< -o $@ $(LDLIBS) $(APPEND_LDFLAGS)
 
diff --git a/tools/xenbackendd/xenbackendd.c b/tools/xenbackendd/xenbackendd.c
index f1eb1f5..e21464b 100644
--- a/tools/xenbackendd/xenbackendd.c
+++ b/tools/xenbackendd/xenbackendd.c
@@ -13,8 +13,7 @@
  *  GNU General Public License for more details.
  * 
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <sys/types.h>
diff --git a/tools/xenmon/COPYING b/tools/xenmon/COPYING
index 5b6e7c6..ec0f60d 100644
--- a/tools/xenmon/COPYING
+++ b/tools/xenmon/COPYING
@@ -304,8 +304,7 @@ the "copyright" line and a pointer to where the full notice is found.
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 
 
 Also add information on how to contact you by electronic and paper mail.
diff --git a/tools/xenmon/Makefile b/tools/xenmon/Makefile
index 3fe87ba..20ea100 100644
--- a/tools/xenmon/Makefile
+++ b/tools/xenmon/Makefile
@@ -27,12 +27,10 @@ build: xentrace_setmask xenbaked
 
 .PHONY: install
 install: build
-	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
-	$(INSTALL_PROG) xenbaked $(DESTDIR)$(SBINDIR)/xenbaked
-	$(INSTALL_PROG) xentrace_setmask  $(DESTDIR)$(SBINDIR)/xentrace_setmask
-	$(INSTALL_PROG) xenmon.py  $(DESTDIR)$(SBINDIR)/xenmon.py
-	$(INSTALL_DIR) $(DESTDIR)$(DOCDIR)
-	$(INSTALL_DATA) README $(DESTDIR)$(DOCDIR)/README.xenmon
+	$(INSTALL_DIR) $(DESTDIR)$(sbindir)
+	$(INSTALL_PROG) xenbaked $(DESTDIR)$(sbindir)/xenbaked
+	$(INSTALL_PROG) xentrace_setmask  $(DESTDIR)$(sbindir)/xentrace_setmask
+	$(INSTALL_PROG) xenmon.py  $(DESTDIR)$(sbindir)/xenmon.py
 
 .PHONY: clean
 clean:
@@ -40,6 +38,9 @@ clean:
 	$(RM) -f xenbaked xenbaked.o
 	$(RM) -f xentrace_setmask setmask.o
 
+.PHONY: distclean
+distclean: clean
+
 xenbaked: xenbaked.o Makefile
 	$(CC) $(LDFLAGS) $< -o $@ $(LDLIBS) $(APPEND_LDFLAGS)
 
diff --git a/tools/xenmon/setmask.c b/tools/xenmon/setmask.c
index 2cc20d5..7e7b7b8 100644
--- a/tools/xenmon/setmask.c
+++ b/tools/xenmon/setmask.c
@@ -20,8 +20,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <stdlib.h>
diff --git a/tools/xenmon/xenbaked.c b/tools/xenmon/xenbaked.c
index dc61d14..1ddb29b 100644
--- a/tools/xenmon/xenbaked.c
+++ b/tools/xenmon/xenbaked.c
@@ -26,8 +26,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <time.h>
diff --git a/tools/xenmon/xenbaked.h b/tools/xenmon/xenbaked.h
index af97f2d..9eeacbb 100644
--- a/tools/xenmon/xenbaked.h
+++ b/tools/xenmon/xenbaked.h
@@ -19,8 +19,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __QOS_H__
diff --git a/tools/xenmon/xenmon.py b/tools/xenmon/xenmon.py
index 1883107..2a948cd 100644
--- a/tools/xenmon/xenmon.py
+++ b/tools/xenmon/xenmon.py
@@ -20,8 +20,7 @@
 #   GNU General Public License for more details.
 # 
 #   You should have received a copy of the GNU General Public License
-#   along with this program; if not, write to the Free Software
-#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#   along with this program; If not, see <http://www.gnu.org/licenses/>.
 #####################################################################
 
 import mmap
diff --git a/tools/xenpaging/Makefile b/tools/xenpaging/Makefile
index c742e62..2407a30 100644
--- a/tools/xenpaging/Makefile
+++ b/tools/xenpaging/Makefile
@@ -31,7 +31,9 @@ install: all
 clean:
 	rm -f *.o *~ $(DEPS) xen TAGS $(IBINS) $(LIB)
 
-.PHONY: clean install
+distclean: clean
+
+.PHONY: clean install distclean
 
 .PHONY: TAGS
 TAGS:
diff --git a/tools/xenpaging/file_ops.c b/tools/xenpaging/file_ops.c
index 9bc14b1..8210f42 100644
--- a/tools/xenpaging/file_ops.c
+++ b/tools/xenpaging/file_ops.c
@@ -15,8 +15,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
diff --git a/tools/xenpaging/file_ops.h b/tools/xenpaging/file_ops.h
index 53c9b6e..ee3fd7d 100644
--- a/tools/xenpaging/file_ops.h
+++ b/tools/xenpaging/file_ops.h
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
diff --git a/tools/xenpaging/pagein.c b/tools/xenpaging/pagein.c
index b3bcef7..7cb0f33 100644
--- a/tools/xenpaging/pagein.c
+++ b/tools/xenpaging/pagein.c
@@ -63,7 +63,7 @@ void page_in_trigger(void)
 
 void create_page_in_thread(struct xenpaging *paging)
 {
-    page_in_args.dom = paging->mem_event.domain_id;
+    page_in_args.dom = paging->vm_event.domain_id;
     page_in_args.pagein_queue = paging->pagein_queue;
     page_in_args.xch = paging->xc_handle;
     if (pthread_create(&page_in_thread, NULL, page_in, &page_in_args) == 0)
diff --git a/tools/xenpaging/policy.h b/tools/xenpaging/policy.h
index 455931a..b1425a9 100644
--- a/tools/xenpaging/policy.h
+++ b/tools/xenpaging/policy.h
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
diff --git a/tools/xenpaging/policy_default.c b/tools/xenpaging/policy_default.c
index 99cd5f8..3324835 100644
--- a/tools/xenpaging/policy_default.c
+++ b/tools/xenpaging/policy_default.c
@@ -15,8 +15,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
diff --git a/tools/xenpaging/xenpaging.c b/tools/xenpaging/xenpaging.c
index 82c1ee4..b5ffee6 100644
--- a/tools/xenpaging/xenpaging.c
+++ b/tools/xenpaging/xenpaging.c
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #define _GNU_SOURCE
@@ -63,7 +62,7 @@ static void close_handler(int sig)
 static void xenpaging_mem_paging_flush_ioemu_cache(struct xenpaging *paging)
 {
     struct xs_handle *xsh = paging->xs_handle;
-    domid_t domain_id = paging->mem_event.domain_id;
+    domid_t domain_id = paging->vm_event.domain_id;
     char path[80];
 
     sprintf(path, "/local/domain/0/device-model/%u/command", domain_id);
@@ -74,7 +73,7 @@ static void xenpaging_mem_paging_flush_ioemu_cache(struct xenpaging *paging)
 static int xenpaging_wait_for_event_or_timeout(struct xenpaging *paging)
 {
     xc_interface *xch = paging->xc_handle;
-    xc_evtchn *xce = paging->mem_event.xce_handle;
+    xc_evtchn *xce = paging->vm_event.xce_handle;
     char **vec, *val;
     unsigned int num;
     struct pollfd fd[2];
@@ -111,7 +110,7 @@ static int xenpaging_wait_for_event_or_timeout(struct xenpaging *paging)
             if ( strcmp(vec[XS_WATCH_TOKEN], watch_token) == 0 )
             {
                 /* If our guest disappeared, set interrupt flag and fall through */
-                if ( xs_is_domain_introduced(paging->xs_handle, paging->mem_event.domain_id) == false )
+                if ( xs_is_domain_introduced(paging->xs_handle, paging->vm_event.domain_id) == false )
                 {
                     xs_unwatch(paging->xs_handle, "@releaseDomain", watch_token);
                     interrupted = SIGQUIT;
@@ -171,7 +170,7 @@ static int xenpaging_get_tot_pages(struct xenpaging *paging)
     xc_domaininfo_t domain_info;
     int rc;
 
-    rc = xc_domain_getinfolist(xch, paging->mem_event.domain_id, 1, &domain_info);
+    rc = xc_domain_getinfolist(xch, paging->vm_event.domain_id, 1, &domain_info);
     if ( rc != 1 )
     {
         PERROR("Error getting domain info");
@@ -231,7 +230,7 @@ static int xenpaging_getopts(struct xenpaging *paging, int argc, char *argv[])
     {
         switch(ch) {
         case 'd':
-            paging->mem_event.domain_id = atoi(optarg);
+            paging->vm_event.domain_id = atoi(optarg);
             break;
         case 'f':
             filename = strdup(optarg);
@@ -264,7 +263,7 @@ static int xenpaging_getopts(struct xenpaging *paging, int argc, char *argv[])
     }
 
     /* Set domain id */
-    if ( !paging->mem_event.domain_id )
+    if ( !paging->vm_event.domain_id )
     {
         printf("Numerical <domain_id> missing!\n");
         return 1;
@@ -312,7 +311,7 @@ static struct xenpaging *xenpaging_init(int argc, char *argv[])
     }
 
     /* write domain ID to watch so we can ignore other domain shutdowns */
-    snprintf(watch_token, sizeof(watch_token), "%u", paging->mem_event.domain_id);
+    snprintf(watch_token, sizeof(watch_token), "%u", paging->vm_event.domain_id);
     if ( xs_watch(paging->xs_handle, "@releaseDomain", watch_token) == false )
     {
         PERROR("Could not bind to shutdown watch\n");
@@ -320,7 +319,7 @@ static struct xenpaging *xenpaging_init(int argc, char *argv[])
     }
 
     /* Watch xenpagings working target */
-    dom_path = xs_get_domain_path(paging->xs_handle, paging->mem_event.domain_id);
+    dom_path = xs_get_domain_path(paging->xs_handle, paging->vm_event.domain_id);
     if ( !dom_path )
     {
         PERROR("Could not find domain path\n");
@@ -339,17 +338,17 @@ static struct xenpaging *xenpaging_init(int argc, char *argv[])
     }
 
     /* Map the ring page */
-    xc_get_hvm_param(xch, paging->mem_event.domain_id, 
+    xc_get_hvm_param(xch, paging->vm_event.domain_id, 
                         HVM_PARAM_PAGING_RING_PFN, &ring_pfn);
     mmap_pfn = ring_pfn;
-    paging->mem_event.ring_page = 
-        xc_map_foreign_batch(xch, paging->mem_event.domain_id, 
+    paging->vm_event.ring_page = 
+        xc_map_foreign_batch(xch, paging->vm_event.domain_id, 
                                 PROT_READ | PROT_WRITE, &mmap_pfn, 1);
     if ( mmap_pfn & XEN_DOMCTL_PFINFO_XTAB )
     {
         /* Map failed, populate ring page */
         rc = xc_domain_populate_physmap_exact(paging->xc_handle, 
-                                              paging->mem_event.domain_id,
+                                              paging->vm_event.domain_id,
                                               1, 0, 0, &ring_pfn);
         if ( rc != 0 )
         {
@@ -358,8 +357,8 @@ static struct xenpaging *xenpaging_init(int argc, char *argv[])
         }
 
         mmap_pfn = ring_pfn;
-        paging->mem_event.ring_page = 
-            xc_map_foreign_batch(xch, paging->mem_event.domain_id, 
+        paging->vm_event.ring_page = 
+            xc_map_foreign_batch(xch, paging->vm_event.domain_id, 
                                     PROT_READ | PROT_WRITE, &mmap_pfn, 1);
         if ( mmap_pfn & XEN_DOMCTL_PFINFO_XTAB )
         {
@@ -369,8 +368,8 @@ static struct xenpaging *xenpaging_init(int argc, char *argv[])
     }
     
     /* Initialise Xen */
-    rc = xc_mem_paging_enable(xch, paging->mem_event.domain_id,
-                             &paging->mem_event.evtchn_port);
+    rc = xc_mem_paging_enable(xch, paging->vm_event.domain_id,
+                             &paging->vm_event.evtchn_port);
     if ( rc != 0 )
     {
         switch ( errno ) {
@@ -394,40 +393,40 @@ static struct xenpaging *xenpaging_init(int argc, char *argv[])
     }
 
     /* Open event channel */
-    paging->mem_event.xce_handle = xc_evtchn_open(NULL, 0);
-    if ( paging->mem_event.xce_handle == NULL )
+    paging->vm_event.xce_handle = xc_evtchn_open(NULL, 0);
+    if ( paging->vm_event.xce_handle == NULL )
     {
         PERROR("Failed to open event channel");
         goto err;
     }
 
     /* Bind event notification */
-    rc = xc_evtchn_bind_interdomain(paging->mem_event.xce_handle,
-                                    paging->mem_event.domain_id,
-                                    paging->mem_event.evtchn_port);
+    rc = xc_evtchn_bind_interdomain(paging->vm_event.xce_handle,
+                                    paging->vm_event.domain_id,
+                                    paging->vm_event.evtchn_port);
     if ( rc < 0 )
     {
         PERROR("Failed to bind event channel");
         goto err;
     }
 
-    paging->mem_event.port = rc;
+    paging->vm_event.port = rc;
 
     /* Initialise ring */
-    SHARED_RING_INIT((mem_event_sring_t *)paging->mem_event.ring_page);
-    BACK_RING_INIT(&paging->mem_event.back_ring,
-                   (mem_event_sring_t *)paging->mem_event.ring_page,
+    SHARED_RING_INIT((vm_event_sring_t *)paging->vm_event.ring_page);
+    BACK_RING_INIT(&paging->vm_event.back_ring,
+                   (vm_event_sring_t *)paging->vm_event.ring_page,
                    PAGE_SIZE);
 
     /* Now that the ring is set, remove it from the guest's physmap */
     if ( xc_domain_decrease_reservation_exact(xch, 
-                    paging->mem_event.domain_id, 1, 0, &ring_pfn) )
+                    paging->vm_event.domain_id, 1, 0, &ring_pfn) )
         PERROR("Failed to remove ring from guest physmap");
 
     /* Get max_pages from guest if not provided via cmdline */
     if ( !paging->max_pages )
     {
-        rc = xc_domain_getinfolist(xch, paging->mem_event.domain_id, 1,
+        rc = xc_domain_getinfolist(xch, paging->vm_event.domain_id, 1,
                                    &domain_info);
         if ( rc != 1 )
         {
@@ -497,9 +496,9 @@ static struct xenpaging *xenpaging_init(int argc, char *argv[])
             free(paging->paging_buffer);
         }
 
-        if ( paging->mem_event.ring_page )
+        if ( paging->vm_event.ring_page )
         {
-            munmap(paging->mem_event.ring_page, PAGE_SIZE);
+            munmap(paging->vm_event.ring_page, PAGE_SIZE);
         }
 
         free(dom_path);
@@ -524,28 +523,28 @@ static void xenpaging_teardown(struct xenpaging *paging)
 
     paging->xc_handle = NULL;
     /* Tear down domain paging in Xen */
-    munmap(paging->mem_event.ring_page, PAGE_SIZE);
-    rc = xc_mem_paging_disable(xch, paging->mem_event.domain_id);
+    munmap(paging->vm_event.ring_page, PAGE_SIZE);
+    rc = xc_mem_paging_disable(xch, paging->vm_event.domain_id);
     if ( rc != 0 )
     {
         PERROR("Error tearing down domain paging in xen");
     }
 
     /* Unbind VIRQ */
-    rc = xc_evtchn_unbind(paging->mem_event.xce_handle, paging->mem_event.port);
+    rc = xc_evtchn_unbind(paging->vm_event.xce_handle, paging->vm_event.port);
     if ( rc != 0 )
     {
         PERROR("Error unbinding event port");
     }
-    paging->mem_event.port = -1;
+    paging->vm_event.port = -1;
 
     /* Close event channel */
-    rc = xc_evtchn_close(paging->mem_event.xce_handle);
+    rc = xc_evtchn_close(paging->vm_event.xce_handle);
     if ( rc != 0 )
     {
         PERROR("Error closing event channel");
     }
-    paging->mem_event.xce_handle = NULL;
+    paging->vm_event.xce_handle = NULL;
     
     /* Close connection to xenstore */
     xs_close(paging->xs_handle);
@@ -558,12 +557,12 @@ static void xenpaging_teardown(struct xenpaging *paging)
     }
 }
 
-static void get_request(struct mem_event *mem_event, mem_event_request_t *req)
+static void get_request(struct vm_event *vm_event, vm_event_request_t *req)
 {
-    mem_event_back_ring_t *back_ring;
+    vm_event_back_ring_t *back_ring;
     RING_IDX req_cons;
 
-    back_ring = &mem_event->back_ring;
+    back_ring = &vm_event->back_ring;
     req_cons = back_ring->req_cons;
 
     /* Copy request */
@@ -575,12 +574,12 @@ static void get_request(struct mem_event *mem_event, mem_event_request_t *req)
     back_ring->sring->req_event = req_cons + 1;
 }
 
-static void put_response(struct mem_event *mem_event, mem_event_response_t *rsp)
+static void put_response(struct vm_event *vm_event, vm_event_response_t *rsp)
 {
-    mem_event_back_ring_t *back_ring;
+    vm_event_back_ring_t *back_ring;
     RING_IDX rsp_prod;
 
-    back_ring = &mem_event->back_ring;
+    back_ring = &vm_event->back_ring;
     rsp_prod = back_ring->rsp_prod_pvt;
 
     /* Copy response */
@@ -607,7 +606,7 @@ static int xenpaging_evict_page(struct xenpaging *paging, unsigned long gfn, int
     DECLARE_DOMCTL;
 
     /* Nominate page */
-    ret = xc_mem_paging_nominate(xch, paging->mem_event.domain_id, gfn);
+    ret = xc_mem_paging_nominate(xch, paging->vm_event.domain_id, gfn);
     if ( ret < 0 )
     {
         /* unpageable gfn is indicated by EBUSY */
@@ -619,7 +618,7 @@ static int xenpaging_evict_page(struct xenpaging *paging, unsigned long gfn, int
     }
 
     /* Map page */
-    page = xc_map_foreign_pages(xch, paging->mem_event.domain_id, PROT_READ, &victim, 1);
+    page = xc_map_foreign_pages(xch, paging->vm_event.domain_id, PROT_READ, &victim, 1);
     if ( page == NULL )
     {
         PERROR("Error mapping page %lx", gfn);
@@ -641,7 +640,7 @@ static int xenpaging_evict_page(struct xenpaging *paging, unsigned long gfn, int
     munmap(page, PAGE_SIZE);
 
     /* Tell Xen to evict page */
-    ret = xc_mem_paging_evict(xch, paging->mem_event.domain_id, gfn);
+    ret = xc_mem_paging_evict(xch, paging->vm_event.domain_id, gfn);
     if ( ret < 0 )
     {
         /* A gfn in use is indicated by EBUSY */
@@ -671,10 +670,10 @@ static int xenpaging_evict_page(struct xenpaging *paging, unsigned long gfn, int
     return ret;
 }
 
-static int xenpaging_resume_page(struct xenpaging *paging, mem_event_response_t *rsp, int notify_policy)
+static int xenpaging_resume_page(struct xenpaging *paging, vm_event_response_t *rsp, int notify_policy)
 {
     /* Put the page info on the ring */
-    put_response(&paging->mem_event, rsp);
+    put_response(&paging->vm_event, rsp);
 
     /* Notify policy of page being paged in */
     if ( notify_policy )
@@ -684,16 +683,16 @@ static int xenpaging_resume_page(struct xenpaging *paging, mem_event_response_t
          * This allows page-out of these gfns if the target grows again.
          */
         if (paging->num_paged_out > paging->policy_mru_size)
-            policy_notify_paged_in(rsp->gfn);
+            policy_notify_paged_in(rsp->u.mem_paging.gfn);
         else
-            policy_notify_paged_in_nomru(rsp->gfn);
+            policy_notify_paged_in_nomru(rsp->u.mem_paging.gfn);
 
        /* Record number of resumed pages */
        paging->num_paged_out--;
     }
 
     /* Tell Xen page is ready */
-    return xc_evtchn_notify(paging->mem_event.xce_handle, paging->mem_event.port);
+    return xc_evtchn_notify(paging->vm_event.xce_handle, paging->vm_event.port);
 }
 
 static int xenpaging_populate_page(struct xenpaging *paging, unsigned long gfn, int i)
@@ -715,7 +714,7 @@ static int xenpaging_populate_page(struct xenpaging *paging, unsigned long gfn,
     do
     {
         /* Tell Xen to allocate a page for the domain */
-        ret = xc_mem_paging_load(xch, paging->mem_event.domain_id, gfn, paging->paging_buffer);
+        ret = xc_mem_paging_load(xch, paging->vm_event.domain_id, gfn, paging->paging_buffer);
         if ( ret < 0 )
         {
             if ( errno == ENOMEM )
@@ -857,8 +856,8 @@ int main(int argc, char *argv[])
 {
     struct sigaction act;
     struct xenpaging *paging;
-    mem_event_request_t req;
-    mem_event_response_t rsp;
+    vm_event_request_t req;
+    vm_event_response_t rsp;
     int num, prev_num = 0;
     int slot;
     int tot_pages;
@@ -874,7 +873,8 @@ int main(int argc, char *argv[])
     }
     xch = paging->xc_handle;
 
-    DPRINTF("starting %s for domain_id %u with pagefile %s\n", argv[0], paging->mem_event.domain_id, filename);
+    DPRINTF("starting %s for domain_id %u with pagefile %s\n",
+            argv[0], paging->vm_event.domain_id, filename);
 
     /* ensure that if we get a signal, we'll do cleanup, then exit */
     act.sa_handler = close_handler;
@@ -903,56 +903,59 @@ int main(int argc, char *argv[])
             DPRINTF("Got event from Xen\n");
         }
 
-        while ( RING_HAS_UNCONSUMED_REQUESTS(&paging->mem_event.back_ring) )
+        while ( RING_HAS_UNCONSUMED_REQUESTS(&paging->vm_event.back_ring) )
         {
             /* Indicate possible error */
             rc = 1;
 
-            get_request(&paging->mem_event, &req);
+            get_request(&paging->vm_event, &req);
 
-            if ( req.gfn > paging->max_pages )
+            if ( req.u.mem_paging.gfn > paging->max_pages )
             {
-                ERROR("Requested gfn %"PRIx64" higher than max_pages %x\n", req.gfn, paging->max_pages);
+                ERROR("Requested gfn %"PRIx64" higher than max_pages %x\n",
+                      req.u.mem_paging.gfn, paging->max_pages);
                 goto out;
             }
 
             /* Check if the page has already been paged in */
-            if ( test_and_clear_bit(req.gfn, paging->bitmap) )
+            if ( test_and_clear_bit(req.u.mem_paging.gfn, paging->bitmap) )
             {
                 /* Find where in the paging file to read from */
-                slot = paging->gfn_to_slot[req.gfn];
+                slot = paging->gfn_to_slot[req.u.mem_paging.gfn];
 
                 /* Sanity check */
-                if ( paging->slot_to_gfn[slot] != req.gfn )
+                if ( paging->slot_to_gfn[slot] != req.u.mem_paging.gfn )
                 {
-                    ERROR("Expected gfn %"PRIx64" in slot %d, but found gfn %lx\n", req.gfn, slot, paging->slot_to_gfn[slot]);
+                    ERROR("Expected gfn %"PRIx64" in slot %d, but found gfn %lx\n",
+                          req.u.mem_paging.gfn, slot, paging->slot_to_gfn[slot]);
                     goto out;
                 }
 
-                if ( req.flags & MEM_EVENT_FLAG_DROP_PAGE )
+                if ( req.u.mem_paging.flags & MEM_PAGING_DROP_PAGE )
                 {
-                    DPRINTF("drop_page ^ gfn %"PRIx64" pageslot %d\n", req.gfn, slot);
+                    DPRINTF("drop_page ^ gfn %"PRIx64" pageslot %d\n",
+                            req.u.mem_paging.gfn, slot);
                     /* Notify policy of page being dropped */
-                    policy_notify_dropped(req.gfn);
+                    policy_notify_dropped(req.u.mem_paging.gfn);
                 }
                 else
                 {
                     /* Populate the page */
-                    if ( xenpaging_populate_page(paging, req.gfn, slot) < 0 )
+                    if ( xenpaging_populate_page(paging, req.u.mem_paging.gfn, slot) < 0 )
                     {
-                        ERROR("Error populating page %"PRIx64"", req.gfn);
+                        ERROR("Error populating page %"PRIx64"", req.u.mem_paging.gfn);
                         goto out;
                     }
                 }
 
                 /* Prepare the response */
-                rsp.gfn = req.gfn;
+                rsp.u.mem_paging.gfn = req.u.mem_paging.gfn;
                 rsp.vcpu_id = req.vcpu_id;
                 rsp.flags = req.flags;
 
                 if ( xenpaging_resume_page(paging, &rsp, 1) < 0 )
                 {
-                    PERROR("Error resuming page %"PRIx64"", req.gfn);
+                    PERROR("Error resuming page %"PRIx64"", req.u.mem_paging.gfn);
                     goto out;
                 }
 
@@ -966,22 +969,23 @@ int main(int argc, char *argv[])
             {
                 DPRINTF("page %s populated (domain = %d; vcpu = %d;"
                         " gfn = %"PRIx64"; paused = %d; evict_fail = %d)\n",
-                        req.flags & MEM_EVENT_FLAG_EVICT_FAIL ? "not" : "already",
-                        paging->mem_event.domain_id, req.vcpu_id, req.gfn,
-                        !!(req.flags & MEM_EVENT_FLAG_VCPU_PAUSED) ,
-                        !!(req.flags & MEM_EVENT_FLAG_EVICT_FAIL) );
+                        req.u.mem_paging.flags & MEM_PAGING_EVICT_FAIL ? "not" : "already",
+                        paging->vm_event.domain_id, req.vcpu_id, req.u.mem_paging.gfn,
+                        !!(req.flags & VM_EVENT_FLAG_VCPU_PAUSED) ,
+                        !!(req.u.mem_paging.flags & MEM_PAGING_EVICT_FAIL) );
 
                 /* Tell Xen to resume the vcpu */
-                if (( req.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) || ( req.flags & MEM_EVENT_FLAG_EVICT_FAIL ))
+                if (( req.flags & VM_EVENT_FLAG_VCPU_PAUSED ) ||
+                    ( req.u.mem_paging.flags & MEM_PAGING_EVICT_FAIL ))
                 {
                     /* Prepare the response */
-                    rsp.gfn = req.gfn;
+                    rsp.u.mem_paging.gfn = req.u.mem_paging.gfn;
                     rsp.vcpu_id = req.vcpu_id;
                     rsp.flags = req.flags;
 
                     if ( xenpaging_resume_page(paging, &rsp, 0) < 0 )
                     {
-                        PERROR("Error resuming page %"PRIx64"", req.gfn);
+                        PERROR("Error resuming page %"PRIx64"", req.u.mem_paging.gfn);
                         goto out;
                     }
                 }
diff --git a/tools/xenpaging/xenpaging.h b/tools/xenpaging/xenpaging.h
index 877db2f..c6ab77c 100644
--- a/tools/xenpaging/xenpaging.h
+++ b/tools/xenpaging/xenpaging.h
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
@@ -27,15 +26,15 @@
 
 #include <xc_private.h>
 #include <xen/event_channel.h>
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
 
 #define XENPAGING_PAGEIN_QUEUE_SIZE 64
 
-struct mem_event {
+struct vm_event {
     domid_t domain_id;
     xc_evtchn *xce_handle;
     int port;
-    mem_event_back_ring_t back_ring;
+    vm_event_back_ring_t back_ring;
     uint32_t evtchn_port;
     void *ring_page;
 };
@@ -51,7 +50,7 @@ struct xenpaging {
 
     void *paging_buffer;
 
-    struct mem_event mem_event;
+    struct vm_event vm_event;
     int fd;
     /* number of pages for which data structures were allocated */
     int max_pages;
diff --git a/tools/xenpmd/Makefile b/tools/xenpmd/Makefile
index 2d925df..55e8fc5 100644
--- a/tools/xenpmd/Makefile
+++ b/tools/xenpmd/Makefile
@@ -11,13 +11,16 @@ all: xenpmd
 
 .PHONY: install
 install: all
-	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
-	$(INSTALL_PROG) xenpmd $(DESTDIR)$(SBINDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(sbindir)
+	$(INSTALL_PROG) xenpmd $(DESTDIR)$(sbindir)
 
 .PHONY: clean
 clean:
 	$(RM) -f xenpmd xenpmd.o $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 xenpmd: xenpmd.o Makefile
 	$(CC) $(LDFLAGS) $< -o $@ $(LDLIBS) $(APPEND_LDFLAGS)
 
diff --git a/tools/xenpmd/xenpmd.c b/tools/xenpmd/xenpmd.c
index 5dffff8..b3a3106 100644
--- a/tools/xenpmd/xenpmd.c
+++ b/tools/xenpmd/xenpmd.c
@@ -18,8 +18,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /* Xen extended power management support provides HVM guest power management
diff --git a/tools/xenstat/Makefile b/tools/xenstat/Makefile
index 904ccdd..901be4a 100644
--- a/tools/xenstat/Makefile
+++ b/tools/xenstat/Makefile
@@ -11,6 +11,6 @@ SUBDIRS += xentop
 endif
 endif
 
-.PHONY: all install clean
+.PHONY: all install clean distclean
 
-all install clean: %: subdirs-%
+all install clean distclean: %: subdirs-%
diff --git a/tools/xenstat/libxenstat/COPYING b/tools/xenstat/libxenstat/COPYING
index 2d2d780..480e5f1 100644
--- a/tools/xenstat/libxenstat/COPYING
+++ b/tools/xenstat/libxenstat/COPYING
@@ -489,8 +489,7 @@ notice is found.
     Lesser General Public License for more details.
 
     You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+    License along with this library; If not, see <http://www.gnu.org/licenses/>.
 
 Also add information on how to contact you by electronic and paper mail.
 
diff --git a/tools/xenstat/libxenstat/Makefile b/tools/xenstat/libxenstat/Makefile
index 86068cb..850d24a 100644
--- a/tools/xenstat/libxenstat/Makefile
+++ b/tools/xenstat/libxenstat/Makefile
@@ -24,7 +24,7 @@ MINOR=0
 LIB=src/libxenstat.a
 SHLIB=src/libxenstat.so.$(MAJOR).$(MINOR)
 SHLIB_LINKS=src/libxenstat.so.$(MAJOR) src/libxenstat.so
-OBJECTS-y=src/xenstat.o
+OBJECTS-y=src/xenstat.o src/xenstat_qmp.o
 OBJECTS-$(CONFIG_Linux) += src/xenstat_linux.o
 OBJECTS-$(CONFIG_SunOS) += src/xenstat_solaris.o
 OBJECTS-$(CONFIG_NetBSD) += src/xenstat_netbsd.o
@@ -32,7 +32,7 @@ OBJECTS-$(CONFIG_FreeBSD) += src/xenstat_freebsd.o
 SONAME_FLAGS=-Wl,$(SONAME_LDFLAG) -Wl,libxenstat.so.$(MAJOR)
 
 CFLAGS+=-fPIC
-CFLAGS+=-Isrc $(CFLAGS_libxenctrl) $(CFLAGS_libxenstore) $(CFLAGS_xeninclude)
+CFLAGS+=-Isrc $(CFLAGS_libxenctrl) $(CFLAGS_libxenstore) $(CFLAGS_xeninclude) -include $(XEN_ROOT)/tools/config.h
 
 LDLIBS-y = $(LDLIBS_libxenstore) $(LDLIBS_libxenctrl)
 LDLIBS-$(CONFIG_SunOS) += -lkstat
@@ -56,11 +56,11 @@ src/libxenstat.so: src/libxenstat.so.$(MAJOR)
 
 .PHONY: install
 install: all
-	$(INSTALL_DATA) src/xenstat.h $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_DATA) $(LIB) $(DESTDIR)$(LIBDIR)/libxenstat.a
-	$(INSTALL_PROG) src/libxenstat.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)
-	ln -sf libxenstat.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxenstat.so.$(MAJOR)
-	ln -sf libxenstat.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxenstat.so
+	$(INSTALL_DATA) src/xenstat.h $(DESTDIR)$(includedir)
+	$(INSTALL_DATA) $(LIB) $(DESTDIR)$(libdir)/libxenstat.a
+	$(INSTALL_PROG) src/libxenstat.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)
+	ln -sf libxenstat.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)/libxenstat.so.$(MAJOR)
+	ln -sf libxenstat.so.$(MAJOR) $(DESTDIR)$(libdir)/libxenstat.so
 
 PYLIB=bindings/swig/python/_xenstat.so
 PYMOD=bindings/swig/python/xenstat.py
@@ -137,4 +137,7 @@ clean:
 	rm -f $(LIB) $(SHLIB) $(SHLIB_LINKS) $(OBJECTS-y) \
 	      $(BINDINGS) $(BINDINGSRC) $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 -include $(DEPS)
diff --git a/tools/xenstat/libxenstat/src/xenstat.c b/tools/xenstat/libxenstat/src/xenstat.c
index 8072a90..3495f3f 100644
--- a/tools/xenstat/libxenstat/src/xenstat.c
+++ b/tools/xenstat/libxenstat/src/xenstat.c
@@ -149,8 +149,8 @@ void domain_get_tmem_stats(xenstat_handle * handle, xenstat_domain * domain)
 {
 	char buffer[4096];
 
-	if (xc_tmem_control(handle->xc_handle,-1,TMEMC_LIST,domain->id,
-                        sizeof(buffer),-1,-1,buffer) < 0)
+	if (xc_tmem_control(handle->xc_handle,-1,XEN_SYSCTL_TMEM_OP_LIST,domain->id,
+                        sizeof(buffer),-1,buffer) < 0)
 		return;
 	domain->tmem_stats.curr_eph_pages = parse(buffer,"Ec");
 	domain->tmem_stats.succ_eph_gets = parse(buffer,"Ge");
@@ -166,6 +166,7 @@ xenstat_node *xenstat_get_node(xenstat_handle * handle, unsigned int flags)
 	xc_domaininfo_t domaininfo[DOMAIN_CHUNK_SIZE];
 	int new_domains;
 	unsigned int i;
+	int rc;
 
 	/* Create the node */
 	node = (xenstat_node *) calloc(1, sizeof(xenstat_node));
@@ -189,9 +190,9 @@ xenstat_node *xenstat_get_node(xenstat_handle * handle, unsigned int flags)
 	node->free_mem = ((unsigned long long)physinfo.free_pages)
 	    * handle->page_size;
 
-	node->freeable_mb = (long)xc_tmem_control(handle->xc_handle, -1,
-				TMEMC_QUERY_FREEABLE_MB, -1, 0, 0, 0, NULL);
-
+	rc = xc_tmem_control(handle->xc_handle, -1,
+                         XEN_SYSCTL_TMEM_OP_QUERY_FREEABLE_MB, -1, 0, 0, NULL);
+	node->freeable_mb = (rc < 0) ? 0 : rc;
 	/* malloc(0) is not portable, so allocate a single domain.  This will
 	 * be resized below. */
 	node->domains = malloc(sizeof(xenstat_domain));
@@ -657,6 +658,27 @@ static void xenstat_uninit_xen_version(xenstat_handle * handle)
  * VBD functions
  */
 
+/* Save VBD information */
+xenstat_vbd *xenstat_save_vbd(xenstat_domain *domain, xenstat_vbd *vbd)
+{
+        xenstat_vbd *vbds = domain->vbds;
+
+        domain->num_vbds++;
+        domain->vbds = realloc(domain->vbds,
+                               domain->num_vbds *
+                               sizeof(xenstat_vbd));
+
+        if (domain->vbds == NULL) {
+                domain->num_vbds = 0;
+                free(vbds);
+        }
+        else {
+                domain->vbds[domain->num_vbds - 1] = *vbd;
+        }
+
+        return domain->vbds;
+}
+
 /* Free VBD information */
 static void xenstat_free_vbds(xenstat_node * node)
 {
diff --git a/tools/xenstat/libxenstat/src/xenstat_linux.c b/tools/xenstat/libxenstat/src/xenstat_linux.c
index 7fdf70a..2cc9c7f 100644
--- a/tools/xenstat/libxenstat/src/xenstat_linux.c
+++ b/tools/xenstat/libxenstat/src/xenstat_linux.c
@@ -417,6 +417,9 @@ int xenstat_collect_vbds(xenstat_node * node)
 		}
 	}
 
+	/* Get qdisk statistics */
+	read_attributes_qdisk(node);
+
 	rewinddir(priv->sysfsvbd);
 
 	for(dp = readdir(priv->sysfsvbd); dp != NULL ;
@@ -477,18 +480,10 @@ int xenstat_collect_vbds(xenstat_node * node)
 			continue;
 		}
 
-		if (domain->vbds == NULL) {
-			domain->num_vbds = 1;
-			domain->vbds = malloc(sizeof(xenstat_vbd));
-		} else {
-			domain->num_vbds++;
-			domain->vbds = realloc(domain->vbds,
-					       domain->num_vbds *
-					       sizeof(xenstat_vbd));
-		}
-		if (domain->vbds == NULL)
+		if ((xenstat_save_vbd(domain, &vbd)) == NULL) {
+			perror("Allocation error");
 			return 0;
-		domain->vbds[domain->num_vbds - 1] = vbd;
+		}
 	}
 
 	return 1;	
diff --git a/tools/xenstat/libxenstat/src/xenstat_priv.h b/tools/xenstat/libxenstat/src/xenstat_priv.h
index 8490e23..74e0774 100644
--- a/tools/xenstat/libxenstat/src/xenstat_priv.h
+++ b/tools/xenstat/libxenstat/src/xenstat_priv.h
@@ -109,5 +109,7 @@ extern int xenstat_collect_networks(xenstat_node * node);
 extern void xenstat_uninit_networks(xenstat_handle * handle);
 extern int xenstat_collect_vbds(xenstat_node * node);
 extern void xenstat_uninit_vbds(xenstat_handle * handle);
+extern void read_attributes_qdisk(xenstat_node * node);
+extern xenstat_vbd *xenstat_save_vbd(xenstat_domain * domain, xenstat_vbd * vbd);
 
 #endif /* XENSTAT_PRIV_H */
diff --git a/tools/xenstat/libxenstat/src/xenstat_qmp.c b/tools/xenstat/libxenstat/src/xenstat_qmp.c
new file mode 100644
index 0000000..5e261af
--- /dev/null
+++ b/tools/xenstat/libxenstat/src/xenstat_qmp.c
@@ -0,0 +1,448 @@
+/* libxenstat: statistics-collection library for Xen
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+#include <sys/un.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <xenctrl.h>
+
+#include "xenstat_priv.h"
+
+#ifdef HAVE_YAJL_YAJL_VERSION_H
+#  include <yajl/yajl_version.h>
+#endif
+
+/* YAJL version check */
+#if defined(YAJL_MAJOR) && (YAJL_MAJOR > 1)
+#  define HAVE_YAJL_V2 1
+#endif
+
+#ifdef HAVE_YAJL_V2
+
+#include <yajl/yajl_tree.h>
+
+static unsigned char *qmp_query(int, char *);
+
+enum query_blockstats {
+    QMP_STATS_RETURN  = 0,
+    QMP_STATS_DEVICE  = 1,
+    QMP_STATS         = 2,
+    QMP_RD_BYTES      = 3,
+    QMP_WR_BYTES      = 4,
+    QMP_RD_OPERATIONS = 5,
+    QMP_WR_OPERATIONS = 6,
+};
+
+enum query_block {
+    QMP_BLOCK_RETURN  = 0,
+    QMP_BLOCK_DEVICE  = 1,
+    QMP_INSERTED      = 2,
+    QMP_FILE          = 3,
+};
+
+
+/* Given the qmp device name, get the image filename associated with it
+   QMP Syntax for querying block infomation:
+     In: { "execute": "query-block" }
+     Out: {"return": [{
+            "device": 'str, "locked": 'bool', "removable": bool,
+            "inserted": {
+              "iops_rd": 'int',
+              "image": {
+                "virtual-size": 'int', "filename": 'str', "cluster-size": 'int',
+                "format": 'str', "actual-size": 'int', "dirty-flag": 'bool'
+              },
+              "iops_wr": 'int', "ro": 'bool', "backing_file_depth": 'int',
+              "drv": 'str', "iops": 'int', "bps_wr": 'int', "encrypted": 'bool',
+              "bps": 'int', "bps_rd": 'int',
+              "file": 'str', "encryption_key_missing": 'bool'
+            },
+            "type": 'str'
+          }]}
+*/
+static char *qmp_get_block_image(xenstat_node *node, char *qmp_devname, int qfd)
+{
+	char *tmp, *file = NULL;
+	char *query_block_cmd = "{ \"execute\": \"query-block\" }";
+	static const char *const qblock[] = {
+		[ QMP_BLOCK_RETURN  ] = "return",
+		[ QMP_BLOCK_DEVICE  ] = "device",
+		[ QMP_INSERTED      ] = "inserted",
+		[ QMP_FILE          ] = "file",
+	};
+	const char *ptr[] = {0, 0};
+	unsigned char *qmp_stats;
+	yajl_val info, ret_obj, dev_obj, n;
+	int i;
+
+	if ((qmp_stats = qmp_query(qfd, query_block_cmd)) == NULL)
+		return NULL;
+
+	/* Use libyajl version 2.0.3 or newer for the tree parser feature with bug fixes */
+	info = yajl_tree_parse((char *)qmp_stats, NULL, 0);
+	free(qmp_stats);
+	if (info == NULL)
+		return NULL;
+
+	ptr[0] = qblock[QMP_BLOCK_RETURN]; /* "return" */
+	if ((ret_obj = yajl_tree_get(info, ptr, yajl_t_array)) == NULL)
+		goto done;
+
+	for (i=0; i<YAJL_GET_ARRAY(ret_obj)->len; i++) {
+		n = YAJL_GET_ARRAY(ret_obj)->values[i];
+
+		ptr[0] = qblock[QMP_BLOCK_DEVICE]; /* "device" */
+		if ((dev_obj = yajl_tree_get(n, ptr, yajl_t_any)) != NULL) {
+			tmp = YAJL_GET_STRING(dev_obj);
+			if (!tmp || strcmp(qmp_devname, tmp))
+				continue;
+		}
+		else
+			continue;
+
+		ptr[0] = qblock[QMP_INSERTED]; /* "inserted" */
+		n = yajl_tree_get(n, ptr, yajl_t_any);
+		if (n) {
+			ptr[0] = qblock[QMP_FILE]; /* "file" */
+			n = yajl_tree_get(n, ptr, yajl_t_any);
+			if (n && YAJL_IS_STRING(n)) {
+				tmp = YAJL_GET_STRING(n);
+				file = malloc(strlen(tmp)+1);
+				if (file != NULL)
+					strcpy(file, tmp);
+				goto done;
+			}
+		}
+	}
+done:
+	yajl_tree_free(info);
+	return file;
+}
+
+
+/* Given a QMP device name, lookup the associated xenstore qdisk device id */
+static void lookup_xenstore_devid(xenstat_node * node, unsigned int domid, char *qmp_devname,
+	int qfd, unsigned int *dev, unsigned int *sector_size)
+{
+	char **dev_ids, *tmp, *ptr, *image, path[80];
+	unsigned int num_dev_ids;
+	int i, devid;
+
+	/* Get all the qdisk dev IDs associated with the this VM */
+	snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i", domid);
+	dev_ids = xs_directory(node->handle->xshandle, XBT_NULL, path, &num_dev_ids);
+	if (dev_ids == NULL) {
+		return;
+	}
+
+	/* Get the filename of the image associated with this QMP device */
+	image = qmp_get_block_image(node, qmp_devname, qfd);
+	if (image == NULL) {
+		free(dev_ids);
+		return;
+	}
+
+	/* Look for a matching image in xenstore */
+	for (i=0; i<num_dev_ids; i++) {
+		devid = atoi(dev_ids[i]);
+		/* Get the xenstore name of the image */
+		snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i/%i/params", domid, devid);
+		if ((ptr = xs_read(node->handle->xshandle, XBT_NULL, path, NULL)) == NULL)
+			continue;
+
+		/* Get to actual path in string */
+		if ((tmp = strchr(ptr, '/')) == NULL)
+			tmp = ptr;
+		if (!strcmp(tmp,image)) {
+			*dev = devid;
+			free(ptr);
+
+			/* Get the xenstore sector size of the image while we're here */
+			snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i/%i/sector-size", domid, devid);
+			if ((ptr = xs_read(node->handle->xshandle, XBT_NULL, path, NULL)) != NULL) {
+				*sector_size = atoi((char *)ptr);
+				free(ptr);
+			}
+			break;
+		}
+		free(ptr);
+	}
+
+	free(image);
+	free(dev_ids);
+}
+
+/* Parse the stats buffer which contains I/O data for all the disks belonging to domid */
+static void qmp_parse_stats(xenstat_node *node, unsigned int domid, unsigned char *stats_buf, int qfd)
+{
+	char *qmp_devname;
+	static const char *const qstats[] = {
+		[ QMP_STATS_RETURN  ] = "return",
+		[ QMP_STATS_DEVICE  ] = "device",
+		[ QMP_STATS         ] = "stats",
+		[ QMP_RD_BYTES      ] = "rd_bytes",
+		[ QMP_WR_BYTES      ] = "wr_bytes",
+		[ QMP_RD_OPERATIONS ] = "rd_operations",
+		[ QMP_WR_OPERATIONS ] = "wr_operations",
+	};
+	const char *ptr[] = {0, 0};
+	yajl_val info, ret_obj, stats_obj, n;
+	xenstat_vbd vbd;
+	xenstat_domain *domain;
+	unsigned int sector_size = 512;
+	int i, j;
+
+	/* Use libyajl version 2.0.3 or newer for the tree parser feature */
+	if ((info = yajl_tree_parse((char *)stats_buf, NULL, 0)) == NULL)
+		return;
+
+	ptr[0] = qstats[QMP_STATS_RETURN]; /* "return" */
+	if ((ret_obj = yajl_tree_get(info, ptr, yajl_t_array)) == NULL)
+		goto done;
+
+	/* Array of devices */
+	for (i=0; i<YAJL_GET_ARRAY(ret_obj)->len; i++) {
+		memset(&vbd, 0, sizeof(xenstat_vbd));
+		qmp_devname = NULL;
+		stats_obj = YAJL_GET_ARRAY(ret_obj)->values[i];
+
+		ptr[0] = qstats[QMP_STATS_DEVICE]; /* "device" */
+		if ((n = yajl_tree_get(stats_obj, ptr, yajl_t_any)) != NULL)
+			qmp_devname = YAJL_GET_STRING(n);
+
+		ptr[0] = qstats[QMP_STATS]; /* "stats" */
+		stats_obj = yajl_tree_get(stats_obj, ptr, yajl_t_object);
+		if (stats_obj && YAJL_IS_OBJECT(stats_obj)) {
+			for (j=3; j<7; j++) {
+				ptr[0] = qstats[j];
+				n = yajl_tree_get(stats_obj, ptr, yajl_t_number);
+				if (n && YAJL_IS_NUMBER(n)) {
+					switch(j) {
+					case QMP_RD_BYTES: /* "rd_bytes" */
+						vbd.rd_sects = YAJL_GET_INTEGER(n) / sector_size;
+						break;
+					case QMP_WR_BYTES: /* "wr_bytes" */
+						vbd.wr_sects = YAJL_GET_INTEGER(n) / sector_size;
+						break;
+					case QMP_RD_OPERATIONS: /* "rd_operations" */
+						vbd.rd_reqs = YAJL_GET_INTEGER(n);
+						break;
+					case QMP_WR_OPERATIONS: /* "wr_operations" */
+						vbd.wr_reqs = YAJL_GET_INTEGER(n);
+						break;
+					}
+				}
+			}
+			/* With the QMP device name, lookup the xenstore qdisk device ID and set vdb.dev */
+			if (qmp_devname)
+				lookup_xenstore_devid(node, domid, qmp_devname, qfd, &vbd.dev, &sector_size);
+			if ((domain = xenstat_node_domain(node, domid)) == NULL)
+				continue;
+			if ((xenstat_save_vbd(domain, &vbd)) == NULL)
+				goto done;
+		}
+	}
+done:
+	yajl_tree_free(info);
+}
+
+/* Write a command via the QMP. Returns number of bytes written */
+static size_t qmp_write(int qfd, char *cmd, size_t cmd_len)
+{
+	size_t pos = 0;
+	ssize_t res;
+
+	while (cmd_len > pos) {
+		res = write(qfd, cmd + pos, cmd_len - pos);
+		switch (res) {
+		case -1:
+			if (errno == EINTR || errno == EAGAIN)
+				continue;
+			return 0;
+		case 0:
+			errno = EPIPE;
+			return pos;
+		default:
+			pos += (size_t)res;
+		}
+	}
+	return pos;
+}
+
+/* Read the data sent in response to a QMP execute query. Returns 1 for success */
+static int qmp_read(int qfd, unsigned char **qstats)
+{
+	unsigned char buf[1024], *ptr;
+	struct pollfd pfd[1];
+	int n, qsize = 0;
+
+	*qstats = NULL;
+	pfd[0].fd = qfd;
+	pfd[0].events = POLLIN;
+	while ((n = poll(pfd, 1, 10)) > 0) {
+		if (pfd[0].revents & POLLIN) {
+			if ((n = read(qfd, buf, sizeof(buf))) < 0) {
+				free(*qstats);
+				return 0;
+			}
+			ptr = realloc(*qstats, qsize+n+1);
+			if (ptr == NULL) {
+				free(*qstats);
+				return 0;
+			}
+			memcpy(&ptr[qsize], buf, n);
+			qsize += n;
+			ptr[qsize] = 0;
+			*qstats = ptr;
+		}
+	}
+	return 1;
+}
+
+/* With the given cmd, query QMP for requested data. Returns allocated buffer containing data or NULL */
+static unsigned char *qmp_query(int qfd, char *cmd)
+{
+	unsigned char *qstats = NULL;
+	int n;
+
+	n = strlen(cmd);
+	if (qmp_write(qfd, cmd, n) != n)
+		return NULL;
+	if (!qmp_read(qfd, &qstats))
+		return NULL;
+	return qstats;
+}
+
+/* Returns a socket connected to the QMP socket. Returns -1 on failure. */
+static int qmp_connect(char *path)
+{
+	struct sockaddr_un sun;
+	int s;
+
+	if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
+		return -1;
+	(void)fcntl(s, F_SETFD, 1);
+
+	memset(&sun, 0, sizeof(struct sockaddr_un));
+	sun.sun_family = AF_UNIX;
+
+	if (strlen(path) >= sizeof(sun.sun_path)) {
+		close(s);
+		return -1;
+	}
+
+	strcpy(sun.sun_path, path);
+	if (connect(s, (struct sockaddr *)&sun, SUN_LEN(&sun)) < 0) {
+		close(s);
+		return -1;
+	}
+
+	return s;
+}
+
+/* Get up to 1024 active domains */
+static xc_domaininfo_t *get_domain_ids(xc_interface *xc_handle, int *num_doms)
+{
+	xc_domaininfo_t *dominfo;
+
+	dominfo = calloc(1024, sizeof(xc_domaininfo_t));
+	if (dominfo == NULL)
+		return NULL;
+	*num_doms = xc_domain_getinfolist(xc_handle, 0, 1024, dominfo);
+	return dominfo;
+}
+
+/* Gather the qdisk statistics by querying QMP
+   Resources: http://wiki.qemu.org/QMP and qmp-commands.hx from the qemu code
+   QMP Syntax for entering command mode. This command must be issued before
+   issuing any other command:
+     In: {"execute": "qmp_capabilities"}
+     Out: {"return": {}}
+   QMP Syntax for querying block statistics:
+     In: { "execute": "query-blockstats" }
+     Out: {"return": [{
+            "device": 'str',
+            "parent": {
+              "stats": {
+                "flush_total_time_ns": 'int', "wr_highest_offset": 'int',
+                "wr_total_time_ns": 'int', "wr_bytes": 'int',
+                "rd_total_time_ns": 'int', "flush_operations": 'int',
+                "wr_operations": 'int', "rd_bytes": 'int', "rd_operations": 'int'
+              }
+            },
+            "stats": {
+              "flush_total_time_ns": 'int', "wr_highest_offset": 'int',
+              "wr_total_time_ns": 'int', "wr_bytes": 'int',
+              "rd_total_time_ns": 'int', "flush_operations": 'int',
+              "wr_operations": 'int', "rd_bytes": 'int', "rd_operations": 'int'
+            }
+          }]}
+*/
+void read_attributes_qdisk(xenstat_node * node)
+{
+	char *cmd_mode = "{ \"execute\": \"qmp_capabilities\" }";
+	char *query_blockstats_cmd = "{ \"execute\": \"query-blockstats\" }";
+	xc_domaininfo_t *dominfo = NULL;
+	unsigned char *qmp_stats, *val;
+	char path[80];
+	int i, qfd, num_doms;
+
+	dominfo = get_domain_ids(node->handle->xc_handle, &num_doms);
+	if (dominfo == NULL)
+		return;
+
+	for (i=0; i<num_doms; i++) {
+		if (dominfo[i].domain <= 0)
+			continue;
+
+		/* Verify that qdisk disks are used with this VM */
+		snprintf(path, sizeof(path),"/local/domain/0/backend/qdisk/%i", dominfo[i].domain);
+		if ((val = xs_read(node->handle->xshandle, XBT_NULL, path, NULL)) == NULL)
+			continue;
+		free(val);
+
+		/* Connect to this VMs QMP socket */
+		snprintf(path, sizeof(path), "/var/run/xen/qmp-libxenstat-%i", dominfo[i].domain);
+		if ((qfd = qmp_connect(path)) < 0) {
+			continue;
+		}
+
+		/* First enable QMP capabilities so that we can query for data */
+		if ((qmp_stats = qmp_query(qfd, cmd_mode)) != NULL) {
+			free(qmp_stats);
+			/* Query QMP for this VMs blockstats */
+			if ((qmp_stats = qmp_query(qfd, query_blockstats_cmd)) != NULL) {
+				qmp_parse_stats(node, dominfo[i].domain, qmp_stats, qfd);
+				free(qmp_stats);
+			}
+		}
+		close(qfd);
+	}
+
+	free(dominfo);
+}
+
+#else /* !HAVE_YAJL_V2 */
+
+/* Statistics gathering for qdisks requires at least yajl v2 */
+void read_attributes_qdisk(xenstat_node * node)
+{
+}
+
+#endif /* !HAVE_YAJL_V2 */
diff --git a/tools/xenstat/xentop/Makefile b/tools/xenstat/xentop/Makefile
index 076e44c..1cc393f 100644
--- a/tools/xenstat/xentop/Makefile
+++ b/tools/xenstat/xentop/Makefile
@@ -19,22 +19,20 @@ all install xentop:
 else
 
 CFLAGS += -DGCC_PRINTF -Werror $(CFLAGS_libxenstat)
-LDLIBS += $(LDLIBS_libxenstat) $(CURSES_LIBS) $(SOCKET_LIBS) -lm
+LDLIBS += $(LDLIBS_libxenstat) $(CURSES_LIBS) $(TINFO_LIBS) $(SOCKET_LIBS) -lm -lyajl
 CFLAGS += -DHOST_$(XEN_OS)
 
-# Include configure output (config.h) to headers search path
-CFLAGS += -I$(XEN_ROOT)/tools
+# Include configure output (config.h)
+CFLAGS += -include $(XEN_ROOT)/tools/config.h
 LDFLAGS += $(APPEND_LDFLAGS)
 
 .PHONY: all
 all: xentop
 
 .PHONY: install
-install: xentop xentop.1
-	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
-	$(INSTALL_PROG) xentop $(DESTDIR)$(SBINDIR)/xentop
-	$(INSTALL_DIR) $(DESTDIR)$(MAN1DIR)
-	$(INSTALL_DATA) xentop.1 $(DESTDIR)$(MAN1DIR)/xentop.1
+install: xentop
+	$(INSTALL_DIR) $(DESTDIR)$(sbindir)
+	$(INSTALL_PROG) xentop $(DESTDIR)$(sbindir)/xentop
 
 endif
 
@@ -42,4 +40,7 @@ endif
 clean:
 	rm -f xentop xentop.o $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 -include $(DEPS)
diff --git a/tools/xenstat/xentop/xentop.1 b/tools/xenstat/xentop/xentop.1
deleted file mode 100644
index 8f6ab01..0000000
--- a/tools/xenstat/xentop/xentop.1
+++ /dev/null
@@ -1,104 +0,0 @@
-.\" Copyright (C) International Business Machines  Corp., 2005
-.\" Author: Josh Triplett <josh at kernel.org>
-.\"
-.\" This program is free software; you can redistribute it and/or modify
-.\" it under the terms of the GNU General Public License as published by
-.\" the Free Software Foundation; under version 2 of the License.
-.\"
-.\" This program is distributed in the hope that it will be useful,
-.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
-.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-.\" GNU General Public License for more details.
-.\"
-.\" You should have received a copy of the GNU General Public License
-.\" along with this program; if not, write to the Free Software
-.\" Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-.TH xentop 1 "August 2005"
-.SH NAME
-\fBxentop\fR \- displays real-time information about a Xen system and domains
-
-.SH SYNOPSIS
-.B xentop
-[\fB\-h\fR]
-[\fB\-V\fR]
-[\fB\-d\fRSECONDS]
-[\fB\-n\fR]
-[\fB\-r\fR]
-[\fB\-v\fR]
-[\fB\-f\fR]
-[\fB\-b\fR]
-[\fB\-i\fRITERATIONS]
-
-.SH DESCRIPTION
-\fBxentop\fR displays information about the Xen system and domains, in a
-continually-updating manner.  Command-line options and interactive commands
-can change the detail and format of the information displayed by \fBxentop\fR.
-
-.SH OPTIONS
-.TP
-\fB\-h\fR, \fB\-\-help\fR
-display help and exit
-.TP
-\fB\-V\fR, \fB\-\-version\fR
-output version information and exit
-.TP
-\fB\-d\fR, \fB\-\-delay\fR=\fISECONDS\fR
-seconds between updates (default 3)
-.TP
-\fB\-n\fR, \fB\-\-networks\fR
-output network information
-.TP
-\fB\-x\fR, \fB\-\-vbds\fR
-output vbd block device data
-.TP
-\fB\-r\fR, \fB\-\-repeat\-header\fR
-repeat table header before each domain
-.TP
-\fB\-v\fR, \fB\-\-vcpus\fR
-output VCPU data
-.TP
-\fB\-f\fR, \fB\-\-full\-name\fR
-output the full domain name (not truncated)
-.TP
-\fB\-b\fR, \fB\-\-batch\fR
-output data in batch mode (to stdout)
-.TP
-\fB\-i\fR, \fB\-\-iterations\fR=\fIITERATIONS\fR
-maximum number of iterations xentop should produce before ending
-
-
-.SH "INTERACTIVE COMMANDS"
-All interactive commands are case-insensitive.
-.TP
-.B D
-set delay between updates
-.TP
-.B N
-toggle display of network information
-.TP
-.B Q, Esc
-quit
-.TP
-.B R
-toggle table header before each domain
-.TP
-.B S
-cycle sort order
-.TP
-.B V
-toggle display of VCPU information
-.TP
-.B Arrows
-scroll domain display
-
-.SH AUTHORS
-Written by Judy Fischbach, David Hendricks, and Josh Triplett
-
-.SH "REPORTING BUGS"
-Report bugs to <xen-devel at lists.xen.org>.
-
-.SH COPYRIGHT
-Copyright \(co 2005  International Business Machines  Corp
-.br
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
diff --git a/tools/xenstat/xentop/xentop.c b/tools/xenstat/xentop/xentop.c
index 3062cb5..2fd2b67 100644
--- a/tools/xenstat/xentop/xentop.c
+++ b/tools/xenstat/xentop/xentop.c
@@ -15,13 +15,9 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
-/* Include output from configure */
-#include <config.h>
-
 /* get curses header from configure */
 #include INCLUDE_CURSES_H
 
@@ -407,6 +403,7 @@ static int handle_key(int ch)
 		case KEY_BACKSPACE:
 			if(prompt_val_len > 0)
 				prompt_val[--prompt_val_len] = '\0';
+                        break;
 		default:
 			if((prompt_val_len+1) < PROMPT_VAL_LEN
 			   && isprint(ch)) {
diff --git a/tools/xenstore/COPYING b/tools/xenstore/COPYING
index 5a6237b..c764b2e 100644
--- a/tools/xenstore/COPYING
+++ b/tools/xenstore/COPYING
@@ -494,8 +494,7 @@ notice is found.
     Lesser General Public License for more details.
 
     You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+    License along with this library; If not, see <http://www.gnu.org/licenses/>.
 
 Also add information on how to contact you by electronic and paper mail.
 
diff --git a/tools/xenstore/Makefile b/tools/xenstore/Makefile
index 11b6a06..1b4a494 100644
--- a/tools/xenstore/Makefile
+++ b/tools/xenstore/Makefile
@@ -6,7 +6,8 @@ MINOR = 3
 
 CFLAGS += -Werror
 CFLAGS += -I.
-CFLAGS += -I$(XEN_ROOT)/tools/
+# Include configure output (config.h)
+CFLAGS += -include $(XEN_ROOT)/tools/config.h
 CFLAGS += -I./include
 CFLAGS += $(CFLAGS_libxenctrl)
 CFLAGS += -DXEN_LIB_STORED="\"$(XEN_LIB_STORED)\""
@@ -115,6 +116,9 @@ clean:
 	rm -f xenstore $(CLIENTS)
 	$(RM) $(DEPS)
 
+.PHONY: distclean
+distclean: clean
+
 .PHONY: TAGS
 TAGS:
 	etags `find . -name '*.[ch]'`
@@ -125,37 +129,37 @@ tarball: clean
 
 .PHONY: install
 install: all
-	$(INSTALL_DIR) $(DESTDIR)$(BINDIR)
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)/xenstore-compat
+	$(INSTALL_DIR) $(DESTDIR)$(bindir)
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)
+	$(INSTALL_DIR) $(DESTDIR)$(includedir)/xenstore-compat
 ifeq ($(XENSTORE_XENSTORED),y)
-	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(sbindir)
 	$(INSTALL_DIR) $(DESTDIR)$(XEN_LIB_STORED)
-	$(INSTALL_PROG) xenstored $(DESTDIR)$(SBINDIR)
+	$(INSTALL_PROG) xenstored $(DESTDIR)$(sbindir)
 endif
-	$(INSTALL_PROG) xenstore-control $(DESTDIR)$(BINDIR)
-	$(INSTALL_PROG) xenstore $(DESTDIR)$(BINDIR)
+	$(INSTALL_PROG) xenstore-control $(DESTDIR)$(bindir)
+	$(INSTALL_PROG) xenstore $(DESTDIR)$(bindir)
 	set -e ; for c in $(CLIENTS) ; do \
-		ln -f $(DESTDIR)$(BINDIR)/xenstore $(DESTDIR)$(BINDIR)/$${c} ; \
+		ln -f $(DESTDIR)$(bindir)/xenstore $(DESTDIR)$(bindir)/$${c} ; \
 	done
-	$(INSTALL_DIR) $(DESTDIR)$(LIBDIR)
-	$(INSTALL_SHLIB) libxenstore.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)
-	ln -sf libxenstore.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxenstore.so.$(MAJOR)
-	ln -sf libxenstore.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxenstore.so
-	$(INSTALL_DATA) libxenstore.a $(DESTDIR)$(LIBDIR)
-	$(INSTALL_DATA) include/xenstore.h $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_DATA) include/xenstore_lib.h $(DESTDIR)$(INCLUDEDIR)
-	$(INSTALL_DATA) include/compat/xs.h $(DESTDIR)$(INCLUDEDIR)/xenstore-compat/xs.h
-	$(INSTALL_DATA) include/compat/xs_lib.h $(DESTDIR)$(INCLUDEDIR)/xenstore-compat/xs_lib.h
-	ln -sf xenstore-compat/xs.h  $(DESTDIR)$(INCLUDEDIR)/xs.h
-	ln -sf xenstore-compat/xs_lib.h $(DESTDIR)$(INCLUDEDIR)/xs_lib.h
+	$(INSTALL_DIR) $(DESTDIR)$(libdir)
+	$(INSTALL_SHLIB) libxenstore.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)
+	ln -sf libxenstore.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)/libxenstore.so.$(MAJOR)
+	ln -sf libxenstore.so.$(MAJOR) $(DESTDIR)$(libdir)/libxenstore.so
+	$(INSTALL_DATA) libxenstore.a $(DESTDIR)$(libdir)
+	$(INSTALL_DATA) include/xenstore.h $(DESTDIR)$(includedir)
+	$(INSTALL_DATA) include/xenstore_lib.h $(DESTDIR)$(includedir)
+	$(INSTALL_DATA) include/compat/xs.h $(DESTDIR)$(includedir)/xenstore-compat/xs.h
+	$(INSTALL_DATA) include/compat/xs_lib.h $(DESTDIR)$(includedir)/xenstore-compat/xs_lib.h
+	ln -sf xenstore-compat/xs.h  $(DESTDIR)$(includedir)/xs.h
+	ln -sf xenstore-compat/xs_lib.h $(DESTDIR)$(includedir)/xs_lib.h
 
 .PHONY: clients-install
 clients-install: clients
-	$(INSTALL_DIR) $(DESTDIR)$(BINDIR)
-	$(INSTALL_PROG) xenstore $(DESTDIR)$(BINDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(bindir)
+	$(INSTALL_PROG) xenstore $(DESTDIR)$(bindir)
 	set -e ; for c in $(CLIENTS) ; do \
-		ln -f $(DESTDIR)$(BINDIR)/xenstore $(DESTDIR)$(BINDIR)/$${c} ; \
+		ln -f $(DESTDIR)$(bindir)/xenstore $(DESTDIR)$(bindir)/$${c} ; \
 	done
 
 -include $(DEPS)
diff --git a/tools/xenstore/include/xenstore.h b/tools/xenstore/include/xenstore.h
index b4b113e..42c0dc7 100644
--- a/tools/xenstore/include/xenstore.h
+++ b/tools/xenstore/include/xenstore.h
@@ -13,8 +13,7 @@
     Lesser General Public License for more details.
 
     You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+    License along with this library; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #ifndef XENSTORE_H
@@ -149,8 +148,31 @@ struct xs_permissions *xs_get_permissions(struct xs_handle *h,
 					  xs_transaction_t t,
 					  const char *path, unsigned int *num);
 
-/* Set permissions of node (must be owner).
- * Returns false on failure.
+/* Set permissions of node (must be owner).  Returns false on failure.
+ *
+ * Domain 0 may read / write anywhere in the store, regardless of
+ * permission settings.
+ *
+ * Note:
+ * The perms array is a list of (domid, permissions) pairs. The first
+ * element in the list specifies the owner of the list, plus the flags
+ * for every domain not explicitly specified subsequently. The
+ * subsequent entries are normal capabilities.
+ *
+ * Example C code:
+ *
+ *  struct xs_permissions perms[2];
+ *
+ *  perms[0].id = dm_domid;
+ *  perms[0].perms = XS_PERM_NONE;
+ *  perms[1].id = guest_domid;
+ *  perms[1].perms = XS_PERM_READ;
+ *
+ * It means the owner of the path is domain $dm_domid (hence it always
+ * has read and write permission), all other domains (unless specified
+ * in subsequent pair) can neither read from nor write to that
+ * path. It then specifies domain $guest_domid can read from that
+ * path.
  */
 bool xs_set_permissions(struct xs_handle *h, xs_transaction_t t,
 			const char *path, struct xs_permissions *perms,
diff --git a/tools/xenstore/include/xenstore_lib.h b/tools/xenstore/include/xenstore_lib.h
index 5c2baf6..5a10c6c 100644
--- a/tools/xenstore/include/xenstore_lib.h
+++ b/tools/xenstore/include/xenstore_lib.h
@@ -13,8 +13,7 @@
     Lesser General Public License for more details.
 
     You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+    License along with this library; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #ifndef XENSTORE_LIB_H
diff --git a/tools/xenstore/talloc.c b/tools/xenstore/talloc.c
index a3d85e3..d7edcf3 100644
--- a/tools/xenstore/talloc.c
+++ b/tools/xenstore/talloc.c
@@ -22,8 +22,7 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+   License along with this library; If not, see <http://www.gnu.org/licenses/>.
 */
 
 /*
@@ -1102,13 +1101,16 @@ char *talloc_vasprintf(const void *t, const char *fmt, va_list ap)
 
 	/* this call looks strange, but it makes it work on older solaris boxes */
 	if ((len = vsnprintf(&c, 1, fmt, ap2)) < 0) {
+		va_end(ap2);
 		return NULL;
 	}
+	va_end(ap2);
 
 	ret = _talloc(t, len+1);
 	if (ret) {
 		VA_COPY(ap2, ap);
 		vsnprintf(ret, len+1, fmt, ap2);
+		va_end(ap2);
 		talloc_set_name_const(ret, ret);
 	}
 
@@ -1162,8 +1164,10 @@ static char *talloc_vasprintf_append(char *s, const char *fmt, va_list ap)
 		 * the original string. Most current callers of this 
 		 * function expect it to never return NULL.
 		 */
+		va_end(ap2);
 		return s;
 	}
+	va_end(ap2);
 
 	s = talloc_realloc(NULL, s, char, s_len + len+1);
 	if (!s) return NULL;
@@ -1171,6 +1175,7 @@ static char *talloc_vasprintf_append(char *s, const char *fmt, va_list ap)
 	VA_COPY(ap2, ap);
 
 	vsnprintf(s+s_len, len+1, fmt, ap2);
+	va_end(ap2);
 	talloc_set_name_const(s, s);
 
 	return s;
diff --git a/tools/xenstore/talloc.h b/tools/xenstore/talloc.h
index 1001a95..c849bf6 100644
--- a/tools/xenstore/talloc.h
+++ b/tools/xenstore/talloc.h
@@ -21,8 +21,7 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+   License along with this library; If not, see <http://www.gnu.org/licenses/>.
 */
 
 /* this is only needed for compatibility with the old talloc */
diff --git a/tools/xenstore/tdb.c b/tools/xenstore/tdb.c
index 3ecd3fc..0bb53a9 100644
--- a/tools/xenstore/tdb.c
+++ b/tools/xenstore/tdb.c
@@ -22,8 +22,7 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+   License along with this library; If not, see <http://www.gnu.org/licenses/>.
 */
 
 
diff --git a/tools/xenstore/tdb.h b/tools/xenstore/tdb.h
index 84d2df3..4187274 100644
--- a/tools/xenstore/tdb.h
+++ b/tools/xenstore/tdb.h
@@ -23,8 +23,7 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+   License along with this library; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #ifdef  __cplusplus
diff --git a/tools/xenstore/xenstore_client.c b/tools/xenstore/xenstore_client.c
index 1054f18..3d14d37 100644
--- a/tools/xenstore/xenstore_client.c
+++ b/tools/xenstore/xenstore_client.c
@@ -87,6 +87,7 @@ usage(enum mode mode, int incl_mode, const char *progname)
 	errx(1, "Usage: %s %s[-h] [-s] [-t] key [...]", progname, mstr);
     case MODE_exists:
 	mstr = incl_mode ? "exists " : "";
+	/* fallthrough */
     case MODE_list:
 	mstr = mstr ? : incl_mode ? "list " : "";
 	errx(1, "Usage: %s %s[-h] [-p] [-s] key [...]", progname, mstr);
diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
index 4eaff57..25a548d 100644
--- a/tools/xenstore/xenstored_core.c
+++ b/tools/xenstore/xenstored_core.c
@@ -13,8 +13,7 @@
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #include <sys/types.h>
@@ -40,7 +39,6 @@
 #include <signal.h>
 #include <assert.h>
 #include <setjmp.h>
-#include <config.h>
 
 #include "utils.h"
 #include "list.h"
@@ -89,9 +87,14 @@ static void check_store(void);
 #define log(...)							\
 	do {								\
 		char *s = talloc_asprintf(NULL, __VA_ARGS__);		\
-		trace("%s\n", s);					\
-		syslog(LOG_ERR, "%s",  s);				\
-		talloc_free(s);						\
+		if (s) {						\
+			trace("%s\n", s);				\
+			syslog(LOG_ERR, "%s",  s);			\
+			talloc_free(s);					\
+		} else {						\
+			trace("talloc failure during logging\n");	\
+			syslog(LOG_ERR, "talloc failure during logging\n"); \
+		}							\
 	} while (0)
 
 
@@ -1479,13 +1482,35 @@ static void manual_node(const char *name, const char *child)
 	talloc_free(node);
 }
 
+static void tdb_logger(TDB_CONTEXT *tdb, int level, const char * fmt, ...)
+{
+	va_list ap;
+	char *s;
+
+	va_start(ap, fmt);
+	s = talloc_vasprintf(NULL, fmt, ap);
+	va_end(ap);
+
+	if (s) {
+		trace("TDB: %s\n", s);
+		syslog(LOG_ERR, "TDB: %s",  s);
+		if (verbose)
+			xprintf("TDB: %s", s);
+		talloc_free(s);
+	} else {
+		trace("talloc failure during logging\n");
+		syslog(LOG_ERR, "talloc failure during logging\n");
+	}
+}
+
 static void setup_structure(void)
 {
 	char *tdbname;
 	tdbname = talloc_strdup(talloc_autofree_context(), xs_daemon_tdb());
 
 	if (!(tdb_flags & TDB_INTERNAL))
-		tdb_ctx = tdb_open(tdbname, 0, tdb_flags, O_RDWR, 0);
+		tdb_ctx = tdb_open_ex(tdbname, 0, tdb_flags, O_RDWR, 0,
+				      &tdb_logger, NULL);
 
 	if (tdb_ctx) {
 		/* XXX When we make xenstored able to restart, this will have
@@ -1516,8 +1541,8 @@ static void setup_structure(void)
 		talloc_free(tlocal);
 	}
 	else {
-		tdb_ctx = tdb_open(tdbname, 7919, tdb_flags, O_RDWR|O_CREAT,
-				   0640);
+		tdb_ctx = tdb_open_ex(tdbname, 7919, tdb_flags, O_RDWR|O_CREAT,
+				      0640, &tdb_logger, NULL);
 		if (!tdb_ctx)
 			barf_perror("Could not create tdb file %s", tdbname);
 
@@ -1756,7 +1781,10 @@ static int xs_validate_active_socket(const char *connect_to)
 	return xs_get_sd_fd(connect_to);
 }
 
-static void xen_claim_active_sockets(int **psock, int **pro_sock)
+/* Return true if started by systemd and false if not. Exit with
+ * error if things go wrong.
+ */
+static bool systemd_checkin(int **psock, int **pro_sock)
 {
 	int *sock, *ro_sock;
 	const char *soc_str = xs_daemon_socket();
@@ -1764,7 +1792,11 @@ static void xen_claim_active_sockets(int **psock, int **pro_sock)
 	int n;
 
 	n = sd_listen_fds(0);
-	if (n <= 0) {
+
+	if (n == 0)
+		return false;
+
+	if (n < 0) {
 		sd_notifyf(0, "STATUS=Failed to get any active sockets: %s\n"
 			   "ERRNO=%i",
 			   strerror(errno),
@@ -1791,6 +1823,8 @@ static void xen_claim_active_sockets(int **psock, int **pro_sock)
 
 	talloc_set_destructor(sock, destroy_fd);
 	talloc_set_destructor(ro_sock, destroy_fd);
+
+	return true;
 }
 #endif
 
@@ -1852,21 +1886,21 @@ static void usage(void)
 "\n"
 "where options may include:\n"
 "\n"
-"  --no-domain-init    to state that xenstored should not initialise dom0,\n"
-"  --pid-file <file>   giving a file for the daemon's pid to be written,\n"
-"  --help              to output this message,\n"
-"  --no-fork           to request that the daemon does not fork,\n"
-"  --output-pid        to request that the pid of the daemon is output,\n"
-"  --trace-file <file> giving the file for logging, and\n"
-"  --entry-nb <nb>     limit the number of entries per domain,\n"
-"  --entry-size <size> limit the size of entry per domain, and\n"
-"  --watch-nb <nb>     limit the number of watches per domain,\n"
-"  --transaction <nb>  limit the number of transaction allowed per domain,\n"
-"  --no-recovery       to request that no recovery should be attempted when\n"
-"                      the store is corrupted (debug only),\n"
-"  --internal-db       store database in memory, not on disk\n"
-"  --preserve-local    to request that /local is preserved on start-up,\n"
-"  --verbose           to request verbose execution.\n");
+"  -D, --no-domain-init    to state that xenstored should not initialise dom0,\n"
+"  -F, --pid-file <file>   giving a file for the daemon's pid to be written,\n"
+"  -H, --help              to output this message,\n"
+"  -N, --no-fork           to request that the daemon does not fork,\n"
+"  -P, --output-pid        to request that the pid of the daemon is output,\n"
+"  -T, --trace-file <file> giving the file for logging, and\n"
+"  -E, --entry-nb <nb>     limit the number of entries per domain,\n"
+"  -S, --entry-size <size> limit the size of entry per domain, and\n"
+"  -W, --watch-nb <nb>     limit the number of watches per domain,\n"
+"  -t, --transaction <nb>  limit the number of transaction allowed per domain,\n"
+"  -R, --no-recovery       to request that no recovery should be attempted when\n"
+"                          the store is corrupted (debug only),\n"
+"  -I, --internal-db       store database in memory, not on disk\n"
+"  -L, --preserve-local    to request that /local is preserved on start-up,\n"
+"  -V, --verbose           to request verbose execution.\n");
 }
 
 
@@ -1897,13 +1931,16 @@ int priv_domid = 0;
 
 int main(int argc, char *argv[])
 {
-	int opt, *sock, *ro_sock;
+	int opt, *sock = NULL, *ro_sock = NULL;
 	int sock_pollfd_idx = -1, ro_sock_pollfd_idx = -1;
 	bool dofork = true;
 	bool outputpid = false;
 	bool no_domain_init = false;
 	const char *pidfile = NULL;
 	int timeout;
+#if defined(XEN_SYSTEMD_ENABLED)
+	bool systemd;
+#endif
 
 	while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:T:RLVW:", options,
 				  NULL)) != -1) {
@@ -1965,10 +2002,11 @@ int main(int argc, char *argv[])
 		barf("%s: No arguments desired", argv[0]);
 
 #if defined(XEN_SYSTEMD_ENABLED)
-	if (sd_booted()) {
+	systemd = systemd_checkin(&sock, &ro_sock);
+	if (systemd) {
 		dofork = false;
 		if (pidfile)
-			barf("%s: PID file not needed on systemd", argv[0]);
+			xprintf("%s: PID file not needed on systemd", argv[0]);
 		pidfile = NULL;
 	}
 #endif
@@ -1995,9 +2033,7 @@ int main(int argc, char *argv[])
 	signal(SIGPIPE, SIG_IGN);
 
 #if defined(XEN_SYSTEMD_ENABLED)
-	if (sd_booted())
-		xen_claim_active_sockets(&sock, &ro_sock);
-	else
+	if (!systemd)
 #endif
 		init_sockets(&sock, &ro_sock);
 
@@ -2032,7 +2068,7 @@ int main(int argc, char *argv[])
 	xenbus_notify_running();
 
 #if defined(XEN_SYSTEMD_ENABLED)
-	if (sd_booted()) {
+	if (systemd) {
 		sd_notify(1, "READY=1");
 		fprintf(stderr, SD_NOTICE "xenstored is ready\n");
 	}
diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h
index dcf95b5..8c853c9 100644
--- a/tools/xenstore/xenstored_core.h
+++ b/tools/xenstore/xenstored_core.h
@@ -13,8 +13,7 @@
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #ifndef _XENSTORED_CORE_H
diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c
index 6d0394d..dcd6581 100644
--- a/tools/xenstore/xenstored_domain.c
+++ b/tools/xenstore/xenstored_domain.c
@@ -13,8 +13,7 @@
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #include <stdio.h>
@@ -172,7 +171,7 @@ static void *map_interface(domid_t domid, unsigned long mfn)
 			GNTTAB_RESERVED_XENSTORE, PROT_READ|PROT_WRITE);
 	} else {
 		return xc_map_foreign_range(*xc_handle, domid,
-			getpagesize(), PROT_READ|PROT_WRITE, mfn);
+			XC_PAGE_SIZE, PROT_READ|PROT_WRITE, mfn);
 	}
 }
 
@@ -181,7 +180,7 @@ static void unmap_interface(void *interface)
 	if (*xcg_handle != NULL)
 		xc_gnttab_munmap(*xcg_handle, interface, 1);
 	else
-		munmap(interface, getpagesize());
+		munmap(interface, XC_PAGE_SIZE);
 }
 
 static int destroy_domain(void *_domain)
diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h
index 9e2afae..83488ed 100644
--- a/tools/xenstore/xenstored_domain.h
+++ b/tools/xenstore/xenstored_domain.h
@@ -13,8 +13,7 @@
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #ifndef _XENSTORED_DOMAIN_H
diff --git a/tools/xenstore/xenstored_minios.c b/tools/xenstore/xenstored_minios.c
index f9c921e..b686e1c 100644
--- a/tools/xenstore/xenstored_minios.c
+++ b/tools/xenstore/xenstored_minios.c
@@ -13,8 +13,7 @@
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 #include <sys/types.h>
 #include <sys/mman.h>
diff --git a/tools/xenstore/xenstored_posix.c b/tools/xenstore/xenstored_posix.c
index ecf4386..1f9603f 100644
--- a/tools/xenstore/xenstored_posix.c
+++ b/tools/xenstore/xenstored_posix.c
@@ -13,8 +13,7 @@
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #include <sys/types.h>
diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c
index 50a32fb..d0e4739 100644
--- a/tools/xenstore/xenstored_transaction.c
+++ b/tools/xenstore/xenstored_transaction.c
@@ -13,8 +13,7 @@
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #include <stdio.h>
diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h
index b3cc9ac..cfeeae1 100644
--- a/tools/xenstore/xenstored_transaction.h
+++ b/tools/xenstore/xenstored_transaction.h
@@ -13,8 +13,7 @@
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef _XENSTORED_TRANSACTION_H
 #define _XENSTORED_TRANSACTION_H
diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c
index 2ac498a..8543999 100644
--- a/tools/xenstore/xenstored_watch.c
+++ b/tools/xenstore/xenstored_watch.c
@@ -13,8 +13,7 @@
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #include <stdio.h>
diff --git a/tools/xenstore/xenstored_watch.h b/tools/xenstore/xenstored_watch.h
index 6125ff3..5bc4f88 100644
--- a/tools/xenstore/xenstored_watch.h
+++ b/tools/xenstore/xenstored_watch.h
@@ -13,8 +13,7 @@
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #ifndef _XENSTORED_WATCH_H
diff --git a/tools/xenstore/xs.c b/tools/xenstore/xs.c
index 968141d..d1e01ba 100644
--- a/tools/xenstore/xs.c
+++ b/tools/xenstore/xs.c
@@ -13,8 +13,7 @@
     Lesser General Public License for more details.
 
     You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+    License along with this library; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #include <sys/types.h>
diff --git a/tools/xenstore/xs_lib.c b/tools/xenstore/xs_lib.c
index 4795162..0c7744e 100644
--- a/tools/xenstore/xs_lib.c
+++ b/tools/xenstore/xs_lib.c
@@ -13,8 +13,7 @@
     Lesser General Public License for more details.
 
     You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+    License along with this library; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #include <unistd.h>
@@ -82,6 +81,8 @@ const char *xs_domain_dev(void)
 #if defined(__RUMPUSER_XEN__) || defined(__RUMPRUN__)
 	return "/dev/xen/xenbus";
 #elif defined(__linux__)
+	if (access("/dev/xen/xenbus", F_OK) == 0)
+		return "/dev/xen/xenbus";
 	return "/proc/xen/xenbus";
 #elif defined(__NetBSD__)
 	return "/kern/xen/xenbus";
diff --git a/tools/xenstore/xs_tdb_dump.c b/tools/xenstore/xs_tdb_dump.c
index b91cdef..9f636f9 100644
--- a/tools/xenstore/xs_tdb_dump.c
+++ b/tools/xenstore/xs_tdb_dump.c
@@ -33,6 +33,15 @@ static char perm_to_char(enum xs_perm_type perm)
 		'?';
 }
 
+static void tdb_logger(TDB_CONTEXT *tdb, int level, const char * fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+}
+
 int main(int argc, char *argv[])
 {
 	TDB_DATA key;
@@ -41,7 +50,8 @@ int main(int argc, char *argv[])
 	if (argc != 2)
 		barf("Usage: xs_tdb_dump <tdbfile>");
 
-	tdb = tdb_open(talloc_strdup(NULL, argv[1]), 0, 0, O_RDONLY, 0);
+	tdb = tdb_open_ex(talloc_strdup(NULL, argv[1]), 0, 0, O_RDONLY, 0,
+			  &tdb_logger, NULL);
 	if (!tdb)
 		barf_perror("Could not open %s", argv[1]);
 
diff --git a/tools/xentrace/Makefile b/tools/xentrace/Makefile
index b821952..6c13cd1 100644
--- a/tools/xentrace/Makefile
+++ b/tools/xentrace/Makefile
@@ -4,35 +4,38 @@ include $(XEN_ROOT)/tools/Rules.mk
 CFLAGS += -Werror
 
 CFLAGS += $(CFLAGS_libxenctrl)
-LDLIBS += $(LDLIBS_libxenctrl)
+LDLIBS += $(LDLIBS_libxenctrl) $(ARGP_LDFLAGS)
 
-BIN      = xentrace xentrace_setsize
+BIN-$(CONFIG_X86) = xenalyze
+BIN      = $(BIN-y)
+SBIN     = xentrace xentrace_setsize
 LIBBIN   = xenctx
 SCRIPTS  = xentrace_format
-MAN1     = $(wildcard *.1)
-MAN8     = $(wildcard *.8)
 
 .PHONY: all
 all: build
 
 .PHONY: build
-build: $(BIN) $(LIBBIN)
+build: $(BIN) $(SBIN) $(LIBBIN)
 
 .PHONY: install
 install: build
-	$(INSTALL_DIR) $(DESTDIR)$(BINDIR)
+	$(INSTALL_DIR) $(DESTDIR)$(bindir)
+	$(INSTALL_DIR) $(DESTDIR)$(sbindir)
 	[ -z "$(LIBBIN)" ] || $(INSTALL_DIR) $(DESTDIR)$(LIBEXEC_BIN)
-	$(INSTALL_DIR) $(DESTDIR)$(MAN1DIR)
-	$(INSTALL_DIR) $(DESTDIR)$(MAN8DIR)
-	$(INSTALL_PROG) $(BIN) $(DESTDIR)$(BINDIR)
-	$(INSTALL_PYTHON_PROG) $(SCRIPTS) $(DESTDIR)$(BINDIR)
+ifneq ($(BIN),)
+	$(INSTALL_PROG) $(BIN) $(DESTDIR)$(bindir)
+endif
+	$(INSTALL_PROG) $(SBIN) $(DESTDIR)$(sbindir)
+	$(INSTALL_PYTHON_PROG) $(SCRIPTS) $(DESTDIR)$(bindir)
 	[ -z "$(LIBBIN)" ] || $(INSTALL_PROG) $(LIBBIN) $(DESTDIR)$(LIBEXEC_BIN)
-	$(INSTALL_DATA) $(MAN1) $(DESTDIR)$(MAN1DIR)
-	$(INSTALL_DATA) $(MAN8) $(DESTDIR)$(MAN8DIR)
 
 .PHONY: clean
 clean:
-	$(RM) *.a *.so *.o *.rpm $(BIN) $(LIBBIN) $(DEPS)
+	$(RM) *.a *.so *.o *.rpm $(BIN) $(SBIN) $(LIBBIN) $(DEPS)
+
+.PHONY: distclean
+distclean: clean
 
 xentrace: xentrace.o
 	$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS) $(APPEND_LDFLAGS)
@@ -43,5 +46,8 @@ xenctx: xenctx.o
 xentrace_setsize: setsize.o
 	$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS) $(APPEND_LDFLAGS)
 
+xenalyze: xenalyze.o mread.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LDLIBS) $(APPEND_LDFLAGS)
+
 -include $(DEPS)
 
diff --git a/tools/xentrace/analyze.h b/tools/xentrace/analyze.h
new file mode 100644
index 0000000..40ee551
--- /dev/null
+++ b/tools/xentrace/analyze.h
@@ -0,0 +1,107 @@
+#ifndef __ANALYZE_H
+# define __ANALYZE_H
+
+#include <stdint.h>
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#define TRC_GEN_MAIN     0
+#define TRC_SCHED_MAIN   1
+#define TRC_DOM0OP_MAIN  2
+#define TRC_HVM_MAIN     3
+#define TRC_MEM_MAIN     4
+#define TRC_PV_MAIN      5
+#define TRC_SHADOW_MAIN  6
+#define TRC_HW_MAIN      7
+
+#define TRC_LOST_RECORDS_END    (TRC_GEN + 50)
+
+#define NR_CPUS 128
+#if __x86_64__
+# define BITS_PER_LONG 64
+#else
+# define BITS_PER_LONG 32
+#endif
+
+#define BITS_TO_LONGS(bits) \
+    (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define DECLARE_BITMAP(name,bits) \
+    unsigned long name[BITS_TO_LONGS(bits)]
+typedef struct cpumask{ DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
+
+enum {
+    TRCE_SFLAG_SET_AD,
+    TRCE_SFLAG_SET_A,
+    TRCE_SFLAG_SHADOW_L1_GET_REF,
+    TRCE_SFLAG_SHADOW_L1_PUT_REF,
+    TRCE_SFLAG_L2_PROPAGATE,
+    TRCE_SFLAG_SET_CHANGED,
+    TRCE_SFLAG_SET_FLUSH,
+    TRCE_SFLAG_SET_ERROR,
+    TRCE_SFLAG_DEMOTE,
+    TRCE_SFLAG_PROMOTE,
+    TRCE_SFLAG_WRMAP,
+    TRCE_SFLAG_WRMAP_GUESS_FOUND,
+    TRCE_SFLAG_WRMAP_BRUTE_FORCE,
+    TRCE_SFLAG_EARLY_UNSHADOW,
+    TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN,
+    TRCE_SFLAG_EMULATION_LAST_FAILED,
+    TRCE_SFLAG_EMULATE_FULL_PT,
+    TRCE_SFLAG_PREALLOC_UNPIN,
+    TRCE_SFLAG_PREALLOC_UNHOOK
+};
+
+#define TRC_HVM_OP_DESTROY_PROC (TRC_HVM_HANDLER + 0x100)
+
+typedef unsigned long long tsc_t;
+
+/* -- on-disk trace buffer definitions -- */
+struct trace_record {
+    union {
+        struct {
+            unsigned event:28,
+                extra_words:3,
+                cycle_flag:1;
+            union {
+                struct {
+                    uint32_t tsc_lo, tsc_hi;
+                    uint32_t data[7];
+                } tsc;
+                struct {
+                    uint32_t data[7];
+                } notsc;
+            } u;
+        };
+        uint32_t raw[8];
+    };
+};
+
+/* -- General info about a current record -- */
+struct time_struct {
+    unsigned long long time;
+    unsigned int s, ns;
+};
+
+#define DUMP_HEADER_MAX 256
+
+struct record_info {
+    int cpu;
+    tsc_t tsc;
+    union {
+        unsigned event;
+        struct {
+            unsigned minor:12,
+                sub:4,
+                main:12,
+                unused:4;
+        } evt;
+    };
+    int extra_words;
+    int size;
+    uint32_t *d;
+    char dump_header[DUMP_HEADER_MAX];
+    struct time_struct t;
+    struct trace_record rec;
+};
+
+#endif
diff --git a/tools/xentrace/formats b/tools/xentrace/formats
index da658bf..5d7b72a 100644
--- a/tools/xentrace/formats
+++ b/tools/xentrace/formats
@@ -68,10 +68,10 @@
 0x00082014  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INVLPG      [ is invlpga? = %(1)d, virt = 0x%(2)08x ]
 0x00082114  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INVLPG      [ is invlpga? = %(1)d, virt = 0x%(3)08x%(2)08x ]
 0x00082015  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MCE
-0x00082016  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  IOPORT_READ [ data = 0x%(1)04x ]
-0x00082216  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  IOPORT_WRITE [ data = 0x%(1)04x ]
-0x00082017  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MMIO_READ   [ data = 0x%(1)04x ]
-0x00082217  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MMIO_WRITE  [ data = 0x%(1)04x ]
+0x00082016  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  IOPORT_READ [ port = 0x%(1)04x, data = 0x%(2)08x ]
+0x00082216  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  IOPORT_WRITE [ port = 0x%(1)04x, data = 0x%(2)08x ]
+0x00082017  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MMIO_READ   [ port = 0x%(1)08x, data = 0x%(2)08x ]
+0x00082217  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MMIO_WRITE  [ port = 0x%(1)08x, data = 0x%(2)08x ]
 0x00082018  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CLTS
 0x00082019  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  LMSW        [ value = 0x%(1)08x ]
 0x00082119  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  LMSW        [ value = 0x%(2)08x%(1)08x ]
diff --git a/tools/xentrace/mread.c b/tools/xentrace/mread.c
new file mode 100644
index 0000000..a22c4ea
--- /dev/null
+++ b/tools/xentrace/mread.c
@@ -0,0 +1,160 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include "mread.h"
+
+mread_handle_t mread_init(int fd)
+{
+    struct stat s;
+    mread_handle_t h;
+    
+    h=malloc(sizeof(struct mread_ctrl));
+
+    if (!h)
+    {
+        perror("malloc");
+        exit(1);
+    }
+
+    bzero(h, sizeof(struct mread_ctrl));
+
+    h->fd = fd;
+
+    fstat(fd, &s);
+    h->file_size = s.st_size;
+
+    return h;
+}
+
+ssize_t mread64(mread_handle_t h, void *rec, ssize_t len, off_t offset)
+{
+    /* Idea: have a "cache" of N mmaped regions.  If the offset is
+     * in one of the regions, just copy it.  If not, evict one of the
+     * regions and map the appropriate range.
+     *
+     * Basic algorithm:
+     *  - See if the offset is in one of the regions
+     *    - If not, map it
+     *       - evict an old region
+     *       - map the new region
+     *  - Copy
+     */
+    char * b=NULL;
+    int bind=-1;
+    off_t boffset=0;
+    ssize_t bsize;
+
+#define dprintf(x...)
+//#define dprintf fprintf
+
+    dprintf(warn, "%s: offset %llx len %d\n", __func__,
+            offset, len);
+    if ( offset > h->file_size )
+    {
+        dprintf(warn, " offset > file size %llx, returning 0\n",
+                h->file_size);
+        return 0;
+    }
+    if ( offset + len > h->file_size )
+    {
+        dprintf(warn, " offset+len > file size %llx, truncating\n",
+                h->file_size);
+        len = h->file_size - offset;
+    }
+
+    /* Try to find the offset in our range */
+    dprintf(warn, " Trying last, %d\n", last);
+    if ( h->map[h->last].buffer
+         && (offset & MREAD_BUF_MASK) == h->map[h->last].start_offset )
+    {
+        bind=h->last;
+        goto copy;
+    }
+
+    /* Scan to see if it's anywhere else */
+    dprintf(warn, " Scanning\n");
+    for(bind=0; bind<MREAD_MAPS; bind++)
+        if ( h->map[bind].buffer
+             && (offset & MREAD_BUF_MASK) == h->map[bind].start_offset )
+        {
+            dprintf(warn, "  Found, index %d\n", bind);
+            break;
+        }
+
+    /* If we didn't find it, evict someone and map it */
+    if ( bind == MREAD_MAPS )
+    {
+        dprintf(warn, " Clock\n");
+        while(1)
+        {
+            h->clock++;
+            if(h->clock >= MREAD_MAPS)
+                h->clock=0;
+            dprintf(warn, "  %d\n", h->clock);
+            if(h->map[h->clock].buffer == NULL)
+            {
+                dprintf(warn, "  Buffer null, using\n");
+                break;
+            }
+            if(!h->map[h->clock].accessed)
+            {
+                dprintf(warn, "  Not accessed, using\n");
+                break;
+            }
+            h->map[h->clock].accessed=0;
+        }
+        if(h->map[h->clock].buffer)
+        {
+            dprintf(warn, "  Unmapping\n");
+            munmap(h->map[h->clock].buffer, MREAD_BUF_SIZE);
+        }
+        /* FIXME: Try MAP_HUGETLB? */
+        /* FIXME: Make sure this works on large files... */
+        h->map[h->clock].start_offset = offset & MREAD_BUF_MASK;
+        dprintf(warn, "  Mapping %llx from offset %llx\n",
+                MREAD_BUF_SIZE, h->map[h->clock].start_offset);
+        h->map[h->clock].buffer = mmap(NULL, MREAD_BUF_SIZE, PROT_READ,
+                                  MAP_SHARED,
+                                  h->fd,
+                                  h->map[h->clock].start_offset);
+        dprintf(warn, "   mmap returned %p\n", h->map[h->clock].buffer);
+        if ( h->map[h->clock].buffer == MAP_FAILED )
+        {
+            h->map[h->clock].buffer = NULL;
+            perror("mmap");
+            exit(1);
+        }
+        bind = h->clock;
+    }
+
+    h->last=bind;
+copy:
+    h->map[bind].accessed=1;
+    b=h->map[bind].buffer;
+    boffset=offset - h->map[bind].start_offset;
+    if ( boffset + len > MREAD_BUF_SIZE )
+        bsize = MREAD_BUF_SIZE - boffset;
+    else
+        bsize = len;
+    dprintf(warn, " Using index %d, buffer at %p, buffer offset %llx len %d\n",
+            bind, b, boffset, bsize);
+
+    bcopy(b+boffset, rec, bsize);
+
+    /* Handle the boundary case; make sure this is after doing anything
+     * with the static variables*/
+    if ( len > bsize )
+    {
+        dprintf(warn, "  Finishing up by reading l %d o %llx\n",
+                len-bsize, offset+bsize);
+        mread64(h, rec+bsize, len-bsize, offset+bsize);
+    }
+
+    /* FIXME: ?? */
+    return len;
+#undef dprintf
+}
diff --git a/tools/xentrace/mread.h b/tools/xentrace/mread.h
new file mode 100644
index 0000000..443814f
--- /dev/null
+++ b/tools/xentrace/mread.h
@@ -0,0 +1,18 @@
+#define MREAD_MAPS 8
+#define MREAD_BUF_SHIFT 9
+#define PAGE_SHIFT 12
+#define MREAD_BUF_SIZE (1ULL<<(PAGE_SHIFT+MREAD_BUF_SHIFT))
+#define MREAD_BUF_MASK (~(MREAD_BUF_SIZE-1))
+typedef struct mread_ctrl {
+    int fd;
+    off_t file_size;
+    struct mread_buffer {
+        char * buffer;
+        off_t start_offset;
+        int accessed;
+    } map[MREAD_MAPS];
+    int clock, last;
+} *mread_handle_t;
+
+mread_handle_t mread_init(int fd);
+ssize_t mread64(mread_handle_t h, void *dst, ssize_t len, off_t offset);
diff --git a/tools/xentrace/pv.h b/tools/xentrace/pv.h
new file mode 100644
index 0000000..3e6ad77
--- /dev/null
+++ b/tools/xentrace/pv.h
@@ -0,0 +1,41 @@
+/*
+ * PV event decoding.
+ *
+ * Copyright (C) 2012 Citrix Systems R&D Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+#ifndef __PV_H
+
+#include "analyze.h"
+#include <xen/trace.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ARG_MISSING 0x0
+#define ARG_32BIT 0x1
+#define ARG_64BIT 0x2
+
+#define MMU_UPDATE_PREEMPTED          (~(~0U>>1))
+
+static inline uint32_t pv_hypercall_op(const struct record_info *ri)
+{
+    return ri->d[0] & ~TRC_PV_HYPERCALL_V2_ARG_MASK;
+}
+
+static inline int pv_hypercall_arg_present(const struct record_info *ri, int arg)
+{
+    return (ri->d[0] >> (20 + 2*arg)) & 0x3;
+}
+
+void pv_hypercall_gather_args(const struct record_info *ri, uint64_t *args);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c
new file mode 100644
index 0000000..5a2735c
--- /dev/null
+++ b/tools/xentrace/xenalyze.c
@@ -0,0 +1,10407 @@
+/*
+ * xenalyze.c: Analyzing xentrace output
+ *
+ * Written by George Dunlap.
+ *
+ * Copyright (c) 2006-2007, XenSource Inc.
+ * Copyright (c) 2007-2008, Citrix Systems R&D Ltd, UK
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+#define _XOPEN_SOURCE 600
+#include <stdio.h>
+#include <stdlib.h>
+#include <argp.h>
+#include <inttypes.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <xen/trace.h>
+#include "analyze.h"
+#include "mread.h"
+#include "pv.h"
+#include <errno.h>
+#include <strings.h>
+#include <string.h>
+#include <assert.h>
+
+struct mread_ctrl;
+
+
+#define DEFAULT_CPU_HZ 2400000000LL
+#define QHZ_FROM_HZ(_hz) (((_hz) << 10)/ 1000000000)
+
+#define ADDR_SPACE_BITS 48
+#define DEFAULT_SAMPLE_SIZE 10240
+#define DEFAULT_INTERVAL_LENGTH 1000
+
+struct array_struct {
+    unsigned long long *values;
+    int count;
+};
+
+#define warn_once(_x...)                          \
+    do {                                          \
+        static int _w=1;                          \
+        if ( _w ) {                               \
+            _w=0;                                 \
+            fprintf(warn, ##_x);                  \
+        }                                         \
+    } while(0)                                    \
+
+/* -- Global variables -- */
+struct {
+    int fd;
+    struct mread_ctrl *mh;
+    struct symbol_struct * symbols;
+    char * symbol_file;
+    char * trace_file;
+    int output_defined;
+    off_t file_size;
+    struct {
+        off_t update_offset;
+        int pipe[2];
+        FILE* out;
+        int pid;
+    } progress;
+} G = {
+    .fd=-1,
+    .symbols = NULL,
+    .symbol_file = NULL,
+    .trace_file = NULL,
+    .output_defined = 0,
+    .file_size = 0,
+    .progress = { .update_offset = 0 },
+};
+
+/*
+  Kinds of errors:
+   Unexpected values
+    - RIP with information in high bits (not all 0 or 1)
+    - exit reason too high
+   Unexpected record layout
+    - x64 bit set in PIO,PV_PTWR_EMULATION_PAE,
+    - Unknown minor type (PV_PTWR_EMULATION, RUNSTATE_CHANGE
+    - Wrong record size
+    - More than one bit set in evt.main field
+   Unexpected sequences
+    - wake tsc tracking
+    - TSC dependency loop
+    - Mismatch between non-running old event states
+    - Runstate continue while running on another pcpu
+    - lost_record_end seen in non-lost pcpu
+    - Unexpected non-CPU_CHANGE record during new_pcpu scan
+    - record tsc < interval start tsc
+    - lost_record tsc !> order tsc
+   Limited resources
+    - interrupt interval slots
+    - record cpu > MAX_CPUS
+   Algorithm asserts
+    - Duplicate CR3/domain values
+    - Logic holes
+     - domain runstates
+     - runstate / tsc skew
+    - vcpu_{prev,next}_update p->current{==,!=}null
+    - vcpu start conditions
+    - lost_cpu count higher than # of seen cpus / < 0
+    - lost cpu has non-null p->current
+   Symbol file
+    -file doesn't open
+    -file not ordered
+   System
+    - short read
+    - malloc failed
+   Args
+    - Invalid cpu_hz value / suffix
+    - No trace file
+    - Can't open trace file
+*/
+enum error_level {
+    ERR_NONE=0,
+    ERR_STRICT, /* Be unreasonably picky */
+    ERR_WARN,   /* Something midly unexpected */
+    ERR_SANITY, /* Sanity checks: RIP with info in high bits */
+    ERR_RECORD, /* Something that keeps you from processing the record */
+    ERR_FILE,   /* Probably caused by a corrupt file */
+    ERR_LIMIT,  /* Exceeded limits; data will be lost */
+    ERR_MAX_TOLERABLE=ERR_LIMIT,
+    /* -- Unrecoverable past this point -- */
+    ERR_ASSERT, /* Algoritm assert */
+    ERR_SYSTEM, /* System error: cannot allocate memory, short read, &c */
+};
+
+int verbosity = 5;
+
+struct {
+    unsigned
+        scatterplot_interrupt_eip:1,
+        scatterplot_cpi:1,
+        scatterplot_unpin_promote:1,
+        scatterplot_cr3_switch:1,
+        scatterplot_wake_to_halt:1,
+        scatterplot_io:1,
+        scatterplot_vmexit_eip:1,
+        scatterplot_runstate:1,
+        scatterplot_runstate_time:1,
+        scatterplot_pcpu:1,
+        scatterplot_extint_cycles:1,
+        scatterplot_rdtsc:1,
+        scatterplot_irq:1,
+        histogram_interrupt_eip:1,
+        interval_mode:1,
+        dump_all:1,
+        dump_raw_process:1,
+        dump_raw_reads:1,
+        dump_no_processing:1,
+        dump_ipi_latency:1,
+        dump_trace_volume_on_lost_record:1,
+        dump_show_power_states:1,
+        with_cr3_enumeration:1,
+        with_pio_enumeration:1,
+        with_mmio_enumeration:1,
+        with_interrupt_eip_enumeration:1,
+        show_default_domain_summary:1,
+        mmio_enumeration_skip_vga:1,
+        progress:1,
+        svm_mode:1,
+        summary:1,
+        report_pcpu:1,
+        tsc_loop_fatal:1,
+        summary_info;
+    long long cpu_qhz, cpu_hz;
+    int scatterplot_interrupt_vector;
+    int scatterplot_extint_cycles_vector;
+    int scatterplot_io_port;
+    int histogram_interrupt_vector;
+    unsigned long long histogram_interrupt_increment;
+    int interrupt_eip_enumeration_vector;
+    int default_guest_paging_levels;
+    int sample_size;
+    enum error_level tolerance; /* Tolerate up to this level of error */
+    struct {
+        tsc_t cycles;
+        /* Used if interval is specified in seconds to delay calculating
+         * time_interval until all arguments have been processed (specifically,
+         * cpu_hz). */
+        unsigned msec;
+        enum {
+            INTERVAL_CR3_SCHEDULE_TIME,
+            INTERVAL_CR3_SCHEDULE_ORDERED,
+            INTERVAL_CR3_SHORT_SUMMARY,
+            INTERVAL_DOMAIN_TOTAL_TIME,
+            INTERVAL_DOMAIN_SHORT_SUMMARY,
+            INTERVAL_DOMAIN_GUEST_INTERRUPT,
+            INTERVAL_DOMAIN_GRANT_MAPS
+        } output;
+        enum {
+            INTERVAL_MODE_CUSTOM,
+            INTERVAL_MODE_ARRAY,
+            INTERVAL_MODE_LIST
+        } mode;
+        enum {
+            INTERVAL_CHECK_NONE,
+            INTERVAL_CHECK_CR3,
+            INTERVAL_CHECK_DOMAIN
+        } check;
+        /* Options for specific interval output types */
+        union {
+            struct array_struct array;
+        };
+        int count;
+    } interval;
+} opt = {
+    .scatterplot_interrupt_eip=0,
+    .scatterplot_cpi=0,
+    .scatterplot_unpin_promote=0,
+    .scatterplot_cr3_switch=0,
+    .scatterplot_wake_to_halt=0,
+    .scatterplot_vmexit_eip=0,
+    .scatterplot_runstate=0,
+    .scatterplot_runstate_time=0,
+    .scatterplot_pcpu=0,
+    .scatterplot_extint_cycles=0,
+    .scatterplot_rdtsc=0,
+    .scatterplot_irq=0,
+    .histogram_interrupt_eip=0,
+    .dump_all = 0,
+    .dump_raw_process = 0,
+    .dump_raw_reads = 0,
+    .dump_no_processing = 0,
+    .dump_ipi_latency = 0,
+    .dump_trace_volume_on_lost_record = 0,
+    .dump_show_power_states = 0,
+    .with_cr3_enumeration = 0,
+    .with_pio_enumeration = 1,
+    .with_mmio_enumeration = 0,
+    .with_interrupt_eip_enumeration = 0,
+    .show_default_domain_summary = 0,
+    .mmio_enumeration_skip_vga = 1,
+    .progress = 0,
+    .svm_mode = 0,
+    .summary = 0,
+    .report_pcpu = 0,
+    .tsc_loop_fatal = 0,
+    .cpu_hz = DEFAULT_CPU_HZ,
+    /* Pre-calculate a multiplier that makes the rest of the
+     * calculations easier */
+    .cpu_qhz = QHZ_FROM_HZ(DEFAULT_CPU_HZ),
+    .default_guest_paging_levels = 2,
+    .sample_size = DEFAULT_SAMPLE_SIZE,
+    .tolerance = ERR_SANITY,
+    .interval = { .msec = DEFAULT_INTERVAL_LENGTH },
+};
+
+FILE *warn = NULL;
+
+/* -- Summary data -- */
+struct cycle_framework {
+    tsc_t first_tsc, last_tsc, total_cycles;
+};
+
+struct interval_element {
+    int count;
+    long long cycles;
+    long long instructions;
+};
+
+struct event_cycle_summary {
+    int count, cycles_count;
+    long long cycles;
+    long long *cycles_sample;
+    struct interval_element interval;
+};
+
+struct cycle_summary {
+    int count;
+    unsigned long long cycles;
+    long long *sample;
+    struct interval_element interval;
+};
+
+struct weighted_cpi_summary {
+    int count;
+    unsigned long long instructions;
+    unsigned long long cycles;
+    float *cpi;
+    unsigned long long *cpi_weight;
+    struct interval_element interval;
+};
+
+/* -- Symbol list information -- */
+#define SYMBOL_ENTRIES_PER_STRUCT 1023
+#define SYMBOL_NAME_SIZE 124
+struct symbol_struct {
+    int count;
+    struct {
+        unsigned long long addr;
+        char name[SYMBOL_NAME_SIZE];
+    } symbols[SYMBOL_ENTRIES_PER_STRUCT];
+    struct symbol_struct *next;
+};
+
+void error(enum error_level l, struct record_info *ri);
+
+void parse_symbol_file(char *fn) {
+    unsigned long long last_addr = 0;
+    FILE * symbol_file;
+    struct symbol_struct ** p=&G.symbols;
+
+    if((symbol_file=fopen(fn, "rb"))==NULL) {
+        fprintf(stderr, "Could not open symbol file %s\n", fn);
+        perror("open");
+        error(ERR_SYSTEM, NULL);
+    }
+    while(!feof(symbol_file)) {
+        /* Allocate a new struct if we need it */
+        if(!*p) {
+            *p = malloc(sizeof(**p));
+            if(!*p) {
+                fprintf(stderr, "Malloc failed!\n");
+                error(ERR_SYSTEM, NULL);
+            }
+            (*p)->count=0;
+            (*p)->next=NULL;
+        }
+
+        /* FIXME -- use SYMBOL_NAME_SIZE */
+        /* FIXME -- use regexp.  This won't work for symbols with spaces (yes they exist) */
+        (*p)->symbols[(*p)->count].addr = 0xDEADBEEF;
+        if ( fscanf(symbol_file, "%llx %128s",
+               &(*p)->symbols[(*p)->count].addr,
+                    (*p)->symbols[(*p)->count].name) == 0 )
+            break;
+
+
+        if( ((*p)->symbols[(*p)->count].addr > 0)
+            && ((*p)->symbols[(*p)->count].addr < last_addr) )  {
+            fprintf(stderr, "Symbol file not properly ordered: %llx %s < %llx!\n",
+                    (*p)->symbols[(*p)->count].addr,
+                    (*p)->symbols[(*p)->count].name,
+                    last_addr);
+            /* Could be recovered from; just free existing strings and set symbols to NULL */
+            error(ERR_ASSERT, NULL);
+        } else
+            last_addr = (*p)->symbols[(*p)->count].addr;
+
+        (*p)->count++;
+
+        /* If this struct is full, point to the next.  It will be allocated
+           if needed. */
+        if((*p)->count == SYMBOL_ENTRIES_PER_STRUCT) {
+            p=&((*p)->next);
+        }
+    }
+}
+
+/* WARNING not thread safe */
+char * find_symbol(unsigned long long addr) {
+    struct symbol_struct * p=G.symbols;
+    int i;
+    char * lastname="ZERO";
+    unsigned long long offset=addr;
+    static char name[128];
+
+    if(!p) {
+        name[0]=0;
+        return name;
+    }
+
+    while(1) {
+        if(!p)
+            goto finish;
+        for(i=0; i<p->count; i++) {
+            if(p->symbols[i].addr > addr)
+                goto finish;
+            else {
+                lastname=p->symbols[i].name;
+                offset=addr - p->symbols[i].addr;
+            }
+        }
+        p=p->next;
+    }
+ finish:
+    snprintf(name, 128, "(%s +%llx)",
+             lastname, offset);
+    return name;
+}
+
+/* -- Eip list data -- */
+enum {
+    EIP_LIST_TYPE_NONE=0,
+    EIP_LIST_TYPE_MAX
+};
+
+struct eip_list_struct {
+    struct eip_list_struct *next;
+    unsigned long long eip;
+    struct event_cycle_summary summary;
+    int type;
+    void * extra;
+};
+
+struct {
+    void (*update)(struct eip_list_struct *, void *);
+    void (*new)(struct eip_list_struct *, void *);
+    void (*dump)(struct eip_list_struct *);
+} eip_list_type[EIP_LIST_TYPE_MAX] = {
+    [EIP_LIST_TYPE_NONE] = {
+        .update=NULL,
+        .new=NULL,
+        .dump=NULL },
+};
+
+
+/* --- HVM class of events --- */
+
+/*
+ *  -- Algorithms --
+ *
+ * Interrupt Wake-to-halt detection
+ *
+ * Purpose: To correlate device interrupts to vcpu runtime.
+ *
+ * Diagram:
+ *  ...
+ *  blocked  -> runnable     <- set to waking
+ *  ...
+ *  runnable -> running
+ *  inj_virq A               <- Note "waking" interrupt
+ *  vmenter                  <- Start tsc of "wake-to-halt" interval.
+                                Turn off 'waking'.
+ *  ...
+ *  inj_virq B               <- Note alternate interrupt
+ *  vmenter                  <- Start tsc of "interrupt-to-halt" interval
+ *  ...
+ *  vmexit                   <- End tsc of "x-to-halt" interval
+ *  running -> blocked       <- Process
+ *
+ *  The "waking" interrupts we want to sub-classify into
+ *  "wake-only" (when interrupt was the only interrupt from wake to halt) and
+ *  "wake-all"  (whether this was the only interrupt or not).
+ */
+
+/* VMX data */
+#define EXIT_REASON_EXCEPTION_NMI       0
+#define EXIT_REASON_EXTERNAL_INTERRUPT  1
+#define EXIT_REASON_TRIPLE_FAULT        2
+#define EXIT_REASON_INIT                3
+#define EXIT_REASON_SIPI                4
+#define EXIT_REASON_IO_SMI              5
+#define EXIT_REASON_OTHER_SMI           6
+#define EXIT_REASON_PENDING_INTERRUPT   7
+#define EXIT_REASON_PENDING_VIRT_NMI    8
+#define EXIT_REASON_TASK_SWITCH         9
+#define EXIT_REASON_CPUID               10
+#define EXIT_REASON_GETSEC              11
+#define EXIT_REASON_HLT                 12
+#define EXIT_REASON_INVD                13
+#define EXIT_REASON_INVLPG              14
+#define EXIT_REASON_RDPMC               15
+#define EXIT_REASON_RDTSC               16
+#define EXIT_REASON_RSM                 17
+#define EXIT_REASON_VMCALL              18
+#define EXIT_REASON_VMCLEAR             19
+#define EXIT_REASON_VMLAUNCH            20
+#define EXIT_REASON_VMPTRLD             21
+#define EXIT_REASON_VMPTRST             22
+#define EXIT_REASON_VMREAD              23
+#define EXIT_REASON_VMRESUME            24
+#define EXIT_REASON_VMWRITE             25
+#define EXIT_REASON_VMOFF               26
+#define EXIT_REASON_VMON                27
+#define EXIT_REASON_CR_ACCESS           28
+#define EXIT_REASON_DR_ACCESS           29
+#define EXIT_REASON_IO_INSTRUCTION      30
+#define EXIT_REASON_MSR_READ            31
+#define EXIT_REASON_MSR_WRITE           32
+#define EXIT_REASON_INVALID_GUEST_STATE 33
+#define EXIT_REASON_MSR_LOADING         34
+#define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_MONITOR_TRAP_FLAG   37
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION   40
+#define EXIT_REASON_MACHINE_CHECK       41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_ACCESS_GDTR_OR_IDTR 46
+#define EXIT_REASON_ACCESS_LDTR_OR_TR   47
+#define EXIT_REASON_EPT_VIOLATION       48
+#define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_INVEPT              50
+#define EXIT_REASON_RDTSCP              51
+#define EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED 52
+#define EXIT_REASON_INVVPID             53
+#define EXIT_REASON_WBINVD              54
+#define EXIT_REASON_XSETBV              55
+
+#define HVM_VMX_EXIT_REASON_MAX (EXIT_REASON_XSETBV+1)
+
+char * hvm_vmx_exit_reason_name[HVM_VMX_EXIT_REASON_MAX] = {
+    [0] = "NONE",
+    [EXIT_REASON_EXCEPTION_NMI]="EXCEPTION_NMI",
+    [EXIT_REASON_EXTERNAL_INTERRUPT]="EXTERNAL_INTERRUPT",
+    [EXIT_REASON_TRIPLE_FAULT]="TRIPLE_FAULT",
+    [EXIT_REASON_INIT]="INIT",
+    [EXIT_REASON_SIPI]="SIPI",
+    [EXIT_REASON_IO_SMI]="IO_SMI",
+    [EXIT_REASON_OTHER_SMI]="OTHER_SMI",
+    [EXIT_REASON_PENDING_INTERRUPT]="PENDING_INTERRUPT",
+    [EXIT_REASON_PENDING_VIRT_NMI]="PENDING_VIRT_NMI",
+    [EXIT_REASON_TASK_SWITCH]="TASK_SWITCH",
+    [EXIT_REASON_CPUID]="CPUID",
+    [EXIT_REASON_GETSEC]="GETSEC",
+    [EXIT_REASON_HLT]="HLT",
+    [EXIT_REASON_INVD]="INVD",
+    [EXIT_REASON_INVLPG]="INVLPG",
+    [EXIT_REASON_RDPMC]="RDPMC",
+    [EXIT_REASON_RDTSC]="RDTSC",
+    [EXIT_REASON_RSM]="RSM",
+    [EXIT_REASON_VMCALL]="VMCALL",
+    [EXIT_REASON_VMCLEAR]="VMCLEAR",
+    [EXIT_REASON_VMLAUNCH]="VMLAUNCH",
+    [EXIT_REASON_VMPTRLD]="VMPTRLD",
+    [EXIT_REASON_VMPTRST]="VMPTRST",
+    [EXIT_REASON_VMREAD]="VMREAD",
+    [EXIT_REASON_VMRESUME]="VMRESUME",
+    [EXIT_REASON_VMWRITE]="VMWRITE",
+    [EXIT_REASON_VMOFF]="VMOFF",
+    [EXIT_REASON_VMON]="VMON",
+    [EXIT_REASON_CR_ACCESS]="CR_ACCESS",
+    [EXIT_REASON_DR_ACCESS]="DR_ACCESS",
+    [EXIT_REASON_IO_INSTRUCTION]="IO_INSTRUCTION",
+    [EXIT_REASON_MSR_READ]="MSR_READ",
+    [EXIT_REASON_MSR_WRITE]="MSR_WRITE",
+    [EXIT_REASON_INVALID_GUEST_STATE]="INVALID_GUEST_STATE",
+    [EXIT_REASON_MSR_LOADING]="MSR_LOADING",
+    [EXIT_REASON_MWAIT_INSTRUCTION]="MWAIT_INSTRUCTION",
+    [EXIT_REASON_MONITOR_TRAP_FLAG]="MONITOR_TRAP_FLAG",
+    [EXIT_REASON_MONITOR_INSTRUCTION]="MONITOR_INSTRUCTION",
+    [EXIT_REASON_PAUSE_INSTRUCTION]="PAUSE_INSTRUCTION",
+    [EXIT_REASON_MACHINE_CHECK]="MACHINE_CHECK",
+    [EXIT_REASON_TPR_BELOW_THRESHOLD]="TPR_BELOW_THRESHOLD",
+    [EXIT_REASON_APIC_ACCESS]="APIC_ACCESS",
+    [EXIT_REASON_EPT_VIOLATION]="EPT_VIOLATION",
+    [EXIT_REASON_EPT_MISCONFIG]="EPT_MISCONFIG",
+    [EXIT_REASON_INVEPT]="INVEPT",
+    [EXIT_REASON_RDTSCP]="RDTSCP",
+    [EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED]="VMX_PREEMPTION_TIMER_EXPIRED",
+    [EXIT_REASON_INVVPID]="INVVPID",
+    [EXIT_REASON_WBINVD]="WBINVD",
+    [EXIT_REASON_XSETBV]="XSETBV",
+};
+
+/* SVM data */
+enum VMEXIT_EXITCODE
+{
+    /* control register read exitcodes */
+    VMEXIT_CR0_READ    =   0,
+    VMEXIT_CR1_READ    =   1,
+    VMEXIT_CR2_READ    =   2,
+    VMEXIT_CR3_READ    =   3,
+    VMEXIT_CR4_READ    =   4,
+    VMEXIT_CR5_READ    =   5,
+    VMEXIT_CR6_READ    =   6,
+    VMEXIT_CR7_READ    =   7,
+    VMEXIT_CR8_READ    =   8,
+    VMEXIT_CR9_READ    =   9,
+    VMEXIT_CR10_READ   =  10,
+    VMEXIT_CR11_READ   =  11,
+    VMEXIT_CR12_READ   =  12,
+    VMEXIT_CR13_READ   =  13,
+    VMEXIT_CR14_READ   =  14,
+    VMEXIT_CR15_READ   =  15,
+
+    /* control register write exitcodes */
+    VMEXIT_CR0_WRITE   =  16,
+    VMEXIT_CR1_WRITE   =  17,
+    VMEXIT_CR2_WRITE   =  18,
+    VMEXIT_CR3_WRITE   =  19,
+    VMEXIT_CR4_WRITE   =  20,
+    VMEXIT_CR5_WRITE   =  21,
+    VMEXIT_CR6_WRITE   =  22,
+    VMEXIT_CR7_WRITE   =  23,
+    VMEXIT_CR8_WRITE   =  24,
+    VMEXIT_CR9_WRITE   =  25,
+    VMEXIT_CR10_WRITE  =  26,
+    VMEXIT_CR11_WRITE  =  27,
+    VMEXIT_CR12_WRITE  =  28,
+    VMEXIT_CR13_WRITE  =  29,
+    VMEXIT_CR14_WRITE  =  30,
+    VMEXIT_CR15_WRITE  =  31,
+
+    /* debug register read exitcodes */
+    VMEXIT_DR0_READ    =  32,
+    VMEXIT_DR1_READ    =  33,
+    VMEXIT_DR2_READ    =  34,
+    VMEXIT_DR3_READ    =  35,
+    VMEXIT_DR4_READ    =  36,
+    VMEXIT_DR5_READ    =  37,
+    VMEXIT_DR6_READ    =  38,
+    VMEXIT_DR7_READ    =  39,
+    VMEXIT_DR8_READ    =  40,
+    VMEXIT_DR9_READ    =  41,
+    VMEXIT_DR10_READ   =  42,
+    VMEXIT_DR11_READ   =  43,
+    VMEXIT_DR12_READ   =  44,
+    VMEXIT_DR13_READ   =  45,
+    VMEXIT_DR14_READ   =  46,
+    VMEXIT_DR15_READ   =  47,
+
+    /* debug register write exitcodes */
+    VMEXIT_DR0_WRITE   =  48,
+    VMEXIT_DR1_WRITE   =  49,
+    VMEXIT_DR2_WRITE   =  50,
+    VMEXIT_DR3_WRITE   =  51,
+    VMEXIT_DR4_WRITE   =  52,
+    VMEXIT_DR5_WRITE   =  53,
+    VMEXIT_DR6_WRITE   =  54,
+    VMEXIT_DR7_WRITE   =  55,
+    VMEXIT_DR8_WRITE   =  56,
+    VMEXIT_DR9_WRITE   =  57,
+    VMEXIT_DR10_WRITE  =  58,
+    VMEXIT_DR11_WRITE  =  59,
+    VMEXIT_DR12_WRITE  =  60,
+    VMEXIT_DR13_WRITE  =  61,
+    VMEXIT_DR14_WRITE  =  62,
+    VMEXIT_DR15_WRITE  =  63,
+
+    /* processor exception exitcodes (VMEXIT_EXCP[0-31]) */
+    VMEXIT_EXCEPTION_DE  =  64, /* divide-by-zero-error */
+    VMEXIT_EXCEPTION_DB  =  65, /* debug */
+    VMEXIT_EXCEPTION_NMI =  66, /* non-maskable-interrupt */
+    VMEXIT_EXCEPTION_BP  =  67, /* breakpoint */
+    VMEXIT_EXCEPTION_OF  =  68, /* overflow */
+    VMEXIT_EXCEPTION_BR  =  69, /* bound-range */
+    VMEXIT_EXCEPTION_UD  =  70, /* invalid-opcode*/
+    VMEXIT_EXCEPTION_NM  =  71, /* device-not-available */
+    VMEXIT_EXCEPTION_DF  =  72, /* double-fault */
+    VMEXIT_EXCEPTION_09  =  73, /* unsupported (reserved) */
+    VMEXIT_EXCEPTION_TS  =  74, /* invalid-tss */
+    VMEXIT_EXCEPTION_NP  =  75, /* segment-not-present */
+    VMEXIT_EXCEPTION_SS  =  76, /* stack */
+    VMEXIT_EXCEPTION_GP  =  77, /* general-protection */
+    VMEXIT_EXCEPTION_PF  =  78, /* page-fault */
+    VMEXIT_EXCEPTION_15  =  79, /* reserved */
+    VMEXIT_EXCEPTION_MF  =  80, /* x87 floating-point exception-pending */
+    VMEXIT_EXCEPTION_AC  =  81, /* alignment-check */
+    VMEXIT_EXCEPTION_MC  =  82, /* machine-check */
+    VMEXIT_EXCEPTION_XF  =  83, /* simd floating-point */
+
+    /* exceptions 20-31 (exitcodes 84-95) are reserved */
+
+    /* ...and the rest of the #VMEXITs */
+    VMEXIT_INTR             =  96,
+    VMEXIT_NMI              =  97,
+    VMEXIT_SMI              =  98,
+    VMEXIT_INIT             =  99,
+    VMEXIT_VINTR            = 100,
+    VMEXIT_CR0_SEL_WRITE    = 101,
+    VMEXIT_IDTR_READ        = 102,
+    VMEXIT_GDTR_READ        = 103,
+    VMEXIT_LDTR_READ        = 104,
+    VMEXIT_TR_READ          = 105,
+    VMEXIT_IDTR_WRITE       = 106,
+    VMEXIT_GDTR_WRITE       = 107,
+    VMEXIT_LDTR_WRITE       = 108,
+    VMEXIT_TR_WRITE         = 109,
+    VMEXIT_RDTSC            = 110,
+    VMEXIT_RDPMC            = 111,
+    VMEXIT_PUSHF            = 112,
+    VMEXIT_POPF             = 113,
+    VMEXIT_CPUID            = 114,
+    VMEXIT_RSM              = 115,
+    VMEXIT_IRET             = 116,
+    VMEXIT_SWINT            = 117,
+    VMEXIT_INVD             = 118,
+    VMEXIT_PAUSE            = 119,
+    VMEXIT_HLT              = 120,
+    VMEXIT_INVLPG           = 121,
+    VMEXIT_INVLPGA          = 122,
+    VMEXIT_IOIO             = 123,
+    VMEXIT_MSR              = 124,
+    VMEXIT_TASK_SWITCH      = 125,
+    VMEXIT_FERR_FREEZE      = 126,
+    VMEXIT_SHUTDOWN         = 127,
+    VMEXIT_VMRUN            = 128,
+    VMEXIT_VMMCALL          = 129,
+    VMEXIT_VMLOAD           = 130,
+    VMEXIT_VMSAVE           = 131,
+    VMEXIT_STGI             = 132,
+    VMEXIT_CLGI             = 133,
+    VMEXIT_SKINIT           = 134,
+    VMEXIT_RDTSCP           = 135,
+    VMEXIT_ICEBP            = 136,
+    VMEXIT_WBINVD           = 137,
+    VMEXIT_MONITOR          = 138,
+    VMEXIT_MWAIT            = 139,
+    VMEXIT_MWAIT_CONDITIONAL= 140,
+    VMEXIT_NPF              = 1024, /* nested paging fault */
+    VMEXIT_INVALID          =  -1
+};
+
+#define HVM_SVM_EXIT_REASON_MAX 1025
+char * hvm_svm_exit_reason_name[HVM_SVM_EXIT_REASON_MAX] = {
+    /* 0-15 */
+    "VMEXIT_CR0_READ",
+    "VMEXIT_CR1_READ",
+    "VMEXIT_CR2_READ",
+    "VMEXIT_CR3_READ",
+    "VMEXIT_CR4_READ",
+    "VMEXIT_CR5_READ",
+    "VMEXIT_CR6_READ",
+    "VMEXIT_CR7_READ",
+    "VMEXIT_CR8_READ",
+    "VMEXIT_CR9_READ",
+    "VMEXIT_CR10_READ",
+    "VMEXIT_CR11_READ",
+    "VMEXIT_CR12_READ",
+    "VMEXIT_CR13_READ",
+    "VMEXIT_CR14_READ",
+    "VMEXIT_CR15_READ",
+    /* 16-31 */
+    "VMEXIT_CR0_WRITE",
+    "VMEXIT_CR1_WRITE",
+    "VMEXIT_CR2_WRITE",
+    "VMEXIT_CR3_WRITE",
+    "VMEXIT_CR4_WRITE",
+    "VMEXIT_CR5_WRITE",
+    "VMEXIT_CR6_WRITE",
+    "VMEXIT_CR7_WRITE",
+    "VMEXIT_CR8_WRITE",
+    "VMEXIT_CR9_WRITE",
+    "VMEXIT_CR10_WRITE",
+    "VMEXIT_CR11_WRITE",
+    "VMEXIT_CR12_WRITE",
+    "VMEXIT_CR13_WRITE",
+    "VMEXIT_CR14_WRITE",
+    "VMEXIT_CR15_WRITE",
+    /* 32-47 */
+    "VMEXIT_DR0_READ",
+    "VMEXIT_DR1_READ",
+    "VMEXIT_DR2_READ",
+    "VMEXIT_DR3_READ",
+    "VMEXIT_DR4_READ",
+    "VMEXIT_DR5_READ",
+    "VMEXIT_DR6_READ",
+    "VMEXIT_DR7_READ",
+    "VMEXIT_DR8_READ",
+    "VMEXIT_DR9_READ",
+    "VMEXIT_DR10_READ",
+    "VMEXIT_DR11_READ",
+    "VMEXIT_DR12_READ",
+    "VMEXIT_DR13_READ",
+    "VMEXIT_DR14_READ",
+    "VMEXIT_DR15_READ",
+    /* 48-63 */
+    "VMEXIT_DR0_WRITE",
+    "VMEXIT_DR1_WRITE",
+    "VMEXIT_DR2_WRITE",
+    "VMEXIT_DR3_WRITE",
+    "VMEXIT_DR4_WRITE",
+    "VMEXIT_DR5_WRITE",
+    "VMEXIT_DR6_WRITE",
+    "VMEXIT_DR7_WRITE",
+    "VMEXIT_DR8_WRITE",
+    "VMEXIT_DR9_WRITE",
+    "VMEXIT_DR10_WRITE",
+    "VMEXIT_DR11_WRITE",
+    "VMEXIT_DR12_WRITE",
+    "VMEXIT_DR13_WRITE",
+    "VMEXIT_DR14_WRITE",
+    "VMEXIT_DR15_WRITE",
+    /* 64-83 */
+    "VMEXIT_EXCEPTION_DE",
+    "VMEXIT_EXCEPTION_DB",
+    "VMEXIT_EXCEPTION_NMI",
+    "VMEXIT_EXCEPTION_BP",
+    "VMEXIT_EXCEPTION_OF",
+    "VMEXIT_EXCEPTION_BR",
+    "VMEXIT_EXCEPTION_UD",
+    "VMEXIT_EXCEPTION_NM",
+    "VMEXIT_EXCEPTION_DF",
+    "VMEXIT_EXCEPTION_09",
+    "VMEXIT_EXCEPTION_TS",
+    "VMEXIT_EXCEPTION_NP",
+    "VMEXIT_EXCEPTION_SS",
+    "VMEXIT_EXCEPTION_GP",
+    "VMEXIT_EXCEPTION_PF",
+    "VMEXIT_EXCEPTION_15",
+    "VMEXIT_EXCEPTION_MF",
+    "VMEXIT_EXCEPTION_AC",
+    "VMEXIT_EXCEPTION_MC",
+    "VMEXIT_EXCEPTION_XF",
+    /* 84-95 */
+    "VMEXIT_EXCEPTION_20",
+    "VMEXIT_EXCEPTION_21",
+    "VMEXIT_EXCEPTION_22",
+    "VMEXIT_EXCEPTION_23",
+    "VMEXIT_EXCEPTION_24",
+    "VMEXIT_EXCEPTION_25",
+    "VMEXIT_EXCEPTION_26",
+    "VMEXIT_EXCEPTION_27",
+    "VMEXIT_EXCEPTION_28",
+    "VMEXIT_EXCEPTION_29",
+    "VMEXIT_EXCEPTION_30",
+    "VMEXIT_EXCEPTION_31",
+    /* 96-99 */
+    "VMEXIT_INTR",
+    "VMEXIT_NMI",
+    "VMEXIT_SMI",
+    "VMEXIT_INIT",
+    /* 100-109 */
+    "VMEXIT_VINTR",
+    "VMEXIT_CR0_SEL_WRITE",
+    "VMEXIT_IDTR_READ",
+    "VMEXIT_GDTR_READ",
+    "VMEXIT_LDTR_READ",
+    "VMEXIT_TR_READ",
+    "VMEXIT_IDTR_WRITE",
+    "VMEXIT_GDTR_WRITE",
+    "VMEXIT_LDTR_WRITE",
+    "VMEXIT_TR_WRITE",
+    /* 110-119 */
+    "VMEXIT_RDTSC",
+    "VMEXIT_RDPMC",
+    "VMEXIT_PUSHF",
+    "VMEXIT_POPF",
+    "VMEXIT_CPUID",
+    "VMEXIT_RSM",
+    "VMEXIT_IRET",
+    "VMEXIT_SWINT",
+    "VMEXIT_INVD",
+    "VMEXIT_PAUSE",
+    /* 120-129 */
+    "VMEXIT_HLT",
+    "VMEXIT_INVLPG",
+    "VMEXIT_INVLPGA",
+    "VMEXIT_IOIO",
+    "VMEXIT_MSR",
+    "VMEXIT_TASK_SWITCH",
+    "VMEXIT_FERR_FREEZE",
+    "VMEXIT_SHUTDOWN",
+    "VMEXIT_VMRUN",
+    "VMEXIT_VMMCALL",
+    /* 130-139 */
+    "VMEXIT_VMLOAD",
+    "VMEXIT_VMSAVE",
+    "VMEXIT_STGI",
+    "VMEXIT_CLGI",
+    "VMEXIT_SKINIT",
+    "VMEXIT_RDTSCP",
+    "VMEXIT_ICEBP",
+    "VMEXIT_WBINVD",
+    "VMEXIT_MONITOR",
+    "VMEXIT_MWAIT",
+    /* 140 */
+    "VMEXIT_MWAIT_CONDITIONAL",
+    [VMEXIT_NPF] = "VMEXIT_NPF", /* nested paging fault */
+};
+
+
+#if ( HVM_VMX_EXIT_REASON_MAX > HVM_SVM_EXIT_REASON_MAX )
+# define HVM_EXIT_REASON_MAX HVM_VMX_EXIT_REASON_MAX
+# error - Strange!
+#else
+# define HVM_EXIT_REASON_MAX HVM_SVM_EXIT_REASON_MAX
+#endif
+
+/* General hvm information */
+#define SPURIOUS_APIC_VECTOR  0xff
+#define ERROR_APIC_VECTOR     0xfe
+#define INVALIDATE_TLB_VECTOR 0xfd
+#define EVENT_CHECK_VECTOR    0xfc
+#define CALL_FUNCTION_VECTOR  0xfb
+#define THERMAL_APIC_VECTOR   0xfa
+#define LOCAL_TIMER_VECTOR    0xf9
+
+#define EXTERNAL_INTERRUPT_MAX 256
+
+/* Stringify numbers */
+char * hvm_extint_vector_name[EXTERNAL_INTERRUPT_MAX] = {
+    [SPURIOUS_APIC_VECTOR] = "SPURIOS_APIC",
+    [ERROR_APIC_VECTOR] =    "ERROR_APIC",
+    [INVALIDATE_TLB_VECTOR]= "INVALIDATE_TLB",
+    [EVENT_CHECK_VECTOR]=    "EVENT_CHECK",
+    [CALL_FUNCTION_VECTOR]=  "CALL_FUNCTION",
+    [THERMAL_APIC_VECTOR]=   "THERMAL_APIC",
+    [LOCAL_TIMER_VECTOR] =   "LOCAL_TIMER",
+};
+
+#define HVM_TRAP_MAX 20
+
+char * hvm_trap_name[HVM_TRAP_MAX] = {
+    [0] =  "Divide",
+    [1] =  "RESERVED",
+    [2] =  "NMI",
+    [3] =  "Breakpoint",
+    [4] =  "Overflow",
+    [5] =  "BOUND",
+    [6] =  "Invalid Op",
+    [7] =  "Coprocessor not present",
+    [8] =  "Double Fault",
+    [9] =  "Coprocessor segment overrun",
+    [10] = "TSS",
+    [11] = "Segment not present",
+    [12] = "Stack-segment fault",
+    [13] = "GP",
+    [14] = "Page fault",
+    [15] = "RESERVED",
+    [16] = "FPU",
+    [17] = "Alignment check",
+    [18] = "Machine check",
+    [19] = "SIMD",
+};
+
+
+enum {
+    HVM_EVENT_HANDLER_NONE = 0,
+    HVM_EVENT_HANDLER_PF_XEN = 1,
+    HVM_EVENT_HANDLER_PF_INJECT,
+    HVM_EVENT_HANDLER_INJ_EXC,
+    HVM_EVENT_HANDLER_INJ_VIRQ,
+    HVM_EVENT_HANDLER_REINJ_VIRQ,
+    HVM_EVENT_HANDLER_IO_READ,
+    HVM_EVENT_HANDLER_IO_WRITE,
+    HVM_EVENT_HANDLER_CR_READ, /* 8 */
+    HVM_EVENT_HANDLER_CR_WRITE,
+    HVM_EVENT_HANDLER_DR_READ,
+    HVM_EVENT_HANDLER_DR_WRITE,
+    HVM_EVENT_HANDLER_MSR_READ,
+    HVM_EVENT_HANDLER_MSR_WRITE,
+    HVM_EVENT_HANDLER_CPUID,
+    HVM_EVENT_HANDLER_INTR,
+    HVM_EVENT_HANDLER_NMI, /* 16 */
+    HVM_EVENT_HANDLER_SMI,
+    HVM_EVENT_HANDLER_VMCALL,
+    HVM_EVENT_HANDLER_HLT,
+    HVM_EVENT_HANDLER_INVLPG,
+    HVM_EVENT_HANDLER_MCE,
+    HVM_EVENT_HANDLER_IO_ASSIST,
+    HVM_EVENT_HANDLER_MMIO_ASSIST,
+    HVM_EVENT_HANDLER_CLTS,
+    HVM_EVENT_HANDLER_LMSW,
+    HVM_EVENT_RDTSC,
+    HVM_EVENT_INTR_WINDOW=0x20, /* Oops... skipped 0x1b-1f */
+    HVM_EVENT_NPF,
+    HVM_EVENT_REALMODE_EMULATE,
+    HVM_EVENT_TRAP,
+    HVM_EVENT_TRAP_DEBUG,
+    HVM_EVENT_VLAPIC,
+    HVM_EVENT_HANDLER_MAX
+};
+char * hvm_event_handler_name[HVM_EVENT_HANDLER_MAX] = {
+    "(no handler)",
+    "pf_xen",
+    "pf_inject",
+    "inj_exc",
+    "inj_virq",
+    "reinj_virq",
+    "io_read",
+    "io_write",
+    "cr_read", /* 8 */
+    "cr_write",
+    "dr_read",
+    "dr_write",
+    "msr_read",
+    "msr_write",
+    "cpuid",
+    "intr",
+    "nmi", /* 16 */
+    "smi",
+    "vmcall",
+    "hlt",
+    "invlpg",
+    "mce",
+    "io_assist",
+    "mmio_assist",
+    "clts", /* 24 */
+    "lmsw",
+    "rdtsc",
+    [HVM_EVENT_INTR_WINDOW]="intr_window",
+    "npf",
+    "realmode_emulate",
+    "trap",
+    "trap_debug",
+    "vlapic"
+};
+
+enum {
+    HVM_VOL_VMENTRY,
+    HVM_VOL_VMEXIT,
+    HVM_VOL_HANDLER,
+    HVM_VOL_MAX
+};
+
+enum {
+    GUEST_INTERRUPT_CASE_NONE,
+    /* This interrupt woke, no other interrupts until halt */
+    GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ALONE,
+    /* This interrupt woke, maybe another interrupt before halt */
+    GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ANY,
+    /* Time from interrupt (running) to halt */
+    GUEST_INTERRUPT_CASE_INTERRUPT_TO_HALT,
+    GUEST_INTERRUPT_CASE_MAX,
+};
+
+char *guest_interrupt_case_name[] = {
+    [GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ALONE]="wake to halt alone",
+    /* This interrupt woke, maybe another interrupt before halt */
+    [GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ANY]  ="wake to halt any  ",
+    /* Time from interrupt (running) to halt */
+    [GUEST_INTERRUPT_CASE_INTERRUPT_TO_HALT] ="intr to halt      ",
+};
+
+char *hvm_vol_name[HVM_VOL_MAX] = {
+    [HVM_VOL_VMENTRY]="vmentry",
+    [HVM_VOL_VMEXIT] ="vmexit",
+    [HVM_VOL_HANDLER]="handler",
+};
+
+enum {
+    HYPERCALL_set_trap_table = 0,
+    HYPERCALL_mmu_update,
+    HYPERCALL_set_gdt,
+    HYPERCALL_stack_switch,
+    HYPERCALL_set_callbacks,
+    HYPERCALL_fpu_taskswitch,
+    HYPERCALL_sched_op_compat,
+    HYPERCALL_platform_op,
+    HYPERCALL_set_debugreg,
+    HYPERCALL_get_debugreg,
+    HYPERCALL_update_descriptor,
+    HYPERCALL_memory_op=12,
+    HYPERCALL_multicall,
+    HYPERCALL_update_va_mapping,
+    HYPERCALL_set_timer_op,
+    HYPERCALL_event_channel_op_compat,
+    HYPERCALL_xen_version,
+    HYPERCALL_console_io,
+    HYPERCALL_physdev_op_compat,
+    HYPERCALL_grant_table_op,
+    HYPERCALL_vm_assist,
+    HYPERCALL_update_va_mapping_otherdomain,
+    HYPERCALL_iret,
+    HYPERCALL_vcpu_op,
+    HYPERCALL_set_segment_base,
+    HYPERCALL_mmuext_op,
+    HYPERCALL_acm_op,
+    HYPERCALL_nmi_op,
+    HYPERCALL_sched_op,
+    HYPERCALL_callback_op,
+    HYPERCALL_xenoprof_op,
+    HYPERCALL_event_channel_op,
+    HYPERCALL_physdev_op,
+    HYPERCALL_hvm_op,
+    HYPERCALL_sysctl,
+    HYPERCALL_domctl,
+    HYPERCALL_kexec_op,
+    HYPERCALL_MAX
+};
+
+char *hypercall_name[HYPERCALL_MAX] = {
+    [HYPERCALL_set_trap_table]="set_trap_table",
+    [HYPERCALL_mmu_update]="mmu_update",
+    [HYPERCALL_set_gdt]="set_gdt",
+    [HYPERCALL_stack_switch]="stack_switch",
+    [HYPERCALL_set_callbacks]="set_callbacks",
+    [HYPERCALL_fpu_taskswitch]="fpu_taskswitch",
+    [HYPERCALL_sched_op_compat]="sched_op(compat)",
+    [HYPERCALL_platform_op]="platform_op",
+    [HYPERCALL_set_debugreg]="set_debugreg",
+    [HYPERCALL_get_debugreg]="get_debugreg",
+    [HYPERCALL_update_descriptor]="update_descriptor",
+    [HYPERCALL_memory_op]="memory_op",
+    [HYPERCALL_multicall]="multicall",
+    [HYPERCALL_update_va_mapping]="update_va_mapping",
+    [HYPERCALL_set_timer_op]="set_timer_op",
+    [HYPERCALL_event_channel_op_compat]="evtchn_op(compat)",
+    [HYPERCALL_xen_version]="xen_version",
+    [HYPERCALL_console_io]="console_io",
+    [HYPERCALL_physdev_op_compat]="physdev_op(compat)",
+    [HYPERCALL_grant_table_op]="grant_table_op",
+    [HYPERCALL_vm_assist]="vm_assist",
+    [HYPERCALL_update_va_mapping_otherdomain]="update_va_mapping_otherdomain",
+    [HYPERCALL_iret]="iret",
+    [HYPERCALL_vcpu_op]="vcpu_op",
+    [HYPERCALL_set_segment_base]="set_segment_base",
+    [HYPERCALL_mmuext_op]="mmuext_op",
+    [HYPERCALL_acm_op]="acm_op",
+    [HYPERCALL_nmi_op]="nmi_op",
+    [HYPERCALL_sched_op]="sched_op",
+    [HYPERCALL_callback_op]="callback_op",
+    [HYPERCALL_xenoprof_op]="xenoprof_op",
+    [HYPERCALL_event_channel_op]="evtchn_op",
+    [HYPERCALL_physdev_op]="physdev_op",
+    [HYPERCALL_hvm_op]="hvm_op",
+    [HYPERCALL_sysctl]="sysctl",
+    [HYPERCALL_domctl]="domctl",
+    [HYPERCALL_kexec_op]="kexec_op"
+};
+
+enum {
+    PF_XEN_EMUL_LVL_0,
+    PF_XEN_EMUL_LVL_1,
+    PF_XEN_EMUL_LVL_2,
+    PF_XEN_EMUL_LVL_3,
+    PF_XEN_EMUL_LVL_4,
+    PF_XEN_EMUL_EARLY_UNSHADOW,
+    PF_XEN_EMUL_SET_CHANGED,
+    PF_XEN_EMUL_SET_UNCHANGED,
+    PF_XEN_EMUL_SET_FLUSH,
+    PF_XEN_EMUL_SET_ERROR,
+    PF_XEN_EMUL_PROMOTE,
+    PF_XEN_EMUL_DEMOTE,
+    PF_XEN_EMUL_PREALLOC_UNPIN,
+    PF_XEN_EMUL_PREALLOC_UNHOOK,
+    PF_XEN_EMUL_MAX,
+};
+
+char * pf_xen_emul_name[PF_XEN_EMUL_MAX] = {
+    [PF_XEN_EMUL_LVL_0]="non-linmap",
+    [PF_XEN_EMUL_LVL_1]="linmap l1",
+    [PF_XEN_EMUL_LVL_2]="linmap l2",
+    [PF_XEN_EMUL_LVL_3]="linmap l3",
+    [PF_XEN_EMUL_LVL_4]="linmap l4",
+    [PF_XEN_EMUL_EARLY_UNSHADOW]="early unshadow",
+    [PF_XEN_EMUL_SET_UNCHANGED]="set unchanged",
+    [PF_XEN_EMUL_SET_CHANGED]="set changed",
+    [PF_XEN_EMUL_SET_FLUSH]="set changed",
+    [PF_XEN_EMUL_SET_ERROR]="set changed",
+    [PF_XEN_EMUL_PROMOTE]="promote",
+    [PF_XEN_EMUL_DEMOTE]="demote",
+    [PF_XEN_EMUL_PREALLOC_UNPIN]="unpin",
+    [PF_XEN_EMUL_PREALLOC_UNHOOK]="unhook",
+};
+
+/* Rio only */
+enum {
+    PF_XEN_NON_EMUL_VA_USER,
+    PF_XEN_NON_EMUL_VA_KERNEL,
+    PF_XEN_NON_EMUL_EIP_USER,
+    PF_XEN_NON_EMUL_EIP_KERNEL,
+    PF_XEN_NON_EMUL_MAX,
+};
+
+char * pf_xen_non_emul_name[PF_XEN_NON_EMUL_MAX] = {
+    [PF_XEN_NON_EMUL_VA_USER]="va user",
+    [PF_XEN_NON_EMUL_VA_KERNEL]="va kernel",
+    [PF_XEN_NON_EMUL_EIP_USER]="eip user",
+    [PF_XEN_NON_EMUL_EIP_KERNEL]="eip kernel",
+};
+
+enum {
+    PF_XEN_FIXUP_PREALLOC_UNPIN,
+    PF_XEN_FIXUP_PREALLOC_UNHOOK,
+    PF_XEN_FIXUP_UNSYNC,
+    PF_XEN_FIXUP_OOS_ADD,
+    PF_XEN_FIXUP_OOS_EVICT,
+    PF_XEN_FIXUP_PROMOTE,
+    PF_XEN_FIXUP_UPDATE_ONLY,
+    PF_XEN_FIXUP_WRMAP,
+    PF_XEN_FIXUP_BRUTE_FORCE,
+    PF_XEN_FIXUP_MAX,
+};
+
+char * pf_xen_fixup_name[PF_XEN_FIXUP_MAX] = {
+    [PF_XEN_FIXUP_PREALLOC_UNPIN] = "unpin",
+    [PF_XEN_FIXUP_PREALLOC_UNHOOK] = "unhook",
+    [PF_XEN_FIXUP_UNSYNC] = "unsync",
+    [PF_XEN_FIXUP_OOS_ADD] = "oos-add",
+    [PF_XEN_FIXUP_OOS_EVICT] = "oos-evict",
+    [PF_XEN_FIXUP_PROMOTE] = "promote",
+    [PF_XEN_FIXUP_UPDATE_ONLY] = "update",
+    [PF_XEN_FIXUP_WRMAP] = "wrmap",
+    [PF_XEN_FIXUP_BRUTE_FORCE] = "wrmap-bf",
+};
+
+enum {
+    PF_XEN_NOT_SHADOW = 1,
+    PF_XEN_FAST_PROPAGATE,
+    PF_XEN_FAST_MMIO,
+    PF_XEN_FALSE_FAST_PATH,
+    PF_XEN_MMIO,
+    PF_XEN_FIXUP,
+    PF_XEN_DOMF_DYING,
+    PF_XEN_EMULATE,
+    PF_XEN_EMULATE_UNSHADOW_USER,
+    PF_XEN_EMULATE_UNSHADOW_EVTINJ,
+    PF_XEN_EMULATE_UNSHADOW_UNHANDLED,
+    PF_XEN_LAST_FAULT=PF_XEN_EMULATE_UNSHADOW_UNHANDLED,
+    PF_XEN_NON_EMULATE,
+    PF_XEN_NO_HANDLER,
+    PF_XEN_MAX,
+};
+
+#define SHADOW_WRMAP_BF       12
+#define SHADOW_PREALLOC_UNPIN 13
+#define SHADOW_RESYNC_FULL    14
+#define SHADOW_RESYNC_ONLY    15
+
+char * pf_xen_name[PF_XEN_MAX] = {
+    [PF_XEN_NOT_SHADOW]="propagate",
+    [PF_XEN_FAST_PROPAGATE]="fast propagate",
+    [PF_XEN_FAST_MMIO]="fast mmio",
+    [PF_XEN_FALSE_FAST_PATH]="false fast path",
+    [PF_XEN_MMIO]="mmio",
+    [PF_XEN_FIXUP]="fixup",
+    [PF_XEN_DOMF_DYING]="dom dying",
+    [PF_XEN_EMULATE]="emulate",
+    [PF_XEN_EMULATE_UNSHADOW_USER]="unshadow:user-mode",
+    [PF_XEN_EMULATE_UNSHADOW_EVTINJ]="unshadow:evt inj",
+    [PF_XEN_EMULATE_UNSHADOW_UNHANDLED]="unshadow:unhandled instr",
+    [PF_XEN_NON_EMULATE]="fixup|mmio",
+    [PF_XEN_NO_HANDLER]="(no handler)",
+};
+
+#define CORR_VA_INVALID (0ULL-1)
+
+enum {
+    NONPF_MMIO_APIC,
+    NONPF_MMIO_NPF,
+    NONPF_MMIO_UNKNOWN,
+    NONPF_MMIO_MAX
+};
+
+struct mmio_info {
+    unsigned long long gpa;
+    unsigned long long va; /* Filled only by shadow */
+    unsigned data;
+    unsigned data_valid:1, is_write:1;
+};
+
+struct pf_xen_extra {
+    unsigned long long va;
+    union {
+        unsigned flags;
+        struct {
+            unsigned flag_set_ad:1,
+                flag_set_a:1,
+                flag_shadow_l1_get_ref:1,
+                flag_shadow_l1_put_ref:1,
+                flag_l2_propagate:1,
+                flag_set_changed:1,
+                flag_set_flush:1,
+                flag_set_error:1,
+                flag_demote:1,
+                flag_promote:1,
+                flag_wrmap:1,
+                flag_wrmap_guess_found:1,
+                flag_wrmap_brute_force:1,
+                flag_early_unshadow:1,
+                flag_emulation_2nd_pt_written:1,
+                flag_emulation_last_failed:1,
+                flag_emulate_full_pt:1,
+                flag_prealloc_unhook:1,
+                flag_unsync:1,
+                flag_oos_fixup_add:1,
+                flag_oos_fixup_evict:1;
+        };
+    }; /* Miami + ; fixup & emulate */
+    unsigned int error_code; /* Rio only */
+
+    /* Calculated */
+    int pf_case; /* Rio */
+
+    /* MMIO only */
+    unsigned long long gpa;
+    unsigned int data;
+
+    /* Emulate only */
+    unsigned long long gl1e; /* Miami + */
+    unsigned long long wval; /* Miami */
+    unsigned long long corresponding_va;
+    unsigned int pt_index[5], pt_is_lo;
+    int pt_level;
+
+    /* Other */
+    unsigned long long gfn;
+
+    /* Flags */
+    unsigned corr_valid:1,
+        corr_is_kernel:1,
+        va_is_kernel:1;
+};
+
+struct pcpu_info;
+
+#define GUEST_INTERRUPT_MAX 350
+#define FAKE_VECTOR 349
+#define CR_MAX 9
+#define RESYNCS_MAX 17
+#define PF_XEN_FIXUP_UNSYNC_RESYNC_MAX 2
+
+struct hvm_data;
+
+struct hvm_summary_handler_node {
+    void (*handler)(struct hvm_data *, void* data);
+    void *data;
+    struct hvm_summary_handler_node *next;
+};
+
+struct hvm_data {
+    /* Summary information */
+    int init;
+    int vmexit_valid;
+    int summary_info;
+    struct vcpu_data *v; /* up-pointer */
+
+    /* SVM / VMX compatibility. FIXME - should be global */
+    char ** exit_reason_name;
+    int exit_reason_max;
+    struct hvm_summary_handler_node *exit_reason_summary_handler_list[HVM_EXIT_REASON_MAX];
+
+    /* Information about particular exit reasons */
+    struct {
+        struct event_cycle_summary exit_reason[HVM_EXIT_REASON_MAX];
+        int extint[EXTERNAL_INTERRUPT_MAX+1];
+        int *extint_histogram;
+        struct event_cycle_summary trap[HVM_TRAP_MAX];
+        struct event_cycle_summary pf_xen[PF_XEN_MAX];
+        struct event_cycle_summary pf_xen_emul[PF_XEN_EMUL_MAX];
+        struct event_cycle_summary pf_xen_emul_early_unshadow[5];
+        struct event_cycle_summary pf_xen_non_emul[PF_XEN_NON_EMUL_MAX];
+        struct event_cycle_summary pf_xen_fixup[PF_XEN_FIXUP_MAX];
+        struct event_cycle_summary pf_xen_fixup_unsync_resync[PF_XEN_FIXUP_UNSYNC_RESYNC_MAX+1];
+        struct event_cycle_summary cr_write[CR_MAX];
+        struct event_cycle_summary cr3_write_resyncs[RESYNCS_MAX+1];
+        struct event_cycle_summary vmcall[HYPERCALL_MAX+1];
+        struct event_cycle_summary generic[HVM_EVENT_HANDLER_MAX];
+        struct event_cycle_summary mmio[NONPF_MMIO_MAX];
+        struct hvm_gi_struct {
+            int count;
+            struct cycle_summary runtime[GUEST_INTERRUPT_CASE_MAX];
+            /* OK, not summary info, but still... */
+            int is_wake;
+            tsc_t start_tsc;
+        } guest_interrupt[GUEST_INTERRUPT_MAX + 1];
+        /* IPI Latency */
+        struct event_cycle_summary ipi_latency;
+        int ipi_count[256];
+        struct {
+            struct io_address *mmio, *pio;
+        } io;
+    } summary;
+
+    /* In-flight accumulation information */
+    struct {
+        union {
+            struct {
+                unsigned port:31,
+                    is_write:1;
+                unsigned int val;
+            } io;
+            struct pf_xen_extra pf_xen;
+            struct {
+                unsigned cr;
+                unsigned long long val;
+                int repromote;
+            } cr_write;
+            struct {
+                unsigned addr;
+                unsigned long long val;
+            } msr;
+            struct {
+                unsigned int event;
+                uint32_t d[4];
+            } generic;
+            struct {
+                unsigned eax;
+            } vmcall;
+            struct {
+                unsigned vec;
+            } intr;
+        };
+        /* MMIO gets its separate area, since many exits may use it */
+        struct mmio_info mmio;
+    }inflight;
+    int resyncs;
+    void (*post_process)(struct hvm_data *);
+    tsc_t exit_tsc, arc_cycles, entry_tsc;
+    unsigned long long rip;
+    unsigned exit_reason, event_handler;
+    int short_summary_done:1, prealloc_unpin:1, wrmap_bf:1;
+
+    /* Immediate processing */
+    void *d;
+
+    /* Wake-to-halt detection.  See comment above. */
+    struct {
+        unsigned waking:1;
+        /* Wake vector: keep track of time from vmentry until:
+           next halt, or next interrupt */
+        int vector, interrupts, interrupts_wanting_tsc;
+    } w2h;
+
+    /* Historical info */
+    tsc_t last_rdtsc;
+};
+
+enum {
+    HVM_SHORT_SUMMARY_EMULATE,
+    HVM_SHORT_SUMMARY_UNSYNC,
+    HVM_SHORT_SUMMARY_FIXUP,
+    HVM_SHORT_SUMMARY_MMIO,
+    HVM_SHORT_SUMMARY_PROPAGATE,
+    HVM_SHORT_SUMMARY_CR3,
+    HVM_SHORT_SUMMARY_VMCALL,
+    HVM_SHORT_SUMMARY_INTERRUPT,
+    HVM_SHORT_SUMMARY_HLT,
+    HVM_SHORT_SUMMARY_OTHER,
+    HVM_SHORT_SUMMARY_MAX,
+};
+
+char *hvm_short_summary_name[HVM_SHORT_SUMMARY_MAX] = {
+    [HVM_SHORT_SUMMARY_EMULATE]  ="emulate",
+    [HVM_SHORT_SUMMARY_UNSYNC]   ="unsync",
+    [HVM_SHORT_SUMMARY_FIXUP]    ="fixup",
+    [HVM_SHORT_SUMMARY_MMIO]     ="mmio",
+    [HVM_SHORT_SUMMARY_PROPAGATE]="propagate",
+    [HVM_SHORT_SUMMARY_CR3]      ="cr3",
+    [HVM_SHORT_SUMMARY_VMCALL]   ="vmcall",
+    [HVM_SHORT_SUMMARY_INTERRUPT]="intr",
+    [HVM_SHORT_SUMMARY_HLT]      ="hlt",
+    [HVM_SHORT_SUMMARY_OTHER]    ="other",
+};
+
+struct hvm_short_summary_struct {
+    struct cycle_summary s[HVM_SHORT_SUMMARY_MAX];
+};
+
+void init_hvm_data(struct hvm_data *h, struct vcpu_data *v) {
+    int i;
+
+    if(h->init)
+        return;
+
+    h->v = v;
+
+    h->init = 1;
+
+    if(opt.svm_mode) {
+        h->exit_reason_max = HVM_SVM_EXIT_REASON_MAX;
+        h->exit_reason_name = hvm_svm_exit_reason_name;
+    } else {
+        h->exit_reason_max = HVM_VMX_EXIT_REASON_MAX;
+        h->exit_reason_name = hvm_vmx_exit_reason_name;
+    }
+
+    if(opt.histogram_interrupt_eip) {
+        int count = ((1ULL<<ADDR_SPACE_BITS)/opt.histogram_interrupt_increment);
+        size_t size = count * sizeof(int);
+        h->summary.extint_histogram = malloc(size);
+        if(h->summary.extint_histogram)
+            bzero(h->summary.extint_histogram, size);
+        else {
+            fprintf(stderr, "FATAL: Could not allocate %zd bytes for interrupt histogram!\n",
+                    size);
+            error(ERR_SYSTEM, NULL);
+        }
+
+    }
+    for(i=0; i<GUEST_INTERRUPT_MAX+1; i++)
+        h->summary.guest_interrupt[i].count=0;
+}
+
+/* PV data */
+enum {
+    PV_HYPERCALL=1,
+    PV_TRAP=3,
+    PV_PAGE_FAULT,
+    PV_FORCED_INVALID_OP,
+    PV_EMULATE_PRIVOP,
+    PV_EMULATE_4GB,
+    PV_MATH_STATE_RESTORE,
+    PV_PAGING_FIXUP,
+    PV_GDT_LDT_MAPPING_FAULT,
+    PV_PTWR_EMULATION,
+    PV_PTWR_EMULATION_PAE,
+    PV_HYPERCALL_V2 = 13,
+    PV_HYPERCALL_SUBCALL = 14,
+    PV_MAX
+};
+
+char *pv_name[PV_MAX] = {
+    [PV_HYPERCALL]="hypercall",
+    [PV_TRAP]="trap",
+    [PV_PAGE_FAULT]="page_fault",
+    [PV_FORCED_INVALID_OP]="forced_invalid_op",
+    [PV_EMULATE_PRIVOP]="emulate privop",
+    [PV_EMULATE_4GB]="emulate 4g",
+    [PV_MATH_STATE_RESTORE]="math state restore",
+    [PV_PAGING_FIXUP]="paging fixup",
+    [PV_GDT_LDT_MAPPING_FAULT]="gdt/ldt mapping fault",
+    [PV_PTWR_EMULATION]="ptwr",
+    [PV_PTWR_EMULATION_PAE]="ptwr(pae)",
+    [PV_HYPERCALL_V2]="hypercall",
+    [PV_HYPERCALL_SUBCALL]="hypercall (subcall)",
+};
+
+#define PV_HYPERCALL_MAX 56
+#define PV_TRAP_MAX 20
+
+struct pv_data {
+    unsigned summary_info:1;
+    int count[PV_MAX];
+    int hypercall_count[PV_HYPERCALL_MAX];
+    int trap_count[PV_TRAP_MAX];
+};
+
+/* Sched data */
+
+enum {
+    SCHED_DOM_ADD=1,
+    SCHED_DOM_REM,
+    SCHED_SLEEP,
+    SCHED_WAKE,
+    SCHED_YIELD,
+    SCHED_BLOCK,
+    SCHED_SHUTDOWN,
+    SCHED_CTL,
+    SCHED_ADJDOM,
+    SCHED_SWITCH,
+    SCHED_S_TIMER_FN,
+    SCHED_T_TIMER_FN,
+    SCHED_DOM_TIMER_FN,
+    SCHED_SWITCH_INFPREV,
+    SCHED_SWITCH_INFNEXT,
+    SCHED_SHUTDOWN_CODE,
+    SCHED_MAX
+};
+
+enum {
+    RUNSTATE_RUNNING=0,
+    RUNSTATE_RUNNABLE,
+    RUNSTATE_BLOCKED,
+    RUNSTATE_OFFLINE,
+    RUNSTATE_LOST,
+    RUNSTATE_QUEUED,
+    RUNSTATE_INIT,
+    RUNSTATE_MAX
+};
+
+int runstate_graph[RUNSTATE_MAX] =
+{
+    [RUNSTATE_BLOCKED]=0,
+    [RUNSTATE_OFFLINE]=1,
+    [RUNSTATE_RUNNABLE]=2,
+    [RUNSTATE_RUNNING]=3,
+    [RUNSTATE_LOST]=-1,
+    [RUNSTATE_QUEUED]=-2,
+    [RUNSTATE_INIT]=-2,
+};
+
+char * runstate_name[RUNSTATE_MAX]={
+    [RUNSTATE_RUNNING]= "running",
+    [RUNSTATE_RUNNABLE]="runnable",
+    [RUNSTATE_BLOCKED]= "blocked", /* to be blocked */
+    [RUNSTATE_OFFLINE]= "offline",
+    [RUNSTATE_QUEUED]=  "queued",
+    [RUNSTATE_INIT]=    "init",
+    [RUNSTATE_LOST]=    "lost",
+};
+
+enum {
+    RUNNABLE_STATE_INVALID,
+    RUNNABLE_STATE_WAKE,
+    RUNNABLE_STATE_PREEMPT,
+    RUNNABLE_STATE_OTHER,
+    RUNNABLE_STATE_MAX
+};
+
+char * runnable_state_name[RUNNABLE_STATE_MAX]={
+    [RUNNABLE_STATE_INVALID]="invalid", /* Should never show up */
+    [RUNNABLE_STATE_WAKE]="wake",
+    [RUNNABLE_STATE_PREEMPT]="preempt",
+    [RUNNABLE_STATE_OTHER]="other",
+};
+
+/* Memory data */
+enum {
+    MEM_PAGE_GRANT_MAP = 1,
+    MEM_PAGE_GRANT_UNMAP,
+    MEM_PAGE_GRANT_TRANSFER,
+    MEM_SET_P2M_ENTRY,
+    MEM_DECREASE_RESERVATION,
+    MEM_POD_POPULATE = 16,
+    MEM_POD_ZERO_RECLAIM,
+    MEM_POD_SUPERPAGE_SPLINTER,
+    MEM_MAX
+};
+
+char *mem_name[MEM_MAX] = {
+    [MEM_PAGE_GRANT_MAP]         = "grant-map",
+    [MEM_PAGE_GRANT_UNMAP]       = "grant-unmap",
+    [MEM_PAGE_GRANT_TRANSFER]    = "grant-transfer",
+    [MEM_SET_P2M_ENTRY]          = "set-p2m",
+    [MEM_DECREASE_RESERVATION]   = "decrease-reservation",
+    [MEM_POD_POPULATE]           = "pod-populate",
+    [MEM_POD_ZERO_RECLAIM]       = "pod-zero-reclaim",
+    [MEM_POD_SUPERPAGE_SPLINTER] = "pod-superpage-splinter",
+};
+
+/* Per-unit information. */
+
+struct cr3_value_struct {
+    struct cr3_value_struct * next;
+    struct cr3_value_struct * gnext;
+    unsigned long long gmfn;
+    int cr3_id;
+    unsigned long long first_time, last_time, run_time;
+    struct cycle_summary total_time, guest_time, hv_time;
+    int switch_count, flush_count;
+
+    struct hvm_short_summary_struct hvm;
+
+    struct {
+        int now;
+        int count;
+    } prealloc_unpin;
+
+    struct {
+        unsigned callback:1;
+        unsigned flush_count, switch_count;
+        unsigned fixup_user, emulate_corr_user;
+    } destroy;
+};
+
+#ifndef MAX_CPUS
+#define MAX_CPUS 256
+#endif
+typedef uint32_t cpu_mask_t;
+
+#define IDLE_DOMAIN 32767
+#define DEFAULT_DOMAIN 32768
+
+#define MAX_VLAPIC_LIST 8
+struct vlapic_struct {
+    struct {
+        struct outstanding_ipi {
+            tsc_t first_tsc;
+            int vec, count;
+            int injected, valid;
+        } list[MAX_VLAPIC_LIST];
+    } outstanding;
+};
+
+struct vcpu_data {
+    int vid;
+    struct domain_data *d; /* up-pointer */
+    unsigned activated:1;
+
+    int guest_paging_levels;
+
+    /* Schedule info */
+    struct {
+        int state;
+        int runnable_state; /* Only valid when state==RUNSTATE_RUNNABLE */
+        tsc_t tsc;
+        /* TSC skew detection/correction */
+        struct last_oldstate_struct {
+            int wrong, actual, pid;
+            tsc_t tsc;
+        } last_oldstate;
+        /* Performance counters */
+        unsigned long long p1_start, p2_start;
+    } runstate;
+    struct pcpu_info *p;
+    tsc_t pcpu_tsc;
+
+    /* Hardware tracking */
+    struct {
+        long long val;
+        tsc_t start_time;
+        struct cr3_value_struct *data;
+    } cr3;
+
+    /* IPI latency tracking */
+    struct vlapic_struct vlapic;
+
+    /* Summary info */
+    struct cycle_framework f;
+    struct cycle_summary runstates[RUNSTATE_MAX];
+    struct cycle_summary runnable_states[RUNNABLE_STATE_MAX];
+    struct weighted_cpi_summary cpi;
+    struct cycle_summary cpu_affinity_all,
+        cpu_affinity_pcpu[MAX_CPUS];
+    enum {
+        VCPU_DATA_NONE=0,
+        VCPU_DATA_HVM,
+        VCPU_DATA_PV
+    } data_type;
+    union {
+        struct hvm_data hvm;
+        struct pv_data pv;
+    };
+};
+
+enum {
+    DOMAIN_RUNSTATE_BLOCKED=0,
+    DOMAIN_RUNSTATE_PARTIAL_RUN,
+    DOMAIN_RUNSTATE_FULL_RUN,
+    DOMAIN_RUNSTATE_PARTIAL_CONTENTION,
+    DOMAIN_RUNSTATE_CONCURRENCY_HAZARD,
+    DOMAIN_RUNSTATE_FULL_CONTENTION,
+    DOMAIN_RUNSTATE_LOST,
+    DOMAIN_RUNSTATE_MAX
+};
+
+char * domain_runstate_name[] = {
+    [DOMAIN_RUNSTATE_BLOCKED]="blocked",
+    [DOMAIN_RUNSTATE_PARTIAL_RUN]="partial run",
+    [DOMAIN_RUNSTATE_FULL_RUN]="full run",
+    [DOMAIN_RUNSTATE_PARTIAL_CONTENTION]="partial contention",
+    [DOMAIN_RUNSTATE_CONCURRENCY_HAZARD]="concurrency_hazard",
+    [DOMAIN_RUNSTATE_FULL_CONTENTION]="full_contention",
+    [DOMAIN_RUNSTATE_LOST]="lost",
+};
+
+enum {
+    POD_RECLAIM_CONTEXT_UNKNOWN=0,
+    POD_RECLAIM_CONTEXT_FAULT,
+    POD_RECLAIM_CONTEXT_BALLOON,
+    POD_RECLAIM_CONTEXT_MAX
+};
+
+char * pod_reclaim_context_name[] = {
+    [POD_RECLAIM_CONTEXT_UNKNOWN]="unknown",
+    [POD_RECLAIM_CONTEXT_FAULT]="fault",
+    [POD_RECLAIM_CONTEXT_BALLOON]="balloon",
+};
+
+#define POD_ORDER_MAX 4
+
+struct domain_data {
+    struct domain_data *next;
+    int did;
+    struct vcpu_data *vcpu[MAX_CPUS];
+
+    int max_vid;
+
+    int runstate;
+    tsc_t runstate_tsc;
+    struct cycle_summary total_time;
+    struct cycle_summary runstates[DOMAIN_RUNSTATE_MAX];
+    struct cr3_value_struct *cr3_value_head;
+    struct eip_list_struct *emulate_eip_list;
+    struct eip_list_struct *interrupt_eip_list;
+
+    int guest_interrupt[GUEST_INTERRUPT_MAX+1];
+    struct hvm_short_summary_struct hvm_short;
+    struct {
+        int done[MEM_MAX];
+        int done_interval[MEM_MAX];
+
+        int done_for[MEM_MAX];
+        int done_for_interval[MEM_MAX];
+    } memops;
+
+    struct {
+        int reclaim_order[POD_ORDER_MAX];
+        int reclaim_context[POD_RECLAIM_CONTEXT_MAX];
+        int reclaim_context_order[POD_RECLAIM_CONTEXT_MAX][POD_ORDER_MAX];
+        /* FIXME: Do a full cycle summary */
+        int populate_order[POD_ORDER_MAX];
+    } pod;
+};
+
+struct domain_data * domain_list=NULL;
+
+struct domain_data default_domain;
+
+enum {
+    TOPLEVEL_GEN=0,
+    TOPLEVEL_SCHED,
+    TOPLEVEL_DOM0OP,
+    TOPLEVEL_HVM,
+    TOPLEVEL_MEM,
+    TOPLEVEL_PV,
+    TOPLEVEL_SHADOW,
+    TOPLEVEL_HW,
+    TOPLEVEL_MAX=TOPLEVEL_HW+1,
+};
+
+char * toplevel_name[TOPLEVEL_MAX] = {
+    [TOPLEVEL_GEN]="gen",
+    [TOPLEVEL_SCHED]="sched",
+    [TOPLEVEL_DOM0OP]="dom0op",
+    [TOPLEVEL_HVM]="hvm",
+    [TOPLEVEL_MEM]="mem",
+    [TOPLEVEL_PV]="pv",
+    [TOPLEVEL_SHADOW]="shadow",
+    [TOPLEVEL_HW]="hw",
+};
+
+struct trace_volume {
+    unsigned long long toplevel[TOPLEVEL_MAX];
+    unsigned long long sched_verbose;
+    unsigned long long hvm[HVM_VOL_MAX];
+} volume;
+
+#define UPDATE_VOLUME(_p,_x,_s) \
+    do {                        \
+        (_p)->volume.total._x += _s;          \
+        (_p)->volume.last_buffer._x += _s;    \
+    } while(0)
+
+void volume_clear(struct trace_volume *vol)
+{
+    bzero(vol, sizeof(*vol));
+}
+
+void volume_summary(struct trace_volume *vol)
+{
+    int j, k;
+    for(j=0; j<TOPLEVEL_MAX; j++)
+        if(vol->toplevel[j]) {
+            printf(" %-6s: %10lld\n",
+                   toplevel_name[j], vol->toplevel[j]);
+            switch(j) {
+            case TOPLEVEL_SCHED:
+                if(vol->sched_verbose)
+                    printf(" +-verbose: %10lld\n",
+                           vol->sched_verbose);
+                break;
+            case TOPLEVEL_HVM:
+                for(k=0; k<HVM_VOL_MAX; k++) {
+                    if(vol->hvm[k])
+                        printf(" +-%-7s: %10lld\n",
+                               hvm_vol_name[k], vol->hvm[k]);
+                }
+
+                break;
+            }
+        }
+}
+
+struct pcpu_info {
+    /* Information about this pcpu */
+    unsigned active:1, summary:1;
+    int pid;
+
+    /* Information related to scanning thru the file */
+    tsc_t first_tsc, last_tsc, order_tsc;
+    off_t file_offset;
+    off_t next_cpu_change_offset;
+    struct record_info ri;
+    int last_cpu_change_pid;
+    int power_state;
+
+    /* Information related to tsc skew detection / correction */
+    struct {
+        tsc_t offset;
+        cpu_mask_t downstream; /* To detect cycles in dependencies */
+    } tsc_skew;
+
+    /* Information related to domain tracking */
+    struct vcpu_data * current;
+    struct {
+        unsigned active:1,
+            domain_valid:1,
+            seen_valid_schedule:1; /* Seen an actual schedule since lost records */
+        unsigned did:16,vid:16;
+        tsc_t tsc;
+    } lost_record;
+
+    /* Record volume */
+    struct {
+        tsc_t buffer_first_tsc,
+            buffer_dom0_runstate_tsc,
+            buffer_dom0_runstate_cycles[RUNSTATE_MAX];
+        int buffer_dom0_runstate;
+        unsigned buffer_size;
+        struct trace_volume total, last_buffer;
+    } volume;
+
+    /* Time report */
+    struct {
+        tsc_t tsc;
+        struct cycle_summary idle, running, lost;
+    } time;
+};
+
+void __fill_in_record_info(struct pcpu_info *p);
+
+#define INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX 10
+
+struct {
+    int max_active_pcpu;
+    off_t last_epoch_offset;
+    int early_eof;
+    int lost_cpus;
+    tsc_t now;
+    struct cycle_framework f;
+    tsc_t buffer_trace_virq_tsc;
+    struct pcpu_info pcpu[MAX_CPUS];
+
+    struct {
+        int id;
+        /* Invariant: head null => tail null; head !null => tail valid */
+        struct cr3_value_struct *head, **tail;
+    } cr3;
+
+    struct {
+        tsc_t start_tsc;
+        /* Information about specific interval output types */
+        union {
+            struct {
+                struct interval_element ** values;
+                int count;
+            } array;
+            struct {
+                struct interval_list *head, **tail;
+            } list;
+            struct cr3_value_struct *cr3;
+            struct {
+                struct domain_data *d;
+                int guest_vector[INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX];
+            } domain;
+        };
+    } interval;
+} P = { 0 };
+
+/* Function prototypes */
+char * pcpu_string(int pcpu);
+void pcpu_string_draw(struct pcpu_info *p);
+void process_generic(struct record_info *ri);
+void dump_generic(FILE *f, struct record_info *ri);
+ssize_t __read_record(struct trace_record *rec, off_t offset);
+void error(enum error_level l, struct record_info *ri);
+void update_io_address(struct io_address ** list, unsigned int pa, int dir,
+                       tsc_t arc_cycles, unsigned int va);
+int check_extra_words(struct record_info *ri, int expected_size, const char *record);
+int vcpu_set_data_type(struct vcpu_data *v, int type);
+
+void cpumask_init(cpu_mask_t *c) {
+    *c = 0UL;
+}
+
+void cpumask_clear(cpu_mask_t *c, int cpu) {
+    *c &= ~(1UL << cpu);
+}
+
+void cpumask_set(cpu_mask_t *c, int cpu) {
+    *c |= (1UL << cpu);
+}
+
+int cpumask_isset(const cpu_mask_t *c, int cpu) {
+    if(*c & (1UL<<cpu))
+        return 1;
+    else
+        return 0;
+}
+
+void cpumask_union(cpu_mask_t *d, const cpu_mask_t *s) {
+    *d |= *s;
+}
+
+/* -- Time code -- */
+
+void cycles_to_time(unsigned long long c, struct time_struct *t) {
+    t->time = ((c - P.f.first_tsc) << 10) / opt.cpu_qhz;
+    t->s = t->time / 1000000000;
+    t->ns = t->time - (t->s * 1000000000);
+}
+
+void abs_cycles_to_time(unsigned long long ac, struct time_struct *t) {
+    if(ac > P.f.first_tsc) {
+        /* t->time = ((ac - P.f.first_tsc) * 1000) / (opt.cpu_hz / 1000000 );     */
+        /* t->s = t->time / 1000000000;                         */
+        /* t->ns = t->time % 1000000000; */
+        t->time = ((ac - P.f.first_tsc) << 10) / opt.cpu_qhz;
+        t->s = t->time / 1000000000;
+        t->ns = t->time - (t->s * 1000000000);
+    } else {
+        t->time = t->s = t->ns = 0;
+    }
+}
+
+tsc_t abs_cycles_to_global(unsigned long long ac) {
+    if(ac > P.f.first_tsc)
+        return ac - P.f.first_tsc;
+    else
+        return 0;
+}
+
+void scatterplot_vs_time(tsc_t atsc, long long y) {
+    struct time_struct t;
+
+    abs_cycles_to_time(atsc, &t);
+
+    printf("%u.%09u %lld\n", t.s, t.ns, y);
+}
+
+/* -- Summary Code -- */
+
+/* With compliments to "Numerical Recipes in C", which provided the algorithm
+ * and basic template for this function. */
+long long percentile(long long * A, int N, int ple) {
+    int I, J, L, R, K;
+
+    long long X, W;
+
+    /* No samples! */
+    if ( N == 0 )
+        return 0;
+
+    /* Find K, the element # we want */
+    K=N*ple/100;
+
+    /* Set the left and right boundaries of the current search space */
+    L=0; R=N-1;
+
+    while(L < R) {
+        /* X: The value to order everything higher / lower than */
+        X=A[K];
+
+        /* Starting at the left and the right... */
+        I=L;
+        J=R;
+
+        do {
+            /* Find the first element on the left that is out-of-order w/ X */
+            while(A[I]<X)
+                I++;
+            /* Find the first element on the right that is out-of-order w/ X */
+            while(X<A[J])
+                J--;
+
+            /* If we found something out-of-order */
+            if(I<=J) {
+                /* Switch the values */
+                W=A[I];
+                A[I]=A[J];
+                A[J]=W;
+
+                /* And move on */
+                I++; J--;
+            }
+        } while (I <= J); /* Keep going until our pointers meet or pass */
+
+        /* Re-adjust L and R, based on which element we're looking for */
+        if(J<K)
+            L=I;
+        if(K<I)
+            R=J;
+    }
+
+    return A[K];
+}
+
+float weighted_percentile(float * A, /* values */
+                                       unsigned long long * w, /* weights */
+                                       int N,                  /* total */
+                                       int ple)                /* percentile */
+{
+    int L, R, I, J, K;
+    unsigned long long L_weight, R_weight, I_weight, J_weight,
+        K_weight, N_weight;
+
+    float X, t1;
+    unsigned long long t2;
+
+    /* Calculate total weight */
+    N_weight=0;
+
+    for(I=0; I<N; I++) {
+        assert(w[I]!=0);
+        N_weight += w[I];
+    }
+
+    /* Find K_weight, the target weight we want */
+    K_weight = N_weight * ple / 100;
+
+    /* Set the left and right boundaries of the current search space */
+    L=0;
+    L_weight = 0;
+    R=N-1;
+    R_weight = N_weight - w[R];
+
+    /* Search between L and R, narrowing down until we're done */
+    while(L < R) {
+        /* Chose an ordering value from right in the middle */
+        K = (L + R) >> 1;
+        /* X: The value to order everything higher / lower than */
+        X=A[K];
+
+        /* Starting at the left and the right... */
+        I=L; I_weight = L_weight;
+        J=R; J_weight = R_weight;
+
+        do {
+            /* Find the first element on the left that is out-of-order w/ X */
+            while(A[I]<X) {
+                I_weight += w[I];
+                I++;
+            }
+            /* Find the first element on the right that is out-of-order w/ X */
+            while(X<A[J]) {
+                J_weight -= w[J];
+                J--;
+            }
+
+            /* If we actually found something... */
+            if(I<=J) {
+                /* Switch the values */
+                t1=A[I];
+                A[I]=A[J];
+                A[J]=t1;
+
+                t2=w[I];
+                w[I]=w[J];
+                w[J]=t2;
+
+                /* And move in */
+                I_weight += w[I];
+                I++;
+
+                J_weight -= w[J];
+                J--;
+            }
+        } while (I <= J); /* Keep going until our pointers meet or pass */
+
+        /* Re-adjust L and R, based on which element we're looking for */
+        if(J_weight<K_weight)
+            L=I; L_weight = I_weight;
+        if(K_weight<I_weight)
+            R=J; R_weight = J_weight;
+    }
+
+    return A[L];
+}
+
+long long self_weighted_percentile(long long * A,
+                                   int N,            /* total */
+                                   int ple)          /* percentile */
+{
+    int L, R, I, J, K;
+    long long L_weight, R_weight, I_weight, J_weight,
+        K_weight, N_weight;
+
+    long long X, t1;
+
+    /* Calculate total weight */
+    N_weight=0;
+
+    for(I=0; I<N; I++) {
+        if(A[I] < 0)
+            fprintf(warn, "%s: Value %lld less than zero!\n",
+                    __func__, A[I]);
+        assert(A[I]!=0);
+        N_weight += A[I];
+    }
+
+    /* Find K_weight, the target weight we want */
+    K_weight = N_weight * ple / 100;
+
+    /* Set the left and right boundaries of the current search space */
+    L=0;
+    L_weight = 0;
+    R=N-1;
+    R_weight = N_weight - A[R];
+
+    /* Search between L and R, narrowing down until we're done */
+    while(L < R) {
+        /* Chose an ordering value from right in the middle */
+        K = (L + R) >> 1;
+        /* X: The value to order everything higher / lower than */
+        X=A[K];
+
+        /* Starting at the left and the right... */
+        I=L; I_weight = L_weight;
+        J=R; J_weight = R_weight;
+
+        do {
+            /* Find the first element on the left that is out-of-order w/ X */
+            while(A[I]<X) {
+                I_weight += A[I];
+                I++;
+            }
+            /* Find the first element on the right that is out-of-order w/ X */
+            while(X<A[J]) {
+                J_weight -= A[J];
+                J--;
+            }
+
+            /* If we actually found something... */
+            if(I<=J) {
+                /* Switch the values */
+                t1=A[I];
+                A[I]=A[J];
+                A[J]=t1;
+
+                /* And move in */
+                I_weight += A[I];
+                I++;
+
+                J_weight -= A[J];
+                J--;
+            }
+        } while (I <= J); /* Keep going until our pointers meet or pass */
+
+        /* Re-adjust L and R, based on which element we're looking for */
+        if(J_weight<K_weight)
+            L=I; L_weight = I_weight;
+        if(K_weight<I_weight)
+            R=J; R_weight = J_weight;
+    }
+
+    return A[L];
+}
+
+static inline double __cycles_percent(long long cycles, long long total) {
+    return (double)(cycles*100) / total;
+}
+
+static inline double __summary_percent(struct event_cycle_summary *s,
+                                       struct cycle_framework *f) {
+    return __cycles_percent(s->cycles, f->total_cycles);
+}
+
+static inline double summary_percent_global(struct event_cycle_summary *s) {
+    return __summary_percent(s, &P.f);
+}
+
+static inline void update_summary(struct event_cycle_summary *s, long long c) {
+/* We don't know ahead of time how many samples there are, and working
+ * with dynamic stuff is a pain, and unnecessary.  This algorithm will
+ * generate a sample set that approximates an even sample.  We can
+ * then take the percentiles on this, and get an approximate value. */
+    if(c) {
+        if(opt.sample_size) {
+            int lap = (s->cycles_count/opt.sample_size)+1,
+                index =s->cycles_count % opt.sample_size;
+            if((index - (lap/3))%lap == 0) {
+                if(!s->cycles_sample) {
+                    s->cycles_sample = malloc(sizeof(*s->cycles_sample) * opt.sample_size);
+                    if(!s->cycles_sample) {
+                        fprintf(stderr, "%s: malloc failed!\n", __func__);
+                        error(ERR_SYSTEM, NULL);
+                    }
+                }
+                s->cycles_sample[index]=c;
+            }
+        }
+        s->cycles_count++;
+        s->cycles += c;
+
+        s->interval.count++;
+        s->interval.cycles += c;
+    }
+    s->count++;
+}
+
+static inline void clear_interval_summary(struct event_cycle_summary *s) {
+    s->interval.count = 0;
+    s->interval.cycles = 0;
+}
+
+static inline void update_cycles(struct cycle_summary *s, long long c) {
+/* We don't know ahead of time how many samples there are, and working
+ * with dynamic stuff is a pain, and unnecessary.  This algorithm will
+ * generate a sample set that approximates an even sample.  We can
+ * then take the percentiles on this, and get an approximate value. */
+    int lap, index;
+
+    if ( c == 0 )
+    {
+        fprintf(warn, "%s: cycles 0! Not updating...\n",
+                __func__);
+        return;
+    }
+
+    if ( opt.sample_size ) {
+        lap = (s->count/opt.sample_size)+1;
+        index =s->count % opt.sample_size;
+
+        if((index - (lap/3))%lap == 0) {
+            if(!s->sample) {
+                s->sample = malloc(sizeof(*s->sample) * opt.sample_size);
+                if(!s->sample) {
+                    fprintf(stderr, "%s: malloc failed!\n", __func__);
+                    error(ERR_SYSTEM, NULL);
+                }
+            }
+            s->sample[index] = c;
+        }
+    }
+
+    if(c > 0) {
+        s->cycles += c;
+        s->interval.cycles += c;
+    } else {
+        s->cycles += -c;
+        s->interval.cycles += -c;
+    }
+    s->count++;
+    s->interval.count++;
+}
+
+static inline void clear_interval_cycles(struct interval_element *e) {
+    e->cycles = 0;
+    e->count = 0;
+    e->instructions = 0;
+}
+
+static inline void update_cpi(struct weighted_cpi_summary *s,
+                              unsigned long long i,
+                              unsigned long long c) {
+/* We don't know ahead of time how many samples there are, and working
+ * with dynamic stuff is a pain, and unnecessary.  This algorithm will
+ * generate a sample set that approximates an even sample.  We can
+ * then take the percentiles on this, and get an approximate value. */
+    int lap, index;
+
+    if ( opt.sample_size ) {
+        lap = (s->count/opt.sample_size)+1;
+        index =s->count % opt.sample_size;
+
+        if((index - (lap/3))%lap == 0) {
+            if(!s->cpi) {
+                assert(!s->cpi_weight);
+
+                s->cpi = malloc(sizeof(*s->cpi) * opt.sample_size);
+                s->cpi_weight = malloc(sizeof(*s->cpi_weight) * opt.sample_size);
+                if(!s->cpi || !s->cpi_weight) {
+                    fprintf(stderr, "%s: malloc failed!\n", __func__);
+                    error(ERR_SYSTEM, NULL);
+                }
+            }
+            assert(s->cpi_weight);
+
+            s->cpi[index] = (float) c / i;
+            s->cpi_weight[index]=c;
+        }
+    }
+
+    s->instructions += i;
+    s->cycles += c;
+    s->count++;
+
+    s->interval.instructions += i;
+    s->interval.cycles += c;
+    s->interval.count++;
+}
+
+static inline void clear_interval_cpi(struct weighted_cpi_summary *s) {
+    s->interval.cycles = 0;
+    s->interval.count = 0;
+    s->interval.instructions = 0;
+}
+
+static inline void print_cpu_affinity(struct cycle_summary *s, char *p) {
+    if(s->count) {
+        long long avg;
+
+        avg = s->cycles / s->count;
+
+        if ( opt.sample_size ) {
+            long long  p5, p50, p95;
+            int data_size = s->count;
+           if(data_size > opt.sample_size)
+                data_size = opt.sample_size;
+
+            p50 = percentile(s->sample, data_size, 50);
+            p5 = percentile(s->sample, data_size, 5);
+            p95 = percentile(s->sample, data_size, 95);
+
+            printf("%s: %7d %6lld {%6lld|%6lld|%6lld}\n",
+                   p, s->count, avg, p5, p50, p95);
+        } else {
+            printf("%s: %7d %6lld\n",
+                   p, s->count, avg);
+        }
+    }
+}
+
+static inline void print_cpi_summary(struct weighted_cpi_summary *s) {
+    if(s->count) {
+        float avg;
+
+        avg = (float)s->cycles / s->instructions;
+
+        if ( opt.sample_size ) {
+            float p5, p50, p95;
+            int data_size = s->count;
+
+            if(data_size > opt.sample_size)
+                data_size = opt.sample_size;
+
+            p50 = weighted_percentile(s->cpi, s->cpi_weight, data_size, 50);
+            p5 = weighted_percentile(s->cpi, s->cpi_weight, data_size, 5);
+            p95 = weighted_percentile(s->cpi, s->cpi_weight, data_size, 95);
+
+            printf("  CPI summary: %2.2f {%2.2f|%2.2f|%2.2f}\n",
+                   avg, p5, p50, p95);
+        } else {
+            printf("  CPI summary: %2.2f\n", avg);
+        }
+    }
+}
+
+static inline void print_cycle_percent_summary(struct cycle_summary *s,
+                                               tsc_t total, char *p) {
+    if(s->count) {
+        long long avg;
+        double percent, seconds;
+
+        avg = s->cycles / s->count;
+
+        seconds = ((double)s->cycles) / opt.cpu_hz;
+
+        percent = ((double)(s->cycles * 100)) / total;
+
+        if ( opt.sample_size ) {
+            long long p5, p50, p95;
+            int data_size = s->count;
+
+            if(data_size > opt.sample_size)
+                data_size = opt.sample_size;
+
+            p50 = self_weighted_percentile(s->sample, data_size, 50);
+            p5 = self_weighted_percentile(s->sample, data_size, 5);
+            p95 = self_weighted_percentile(s->sample, data_size, 95);
+
+            printf("%s: %7d %5.2lfs %5.2lf%% %6lld {%6lld|%6lld|%6lld}\n",
+                   p, s->count,
+                   seconds,
+                   percent,
+                   avg, p5, p50, p95);
+        } else {
+            printf("%s: %7d %5.2lfs %5.2lf%% %6lld\n",
+                   p, s->count,
+                   seconds,
+                   percent,
+                   avg);
+        }
+    }
+}
+
+static inline void print_cycle_summary(struct cycle_summary *s, char *p) {
+    if(s->count) {
+        long long avg;
+
+        avg = s->cycles / s->count;
+
+        if ( opt.sample_size ) {
+            long long p5, p50, p95;
+            int data_size = s->count;
+
+            if(data_size > opt.sample_size)
+                data_size = opt.sample_size;
+
+            p50 = self_weighted_percentile(s->sample, data_size, 50);
+            p5 = self_weighted_percentile(s->sample, data_size, 5);
+            p95 = self_weighted_percentile(s->sample, data_size, 95);
+
+            printf("%s: %7d %5.2lfs %6lld {%6lld|%6lld|%6lld}\n",
+                   p, s->count, ((double)s->cycles)/opt.cpu_hz,
+                   avg, p5, p50, p95);
+        } else {
+            printf("%s: %7d %5.2lfs %6lld\n",
+                   p, s->count, ((double)s->cycles)/opt.cpu_hz, avg);
+        }
+    }
+}
+
+#define PRINT_SUMMARY(_s, _p...)                                        \
+    do {                                                                \
+        if((_s).count) {                                                \
+            if ( opt.sample_size ) {                                    \
+                unsigned long long p5, p50, p95;                        \
+                int data_size=(_s).cycles_count;                        \
+                if(data_size > opt.sample_size)                         \
+                    data_size=opt.sample_size;                          \
+                p50=percentile((_s).cycles_sample, data_size, 50);      \
+                p5=percentile((_s).cycles_sample, data_size, 5);        \
+                p95=percentile((_s).cycles_sample, data_size, 95);      \
+                printf(_p);                                             \
+                printf(" %7d %5.2lfs %5.2lf%% %5lld cyc {%5lld|%5lld|%5lld}\n", \
+                       (_s).count,                                      \
+                       ((double)(_s).cycles)/opt.cpu_hz,                \
+                       summary_percent_global(&(_s)),                   \
+                       (_s).cycles_count ? (_s).cycles / (_s).cycles_count:0, \
+                       p5, p50, p95);                                   \
+            } else {                                                    \
+                printf(_p);                                             \
+                printf(" %7d %5.2lfs %5.2lf%% %5lld cyc\n",             \
+                       (_s).count,                                      \
+                       ((double)(_s).cycles)/opt.cpu_hz,                \
+                       summary_percent_global(&(_s)),                   \
+                       (_s).cycles_count ? (_s).cycles / (_s).cycles_count:0); \
+            }                                                           \
+        }                                                               \
+    } while(0)
+
+#define INTERVAL_DESC_MAX 31
+struct interval_list {
+    struct interval_element *elem;
+    struct interval_list *next;
+    char desc[INTERVAL_DESC_MAX+1]; /* +1 for the null terminator */
+};
+
+void __interval_cycle_percent_output(struct interval_element *e, tsc_t cycles) {
+    printf(" %.02lf",
+           __cycles_percent(e->cycles, cycles));
+    clear_interval_cycles(e);
+}
+
+void interval_cycle_percent_output(struct interval_element *e) {
+    __interval_cycle_percent_output(e, opt.interval.cycles);
+}
+
+void interval_time_output(void) {
+    struct time_struct t;
+    abs_cycles_to_time(P.interval.start_tsc, &t);
+
+    printf("%u.%09u", t.s, t.ns);
+}
+
+void interval_table_output(void) {
+    int i;
+
+    interval_time_output();
+
+    if(opt.interval.mode == INTERVAL_MODE_ARRAY) {
+        for(i=0; i<P.interval.array.count; i++) {
+            struct interval_element *e = P.interval.array.values[i];
+            if(e) {
+                interval_cycle_percent_output(e);
+            } else {
+                printf(" 0.0");
+            }
+        }
+    } else if(opt.interval.mode == INTERVAL_MODE_LIST) {
+        struct interval_list *p;
+        for(p = P.interval.list.head; p; p = p->next)
+            interval_cycle_percent_output(p->elem);
+    }
+    printf("\n");
+}
+
+void interval_table_tail(void) {
+    struct interval_list *p;
+
+    printf("time");
+
+    for(p=P.interval.list.head; p; p = p->next)
+        printf(" %s", p->desc);
+
+    printf("\n");
+}
+
+void interval_table_alloc(int count) {
+    P.interval.array.count = count;
+    P.interval.array.values = malloc(count * sizeof(struct interval_list *));
+
+    if(!P.interval.array.values) {
+        fprintf(stderr, "Malloc failed!\n");
+        error(ERR_SYSTEM, NULL);
+    }
+
+    bzero(P.interval.array.values, count*sizeof(struct interval_list *));
+}
+
+void interval_list_add(struct interval_element *e, char *desc) {
+    struct interval_list *p;
+
+    fprintf(warn, "%s: Adding element '%s'\n", __func__, desc);
+
+    if((p=malloc(sizeof(*p)))==NULL) {
+        fprintf(stderr, "malloc() failed.\n");
+        error(ERR_SYSTEM, NULL);
+    }
+
+    bzero(p, sizeof(*p));
+
+    p->elem = e;
+    strncpy(p->desc, desc, INTERVAL_DESC_MAX);
+
+    p->next=NULL;
+
+    if(P.interval.list.head)
+        *P.interval.list.tail = p;
+    else
+        P.interval.list.head = p;
+    P.interval.list.tail = &p->next;
+}
+
+void interval_cr3_schedule_time_header(void) {
+    if( opt.interval.mode == INTERVAL_MODE_ARRAY ) {
+        int i;
+
+        printf("time");
+        for(i=0; i<opt.interval.array.count; i++) {
+            printf(" %llx", opt.interval.array.values[i]);
+        }
+        printf("\n");
+    }
+    /* Can't see into the future, so no header if cr3 values are
+       not specified. */
+}
+
+void interval_cr3_value_check(struct cr3_value_struct *cr3) {
+    if( opt.interval.mode == INTERVAL_MODE_ARRAY ) {
+        int i;
+
+        for(i=0; i<opt.interval.array.count; i++) {
+            if(cr3->gmfn == opt.interval.array.values[i]) {
+                if(P.interval.array.values[i]) {
+                    fprintf(stderr, "Fatal: duplicate cr3 value %llx!\n",
+                            cr3->gmfn);
+                    error(ERR_ASSERT, NULL);
+                }
+                fprintf(stderr, "%s: found gmfn %llx\n",
+                        __func__, cr3->gmfn);
+
+                P.interval.array.values[i] = &cr3->total_time.interval;
+            }
+        }
+    } else if(opt.interval.mode == INTERVAL_MODE_LIST) {
+        char desc[32];
+        snprintf(desc, 32, "%llx", cr3->gmfn);
+        interval_list_add(&cr3->total_time.interval, desc);
+    } else {
+        /* Custom */
+        if(cr3->gmfn == opt.interval.array.values[0])
+            P.interval.cr3 = cr3;
+    }
+}
+
+void interval_cr3_schedule_ordered_output(void) {
+    struct cr3_value_struct *p;
+    int i;
+
+    struct cr3_value_struct **qsort_array;
+    int N=0;
+
+    int cr3_time_compare(const void *_a, const void *_b) {
+        struct cr3_value_struct *a=*(typeof(&a))_a;
+        struct cr3_value_struct *b=*(typeof(&a))_b;
+
+        if(a->total_time.interval.cycles < b->total_time.interval.cycles)
+            return 1;
+        else if(b->total_time.interval.cycles == a->total_time.interval.cycles) {
+            if(a->total_time.interval.count < b->total_time.interval.count)
+                return 1;
+            else if(a->total_time.interval.count == b->total_time.interval.count)
+                return 0;
+            else
+                return -1;
+        } else
+            return -1;
+    }
+
+    for(p=P.cr3.head; p; p=p->gnext)
+        N++;
+
+    if(!N)
+        return;
+
+    qsort_array = malloc(N * sizeof(struct eip_list_struct *));
+
+    for(i=0, p=P.cr3.head; p; p=p->gnext, i++)
+        qsort_array[i]=p;
+
+    qsort(qsort_array, N, sizeof(struct eip_list_struct *),
+          cr3_time_compare);
+
+    interval_time_output();
+
+    for(i=0; i<N; i++) {
+        p = qsort_array[i];
+        /* Rounding down means this will get ..1]% */
+        if(p->total_time.interval.cycles > 0) {
+            printf(" %8llx: %.02lf %c\n",
+                   p->gmfn,
+                   __cycles_percent(p->total_time.interval.cycles,
+                                    opt.interval.cycles),
+                   (p->first_time > P.interval.start_tsc)?'*':' ');
+        }
+        clear_interval_cycles(&p->total_time.interval);
+    }
+
+    free(qsort_array);
+}
+
+void interval_cr3_short_summary_header(void) {
+    int i;
+
+    printf("time guest");
+    for(i=0; i<HVM_SHORT_SUMMARY_MAX; i++)
+        printf(" %s", hvm_short_summary_name[i]);
+    printf("\n");
+}
+
+void interval_cr3_short_summary_output(void) {
+    struct cycle_summary *hss_array;
+    int i;
+
+    if(P.interval.cr3) {
+        struct cr3_value_struct *p = P.interval.cr3;
+
+        interval_time_output();
+
+        hss_array = p->hvm.s;
+
+        printf(" %.02lf",
+               __cycles_percent(p->total_time.interval.cycles,
+                                opt.interval.cycles));
+
+        for(i=0; i<HVM_SHORT_SUMMARY_MAX; i++)
+            __interval_cycle_percent_output(&hss_array[i].interval,
+                                            p->total_time.interval.cycles);
+
+        clear_interval_cycles(&p->total_time.interval);
+
+        printf("\n");
+    }
+}
+
+void interval_domain_value_check(struct domain_data *d) {
+    if( opt.interval.mode == INTERVAL_MODE_ARRAY ) {
+        int i;
+
+        for(i=0; i<opt.interval.array.count; i++) {
+            if(d->did == opt.interval.array.values[i]) {
+                if(P.interval.array.values[i]) {
+                    fprintf(stderr, "Fatal: duplicate domain value %d!\n",
+                            d->did);
+                    error(ERR_ASSERT, NULL);
+                }
+
+                P.interval.array.values[i] = &d->total_time.interval;
+            }
+        }
+    } else if(opt.interval.mode == INTERVAL_MODE_LIST) {
+        char desc[32];
+        snprintf(desc, 32, "%d", d->did);
+        interval_list_add(&d->total_time.interval, desc);
+    } else {
+        if(d->did == opt.interval.array.values[0])
+            P.interval.domain.d = d;
+    }
+}
+
+void interval_domain_short_summary_header(void) {
+    int i;
+
+    printf("time running");
+    for(i=0; i<HVM_SHORT_SUMMARY_MAX; i++)
+        printf(" %s", hvm_short_summary_name[i]);
+    printf("\n");
+}
+
+void interval_domain_short_summary_output(void) {
+
+    if(P.interval.domain.d) {
+        struct domain_data *d;
+        int i;
+
+        d=P.interval.domain.d;
+
+        interval_time_output();
+
+        interval_cycle_percent_output(&d->total_time.interval);
+
+        for(i=0; i<HVM_SHORT_SUMMARY_MAX; i++)
+            interval_cycle_percent_output(&d->hvm_short.s[i].interval);
+
+        printf("\n");
+    }
+}
+
+void interval_domain_guest_interrupt(struct hvm_data *h, int vector) {
+    struct domain_data *d = h->v->d;
+    int i;
+
+    /* Check to see if this vector is in the "print list" */
+    for(i=0; i<INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX; i++) {
+        if(P.interval.domain.guest_vector[i] == 0) {
+            P.interval.domain.guest_vector[i] = vector;
+            break;
+        }
+        if(P.interval.domain.guest_vector[i] == vector)
+            break;
+    }
+
+    if(i == INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX) {
+        fprintf(stderr, "FATAL: used up all %d guest interrupt slots!\n",
+                INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX);
+        error(ERR_LIMIT, NULL);
+    } else {
+        d->guest_interrupt[vector]++;
+    }
+}
+
+void interval_domain_guest_interrupt_tail(void) {
+    int i;
+
+    printf("time running");
+    for(i=0; i<INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX; i++) {
+        if(P.interval.domain.guest_vector[i] == 0)
+            break;
+        printf(" %d", P.interval.domain.guest_vector[i]);
+    }
+    printf("\n");
+}
+
+void interval_domain_guest_interrupt_output(void) {
+
+    if(P.interval.domain.d) {
+        struct domain_data *d;
+        int i;
+
+        d=P.interval.domain.d;
+
+        interval_time_output();
+
+        for(i=0; i<INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX; i++) {
+            int v = P.interval.domain.guest_vector[i];
+
+            if(v == 0)
+                break;
+
+            printf(" %d", d->guest_interrupt[v]);
+
+            d->guest_interrupt[v]=0;
+        }
+
+        printf("\n");
+    }
+
+}
+
+void interval_domain_grant_maps_output(void) {
+
+    if(P.interval.domain.d) {
+        struct domain_data *d;
+
+        d=P.interval.domain.d;
+
+        interval_time_output();
+
+        printf(" %d", d->memops.done_for_interval[MEM_PAGE_GRANT_MAP]);
+
+        d->memops.done_for_interval[MEM_PAGE_GRANT_MAP] = 0;
+
+        printf("\n");
+    }
+}
+
+/* General interval gateways */
+
+void interval_callback(void) {
+    /* First, see if we're in generic mode. */
+    switch(opt.interval.mode) {
+    case INTERVAL_MODE_LIST:
+    case INTERVAL_MODE_ARRAY:
+        interval_table_output();
+        return;
+    default:
+        break;
+    }
+
+    switch(opt.interval.output) {
+    case INTERVAL_CR3_SCHEDULE_ORDERED:
+        interval_cr3_schedule_ordered_output();
+        break;
+    case INTERVAL_CR3_SHORT_SUMMARY:
+        interval_cr3_short_summary_output();
+        break;
+    case INTERVAL_DOMAIN_SHORT_SUMMARY:
+        interval_domain_short_summary_output();
+        break;
+    case INTERVAL_DOMAIN_GUEST_INTERRUPT:
+        interval_domain_guest_interrupt_output();
+        break;
+    case INTERVAL_DOMAIN_GRANT_MAPS:
+        interval_domain_grant_maps_output();
+        break;
+    default:
+        break;
+    }
+}
+
+void interval_header(void) {
+    switch(opt.interval.output) {
+    case INTERVAL_CR3_SHORT_SUMMARY:
+        interval_cr3_short_summary_header();
+        break;
+    case INTERVAL_DOMAIN_SHORT_SUMMARY:
+        interval_domain_short_summary_header();
+        break;
+    default:
+        break;
+    }
+}
+
+void interval_tail(void) {
+    if(opt.interval.mode == INTERVAL_MODE_LIST) {
+        interval_table_tail();
+        return;
+    }
+
+    switch(opt.interval.output) {
+    case INTERVAL_DOMAIN_GUEST_INTERRUPT:
+        interval_domain_guest_interrupt_tail();
+        break;
+    default:
+        break;
+    }
+}
+
+/* -- Eip list data -- */
+
+void update_eip(struct eip_list_struct **head, unsigned long long eip,
+                unsigned long long cycles, int type, void * extra) {
+    struct eip_list_struct *p, **last=head;
+
+    for(p=*head; p; last = (&p->next), p=p->next)
+        if(p->eip >= eip)
+            break;
+
+    if(!p || p->eip != eip) {
+        p=malloc(sizeof(*p));
+        if(!p) {
+            perror("malloc failed");
+            error(ERR_SYSTEM, NULL);
+        }
+
+        bzero(p, sizeof(*p));
+
+        p->eip=eip;
+        p->type = type;
+
+        if(eip_list_type[type].new) {
+            eip_list_type[type].new(p, extra);
+        }
+        p->next = *last;
+        *last=p;
+    } else if(p->type != type) {
+        fprintf(stderr, "WARNING, mixed types! %d %d\n", p->type, type);
+    } else if(eip_list_type[type].update) {
+        eip_list_type[type].update(p, extra);
+    }
+
+    update_summary(&p->summary, cycles);
+}
+
+void dump_eip(struct eip_list_struct *head) {
+    struct eip_list_struct *p;
+    int i;
+    int total = 0;
+
+    struct eip_list_struct **qsort_array;
+    int N=0;
+
+    int eip_compare(const void *_a, const void *_b) {
+        struct eip_list_struct *a=*(typeof(&a))_a;
+        struct eip_list_struct *b=*(typeof(&a))_b;
+
+        if(a->summary.cycles < b->summary.cycles)
+            return 1;
+        else if(b->summary.cycles == a->summary.cycles) {
+            if(a->summary.count < b->summary.count)
+                return 1;
+            else if(a->summary.count == b->summary.count)
+                return 0;
+            else
+                return -1;
+        } else
+            return -1;
+    }
+
+    for(p=head; p; p=p->next)
+    {
+        total += p->summary.count;
+        N++;
+    }
+
+    if(!N)
+        return;
+
+    qsort_array = malloc(N * sizeof(struct eip_list_struct *));
+
+    for(i=0, p=head; p; p=p->next, i++)
+        qsort_array[i]=p;
+
+    qsort(qsort_array, N, sizeof(struct eip_list_struct *),
+          eip_compare);
+
+    /* WARNING: don't use N after this point unless you copy this variable */
+#if 0
+    if(opt.summary_eip_limit && opt.summary_eip_limit < N)
+        N=opt.summary_eip_limit;
+#endif
+
+    printf("   Total samples: %d\n", total);
+
+    for(i=0; i<N; i++) {
+        p = qsort_array[i];
+        if ( p->summary.cycles )
+            PRINT_SUMMARY(p->summary, "   %12llx%-45s: ",
+                          p->eip,
+                          find_symbol(p->eip));
+        else
+        {
+            printf("   %12llx%-45s: ",
+                          p->eip,
+                          find_symbol(p->eip));
+            printf(" %7d %5.2lf%%\n",
+                   p->summary.count,
+                   ((double)p->summary.count*100)/total);
+        }
+
+
+        if(eip_list_type[p->type].dump) {
+            eip_list_type[p->type].dump(p);
+        }
+    }
+
+    free(qsort_array);
+}
+
+/* -- HVM code -- */
+struct hvm_pf_xen_record {
+    //unsigned vcpu:16, domain:16;
+    union {
+        struct {
+            unsigned long long va;
+            unsigned int error_code;
+        } x64;
+        struct {
+            unsigned int va;
+            unsigned int error_code;
+        } x32;
+    };
+};
+
+void hvm_update_short_summary(struct hvm_data *h, int element) {
+    struct vcpu_data *v = h->v;
+
+    if(v->cr3.data)
+        update_cycles(&v->cr3.data->hvm.s[element], h->arc_cycles);
+
+    update_cycles(&v->d->hvm_short.s[element], h->arc_cycles);
+
+    h->short_summary_done=1;
+}
+
+void hvm_short_summary(struct hvm_short_summary_struct *hss,
+                       tsc_t total, char *prefix) {
+    char desc[80];
+    int i;
+
+    for(i=0; i<HVM_SHORT_SUMMARY_MAX; i++) {
+        snprintf(desc, 80, "%s%s", prefix, hvm_short_summary_name[i]);
+        print_cycle_percent_summary(hss->s + i, total, desc);
+    }
+}
+
+/* Wrapper to try to make sure this is only called once per
+ * call site, rather than walking through the list each time */
+#define hvm_set_summary_handler(_h, _s, _d)                             \
+    do {                                                                \
+        static int done=0;                                              \
+        int ret;                                                        \
+        if(!done) {                                                     \
+            if ((ret=__hvm_set_summary_handler(_h, _s, _d)))            \
+                fprintf(stderr, "%s: hvm_set_summary_handler returned %d\n", \
+                        __func__, ret);                                 \
+            done=1;                                                     \
+        }                                                               \
+    } while(0)
+
+int __hvm_set_summary_handler(struct hvm_data *h, void (*s)(struct hvm_data *h, void*d), void*d) {
+    /* Set summary handler */
+    if(h->exit_reason < h->exit_reason_max)
+    {
+        struct hvm_summary_handler_node *p, **q;
+
+        /* Find the end of the list, checking to make sure there are no
+         * duplicates along the way */
+        q=&h->exit_reason_summary_handler_list[h->exit_reason];
+        p = *q;
+        while(p)
+        {
+            if(p->handler == s && p->data == d)
+            {
+                fprintf(stderr, "%s: Unexpected duplicate handler %p,%p\n",
+                        __func__, s, d);
+                error(ERR_STRICT, NULL);
+            return -EBUSY;
+            }
+            q=&p->next;
+            p=*q;
+        }
+
+        assert(p==NULL);
+
+        /* Insert the new handler */
+        p=malloc(sizeof(*p));
+        if (!p) {
+            fprintf(stderr, "%s: Malloc failed!\n", __func__);
+            error(ERR_SYSTEM, NULL);
+        }
+        p->handler=s;
+        p->data = d;
+        p->next=*q;
+        *q=p;
+        return 0;
+    }
+    return -EINVAL;
+}
+
+void hvm_generic_postprocess(struct hvm_data *h);
+
+static int hvm_set_postprocess(struct hvm_data *h, void (*s)(struct hvm_data *h))
+{
+    if ( h->post_process == NULL
+        || h->post_process == hvm_generic_postprocess )
+    {
+        h->post_process = s;
+        return 0;
+    }
+    else
+        return 1;
+}
+
+#define SIGN_EXTENDED_BITS (~((1ULL<<48)-1))
+#define HIGH_BIT(_v) ((_v) & (1ULL<<47))
+static inline int is_valid_addr64(unsigned long long va)
+{
+    if(HIGH_BIT(va))
+        return ((va & SIGN_EXTENDED_BITS) == SIGN_EXTENDED_BITS);
+    else
+        return ((va & SIGN_EXTENDED_BITS) == 0);
+}
+
+void hvm_pf_xen_summary(struct hvm_data *h, void *d) {
+    int i,j, k;
+
+    printf("   page_fault\n");
+    for(i=0; i<PF_XEN_MAX; i++)
+    {
+        if( pf_xen_name[i] )
+        {
+            PRINT_SUMMARY(h->summary.pf_xen[i],
+                          "     %-25s ", pf_xen_name[i]);
+        }
+        else
+        {
+            PRINT_SUMMARY(h->summary.pf_xen[i],
+                          "     [%23d] ", i);
+        }
+        switch(i){
+        case PF_XEN_NON_EMULATE:
+            for(j=0; j<PF_XEN_NON_EMUL_MAX; j++)
+                PRINT_SUMMARY(h->summary.pf_xen_non_emul[j],
+                              "      *%-13s ", pf_xen_non_emul_name[j]);
+            break;
+        case PF_XEN_EMULATE:
+            for(j=0; j<PF_XEN_EMUL_MAX; j++) {
+                PRINT_SUMMARY(h->summary.pf_xen_emul[j],
+                              "      *%-13s ", pf_xen_emul_name[j]);
+                if(j == PF_XEN_EMUL_EARLY_UNSHADOW) {
+                    int k;
+                    for(k=0; k<5; k++) {
+                        PRINT_SUMMARY(h->summary.pf_xen_emul_early_unshadow[k],
+                                      "        +[%d] ", k);
+                    }
+                }
+            }
+            break;
+        case PF_XEN_FIXUP:
+            for(j=0; j<PF_XEN_FIXUP_MAX; j++) {
+                PRINT_SUMMARY(h->summary.pf_xen_fixup[j],
+                              "      *%-13s ", pf_xen_fixup_name[j]);
+                if(j == PF_XEN_FIXUP_UNSYNC ) {
+                    for(k=0; k<PF_XEN_FIXUP_UNSYNC_RESYNC_MAX; k++) {
+                        PRINT_SUMMARY(h->summary.pf_xen_fixup_unsync_resync[k],
+                                      "       +[%3d] ", k);
+                    }
+                    PRINT_SUMMARY(h->summary.pf_xen_fixup_unsync_resync[k],
+                                  "        +[max] ");
+                }
+            }
+            break;
+        }
+    }
+}
+
+void pf_preprocess(struct pf_xen_extra *e, int guest_paging_levels)
+{
+    switch(guest_paging_levels) {
+        /* Select a subfield of _bits bits starting at bit _shift from _x */
+#define _SUBFIELD(_bits, _shift, _x)                \
+        (((_x)>>(_shift)) & ((1ULL<<(_bits))-1))
+    case 4:
+        /* Verify sign-extension */
+        if((HIGH_BIT(e->va)
+            &&((e->va & SIGN_EXTENDED_BITS) != SIGN_EXTENDED_BITS))
+           || (!HIGH_BIT(e->va)
+               && ((e->va & SIGN_EXTENDED_BITS) != 0))) {
+            fprintf(warn, "Strange, va %llx not properly sign extended for 4-level pagetables\n",
+                    e->va);
+        }
+        e->pt_index[4]=_SUBFIELD(9,39,e->va);
+        e->pt_index[3]=_SUBFIELD(9,30,e->va);
+        e->pt_index[2]=_SUBFIELD(9,21,e->va);
+        e->pt_index[1]=_SUBFIELD(9,12,e->va);
+        /* These are only useful for the linear-pagetable code */
+        e->pt_index[0]=_SUBFIELD(9,3,e->va);
+        if(e->va & 0x4)
+            e->pt_is_lo=0;
+        break;
+    case 3:
+        e->pt_index[3]=_SUBFIELD(2,30,e->va);
+        e->pt_index[2]=_SUBFIELD(9,21,e->va);
+        e->pt_index[1]=_SUBFIELD(9,12,e->va);
+        /* These are only useful for the linear-pagetable code */
+        e->pt_index[0]=_SUBFIELD(9,3,e->va);
+        if(e->va & 0x4)
+            e->pt_is_lo=0;
+        break;
+    case 2:
+        e->pt_index[2]=_SUBFIELD(10,22,e->va);
+        e->pt_index[1]=_SUBFIELD(10,12,e->va);
+        /* This is only useful for the linear pagetable code */
+        e->pt_index[0]=_SUBFIELD(10,2,e->va);
+        break;
+    case 0:
+        break;
+    default:
+        fprintf(warn, "Don't know how to handle %d-level pagetables\n",
+                guest_paging_levels);
+    }
+
+    e->corresponding_va = CORR_VA_INVALID;
+    e->pt_level = 0;
+
+    /* Detect accesses to Windows linear pagetables */
+    switch(guest_paging_levels)
+    {
+    case 2:
+        if(e->pt_index[2] == 768) {
+            if(e->pt_index[1] == 768)
+            {
+                e->pt_level = 2;
+                e->corresponding_va=((1UL<<22)-1)
+                    | e->pt_index[0]<<22;
+            }
+            else
+            {
+                e->pt_level = 1;
+                e->corresponding_va = ((1UL<<12)-1)
+                    | e->pt_index[1]<<22
+                    | e->pt_index[0]<<12;
+            }
+        }
+        break;
+    case 3:
+        if(e->pt_index[3]==3 && (e->pt_index[2]>>2==0))
+        {
+            if(e->pt_index[2]==3 && e->pt_index[1]>>2==0)
+            {
+                if(e->pt_index[1] == 3 && e->pt_index[0]>>2==0)
+                {
+                    e->pt_level = 3;
+                    e->corresponding_va=((1UL<<30)-1)
+                        | e->pt_index[0]<<30;
+                }
+                else
+                {
+                    e->pt_level = 2;
+                    e->corresponding_va=((1UL<<21)-1)
+                        | e->pt_index[1]<<30
+                        | e->pt_index[2]<<21;
+                }
+            }
+            else
+            {
+                e->pt_level = 1;
+                e->corresponding_va = ((1UL<<12)-1)
+                    | e->pt_index[0]<<12
+                    | e->pt_index[1]<<21
+                    | e->pt_index[2]<<30;
+            }
+        }
+        break;
+    case 4:
+        if(e->pt_index[4] == 0x1ed)
+        {
+            if(e->pt_index[3] == 0x1ed)
+            {
+                if(e->pt_index[2] == 0x1ed)
+                {
+                    if(e->pt_index[1] == 0x1ed)
+                    {
+                        e->pt_level = 4;
+                        e->corresponding_va = ((1ULL<<39)-1)
+                            | (unsigned long long)e->pt_index[0]<<39;
+                    }
+                    else
+                    {
+                        e->pt_level = 3;
+                        e->corresponding_va = ((1ULL<<30)-1)
+                            | (unsigned long long)e->pt_index[0]<<30
+                            | (unsigned long long)e->pt_index[1]<<39;
+                    }
+                }
+                else
+                {
+                    e->pt_level = 2;
+                    e->corresponding_va = ((1ULL<<21)-1)
+                        | (unsigned long long)e->pt_index[0]<<21
+                        | (unsigned long long)e->pt_index[1]<<30
+                        | (unsigned long long)e->pt_index[2]<<39;
+                }
+            }
+            else
+            {
+                e->pt_level = 1;
+                e->corresponding_va = ((1ULL<<12)-1)
+                    | (unsigned long long)e->pt_index[0]<<12
+                    | (unsigned long long)e->pt_index[1]<<21
+                    | (unsigned long long)e->pt_index[2]<<30
+                    | (unsigned long long)e->pt_index[3]<<39;
+            }
+
+            if(HIGH_BIT(e->corresponding_va))
+                e->corresponding_va |= SIGN_EXTENDED_BITS;
+        }
+        break;
+    default:
+        break;
+    }
+}
+
+void hvm_pf_xen_preprocess(unsigned event, struct hvm_data *h) {
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+    struct mmio_info *m = &h->inflight.mmio;
+    struct hvm_pf_xen_record *r = (typeof(r))h->d;
+
+    if(event == TRC_HVM_PF_XEN64)
+    {
+        if(!is_valid_addr64(r->x64.va))
+            fprintf(warn, "%s: invalid va %llx",
+                    __func__, r->x64.va);
+        e->va = r->x64.va;
+        e->error_code = r->x64.error_code;
+    }
+    else
+    {
+        e->va = r->x32.va;
+        e->error_code = r->x32.error_code;
+    }
+
+    if(m->data_valid)
+        e->pf_case = PF_XEN_MMIO;
+    else
+    {
+        pf_preprocess(e, h->v->guest_paging_levels);
+
+        /* On rio traces, we try to infer emulation by looking for accesses
+           in the linear pagetable */
+        if(e->pt_level > 0)
+            e->pf_case = PF_XEN_EMULATE;
+        else
+            e->pf_case = PF_XEN_NON_EMULATE;
+    }
+}
+
+static inline int is_kernel(int paging_levels, unsigned long long va) {
+    switch(paging_levels) {
+    case 2:
+    case 3:
+        if(va & 0x80000000)
+            return 1;
+        else
+            return 0;
+        break;
+    case 4:
+        if(HIGH_BIT(va))
+            return 1;
+        else return 0;
+    default:
+        return 0;
+    }
+
+}
+
+void hvm_pf_xen_postprocess(struct hvm_data *h) {
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+
+    if(opt.summary_info) {
+        if(e->pf_case)
+            update_summary(&h->summary.pf_xen[e->pf_case],
+                           h->arc_cycles);
+        else
+            fprintf(warn, "Strange, pf_case 0!\n");
+        switch(e->pf_case)
+        {
+        case PF_XEN_EMULATE:
+            update_eip(&h->v->d->emulate_eip_list,
+                       h->rip,
+                       h->arc_cycles,
+                       0, NULL);
+            break;
+        case PF_XEN_NON_EMULATE:
+            if(is_kernel(h->v->guest_paging_levels, h->rip))
+                update_summary(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_EIP_KERNEL],
+                               h->arc_cycles);
+            else
+                update_summary(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_EIP_USER],
+                               h->arc_cycles);
+            if(is_kernel(h->v->guest_paging_levels, e->va))
+                update_summary(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_VA_KERNEL],
+                               h->arc_cycles);
+
+            else
+                update_summary(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_VA_USER],
+                               h->arc_cycles);
+        }
+
+        /* Set summary handler */
+        hvm_set_summary_handler(h, hvm_pf_xen_summary, NULL);
+    }
+}
+
+void hvm_pf_xen_process(struct record_info *ri, struct hvm_data *h) {
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+
+    if(ri->event == TRC_HVM_PF_XEN64
+        && h->v->guest_paging_levels != 4)
+        fprintf(warn, "Strange, PF_XEN64 but guest_paging_levels %d!\n",
+                h->v->guest_paging_levels);
+    else if(ri->event == TRC_HVM_PF_XEN
+            && h->v->guest_paging_levels == 4)
+        fprintf(warn, "Strange, PF_XEN but guest_paging_levels %d!\n",
+                h->v->guest_paging_levels);
+
+    hvm_pf_xen_preprocess(ri->event, h);
+
+    if(opt.dump_all)
+    {
+        if(e->pf_case == PF_XEN_EMULATE)
+            printf("]%s pf_xen:emulate va %llx ec %x level %d corr %llx e->pt_index[%d %d %d %d %d]\n",
+                   ri->dump_header, e->va, e->error_code,
+                   e->pt_level, e->corresponding_va,
+                   e->pt_index[0], e->pt_index[1], e->pt_index[2],
+                   e->pt_index[3],
+                   e->pt_index[4]);
+        else
+            printf("]%s pf_xen va %llx ec %x e->pt_index[%d %d %d %d %d]\n",
+                   ri->dump_header, e->va, e->error_code,
+                   e->pt_index[0], e->pt_index[1], e->pt_index[2],
+                   e->pt_index[3],
+                   e->pt_index[4]);
+    }
+
+    if ( hvm_set_postprocess(h, hvm_pf_xen_postprocess) )
+         fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+
+char * hvm_vlapic_icr_dest_shorthand_name[4] = {
+    "dest_field", "self", "all-inc", "all-exc"
+};
+
+void hvm_vlapic_vmentry_cleanup(struct vcpu_data *v, tsc_t tsc)
+{
+    int i;
+
+    struct vlapic_struct *vla = &v->vlapic;
+
+    for(i=0; i<MAX_VLAPIC_LIST; i++)
+    {
+        unsigned long long lat=0;
+        struct outstanding_ipi *o = vla->outstanding.list + i;
+
+        if(!(o->valid && o->injected))
+            continue;
+
+        if(tsc >= o->first_tsc)
+            lat = tsc - o->first_tsc;
+        else
+            fprintf(warn, "Strange, vec %d first_tsc %lld > ri->tsc %lld!\n",
+                    o->vec, o->first_tsc, tsc);
+
+        if(opt.dump_ipi_latency
+           || (opt.dump_all && o->count > 1)) {
+            struct time_struct t;
+            cycles_to_time(lat, &t);
+            printf(" [vla] d%dv%d vec %d ipis %d, latency %lld (%u.%09u s)\n",
+                   v->d->did, v->vid, o->vec, o->count, lat,
+                   t.s, t.ns);
+        }
+
+#if 0
+        /* FIXME: make general somehow */
+        if(opt.summary_info)
+        {
+            update_summary(&h->summary.ipi_latency, lat);
+            h->summary.ipi_count[vla->outstanding_ipis]++;
+        }
+#endif
+
+        o->vec = o->count = o->injected = o->valid = o->first_tsc = 0;
+    }
+}
+
+void hvm_vlapic_clear(struct vlapic_struct *vla)
+{
+    bzero(vla, sizeof(*vla));
+}
+
+struct outstanding_ipi *find_vec(struct vlapic_struct *vla, int vec)
+{
+    struct outstanding_ipi *o = NULL;
+    int i;
+
+    /* Find the entry for this vector, or the first empty one. */
+    for(i=0; i<MAX_VLAPIC_LIST; i++)
+    {
+        if(vla->outstanding.list[i].valid && vla->outstanding.list[i].vec == vec)
+        {
+            o = vla->outstanding.list + i;
+            break;
+        } else if(!vla->outstanding.list[i].valid && !o)
+            o = vla->outstanding.list + i;
+    }
+
+    if(!o->valid) {
+        o->vec = vec;
+        o->valid = 1;
+    }
+
+    return o;
+}
+
+void hvm_vlapic_icr_handler(struct hvm_data *h)
+{
+    struct mmio_info *m = &h->inflight.mmio;
+    union {
+        unsigned int val;
+        struct {
+            unsigned vec:8,
+                delivery_mode:3,
+                dest_mode:1,
+                delivery_status:1,
+                _res1:1,
+                level:1,
+                trigger:1,
+                _res2:2,
+                dest_shorthand:2;
+        };
+    } icr = { .val = m->data };
+
+    void ipi_send(struct vcpu_data *ov, int vec)
+    {
+        struct vlapic_struct *vla;
+        struct outstanding_ipi *o = NULL;
+
+        if(ov->runstate.state == RUNSTATE_LOST) {
+            if(opt.dump_all)
+                fprintf(warn, "%s: v%d in state RUNSTATE_LOST, not counting ipi\n",
+                        __func__, ov->vid);
+            return;
+        }
+
+        vla = &ov->vlapic;
+
+        o = find_vec(vla, vec);
+
+        if(!o)
+        {
+            fprintf(warn, "%s: Couldn't find an open slot!\n",
+                    __func__);
+            return;
+        }
+
+        if(!o->first_tsc)
+            o->first_tsc = P.now;
+
+        if(opt.dump_all && o->count == 0 && o->injected)
+            printf(" [vla] Pre-injection\n");
+
+        o->count++;
+
+        if((opt.dump_all)
+#if 0
+           && (ov->runstate.state != RUNSTATE_RUNNING
+               || ov->hvm.vmexit_valid)
+#endif
+            )
+            printf(" [vla] d%dv%d vec %d state %s (outstanding ipis %d)\n",
+                   ov->d->did, ov->vid,
+                   o->vec,
+                   runstate_name[ov->runstate.state],
+                   o->count);
+    }
+
+    if(m->is_write) {
+        if(opt.dump_all) {
+            printf("              [vla] d%dv%d icr vec %d %s\n",
+                   h->v->d->did, h->v->vid,
+                   icr.vec,
+                   hvm_vlapic_icr_dest_shorthand_name[icr.dest_shorthand]);
+        }
+
+        if(icr.dest_shorthand == 3)
+        {
+            struct vcpu_data *ov, *v = h->v;
+            struct domain_data *d = v->d;
+            int i;
+
+            for(i=0; i<MAX_CPUS; i++)
+            {
+                ov = d->vcpu[i];
+                if(!ov || ov == v)
+                    continue;
+
+                ipi_send(ov, icr.vec);
+
+            }
+        } else if(icr.dest_shorthand != 1) {
+#if 0
+            fprintf(warn, "Strange, vlapic icr %s vec %d!\n",
+                    hvm_vlapic_icr_dest_shorthand_name[icr.dest_shorthand],
+                    icr.vec);
+#endif
+        }
+    } else {
+        /* Read */
+        if(opt.dump_all) {
+            printf("              [vla] d%dv%d icr status %s\n",
+                   h->v->d->did, h->v->vid,
+                   icr.delivery_status?"pending":"idle");
+        }
+    }
+
+}
+
+void hvm_vlapic_inject(struct vcpu_data *v, int vec)
+{
+    struct vlapic_struct *vla = &v->vlapic;
+    struct outstanding_ipi *o = NULL;
+
+    o = find_vec(vla, vec);
+
+    if(o) {
+        if(opt.dump_all)
+            printf("  [vla] d%dv%d vec %d injecting\n",
+                   v->d->did, v->vid, vec);
+        o->injected=1;
+    } else {
+        fprintf(stderr, "%s: Couldn't find an open ipi slot!\n",
+                __func__);
+    }
+}
+
+void hvm_vlapic_eoi_handler(struct hvm_data *h) {
+    if(opt.dump_all)
+        printf("              [vla] d%dv%d eoi\n",
+               h->v->d->did, h->v->vid);
+}
+
+void hvm_vlapic_handler(struct hvm_data *h)
+{
+    struct mmio_info *m = &h->inflight.mmio;
+
+    switch(m->gpa) {
+    case 0xfee00300:
+        hvm_vlapic_icr_handler(h);
+        break;
+    case 0xfee000b0:
+        hvm_vlapic_eoi_handler(h);
+        break;
+    }
+
+}
+
+/* Also called by shadow_mmio_postprocess */
+#define MMIO_VGA_START (0xa0000)
+#define MMIO_VGA_END   (0xbffff)
+void enumerate_mmio(struct hvm_data *h)
+{
+    struct mmio_info *m = &h->inflight.mmio;
+
+    /* Skip vga area */
+    if ( opt.mmio_enumeration_skip_vga
+         && m->gpa >= MMIO_VGA_START
+         && m->gpa <  MMIO_VGA_END)
+    {
+        warn_once("WARNING: Not enumerationg MMIO in VGA range.  Use --mmio-enumeration-skip-vga=0 to override.\n");
+        return;
+    }
+
+    if ( m->data_valid )
+        update_io_address(&h->summary.io.mmio, m->gpa, m->is_write, h->arc_cycles, m->va);
+}
+
+void hvm_mmio_summary(struct hvm_data *h, void *data)
+{
+    long reason=(long)data;
+
+    PRINT_SUMMARY(h->summary.mmio[reason],
+                  "   mmio ");
+}
+
+void hvm_mmio_assist_postprocess(struct hvm_data *h)
+{
+    long reason;
+
+    switch(h->exit_reason)
+    {
+    case VMEXIT_NPF:
+    case EXIT_REASON_EPT_VIOLATION:
+        reason=NONPF_MMIO_NPF;
+        hvm_set_summary_handler(h, hvm_mmio_summary, (void *)reason);
+        break;
+    case EXIT_REASON_APIC_ACCESS:
+        reason=NONPF_MMIO_APIC;
+        hvm_set_summary_handler(h, hvm_mmio_summary, (void *)reason);
+        break;
+    default:
+    {
+        static int warned = 0;
+        if (!warned)
+        {
+            fprintf(warn, "%s: Strange, MMIO with unexpected exit reason %d\n",
+                    __func__, h->exit_reason);
+            warned=1;
+        }
+        reason=NONPF_MMIO_UNKNOWN;
+        hvm_set_summary_handler(h, hvm_mmio_summary, (void *)reason);
+        break;
+    }
+    }
+
+    if(opt.summary_info)
+    {
+        update_summary(&h->summary.mmio[reason],
+                       h->arc_cycles);
+    }
+
+    if ( opt.with_mmio_enumeration )
+        enumerate_mmio(h);
+}
+
+#define HVM_IO_ASSIST_WRITE 0x200
+void hvm_mmio_assist_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct mmio_info *m = &h->inflight.mmio;
+    union {
+        struct {
+            unsigned int gpa;
+            unsigned int data;
+        } x32;
+        struct {
+            unsigned long long gpa;
+            unsigned int data;
+        } x64;
+    } *r = (typeof(r))h->d;
+
+    union {
+        unsigned event;
+        struct {
+            unsigned minor:8,
+                x64:1,
+                write:2;
+        };
+    } mevt = { .event = ri->event };
+
+    if(mevt.x64) {
+        m->gpa = r->x64.gpa;
+        m->data = r->x64.data;
+        if(ri->extra_words*(sizeof(unsigned int))==sizeof(r->x64))
+            m->data_valid=1;
+    } else {
+        m->gpa = r->x32.gpa;
+        m->data = r->x32.data;
+        if(ri->extra_words*(sizeof(unsigned int))==sizeof(r->x32))
+            m->data_valid=1;
+    }
+
+    m->is_write = mevt.write;
+
+    if(opt.dump_all)
+    {
+        if(m->data_valid)
+            printf("]%s mmio_assist %c gpa %llx data %x\n",
+                   ri->dump_header,
+                   mevt.write?'w':'r',
+                   m->gpa, m->data);
+        else
+            printf("]%s mmio_assist %c gpa %llx (no data)\n", ri->dump_header,
+                   mevt.write?'w':'r', m->gpa);
+    }
+
+    if((m->gpa & 0xfffff000) == 0xfee00000)
+        hvm_vlapic_handler(h);
+
+    /* Catch MMIOs that don't go through the shadow code; tolerate
+     * failures to set (probably shadow_mmio) */
+    hvm_set_postprocess(h, hvm_mmio_assist_postprocess);
+}
+
+void hvm_inj_virq_process(struct record_info *ri, struct hvm_data *h) {
+    struct {
+        int vector, fake;
+    } *r = (typeof(r))h->d;
+
+    if(opt.dump_all) {
+        printf(" %s inj_virq vec %u  %s\n",
+               ri->dump_header,
+               r->vector, r->fake?"fake":"real");
+    }
+
+    if(opt.summary_info)
+    {
+        int vector = r->vector;
+
+        if(vector >= GUEST_INTERRUPT_MAX)
+            vector = GUEST_INTERRUPT_MAX;
+        h->summary.guest_interrupt[vector].count++;
+
+        if(opt.interval.output == INTERVAL_DOMAIN_GUEST_INTERRUPT)
+            interval_domain_guest_interrupt(h, vector);
+    }
+
+    /* If we're waking, make this the wake vector */
+    if(r->vector < GUEST_INTERRUPT_MAX ) {
+        int vector = r->vector;
+        if ( h->w2h.waking && h->w2h.vector == 0 ) {
+            if(h->summary.guest_interrupt[vector].start_tsc) {
+                fprintf(warn, "Strange, d%dv%d waking && wake_vector 0 but vec %d start_tsc %lld!\n",
+                        h->v->d->did, h->v->vid,
+                        vector,
+                        h->summary.guest_interrupt[vector].start_tsc);
+                error(ERR_WARN, NULL);
+            }
+            if(h->w2h.interrupts)
+                fprintf(warn, "Strange, waking && wake_vector 0 but interrupts_this_wait_to_halt %d!\n",
+                        h->w2h.interrupts);
+
+            if(opt.dump_all)
+                printf(" [w2h] d%dv%d Setting wake_vector %d\n",
+                       h->v->d->did, h->v->vid, vector);
+
+            /* In svm mode, vector information is invalid */
+            if ( opt.svm_mode && r->fake )
+                h->w2h.vector = FAKE_VECTOR;
+            else
+                h->w2h.vector = vector;
+            h->summary.guest_interrupt[vector].is_wake = 1;
+        }
+
+        if( h->summary.guest_interrupt[vector].start_tsc == 0 ) {
+            /* Note that we want start_tsc set at the next vmentry */
+            h->summary.guest_interrupt[vector].start_tsc = 1;
+            h->w2h.interrupts_wanting_tsc++;
+            h->w2h.interrupts++;
+
+            if(opt.dump_all)
+                printf(" [w2h] d%dv%d Starting vec %d\n",
+                       h->v->d->did, h->v->vid, vector);
+        }
+    }
+
+    hvm_vlapic_inject(h->v, r->vector);
+}
+
+/* I/O Handling */
+struct io_address {
+    struct io_address *next;
+    unsigned int pa;
+    unsigned int va;
+    struct event_cycle_summary summary[2];
+};
+
+void update_io_address(struct io_address ** list, unsigned int pa, int dir,
+                       tsc_t arc_cycles, unsigned int va) {
+    struct io_address *p, *q=NULL;
+
+    /* Keep list in order */
+    for(p=*list; p && (p->pa != pa) && (p->pa < pa); q=p, p=p->next);
+
+    /* If we didn't find it, make a new element. */
+    if(!p || (p->pa != pa)) {
+        if((p=malloc(sizeof(*p)))==NULL) {
+            fprintf(stderr, "malloc() failed.\n");
+            error(ERR_SYSTEM, NULL);
+        }
+
+        bzero(p, sizeof(*p));
+
+        p->pa=pa;
+        p->va=va;
+
+        /* If we stopped in the middle or at the end, add it in */
+        if(q) {
+            p->next=q->next;
+            q->next=p;
+        } else {
+            /* Otherwise, we stopped after the first element; put it at the beginning */
+            p->next = *list;
+            *list = p;
+        }
+    }
+    update_summary(&p->summary[dir], arc_cycles);
+}
+
+void hvm_io_address_summary(struct io_address *list, char * s) {
+    if(!list)
+        return;
+
+    printf("%s\n", s);
+
+    for(; list; list=list->next) {
+        if ( list->va )
+        {
+            PRINT_SUMMARY(list->summary[0], "%8x@%8x:[r] ", list->pa, list->va);
+            PRINT_SUMMARY(list->summary[1], "%8x@%8x:[w] ", list->pa, list->va);
+        }
+        else
+        {
+            PRINT_SUMMARY(list->summary[0], "%8x:[r] ", list->pa);
+            PRINT_SUMMARY(list->summary[1], "%8x:[w] ", list->pa);
+        }
+    }
+}
+
+void hvm_io_write_postprocess(struct hvm_data *h)
+{
+    if(opt.with_pio_enumeration)
+        update_io_address(&h->summary.io.pio, h->inflight.io.port, 1, h->arc_cycles, 0);
+}
+
+void hvm_io_read_postprocess(struct hvm_data *h)
+{
+    if(opt.with_pio_enumeration)
+        update_io_address(&h->summary.io.pio, h->inflight.io.port, 0, h->arc_cycles, 0);
+    if(opt.scatterplot_io && h->inflight.io.port == opt.scatterplot_io_port)
+        scatterplot_vs_time(h->exit_tsc, P.now - h->exit_tsc);
+}
+
+void hvm_io_assist_process(struct record_info *ri, struct hvm_data *h)
+{
+    union {
+        struct {
+            unsigned int port;
+            unsigned int data;
+        } x32;
+    } *r = (typeof(r))h->d;
+
+    union {
+        unsigned event;
+        struct {
+            unsigned minor:8,
+                x64:1,
+                write:2;
+        };
+    } mevt = { .event = ri->event };
+
+    if(mevt.x64) {
+        fprintf(stderr, "FATAL: Unexpected 64-bit PIO\n");
+        error(ERR_RECORD, ri);
+        return;
+    }
+
+    h->inflight.io.port = r->x32.port;
+    h->inflight.io.val = r->x32.data;
+
+    if(mevt.write) {
+        h->inflight.io.is_write = 1;
+        if ( hvm_set_postprocess(h, hvm_io_write_postprocess) )
+             fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+    } else {
+        h->inflight.io.is_write = 0;
+        if ( hvm_set_postprocess(h, hvm_io_read_postprocess) )
+             fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+    }
+
+    if(opt.dump_all)
+    {
+        printf(" %s io %s port %x val %x\n",
+               ri->dump_header,
+               mevt.write?"write":"read",
+               r->x32.port,
+               r->x32.data);
+    }
+}
+
+/* cr_write */
+/* CR3 list */
+void cr3_switch(unsigned long long val, struct hvm_data *h) {
+    struct vcpu_data *v = h->v;
+    /* Really only need absolute tsc here.  Later change to global time. */
+    unsigned long long now = P.now;
+    unsigned long long gmfn = val >> 12;
+
+    if ( !h->init )
+        return;
+
+    if(opt.with_cr3_enumeration) {
+        if(v->cr3.data) {
+            struct cr3_value_struct *cur = v->cr3.data;
+            unsigned long long cycles = now - v->cr3.start_time;
+
+            if(opt.summary_info)
+                update_cycles(&cur->total_time, cycles);
+
+            cur->last_time = now;
+        }
+
+        if(gmfn) {
+            struct cr3_value_struct *p, **last=&v->d->cr3_value_head;
+
+            /* Always add to tail, so that we get consistent interval
+               ouptut as the number of cr3s grow */
+            for(p=*last; p; last = (&p->next), p=p->next)
+                if(p->gmfn == gmfn)
+                    break;
+
+            if(!p) {
+                if((p=malloc(sizeof(*p)))==NULL) {
+                    fprintf(stderr, "malloc() failed.\n");
+                    error(ERR_SYSTEM, NULL);
+                }
+
+                bzero(p, sizeof(*p));
+
+                p->gmfn = gmfn;
+                p->cr3_id = P.cr3.id;
+                p->first_time = now;
+
+                p->next=*last;
+                *last=p;
+
+                p->gnext = NULL;
+                if(P.cr3.head)
+                    *P.cr3.tail = p;
+                else
+                    P.cr3.head = p;
+                P.cr3.tail = &p->gnext;
+
+                P.cr3.id++;
+
+                /* Add to the interval list if appropriate */
+                if(opt.interval.check == INTERVAL_CHECK_CR3
+                   && v->d->did != DEFAULT_DOMAIN)
+                    interval_cr3_value_check(p);
+            }
+
+            if(p->prealloc_unpin.now) {
+                fprintf(warn, "Re-promoting previously unpinned cr3 %llx!\n",
+                        p->gmfn);
+                p->prealloc_unpin.now = 0;
+                h->inflight.cr_write.repromote = 1;
+            }
+
+            /* Accounting for new toplevel */
+            v->cr3.start_time = now;
+            p->switch_count++;
+            if(p->destroy.callback)
+                p->destroy.switch_count++;
+            v->cr3.data = p;
+        } else {
+            v->cr3.data = NULL;
+        }
+
+        if (opt.scatterplot_cr3_switch) {
+            scatterplot_vs_time(h->exit_tsc,
+                                v->cr3.data ? (v->cr3.data->cr3_id) : 0);
+        }
+    } else {
+        if (opt.scatterplot_cr3_switch)
+            scatterplot_vs_time(h->exit_tsc, gmfn);
+    }
+
+    v->cr3.val = val;
+};
+
+void cr3_prealloc_unpin(struct vcpu_data *v, unsigned long long gmfn) {
+    struct cr3_value_struct *cr3;
+
+    /* Look for it in the list */
+    for(cr3 = v->d->cr3_value_head; cr3; cr3=cr3->next)
+        if(cr3->gmfn == gmfn)
+            break;
+
+    if(!cr3)
+        return;
+
+    if(cr3->prealloc_unpin.now)
+        fprintf(warn, "Strange, gmfn %llx multiple unpins w/o access!\n",
+                gmfn);
+
+    cr3->prealloc_unpin.now = 1;
+    cr3->prealloc_unpin.count++;
+
+    if(opt.dump_all)
+        printf(" cr3 %llx unpinned %d times\n",
+               gmfn, cr3->prealloc_unpin.count);
+}
+
+void cr3_dump_list(struct cr3_value_struct *head){
+    struct cr3_value_struct *p;
+    struct cr3_value_struct **qsort_array;
+    int i, N=0;
+
+    int cr3_compare_total(const void *_a, const void *_b) {
+        struct cr3_value_struct *a=*(typeof(&a))_a;
+        struct cr3_value_struct *b=*(typeof(&a))_b;
+
+        if(a->total_time.cycles < b->total_time.cycles)
+            return 1;
+        else if(b->total_time.cycles == a->total_time.cycles) {
+            if(a->total_time.count < b->total_time.count)
+                return 1;
+            else if(a->total_time.count == b->total_time.count)
+                return 0;
+            else
+                return -1;
+        } else
+            return -1;
+    }
+
+    int cr3_compare_start(const void *_a, const void *_b) {
+        struct cr3_value_struct *a=*(typeof(&a))_a;
+        struct cr3_value_struct *b=*(typeof(&a))_b;
+
+        if(a->first_time > b->first_time)
+            return 1;
+        else if(b->first_time == a->first_time)
+            return 0;
+        else
+            return -1;
+    }
+
+    if(!head)
+        return;
+
+    /* Count the number of elements */
+    for(p=head; p; p=p->next)
+        N++;
+
+    if(!N)
+        return;
+
+    /* Alloc a struct of the right size */
+    qsort_array = malloc(N * sizeof(struct eip_list_struct *));
+
+    /* Point the array into it */
+    for(i=0, p=head; p; p=p->next, i++)
+        qsort_array[i]=p;
+
+    /* Sort the array by time */
+    qsort(qsort_array, N, sizeof(struct eip_list_struct *),
+          cr3_compare_start);
+
+    /* WARNING: don't use N after this point unless you copy this variable */
+#if 0
+    if(opt.summary_eip_limit && opt.summary_eip_limit < N)
+        N=opt.summary_eip_limit;
+#endif
+
+    /* Now print the results */
+    printf("    cr3 values:\n");
+    for(i=0; i<N; i++) {
+        char desc[30];
+        struct time_struct first, last;
+
+        p = qsort_array[i];
+
+        abs_cycles_to_time(p->first_time, &first);
+        abs_cycles_to_time(p->last_time, &last);
+
+
+        snprintf(desc, 30, "  %8llx (id %d)", p->gmfn, p->cr3_id);
+        print_cycle_summary(&p->total_time, desc);
+        snprintf(desc, 30, "          guest");
+        print_cycle_percent_summary(&p->guest_time, p->run_time, desc);
+        snprintf(desc, 30, "          hv   ");
+        print_cycle_percent_summary(&p->hv_time, p->run_time, desc);
+
+        hvm_short_summary(&p->hvm, p->run_time, "           + ");
+        printf("            Seen: %4u.%09u-%4u.%09u switch %d flush %d\n",
+               first.s, first.ns,
+               last.s, last.ns,
+               p->switch_count, p->flush_count);
+        if(p->destroy.callback)
+            printf("          destroy: flush %u switch %u fixup %u emulate %u\n",
+                   p->destroy.flush_count,
+                   p->destroy.switch_count,
+                   p->destroy.fixup_user,
+                   p->destroy.emulate_corr_user);
+    }
+
+    free(qsort_array);
+}
+
+void hvm_cr3_write_summary(struct hvm_data *h) {
+    int j;
+
+    for(j=0; j<RESYNCS_MAX; j++)
+        PRINT_SUMMARY(h->summary.cr3_write_resyncs[j],
+                      "     *[%3d] ", j);
+    PRINT_SUMMARY(h->summary.cr3_write_resyncs[j],
+                  "     *[MAX] ");
+}
+
+void hvm_cr_write_summary(struct hvm_data *h, void *data)
+{
+    long cr=(long)data;
+
+    PRINT_SUMMARY(h->summary.cr_write[cr],
+                  "   cr%ld ", cr);
+    if ( cr==3 )
+        hvm_cr3_write_summary(h);
+}
+
+void hvm_cr_write_postprocess(struct hvm_data *h)
+{
+    if(h->inflight.cr_write.cr == 3) {
+        struct vcpu_data *v = h->v;
+        unsigned long long new_val = h->inflight.cr_write.val;
+        unsigned long long oval;
+        int flush=0;
+
+        if(v->cr3.val) {
+            oval = v->cr3.val;
+
+            if(new_val == oval) {
+                if(v->cr3.data) {
+                    v->cr3.data->flush_count++;
+                    if(v->cr3.data->destroy.callback)
+                        v->cr3.data->destroy.flush_count++;
+                }
+                flush=1;
+            }
+        }
+
+        if(opt.summary_info) {
+            int resyncs = h->resyncs;
+
+            if(resyncs > RESYNCS_MAX)
+                resyncs = RESYNCS_MAX;
+
+            update_summary(&h->summary.cr3_write_resyncs[resyncs],
+                           h->arc_cycles);
+
+            update_summary(&h->summary.cr_write[3],
+                           h->arc_cycles);
+
+            hvm_update_short_summary(h, HVM_SHORT_SUMMARY_CR3);
+        }
+
+        if(!flush)
+            cr3_switch(new_val, h);
+    } else {
+        if(opt.summary_info)
+        {
+            if(h->inflight.cr_write.cr < CR_MAX)
+                update_summary(&h->summary.cr_write[h->inflight.cr_write.cr],
+                               h->arc_cycles);
+
+        }
+    }
+
+    /* Set summary handler */
+    /* FIXME - deal with cr_read_summary */
+    if(h->exit_reason < h->exit_reason_max)
+    {
+        /* Want a different "set" for each cr */
+        switch(h->inflight.cr_write.cr)
+        {
+#define case_cr(_x)                                                     \
+            case (_x):                                                  \
+                hvm_set_summary_handler(h, hvm_cr_write_summary, (void *)(_x)); \
+                break
+            case_cr(0);
+            case_cr(1);
+            case_cr(2);
+            case_cr(3);
+            case_cr(4);
+            case_cr(5);
+            case_cr(6);
+            case_cr(7);
+            case_cr(8);
+            case_cr(9);
+            case_cr(10);
+            case_cr(11);
+            case_cr(12);
+            case_cr(13);
+            case_cr(14);
+            case_cr(15);
+#undef case_cr
+        default:
+            fprintf(stderr, "Unexpected cr: %d\n", h->inflight.cr_write.cr);
+            error(ERR_SANITY, NULL);
+            break;
+        }
+    }
+}
+
+void hvm_cr_write_process(struct record_info *ri, struct hvm_data *h)
+{
+    union {
+        struct {
+            unsigned cr;
+            unsigned int val;
+        } x32;
+        struct {
+            unsigned cr;
+            unsigned long long val;
+        } __attribute__((packed)) x64;
+    } *r = (typeof(r))h->d;
+    unsigned cr;
+    unsigned long long val;
+
+    if(ri->event & TRC_64_FLAG) {
+        h->inflight.cr_write.cr = cr = r->x64.cr;
+        h->inflight.cr_write.val = val = r->x64.val;
+    } else {
+        h->inflight.cr_write.cr = cr = r->x32.cr;
+        h->inflight.cr_write.val = val = r->x32.val;
+    }
+
+    /* In vmx, in real mode, cr accesses may cause EXNMI vmexits.
+     * Account them under that heading; otherwise, complain */
+    if ( hvm_set_postprocess(h, hvm_cr_write_postprocess) )
+        fprintf(warn, "%s: Strange, h->postprocess already set!\n",
+            __func__);
+
+    if(opt.dump_all)
+    {
+        if(cr == 3 && h->v->cr3.val) {
+            printf("]%s cr_write cr3 val %llx oval %llx %s\n",
+                   ri->dump_header,
+                   val,
+                   h->v->cr3.val,
+                   (h->v->cr3.val == val)?"flush":"switch");
+        } else {
+            printf(" %s cr_write cr%d val %llx\n",
+                   ri->dump_header,
+                   cr, val);
+
+        }
+    }
+
+}
+
+/* msr_write */
+void hvm_msr_write_summary(struct hvm_data *h, void *d)
+{
+}
+
+void hvm_msr_write_postprocess(struct hvm_data *h)
+{
+    if(opt.summary_info) {
+    }
+
+    /* Set summary handler */
+    hvm_set_summary_handler(h, hvm_msr_write_summary, NULL);
+}
+
+void hvm_msr_write_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct {
+        unsigned int addr;
+        unsigned long long val;
+    } __attribute__((packed)) *r = (typeof(r))h->d;
+
+    if(check_extra_words(ri, sizeof(*r), "msr_write"))
+        return;
+
+    h->inflight.msr.addr = r->addr;
+    h->inflight.msr.val = r->val;
+
+    if(opt.dump_all)
+    {
+        printf(" %s msr_write addr %x val %llx\n",
+               ri->dump_header,
+               r->addr, r->val);
+    }
+
+    if ( hvm_set_postprocess(h, hvm_msr_write_postprocess) )
+        fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+
+/* msr_read */
+void hvm_msr_read_summary(struct hvm_data *h, void *d)
+{
+}
+
+void hvm_msr_read_postprocess(struct hvm_data *h)
+{
+    if(opt.summary_info) {
+    }
+
+    /* Set summary handler */
+    hvm_set_summary_handler(h, hvm_msr_read_summary, NULL);
+}
+
+void hvm_msr_read_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct {
+        unsigned int addr;
+        unsigned long long val;
+    } __attribute__((packed)) *r = (typeof(r))h->d;
+
+    if(check_extra_words(ri, sizeof(*r), "msr_read"))
+        return;
+
+    h->inflight.msr.addr = r->addr;
+    h->inflight.msr.val = r->val;
+
+    if(opt.dump_all)
+    {
+        printf(" %s msr_read addr %x val %llx\n",
+               ri->dump_header,
+               r->addr, r->val);
+    }
+
+    if ( hvm_set_postprocess(h, hvm_msr_read_postprocess) )
+        fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+
+void hvm_vmcall_summary(struct hvm_data *h, void *d)
+{
+    int i;
+
+    for ( i=0; i<HYPERCALL_MAX ; i++)
+    {
+        PRINT_SUMMARY(h->summary.vmcall[i],
+                      "    [%10s] ", hypercall_name[i]);
+    }
+    PRINT_SUMMARY(h->summary.vmcall[HYPERCALL_MAX],
+                  "    [%10s] ", "max");
+}
+
+void hvm_vmcall_postprocess(struct hvm_data *h)
+{
+    unsigned eax = h->inflight.vmcall.eax ;
+
+    if(opt.summary)
+    {
+        if ( eax < HYPERCALL_MAX )
+            update_summary(&h->summary.vmcall[eax],
+                       h->arc_cycles);
+        else
+            update_summary(&h->summary.vmcall[HYPERCALL_MAX],
+                       h->arc_cycles);
+        hvm_set_summary_handler(h, hvm_vmcall_summary, NULL);
+    }
+}
+
+void hvm_vmcall_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct {
+        unsigned int eax;
+    } *r = (typeof(r))h->d;
+
+    if(opt.dump_all) {
+        if(r->eax < HYPERCALL_MAX)
+            printf(" %s vmcall %2x (%s)\n",
+                   ri->dump_header,
+                   r->eax,
+                   hypercall_name[r->eax]);
+        else
+            printf(" %s vmcall %2x\n",
+                   ri->dump_header,
+                   r->eax);
+    }
+
+    h->inflight.vmcall.eax = r->eax;
+
+    if ( hvm_set_postprocess(h, hvm_vmcall_postprocess) )
+        fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+
+void hvm_inj_exc_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct {
+        unsigned vec, ec;
+    } *r = (typeof(r))h->d;
+
+    if ( opt.dump_all )
+    {
+        if(r->vec < HVM_TRAP_MAX)
+            printf(" %3u.%09u %s inj_exc trap %s ec %x\n",
+                   ri->t.s, ri->t.ns, pcpu_string(ri->cpu),
+                   hvm_trap_name[r->vec], r->ec);
+        else
+            printf(" %3u.%09u %s inj_exc trap %u ec %x\n",
+                   ri->t.s, ri->t.ns, pcpu_string(ri->cpu),
+                   r->vec, r->ec);
+    }
+
+}
+
+void hvm_intr_summary(struct hvm_data *h, void *d)
+{
+    int i;
+
+    for(i=0; i<EXTERNAL_INTERRUPT_MAX; i++)
+        if(h->summary.extint[i])
+        {
+            if(hvm_extint_vector_name[i])
+                printf("  %10s(%3d): %d\n",
+                       hvm_extint_vector_name[i],
+                       i,
+                       h->summary.extint[i]);
+            else
+                printf("            [%3d]: %d\n",
+                       i,
+                       h->summary.extint[i]);
+        }
+    if(h->summary.extint[EXTERNAL_INTERRUPT_MAX])
+        printf("  Other:         : %d\n",
+               h->summary.extint[EXTERNAL_INTERRUPT_MAX]);
+}
+
+
+void hvm_intr_process(struct record_info *ri, struct hvm_data *h)
+{
+    unsigned vec = *(unsigned *)h->d;
+
+    /* Vector is difficult to get in SVM mode */
+    if ( opt.svm_mode )
+        vec = 0;
+
+    if( (h->rip >> ADDR_SPACE_BITS) != 00
+        && (h->rip >> ADDR_SPACE_BITS) != ((0ULL-1)>> ADDR_SPACE_BITS) ) {
+        fprintf(stderr, "Unexpected rip %llx (shift %llx)\n",
+                h->rip,
+                h->rip >> ADDR_SPACE_BITS);
+        error(ERR_RECORD, NULL);
+        /* Can process with strange rip */
+    }
+
+    h->inflight.intr.vec = vec;
+
+    if ( opt.dump_all )
+    {
+        if ( vec < EXTERNAL_INTERRUPT_MAX &&
+             hvm_extint_vector_name[vec] )
+            printf(" %s intr vec %s(%x)\n",
+                   ri->dump_header,
+                   hvm_extint_vector_name[vec],
+                   vec);
+        else
+            printf(" %s intr vec %x\n",
+                   ri->dump_header, vec);
+    }
+
+    if(opt.scatterplot_interrupt_eip
+       && vec == opt.scatterplot_interrupt_vector)
+    {
+        struct time_struct t;
+        /* Truncate to 40 bits */
+        unsigned long long rip = h->rip & ((1ULL << ADDR_SPACE_BITS)-1);
+
+        /* Want absolute tsc to global tsc */
+        abs_cycles_to_time(h->exit_tsc, &t);
+        printf("d%dv%d %u.%09u %lld\n",
+               h->v->d->did, h->v->vid,
+               t.s, t.ns,
+               rip);
+    }
+
+    if(opt.histogram_interrupt_eip
+       && vec == opt.histogram_interrupt_vector)
+    {
+        /* Truncate to 40 bits */
+        unsigned long long rip = h->rip & ((1ULL << ADDR_SPACE_BITS)-1);
+        unsigned index = rip / opt.histogram_interrupt_increment;
+
+        h->summary.extint_histogram[index]++;
+    }
+
+    if(opt.with_interrupt_eip_enumeration
+       && vec == opt.interrupt_eip_enumeration_vector)
+    {
+        /* Truncate to 40 bits */
+        unsigned long long rip = h->rip & ((1ULL << ADDR_SPACE_BITS)-1);
+
+        /* Want absolute tsc to global tsc */
+        update_eip(&h->v->d->interrupt_eip_list, rip, 0, 0, NULL);
+    }
+
+    /* Disable generic postprocessing */
+    /* FIXME: Do the summary stuff in a post-processor */
+    h->post_process = NULL;
+
+    if(opt.summary_info) {
+        if(opt.summary)
+            hvm_set_summary_handler(h, hvm_intr_summary, NULL);
+
+        if(vec < EXTERNAL_INTERRUPT_MAX)
+            h->summary.extint[vec]++;
+        else
+            h->summary.extint[EXTERNAL_INTERRUPT_MAX]++;
+    }
+}
+
+
+void hvm_intr_window_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct {
+        uint32_t vector;
+        uint32_t source;
+        int32_t intr;
+    } *r = (typeof(r))h->d;
+
+    char *intsrc_name[] = {
+        "none",
+        "pic",
+        "lapic",
+        "nmi",
+        "mce",
+        "vector"
+    };
+
+    if ( opt.dump_all )
+    {
+        printf(" %s intr_window vec %u src %u(%s) ",
+               ri->dump_header,
+               (unsigned)r->vector,
+               (unsigned)r->source,
+               r->source < 6 ? intsrc_name[r->source]: "?");
+
+        if ( r->intr > 0 )
+            printf("intr %x\n",
+                   (unsigned)r->intr);
+        else
+            printf("intr #\n");
+    }
+}
+
+void hvm_pf_inject_process(struct record_info *ri, struct hvm_data *h)
+{
+    union {
+        struct {
+            unsigned ec;
+            unsigned int cr2;
+        } x32;
+        struct {
+            unsigned ec;
+            unsigned long long cr2;
+        } __attribute__((packed)) x64;
+    } *r = (typeof(r))h->d;
+    unsigned int ec;
+    unsigned long long cr2;
+    int is_64 = 0;
+
+    if(ri->event & TRC_64_FLAG) {
+        is_64 = 1;
+        cr2 = r->x64.cr2;
+        ec = r->x64.ec;
+    } else {
+        cr2 = r->x32.cr2;
+        ec = r->x32.ec;
+    }
+
+    if ( opt.dump_all )
+    {
+            printf(" %3u.%09u %s pf_inject%s guest_cr2 %llx  guest_ec %x\n",
+                   ri->t.s, ri->t.ns, pcpu_string(ri->cpu),
+                   is_64?"64":"",
+                   cr2, ec);
+    }
+}
+
+void hvm_generic_postprocess_init(struct record_info *ri, struct hvm_data *h);
+
+void hvm_npf_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct {
+        uint64_t gpa;
+        uint64_t mfn;
+        uint32_t qualification;
+        uint32_t p2mt;
+    } *r = (typeof(r))h->d;
+
+    if ( opt.dump_all )
+        printf(" %s npf gpa %llx q %x mfn %llx t %d\n",
+               ri->dump_header,
+               (unsigned long long)r->gpa, r->qualification,
+               (unsigned long long)r->mfn, r->p2mt);
+
+    if ( opt.summary_info )
+        hvm_generic_postprocess_init(ri, h);
+}
+
+void hvm_rdtsc_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct {
+        unsigned long long tsc;
+    } *r = (typeof(r))h->d;
+
+    if ( opt.dump_all )
+        printf(" %s rdtsc %llx %lld %s\n",
+               ri->dump_header,
+               (unsigned long long)r->tsc,
+               (unsigned long long)r->tsc,
+               h->last_rdtsc > r->tsc ? "BACKWARDS" : "");
+
+    if ( opt.scatterplot_rdtsc )
+    {
+        struct time_struct t;
+
+        abs_cycles_to_time(ri->tsc, &t);
+
+        printf("%dv%d %u.%09u %llu\n",
+               h->v->d->did, h->v->vid,
+               t.s, t.ns,
+               r->tsc);
+    }
+
+    h->last_rdtsc = r->tsc;
+}
+
+void hvm_generic_summary(struct hvm_data *h, void *data)
+{
+    long evt = (long)data;
+
+    assert(evt < HVM_EVENT_HANDLER_MAX);
+
+    PRINT_SUMMARY(h->summary.generic[evt],
+                  "   %s ", hvm_event_handler_name[evt]);
+
+}
+
+void hvm_generic_postprocess_init(struct record_info *ri, struct hvm_data *h)
+{
+    if ( h->post_process != hvm_generic_postprocess )
+        fprintf(warn, "%s: Strange, h->postprocess set!\n",
+                __func__);
+    h->inflight.generic.event = ri->event;
+    bcopy(h->d, h->inflight.generic.d, sizeof(unsigned int) * 4);
+}
+
+void hvm_generic_postprocess(struct hvm_data *h)
+{
+    long evt = 0;
+    static unsigned registered[HVM_EVENT_HANDLER_MAX] = { 0 };
+
+    if ( h->inflight.generic.event )
+        evt = (h->inflight.generic.event - TRC_HVM_HANDLER)
+            & ~(TRC_64_FLAG|HVM_IO_ASSIST_WRITE);
+    else  {
+        static unsigned warned[HVM_EXIT_REASON_MAX] = { 0 };
+        /* Some exits we don't expect a handler; just return */
+        if(opt.svm_mode)
+        {
+        }
+        else
+        {
+            switch(h->exit_reason)
+            {
+                /* These just need us to go through the return path */
+            case EXIT_REASON_PENDING_INTERRUPT:
+            case EXIT_REASON_TPR_BELOW_THRESHOLD:
+                /* Not much to log now; may need later */
+            case EXIT_REASON_WBINVD:
+                return;
+            default:
+                break;
+            }
+        }
+        if ( !warned[h->exit_reason] )
+        {
+            /* If we aren't a known exception, warn and log results */
+            fprintf(warn, "%s: Strange, exit %x(%s) missing a handler\n",
+                    __func__, h->exit_reason,
+                    (h->exit_reason > h->exit_reason_max)
+                      ? "[clipped]"
+                      : h->exit_reason_name[h->exit_reason]);
+            warned[h->exit_reason]=1;
+        }
+    }
+
+    if ( evt >= HVM_EVENT_HANDLER_MAX || evt < 0)
+    {
+        fprintf(warn, "%s: invalid hvm event %lx(%x)\n",
+                __func__, evt, h->inflight.generic.event);
+        error(ERR_RECORD, NULL);
+        return;
+    }
+
+    if(opt.summary_info) {
+        update_summary(&h->summary.generic[evt],
+                       h->arc_cycles);
+
+        /* NB that h->exit_reason may be 0, so we offset by 1 */
+        if ( registered[evt] )
+        {
+            static unsigned warned[HVM_EXIT_REASON_MAX] = { 0 };
+            if ( registered[evt] != h->exit_reason+1 && !warned[h->exit_reason])
+            {
+                fprintf(warn, "%s: HVM evt %lx in %x and %x!\n",
+                        __func__, evt, registered[evt]-1, h->exit_reason);
+                warned[h->exit_reason]=1;
+            }
+        }
+        else
+        {
+            int ret;
+            if((ret=__hvm_set_summary_handler(h, hvm_generic_summary, (void *)evt)))
+                fprintf(stderr, "%s: hvm_set_summary_handler returned %d\n",
+                        __func__, ret);
+            registered[evt]=h->exit_reason+1;
+        }
+        /* HLT checked at hvm_vmexit_close() */
+    }
+}
+
+void hvm_generic_dump(struct record_info *ri, char * prefix)
+{
+    struct {
+        unsigned vcpu:16, domain:16;
+        unsigned d[4];
+    } *cr = (typeof(cr))ri->d;
+
+    char *evt_string, evt_number[256];
+    int i, evt, is_64 = 0;
+
+    evt = ri->event - TRC_HVM_HANDLER;
+
+    if(evt & TRC_64_FLAG) {
+        evt &= ~(TRC_64_FLAG);
+        is_64=1;
+    }
+
+    if(evt < HVM_EVENT_HANDLER_MAX)
+    {
+        evt_string = hvm_event_handler_name[evt];
+    }
+    else
+    {
+        snprintf(evt_number, 256, "hvm_handler %d", evt);
+        evt_string = evt_number;
+    }
+
+    printf("%s%s %s%s [",
+           prefix,
+           ri->dump_header,
+           evt_string,
+           is_64?"64":"");
+
+    for(i=0; i<ri->extra_words; i++) {
+        printf(" %x", ri->d[i]);
+    }
+
+    printf(" ]\n");
+}
+
+void hvm_handler_process(struct record_info *ri, struct hvm_data *h) {
+    /* Wait for first vmexit to initialize */
+    if(!h->init)
+    {
+        if(opt.dump_all)
+            hvm_generic_dump(ri,"!");
+        return;
+    }
+
+    h->d = ri->d;
+
+    /* Handle things that don't need a vmexit */
+    switch(ri->event) {
+    default:
+        goto needs_vmexit;
+        /* Records about changing guest state */
+    case TRC_HVM_PF_INJECT:
+    case TRC_HVM_PF_INJECT64:
+        hvm_pf_inject_process(ri, h);
+        break;
+    case TRC_HVM_REINJ_VIRQ:
+        if ( opt.dump_all )
+        {
+            printf(" %3u.%09u %s inj_virq vec %u\n",
+                   ri->t.s, ri->t.ns, pcpu_string(ri->cpu),
+                   *(unsigned*)h->d);
+        }
+        break;
+    case TRC_HVM_INJ_EXC:
+        hvm_inj_exc_process(ri, h);
+        break;
+    case TRC_HVM_INJ_VIRQ:
+        hvm_inj_virq_process(ri, h);
+        break;
+    case TRC_HVM_INTR_WINDOW:
+        hvm_intr_window_process(ri, h);
+        break;
+    case TRC_HVM_OP_DESTROY_PROC:
+        if(h->v->cr3.data) {
+            struct cr3_value_struct *cur = h->v->cr3.data;
+            if(cur->destroy.callback)
+                fprintf(warn, "Strange, double callback for cr3 gmfn %llx!\n",
+                    cur->gmfn);
+            cur->destroy.callback = 1;
+        } else if(opt.with_cr3_enumeration) {
+            fprintf(warn, "Warning: destroy_proc: don't know current cr3\n");
+        }
+        if ( opt.dump_all )
+        {
+            printf(" %3u.%09u %s destroy_proc cur_cr3 %llx\n",
+                   ri->t.s, ri->t.ns, pcpu_string(ri->cpu), h->v->cr3.val);
+        }
+        break;
+    }
+
+    return;
+
+needs_vmexit:
+    /* Wait for the next vmexit */
+    if(!h->vmexit_valid)
+    {
+        if(opt.dump_all)
+            hvm_generic_dump(ri,"!");
+        return;
+    }
+
+    /* Keep generic "event handler" info */
+    h->event_handler = ri->event - TRC_HVM_HANDLER;
+
+    switch(ri->event) {
+        /* Records adding to the vmexit reason */
+    case TRC_HVM_INTR:
+        hvm_intr_process(ri, h);
+        break;
+    case TRC_HVM_PF_XEN:
+    case TRC_HVM_PF_XEN64:
+        hvm_pf_xen_process(ri, h);
+        break;
+    case TRC_HVM_IOPORT_READ:
+    case TRC_HVM_IOPORT_WRITE:
+        hvm_io_assist_process(ri, h);
+        break;
+    case TRC_HVM_IOMEM_READ:
+    case TRC_HVM_IOMEM_WRITE:
+    case TRC_HVM_IOMEM_READ|TRC_64_FLAG:
+    case TRC_HVM_IOMEM_WRITE|TRC_64_FLAG:
+        hvm_mmio_assist_process(ri, h);
+        break;
+    case TRC_HVM_CR_WRITE:
+    case TRC_HVM_CR_WRITE64:
+        hvm_cr_write_process(ri, h);
+        break;
+    case TRC_HVM_MSR_WRITE:
+      hvm_msr_write_process(ri, h);
+      break;
+    case TRC_HVM_MSR_READ:
+        hvm_msr_read_process(ri, h);
+      break;
+    case TRC_HVM_VMMCALL:
+        hvm_vmcall_process(ri, h);
+        break;
+    case TRC_HVM_NPF:
+        hvm_npf_process(ri, h);
+        break;
+    case TRC_HVM_RDTSC:
+        hvm_rdtsc_process(ri, h);
+        break;
+    case TRC_HVM_DR_READ:
+    case TRC_HVM_DR_WRITE:
+    case TRC_HVM_CPUID:
+    case TRC_HVM_SMI:
+    case TRC_HVM_HLT:
+    case TRC_HVM_INVLPG:
+    case TRC_HVM_INVLPG64:
+    case TRC_HVM_MCE:
+    case TRC_HVM_CLTS:
+    case TRC_HVM_LMSW:
+    case TRC_HVM_LMSW64:
+    case TRC_HVM_NMI:
+    case TRC_HVM_REALMODE_EMULATE:
+    case TRC_HVM_TRAP:
+    case TRC_HVM_TRAP_DEBUG:
+    case TRC_HVM_CR_READ:
+    case TRC_HVM_CR_READ64:
+    default:
+        if(opt.dump_all)
+            hvm_generic_dump(ri, "]");
+        if(opt.summary_info)
+            hvm_generic_postprocess_init(ri, h);
+        break;
+    }
+}
+
+void vcpu_next_update(struct pcpu_info *p, struct vcpu_data *next, tsc_t tsc);
+void vcpu_prev_update(struct pcpu_info *p, struct vcpu_data *prev,
+                      tsc_t tsc, int new_runstate);
+struct vcpu_data * vcpu_find(int did, int vid);
+void lose_vcpu(struct vcpu_data *v, tsc_t tsc);
+
+int domain_runstate(struct domain_data *d) {
+    int i;
+    int runstates[RUNSTATE_MAX];
+    int ret=-1;
+    int max_vcpus = 0;
+
+    if(d->did == DEFAULT_DOMAIN)
+        return 0;
+
+    for(i=0; i<RUNSTATE_MAX; i++)
+        runstates[i]=0;
+
+    for(i=0; i<=d->max_vid; i++)
+        if(d->vcpu[i] && d->vcpu[i]->runstate.state != RUNSTATE_INIT) {
+            max_vcpus++;
+            runstates[d->vcpu[i]->runstate.state]++;
+        }
+
+    if(runstates[RUNSTATE_LOST] == max_vcpus)
+        ret=DOMAIN_RUNSTATE_LOST;
+    else if(runstates[RUNSTATE_RUNNING])
+    {
+        if(runstates[RUNSTATE_RUNNABLE])
+            ret=DOMAIN_RUNSTATE_CONCURRENCY_HAZARD;
+        else if(runstates[RUNSTATE_BLOCKED]||runstates[RUNSTATE_OFFLINE])
+            ret= DOMAIN_RUNSTATE_PARTIAL_RUN;
+        else
+            ret= DOMAIN_RUNSTATE_FULL_RUN;
+    }
+    else if(runstates[RUNSTATE_RUNNABLE])
+    {
+        if(runstates[RUNSTATE_BLOCKED]||runstates[RUNSTATE_OFFLINE])
+            ret= DOMAIN_RUNSTATE_PARTIAL_CONTENTION;
+        else
+            ret= DOMAIN_RUNSTATE_FULL_CONTENTION;
+    }
+    else if(runstates[RUNSTATE_BLOCKED]||runstates[RUNSTATE_OFFLINE])
+    {
+        ret= DOMAIN_RUNSTATE_BLOCKED;
+    } else {
+        fprintf(warn, "Strange, no meaningful runstates for d%d!\n",
+                d->did);
+    }
+
+    if ( ret < 0 )
+    {
+        printf(" Max vid: %d (max_vcpus %d)\n", d->max_vid, max_vcpus);
+        for(i=0; i<=d->max_vid; i++)
+            if(d->vcpu[i])
+                fprintf(warn, " v%d: %s\n",
+                        i, runstate_name[d->vcpu[i]->runstate.state]);
+
+        for(i=0; i<RUNSTATE_MAX; i++)
+            fprintf(warn, " %s: %d\n",
+                    runstate_name[i], runstates[i]);
+    }
+
+    if(ret >= 0)
+        return ret;
+
+    error(ERR_ASSERT, NULL);
+    return -1; /* Never happens */
+}
+
+static inline void runstate_update(struct vcpu_data *v, int new_runstate,
+                                   tsc_t tsc)
+{
+    struct domain_data *d = v->d;
+
+    if ( opt.scatterplot_runstate )
+    {
+        struct time_struct t;
+
+        abs_cycles_to_time(tsc, &t);
+
+        printf("%dv%d %u.%09u %d\n",
+               d->did, v->vid,
+               t.s, t.ns,
+               runstate_graph[v->runstate.state]);
+        printf("%dv%d %u.%09u %d\n",
+               d->did, v->vid,
+               t.s, t.ns,
+               runstate_graph[new_runstate]);
+    }
+
+    if(v->runstate.tsc > 0 && v->runstate.tsc < tsc) {
+        update_cycles(v->runstates + v->runstate.state, tsc - v->runstate.tsc);
+
+        if ( opt.scatterplot_runstate_time )
+        {
+            struct time_struct t, dt;
+
+            abs_cycles_to_time(tsc, &t);
+            cycles_to_time(tsc - v->runstate.tsc, &dt);
+
+            printf("%dv%d %u.%09u %u.%09u\n",
+                   d->did, v->vid,
+                   t.s, t.ns,
+                   dt.s, dt.ns);
+        }
+
+        if(v->runstate.state == RUNSTATE_RUNNING)
+            update_cycles(&v->d->total_time, tsc - v->runstate.tsc);
+
+        if(v->runstate.state == RUNSTATE_RUNNABLE)
+            update_cycles(v->runnable_states + v->runstate.runnable_state, tsc - v->runstate.tsc);
+
+        /* How much did dom0 run this buffer? */
+        if(v->d->did == 0) {
+            int i;
+            for(i=0; i<MAX_CPUS; i++) {
+                struct pcpu_info * p = P.pcpu + i;
+                tsc_t start_tsc;
+                if(!p->active)
+                    continue;
+                start_tsc = (p->volume.buffer_first_tsc > v->runstate.tsc) ?
+                    p->volume.buffer_first_tsc :
+                    v->runstate.tsc;
+                p->volume.buffer_dom0_runstate_cycles[v->runstate.state]
+                    += tsc - start_tsc;
+#if 0
+                printf(" - updated p%d dom0_runstate %s to %lld cycles (+%lld)\n",
+                       p->pid, runstate_name[v->runstate.state],
+                       p->volume.buffer_dom0_runstate_cycles[v->runstate.state],
+                       tsc - start_tsc);
+#endif
+                p->volume.buffer_dom0_runstate = new_runstate;
+                p->volume.buffer_dom0_runstate_tsc = tsc;
+            }
+        }
+    }
+
+    /* Detect "runnable" states */
+    if ( new_runstate == RUNSTATE_RUNNABLE )
+    {
+        switch(v->runstate.state)
+        {
+        case RUNSTATE_RUNNING:
+            v->runstate.runnable_state=RUNNABLE_STATE_PREEMPT;
+            break;
+        case RUNSTATE_BLOCKED:
+        case RUNSTATE_OFFLINE:
+            v->runstate.runnable_state=RUNNABLE_STATE_WAKE;
+            break;
+        default:
+            v->runstate.runnable_state=RUNNABLE_STATE_OTHER;
+            break;
+        }
+    } else
+        v->runstate.runnable_state=RUNNABLE_STATE_INVALID;
+
+    v->runstate.state = new_runstate;
+    v->runstate.tsc = tsc;
+
+    /* Determine the domain runstate */
+    if(d->runstate_tsc > 0 && d->runstate_tsc < tsc)
+        update_cycles(d->runstates + d->runstate, tsc - d->runstate_tsc);
+
+    d->runstate = domain_runstate(d);
+
+    d->runstate_tsc = tsc;
+}
+
+void hvm_vmexit_process(struct record_info *ri, struct hvm_data *h,
+                        struct vcpu_data *v) {
+    struct {
+        union {
+            struct {
+                unsigned int exit_reason;
+                unsigned long long rip;
+            } __attribute__((packed)) x64;
+            struct {
+                unsigned int exit_reason;
+                unsigned int eip;
+            } x32;
+        };
+    } *r;
+
+    if ( ri->event & TRC_64_FLAG )
+    {
+        if (check_extra_words(ri, sizeof(r->x64), "vmexit"))
+            return;
+    }
+    else
+    {
+        if (check_extra_words(ri, sizeof(r->x32), "vmexit"))
+            return;
+    }
+
+    r = (typeof(r))ri->d;
+
+    if(!h->init)
+        init_hvm_data(h, v);
+
+    h->vmexit_valid=1;
+    bzero(&h->inflight, sizeof(h->inflight));
+
+    if(ri->event == TRC_HVM_VMEXIT64) {
+        if(v->guest_paging_levels != 4)
+        {
+            if ( verbosity >= 6 )
+                fprintf(warn, "%s: VMEXIT64, but guest_paging_levels %d.  Switching to 4.\n",
+                        __func__, v->guest_paging_levels);
+            v->guest_paging_levels = 4;
+        }
+        if(!is_valid_addr64(r->x64.rip))
+            fprintf(warn, "%s: invalid va %llx\n",
+                    __func__, r->x64.rip);
+        h->rip = r->x64.rip;
+        h->exit_reason = r->x64.exit_reason;
+    } else {
+        if(v->guest_paging_levels == 4)
+        {
+            int new_paging_levels = opt.default_guest_paging_levels;
+
+            if(new_paging_levels == 4)
+                new_paging_levels = 2; /* Wild guess */
+
+            if ( verbosity >= 6 )
+                fprintf(warn, "%s: VMEXIT, but guest_paging_levels %d.  Switching to %d(default).\n",
+                        __func__, v->guest_paging_levels, new_paging_levels);
+
+            v->guest_paging_levels = new_paging_levels;
+        }
+        h->rip = r->x32.eip;
+        h->exit_reason = r->x32.exit_reason;
+    }
+
+    if(opt.scatterplot_vmexit_eip)
+        scatterplot_vs_time(ri->tsc, h->rip);
+
+    if(h->exit_reason > h->exit_reason_max)
+    {
+        fprintf(warn, "h->exit_reason %x > exit_reason_max %x!\n",
+                (unsigned int)h->exit_reason,
+                (unsigned int)h->exit_reason_max);
+        error(ERR_RECORD, ri);
+        return;
+    }
+
+    if(opt.dump_all) {
+        if ( h->exit_reason < h->exit_reason_max
+             && h->exit_reason_name[h->exit_reason] != NULL)
+            printf("]%s vmexit exit_reason %s eip %llx%s\n",
+                   ri->dump_header,
+                   h->exit_reason_name[h->exit_reason],
+                   h->rip,
+                   find_symbol(h->rip));
+        else
+            printf("]%s vmexit exit_reason %x eip %llx%s\n",
+                   ri->dump_header,
+                   h->exit_reason,
+                   h->rip,
+                   find_symbol(h->rip));
+    }
+
+    if(h->v->cr3.data && h->entry_tsc) {
+        update_cycles(&h->v->cr3.data->guest_time,
+                      ri->tsc - h->entry_tsc);
+        h->v->cr3.data->run_time += (ri->tsc - h->entry_tsc);
+    }
+
+    h->exit_tsc = ri->tsc;
+    h->entry_tsc = 0;
+    h->resyncs = 0;
+    h->prealloc_unpin = 0;
+    h->wrmap_bf = 0;
+    h->short_summary_done = 0;
+
+    h->post_process = hvm_generic_postprocess;
+    h->inflight.generic.event = 0;
+}
+
+void hvm_close_vmexit(struct hvm_data *h, tsc_t tsc) {
+
+    if(h->exit_tsc) {
+        if(h->exit_tsc > tsc)
+            h->arc_cycles = 0;
+        else {
+            h->arc_cycles = tsc - h->exit_tsc;
+
+            if(opt.summary_info) {
+                update_summary(&h->summary.exit_reason[h->exit_reason],
+                               h->arc_cycles);
+                h->summary_info = 1;
+            }
+
+            if ( opt.scatterplot_extint_cycles
+                 && h->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT
+                 && h->inflight.intr.vec == opt.scatterplot_extint_cycles_vector )
+            {
+                struct time_struct t;
+
+                abs_cycles_to_time(tsc, &t);
+
+                printf("d%dv%d %u.%09u %lld\n",
+                       h->v->d->did,
+                       h->v->vid,
+                       t.s, t.ns,
+                       h->arc_cycles);
+            }
+        }
+    }
+
+    if(h->post_process)
+        (h->post_process)(h);
+
+    if(h->arc_cycles) {
+        if(opt.summary_info && !h->short_summary_done) {
+            switch(h->event_handler) {
+            case HVM_EVENT_HANDLER_VMCALL:
+                hvm_update_short_summary(h, HVM_SHORT_SUMMARY_VMCALL);
+                break;
+            case HVM_EVENT_HANDLER_INTR:
+                hvm_update_short_summary(h, HVM_SHORT_SUMMARY_INTERRUPT);
+                break;
+            case HVM_EVENT_HANDLER_HLT:
+                hvm_update_short_summary(h, HVM_SHORT_SUMMARY_HLT);
+                break;
+            default:
+                hvm_update_short_summary(h, HVM_SHORT_SUMMARY_OTHER);
+                break;
+            }
+        }
+
+
+        if(h->v->cr3.data) {
+            h->v->cr3.data->run_time += h->arc_cycles;
+
+            if(opt.summary_info)
+                update_cycles(&h->v->cr3.data->hv_time,
+                              h->arc_cycles);
+        }
+    }
+
+    h->exit_tsc = 0;
+    h->vmexit_valid = 0;
+    h->post_process = NULL;
+
+}
+
+void hvm_vmentry_process(struct record_info *ri, struct hvm_data *h) {
+    if(!h->init)
+    {
+        if(opt.dump_all)
+            printf("!%s vmentry\n",
+                   ri->dump_header);
+        return;
+    }
+
+    /* Vista bug
+     * This has to be done here because irqs are injected on the path out
+     * to vmexit. */
+    hvm_vlapic_vmentry_cleanup(h->v, ri->tsc);
+
+    if(h->w2h.waking && opt.dump_all)
+        printf(" [w2h] d%dv%d Finishing waking\n",
+               h->v->d->did, h->v->vid);
+
+    h->w2h.waking = 0;
+
+    if ( h->w2h.interrupts_wanting_tsc ) {
+        int i;
+        for(i=0; i<GUEST_INTERRUPT_MAX; i++)
+        {
+            if ( h->summary.guest_interrupt[i].start_tsc == 1 )
+            {
+                if(opt.dump_all)
+                    printf(" [w2h] d%dv%d Setting vec %d tsc to %lld\n",
+                           h->v->d->did, h->v->vid, i, ri->tsc);
+                h->summary.guest_interrupt[i].start_tsc = ri->tsc;
+                h->w2h.interrupts_wanting_tsc--;
+                if ( h->w2h.interrupts_wanting_tsc == 0 )
+                    break;
+            }
+        }
+    }
+
+    if(!h->vmexit_valid)
+    {
+        if(opt.dump_all)
+            printf("!%s vmentry\n",
+                   ri->dump_header);
+        return;
+    }
+
+    if(opt.dump_all) {
+        unsigned long long arc_cycles = ri->tsc - h->exit_tsc;
+        printf("]%s vmentry cycles %lld %s\n",
+               ri->dump_header, arc_cycles, (arc_cycles>10000)?"!":"");
+    }
+
+    hvm_close_vmexit(h, ri->tsc);
+    h->entry_tsc = ri->tsc;
+}
+
+void hvm_process(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+    struct vcpu_data *v = p->current;
+    struct hvm_data *h = &v->hvm;
+
+    assert(p->current);
+
+    if(vcpu_set_data_type(p->current, VCPU_DATA_HVM))
+        return;
+
+    if(ri->evt.sub == 2)
+    {
+        UPDATE_VOLUME(p, hvm[HVM_VOL_HANDLER], ri->size);
+        hvm_handler_process(ri, h);
+    }
+    else
+    {
+        switch(ri->event) {
+            /* HVM */
+        case TRC_HVM_VMEXIT:
+        case TRC_HVM_VMEXIT64:
+            UPDATE_VOLUME(p, hvm[HVM_VOL_VMEXIT], ri->size);
+            hvm_vmexit_process(ri, h, v);
+            break;
+        case TRC_HVM_VMENTRY:
+            UPDATE_VOLUME(p, hvm[HVM_VOL_VMENTRY], ri->size);
+            hvm_vmentry_process(ri, &p->current->hvm);
+            break;
+        default:
+            fprintf(warn, "Unknown hvm event: %x\n", ri->event);
+        }
+    }
+}
+
+void hvm_summary(struct hvm_data *h) {
+   int i;
+
+   if(!h->summary_info)
+       return;
+
+   printf("Exit reasons:\n");
+   for(i=0; i<h->exit_reason_max; i++) {
+       struct hvm_summary_handler_node *p;
+
+       if ( h->exit_reason_name[i] )
+           PRINT_SUMMARY(h->summary.exit_reason[i],
+                         " %-20s ", h->exit_reason_name[i]);
+       else
+           PRINT_SUMMARY(h->summary.exit_reason[i],
+                         " %20d ", i);
+
+       p=h->exit_reason_summary_handler_list[i];
+       while(p)
+       {
+           p->handler(h, p->data);
+           p=p->next;
+       }
+   }
+
+   printf("Guest interrupt counts:\n");
+   for(i=0; i<GUEST_INTERRUPT_MAX; i++)
+       if(h->summary.guest_interrupt[i].count) {
+           int j;
+           printf("  [%3d] %d\n",
+                  i, h->summary.guest_interrupt[i].count);
+           for(j=1; j<GUEST_INTERRUPT_CASE_MAX; j++) {
+               char desc[80];
+               snprintf(desc, 80, "   * %s", guest_interrupt_case_name[j]);
+               print_cycle_summary(h->summary.guest_interrupt[i].runtime+j, desc);
+           }
+       }
+   if(h->summary.guest_interrupt[i].count)
+       printf("  [%d+] %d\n",
+              i, h->summary.guest_interrupt[i].count);
+
+   if(opt.histogram_interrupt_eip)
+   {
+       unsigned max = ((1ULL<<ADDR_SPACE_BITS)/opt.histogram_interrupt_increment);
+       printf("Interrupt eip histogram:\n");
+       for(i=0; i<max; i++)
+           if(h->summary.extint_histogram[i])
+           {
+               printf("[%llx-%llx]: %d\n",
+                      opt.histogram_interrupt_increment * i,
+                      (opt.histogram_interrupt_increment * (i+1)) - 1,
+                      h->summary.extint_histogram[i]);
+           }
+   }
+
+   PRINT_SUMMARY(h->summary.ipi_latency,
+                 "IPI latency \n");
+   for(i=0; i<256; i++)
+       if(h->summary.ipi_count[i])
+           printf("    [%3d] %10d\n",
+                  i, h->summary.ipi_count[i]);
+   hvm_io_address_summary(h->summary.io.pio, "IO address summary:");
+   hvm_io_address_summary(h->summary.io.mmio, "MMIO address summary:");
+}
+
+/* ---- Shadow records ---- */
+union shadow_event
+{
+    unsigned event;
+    struct {
+        unsigned minor:8,
+            paging_levels:4;
+    };
+};
+
+/* WARNING - not thread safe */
+#define FLAGSTRING(_name, _char) \
+    if(e->flag_ ## _name)        \
+        flagstring[i] = _char; \
+    i++;
+
+char * flag_string(struct pf_xen_extra *e)
+{
+    static char flagstring[32];
+    int i=0;
+
+    for(i=0; i<32; i++)
+        flagstring[i]='-';
+
+    i=0;
+
+    if(e->flag_set_ad)
+        flagstring[i]='d';
+    else if(e->flag_set_a)
+        flagstring[i]='a';
+    i++;
+
+    FLAGSTRING(shadow_l1_get_ref,  'g');
+    FLAGSTRING(shadow_l1_put_ref,  'p');
+    //FLAGSTRING(l2_propagate,      '2');
+    FLAGSTRING(demote,             'D');
+    FLAGSTRING(promote,            'P');
+    FLAGSTRING(wrmap,              'w');
+    FLAGSTRING(wrmap_guess_found,  'G');
+    //FLAGSTRING(wrmap_brute_force, 'b');
+    FLAGSTRING(early_unshadow,     'e');
+    FLAGSTRING(prealloc_unhook,    'H');
+    FLAGSTRING(unsync,             'u');
+    FLAGSTRING(oos_fixup_add,      'a');
+    FLAGSTRING(oos_fixup_evict,    'x');
+
+    flagstring[i]=0;
+
+    return flagstring;
+}
+
+void shadow_emulate_postprocess(struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+
+    if ( opt.summary_info )
+    {
+        update_eip(&h->v->d->emulate_eip_list,
+                   h->rip,
+                   h->arc_cycles,
+                   0, NULL);
+        update_summary(&h->summary.pf_xen[PF_XEN_EMULATE], h->arc_cycles);
+        update_summary(&h->summary.pf_xen_emul[e->pt_level], h->arc_cycles);
+        if(h->prealloc_unpin)
+            update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_PREALLOC_UNPIN], h->arc_cycles);
+        if(e->flag_prealloc_unhook)
+            update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_PREALLOC_UNHOOK], h->arc_cycles);
+        if(e->flag_early_unshadow)
+            update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_EARLY_UNSHADOW], h->arc_cycles);
+        if(e->flag_set_changed)
+            update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_CHANGED], h->arc_cycles);
+        else
+            update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_UNCHANGED], h->arc_cycles);
+        if(e->flag_set_flush)
+            update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_FLUSH], h->arc_cycles);
+        if(e->flag_set_error)
+            update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_ERROR], h->arc_cycles);
+        if(e->flag_promote)
+            update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_PROMOTE], h->arc_cycles);
+        if(e->flag_demote)
+            update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_DEMOTE], h->arc_cycles);
+        /* more summary info */
+
+        hvm_update_short_summary(h, HVM_SHORT_SUMMARY_EMULATE);
+    }
+
+    if(opt.scatterplot_unpin_promote) {
+        if(e->flag_early_unshadow)
+            scatterplot_vs_time(h->exit_tsc, -10);
+        if(h->prealloc_unpin)
+            scatterplot_vs_time(h->exit_tsc, 0);
+        if(e->flag_promote) {
+            if(opt.with_cr3_enumeration) {
+                if(h->v->cr3.data)
+                    scatterplot_vs_time(h->exit_tsc, h->v->cr3.data->cr3_id);
+            } else
+                scatterplot_vs_time(h->exit_tsc, 2);
+        }
+    }
+
+
+}
+
+void shadow_emulate_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+    union {
+        /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+           so put it first for alignment sake. */
+        struct {
+            unsigned gl1e, write_val;
+            unsigned va;
+            unsigned flags:29, emulation_count:3;
+        } gpl2;
+        struct {
+            unsigned long long gl1e, write_val;
+            unsigned va;
+            unsigned flags:29, emulation_count:3;
+        } gpl3;
+        struct {
+            unsigned long long gl1e, write_val;
+            unsigned long long va;
+            unsigned flags:29, emulation_count:3;
+        } gpl4;
+    } *r = (typeof(r))ri->d;
+
+    union shadow_event sevt = { .event = ri->event };
+    int rec_gpl = sevt.paging_levels + 2;
+
+    if ( rec_gpl != h->v->guest_paging_levels )
+    {
+        fprintf(warn, "%s: record paging levels %d, guest paging levels %d.  Switching.\n",
+                __func__, rec_gpl, h->v->guest_paging_levels);
+        h->v->guest_paging_levels = rec_gpl;
+    }
+
+    /* Fill in extended information */
+    switch(rec_gpl)
+    {
+    case 2:
+        if(sizeof(r->gpl2) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl2), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = r->gpl2.va;
+        e->flags = r->gpl2.flags;
+        e->gl1e = r->gpl2.gl1e;
+        e->wval = r->gpl2.write_val;
+        break;
+    case 3:
+        if(sizeof(r->gpl3) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl3), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = r->gpl3.va;
+        e->flags = r->gpl3.flags;
+        e->gl1e = r->gpl3.gl1e;
+        e->wval = r->gpl3.write_val;
+        break;
+    case 4:
+        if(sizeof(r->gpl4) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl4), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = r->gpl4.va;
+        e->flags = r->gpl4.flags;
+        e->gl1e = r->gpl4.gl1e;
+        e->wval = r->gpl4.write_val;
+        break;
+    }
+
+    pf_preprocess(e,rec_gpl);
+
+    if(opt.dump_all)
+        printf("]%s emulate va %llx gl1e %8llx wval %8llx flags %s(%x) pt_level %d corr %8llx\n",
+               ri->dump_header,
+               e->va,
+               e->gl1e, e->wval,
+               flag_string(e), e->flags,
+               e->pt_level, e->corresponding_va);
+
+    if ( hvm_set_postprocess(h, shadow_emulate_postprocess) )
+        fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+
+struct shadow_emulate_other {
+    unsigned long long gfn, va;
+};
+
+#define SHADOW_OTHER_LOGS_GFN_NOT_GMFN 1
+
+void shadow_parse_other(struct record_info *ri,
+                        struct shadow_emulate_other *o,
+                        struct hvm_data *h) {
+    union {
+        /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+           so put it first for alignment sake. */
+#if SHADOW_OTHER_LOGS_GFN_NOT_GMFN
+        /* D'OH!  Accidentally used mfn_t in the struct, so gmfns are always
+           64-bit... :-/ */
+        struct {
+            unsigned int gfn, va;
+        } gpl2;
+#endif
+        struct {
+            unsigned long long gfn;
+            unsigned int va;
+        } gpl3;
+        struct {
+            unsigned long long gfn, va;
+        } gpl4;
+    } *r = (typeof(r))ri->d;
+
+
+    union shadow_event sevt = { .event = ri->event };
+    int rec_gpl = sevt.paging_levels + 2;
+
+    if ( rec_gpl != h->v->guest_paging_levels )
+    {
+        fprintf(warn, "%s: record paging levels %d, guest paging levels %d.  Switching.\n",
+                __func__, rec_gpl, h->v->guest_paging_levels);
+        h->v->guest_paging_levels = rec_gpl;
+    }
+
+    switch(rec_gpl)
+    {
+#if SHADOW_OTHER_LOGS_GFN_NOT_GMFN
+    case 2:
+        if(sizeof(r->gpl2) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl2), rec_gpl,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        o->va = r->gpl2.va;
+        o->gfn = r->gpl2.gfn;
+        break;
+#else
+    case 2:
+        /* FALLTHRU */
+#endif
+    case 3:
+        if(sizeof(r->gpl3) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl3), rec_gpl,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        o->va = r->gpl3.va;
+        o->gfn = r->gpl3.gfn;
+        break;
+    case 4:
+        if(sizeof(r->gpl4) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl4), rec_gpl,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        o->va = r->gpl4.va;
+        o->gfn = r->gpl4.gfn;
+        break;
+    }
+}
+
+#if 0
+void shadow_unsync_postprocess(struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+
+    if(h->resyncs > 1)
+        fprintf(warn, "Strange, %d resyncs for an unsync!\n",
+                h->resyncs);
+
+    if(opt.summary_info) {
+        update_summary(&h->summary.pf_xen[PF_XEN_EMULATE_UNSYNC],
+                       h->arc_cycles);
+        if(h->resyncs <= 1)
+            update_summary(&h->summary.pf_xen_unsync[h->resyncs],
+                           h->arc_cycles);
+    }
+}
+
+
+void shadow_unsync_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+    struct shadow_emulate_other r;
+
+    shadow_parse_other(ri, &r, h);
+
+    e->gmfn = r.gmfn;
+    e->va = r.va;
+
+    pf_preprocess(e, h->v->guest_paging_levels);
+
+    if(opt.dump_all)
+        printf("]%s shadow unsync gmfn %llx va %llx pt_level %d corr %llx\n",
+               ri->dump_header,
+               e->gmfn,
+               e->va,
+               e->pt_level,
+               e->corresponding_va);
+
+    if ( hvm_set_postprocess(h, shadow_unsync_postprocess) )
+        fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+#endif
+
+void shadow_fault_generic_postprocess(struct hvm_data *h);
+
+void shadow_emulate_other_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+    struct shadow_emulate_other r;
+    union shadow_event sevt = { .event = ri->event };
+
+    shadow_parse_other(ri, &r, h);
+
+    e->gfn = r.gfn;
+    e->va = r.va;
+    e->pf_case = sevt.minor;
+
+    pf_preprocess(e, h->v->guest_paging_levels);
+
+    if(opt.dump_all)
+        printf("]%s shadow %s gfn %llx va %llx\n",
+               ri->dump_header,
+               pf_xen_name[sevt.minor],
+               e->gfn,
+               e->va);
+
+    if ( hvm_set_postprocess(h, shadow_fault_generic_postprocess) )
+        fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+
+void shadow_fixup_postprocess(struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+
+    if ( opt.summary_info )
+    {
+        update_summary(&h->summary.pf_xen[PF_XEN_FIXUP], h->arc_cycles);
+        if(h->prealloc_unpin) {
+            update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_PREALLOC_UNPIN], h->arc_cycles);
+        }
+        if(e->flag_unsync) {
+            update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_UNSYNC], h->arc_cycles);
+            if(h->resyncs < PF_XEN_FIXUP_UNSYNC_RESYNC_MAX)
+                update_summary(&h->summary.pf_xen_fixup_unsync_resync[h->resyncs],
+                               h->arc_cycles);
+            else
+                update_summary(&h->summary.pf_xen_fixup_unsync_resync[PF_XEN_FIXUP_UNSYNC_RESYNC_MAX],
+                               h->arc_cycles);
+        }
+        if(e->flag_oos_fixup_add)
+            update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_OOS_ADD], h->arc_cycles);
+        if(e->flag_oos_fixup_evict)
+            update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_OOS_EVICT], h->arc_cycles);
+        if(e->flag_promote)
+            update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_PROMOTE], h->arc_cycles);
+        if(e->flag_wrmap) {
+            update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_WRMAP], h->arc_cycles);
+            if(e->flag_wrmap_brute_force || h->wrmap_bf)
+                update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_BRUTE_FORCE], h->arc_cycles);
+        } else if(e->flag_wrmap_brute_force || h->wrmap_bf) {
+            fprintf(warn, "Strange: wrmap_bf but not wrmap!\n");
+        }
+
+
+        if(!(e->flag_promote || h->prealloc_unpin || e->flag_unsync))
+            update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_UPDATE_ONLY], h->arc_cycles);
+        /* more summary info */
+
+        if(e->flag_unsync)
+            hvm_update_short_summary(h, HVM_SHORT_SUMMARY_UNSYNC);
+        else
+            hvm_update_short_summary(h, HVM_SHORT_SUMMARY_FIXUP);
+    }
+
+    if(opt.scatterplot_unpin_promote) {
+        if(h->prealloc_unpin)
+            scatterplot_vs_time(h->exit_tsc, 0);
+        if(e->flag_promote) {
+            if(opt.with_cr3_enumeration) {
+                if(h->v->cr3.data)
+                    scatterplot_vs_time(h->exit_tsc, h->v->cr3.data->cr3_id);
+            } else
+                scatterplot_vs_time(h->exit_tsc, 2);
+        }
+    }
+}
+
+void shadow_fixup_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+    union {
+        /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+           so put it first for alignment sake. */
+        struct {
+            unsigned int gl1e, va, flags;
+        } gpl2;
+        struct {
+            unsigned long long gl1e;
+            unsigned int va, flags;
+        } gpl3;
+        struct {
+            unsigned long long gl1e, va;
+            unsigned int flags;
+        } gpl4;
+    } *r = (typeof(r))ri->d;
+    union shadow_event sevt = { .event = ri->event };
+    int rec_gpl = sevt.paging_levels + 2;
+
+    if ( rec_gpl != h->v->guest_paging_levels )
+    {
+        fprintf(warn, "%s: record paging levels %d, guest paging levels %d.  Switching.\n",
+                __func__, rec_gpl, h->v->guest_paging_levels);
+        h->v->guest_paging_levels = rec_gpl;
+    }
+
+    switch(rec_gpl)
+    {
+    case 2:
+        if(sizeof(r->gpl2) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl2), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = r->gpl2.va;
+        e->flags = r->gpl2.flags;
+        e->gl1e = r->gpl2.gl1e;
+        break;
+    case 3:
+        if(sizeof(r->gpl3) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl3), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = r->gpl3.va;
+        e->flags = r->gpl3.flags;
+        e->gl1e = r->gpl3.gl1e;
+        break;
+    case 4:
+        if(sizeof(r->gpl4) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl4), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = r->gpl4.va;
+        e->flags = r->gpl4.flags;
+        e->gl1e = r->gpl4.gl1e;
+        break;
+    }
+
+    pf_preprocess(e,rec_gpl);
+
+    if(opt.dump_all)
+    {
+        if ( e->flag_unsync )
+            printf("]%s fixup:unsync va %llx gl1e %llx corr %llx flags (%x)%s\n",
+                   ri->dump_header,
+                   e->va, e->gl1e,
+                   e->corresponding_va,
+                   e->flags,
+                   flag_string(e));
+        else
+            printf("]%s fixup va %llx gl1e %llx flags (%x)%s\n",
+                   ri->dump_header,
+                   e->va, e->gl1e, e->flags,
+                   flag_string(e));
+    }
+
+    if ( hvm_set_postprocess(h, shadow_fixup_postprocess) )
+        fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+
+void shadow_mmio_postprocess(struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+    if ( opt.summary_info )
+    {
+        if(e->pf_case)
+            update_summary(&h->summary.pf_xen[e->pf_case],
+                           h->arc_cycles);
+        else
+            fprintf(warn, "Strange, pf_case 0!\n");
+
+        hvm_update_short_summary(h, HVM_SHORT_SUMMARY_MMIO);
+    }
+
+    if(opt.with_mmio_enumeration)
+        enumerate_mmio(h);
+}
+
+void shadow_mmio_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+    struct mmio_info *m = &h->inflight.mmio;
+    union {
+        /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+           so put it first for alignment sake. */
+        struct {
+            unsigned int va;
+        } gpl2;
+        struct {
+            unsigned long long va;
+        } gpl4;
+    } *r = (typeof(r))ri->d;
+    union shadow_event sevt = { .event = ri->event };
+    int rec_gpl = sevt.paging_levels + 2;
+
+    if ( rec_gpl != h->v->guest_paging_levels )
+    {
+        fprintf(warn, "%s: record paging levels %d, guest paging levels %d.  Switching.\n",
+                __func__, rec_gpl, h->v->guest_paging_levels);
+        h->v->guest_paging_levels = rec_gpl;
+    }
+
+    switch(rec_gpl)
+    {
+    case 2:
+    case 3:
+        if(sizeof(r->gpl2) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl2), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = m->va = r->gpl2.va;
+        break;
+    case 4:
+        if(sizeof(r->gpl4) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl4), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = m->va = r->gpl4.va;
+        break;
+    }
+
+    if(opt.dump_all)
+        printf("]%s %smmio va %llx\n",
+                ri->dump_header,
+                (e->pf_case==PF_XEN_FAST_MMIO)?"fast ":"",
+                e->va);
+
+    if ( hvm_set_postprocess(h, shadow_mmio_postprocess) )
+        fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+
+void shadow_propagate_postprocess(struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+
+    if ( opt.summary_info )
+    {
+        if(e->pf_case)
+            update_summary(&h->summary.pf_xen[e->pf_case],
+                           h->arc_cycles);
+        else
+            fprintf(warn, "Strange, pf_case 0!\n");
+
+        hvm_update_short_summary(h, HVM_SHORT_SUMMARY_PROPAGATE);
+    }
+}
+
+void shadow_propagate_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+    union {
+        /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+           so put it first for alignment sake. */
+        struct {
+            unsigned int gl1e, va, flags;
+        } gpl2;
+        struct {
+            unsigned long long gl1e;
+            unsigned int va, flags;
+        } gpl3;
+        struct {
+            unsigned long long gl1e, va;
+            unsigned int flags;
+        } gpl4;
+    } *r = (typeof(r))ri->d;
+    union shadow_event sevt = { .event = ri->event };
+    int rec_gpl = sevt.paging_levels + 2;
+
+    if ( rec_gpl != h->v->guest_paging_levels )
+    {
+        fprintf(warn, "%s: record paging levels %d, guest paging levels %d.  Switching.\n",
+                __func__, rec_gpl, h->v->guest_paging_levels);
+        h->v->guest_paging_levels = rec_gpl;
+    }
+
+    switch(rec_gpl)
+    {
+    case 2:
+        if(sizeof(r->gpl2) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl2), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = r->gpl2.va;
+        e->flags = r->gpl2.flags;
+        e->gl1e = r->gpl2.gl1e;
+        break;
+    case 3:
+        if(sizeof(r->gpl3) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl3), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = r->gpl3.va;
+        e->flags = r->gpl3.flags;
+        e->gl1e = r->gpl3.gl1e;
+        break;
+    case 4:
+        if(sizeof(r->gpl4) != ri->extra_words * 4)
+        {
+            fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
+                    __func__, sizeof(r->gpl4), h->v->guest_paging_levels,
+                    ri->extra_words * 4);
+            error(ERR_RECORD, ri);
+            return;
+        }
+        e->va = r->gpl4.va;
+        e->flags = r->gpl4.flags;
+        e->gl1e = r->gpl4.gl1e;
+        break;
+    }
+
+    if(opt.dump_all)
+        printf("]%s propagate va %llx gl1e %llx flags (%x)%s\n",
+               ri->dump_header,
+               e->va, e->gl1e, e->flags,
+               flag_string(e));
+
+    if ( hvm_set_postprocess(h, shadow_propagate_postprocess) )
+        fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+
+void shadow_fault_generic_dump(unsigned int event, uint32_t *d, char *prefix,
+                         char * dump_header)
+{
+    char *evt_string, evt_number[10];
+    union shadow_event sevt = { .event = event };
+    int i;
+
+    if(sevt.minor < PF_XEN_MAX && pf_xen_name[sevt.minor])
+    {
+        evt_string = pf_xen_name[sevt.minor];
+    }
+    else
+    {
+        snprintf(evt_number, 10, "%d", sevt.minor);
+        evt_string = evt_number;
+    }
+
+    printf("%s%s shadow %s gl %d [",
+           prefix,
+           dump_header,
+           evt_string,
+           sevt.paging_levels);
+
+    for(i=0; i<4; i++)
+    {
+        printf(" %x", d[i]);
+    }
+
+    printf(" ]\n");
+}
+
+void shadow_fault_generic_postprocess(struct hvm_data *h)
+{
+    struct pf_xen_extra *e = &h->inflight.pf_xen;
+    if ( e->pf_case < PF_XEN_NOT_SHADOW || e->pf_case > PF_XEN_LAST_FAULT )
+    {
+        fprintf(warn, "%s: Strange, unexpected case %d\n",
+                __func__, e->pf_case);
+        return;
+    }
+
+    if(opt.summary_info) {
+        update_summary(&h->summary.pf_xen[e->pf_case],
+                           h->arc_cycles);
+
+        hvm_update_short_summary(h, HVM_SHORT_SUMMARY_PROPAGATE);
+    }
+}
+
+void shadow_fault_generic_process(struct record_info *ri, struct hvm_data *h)
+{
+    union shadow_event sevt = { .event = ri->event };
+
+    /* pf-case traces, vs others */
+    h->inflight.generic.event = ri->event;
+    bcopy(ri->d, h->inflight.generic.d, sizeof(unsigned int) * 4);
+
+    if(opt.dump_all)
+        shadow_fault_generic_dump(h->inflight.generic.event,
+                                  h->inflight.generic.d,
+                                  "]", ri->dump_header);
+
+    h->inflight.pf_xen.pf_case = sevt.minor;
+    if ( hvm_set_postprocess(h, shadow_fault_generic_postprocess) )
+        fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
+}
+
+void shadow_resync_process(struct record_info *ri, struct hvm_data *h)
+{
+    struct {
+        unsigned long long gfn;
+    } *r = (typeof(r))ri->d;
+
+    if(opt.dump_all)
+        printf(" %s oos resync %s gfn %llx\n",
+               ri->dump_header,
+               (ri->event == TRC_SHADOW_RESYNC_FULL)?"full":"only",
+               r->gfn);
+
+    h->resyncs++;
+}
+
+void shadow_prealloc_unpin_process(struct record_info *ri, struct hvm_data *h) {
+    struct {
+        unsigned long long gfn;
+    } *r = (typeof(r))ri->d;
+
+    if(opt.dump_all)
+        printf(" %s prealloc-unpin gfn %llx\n",
+               ri->dump_header, r->gfn);
+
+    if(h->prealloc_unpin)
+        fprintf(warn, "Strange, more than one prealloc_unpin per arc!\n");
+
+    h->prealloc_unpin = 1;
+
+    if(opt.with_cr3_enumeration)
+        cr3_prealloc_unpin(h->v, r->gfn);
+}
+
+void shadow_wrmap_bf_process(struct record_info *ri, struct hvm_data *h) {
+    struct {
+        unsigned long long gfn;
+    } *r = (typeof(r))ri->d;
+
+    if(opt.dump_all)
+        printf(" %s wrmap-bf gfn %llx\n",
+               ri->dump_header, r->gfn);
+
+    h->wrmap_bf = 1;
+}
+
+void shadow_process(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+    struct hvm_data *h;
+
+    union shadow_event sevt = { .event = ri->event };
+
+    int gpl = sevt.paging_levels + 2;
+
+    assert(p->current);
+    if(vcpu_set_data_type(p->current, VCPU_DATA_HVM))
+        return;
+
+    h = &p->current->hvm;
+
+    if(!h->init || !h->vmexit_valid)
+    {
+        if(opt.dump_all)
+            shadow_fault_generic_dump(ri->event,
+                                      ri->d,
+                                      "!", ri->dump_header);
+        return;
+    }
+
+    if(sevt.minor <= PF_XEN_NOT_SHADOW) {
+        if(p->current->guest_paging_levels != gpl)
+        {
+            fprintf(warn, "%s: Changing guest paging levels to %d\n",
+                    __func__, gpl);
+            p->current->guest_paging_levels = gpl;
+        }
+    }
+
+    if(sevt.minor <= PF_XEN_LAST_FAULT)  {
+        h->inflight.pf_xen.pf_case = sevt.minor;
+        if(opt.summary) {
+            hvm_set_summary_handler(h, hvm_pf_xen_summary, NULL);
+        }
+    }
+
+    /* FIXME - mask out paging levels */
+    switch(sevt.minor)
+    {
+    case PF_XEN_NOT_SHADOW:
+        shadow_propagate_process(ri, h);
+        break;
+    case PF_XEN_EMULATE:
+        shadow_emulate_process(ri, h);
+        break;
+    case PF_XEN_FIXUP:
+        shadow_fixup_process(ri, h);
+        break;
+    case PF_XEN_MMIO:
+    case PF_XEN_FAST_MMIO:
+        shadow_mmio_process(ri, h);
+        break;
+    case PF_XEN_EMULATE_UNSHADOW_USER:
+    case PF_XEN_EMULATE_UNSHADOW_EVTINJ:
+    case PF_XEN_EMULATE_UNSHADOW_UNHANDLED:
+        shadow_emulate_other_process(ri, h);
+        break;
+#if 0
+    case PF_XEN_EMULATE_UNSYNC:
+        shadow_unsync_process(ri, h);
+        break;
+#endif
+    case SHADOW_RESYNC_FULL:
+    case SHADOW_RESYNC_ONLY:
+        shadow_resync_process(ri, h);
+        break;
+    case SHADOW_PREALLOC_UNPIN:
+        shadow_prealloc_unpin_process(ri, h);
+        break;
+    case SHADOW_WRMAP_BF:
+        shadow_wrmap_bf_process(ri, h);
+        break;
+    default:
+        if(sevt.minor <= PF_XEN_LAST_FAULT) {
+            shadow_fault_generic_process(ri, h);
+        } else {
+            warn_once("Warning: processing shadow as generic\n");
+            process_generic(ri);
+        }
+        break;
+    }
+}
+
+/* ---- PV guests ---- */
+union pv_event {
+    unsigned event;
+    struct {
+        unsigned minor:8,
+            x64:1,
+            unused1:3,
+            sub:4,
+            main:12,
+            unused:4;
+    };
+};
+
+void pv_hypercall_process(struct record_info *ri, struct pv_data *pv) {
+    union {
+        struct {
+            uint32_t eip, eax;
+        } x32;
+        struct {
+            uint64_t eip;
+            uint32_t eax;
+        } x64;
+    } * r = (typeof(r)) ri->d;
+    union pv_event pevt = { .event = ri->event };
+    unsigned long long eip;
+    unsigned int eax;
+
+    if(pevt.x64) {
+        eip = r->x64.eip;
+        eax = r->x64.eax;
+    } else {
+        eip = r->x32.eip;
+        eax = r->x32.eax;
+    }
+
+    if(opt.summary_info) {
+        if(eax < PV_HYPERCALL_MAX)
+            pv->hypercall_count[eax]++;
+    }
+
+    if(opt.dump_all) {
+        if(eax < HYPERCALL_MAX)
+            printf(" %s hypercall %2x (%s) eip %llx\n",
+                   ri->dump_header, eax,
+                   hypercall_name[eax], eip);
+        else
+            printf(" %s hypercall %x eip %llx\n",
+                   ri->dump_header, eax, eip);
+    }
+}
+
+void pv_trap_process(struct record_info *ri, struct pv_data *pv) {
+    union {
+        struct {
+            unsigned int eip;
+            unsigned trapnr:15,
+                use_error_code:1,
+                error_code:16;
+        } x32;
+        struct {
+            unsigned long long eip;
+            unsigned trapnr:15,
+                use_error_code:1,
+                error_code:16;
+        } x64;
+    } * r = (typeof(r)) ri->d;
+    union pv_event pevt = { .event = ri->event };
+    unsigned long long eip;
+    unsigned trapnr, use_ec, ec;
+
+    if(pevt.x64) {
+        eip = r->x64.eip;
+        trapnr = r->x64.trapnr;
+        use_ec = r->x64.use_error_code;
+        ec = r->x64.error_code;
+    } else {
+        eip = r->x32.eip;
+        trapnr = r->x32.trapnr;
+        use_ec = r->x32.use_error_code;
+        ec = r->x32.error_code;
+    }
+
+    if(opt.summary_info) {
+        if(trapnr < PV_TRAP_MAX)
+            pv->trap_count[trapnr]++;
+    }
+
+    if(opt.dump_all) {
+        printf(" %s trap %x eip %llx",
+               ri->dump_header, trapnr, eip);
+        if(use_ec)
+            printf(" ec %x\n", ec);
+        else
+            printf("\n");
+    }
+}
+
+void pv_ptwr_emulation_process(struct record_info *ri, struct pv_data *pv) {
+    union pv_event pevt = { .event = ri->event };
+    union {
+        /* gpl2 is deprecated */
+        struct {
+            unsigned long long pte;
+            unsigned int addr, eip;
+        } gpl3;
+        struct {
+            unsigned long long pte;
+            unsigned long long addr, eip;
+        } gpl4;
+    } *r = (typeof(r))ri->d;
+    struct {
+        unsigned long long pte, addr, eip;
+    } e;
+
+    switch ( pevt.minor ) {
+    case PV_PTWR_EMULATION_PAE:
+        if ( pevt.x64 )
+        {
+            fprintf(warn, "Strange: PV_PTWR_EMULATION, but x64! %x\n",
+                    ri->event);
+            error(ERR_RECORD, ri);
+        }
+        e.pte = r->gpl3.pte;
+        e.addr = r->gpl3.addr;
+        e.eip = r->gpl3.eip;
+        break;
+    case PV_PTWR_EMULATION:
+        if ( !pevt.x64 )
+        {
+            fprintf(warn, "Strange: PV_PTWR_EMULATION, but !x64! %x\n",
+                    ri->event);
+            error(ERR_RECORD, ri);
+        }
+        e.pte = r->gpl4.pte;
+        e.addr = r->gpl4.addr;
+        e.eip = r->gpl4.eip;
+        break;
+    default:
+        fprintf(warn, "ERROR: Unknown PV_PTRW minor type %d!\n",
+                pevt.minor);
+        error(ERR_RECORD, ri);
+        return;
+    }
+
+    if ( opt.dump_all )
+    {
+        printf(" %s ptwr l1e %llx eip %llx addr %llx\n",
+               ri->dump_header,
+               e.pte, e.eip, e.addr);
+    }
+}
+
+void pv_generic_process(struct record_info *ri, struct pv_data *pv) {
+    union pv_event pevt = { .event = ri->event };
+    if ( opt.dump_all ) {
+        if(pevt.minor < PV_MAX && pv_name[pevt.minor])
+            printf(" %s %s",
+                   ri->dump_header,
+                   pv_name[pevt.minor]);
+        else
+            printf(" %s PV-%d ",
+                   ri->dump_header, pevt.minor);
+        if (ri->extra_words) {
+            int i;
+            printf("[ ");
+            for(i=0; i<ri->extra_words; i++) {
+                printf("%x ", (unsigned)ri->d[i]);
+            }
+            printf("]");
+
+        }
+        printf("\n");
+    }
+}
+
+void pv_summary(struct pv_data *pv) {
+    int i, j;
+
+    if(!pv->summary_info)
+        return;
+
+    printf("PV events:\n");
+    for(i=0; i<PV_MAX; i++) {
+        int count;
+
+        count = pv->count[i];
+        if (i == PV_HYPERCALL_V2)
+            count += pv->count[PV_HYPERCALL_SUBCALL];
+
+        if (count == 0)
+            continue;
+
+        printf("  %s  %d\n", pv_name[i], count);
+
+        switch(i) {
+        case PV_HYPERCALL:
+        case PV_HYPERCALL_V2:
+            for(j=0; j<PV_HYPERCALL_MAX; j++) {
+                if(pv->hypercall_count[j])
+                    printf("    %-29s[%2d]: %6d\n",
+                           hypercall_name[j],
+                           j,
+                           pv->hypercall_count[j]);
+            }
+            break;
+        case PV_TRAP:
+            for(j=0; j<PV_TRAP_MAX; j++) {
+                if(pv->trap_count[j])
+                    printf("    [%d] %d\n",
+                           j, pv->trap_count[j]);
+            }
+            break;
+        }
+    }
+}
+
+static const char *grant_table_op_str[] = {
+    "map_grant_ref", "unmap_grant_ref", "setup_table", "dump_table",
+    "transfer", "copy", "query_size", "unmap_and_replace",
+    "set_version", "get_status_frames", "get_version", "swap_grant_ref",
+};
+
+static const char *vcpu_op_str[] = {
+    "initialise", "up", "down", "is_up", "get_runstate_info",
+    "register_runstate_memory_area", "set_periodic_timer",
+    "stop_periodic_timer", "set_singleshot_timer", "stop_singleshot_timer",
+    "register_vcpu_info", "send_nmi", "get_physid",
+    "register_vcpu_time_memory_area",
+};
+
+static const char *sched_op_str[] = {
+    "yield", "block", "shutdown", "poll", "remote_shutdown", "shutdown_code",
+    "watchdog",
+};
+
+static const char *cmd_to_str(const char *strings[], size_t n, uint32_t cmd)
+{
+    static char buf[32];
+
+    if (cmd < n)
+        return strings[cmd];
+
+    snprintf(buf, sizeof(buf), "unknown (%d)", cmd);
+    return buf;
+}
+
+#define CMD_TO_STR(op)                                                  \
+    static const char * op ## _to_str(uint32_t cmd) {                   \
+        return cmd_to_str(op ## _str, ARRAY_SIZE(op ## _str), cmd);     \
+    }
+
+CMD_TO_STR(grant_table_op);
+CMD_TO_STR(vcpu_op);
+CMD_TO_STR(sched_op);
+
+void pv_hypercall_gather_args(const struct record_info *ri, uint64_t *args)
+{
+    int i, word;
+
+    /* Missing arguments are zeroed. */
+    memset(args, 0, 6 * sizeof(uint64_t));
+
+    for (i = 0, word = 1; i < 6 && word < ri->extra_words; i++) {
+        int present = pv_hypercall_arg_present(ri, i);
+
+        switch (present) {
+        case ARG_32BIT:
+            args[i] = ri->d[word];
+            break;
+        case ARG_64BIT:
+            args[i] = ((uint64_t)ri->d[word + 1] << 32) | ri->d[word];
+            break;
+        }
+
+        /* Skip over any words for this argument. */
+        word += present;
+    }
+}
+
+static void pv_hypercall_print_args(const struct record_info *ri)
+{
+    int i, word;
+
+    for (i = 0, word = 1; i < 6 && word < ri->extra_words; i++) {
+        int present = pv_hypercall_arg_present(ri, i);
+
+        switch (present) {
+        case ARG_MISSING:
+            printf(" ??");
+            break;
+        case ARG_32BIT:
+            printf(" %08x", ri->d[word]);
+            break;
+        case ARG_64BIT:
+            printf(" %016"PRIu64"", ((uint64_t)ri->d[word + 1] << 32) | ri->d[word]);
+            break;
+        }
+
+        word += present;
+    }
+}
+
+void pv_hypercall_v2_process(struct record_info *ri, struct pv_data *pv,
+                             const char *indent)
+{
+    int op = pv_hypercall_op(ri);
+
+    if(opt.summary_info) {
+        if(op < PV_HYPERCALL_MAX)
+            pv->hypercall_count[op]++;
+    }
+
+    if(opt.dump_all) {
+        uint64_t args[6];
+
+        if(op < HYPERCALL_MAX)
+            printf(" %s%s hypercall %2x (%s)",
+                   ri->dump_header, indent, op, hypercall_name[op]);
+        else
+            printf(" %s%s hypercall %2x",
+                   ri->dump_header, indent, op);
+
+        switch(op) {
+        case HYPERCALL_mmu_update:
+            pv_hypercall_gather_args(ri, args);
+            printf(" %d updates%s", (uint32_t)args[1] & ~MMU_UPDATE_PREEMPTED,
+                   (args[1] & MMU_UPDATE_PREEMPTED) ? " (preempted)" : "");
+            break;
+        case HYPERCALL_multicall:
+            pv_hypercall_gather_args(ri, args);
+            printf(" %d calls", (uint32_t)args[1]);
+            break;
+        case HYPERCALL_grant_table_op:
+            pv_hypercall_gather_args(ri, args);
+            printf(" %s %d ops", grant_table_op_to_str(args[0]), (uint32_t)args[2]);
+            break;
+        case HYPERCALL_vcpu_op:
+            pv_hypercall_gather_args(ri, args);
+            printf(" %s vcpu %d", vcpu_op_to_str(args[0]), (uint32_t)args[1]);
+            break;
+        case HYPERCALL_mmuext_op:
+            pv_hypercall_gather_args(ri, args);
+            printf(" %d ops", (uint32_t)args[1]);
+            break;
+        case HYPERCALL_sched_op:
+            pv_hypercall_gather_args(ri, args);
+            printf(" %s", sched_op_to_str(args[0]));
+            break;
+        default:
+            pv_hypercall_print_args(ri);
+            break;
+        }
+        printf("\n");
+    }
+}
+
+void pv_process(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+    struct vcpu_data *v = p->current;
+    struct pv_data *pv = &v->pv;
+
+    union pv_event pevt = { .event = ri->event };
+
+    if(vcpu_set_data_type(p->current, VCPU_DATA_PV))
+        return;
+
+    if(opt.summary_info) {
+        pv->summary_info=1;
+
+        if(pevt.minor == PV_PTWR_EMULATION_PAE)
+            pv->count[PV_PTWR_EMULATION]++;
+        else
+            pv->count[pevt.minor]++;
+    }
+
+    switch(pevt.minor)
+    {
+    case PV_HYPERCALL:
+        pv_hypercall_process(ri, pv);
+        break;
+    case PV_TRAP:
+        pv_trap_process(ri, pv);
+        break;
+    case PV_PTWR_EMULATION:
+    case PV_PTWR_EMULATION_PAE:
+        pv_ptwr_emulation_process(ri, pv);
+        break;
+    case PV_HYPERCALL_V2:
+        pv_hypercall_v2_process(ri, pv, "");
+        break;
+    case PV_HYPERCALL_SUBCALL:
+        pv_hypercall_v2_process(ri, pv, " ");
+        break;
+    default:
+        pv_generic_process(ri, pv);
+        break;
+    }
+}
+
+/* ---- Schedule ---- */
+struct vcpu_data * vcpu_create(struct domain_data *d, int vid)
+{
+    struct vcpu_data *v;
+
+    assert(d->vcpu[vid] == NULL);
+
+    fprintf(warn, "Creating vcpu %d for dom %d\n", vid, d->did);
+
+    if((v=malloc(sizeof(*v)))==NULL)
+    {
+        fprintf(stderr, "%s: malloc %zd failed!\n", __func__, sizeof(*d));
+        error(ERR_SYSTEM, NULL);
+    }
+
+    bzero(v, sizeof(*v));
+
+    v->vid = vid;
+    v->d = d;
+    v->p = NULL;
+    v->runstate.state = RUNSTATE_INIT;
+    v->runstate.last_oldstate.wrong = RUNSTATE_INIT;
+
+    d->vcpu[vid] = v;
+
+    assert(v == v->d->vcpu[v->vid]);
+
+    if(vid > d->max_vid)
+        d->max_vid = vid;
+
+    return v;
+}
+
+/* Called by both domain_create and sched_default_domain_init */
+void domain_init(struct domain_data *d, int did)
+{
+    bzero(d, sizeof(*d));
+
+    d->did = did;
+    d->next = NULL;
+
+    if(opt.interval.check == INTERVAL_CHECK_DOMAIN)
+        interval_domain_value_check(d);
+}
+
+struct domain_data * domain_create(int did)
+{
+    struct domain_data *d;
+
+    fprintf(warn, "Creating domain %d\n", did);
+
+    if((d=malloc(sizeof(*d)))==NULL)
+    {
+        fprintf(stderr, "%s: malloc %zd failed!\n", __func__, sizeof(*d));
+        error(ERR_SYSTEM, NULL);
+    }
+
+    /* Initialize domain & vcpus */
+    domain_init(d, did);
+
+    return d;
+ }
+
+struct domain_data * domain_find(int did)
+{
+    struct domain_data *d, *n, **q;
+
+    /* Look for domain, keeping track of the last pointer so we can add
+       a domain if we need to. */
+    for ( d = domain_list, q=&domain_list ;
+          d && (d->did < did) ;
+          q = &d->next, d=d->next ) ;
+
+    if(d && d->did == did)
+        return d;
+
+    /* Make a new domain */
+    n = domain_create(did);
+
+    /* Insert it into the list */
+    n->next = d;
+    *q = n;
+
+    return n;
+}
+
+struct vcpu_data * vcpu_find(int did, int vid)
+{
+    struct domain_data *d;
+    struct vcpu_data *v;
+
+    d = domain_find(did);
+
+    v = d->vcpu[vid];
+
+    if(!v)
+        v = vcpu_create(d, vid);
+
+    return v;
+}
+
+void pcpu_runstate_update(struct pcpu_info *p, tsc_t tsc)
+{
+    if ( p->time.tsc )
+    {
+        if ( p->current->d->did == IDLE_DOMAIN )
+            update_cycles(&p->time.idle, tsc - p->time.tsc);
+        else
+            update_cycles(&p->time.running, tsc - p->time.tsc);
+        p->time.tsc = 0;
+    }
+}
+
+void vcpu_prev_update(struct pcpu_info *p, struct vcpu_data *prev,
+                      tsc_t tsc, int new_runstate)
+{
+    assert(prev == prev->d->vcpu[prev->vid]);
+
+    if(prev->p != p)
+    {
+        fprintf(warn, "Strange, sched_switch on pcpu %d, prev->pcpu %d!\n",
+                p->pid, prev->p->pid);
+        prev->runstate.tsc = 0;
+        goto set;
+    }
+
+    //assert(p->current);
+
+   if ( !p->current )
+    {
+        fprintf(warn, "%s: FATAL: p->current NULL!\n", __func__);
+        error(ERR_ASSERT, NULL);
+    }
+
+    if(p->current != prev)
+    {
+        fprintf(warn, "Strange, sched_switch prev d%dv%d, pcpu %d current d%dv%d!\n",
+                prev->d->did, prev->vid,
+                p->pid, p->current->d->did, p->current->vid);
+        prev->runstate.tsc = 0;
+        goto set;
+    }
+
+    if(prev->runstate.state != RUNSTATE_RUNNING)
+    {
+        fprintf(warn, "Strange, prev d%dv%d not running!\n",
+                prev->d->did, prev->vid);
+        prev->runstate.tsc = 0;
+        goto set;
+    }
+
+set:
+    pcpu_runstate_update(p, tsc);
+    p->current = NULL;
+    pcpu_string_draw(p);
+    runstate_update(prev, new_runstate, tsc);
+}
+
+void vcpu_next_update(struct pcpu_info *p, struct vcpu_data *next, tsc_t tsc)
+{
+    assert(next == next->d->vcpu[next->vid]);
+    //assert(p->current == NULL);
+
+    if ( p->current != NULL )
+    {
+        if ( p->lost_record.seen_valid_schedule == 0 )
+        {
+            fprintf(warn, "%s: p->current non-NULL, but seen_valid_schedule 0.  Ignoring.\n",
+                    __func__);
+            runstate_update(p->current, RUNSTATE_LOST, tsc);
+            p->current = NULL;
+        }
+        else
+        {
+            fprintf(warn, "%s: FATAL: p->current not NULL! (d%dv%d, runstate %s)\n",
+                    __func__,
+                    p->current->d->did,
+                    p->current->vid,
+                    runstate_name[p->current->runstate.state]);
+            error(ERR_ASSERT, NULL);
+        }
+    }
+
+    if(next->activated)
+    {
+        /* We may get lost records at start-of-day, so ignore
+           setting runstate of default vcpus */
+        if(next->runstate.state == RUNSTATE_RUNNING
+           && next->d->did != DEFAULT_DOMAIN)
+        {
+            fprintf(warn, "Strange, next d%dv%d already running on proc %d!\n",
+                    next->d->did, next->vid,
+                    next->p->pid);
+            next->runstate.tsc = 0;
+        }
+
+        /* If we're moving from one pcpu to another, record change & update tsc */
+        if(next->p != p) {
+            if(next->pcpu_tsc)
+            {
+                update_cycles(&next->cpu_affinity_all, tsc - next->pcpu_tsc);
+                update_cycles(&next->cpu_affinity_pcpu[p->pid], tsc - next->pcpu_tsc);
+            }
+            next->pcpu_tsc = tsc;
+        }
+    }
+    else
+    {
+        next->guest_paging_levels = opt.default_guest_paging_levels;
+        next->activated = 1;
+        next->pcpu_tsc = tsc;
+    }
+
+    runstate_update(next, RUNSTATE_RUNNING, tsc);
+
+    if ( opt.scatterplot_pcpu
+         && next->d->did != IDLE_DOMAIN
+         && next->d->did != DEFAULT_DOMAIN )
+    {
+        struct time_struct t;
+
+        abs_cycles_to_time(tsc, &t);
+
+        if ( next->p )
+            printf("%dv%d %u.%09u %d\n",
+                   next->d->did, next->vid,
+                   t.s, t.ns,
+                   next->p->pid);
+        printf("%dv%d %u.%09u %d\n",
+               next->d->did, next->vid,
+               t.s, t.ns,
+               p->pid);
+    }
+
+    next->p = p;
+    p->current = next;
+    pcpu_string_draw(p);
+    p->time.tsc = tsc;
+    p->lost_record.seen_valid_schedule = 1;
+}
+
+/* If current is the default domain, we're fixing up from something
+ * like start-of-day.  Update what we can. */
+void vcpu_start(struct pcpu_info *p, struct vcpu_data *v) {
+    /* If vcpus are created, or first show up, in a "dead zone", this will
+     * fail. */
+    if( !p->current || p->current->d->did != DEFAULT_DOMAIN) {
+        fprintf(stderr, "Strange, p->current not default domain!\n");
+        error(ERR_FILE, NULL);
+        return;
+    }
+
+    if(!p->first_tsc) {
+        fprintf(stderr, "Strange, p%d first_tsc 0!\n", p->pid);
+        error(ERR_FILE, NULL);
+    }
+
+    if(p->first_tsc <= p->current->runstate.tsc) {
+        fprintf(stderr, "Strange, first_tsc %llx < default_domain runstate tsc %llx!\n",
+                p->first_tsc,
+                p->current->runstate.tsc);
+        error(ERR_FILE, NULL);
+    }
+
+    /* Change default domain to 'queued' */
+    runstate_update(p->current, RUNSTATE_QUEUED, p->first_tsc);
+
+    /* FIXME: Copy over data from the default domain this interval */
+    fprintf(warn, "Using first_tsc for d%dv%d (%lld cycles)\n",
+            v->d->did, v->vid, p->last_tsc - p->first_tsc);
+
+    /* Simulate the time since the first tsc */
+    runstate_update(v, RUNSTATE_RUNNING, p->first_tsc);
+    p->time.tsc = p->first_tsc;
+    p->current = v;
+    pcpu_string_draw(p);
+    v->p = p;
+}
+
+void sched_runstate_process(struct pcpu_info *p)
+{
+    enum {
+        CHANGE=0,
+        CONTINUE
+    } type;
+    struct vcpu_data *v;
+    struct record_info *ri = &p->ri;
+    struct {
+        unsigned vcpu:16, dom:16;
+        unsigned long long p1, p2;
+    } __attribute__((packed)) * r = (typeof(r))ri->d;
+    union {
+        unsigned int event;
+        struct {
+            unsigned lo:4,
+                new_runstate:4,
+                old_runstate:4,
+                sub:4,
+                main:12,
+                unused:4;
+        };
+    } _sevt = { .event = ri->event };
+    struct {
+        int new_runstate, old_runstate;
+    } sevt;
+    int perfctrs;
+    struct last_oldstate_struct last_oldstate;
+
+    switch(_sevt.lo)
+    {
+    case 1:
+        type = CHANGE;
+        sevt.new_runstate = _sevt.new_runstate;
+        sevt.old_runstate = _sevt.old_runstate;
+        break;
+    case 2:
+        type = CONTINUE;
+        sevt.new_runstate = sevt.old_runstate = RUNSTATE_RUNNING;
+        break;
+    default:
+        fprintf(warn, "FATAL: Unexpected runstate change type %d!\n",
+                _sevt.lo);
+        error(ERR_RECORD, NULL);
+        return;
+    }
+
+    perfctrs = (ri->extra_words == 5);
+
+    if(opt.dump_all) {
+        if( perfctrs ) {
+            printf(" %s %s {%lld,%lld} d%uv%u %s->%s\n",
+                   ri->dump_header,
+                   type?"runstate_continue":"runstate_change",
+                   r->p1, r->p2,
+                   r->dom, r->vcpu,
+                   runstate_name[sevt.old_runstate],
+                   runstate_name[sevt.new_runstate]);
+        } else {
+            printf(" %s %s d%uv%u %s->%s\n",
+                   ri->dump_header,
+                   type?"runstate_continue":"runstate_change",
+                   r->dom, r->vcpu,
+                   runstate_name[sevt.old_runstate],
+                   runstate_name[sevt.new_runstate]);
+        }
+    }
+
+    /* Sanity check: expected transitions */
+    if ( type == CHANGE )
+    {
+        if( (sevt.new_runstate == RUNSTATE_RUNNING
+             && sevt.old_runstate != RUNSTATE_RUNNABLE)
+            || (sevt.new_runstate == RUNSTATE_BLOCKED
+                && sevt.old_runstate == RUNSTATE_RUNNABLE ) )
+        {
+            fprintf(warn, "Strange, d%dv%d unexpected runstate transition %s->%s\n",
+                    r->dom, r->vcpu,
+                    runstate_name[sevt.old_runstate],
+                    runstate_name[sevt.new_runstate]);
+        }
+    }
+
+    if(r->vcpu > MAX_CPUS)
+    {
+        fprintf(warn, "%s: vcpu %u > MAX_VCPUS %d!\n",
+                __func__, r->vcpu, MAX_CPUS);
+        return;
+    }
+
+    v = vcpu_find(r->dom, r->vcpu);
+
+    /* We want last_oldstate reset every time; so copy the last one and use
+     * that locally, clobbering the one in the vcpu struct.  If it needs to
+     * be reset, it will be reset below. */
+    last_oldstate = v->runstate.last_oldstate;
+    v->runstate.last_oldstate.wrong = RUNSTATE_INIT;
+
+    /* Close vmexits when the putative reason for blocking / &c stops.
+     * This way, we don't account cpu contention to some other overhead. */
+    if(sevt.new_runstate == RUNSTATE_RUNNABLE
+       && v->data_type == VCPU_DATA_HVM
+       && v->hvm.vmexit_valid) {
+        hvm_close_vmexit(&v->hvm, ri->tsc);
+    }
+
+    /* Track waking state */
+    if ( v->data_type == VCPU_DATA_HVM && v->runstate.state != RUNSTATE_LOST ) {
+        if ( sevt.new_runstate == RUNSTATE_RUNNABLE
+             && sevt.old_runstate == RUNSTATE_BLOCKED )
+        {
+            /* Hmm... want to make sure we're not in some weird
+               vmexit state... have to look later. */
+            if(opt.dump_all)
+                printf(" [w2h] d%dv%d Setting waking\n", v->d->did, v->vid);
+            v->hvm.w2h.waking = 1;
+        }
+        else if ( sevt.new_runstate != RUNSTATE_RUNNING
+                  || sevt.old_runstate != RUNSTATE_RUNNABLE )
+        {
+            if( v->hvm.w2h.waking
+                && sevt.old_runstate == RUNSTATE_RUNNING
+                && sevt.new_runstate != RUNSTATE_OFFLINE )
+            {
+                /* NB: This is printed a lot unnecessairly when there is TSC skew */
+                if ( sevt.old_runstate != v->runstate.state )
+                    fprintf(warn, "Strange, unexpected waking transition for d%dv%d: %s -> %s\n",
+                            v->d->did, v->vid,
+                            runstate_name[sevt.old_runstate],
+                            runstate_name[sevt.new_runstate]);
+                v->hvm.w2h.waking = 0;
+            }
+
+            /* Close wake-to-halt summary */
+            /* FIXME: Need to think about handling preemption. */
+            if (sevt.new_runstate == RUNSTATE_BLOCKED
+                && sevt.old_runstate == RUNSTATE_RUNNING
+                && v->hvm.w2h.interrupts ) {
+                int i;
+                for(i=0; i<GUEST_INTERRUPT_MAX; i++) {
+                    struct hvm_gi_struct *g=v->hvm.summary.guest_interrupt + i;
+                    tsc_t start_tsc = g->start_tsc;
+                    if(start_tsc) {
+                        tsc_t t = (start_tsc == 1) ? 0 : ri->tsc - start_tsc;
+                        if(opt.dump_all)
+                            printf(" [w2h] Halting vec %d is_wake %d time %lld\n",
+                                   i,
+                                   g->is_wake,
+                                   t);
+
+                        if(opt.scatterplot_wake_to_halt
+                           && t
+                           && g->is_wake)
+                            scatterplot_vs_time(ri->tsc, t);
+
+                        if(opt.summary && t) {
+                            if(g->is_wake) {
+                                if(v->hvm.w2h.interrupts==1)
+                                    update_cycles(&g->runtime[GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ALONE],
+                                                  t);
+                                update_cycles(&g->runtime[GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ANY],
+                                              t);
+                            } else {
+                                update_cycles(&g->runtime[GUEST_INTERRUPT_CASE_INTERRUPT_TO_HALT],
+                                              t);
+                            }
+                        }
+                        g->start_tsc = 0;
+                        g->is_wake = 0;
+                    }
+                }
+                v->hvm.w2h.interrupts = 0;
+                v->hvm.w2h.vector = 0;
+            }
+        }
+    }
+
+    /* Sanity checks / tsc skew detection */
+    if( v->runstate.state != sevt.old_runstate
+        && v->runstate.state != RUNSTATE_INIT )
+    {
+        if(v->runstate.state == RUNSTATE_LOST) {
+            if( sevt.new_runstate == RUNSTATE_RUNNING )
+                goto update;
+            else if(opt.dump_all)
+                fprintf(warn, "%s: d%dv%d in runstate lost, not updating to %s\n",
+                        __func__, v->d->did, v->vid,
+                        runstate_name[sevt.new_runstate]);
+            goto no_update;
+        } else if (last_oldstate.wrong == sevt.new_runstate
+                   && last_oldstate.actual == sevt.old_runstate) {
+            tsc_t lag, old_offset;
+            struct pcpu_info *p2;
+
+            if(ri->tsc < last_oldstate.tsc) {
+                fprintf(warn, "WARNING: new tsc %lld < detected runstate tsc %lld! Not updating\n",
+                        ri->tsc, last_oldstate.tsc);
+                goto no_update;
+            }
+
+            p2 = P.pcpu + last_oldstate.pid;
+
+            lag = ri->tsc
+                - last_oldstate.tsc;
+
+            old_offset = p2->tsc_skew.offset;
+
+            cpumask_union(&p2->tsc_skew.downstream, &p->tsc_skew.downstream);
+            cpumask_set(&p2->tsc_skew.downstream, p->pid);
+
+            if(cpumask_isset(&p2->tsc_skew.downstream, p2->pid)) {
+                if ( opt.tsc_loop_fatal )
+                {
+                    fprintf(stderr, "FATAL: tsc skew dependency loop detected!\n");
+                    error(ERR_FILE, NULL);
+                }
+                else
+                {
+                    int i;
+                    fprintf(warn, "Tsc skew dependency loop detected!  Resetting...\n");
+                    for ( i=0; i<=P.max_active_pcpu; i++)
+                    {
+                        struct pcpu_info *p = P.pcpu + i;
+
+                        p->tsc_skew.offset = 0;
+                        cpumask_init(&p->tsc_skew.downstream);
+                    }
+                    goto no_update;
+                }
+            }
+
+            p2->tsc_skew.offset += lag * 2;
+
+            fprintf(warn, "TSC skew detected p%d->p%d, %lld cycles. Changing p%d offset from %lld to %lld\n",
+                    p->pid, p2->pid, lag,
+                    p2->pid,
+                    old_offset,
+                    p2->tsc_skew.offset);
+
+            goto no_update;
+        } else {
+            fprintf(warn, "runstate_change old_runstate %s, d%dv%d runstate %s.  Possible tsc skew.\n",
+                    runstate_name[sevt.old_runstate],
+                    v->d->did, v->vid,
+                    runstate_name[v->runstate.state]);
+
+            v->runstate.last_oldstate.wrong = sevt.old_runstate;
+            v->runstate.last_oldstate.actual = v->runstate.state;
+            v->runstate.last_oldstate.tsc = ri->tsc;
+            v->runstate.last_oldstate.pid = p->pid;
+
+            if ( v->runstate.state == RUNSTATE_RUNNING )
+            {
+                fprintf(warn, " Not updating.\n");
+                goto no_update;
+            }
+            goto update;
+        }
+        fprintf(stderr, "FATAL: Logic hole in %s\n", __func__);
+        error(ERR_ASSERT, NULL);
+    }
+
+update:
+    /* Actually update the runstate.  Special things to do if we're starting
+     * or stopping actually running on a physical cpu. */
+    if ( type == CONTINUE )
+    {
+        if( v->runstate.state == RUNSTATE_INIT ) {
+            /* Start-of-day; account first tsc -> now to v */
+            vcpu_start(p, v);
+        } else {
+            /* Continue running.  First, do some sanity checks */
+            if ( v->runstate.state == RUNSTATE_LOST ) {
+                fprintf(warn, "WARNING: continue with d%dv%d in RUNSTATE_LOST.  Resetting current.\n",
+                        v->d->did, v->vid);
+                if ( p->current )
+                    vcpu_prev_update(p, p->current, ri->tsc, RUNSTATE_LOST);
+                vcpu_next_update(p, v, ri->tsc);
+            }
+            else if( v->runstate.state != RUNSTATE_RUNNING ) {
+                /* This should never happen. */
+                fprintf(warn, "FATAL: sevt.old_runstate running, but d%dv%d runstate %s!\n",
+                        v->d->did, v->vid, runstate_name[v->runstate.state]);
+                error(ERR_FILE, NULL);
+            } else if ( v->p != p ) {
+                fprintf(warn, "FATAL: continue on p%d, but d%dv%d p%d!\n",
+                        p->pid, v->d->did, v->vid,
+                        v->p ? v->p->pid : -1);
+                error(ERR_FILE, NULL);
+            }
+
+            runstate_update(v, RUNSTATE_RUNNING, ri->tsc);
+        }
+    }
+    else if ( sevt.old_runstate == RUNSTATE_RUNNING
+              || v->runstate.state == RUNSTATE_RUNNING )
+    {
+#if 0
+        /* A lot of traces include cpi that shouldn't... */
+        if(perfctrs && v->runstate.tsc) {
+            unsigned long long run_cycles, run_instr;
+            double cpi;
+
+            //run_cycles = r->p1 - v->runstate_p1_start;
+            run_cycles = ri->tsc - v->runstate.tsc;
+            run_instr  = r->p2 - v->runstate.p2_start;
+
+            cpi = ((double)run_cycles) / run_instr;
+
+            if(opt.dump_all) {
+                printf("   cpi: %2.2lf ( %lld / %lld )\n",
+                       cpi, run_cycles, run_instr);
+            }
+
+            if(opt.scatterplot_cpi && v->d->did == 1)
+                printf("%lld,%2.2lf\n",
+                       ri->tsc, cpi);
+
+            if(opt.summary_info)
+                update_cpi(&v->cpi, run_instr, run_cycles);
+        }
+#endif
+        /*
+         * Cases:
+         * old running, v running:
+         *   normal (prev update p, lost record check)
+         * v running, old ! running:
+         *   tsc skew (prev update v->p, lost record check)
+         * old running, v init:
+         start-of-day (fake update, prev p, lost record)
+         * old running, v !{running,init}:
+         *   # (should never happen)
+         */
+        if( sevt.old_runstate == RUNSTATE_RUNNING ) {
+            if( v->runstate.state == RUNSTATE_INIT ) {
+                /* Start-of-day; account first tsc -> now to v */
+                vcpu_start(p, v);
+            } else if( v->runstate.state != RUNSTATE_RUNNING
+                       && v->runstate.state != RUNSTATE_LOST ) {
+                /* This should never happen. */
+                fprintf(warn, "FATAL: sevt.old_runstate running, but d%dv%d runstate %s!\n",
+                        v->d->did, v->vid, runstate_name[v->runstate.state]);
+                error(ERR_FILE, NULL);
+            }
+
+            vcpu_prev_update(p, v, ri->tsc, sevt.new_runstate);
+        } else {
+            vcpu_prev_update(v->p, v, ri->tsc, sevt.new_runstate);
+        }
+
+        if(P.lost_cpus && v->d->did != IDLE_DOMAIN) {
+            if(opt.dump_all)
+                fprintf(warn, "%s: %d lost cpus, setting d%dv%d runstate to RUNSTATE_LOST\n",
+                        __func__, P.lost_cpus, v->d->did, v->vid);
+            lose_vcpu(v, ri->tsc);
+        }
+    }
+    else if ( sevt.new_runstate == RUNSTATE_RUNNING )
+    {
+        if(perfctrs) {
+            v->runstate.p1_start = r->p1;
+            v->runstate.p2_start = r->p2;
+        }
+
+        vcpu_next_update(p, v, ri->tsc);
+    }
+    else if ( v->runstate.state != RUNSTATE_INIT )
+    {
+        /* TSC skew at start-of-day is hard to deal with.  Don't
+         * bring a vcpu out of INIT until it's seen to be actually
+         * running somewhere. */
+        runstate_update(v, sevt.new_runstate, ri->tsc);
+    }
+
+no_update:
+    return;
+}
+
+void sched_switch_process(struct pcpu_info *p)
+{
+    struct vcpu_data *prev, *next;
+    struct record_info *ri = &p->ri;
+    struct {
+        unsigned int prev_dom, prev_vcpu, next_dom, next_vcpu;
+    } * r = (typeof(r))ri->d;
+
+    if(opt.dump_all)
+        printf("%s sched_switch prev d%uv%u next d%uv%u\n",
+               ri->dump_header,
+               r->prev_dom, r->prev_vcpu,
+               r->next_dom, r->next_vcpu);
+
+    if(r->prev_vcpu > MAX_CPUS)
+    {
+        fprintf(warn, "%s: prev_vcpu %u > MAX_VCPUS %d!\n",
+                __func__, r->prev_vcpu, MAX_CPUS);
+        return;
+    }
+
+    if(r->next_vcpu > MAX_CPUS)
+    {
+        fprintf(warn, "%s: next_vcpu %u > MAX_VCPUS %d!\n",
+                __func__, r->next_vcpu, MAX_CPUS);
+        return;
+    }
+
+    prev = vcpu_find(r->prev_dom, r->prev_vcpu);
+    next = vcpu_find(r->next_dom, r->next_vcpu);
+
+    vcpu_prev_update(p, prev, ri->tsc, RUNSTATE_QUEUED); /* FIXME */
+
+    vcpu_next_update(p, next, ri->tsc);
+}
+
+void sched_default_vcpu_activate(struct pcpu_info *p)
+{
+    struct vcpu_data *v = default_domain.vcpu[p->pid];
+
+    if(!v)
+        v = vcpu_create(&default_domain, p->pid);
+
+    assert(v == v->d->vcpu[v->vid]);
+
+    v->activated = 1;
+    v->guest_paging_levels = opt.default_guest_paging_levels;
+    v->p = p;
+    v->runstate.state = RUNSTATE_RUNNING;
+
+    p->current = v;
+    pcpu_string_draw(p);
+}
+
+void sched_default_domain_init(void)
+{
+    struct domain_data *d = &default_domain;
+
+    domain_init(d, DEFAULT_DOMAIN);
+}
+
+void runstate_clear(tsc_t * runstate_cycles)
+{
+    int i;
+    for(i=0; i<RUNSTATE_MAX; i++)
+        runstate_cycles[i]=0;
+}
+
+void runstate_summary(tsc_t * runstate_cycles)
+{
+    int i;
+    for(i=0; i<RUNSTATE_MAX; i++)
+        if(runstate_cycles[i]) {
+            struct time_struct t;
+            cycles_to_time(runstate_cycles[i], &t);
+            printf("  %s: %u.%09u s\n",
+                   runstate_name[i], t.s, t.ns);
+        }
+}
+
+void sched_summary_vcpu(struct vcpu_data *v)
+{
+    int i;
+    char desc[30];
+
+    /* FIXME: Update all records like this */
+    if ( v->pcpu_tsc )
+    {
+        update_cycles(&v->cpu_affinity_all, P.f.last_tsc - v->pcpu_tsc);
+        update_cycles(&v->cpu_affinity_pcpu[v->p->pid], P.f.last_tsc - v->pcpu_tsc);
+    }
+
+    printf(" Runstates:\n");
+    for(i=0; i<RUNSTATE_MAX; i++) {
+        snprintf(desc,30, "  %8s", runstate_name[i]);
+        print_cycle_summary(v->runstates+i, desc);
+        if ( i==RUNSTATE_RUNNABLE )
+        {
+            int j;
+            for(j=0; j<RUNNABLE_STATE_MAX; j++) {
+                if ( j == RUNNABLE_STATE_INVALID )
+                    continue;
+                snprintf(desc,30, "    %8s", runnable_state_name[j]);
+                print_cycle_summary(v->runnable_states+j, desc);
+            }
+        }
+    }
+    print_cpi_summary(&v->cpi);
+    print_cpu_affinity(&v->cpu_affinity_all, " cpu affinity");
+    for ( i = 0; i < MAX_CPUS ; i++)
+    {
+        snprintf(desc,30, "   [%d]", i);
+        print_cpu_affinity(v->cpu_affinity_pcpu+i, desc);
+    }
+}
+
+void sched_summary_domain(struct domain_data *d)
+{
+    int i;
+    char desc[30];
+
+    printf(" Runstates:\n");
+    for(i=0; i<DOMAIN_RUNSTATE_MAX; i++) {
+        snprintf(desc,30, "  %8s", domain_runstate_name[i]);
+        print_cycle_summary(d->runstates+i, desc);
+    }
+}
+
+
+void sched_process(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+
+    if(ri->evt.sub == 0xf) {
+        switch(ri->event)
+        {
+        case TRC_SCHED_SWITCH:
+            sched_switch_process(p);
+            break;
+        default:
+            process_generic(&p->ri);
+        }
+    } else {
+        if(ri->evt.sub == 1)
+            sched_runstate_process(p);
+        else {
+            UPDATE_VOLUME(p, sched_verbose, ri->size);
+            process_generic(&p->ri);
+        }
+    }
+}
+
+/* ---- Memory ---- */
+void mem_summary_domain(struct domain_data *d) {
+    int i, j;
+
+    printf(" Grant table ops:\n");
+
+    printf("  Done by:\n");
+    for(i=0; i<MEM_MAX; i++)
+        if(d->memops.done[i])
+            printf("   %-14s: %d\n",
+                   mem_name[i],
+                   d->memops.done[i]);
+
+    printf("  Done for:\n");
+    for(i=0; i<MEM_MAX; i++)
+        if(d->memops.done_for[i])
+            printf("   %-14s: %d\n",
+                   mem_name[i],
+                   d->memops.done_for[i]);
+
+    printf(" Populate-on-demand:\n");
+    printf("  Populated:\n");
+    for(i=0; i<4; i++)
+    {
+        if ( d->pod.populate_order[i] )
+            printf("   [%d] %d\n", i,
+                   d->pod.populate_order[i]);
+    }
+    printf("  Reclaim order:\n");
+    for(i=0; i<4; i++)
+    {
+        if ( d->pod.reclaim_order[i] )
+            printf("   [%d] %d\n", i,
+                   d->pod.reclaim_order[i]);
+    }
+    printf("  Reclaim contexts:\n");
+    for(j=0; j<POD_RECLAIM_CONTEXT_MAX; j++)
+    {
+        if ( d->pod.reclaim_context[j] )
+        {
+            printf("   * [%s] %d\n",
+                   pod_reclaim_context_name[j],
+                   d->pod.reclaim_context[j]);
+            for(i=0; i<4; i++)
+            {
+                if ( d->pod.reclaim_context_order[j][i] )
+                    printf("    [%d] %d\n", i,
+                           d->pod.reclaim_context_order[j][i]);
+            }
+        }
+    }
+}
+
+int p2m_canonical_order(int order)
+{
+    if ( order % 9
+         || (order / 9) > 2 )
+    {
+        fprintf(warn, "%s: Strange, non-canonical order %d\n",
+                __func__, order);
+        order = 4;
+    } else {
+        order /= 9;
+    }
+    return order;
+}
+
+void mem_pod_zero_reclaim_process(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+    int context = POD_RECLAIM_CONTEXT_UNKNOWN;
+    struct vcpu_data *v = p->current;
+
+    struct {
+        uint64_t gfn, mfn;
+        int d:16,order:16;
+    } *r = (typeof(r))ri->d;
+
+    if ( v && v->hvm.vmexit_valid )
+    {
+        switch(v->hvm.exit_reason)
+        {
+        case EXIT_REASON_EPT_VIOLATION:
+        case EXIT_REASON_EXCEPTION_NMI:
+            context = POD_RECLAIM_CONTEXT_FAULT;
+            break;
+        case EXIT_REASON_VMCALL:
+            context = POD_RECLAIM_CONTEXT_BALLOON;
+            break;
+        }
+    }
+
+    if ( opt.dump_all )
+    {
+        printf(" %s pod_zero_reclaim d%d o%d g %llx m %llx ctx %s\n",
+               ri->dump_header,
+               r->d, r->order,
+               (unsigned long long)r->gfn, (unsigned long long)r->mfn,
+               pod_reclaim_context_name[context]);
+
+    }
+
+    if ( opt.summary_info )
+    {
+        struct domain_data *d;
+
+        if ( v && (d=v->d) )
+        {
+            int order;
+
+            order = p2m_canonical_order(r->order);
+
+            d->pod.reclaim_order[order]++;
+            d->pod.reclaim_context[context]++;
+            d->pod.reclaim_context_order[context][order]++;
+        }
+    }
+}
+
+void mem_pod_populate_process(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+
+    struct {
+        uint64_t gfn, mfn;
+        int d:16,order:16;
+    } *r = (typeof(r))ri->d;
+
+    if ( opt.dump_all )
+    {
+        printf(" %s pod_populate d%d o%d g %llx m %llx\n",
+               ri->dump_header,
+               r->d, r->order,
+               (unsigned long long)r->gfn, (unsigned long long)r->mfn);
+    }
+
+    if ( opt.summary_info )
+    {
+        struct vcpu_data *v = p->current;
+        struct domain_data *d;
+
+        if ( v && (d=v->d) )
+        {
+            int order;
+
+            order = p2m_canonical_order(r->order);
+
+            d->pod.populate_order[order]++;
+        }
+    }
+}
+
+void mem_pod_superpage_splinter_process(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+
+    struct {
+        uint64_t gfn;
+        int d:16;
+    } *r = (typeof(r))ri->d;
+
+    if ( opt.dump_all )
+    {
+        printf(" %s pod_spage_splinter d%d g %llx\n",
+               ri->dump_header,
+               r->d, (unsigned long long)r->gfn);
+    }
+}
+
+void mem_page_grant(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+
+    struct {
+        unsigned domain;
+    } *r = (typeof(r))ri->d;
+    union pv_event pevt = { .event = ri->event };
+
+    if ( opt.dump_all )
+    {
+        printf(" %s %s domain %u\n", ri->dump_header, mem_name[pevt.minor], r->domain);
+    }
+}
+void mem_set_p2m_entry_process(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+
+    struct {
+        uint64_t gfn, mfn;
+        int p2mt;
+        int d:16,order:16;
+    } *r = (typeof(r))ri->d;
+
+    if ( opt.dump_all )
+    {
+        printf(" %s set_p2m_entry d%d o%d t %d g %llx m %llx\n",
+               ri->dump_header,
+               r->d, r->order,
+               r->p2mt,
+               (unsigned long long)r->gfn, (unsigned long long)r->mfn);
+    }
+}
+
+void mem_decrease_reservation_process(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+
+    struct {
+        uint64_t gfn;
+        int d:16,order:16;
+    } *r = (typeof(r))ri->d;
+
+    if ( opt.dump_all )
+    {
+        printf(" %s decrease_reservation d%d o%d g %llx\n",
+               ri->dump_header,
+               r->d, r->order,
+               (unsigned long long)r->gfn);
+    }
+}
+
+void mem_process(struct pcpu_info *p) {
+    struct record_info *ri = &p->ri;
+    struct {
+        int dom;
+    } *r = (typeof(r))ri->d;
+
+    int minor = ri->evt.minor;
+
+    switch ( minor )
+    {
+    case MEM_PAGE_GRANT_MAP:
+    case MEM_PAGE_GRANT_UNMAP:
+    case MEM_PAGE_GRANT_TRANSFER:
+        mem_page_grant(p);
+        break;
+    case MEM_SET_P2M_ENTRY:
+        mem_set_p2m_entry_process(p);
+        break;
+    case MEM_DECREASE_RESERVATION:
+        mem_decrease_reservation_process(p);
+        break;
+    case MEM_POD_POPULATE:
+        mem_pod_populate_process(p);
+        break;
+    case MEM_POD_ZERO_RECLAIM:
+        mem_pod_zero_reclaim_process(p);
+        break;
+    case MEM_POD_SUPERPAGE_SPLINTER:
+        mem_pod_superpage_splinter_process(p);
+        break;
+    default:
+        if(opt.dump_all) {
+            dump_generic(stdout, ri);
+        }
+
+        if(opt.summary_info && minor < MEM_MAX) {
+            struct domain_data *d;
+
+            if(p->current) {
+                if (p->current->d) {
+                    p->current->d->memops.done[minor]++;
+                    p->current->d->memops.done_interval[minor]++;
+                }
+                if((d=domain_find(r->dom))) {
+                    d->memops.done_for[minor]++;
+                    d->memops.done_for_interval[minor]++;
+                }
+            }
+        }
+        break;
+    }
+
+}
+
+/* ---- PM ---- */
+#define CSTATE_MAX 5
+#define CSTATE_INVALID ((CSTATE_MAX)+1)
+void pm_process(struct pcpu_info *p) {
+    struct record_info *ri = &p->ri;
+
+    switch ( ri->event )
+    {
+    case TRC_PM_FREQ_CHANGE:
+        if (opt.dump_all )
+            printf(" %s pm_freq_change o%d n%d\n",
+                   ri->dump_header,
+                   ri->d[0],
+                   ri->d[1]);
+        break;
+    case TRC_PM_IDLE_ENTRY:
+        if (opt.dump_all )
+            printf(" %s pm_idle_start c%d\n",
+                   ri->dump_header,
+                   ri->d[0]);
+        if ( ri->d[0] <= CSTATE_MAX )
+        {
+            p->power_state=ri->d[0];
+            pcpu_string_draw(p);
+        }
+        break;
+    case TRC_PM_IDLE_EXIT:
+        if (opt.dump_all )
+            printf(" %s pm_idle_end c%d\n",
+                   ri->dump_header,
+                   ri->d[0]);
+        if ( p->power_state != ri->d[0]
+             && p->power_state != CSTATE_INVALID )
+            printf("Strange, pm_idle_end %d, power_state %d!\n",
+                   ri->d[0], p->power_state);
+        p->power_state = 0;
+        pcpu_string_draw(p);
+        break;
+    default:
+        if(opt.dump_all) {
+            dump_generic(stdout, ri);
+        }
+        break;
+    }
+
+}
+
+/*
+ * IRQ related stuff
+ */
+
+#define MAX_VECTOR 256
+int global_vector_used[256] = {0};
+struct pci_dev {
+    uint8_t bus;
+    uint8_t devfn;
+    int vector_used[MAX_VECTOR];
+    struct pci_dev *next;
+} *pdev_list;
+
+#define MAX_IRQ 512
+struct irq_desc {
+    enum {
+        IRQ_NONE,
+        IRQ_MSI,
+        IRQ_GSI
+    } type;
+    struct pci_dev *dev;
+} irq_table[MAX_IRQ];
+
+struct pci_dev * pdev_find(uint8_t bus, uint8_t devfn)
+{
+    struct pci_dev *d, *n, **q;
+
+    /* Look for domain, keeping track of the last pointer so we can add
+       a domain if we need to. */
+    for ( d = pdev_list, q=&pdev_list ;
+          d &&  ( (d->bus < bus)
+                  || (d->bus == bus && d->devfn < devfn) ) ;
+          q = &d->next, d=d->next ) ;
+
+    if(d && d->bus == bus && d->devfn == devfn)
+        return d;
+
+    /* Make a new domain */
+    fprintf(warn, "Creating pdev %02x:%02x.%x\n", bus, devfn>>4, devfn&3);
+
+    if((n=malloc(sizeof(*n)))==NULL)
+    {
+        fprintf(stderr, "%s: malloc %zd failed!\n", __func__, sizeof(*n));
+        error(ERR_SYSTEM, NULL);
+    }
+
+    bzero(n, sizeof(*n));
+
+    n->bus=bus;
+    n->devfn=devfn;
+
+    /* Insert it into the list */
+    n->next = d;
+    *q = n;
+
+    return n;
+}
+
+void irq_process(struct pcpu_info *p) {
+    struct record_info *ri = &p->ri;
+
+    switch ( ri->event )
+    {
+    case TRC_HW_IRQ_BIND_VECTOR:
+    {
+        struct {
+            int irq, vec;
+            unsigned mask[4];
+        } *r = (typeof(r))ri->d;
+        if ( opt.dump_all )
+        {
+            printf(" %s irq_bind_vector irq %x vec %x mask %04x %04x %04x %04x\n",
+                   ri->dump_header,
+                   r->irq, r->vec,
+                   r->mask[3],
+                   r->mask[2],
+                   r->mask[1],
+                   r->mask[0]);
+        }
+        break;
+    }
+    case TRC_HW_IRQ_HANDLED:
+    {
+        struct {
+            int irq, start_tsc, end_tsc;
+        } *r = (typeof(r))ri->d;
+        int arctime;
+
+        arctime = r->end_tsc - r->start_tsc;
+        if ( opt.dump_all )
+        {
+            printf(" %s irq_handled irq %x %d (%d,%d)\n",
+                   ri->dump_header,
+                   r->irq, arctime, r->start_tsc, r->end_tsc);
+        }
+        if ( opt.scatterplot_irq )
+        {
+            struct time_struct t;
+
+            abs_cycles_to_time(ri->tsc, &t);
+
+            printf("i%x %u.%09u %d\n",
+                   (unsigned)r->irq,
+                   t.s, t.ns,
+                   p->pid);
+        }
+        break;
+    }
+    case TRC_HW_IRQ_ASSIGN_VECTOR:
+    {
+        struct {
+            int irq, vec;
+            unsigned mask[4];
+        } *r = (typeof(r))ri->d;
+        if ( opt.dump_all )
+        {
+            printf(" %s irq_assign_vector irq %x vec %x mask %04x %04x %04x %04x\n",
+                   ri->dump_header,
+                   r->irq, r->vec,
+                   r->mask[3],
+                   r->mask[2],
+                   r->mask[1],
+                   r->mask[0]);
+        }
+        if ( r->irq < MAX_IRQ
+             && r->vec < MAX_VECTOR )
+        {
+            if ( irq_table[r->irq].type == IRQ_MSI )
+            {
+                if(global_vector_used[r->vec])
+                    fprintf(warn, "  Vector collision on global table!\n");
+                global_vector_used[r->vec]=1;
+            }
+            if( irq_table[r->irq].dev )
+            {
+                struct pci_dev * pdev=irq_table[r->irq].dev;
+
+                if(pdev->vector_used[r->vec])
+                    fprintf(warn, "  Vector collision on %02x.%02x!\n",
+                            pdev->bus, pdev->devfn);
+                pdev->vector_used[r->vec]=1;
+            }
+        }
+        break;
+    }
+    case TRC_HW_IRQ_MOVE_CLEANUP_DELAY:
+    {
+        struct {
+            int irq, vec, cpu;
+        } *r = (typeof(r))ri->d;
+
+        if ( opt.dump_all )
+        {
+            printf(" %s irq_move_cleanup_delay irq %x vec %x cpu %d\n",
+                   ri->dump_header,
+                   r->irq, r->vec, r->cpu);
+        }
+        break;
+    }
+    case TRC_HW_IRQ_MOVE_CLEANUP:
+    {
+        struct {
+            int irq;
+            int vec;
+            int cpu;
+        } *r = (typeof(r))ri->d;
+
+        if ( opt.dump_all )
+        {
+            printf(" %s irq_move_cleanup irq %x vec %x cpu %d\n",
+                   ri->dump_header,
+                   r->irq, r->vec, r->cpu);
+        }
+        if ( r->irq < MAX_IRQ
+             && r->vec < MAX_VECTOR )
+        {
+            if ( irq_table[r->irq].type == IRQ_MSI )
+            {
+                if(!global_vector_used[r->vec])
+                    fprintf(warn,"  Strange, cleanup on non-used vector\n");
+                global_vector_used[r->vec]=0;
+            }
+            if ( irq_table[r->irq].dev )
+            {
+                struct pci_dev * pdev=irq_table[r->irq].dev;
+
+                if(!pdev->vector_used[r->vec])
+                    fprintf(warn,"  Strange, cleanup on non-used vector\n");
+                pdev->vector_used[r->vec]=0;
+            }
+        }
+        break;
+    }
+    case TRC_HW_IRQ_UNMAPPED_VECTOR:
+    {
+        struct {
+            int vec;
+        } *r = (typeof(r))ri->d;
+
+        if ( opt.dump_all )
+        {
+            printf(" %s irq_unmapped_vector vec %x\n",
+                   ri->dump_header,
+                   r->vec);
+        }
+        break;
+    }
+    case TRC_HW_IRQ_CLEAR_VECTOR:
+    case TRC_HW_IRQ_MOVE_FINISH :
+    default:
+        if(opt.dump_all) {
+            dump_generic(stdout, ri);
+        }
+        break;
+    }
+}
+
+#define TRC_HW_SUB_PM 1
+#define TRC_HW_SUB_IRQ 2
+void hw_process(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+
+    switch(ri->evt.sub)
+    {
+    case TRC_HW_SUB_PM:
+        pm_process(p);
+        break;
+    case TRC_HW_SUB_IRQ:
+        irq_process(p);
+        break;
+    }
+
+}
+/* ---- Base ----- */
+void dump_generic(FILE * f, struct record_info *ri)
+{
+    int i;
+
+    fprintf(f, "]%s %7x(%x:%x:%x) %u [",
+           ri->dump_header,
+           ri->event,
+           ri->evt.main,
+           ri->evt.sub,
+           ri->evt.minor,
+           ri->extra_words);
+
+    for(i=0; i<ri->extra_words; i++) {
+        fprintf(f, " %x", ri->d[i]);
+    }
+
+    fprintf(f, " ]\n");
+}
+
+void dump_raw(char * s, struct record_info *ri)
+{
+    int i;
+
+    if(ri->rec.cycle_flag)
+        printf("%s %7x %d %14lld [",
+               s, ri->event, ri->extra_words, ri->tsc);
+    else
+        printf("%s %7x %d %14s [",
+               s, ri->event, ri->extra_words, "-");
+
+    for(i=0; i<7; i++) {
+        if ( i < ri->extra_words )
+            printf(" %8x", ri->d[i]);
+        else
+            printf("         ");
+    }
+
+    printf(" ] | ");
+
+    for (i=0; i<8; i++) {
+        printf(" %08x", ri->rec.raw[i]);
+    }
+
+    printf(" |\n");
+}
+
+void error(enum error_level l, struct record_info *ri)
+{
+    if ( l > opt.tolerance )
+    {
+        if ( ri )
+            dump_generic(warn, ri);
+        exit(1);
+    }
+}
+
+int check_extra_words(struct record_info *ri,
+                       int expected_size,
+                       const char *record)
+{
+    static int off_by_one = 0;
+    int expected_extra = expected_size / sizeof(unsigned int);
+
+    if(ri->extra_words != expected_extra
+       && !(off_by_one && ri->extra_words == expected_extra + 1) )
+    {
+        if ( !off_by_one && ri->extra_words == expected_extra + 1 )
+        {
+            fprintf(warn, "Detected off-by-one bug; relaxing expectations\n");
+            off_by_one=1;
+        }
+        else {
+            fprintf(warn, "ERROR: %s extra_words %d, expected %d!\n",
+                    record,
+                    ri->extra_words, expected_extra);
+            error(ERR_RECORD, ri);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+void process_generic(struct record_info *ri) {
+
+    error(ERR_STRICT, ri);
+
+    if(opt.dump_all) {
+        dump_generic(stdout, ri);
+    }
+}
+
+int vcpu_set_data_type(struct vcpu_data *v, int type)
+{
+    if (v->data_type == VCPU_DATA_NONE )
+    {
+        v->data_type = type;
+        switch(type)
+        {
+        case VCPU_DATA_HVM:
+            init_hvm_data(&v->hvm, v);
+            break;
+        default:
+            break;
+        }
+    }
+    else
+        assert(v->data_type == type);
+    return 0;
+}
+
+
+void lose_vcpu(struct vcpu_data *v, tsc_t tsc)
+{
+    if(v->data_type == VCPU_DATA_HVM)
+        v->hvm.vmexit_valid=0;
+    runstate_update(v, RUNSTATE_LOST, tsc);
+    hvm_vlapic_clear(&v->vlapic);
+
+    if(v->data_type == VCPU_DATA_HVM) {
+        int i;
+        if(opt.dump_all)
+            printf(" [w2h] Clearing w2h state for d%dv%d\n",
+                   v->d->did, v->vid);
+        v->hvm.w2h.interrupts=0;
+        v->hvm.w2h.vector=0;
+        v->hvm.w2h.waking = 0;
+        for(i=0; i<GUEST_INTERRUPT_MAX; i++)  {
+            if(opt.dump_all && v->hvm.summary.guest_interrupt[i].start_tsc) {
+                printf("  Interrupt %d clearing start_tsc %lld\n",
+                       i, v->hvm.summary.guest_interrupt[i].start_tsc);
+            }
+            v->hvm.summary.guest_interrupt[i].start_tsc = 0;
+        }
+    }
+}
+
+struct lost_record_struct {
+        int lost_records;
+        unsigned did:16,vid:16;
+        tsc_t first_tsc;
+};
+
+void process_lost_records(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+    struct lost_record_struct *r = (typeof(r))ri->d;
+    tsc_t first_tsc; /* TSC of first record that was lost */
+
+    /* Sanity checks */
+    if(ri->extra_words != 4)
+    {
+         fprintf(warn, "FATAL: Lost record has unexpected extra words %d!\n",
+                 ri->extra_words);
+         error(ERR_RECORD, ri);
+         return;
+    }
+
+    first_tsc = r->first_tsc;
+
+    if(opt.dump_all)
+    {
+        if(p->current)
+            printf(" %s lost_records count %d d%uv%u (cur d%dv%d) first_tsc %lld\n",
+                   ri->dump_header, r->lost_records,
+                   r->did, r->vid,
+                   p->current->d->did, p->current->vid,
+                   r->first_tsc);
+        else
+            printf(" %s lost_records count %d d%uv%u (cur X) first_tsc %lld\n",
+                   ri->dump_header, r->lost_records,
+                   r->did, r->vid,
+                   r->first_tsc);
+    }
+
+#if 0
+    if(opt.dump_trace_volume_on_lost_record)
+        volume_summary(&p->volume.last_buffer);
+#endif
+
+    if ( p->current ) {
+
+        hvm_vlapic_clear(&p->current->vlapic);
+        if(p->current->data_type == VCPU_DATA_HVM) {
+            p->current->hvm.vmexit_valid=0;
+            cr3_switch(0, &p->current->hvm);
+        }
+
+        /* We may lose scheduling records; so we need to:
+         * - Point all records until now to the next schedule in the
+         * "default" domain
+         * - Make sure there are no warnings / strangeness with the
+         * current vcpu (if it gets scheduled elsewhere).
+         */
+        vcpu_prev_update(p, p->current, first_tsc, RUNSTATE_LOST);
+    }
+#if 0
+    vcpu_next_update(p, default_domain.vcpu[p->pid], first_tsc);
+    if(p->current->data_type == VCPU_DATA_HVM) {
+        p->current->hvm.vmexit_valid=0;
+    }
+#endif
+
+    /* The lost record trace is processed early -- i.e.,
+     * After the last good record, rather than when the next
+     * record is processed.  Between the time it's processed and
+     * the time it actually went in, the vcpu may be scheduled on
+     * other processors.  So we can't switch vcpus until the first
+     * TSC'd record after the lost record. */
+    if(!p->lost_record.active) {
+        P.lost_cpus++;
+        if(P.lost_cpus > P.max_active_pcpu + 1) {
+            fprintf(warn, "ERROR: P.lost_cpus %d > P.max_active_pcpu + 1 %d!\n",
+                    P.lost_cpus, P.max_active_pcpu + 1);
+            error(ERR_ASSERT, NULL);
+        }
+    } else
+        fprintf(warn, "Strange, lost record for pcpu %d, but lost_record still active!\n",
+                p->pid);
+
+    p->lost_record.active = 1;
+    p->lost_record.tsc = first_tsc;
+    pcpu_string_draw(p);
+
+    {
+        /* Any vcpu which is not actively running may be scheduled on the
+         * lost cpu.  To avoid mis-accounting, we need to reset */
+        struct domain_data *d;
+        int i;
+        for(d=domain_list ; d; d=d->next)
+        {
+            if(d->did != DEFAULT_DOMAIN) {
+                for(i=0; i<MAX_CPUS; i++)
+                    if(d->vcpu[i] &&
+                       d->vcpu[i]->runstate.state != RUNSTATE_RUNNING) {
+                        if(opt.dump_all)
+                            fprintf(warn, "%s: setting d%dv%d to RUNSTATE_LOST\n",
+                                    __func__, d->did, i);
+                        lose_vcpu(d->vcpu[i], first_tsc);
+                    }
+            }
+        }
+    }
+
+    p->lost_record.domain_valid=1;
+    p->lost_record.did=r->did;
+    p->lost_record.vid=r->vid;
+}
+
+
+void process_lost_records_end(struct pcpu_info *p)
+{
+    struct record_info *ri = &p->ri;
+    struct lost_record_struct *r = (typeof(r))ri->d;
+
+    if(!p->lost_record.active) {
+        fprintf(warn, "FATAL: lost_records_end but pid %d not lost!\n",
+                p->pid);
+        error(ERR_FILE, NULL);
+        return;
+    }
+
+    /* Lost records.  If this is the first record on a pcpu after the loss,
+     * Update the information. */
+    if(ri->tsc > p->lost_record.tsc)
+    {
+        if(opt.dump_all)
+            printf("               %s lost_records end ---\n",
+                   pcpu_string(p->pid));
+
+        update_cycles(&p->time.lost, ri->tsc - p->lost_record.tsc);
+
+        if(p->lost_record.domain_valid) {
+            int did = p->lost_record.did,
+                vid = p->lost_record.vid;
+
+            if(opt.dump_all)
+                printf("               %s lost_records end d%dv%d---\n",
+                       pcpu_string(p->pid),
+                       did, vid);
+            if(p->current)
+            {
+                fprintf(warn, "FATAL: lost_record valid (d%dv%d), but current d%dv%d!\n",
+                        did, vid,
+                        p->current->d->did, p->current->vid);
+                error(ERR_FILE, NULL);
+                return;
+            }
+
+            if(opt.dump_all)
+                fprintf(warn, "Changing p%d current to d%dv%d\n",
+                        p->pid, did, vid);
+            vcpu_next_update(p,
+                             vcpu_find(did, vid),
+                             ri->tsc);
+            p->lost_record.domain_valid=0;
+            p->lost_record.seen_valid_schedule=0; /* Let next vcpu_next_update know that
+                                                     this one was inferred */
+        } else {
+            if(opt.dump_all)
+                printf("               %s lost_records end (domain invalid)---\n",
+                       pcpu_string(p->pid));
+        }
+
+
+        p->lost_record.active = 0;
+        pcpu_string_draw(p);
+        P.lost_cpus--;
+        if(P.lost_cpus < 0) {
+            fprintf(warn, "ERROR: lost_cpus fell below 0 for pcpu %d!\n",
+                    p->pid);
+            error(ERR_ASSERT, NULL);
+        }
+    }
+}
+
+void base_process(struct pcpu_info *p) {
+    struct record_info *ri = &p->ri;
+    switch(ri->event)
+    {
+    case TRC_TRACE_WRAP_BUFFER:
+        break;
+    case TRC_LOST_RECORDS:
+        process_lost_records(p);
+        break;
+    case TRC_LOST_RECORDS_END:
+        process_lost_records_end(p);
+        break;
+    default:
+        process_generic(ri);
+    }
+ }
+
+
+
+/* Non-compat only */
+void record_order_insert(struct pcpu_info *new);
+void record_order_remove(struct pcpu_info *rem);
+void record_order_bubble(struct pcpu_info *last);
+
+struct cpu_change_data {
+    int cpu;
+    unsigned window_size;
+};
+
+void activate_early_eof(void) {
+    struct pcpu_info *p;
+    int i;
+
+    fprintf(warn, "Short cpu_change window, activating early_eof\n");
+
+    P.early_eof = 1;
+
+    for(i=0; i<=P.max_active_pcpu; i++) {
+        p = P.pcpu + i;
+        if(p->active && p->file_offset > P.last_epoch_offset) {
+            fprintf(warn, " deactivating pid %d\n",
+                    p->pid);
+            p->active = 0;
+        }
+    }
+}
+
+off_t scan_for_new_pcpu(off_t offset) {
+    ssize_t r;
+    struct trace_record rec;
+    struct cpu_change_data *cd;
+
+    r=__read_record(&rec, offset);
+
+    if(r==0)
+        return 0;
+
+    if(rec.event != TRC_TRACE_CPU_CHANGE
+       || rec.cycle_flag)
+    {
+        fprintf(stderr, "%s: Unexpected record event %x!\n",
+                __func__, rec.event);
+        error(ERR_ASSERT, NULL); /* Actually file, but can't recover */
+    }
+
+    cd = (typeof(cd))rec.u.notsc.data;
+
+    if ( cd->cpu > MAX_CPUS )
+    {
+        fprintf(stderr, "%s: cpu %d exceeds MAX_CPU %d!\n",
+                __func__, cd->cpu, MAX_CPUS);
+        /* FIXME: Figure out if we could handle this more gracefully */
+        error(ERR_ASSERT, NULL);
+    }
+
+    if(cd->cpu > P.max_active_pcpu || !P.pcpu[cd->cpu].active) {
+        struct pcpu_info *p = P.pcpu + cd->cpu;
+
+        fprintf(warn, "%s: Activating pcpu %d at offset %lld\n",
+                __func__, cd->cpu, (unsigned long long)offset);
+
+        p->active = 1;
+        /* Process this cpu_change record first */
+        p->ri.rec = rec;
+        p->ri.size = r;
+        __fill_in_record_info(p);
+
+        p->file_offset = offset;
+        p->next_cpu_change_offset = offset;
+
+        record_order_insert(p);
+
+        offset += r + cd->window_size;
+
+        sched_default_vcpu_activate(p);
+
+        if ( cd->cpu > P.max_active_pcpu )
+            P.max_active_pcpu = cd->cpu;
+
+        return offset;
+    } else {
+        return 0;
+    }
+}
+
+/*
+ * Conceptually, when we reach a cpu_change record that's not for our pcpu,
+ * we want to scan forward through the file until we reach one that's for us.
+ * However, looping through involves reading the file, which we'd rather
+ * do in one place.  Because cpu_change records don't include a tsc,
+ * the same pcpu will be processed repeatedly until the cpu_change
+ * equals p->pid.
+ *
+ * There are two additional things we need to do in this algorithm:
+ * + Detect new pcpus as they come online
+ * + De-activate pcpus which don't have any more records
+ *
+ * Detecting new pcpus which are less than P.max_active_pcpu is straight-
+ * forward: when max_active_pcpu is searching for its next cpu window,
+ * it will pass by the new cpu's window, and can activate it then.
+ *
+ * Detecting new pcpus greater than P.max_active_pcpu is a little harder;
+ * When max_active_pcpu is scanning for its next cpu window, after it's found
+ * it, we need to scan one more window forward to see if its' an already-active
+ * pcpu; if not, activate it.
+ *
+ * We also need to deal with truncated files, where records from one pcpu may
+ * be present but not from another pcpu due to lack of disk space.  The best
+ * thing to do is to find the last "epoch" and essentially truncate the file
+ * to that.
+ */
+void deactivate_pcpu(struct pcpu_info *p)
+{
+    if ( p->current )
+    {
+        pcpu_runstate_update(p, p->last_tsc);
+
+        fprintf(warn, "%s: setting d%dv%d to state LOST\n",
+                __func__, p->current->d->did,
+                p->current->vid);
+        lose_vcpu(p->current, p->last_tsc);
+    }
+    p->active = 0;
+
+    record_order_remove(p);
+
+    if ( p->pid == P.max_active_pcpu )
+    {
+        int i, max_active_pcpu = -1;
+        for(i=0; i<=P.max_active_pcpu; i++)
+        {
+            if(!P.pcpu[i].active)
+                continue;
+
+            max_active_pcpu = i;
+        }
+        P.max_active_pcpu = max_active_pcpu;
+        fprintf(warn, "%s: Setting max_active_pcpu to %d\n",
+                __func__, max_active_pcpu);
+    }
+
+}
+
+/* Helper function to process tsc-related record info */
+void process_record_tsc(tsc_t order_tsc, struct record_info *ri)
+{
+    /* Find the first tsc set */
+    if(ri->tsc && ri->tsc >= P.f.first_tsc) {
+        /* We use the order_tsc to account for the second processing of
+         * a lost record.  */
+        tsc_t tsc = order_tsc;
+
+        if(P.f.first_tsc == 0) {
+            P.f.first_tsc = tsc;
+            if ( opt.interval_mode ) {
+                P.interval.start_tsc = tsc;
+            }
+        } else {
+            if ( opt.interval_mode ) {
+                if(P.interval.start_tsc > tsc) {
+                    fprintf(warn, "FATAL: order_tsc %lld < interval.start_tsc %lld!\n",
+                            tsc, P.interval.start_tsc);
+                    error(ERR_FILE, NULL);
+                } else {
+                    while ( tsc - P.interval.start_tsc > opt.interval.cycles ) {
+                        interval_callback();
+                        P.interval.start_tsc += opt.interval.cycles;
+                    }
+                }
+            }
+        }
+
+        P.f.last_tsc=tsc;
+
+        P.f.total_cycles = P.f.last_tsc - P.f.first_tsc;
+
+        P.now = tsc;
+    }
+}
+
+/* Standardized part of dump output */
+void create_dump_header(struct record_info *ri, struct pcpu_info *p)
+{
+    char * c;
+    int len, r;
+
+    len = DUMP_HEADER_MAX;
+    c = ri->dump_header;
+
+    abs_cycles_to_time(ri->tsc, &ri->t);
+
+    if ( ri->t.time )
+    {
+        r=snprintf(c, len, "%3u.%09u", ri->t.s, ri->t.ns);
+        c+=r;
+        len-=r;
+    }
+    else
+    {
+        r=snprintf(c,
+                   len,
+                   "              ");
+        c+=r;
+        len-=r;
+    }
+
+    r = snprintf(c, len, " %s", pcpu_string(ri->cpu));
+    c+=r;
+    len-=r;
+
+    if ( p->current )
+    {
+        r = snprintf(c, len, " d%dv%d", p->current->d->did, p->current->vid);
+        c+=r;
+        len-=r;
+    }
+    else
+    {
+        r = snprintf(c, len, " d?v?");
+        c+=r;
+        len-=r;
+    }
+}
+
+int find_toplevel_event(struct record_info *ri)
+{
+    int toplevel=0, i, count;
+
+    for(i=0, count=0; i<TOPLEVEL_MAX; i++)
+        if(ri->evt.main & (1UL<<i))
+        {
+            toplevel=i;
+            count++;
+        }
+
+    /* Sanity check: One and only one bit should be set */
+    if(count != 1)
+    {
+        fprintf(warn, "FATAL: unexpected number bits(%d) in evt.main! event %x main %x sub %x minor %x\n",
+                count,
+                ri->event,
+                ri->evt.main, ri->evt.sub, ri->evt.minor);
+        error(ERR_RECORD, NULL);
+        return -1;
+    }
+
+    return toplevel;
+}
+
+
+void process_cpu_change(struct pcpu_info *p) {
+    struct record_info *ri = &p->ri;
+    struct cpu_change_data *r = (typeof(r))ri->d;
+
+    if(opt.dump_all && verbosity >= 6) {
+        printf("]%s cpu_change this-cpu %u record-cpu %u window_size %u(0x%08x)\n",
+               ri->dump_header, p->pid, r->cpu, r->window_size,
+               r->window_size);
+    }
+
+    /* File sanity check */
+    if(p->file_offset != p->next_cpu_change_offset) {
+        fprintf(warn, "Strange, pcpu %d expected offet %llx, actual %llx!\n",
+                p->pid, (unsigned long long)p->next_cpu_change_offset,
+                (unsigned long long)p->file_offset);
+    }
+
+    if(r->cpu > MAX_CPUS)
+    {
+        fprintf(stderr, "FATAL: cpu %d > MAX_CPUS %d.\n",
+                r->cpu, MAX_CPUS);
+        /* Actually file, but takes some work to skip */
+        error(ERR_ASSERT, NULL);
+    }
+
+    /* Detect beginning of new "epoch" while scanning thru file */
+    if((p->last_cpu_change_pid > r->cpu)
+       && (p->file_offset > P.last_epoch_offset)) {
+        P.last_epoch_offset = p->file_offset;
+    }
+
+    /* If that pcpu has never been activated, activate it. */
+    if(!P.pcpu[r->cpu].active && P.pcpu[r->cpu].file_offset == 0)
+    {
+        struct pcpu_info * p2 = P.pcpu + r->cpu;
+
+        p2->active = 1;
+        if(r->cpu > P.max_active_pcpu)
+            P.max_active_pcpu = r->cpu;
+
+        /* Taking this record as the first record should make everything
+         * run swimmingly. */
+        p2->ri = *ri;
+        p2->ri.cpu = r->cpu;
+        p2->ri.d = p2->ri.rec.u.notsc.data;
+        p2->file_offset = p->file_offset;
+        p2->next_cpu_change_offset = p->file_offset;
+
+        fprintf(warn, "%s: Activating pcpu %d at offset %lld\n",
+                __func__, r->cpu, (unsigned long long)p->file_offset);
+
+        record_order_insert(p2);
+
+        sched_default_vcpu_activate(p2);
+    }
+
+    p->last_cpu_change_pid = r->cpu;
+
+    /* If this isn't the cpu we're looking for, skip the whole bunch */
+    if(p->pid != r->cpu)
+    {
+        p->file_offset += ri->size + r->window_size;
+        p->next_cpu_change_offset = p->file_offset;
+
+        if(p->file_offset > G.file_size) {
+            activate_early_eof();
+        } else if(P.early_eof && p->file_offset > P.last_epoch_offset) {
+            fprintf(warn, "%s: early_eof activated, pcpu %d past last_epoch_offset %llx, deactivating.\n",
+                    __func__, p->pid, (unsigned long long)P.last_epoch_offset);
+            deactivate_pcpu(p);
+        }
+    }
+    else
+    {
+        /* Track information about dom0 scheduling and records */
+        if(opt.dump_trace_volume_on_lost_record) {
+            tsc_t cycles;
+            struct time_struct t;
+
+            /* Update dom0 runstates */
+            cycles = (p->volume.buffer_first_tsc > p->volume.buffer_dom0_runstate_tsc) ?
+                p->volume.buffer_first_tsc :
+                p->volume.buffer_dom0_runstate_tsc;
+            p->volume.buffer_dom0_runstate_cycles[p->volume.buffer_dom0_runstate]
+                += ri->tsc - cycles;
+
+            printf(" - updated p%d dom0_runstate %s to %lld cycles (+%lld)\n",
+                   p->pid, runstate_name[p->volume.buffer_dom0_runstate],
+                   p->volume.buffer_dom0_runstate_cycles[p->volume.buffer_dom0_runstate],
+                   ri->tsc - cycles);
+
+            /* print info */
+            cycles = ri->tsc - p->volume.buffer_first_tsc;
+            cycles_to_time(cycles, &t);
+            printf("Buffer time: %u.%09u (%lld cycles)\n",
+                   t.s, t.ns, cycles);
+            if(p->volume.buffer_size)
+                printf("Rate: %lld cycles / byte\n",
+                       cycles / p->volume.buffer_size);
+            if(P.buffer_trace_virq_tsc)
+            {
+                cycles = ri->tsc - P.buffer_trace_virq_tsc;
+                cycles_to_time(cycles, &t);
+                printf("trace_virq latency: %u.%09u (%lld cycles)\n",
+                       t.s, t.ns, cycles);
+                P.buffer_trace_virq_tsc = 0;
+            }
+            else
+            {
+                printf("No trace_virq record found.\n");
+            }
+            printf("Dom0 runstates this buffer:\n");
+            runstate_summary(p->volume.buffer_dom0_runstate_cycles);
+            volume_summary(&p->volume.last_buffer);
+
+            /* reset info */
+            p->volume.buffer_first_tsc = 0;
+            p->volume.buffer_size = r->window_size;
+            runstate_clear(p->volume.buffer_dom0_runstate_cycles);
+            volume_clear(&p->volume.last_buffer);
+        }
+
+        p->file_offset += ri->size;
+        p->next_cpu_change_offset = p->file_offset + r->window_size;
+
+        if(p->next_cpu_change_offset > G.file_size)
+            activate_early_eof();
+        else if(p->pid == P.max_active_pcpu)
+            scan_for_new_pcpu(p->next_cpu_change_offset);
+
+    }
+}
+
+struct tl_assert_mask {
+    unsigned p_current:1,
+        not_idle_domain:1;
+    int vcpu_data_mode;
+};
+static struct tl_assert_mask tl_assert_checks[TOPLEVEL_MAX] = {
+    [TRC_HVM_MAIN]={ .p_current=1, .not_idle_domain=1, .vcpu_data_mode=VCPU_DATA_HVM },
+    [TRC_SHADOW_MAIN]={ .p_current=1, .not_idle_domain=1, .vcpu_data_mode=VCPU_DATA_HVM },
+    [TRC_PV_MAIN]={ .p_current=1, .not_idle_domain=1, .vcpu_data_mode=VCPU_DATA_PV },
+};
+
+/* There are a lot of common assumptions for the various processing
+ * routines.  Check them all in one place, doing something else if
+ * they don't pass. */
+int toplevel_assert_check(int toplevel, struct pcpu_info *p)
+{
+    struct tl_assert_mask mask;
+
+    mask = tl_assert_checks[toplevel];
+
+    if (mask.p_current && p->current == NULL)
+    {
+        fprintf(warn, "WARNING: p->current null!  Not processing\n");
+        goto fail;
+    }
+
+    if( mask.not_idle_domain )
+    {
+        /* Can't do this check w/o first doing above check */
+        assert(mask.p_current);
+
+        if ( p->current->d->did == IDLE_DOMAIN) {
+            fprintf(warn, "WARNING: Unexpected record for idle domain! Not processing\n");
+            goto fail;
+        }
+    }
+
+    if ( mask.vcpu_data_mode )
+    {
+        struct vcpu_data *v;
+        assert(mask.p_current);
+
+        v = p->current;
+
+        if ( ! (v->data_type == VCPU_DATA_NONE
+                || v->data_type == mask.vcpu_data_mode) )
+        {
+            /* This may happen for track_dirty_vram, which causes a SHADOW_WRMAP_BF trace f/ dom0 */
+            fprintf(warn, "WARNING: Unexpected vcpu data type for d%dv%d on proc %d! Expected %d got %d. Not processing\n",
+                    v->d->did, v->vid, p->pid,
+                    mask.vcpu_data_mode,
+                    v->data_type);
+            goto fail;
+        }
+    }
+
+    return 1;
+
+fail:
+    dump_generic(warn, &p->ri);
+    return 0;
+}
+
+void process_record(struct pcpu_info *p) {
+    struct record_info *ri = &p->ri;
+    int toplevel;
+
+    /* Process only TRC_TRACE_CPU_CHANGE */
+    if(ri->event == TRC_TRACE_CPU_CHANGE) {
+        process_cpu_change(p);
+        return;
+    }
+
+    if ( opt.dump_no_processing )
+        goto out;
+
+    p->summary = 1;
+
+    if( opt.dump_raw_process )
+        dump_raw("* ", ri);
+
+    process_record_tsc(p->order_tsc, ri);
+
+    if(opt.dump_all)
+        create_dump_header(ri, p);
+
+
+    toplevel = find_toplevel_event(ri);
+    if ( toplevel < 0 )
+        return;
+
+    /* Unify toplevel assertions */
+    if ( toplevel_assert_check(toplevel, p) )
+    {
+        switch(toplevel) {
+        case TRC_GEN_MAIN:
+            base_process(p);
+            break;
+        case TRC_SCHED_MAIN:
+            sched_process(p);
+            break;
+        case TRC_HVM_MAIN:
+            hvm_process(p);
+            break;
+        case TRC_SHADOW_MAIN:
+            shadow_process(p);
+            break;
+        case TRC_PV_MAIN:
+            pv_process(p);
+            break;
+        case TRC_MEM_MAIN:
+            mem_process(p);
+            break;
+        case TRC_HW_MAIN:
+            hw_process(p);
+            break;
+        case TRC_DOM0OP_MAIN:
+        default:
+            process_generic(ri);
+        }
+    }
+
+    UPDATE_VOLUME(p, toplevel[toplevel], ri->size);
+
+    if(!p->volume.buffer_first_tsc)
+        p->volume.buffer_first_tsc = ri->tsc;
+
+ out:
+    /* Lost records gets processed twice */
+    if(ri->event != TRC_LOST_RECORDS)
+        p->file_offset += ri->size;
+}
+
+static inline ssize_t get_rec_size(struct trace_record *rec) {
+    ssize_t s;
+
+    s = sizeof(uint32_t);
+
+    if(rec->cycle_flag)
+        s += sizeof(tsc_t);
+
+    s += rec->extra_words * sizeof(uint32_t);
+
+    return s;
+}
+
+#define STDIN 0
+
+void progress_child_exec(void) {
+    fclose(stdin);
+    dup2(G.progress.pipe[0], STDIN);
+
+    execlp("zenity", "zenity", "--progress", "--auto-close", "--title",
+           "Analyzing", "--text", G.trace_file, "--auto-kill", NULL);
+}
+
+void progress_init(void) {
+    int pid;
+
+    if (pipe(G.progress.pipe) < 0)
+        perror("pipe");
+
+    if(!(pid = fork())) {
+        progress_child_exec();
+
+        fprintf(stderr, "%s: exec failed (%s), disabling progress bar\n",
+                __func__, strerror(errno));
+        opt.progress = 0;
+        exit(1);
+    } else if( pid < 0 ) {
+        fprintf(stderr, "%s: could not fork: %s, disabling progress bar\n",
+                __func__, strerror(errno));
+        opt.progress = 0;
+    }
+
+    if( (G.progress.out = fdopen(G.progress.pipe[1], "w")) < 0 ) {
+        fprintf(stderr, "%s: could not fdopen pipe: %s, disabling progress bar\n",
+                __func__, strerror(errno));
+        opt.progress = 0;
+    }
+
+}
+
+void progress_update(off_t offset) {
+    long long p;
+
+    p = ( offset * 100 ) / G.file_size;
+
+    fprintf(G.progress.out, "%lld\n", p);
+    fflush(G.progress.out);
+
+    p += 1;
+
+    G.progress.update_offset = ( G.file_size * p ) / 100;
+
+#if 0
+    fprintf(stderr, "Progress: %lld %% Next update_offset: %lld\n",
+            p-1,
+            G.progress.update_offset);
+#endif
+}
+
+void progress_finish(void) {
+    int pid;
+
+    fprintf(G.progress.out, "100\n");
+    fflush(G.progress.out);
+    fclose(G.progress.out);
+
+    wait(NULL);
+
+    if(!(pid = fork())) {
+        /* Child */
+        char text[128];
+
+        snprintf(text, 128, "Finished analyzing %s",
+                 G.trace_file);
+        execlp("zenity", "zenity", "--info", "--text", text, NULL);
+    }
+}
+
+ssize_t __read_record(struct trace_record *rec, off_t offset)
+{
+    ssize_t r, rsize;
+
+    r=mread64(G.mh, rec, sizeof(*rec), offset);
+
+    if(r < 0) {
+        /* Read error */
+        perror("read");
+        fprintf(stderr, "offset %llx\n", (unsigned long long)offset);
+        return 0;
+    } else if(r==0) {
+        /* End-of-file */
+        return 0;
+    } else if(r < sizeof(uint32_t)) {
+        /* Full header not read */
+        fprintf(stderr, "%s: short read (%zd bytes)\n",
+                __func__, r);
+        error(ERR_SYSTEM, NULL);
+    }
+
+    rsize=get_rec_size(rec);
+
+    if(r < rsize) {
+        /* Full record not read */
+        fprintf(stderr, "%s: short read (%zd, expected %zd)\n",
+                __func__, r, rsize);
+        return 0;
+    }
+
+    return rsize;
+}
+
+void __fill_in_record_info(struct pcpu_info *p)
+{
+    struct record_info *ri;
+    tsc_t tsc=0;
+
+    ri = &p->ri;
+
+    ri->event = ri->rec.event;
+    ri->extra_words = ri->rec.extra_words;
+
+    if(ri->rec.cycle_flag) {
+        tsc = (((tsc_t)ri->rec.u.tsc.tsc_hi) << 32)
+                | ri->rec.u.tsc.tsc_lo;
+
+        tsc += p->tsc_skew.offset;
+
+        ri->tsc = tsc;
+        ri->d = ri->rec.u.tsc.data;
+
+        if(p->first_tsc == 0)
+            p->first_tsc = tsc;
+
+        /* We process lost record twice: once at the first_tsc,
+           once at the time it was placed in the log */
+        if(ri->event == TRC_LOST_RECORDS && ri->extra_words == 4) {
+            struct lost_record_struct *r = (typeof(r))ri->d;
+            p->order_tsc = r->first_tsc + p->tsc_skew.offset;
+        } else
+            p->order_tsc = tsc;
+
+        p->last_tsc = tsc;
+    } else {
+        ri->tsc = p->last_tsc;
+        ri->d = ri->rec.u.notsc.data;
+    }
+
+    if ( opt.dump_raw_reads ) {
+        char s[256];
+        snprintf(s, 256, "R p%2d o%016llx ",
+               p->pid, (unsigned long long)p->file_offset);
+        dump_raw(s, ri);
+    }
+
+    /* Updated tracing uses CPU_CHANGE.  If we hit one of these,
+     * it will process very next (since the tsc isn't updated), and
+     * we'll skip forward appropriately. */
+    ri->cpu = p->pid;
+}
+
+ssize_t read_record(struct pcpu_info * p) {
+    off_t * offset;
+    struct record_info *ri;
+
+    offset = &p->file_offset;
+    ri = &p->ri;
+
+    ri->size = __read_record(&ri->rec, *offset);
+    if(ri->size)
+    {
+        __fill_in_record_info(p);
+    }
+    else
+    {
+        fprintf(warn, "%s: read returned zero, deactivating pcpu %d\n",
+                __func__, p->pid);
+        deactivate_pcpu(p);
+    }
+
+    return ri->size;
+}
+
+/*
+ * This funciton gets called for every record when doing dump.  Try to
+ * make it efficient by changing the minimum amount from the last
+ * call.  Do this by:
+ * - Keeping track of the last pcpu called, so we can just set that to -
+ * - Keeping track of how many pcpus we've "drawn", and only "drawing" new ones
+ * - Updating the current one
+ *
+ * FIXME: Need to deal with pcpu states changing...
+ *
+ * WARNING not thread-safe
+ */
+
+char __pcpu_string[MAX_CPUS+1] = { 0 };
+void pcpu_string_draw(struct pcpu_info *p)
+{
+    char *s = __pcpu_string;
+    int i=p->pid;
+
+    if(p->lost_record.active)
+        s[i]='l';
+    else if (!p->current)
+        s[i]=' ';
+    else if (p->current->d->did == DEFAULT_DOMAIN)
+        s[i]='.';
+    else if (p->current->d->did == IDLE_DOMAIN)
+    {
+        if ( opt.dump_show_power_states )
+            s[i]=p->power_state+'0';
+        else
+            s[i]='-';
+    }
+    else
+        s[i]='|';
+}
+
+char * pcpu_string(int pcpu)
+{
+    char *s = __pcpu_string;
+    static int max_active_pcpu=-1, last_pcpu=-1;
+
+    assert(P.max_active_pcpu < MAX_CPUS);
+    assert(pcpu <= P.max_active_pcpu);
+
+    if(last_pcpu >= 0)
+        pcpu_string_draw(P.pcpu+last_pcpu);
+
+    if(P.max_active_pcpu > max_active_pcpu)
+    {
+        int i;
+        for(i=max_active_pcpu + 1; i<= P.max_active_pcpu; i++)
+            pcpu_string_draw(P.pcpu+i);
+        max_active_pcpu=P.max_active_pcpu;
+    }
+
+    s[pcpu]='x';
+    last_pcpu = pcpu;
+
+    return s;
+}
+
+/* Null terminated */
+struct pcpu_info *record_order[MAX_CPUS+1] = { 0 };
+
+/* In the case of identical tsc values, the old algorithm would favor the
+ * pcpu with the lowest number.  By default the new algorithm favors the
+ * pcpu which has been processed most recently.
+ *
+ * I think the second way is better; but it's good to be able to use the
+ * old ordering, at very lest to verify that there are no (other) ordering
+ * differences.  Enabling the below flag will cause the insertion / bubble
+ * routines to order by pcpu id as well as tsc, preserving the old order. */
+//#define PRESERVE_PCPU_ORDERING
+
+/* Steady state:
+ * + Entire list is in order, except (potentially) for the first entry
+ * + last is pointing to the first entry.
+ */
+void record_order_bubble(struct pcpu_info *last)
+{
+    int i;
+
+    /* Find the pcpu to "bubble".  This is usually the
+     * first one, but if other pcpus have been activated, it may
+     * not be. */
+    for(i=0; record_order[i] && record_order[i]!=last; i++);
+
+    assert(record_order[i]);
+
+    /* Now bubble it down */
+    for( ;
+        record_order[i+1]
+             && ( record_order[i+1]->order_tsc < last->order_tsc
+#ifdef PRESERVE_PCPU_ORDERING
+                  || ( record_order[i+1]->order_tsc == last->order_tsc
+                       && record_order[i+1]->pid < last->pid )
+#endif
+                 ) ;
+        i++)
+        record_order[i]=record_order[i+1];
+    record_order[i]=last;
+}
+
+void record_order_insert(struct pcpu_info *new)
+{
+    int i;
+    struct pcpu_info *p=NULL, *t=NULL;
+
+    /* Sanity check: Make sure it's not already in there */
+    for(i=0; record_order[i]; i++)
+        assert(record_order[i]!=new);
+
+    /* Find where to insert it */
+    for(i=0;
+        record_order[i]
+             && ( record_order[i]->order_tsc < new->order_tsc
+#ifdef PRESERVE_PCPU_ORDERING
+                  || ( record_order[i]->order_tsc == new->order_tsc
+                       && record_order[i]->pid < new->pid )
+#endif
+                 ) ;
+        i++)
+        ;
+
+    /* And insert it */
+    for( p=new; p ; i++)
+    {
+        t=record_order[i];
+        record_order[i]=p;
+        p=t;
+    }
+}
+
+void record_order_remove(struct pcpu_info *rem)
+{
+    int i;
+
+    /* Find where the record is */
+    for(i=0; record_order[i] && record_order[i]!=rem; i++)
+        ;
+
+    /* Sanity check: Make sure it's actually there! */
+    assert(record_order[i]);
+
+    /* And move everyone forward */
+    for(; (record_order[i]=record_order[i+1]); i++)
+        ;
+}
+
+struct pcpu_info * choose_next_record(void)
+{
+    struct pcpu_info *min_p=NULL;
+
+    min_p=record_order[0];
+
+    if(opt.progress && min_p && min_p->file_offset >= G.progress.update_offset)
+        progress_update(min_p->file_offset);
+
+    /* If there are active pcpus, make sure we chose one */
+    assert(min_p || (P.max_active_pcpu==-1));
+
+    return min_p;
+}
+
+void process_records(void) {
+    while(1) {
+        struct pcpu_info *p = NULL;
+
+        if(!(p=choose_next_record()))
+            return;
+
+        process_record(p);
+
+        /* Lost records gets processed twice. */
+        if(p->ri.event == TRC_LOST_RECORDS) {
+            p->ri.event = TRC_LOST_RECORDS_END;
+            if(p->ri.tsc > p->order_tsc)
+                p->order_tsc = p->ri.tsc;
+            else {
+                fprintf(warn, "Strange, lost_record ri->tsc %lld !> p->order_tsc %lld!\n",
+                        p->ri.tsc, p->order_tsc);
+                error(ERR_FILE, NULL);
+            }
+        }
+        else
+            read_record(p);
+
+        /* Update this pcpu in the processing order */
+        if ( p->active )
+            record_order_bubble(p);
+    }
+}
+
+void vcpu_summary(struct vcpu_data *v)
+{
+    printf("-- v%d --\n", v->vid);
+    sched_summary_vcpu(v);
+    switch(v->data_type) {
+    case VCPU_DATA_HVM:
+        hvm_summary(&v->hvm);
+        break;
+    case VCPU_DATA_PV:
+        pv_summary(&v->pv);
+        break;
+    default:
+        break;
+    }
+}
+
+void domain_summary(void)
+{
+    struct domain_data * d;
+    int i;
+
+    if(opt.show_default_domain_summary) {
+        d = &default_domain;
+        printf("|-- Default domain --|\n");
+
+        for( i = 0; i < MAX_CPUS ; i++ )
+        {
+            if(d->vcpu[i])
+                vcpu_summary(d->vcpu[i]);
+        }
+    }
+
+    for ( d = domain_list ; d ; d=d->next )
+    {
+        int i;
+        printf("|-- Domain %d --|\n", d->did);
+
+        sched_summary_domain(d);
+
+        mem_summary_domain(d);
+
+        for( i = 0; i < MAX_CPUS ; i++ )
+        {
+            if(d->vcpu[i])
+                vcpu_summary(d->vcpu[i]);
+        }
+
+        printf("Emulate eip list\n");
+        dump_eip(d->emulate_eip_list);
+
+        if ( opt.with_interrupt_eip_enumeration )
+        {
+            printf("Interrupt eip list (vector %d)\n",
+                   opt.interrupt_eip_enumeration_vector);
+            dump_eip(d->interrupt_eip_list);
+        }
+
+        cr3_dump_list(d->cr3_value_head);
+    }
+}
+
+char * stringify_cpu_hz(long long cpu_hz);
+
+void summary(void) {
+    int i;
+    printf("Total time: %.2lf seconds (using cpu speed %s)\n",
+           ((double)(P.f.total_cycles))/opt.cpu_hz,
+           stringify_cpu_hz(opt.cpu_hz));
+    printf("--- Log volume summary ---\n");
+    for(i=0; i<MAX_CPUS; i++)
+    {
+        struct pcpu_info *p = P.pcpu+i;
+        if(!p->summary)
+            continue;
+        printf(" - cpu %d -\n", i);
+        volume_summary(&p->volume.total);
+    }
+    domain_summary();
+}
+
+void report_pcpu(void) {
+    int i, active=0;
+
+    for(i=0; i<MAX_CPUS; i++)
+    {
+        struct pcpu_info *p = P.pcpu+i;
+        if(!p->summary)
+            continue;
+        printf("pcpu %d\n", i);
+
+        print_cycle_summary(&p->time.running, " running");
+        print_cycle_summary(&p->time.idle,    "    idle");
+        print_cycle_summary(&p->time.lost,    "    lost");
+
+        if ( p->time.running.count )
+            active++;
+    }
+    printf("Total active cpus: %d\n", active);
+
+}
+
+void init_pcpus(void) {
+    int i=0;
+    off_t offset = 0;
+
+    for(i=0; i<MAX_CPUS; i++)
+    {
+        P.pcpu[i].pid=i;
+        P.pcpu[i].lost_record.seen_valid_schedule=1;
+        P.pcpu[i].power_state=CSTATE_INVALID;
+    }
+
+    P.max_active_pcpu = -1;
+
+    sched_default_domain_init();
+
+    /* Scan through the cpu_change recs until we see a duplicate */
+    do {
+        offset = scan_for_new_pcpu(offset);
+
+        if(!offset) {
+            fprintf(warn, "%s: through first trace write, done for now.\n",
+                   __func__);
+        }
+    } while(offset);
+
+}
+
+enum {
+    OPT_NULL=0,
+    /* Dumping info */
+    OPT_DUMP_RAW_READS,
+    OPT_DUMP_RAW_PROCESS,
+    OPT_DUMP_NO_PROCESSING,
+    OPT_DUMP_IPI_LATENCY,
+    OPT_DUMP_TRACE_VOLUME_ON_LOST_RECORD,
+    OPT_DUMP_SHOW_POWER_STATES,
+    /* Extra tracking functionality */
+    OPT_WITH_CR3_ENUMERATION,
+    OPT_WITH_PIO_ENUMERATION,
+    OPT_WITH_MMIO_ENUMERATION,
+    OPT_WITH_INTERRUPT_EIP_ENUMERATION,
+    OPT_SCATTERPLOT_INTERRUPT_EIP,
+    OPT_SCATTERPLOT_CPI,
+    OPT_SCATTERPLOT_UNPIN_PROMOTE,
+    OPT_SCATTERPLOT_CR3_SWITCH,
+    OPT_SCATTERPLOT_WAKE_TO_HALT,
+    OPT_SCATTERPLOT_IO,
+    OPT_SCATTERPLOT_VMEXIT_EIP,
+    OPT_SCATTERPLOT_RUNSTATE,
+    OPT_SCATTERPLOT_RUNSTATE_TIME,
+    OPT_SCATTERPLOT_PCPU,
+    OPT_SCATTERPLOT_EXTINT_CYCLES,
+    OPT_SCATTERPLOT_RDTSC,
+    OPT_SCATTERPLOT_IRQ,
+    OPT_HISTOGRAM_INTERRUPT_EIP,
+    /* Interval options */
+    OPT_INTERVAL_CR3_SCHEDULE_TIME,
+    OPT_INTERVAL_CR3_SCHEDULE_TIME_ALL,
+    OPT_INTERVAL_CR3_SCHEDULE_ORDERED,
+    OPT_INTERVAL_CR3_SHORT_SUMMARY,
+    OPT_INTERVAL_DOMAIN_TOTAL_TIME,
+    OPT_INTERVAL_DOMAIN_TOTAL_TIME_ALL,
+    OPT_INTERVAL_DOMAIN_SHORT_SUMMARY,
+    OPT_INTERVAL_DOMAIN_GUEST_INTERRUPT,
+    OPT_INTERVAL_DOMAIN_GRANT_MAPS,
+    /* Summary info */
+    OPT_SHOW_DEFAULT_DOMAIN_SUMMARY,
+    OPT_MMIO_ENUMERATION_SKIP_VGA,
+    OPT_SAMPLE_SIZE,
+    OPT_REPORT_PCPU,
+    /* Guest info */
+    OPT_DEFAULT_GUEST_PAGING_LEVELS,
+    OPT_SYMBOL_FILE,
+    /* Hardware info */
+    OPT_SVM_MODE,
+    OPT_CPU_HZ,
+    /* Misc */
+    OPT_PROGRESS,
+    OPT_TOLERANCE,
+    OPT_TSC_LOOP_FATAL,
+    /* Specific letters */
+    OPT_DUMP_ALL='a',
+    OPT_INTERVAL_LENGTH='i',
+    OPT_SUMMARY='s',
+};
+
+enum {
+    OPT_GROUP_SUMMARY=1,
+    OPT_GROUP_DUMP,
+    OPT_GROUP_INTERVAL,
+    OPT_GROUP_EXTRA,
+    OPT_GROUP_GUEST,
+    OPT_GROUP_HARDWARE
+};
+
+#define xstr(x) str(x)
+#define str(x) #x
+
+#define GHZ 1000000000LL
+#define MHZ 1000000LL
+#define KHZ 1000LL
+
+void parse_cpu_hz(char * arg) {
+    float hz_base;
+    char * next_ptr;
+
+    hz_base=strtof(arg, &next_ptr);
+    if(next_ptr == arg) {
+        fprintf(stderr, "Invalid cpu_hz %s\n", arg);
+        exit(1);
+    }
+    switch(*next_ptr) {
+    case '\0':
+        opt.cpu_hz=(long long)hz_base;
+        break;
+    case 'G':
+        opt.cpu_hz= hz_base * GHZ;
+        break;
+    case 'M':
+        opt.cpu_hz=hz_base * MHZ;
+        break;
+    case 'K':
+        opt.cpu_hz=hz_base * KHZ;
+        break;
+    default:
+        fprintf(stderr, "Unknown suffix %c\n", *next_ptr);
+        exit(1);
+    }
+    /* Just a convenient pre-calculation */
+    opt.cpu_qhz = QHZ_FROM_HZ(opt.cpu_hz);
+}
+
+/* WARNING not thread-safe */
+char * stringify_cpu_hz(long long cpu_hz) {
+    static char cpu_string[20], suffix;
+    float hz;
+
+    if(cpu_hz > GHZ) {
+        hz = (float)cpu_hz / GHZ;
+        suffix = 'G';
+    } else if(cpu_hz > MHZ) {
+        hz = (float)cpu_hz / MHZ;
+        suffix = 'M';
+    } else if(cpu_hz > KHZ) {
+        hz = (float)cpu_hz / KHZ;
+        suffix = 'k';
+    } else {
+        hz = cpu_hz;
+        suffix = ' ';
+    }
+
+    snprintf(cpu_string, 20, "%1.2lf %cHz", hz, suffix);
+
+    return cpu_string;
+}
+
+int parse_array(char *arg, struct array_struct *a) {
+    char *p, *q;
+    int n=1, i;
+
+    /* Count the number of commas (and thus the number of elements) */
+    for(p=arg; *p; p++)
+        if(*p == ',')
+            n++;
+
+    fprintf(warn, "%s: Found %d elements\n", __func__, n);
+    fflush(warn);
+    a->count = n;
+    a->values = malloc(n * sizeof(unsigned long long));
+
+    if(!a->values) {
+        fprintf(stderr, "Malloc failed!\n");
+        error(ERR_SYSTEM, NULL);
+    }
+
+    /* Now parse the elements */
+    p = q = arg;
+    for(i=0; i<n; i++) {
+        a->values[i] = strtoull(p, &q, 0);
+        if(p == q) {
+            fprintf(stderr, "Bad format: %s\n", q);
+            return -1;
+        }
+        fprintf(warn, "%s: Found element 0x%llx (%lld)\n",
+                __func__, a->values[i],
+                a->values[i]);
+        fflush(warn);
+        if(*q == ',')
+            q++;
+        else if(*q != '\0') {
+            fprintf(stderr, "Bad format: %s\n", q);
+            return -1;
+        }
+        p=q;
+    }
+
+    return n;
+}
+
+error_t cmd_parser(int key, char *arg, struct argp_state *state)
+{
+    switch (key)
+    {
+        /* Dump group */
+    case OPT_DUMP_ALL:
+        opt.dump_all = 1;
+        G.output_defined = 1;
+        break;
+    case OPT_DUMP_RAW_READS:
+        opt.dump_raw_reads = 1;
+        G.output_defined = 1;
+        break;
+    case OPT_DUMP_NO_PROCESSING:
+        opt.dump_no_processing = 1;
+        opt.dump_raw_reads = 1;
+        G.output_defined = 1;
+        break;
+    case OPT_DUMP_RAW_PROCESS:
+        opt.dump_raw_process = 1;
+        G.output_defined = 1;
+        break;
+    case OPT_DUMP_IPI_LATENCY:
+        opt.dump_ipi_latency = 1;
+        break;
+    case OPT_DUMP_TRACE_VOLUME_ON_LOST_RECORD:
+        opt.dump_trace_volume_on_lost_record = 1;
+        break;
+    case OPT_DUMP_SHOW_POWER_STATES:
+        opt.dump_show_power_states = 1;
+        break;
+        /* Extra group */
+    case OPT_WITH_CR3_ENUMERATION:
+        opt.with_cr3_enumeration=1;
+        break;
+    case OPT_WITH_PIO_ENUMERATION:
+        opt.with_pio_enumeration=1;
+        break;
+    case OPT_WITH_MMIO_ENUMERATION:
+        opt.with_mmio_enumeration=1;
+        break;
+    case OPT_SHOW_DEFAULT_DOMAIN_SUMMARY:
+        opt.show_default_domain_summary=1;
+        break;
+    case OPT_SAMPLE_SIZE:
+    {
+        char * inval;
+        opt.sample_size = (int)strtol(arg, &inval, 0);
+        if( inval == arg )
+            argp_usage(state);
+        break;
+    }
+    case OPT_MMIO_ENUMERATION_SKIP_VGA:
+    {
+        char * inval;
+        opt.mmio_enumeration_skip_vga = (int)strtol(arg, &inval, 0);
+        if( inval == arg )
+            argp_usage(state);
+        break;
+    }
+    case OPT_SCATTERPLOT_INTERRUPT_EIP:
+    {
+        char * inval;
+        G.output_defined = 1;
+        opt.scatterplot_interrupt_eip=1;
+        opt.scatterplot_interrupt_vector = (int)strtol(arg, &inval, 0);
+        if( inval == arg )
+            argp_usage(state);
+    }
+    break;
+    case OPT_WITH_INTERRUPT_EIP_ENUMERATION:
+    {
+        char * inval;
+        opt.with_interrupt_eip_enumeration=1;
+        opt.interrupt_eip_enumeration_vector = (int)strtol(arg, &inval, 0);
+        if( inval == arg )
+            argp_usage(state);
+    }
+    break;
+    case OPT_SCATTERPLOT_CPI:
+        G.output_defined = 1;
+        opt.scatterplot_cpi=1;
+        break;
+    case OPT_SCATTERPLOT_UNPIN_PROMOTE:
+        G.output_defined = 1;
+        opt.scatterplot_unpin_promote=1;
+        break;
+    case OPT_SCATTERPLOT_CR3_SWITCH:
+        G.output_defined = 1;
+        opt.scatterplot_cr3_switch=1;
+        break;
+    case OPT_SCATTERPLOT_WAKE_TO_HALT:
+        G.output_defined = 1;
+        opt.scatterplot_wake_to_halt=1;
+        break;
+    case OPT_SCATTERPLOT_VMEXIT_EIP:
+        G.output_defined = 1;
+        opt.scatterplot_vmexit_eip=1;
+    break;
+    case OPT_SCATTERPLOT_EXTINT_CYCLES:
+    {
+        char * inval;
+        G.output_defined = 1;
+        opt.scatterplot_extint_cycles=1;
+        opt.scatterplot_extint_cycles_vector = (int)strtol(arg, &inval, 0);
+        if( inval == arg )
+            argp_usage(state);
+    }
+    break;
+    case OPT_SCATTERPLOT_RDTSC:
+        G.output_defined = 1;
+        opt.scatterplot_rdtsc=1;
+        break;
+    case OPT_SCATTERPLOT_IRQ:
+        G.output_defined = 1;
+        opt.scatterplot_irq=1;
+        break;
+    case OPT_SCATTERPLOT_IO:
+    {
+        char * inval;
+        G.output_defined = 1;
+        opt.scatterplot_io=1;
+        opt.scatterplot_io_port = (int)strtol(arg, &inval, 0);
+        if( inval == arg )
+            argp_usage(state);
+    }
+    break;
+    case OPT_SCATTERPLOT_RUNSTATE:
+        G.output_defined = 1;
+        opt.scatterplot_runstate=1;
+        break;
+    case OPT_SCATTERPLOT_RUNSTATE_TIME:
+        G.output_defined = 1;
+        opt.scatterplot_runstate_time=1;
+        break;
+    case OPT_SCATTERPLOT_PCPU:
+        G.output_defined = 1;
+        opt.scatterplot_pcpu=1;
+        break;
+    case OPT_HISTOGRAM_INTERRUPT_EIP:
+    {
+        char * inval, *p;
+
+        opt.histogram_interrupt_eip=1;
+        opt.histogram_interrupt_vector = (int)strtol(arg, &inval, 0);
+
+        if( inval == arg )
+            argp_usage(state);
+
+        p = inval;
+
+        if(*p == ',')
+            opt.histogram_interrupt_increment = (unsigned long long)strtoull(p+1, &inval, 0);
+        else
+            opt.histogram_interrupt_increment = 0x1000000;
+
+        printf("Making histogram of eips at interrupt %d, increment %llx\n",
+               opt.histogram_interrupt_vector,
+               opt.histogram_interrupt_increment);
+    }
+    break;
+
+    case OPT_INTERVAL_LENGTH:
+    {
+        char * inval;
+
+        opt.interval.msec = (unsigned) (strtof(arg, &inval) * 1000);
+
+        if ( inval == arg )
+            argp_usage(state);
+
+        break;
+    }
+
+    case OPT_INTERVAL_CR3_SCHEDULE_TIME:
+    {
+        if(parse_array(arg, &opt.interval.array) < 0)
+            goto usage;
+        interval_table_alloc(opt.interval.array.count);
+        opt.interval.output = INTERVAL_CR3_SCHEDULE_TIME;
+        opt.interval.check = INTERVAL_CHECK_CR3;
+        opt.interval.mode = INTERVAL_MODE_ARRAY;
+        opt.interval_mode = 1;
+        opt.summary_info = 1;
+        opt.with_cr3_enumeration = 1;
+        G.output_defined = 1;
+        break;
+    usage:
+        fprintf(stderr, "Invalid input for cr3_schedule_time\n");
+        argp_usage(state);
+        break;
+    }
+
+    case OPT_INTERVAL_CR3_SCHEDULE_TIME_ALL:
+        opt.interval.output = INTERVAL_CR3_SCHEDULE_TIME;
+        opt.interval.check = INTERVAL_CHECK_CR3;
+        opt.interval.mode = INTERVAL_MODE_LIST;
+        opt.interval_mode = 1;
+        opt.summary_info = 1;
+        opt.with_cr3_enumeration = 1;
+        G.output_defined = 1;
+        break;
+
+    case OPT_INTERVAL_CR3_SCHEDULE_ORDERED:
+        opt.interval.output = INTERVAL_CR3_SCHEDULE_ORDERED;
+        opt.interval.check = INTERVAL_CHECK_CR3;
+        opt.interval_mode = 1;
+        opt.summary_info = 1;
+        opt.with_cr3_enumeration = 1;
+        G.output_defined = 1;
+        break;
+
+    case OPT_INTERVAL_CR3_SHORT_SUMMARY:
+    {
+        if(parse_array(arg, &opt.interval.array) < 0
+           || opt.interval.array.count != 1)
+            goto usage;
+        opt.interval.output = INTERVAL_CR3_SHORT_SUMMARY;
+        opt.interval.check = INTERVAL_CHECK_CR3;
+        opt.interval_mode = 1;
+        opt.summary_info = 1;
+        opt.with_cr3_enumeration = 1;
+        G.output_defined = 1;
+        break;
+    }
+
+    case OPT_INTERVAL_DOMAIN_TOTAL_TIME:
+    {
+        if(parse_array(arg, &opt.interval.array) < 0)
+            goto idtt_usage;
+        interval_table_alloc(opt.interval.array.count);
+        opt.interval.output = INTERVAL_DOMAIN_TOTAL_TIME;
+        opt.interval.check = INTERVAL_CHECK_DOMAIN;
+        opt.interval.mode = INTERVAL_MODE_ARRAY;
+        opt.interval_mode = 1;
+        opt.summary_info = 1;
+        G.output_defined = 1;
+        break;
+    idtt_usage:
+        fprintf(stderr, "Invalid input for domain_total_time\n");
+        argp_usage(state);
+        break;
+    }
+
+    case OPT_INTERVAL_DOMAIN_TOTAL_TIME_ALL:
+        opt.interval.output = INTERVAL_DOMAIN_TOTAL_TIME;
+        opt.interval.check = INTERVAL_CHECK_DOMAIN;
+        opt.interval.mode = INTERVAL_MODE_LIST;
+        opt.interval_mode = 1;
+        opt.summary_info = 1;
+        G.output_defined = 1;
+        break;
+
+    case OPT_INTERVAL_DOMAIN_SHORT_SUMMARY:
+    {
+        if((parse_array(arg, &opt.interval.array) < 0)
+           || opt.interval.array.count != 1)
+            argp_usage(state);
+
+        opt.interval.output = INTERVAL_DOMAIN_SHORT_SUMMARY;
+        opt.interval.check = INTERVAL_CHECK_DOMAIN;
+        opt.interval_mode = 1;
+        opt.summary_info = 1;
+        G.output_defined = 1;
+        break;
+    }
+
+    case OPT_INTERVAL_DOMAIN_GUEST_INTERRUPT:
+    {
+        if((parse_array(arg, &opt.interval.array) < 0)
+           || opt.interval.array.count != 1)
+            argp_usage(state);
+
+        opt.interval.output = INTERVAL_DOMAIN_GUEST_INTERRUPT;
+        opt.interval.check = INTERVAL_CHECK_DOMAIN;
+        opt.interval_mode = 1;
+        opt.summary_info = 1;
+        G.output_defined = 1;
+        break;
+    }
+
+    case OPT_INTERVAL_DOMAIN_GRANT_MAPS:
+    {
+        if((parse_array(arg, &opt.interval.array) < 0)
+           || opt.interval.array.count != 1)
+            argp_usage(state);
+
+        opt.interval.output = INTERVAL_DOMAIN_GRANT_MAPS;
+        opt.interval.check = INTERVAL_CHECK_DOMAIN;
+        opt.interval_mode = 1;
+        opt.summary_info = 1;
+        G.output_defined = 1;
+        break;
+    }
+
+        /* Summary group */
+    case OPT_SUMMARY:
+        opt.summary = 1;
+        opt.summary_info = 1;
+        G.output_defined = 1;
+        break;
+    case OPT_REPORT_PCPU:
+        opt.report_pcpu = 1;
+        //opt.summary_info = 1;
+        G.output_defined = 1;
+        break;
+        /* Guest info group */
+    case OPT_DEFAULT_GUEST_PAGING_LEVELS:
+    {
+        char *inval;
+        opt.default_guest_paging_levels = (int)strtol(arg, &inval, 0);
+        if ( inval == arg )
+            argp_usage(state);
+    }
+    break;
+    case OPT_SYMBOL_FILE:
+        /* FIXME - strcpy */
+        G.symbol_file = arg;
+        break;
+        /* Hardware info group */
+    case OPT_SVM_MODE:
+        opt.svm_mode = 1;
+        break;
+    case OPT_CPU_HZ:
+        parse_cpu_hz(arg);
+        break;
+        break;
+
+    case OPT_TOLERANCE:
+    {
+        char * inval;
+
+        opt.tolerance = (int)strtol(arg, &inval, 0);
+
+        if( inval == arg )
+            argp_usage(state);
+
+        if ( opt.tolerance > ERR_MAX_TOLERABLE )
+        {
+            fprintf(stderr, "ERROR: Max tolerable error %d\n",
+                    ERR_MAX_TOLERABLE);
+            exit(1);
+        }
+
+        printf("Tolerating errors at or below %d\n",
+               opt.tolerance);
+    }
+    break;
+
+    case OPT_PROGRESS:
+        opt.progress = 1;
+        break;
+
+    case OPT_TSC_LOOP_FATAL:
+        opt.tsc_loop_fatal = 1;
+        break;
+
+    case ARGP_KEY_ARG:
+    {
+        /* FIXME - strcpy */
+        if (state->arg_num == 0)
+            G.trace_file = arg;
+        else
+            argp_usage(state);
+    }
+    break;
+    case ARGP_KEY_END:
+    {
+        if(opt.interval_mode) {
+            opt.interval.cycles = ( opt.interval.msec * opt.cpu_hz ) / 1000 ;
+            interval_header();
+        }
+
+        if(!G.output_defined)
+        {
+            fprintf(stderr, "No output defined, using summary.\n");
+            opt.summary = 1;
+            opt.summary_info = 1;
+        }
+        fprintf(stderr, "Using %s hardware-assisted virtualization.\n",
+                opt.svm_mode?"SVM":"VMX");
+    }
+    break;
+
+    default:
+        return ARGP_ERR_UNKNOWN;
+    }
+
+    return 0;
+}
+
+const struct argp_option cmd_opts[] =  {
+    /* Dump group */
+    { .name = "dump-all",
+      .key = OPT_DUMP_ALL,
+      .group = OPT_GROUP_DUMP,
+      .doc = "Dump all records as they come in.", },
+
+    { .name = "dump-raw-reads",
+      .key = OPT_DUMP_RAW_READS,
+      .group = OPT_GROUP_DUMP,
+      .doc = "Dump raw data as it's read from disk.  Useful mainly for debugging the analysis tool.", },
+
+    { .name = "dump-no-processing",
+      .key = OPT_DUMP_NO_PROCESSING,
+      .group = OPT_GROUP_DUMP,
+      .doc = "Don't do any processing on records other than cpu changes.  Implies dump-raw-reads (or you wouldn't get anything).", },
+
+    { .name = "dump-raw-process",
+      .key = OPT_DUMP_RAW_PROCESS,
+      .group = OPT_GROUP_DUMP,
+      .doc = "Dump raw data as it's processed.  Useful mainly for debugging the analysis tool.", },
+
+    { .name = "dump-ipi-latency",
+      .key = OPT_DUMP_IPI_LATENCY,
+      .group = OPT_GROUP_DUMP,
+      .doc = "Dump IPI latency info as IPIs are delivered (vector 0xd1 only).", },
+
+    { .name = "dump-trace-volume-on-lost-record",
+      .key = OPT_DUMP_TRACE_VOLUME_ON_LOST_RECORD,
+      .group = OPT_GROUP_DUMP,
+      .doc = "Dump the volume of trace types in the previous cpu buffer when a lost record is created.", },
+
+    { .name = "dump-show-power-states",
+      .key = OPT_DUMP_SHOW_POWER_STATES,
+      .group = OPT_GROUP_DUMP,
+      .doc = "Show the power-state of the physical cpu when dumping output.", },
+
+    /* Extra processing group */
+    { .name = "with-cr3-enumeration",
+      .key = OPT_WITH_CR3_ENUMERATION,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Keep track of per-cr3 values", },
+
+    { .name = "with-pio-enumeration",
+      .key = OPT_WITH_PIO_ENUMERATION,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Report summary info on indiviaul IO addresses", },
+
+    { .name = "with-mmio-enumeration",
+      .key = OPT_WITH_MMIO_ENUMERATION,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Report summary info on indiviaul MMIO addresses.", },
+
+    { .name = "with-interrupt-eip-enumeration",
+      .key = OPT_WITH_INTERRUPT_EIP_ENUMERATION,
+      .arg = "vector",
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Report a summary on eips interrupted by specified vector.", },
+
+    { .name = "scatterplot-interrupt-eip",
+      .key = OPT_SCATTERPLOT_INTERRUPT_EIP,
+      .arg = "vector",
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of eips as a function of time.", },
+
+    { .name = "scatterplot-extint-cycles",
+      .key = OPT_SCATTERPLOT_EXTINT_CYCLES,
+      .arg = "vector",
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output a scatterplot of vmexit cycles for external interrupts of the given vector as a funciton of time.", },
+
+    { .name = "scatterplot-cpi",
+      .key = OPT_SCATTERPLOT_CPI,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of cpi.", },
+
+    { .name = "scatterplot-unpin-promote",
+      .key = OPT_SCATTERPLOT_UNPIN_PROMOTE,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of unpins and promotions.  If " \
+      "--with-cr3-enumeration is included, promotions include current cr3.", },
+
+    { .name = "scatterplot-cr3-switch",
+      .key = OPT_SCATTERPLOT_CR3_SWITCH,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of cr3 switches.", },
+
+    { .name = "scatterplot-wake-to-halt",
+      .key = OPT_SCATTERPLOT_WAKE_TO_HALT,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of wake-to-halt.", },
+
+    { .name = "scatterplot-vmexit-eip",
+      .key = OPT_SCATTERPLOT_VMEXIT_EIP,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of vmexit eips.", },
+
+    { .name = "scatterplot-io",
+      .key = OPT_SCATTERPLOT_IO,
+      .arg = "port",
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of io latencies for givein address as a function of time.", },
+
+    { .name = "scatterplot-runstate",
+      .key = OPT_SCATTERPLOT_RUNSTATE,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of runstate.", },
+
+    { .name = "scatterplot-runstate-time",
+      .key = OPT_SCATTERPLOT_RUNSTATE_TIME,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of time in a runstate.", },
+
+    { .name = "scatterplot-pcpu",
+      .key = OPT_SCATTERPLOT_PCPU,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of which pcpu vcpus are run on.", },
+
+    { .name = "scatterplot-rdtsc",
+      .key = OPT_SCATTERPLOT_RDTSC,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of rdtsc values.", },
+
+    { .name = "scatterplot-irq",
+      .key = OPT_SCATTERPLOT_IRQ,
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output scatterplot of irqs on pcpus.", },
+
+    { .name = "histogram-interrupt-eip",
+      .key = OPT_HISTOGRAM_INTERRUPT_EIP,
+      .arg = "vector[,increment]",
+      .group = OPT_GROUP_EXTRA,
+      .doc = "Output histograms of eips.", },
+
+    { .name = "interval",
+      .key = OPT_INTERVAL_LENGTH,
+      .arg = "sec",
+      .group = OPT_GROUP_INTERVAL,
+      .doc = "Interval length to do time-based graphs, in seconds", },
+
+    { .name = "interval-cr3-schedule-time",
+      .key = OPT_INTERVAL_CR3_SCHEDULE_TIME,
+      .arg = "gmfn[,gmfn...]",
+      .group = OPT_GROUP_INTERVAL,
+      .doc = "Print a csv with the listed cr3 value(s) every interval.", },
+
+    { .name = "interval-cr3-schedule-time-all",
+      .key = OPT_INTERVAL_CR3_SCHEDULE_TIME_ALL,
+      .group = OPT_GROUP_INTERVAL,
+      .doc = "Print a csv with all cr3 values every interval.", },
+
+    { .name = "interval-cr3-schedule-ordered",
+      .key = OPT_INTERVAL_CR3_SCHEDULE_ORDERED,
+      .group = OPT_GROUP_INTERVAL,
+      .doc = "Print summary with the top 10 cr3 values every interval.", },
+
+    { .name = "interval-cr3-short-summary",
+      .key = OPT_INTERVAL_CR3_SHORT_SUMMARY,
+      .arg = "gmfn",
+      .group = OPT_GROUP_INTERVAL,
+      .doc = "Print a csv with the hvm short summary of cr3 value every interval.", },
+
+    { .name = "interval-domain-total-time",
+      .key = OPT_INTERVAL_DOMAIN_TOTAL_TIME,
+      .arg = "domain[,domain...]",
+      .group = OPT_GROUP_INTERVAL,
+      .doc = "Print a csv with the listed domain(s) total runtime every interval.", },
+
+    { .name = "interval-domain-total-time-all",
+      .key = OPT_INTERVAL_DOMAIN_TOTAL_TIME_ALL,
+      .group = OPT_GROUP_INTERVAL,
+      .doc = "Print a csv with all domains every interval.", },
+
+    { .name = "interval-domain-short-summary",
+      .key = OPT_INTERVAL_DOMAIN_SHORT_SUMMARY,
+      .arg = "domain-id",
+      .group = OPT_GROUP_INTERVAL,
+      .doc = "Print a csv with the hvm short summary of given domain every interval.", },
+
+    { .name = "interval-domain-guest-interrupt",
+      .key = OPT_INTERVAL_DOMAIN_GUEST_INTERRUPT,
+      .arg = "domain-id",
+      .group = OPT_GROUP_INTERVAL,
+      .doc = "Print a csv with the guest interrupt count of given domain every interval.", },
+
+    { .name = "interval-domain-grant-maps",
+      .key = OPT_INTERVAL_DOMAIN_GRANT_MAPS,
+      .arg = "domain-id",
+      .group = OPT_GROUP_INTERVAL,
+      .doc = "Print a csv with the grant maps done on behalf of a given domain every interval.", },
+
+    /* Summary group */
+    { .name = "show-default-domain-summary",
+      .key = OPT_SHOW_DEFAULT_DOMAIN_SUMMARY,
+      .group = OPT_GROUP_SUMMARY,
+      .doc = "Show default domain information on summary", },
+
+    { .name = "mmio-enumeration-skip-vga",
+      .key = OPT_MMIO_ENUMERATION_SKIP_VGA,
+      .arg = "[0|1]",
+      .group = OPT_GROUP_SUMMARY,
+      .doc = "Control whether we enumerate MMIO accesses to the VGA area, which can be extremly high during boot.  Default: 0", },
+
+    { .name = "sample-size",
+      .key = OPT_SAMPLE_SIZE,
+      .arg = "size",
+      .group = OPT_GROUP_SUMMARY,
+      .doc = "Keep [size] samples for percentile purposes.  Enter 0 to " \
+      "disable.  Default 10240.", },
+
+    { .name = "summary",
+      .key = OPT_SUMMARY,
+      .group = OPT_GROUP_SUMMARY,
+      .doc = "Output a summary", },
+
+    { .name = "report-pcpu",
+      .key = OPT_REPORT_PCPU,
+      .group = OPT_GROUP_SUMMARY,
+      .doc = "Report utilization for pcpus", },
+
+    /* Guest info */
+    { .name = "default-guest-paging-levels",
+      .key = OPT_DEFAULT_GUEST_PAGING_LEVELS,
+      .group = OPT_GROUP_GUEST,
+      .arg = "L",
+      .doc = "Default guest paging levels.  Mainly necessary for Rio, as Miami traces include guest paging levels where appropriate.", },
+
+    { .name = "symbol-file",
+      .key = OPT_SYMBOL_FILE,
+      .group = OPT_GROUP_GUEST,
+      .arg = "filename",
+      .doc = "A symbol file for interpreting guest eips.", },
+
+    /* Hardware info */
+    { .name = "cpu-hz",
+      .key = OPT_CPU_HZ,
+      .group = OPT_GROUP_HARDWARE,
+      .arg = "HZ",
+      .doc = "Cpu speed of the tracing host, used to convert tsc into seconds.", },
+
+    { .name = "svm-mode",
+      .key = OPT_SVM_MODE,
+      .group = OPT_GROUP_HARDWARE,
+      .doc = "Assume AMD SVM-style vmexit error codes.  (Default is Intel VMX.)", },
+
+    { .name = "progress",
+      .key = OPT_PROGRESS,
+      .doc = "Progress dialog.  Requires the zenity (GTK+) executable.", },
+
+    { .name = "tsc-loop-fatal",
+      .key = OPT_TSC_LOOP_FATAL,
+      .doc = "Stop processing and exit if tsc skew tracking detects a dependency loop.", },
+
+    { .name = "tolerance",
+      .key = OPT_TOLERANCE,
+      .arg = "errlevel",
+      .doc = "Sets tolerance for errors found in the file.  Default is 3; max is 6.", },
+
+
+    { 0 },
+};
+
+const struct argp parser_def = {
+    .options = cmd_opts,
+    .parser = cmd_parser,
+    .args_doc = "[trace file]",
+    .doc = "",
+};
+
+const char *argp_program_bug_address = "George Dunlap <george.dunlap at eu.citrix.com>";
+
+
+int main(int argc, char *argv[]) {
+    /* Start with warn at stderr. */
+    warn = stderr;
+
+    argp_parse(&parser_def, argc, argv, 0, NULL, NULL);
+
+    if (G.trace_file == NULL)
+        exit(1);
+
+    if ( (G.fd = open(G.trace_file, O_RDONLY)) < 0) {
+        perror("open");
+        error(ERR_SYSTEM, NULL);
+    } else {
+        struct stat s;
+        fstat(G.fd, &s);
+        G.file_size = s.st_size;
+    }
+
+    if ( (G.mh = mread_init(G.fd)) == NULL )
+        perror("mread");
+
+    if (G.symbol_file != NULL)
+        parse_symbol_file(G.symbol_file);
+
+    if(opt.dump_all)
+        warn = stdout;
+
+    init_pcpus();
+
+    if(opt.progress)
+        progress_init();
+
+    process_records();
+
+    if(opt.interval_mode)
+        interval_tail();
+
+    if(opt.summary)
+        summary();
+
+    if(opt.report_pcpu)
+        report_pcpu();
+
+    if(opt.progress)
+        progress_finish();
+
+    return 0;
+}
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/xentrace/xentrace.c b/tools/xentrace/xentrace.c
index 8a38e32..4ee1458 100644
--- a/tools/xentrace/xentrace.c
+++ b/tools/xentrace/xentrace.c
@@ -23,6 +23,7 @@
 #include <string.h>
 #include <getopt.h>
 #include <assert.h>
+#include <ctype.h>
 #include <sys/poll.h>
 #include <sys/statvfs.h>
 
@@ -52,7 +53,7 @@ typedef struct settings_st {
     char *outfile;
     unsigned long poll_sleep; /* milliseconds to sleep between polls */
     uint32_t evt_mask;
-    uint32_t cpu_mask;
+    char *cpu_mask_str;
     unsigned long tbuf_size;
     unsigned long disk_rsvd;
     unsigned long timeout;
@@ -521,23 +522,52 @@ static struct t_struct *map_tbufs(unsigned long tbufs_mfn, unsigned int num,
     return &tbufs;
 }
 
+void print_cpu_mask(xc_cpumap_t map)
+{
+    unsigned int v, had_printed = 0;
+    int i;
+
+    fprintf(stderr, "change cpumask to 0x");
+
+    for ( i = xc_get_cpumap_size(xc_handle); i >= 0; i-- )
+    {
+        v = map[i];
+        if ( v || had_printed || !i ) {
+            if (had_printed)
+                fprintf(stderr,"%02x", v);
+            else
+                fprintf(stderr,"%x", v);
+            had_printed = 1;
+        }
+   }
+   fprintf(stderr, "\n");
+}
+
+static int set_cpu_mask(xc_cpumap_t map)
+{
+    int ret = xc_tbuf_set_cpu_mask(xc_handle, map);
+
+    if ( ret == 0 )
+    {
+        print_cpu_mask(map);
+        return 0;
+    }
+    PERROR("Failure to get trace buffer pointer from Xen and set the new mask");
+    return EXIT_FAILURE;
+}
+
 /**
- * set_mask - set the cpu/event mask in HV
+ * set_mask - set the event mask in HV
  * @mask:           the new mask 
  * @type:           the new mask type,0-event mask, 1-cpu mask
  *
  */
-static void set_mask(uint32_t mask, int type)
+static void set_evt_mask(uint32_t mask)
 {
     int ret = 0;
 
-    if (type == 1) {
-        ret = xc_tbuf_set_cpu_mask(xc_handle, mask);
-        fprintf(stderr, "change cpumask to 0x%x\n", mask);
-    } else if (type == 0) {
-        ret = xc_tbuf_set_evt_mask(xc_handle, mask);
-        fprintf(stderr, "change evtmask to 0x%x\n", mask);
-    }
+    ret = xc_tbuf_set_evt_mask(xc_handle, mask);
+    fprintf(stderr, "change evtmask to 0x%x\n", mask);
 
     if ( ret != 0 )
     {
@@ -774,7 +804,8 @@ static void usage(void)
 "Usage: xentrace [OPTION...] [output file]\n" \
 "Tool to capture Xen trace buffer data\n" \
 "\n" \
-"  -c, --cpu-mask=c        Set cpu-mask\n" \
+"  -c, --cpu-mask=c        Set cpu-mask, using either hex, CPU ranges, or\n" \
+"                          for all CPUs\n" \
 "  -e, --evt-mask=e        Set evt-mask\n" \
 "  -s, --poll-sleep=p      Set sleep time, p, in milliseconds between\n" \
 "                          polling the trace buffer for new data\n" \
@@ -906,6 +937,134 @@ static int parse_evtmask(char *arg)
     return 0;
 }
 
+#define ZERO_DIGIT '0'
+
+#define is_terminator(c) ((c)=='\0' || (c)==',')
+
+static int parse_cpumask_range(const char *mask_str, xc_cpumap_t map)
+{
+    unsigned int a, b;
+    int nmaskbits;
+    char c;
+    int in_range;
+    const char *s;
+
+    nmaskbits = xc_get_max_cpus(xc_handle);
+    if ( nmaskbits <= 0 )
+    {
+        fprintf(stderr, "Failed to get max number of CPUs! rc: %d\n", nmaskbits);
+        return EXIT_FAILURE;
+    }
+
+    c = 0;
+    s = mask_str;
+    do {
+        in_range = 0;
+        a = b = 0;
+
+        /* Process until we find a range terminator */
+        for ( c=*s++; !is_terminator(c); c=*s++ )
+        {
+            if ( c == '-' )
+            {
+                if ( in_range )
+                        goto err_out;
+                b = 0;
+                in_range = 1;
+                continue;
+            }
+
+            if ( !isdigit(c) )
+            {
+                fprintf(stderr, "Invalid character in cpumask: %s\n", mask_str);
+                goto err_out;
+            }
+
+            b = b * 10 + (c - ZERO_DIGIT);
+            if ( !in_range )
+                a = b;
+        }
+
+        /* Syntax: <digit>-[,] - expand to number of CPUs. */
+        if ( b == 0 && in_range )
+            b = nmaskbits-1;
+
+        if ( a > b )
+        {
+            fprintf(stderr, "Wrong order of %d and %d\n", a, b);
+            goto err_out;
+        }
+
+        if ( b >= nmaskbits )
+        {
+            fprintf(stderr, "Specified higher value then there are CPUS!\n");
+            goto err_out;
+        }
+
+        while ( a <= b )
+        {
+            xc_cpumap_setcpu(a, map);
+            a++;
+        }
+    } while ( c );
+
+    return 0;
+ err_out:
+    errno = EINVAL;
+    return EXIT_FAILURE;
+}
+
+/**
+ * Figure out which of the CPU types the user has provided - either the hex
+ * variant, the cpu-list, or 'all'. Once done set the CPU mask.
+ */
+static int parse_cpu_mask(void)
+{
+    int i, ret = EXIT_FAILURE;
+    xc_cpumap_t map;
+
+    map = xc_cpumap_alloc(xc_handle);
+    if ( !map )
+        goto out;
+
+    if ( strlen(opts.cpu_mask_str) < 1 )
+    {
+        errno = ENOSPC;
+        goto out;
+    }
+
+    ret = 0;
+    if ( strncmp("0x", opts.cpu_mask_str, 2) == 0 )
+    {
+        uint32_t v;
+
+        v = argtol(opts.cpu_mask_str, 0);
+        /*
+         * If mask is set, copy the bits out of it.  This still works for
+         * systems with more than 32 cpus, as the shift will just shift
+         * mask down to zero.
+         */
+        for ( i = 0; i < sizeof(uint32_t); i++ )
+            map[i] = (v >> (i * 8)) & 0xff;
+    }
+    else if ( strcmp("all", opts.cpu_mask_str) == 0 )
+    {
+        for ( i = 0; i < xc_get_cpumap_size(xc_handle); i++ )
+            map[i] = 0xff;
+    }
+    else
+        ret = parse_cpumask_range(opts.cpu_mask_str, map);
+
+    if ( !ret )
+        ret = set_cpu_mask(map);
+ out:
+    /* We don't use them pass this point. */
+    free(map);
+    free(opts.cpu_mask_str);
+    opts.cpu_mask_str = NULL;
+    return ret;
+}
+
 /* parse command line arguments */
 static void parse_args(int argc, char **argv)
 {
@@ -936,10 +1095,9 @@ static void parse_args(int argc, char **argv)
             opts.poll_sleep = argtol(optarg, 0);
             break;
 
-        case 'c': /* set new cpu mask for filtering*/
-            opts.cpu_mask = argtol(optarg, 0);
+        case 'c': /* set new cpu mask for filtering (when xch is set). */
+            opts.cpu_mask_str = strdup(optarg);
             break;
-        
         case 'e': /* set new event mask for filtering*/
             parse_evtmask(optarg);
             break;
@@ -1002,7 +1160,7 @@ int main(int argc, char **argv)
     opts.outfile = 0;
     opts.poll_sleep = POLL_SLEEP_MILLIS;
     opts.evt_mask = 0;
-    opts.cpu_mask = 0;
+    opts.cpu_mask_str = NULL;
     opts.disk_rsvd = 0;
     opts.disable_tracing = 1;
     opts.start_disabled = 0;
@@ -1018,10 +1176,13 @@ int main(int argc, char **argv)
     }
 
     if ( opts.evt_mask != 0 )
-        set_mask(opts.evt_mask, 0);
+        set_evt_mask(opts.evt_mask);
 
-    if ( opts.cpu_mask != 0 )
-        set_mask(opts.cpu_mask, 1);
+    if ( opts.cpu_mask_str )
+    {
+        if ( parse_cpu_mask() )
+            exit(EXIT_FAILURE);
+    }
 
     if ( opts.timeout != 0 ) 
         alarm(opts.timeout);
diff --git a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c
index 35a2819..b72e37a 100644
--- a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c
+++ b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c
@@ -350,11 +350,13 @@ int xen_irq_init(struct pci_dev *pdev)
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
 			   SA_SHIRQ | SA_SAMPLE_RANDOM | SA_INTERRUPT,
 #else
-			   IRQF_SHARED |
 #ifdef IRQF_SAMPLE_RANDOM
 			   IRQF_SAMPLE_RANDOM |
 #endif
-			   IRQF_DISABLED,
+#ifdef IRQF_DISABLED
+			   IRQF_DISABLED |
+#endif
+			   IRQF_SHARED,
 #endif
 			   "xen-platform-pci", pdev);
 }
diff --git a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c
index 163b168..ffbc577 100644
--- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c
+++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c
@@ -15,8 +15,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h
index 2537213..b834700 100644
--- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h
+++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h
@@ -15,8 +15,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _XEN_PLATFORM_PCI_H
diff --git a/unmodified_drivers/linux-2.6/platform-pci/xen_support.c b/unmodified_drivers/linux-2.6/platform-pci/xen_support.c
index 510a2b7..7818c0c 100644
--- a/unmodified_drivers/linux-2.6/platform-pci/xen_support.c
+++ b/unmodified_drivers/linux-2.6/platform-pci/xen_support.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/xen/COPYING b/xen/COPYING
index 43f972e..ddb3494 100644
--- a/xen/COPYING
+++ b/xen/COPYING
@@ -331,8 +331,7 @@ the "copyright" line and a pointer to where the full notice is found.
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+    along with this program; If not, see <http://www.gnu.org/licenses/>.
 
 
 Also add information on how to contact you by electronic and paper mail.
diff --git a/xen/Makefile b/xen/Makefile
index 5720393..4c54e9b 100644
--- a/xen/Makefile
+++ b/xen/Makefile
@@ -1,13 +1,16 @@
 # This is the correct place to edit the build version.
 # All other places this is stored (eg. compile.h) should be autogenerated.
 export XEN_VERSION       = 4
-export XEN_SUBVERSION    = 5
-export XEN_EXTRAVERSION ?= .1-rc1$(XEN_VENDORVERSION)
+export XEN_SUBVERSION    = 6
+export XEN_EXTRAVERSION ?= .0$(XEN_VENDORVERSION)
 export XEN_FULLVERSION   = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
 -include xen-version
 
 export XEN_WHOAMI	?= $(USER)
 export XEN_DOMAIN	?= $(shell ([ -x /bin/dnsdomainname ] && /bin/dnsdomainname) || ([ -x /bin/domainname ] && /bin/domainname || echo [unknown]))
+export XEN_BUILD_DATE	?= $(shell LC_ALL=C date)
+export XEN_BUILD_TIME	?= $(shell LC_ALL=C date +%T)
+export XEN_BUILD_HOST	?= $(shell hostname)
 
 export BASEDIR := $(CURDIR)
 export XEN_ROOT := $(BASEDIR)/..
@@ -41,7 +44,8 @@ _install: $(TARGET)$(CONFIG_XEN_INSTALL_SUFFIX)
 	ln -f -s $(T)-$(XEN_FULLVERSION)$(Z) $(D)$(BOOT_DIR)/$(T)-$(XEN_VERSION).$(XEN_SUBVERSION)$(Z)
 	ln -f -s $(T)-$(XEN_FULLVERSION)$(Z) $(D)$(BOOT_DIR)/$(T)-$(XEN_VERSION)$(Z)
 	ln -f -s $(T)-$(XEN_FULLVERSION)$(Z) $(D)$(BOOT_DIR)/$(T)$(Z)
-	$(INSTALL_DATA) $(TARGET)-syms $(D)$(BOOT_DIR)/$(T)-syms-$(XEN_FULLVERSION)
+	[ -d "$(D)$(DEBUG_DIR)" ] || $(INSTALL_DIR) $(D)$(DEBUG_DIR)
+	$(INSTALL_DATA) $(TARGET)-syms $(D)$(DEBUG_DIR)/$(T)-syms-$(XEN_FULLVERSION)
 	if [ -r $(TARGET).efi -a -n '$(EFI_DIR)' ]; then \
 		[ -d $(D)$(EFI_DIR) ] || $(INSTALL_DIR) $(D)$(EFI_DIR); \
 		$(INSTALL_DATA) $(TARGET).efi $(D)$(EFI_DIR)/$(T)-$(XEN_FULLVERSION).efi; \
@@ -64,7 +68,7 @@ _uninstall:
 	rm -f $(D)$(BOOT_DIR)/$(T)-$(XEN_VERSION).$(XEN_SUBVERSION)$(Z)
 	rm -f $(D)$(BOOT_DIR)/$(T)-$(XEN_VERSION)$(Z)
 	rm -f $(D)$(BOOT_DIR)/$(T)$(Z)
-	rm -f $(D)$(BOOT_DIR)/$(T)-syms-$(XEN_FULLVERSION)
+	rm -f $(D)$(DEBUG_DIR)/$(T)-syms-$(XEN_FULLVERSION)
 	rm -f $(D)$(EFI_DIR)/$(T)-$(XEN_FULLVERSION).efi
 	rm -f $(D)$(EFI_DIR)/$(T)-$(XEN_VERSION).$(XEN_SUBVERSION).efi
 	rm -f $(D)$(EFI_DIR)/$(T)-$(XEN_VERSION).efi
@@ -126,11 +130,11 @@ delete-unfresh-files:
 
 # compile.h contains dynamic build info. Rebuilt on every 'make' invocation.
 include/xen/compile.h: include/xen/compile.h.in .banner
-	@sed -e 's/@@date@@/$(shell LC_ALL=C date)/g' \
-	    -e 's/@@time@@/$(shell LC_ALL=C date +%T)/g' \
+	@sed -e 's/@@date@@/$(XEN_BUILD_DATE)/g' \
+	    -e 's/@@time@@/$(XEN_BUILD_TIME)/g' \
 	    -e 's/@@whoami@@/$(XEN_WHOAMI)/g' \
 	    -e 's/@@domain@@/$(XEN_DOMAIN)/g' \
-	    -e 's/@@hostname@@/$(shell hostname)/g' \
+	    -e 's/@@hostname@@/$(XEN_BUILD_HOST)/g' \
 	    -e 's!@@compiler@@!$(shell $(CC) $(CFLAGS) --version 2>&1 | head -1)!g' \
 	    -e 's/@@version@@/$(XEN_VERSION)/g' \
 	    -e 's/@@subversion@@/$(XEN_SUBVERSION)/g' \
diff --git a/xen/Rules.mk b/xen/Rules.mk
index a97405c..feb08d6 100644
--- a/xen/Rules.mk
+++ b/xen/Rules.mk
@@ -170,7 +170,10 @@ _clean_%/: FORCE
 %.o: %.S Makefile
 	$(CC) $(AFLAGS) -c $< -o $@
 
-SPECIAL_DATA_SECTIONS := rodata $(foreach n,1 2 4 8,rodata.str1.$(n)) \
+SPECIAL_DATA_SECTIONS := rodata $(foreach a,1 2 4 8 16, \
+					    $(foreach w,1 2 4, \
+							rodata.str$(w).$(a)) \
+					    rodata.cst$(a)) \
 			 $(foreach r,rel rel.ro,data.$(r).local)
 
 $(filter %.init.o,$(obj-y) $(obj-bin-y) $(extra-y)): %.init.o: %.o Makefile
diff --git a/xen/arch/arm/Makefile b/xen/arch/arm/Makefile
index 41aba2e..1ef39f7 100644
--- a/xen/arch/arm/Makefile
+++ b/xen/arch/arm/Makefile
@@ -12,7 +12,8 @@ obj-y += domctl.o
 obj-y += sysctl.o
 obj-y += domain_build.o
 obj-y += gic.o gic-v2.o
-obj-$(CONFIG_ARM_64) += gic-v3.o
+obj-$(CONFIG_ARM_32) += gic-hip04.o
+obj-$(HAS_GICV3) += gic-v3.o
 obj-y += io.o
 obj-y += irq.o
 obj-y += kernel.o
diff --git a/xen/arch/arm/README.LinuxPrimitives b/xen/arch/arm/README.LinuxPrimitives
index 7f33fc7..3115f51 100644
--- a/xen/arch/arm/README.LinuxPrimitives
+++ b/xen/arch/arm/README.LinuxPrimitives
@@ -25,16 +25,6 @@ linux/arch/arm64/include/asm/atomic.h   xen/include/asm-arm/arm64/atomic.h
 
 ---------------------------------------------------------------------
 
-spinlocks: last sync @ v3.16-rc6 (last commit: 95c4189689f9)
-
-linux/arch/arm64/include/asm/spinlock.h xen/include/asm-arm/arm64/spinlock.h
-
-Skipped:
-  5686b06 arm64: lockref: add support for lockless lockrefs using cmpxchg
-  52ea2a5 arm64: locks: introduce ticket-based spinlock implementation
-
----------------------------------------------------------------------
-
 mem*: last sync @ v3.16-rc6 (last commit: d875c9b37240)
 
 linux/arch/arm64/lib/memchr.S           xen/arch/arm/arm64/lib/memchr.S
@@ -103,24 +93,6 @@ linux/arch/arm/include/asm/atomic.h     xen/include/asm-arm/arm32/atomic.h
 
 ---------------------------------------------------------------------
 
-spinlocks: last sync: 15e7e5c1ebf5
-
-linux/arch/arm/include/asm/spinlock.h   xen/include/asm-arm/arm32/spinlock.h
-
-*** Linux has switched to ticket locks but we still use bitlocks.
-
-resync to v3.14-rc7:
-
-  7c8746a ARM: 7955/1: spinlock: ensure we have a compiler barrier before sev
-  0cbad9c ARM: 7854/1: lockref: add support for lockless lockrefs using cmpxchg64
-  9bb17be ARM: locks: prefetch the destination word for write prior to strex
-  27a8479 ARM: smp_on_up: move inline asm ALT_SMP patching macro out of spinlock.
-  00efaa0 ARM: 7812/1: rwlocks: retry trylock operation if strex fails on free lo
-  afa31d8 ARM: 7811/1: locks: use early clobber in arch_spin_trylock
-  73a6fdc ARM: spinlock: use inner-shareable dsb variant prior to sev instruction
-
----------------------------------------------------------------------
-
 mem*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0)
 
 linux/arch/arm/lib/copy_template.S      xen/arch/arm/arm32/lib/copy_template.S
diff --git a/xen/arch/arm/Rules.mk b/xen/arch/arm/Rules.mk
index 4ee51a9..b31770c 100644
--- a/xen/arch/arm/Rules.mk
+++ b/xen/arch/arm/Rules.mk
@@ -38,76 +38,51 @@ ifneq ($(call cc-option,$(CC),-fvisibility=hidden,n),n)
 CFLAGS += -DGCC_HAS_VISIBILITY_ATTRIBUTE
 endif
 
+CFLAGS-$(HAS_GICV3) += -DHAS_GICV3
+
 EARLY_PRINTK := n
 
 ifeq ($(debug),y)
 
-# Early printk for versatile express
-ifeq ($(CONFIG_EARLY_PRINTK), vexpress)
-EARLY_PRINTK_INC := pl011
-EARLY_UART_BASE_ADDRESS := 0x1c090000
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), fastmodel)
-EARLY_PRINTK_INC := pl011
+# See docs/misc/arm/early-printk.txt for syntax
+
+EARLY_PRINTK_brcm           := 8250,0xF040AB00,2
+EARLY_PRINTK_dra7           := 8250,0x4806A000,2
+EARLY_PRINTK_fastmodel      := pl011,0x1c090000,115200
+EARLY_PRINTK_exynos5250     := exynos4210,0x12c20000
+EARLY_PRINTK_hip04-d01      := 8250,0xE4007000,2
+EARLY_PRINTK_juno           := pl011,0x7ff80000
+EARLY_PRINTK_lager          := scif,0xe6e60000
+EARLY_PRINTK_midway         := pl011,0xfff36000
+EARLY_PRINTK_omap5432       := 8250,0x48020000,2
+EARLY_PRINTK_seattle        := pl011,0xe1010000
+EARLY_PRINTK_sun6i          := 8250,0x01c28000,2
+EARLY_PRINTK_sun7i          := 8250,0x01c28000,2
+EARLY_PRINTK_thunderx       := pl011,0x87e024000000
+EARLY_PRINTK_vexpress       := pl011,0x1c090000
+EARLY_PRINTK_xgene-mcdivitt := 8250,0x1c021000,2
+EARLY_PRINTK_xgene-storm    := 8250,0x1c020000,2
+EARLY_PRINTK_zynqmp         := cadence,0xff000000
+
+ifneq ($(EARLY_PRINTK_$(CONFIG_EARLY_PRINTK)),)
+EARLY_PRINTK_CFG := $(subst $(comma), ,$(EARLY_PRINTK_$(CONFIG_EARLY_PRINTK)))
+else
+EARLY_PRINTK_CFG := $(subst $(comma), ,$(CONFIG_EARLY_PRINTK))
+endif
+
+# Extract configuration from string
+EARLY_PRINTK_INC := $(word 1,$(EARLY_PRINTK_CFG))
+EARLY_UART_BASE_ADDRESS := $(word 2,$(EARLY_PRINTK_CFG))
+
+# UART specific options
+ifeq ($(EARLY_PRINTK_INC),8250)
+EARLY_UART_REG_SHIFT := $(word 3,$(EARLY_PRINTK_CFG))
+endif
+ifeq ($(EARLY_PRINTK_INC),pl011)
+ifneq ($(word 3,$(EARLY_PRINTK_CFG)),)
 EARLY_PRINTK_INIT_UART := y
-EARLY_PRINTK_BAUD := 115200
-EARLY_UART_BASE_ADDRESS := 0x1c090000
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), exynos5250)
-EARLY_PRINTK_INC := exynos4210
-EARLY_UART_BASE_ADDRESS := 0x12c20000
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), midway)
-EARLY_PRINTK_INC := pl011
-EARLY_UART_BASE_ADDRESS := 0xfff36000
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), omap5432)
-EARLY_PRINTK_INC := 8250
-EARLY_UART_BASE_ADDRESS := 0x48020000
-EARLY_UART_REG_SHIFT := 2
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), dra7)
-EARLY_PRINTK_INC := 8250
-EARLY_UART_BASE_ADDRESS := 0x4806A000
-EARLY_UART_REG_SHIFT := 2
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), sun6i)
-EARLY_PRINTK_INC := 8250
-EARLY_UART_BASE_ADDRESS := 0x01c28000
-EARLY_UART_REG_SHIFT := 2
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), sun7i)
-EARLY_PRINTK_INC := 8250
-EARLY_UART_BASE_ADDRESS := 0x01c28000
-EARLY_UART_REG_SHIFT := 2
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), brcm)
-EARLY_PRINTK_INC := 8250
-EARLY_UART_BASE_ADDRESS := 0xF040AB00
-EARLY_UART_REG_SHIFT := 2
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), xgene-storm)
-EARLY_PRINTK_INC := 8250
-EARLY_UART_BASE_ADDRESS := 0x1c020000
-EARLY_UART_REG_SHIFT := 2
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), xgene-mcdivitt)
-EARLY_PRINTK_INC := 8250
-EARLY_UART_BASE_ADDRESS := 0x1c021000
-EARLY_UART_REG_SHIFT := 2
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), juno)
-EARLY_PRINTK_INC := pl011
-EARLY_UART_BASE_ADDRESS := 0x7ff80000
-endif
-ifeq ($(CONFIG_EARLY_PRINTK), hip04-d01)
-EARLY_PRINTK_INC := 8250
-EARLY_UART_BASE_ADDRESS := 0xE4007000
-EARLY_UART_REG_SHIFT := 2
+EARLY_PRINTK_BAUD := $(word 3,$(EARLY_PRINTK_CFG))
 endif
-ifeq ($(CONFIG_EARLY_PRINTK), seattle)
-EARLY_PRINTK_INC := pl011
-EARLY_UART_BASE_ADDRESS := 0xe1010000
 endif
 
 ifneq ($(EARLY_PRINTK_INC),)
diff --git a/xen/arch/arm/arm32/debug-8250.inc b/xen/arch/arm/arm32/debug-8250.inc
index eb25882..757ffd8 100644
--- a/xen/arch/arm/arm32/debug-8250.inc
+++ b/xen/arch/arm/arm32/debug-8250.inc
@@ -21,9 +21,9 @@
  * rc: scratch register */
 .macro early_uart_ready rb rc
 1:
-	ldr	\rc, [\rb, #(UART_LSR << EARLY_UART_REG_SHIFT)] /* Read LSR */
-	tst	\rc, #UART_LSR_THRE     /* Check Xmit holding register flag */
-	beq	1b		           /* Wait for the UART to be ready */
+        ldr     \rc, [\rb, #(UART_LSR << EARLY_UART_REG_SHIFT)] /* Read LSR */
+        tst     \rc, #UART_LSR_THRE     /* Check Xmit holding register flag */
+        beq     1b                         /* Wait for the UART to be ready */
 .endm
 
 /* 8250 UART transmit character
diff --git a/xen/arch/arm/arm32/debug-scif.inc b/xen/arch/arm/arm32/debug-scif.inc
new file mode 100644
index 0000000..ce85752
--- /dev/null
+++ b/xen/arch/arm/arm32/debug-scif.inc
@@ -0,0 +1,49 @@
+/*
+ * xen/arch/arm/arm32/debug-scif.inc
+ *
+ * SCIF specific debug code
+ *
+ * Oleksandr Tyshchenko <oleksandr.tyshchenko at globallogic.com>
+ * Copyright (C) 2014, Globallogic.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <asm/scif-uart.h>
+
+/* SCIF UART wait UART to be ready to transmit
+ * rb: register which contains the UART base address
+ * rc: scratch register
+ */
+.macro early_uart_ready rb rc
+1:
+        ldrh   \rc, [\rb, #SCIF_SCFSR]   /* <- SCFSR (status register) */
+        tst    \rc, #SCFSR_TDFE          /* Check TDFE bit */
+        beq    1b                        /* Wait for the UART to be ready */
+.endm
+
+/* SCIF UART transmit character
+ * rb: register which contains the UART base address
+ * rt: register which contains the character to transmit
+ */
+.macro early_uart_transmit rb rt
+        strb   \rt, [\rb, #SCIF_SCFTDR]                  /* -> SCFTDR (data register) */
+        ldrh   \rt, [\rb, #SCIF_SCFSR]                   /* <- SCFSR (status register) */
+        and    \rt, \rt, #(~(SCFSR_TEND | SCFSR_TDFE))   /* Clear TEND and TDFE bits */
+        strh   \rt, [\rb, #SCIF_SCFSR]                   /* -> SCFSR (status register) */
+.endm
+
+/*
+ * Local variables:
+ * mode: ASM
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/arm/arm32/lib/lib1funcs.S b/xen/arch/arm/arm32/lib/lib1funcs.S
index 95ee312..1a2e6d2 100644
--- a/xen/arch/arm/arm32/lib/lib1funcs.S
+++ b/xen/arch/arm/arm32/lib/lib1funcs.S
@@ -28,9 +28,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 General Public License for more details.
 
 You should have received a copy of the GNU General Public License
-along with this program; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
+along with this program; see the file COPYING.  If not, see <http://www.gnu.org/licenses/>.  */
 
 
 #include <xen/config.h>
diff --git a/xen/arch/arm/arm32/lib/lshrdi3.S b/xen/arch/arm/arm32/lib/lshrdi3.S
index 3e8887e..35bd4d1 100644
--- a/xen/arch/arm/arm32/lib/lshrdi3.S
+++ b/xen/arch/arm/arm32/lib/lshrdi3.S
@@ -21,9 +21,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 General Public License for more details.
 
 You should have received a copy of the GNU General Public License
-along with this program; see the file COPYING.  If not, write to
-the Free Software Foundation, 51 Franklin Street, Fifth Floor,
-Boston, MA 02110-1301, USA.  */
+along with this program; see the file COPYING.  If not, see <http://www.gnu.org/licenses/>.  */
 
 
 #include <xen/config.h>
diff --git a/xen/arch/arm/arm64/debug-cadence.inc b/xen/arch/arm/arm64/debug-cadence.inc
new file mode 100644
index 0000000..84dee4c
--- /dev/null
+++ b/xen/arch/arm/arm64/debug-cadence.inc
@@ -0,0 +1,45 @@
+/*
+ * xen/arch/arm/arm64/debug-cadence.S
+ *
+ * Cadence UART specific debug code
+ *
+ * Copyright (c) 2015 Xilinx Inc.
+ * Written by Edgar E. Iglesias.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <asm/asm_defns.h>
+#include <asm/cadence-uart.h>
+
+/* Cadence UART wait UART to be ready to transmit
+ * xb: register which contains the UART base address
+ * c: scratch register number */
+.macro early_uart_ready xb, c
+1:
+        ldrh  w\c, [\xb, #R_UART_SR]
+        tst   w\c, #UART_SR_INTR_TFUL
+        b.ne  1b
+.endm
+
+/* Cadence UART transmit character
+ * xb: register which contains the UART base address
+ * wt: register which contains the character to transmit */
+.macro early_uart_transmit xb, wt
+        strb  \wt, [\xb, #R_UART_TX]
+.endm
+
+/*
+ * Local variables:
+ * mode: ASM
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/arm/arm64/head.S b/xen/arch/arm/arm64/head.S
index 75b7ee1..9ed9a93 100644
--- a/xen/arch/arm/arm64/head.S
+++ b/xen/arch/arm/arm64/head.S
@@ -519,8 +519,7 @@ paging:
         cbnz  x22, 1f
 
         /* Add UART to the fixmap table */
-        ldr   x1, =xen_fixmap
-        add   x1, x1, x20            /* x1 := paddr (xen_fixmap) */
+        ldr   x1, =xen_fixmap        /* x1 := vaddr (xen_fixmap) */
         lsr   x2, x23, #THIRD_SHIFT
         lsl   x2, x2, #THIRD_SHIFT   /* 4K aligned paddr of UART */
         mov   x3, #PT_DEV_L3
diff --git a/xen/arch/arm/arm64/smpboot.c b/xen/arch/arm/arm64/smpboot.c
index 341cc77..62e6abb 100644
--- a/xen/arch/arm/arm64/smpboot.c
+++ b/xen/arch/arm/arm64/smpboot.c
@@ -38,7 +38,7 @@ static int __init smp_spin_table_cpu_up(int cpu)
 
     sev();
 
-    return cpu_up_send_sgi(cpu);
+    return 0;
 }
 
 static void __init smp_spin_table_init(int cpu, struct dt_device_node *dn)
diff --git a/xen/arch/arm/arm64/traps.c b/xen/arch/arm/arm64/traps.c
index 1693b5d..5a90cfa 100644
--- a/xen/arch/arm/arm64/traps.c
+++ b/xen/arch/arm/arm64/traps.c
@@ -24,11 +24,6 @@
 
 #include <public/xen.h>
 
-asmlinkage void do_trap_serror(struct cpu_user_regs *regs)
-{
-    panic("Unhandled serror trap");
-}
-
 static const char *handler[]= {
         "Synchronous Abort",
         "IRQ",
@@ -38,11 +33,14 @@ static const char *handler[]= {
 
 asmlinkage void do_bad_mode(struct cpu_user_regs *regs, int reason)
 {
-    uint64_t esr = READ_SYSREG64(ESR_EL2);
-    printk("Bad mode in %s handler detected, code 0x%08"PRIx64"\n",
-           handler[reason], esr);
+    union hsr hsr = { .bits = READ_SYSREG32(ESR_EL2) };
+
+    printk("Bad mode in %s handler detected\n", handler[reason]);
+    printk("ESR=0x%08"PRIx32":  EC=%"PRIx32", IL=%"PRIx32", ISS=%"PRIx32"\n",
+           hsr.bits, hsr.ec, hsr.len, hsr.iss);
 
     local_irq_disable();
+    show_execution_state(regs);
     panic("bad mode");
 }
 
diff --git a/xen/arch/arm/bootfdt.c b/xen/arch/arm/bootfdt.c
index e100233..74d208b 100644
--- a/xen/arch/arm/bootfdt.c
+++ b/xen/arch/arm/bootfdt.c
@@ -100,6 +100,7 @@ static int __init device_tree_for_each_node(const void *fdt,
           node = fdt_next_node(fdt, node, &depth) )
     {
         const char *name = fdt_get_name(fdt, node, NULL);
+        u32 as, ss;
 
         if ( depth >= DEVICE_TREE_MAX_DEPTH )
         {
@@ -108,14 +109,15 @@ static int __init device_tree_for_each_node(const void *fdt,
             continue;
         }
 
-        address_cells[depth] = device_tree_get_u32(fdt, node, "#address-cells",
-                                depth > 0 ? address_cells[depth-1] : 0);
-        size_cells[depth] = device_tree_get_u32(fdt, node, "#size-cells",
-                                depth > 0 ? size_cells[depth-1] : 0);
+        as = depth > 0 ? address_cells[depth-1] : 0;
+        ss = depth > 0 ? size_cells[depth-1] : 0;
 
+        address_cells[depth] = device_tree_get_u32(fdt, node,
+                                                   "#address-cells", as);
+        size_cells[depth] = device_tree_get_u32(fdt, node,
+                                                "#size-cells", ss);
 
-        ret = func(fdt, node, name, depth,
-                   address_cells[depth-1], size_cells[depth-1], data);
+        ret = func(fdt, node, name, depth, as, ss, data);
         if ( ret != 0 )
             return ret;
     }
diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
index 9d237f8..c6f49a5 100644
--- a/xen/arch/arm/decode.c
+++ b/xen/arch/arm/decode.c
@@ -78,7 +78,7 @@ static int decode_thumb2(register_t pc, struct hsr_dabt *dabt, uint16_t hw1)
     return 0;
 
 bad_thumb2:
-    gdprintk(XENLOG_ERR, "unhandled THUMB2 instruction 0x%x%x\n", hw1, hw2);
+    gprintk(XENLOG_ERR, "unhandled THUMB2 instruction 0x%x%x\n", hw1, hw2);
 
     return 1;
 }
@@ -145,7 +145,7 @@ static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
     return 0;
 
 bad_thumb:
-    gdprintk(XENLOG_ERR, "unhandled THUMB instruction 0x%x\n", instr);
+    gprintk(XENLOG_ERR, "unhandled THUMB instruction 0x%x\n", instr);
     return 1;
 }
 
@@ -155,7 +155,7 @@ int decode_instruction(const struct cpu_user_regs *regs, struct hsr_dabt *dabt)
         return decode_thumb(regs->pc, dabt);
 
     /* TODO: Handle ARM instruction */
-    gdprintk(XENLOG_ERR, "unhandled ARM instruction\n");
+    gprintk(XENLOG_ERR, "unhandled ARM instruction\n");
 
     return 1;
 }
diff --git a/xen/arch/arm/device.c b/xen/arch/arm/device.c
index 59e94c0..0b53f6a 100644
--- a/xen/arch/arm/device.c
+++ b/xen/arch/arm/device.c
@@ -23,39 +23,22 @@
 
 extern const struct device_desc _sdevice[], _edevice[];
 
-static bool_t __init device_is_compatible(const struct device_desc *desc,
-                                          const struct dt_device_node *dev)
-{
-    const char *const *compat;
-
-    if ( !desc->compatible )
-        return 0;
-
-    for ( compat = desc->compatible; *compat; compat++ )
-    {
-        if ( dt_device_is_compatible(dev, *compat) )
-            return 1;
-    }
-
-    return 0;
-}
-
-int __init device_init(struct dt_device_node *dev, enum device_type type,
+int __init device_init(struct dt_device_node *dev, enum device_class class,
                        const void *data)
 {
     const struct device_desc *desc;
 
     ASSERT(dev != NULL);
 
-    if ( !dt_device_is_available(dev) )
+    if ( !dt_device_is_available(dev) || dt_device_for_passthrough(dev) )
         return  -ENODEV;
 
     for ( desc = _sdevice; desc != _edevice; desc++ )
     {
-        if ( desc->type != type )
+        if ( desc->class != class )
             continue;
 
-        if ( device_is_compatible(desc, dev) )
+        if ( dt_match_node(desc->dt_match, dev) )
         {
             ASSERT(desc->init != NULL);
 
@@ -67,7 +50,7 @@ int __init device_init(struct dt_device_node *dev, enum device_type type,
     return -EBADF;
 }
 
-enum device_type device_get_type(const struct dt_device_node *dev)
+enum device_class device_get_class(const struct dt_device_node *dev)
 {
     const struct device_desc *desc;
 
@@ -75,8 +58,8 @@ enum device_type device_get_type(const struct dt_device_node *dev)
 
     for ( desc = _sdevice; desc != _edevice; desc++ )
     {
-        if ( device_is_compatible(desc, dev) )
-            return desc->type;
+        if ( dt_match_node(desc->dt_match, dev) )
+            return desc->class;
     }
 
     return DEVICE_UNKNOWN;
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index d486632..b2bfc7d 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -501,11 +501,7 @@ int vcpu_initialise(struct vcpu *v)
 
     v->arch.sctlr = SCTLR_GUEST_INIT;
 
-    /*
-     * By default exposes an SMP system with AFF0 set to the VCPU ID
-     * TODO: Handle multi-threading processor and cluster
-     */
-    v->arch.vmpidr = MPIDR_SMP | (v->vcpu_id << MPIDR_AFF0_SHIFT);
+    v->arch.vmpidr = MPIDR_SMP | vcpuid_to_vaffinity(v->vcpu_id);
 
     v->arch.actlr = READ_SYSREG32(ACTLR_EL1);
 
@@ -531,7 +527,8 @@ void vcpu_destroy(struct vcpu *v)
     free_xenheap_pages(v->arch.stack, STACK_ORDER);
 }
 
-int arch_domain_create(struct domain *d, unsigned int domcr_flags)
+int arch_domain_create(struct domain *d, unsigned int domcr_flags,
+                       struct xen_arch_domainconfig *config)
 {
     int rc;
 
@@ -541,6 +538,7 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
     if ( is_idle_domain(d) )
         return 0;
 
+    ASSERT(config != NULL);
     if ( (rc = p2m_init(d)) != 0 )
         goto fail;
 
@@ -561,19 +559,57 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
     if ( (rc = p2m_alloc_table(d)) != 0 )
         goto fail;
 
-    if ( (rc = gicv_setup(d)) != 0 )
+    switch ( config->gic_version )
+    {
+    case XEN_DOMCTL_CONFIG_GIC_NATIVE:
+        switch ( gic_hw_version () )
+        {
+        case GIC_V2:
+            config->gic_version = XEN_DOMCTL_CONFIG_GIC_V2;
+            d->arch.vgic.version = GIC_V2;
+            break;
+
+        case GIC_V3:
+            config->gic_version = XEN_DOMCTL_CONFIG_GIC_V3;
+            d->arch.vgic.version = GIC_V3;
+            break;
+
+        default:
+            BUG();
+        }
+        break;
+
+    case XEN_DOMCTL_CONFIG_GIC_V2:
+        d->arch.vgic.version = GIC_V2;
+        break;
+
+    case XEN_DOMCTL_CONFIG_GIC_V3:
+        d->arch.vgic.version = GIC_V3;
+        break;
+
+    default:
+        rc = -EOPNOTSUPP;
         goto fail;
+    }
 
-    if ( (rc = domain_vgic_init(d)) != 0 )
+    if ( (rc = domain_vgic_init(d, config->nr_spis)) != 0 )
         goto fail;
 
-    if ( (rc = domain_vtimer_init(d)) != 0 )
+    if ( (rc = domain_vtimer_init(d, config)) != 0 )
         goto fail;
 
-    if ( d->domain_id )
+    /*
+     * The hardware domain will get a PPI later in
+     * arch/arm/domain_build.c  depending on the
+     * interrupt map of the hardware.
+     */
+    if ( !is_hardware_domain(d) )
+    {
         d->arch.evtchn_irq = GUEST_EVTCHN_PPI;
-    else
-        d->arch.evtchn_irq = platform_dom0_evtchn_ppi();
+        /* At this stage vgic_reserve_virq should never fail */
+        if ( !vgic_reserve_virq(d, GUEST_EVTCHN_PPI) )
+            BUG();
+    }
 
     /*
      * Virtual UART is only used by linux early printk and decompress code.
@@ -761,8 +797,12 @@ int domain_relinquish_resources(struct domain *d)
     switch ( d->arch.relmem )
     {
     case RELMEM_not_started:
+        ret = iommu_release_dt_devices(d);
+        if ( ret )
+            return ret;
+
         d->arch.relmem = RELMEM_xen;
-        /* Falltrough */
+        /* Fallthrough */
 
     case RELMEM_xen:
         ret = relinquish_memory(d, &d->xenpage_list);
@@ -804,7 +844,7 @@ void arch_dump_domain_info(struct domain *d)
 }
 
 
-long do_arm_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
+long do_arm_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     switch ( cmd )
     {
@@ -850,6 +890,20 @@ void vcpu_block_unless_event_pending(struct vcpu *v)
         vcpu_unblock(current);
 }
 
+unsigned int domain_max_vcpus(const struct domain *d)
+{
+    /*
+     * Since evtchn_init would call domain_max_vcpus for poll_mask
+     * allocation when the vgic_ops haven't been initialised yet,
+     * we return MAX_VIRT_CPUS if d->arch.vgic.handler is null.
+     */
+    if ( !d->arch.vgic.handler )
+        return MAX_VIRT_CPUS;
+    else
+        return min_t(unsigned int, MAX_VIRT_CPUS,
+                     d->arch.vgic.handler->max_vcpus);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index da868e3..a059de6 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -21,6 +21,7 @@
 
 #include <asm/gic.h>
 #include <xen/irq.h>
+#include <xen/grant_table.h>
 #include "kernel.h"
 
 static unsigned int __initdata opt_dom0_max_vcpus;
@@ -402,7 +403,7 @@ static int write_properties(struct domain *d, struct kernel_info *kinfo,
                             const struct dt_device_node *node)
 {
     const char *bootargs = NULL;
-    const struct dt_property *prop;
+    const struct dt_property *prop, *status = NULL;
     int res = 0;
     int had_dom0_bootargs = 0;
 
@@ -424,6 +425,7 @@ static int write_properties(struct domain *d, struct kernel_info *kinfo,
          *   bootargs (from module #1, above).
          * * remove bootargs,  xen,dom0-bootargs, xen,xen-bootargs,
          *   linux,initrd-start and linux,initrd-end.
+         * * remove stdout-path.
          * * remove bootargs, linux,uefi-system-table,
          *   linux,uefi-mmap-start, linux,uefi-mmap-size,
          *   linux,uefi-mmap-desc-size, and linux,uefi-mmap-desc-ver
@@ -434,6 +436,7 @@ static int write_properties(struct domain *d, struct kernel_info *kinfo,
             if ( dt_property_name_is_equal(prop, "xen,xen-bootargs") ||
                  dt_property_name_is_equal(prop, "linux,initrd-start") ||
                  dt_property_name_is_equal(prop, "linux,initrd-end") ||
+                 dt_property_name_is_equal(prop, "stdout-path") ||
                  dt_property_name_is_equal(prop, "linux,uefi-system-table") ||
                  dt_property_name_is_equal(prop, "linux,uefi-mmap-start") ||
                  dt_property_name_is_equal(prop, "linux,uefi-mmap-size") ||
@@ -455,6 +458,17 @@ static int write_properties(struct domain *d, struct kernel_info *kinfo,
             }
         }
 
+        /* Don't expose the property "xen,passthrough" to the guest */
+        if ( dt_property_name_is_equal(prop, "xen,passthrough") )
+            continue;
+
+        /* Remember and skip the status property as Xen may modify it later */
+        if ( dt_property_name_is_equal(prop, "status") )
+        {
+            status = prop;
+            continue;
+        }
+
         res = fdt_property(kinfo->fdt, prop->name, prop_data, prop_len);
 
         xfree(new_data);
@@ -463,6 +477,19 @@ static int write_properties(struct domain *d, struct kernel_info *kinfo,
             return res;
     }
 
+    /*
+     * Override the property "status" to disable the device when it's
+     * marked for passthrough.
+     */
+    if ( dt_device_for_passthrough(node) )
+        res = fdt_property_string(kinfo->fdt, "status", "disabled");
+    else if ( status )
+        res = fdt_property(kinfo->fdt, "status", status->value,
+                           status->length);
+
+    if ( res )
+        return res;
+
     if ( dt_node_path_is_equal(node, "/chosen") )
     {
         const struct bootmodule *mod = kinfo->initrd_bootmodule;
@@ -579,8 +606,8 @@ static int make_memory_node(const struct domain *d,
     return res;
 }
 
-static int make_hypervisor_node(struct domain *d,
-                                void *fdt, const struct dt_device_node *parent)
+static int make_hypervisor_node(const struct kernel_info *kinfo,
+                                const struct dt_device_node *parent)
 {
     const char compat[] =
         "xen,xen-"__stringify(XEN_VERSION)"."__stringify(XEN_SUBVERSION)"\0"
@@ -589,9 +616,10 @@ static int make_hypervisor_node(struct domain *d,
     gic_interrupt_t intr;
     __be32 *cells;
     int res;
+    /* Convenience alias */
     int addrcells = dt_n_addr_cells(parent);
     int sizecells = dt_n_size_cells(parent);
-    paddr_t gnttab_start, gnttab_size;
+    void *fdt = kinfo->fdt;
 
     DPRINT("Create hypervisor node\n");
 
@@ -613,28 +641,19 @@ static int make_hypervisor_node(struct domain *d,
     if ( res )
         return res;
 
-    platform_dom0_gnttab(&gnttab_start, &gnttab_size);
-    DPRINT("  Grant table range: %#"PRIpaddr"-%#"PRIpaddr"\n",
-           gnttab_start, gnttab_start + gnttab_size);
     /* reg 0 is grant table space */
     cells = &reg[0];
-    dt_set_range(&cells, parent, gnttab_start, gnttab_size);
+    dt_set_range(&cells, parent, kinfo->gnttab_start, kinfo->gnttab_size);
     res = fdt_property(fdt, "reg", reg,
                        dt_cells_to_size(addrcells + sizecells));
     if ( res )
         return res;
 
     /*
-     * interrupts is evtchn upcall:
-     *  - Active-low level-sensitive
-     *  - All cpus
-     *
-     * TODO: Handle correctly the cpumask
+     * Placeholder for the event channel interrupt.  The values will be
+     * replaced later.
      */
-    DPRINT("  Event channel interrupt to %u\n", d->arch.evtchn_irq);
-    set_interrupt_ppi(intr, d->arch.evtchn_irq, 0xf,
-                   DT_IRQ_TYPE_LEVEL_LOW);
-
+    set_interrupt_ppi(intr, ~0, 0xf, DT_IRQ_TYPE_INVALID);
     res = fdt_property_interrupts(fdt, &intr, 1);
     if ( res )
         return res;
@@ -692,6 +711,7 @@ static int make_cpus_node(const struct domain *d, void *fdt,
     char buf[15];
     u32 clock_frequency;
     bool_t clock_valid;
+    uint64_t mpidr_aff;
 
     DPRINT("Create cpus node\n");
 
@@ -741,9 +761,16 @@ static int make_cpus_node(const struct domain *d, void *fdt,
 
     for ( cpu = 0; cpu < d->max_vcpus; cpu++ )
     {
-        DPRINT("Create cpu@%u node\n", cpu);
+        /*
+         * According to ARM CPUs bindings, the reg field should match
+         * the MPIDR's affinity bits. We will use AFF0 and AFF1 when
+         * constructing the reg value of the guest at the moment, for it
+         * is enough for the current max vcpu number.
+         */
+        mpidr_aff = vcpuid_to_vaffinity(cpu);
+        DPRINT("Create cpu@%"PRIx64" (logical CPUID: %d) node\n", mpidr_aff, cpu);
 
-        snprintf(buf, sizeof(buf), "cpu@%u", cpu);
+        snprintf(buf, sizeof(buf), "cpu@%"PRIx64, mpidr_aff);
         res = fdt_begin_node(fdt, buf);
         if ( res )
             return res;
@@ -756,7 +783,7 @@ static int make_cpus_node(const struct domain *d, void *fdt,
         if ( res )
             return res;
 
-        res = fdt_property_cell(fdt, "reg", cpu);
+        res = fdt_property_cell(fdt, "reg", mpidr_aff);
         if ( res )
             return res;
 
@@ -788,8 +815,8 @@ static int make_gic_node(const struct domain *d, void *fdt,
 {
     const struct dt_device_node *gic = dt_interrupt_controller;
     int res = 0;
-    const void *addrcells;
-    u32 addrcells_len;
+    const void *addrcells, *sizecells;
+    u32 addrcells_len, sizecells_len;
 
     /*
      * Xen currently supports only a single GIC. Discard any secondary
@@ -803,7 +830,7 @@ static int make_gic_node(const struct domain *d, void *fdt,
 
     DPRINT("Create gic node\n");
 
-    res = gic_make_node(d, node, fdt);
+    res = fdt_begin_node(fdt, "interrupt-controller");
     if ( res )
         return res;
 
@@ -827,6 +854,26 @@ static int make_gic_node(const struct domain *d, void *fdt,
             return res;
     }
 
+    sizecells = dt_get_property(gic, "#size-cells", &sizecells_len);
+    if ( sizecells )
+    {
+        res = fdt_property(fdt, "#size-cells", sizecells, sizecells_len);
+        if ( res )
+            return res;
+    }
+
+    res = fdt_property_cell(fdt, "#interrupt-cells", 3);
+    if ( res )
+        return res;
+
+    res = fdt_property(fdt, "interrupt-controller", NULL, 0);
+    if ( res )
+        return res;
+
+    res = gic_make_hwdom_dt_node(d, node, fdt);
+    if ( res )
+        return res;
+
     res = fdt_end_node(fdt);
 
     return res;
@@ -907,23 +954,164 @@ static int make_timer_node(const struct domain *d, void *fdt,
     return res;
 }
 
-/* Map the device in the domain */
-static int map_device(struct domain *d, struct dt_device_node *dev)
+static int map_irq_to_domain(const struct dt_device_node *dev,
+                             struct domain *d, unsigned int irq)
+
+{
+    bool_t need_mapping = !dt_device_for_passthrough(dev);
+    int res;
+
+    res = irq_permit_access(d, irq);
+    if ( res )
+    {
+        printk(XENLOG_ERR "Unable to permit to dom%u access to IRQ %u\n",
+               d->domain_id, irq);
+        return res;
+    }
+
+    if ( need_mapping )
+    {
+        /*
+         * Checking the return of vgic_reserve_virq is not
+         * necessary. It should not fail except when we try to map
+         * the IRQ twice. This can legitimately happen if the IRQ is shared
+         */
+        vgic_reserve_virq(d, irq);
+
+        res = route_irq_to_guest(d, irq, irq, dt_node_name(dev));
+        if ( res < 0 )
+        {
+            printk(XENLOG_ERR "Unable to map IRQ%"PRId32" to dom%d\n",
+                   irq, d->domain_id);
+            return res;
+        }
+    }
+
+    DPRINT("  - IRQ: %u\n", irq);
+    return 0;
+}
+
+static int map_dt_irq_to_domain(const struct dt_device_node *dev,
+                                const struct dt_irq *dt_irq,
+                                void *data)
+{
+    struct domain *d = data;
+    unsigned int irq = dt_irq->irq;
+    int res;
+
+    if ( irq < NR_LOCAL_IRQS )
+    {
+        printk(XENLOG_ERR "%s: IRQ%"PRId32" is not a SPI\n",
+               dt_node_name(dev), irq);
+        return -EINVAL;
+    }
+
+    /* Setup the IRQ type */
+    res = irq_set_spi_type(irq, dt_irq->type);
+    if ( res )
+    {
+        printk(XENLOG_ERR
+               "%s: Unable to setup IRQ%"PRId32" to dom%d\n",
+               dt_node_name(dev), irq, d->domain_id);
+        return res;
+    }
+
+    res = map_irq_to_domain(dev, d, irq);
+
+    return 0;
+}
+
+static int map_range_to_domain(const struct dt_device_node *dev,
+                               u64 addr, u64 len,
+                               void *data)
+{
+    struct domain *d = data;
+    bool_t need_mapping = !dt_device_for_passthrough(dev);
+    int res;
+
+    res = iomem_permit_access(d, paddr_to_pfn(addr),
+                              paddr_to_pfn(PAGE_ALIGN(addr + len - 1)));
+    if ( res )
+    {
+        printk(XENLOG_ERR "Unable to permit to dom%d access to"
+               " 0x%"PRIx64" - 0x%"PRIx64"\n",
+               d->domain_id,
+               addr & PAGE_MASK, PAGE_ALIGN(addr + len) - 1);
+        return res;
+    }
+
+    if ( need_mapping )
+    {
+        res = map_mmio_regions(d,
+                               paddr_to_pfn(addr),
+                               DIV_ROUND_UP(len, PAGE_SIZE),
+                               paddr_to_pfn(addr));
+        if ( res < 0 )
+        {
+            printk(XENLOG_ERR "Unable to map 0x%"PRIx64
+                   " - 0x%"PRIx64" in domain %d\n",
+                   addr & PAGE_MASK, PAGE_ALIGN(addr + len) - 1,
+                   d->domain_id);
+            return res;
+        }
+    }
+
+    DPRINT("  - MMIO: %010"PRIx64" - %010"PRIx64"\n", addr, addr + len);
+
+    return 0;
+}
+
+/*
+ * For a node which describes a discoverable bus (such as a PCI bus)
+ * then we may need to perform additional mappings in order to make
+ * the child resources available to domain 0.
+ */
+static int map_device_children(struct domain *d,
+                               const struct dt_device_node *dev)
+{
+    int ret;
+
+    if ( dt_device_type_is_equal(dev, "pci") )
+    {
+        DPRINT("Mapping children of %s to guest\n", dt_node_full_name(dev));
+
+        ret = dt_for_each_irq_map(dev, &map_dt_irq_to_domain, d);
+        if ( ret < 0 )
+            return ret;
+
+        ret = dt_for_each_range(dev, &map_range_to_domain, d);
+        if ( ret < 0 )
+            return ret;
+    }
+
+    return 0;
+}
+
+/*
+ * For a given device node:
+ *  - Give permission to the guest to manage IRQ and MMIO range
+ *  - Retrieve the IRQ configuration (i.e edge/level) from device tree
+ * When the device is not marked for guest passthrough:
+ *  - Assign the device to the guest if it's protected by an IOMMU
+ *  - Map the IRQs and iomem regions to DOM0
+ */
+static int handle_device(struct domain *d, struct dt_device_node *dev)
 {
     unsigned int nirq;
     unsigned int naddr;
     unsigned int i;
     int res;
-    unsigned int irq;
     struct dt_raw_irq rirq;
     u64 addr, size;
+    bool_t need_mapping = !dt_device_for_passthrough(dev);
 
     nirq = dt_number_of_irq(dev);
     naddr = dt_number_of_address(dev);
 
-    DPRINT("%s nirq = %d naddr = %u\n", dt_node_full_name(dev), nirq, naddr);
+    DPRINT("%s passthrough = %d nirq = %d naddr = %u\n", dt_node_full_name(dev),
+           need_mapping, nirq, naddr);
 
-    if ( dt_device_is_protected(dev) )
+    if ( dt_device_is_protected(dev) && need_mapping )
     {
         DPRINT("%s setup iommu\n", dt_node_full_name(dev));
         res = iommu_assign_dt_device(d, dev);
@@ -935,7 +1123,7 @@ static int map_device(struct domain *d, struct dt_device_node *dev)
         }
     }
 
-    /* Map IRQs */
+    /* Give permission and map IRQs */
     for ( i = 0; i < nirq; i++ )
     {
         res = dt_device_get_raw_irq(dev, i, &rirq);
@@ -965,19 +1153,12 @@ static int map_device(struct domain *d, struct dt_device_node *dev)
             return res;
         }
 
-        irq = res;
-
-        DPRINT("irq %u = %u\n", i, irq);
-        res = route_irq_to_guest(d, irq, dt_node_name(dev));
+        res = map_irq_to_domain(dev, d, res);
         if ( res )
-        {
-            printk(XENLOG_ERR "Unable to route IRQ %u to domain %u\n",
-                   irq, d->domain_id);
             return res;
-        }
     }
 
-    /* Map the address ranges */
+    /* Give permission and map MMIOs */
     for ( i = 0; i < naddr; i++ )
     {
         res = dt_device_get_address(dev, i, &addr, &size);
@@ -988,33 +1169,15 @@ static int map_device(struct domain *d, struct dt_device_node *dev)
             return res;
         }
 
-        DPRINT("addr %u = 0x%"PRIx64" - 0x%"PRIx64"\n",
-               i, addr, addr + size - 1);
-
-        res = iomem_permit_access(d, paddr_to_pfn(addr & PAGE_MASK),
-                                  paddr_to_pfn(PAGE_ALIGN(addr + size - 1)));
+        res = map_range_to_domain(dev, addr, size, d);
         if ( res )
-        {
-            printk(XENLOG_ERR "Unable to permit to dom%d access to"
-                   " 0x%"PRIx64" - 0x%"PRIx64"\n",
-                   d->domain_id,
-                   addr & PAGE_MASK, PAGE_ALIGN(addr + size) - 1);
             return res;
-        }
-        res = map_mmio_regions(d,
-                               paddr_to_pfn(addr & PAGE_MASK),
-                               DIV_ROUND_UP(size, PAGE_SIZE),
-                               paddr_to_pfn(addr & PAGE_MASK));
-        if ( res )
-        {
-            printk(XENLOG_ERR "Unable to map 0x%"PRIx64
-                   " - 0x%"PRIx64" in domain %d\n",
-                   addr & PAGE_MASK, PAGE_ALIGN(addr + size) - 1,
-                   d->domain_id);
-            return res;
-        }
     }
 
+    res = map_device_children(d, dev);
+    if ( res )
+        return res;
+
     return 0;
 }
 
@@ -1027,18 +1190,16 @@ static int handle_node(struct domain *d, struct kernel_info *kinfo,
         DT_MATCH_COMPATIBLE("xen,multiboot-module"),
         DT_MATCH_COMPATIBLE("multiboot,module"),
         DT_MATCH_COMPATIBLE("arm,psci"),
+        DT_MATCH_COMPATIBLE("arm,psci-0.2"),
+        DT_MATCH_COMPATIBLE("arm,cortex-a7-pmu"),
+        DT_MATCH_COMPATIBLE("arm,cortex-a15-pmu"),
+        DT_MATCH_COMPATIBLE("arm,armv8-pmuv3"),
         DT_MATCH_PATH("/cpus"),
         DT_MATCH_TYPE("memory"),
         /* The memory mapped timer is not supported by Xen. */
         DT_MATCH_COMPATIBLE("arm,armv7-timer-mem"),
         { /* sentinel */ },
     };
-    static const struct dt_device_match gic_matches[] __initconst =
-    {
-        DT_MATCH_GIC_V2,
-        DT_MATCH_GIC_V3,
-        { /* sentinel */ },
-    };
     static const struct dt_device_match timer_matches[] __initconst =
     {
         DT_MATCH_TIMER,
@@ -1067,7 +1228,7 @@ static int handle_node(struct domain *d, struct kernel_info *kinfo,
 
     /* Replace these nodes with our own. Note that the original may be
      * used_by DOMID_XEN so this check comes first. */
-    if ( dt_match_node(gic_matches, node) )
+    if ( device_get_class(node) == DEVICE_GIC )
         return make_gic_node(d, kinfo->fdt, node);
     if ( dt_match_node(timer_matches, node) )
         return make_timer_node(d, kinfo->fdt, node);
@@ -1082,28 +1243,15 @@ static int handle_node(struct domain *d, struct kernel_info *kinfo,
     /* Even if the IOMMU device is not used by Xen, it should not be
      * passthrough to DOM0
      */
-    if ( device_get_type(node) == DEVICE_IOMMU )
+    if ( device_get_class(node) == DEVICE_IOMMU )
     {
         DPRINT(" IOMMU, skip it\n");
         return 0;
     }
 
-    /*
-     * Some device doesn't need to be mapped in Xen:
-     *  - Memory: the guest will see a different view of memory. It will
-     *  be allocated later.
-     *  - Disabled device: Linux is able to cope with status="disabled"
-     *  property. Therefore these device doesn't need to be mapped. This
-     *  solution can be use later for pass through.
-     */
-    if ( !dt_device_type_is_equal(node, "memory") &&
-         dt_device_is_available(node) )
-    {
-        res = map_device(d, node);
-
-        if ( res )
-            return res;
-    }
+    res = handle_device(d, node);
+    if ( res)
+        return res;
 
     /*
      * The property "name" is used to have a different name on older FDT
@@ -1130,7 +1278,7 @@ static int handle_node(struct domain *d, struct kernel_info *kinfo,
 
     if ( node == dt_host )
     {
-        res = make_hypervisor_node(d, kinfo->fdt, node);
+        res = make_hypervisor_node(kinfo, node);
         if ( res )
             return res;
 
@@ -1260,7 +1408,7 @@ static void initrd_load(struct kernel_info *kinfo)
             return;
         }
 
-        dst = map_domain_page(ma>>PAGE_SHIFT);
+        dst = map_domain_page(_mfn(paddr_to_pfn(ma)));
 
         copy_from_paddr(dst + s, paddr + offs, l);
 
@@ -1269,6 +1417,74 @@ static void initrd_load(struct kernel_info *kinfo)
     }
 }
 
+static void evtchn_fixup(struct domain *d, struct kernel_info *kinfo)
+{
+    int res, node;
+    gic_interrupt_t intr;
+
+    /*
+     * The allocation of the event channel IRQ has been deferred until
+     * now. At this time, all PPIs used by DOM0 have been registered.
+     */
+    res = vgic_allocate_ppi(d);
+    if ( res < 0 )
+        panic("Unable to allocate a PPI for the event channel interrupt\n");
+
+    d->arch.evtchn_irq = res;
+
+    printk("Allocating PPI %u for event channel interrupt\n",
+           d->arch.evtchn_irq);
+
+    /* Fix up "interrupts" in /hypervisor node */
+    node = fdt_path_offset(kinfo->fdt, "/hypervisor");
+    if ( node < 0 )
+        panic("Cannot find the /hypervisor node");
+
+    /* Interrupt event channel upcall:
+     *  - Active-low level-sensitive
+     *  - All CPUs
+     *
+     *  TODO: Handle properly the cpumask
+     */
+    set_interrupt_ppi(intr, d->arch.evtchn_irq, 0xf,
+                      DT_IRQ_TYPE_LEVEL_LOW);
+    res = fdt_setprop_inplace(kinfo->fdt, node, "interrupts",
+                              &intr, sizeof(intr));
+    if ( res )
+        panic("Cannot fix up \"interrupts\" property of the hypervisor node");
+}
+
+static void __init find_gnttab_region(struct domain *d,
+                                      struct kernel_info *kinfo)
+{
+    /*
+     * The region used by Xen on the memory will never be mapped in DOM0
+     * memory layout. Therefore it can be used for the grant table.
+     *
+     * Only use the text section as it's always present and will contain
+     * enough space for a large grant table
+     */
+    kinfo->gnttab_start = __pa(_stext);
+    kinfo->gnttab_size = (_etext - _stext) & PAGE_MASK;
+
+    /* Make sure the grant table will fit in the region */
+    if ( (kinfo->gnttab_size >> PAGE_SHIFT) < max_grant_frames )
+        panic("Cannot find a space for the grant table region\n");
+
+#ifdef CONFIG_ARM_32
+    /*
+     * The gnttab region must be under 4GB in order to work with DOM0
+     * using short page table.
+     * In practice it's always the case because Xen is always located
+     * below 4GB, but be safe.
+     */
+    BUG_ON((kinfo->gnttab_start + kinfo->gnttab_size) > GB(4));
+#endif
+
+    printk("Grant table range: %#"PRIpaddr"-%#"PRIpaddr"\n",
+           kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size);
+}
+
 int construct_dom0(struct domain *d)
 {
     struct kernel_info kinfo = {};
@@ -1306,6 +1522,7 @@ int construct_dom0(struct domain *d)
 #endif
 
     allocate_memory(d, &kinfo);
+    find_gnttab_region(d, &kinfo);
 
     rc = prepare_dtb(d, &kinfo);
     if ( rc < 0 )
@@ -1330,6 +1547,8 @@ int construct_dom0(struct domain *d)
     kernel_load(&kinfo);
     /* initrd_load will fix up the fdt, so call it before dtb_load */
     initrd_load(&kinfo);
+    /* Allocate the event channel IRQ and fix up the device tree */
+    evtchn_fixup(d, &kinfo);
     dtb_load(&kinfo);
 
     /* Now that we are done restore the original p2m and current. */
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index d246e84..30453d8 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -10,8 +10,8 @@
 #include <xen/errno.h>
 #include <xen/sched.h>
 #include <xen/hypercall.h>
-#include <asm/gic.h>
-#include <xen/guest_access.h>
+#include <xen/iocap.h>
+#include <xsm/xsm.h>
 #include <public/domctl.h>
 
 long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
@@ -32,42 +32,93 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
 
         return p2m_cache_flush(d, s, e);
     }
-    case XEN_DOMCTL_arm_configure_domain:
+    case XEN_DOMCTL_bind_pt_irq:
     {
-        uint8_t gic_version;
+        int rc;
+        xen_domctl_bind_pt_irq_t *bind = &domctl->u.bind_pt_irq;
+        uint32_t irq = bind->u.spi.spi;
+        uint32_t virq = bind->machine_irq;
+
+        /* We only support PT_IRQ_TYPE_SPI */
+        if ( bind->irq_type != PT_IRQ_TYPE_SPI )
+            return -EOPNOTSUPP;
+
+        /*
+         * XXX: For now map the interrupt 1:1. Other support will require to
+         * modify domain_pirq_to_irq macro.
+         */
+        if ( irq != virq )
+            return -EINVAL;
 
         /*
-         * Currently the vGIC is emulating the same version of the
-         * hardware GIC. Only the value XEN_DOMCTL_CONFIG_GIC_DEFAULT
-         * is allowed. The DOMCTL will return the actual version of the
-         * GIC.
+         * ARM doesn't require separating IRQ assignation into 2
+         * hypercalls (PHYSDEVOP_map_pirq and DOMCTL_bind_pt_irq).
+         *
+         * Call xsm_map_domain_irq in order to keep the same XSM checks
+         * done by the 2 hypercalls for consistency with other
+         * architectures.
          */
-        if ( domctl->u.configuredomain.gic_version != XEN_DOMCTL_CONFIG_GIC_DEFAULT )
+        rc = xsm_map_domain_irq(XSM_HOOK, d, irq, NULL);
+        if ( rc )
+            return rc;
+
+        rc = xsm_bind_pt_irq(XSM_HOOK, d, bind);
+        if ( rc )
+            return rc;
+
+        if ( !irq_access_permitted(current->domain, irq) )
+            return -EPERM;
+
+        if ( !vgic_reserve_virq(d, virq) )
+            return -EBUSY;
+
+        rc = route_irq_to_guest(d, virq, irq, "routed IRQ");
+        if ( rc )
+            vgic_free_virq(d, virq);
+
+        return rc;
+    }
+    case XEN_DOMCTL_unbind_pt_irq:
+    {
+        int rc;
+        xen_domctl_bind_pt_irq_t *bind = &domctl->u.bind_pt_irq;
+        uint32_t irq = bind->u.spi.spi;
+        uint32_t virq = bind->machine_irq;
+
+        /* We only support PT_IRQ_TYPE_SPI */
+        if ( bind->irq_type != PT_IRQ_TYPE_SPI )
             return -EOPNOTSUPP;
 
-        switch ( gic_hw_version() )
-        {
-        case GIC_V3:
-            gic_version = XEN_DOMCTL_CONFIG_GIC_V3;
-            break;
-        case GIC_V2:
-            gic_version = XEN_DOMCTL_CONFIG_GIC_V2;
-            break;
-        default:
-            BUG();
-        }
+        /* For now map the interrupt 1:1 */
+        if ( irq != virq )
+            return -EINVAL;
+
+        rc = xsm_unbind_pt_irq(XSM_HOOK, d, bind);
+        if ( rc )
+            return rc;
+
+        if ( !irq_access_permitted(current->domain, irq) )
+            return -EPERM;
 
-        domctl->u.configuredomain.gic_version = gic_version;
+        rc = release_guest_irq(d, virq);
+        if ( rc )
+            return rc;
 
-        /* TODO: Make the copy generic for all ARCH domctl */
-        if ( __copy_to_guest(u_domctl, domctl, 1) )
-            return -EFAULT;
+        vgic_free_virq(d, virq);
 
         return 0;
     }
-
     default:
-        return subarch_do_domctl(domctl, d, u_domctl);
+    {
+        int rc;
+
+        rc = subarch_do_domctl(domctl, d, u_domctl);
+
+        if ( rc == -ENOSYS )
+            rc = iommu_do_domctl(domctl, d, u_domctl);
+
+        return rc;
+    }
     }
 }
 
diff --git a/xen/arch/arm/efi/efi-boot.h b/xen/arch/arm/efi/efi-boot.h
index 639942d..e427e5f 100644
--- a/xen/arch/arm/efi/efi-boot.h
+++ b/xen/arch/arm/efi/efi-boot.h
@@ -6,8 +6,10 @@
 #include <xen/device_tree.h>
 #include <xen/libfdt/libfdt.h>
 #include <asm/setup.h>
+#include <asm/smp.h>
 
 void noreturn efi_xen_start(void *fdt_ptr, uint32_t fdt_size);
+void __flush_dcache_area(const void *vaddr, unsigned long size);
 
 #define DEVICE_TREE_GUID \
 {0xb1b621d5, 0xf19c, 0x41a5, {0x83, 0x0b, 0xd9, 0x15, 0x2c, 0x69, 0xaa, 0xe0}}
@@ -104,7 +106,7 @@ static int __init fdt_set_reg(void *fdt, int node, int addr_cells,
 
 static void __init *lookup_fdt_config_table(EFI_SYSTEM_TABLE *sys_table)
 {
-    const EFI_GUID fdt_guid = DEVICE_TREE_GUID;
+    static const EFI_GUID __initconst fdt_guid = DEVICE_TREE_GUID;
     EFI_CONFIGURATION_TABLE *tables;
     void *fdt = NULL;
     int i;
@@ -131,19 +133,20 @@ static EFI_STATUS __init efi_process_memory_map_bootinfo(EFI_MEMORY_DESCRIPTOR *
 
     for ( Index = 0; Index < (mmap_size / desc_size); Index++ )
     {
-        if ( desc_ptr->Type == EfiConventionalMemory
-             || desc_ptr->Type == EfiBootServicesCode
-             || desc_ptr->Type == EfiBootServicesData )
+        if ( desc_ptr->Type == EfiConventionalMemory ||
+             (!map_bs &&
+              (desc_ptr->Type == EfiBootServicesCode ||
+               desc_ptr->Type == EfiBootServicesData)) )
         {
-            bootinfo.mem.bank[i].start = desc_ptr->PhysicalStart;
-            bootinfo.mem.bank[i].size = desc_ptr->NumberOfPages * EFI_PAGE_SIZE;
-            if ( ++i >= NR_MEM_BANKS )
+            if ( i >= NR_MEM_BANKS )
             {
-                PrintStr(L"Warning: All ");
-                DisplayUint(NR_MEM_BANKS, -1);
-                PrintStr(L" bootinfo mem banks exhausted.\r\n");
+                PrintStr(L"Warning: All " __stringify(NR_MEM_BANKS)
+                          " bootinfo mem banks exhausted.\r\n");
                 break;
             }
+            bootinfo.mem.bank[i].start = desc_ptr->PhysicalStart;
+            bootinfo.mem.bank[i].size = desc_ptr->NumberOfPages * EFI_PAGE_SIZE;
+            ++i;
         }
         desc_ptr = NextMemoryDescriptor(desc_ptr, desc_size);
     }
@@ -334,7 +337,7 @@ static void __init efi_arch_process_memory_map(EFI_SYSTEM_TABLE *SystemTable,
     status = fdt_add_uefi_nodes(SystemTable, fdt, map, map_size, desc_size,
                                 desc_ver);
     if ( EFI_ERROR(status) )
-        PrintErrMesg(L"Updating FDT failed\r\n", status);
+        PrintErrMesg(L"Updating FDT failed", status);
 }
 
 static void __init efi_arch_pre_exit_boot(void)
@@ -370,16 +373,14 @@ static void __init efi_arch_cfg_file_late(EFI_FILE_HANDLE dir_handle, char *sect
 {
 }
 
-static void *__init efi_arch_allocate_mmap_buffer(UINTN *map_size)
+static void *__init efi_arch_allocate_mmap_buffer(UINTN map_size)
 {
     void *ptr;
     EFI_STATUS status;
-    UINTN map_size_alloc = *map_size + EFI_PAGE_SIZE;
 
-    status = efi_bs->AllocatePool(EfiLoaderData, map_size_alloc, &ptr);
+    status = efi_bs->AllocatePool(EfiLoaderData, map_size, &ptr);
     if ( status != EFI_SUCCESS )
         return NULL;
-    *map_size = map_size_alloc;
     return ptr;
 }
 
@@ -408,7 +409,7 @@ static void __init efi_arch_handle_cmdline(CHAR16 *image_name,
 
     status = efi_bs->AllocatePool(EfiBootServicesData, EFI_PAGE_SIZE, (void **)&buf);
     if ( EFI_ERROR(status) )
-        PrintErrMesg(L"Unable to allocate string buffer\r\n", status);
+        PrintErrMesg(L"Unable to allocate string buffer", status);
 
     if ( image_name )
     {
@@ -524,6 +525,11 @@ static void __init efi_arch_blexit(void)
         efi_bs->FreePool(memmap);
 }
 
+static void __init efi_arch_halt(void)
+{
+    stop_cpu();
+}
+
 static void __init efi_arch_load_addr_check(EFI_LOADED_IMAGE *loaded_image)
 {
     if ( (unsigned long)loaded_image->ImageBase & ((1 << 12) - 1) )
@@ -566,6 +572,12 @@ static void __init efi_arch_video_init(EFI_GRAPHICS_OUTPUT_PROTOCOL *gop,
                                        EFI_GRAPHICS_OUTPUT_MODE_INFORMATION *mode_info)
 {
 }
+
+static void efi_arch_flush_dcache_area(const void *vaddr, UINTN size)
+{
+    __flush_dcache_area(vaddr, size);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/gic-v2.c b/xen/arch/arm/gic-hip04.c
similarity index 56%
copy from xen/arch/arm/gic-v2.c
copy to xen/arch/arm/gic-hip04.c
index 31fb81a..c5ed545 100644
--- a/xen/arch/arm/gic-v2.c
+++ b/xen/arch/arm/gic-hip04.c
@@ -1,7 +1,8 @@
 /*
- * xen/arch/arm/gic-v2.c
+ * xen/arch/arm/gic-hip04.c
  *
- * ARM Generic Interrupt Controller support v2
+ * Generic Interrupt Controller for HiSilicon Hip04 platform
+ * Based heavily on gic-v2.c (id 3bcf563fec26378f7f4cf1e2ad0d4d5b3f341919)
  *
  * Tim Deegan <tim at xen.org>
  * Copyright (c) 2011 Citrix Systems.
@@ -28,6 +29,7 @@
 #include <xen/list.h>
 #include <xen/device_tree.h>
 #include <xen/libfdt/libfdt.h>
+#include <xen/sizes.h>
 #include <asm/p2m.h>
 #include <asm/domain.h>
 #include <asm/platform.h>
@@ -63,13 +65,9 @@
 
 /* Global state */
 static struct {
-    paddr_t dbase;            /* Address of distributor registers */
     void __iomem * map_dbase; /* IO mapped Address of distributor registers */
-    paddr_t cbase;            /* Address of CPU interface registers */
     void __iomem * map_cbase[2]; /* IO mapped Address of CPU interface registers */
-    paddr_t hbase;            /* Address of virtual interface registers */
     void __iomem * map_hbase; /* IO Address of virtual interface registers */
-    paddr_t vbase;            /* Address of virtual cpu interface registers */
     spinlock_t lock;
 } gicv2;
 
@@ -79,16 +77,26 @@ static struct gic_info gicv2_info;
  * logical CPU numbering. Let's use mapping as returned by the GIC
  * itself
  */
-static DEFINE_PER_CPU(u8, gic_cpu_id);
+static DEFINE_PER_CPU(u16, gic_cpu_id);
 
 /* Maximum cpu interface per GIC */
-#define NR_GIC_CPU_IF 8
+#define NR_GIC_CPU_IF 16
+
+#define HIP04_GICD_SGI_TARGET_SHIFT 8
+
+#define HIP04_GICH_APR   0x70
+#define HIP04_GICH_LR    0x80
 
 static inline void writeb_gicd(uint8_t val, unsigned int offset)
 {
     writeb_relaxed(val, gicv2.map_dbase + offset);
 }
 
+static inline void writew_gicd(uint16_t val, unsigned int offset)
+{
+    writew_relaxed(val, gicv2.map_dbase + offset);
+}
+
 static inline void writel_gicd(uint32_t val, unsigned int offset)
 {
     writel_relaxed(val, gicv2.map_dbase + offset);
@@ -123,7 +131,7 @@ static inline uint32_t readl_gich(int unsigned offset)
     return readl_relaxed(gicv2.map_hbase + offset);
 }
 
-static unsigned int gicv2_cpu_mask(const cpumask_t *cpumask)
+static unsigned int hip04gic_cpu_mask(const cpumask_t *cpumask)
 {
     unsigned int cpu;
     unsigned int mask = 0;
@@ -139,7 +147,7 @@ static unsigned int gicv2_cpu_mask(const cpumask_t *cpumask)
     return mask;
 }
 
-static void gicv2_save_state(struct vcpu *v)
+static void hip04gic_save_state(struct vcpu *v)
 {
     int i;
 
@@ -148,27 +156,27 @@ static void gicv2_save_state(struct vcpu *v)
      * accessed simultaneously by another pCPU.
      */
     for ( i = 0; i < gicv2_info.nr_lrs; i++ )
-        v->arch.gic.v2.lr[i] = readl_gich(GICH_LR + i * 4);
+        v->arch.gic.v2.lr[i] = readl_gich(HIP04_GICH_LR + i * 4);
 
-    v->arch.gic.v2.apr = readl_gich(GICH_APR);
+    v->arch.gic.v2.apr = readl_gich(HIP04_GICH_APR);
     v->arch.gic.v2.vmcr = readl_gich(GICH_VMCR);
     /* Disable until next VCPU scheduled */
     writel_gich(0, GICH_HCR);
 }
 
-static void gicv2_restore_state(const struct vcpu *v)
+static void hip04gic_restore_state(const struct vcpu *v)
 {
     int i;
 
     for ( i = 0; i < gicv2_info.nr_lrs; i++ )
-        writel_gich(v->arch.gic.v2.lr[i], GICH_LR + i * 4);
+        writel_gich(v->arch.gic.v2.lr[i], HIP04_GICH_LR + i * 4);
 
-    writel_gich(v->arch.gic.v2.apr, GICH_APR);
+    writel_gich(v->arch.gic.v2.apr, HIP04_GICH_APR);
     writel_gich(v->arch.gic.v2.vmcr, GICH_VMCR);
     writel_gich(GICH_HCR_EN, GICH_HCR);
 }
 
-static void gicv2_dump_state(const struct vcpu *v)
+static void hip04gic_dump_state(const struct vcpu *v)
 {
     int i;
 
@@ -176,7 +184,7 @@ static void gicv2_dump_state(const struct vcpu *v)
     {
         for ( i = 0; i < gicv2_info.nr_lrs; i++ )
             printk("   HW_LR[%d]=%x\n", i,
-                   readl_gich(GICH_LR + i * 4));
+                   readl_gich(HIP04_GICH_LR + i * 4));
     }
     else
     {
@@ -185,20 +193,20 @@ static void gicv2_dump_state(const struct vcpu *v)
     }
 }
 
-static void gicv2_eoi_irq(struct irq_desc *irqd)
+static void hip04gic_eoi_irq(struct irq_desc *irqd)
 {
     int irq = irqd->irq;
     /* Lower the priority */
     writel_gicc(irq, GICC_EOIR);
 }
 
-static void gicv2_dir_irq(struct irq_desc *irqd)
+static void hip04gic_dir_irq(struct irq_desc *irqd)
 {
     /* Deactivate */
     writel_gicc(irqd->irq, GICC_DIR);
 }
 
-static unsigned int gicv2_read_irq(void)
+static unsigned int hip04gic_read_irq(void)
 {
     return (readl_gicc(GICC_IAR) & GICC_IA_IRQ);
 }
@@ -207,12 +215,12 @@ static unsigned int gicv2_read_irq(void)
  * needs to be called with a valid cpu_mask, ie each cpu in the mask has
  * already called gic_cpu_init
  */
-static void gicv2_set_irq_properties(struct irq_desc *desc,
+static void hip04gic_set_irq_properties(struct irq_desc *desc,
                                    const cpumask_t *cpu_mask,
                                    unsigned int priority)
 {
-    uint32_t cfg, edgebit;
-    unsigned int mask = gicv2_cpu_mask(cpu_mask);
+    uint32_t cfg, actual, edgebit;
+    unsigned int mask = hip04gic_cpu_mask(cpu_mask);
     unsigned int irq = desc->irq;
     unsigned int type = desc->arch.type;
 
@@ -229,63 +237,80 @@ static void gicv2_set_irq_properties(struct irq_desc *desc,
         cfg |= edgebit;
     writel_gicd(cfg, GICD_ICFGR + (irq / 16) * 4);
 
+    actual = readl_gicd(GICD_ICFGR + (irq / 16) * 4);
+    if ( ( cfg & edgebit ) ^ ( actual & edgebit ) )
+    {
+        printk(XENLOG_WARNING "GIC-HIP04: WARNING: "
+               "CPU%d: Failed to configure IRQ%u as %s-triggered. "
+               "H/w forces to %s-triggered.\n",
+               smp_processor_id(), desc->irq,
+               cfg & edgebit ? "Edge" : "Level",
+               actual & edgebit ? "Edge" : "Level");
+        desc->arch.type = actual & edgebit ?
+            DT_IRQ_TYPE_EDGE_RISING :
+            DT_IRQ_TYPE_LEVEL_HIGH;
+    }
+
     /* Set target CPU mask (RAZ/WI on uniprocessor) */
-    writeb_gicd(mask, GICD_ITARGETSR + irq);
+    writew_gicd(mask, GICD_ITARGETSR + irq * 2);
     /* Set priority */
     writeb_gicd(priority, GICD_IPRIORITYR + irq);
 
     spin_unlock(&gicv2.lock);
 }
 
-static void __init gicv2_dist_init(void)
+static void __init hip04gic_dist_init(void)
 {
     uint32_t type;
     uint32_t cpumask;
     uint32_t gic_cpus;
+    unsigned int nr_lines;
     int i;
 
-    cpumask = readl_gicd(GICD_ITARGETSR) & 0xff;
-    cpumask |= cpumask << 8;
+    cpumask = readl_gicd(GICD_ITARGETSR) & 0xffff;
     cpumask |= cpumask << 16;
 
     /* Disable the distributor */
     writel_gicd(0, GICD_CTLR);
 
     type = readl_gicd(GICD_TYPER);
-    gicv2_info.nr_lines = 32 * ((type & GICD_TYPE_LINES) + 1);
-    gic_cpus = 1 + ((type & GICD_TYPE_CPUS) >> 5);
-    printk("GICv2: %d lines, %d cpu%s%s (IID %8.8x).\n",
-           gicv2_info.nr_lines, gic_cpus, (gic_cpus == 1) ? "" : "s",
+    nr_lines = 32 * ((type & GICD_TYPE_LINES) + 1);
+    gic_cpus = 16;
+    printk("GIC-HIP04: %d lines, %d cpu%s%s (IID %8.8x).\n",
+           nr_lines, gic_cpus, (gic_cpus == 1) ? "" : "s",
            (type & GICD_TYPE_SEC) ? ", secure" : "",
            readl_gicd(GICD_IIDR));
 
     /* Default all global IRQs to level, active low */
-    for ( i = 32; i < gicv2_info.nr_lines; i += 16 )
+    for ( i = 32; i < nr_lines; i += 16 )
         writel_gicd(0x0, GICD_ICFGR + (i / 16) * 4);
 
     /* Route all global IRQs to this CPU */
-    for ( i = 32; i < gicv2_info.nr_lines; i += 4 )
-        writel_gicd(cpumask, GICD_ITARGETSR + (i / 4) * 4);
+    for ( i = 32; i < nr_lines; i += 2 )
+        writel_gicd(cpumask, GICD_ITARGETSR + (i / 2) * 4);
 
     /* Default priority for global interrupts */
-    for ( i = 32; i < gicv2_info.nr_lines; i += 4 )
+    for ( i = 32; i < nr_lines; i += 4 )
         writel_gicd(GIC_PRI_IRQ << 24 | GIC_PRI_IRQ << 16 |
                     GIC_PRI_IRQ << 8 | GIC_PRI_IRQ,
                     GICD_IPRIORITYR + (i / 4) * 4);
 
     /* Disable all global interrupts */
-    for ( i = 32; i < gicv2_info.nr_lines; i += 32 )
+    for ( i = 32; i < nr_lines; i += 32 )
         writel_gicd(~0x0, GICD_ICENABLER + (i / 32) * 4);
 
+    /* Only 1020 interrupts are supported */
+    gicv2_info.nr_lines = min(1020U, nr_lines);
+
     /* Turn on the distributor */
     writel_gicd(GICD_CTL_ENABLE, GICD_CTLR);
 }
 
-static void __cpuinit gicv2_cpu_init(void)
+static void __cpuinit hip04gic_cpu_init(void)
 {
     int i;
 
-    this_cpu(gic_cpu_id) = readl_gicd(GICD_ITARGETSR) & 0xff;
+    this_cpu(gic_cpu_id) = readl_gicd(GICD_ITARGETSR) & 0xffff;
 
     /* The first 32 interrupts (PPI and SGI) are banked per-cpu, so
      * even though they are controlled with GICD registers, they must
@@ -314,12 +339,12 @@ static void __cpuinit gicv2_cpu_init(void)
     writel_gicc(GICC_CTL_ENABLE|GICC_CTL_EOI, GICC_CTLR);
 }
 
-static void gicv2_cpu_disable(void)
+static void hip04gic_cpu_disable(void)
 {
     writel_gicc(0x0, GICC_CTLR);
 }
 
-static void __cpuinit gicv2_hyp_init(void)
+static void __cpuinit hip04gic_hyp_init(void)
 {
     uint32_t vtr;
     uint8_t nr_lrs;
@@ -327,28 +352,26 @@ static void __cpuinit gicv2_hyp_init(void)
     vtr = readl_gich(GICH_VTR);
     nr_lrs  = (vtr & GICH_V2_VTR_NRLRGS) + 1;
     gicv2_info.nr_lrs = nr_lrs;
-
-    writel_gich(GICH_MISR_EOI, GICH_MISR);
 }
 
-static void __cpuinit gicv2_hyp_disable(void)
+static void __cpuinit hip04gic_hyp_disable(void)
 {
     writel_gich(0, GICH_HCR);
 }
 
-static int gicv2_secondary_cpu_init(void)
+static int hip04gic_secondary_cpu_init(void)
 {
     spin_lock(&gicv2.lock);
 
-    gicv2_cpu_init();
-    gicv2_hyp_init();
+    hip04gic_cpu_init();
+    hip04gic_hyp_init();
 
     spin_unlock(&gicv2.lock);
 
     return 0;
 }
 
-static void gicv2_send_SGI(enum gic_sgi sgi, enum gic_sgi_mode irqmode,
+static void hip04gic_send_SGI(enum gic_sgi sgi, enum gic_sgi_mode irqmode,
                            const cpumask_t *cpu_mask)
 {
     unsigned int mask = 0;
@@ -364,9 +387,9 @@ static void gicv2_send_SGI(enum gic_sgi sgi, enum gic_sgi_mode irqmode,
         break;
     case SGI_TARGET_LIST:
         cpumask_and(&online_mask, cpu_mask, &cpu_online_map);
-        mask = gicv2_cpu_mask(&online_mask);
+        mask = hip04gic_cpu_mask(&online_mask);
         writel_gicd(GICD_SGI_TARGET_LIST |
-                    (mask << GICD_SGI_TARGET_SHIFT) | sgi,
+                    (mask << HIP04_GICD_SGI_TARGET_SHIFT) | sgi,
                     GICD_SGIR);
         break;
     default:
@@ -375,15 +398,15 @@ static void gicv2_send_SGI(enum gic_sgi sgi, enum gic_sgi_mode irqmode,
 }
 
 /* Shut down the per-CPU GIC interface */
-static void gicv2_disable_interface(void)
+static void hip04gic_disable_interface(void)
 {
     spin_lock(&gicv2.lock);
-    gicv2_cpu_disable();
-    gicv2_hyp_disable();
+    hip04gic_cpu_disable();
+    hip04gic_hyp_disable();
     spin_unlock(&gicv2.lock);
 }
 
-static void gicv2_update_lr(int lr, const struct pending_irq *p,
+static void hip04gic_update_lr(int lr, const struct pending_irq *p,
                             unsigned int state)
 {
     uint32_t lr_reg;
@@ -397,68 +420,22 @@ static void gicv2_update_lr(int lr, const struct pending_irq *p,
               ((p->irq & GICH_V2_LR_VIRTUAL_MASK) << GICH_V2_LR_VIRTUAL_SHIFT));
 
     if ( p->desc != NULL )
-    {
-        if ( platform_has_quirk(PLATFORM_QUIRK_GUEST_PIRQ_NEED_EOI) )
-            lr_reg |= GICH_V2_LR_MAINTENANCE_IRQ;
-        else
-            lr_reg |= GICH_V2_LR_HW | ((p->desc->irq & GICH_V2_LR_PHYSICAL_MASK )
-                            << GICH_V2_LR_PHYSICAL_SHIFT);
-    }
-
-    writel_gich(lr_reg, GICH_LR + lr * 4);
-}
+        lr_reg |= GICH_V2_LR_HW | ((p->desc->irq & GICH_V2_LR_PHYSICAL_MASK )
+                                   << GICH_V2_LR_PHYSICAL_SHIFT);
 
-static void gicv2_clear_lr(int lr)
-{
-    writel_gich(0, GICH_LR + lr * 4);
+    writel_gich(lr_reg, HIP04_GICH_LR + lr * 4);
 }
 
-static int gicv2v_setup(struct domain *d)
+static void hip04gic_clear_lr(int lr)
 {
-    int ret;
-
-    /*
-     * The hardware domain gets the hardware address.
-     * Guests get the virtual platform layout.
-     */
-    if ( is_hardware_domain(d) )
-    {
-        d->arch.vgic.dbase = gicv2.dbase;
-        d->arch.vgic.cbase = gicv2.cbase;
-    }
-    else
-    {
-        d->arch.vgic.dbase = GUEST_GICD_BASE;
-        d->arch.vgic.cbase = GUEST_GICC_BASE;
-    }
-
-    /*
-     * Map the gic virtual cpu interface in the gic cpu interface
-     * region of the guest.
-     *
-     * The second page is always mapped at +4K irrespective of the
-     * GIC_64K_STRIDE quirk. The DTB passed to the guest reflects this.
-     */
-    ret = map_mmio_regions(d, paddr_to_pfn(d->arch.vgic.cbase), 1,
-                            paddr_to_pfn(gicv2.vbase));
-    if ( ret )
-        return ret;
-
-    if ( !platform_has_quirk(PLATFORM_QUIRK_GIC_64K_STRIDE) )
-        ret = map_mmio_regions(d, paddr_to_pfn(d->arch.vgic.cbase + PAGE_SIZE),
-                               2, paddr_to_pfn(gicv2.vbase + PAGE_SIZE));
-    else
-        ret = map_mmio_regions(d, paddr_to_pfn(d->arch.vgic.cbase + PAGE_SIZE),
-                               2, paddr_to_pfn(gicv2.vbase + 16*PAGE_SIZE));
-
-    return ret;
+    writel_gich(0, HIP04_GICH_LR + lr * 4);
 }
 
-static void gicv2_read_lr(int lr, struct gic_lr *lr_reg)
+static void hip04gic_read_lr(int lr, struct gic_lr *lr_reg)
 {
     uint32_t lrv;
 
-    lrv          = readl_gich(GICH_LR + lr * 4);
+    lrv          = readl_gich(HIP04_GICH_LR + lr * 4);
     lr_reg->pirq = (lrv >> GICH_V2_LR_PHYSICAL_SHIFT) & GICH_V2_LR_PHYSICAL_MASK;
     lr_reg->virq = (lrv >> GICH_V2_LR_VIRTUAL_SHIFT) & GICH_V2_LR_VIRTUAL_MASK;
     lr_reg->priority = (lrv >> GICH_V2_LR_PRIORITY_SHIFT) & GICH_V2_LR_PRIORITY_MASK;
@@ -467,7 +444,7 @@ static void gicv2_read_lr(int lr, struct gic_lr *lr_reg)
     lr_reg->grp       = (lrv >> GICH_V2_LR_GRP_SHIFT) & GICH_V2_LR_GRP_MASK;
 }
 
-static void gicv2_write_lr(int lr, const struct gic_lr *lr_reg)
+static void hip04gic_write_lr(int lr, const struct gic_lr *lr_reg)
 {
     uint32_t lrv = 0;
 
@@ -481,10 +458,10 @@ static void gicv2_write_lr(int lr, const struct gic_lr *lr_reg)
                                        << GICH_V2_LR_HW_SHIFT)  |
           ((uint32_t)(lr_reg->grp & GICH_V2_LR_GRP_MASK) << GICH_V2_LR_GRP_SHIFT) );
 
-    writel_gich(lrv, GICH_LR + lr * 4);
+    writel_gich(lrv, HIP04_GICH_LR + lr * 4);
 }
 
-static void gicv2_hcr_status(uint32_t flag, bool_t status)
+static void hip04gic_hcr_status(uint32_t flag, bool_t status)
 {
     uint32_t hcr = readl_gich(GICH_HCR);
 
@@ -496,18 +473,18 @@ static void gicv2_hcr_status(uint32_t flag, bool_t status)
     writel_gich(hcr, GICH_HCR);
 }
 
-static unsigned int gicv2_read_vmcr_priority(void)
+static unsigned int hip04gic_read_vmcr_priority(void)
 {
    return ((readl_gich(GICH_VMCR) >> GICH_V2_VMCR_PRIORITY_SHIFT)
            & GICH_V2_VMCR_PRIORITY_MASK);
 }
 
-static unsigned int gicv2_read_apr(int apr_reg)
+static unsigned int hip04gic_read_apr(int apr_reg)
 {
-   return readl_gich(GICH_APR);
+   return readl_gich(HIP04_GICH_APR);
 }
 
-static void gicv2_irq_enable(struct irq_desc *desc)
+static void hip04gic_irq_enable(struct irq_desc *desc)
 {
     unsigned long flags;
     int irq = desc->irq;
@@ -522,7 +499,7 @@ static void gicv2_irq_enable(struct irq_desc *desc)
     spin_unlock_irqrestore(&gicv2.lock, flags);
 }
 
-static void gicv2_irq_disable(struct irq_desc *desc)
+static void hip04gic_irq_disable(struct irq_desc *desc)
 {
     unsigned long flags;
     int irq = desc->irq;
@@ -536,39 +513,39 @@ static void gicv2_irq_disable(struct irq_desc *desc)
     spin_unlock_irqrestore(&gicv2.lock, flags);
 }
 
-static unsigned int gicv2_irq_startup(struct irq_desc *desc)
+static unsigned int hip04gic_irq_startup(struct irq_desc *desc)
 {
-    gicv2_irq_enable(desc);
+    hip04gic_irq_enable(desc);
 
     return 0;
 }
 
-static void gicv2_irq_shutdown(struct irq_desc *desc)
+static void hip04gic_irq_shutdown(struct irq_desc *desc)
 {
-    gicv2_irq_disable(desc);
+    hip04gic_irq_disable(desc);
 }
 
-static void gicv2_irq_ack(struct irq_desc *desc)
+static void hip04gic_irq_ack(struct irq_desc *desc)
 {
     /* No ACK -- reading IAR has done this for us */
 }
 
-static void gicv2_host_irq_end(struct irq_desc *desc)
+static void hip04gic_host_irq_end(struct irq_desc *desc)
 {
     /* Lower the priority */
-    gicv2_eoi_irq(desc);
+    hip04gic_eoi_irq(desc);
     /* Deactivate */
-    gicv2_dir_irq(desc);
+    hip04gic_dir_irq(desc);
 }
 
-static void gicv2_guest_irq_end(struct irq_desc *desc)
+static void hip04gic_guest_irq_end(struct irq_desc *desc)
 {
     /* Lower the priority of the IRQ */
-    gicv2_eoi_irq(desc);
+    hip04gic_eoi_irq(desc);
     /* Deactivation happens in maintenance interrupt / via GICV */
 }
 
-static void gicv2_irq_set_affinity(struct irq_desc *desc, const cpumask_t *cpu_mask)
+static void hip04gic_irq_set_affinity(struct irq_desc *desc, const cpumask_t *cpu_mask)
 {
     unsigned int mask;
 
@@ -576,203 +553,203 @@ static void gicv2_irq_set_affinity(struct irq_desc *desc, const cpumask_t *cpu_m
 
     spin_lock(&gicv2.lock);
 
-    mask = gicv2_cpu_mask(cpu_mask);
+    mask = hip04gic_cpu_mask(cpu_mask);
 
     /* Set target CPU mask (RAZ/WI on uniprocessor) */
-    writeb_gicd(mask, GICD_ITARGETSR + desc->irq);
+    writew_gicd(mask, GICD_ITARGETSR + desc->irq * 2);
 
     spin_unlock(&gicv2.lock);
 }
 
-static int gicv2_make_dt_node(const struct domain *d,
-                              const struct dt_device_node *node, void *fdt)
+static int hip04gic_make_hwdom_dt_node(const struct domain *d,
+                                       const struct dt_device_node *node,
+                                       void *fdt)
 {
     const struct dt_device_node *gic = dt_interrupt_controller;
-    const void *compatible = NULL;
+    const void *compatible;
     u32 len;
-    __be32 *new_cells, *tmp;
+    const __be32 *regs;
     int res = 0;
 
-    compatible = dt_get_property(gic, "compatible", &len);
-    if ( !compatible )
-    {
-        dprintk(XENLOG_ERR, "Can't find compatible property for the gic node\n");
-        return -FDT_ERR_XEN(ENOENT);
-    }
-
-    res = fdt_begin_node(fdt, "interrupt-controller");
-    if ( res )
-        return res;
+    /*
+     * Replace compatibility string with a standard one.
+     * dom0 will see a compatible GIC. This as GICC is compatible
+     * with standard one and GICD (emulated by Xen) is compatible
+     * to standard. Otherwise we should implement HIP04 GICD in
+     * the virtual GIC.
+     * This actually limit CPU number to 8 for dom0.
+     */
+    compatible = DT_COMPAT_GIC_CORTEX_A15;
+    len = strlen((char*) compatible) + 1;
 
     res = fdt_property(fdt, "compatible", compatible, len);
     if ( res )
         return res;
 
-    res = fdt_property_cell(fdt, "#interrupt-cells", 3);
-    if ( res )
-        return res;
-
-    res = fdt_property(fdt, "interrupt-controller", NULL, 0);
-
-    if ( res )
-        return res;
+    /*
+     * DTB provides up to 4 regions to handle virtualization
+     * (in order GICD, GICC, GICH and GICV interfaces)
+     * however dom0 just needs GICD and GICC provided by Xen.
+     */
+    regs = dt_get_property(gic, "reg", &len);
+    if ( !regs )
+    {
+        dprintk(XENLOG_ERR, "Can't find reg property for the gic node\n");
+        return -FDT_ERR_XEN(ENOENT);
+    }
 
     len = dt_cells_to_size(dt_n_addr_cells(node) + dt_n_size_cells(node));
-    len *= 2; /* GIC has two memory regions: Distributor + CPU interface */
-    new_cells = xzalloc_bytes(len);
-    if ( new_cells == NULL )
-        return -FDT_ERR_XEN(ENOMEM);
+    len *= 2;
 
-    tmp = new_cells;
-    dt_set_range(&tmp, node, d->arch.vgic.dbase, PAGE_SIZE);
-    dt_set_range(&tmp, node, d->arch.vgic.cbase, PAGE_SIZE * 2);
-
-    res = fdt_property(fdt, "reg", new_cells, len);
-    xfree(new_cells);
+    res = fdt_property(fdt, "reg", regs, len);
 
     return res;
 }
 
 /* XXX different for level vs edge */
-static hw_irq_controller gicv2_host_irq_type = {
-    .typename     = "gic-v2",
-    .startup      = gicv2_irq_startup,
-    .shutdown     = gicv2_irq_shutdown,
-    .enable       = gicv2_irq_enable,
-    .disable      = gicv2_irq_disable,
-    .ack          = gicv2_irq_ack,
-    .end          = gicv2_host_irq_end,
-    .set_affinity = gicv2_irq_set_affinity,
-};
-
-static hw_irq_controller gicv2_guest_irq_type = {
-    .typename     = "gic-v2",
-    .startup      = gicv2_irq_startup,
-    .shutdown     = gicv2_irq_shutdown,
-    .enable       = gicv2_irq_enable,
-    .disable      = gicv2_irq_disable,
-    .ack          = gicv2_irq_ack,
-    .end          = gicv2_guest_irq_end,
-    .set_affinity = gicv2_irq_set_affinity,
+static hw_irq_controller hip04gic_host_irq_type = {
+    .typename     = "gic-hip04",
+    .startup      = hip04gic_irq_startup,
+    .shutdown     = hip04gic_irq_shutdown,
+    .enable       = hip04gic_irq_enable,
+    .disable      = hip04gic_irq_disable,
+    .ack          = hip04gic_irq_ack,
+    .end          = hip04gic_host_irq_end,
+    .set_affinity = hip04gic_irq_set_affinity,
 };
 
-const static struct gic_hw_operations gicv2_ops = {
-    .info                = &gicv2_info,
-    .secondary_init      = gicv2_secondary_cpu_init,
-    .save_state          = gicv2_save_state,
-    .restore_state       = gicv2_restore_state,
-    .dump_state          = gicv2_dump_state,
-    .gicv_setup          = gicv2v_setup,
-    .gic_host_irq_type   = &gicv2_host_irq_type,
-    .gic_guest_irq_type  = &gicv2_guest_irq_type,
-    .eoi_irq             = gicv2_eoi_irq,
-    .deactivate_irq      = gicv2_dir_irq,
-    .read_irq            = gicv2_read_irq,
-    .set_irq_properties  = gicv2_set_irq_properties,
-    .send_SGI            = gicv2_send_SGI,
-    .disable_interface   = gicv2_disable_interface,
-    .update_lr           = gicv2_update_lr,
-    .update_hcr_status   = gicv2_hcr_status,
-    .clear_lr            = gicv2_clear_lr,
-    .read_lr             = gicv2_read_lr,
-    .write_lr            = gicv2_write_lr,
-    .read_vmcr_priority  = gicv2_read_vmcr_priority,
-    .read_apr            = gicv2_read_apr,
-    .make_dt_node        = gicv2_make_dt_node,
+static hw_irq_controller hip04gic_guest_irq_type = {
+    .typename     = "gic-hip04",
+    .startup      = hip04gic_irq_startup,
+    .shutdown     = hip04gic_irq_shutdown,
+    .enable       = hip04gic_irq_enable,
+    .disable      = hip04gic_irq_disable,
+    .ack          = hip04gic_irq_ack,
+    .end          = hip04gic_guest_irq_end,
+    .set_affinity = hip04gic_irq_set_affinity,
 };
 
-/* Set up the GIC */
-static int __init gicv2_init(struct dt_device_node *node, const void *data)
+static int __init hip04gic_init(void)
 {
     int res;
+    paddr_t hbase, dbase, cbase, vbase;
+    const struct dt_device_node *node = gicv2_info.node;
 
-    dt_device_set_used_by(node, DOMID_XEN);
-
-    res = dt_device_get_address(node, 0, &gicv2.dbase, NULL);
-    if ( res || !gicv2.dbase || (gicv2.dbase & ~PAGE_MASK) )
-        panic("GICv2: Cannot find a valid address for the distributor");
+    res = dt_device_get_address(node, 0, &dbase, NULL);
+    if ( res )
+        panic("GIC-HIP04: Cannot find a valid address for the distributor");
 
-    res = dt_device_get_address(node, 1, &gicv2.cbase, NULL);
-    if ( res || !gicv2.cbase || (gicv2.cbase & ~PAGE_MASK) )
-        panic("GICv2: Cannot find a valid address for the CPU");
+    res = dt_device_get_address(node, 1, &cbase, NULL);
+    if ( res )
+        panic("GIC-HIP04: Cannot find a valid address for the CPU");
 
-    res = dt_device_get_address(node, 2, &gicv2.hbase, NULL);
-    if ( res || !gicv2.hbase || (gicv2.hbase & ~PAGE_MASK) )
-        panic("GICv2: Cannot find a valid address for the hypervisor");
+    res = dt_device_get_address(node, 2, &hbase, NULL);
+    if ( res )
+        panic("GIC-HIP04: Cannot find a valid address for the hypervisor");
 
-    res = dt_device_get_address(node, 3, &gicv2.vbase, NULL);
-    if ( res || !gicv2.vbase || (gicv2.vbase & ~PAGE_MASK) )
-        panic("GICv2: Cannot find a valid address for the virtual CPU");
+    res = dt_device_get_address(node, 3, &vbase, NULL);
+    if ( res )
+        panic("GIC-HIP04: Cannot find a valid address for the virtual CPU");
 
     res = platform_get_irq(node, 0);
     if ( res < 0 )
-        panic("GICv2: Cannot find the maintenance IRQ");
+        panic("GIC-HIP04: Cannot find the maintenance IRQ");
     gicv2_info.maintenance_irq = res;
 
-    /* Set the GIC as the primary interrupt controller */
-    dt_interrupt_controller = node;
-
     /* TODO: Add check on distributor, cpu size */
 
-    printk("GICv2 initialization:\n"
+    printk("GIC-HIP04 initialization:\n"
               "        gic_dist_addr=%"PRIpaddr"\n"
               "        gic_cpu_addr=%"PRIpaddr"\n"
               "        gic_hyp_addr=%"PRIpaddr"\n"
               "        gic_vcpu_addr=%"PRIpaddr"\n"
               "        gic_maintenance_irq=%u\n",
-              gicv2.dbase, gicv2.cbase, gicv2.hbase, gicv2.vbase,
+              dbase, cbase, hbase, vbase,
               gicv2_info.maintenance_irq);
 
-    if ( (gicv2.dbase & ~PAGE_MASK) || (gicv2.cbase & ~PAGE_MASK) ||
-         (gicv2.hbase & ~PAGE_MASK) || (gicv2.vbase & ~PAGE_MASK) )
-        panic("GICv2 interfaces not page aligned");
+    if ( (dbase & ~PAGE_MASK) || (cbase & ~PAGE_MASK) ||
+         (hbase & ~PAGE_MASK) || (vbase & ~PAGE_MASK) )
+        panic("GIC-HIP04 interfaces not page aligned");
 
-    gicv2.map_dbase = ioremap_nocache(gicv2.dbase, PAGE_SIZE);
+    gicv2.map_dbase = ioremap_nocache(dbase, PAGE_SIZE);
     if ( !gicv2.map_dbase )
-        panic("GICv2: Failed to ioremap for GIC distributor\n");
+        panic("GIC-HIP04: Failed to ioremap for GIC distributor\n");
 
-    gicv2.map_cbase[0] = ioremap_nocache(gicv2.cbase, PAGE_SIZE);
+    gicv2.map_cbase[0] = ioremap_nocache(cbase, PAGE_SIZE);
 
     if ( platform_has_quirk(PLATFORM_QUIRK_GIC_64K_STRIDE) )
-        gicv2.map_cbase[1] = ioremap_nocache(gicv2.cbase + PAGE_SIZE * 0x10,
-                                           PAGE_SIZE);
+        gicv2.map_cbase[1] = ioremap_nocache(cbase + SZ_64K, PAGE_SIZE);
     else
-        gicv2.map_cbase[1] = ioremap_nocache(gicv2.cbase + PAGE_SIZE, PAGE_SIZE);
+        gicv2.map_cbase[1] = ioremap_nocache(cbase + PAGE_SIZE, PAGE_SIZE);
 
     if ( !gicv2.map_cbase[0] || !gicv2.map_cbase[1] )
-        panic("GICv2: Failed to ioremap for GIC CPU interface\n");
+        panic("GIC-HIP04: Failed to ioremap for GIC CPU interface\n");
 
-    gicv2.map_hbase = ioremap_nocache(gicv2.hbase, PAGE_SIZE);
+    gicv2.map_hbase = ioremap_nocache(hbase, PAGE_SIZE);
     if ( !gicv2.map_hbase )
-        panic("GICv2: Failed to ioremap for GIC Virtual interface\n");
+        panic("GIC-HIP04: Failed to ioremap for GIC Virtual interface\n");
+
+    vgic_v2_setup_hw(dbase, cbase, vbase);
 
     /* Global settings: interrupt distributor */
     spin_lock_init(&gicv2.lock);
     spin_lock(&gicv2.lock);
 
-    gicv2_dist_init();
-    gicv2_cpu_init();
-    gicv2_hyp_init();
+    hip04gic_dist_init();
+    hip04gic_cpu_init();
+    hip04gic_hyp_init();
 
     spin_unlock(&gicv2.lock);
 
+    return 0;
+}
+
+const static struct gic_hw_operations hip04gic_ops = {
+    .info                = &gicv2_info,
+    .init                = hip04gic_init,
+    .secondary_init      = hip04gic_secondary_cpu_init,
+    .save_state          = hip04gic_save_state,
+    .restore_state       = hip04gic_restore_state,
+    .dump_state          = hip04gic_dump_state,
+    .gic_host_irq_type   = &hip04gic_host_irq_type,
+    .gic_guest_irq_type  = &hip04gic_guest_irq_type,
+    .eoi_irq             = hip04gic_eoi_irq,
+    .deactivate_irq      = hip04gic_dir_irq,
+    .read_irq            = hip04gic_read_irq,
+    .set_irq_properties  = hip04gic_set_irq_properties,
+    .send_SGI            = hip04gic_send_SGI,
+    .disable_interface   = hip04gic_disable_interface,
+    .update_lr           = hip04gic_update_lr,
+    .update_hcr_status   = hip04gic_hcr_status,
+    .clear_lr            = hip04gic_clear_lr,
+    .read_lr             = hip04gic_read_lr,
+    .write_lr            = hip04gic_write_lr,
+    .read_vmcr_priority  = hip04gic_read_vmcr_priority,
+    .read_apr            = hip04gic_read_apr,
+    .make_hwdom_dt_node  = hip04gic_make_hwdom_dt_node,
+};
+
+/* Set up the GIC */
+static int __init hip04gic_preinit(struct dt_device_node *node,
+                                   const void *data)
+{
     gicv2_info.hw_version = GIC_V2;
-    register_gic_ops(&gicv2_ops);
+    gicv2_info.node = node;
+    register_gic_ops(&hip04gic_ops);
+    dt_irq_xlate = gic_irq_xlate;
 
     return 0;
 }
 
-static const char * const gicv2_dt_compat[] __initconst =
+static const struct dt_device_match hip04gic_dt_match[] __initconst =
 {
-    DT_COMPAT_GIC_CORTEX_A15,
-    DT_COMPAT_GIC_CORTEX_A7,
-    DT_COMPAT_GIC_400,
-    NULL
+    DT_MATCH_COMPATIBLE("hisilicon,hip04-intc"),
+    { /* sentinel */ },
 };
 
-DT_DEVICE_START(gicv2, "GICv2:", DEVICE_GIC)
-        .compatible = gicv2_dt_compat,
-        .init = gicv2_init,
+DT_DEVICE_START(hip04gic, "GIC-HIP04", DEVICE_GIC)
+        .dt_match = hip04gic_dt_match,
+        .init = hip04gic_preinit,
 DT_DEVICE_END
 
 /*
diff --git a/xen/arch/arm/gic-v2.c b/xen/arch/arm/gic-v2.c
index 31fb81a..596126d 100644
--- a/xen/arch/arm/gic-v2.c
+++ b/xen/arch/arm/gic-v2.c
@@ -28,6 +28,7 @@
 #include <xen/list.h>
 #include <xen/device_tree.h>
 #include <xen/libfdt/libfdt.h>
+#include <xen/sizes.h>
 #include <asm/p2m.h>
 #include <asm/domain.h>
 #include <asm/platform.h>
@@ -63,13 +64,9 @@
 
 /* Global state */
 static struct {
-    paddr_t dbase;            /* Address of distributor registers */
     void __iomem * map_dbase; /* IO mapped Address of distributor registers */
-    paddr_t cbase;            /* Address of CPU interface registers */
     void __iomem * map_cbase[2]; /* IO mapped Address of CPU interface registers */
-    paddr_t hbase;            /* Address of virtual interface registers */
     void __iomem * map_hbase; /* IO Address of virtual interface registers */
-    paddr_t vbase;            /* Address of virtual cpu interface registers */
     spinlock_t lock;
 } gicv2;
 
@@ -211,7 +208,7 @@ static void gicv2_set_irq_properties(struct irq_desc *desc,
                                    const cpumask_t *cpu_mask,
                                    unsigned int priority)
 {
-    uint32_t cfg, edgebit;
+    uint32_t cfg, actual, edgebit;
     unsigned int mask = gicv2_cpu_mask(cpu_mask);
     unsigned int irq = desc->irq;
     unsigned int type = desc->arch.type;
@@ -229,6 +226,20 @@ static void gicv2_set_irq_properties(struct irq_desc *desc,
         cfg |= edgebit;
     writel_gicd(cfg, GICD_ICFGR + (irq / 16) * 4);
 
+    actual = readl_gicd(GICD_ICFGR + (irq / 16) * 4);
+    if ( ( cfg & edgebit ) ^ ( actual & edgebit ) )
+    {
+        printk(XENLOG_WARNING "GICv2: WARNING: "
+               "CPU%d: Failed to configure IRQ%u as %s-triggered. "
+               "H/w forces to %s-triggered.\n",
+               smp_processor_id(), desc->irq,
+               cfg & edgebit ? "Edge" : "Level",
+               actual & edgebit ? "Edge" : "Level");
+        desc->arch.type = actual & edgebit ?
+            DT_IRQ_TYPE_EDGE_RISING :
+            DT_IRQ_TYPE_LEVEL_HIGH;
+    }
+
     /* Set target CPU mask (RAZ/WI on uniprocessor) */
     writeb_gicd(mask, GICD_ITARGETSR + irq);
     /* Set priority */
@@ -242,6 +253,7 @@ static void __init gicv2_dist_init(void)
     uint32_t type;
     uint32_t cpumask;
     uint32_t gic_cpus;
+    unsigned int nr_lines;
     int i;
 
     cpumask = readl_gicd(GICD_ITARGETSR) & 0xff;
@@ -252,31 +264,34 @@ static void __init gicv2_dist_init(void)
     writel_gicd(0, GICD_CTLR);
 
     type = readl_gicd(GICD_TYPER);
-    gicv2_info.nr_lines = 32 * ((type & GICD_TYPE_LINES) + 1);
+    nr_lines = 32 * ((type & GICD_TYPE_LINES) + 1);
     gic_cpus = 1 + ((type & GICD_TYPE_CPUS) >> 5);
     printk("GICv2: %d lines, %d cpu%s%s (IID %8.8x).\n",
-           gicv2_info.nr_lines, gic_cpus, (gic_cpus == 1) ? "" : "s",
+           nr_lines, gic_cpus, (gic_cpus == 1) ? "" : "s",
            (type & GICD_TYPE_SEC) ? ", secure" : "",
            readl_gicd(GICD_IIDR));
 
     /* Default all global IRQs to level, active low */
-    for ( i = 32; i < gicv2_info.nr_lines; i += 16 )
+    for ( i = 32; i < nr_lines; i += 16 )
         writel_gicd(0x0, GICD_ICFGR + (i / 16) * 4);
 
     /* Route all global IRQs to this CPU */
-    for ( i = 32; i < gicv2_info.nr_lines; i += 4 )
+    for ( i = 32; i < nr_lines; i += 4 )
         writel_gicd(cpumask, GICD_ITARGETSR + (i / 4) * 4);
 
     /* Default priority for global interrupts */
-    for ( i = 32; i < gicv2_info.nr_lines; i += 4 )
+    for ( i = 32; i < nr_lines; i += 4 )
         writel_gicd(GIC_PRI_IRQ << 24 | GIC_PRI_IRQ << 16 |
                     GIC_PRI_IRQ << 8 | GIC_PRI_IRQ,
                     GICD_IPRIORITYR + (i / 4) * 4);
 
     /* Disable all global interrupts */
-    for ( i = 32; i < gicv2_info.nr_lines; i += 32 )
+    for ( i = 32; i < nr_lines; i += 32 )
         writel_gicd(~0x0, GICD_ICENABLER + (i / 32) * 4);
 
+    /* Only 1020 interrupts are supported */
+    gicv2_info.nr_lines = min(1020U, nr_lines);
+
     /* Turn on the distributor */
     writel_gicd(GICD_CTL_ENABLE, GICD_CTLR);
 }
@@ -327,8 +342,6 @@ static void __cpuinit gicv2_hyp_init(void)
     vtr = readl_gich(GICH_VTR);
     nr_lrs  = (vtr & GICH_V2_VTR_NRLRGS) + 1;
     gicv2_info.nr_lrs = nr_lrs;
-
-    writel_gich(GICH_MISR_EOI, GICH_MISR);
 }
 
 static void __cpuinit gicv2_hyp_disable(void)
@@ -397,13 +410,8 @@ static void gicv2_update_lr(int lr, const struct pending_irq *p,
               ((p->irq & GICH_V2_LR_VIRTUAL_MASK) << GICH_V2_LR_VIRTUAL_SHIFT));
 
     if ( p->desc != NULL )
-    {
-        if ( platform_has_quirk(PLATFORM_QUIRK_GUEST_PIRQ_NEED_EOI) )
-            lr_reg |= GICH_V2_LR_MAINTENANCE_IRQ;
-        else
-            lr_reg |= GICH_V2_LR_HW | ((p->desc->irq & GICH_V2_LR_PHYSICAL_MASK )
-                            << GICH_V2_LR_PHYSICAL_SHIFT);
-    }
+        lr_reg |= GICH_V2_LR_HW | ((p->desc->irq & GICH_V2_LR_PHYSICAL_MASK )
+                                   << GICH_V2_LR_PHYSICAL_SHIFT);
 
     writel_gich(lr_reg, GICH_LR + lr * 4);
 }
@@ -413,47 +421,6 @@ static void gicv2_clear_lr(int lr)
     writel_gich(0, GICH_LR + lr * 4);
 }
 
-static int gicv2v_setup(struct domain *d)
-{
-    int ret;
-
-    /*
-     * The hardware domain gets the hardware address.
-     * Guests get the virtual platform layout.
-     */
-    if ( is_hardware_domain(d) )
-    {
-        d->arch.vgic.dbase = gicv2.dbase;
-        d->arch.vgic.cbase = gicv2.cbase;
-    }
-    else
-    {
-        d->arch.vgic.dbase = GUEST_GICD_BASE;
-        d->arch.vgic.cbase = GUEST_GICC_BASE;
-    }
-
-    /*
-     * Map the gic virtual cpu interface in the gic cpu interface
-     * region of the guest.
-     *
-     * The second page is always mapped at +4K irrespective of the
-     * GIC_64K_STRIDE quirk. The DTB passed to the guest reflects this.
-     */
-    ret = map_mmio_regions(d, paddr_to_pfn(d->arch.vgic.cbase), 1,
-                            paddr_to_pfn(gicv2.vbase));
-    if ( ret )
-        return ret;
-
-    if ( !platform_has_quirk(PLATFORM_QUIRK_GIC_64K_STRIDE) )
-        ret = map_mmio_regions(d, paddr_to_pfn(d->arch.vgic.cbase + PAGE_SIZE),
-                               2, paddr_to_pfn(gicv2.vbase + PAGE_SIZE));
-    else
-        ret = map_mmio_regions(d, paddr_to_pfn(d->arch.vgic.cbase + PAGE_SIZE),
-                               2, paddr_to_pfn(gicv2.vbase + 16*PAGE_SIZE));
-
-    return ret;
-}
-
 static void gicv2_read_lr(int lr, struct gic_lr *lr_reg)
 {
     uint32_t lrv;
@@ -584,13 +551,14 @@ static void gicv2_irq_set_affinity(struct irq_desc *desc, const cpumask_t *cpu_m
     spin_unlock(&gicv2.lock);
 }
 
-static int gicv2_make_dt_node(const struct domain *d,
-                              const struct dt_device_node *node, void *fdt)
+static int gicv2_make_hwdom_dt_node(const struct domain *d,
+                                    const struct dt_device_node *node,
+                                    void *fdt)
 {
     const struct dt_device_node *gic = dt_interrupt_controller;
     const void *compatible = NULL;
     u32 len;
-    __be32 *new_cells, *tmp;
+    const __be32 *regs;
     int res = 0;
 
     compatible = dt_get_property(gic, "compatible", &len);
@@ -600,35 +568,26 @@ static int gicv2_make_dt_node(const struct domain *d,
         return -FDT_ERR_XEN(ENOENT);
     }
 
-    res = fdt_begin_node(fdt, "interrupt-controller");
-    if ( res )
-        return res;
-
     res = fdt_property(fdt, "compatible", compatible, len);
     if ( res )
         return res;
 
-    res = fdt_property_cell(fdt, "#interrupt-cells", 3);
-    if ( res )
-        return res;
-
-    res = fdt_property(fdt, "interrupt-controller", NULL, 0);
-
-    if ( res )
-        return res;
+    /*
+     * DTB provides up to 4 regions to handle virtualization
+     * (in order GICD, GICC, GICH and GICV interfaces)
+     * however dom0 just needs GICD and GICC provided by Xen.
+     */
+    regs = dt_get_property(gic, "reg", &len);
+    if ( !regs )
+    {
+        dprintk(XENLOG_ERR, "Can't find reg property for the gic node\n");
+        return -FDT_ERR_XEN(ENOENT);
+    }
 
     len = dt_cells_to_size(dt_n_addr_cells(node) + dt_n_size_cells(node));
-    len *= 2; /* GIC has two memory regions: Distributor + CPU interface */
-    new_cells = xzalloc_bytes(len);
-    if ( new_cells == NULL )
-        return -FDT_ERR_XEN(ENOMEM);
+    len *= 2;
 
-    tmp = new_cells;
-    dt_set_range(&tmp, node, d->arch.vgic.dbase, PAGE_SIZE);
-    dt_set_range(&tmp, node, d->arch.vgic.cbase, PAGE_SIZE * 2);
-
-    res = fdt_property(fdt, "reg", new_cells, len);
-    xfree(new_cells);
+    res = fdt_property(fdt, "reg", regs, len);
 
     return res;
 }
@@ -656,52 +615,26 @@ static hw_irq_controller gicv2_guest_irq_type = {
     .set_affinity = gicv2_irq_set_affinity,
 };
 
-const static struct gic_hw_operations gicv2_ops = {
-    .info                = &gicv2_info,
-    .secondary_init      = gicv2_secondary_cpu_init,
-    .save_state          = gicv2_save_state,
-    .restore_state       = gicv2_restore_state,
-    .dump_state          = gicv2_dump_state,
-    .gicv_setup          = gicv2v_setup,
-    .gic_host_irq_type   = &gicv2_host_irq_type,
-    .gic_guest_irq_type  = &gicv2_guest_irq_type,
-    .eoi_irq             = gicv2_eoi_irq,
-    .deactivate_irq      = gicv2_dir_irq,
-    .read_irq            = gicv2_read_irq,
-    .set_irq_properties  = gicv2_set_irq_properties,
-    .send_SGI            = gicv2_send_SGI,
-    .disable_interface   = gicv2_disable_interface,
-    .update_lr           = gicv2_update_lr,
-    .update_hcr_status   = gicv2_hcr_status,
-    .clear_lr            = gicv2_clear_lr,
-    .read_lr             = gicv2_read_lr,
-    .write_lr            = gicv2_write_lr,
-    .read_vmcr_priority  = gicv2_read_vmcr_priority,
-    .read_apr            = gicv2_read_apr,
-    .make_dt_node        = gicv2_make_dt_node,
-};
-
-/* Set up the GIC */
-static int __init gicv2_init(struct dt_device_node *node, const void *data)
+static int __init gicv2_init(void)
 {
     int res;
+    paddr_t hbase, dbase, cbase, vbase;
+    const struct dt_device_node *node = gicv2_info.node;
 
-    dt_device_set_used_by(node, DOMID_XEN);
-
-    res = dt_device_get_address(node, 0, &gicv2.dbase, NULL);
-    if ( res || !gicv2.dbase || (gicv2.dbase & ~PAGE_MASK) )
+    res = dt_device_get_address(node, 0, &dbase, NULL);
+    if ( res )
         panic("GICv2: Cannot find a valid address for the distributor");
 
-    res = dt_device_get_address(node, 1, &gicv2.cbase, NULL);
-    if ( res || !gicv2.cbase || (gicv2.cbase & ~PAGE_MASK) )
+    res = dt_device_get_address(node, 1, &cbase, NULL);
+    if ( res )
         panic("GICv2: Cannot find a valid address for the CPU");
 
-    res = dt_device_get_address(node, 2, &gicv2.hbase, NULL);
-    if ( res || !gicv2.hbase || (gicv2.hbase & ~PAGE_MASK) )
+    res = dt_device_get_address(node, 2, &hbase, NULL);
+    if ( res )
         panic("GICv2: Cannot find a valid address for the hypervisor");
 
-    res = dt_device_get_address(node, 3, &gicv2.vbase, NULL);
-    if ( res || !gicv2.vbase || (gicv2.vbase & ~PAGE_MASK) )
+    res = dt_device_get_address(node, 3, &vbase, NULL);
+    if ( res )
         panic("GICv2: Cannot find a valid address for the virtual CPU");
 
     res = platform_get_irq(node, 0);
@@ -709,9 +642,6 @@ static int __init gicv2_init(struct dt_device_node *node, const void *data)
         panic("GICv2: Cannot find the maintenance IRQ");
     gicv2_info.maintenance_irq = res;
 
-    /* Set the GIC as the primary interrupt controller */
-    dt_interrupt_controller = node;
-
     /* TODO: Add check on distributor, cpu size */
 
     printk("GICv2 initialization:\n"
@@ -720,32 +650,33 @@ static int __init gicv2_init(struct dt_device_node *node, const void *data)
               "        gic_hyp_addr=%"PRIpaddr"\n"
               "        gic_vcpu_addr=%"PRIpaddr"\n"
               "        gic_maintenance_irq=%u\n",
-              gicv2.dbase, gicv2.cbase, gicv2.hbase, gicv2.vbase,
+              dbase, cbase, hbase, vbase,
               gicv2_info.maintenance_irq);
 
-    if ( (gicv2.dbase & ~PAGE_MASK) || (gicv2.cbase & ~PAGE_MASK) ||
-         (gicv2.hbase & ~PAGE_MASK) || (gicv2.vbase & ~PAGE_MASK) )
+    if ( (dbase & ~PAGE_MASK) || (cbase & ~PAGE_MASK) ||
+         (hbase & ~PAGE_MASK) || (vbase & ~PAGE_MASK) )
         panic("GICv2 interfaces not page aligned");
 
-    gicv2.map_dbase = ioremap_nocache(gicv2.dbase, PAGE_SIZE);
+    gicv2.map_dbase = ioremap_nocache(dbase, PAGE_SIZE);
     if ( !gicv2.map_dbase )
         panic("GICv2: Failed to ioremap for GIC distributor\n");
 
-    gicv2.map_cbase[0] = ioremap_nocache(gicv2.cbase, PAGE_SIZE);
+    gicv2.map_cbase[0] = ioremap_nocache(cbase, PAGE_SIZE);
 
     if ( platform_has_quirk(PLATFORM_QUIRK_GIC_64K_STRIDE) )
-        gicv2.map_cbase[1] = ioremap_nocache(gicv2.cbase + PAGE_SIZE * 0x10,
-                                           PAGE_SIZE);
+        gicv2.map_cbase[1] = ioremap_nocache(cbase + SZ_64K, PAGE_SIZE);
     else
-        gicv2.map_cbase[1] = ioremap_nocache(gicv2.cbase + PAGE_SIZE, PAGE_SIZE);
+        gicv2.map_cbase[1] = ioremap_nocache(cbase + PAGE_SIZE, PAGE_SIZE);
 
     if ( !gicv2.map_cbase[0] || !gicv2.map_cbase[1] )
         panic("GICv2: Failed to ioremap for GIC CPU interface\n");
 
-    gicv2.map_hbase = ioremap_nocache(gicv2.hbase, PAGE_SIZE);
+    gicv2.map_hbase = ioremap_nocache(hbase, PAGE_SIZE);
     if ( !gicv2.map_hbase )
         panic("GICv2: Failed to ioremap for GIC Virtual interface\n");
 
+    vgic_v2_setup_hw(dbase, cbase, vbase);
+
     /* Global settings: interrupt distributor */
     spin_lock_init(&gicv2.lock);
     spin_lock(&gicv2.lock);
@@ -756,23 +687,54 @@ static int __init gicv2_init(struct dt_device_node *node, const void *data)
 
     spin_unlock(&gicv2.lock);
 
+    return 0;
+}
+
+const static struct gic_hw_operations gicv2_ops = {
+    .info                = &gicv2_info,
+    .init                = gicv2_init,
+    .secondary_init      = gicv2_secondary_cpu_init,
+    .save_state          = gicv2_save_state,
+    .restore_state       = gicv2_restore_state,
+    .dump_state          = gicv2_dump_state,
+    .gic_host_irq_type   = &gicv2_host_irq_type,
+    .gic_guest_irq_type  = &gicv2_guest_irq_type,
+    .eoi_irq             = gicv2_eoi_irq,
+    .deactivate_irq      = gicv2_dir_irq,
+    .read_irq            = gicv2_read_irq,
+    .set_irq_properties  = gicv2_set_irq_properties,
+    .send_SGI            = gicv2_send_SGI,
+    .disable_interface   = gicv2_disable_interface,
+    .update_lr           = gicv2_update_lr,
+    .update_hcr_status   = gicv2_hcr_status,
+    .clear_lr            = gicv2_clear_lr,
+    .read_lr             = gicv2_read_lr,
+    .write_lr            = gicv2_write_lr,
+    .read_vmcr_priority  = gicv2_read_vmcr_priority,
+    .read_apr            = gicv2_read_apr,
+    .make_hwdom_dt_node  = gicv2_make_hwdom_dt_node,
+};
+
+/* Set up the GIC */
+static int __init gicv2_preinit(struct dt_device_node *node, const void *data)
+{
     gicv2_info.hw_version = GIC_V2;
+    gicv2_info.node = node;
     register_gic_ops(&gicv2_ops);
+    dt_irq_xlate = gic_irq_xlate;
 
     return 0;
 }
 
-static const char * const gicv2_dt_compat[] __initconst =
+static const struct dt_device_match gicv2_dt_match[] __initconst =
 {
-    DT_COMPAT_GIC_CORTEX_A15,
-    DT_COMPAT_GIC_CORTEX_A7,
-    DT_COMPAT_GIC_400,
-    NULL
+    DT_MATCH_GIC_V2,
+    { /* sentinel */ },
 };
 
-DT_DEVICE_START(gicv2, "GICv2:", DEVICE_GIC)
-        .compatible = gicv2_dt_compat,
-        .init = gicv2_init,
+DT_DEVICE_START(gicv2, "GICv2", DEVICE_GIC)
+        .dt_match = gicv2_dt_match,
+        .init = gicv2_preinit,
 DT_DEVICE_END
 
 /*
diff --git a/xen/arch/arm/gic-v3.c b/xen/arch/arm/gic-v3.c
index 47452ca..d1db1ce 100644
--- a/xen/arch/arm/gic-v3.c
+++ b/xen/arch/arm/gic-v3.c
@@ -33,6 +33,7 @@
 #include <xen/device_tree.h>
 #include <xen/sizes.h>
 #include <xen/libfdt/libfdt.h>
+#include <xen/sort.h>
 #include <asm/p2m.h>
 #include <asm/domain.h>
 #include <asm/io.h>
@@ -41,16 +42,8 @@
 #include <asm/gic_v3_defs.h>
 #include <asm/cpufeature.h>
 
-struct rdist_region {
-    paddr_t base;
-    paddr_t size;
-    void __iomem *map_base;
-};
-
 /* Global state */
 static struct {
-    paddr_t dbase;            /* Address of distributor registers */
-    paddr_t dbase_size;
     void __iomem *map_dbase;  /* Mapped address of distributor registers */
     struct rdist_region *rdist_regions;
     uint32_t  rdist_stride;
@@ -254,7 +247,7 @@ static void gicv3_enable_sre(void)
     uint32_t val;
 
     val = READ_SYSREG32(ICC_SRE_EL2);
-    val |= GICC_SRE_EL2_SRE | GICC_SRE_EL2_ENEL1;
+    val |= GICC_SRE_EL2_SRE;
 
     WRITE_SYSREG32(val, ICC_SRE_EL2);
     isb();
@@ -382,6 +375,19 @@ static void gicv3_save_state(struct vcpu *v)
 
 static void gicv3_restore_state(const struct vcpu *v)
 {
+    uint32_t val;
+
+    val = READ_SYSREG32(ICC_SRE_EL2);
+    /*
+     * Don't give access to system registers when the guest is using
+     * GICv2
+     */
+    if ( v->domain->arch.vgic.version == GIC_V2 )
+        val &= ~GICC_SRE_EL2_ENEL1;
+    else
+        val |= GICC_SRE_EL2_ENEL1;
+    WRITE_SYSREG32(val, ICC_SRE_EL2);
+
     WRITE_SYSREG32(v->arch.gic.v3.sre_el1, ICC_SRE_EL1);
     WRITE_SYSREG32(v->arch.gic.v3.vmcr, ICH_VMCR_EL2);
     restore_aprn_regs(&v->arch.gic);
@@ -465,7 +471,7 @@ static void gicv3_set_irq_properties(struct irq_desc *desc,
                                      const cpumask_t *cpu_mask,
                                      unsigned int priority)
 {
-    uint32_t cfg, edgebit;
+    uint32_t cfg, actual, edgebit;
     uint64_t affinity;
     void __iomem *base;
     unsigned int cpu = gicv3_get_cpu_from_mask(cpu_mask);
@@ -492,6 +498,20 @@ static void gicv3_set_irq_properties(struct irq_desc *desc,
 
     writel_relaxed(cfg, base);
 
+    actual = readl_relaxed(base);
+    if ( ( cfg & edgebit ) ^ ( actual & edgebit ) )
+    {
+        printk(XENLOG_WARNING "GICv3: WARNING: "
+               "CPU%d: Failed to configure IRQ%u as %s-triggered. "
+               "H/w forces to %s-triggered.\n",
+               smp_processor_id(), desc->irq,
+               cfg & edgebit ? "Edge" : "Level",
+               actual & edgebit ? "Edge" : "Level");
+        desc->arch.type = actual & edgebit ?
+            DT_IRQ_TYPE_EDGE_RISING :
+            DT_IRQ_TYPE_LEVEL_HIGH;
+    }
+
     affinity = gicv3_mpidr_to_affinity(cpu);
     /* Make sure we don't broadcast the interrupt */
     affinity &= ~GICD_IROUTER_SPI_MODE_ANY;
@@ -513,23 +533,24 @@ static void __init gicv3_dist_init(void)
     uint32_t type;
     uint32_t priority;
     uint64_t affinity;
+    unsigned int nr_lines;
     int i;
 
     /* Disable the distributor */
     writel_relaxed(0, GICD + GICD_CTLR);
 
     type = readl_relaxed(GICD + GICD_TYPER);
-    gicv3_info.nr_lines = 32 * ((type & GICD_TYPE_LINES) + 1);
+    nr_lines = 32 * ((type & GICD_TYPE_LINES) + 1);
 
     printk("GICv3: %d lines, (IID %8.8x).\n",
-           gicv3_info.nr_lines, readl_relaxed(GICD + GICD_IIDR));
+           nr_lines, readl_relaxed(GICD + GICD_IIDR));
 
     /* Default all global IRQs to level, active low */
-    for ( i = NR_GIC_LOCAL_IRQS; i < gicv3_info.nr_lines; i += 16 )
+    for ( i = NR_GIC_LOCAL_IRQS; i < nr_lines; i += 16 )
         writel_relaxed(0, GICD + GICD_ICFGR + (i / 16) * 4);
 
     /* Default priority for global interrupts */
-    for ( i = NR_GIC_LOCAL_IRQS; i < gicv3_info.nr_lines; i += 4 )
+    for ( i = NR_GIC_LOCAL_IRQS; i < nr_lines; i += 4 )
     {
         priority = (GIC_PRI_IRQ << 24 | GIC_PRI_IRQ << 16 |
                     GIC_PRI_IRQ << 8 | GIC_PRI_IRQ);
@@ -537,7 +558,7 @@ static void __init gicv3_dist_init(void)
     }
 
     /* Disable all global interrupts */
-    for ( i = NR_GIC_LOCAL_IRQS; i < gicv3_info.nr_lines; i += 32 )
+    for ( i = NR_GIC_LOCAL_IRQS; i < nr_lines; i += 32 )
         writel_relaxed(0xffffffff, GICD + GICD_ICENABLER + (i / 32) * 4);
 
     gicv3_dist_wait_for_rwp();
@@ -551,8 +572,11 @@ static void __init gicv3_dist_init(void)
     /* Make sure we don't broadcast the interrupt */
     affinity &= ~GICD_IROUTER_SPI_MODE_ANY;
 
-    for ( i = NR_GIC_LOCAL_IRQS; i < gicv3_info.nr_lines; i++ )
+    for ( i = NR_GIC_LOCAL_IRQS; i < nr_lines; i++ )
         writeq_relaxed(affinity, GICD + GICD_IROUTER + i * 8);
+
+    /* Only 1020 interrupts are supported */
+    gicv3_info.nr_lines = min(1020U, nr_lines);
 }
 
 static int gicv3_enable_redist(void)
@@ -638,7 +662,7 @@ static int __init gicv3_populate_rdist(void)
                 ptr += gicv3.rdist_stride;
             else
             {
-                ptr += SZ_64K * 2;
+                ptr += SZ_64K * 2; /* Skip RD_base + SGI_base */
                 if ( typer & GICR_TYPER_VLPIS )
                     ptr += SZ_64K * 2; /* Skip VLPI_base + reserved page */
             }
@@ -695,7 +719,7 @@ static int __cpuinit gicv3_cpu_init(void)
     /* Set priority mask register */
     WRITE_SYSREG32(DEFAULT_PMR_VALUE, ICC_PMR_EL1);
 
-    /* EOI drops priority too (mode 0) */
+    /* EOI drops priority, DIR deactivates the interrupt (mode 1) */
     WRITE_SYSREG32(GICC_CTLR_EL1_EOImode_drop, ICC_CTLR_EL1);
 
     /* Enable Group1 interrupts */
@@ -789,8 +813,7 @@ out:
     return tlist;
 }
 
-static void gicv3_send_sgi(enum gic_sgi sgi, enum gic_sgi_mode mode,
-                           const cpumask_t *cpumask)
+static void gicv3_send_sgi_list(enum gic_sgi sgi, const cpumask_t *cpumask)
 {
     int cpu = 0;
     uint64_t val;
@@ -814,12 +837,34 @@ static void gicv3_send_sgi(enum gic_sgi sgi, enum gic_sgi_mode mode,
                MPIDR_AFFINITY_LEVEL(cluster_id, 1) << 16  |
                tlist);
 
-        WRITE_SYSREG(val, ICC_SGI1R_EL1);
+        WRITE_SYSREG64(val, ICC_SGI1R_EL1);
     }
     /* Force above writes to ICC_SGI1R_EL1 */
     isb();
 }
 
+static void gicv3_send_sgi(enum gic_sgi sgi, enum gic_sgi_mode mode,
+                           const cpumask_t *cpumask)
+{
+    switch ( mode )
+    {
+    case SGI_TARGET_OTHERS:
+        WRITE_SYSREG64(ICH_SGI_TARGET_OTHERS << ICH_SGI_IRQMODE_SHIFT |
+                       (uint64_t)sgi << ICH_SGI_IRQ_SHIFT,
+                       ICC_SGI1R_EL1);
+        isb();
+        break;
+    case SGI_TARGET_SELF:
+        gicv3_send_sgi_list(sgi, cpumask_of(smp_processor_id()));
+        break;
+    case SGI_TARGET_LIST:
+        gicv3_send_sgi_list(sgi, cpumask);
+        break;
+    default:
+        BUG();
+    }
+}
+
 /* Shut down the per-CPU GIC interface */
 static void gicv3_disable_interface(void)
 {
@@ -834,13 +879,20 @@ static void gicv3_disable_interface(void)
 static void gicv3_update_lr(int lr, const struct pending_irq *p,
                             unsigned int state)
 {
-    uint64_t grp = GICH_LR_GRP1;
     uint64_t val = 0;
 
     BUG_ON(lr >= gicv3_info.nr_lrs);
     BUG_ON(lr < 0);
 
-    val =  (((uint64_t)state & 0x3) << GICH_LR_STATE_SHIFT) | grp;
+    val =  (((uint64_t)state & 0x3) << GICH_LR_STATE_SHIFT);
+
+    /*
+     * When the guest is GICv3, all guest IRQs are Group 1, as Group0
+     * would result in a FIQ in the guest, which it wouldn't expect
+     */
+    if ( current->domain->arch.vgic.version == GIC_V3 )
+        val |= GICH_LR_GRP1;
+
     val |= ((uint64_t)p->priority & 0xff) << GICH_LR_PRIORITY_SHIFT;
     val |= ((uint64_t)p->irq & GICH_LR_VIRTUAL_MASK) << GICH_LR_VIRTUAL_SHIFT;
 
@@ -885,46 +937,6 @@ static void gicv3_write_lr(int lr_reg, const struct gic_lr *lr)
     gicv3_ich_write_lr(lr_reg, lrv);
 }
 
-static int gicv_v3_init(struct domain *d)
-{
-    int i;
-
-    /*
-     * Domain 0 gets the hardware address.
-     * Guests get the virtual platform layout.
-     */
-    if ( is_hardware_domain(d) )
-    {
-        d->arch.vgic.dbase = gicv3.dbase;
-        d->arch.vgic.dbase_size = gicv3.dbase_size;
-        for ( i = 0; i < gicv3.rdist_count; i++ )
-        {
-            d->arch.vgic.rbase[i] = gicv3.rdist_regions[i].base;
-            d->arch.vgic.rbase_size[i] = gicv3.rdist_regions[i].size;
-        }
-        d->arch.vgic.rdist_stride = gicv3.rdist_stride;
-        d->arch.vgic.rdist_count = gicv3.rdist_count;
-    }
-    else
-    {
-        d->arch.vgic.dbase = GUEST_GICV3_GICD_BASE;
-        d->arch.vgic.dbase_size = GUEST_GICV3_GICD_SIZE;
-
-        /* XXX: Only one Re-distributor region mapped for the guest */
-        BUILD_BUG_ON(GUEST_GICV3_RDIST_REGIONS != 1);
-
-        d->arch.vgic.rdist_count = GUEST_GICV3_RDIST_REGIONS;
-        d->arch.vgic.rdist_stride = GUEST_GICV3_RDIST_STRIDE;
-
-        /* The first redistributor should contain enough space for all CPUs */
-        BUILD_BUG_ON((GUEST_GICV3_GICR0_SIZE / GUEST_GICV3_RDIST_STRIDE) < MAX_VIRT_CPUS);
-        d->arch.vgic.rbase[0] = GUEST_GICV3_GICR0_BASE;
-        d->arch.vgic.rbase_size[0] = GUEST_GICV3_GICR0_SIZE;
-    }
-
-    return 0;
-}
-
 static void gicv3_hcr_status(uint32_t flag, bool_t status)
 {
     uint32_t hcr;
@@ -1041,16 +1053,14 @@ static void gicv3_irq_set_affinity(struct irq_desc *desc, const cpumask_t *mask)
     spin_unlock(&gicv3.lock);
 }
 
-static int gicv3_make_dt_node(const struct domain *d,
-                              const struct dt_device_node *node, void *fdt)
+static int gicv3_make_hwdom_dt_node(const struct domain *d,
+                                    const struct dt_device_node *node,
+                                    void *fdt)
 {
     const struct dt_device_node *gic = dt_interrupt_controller;
     const void *compatible = NULL;
     uint32_t len;
     __be32 *new_cells, *tmp;
-    uint32_t rd_stride = 0;
-    uint32_t rd_count = 0;
-
     int i, res = 0;
 
     compatible = dt_get_property(gic, "compatible", &len);
@@ -1060,35 +1070,17 @@ static int gicv3_make_dt_node(const struct domain *d,
         return -FDT_ERR_XEN(ENOENT);
     }
 
-    res = fdt_begin_node(fdt, "interrupt-controller");
-    if ( res )
-        return res;
-
     res = fdt_property(fdt, "compatible", compatible, len);
     if ( res )
         return res;
 
-    res = fdt_property_cell(fdt, "#interrupt-cells", 3);
-    if ( res )
-        return res;
-
-    res = fdt_property(fdt, "interrupt-controller", NULL, 0);
-    if ( res )
-        return res;
-
-    res = dt_property_read_u32(gic, "redistributor-stride", &rd_stride);
-    if ( !res )
-        rd_stride = 0;
-
-    res = dt_property_read_u32(gic, "#redistributor-regions", &rd_count);
-    if ( !res )
-        rd_count = 1;
-
-    res = fdt_property_cell(fdt, "redistributor-stride", rd_stride);
+    res = fdt_property_cell(fdt, "redistributor-stride",
+                            d->arch.vgic.rdist_stride);
     if ( res )
         return res;
 
-    res = fdt_property_cell(fdt, "#redistributor-regions", rd_count);
+    res = fdt_property_cell(fdt, "#redistributor-regions",
+                            d->arch.vgic.nr_regions);
     if ( res )
         return res;
 
@@ -1098,18 +1090,18 @@ static int gicv3_make_dt_node(const struct domain *d,
      * CPU interface and virtual cpu interfaces accessesed as System registers
      * So cells are created only for Distributor and rdist regions
      */
-    len = len * (d->arch.vgic.rdist_count + 1);
+    len = len * (d->arch.vgic.nr_regions + 1);
     new_cells = xzalloc_bytes(len);
     if ( new_cells == NULL )
         return -FDT_ERR_XEN(ENOMEM);
 
     tmp = new_cells;
 
-    dt_set_range(&tmp, node, d->arch.vgic.dbase, d->arch.vgic.dbase_size);
+    dt_set_range(&tmp, node, d->arch.vgic.dbase, SZ_64K);
 
-    for ( i = 0; i < d->arch.vgic.rdist_count; i++ )
-        dt_set_range(&tmp, node, d->arch.vgic.rbase[i],
-                     d->arch.vgic.rbase_size[i]);
+    for ( i = 0; i < d->arch.vgic.nr_regions; i++ )
+        dt_set_range(&tmp, node, d->arch.vgic.rdist_regions[i].base,
+                     d->arch.vgic.rdist_regions[i].size);
 
     res = fdt_property(fdt, "reg", new_cells, len);
     xfree(new_cells);
@@ -1139,37 +1131,49 @@ static const hw_irq_controller gicv3_guest_irq_type = {
     .set_affinity = gicv3_irq_set_affinity,
 };
 
-static const struct gic_hw_operations gicv3_ops = {
-    .info                = &gicv3_info,
-    .save_state          = gicv3_save_state,
-    .restore_state       = gicv3_restore_state,
-    .dump_state          = gicv3_dump_state,
-    .gicv_setup          = gicv_v3_init,
-    .gic_host_irq_type   = &gicv3_host_irq_type,
-    .gic_guest_irq_type  = &gicv3_guest_irq_type,
-    .eoi_irq             = gicv3_eoi_irq,
-    .deactivate_irq      = gicv3_dir_irq,
-    .read_irq            = gicv3_read_irq,
-    .set_irq_properties  = gicv3_set_irq_properties,
-    .send_SGI            = gicv3_send_sgi,
-    .disable_interface   = gicv3_disable_interface,
-    .update_lr           = gicv3_update_lr,
-    .update_hcr_status   = gicv3_hcr_status,
-    .clear_lr            = gicv3_clear_lr,
-    .read_lr             = gicv3_read_lr,
-    .write_lr            = gicv3_write_lr,
-    .read_vmcr_priority  = gicv3_read_vmcr_priority,
-    .read_apr            = gicv3_read_apr,
-    .secondary_init      = gicv3_secondary_cpu_init,
-    .make_dt_node        = gicv3_make_dt_node,
-};
+static int __init cmp_rdist(const void *a, const void *b)
+{
+    const struct rdist_region *l = a, *r = a;
+
+    /* We assume that re-distributor regions can never overlap */
+    return ( l->base < r->base) ? -1 : 0;
+}
+
+/* If the GICv3 supports GICv2, initialize it */
+static void __init gicv3_init_v2(const struct dt_device_node *node,
+                                 paddr_t dbase)
+{
+    int res;
+    paddr_t cbase, vbase;
+
+    /*
+     * For GICv3 supporting GICv2, GICC and GICV base address will be
+     * provided.
+     */
+    res = dt_device_get_address(node, 1 + gicv3.rdist_count,
+                                &cbase, NULL);
+    if ( res )
+        return;
+
+    res = dt_device_get_address(node, 1 + gicv3.rdist_count + 2,
+                                &vbase, NULL);
+    if ( res )
+        return;
+
+    printk("GICv3 compatible with GICv2 cbase %#"PRIpaddr" vbase %#"PRIpaddr"\n",
+           cbase, vbase);
+
+    vgic_v2_setup_hw(dbase, cbase, vbase);
+}
 
 /* Set up the GIC */
-static int __init gicv3_init(struct dt_device_node *node, const void *data)
+static int __init gicv3_init(void)
 {
     struct rdist_region *rdist_regs;
     int res, i;
     uint32_t reg;
+    const struct dt_device_node *node = gicv3_info.node;
+    paddr_t dbase;
 
     if ( !cpu_has_gicv3 )
     {
@@ -1177,17 +1181,15 @@ static int __init gicv3_init(struct dt_device_node *node, const void *data)
         return -ENODEV;
     }
 
-    dt_device_set_used_by(node, DOMID_XEN);
-
-    res = dt_device_get_address(node, 0, &gicv3.dbase, &gicv3.dbase_size);
-    if ( res || !gicv3.dbase )
+    res = dt_device_get_address(node, 0, &dbase, NULL);
+    if ( res )
         panic("GICv3: Cannot find a valid distributor address");
 
-    if ( (gicv3.dbase & ~PAGE_MASK) || (gicv3.dbase_size & ~PAGE_MASK) )
+    if ( (dbase & ~PAGE_MASK) )
         panic("GICv3:  Found unaligned distributor address %"PRIpaddr"",
-              gicv3.dbase);
+              dbase);
 
-    gicv3.map_dbase = ioremap_nocache(gicv3.dbase, gicv3.dbase_size);
+    gicv3.map_dbase = ioremap_nocache(dbase, SZ_64K);
     if ( !gicv3.map_dbase )
         panic("GICv3: Failed to ioremap for GIC distributor\n");
 
@@ -1212,14 +1214,16 @@ static int __init gicv3_init(struct dt_device_node *node, const void *data)
         uint64_t rdist_base, rdist_size;
 
         res = dt_device_get_address(node, 1 + i, &rdist_base, &rdist_size);
-        if ( res || !rdist_base )
+        if ( res )
             panic("GICv3: No rdist base found for region %d\n", i);
 
         rdist_regs[i].base = rdist_base;
         rdist_regs[i].size = rdist_size;
     }
 
-    /* If stride is not set in dt. Set default to 2 * SZ_64K */
+    /* The vGIC code requires the region to be sorted */
+    sort(rdist_regs, gicv3.rdist_count, sizeof(*rdist_regs), cmp_rdist, NULL);
+
     if ( !dt_property_read_u32(node, "redistributor-stride", &gicv3.rdist_stride) )
         gicv3.rdist_stride = 0;
 
@@ -1230,9 +1234,6 @@ static int __init gicv3_init(struct dt_device_node *node, const void *data)
         panic("GICv3: Cannot find the maintenance IRQ");
     gicv3_info.maintenance_irq = res;
 
-    /* Set the GIC as the primary interrupt controller */
-    dt_interrupt_controller = node;
-
     for ( i = 0; i < gicv3.rdist_count; i++ )
     {
         /* map dbase & rdist regions */
@@ -1245,19 +1246,24 @@ static int __init gicv3_init(struct dt_device_node *node, const void *data)
     }
 
     printk("GICv3 initialization:\n"
-           "      gic_dist_addr=%"PRIpaddr"\n"
-           "      gic_dist_size=%"PRIpaddr"\n"
-           "      gic_dist_mapaddr=%p\n"
-           "      gic_rdist_regions=%d\n"
-           "      gic_rdist_stride=%x\n"
-           "      gic_rdist_base=%"PRIpaddr"\n"
-           "      gic_rdist_base_size=%"PRIpaddr"\n"
-           "      gic_rdist_base_mapaddr=%p\n"
-           "      gic_maintenance_irq=%u\n",
-           gicv3.dbase, gicv3.dbase_size, gicv3.map_dbase, gicv3.rdist_count,
-           gicv3.rdist_stride, gicv3.rdist_regions[0].base,
-           gicv3.rdist_regions[0].size, gicv3.rdist_regions[0].map_base,
-           gicv3_info.maintenance_irq);
+           "      gic_dist_addr=%#"PRIpaddr"\n"
+           "      gic_maintenance_irq=%u\n"
+           "      gic_rdist_stride=%#x\n"
+           "      gic_rdist_regions=%d\n",
+           dbase, gicv3_info.maintenance_irq,
+           gicv3.rdist_stride, gicv3.rdist_count);
+    printk("      redistributor regions:\n");
+    for ( i = 0; i < gicv3.rdist_count; i++ )
+    {
+        const struct rdist_region *r = &gicv3.rdist_regions[i];
+
+        printk("        - region %u: %#"PRIpaddr" - %#"PRIpaddr"\n",
+               i, r->base, r->base + r->size);
+    }
+
+    vgic_v3_setup_hw(dbase, gicv3.rdist_count, gicv3.rdist_regions,
+                     gicv3.rdist_stride);
+    gicv3_init_v2(node, dbase);
 
     spin_lock_init(&gicv3.lock);
 
@@ -1267,24 +1273,55 @@ static int __init gicv3_init(struct dt_device_node *node, const void *data)
     res = gicv3_cpu_init();
     gicv3_hyp_init();
 
-    gicv3_info.hw_version = GIC_V3;
-    /* Register hw ops*/
-    register_gic_ops(&gicv3_ops);
-
     spin_unlock(&gicv3.lock);
 
     return res;
 }
 
-static const char * const gicv3_dt_compat[] __initconst =
+static const struct gic_hw_operations gicv3_ops = {
+    .info                = &gicv3_info,
+    .init                = gicv3_init,
+    .save_state          = gicv3_save_state,
+    .restore_state       = gicv3_restore_state,
+    .dump_state          = gicv3_dump_state,
+    .gic_host_irq_type   = &gicv3_host_irq_type,
+    .gic_guest_irq_type  = &gicv3_guest_irq_type,
+    .eoi_irq             = gicv3_eoi_irq,
+    .deactivate_irq      = gicv3_dir_irq,
+    .read_irq            = gicv3_read_irq,
+    .set_irq_properties  = gicv3_set_irq_properties,
+    .send_SGI            = gicv3_send_sgi,
+    .disable_interface   = gicv3_disable_interface,
+    .update_lr           = gicv3_update_lr,
+    .update_hcr_status   = gicv3_hcr_status,
+    .clear_lr            = gicv3_clear_lr,
+    .read_lr             = gicv3_read_lr,
+    .write_lr            = gicv3_write_lr,
+    .read_vmcr_priority  = gicv3_read_vmcr_priority,
+    .read_apr            = gicv3_read_apr,
+    .secondary_init      = gicv3_secondary_cpu_init,
+    .make_hwdom_dt_node  = gicv3_make_hwdom_dt_node,
+};
+
+static int __init gicv3_preinit(struct dt_device_node *node, const void *data)
+{
+    gicv3_info.hw_version = GIC_V3;
+    gicv3_info.node = node;
+    register_gic_ops(&gicv3_ops);
+    dt_irq_xlate = gic_irq_xlate;
+
+    return 0;
+}
+
+static const struct dt_device_match gicv3_dt_match[] __initconst =
 {
-    DT_COMPAT_GIC_V3,
-    NULL
+    DT_MATCH_GIC_V3,
+    { /* sentinel */ },
 };
 
 DT_DEVICE_START(gicv3, "GICv3", DEVICE_GIC)
-        .compatible = gicv3_dt_compat,
-        .init = gicv3_init,
+        .dt_match = gicv3_dt_match,
+        .init = gicv3_preinit,
 DT_DEVICE_END
 
 /*
diff --git a/xen/arch/arm/gic.c b/xen/arch/arm/gic.c
index e7a1af5..1757193 100644
--- a/xen/arch/arm/gic.c
+++ b/xen/arch/arm/gic.c
@@ -126,21 +126,86 @@ void gic_route_irq_to_xen(struct irq_desc *desc, const cpumask_t *cpu_mask,
 /* Program the GIC to route an interrupt to a guest
  *   - desc.lock must be held
  */
-void gic_route_irq_to_guest(struct domain *d, struct irq_desc *desc,
-                            const cpumask_t *cpu_mask, unsigned int priority)
+int gic_route_irq_to_guest(struct domain *d, unsigned int virq,
+                           struct irq_desc *desc, unsigned int priority)
 {
-    struct pending_irq *p;
+    unsigned long flags;
+    /* Use vcpu0 to retrieve the pending_irq struct. Given that we only
+     * route SPIs to guests, it doesn't make any difference. */
+    struct vcpu *v_target = vgic_get_target_vcpu(d->vcpu[0], virq);
+    struct vgic_irq_rank *rank = vgic_rank_irq(v_target, virq);
+    struct pending_irq *p = irq_to_pending(v_target, virq);
+    int res = -EBUSY;
+
     ASSERT(spin_is_locked(&desc->lock));
+    /* Caller has already checked that the IRQ is an SPI */
+    ASSERT(virq >= 32);
+    ASSERT(virq < vgic_num_irqs(d));
+
+    vgic_lock_rank(v_target, rank, flags);
+
+    if ( p->desc ||
+         /* The VIRQ should not be already enabled by the guest */
+         test_bit(GIC_IRQ_GUEST_ENABLED, &p->status) )
+        goto out;
 
     desc->handler = gic_hw_ops->gic_guest_irq_type;
     set_bit(_IRQ_GUEST, &desc->status);
 
-    gic_set_irq_properties(desc, cpumask_of(smp_processor_id()), GIC_PRI_IRQ);
+    gic_set_irq_properties(desc, cpumask_of(v_target->processor), priority);
 
-    /* Use vcpu0 to retrieve the pending_irq struct. Given that we only
-     * route SPIs to guests, it doesn't make any difference. */
-    p = irq_to_pending(d->vcpu[0], desc->irq);
     p->desc = desc;
+    res = 0;
+
+out:
+    vgic_unlock_rank(v_target, rank, flags);
+
+    return res;
+}
+
+/* This function only works with SPIs for now */
+int gic_remove_irq_from_guest(struct domain *d, unsigned int virq,
+                              struct irq_desc *desc)
+{
+    struct vcpu *v_target = vgic_get_target_vcpu(d->vcpu[0], virq);
+    struct vgic_irq_rank *rank = vgic_rank_irq(v_target, virq);
+    struct pending_irq *p = irq_to_pending(v_target, virq);
+    unsigned long flags;
+
+    ASSERT(spin_is_locked(&desc->lock));
+    ASSERT(test_bit(_IRQ_GUEST, &desc->status));
+    ASSERT(p->desc == desc);
+
+    vgic_lock_rank(v_target, rank, flags);
+
+    if ( d->is_dying )
+    {
+        desc->handler->shutdown(desc);
+
+        /* EOI the IRQ if it has not been done by the guest */
+        if ( test_bit(_IRQ_INPROGRESS, &desc->status) )
+            gic_hw_ops->deactivate_irq(desc);
+        clear_bit(_IRQ_INPROGRESS, &desc->status);
+    }
+    else
+    {
+        /*
+         * TODO: Handle eviction from LRs For now, deny
+         * remove if the IRQ is inflight or not disabled.
+         */
+        if ( test_bit(_IRQ_INPROGRESS, &desc->status) ||
+             !test_bit(_IRQ_DISABLED, &desc->status) )
+            return -EBUSY;
+    }
+
+    clear_bit(_IRQ_GUEST, &desc->status);
+    desc->handler = &no_irq_type;
+
+    p->desc = NULL;
+
+    vgic_unlock_rank(v_target, rank, flags);
+
+    return 0;
 }
 
 int gic_irq_xlate(const u32 *intspec, unsigned int intsize,
@@ -163,8 +228,10 @@ int gic_irq_xlate(const u32 *intspec, unsigned int intsize,
     return 0;
 }
 
-/* Set up the GIC */
-void __init gic_init(void)
+/* Find the interrupt controller and set up the callback to translate
+ * device tree IRQ.
+ */
+void __init gic_preinit(void)
 {
     int rc;
     struct dt_device_node *node;
@@ -189,6 +256,16 @@ void __init gic_init(void)
     if ( !num_gics )
         panic("Unable to find compatible GIC in the device tree");
 
+    /* Set the GIC as the primary interrupt controller */
+    dt_interrupt_controller = node;
+    dt_device_set_used_by(node, DOMID_XEN);
+}
+
+/* Set up the GIC */
+void __init gic_init(void)
+{
+    if ( gic_hw_ops->init() )
+        panic("Failed to initialize the GIC drivers");
     /* Clear LR mask for cpu0 */
     clear_cpu_lr_mask();
 }
@@ -368,11 +445,7 @@ static void gic_update_one_lr(struct vcpu *v, int i)
         clear_bit(i, &this_cpu(lr_mask));
 
         if ( p->desc != NULL )
-        {
             clear_bit(_IRQ_INPROGRESS, &p->desc->status);
-            if ( platform_has_quirk(PLATFORM_QUIRK_GUEST_PIRQ_NEED_EOI) )
-                gic_hw_ops->deactivate_irq(p->desc);
-        }
         clear_bit(GIC_IRQ_GUEST_VISIBLE, &p->status);
         clear_bit(GIC_IRQ_GUEST_ACTIVE, &p->status);
         p->lr = GIC_INVALID_LR;
@@ -536,6 +609,8 @@ static void do_sgi(struct cpu_user_regs *regs, enum gic_sgi sgi)
     /* Lower the priority */
     struct irq_desc *desc = irq_to_desc(sgi);
 
+    perfc_incr(ipis);
+
     /* Lower the priority */
     gic_hw_ops->eoi_irq(desc);
 
@@ -568,7 +643,7 @@ void gic_interrupt(struct cpu_user_regs *regs, int is_fiq)
         /* Reading IRQ will ACK it */
         irq = gic_hw_ops->read_irq();
 
-        if ( likely(irq >= 16 && irq < 1021) )
+        if ( likely(irq >= 16 && irq < 1020) )
         {
             local_irq_enable();
             do_IRQ(regs, irq, is_fiq);
@@ -586,11 +661,6 @@ void gic_interrupt(struct cpu_user_regs *regs, int is_fiq)
     } while (1);
 }
 
-int gicv_setup(struct domain *d)
-{
-    return gic_hw_ops->gicv_setup(d);
-}
-
 static void maintenance_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
 {
     /*
@@ -604,6 +674,7 @@ static void maintenance_interrupt(int irq, void *dev_id, struct cpu_user_regs *r
      * GICH_HCR_UIE is cleared before reading GICC_IAR. As a consequence
      * this handler is not called.
      */
+    perfc_incr(maintenance_irqs);
 }
 
 void gic_dump_info(struct vcpu *v)
@@ -615,7 +686,7 @@ void gic_dump_info(struct vcpu *v)
 
     list_for_each_entry ( p, &v->arch.vgic.inflight_irqs, inflight )
     {
-        printk("Inflight irq=%d lr=%u\n", p->irq, p->lr);
+        printk("Inflight irq=%u lr=%u\n", p->irq, p->lr);
     }
 
     list_for_each_entry( p, &v->arch.vgic.lr_pending, lr_queue )
@@ -630,10 +701,11 @@ void __cpuinit init_maintenance_interrupt(void)
                 "irq-maintenance", NULL);
 }
 
-int gic_make_node(const struct domain *d,const struct dt_device_node *node,
-                   void *fdt)
+int gic_make_hwdom_dt_node(const struct domain *d,
+                           const struct dt_device_node *node,
+                           void *fdt)
 {
-    return gic_hw_ops->make_dt_node(d, node, fdt);
+    return gic_hw_ops->make_hwdom_dt_node(d, node, fdt);
 }
 
 /*
diff --git a/xen/arch/arm/guestcopy.c b/xen/arch/arm/guestcopy.c
index 7dbaeca..ce1c3c3 100644
--- a/xen/arch/arm/guestcopy.c
+++ b/xen/arch/arm/guestcopy.c
@@ -1,10 +1,8 @@
-#include <xen/config.h>
 #include <xen/lib.h>
 #include <xen/domain_page.h>
+#include <xen/mm.h>
 #include <xen/sched.h>
 #include <asm/current.h>
-
-#include <asm/mm.h>
 #include <asm/guest_access.h>
 
 static unsigned long raw_copy_to_guest_helper(void *to, const void *from,
diff --git a/xen/arch/arm/irq.c b/xen/arch/arm/irq.c
index 25ecf1d..1f38605 100644
--- a/xen/arch/arm/irq.c
+++ b/xen/arch/arm/irq.c
@@ -31,6 +31,13 @@
 static unsigned int local_irqs_type[NR_LOCAL_IRQS];
 static DEFINE_SPINLOCK(local_irqs_type_lock);
 
+/* Describe an IRQ assigned to a guest */
+struct irq_guest
+{
+    struct domain *d;
+    unsigned int virq;
+};
+
 static void ack_none(struct irq_desc *irq)
 {
     printk("unexpected IRQ trap at irq %02x\n", irq->irq);
@@ -122,18 +129,20 @@ void __cpuinit init_secondary_IRQ(void)
     BUG_ON(init_local_irq_data() < 0);
 }
 
-static inline struct domain *irq_get_domain(struct irq_desc *desc)
+static inline struct irq_guest *irq_get_guest_info(struct irq_desc *desc)
 {
     ASSERT(spin_is_locked(&desc->lock));
-
-    if ( !test_bit(_IRQ_GUEST, &desc->status) )
-        return dom_xen;
-
+    ASSERT(test_bit(_IRQ_GUEST, &desc->status));
     ASSERT(desc->action != NULL);
 
     return desc->action->dev_id;
 }
 
+static inline struct domain *irq_get_domain(struct irq_desc *desc)
+{
+    return irq_get_guest_info(desc)->d;
+}
+
 void irq_set_affinity(struct irq_desc *desc, const cpumask_t *cpu_mask)
 {
     if ( desc != NULL )
@@ -179,7 +188,14 @@ void do_IRQ(struct cpu_user_regs *regs, unsigned int irq, int is_fiq)
 {
     struct irq_desc *desc = irq_to_desc(irq);
 
-    /* TODO: perfc_incr(irqs); */
+    perfc_incr(irqs);
+
+    ASSERT(irq >= 16); /* SGIs do not come down this path */
+
+    if (irq < 32)
+        perfc_incr(ppis);
+    else
+        perfc_incr(spis);
 
     /* TODO: this_cpu(irq_count)++; */
 
@@ -197,16 +213,18 @@ void do_IRQ(struct cpu_user_regs *regs, unsigned int irq, int is_fiq)
 
     if ( test_bit(_IRQ_GUEST, &desc->status) )
     {
-        struct domain *d = irq_get_domain(desc);
+        struct irq_guest *info = irq_get_guest_info(desc);
 
+        perfc_incr(guest_irqs);
         desc->handler->end(desc);
 
         set_bit(_IRQ_INPROGRESS, &desc->status);
-        desc->arch.eoi_cpu = smp_processor_id();
 
-        /* the irq cannot be a PPI, we only support delivery of SPIs to
-         * guests */
-        vgic_vcpu_inject_spi(d, irq);
+        /*
+         * The irq cannot be a PPI, we only support delivery of SPIs to
+         * guests.
+	 */
+        vgic_vcpu_inject_spi(info->d, info->virq);
         goto out_no_end;
     }
 
@@ -370,26 +388,79 @@ err:
     return rc;
 }
 
-int route_irq_to_guest(struct domain *d, unsigned int irq,
-                       const char * devname)
+bool_t is_assignable_irq(unsigned int irq)
+{
+    /* For now, we can only route SPIs to the guest */
+    return ((irq >= NR_LOCAL_IRQS) && (irq < gic_number_lines()));
+}
+
+/*
+ * Route an IRQ to a specific guest.
+ * For now only SPIs are assignable to the guest.
+ */
+int route_irq_to_guest(struct domain *d, unsigned int virq,
+                       unsigned int irq, const char * devname)
 {
     struct irqaction *action;
-    struct irq_desc *desc = irq_to_desc(irq);
+    struct irq_guest *info;
+    struct irq_desc *desc;
     unsigned long flags;
     int retval = 0;
 
+    if ( virq >= vgic_num_irqs(d) )
+    {
+        printk(XENLOG_G_ERR
+               "the vIRQ number %u is too high for domain %u (max = %u)\n",
+               irq, d->domain_id, vgic_num_irqs(d));
+        return -EINVAL;
+    }
+
+    /* Only routing to virtual SPIs is supported */
+    if ( virq < NR_LOCAL_IRQS )
+    {
+        printk(XENLOG_G_ERR "IRQ can only be routed to an SPI\n");
+        return -EINVAL;
+    }
+
+    if ( !is_assignable_irq(irq) )
+    {
+        printk(XENLOG_G_ERR "the IRQ%u is not routable\n", irq);
+        return -EINVAL;
+    }
+    desc = irq_to_desc(irq);
+
     action = xmalloc(struct irqaction);
-    if (!action)
+    if ( !action )
+        return -ENOMEM;
+
+    info = xmalloc(struct irq_guest);
+    if ( !info )
+    {
+        xfree(action);
         return -ENOMEM;
+    }
+
+    info->d = d;
+    info->virq = virq;
 
-    action->dev_id = d;
+    action->dev_id = info;
     action->name = devname;
     action->free_on_release = 1;
 
     spin_lock_irqsave(&desc->lock, flags);
 
-    /* If the IRQ is already used by someone
-     *  - If it's the same domain -> Xen doesn't need to update the IRQ desc
+    if ( desc->arch.type == DT_IRQ_TYPE_INVALID )
+    {
+        printk(XENLOG_G_ERR "IRQ %u has not been configured\n", irq);
+        retval = -EIO;
+        goto out;
+    }
+
+    /*
+     * If the IRQ is already used by someone
+     *  - If it's the same domain -> Xen doesn't need to update the IRQ desc.
+     *  For safety check if we are not trying to assign the IRQ to a
+     *  different vIRQ.
      *  - Otherwise -> For now, don't allow the IRQ to be shared between
      *  Xen and domains.
      */
@@ -398,13 +469,22 @@ int route_irq_to_guest(struct domain *d, unsigned int irq,
         struct domain *ad = irq_get_domain(desc);
 
         if ( test_bit(_IRQ_GUEST, &desc->status) && d == ad )
+        {
+            if ( irq_get_guest_info(desc)->virq != virq )
+            {
+                printk(XENLOG_G_ERR
+                       "d%u: IRQ %u is already assigned to vIRQ %u\n",
+                       d->domain_id, irq, irq_get_guest_info(desc)->virq);
+                retval = -EBUSY;
+            }
             goto out;
+        }
 
         if ( test_bit(_IRQ_GUEST, &desc->status) )
-            printk(XENLOG_ERR "ERROR: IRQ %u is already used by domain %u\n",
+            printk(XENLOG_G_ERR "IRQ %u is already used by domain %u\n",
                    irq, ad->domain_id);
         else
-            printk(XENLOG_ERR "ERROR: IRQ %u is already used by Xen\n", irq);
+            printk(XENLOG_G_ERR "IRQ %u is already used by Xen\n", irq);
         retval = -EBUSY;
         goto out;
     }
@@ -413,18 +493,73 @@ int route_irq_to_guest(struct domain *d, unsigned int irq,
     if ( retval )
         goto out;
 
-    gic_route_irq_to_guest(d, desc, cpumask_of(smp_processor_id()),
-                           GIC_PRI_IRQ);
+    retval = gic_route_irq_to_guest(d, virq, desc, GIC_PRI_IRQ);
+
     spin_unlock_irqrestore(&desc->lock, flags);
+
+    if ( retval )
+    {
+        release_irq(desc->irq, info);
+        goto free_info;
+    }
+
     return 0;
 
 out:
     spin_unlock_irqrestore(&desc->lock, flags);
     xfree(action);
+free_info:
+    xfree(info);
 
     return retval;
 }
 
+int release_guest_irq(struct domain *d, unsigned int virq)
+{
+    struct irq_desc *desc;
+    struct irq_guest *info;
+    unsigned long flags;
+    struct pending_irq *p;
+    int ret;
+
+    /* Only SPIs are supported */
+    if ( virq < NR_LOCAL_IRQS || virq >= vgic_num_irqs(d) )
+        return -EINVAL;
+
+    p = spi_to_pending(d, virq);
+    if ( !p->desc )
+        return -EINVAL;
+
+    desc = p->desc;
+
+    spin_lock_irqsave(&desc->lock, flags);
+
+    ret = -EINVAL;
+    if ( !test_bit(_IRQ_GUEST, &desc->status) )
+        goto unlock;
+
+    info = irq_get_guest_info(desc);
+    ret = -EINVAL;
+    if ( d != info->d )
+        goto unlock;
+
+    ret = gic_remove_irq_from_guest(d, virq, desc);
+    if ( ret )
+        goto unlock;
+
+    spin_unlock_irqrestore(&desc->lock, flags);
+
+    release_irq(desc->irq, info);
+    xfree(info);
+
+    return 0;
+
+unlock:
+    spin_unlock_irqrestore(&desc->lock, flags);
+
+    return ret;
+}
+
 /*
  * pirq event channels. We don't use these on ARM, instead we use the
  * features of the GIC to inject virtualised normal interrupts.
diff --git a/xen/arch/arm/kernel.c b/xen/arch/arm/kernel.c
index 209c3dd..f641b12 100644
--- a/xen/arch/arm/kernel.c
+++ b/xen/arch/arm/kernel.c
@@ -182,7 +182,7 @@ static void kernel_zimage_load(struct kernel_info *info)
             return;
         }
 
-        dst = map_domain_page(ma>>PAGE_SHIFT);
+        dst = map_domain_page(_mfn(paddr_to_pfn(ma)));
 
         copy_from_paddr(dst + s, paddr + offs, l);
 
diff --git a/xen/arch/arm/kernel.h b/xen/arch/arm/kernel.h
index 0050dfb..c1b07d4 100644
--- a/xen/arch/arm/kernel.h
+++ b/xen/arch/arm/kernel.h
@@ -22,6 +22,10 @@ struct kernel_info {
     /* kernel entry point */
     paddr_t entry;
 
+    /* grant table region */
+    paddr_t gnttab_start;
+    paddr_t gnttab_size;
+
     /* boot blob load addresses */
     const struct bootmodule *kernel_bootmodule, *initrd_bootmodule;
     paddr_t dtb_paddr;
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index 7d4ba0c..b5d8c85 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -213,7 +213,7 @@ void dump_pt_walk(paddr_t ttbr, paddr_t addr,
     else
         root_table = 0;
 
-    mapping = map_domain_page(root_pfn + root_table);
+    mapping = map_domain_page(_mfn(root_pfn + root_table));
 
     for ( level = root_level; ; level++ )
     {
@@ -230,7 +230,7 @@ void dump_pt_walk(paddr_t ttbr, paddr_t addr,
 
         /* For next iteration */
         unmap_domain_page(mapping);
-        mapping = map_domain_page(pte.walk.base);
+        mapping = map_domain_page(_mfn(pte.walk.base));
     }
 
     unmap_domain_page(mapping);
@@ -271,7 +271,7 @@ void clear_fixmap(unsigned map)
 }
 
 #ifdef CONFIG_DOMAIN_PAGE
-void *map_domain_page_global(unsigned long mfn)
+void *map_domain_page_global(mfn_t mfn)
 {
     return vmap(&mfn, 1);
 }
@@ -282,11 +282,11 @@ void unmap_domain_page_global(const void *va)
 }
 
 /* Map a page of domheap memory */
-void *map_domain_page(unsigned long mfn)
+void *map_domain_page(mfn_t mfn)
 {
     unsigned long flags;
     lpae_t *map = this_cpu(xen_dommap);
-    unsigned long slot_mfn = mfn & ~LPAE_ENTRY_MASK;
+    unsigned long slot_mfn = mfn_x(mfn) & ~LPAE_ENTRY_MASK;
     vaddr_t va;
     lpae_t pte;
     int i, slot;
@@ -339,7 +339,7 @@ void *map_domain_page(unsigned long mfn)
 
     va = (DOMHEAP_VIRT_START
           + (slot << SECOND_SHIFT)
-          + ((mfn & LPAE_ENTRY_MASK) << THIRD_SHIFT));
+          + ((mfn_x(mfn) & LPAE_ENTRY_MASK) << THIRD_SHIFT));
 
     /*
      * We may not have flushed this specific subpage at map time,
@@ -386,7 +386,7 @@ unsigned long domain_page_map_to_mfn(const void *ptr)
 
 void flush_page_to_ram(unsigned long mfn)
 {
-    void *v = map_domain_page(mfn);
+    void *v = map_domain_page(_mfn(mfn));
 
     clean_and_invalidate_dcache_va_range(v, PAGE_SIZE);
     unmap_domain_page(v);
@@ -399,7 +399,7 @@ void __init arch_init_memory(void)
      * Any Xen-heap pages that we will allow to be mapped will have
      * their domain field set to dom_xen.
      */
-    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
+    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0, NULL);
     BUG_ON(IS_ERR(dom_xen));
 
     /*
@@ -407,14 +407,14 @@ void __init arch_init_memory(void)
      * This domain owns I/O pages that are within the range of the page_info
      * array. Mappings occur at the priv of the caller.
      */
-    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
+    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0, NULL);
     BUG_ON(IS_ERR(dom_io));
 
     /*
      * Initialise our COW domain.
      * This domain owns sharable pages.
      */
-    dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0);
+    dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0, NULL);
     BUG_ON(IS_ERR(dom_cow));
 }
 
@@ -794,10 +794,10 @@ void *__init arch_vmap_virt_end(void)
  */
 void *ioremap_attr(paddr_t pa, size_t len, unsigned int attributes)
 {
-    unsigned long pfn = PFN_DOWN(pa);
+    mfn_t mfn = _mfn(PFN_DOWN(pa));
     unsigned int offs = pa & (PAGE_SIZE - 1);
     unsigned int nr = PFN_UP(offs + len);
-    void *ptr = __vmap(&pfn, nr, 1, 1, attributes);
+    void *ptr = __vmap(&mfn, nr, 1, 1, attributes);
 
     if ( ptr == NULL )
         return NULL;
@@ -827,7 +827,8 @@ static int create_xen_table(lpae_t *entry)
 
 enum xenmap_operation {
     INSERT,
-    REMOVE
+    REMOVE,
+    RESERVE
 };
 
 static int create_xen_entries(enum xenmap_operation op,
@@ -859,12 +860,15 @@ static int create_xen_entries(enum xenmap_operation op,
 
         switch ( op ) {
             case INSERT:
+            case RESERVE:
                 if ( third[third_table_offset(addr)].pt.valid )
                 {
                     printk("create_xen_entries: trying to replace an existing mapping addr=%lx mfn=%lx\n",
                            addr, mfn);
                     return -EINVAL;
                 }
+                if ( op == RESERVE )
+                    break;
                 pte = mfn_to_xen_entry(mfn, ai);
                 pte.pt.table = 1;
                 write_pte(&third[third_table_offset(addr)], pte);
@@ -898,6 +902,13 @@ int map_pages_to_xen(unsigned long virt,
 {
     return create_xen_entries(INSERT, virt, mfn, nr_mfns, flags);
 }
+
+int populate_pt_range(unsigned long virt, unsigned long mfn,
+                      unsigned long nr_mfns)
+{
+    return create_xen_entries(RESERVE, virt, mfn, nr_mfns, 0);
+}
+
 void destroy_xen_mappings(unsigned long v, unsigned long e)
 {
     create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0);
@@ -985,7 +996,7 @@ int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
 
 unsigned long domain_get_maximum_gpfn(struct domain *d)
 {
-    return -ENOSYS;
+    return d->arch.p2m.max_mapped_gfn;
 }
 
 void share_xen_page_with_guest(struct page_info *page,
@@ -1037,7 +1048,7 @@ int xenmem_add_to_physmap_one(
     switch ( space )
     {
     case XENMAPSPACE_grant_table:
-        spin_lock(&d->grant_table->lock);
+        write_lock(&d->grant_table->lock);
 
         if ( d->grant_table->gt_version == 0 )
             d->grant_table->gt_version = 1;
@@ -1067,7 +1078,7 @@ int xenmem_add_to_physmap_one(
 
         t = p2m_ram_rw;
 
-        spin_unlock(&d->grant_table->lock);
+        write_unlock(&d->grant_table->lock);
         break;
     case XENMAPSPACE_shared_info:
         if ( idx != 0 )
@@ -1103,7 +1114,6 @@ int xenmem_add_to_physmap_one(
         page = get_page_from_gfn(od, idx, &p2mt, P2M_ALLOC);
         if ( !page )
         {
-            dump_p2m_lookup(od, pfn_to_paddr(idx));
             rcu_unlock_domain(od);
             return -EINVAL;
         }
@@ -1159,6 +1169,7 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg)
 struct domain *page_get_owner_and_reference(struct page_info *page)
 {
     unsigned long x, y = page->count_info;
+    struct domain *owner;
 
     do {
         x = y;
@@ -1171,7 +1182,10 @@ struct domain *page_get_owner_and_reference(struct page_info *page)
     }
     while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
 
-    return page_get_owner(page);
+    owner = page_get_owner(page);
+    ASSERT(owner);
+
+    return owner;
 }
 
 void put_page(struct page_info *page)
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 8809f5a..e396c40 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -5,6 +5,9 @@
 #include <xen/errno.h>
 #include <xen/domain_page.h>
 #include <xen/bitops.h>
+#include <xen/vm_event.h>
+#include <xen/mem_access.h>
+#include <public/vm_event.h>
 #include <asm/flushtlb.h>
 #include <asm/gic.h>
 #include <asm/event.h>
@@ -24,6 +27,8 @@ static unsigned int __read_mostly p2m_root_level;
 
 #define P2M_ROOT_PAGES    (1<<P2M_ROOT_ORDER)
 
+unsigned int __read_mostly p2m_ipa_bits;
+
 static bool_t p2m_valid(lpae_t pte)
 {
     return pte.p2m.valid;
@@ -139,7 +144,7 @@ void flush_tlb_domain(struct domain *d)
  * There are no processor functions to do a stage 2 only lookup therefore we
  * do a a software walk.
  */
-paddr_t p2m_lookup(struct domain *d, paddr_t paddr, p2m_type_t *t)
+static paddr_t __p2m_lookup(struct domain *d, paddr_t paddr, p2m_type_t *t)
 {
     struct p2m_domain *p2m = &d->arch.p2m;
     const unsigned int offsets[4] = {
@@ -157,6 +162,7 @@ paddr_t p2m_lookup(struct domain *d, paddr_t paddr, p2m_type_t *t)
     p2m_type_t _t;
     unsigned int level, root_table;
 
+    ASSERT(spin_is_locked(&p2m->lock));
     BUILD_BUG_ON(THIRD_MASK != PAGE_MASK);
 
     /* Allow t to be NULL */
@@ -179,8 +185,6 @@ paddr_t p2m_lookup(struct domain *d, paddr_t paddr, p2m_type_t *t)
     else
         root_table = 0;
 
-    spin_lock(&p2m->lock);
-
     map = __map_domain_page(p2m->root + root_table);
 
     ASSERT(P2M_ROOT_LEVEL < 4);
@@ -202,7 +206,7 @@ paddr_t p2m_lookup(struct domain *d, paddr_t paddr, p2m_type_t *t)
 
         /* Map for next level */
         unmap_domain_page(map);
-        map = map_domain_page(pte.p2m.base);
+        map = map_domain_page(_mfn(pte.p2m.base));
     }
 
     unmap_domain_page(map);
@@ -215,11 +219,22 @@ paddr_t p2m_lookup(struct domain *d, paddr_t paddr, p2m_type_t *t)
         *t = pte.p2m.type;
     }
 
-    spin_unlock(&p2m->lock);
 err:
     return maddr;
 }
 
+paddr_t p2m_lookup(struct domain *d, paddr_t paddr, p2m_type_t *t)
+{
+    paddr_t ret;
+    struct p2m_domain *p2m = &d->arch.p2m;
+
+    spin_lock(&p2m->lock);
+    ret = __p2m_lookup(d, paddr, t);
+    spin_unlock(&p2m->lock);
+
+    return ret;
+}
+
 int guest_physmap_mark_populate_on_demand(struct domain *d,
                                           unsigned long gfn,
                                           unsigned int order)
@@ -305,7 +320,7 @@ static void p2m_set_permission(lpae_t *e, p2m_type_t t, p2m_access_t a)
 }
 
 static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr,
-                               p2m_type_t t)
+                               p2m_type_t t, p2m_access_t a)
 {
     paddr_t pa = ((paddr_t) mfn) << PAGE_SHIFT;
     /* sh, xn and write bit will be defined in the following switches
@@ -335,8 +350,7 @@ static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr,
         break;
     }
 
-    /* We pass p2m_access_rwx as a placeholder for now. */
-    p2m_set_permission(&e, t, p2m_access_rwx);
+    p2m_set_permission(&e, t, a);
 
     ASSERT(!(pa & ~PAGE_MASK));
     ASSERT(!(pa & ~PADDR_MASK));
@@ -394,7 +408,7 @@ static int p2m_create_table(struct domain *d, lpae_t *entry,
          for ( i=0 ; i < LPAE_ENTRIES; i++ )
          {
              pte = mfn_to_p2m_entry(base_pfn + (i<<(level_shift-LPAE_SHIFT)),
-                                    MATTR_MEM, t);
+                                    MATTR_MEM, t, p2m->default_access);
 
              /*
               * First and second level super pages set p2m.table = 0, but
@@ -414,19 +428,114 @@ static int p2m_create_table(struct domain *d, lpae_t *entry,
 
     unmap_domain_page(p);
 
-    pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid);
+    pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid,
+                           p2m->default_access);
 
     p2m_write_pte(entry, pte, flush_cache);
 
     return 0;
 }
 
+static int __p2m_get_mem_access(struct domain *d, gfn_t gfn,
+                                xenmem_access_t *access)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    void *i;
+    unsigned int index;
+
+    static const xenmem_access_t memaccess[] = {
+#define ACCESS(ac) [p2m_access_##ac] = XENMEM_access_##ac
+            ACCESS(n),
+            ACCESS(r),
+            ACCESS(w),
+            ACCESS(rw),
+            ACCESS(x),
+            ACCESS(rx),
+            ACCESS(wx),
+            ACCESS(rwx),
+            ACCESS(rx2rw),
+            ACCESS(n2rwx),
+#undef ACCESS
+    };
+
+    /* If no setting was ever set, just return rwx. */
+    if ( !p2m->mem_access_enabled )
+    {
+        *access = XENMEM_access_rwx;
+        return 0;
+    }
+
+    /* If request to get default access. */
+    if ( gfn_x(gfn) == INVALID_GFN )
+    {
+        *access = memaccess[p2m->default_access];
+        return 0;
+    }
+
+    i = radix_tree_lookup(&p2m->mem_access_settings, gfn_x(gfn));
+
+    if ( !i )
+    {
+        /*
+         * No setting was found in the Radix tree. Check if the
+         * entry exists in the page-tables.
+         */
+        paddr_t maddr = p2m_lookup(d, gfn_x(gfn) << PAGE_SHIFT, NULL);
+        if ( INVALID_PADDR == maddr )
+            return -ESRCH;
+
+        /* If entry exists then its rwx. */
+        *access = XENMEM_access_rwx;
+    }
+    else
+    {
+        /* Setting was found in the Radix tree. */
+        index = radix_tree_ptr_to_int(i);
+        if ( index >= ARRAY_SIZE(memaccess) )
+            return -ERANGE;
+
+        *access = memaccess[index];
+    }
+
+    return 0;
+}
+
+static int p2m_mem_access_radix_set(struct p2m_domain *p2m, unsigned long pfn,
+                                    p2m_access_t a)
+{
+    int rc;
+
+    if ( !p2m->mem_access_enabled )
+        return 0;
+
+    if ( p2m_access_rwx == a )
+    {
+        radix_tree_delete(&p2m->mem_access_settings, pfn);
+        return 0;
+    }
+
+    rc = radix_tree_insert(&p2m->mem_access_settings, pfn,
+                           radix_tree_int_to_ptr(a));
+    if ( rc == -EEXIST )
+    {
+        /* If a setting already exists, change it to the new one */
+        radix_tree_replace_slot(
+            radix_tree_lookup_slot(
+                &p2m->mem_access_settings, pfn),
+            radix_tree_int_to_ptr(a));
+        rc = 0;
+    }
+
+    return rc;
+}
+
 enum p2m_operation {
     INSERT,
     ALLOCATE,
     REMOVE,
     RELINQUISH,
     CACHEFLUSH,
+    MEMACCESS,
 };
 
 /* Put any references on the single 4K page referenced by pte.  TODO:
@@ -537,7 +646,8 @@ static int apply_one_level(struct domain *d,
                            paddr_t *maddr,
                            bool_t *flush,
                            int mattr,
-                           p2m_type_t t)
+                           p2m_type_t t,
+                           p2m_access_t a)
 {
     const paddr_t level_size = level_sizes[level];
     const paddr_t level_mask = level_masks[level];
@@ -559,14 +669,23 @@ static int apply_one_level(struct domain *d,
         if ( p2m_valid(orig_pte) )
             return P2M_ONE_DESCEND;
 
-        if ( is_mapping_aligned(*addr, end_gpaddr, 0, level_size) )
+        if ( is_mapping_aligned(*addr, end_gpaddr, 0, level_size) &&
+           /* We only create superpages when mem_access is not in use. */
+             (level == 3 || (level < 3 && !p2m->mem_access_enabled)) )
         {
             struct page_info *page;
 
             page = alloc_domheap_pages(d, level_shift - PAGE_SHIFT, 0);
             if ( page )
             {
-                pte = mfn_to_p2m_entry(page_to_mfn(page), mattr, t);
+                rc = p2m_mem_access_radix_set(p2m, paddr_to_pfn(*addr), a);
+                if ( rc < 0 )
+                {
+                    free_domheap_page(page);
+                    return rc;
+                }
+
+                pte = mfn_to_p2m_entry(page_to_mfn(page), mattr, t, a);
                 if ( level < 3 )
                     pte.p2m.table = 0;
                 p2m_write_pte(entry, pte, flush_cache);
@@ -586,8 +705,8 @@ static int apply_one_level(struct domain *d,
         /*
          * If we get here then we failed to allocate a sufficiently
          * large contiguous region for this level (which can't be
-         * L3). Create a page table and continue to descend so we try
-         * smaller allocations.
+         * L3) or mem_access is in use. Create a page table and
+         * continue to descend so we try smaller allocations.
          */
         rc = p2m_create_table(d, entry, 0, flush_cache);
         if ( rc < 0 )
@@ -597,11 +716,18 @@ static int apply_one_level(struct domain *d,
 
     case INSERT:
         if ( is_mapping_aligned(*addr, end_gpaddr, *maddr, level_size) &&
-           /* We do not handle replacing an existing table with a superpage */
-             (level == 3 || !p2m_table(orig_pte)) )
+           /*
+            * We do not handle replacing an existing table with a superpage
+            * or when mem_access is in use.
+            */
+             (level == 3 || (!p2m_table(orig_pte) && !p2m->mem_access_enabled)) )
         {
+            rc = p2m_mem_access_radix_set(p2m, paddr_to_pfn(*addr), a);
+            if ( rc < 0 )
+                return rc;
+
             /* New mapping is superpage aligned, make it */
-            pte = mfn_to_p2m_entry(*maddr >> PAGE_SHIFT, mattr, t);
+            pte = mfn_to_p2m_entry(*maddr >> PAGE_SHIFT, mattr, t, a);
             if ( level < 3 )
                 pte.p2m.table = 0; /* Superpage entry */
 
@@ -715,6 +841,7 @@ static int apply_one_level(struct domain *d,
 
         memset(&pte, 0x00, sizeof(pte));
         p2m_write_pte(entry, pte, flush_cache);
+        p2m_mem_access_radix_set(p2m, paddr_to_pfn(*addr), p2m_access_rwx);
 
         *addr += level_size;
         *maddr += level_size;
@@ -759,6 +886,44 @@ static int apply_one_level(struct domain *d,
             *addr += PAGE_SIZE;
             return P2M_ONE_PROGRESS_NOP;
         }
+
+    case MEMACCESS:
+        if ( level < 3 )
+        {
+            if ( !p2m_valid(orig_pte) )
+            {
+                *addr += level_size;
+                return P2M_ONE_PROGRESS_NOP;
+            }
+
+            /* Shatter large pages as we descend */
+            if ( p2m_mapping(orig_pte) )
+            {
+                rc = p2m_shatter_page(d, entry, level, flush_cache);
+                if ( rc < 0 )
+                    return rc;
+            } /* else: an existing table mapping -> descend */
+
+            return P2M_ONE_DESCEND;
+        }
+        else
+        {
+            pte = orig_pte;
+
+            if ( p2m_valid(pte) )
+            {
+                rc = p2m_mem_access_radix_set(p2m, paddr_to_pfn(*addr), a);
+                if ( rc < 0 )
+                    return rc;
+
+                p2m_set_permission(&pte, pte.p2m.type, a);
+                p2m_write_pte(entry, pte, flush_cache);
+            }
+
+            *addr += level_size;
+            *flush = true;
+            return P2M_ONE_PROGRESS;
+        }
     }
 
     BUG(); /* Should never get here */
@@ -770,7 +935,9 @@ static int apply_p2m_changes(struct domain *d,
                      paddr_t end_gpaddr,
                      paddr_t maddr,
                      int mattr,
-                     p2m_type_t t)
+                     uint32_t mask,
+                     p2m_type_t t,
+                     p2m_access_t a)
 {
     int rc, ret;
     struct p2m_domain *p2m = &d->arch.p2m;
@@ -780,6 +947,10 @@ static int apply_p2m_changes(struct domain *d,
     unsigned int cur_root_table = ~0;
     unsigned int cur_offset[4] = { ~0, ~0, ~0, ~0 };
     unsigned int count = 0;
+    const unsigned long sgfn = paddr_to_pfn(start_gpaddr),
+                        egfn = paddr_to_pfn(end_gpaddr);
+    const unsigned int preempt_count_limit = (op == MEMACCESS) ? 1 : 0x2000;
+    const bool_t preempt = !is_idle_vcpu(current);
     bool_t flush = false;
     bool_t flush_pt;
 
@@ -807,21 +978,50 @@ static int apply_p2m_changes(struct domain *d,
         };
 
         /*
-         * Arbitrarily, preempt every 512 operations or 8192 nops.
-         * 512*P2M_ONE_PROGRESS == 8192*P2M_ONE_PROGRESS_NOP == 0x2000
-         *
-         * count is initialised to 0 above, so we are guaranteed to
-         * always make at least one pass.
+         * Check if current iteration should be possibly preempted.
+         * Since count is initialised to 0 above we are guaranteed to
+         * always make at least one pass as long as preempt_count_limit is
+         * initialized with a value >= 1.
          */
-
-        if ( op == RELINQUISH && count >= 0x2000 )
+        if ( preempt && count >= preempt_count_limit
+             && hypercall_preempt_check() )
         {
-            if ( hypercall_preempt_check() )
+            switch ( op )
             {
+            case RELINQUISH:
+                /*
+                 * Arbitrarily, preempt every 512 operations or 8192 nops.
+                 * 512*P2M_ONE_PROGRESS == 8192*P2M_ONE_PROGRESS_NOP == 0x2000
+                 * This is set in preempt_count_limit.
+                 *
+                 */
                 p2m->lowest_mapped_gfn = addr >> PAGE_SHIFT;
                 rc = -ERESTART;
                 goto out;
+
+            case MEMACCESS:
+            {
+                /*
+                 * Preempt setting mem_access permissions as required by XSA-89,
+                 * if it's not the last iteration.
+                 */
+                uint32_t progress = paddr_to_pfn(addr) - sgfn + 1;
+
+                if ( (egfn - sgfn) > progress && !(progress & mask) )
+                {
+                    rc = progress;
+                    goto tlbflush;
+                }
+                break;
             }
+
+            default:
+                break;
+            };
+
+            /*
+             * Reset current iteration counter.
+             */
             count = 0;
         }
 
@@ -863,7 +1063,7 @@ static int apply_p2m_changes(struct domain *d,
                                   level, flush_pt, op,
                                   start_gpaddr, end_gpaddr,
                                   &addr, &maddr, &flush,
-                                  mattr, t);
+                                  mattr, t, a);
             if ( ret < 0 ) { rc = ret ; goto out; }
             count += ret;
             /* L3 had better have done something! We cannot descend any further */
@@ -878,7 +1078,7 @@ static int apply_p2m_changes(struct domain *d,
                 int i;
                 if ( mappings[level+1] )
                     unmap_domain_page(mappings[level+1]);
-                mappings[level+1] = map_domain_page(entry->p2m.base);
+                mappings[level+1] = map_domain_page(_mfn(entry->p2m.base));
                 cur_offset[level] = offset;
                 /* Any mapping further down is now invalid */
                 for ( i = level+1; i < 4; i++ )
@@ -888,27 +1088,23 @@ static int apply_p2m_changes(struct domain *d,
         }
     }
 
-    if ( flush )
-    {
-        unsigned long sgfn = paddr_to_pfn(start_gpaddr);
-        unsigned long egfn = paddr_to_pfn(end_gpaddr);
-
-        flush_tlb_domain(d);
-        iommu_iotlb_flush(d, sgfn, egfn - sgfn);
-    }
-
     if ( op == ALLOCATE || op == INSERT )
     {
-        unsigned long sgfn = paddr_to_pfn(start_gpaddr);
-        unsigned long egfn = paddr_to_pfn(end_gpaddr);
-
         p2m->max_mapped_gfn = max(p2m->max_mapped_gfn, egfn);
         p2m->lowest_mapped_gfn = min(p2m->lowest_mapped_gfn, sgfn);
     }
 
     rc = 0;
 
+tlbflush:
+    if ( flush )
+    {
+        flush_tlb_domain(d);
+        iommu_iotlb_flush(d, sgfn, egfn - sgfn);
+    }
+
 out:
+
     if ( rc < 0 && ( op == INSERT || op == ALLOCATE ) &&
          addr != start_gpaddr )
     {
@@ -921,7 +1117,7 @@ out:
          */
         apply_p2m_changes(d, REMOVE,
                           start_gpaddr, addr + level_sizes[level], orig_maddr,
-                          mattr, p2m_invalid);
+                          mattr, 0, p2m_invalid, d->arch.p2m.default_access);
     }
 
     for ( level = P2M_ROOT_LEVEL; level < 4; level ++ )
@@ -940,7 +1136,8 @@ int p2m_populate_ram(struct domain *d,
                      paddr_t end)
 {
     return apply_p2m_changes(d, ALLOCATE, start, end,
-                             0, MATTR_MEM, p2m_ram_rw);
+                             0, MATTR_MEM, 0, p2m_ram_rw,
+                             d->arch.p2m.default_access);
 }
 
 int map_mmio_regions(struct domain *d,
@@ -952,7 +1149,8 @@ int map_mmio_regions(struct domain *d,
                              pfn_to_paddr(start_gfn),
                              pfn_to_paddr(start_gfn + nr),
                              pfn_to_paddr(mfn),
-                             MATTR_DEV, p2m_mmio_direct);
+                             MATTR_DEV, 0, p2m_mmio_direct,
+                             d->arch.p2m.default_access);
 }
 
 int unmap_mmio_regions(struct domain *d,
@@ -964,7 +1162,8 @@ int unmap_mmio_regions(struct domain *d,
                              pfn_to_paddr(start_gfn),
                              pfn_to_paddr(start_gfn + nr),
                              pfn_to_paddr(mfn),
-                             MATTR_DEV, p2m_invalid);
+                             MATTR_DEV, 0, p2m_invalid,
+                             d->arch.p2m.default_access);
 }
 
 int guest_physmap_add_entry(struct domain *d,
@@ -976,7 +1175,8 @@ int guest_physmap_add_entry(struct domain *d,
     return apply_p2m_changes(d, INSERT,
                              pfn_to_paddr(gpfn),
                              pfn_to_paddr(gpfn + (1 << page_order)),
-                             pfn_to_paddr(mfn), MATTR_MEM, t);
+                             pfn_to_paddr(mfn), MATTR_MEM, 0, t,
+                             d->arch.p2m.default_access);
 }
 
 void guest_physmap_remove_page(struct domain *d,
@@ -986,7 +1186,8 @@ void guest_physmap_remove_page(struct domain *d,
     apply_p2m_changes(d, REMOVE,
                       pfn_to_paddr(gpfn),
                       pfn_to_paddr(gpfn + (1<<page_order)),
-                      pfn_to_paddr(mfn), MATTR_MEM, p2m_invalid);
+                      pfn_to_paddr(mfn), MATTR_MEM, 0, p2m_invalid,
+                      d->arch.p2m.default_access);
 }
 
 int p2m_alloc_table(struct domain *d)
@@ -1090,6 +1291,8 @@ void p2m_teardown(struct domain *d)
 
     p2m_free_vmid(d);
 
+    radix_tree_destroy(&p2m->mem_access_settings, NULL);
+
     spin_unlock(&p2m->lock);
 }
 
@@ -1115,6 +1318,10 @@ int p2m_init(struct domain *d)
     p2m->max_mapped_gfn = 0;
     p2m->lowest_mapped_gfn = ULONG_MAX;
 
+    p2m->default_access = p2m_access_rwx;
+    p2m->mem_access_enabled = false;
+    radix_tree_init(&p2m->mem_access_settings);
+
 err:
     spin_unlock(&p2m->lock);
 
@@ -1129,7 +1336,8 @@ int relinquish_p2m_mapping(struct domain *d)
                               pfn_to_paddr(p2m->lowest_mapped_gfn),
                               pfn_to_paddr(p2m->max_mapped_gfn),
                               pfn_to_paddr(INVALID_MFN),
-                              MATTR_MEM, p2m_invalid);
+                              MATTR_MEM, 0, p2m_invalid,
+                              d->arch.p2m.default_access);
 }
 
 int p2m_cache_flush(struct domain *d, xen_pfn_t start_mfn, xen_pfn_t end_mfn)
@@ -1143,7 +1351,8 @@ int p2m_cache_flush(struct domain *d, xen_pfn_t start_mfn, xen_pfn_t end_mfn)
                              pfn_to_paddr(start_mfn),
                              pfn_to_paddr(end_mfn),
                              pfn_to_paddr(INVALID_MFN),
-                             MATTR_MEM, p2m_invalid);
+                             MATTR_MEM, 0, p2m_invalid,
+                             d->arch.p2m.default_access);
 }
 
 unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn)
@@ -1152,6 +1361,103 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn)
     return p >> PAGE_SHIFT;
 }
 
+/*
+ * If mem_access is in use it might have been the reason why get_page_from_gva
+ * failed to fetch the page, as it uses the MMU for the permission checking.
+ * Only in these cases we do a software-based type check and fetch the page if
+ * we indeed found a conflicting mem_access setting.
+ */
+static struct page_info*
+p2m_mem_access_check_and_get_page(vaddr_t gva, unsigned long flag)
+{
+    long rc;
+    paddr_t ipa;
+    unsigned long maddr;
+    unsigned long mfn;
+    xenmem_access_t xma;
+    p2m_type_t t;
+    struct page_info *page = NULL;
+
+    rc = gva_to_ipa(gva, &ipa, flag);
+    if ( rc < 0 )
+        goto err;
+
+    /*
+     * We do this first as this is faster in the default case when no
+     * permission is set on the page.
+     */
+    rc = __p2m_get_mem_access(current->domain, _gfn(paddr_to_pfn(ipa)), &xma);
+    if ( rc < 0 )
+        goto err;
+
+    /* Let's check if mem_access limited the access. */
+    switch ( xma )
+    {
+    default:
+    case XENMEM_access_rwx:
+    case XENMEM_access_rw:
+        /*
+         * If mem_access contains no rw perm restrictions at all then the original
+         * fault was correct.
+         */
+        goto err;
+    case XENMEM_access_n2rwx:
+    case XENMEM_access_n:
+    case XENMEM_access_x:
+        /*
+         * If no r/w is permitted by mem_access, this was a fault caused by mem_access.
+         */
+        break;
+    case XENMEM_access_wx:
+    case XENMEM_access_w:
+        /*
+         * If this was a read then it was because of mem_access, but if it was
+         * a write then the original get_page_from_gva fault was correct.
+         */
+        if ( flag == GV2M_READ )
+            break;
+        else
+            goto err;
+    case XENMEM_access_rx2rw:
+    case XENMEM_access_rx:
+    case XENMEM_access_r:
+        /*
+         * If this was a write then it was because of mem_access, but if it was
+         * a read then the original get_page_from_gva fault was correct.
+         */
+        if ( flag == GV2M_WRITE )
+            break;
+        else
+            goto err;
+    }
+
+    /*
+     * We had a mem_access permission limiting the access, but the page type
+     * could also be limiting, so we need to check that as well.
+     */
+    maddr = __p2m_lookup(current->domain, ipa, &t);
+    if ( maddr == INVALID_PADDR )
+        goto err;
+
+    mfn = maddr >> PAGE_SHIFT;
+    if ( !mfn_valid(mfn) )
+        goto err;
+
+    /*
+     * Base type doesn't allow r/w
+     */
+    if ( t != p2m_ram_rw )
+        goto err;
+
+    page = mfn_to_page(mfn);
+
+    if ( unlikely(!get_page(page, current->domain)) )
+        page = NULL;
+
+err:
+    return page;
+}
+
 struct page_info *get_page_from_gva(struct domain *d, vaddr_t va,
                                     unsigned long flags)
 {
@@ -1192,7 +1498,11 @@ struct page_info *get_page_from_gva(struct domain *d, vaddr_t va,
         page = NULL;
 
 err:
+    if ( !page && p2m->mem_access_enabled )
+        page = p2m_mem_access_check_and_get_page(va, flags);
+
     spin_unlock(&p2m->lock);
+
     return page;
 }
 
@@ -1210,6 +1520,7 @@ void __init setup_virt_paging(void)
 
 #ifdef CONFIG_ARM_32
     printk("P2M: 40-bit IPA\n");
+    p2m_ipa_bits = 40;
     val |= VTCR_T0SZ(0x18); /* 40 bit IPA */
     val |= VTCR_SL0(0x1); /* P2M starts at first level */
 #else /* CONFIG_ARM_64 */
@@ -1252,9 +1563,10 @@ void __init setup_virt_paging(void)
 
     p2m_root_order = pa_range_info[pa_range].root_order;
     p2m_root_level = 2 - pa_range_info[pa_range].sl0;
+    p2m_ipa_bits = 64 - pa_range_info[pa_range].t0sz;
 
     printk("P2M: %d-bit IPA with %d-bit PA\n",
-           64 - pa_range_info[pa_range].t0sz,
+           p2m_ipa_bits,
            pa_range_info[pa_range].pabits);
 #endif
     printk("P2M: %d levels with order-%d root, VTCR 0x%lx\n",
@@ -1265,6 +1577,212 @@ void __init setup_virt_paging(void)
     smp_call_function(setup_virt_paging_one, (void *)val, 1);
 }
 
+bool_t p2m_mem_access_check(paddr_t gpa, vaddr_t gla, const struct npfec npfec)
+{
+    int rc;
+    bool_t violation;
+    xenmem_access_t xma;
+    vm_event_request_t *req;
+    struct vcpu *v = current;
+    struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
+
+    /* Mem_access is not in use. */
+    if ( !p2m->mem_access_enabled )
+        return true;
+
+    rc = p2m_get_mem_access(v->domain, _gfn(paddr_to_pfn(gpa)), &xma);
+    if ( rc )
+        return true;
+
+    /* Now check for mem_access violation. */
+    switch ( xma )
+    {
+    case XENMEM_access_rwx:
+        violation = false;
+        break;
+    case XENMEM_access_rw:
+        violation = npfec.insn_fetch;
+        break;
+    case XENMEM_access_wx:
+        violation = npfec.read_access;
+        break;
+    case XENMEM_access_rx:
+    case XENMEM_access_rx2rw:
+        violation = npfec.write_access;
+        break;
+    case XENMEM_access_x:
+        violation = npfec.read_access || npfec.write_access;
+        break;
+    case XENMEM_access_w:
+        violation = npfec.read_access || npfec.insn_fetch;
+        break;
+    case XENMEM_access_r:
+        violation = npfec.write_access || npfec.insn_fetch;
+        break;
+    default:
+    case XENMEM_access_n:
+    case XENMEM_access_n2rwx:
+        violation = true;
+        break;
+    }
+
+    if ( !violation )
+        return true;
+
+    /* First, handle rx2rw and n2rwx conversion automatically. */
+    if ( npfec.write_access && xma == XENMEM_access_rx2rw )
+    {
+        rc = p2m_set_mem_access(v->domain, _gfn(paddr_to_pfn(gpa)), 1,
+                                0, ~0, XENMEM_access_rw);
+        return false;
+    }
+    else if ( xma == XENMEM_access_n2rwx )
+    {
+        rc = p2m_set_mem_access(v->domain, _gfn(paddr_to_pfn(gpa)), 1,
+                                0, ~0, XENMEM_access_rwx);
+    }
+
+    /* Otherwise, check if there is a vm_event monitor subscriber */
+    if ( !vm_event_check_ring(&v->domain->vm_event->monitor) )
+    {
+        /* No listener */
+        if ( p2m->access_required )
+        {
+            gdprintk(XENLOG_INFO, "Memory access permissions failure, "
+                                  "no vm_event listener VCPU %d, dom %d\n",
+                                  v->vcpu_id, v->domain->domain_id);
+            domain_crash(v->domain);
+        }
+        else
+        {
+            /* n2rwx was already handled */
+            if ( xma != XENMEM_access_n2rwx )
+            {
+                /* A listener is not required, so clear the access
+                 * restrictions. */
+                rc = p2m_set_mem_access(v->domain, _gfn(paddr_to_pfn(gpa)), 1,
+                                        0, ~0, XENMEM_access_rwx);
+            }
+        }
+
+        /* No need to reinject */
+        return false;
+    }
+
+    req = xzalloc(vm_event_request_t);
+    if ( req )
+    {
+        req->reason = VM_EVENT_REASON_MEM_ACCESS;
+
+        /* Pause the current VCPU */
+        if ( xma != XENMEM_access_n2rwx )
+            req->flags |= VM_EVENT_FLAG_VCPU_PAUSED;
+
+        /* Send request to mem access subscriber */
+        req->u.mem_access.gfn = gpa >> PAGE_SHIFT;
+        req->u.mem_access.offset =  gpa & ((1 << PAGE_SHIFT) - 1);
+        if ( npfec.gla_valid )
+        {
+            req->u.mem_access.flags |= MEM_ACCESS_GLA_VALID;
+            req->u.mem_access.gla = gla;
+
+            if ( npfec.kind == npfec_kind_with_gla )
+                req->u.mem_access.flags |= MEM_ACCESS_FAULT_WITH_GLA;
+            else if ( npfec.kind == npfec_kind_in_gpt )
+                req->u.mem_access.flags |= MEM_ACCESS_FAULT_IN_GPT;
+        }
+        req->u.mem_access.flags |= npfec.read_access    ? MEM_ACCESS_R : 0;
+        req->u.mem_access.flags |= npfec.write_access   ? MEM_ACCESS_W : 0;
+        req->u.mem_access.flags |= npfec.insn_fetch     ? MEM_ACCESS_X : 0;
+        req->vcpu_id = v->vcpu_id;
+
+        mem_access_send_req(v->domain, req);
+        xfree(req);
+    }
+
+    /* Pause the current VCPU */
+    if ( xma != XENMEM_access_n2rwx )
+        vm_event_vcpu_pause(v);
+
+    return false;
+}
+
+/*
+ * Set access type for a region of pfns.
+ * If gfn == INVALID_GFN, sets the default access type.
+ */
+long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
+                        uint32_t start, uint32_t mask, xenmem_access_t access)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    p2m_access_t a;
+    long rc = 0;
+
+    static const p2m_access_t memaccess[] = {
+#define ACCESS(ac) [XENMEM_access_##ac] = p2m_access_##ac
+        ACCESS(n),
+        ACCESS(r),
+        ACCESS(w),
+        ACCESS(rw),
+        ACCESS(x),
+        ACCESS(rx),
+        ACCESS(wx),
+        ACCESS(rwx),
+        ACCESS(rx2rw),
+        ACCESS(n2rwx),
+#undef ACCESS
+    };
+
+    switch ( access )
+    {
+    case 0 ... ARRAY_SIZE(memaccess) - 1:
+        a = memaccess[access];
+        break;
+    case XENMEM_access_default:
+        a = p2m->default_access;
+        break;
+    default:
+        return -EINVAL;
+    }
+
+    /*
+     * Flip mem_access_enabled to true when a permission is set, as to prevent
+     * allocating or inserting super-pages.
+     */
+    p2m->mem_access_enabled = true;
+
+    /* If request to set default access. */
+    if ( gfn_x(gfn) == INVALID_GFN )
+    {
+        p2m->default_access = a;
+        return 0;
+    }
+
+    rc = apply_p2m_changes(d, MEMACCESS,
+                           pfn_to_paddr(gfn_x(gfn) + start),
+                           pfn_to_paddr(gfn_x(gfn) + nr),
+                           0, MATTR_MEM, mask, 0, a);
+    if ( rc < 0 )
+        return rc;
+    else if ( rc > 0 )
+        return start + rc;
+
+    return 0;
+}
+
+int p2m_get_mem_access(struct domain *d, gfn_t gfn,
+                       xenmem_access_t *access)
+{
+    int ret;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    spin_lock(&p2m->lock);
+    ret = __p2m_get_mem_access(d, gfn, access);
+    spin_unlock(&p2m->lock);
+
+    return ret;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/platform.c b/xen/arch/arm/platform.c
index cb4cda8..0af6d57 100644
--- a/xen/arch/arm/platform.c
+++ b/xen/arch/arm/platform.c
@@ -44,17 +44,6 @@ static bool_t __init platform_is_compatible(const struct platform_desc *plat)
     return 0;
 }
 
-/* List of possible platform */
-static void dump_platform_table(void)
-{
-    const struct platform_desc *p;
-
-    printk("Available platform support:\n");
-
-    for ( p = _splatform; p != _eplatform; p++ )
-        printk("    - %s\n", p->name);
-}
-
 void __init platform_init(void)
 {
     int res = 0;
@@ -72,9 +61,7 @@ void __init platform_init(void)
     if ( platform == _eplatform )
     {
         /* TODO: dump DT machine compatible node */
-        printk(XENLOG_WARNING "WARNING: Unrecognized/unsupported device tree "
-              "compatible list\n");
-        dump_platform_table();
+        printk(XENLOG_INFO "Platform: Generic System\n");
         platform = NULL;
     }
     else
@@ -157,28 +144,7 @@ bool_t platform_device_is_blacklisted(const struct dt_device_node *node)
     if ( platform && platform->blacklist_dev )
         blacklist = platform->blacklist_dev;
 
-    return dt_match_node(blacklist, node);
-}
-
-unsigned int platform_dom0_evtchn_ppi(void)
-{
-    if ( platform && platform->dom0_evtchn_ppi )
-        return platform->dom0_evtchn_ppi;
-    return GUEST_EVTCHN_PPI;
-}
-
-void platform_dom0_gnttab(paddr_t *start, paddr_t *size)
-{
-    if ( platform && platform->dom0_gnttab_size )
-    {
-        *start = platform->dom0_gnttab_start;
-        *size = platform->dom0_gnttab_size;
-    }
-    else
-    {
-        *start = 0xb0000000;
-        *size = 0x20000;
-    }
+    return (dt_match_node(blacklist, node) != NULL);
 }
 
 /*
diff --git a/xen/arch/arm/platforms/Makefile b/xen/arch/arm/platforms/Makefile
index 8f47c16..e173fec 100644
--- a/xen/arch/arm/platforms/Makefile
+++ b/xen/arch/arm/platforms/Makefile
@@ -4,5 +4,6 @@ obj-$(CONFIG_ARM_32) += exynos5.o
 obj-$(CONFIG_ARM_32) += midway.o
 obj-$(CONFIG_ARM_32) += omap5.o
 obj-$(CONFIG_ARM_32) += sunxi.o
+obj-$(CONFIG_ARM_32) += rcar2.o
 obj-$(CONFIG_ARM_64) += seattle.o
 obj-$(CONFIG_ARM_64) += xgene-storm.o
diff --git a/xen/arch/arm/platforms/midway.c b/xen/arch/arm/platforms/midway.c
index 42f7697..b221279 100644
--- a/xen/arch/arm/platforms/midway.c
+++ b/xen/arch/arm/platforms/midway.c
@@ -51,9 +51,6 @@ static const char * const midway_dt_compat[] __initconst =
 PLATFORM_START(midway, "CALXEDA MIDWAY")
     .compatible = midway_dt_compat,
     .reset = midway_reset,
-
-    .dom0_gnttab_start = 0xff800000,
-    .dom0_gnttab_size = 0x20000,
 PLATFORM_END
 
 /*
diff --git a/xen/arch/arm/platforms/omap5.c b/xen/arch/arm/platforms/omap5.c
index 9d6e504..a49ba62 100644
--- a/xen/arch/arm/platforms/omap5.c
+++ b/xen/arch/arm/platforms/omap5.c
@@ -155,26 +155,12 @@ static const char * const dra7_dt_compat[] __initconst =
     NULL
 };
 
-static const struct dt_device_match dra7_blacklist_dev[] __initconst =
-{
-    /* OMAP Linux kernel handles devices with status "disabled" in a
-     * weird manner - tries to reset them. While their memory ranges
-     * are not mapped, this leads to data aborts, so skip these devices
-     * from DT for dom0.
-     */
-    DT_MATCH_NOT_AVAILABLE(),
-    { /* sentinel */ },
-};
-
 PLATFORM_START(omap5, "TI OMAP5")
     .compatible = omap5_dt_compat,
     .init_time = omap5_init_time,
     .specific_mapping = omap5_specific_mapping,
     .smp_init = omap5_smp_init,
     .cpu_up = cpu_up_send_sgi,
-
-    .dom0_gnttab_start = 0x4b000000,
-    .dom0_gnttab_size = 0x20000,
 PLATFORM_END
 
 PLATFORM_START(dra7, "TI DRA7")
@@ -182,10 +168,6 @@ PLATFORM_START(dra7, "TI DRA7")
     .init_time = omap5_init_time,
     .cpu_up = cpu_up_send_sgi,
     .smp_init = omap5_smp_init,
-
-    .dom0_gnttab_start = 0x4b000000,
-    .dom0_gnttab_size = 0x20000,
-    .blacklist_dev = dra7_blacklist_dev,
 PLATFORM_END
 
 /*
diff --git a/xen/arch/arm/platforms/rcar2.c b/xen/arch/arm/platforms/rcar2.c
new file mode 100644
index 0000000..bb25751
--- /dev/null
+++ b/xen/arch/arm/platforms/rcar2.c
@@ -0,0 +1,68 @@
+/*
+ * xen/arch/arm/platforms/rcar2.c
+ *
+ * Renesas R-Car Gen2 specific settings
+ *
+ * Iurii Konovalenko <iurii.konovalenko at globallogic.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <xen/mm.h>
+#include <xen/vmap.h>
+#include <asm/platform.h>
+#include <asm/io.h>
+
+#define RCAR2_RAM_ADDR                         0xE63C0000
+#define RCAR2_RAM_SIZE                         0x1000
+#define RCAR2_SMP_START_OFFSET                 0xFFC
+
+static int __init rcar2_smp_init(void)
+{
+    void __iomem *pram;
+
+    /* map ICRAM */
+    pram = ioremap_nocache(RCAR2_RAM_ADDR, RCAR2_RAM_SIZE);
+    if( !pram )
+    {
+        dprintk( XENLOG_ERR, "Unable to map RCAR2 ICRAM\n");
+        return -ENOMEM;
+    }
+
+    /* setup reset vectors */
+    writel(__pa(init_secondary), pram + RCAR2_SMP_START_OFFSET);
+    iounmap(pram);
+
+    sev();
+
+    return 0;
+}
+
+static const char const *rcar2_dt_compat[] __initdata =
+{
+    "renesas,lager",
+    NULL
+};
+
+PLATFORM_START(rcar2, "Renesas R-Car Gen2")
+    .compatible = rcar2_dt_compat,
+    .cpu_up = cpu_up_send_sgi,
+    .smp_init = rcar2_smp_init,
+PLATFORM_END
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/arm/platforms/seattle.c b/xen/arch/arm/platforms/seattle.c
index 6cc5362..86dce91 100644
--- a/xen/arch/arm/platforms/seattle.c
+++ b/xen/arch/arm/platforms/seattle.c
@@ -45,9 +45,6 @@ PLATFORM_START(seattle, "SEATTLE")
     .compatible = seattle_dt_compat,
     .reset      = seattle_system_reset,
     .poweroff   = seattle_system_off,
-
-    .dom0_gnttab_start = 0xe1700000,
-    .dom0_gnttab_size = 0x20000,
 PLATFORM_END
 
 /*
diff --git a/xen/arch/arm/platforms/sunxi.c b/xen/arch/arm/platforms/sunxi.c
index 89d8290..0ba7b3d 100644
--- a/xen/arch/arm/platforms/sunxi.c
+++ b/xen/arch/arm/platforms/sunxi.c
@@ -69,9 +69,6 @@ PLATFORM_START(sunxi, "Allwinner A20")
     .compatible = sunxi_dt_compat,
     .blacklist_dev = sunxi_blacklist_dev,
     .reset = sunxi_reset,
-
-    .dom0_gnttab_start = 0x01d00000,
-    .dom0_gnttab_size = 0x20000,
 PLATFORM_END
 
 /*
diff --git a/xen/arch/arm/platforms/vexpress.c b/xen/arch/arm/platforms/vexpress.c
index ce66935..8e6a4ea 100644
--- a/xen/arch/arm/platforms/vexpress.c
+++ b/xen/arch/arm/platforms/vexpress.c
@@ -176,8 +176,6 @@ PLATFORM_START(vexpress, "VERSATILE EXPRESS")
 #endif
     .reset = vexpress_reset,
     .blacklist_dev = vexpress_blacklist_dev,
-    .dom0_gnttab_start = 0x10000000,
-    .dom0_gnttab_size = 0x20000,
 PLATFORM_END
 
 /*
diff --git a/xen/arch/arm/platforms/xgene-storm.c b/xen/arch/arm/platforms/xgene-storm.c
index 0b3492d..8b05ed5 100644
--- a/xen/arch/arm/platforms/xgene-storm.c
+++ b/xen/arch/arm/platforms/xgene-storm.c
@@ -22,6 +22,7 @@
 #include <asm/platform.h>
 #include <xen/stdbool.h>
 #include <xen/vmap.h>
+#include <xen/device_tree.h>
 #include <asm/io.h>
 #include <asm/gic.h>
 
@@ -35,147 +36,41 @@ static u64 reset_addr, reset_size;
 static u32 reset_mask;
 static bool reset_vals_valid = false;
 
-static uint32_t xgene_storm_quirks(void)
-{
-    return PLATFORM_QUIRK_GIC_64K_STRIDE|PLATFORM_QUIRK_GUEST_PIRQ_NEED_EOI;
-}
+#define XGENE_SEC_GICV2_DIST_ADDR    0x78010000
 
-static int map_one_mmio(struct domain *d, const char *what,
-                         unsigned long start, unsigned long end)
+static void __init xgene_check_pirq_eoi(void)
 {
-    int ret;
-
-    printk("Additional MMIO %lx-%lx (%s)\n",
-           start, end, what);
-    ret = map_mmio_regions(d, start, end - start, start);
-    if ( ret )
-        printk("Failed to map %s @ %lx to dom%d\n",
-               what, start, d->domain_id);
-    return ret;
-}
-
-static int map_one_spi(struct domain *d, const char *what,
-                       unsigned int spi, unsigned int type)
-{
-    unsigned int irq;
-    int ret;
-
-    irq = spi + 32; /* SPIs start at IRQ 32 */
-
-    ret = irq_set_spi_type(irq, type);
-    if ( ret )
+    const struct dt_device_node *node;
+    int res;
+    paddr_t dbase;
+    const struct dt_device_match xgene_dt_int_ctrl_match[] =
     {
-        printk("Failed to set the type for IRQ%u\n", irq);
-        return ret;
-    }
-
-    printk("Additional IRQ %u (%s)\n", irq, what);
-
-    ret = route_irq_to_guest(d, irq, what);
-    if ( ret )
-        printk("Failed to route %s to dom%d\n", what, d->domain_id);
-
-    return ret;
-}
-
-/* Creates MMIO mappings base..end as well as 4 SPIs from the given base. */
-static int xgene_storm_pcie_specific_mapping(struct domain *d,
-                                             const struct dt_device_node *node,
-                                             paddr_t base, paddr_t end,
-                                             int base_spi)
-{
-    int ret;
-
-    printk("Mapping additional regions for PCIe device %s\n",
-           dt_node_full_name(node));
-
-    /* Map the PCIe bus resources */
-    ret = map_one_mmio(d, "PCI MEMORY", paddr_to_pfn(base), paddr_to_pfn(end));
-    if ( ret )
-        goto err;
-
-    ret = map_one_spi(d, "PCI#INTA", base_spi+0, DT_IRQ_TYPE_LEVEL_HIGH);
-    if ( ret )
-        goto err;
-
-    ret = map_one_spi(d, "PCI#INTB", base_spi+1, DT_IRQ_TYPE_LEVEL_HIGH);
-    if ( ret )
-        goto err;
-
-    ret = map_one_spi(d, "PCI#INTC", base_spi+2, DT_IRQ_TYPE_LEVEL_HIGH);
-    if ( ret )
-        goto err;
-
-    ret = map_one_spi(d, "PCI#INTD", base_spi+3, DT_IRQ_TYPE_LEVEL_HIGH);
-    if ( ret )
-        goto err;
-
-    ret = 0;
-err:
-    return ret;
+        DT_MATCH_COMPATIBLE("arm,cortex-a15-gic"),
+        { /*sentinel*/ },
+    };
+
+    node = dt_find_interrupt_controller(xgene_dt_int_ctrl_match);
+    if ( !node )
+        panic("%s: Can not find interrupt controller node", __func__);
+
+    res = dt_device_get_address(node, 0, &dbase, NULL);
+    if ( !dbase )
+        panic("%s: Cannot find a valid address for the distributor", __func__);
+
+    /*
+     * In old X-Gene Storm firmware and DT, secure mode addresses have
+     * been mentioned in GICv2 node. EOI HW won't work in this case.
+     * We check the GIC Distributor Base Address to deny Xen booting
+     * with older firmware.
+     */
+    if ( dbase == XGENE_SEC_GICV2_DIST_ADDR )
+        panic("OLD X-Gene Firmware is not supported by Xen.\n"
+              "Please upgrade your firmware to the latest version");
 }
 
-/*
- * Xen does not currently support mapping MMIO regions and interrupt
- * for bus child devices (referenced via the "ranges" and
- * "interrupt-map" properties to domain 0). Instead for now map the
- * necessary resources manually.
- */
-static int xgene_storm_specific_mapping(struct domain *d)
+static uint32_t xgene_storm_quirks(void)
 {
-    struct dt_device_node *node = NULL;
-    int ret;
-
-    while ( (node = dt_find_compatible_node(node, "pci", "apm,xgene-pcie")) )
-    {
-        u64 addr;
-
-        /* Identify the bus via it's control register address */
-        ret = dt_device_get_address(node, 0, &addr, NULL);
-        if ( ret < 0 )
-            return ret;
-
-        if ( !dt_device_is_available(node) )
-            continue;
-
-       switch ( addr )
-        {
-        case 0x1f2b0000: /* PCIe0 */
-            ret = xgene_storm_pcie_specific_mapping(d,
-                node,
-                0x0e000000000UL, 0x10000000000UL, 0xc2);
-            break;
-        case 0x1f2c0000: /* PCIe1 */
-            ret = xgene_storm_pcie_specific_mapping(d,
-                node,
-                0x0d000000000UL, 0x0e000000000UL, 0xc8);
-            break;
-        case 0x1f2d0000: /* PCIe2 */
-            ret = xgene_storm_pcie_specific_mapping(d,
-                node,
-                0x09000000000UL, 0x0a000000000UL, 0xce);
-            break;
-        case 0x1f500000: /* PCIe3 */
-            ret = xgene_storm_pcie_specific_mapping(d,
-                node,
-                0x0a000000000UL, 0x0c000000000UL, 0xd4);
-            break;
-        case 0x1f510000: /* PCIe4 */
-            ret = xgene_storm_pcie_specific_mapping(d,
-                node,
-                0x0c000000000UL, 0x0d000000000UL, 0xda);
-            break;
-
-        default:
-            printk("Ignoring unknown PCI bus %s\n", dt_node_full_name(node));
-            continue;
-        }
-
-        if ( ret < 0 )
-            return ret;
-    }
-
-    return 0;
+    return PLATFORM_QUIRK_GIC_64K_STRIDE;
 }
 
 static void xgene_storm_reset(void)
@@ -212,6 +107,8 @@ static int xgene_storm_init(void)
     reset_mask = XGENE_RESET_MASK;
 
     reset_vals_valid = true;
+    xgene_check_pirq_eoi();
+
     return 0;
 }
 
@@ -226,11 +123,6 @@ PLATFORM_START(xgene_storm, "APM X-GENE STORM")
     .init = xgene_storm_init,
     .reset = xgene_storm_reset,
     .quirks = xgene_storm_quirks,
-    .specific_mapping = xgene_storm_specific_mapping,
-
-    .dom0_evtchn_ppi = 24,
-    .dom0_gnttab_start = 0x1f800000,
-    .dom0_gnttab_size = 0x20000,
 PLATFORM_END
 
 /*
diff --git a/xen/arch/arm/psci.c b/xen/arch/arm/psci.c
index 4066309..7ad6a43 100644
--- a/xen/arch/arm/psci.c
+++ b/xen/arch/arm/psci.c
@@ -102,7 +102,7 @@ int __init psci_init_0_2(void)
 
     psci = dt_find_compatible_node(NULL, NULL, "arm,psci-0.2");
     if ( !psci )
-	return -EOPNOTSUPP;
+        return -EOPNOTSUPP;
 
     ret = psci_is_smc_method(psci);
     if ( ret )
diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
index 1e488ee..48f734f 100644
--- a/xen/arch/arm/setup.c
+++ b/xen/arch/arm/setup.c
@@ -71,6 +71,7 @@ static void __init init_idle_domain(void)
 static const char * __initdata processor_implementers[] = {
     ['A'] = "ARM Limited",
     ['B'] = "Broadcom Corporation",
+    ['C'] = "Cavium Inc.",
     ['D'] = "Digital Equipment Corp",
     ['M'] = "Motorola, Freescale Semiconductor Inc.",
     ['P'] = "Applied Micro",
@@ -249,7 +250,7 @@ void __init discard_initial_modules(void)
     struct bootmodules *mi = &bootinfo.modules;
     int i;
 
-    for ( i = 0; i <= mi->nr_mods; i++ )
+    for ( i = 0; i < mi->nr_mods; i++ )
     {
         paddr_t s = mi->module[i].start;
         paddr_t e = s + PAGE_ALIGN(mi->module[i].size);
@@ -349,7 +350,7 @@ static paddr_t __init next_module(paddr_t s, paddr_t *end)
     paddr_t lowest = ~(paddr_t)0;
     int i;
 
-    for ( i = 0; i <= mi->nr_mods; i++ )
+    for ( i = 0; i < mi->nr_mods; i++ )
     {
         paddr_t mod_s = mi->module[i].start;
         paddr_t mod_e = mod_s + mi->module[i].size;
@@ -664,7 +665,6 @@ static void __init setup_mm(unsigned long dtb_paddr, size_t dtb_size)
     xenheap_virt_end = XENHEAP_VIRT_START + ram_end - ram_start;
     xenheap_mfn_start = ram_start >> PAGE_SHIFT;
     xenheap_mfn_end = ram_end >> PAGE_SHIFT;
-    xenheap_max_mfn(xenheap_mfn_end);
 
     /*
      * Need enough mapped pages for copying the DTB.
@@ -709,6 +709,7 @@ void __init start_xen(unsigned long boot_phys_offset,
     const char *cmdline;
     struct bootmodule *xen_bootmodule;
     struct domain *dom0;
+    struct xen_arch_domainconfig config;
 
     setup_cache();
 
@@ -751,10 +752,15 @@ void __init start_xen(unsigned long boot_phys_offset,
 
     vm_init();
     dt_unflatten_host_device_tree();
-    dt_irq_xlate = gic_irq_xlate;
 
     init_IRQ();
 
+    platform_init();
+
+    preinit_xen_time();
+
+    gic_preinit();
+
     dt_uart_init();
     console_init_preirq();
     console_init_ring();
@@ -763,8 +769,6 @@ void __init start_xen(unsigned long boot_phys_offset,
 
     processor_id();
 
-    platform_init();
-
     smp_init_cpus();
     cpus = smp_get_max_cpus();
 
@@ -795,8 +799,6 @@ void __init start_xen(unsigned long boot_phys_offset,
     local_irq_enable();
     local_abort_enable();
 
-    iommu_setup();
-
     smp_prepare_cpus(cpus);
 
     initialize_keytable();
@@ -820,10 +822,16 @@ void __init start_xen(unsigned long boot_phys_offset,
 
     setup_virt_paging();
 
+    iommu_setup();
+
     do_initcalls();
 
     /* Create initial domain 0. */
-    dom0 = domain_create(0, 0, 0);
+    /* The vGIC for DOM0 is exactly emulating the hardware GIC */
+    config.gic_version = XEN_DOMCTL_CONFIG_GIC_NATIVE;
+    config.nr_spis = gic_number_lines() - 32;
+
+    dom0 = domain_create(0, 0, 0, &config);
     if ( IS_ERR(dom0) || (alloc_dom0_vcpu0(dom0) == NULL) )
             panic("Error creating domain 0");
 
diff --git a/xen/arch/arm/shutdown.c b/xen/arch/arm/shutdown.c
index 4988b03..e53ec97 100644
--- a/xen/arch/arm/shutdown.c
+++ b/xen/arch/arm/shutdown.c
@@ -38,6 +38,10 @@ void machine_restart(unsigned int delay_millisecs)
 {
     int timeout = 10;
 
+    watchdog_disable();
+    console_start_sync();
+    spin_debug_disable();
+
     local_irq_enable();
     smp_call_function(halt_this_cpu, NULL, 0);
     local_irq_disable();
diff --git a/xen/arch/arm/smpboot.c b/xen/arch/arm/smpboot.c
index 14054ae..a96cda2 100644
--- a/xen/arch/arm/smpboot.c
+++ b/xen/arch/arm/smpboot.c
@@ -357,6 +357,7 @@ int __init cpu_up_send_sgi(int cpu)
 int __cpu_up(unsigned int cpu)
 {
     int rc;
+    s_time_t deadline;
 
     printk("Bringing up CPU%d\n", cpu);
 
@@ -369,7 +370,7 @@ int __cpu_up(unsigned int cpu)
     /* Tell the remote CPU which stack to boot on. */
     init_data.stack = idle_vcpu[cpu]->arch.stack;
 
-    /* Tell the remote CPU what is it's logical CPU ID */
+    /* Tell the remote CPU what its logical CPU ID is. */
     init_data.cpuid = cpu;
 
     /* Open the gate for this CPU */
@@ -386,12 +387,34 @@ int __cpu_up(unsigned int cpu)
         return rc;
     }
 
-    while ( !cpu_online(cpu) )
+    deadline = NOW() + MILLISECS(1000);
+
+    while ( !cpu_online(cpu) && NOW() < deadline )
     {
         cpu_relax();
         process_pending_softirqs();
     }
 
+    /*
+     * Nuke start of day info before checking one last time if the CPU
+     * actually came online. If it is not online it may still be
+     * trying to come up and may show up later unexpectedly.
+     *
+     * This doesn't completely avoid the possibility of the supposedly
+     * failed CPU trying to progress with another CPUs stack settings
+     * etc, but better than nothing, hopefully.
+     */
+    init_data.stack = NULL;
+    init_data.cpuid = ~0;
+    smp_up_cpu = MPIDR_INVALID;
+    clean_dcache(smp_up_cpu);
+
+    if ( !cpu_online(cpu) )
+    {
+        printk("CPU%d never came online\n", cpu);
+        return -EIO;
+    }
+
     return 0;
 }
 
diff --git a/xen/arch/arm/time.c b/xen/arch/arm/time.c
index 0add494..5ded30c 100644
--- a/xen/arch/arm/time.c
+++ b/xen/arch/arm/time.c
@@ -42,6 +42,8 @@ uint64_t __read_mostly boot_count;
  * register-mapped time source in the SoC. */
 unsigned long __read_mostly cpu_khz;  /* CPU clock frequency in kHz. */
 
+uint32_t __read_mostly timer_dt_clock_frequency;
+
 static unsigned int timer_irq[MAX_TIMER_PPI];
 
 unsigned int timer_get_irq(enum timer_ppi ppi)
@@ -61,56 +63,66 @@ unsigned int timer_get_irq(enum timer_ppi ppi)
     return muldiv64(ns, 1000 * cpu_khz, SECONDS(1));
 }
 
-/* Set up the timer on the boot CPU */
-int __init init_xen_time(void)
+static __initdata struct dt_device_node *timer;
+
+/* Set up the timer on the boot CPU (early init function) */
+void __init preinit_xen_time(void)
 {
     static const struct dt_device_match timer_ids[] __initconst =
     {
         DT_MATCH_TIMER,
         { /* sentinel */ },
     };
-    struct dt_device_node *dev;
     int res;
-    unsigned int i;
     u32 rate;
 
-    dev = dt_find_matching_node(NULL, timer_ids);
-    if ( !dev )
+    timer = dt_find_matching_node(NULL, timer_ids);
+    if ( !timer )
         panic("Unable to find a compatible timer in the device tree");
 
-    dt_device_set_used_by(dev, DOMID_XEN);
+    dt_device_set_used_by(timer, DOMID_XEN);
+
+    res = platform_init_time();
+    if ( res )
+        panic("Timer: Cannot initialize platform timer");
+
+    res = dt_property_read_u32(timer, "clock-frequency", &rate);
+    if ( res )
+    {
+        cpu_khz = rate / 1000;
+        timer_dt_clock_frequency = rate;
+    }
+    else
+        cpu_khz = READ_SYSREG32(CNTFRQ_EL0) / 1000;
+
+    boot_count = READ_SYSREG64(CNTPCT_EL0);
+}
+
+/* Set up the timer on the boot CPU (late init function) */
+int __init init_xen_time(void)
+{
+    int res;
+    unsigned int i;
 
     /* Retrieve all IRQs for the timer */
     for ( i = TIMER_PHYS_SECURE_PPI; i < MAX_TIMER_PPI; i++ )
     {
-        res = platform_get_irq(dev, i);
+        res = platform_get_irq(timer, i);
 
         if ( res < 0 )
             panic("Timer: Unable to retrieve IRQ %u from the device tree", i);
         timer_irq[i] = res;
     }
 
-    printk("Generic Timer IRQ: phys=%u hyp=%u virt=%u\n",
-           timer_irq[TIMER_PHYS_NONSECURE_PPI],
-           timer_irq[TIMER_HYP_PPI],
-           timer_irq[TIMER_VIRT_PPI]);
-
-    res = platform_init_time();
-    if ( res )
-        panic("Timer: Cannot initialize platform timer");
-
     /* Check that this CPU supports the Generic Timer interface */
     if ( !cpu_has_gentimer )
         panic("CPU does not support the Generic Timer v1 interface");
 
-    res = dt_property_read_u32(dev, "clock-frequency", &rate);
-    if ( res )
-        cpu_khz = rate / 1000;
-    else
-        cpu_khz = READ_SYSREG32(CNTFRQ_EL0) / 1000;
-
-    boot_count = READ_SYSREG64(CNTPCT_EL0);
-    printk("Using generic timer at %lu KHz\n", cpu_khz);
+    printk("Generic Timer IRQ: phys=%u hyp=%u virt=%u Freq: %lu KHz\n",
+           timer_irq[TIMER_PHYS_NONSECURE_PPI],
+           timer_irq[TIMER_HYP_PPI],
+           timer_irq[TIMER_VIRT_PPI],
+           cpu_khz);
 
     return 0;
 }
@@ -151,6 +163,7 @@ static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
     if ( irq == (timer_irq[TIMER_HYP_PPI]) &&
          READ_SYSREG32(CNTHP_CTL_EL2) & CNTx_CTL_PENDING )
     {
+        perfc_incr(hyp_timer_irqs);
         /* Signal the generic timer code to do its work */
         raise_softirq(TIMER_SOFTIRQ);
         /* Disable the timer to avoid more interrupts */
@@ -160,6 +173,7 @@ static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
     if ( irq == (timer_irq[TIMER_PHYS_NONSECURE_PPI]) &&
          READ_SYSREG32(CNTP_CTL_EL0) & CNTx_CTL_PENDING )
     {
+        perfc_incr(phys_timer_irqs);
         /* Signal the generic timer code to do its work */
         raise_softirq(TIMER_SOFTIRQ);
         /* Disable the timer to avoid more interrupts */
@@ -182,18 +196,46 @@ static void vtimer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
     if ( unlikely(is_idle_vcpu(current)) )
         return;
 
+    perfc_incr(virt_timer_irqs);
+
     current->arch.virt_timer.ctl = READ_SYSREG32(CNTV_CTL_EL0);
     WRITE_SYSREG32(current->arch.virt_timer.ctl | CNTx_CTL_MASK, CNTV_CTL_EL0);
     vgic_vcpu_inject_irq(current, current->arch.virt_timer.irq);
 }
 
+/*
+ * Arch timer interrupt really ought to be level triggered, since the
+ * design of the timer/comparator mechanism is based around that
+ * concept.
+ *
+ * However some firmware (incorrectly) describes the interrupts as
+ * edge triggered and, worse, some hardware allows us to program the
+ * interrupt controller as edge triggered.
+ *
+ * Check each interrupt and warn if we find ourselves in this situation.
+ */
+static void check_timer_irq_cfg(unsigned int irq, const char *which)
+{
+    struct irq_desc *desc = irq_to_desc(irq);
+
+    /*
+     * The interrupt controller driver will update desc->arch.type with
+     * the actual type which ended up configured in the hardware.
+     */
+    if ( desc->arch.type & DT_IRQ_TYPE_LEVEL_MASK )
+        return;
+
+    printk(XENLOG_WARNING
+           "WARNING: %s-timer IRQ%u is not level triggered.\n", which, irq);
+}
+
 /* Set up the timer interrupt on this CPU */
 void __cpuinit init_timer_interrupt(void)
 {
     /* Sensible defaults */
     WRITE_SYSREG64(0, CNTVOFF_EL2);     /* No VM-specific offset */
     /* Do not let the VMs program the physical timer, only read the physical counter */
-    WRITE_SYSREG32(CNTHCTL_PA, CNTHCTL_EL2);
+    WRITE_SYSREG32(CNTHCTL_EL2_EL1PCTEN, CNTHCTL_EL2);
     WRITE_SYSREG32(0, CNTP_CTL_EL0);    /* Physical timer disabled */
     WRITE_SYSREG32(0, CNTHP_CTL_EL2);   /* Hypervisor's timer disabled */
     isb();
@@ -204,6 +246,10 @@ void __cpuinit init_timer_interrupt(void)
                    "virtimer", NULL);
     request_irq(timer_irq[TIMER_PHYS_NONSECURE_PPI], 0, timer_interrupt,
                 "phytimer", NULL);
+
+    check_timer_irq_cfg(timer_irq[TIMER_HYP_PPI], "hypervisor");
+    check_timer_irq_cfg(timer_irq[TIMER_VIRT_PPI], "virtual");
+    check_timer_irq_cfg(timer_irq[TIMER_PHYS_NONSECURE_PPI], "NS-physical");
 }
 
 /* Wait a set number of microseconds */
@@ -228,7 +274,7 @@ void update_vcpu_system_time(struct vcpu *v)
     /* XXX update shared_info->wc_* */
 }
 
-void domain_set_time_offset(struct domain *d, int32_t time_offset_seconds)
+void domain_set_time_offset(struct domain *d, int64_t time_offset_seconds)
 {
     d->time_offset_seconds = time_offset_seconds;
     /* XXX update guest visible wallclock time */
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index 4063a80..9d2bd6a 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -30,6 +30,7 @@
 #include <xen/hypercall.h>
 #include <xen/softirq.h>
 #include <xen/domain_page.h>
+#include <xen/perfc.h>
 #include <public/sched.h>
 #include <public/xen.h>
 #include <asm/debugger.h>
@@ -39,6 +40,7 @@
 #include <asm/psci.h>
 #include <asm/mmio.h>
 #include <asm/cpufeature.h>
+#include <asm/flushtlb.h>
 
 #include "decode.h"
 #include "vtimer.h"
@@ -62,6 +64,30 @@ static inline void check_stack_alignment_constraints(void) {
 #endif
 }
 
+/*
+ * GUEST_BUG_ON is intended for checking that the guest state has not been
+ * corrupted in hardware and/or that the hardware behaves as we
+ * believe it should (i.e. that certain traps can only occur when the
+ * guest is in a particular mode).
+ *
+ * The intention is to limit the damage such h/w bugs (or spec
+ * misunderstandings) can do by turning them into Denial of Service
+ * attacks instead of e.g. information leaks or privilege escalations.
+ *
+ * GUEST_BUG_ON *MUST* *NOT* be used to check for guest controllable state!
+ *
+ * Compared with regular BUG_ON it dumps the guest vcpu state instead
+ * of Xen's state.
+ */
+#define guest_bug_on_failed(p)                          \
+do {                                                    \
+    show_execution_state(guest_cpu_user_regs());        \
+    panic("Guest Bug: %pv: '%s', line %d, file %s\n",   \
+          current, p, __LINE__, __FILE__);              \
+} while (0)
+#define GUEST_BUG_ON(p) \
+    do { if ( unlikely(p) ) guest_bug_on_failed(#p); } while (0)
+
 #ifdef CONFIG_ARM_32
 static int debug_stack_lines = 20;
 #define stack_words_per_line 8
@@ -85,8 +111,13 @@ void __cpuinit init_traps(void)
     /* Trap CP15 c15 used for implementation defined registers */
     WRITE_SYSREG(HSTR_T(15), HSTR_EL2);
 
-    /* Trap all coprocessor registers (0-13) except cp10 and cp11 for VFP
-     * /!\ All processors except cp10 and cp11 cannot be used in Xen
+    /* Trap all coprocessor registers (0-13) except cp10 and
+     * cp11 for VFP.
+     *
+     * /!\ All coprocessors except cp10 and cp11 cannot be used in Xen.
+     *
+     * On ARM64 the TCPx bits which we set here (0..9,12,13) are all
+     * RES1, i.e. they would trap whether we did this write or not.
      */
     WRITE_SYSREG((HCPTR_CP_MASK & ~(HCPTR_CP(10) | HCPTR_CP(11))) | HCPTR_TTA,
                  CPTR_EL2);
@@ -422,7 +453,7 @@ static vaddr_t exception_handler64(struct cpu_user_regs *regs, vaddr_t offset)
 static void inject_undef64_exception(struct cpu_user_regs *regs, int instr_len)
 {
     vaddr_t handler;
-    union hsr esr = {
+    const union hsr esr = {
         .iss = 0,
         .len = instr_len,
         .ec = HSR_EC_UNKNOWN,
@@ -493,13 +524,13 @@ static void inject_iabt64_exception(struct cpu_user_regs *regs,
 #endif
 
 static void inject_undef_exception(struct cpu_user_regs *regs,
-                                   int instr_len)
+                                   const union hsr hsr)
 {
         if ( is_32bit_domain(current->domain) )
             inject_undef32_exception(regs);
 #ifdef CONFIG_ARM_64
         else
-            inject_undef64_exception(regs, instr_len);
+            inject_undef64_exception(regs, hsr.len);
 #endif
 }
 
@@ -1031,7 +1062,7 @@ void vcpu_show_execution_state(struct vcpu *v)
     vcpu_pause(v); /* acceptably dangerous */
 
     vcpu_show_registers(v);
-    if ( !usr_mode(&v->arch.cpu_info->guest_cpu_user_regs) )
+    if ( !psr_mode_is_user(&v->arch.cpu_info->guest_cpu_user_regs) )
         show_guest_stack(v, &v->arch.cpu_info->guest_cpu_user_regs);
 
     vcpu_unpause(v);
@@ -1116,7 +1147,7 @@ int do_bug_frame(struct cpu_user_regs *regs, vaddr_t pc)
 }
 
 #ifdef CONFIG_ARM_64
-static void do_trap_brk(struct cpu_user_regs *regs, union hsr hsr)
+static void do_trap_brk(struct cpu_user_regs *regs, const union hsr hsr)
 {
     /* HCR_EL2.TGE and MDCR_EL2.TDE are not set so we never receive
      * software breakpoint exception for EL1 and EL0 here.
@@ -1140,6 +1171,22 @@ die:
 }
 #endif
 
+static register_t do_deprecated_hypercall(void)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    const register_t op =
+#ifdef CONFIG_ARM_64
+        !is_32bit_domain(current->domain) ?
+            regs->x16
+        :
+#endif
+            regs->r12;
+
+    gdprintk(XENLOG_DEBUG, "%pv: deprecated hypercall %lu\n",
+             current, (unsigned long)op);
+    return -ENOSYS;
+}
+
 typedef register_t (*arm_hypercall_fn_t)(
     register_t, register_t, register_t, register_t, register_t);
 
@@ -1159,15 +1206,29 @@ typedef struct {
         .fn = (arm_hypercall_fn_t) &do_arm_ ## _name,                \
         .nr_args = _nr_args,                                         \
     }
+/*
+ * Only use this for hypercalls which were deprecated (i.e. replaced
+ * by something else) before Xen on ARM was created, i.e. *not* for
+ * hypercalls which are simply not yet used on ARM.
+ */
+#define HYPERCALL_DEPRECATED(_name, _nr_args)                   \
+    [ __HYPERVISOR_##_name ] = {                                \
+        .fn = (arm_hypercall_fn_t) &do_deprecated_hypercall,    \
+        .nr_args = _nr_args,                                    \
+    }
+
 static arm_hypercall_t arm_hypercall_table[] = {
     HYPERCALL(memory_op, 2),
     HYPERCALL(domctl, 1),
     HYPERCALL(sched_op, 2),
+    HYPERCALL_DEPRECATED(sched_op_compat, 2),
     HYPERCALL(console_io, 3),
     HYPERCALL(xen_version, 2),
     HYPERCALL(xsm_op, 1),
     HYPERCALL(event_channel_op, 2),
+    HYPERCALL_DEPRECATED(event_channel_op_compat, 1),
     HYPERCALL(physdev_op, 2),
+    HYPERCALL_DEPRECATED(physdev_op_compat, 1),
     HYPERCALL(sysctl, 2),
     HYPERCALL(hvm_op, 2),
     HYPERCALL(grant_table_op, 3),
@@ -1233,6 +1294,7 @@ static void do_trap_psci(struct cpu_user_regs *regs)
     case PSCI_cpu_off:
         {
             uint32_t pstate = PSCI_ARG32(regs,1);
+            perfc_incr(vpsci_cpu_off);
             PSCI_RESULT_REG(regs) = do_psci_cpu_off(pstate);
         }
         break;
@@ -1240,33 +1302,41 @@ static void do_trap_psci(struct cpu_user_regs *regs)
         {
             uint32_t vcpuid = PSCI_ARG32(regs,1);
             register_t epoint = PSCI_ARG(regs,2);
+            perfc_incr(vpsci_cpu_on);
             PSCI_RESULT_REG(regs) = do_psci_cpu_on(vcpuid, epoint);
         }
         break;
     case PSCI_0_2_FN_PSCI_VERSION:
+        perfc_incr(vpsci_version);
         PSCI_RESULT_REG(regs) = do_psci_0_2_version();
         break;
     case PSCI_0_2_FN_CPU_OFF:
+        perfc_incr(vpsci_cpu_off);
         PSCI_RESULT_REG(regs) = do_psci_0_2_cpu_off();
         break;
     case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+        perfc_incr(vpsci_migrate_info_type);
         PSCI_RESULT_REG(regs) = do_psci_0_2_migrate_info_type();
         break;
     case PSCI_0_2_FN_MIGRATE_INFO_UP_CPU:
     case PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU:
+        perfc_incr(vpsci_migrate_info_up_cpu);
         if ( psci_mode_check(current->domain, fid) )
             PSCI_RESULT_REG(regs) = do_psci_0_2_migrate_info_up_cpu();
         break;
     case PSCI_0_2_FN_SYSTEM_OFF:
+        perfc_incr(vpsci_system_off);
         do_psci_0_2_system_off();
         PSCI_RESULT_REG(regs) = PSCI_INTERNAL_FAILURE;
         break;
     case PSCI_0_2_FN_SYSTEM_RESET:
+        perfc_incr(vpsci_system_reset);
         do_psci_0_2_system_reset();
         PSCI_RESULT_REG(regs) = PSCI_INTERNAL_FAILURE;
         break;
     case PSCI_0_2_FN_CPU_ON:
     case PSCI_0_2_FN64_CPU_ON:
+        perfc_incr(vpsci_cpu_on);
         if ( psci_mode_check(current->domain, fid) )
         {
             register_t vcpuid = PSCI_ARG(regs,1);
@@ -1278,6 +1348,7 @@ static void do_trap_psci(struct cpu_user_regs *regs)
         break;
     case PSCI_0_2_FN_CPU_SUSPEND:
     case PSCI_0_2_FN64_CPU_SUSPEND:
+        perfc_incr(vpsci_cpu_suspend);
         if ( psci_mode_check(current->domain, fid) )
         {
             uint32_t pstate = PSCI_ARG32(regs,1);
@@ -1289,6 +1360,7 @@ static void do_trap_psci(struct cpu_user_regs *regs)
         break;
     case PSCI_0_2_FN_AFFINITY_INFO:
     case PSCI_0_2_FN64_AFFINITY_INFO:
+        perfc_incr(vpsci_cpu_affinity_info);
         if ( psci_mode_check(current->domain, fid) )
         {
             register_t taff = PSCI_ARG(regs,1);
@@ -1299,6 +1371,7 @@ static void do_trap_psci(struct cpu_user_regs *regs)
         break;
     case PSCI_0_2_FN_MIGRATE:
     case PSCI_0_2_FN64_MIGRATE:
+        perfc_incr(vpsci_cpu_migrate);
         if ( psci_mode_check(current->domain, fid) )
         {
             uint32_t tcpu = PSCI_ARG32(regs,1);
@@ -1337,15 +1410,19 @@ static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
     register_t orig_pc = regs->pc;
 #endif
 
+    BUILD_BUG_ON(NR_hypercalls < ARRAY_SIZE(arm_hypercall_table) );
+
     if ( iss != XEN_HYPERCALL_TAG )
         domain_crash_synchronous();
 
     if ( *nr >= ARRAY_SIZE(arm_hypercall_table) )
     {
+        perfc_incr(invalid_hypercalls);
         HYPERCALL_RESULT_REG(regs) = -ENOSYS;
         return;
     }
 
+    perfc_incra(hypercalls, *nr);
     call = arm_hypercall_table[*nr].fn;
     if ( call == NULL )
     {
@@ -1447,7 +1524,8 @@ static const unsigned short cc_map[16] = {
         0                       /* NV                     */
 };
 
-static int check_conditional_instr(struct cpu_user_regs *regs, union hsr hsr)
+static int check_conditional_instr(struct cpu_user_regs *regs,
+                                   const union hsr hsr)
 {
     unsigned long cpsr, cpsr_cond;
     int cond;
@@ -1470,7 +1548,7 @@ static int check_conditional_instr(struct cpu_user_regs *regs, union hsr hsr)
     {
         unsigned long it;
 
-        BUG_ON( !is_32bit_domain(current->domain) || !(cpsr&PSR_THUMB) );
+        BUG_ON( !psr_mode_is_32bit(regs->cpsr) || !(cpsr&PSR_THUMB) );
 
         it = ( (cpsr >> (10-2)) & 0xfc) | ((cpsr >> 25) & 0x3 );
 
@@ -1479,26 +1557,28 @@ static int check_conditional_instr(struct cpu_user_regs *regs, union hsr hsr)
             return 1;
 
         /* The cond for this instruction works out as the top 4 bits. */
-        cond =  ( it >> 4 );
+        cond = ( it >> 4 );
     }
 
     cpsr_cond = cpsr >> 28;
 
     if ( !((cc_map[cond] >> cpsr_cond) & 1) )
+    {
+        perfc_incr(trap_uncond);
         return 0;
-
+    }
     return 1;
 }
 
-static void advance_pc(struct cpu_user_regs *regs, union hsr hsr)
+static void advance_pc(struct cpu_user_regs *regs, const union hsr hsr)
 {
     unsigned long itbits, cond, cpsr = regs->cpsr;
 
     /* PSR_IT_MASK bits can only be set for 32-bit processors in Thumb mode. */
-    BUG_ON( (!is_32bit_domain(current->domain)||!(cpsr&PSR_THUMB))
+    BUG_ON( (!psr_mode_is_32bit(cpsr)||!(cpsr&PSR_THUMB))
             && (cpsr&PSR_IT_MASK) );
 
-    if ( is_32bit_domain(current->domain) && (cpsr&PSR_IT_MASK) )
+    if ( cpsr&PSR_IT_MASK )
     {
         /* The ITSTATE[7:0] block is contained in CPSR[15:10],CPSR[26:25]
          *
@@ -1530,11 +1610,70 @@ static void advance_pc(struct cpu_user_regs *regs, union hsr hsr)
     regs->pc += hsr.len ? 4 : 2;
 }
 
+/* Read as zero and write ignore */
+static void handle_raz_wi(struct cpu_user_regs *regs,
+                          register_t *reg,
+                          bool_t read,
+                          const union hsr hsr,
+                          int min_el)
+{
+    ASSERT((min_el == 0) || (min_el == 1));
+
+    if ( min_el > 0 && psr_mode_is_user(regs) )
+        return inject_undef_exception(regs, hsr);
+
+    if ( read )
+        *reg = 0;
+    /* else: write ignored */
+
+    advance_pc(regs, hsr);
+}
+
+/* Write only as write ignore */
+static void handle_wo_wi(struct cpu_user_regs *regs,
+                         register_t *reg,
+                         bool_t read,
+                         const union hsr hsr,
+                         int min_el)
+{
+    ASSERT((min_el == 0) || (min_el == 1));
+
+    if ( min_el > 0 && psr_mode_is_user(regs) )
+        return inject_undef_exception(regs, hsr);
+
+    if ( read )
+        return inject_undef_exception(regs, hsr);
+    /* else: ignore */
+
+    advance_pc(regs, hsr);
+}
+
+/* Read only as read as zero */
+static void handle_ro_raz(struct cpu_user_regs *regs,
+                          register_t *reg,
+                          bool_t read,
+                          const union hsr hsr,
+                          int min_el)
+{
+    ASSERT((min_el == 0) || (min_el == 1));
+
+    if ( min_el > 0 && psr_mode_is_user(regs) )
+        return inject_undef_exception(regs, hsr);
+
+    if ( !read )
+        return inject_undef_exception(regs, hsr);
+    /* else: raz */
+
+    *reg = 0;
+
+    advance_pc(regs, hsr);
+}
+
 static void do_cp15_32(struct cpu_user_regs *regs,
-                       union hsr hsr)
+                       const union hsr hsr)
 {
-    struct hsr_cp32 cp32 = hsr.cp32;
-    uint32_t *r = (uint32_t*)select_user_reg(regs, cp32.reg);
+    const struct hsr_cp32 cp32 = hsr.cp32;
+    register_t *r = select_user_reg(regs, cp32.reg);
     struct vcpu *v = current;
 
     if ( !check_conditional_instr(regs, hsr) )
@@ -1545,51 +1684,49 @@ static void do_cp15_32(struct cpu_user_regs *regs,
 
     switch ( hsr.bits & HSR_CP32_REGS_MASK )
     {
-    case HSR_CPREG32(CLIDR):
-        if ( !cp32.read )
-        {
-            dprintk(XENLOG_ERR,
-                    "attempt to write to read-only register CLIDR\n");
-            domain_crash_synchronous();
-        }
-        *r = READ_SYSREG32(CLIDR_EL1);
-        break;
-    case HSR_CPREG32(CCSIDR):
-        if ( !cp32.read )
-        {
-            dprintk(XENLOG_ERR,
-                    "attempt to write to read-only register CCSIDR\n");
-            domain_crash_synchronous();
-        }
-        *r = READ_SYSREG32(CCSIDR_EL1);
-        break;
-    case HSR_CPREG32(DCCISW):
-        if ( cp32.read )
-        {
-            dprintk(XENLOG_ERR,
-                    "attempt to read from write-only register DCCISW\n");
-            domain_crash_synchronous();
-        }
-#ifdef CONFIG_ARM_32
-        WRITE_CP32(*r, DCCISW);
-#else
-        asm volatile("dc cisw, %0;" : : "r" (*r) : "memory");
-#endif
-        break;
+    /*
+     * !CNTHCTL_EL2.EL1PCEN / !CNTHCTL.PL1PCEN
+     *
+     * ARMv7 (DDI 0406C.b): B4.1.22
+     * ARMv8 (DDI 0487A.d): D1-1510 Table D1-60
+     */
     case HSR_CPREG32(CNTP_CTL):
     case HSR_CPREG32(CNTP_TVAL):
         if ( !vtimer_emulate(regs, hsr) )
-        {
-            dprintk(XENLOG_ERR,
-                    "failed emulation of 32-bit vtimer CP register access\n");
-            domain_crash_synchronous();
-        }
+            return inject_undef_exception(regs, hsr);
         break;
+
+    /*
+     * HCR_EL2.TACR / HCR.TAC
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.6
+     * ARMv8 (DDI 0487A.d): G6.2.1
+     */
     case HSR_CPREG32(ACTLR):
+        if ( psr_mode_is_user(regs) )
+            return inject_undef_exception(regs, hsr);
         if ( cp32.read )
            *r = v->arch.actlr;
         break;
 
+    /*
+     * MDCR_EL2.TPM
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.17
+     * ARMv8 (DDI 0487A.d): D1-1511 Table D1-61
+     *
+     * Unhandled:
+     *    PMEVCNTR<n>
+     *    PMEVTYPER<n>
+     *    PMCCFILTR
+     *
+     * MDCR_EL2.TPMCR
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.17
+     * ARMv8 (DDI 0487A.d): D1-1511 Table D1-62
+     *
+     * NB: Both MDCR_EL2.TPM and MDCR_EL2.TPMCR cause trapping of PMCR.
+     */
     /* We could trap ID_DFR0 and tell the guest we don't support
      * performance monitoring, but Linux doesn't check the ID_DFR0.
      * Therefore it will read PMCR.
@@ -1598,6 +1735,16 @@ static void do_cp15_32(struct cpu_user_regs *regs,
      * always support PMCCNTR (the cyle counter): we just RAZ/WI for all
      * PM register, which doesn't crash the kernel at least
      */
+    case HSR_CPREG32(PMUSERENR):
+        /* RO at EL0. RAZ/WI at EL1 */
+        if ( psr_mode_is_user(regs) )
+            return handle_ro_raz(regs, r, cp32.read, hsr, 0);
+        else
+            return handle_raz_wi(regs, r, cp32.read, hsr, 1);
+    case HSR_CPREG32(PMINTENSET):
+    case HSR_CPREG32(PMINTENCLR):
+        /* EL1 only, however MDCR_EL2.TPM==1 means EL0 may trap here also. */
+        return handle_raz_wi(regs, r, cp32.read, hsr, 1);
     case HSR_CPREG32(PMCR):
     case HSR_CPREG32(PMCNTENSET):
     case HSR_CPREG32(PMCNTENCLR):
@@ -1607,33 +1754,59 @@ static void do_cp15_32(struct cpu_user_regs *regs,
     case HSR_CPREG32(PMCEID0):
     case HSR_CPREG32(PMCEID1):
     case HSR_CPREG32(PMCCNTR):
+    case HSR_CPREG32(PMXEVTYPER):
     case HSR_CPREG32(PMXEVCNTR):
-    case HSR_CPREG32(PMXEVCNR):
-    case HSR_CPREG32(PMUSERENR):
-    case HSR_CPREG32(PMINTENSET):
-    case HSR_CPREG32(PMINTENCLR):
     case HSR_CPREG32(PMOVSSET):
-        if ( cp32.read )
-            *r = 0;
-        break;
+        /*
+         * Accessible at EL0 only if PMUSERENR_EL0.EN is set. We
+         * emulate that register as 0 above.
+         */
+        return handle_raz_wi(regs, r, cp32.read, hsr, 1);
 
+    /*
+     * HCR_EL2.TIDCP
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.3
+     * ARMv8 (DDI 0487A.d): D1-1501 Table D1-43
+     *
+     *  - CRn==c9, opc1=={0-7}, CRm=={c0-c2, c5-c8}, opc2=={0-7}
+     *    (Cache and TCM lockdown registers)
+     *  - CRn==c10, opc1=={0-7}, CRm=={c0, c1, c4, c8}, opc2=={0-7}
+     *    (VMSA CP15 c10 registers)
+     *  - CRn==c11, opc1=={0-7}, CRm=={c0-c8, c15}, opc2=={0-7}
+     *    (VMSA CP15 c11 registers)
+     *
+     * CPTR_EL2.T{0..9,12..13}
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.12
+     * ARMv8 (DDI 0487A.d): N/A
+     *
+     *  - All accesses to coprocessors 0..9 and 12..13
+     *
+     * HSTR_EL2.T15
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.14
+     * ARMv8 (DDI 0487A.d): D1-1507 Table D1-55
+     *
+     *  - All accesses to cp15, c15 registers.
+     *
+     * And all other unknown registers.
+     */
     default:
-#ifndef NDEBUG
         gdprintk(XENLOG_ERR,
                  "%s p15, %d, r%d, cr%d, cr%d, %d @ 0x%"PRIregister"\n",
                  cp32.read ? "mrc" : "mcr",
                  cp32.op1, cp32.reg, cp32.crn, cp32.crm, cp32.op2, regs->pc);
         gdprintk(XENLOG_ERR, "unhandled 32-bit CP15 access %#x\n",
                  hsr.bits & HSR_CP32_REGS_MASK);
-#endif
-        inject_undef_exception(regs, hsr.len);
+        inject_undef_exception(regs, hsr);
         return;
     }
     advance_pc(regs, hsr);
 }
 
 static void do_cp15_64(struct cpu_user_regs *regs,
-                       union hsr hsr)
+                       const union hsr hsr)
 {
     if ( !check_conditional_instr(regs, hsr) )
     {
@@ -1643,18 +1816,37 @@ static void do_cp15_64(struct cpu_user_regs *regs,
 
     switch ( hsr.bits & HSR_CP64_REGS_MASK )
     {
-    case HSR_CPREG64(CNTPCT):
+    /*
+     * !CNTHCTL_EL2.EL1PCEN / !CNTHCTL.PL1PCEN
+     *
+     * ARMv7 (DDI 0406C.b): B4.1.22
+     * ARMv8 (DDI 0487A.d): D1-1510 Table D1-60
+     */
+    case HSR_CPREG64(CNTP_CVAL):
         if ( !vtimer_emulate(regs, hsr) )
-        {
-            dprintk(XENLOG_ERR,
-                    "failed emulation of 64-bit vtimer CP register access\n");
-            domain_crash_synchronous();
-        }
+            return inject_undef_exception(regs, hsr);
         break;
+
+    /*
+     * CPTR_EL2.T{0..9,12..13}
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.12
+     * ARMv8 (DDI 0487A.d): N/A
+     *
+     *  - All accesses to coprocessors 0..9 and 12..13
+     *
+     * HSTR_EL2.T15
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.14
+     * ARMv8 (DDI 0487A.d): D1-1507 Table D1-55
+     *
+     *  - All accesses to cp15, c15 registers.
+     *
+     * And all other unknown registers.
+     */
     default:
         {
-#ifndef NDEBUG
-            struct hsr_cp64 cp64 = hsr.cp64;
+            const struct hsr_cp64 cp64 = hsr.cp64;
 
             gdprintk(XENLOG_ERR,
                      "%s p15, %d, r%d, r%d, cr%d @ 0x%"PRIregister"\n",
@@ -1662,18 +1854,17 @@ static void do_cp15_64(struct cpu_user_regs *regs,
                      cp64.op1, cp64.reg1, cp64.reg2, cp64.crm, regs->pc);
             gdprintk(XENLOG_ERR, "unhandled 64-bit CP15 access %#x\n",
                      hsr.bits & HSR_CP64_REGS_MASK);
-#endif
-            inject_undef_exception(regs, hsr.len);
+            inject_undef_exception(regs, hsr);
             return;
         }
     }
     advance_pc(regs, hsr);
 }
 
-static void do_cp14_32(struct cpu_user_regs *regs, union hsr hsr)
+static void do_cp14_32(struct cpu_user_regs *regs, const union hsr hsr)
 {
-    struct hsr_cp32 cp32 = hsr.cp32;
-    uint32_t *r = (uint32_t *)select_user_reg(regs, cp32.reg);
+    const struct hsr_cp32 cp32 = hsr.cp32;
+    register_t *r = select_user_reg(regs, cp32.reg);
     struct domain *d = current->domain;
 
     if ( !check_conditional_instr(regs, hsr) )
@@ -1684,11 +1875,50 @@ static void do_cp14_32(struct cpu_user_regs *regs, union hsr hsr)
 
     switch ( hsr.bits & HSR_CP32_REGS_MASK )
     {
-    case HSR_CPREG32(DBGDIDR):
+    /*
+     * MDCR_EL2.TDOSA
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.15
+     * ARMv8 (DDI 0487A.d): D1-1509 Table D1-58
+     *
+     * Unhandled:
+     *    DBGOSLSR
+     *    DBGPRCR
+     */
+    case HSR_CPREG32(DBGOSLAR):
+        return handle_wo_wi(regs, r, cp32.read, hsr, 1);
+    case HSR_CPREG32(DBGOSDLR):
+        return handle_raz_wi(regs, r, cp32.read, hsr, 1);
 
-        /* Read-only register */
+    /*
+     * MDCR_EL2.TDA
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.15
+     * ARMv8 (DDI 0487A.d): D1-1510 Table D1-59
+     *
+     * Unhandled:
+     *    DBGDCCINT
+     *    DBGDTRRXint
+     *    DBGDTRTXint
+     *    DBGWFAR
+     *    DBGDTRTXext
+     *    DBGDTRRXext,
+     *    DBGBXVR<n>
+     *    DBGCLAIMSET
+     *    DBGCLAIMCLR
+     *    DBGAUTHSTATUS
+     *    DBGDEVID
+     *    DBGDEVID1
+     *    DBGDEVID2
+     *    DBGOSECCR
+     */
+    case HSR_CPREG32(DBGDIDR):
+        /*
+         * Read-only register. Accessible by EL0 if DBGDSCRext.UDCCdis
+         * is set to 0, which we emulated below.
+         */
         if ( !cp32.read )
-            goto bad_cp;
+            return inject_undef_exception(regs, hsr);
 
         /* Implement the minimum requirements:
          *  - Number of watchpoints: 1
@@ -1701,49 +1931,64 @@ static void do_cp14_32(struct cpu_user_regs *regs, union hsr hsr)
         break;
 
     case HSR_CPREG32(DBGDSCRINT):
+        /*
+         * Read-only register. Accessible by EL0 if DBGDSCRext.UDCCdis
+         * is set to 0, which we emulated below.
+         */
+        return handle_ro_raz(regs, r, cp32.read, hsr, 1);
+
     case HSR_CPREG32(DBGDSCREXT):
-        /* Implement debug status and control register as RAZ/WI.
-         * The OS won't use Hardware debug if MDBGen not set
+        /*
+         * Implement debug status and control register as RAZ/WI.
+         * The OS won't use Hardware debug if MDBGen not set.
          */
-        if ( cp32.read )
-           *r = 0;
-        break;
+        return handle_raz_wi(regs, r, cp32.read, hsr, 1);
+
     case HSR_CPREG32(DBGVCR):
-    case HSR_CPREG32(DBGOSLAR):
     case HSR_CPREG32(DBGBVR0):
     case HSR_CPREG32(DBGBCR0):
     case HSR_CPREG32(DBGWVR0):
     case HSR_CPREG32(DBGWCR0):
     case HSR_CPREG32(DBGBVR1):
     case HSR_CPREG32(DBGBCR1):
-    case HSR_CPREG32(DBGOSDLR):
-        /* RAZ/WI */
-        if ( cp32.read )
-            *r = 0;
-        break;
+        return handle_raz_wi(regs, r, cp32.read, hsr, 1);
 
+    /*
+     * CPTR_EL2.TTA
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.16
+     * ARMv8 (DDI 0487A.d): D1-1507 Table D1-54
+     *
+     *  - All implemented trace registers.
+     *
+     * MDCR_EL2.TDRA
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.15
+     * ARMv8 (DDI 0487A.d): D1-1508 Table D1-57
+     *
+     * Unhandled:
+     *    DBGDRAR (32-bit accesses)
+     *    DBGDSAR (32-bit accesses)
+     *
+     * And all other unknown registers.
+     */
     default:
-bad_cp:
-#ifndef NDEBUG
         gdprintk(XENLOG_ERR,
                  "%s p14, %d, r%d, cr%d, cr%d, %d @ 0x%"PRIregister"\n",
                   cp32.read ? "mrc" : "mcr",
                   cp32.op1, cp32.reg, cp32.crn, cp32.crm, cp32.op2, regs->pc);
         gdprintk(XENLOG_ERR, "unhandled 32-bit cp14 access %#x\n",
                  hsr.bits & HSR_CP32_REGS_MASK);
-#endif
-        inject_undef_exception(regs, hsr.len);
+        inject_undef_exception(regs, hsr);
         return;
     }
 
     advance_pc(regs, hsr);
 }
 
-static void do_cp14_dbg(struct cpu_user_regs *regs, union hsr hsr)
+static void do_cp14_64(struct cpu_user_regs *regs, const union hsr hsr)
 {
-#ifndef NDEBUG
-    struct hsr_cp64 cp64 = hsr.cp64;
-#endif
+    const struct hsr_cp64 cp64 = hsr.cp64;
 
     if ( !check_conditional_instr(regs, hsr) )
     {
@@ -1751,22 +1996,69 @@ static void do_cp14_dbg(struct cpu_user_regs *regs, union hsr hsr)
         return;
     }
 
-#ifndef NDEBUG
+    /*
+     * CPTR_EL2.TTA
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.16
+     * ARMv8 (DDI 0487A.d): D1-1507 Table D1-54
+     *
+     *  - All implemented trace registers.
+     *
+     * MDCR_EL2.TDRA
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.15
+     * ARMv8 (DDI 0487A.d): D1-1508 Table D1-57
+     *
+     * Unhandled:
+     *    DBGDRAR (64-bit accesses)
+     *    DBGDSAR (64-bit accesses)
+     *
+     * And all other unknown registers.
+     */
     gdprintk(XENLOG_ERR,
              "%s p14, %d, r%d, r%d, cr%d @ 0x%"PRIregister"\n",
              cp64.read ? "mrrc" : "mcrr",
              cp64.op1, cp64.reg1, cp64.reg2, cp64.crm, regs->pc);
     gdprintk(XENLOG_ERR, "unhandled 64-bit CP14 access %#x\n",
              hsr.bits & HSR_CP64_REGS_MASK);
-#endif
-    inject_undef_exception(regs, hsr.len);
+    inject_undef_exception(regs, hsr);
 }
 
-static void do_cp(struct cpu_user_regs *regs, union hsr hsr)
+static void do_cp14_dbg(struct cpu_user_regs *regs, const union hsr hsr)
 {
-#ifndef NDEBUG
-    struct hsr_cp cp = hsr.cp;
-#endif
+    struct hsr_cp64 cp64 = hsr.cp64;
+
+    if ( !check_conditional_instr(regs, hsr) )
+    {
+        advance_pc(regs, hsr);
+        return;
+    }
+
+    /*
+     * MDCR_EL2.TDOSA
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.15
+     * ARMv8 (DDI 0487A.d): D1-1509 Table D1-58
+     *
+     * Unhandled:
+     *    DBGDTRTXint
+     *    DBGDTRRXint
+     *
+     * And all other unknown registers.
+     */
+    gdprintk(XENLOG_ERR,
+             "%s p14, %d, r%d, r%d, cr%d @ 0x%"PRIregister"\n",
+             cp64.read ? "mrrc" : "mcrr",
+             cp64.op1, cp64.reg1, cp64.reg2, cp64.crm, regs->pc);
+    gdprintk(XENLOG_ERR, "unhandled 64-bit CP14 DBG access %#x\n",
+             hsr.bits & HSR_CP64_REGS_MASK);
+
+    inject_undef_exception(regs, hsr);
+}
+
+static void do_cp(struct cpu_user_regs *regs, const union hsr hsr)
+{
+    const struct hsr_cp cp = hsr.cp;
 
     if ( !check_conditional_instr(regs, hsr) )
     {
@@ -1774,27 +2066,114 @@ static void do_cp(struct cpu_user_regs *regs, union hsr hsr)
         return;
     }
 
-#ifndef NDEBUG
     ASSERT(!cp.tas); /* We don't trap SIMD instruction */
     gdprintk(XENLOG_ERR, "unhandled CP%d access\n", cp.coproc);
-#endif
-    inject_undef_exception(regs, hsr.len);
+    inject_undef_exception(regs, hsr);
 }
 
 #ifdef CONFIG_ARM_64
 static void do_sysreg(struct cpu_user_regs *regs,
-                      union hsr hsr)
+                      const union hsr hsr)
 {
     register_t *x = select_user_reg(regs, hsr.sysreg.reg);
+    struct vcpu *v = current;
 
     switch ( hsr.bits & HSR_SYSREG_REGS_MASK )
     {
-    /* RAZ/WI registers: */
-    /*  - Debug */
+    /*
+     * HCR_EL2.TACR
+     *
+     * ARMv8 (DDI 0487A.d): D7.2.1
+     */
+    case HSR_SYSREG_ACTLR_EL1:
+        if ( psr_mode_is_user(regs) )
+            return inject_undef_exception(regs, hsr);
+        if ( hsr.sysreg.read )
+           *x = v->arch.actlr;
+        break;
+
+    /*
+     * MDCR_EL2.TDRA
+     *
+     * ARMv8 (DDI 0487A.d): D1-1508 Table D1-57
+     */
+    case HSR_SYSREG_MDRAR_EL1:
+        return handle_ro_raz(regs, x, hsr.sysreg.read, hsr, 1);
+
+    /*
+     * MDCR_EL2.TDOSA
+     *
+     * ARMv8 (DDI 0487A.d): D1-1509 Table D1-58
+     *
+     * Unhandled:
+     *    OSLSR_EL1
+     *    DBGPRCR_EL1
+     */
+    case HSR_SYSREG_OSLAR_EL1:
+        return handle_wo_wi(regs, x, hsr.sysreg.read, hsr, 1);
+    case HSR_SYSREG_OSDLR_EL1:
+        return handle_raz_wi(regs, x, hsr.sysreg.read, hsr, 1);
+
+    /*
+     * MDCR_EL2.TDA
+     *
+     * ARMv8 (DDI 0487A.d): D1-1510 Table D1-59
+     *
+     * Unhandled:
+     *    MDCCINT_EL1
+     *    DBGDTR_EL0
+     *    DBGDTRRX_EL0
+     *    DBGDTRTX_EL0
+     *    OSDTRRX_EL1
+     *    OSDTRTX_EL1
+     *    OSECCR_EL1
+     *    DBGCLAIMSET_EL1
+     *    DBGCLAIMCLR_EL1
+     *    DBGAUTHSTATUS_EL1
+     */
     case HSR_SYSREG_MDSCR_EL1:
-    /*  - Perf monitors */
+        return handle_raz_wi(regs, x, hsr.sysreg.read, hsr, 1);
+    case HSR_SYSREG_MDCCSR_EL0:
+        /*
+         * Accessible at EL0 only if MDSCR_EL1.TDCC is set to 0. We emulate that
+         * register as RAZ/WI above. So RO at both EL0 and EL1.
+         */
+        return handle_ro_raz(regs, x, hsr.sysreg.read, hsr, 0);
+    HSR_SYSREG_DBG_CASES(DBGBVR):
+    HSR_SYSREG_DBG_CASES(DBGBCR):
+    HSR_SYSREG_DBG_CASES(DBGWVR):
+    HSR_SYSREG_DBG_CASES(DBGWCR):
+        return handle_raz_wi(regs, x, hsr.sysreg.read, hsr, 1);
+
+    /*
+     * MDCR_EL2.TPM
+     *
+     * ARMv8 (DDI 0487A.d): D1-1511 Table D1-61
+     *
+     * Unhandled:
+     *    PMEVCNTR<n>_EL0
+     *    PMEVTYPER<n>_EL0
+     *    PMCCFILTR_EL0
+     * MDCR_EL2.TPMCR
+     *
+     * ARMv7 (DDI 0406C.b): B1.14.17
+     * ARMv8 (DDI 0487A.d): D1-1511 Table D1-62
+     *
+     * NB: Both MDCR_EL2.TPM and MDCR_EL2.TPMCR cause trapping of PMCR.
+     */
     case HSR_SYSREG_PMINTENSET_EL1:
     case HSR_SYSREG_PMINTENCLR_EL1:
+        /*
+         * Accessible from EL1 only, but if EL0 trap happens handle as
+         * undef.
+         */
+        return handle_raz_wi(regs, x, hsr.sysreg.read, hsr, 1);
+    case HSR_SYSREG_PMUSERENR_EL0:
+        /* RO at EL0. RAZ/WI at EL1 */
+        if ( psr_mode_is_user(regs) )
+            return handle_ro_raz(regs, x, hsr.sysreg.read, hsr, 0);
+        else
+            return handle_raz_wi(regs, x, hsr.sysreg.read, hsr, 1);
     case HSR_SYSREG_PMCR_EL0:
     case HSR_SYSREG_PMCNTENSET_EL0:
     case HSR_SYSREG_PMCNTENCLR_EL0:
@@ -1806,42 +2185,37 @@ static void do_sysreg(struct cpu_user_regs *regs,
     case HSR_SYSREG_PMCCNTR_EL0:
     case HSR_SYSREG_PMXEVTYPER_EL0:
     case HSR_SYSREG_PMXEVCNTR_EL0:
-    case HSR_SYSREG_PMUSERENR_EL0:
     case HSR_SYSREG_PMOVSSET_EL0:
-    /* - Breakpoints */
-    HSR_SYSREG_DBG_CASES(DBGBVR):
-    HSR_SYSREG_DBG_CASES(DBGBCR):
-    /* - Watchpoints */
-    HSR_SYSREG_DBG_CASES(DBGWVR):
-    HSR_SYSREG_DBG_CASES(DBGWCR):
-    /* - Double Lock Register */
-    case HSR_SYSREG_OSDLR_EL1:
-        if ( hsr.sysreg.read )
-            *x = 0;
-        /* else: write ignored */
-        break;
+        /*
+         * Accessible at EL0 only if PMUSERENR_EL0.EN is set. We
+         * emulate that register as 0 above.
+         */
+        return handle_raz_wi(regs, x, hsr.sysreg.read, hsr, 1);
 
-    /* Write only, Write ignore registers: */
-    case HSR_SYSREG_OSLAR_EL1:
-        if ( hsr.sysreg.read )
-            goto bad_sysreg;
-        /* else: write ignored */
-        break;
+    /*
+     * !CNTHCTL_EL2.EL1PCEN
+     *
+     * ARMv8 (DDI 0487A.d): D1-1510 Table D1-60
+     */
     case HSR_SYSREG_CNTP_CTL_EL0:
     case HSR_SYSREG_CNTP_TVAL_EL0:
+    case HSR_SYSREG_CNTP_CVAL_EL0:
         if ( !vtimer_emulate(regs, hsr) )
-        {
-            dprintk(XENLOG_ERR,
-                    "failed emulation of 64-bit vtimer sysreg access\n");
-            domain_crash_synchronous();
-        }
+            return inject_undef_exception(regs, hsr);
         break;
+
+    /*
+     * HCR_EL2.FMO or HCR_EL2.IMO
+     *
+     * ARMv8: GIC Architecture Specification (PRD03-GENC-010745 24.0)
+     *        Section 4.6.8.
+     */
     case HSR_SYSREG_ICC_SGI1R_EL1:
         if ( !vgic_emulate(regs, hsr) )
         {
             dprintk(XENLOG_WARNING,
                     "failed emulation of sysreg ICC_SGI1R_EL1 access\n");
-            inject_undef64_exception(regs, hsr.len);
+            return inject_undef64_exception(regs, hsr.len);
         }
         break;
     case HSR_SYSREG_ICC_SGI0R_EL1:
@@ -1849,12 +2223,26 @@ static void do_sysreg(struct cpu_user_regs *regs,
         /* TBD: Implement to support secure grp0/1 SGI forwarding */
         dprintk(XENLOG_WARNING,
                 "Emulation of sysreg ICC_SGI0R_EL1/ASGI1R_EL1 not supported\n");
-        inject_undef64_exception(regs, hsr.len);
+        return inject_undef64_exception(regs, hsr.len);
+
+    /*
+     * HCR_EL2.TIDCP
+     *
+     * ARMv8 (DDI 0487A.d): D1-1501 Table D1-43
+     *
+     *  - Reserved control space for IMPLEMENTATION DEFINED functionality.
+     *
+     * CPTR_EL2.TTA
+     *
+     * ARMv8 (DDI 0487A.d): D1-1507 Table D1-54
+     *
+     *  - All implemented trace registers.
+     *
+     * And all other unknown registers.
+     */
     default:
- bad_sysreg:
         {
-            struct hsr_sysreg sysreg = hsr.sysreg;
-#ifndef NDEBUG
+            const struct hsr_sysreg sysreg = hsr.sysreg;
 
             gdprintk(XENLOG_ERR,
                      "%s %d, %d, c%d, c%d, %d %s x%d @ 0x%"PRIregister"\n",
@@ -1866,8 +2254,7 @@ static void do_sysreg(struct cpu_user_regs *regs,
                      sysreg.reg, regs->pc);
             gdprintk(XENLOG_ERR, "unhandled 64-bit sysreg access %#x\n",
                      hsr.bits & HSR_SYSREG_REGS_MASK);
-#endif
-            inject_undef_exception(regs, sysreg.len);
+            inject_undef_exception(regs, hsr);
             return;
         }
     }
@@ -1906,7 +2293,7 @@ void dump_guest_s1_walk(struct domain *d, vaddr_t addr)
         printk("Failed TTBR0 maddr lookup\n");
         goto done;
     }
-    first = map_domain_page(paddr>>PAGE_SHIFT);
+    first = map_domain_page(_mfn(paddr_to_pfn(paddr)));
 
     offset = addr >> (12+10);
     printk("1ST[0x%"PRIx32"] (0x%"PRIpaddr") = 0x%08"PRIx32"\n",
@@ -1922,7 +2309,7 @@ void dump_guest_s1_walk(struct domain *d, vaddr_t addr)
         printk("Failed L1 entry maddr lookup\n");
         goto done;
     }
-    second = map_domain_page(paddr>>PAGE_SHIFT);
+    second = map_domain_page(_mfn(paddr_to_pfn(paddr)));
     offset = (addr >> 12) & 0x3FF;
     printk("2ND[0x%"PRIx32"] (0x%"PRIpaddr") = 0x%08"PRIx32"\n",
            offset, paddr, second[offset]);
@@ -1933,16 +2320,56 @@ done:
 }
 
 static void do_trap_instr_abort_guest(struct cpu_user_regs *regs,
-                                      union hsr hsr)
+                                      const union hsr hsr)
 {
-    register_t addr = READ_SYSREG(FAR_EL2);
-    inject_iabt_exception(regs, addr, hsr.len);
+    int rc;
+    register_t gva = READ_SYSREG(FAR_EL2);
+
+    switch ( hsr.iabt.ifsc & 0x3f )
+    {
+    case FSC_FLT_PERM ... FSC_FLT_PERM + 3:
+    {
+        paddr_t gpa;
+        const struct npfec npfec = {
+            .insn_fetch = 1,
+            .gla_valid = 1,
+            .kind = hsr.iabt.s1ptw ? npfec_kind_in_gpt : npfec_kind_with_gla
+        };
+
+        if ( hsr.iabt.s1ptw )
+            gpa = READ_SYSREG(HPFAR_EL2);
+        else
+        {
+            /*
+             * Flush the TLB to make sure the DTLB is clear before
+             * doing GVA->IPA translation. If we got here because of
+             * an entry only present in the ITLB, this translation may
+             * still be inaccurate.
+             */
+            flush_tlb_local();
+
+            rc = gva_to_ipa(gva, &gpa, GV2M_READ);
+            if ( rc == -EFAULT )
+                goto bad_insn_abort;
+        }
+
+        rc = p2m_mem_access_check(gpa, gva, npfec);
+
+        /* Trap was triggered by mem_access, work here is done */
+        if ( !rc )
+            return;
+    }
+    break;
+    }
+
+bad_insn_abort:
+    inject_iabt_exception(regs, gva, hsr.len);
 }
 
 static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
-                                     union hsr hsr)
+                                     const union hsr hsr)
 {
-    struct hsr_dabt dabt = hsr.dabt;
+    const struct hsr_dabt dabt = hsr.dabt;
     int rc;
     mmio_info_t info;
 
@@ -1959,11 +2386,36 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
     info.gva = READ_SYSREG64(FAR_EL2);
 #endif
 
-    if (dabt.s1ptw)
-        goto bad_data_abort;
+    if ( dabt.s1ptw )
+        info.gpa = READ_SYSREG(HPFAR_EL2);
+    else
+    {
+        rc = gva_to_ipa(info.gva, &info.gpa, GV2M_READ);
+        if ( rc == -EFAULT )
+            goto bad_data_abort;
+    }
+
+    switch ( dabt.dfsc & 0x3f )
+    {
+    case FSC_FLT_PERM ... FSC_FLT_PERM + 3:
+    {
+        const struct npfec npfec = {
+            .read_access = !dabt.write,
+            .write_access = dabt.write,
+            .gla_valid = 1,
+            .kind = dabt.s1ptw ? npfec_kind_in_gpt : npfec_kind_with_gla
+        };
+
+        rc = p2m_mem_access_check(info.gpa, info.gva, npfec);
 
-    rc = gva_to_ipa(info.gva, &info.gpa);
-    if ( rc == -EFAULT )
+        /* Trap was triggered by mem_access, work here is done */
+        if ( !rc )
+            return;
+    }
+    break;
+    }
+
+    if ( dabt.s1ptw )
         goto bad_data_abort;
 
     /* XXX: Decode the instruction if ISS is not valid */
@@ -1979,7 +2431,7 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
         rc = decode_instruction(regs, &info.dabt);
         if ( rc )
         {
-            gdprintk(XENLOG_DEBUG, "Unable to decode instruction\n");
+            gprintk(XENLOG_DEBUG, "Unable to decode instruction\n");
             goto bad_data_abort;
         }
     }
@@ -1991,6 +2443,8 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
     }
 
 bad_data_abort:
+    gdprintk(XENLOG_DEBUG, "HSR=0x%x pc=%#"PRIregister" gva=%#"PRIvaddr
+             " gpa=%#"PRIpaddr"\n", hsr.bits, regs->pc, info.gva, info.gpa);
     inject_dabt_exception(regs, info.gva, hsr.len);
 }
 
@@ -2002,24 +2456,18 @@ static void enter_hypervisor_head(struct cpu_user_regs *regs)
 
 asmlinkage void do_trap_hypervisor(struct cpu_user_regs *regs)
 {
-    union hsr hsr = { .bits = READ_SYSREG32(ESR_EL2) };
+    const union hsr hsr = { .bits = READ_SYSREG32(ESR_EL2) };
 
     enter_hypervisor_head(regs);
 
-    /*
-     * We currently do not handle 32-bit userspace on 64-bit kernels
-     * correctly (See XSA-102). Until that is resolved we treat any
-     * trap from 32-bit userspace on 64-bit kernel as undefined.
-     */
-    if ( !hyp_mode(regs) && is_64bit_domain(current->domain) &&
-         psr_mode_is_32bit(regs->cpsr) )
-    {
-        inject_undef_exception(regs, hsr.len);
-        return;
-    }
-
     switch (hsr.ec) {
     case HSR_EC_WFI_WFE:
+        /*
+         * HCR_EL2.TWI, HCR_EL2.TWE
+         *
+         * ARMv7 (DDI 0406C.b): B1.14.9
+         * ARMv8 (DDI 0487A.d): D1-1505 Table D1-51
+         */
         if ( !check_conditional_instr(regs, hsr) )
         {
             advance_pc(regs, hsr);
@@ -2027,42 +2475,59 @@ asmlinkage void do_trap_hypervisor(struct cpu_user_regs *regs)
         }
         if ( hsr.wfi_wfe.ti ) {
             /* Yield the VCPU for WFE */
+            perfc_incr(trap_wfe);
             vcpu_yield();
         } else {
             /* Block the VCPU for WFI */
+            perfc_incr(trap_wfi);
             vcpu_block_unless_event_pending(current);
         }
         advance_pc(regs, hsr);
         break;
     case HSR_EC_CP15_32:
-        if ( !is_32bit_domain(current->domain) )
-            goto bad_trap;
+        GUEST_BUG_ON(!psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_cp15_32);
         do_cp15_32(regs, hsr);
         break;
     case HSR_EC_CP15_64:
-        if ( !is_32bit_domain(current->domain) )
-            goto bad_trap;
+        GUEST_BUG_ON(!psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_cp15_64);
         do_cp15_64(regs, hsr);
         break;
     case HSR_EC_CP14_32:
-        if ( !is_32bit_domain(current->domain) )
-            goto bad_trap;
+        GUEST_BUG_ON(!psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_cp14_32);
         do_cp14_32(regs, hsr);
         break;
+    case HSR_EC_CP14_64:
+        GUEST_BUG_ON(!psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_cp14_64);
+        do_cp14_64(regs, hsr);
+        break;
     case HSR_EC_CP14_DBG:
-        if ( !is_32bit_domain(current->domain) )
-            goto bad_trap;
+        GUEST_BUG_ON(!psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_cp14_dbg);
         do_cp14_dbg(regs, hsr);
         break;
     case HSR_EC_CP:
-        if ( !is_32bit_domain(current->domain) )
-            goto bad_trap;
+        GUEST_BUG_ON(!psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_cp);
         do_cp(regs, hsr);
         break;
     case HSR_EC_SMC32:
+        /*
+         * HCR_EL2.TSC
+         *
+         * ARMv7 (DDI 0406C.b): B1.14.8
+         * ARMv8 (DDI 0487A.d): D1-1501 Table D1-44
+         */
+        GUEST_BUG_ON(!psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_smc32);
         inject_undef32_exception(regs);
         break;
     case HSR_EC_HVC32:
+        GUEST_BUG_ON(!psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_hvc32);
 #ifndef NDEBUG
         if ( (hsr.iss & 0xff00) == 0xff00 )
             return do_debug_trap(regs, hsr.iss & 0x00ff);
@@ -2073,6 +2538,8 @@ asmlinkage void do_trap_hypervisor(struct cpu_user_regs *regs)
         break;
 #ifdef CONFIG_ARM_64
     case HSR_EC_HVC64:
+        GUEST_BUG_ON(psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_hvc64);
 #ifndef NDEBUG
         if ( (hsr.iss & 0xff00) == 0xff00 )
             return do_debug_trap(regs, hsr.iss & 0x00ff);
@@ -2082,19 +2549,28 @@ asmlinkage void do_trap_hypervisor(struct cpu_user_regs *regs)
         do_trap_hypercall(regs, &regs->x16, hsr.iss);
         break;
     case HSR_EC_SMC64:
+        /*
+         * HCR_EL2.TSC
+         *
+         * ARMv8 (DDI 0487A.d): D1-1501 Table D1-44
+         */
+        GUEST_BUG_ON(psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_smc64);
         inject_undef64_exception(regs, hsr.len);
         break;
     case HSR_EC_SYSREG:
-        if ( is_32bit_domain(current->domain) )
-            goto bad_trap;
+        GUEST_BUG_ON(psr_mode_is_32bit(regs->cpsr));
+        perfc_incr(trap_sysreg);
         do_sysreg(regs, hsr);
         break;
 #endif
 
     case HSR_EC_INSTR_ABORT_LOWER_EL:
+        perfc_incr(trap_iabt);
         do_trap_instr_abort_guest(regs, hsr);
         break;
     case HSR_EC_DATA_ABORT_LOWER_EL:
+        perfc_incr(trap_dabt);
         do_trap_data_abort_guest(regs, hsr);
         break;
 
@@ -2105,7 +2581,6 @@ asmlinkage void do_trap_hypervisor(struct cpu_user_regs *regs)
 #endif
 
     default:
- bad_trap:
         printk("Hypervisor Trap. HSR=0x%x EC=0x%x IL=%x Syndrome=0x%"PRIx32"\n",
                hsr.bits, hsr.ec, hsr.len, hsr.iss);
         do_unexpected_trap("Hypervisor", regs);
diff --git a/xen/arch/arm/vgic-v2.c b/xen/arch/arm/vgic-v2.c
index 86d3628..fa71598 100644
--- a/xen/arch/arm/vgic-v2.c
+++ b/xen/arch/arm/vgic-v2.c
@@ -24,14 +24,32 @@
 #include <xen/softirq.h>
 #include <xen/irq.h>
 #include <xen/sched.h>
+#include <xen/sizes.h>
 
 #include <asm/current.h>
-#include <asm/device.h>
 
 #include <asm/mmio.h>
-#include <asm/gic.h>
+#include <asm/platform.h>
 #include <asm/vgic.h>
 
+static struct {
+    bool_t enabled;
+    /* Distributor interface address */
+    paddr_t dbase;
+    /* CPU interface address */
+    paddr_t cbase;
+    /* Virtual CPU interface address */
+    paddr_t vbase;
+} vgic_v2_hw;
+
+void vgic_v2_setup_hw(paddr_t dbase, paddr_t cbase, paddr_t vbase)
+{
+    vgic_v2_hw.enabled = 1;
+    vgic_v2_hw.dbase = dbase;
+    vgic_v2_hw.cbase = cbase;
+    vgic_v2_hw.vbase = vbase;
+}
+
 static int vgic_v2_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
 {
     struct hsr_dabt dabt = info->dabt;
@@ -41,6 +59,8 @@ static int vgic_v2_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
     int gicd_reg = (int)(info->gpa - v->domain->arch.vgic.dbase);
     unsigned long flags;
 
+    perfc_incr(vgicd_reads);
+
     switch ( gicd_reg )
     {
     case GICD_CTLR:
@@ -54,7 +74,7 @@ static int vgic_v2_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
         /* No secure world support for guests. */
         vgic_lock(v);
         *r = ( ((v->domain->max_vcpus - 1) << GICD_TYPE_CPUS_SHIFT) )
-            |( ((v->domain->arch.vgic.nr_spis / 32)) & GICD_TYPE_LINES );
+            | DIV_ROUND_UP(v->domain->arch.vgic.nr_spis, 32);
         vgic_unlock(v);
         return 1;
     case GICD_IIDR:
@@ -92,41 +112,15 @@ static int vgic_v2_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
         vgic_unlock_rank(v, rank, flags);
         return 1;
 
+    /* Read the pending status of an IRQ via GICD is not supported */
     case GICD_ISPENDR ... GICD_ISPENDRN:
-        if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicd_reg - GICD_ISPENDR, DABT_WORD);
-        if ( rank == NULL) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = vgic_byte_read(rank->ipend, dabt.sign, gicd_reg);
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
-
     case GICD_ICPENDR ... GICD_ICPENDRN:
-        if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 0, gicd_reg - GICD_ICPENDR, DABT_WORD);
-        if ( rank == NULL) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = vgic_byte_read(rank->ipend, dabt.sign, gicd_reg);
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        goto read_as_zero;
 
+    /* Read the active status of an IRQ via GICD is not supported */
     case GICD_ISACTIVER ... GICD_ISACTIVERN:
-        if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicd_reg - GICD_ISACTIVER, DABT_WORD);
-        if ( rank == NULL) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = rank->iactive;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
-
     case GICD_ICACTIVER ... GICD_ICACTIVERN:
-        if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicd_reg - GICD_ICACTIVER, DABT_WORD);
-        if ( rank == NULL) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = rank->iactive;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        goto read_as_zero;
 
     case GICD_ITARGETSR ... GICD_ITARGETSRN:
         if ( dabt.size != DABT_BYTE && dabt.size != DABT_WORD ) goto bad_width;
@@ -172,23 +166,10 @@ static int vgic_v2_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
         *r = 0xdeadbeef;
         return 1;
 
+    /* Setting/Clearing the SGI pending bit via GICD is not supported */
     case GICD_CPENDSGIR ... GICD_CPENDSGIRN:
-        if ( dabt.size != DABT_BYTE && dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicd_reg - GICD_CPENDSGIR, DABT_WORD);
-        if ( rank == NULL) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = vgic_byte_read(rank->pendsgi, dabt.sign, gicd_reg);
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
-
     case GICD_SPENDSGIR ... GICD_SPENDSGIRN:
-        if ( dabt.size != DABT_BYTE && dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicd_reg - GICD_SPENDSGIR, DABT_WORD);
-        if ( rank == NULL) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = vgic_byte_read(rank->pendsgi, dabt.sign, gicd_reg);
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        goto read_as_zero;
 
     /* Implementation defined -- read as zero */
     case 0xfd0 ... 0xfe4:
@@ -237,16 +218,17 @@ static int vgic_v2_to_sgi(struct vcpu *v, register_t sgir)
     int virq;
     int irqmode;
     enum gic_sgi_mode sgi_mode;
-    unsigned long vcpu_mask = 0;
+    struct sgi_target target;
 
     irqmode = (sgir & GICD_SGI_TARGET_LIST_MASK) >> GICD_SGI_TARGET_LIST_SHIFT;
     virq = (sgir & GICD_SGI_INTID_MASK);
-    vcpu_mask = (sgir & GICD_SGI_TARGET_MASK) >> GICD_SGI_TARGET_SHIFT;
 
     /* Map GIC sgi value to enum value */
     switch ( irqmode )
     {
     case GICD_SGI_TARGET_LIST_VAL:
+        sgi_target_init(&target);
+        target.list = (sgir & GICD_SGI_TARGET_MASK) >> GICD_SGI_TARGET_SHIFT;
         sgi_mode = SGI_TARGET_LIST;
         break;
     case GICD_SGI_TARGET_OTHERS_VAL:
@@ -262,7 +244,7 @@ static int vgic_v2_to_sgi(struct vcpu *v, register_t sgir)
         return 0;
     }
 
-    return vgic_to_sgi(v, sgir, sgi_mode, virq, vcpu_mask);
+    return vgic_to_sgi(v, sgir, sgi_mode, virq, &target);
 }
 
 static int vgic_v2_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
@@ -275,6 +257,8 @@ static int vgic_v2_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
     uint32_t tr;
     unsigned long flags;
 
+    perfc_incr(vgicd_writes);
+
     switch ( gicd_reg )
     {
     case GICD_CTLR:
@@ -345,21 +329,17 @@ static int vgic_v2_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
 
     case GICD_ISACTIVER ... GICD_ISACTIVERN:
         if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicd_reg - GICD_ISACTIVER, DABT_WORD);
-        if ( rank == NULL) goto write_ignore;
-        vgic_lock_rank(v, rank, flags);
-        rank->iactive &= ~*r;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        printk(XENLOG_G_ERR
+               "%pv: vGICD: unhandled word write %#"PRIregister" to ISACTIVER%d\n",
+               v, *r, gicd_reg - GICD_ISACTIVER);
+        return 0;
 
     case GICD_ICACTIVER ... GICD_ICACTIVERN:
         if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicd_reg - GICD_ICACTIVER, DABT_WORD);
-        if ( rank == NULL) goto write_ignore;
-        vgic_lock_rank(v, rank, flags);
-        rank->iactive &= ~*r;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        printk(XENLOG_G_ERR
+               "%pv: vGICD: unhandled word write %#"PRIregister" to ICACTIVER%d\n",
+               v, *r, gicd_reg - GICD_ICACTIVER);
+        return 0;
 
     case GICD_ITARGETSR ... GICD_ITARGETSR + 7:
         /* SGI/PPI target is read only */
@@ -565,14 +545,50 @@ static int vgic_v2_vcpu_init(struct vcpu *v)
 
 static int vgic_v2_domain_init(struct domain *d)
 {
-    int i;
+    int i, ret;
+
+    /*
+     * The hardware domain gets the hardware address.
+     * Guests get the virtual platform layout.
+     */
+    if ( is_hardware_domain(d) )
+    {
+        d->arch.vgic.dbase = vgic_v2_hw.dbase;
+        d->arch.vgic.cbase = vgic_v2_hw.cbase;
+    }
+    else
+    {
+        d->arch.vgic.dbase = GUEST_GICD_BASE;
+        d->arch.vgic.cbase = GUEST_GICC_BASE;
+    }
+
+    /*
+     * Map the gic virtual cpu interface in the gic cpu interface
+     * region of the guest.
+     *
+     * The second page is always mapped at +4K irrespective of the
+     * GIC_64K_STRIDE quirk. The DTB passed to the guest reflects this.
+     */
+    ret = map_mmio_regions(d, paddr_to_pfn(d->arch.vgic.cbase), 1,
+                           paddr_to_pfn(vgic_v2_hw.vbase));
+    if ( ret )
+        return ret;
+
+    if ( !platform_has_quirk(PLATFORM_QUIRK_GIC_64K_STRIDE) )
+        ret = map_mmio_regions(d, paddr_to_pfn(d->arch.vgic.cbase + PAGE_SIZE),
+                               1, paddr_to_pfn(vgic_v2_hw.vbase + PAGE_SIZE));
+    else
+        ret = map_mmio_regions(d, paddr_to_pfn(d->arch.vgic.cbase + PAGE_SIZE),
+                               1, paddr_to_pfn(vgic_v2_hw.vbase + SZ_64K));
+
+    if ( ret )
+        return ret;
 
     /* By default deliver to CPU0 */
     for ( i = 0; i < DOMAIN_NR_RANKS(d); i++ )
         memset(d->arch.vgic.shared_irqs[i].v2.itargets, 0x1,
                sizeof(d->arch.vgic.shared_irqs[i].v2.itargets));
 
-    /* We rely on gicv_setup() to initialize dbase(vGIC distributor base) */
     register_mmio_handler(d, &vgic_v2_distr_mmio_handler, d->arch.vgic.dbase,
                           PAGE_SIZE);
 
@@ -584,10 +600,19 @@ static const struct vgic_ops vgic_v2_ops = {
     .domain_init = vgic_v2_domain_init,
     .get_irq_priority = vgic_v2_get_irq_priority,
     .get_target_vcpu = vgic_v2_get_target_vcpu,
+    .max_vcpus = 8,
 };
 
 int vgic_v2_init(struct domain *d)
 {
+    if ( !vgic_v2_hw.enabled )
+    {
+        printk(XENLOG_G_ERR
+               "d%d: vGICv2 is not supported on this platform.\n",
+               d->domain_id);
+        return -ENODEV;
+    }
+
     register_vgic_ops(d, &vgic_v2_ops);
 
     return 0;
diff --git a/xen/arch/arm/vgic-v3.c b/xen/arch/arm/vgic-v3.c
index d0f1ea1..f1c482d 100644
--- a/xen/arch/arm/vgic-v3.c
+++ b/xen/arch/arm/vgic-v3.c
@@ -27,10 +27,8 @@
 #include <xen/sched.h>
 #include <xen/sizes.h>
 #include <asm/current.h>
-#include <asm/device.h>
 #include <asm/mmio.h>
 #include <asm/gic_v3_defs.h>
-#include <asm/gic.h>
 #include <asm/vgic.h>
 
 /* GICD_PIDRn register values for ARM implementations */
@@ -45,42 +43,64 @@
 #define GICV3_GICR_PIDR2  GICV3_GICD_PIDR2
 #define GICV3_GICR_PIDR4  GICV3_GICD_PIDR4
 
-static struct vcpu *vgic_v3_irouter_to_vcpu(struct vcpu *v, uint64_t irouter)
+/*
+ * GICD_CTLR default value:
+ *      - No GICv2 compatibility => ARE = 1
+ */
+#define VGICD_CTLR_DEFAULT  (GICD_CTLR_ARE_NS)
+
+static struct {
+    bool_t enabled;
+    /* Distributor interface address */
+    paddr_t dbase;
+    /* Re-distributor regions */
+    unsigned int nr_rdist_regions;
+    const struct rdist_region *regions;
+    uint32_t rdist_stride; /* Re-distributor stride */
+} vgic_v3_hw;
+
+void vgic_v3_setup_hw(paddr_t dbase,
+                      unsigned int nr_rdist_regions,
+                      const struct rdist_region *regions,
+                      uint32_t rdist_stride)
 {
-    irouter &= ~(GICD_IROUTER_SPI_MODE_ANY);
-    irouter = irouter & MPIDR_AFF0_MASK;
-
-    return v->domain->vcpu[irouter];
+    vgic_v3_hw.enabled = 1;
+    vgic_v3_hw.dbase = dbase;
+    vgic_v3_hw.nr_rdist_regions = nr_rdist_regions;
+    vgic_v3_hw.regions = regions;
+    vgic_v3_hw.rdist_stride = rdist_stride;
 }
 
-static uint64_t vgic_v3_vcpu_to_irouter(struct vcpu *v,
-                                        unsigned int vcpu_id)
+static struct vcpu *vgic_v3_irouter_to_vcpu(struct domain *d, uint64_t irouter)
 {
-    uint64_t irq_affinity;
-    struct vcpu *v_target;
+    unsigned int vcpu_id;
+
+    /*
+     * When the Interrupt Route Mode is set, the IRQ targets any vCPUs.
+     * For simplicity, the IRQ is always routed to vCPU0.
+     */
+    if ( irouter & GICD_IROUTER_SPI_MODE_ANY )
+        return d->vcpu[0];
 
-    v_target = v->domain->vcpu[vcpu_id];
-    irq_affinity = (MPIDR_AFFINITY_LEVEL(v_target->arch.vmpidr, 3) << 32 |
-                    MPIDR_AFFINITY_LEVEL(v_target->arch.vmpidr, 2) << 16 |
-                    MPIDR_AFFINITY_LEVEL(v_target->arch.vmpidr, 1) << 8  |
-                    MPIDR_AFFINITY_LEVEL(v_target->arch.vmpidr, 0));
+    vcpu_id = vaffinity_to_vcpuid(irouter);
+    if ( vcpu_id >= d->max_vcpus )
+        return NULL;
 
-    return irq_affinity;
+    return d->vcpu[vcpu_id];
 }
 
 static struct vcpu *vgic_v3_get_target_vcpu(struct vcpu *v, unsigned int irq)
 {
-    uint64_t target;
+    struct vcpu *v_target;
     struct vgic_irq_rank *rank = vgic_rank_irq(v, irq);
 
     ASSERT(spin_is_locked(&rank->lock));
 
-    target = rank->v3.irouter[irq % 32];
-    target &= ~(GICD_IROUTER_SPI_MODE_ANY);
-    target &= MPIDR_AFF0_MASK;
-    ASSERT(target >= 0 && target < v->domain->max_vcpus);
+    v_target = vgic_v3_irouter_to_vcpu(v->domain, rank->v3.irouter[irq % 32]);
+
+    ASSERT(v_target != NULL);
 
-    return v->domain->vcpu[target];
+    return v_target;
 }
 
 static int __vgic_v3_rdistr_rd_mmio_read(struct vcpu *v, mmio_info_t *info,
@@ -95,7 +115,7 @@ static int __vgic_v3_rdistr_rd_mmio_read(struct vcpu *v, mmio_info_t *info,
     {
     case GICR_CTLR:
         /* We have not implemented LPI's, read zero */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICR_IIDR:
         if ( dabt.size != DABT_WORD ) goto bad_width;
         *r = GICV3_GICR_IIDR_VAL;
@@ -108,13 +128,17 @@ static int __vgic_v3_rdistr_rd_mmio_read(struct vcpu *v, mmio_info_t *info,
                MPIDR_AFFINITY_LEVEL(v->arch.vmpidr, 1) << 40 |
                MPIDR_AFFINITY_LEVEL(v->arch.vmpidr, 0) << 32);
         *r = aff;
+
+        if ( v->arch.vgic.flags & VGIC_V3_RDIST_LAST )
+            *r |= GICR_TYPER_LAST;
+
         return 1;
     case GICR_STATUSR:
         /* Not implemented */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICR_WAKER:
         /* Power management is not implemented */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICR_SETLPIR:
         /* WO. Read as zero */
         goto read_as_zero_64;
@@ -159,17 +183,17 @@ static int __vgic_v3_rdistr_rd_mmio_read(struct vcpu *v, mmio_info_t *info,
          return 1;
     case GICR_PIDR3:
         /* Manufacture/customer defined */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICR_PIDR4:
         if ( dabt.size != DABT_WORD ) goto bad_width;
         *r = GICV3_GICR_PIDR4;
          return 1;
     case GICR_PIDR5 ... GICR_PIDR7:
         /* Reserved0 */
-        goto read_as_zero;
+        goto read_as_zero_32;
     default:
         printk(XENLOG_G_ERR
-               "%pv: vGICR: read r%d offset %#08x\n not found",
+               "%pv: vGICR: unhandled read r%d offset %#08x\n",
                v, dabt.reg, gicr_reg);
         return 0;
     }
@@ -184,7 +208,7 @@ read_as_zero_64:
     *r = 0;
     return 1;
 
-read_as_zero:
+read_as_zero_32:
     if ( dabt.size != DABT_WORD ) goto bad_width;
     *r = 0;
     return 1;
@@ -201,19 +225,19 @@ static int __vgic_v3_rdistr_rd_mmio_write(struct vcpu *v, mmio_info_t *info,
     {
     case GICR_CTLR:
         /* LPI's not implemented */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICR_IIDR:
         /* RO */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICR_TYPER:
         /* RO */
         goto write_ignore_64;
     case GICR_STATUSR:
         /* Not implemented */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICR_WAKER:
         /* Power mgmt not implemented */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICR_SETLPIR:
         /* LPI is not implemented */
         goto write_ignore_64;
@@ -234,7 +258,7 @@ static int __vgic_v3_rdistr_rd_mmio_write(struct vcpu *v, mmio_info_t *info,
         goto write_ignore_64;
     case GICR_SYNCR:
         /* RO */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICR_MOVLPIR:
         /* LPI is not implemented */
         goto write_ignore_64;
@@ -243,9 +267,9 @@ static int __vgic_v3_rdistr_rd_mmio_write(struct vcpu *v, mmio_info_t *info,
         goto write_ignore_64;
     case GICR_PIDR7... GICR_PIDR0:
         /* RO */
-        goto write_ignore;
+        goto write_ignore_32;
     default:
-        printk(XENLOG_G_ERR "%pv: vGICR: write r%d offset %#08x\n not found",
+        printk(XENLOG_G_ERR "%pv: vGICR: unhandled write r%d offset %#08x\n",
                v, dabt.reg, gicr_reg);
         return 0;
     }
@@ -260,13 +284,13 @@ write_ignore_64:
     if ( dabt.size != DABT_DOUBLE_WORD ) goto bad_width;
     return 1;
 
-write_ignore:
+write_ignore_32:
     if ( dabt.size != DABT_WORD ) goto bad_width;
     return 1;
 }
 
-static int __vgic_v3_distr_common_mmio_read(struct vcpu *v, mmio_info_t *info,
-                                            uint32_t reg)
+static int __vgic_v3_distr_common_mmio_read(const char *name, struct vcpu *v,
+                                            mmio_info_t *info, uint32_t reg)
 {
     struct hsr_dabt dabt = info->dabt;
     struct cpu_user_regs *regs = guest_cpu_user_regs();
@@ -278,6 +302,7 @@ static int __vgic_v3_distr_common_mmio_read(struct vcpu *v, mmio_info_t *info,
     {
     case GICD_IGROUPR ... GICD_IGROUPRN:
         /* We do not implement security extensions for guests, read zero */
+        if ( dabt.size != DABT_WORD ) goto bad_width;
         goto read_as_zero;
     case GICD_ISENABLER ... GICD_ISENABLERN:
         if ( dabt.size != DABT_WORD ) goto bad_width;
@@ -295,38 +320,16 @@ static int __vgic_v3_distr_common_mmio_read(struct vcpu *v, mmio_info_t *info,
         *r = rank->ienable;
         vgic_unlock_rank(v, rank, flags);
         return 1;
+    /* Read the pending status of an IRQ via GICD/GICR is not supported */
     case GICD_ISPENDR ... GICD_ISPENDRN:
-        if ( dabt.size != DABT_BYTE && dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, reg - GICD_ISPENDR, DABT_WORD);
-        if ( rank == NULL ) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = vgic_byte_read(rank->ipend, dabt.sign, reg);
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
     case GICD_ICPENDR ... GICD_ICPENDRN:
-        if ( dabt.size != DABT_BYTE && dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, reg - GICD_ICPENDR, DABT_WORD);
-        if ( rank == NULL ) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = vgic_byte_read(rank->ipend, dabt.sign, reg);
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        goto read_as_zero;
+
+    /* Read the active status of an IRQ via GICD/GICR is not supported */
     case GICD_ISACTIVER ... GICD_ISACTIVERN:
-        if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, reg - GICD_ISACTIVER, DABT_WORD);
-        if ( rank == NULL ) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = rank->iactive;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
     case GICD_ICACTIVER ... GICD_ICACTIVERN:
-        if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, reg - GICD_ICACTIVER, DABT_WORD);
-        if ( rank == NULL ) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = rank->iactive;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        goto read_as_zero;
+
     case GICD_IPRIORITYR ... GICD_IPRIORITYRN:
         if ( dabt.size != DABT_BYTE && dabt.size != DABT_WORD ) goto bad_width;
         rank = vgic_rank_offset(v, 8, reg - GICD_IPRIORITYR, DABT_WORD);
@@ -349,26 +352,24 @@ static int __vgic_v3_distr_common_mmio_read(struct vcpu *v, mmio_info_t *info,
         return 1;
     default:
         printk(XENLOG_G_ERR
-               "%pv: vGICD/vGICR: unhandled read r%d offset %#08x\n",
-               v, dabt.reg, reg);
+               "%pv: %s: unhandled read r%d offset %#08x\n",
+               v, name, dabt.reg, reg);
         return 0;
     }
 
 bad_width:
-    printk(XENLOG_G_ERR
-           "%pv: vGICD/vGICR: bad read width %d r%d offset %#08x\n",
-           v, dabt.size, dabt.reg, reg);
+    printk(XENLOG_G_ERR "%pv: %s: bad read width %d r%d offset %#08x\n",
+           v, name, dabt.size, dabt.reg, reg);
     domain_crash_synchronous();
     return 0;
 
 read_as_zero:
-    if ( dabt.size != DABT_WORD ) goto bad_width;
     *r = 0;
     return 1;
 }
 
-static int __vgic_v3_distr_common_mmio_write(struct vcpu *v, mmio_info_t *info,
-                                             uint32_t reg)
+static int __vgic_v3_distr_common_mmio_write(const char *name, struct vcpu *v,
+                                             mmio_info_t *info, uint32_t reg)
 {
     struct hsr_dabt dabt = info->dabt;
     struct cpu_user_regs *regs = guest_cpu_user_regs();
@@ -381,7 +382,7 @@ static int __vgic_v3_distr_common_mmio_write(struct vcpu *v, mmio_info_t *info,
     {
     case GICD_IGROUPR ... GICD_IGROUPRN:
         /* We do not implement security extensions for guests, write ignore */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_ISENABLER ... GICD_ISENABLERN:
         if ( dabt.size != DABT_WORD ) goto bad_width;
         rank = vgic_rank_offset(v, 1, reg - GICD_ISENABLER, DABT_WORD);
@@ -406,36 +407,32 @@ static int __vgic_v3_distr_common_mmio_write(struct vcpu *v, mmio_info_t *info,
         return 1;
     case GICD_ISPENDR ... GICD_ISPENDRN:
         if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, reg - GICD_ISPENDR, DABT_WORD);
-        if ( rank == NULL ) goto write_ignore;
-        vgic_lock_rank(v, rank, flags);
-        rank->ipend = *r;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        printk(XENLOG_G_ERR
+               "%pv: %s: unhandled word write %#"PRIregister" to ISPENDR%d\n",
+               v, name, *r, reg - GICD_ISPENDR);
+        return 0;
+
     case GICD_ICPENDR ... GICD_ICPENDRN:
         if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, reg - GICD_ICPENDR, DABT_WORD);
-        if ( rank == NULL ) goto write_ignore;
-        vgic_lock_rank(v, rank, flags);
-        rank->ipend &= ~*r;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        printk(XENLOG_G_ERR
+               "%pv: %s: unhandled word write %#"PRIregister" to ICPENDR%d\n",
+               v, name, *r, reg - GICD_ICPENDR);
+        return 0;
+
     case GICD_ISACTIVER ... GICD_ISACTIVERN:
         if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, reg - GICD_ISACTIVER, DABT_WORD);
-        if ( rank == NULL ) goto write_ignore;
-        vgic_lock_rank(v, rank, flags);
-        rank->iactive &= ~*r;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        printk(XENLOG_G_ERR
+               "%pv: %s: unhandled word write %#"PRIregister" to ISACTIVER%d\n",
+               v, name, *r, reg - GICD_ISACTIVER);
+        return 0;
+
     case GICD_ICACTIVER ... GICD_ICACTIVERN:
         if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, reg - GICD_ICACTIVER, DABT_WORD);
-        if ( rank == NULL ) goto write_ignore;
-        vgic_lock_rank(v, rank, flags);
-        rank->iactive &= ~*r;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        printk(XENLOG_G_ERR
+               "%pv: %s: unhandled word write %#"PRIregister" to ICACTIVER%d\n",
+               v, name, *r, reg - GICD_ICACTIVER);
+        return 0;
+
     case GICD_IPRIORITYR ... GICD_IPRIORITYRN:
         if ( dabt.size != DABT_BYTE && dabt.size != DABT_WORD ) goto bad_width;
         rank = vgic_rank_offset(v, 8, reg - GICD_IPRIORITYR, DABT_WORD);
@@ -450,7 +447,7 @@ static int __vgic_v3_distr_common_mmio_write(struct vcpu *v, mmio_info_t *info,
         vgic_unlock_rank(v, rank, flags);
         return 1;
     case GICD_ICFGR: /* Restricted to configure SGIs */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_ICFGR + 4 ... GICD_ICFGRN: /* PPI + SPIs */
         /* ICFGR1 for PPI's, which is implementation defined
            if ICFGR1 is programmable or not. We chose to program */
@@ -463,20 +460,21 @@ static int __vgic_v3_distr_common_mmio_write(struct vcpu *v, mmio_info_t *info,
         return 1;
     default:
         printk(XENLOG_G_ERR
-               "%pv: vGICD/vGICR: unhandled write r%d=%"PRIregister" offset %#08x\n",
-               v, dabt.reg, *r, reg);
+               "%pv: %s: unhandled write r%d=%"PRIregister" offset %#08x\n",
+               v, name, dabt.reg, *r, reg);
         return 0;
     }
 
 bad_width:
     printk(XENLOG_G_ERR
-           "%pv: vGICD/vGICR: bad write width %d r%d=%"PRIregister" offset %#08x\n",
-           v, dabt.size, dabt.reg, *r, reg);
+           "%pv: %s: bad write width %d r%d=%"PRIregister" offset %#08x\n",
+           v, name, dabt.size, dabt.reg, *r, reg);
     domain_crash_synchronous();
     return 0;
 
-write_ignore:
+write_ignore_32:
     if ( dabt.size != DABT_WORD ) goto bad_width;
+write_ignore:
     return 1;
 }
 
@@ -486,14 +484,12 @@ static int vgic_v3_rdistr_sgi_mmio_read(struct vcpu *v, mmio_info_t *info,
     struct hsr_dabt dabt = info->dabt;
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     register_t *r = select_user_reg(regs, dabt.reg);
-    struct vgic_irq_rank *rank;
-    unsigned long flags;
 
     switch ( gicr_reg )
     {
     case GICR_IGRPMODR0:
         /* We do not implement security extensions for guests, read zero */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICR_IGROUPR0:
     case GICR_ISENABLER0:
     case GICR_ICENABLER0:
@@ -505,29 +501,21 @@ static int vgic_v3_rdistr_sgi_mmio_read(struct vcpu *v, mmio_info_t *info,
           * Above registers offset are common with GICD.
           * So handle in common with GICD handling
           */
-        return __vgic_v3_distr_common_mmio_read(v, info, gicr_reg);
+        return __vgic_v3_distr_common_mmio_read("vGICR: SGI", v, info,
+                                                gicr_reg);
+
+    /* Read the pending status of an SGI is via GICR is not supported */
     case GICR_ISPENDR0:
-        if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicr_reg - GICR_ISPENDR0, DABT_WORD);
-        if ( rank == NULL ) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = rank->pendsgi;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
     case GICR_ICPENDR0:
-        if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicr_reg - GICR_ICPENDR0, DABT_WORD);
-        if ( rank == NULL ) goto read_as_zero;
-        vgic_lock_rank(v, rank, flags);
-        *r = rank->pendsgi;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        goto read_as_zero;
+
     case GICR_NSACR:
-        if ( dabt.size != DABT_WORD ) goto bad_width;
-        return 1;
+        /* We do not implement security extensions for guests, read zero */
+        goto read_as_zero_32;
+
     default:
         printk(XENLOG_G_ERR
-               "%pv: vGICR: SGI: read r%d offset %#08x\n not found",
+               "%pv: vGICR: SGI: unhandled read r%d offset %#08x\n",
                v, dabt.reg, gicr_reg);
         return 0;
     }
@@ -537,8 +525,9 @@ bad_width:
     domain_crash_synchronous();
     return 0;
 
-read_as_zero:
+read_as_zero_32:
     if ( dabt.size != DABT_WORD ) goto bad_width;
+read_as_zero:
     *r = 0;
     return 1;
 }
@@ -549,14 +538,12 @@ static int vgic_v3_rdistr_sgi_mmio_write(struct vcpu *v, mmio_info_t *info,
     struct hsr_dabt dabt = info->dabt;
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     register_t *r = select_user_reg(regs, dabt.reg);
-    struct vgic_irq_rank *rank;
-    unsigned long flags;
 
     switch ( gicr_reg )
     {
     case GICR_IGRPMODR0:
         /* We do not implement security extensions for guests, write ignore */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICR_IGROUPR0:
     case GICR_ISENABLER0:
     case GICR_ICENABLER0:
@@ -568,31 +555,28 @@ static int vgic_v3_rdistr_sgi_mmio_write(struct vcpu *v, mmio_info_t *info,
           * Above registers offset are common with GICD.
           * So handle common with GICD handling
           */
-        return __vgic_v3_distr_common_mmio_write(v, info, gicr_reg);
+        return __vgic_v3_distr_common_mmio_write("vGICR: SGI", v,
+                                                 info, gicr_reg);
     case GICR_ISPENDR0:
         if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicr_reg - GICR_ISACTIVER0, DABT_WORD);
-        if ( rank == NULL ) goto write_ignore;
-        vgic_lock_rank(v, rank, flags);
-        /* TODO: we just store the SGI pending status. Handle it properly */
-        rank->pendsgi |= *r;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        printk(XENLOG_G_ERR
+               "%pv: vGICR: SGI: unhandled word write %#"PRIregister" to ISPENDR0\n",
+               v, *r);
+        return 0;
+
     case GICR_ICPENDR0:
         if ( dabt.size != DABT_WORD ) goto bad_width;
-        rank = vgic_rank_offset(v, 1, gicr_reg - GICR_ISACTIVER0, DABT_WORD);
-        if ( rank == NULL ) goto write_ignore;
-        vgic_lock_rank(v, rank, flags);
-        /* TODO: we just store the SGI pending status. Handle it properly */
-        rank->pendsgi &= ~*r;
-        vgic_unlock_rank(v, rank, flags);
-        return 1;
+        printk(XENLOG_G_ERR
+               "%pv: vGICR: SGI: unhandled word write %#"PRIregister" to ICPENDR0\n",
+               v, *r);
+        return 0;
+
     case GICR_NSACR:
         /* We do not implement security extensions for guests, write ignore */
-        goto write_ignore;
+        goto write_ignore_32;
     default:
         printk(XENLOG_G_ERR
-               "%pv: vGICR: SGI: write r%d offset %#08x\n not found",
+               "%pv: vGICR: SGI: unhandled write r%d offset %#08x\n",
                v, dabt.reg, gicr_reg);
         return 0;
     }
@@ -604,20 +588,61 @@ bad_width:
     domain_crash_synchronous();
     return 0;
 
-write_ignore:
+write_ignore_32:
     if ( dabt.size != DABT_WORD ) goto bad_width;
     return 1;
 }
 
+static inline struct vcpu *get_vcpu_from_rdist(paddr_t gpa,
+                                               struct vcpu *v,
+                                               uint32_t *offset)
+{
+    struct domain *d = v->domain;
+    uint32_t stride = d->arch.vgic.rdist_stride;
+    paddr_t base;
+    int i, vcpu_id;
+    struct vgic_rdist_region *region;
+
+    *offset = gpa & (stride - 1);
+    base = gpa & ~((paddr_t)stride - 1);
+
+    /* Fast path: the VCPU is trying to access its re-distributor */
+    if ( likely(v->arch.vgic.rdist_base == base) )
+        return v;
+
+    /* Slow path: the VCPU is trying to access another re-distributor */
+
+    /*
+     * Find the region where the re-distributor lives. For this purpose,
+     * we look one region ahead as only MMIO range for redistributors
+     * traps here.
+     * Note: The region has been ordered during the GIC initialization
+     */
+    for ( i = 1; i < d->arch.vgic.nr_regions; i++ )
+    {
+        if ( base < d->arch.vgic.rdist_regions[i].base )
+            break;
+    }
+
+    region = &d->arch.vgic.rdist_regions[i - 1];
+
+    vcpu_id = region->first_cpu + ((base - region->base) / stride);
+
+    if ( unlikely(vcpu_id >= d->max_vcpus) )
+        return NULL;
+
+    return d->vcpu[vcpu_id];
+}
+
 static int vgic_v3_rdistr_mmio_read(struct vcpu *v, mmio_info_t *info)
 {
     uint32_t offset;
 
-    if ( v->domain->arch.vgic.rdist_stride != 0 )
-        offset = info->gpa & (v->domain->arch.vgic.rdist_stride - 1);
-    else
-        /* If stride is not set. Default 128K */
-        offset = info->gpa & (SZ_128K - 1);
+    perfc_incr(vgicr_reads);
+
+    v = get_vcpu_from_rdist(info->gpa, v, &offset);
+    if ( unlikely(!v) )
+        return 0;
 
     if ( offset < SZ_64K )
         return __vgic_v3_rdistr_rd_mmio_read(v, info, offset);
@@ -635,11 +660,11 @@ static int vgic_v3_rdistr_mmio_write(struct vcpu *v, mmio_info_t *info)
 {
     uint32_t offset;
 
-    if ( v->domain->arch.vgic.rdist_stride != 0 )
-        offset = info->gpa & (v->domain->arch.vgic.rdist_stride - 1);
-    else
-        /* If stride is not set. Default 128K */
-        offset = info->gpa & (SZ_128K - 1);
+    perfc_incr(vgicr_writes);
+
+    v = get_vcpu_from_rdist(info->gpa, v, &offset);
+    if ( unlikely(!v) )
+        return 0;
 
     if ( offset < SZ_64K )
         return __vgic_v3_rdistr_rd_mmio_write(v, info, offset);
@@ -660,10 +685,10 @@ static int vgic_v3_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
     register_t *r = select_user_reg(regs, dabt.reg);
     struct vgic_irq_rank *rank;
     unsigned long flags;
-    uint64_t irouter;
-    unsigned int vcpu_id;
     int gicd_reg = (int)(info->gpa - v->domain->arch.vgic.dbase);
 
+    perfc_incr(vgicd_reads);
+
     switch ( gicd_reg )
     {
     case GICD_CTLR:
@@ -688,7 +713,7 @@ static int vgic_v3_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
         if ( dabt.size != DABT_WORD ) goto bad_width;
         /* No secure world support for guests. */
         *r = ((ncpus - 1) << GICD_TYPE_CPUS_SHIFT |
-              ((v->domain->arch.vgic.nr_spis / 32) & GICD_TYPE_LINES));
+              DIV_ROUND_UP(v->domain->arch.vgic.nr_spis, 32));
 
         *r |= (irq_bits - 1) << GICD_TYPE_ID_BITS_SHIFT;
 
@@ -699,7 +724,7 @@ static int vgic_v3_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
          *  Optional, Not implemented for now.
          *  Update to support guest debugging.
          */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICD_IIDR:
         if ( dabt.size != DABT_WORD ) goto bad_width;
         *r = GICV3_GICD_IIDR_VAL;
@@ -707,7 +732,7 @@ static int vgic_v3_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
     case 0x020 ... 0x03c:
     case 0xc000 ... 0xffcc:
         /* Implementation defined -- read as zero */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICD_IGROUPR ... GICD_IGROUPRN:
     case GICD_ISENABLER ... GICD_ISENABLERN:
     case GICD_ICENABLER ... GICD_ICENABLERN:
@@ -720,7 +745,7 @@ static int vgic_v3_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
          * Above all register are common with GICR and GICD
          * Manage in common
          */
-        return __vgic_v3_distr_common_mmio_read(v, info, gicd_reg);
+        return __vgic_v3_distr_common_mmio_read("vGICD", v, info, gicd_reg);
     case GICD_IROUTER ... GICD_IROUTER31:
         /* SGI/PPI is RES0 */
         goto read_as_zero_64;
@@ -730,31 +755,22 @@ static int vgic_v3_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
                                 DABT_DOUBLE_WORD);
         if ( rank == NULL ) goto read_as_zero;
         vgic_lock_rank(v, rank, flags);
-        irouter = rank->v3.irouter[REG_RANK_INDEX(64,
-                                  (gicd_reg - GICD_IROUTER), DABT_DOUBLE_WORD)];
-        /* XXX: bit[31] stores IRQ mode. Just return */
-        if ( irouter & GICD_IROUTER_SPI_MODE_ANY )
-        {
-            *r = GICD_IROUTER_SPI_MODE_ANY;
-            vgic_unlock_rank(v, rank, flags);
-            return 1;
-        }
-        vcpu_id = irouter;
-        *r = vgic_v3_vcpu_to_irouter(v, vcpu_id);
+        *r = rank->v3.irouter[REG_RANK_INDEX(64,
+                              (gicd_reg - GICD_IROUTER), DABT_DOUBLE_WORD)];
         vgic_unlock_rank(v, rank, flags);
         return 1;
     case GICD_NSACR ... GICD_NSACRN:
         /* We do not implement security extensions for guests, read zero */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICD_SGIR:
         /* Read as ICH_SGIR system register with SRE set. So ignore */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICD_CPENDSGIR ... GICD_CPENDSGIRN:
         /* Replaced with GICR_ICPENDR0. So ignore write */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICD_SPENDSGIR ... GICD_SPENDSGIRN:
         /* Replaced with GICR_ISPENDR0. So ignore write */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICD_PIDR0:
         /* GICv3 identification value */
         if ( dabt.size != DABT_WORD ) goto bad_width;
@@ -772,7 +788,7 @@ static int vgic_v3_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
         return 1;
     case GICD_PIDR3:
         /* GICv3 identification value. Manufacturer/Customer defined */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case GICD_PIDR4:
         /* GICv3 identification value */
         if ( dabt.size != DABT_WORD ) goto bad_width;
@@ -780,7 +796,7 @@ static int vgic_v3_distr_mmio_read(struct vcpu *v, mmio_info_t *info)
         return 1;
     case GICD_PIDR5 ... GICD_PIDR7:
         /* Reserved0 */
-        goto read_as_zero;
+        goto read_as_zero_32;
     case 0x00c:
     case 0x044:
     case 0x04c:
@@ -809,10 +825,14 @@ read_as_zero_64:
     *r = 0;
     return 1;
 
-read_as_zero:
+read_as_zero_32:
     if ( dabt.size != DABT_WORD ) goto bad_width;
     *r = 0;
     return 1;
+
+read_as_zero:
+    *r = 0;
+    return 1;
 }
 
 static int vgic_v3_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
@@ -822,45 +842,54 @@ static int vgic_v3_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
     register_t *r = select_user_reg(regs, dabt.reg);
     struct vgic_irq_rank *rank;
     unsigned long flags;
-    uint64_t new_irouter, new_target, old_target;
+    uint64_t new_irouter, old_irouter;
     struct vcpu *old_vcpu, *new_vcpu;
     int gicd_reg = (int)(info->gpa - v->domain->arch.vgic.dbase);
 
+    perfc_incr(vgicd_writes);
+
     switch ( gicd_reg )
     {
     case GICD_CTLR:
         if ( dabt.size != DABT_WORD ) goto bad_width;
-        /* Ignore all but the enable bit */
-        v->domain->arch.vgic.ctlr = (*r) & GICD_CTL_ENABLE;
+
+        vgic_lock(v);
+        /* Only EnableGrp1A can be changed */
+        if ( *r & GICD_CTLR_ENABLE_G1A )
+            v->domain->arch.vgic.ctlr |= GICD_CTLR_ENABLE_G1A;
+        else
+            v->domain->arch.vgic.ctlr &= ~GICD_CTLR_ENABLE_G1A;
+        vgic_unlock(v);
+
         return 1;
     case GICD_TYPER:
         /* RO -- write ignored */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_IIDR:
         /* RO -- write ignored */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_STATUSR:
         /* RO -- write ignored */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_SETSPI_NSR:
         /* Message based SPI is not implemented */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_CLRSPI_NSR:
         /* Message based SPI is not implemented */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_SETSPI_SR:
         /* Message based SPI is not implemented */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_CLRSPI_SR:
         /* Message based SPI is not implemented */
-        goto write_ignore;
+        goto write_ignore_32;
     case 0x020 ... 0x03c:
     case 0xc000 ... 0xffcc:
         /* Implementation defined -- write ignored */
         printk(XENLOG_G_DEBUG
                "%pv: vGICD: WI on implementation defined register offset %#08x\n",
                v, gicd_reg);
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_IGROUPR ... GICD_IGROUPRN:
     case GICD_ISENABLER ... GICD_ISENABLERN:
     case GICD_ICENABLER ... GICD_ICENABLERN:
@@ -872,7 +901,7 @@ static int vgic_v3_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
     case GICD_ICFGR ... GICD_ICFGRN:
         /* Above registers are common with GICR and GICD
          * Manage in common */
-        return __vgic_v3_distr_common_mmio_write(v, info, gicd_reg);
+        return __vgic_v3_distr_common_mmio_write("vGICD", v, info, gicd_reg);
     case GICD_IROUTER ... GICD_IROUTER31:
         /* SGI/PPI is RES0 */
         goto write_ignore_64;
@@ -880,53 +909,41 @@ static int vgic_v3_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
         if ( dabt.size != DABT_DOUBLE_WORD ) goto bad_width;
         rank = vgic_rank_offset(v, 64, gicd_reg - GICD_IROUTER,
                                 DABT_DOUBLE_WORD);
-        if ( rank == NULL ) goto write_ignore_64;
-        BUG_ON(v->domain->max_vcpus > 8);
+        if ( rank == NULL ) goto write_ignore;
         new_irouter = *r;
         vgic_lock_rank(v, rank, flags);
 
-        old_target = rank->v3.irouter[REG_RANK_INDEX(64,
-                              (gicd_reg - GICD_IROUTER), DABT_DOUBLE_WORD)];
-        old_target &= ~(GICD_IROUTER_SPI_MODE_ANY);
-        if ( new_irouter & GICD_IROUTER_SPI_MODE_ANY )
+        old_irouter = rank->v3.irouter[REG_RANK_INDEX(64,
+                                       (gicd_reg - GICD_IROUTER),
+                                       DABT_DOUBLE_WORD)];
+        old_vcpu = vgic_v3_irouter_to_vcpu(v->domain, old_irouter);
+        new_vcpu = vgic_v3_irouter_to_vcpu(v->domain, new_irouter);
+
+        if ( !new_vcpu )
         {
+            printk(XENLOG_G_DEBUG
+                   "%pv: vGICD: wrong irouter at offset %#08x val %#"PRIregister,
+                   v, gicd_reg, *r);
+            vgic_unlock_rank(v, rank, flags);
             /*
-             * IRQ routing mode set. Route any one processor in the entire
-             * system. We chose vcpu 0 and set IRQ mode bit[31] in irouter.
+             * TODO: Don't inject a fault to the guest when the MPIDR is
+             * not valid. From the spec, the interrupt should be
+             * ignored.
              */
-            new_target = 0;
-            new_vcpu = v->domain->vcpu[0];
-            new_irouter = GICD_IROUTER_SPI_MODE_ANY;
-        }
-        else
-        {
-            new_target = new_irouter & MPIDR_AFF0_MASK;
-            if ( new_target >= v->domain->max_vcpus )
-            {
-                printk(XENLOG_G_DEBUG
-                       "%pv: vGICD: wrong irouter at offset %#08x\n val 0x%lx vcpu %x",
-                       v, gicd_reg, new_target, v->domain->max_vcpus);
-                vgic_unlock_rank(v, rank, flags);
-                return 0;
-            }
-            new_vcpu = vgic_v3_irouter_to_vcpu(v, new_irouter);
+            return 0;
         }
-
         rank->v3.irouter[REG_RANK_INDEX(64, (gicd_reg - GICD_IROUTER),
                          DABT_DOUBLE_WORD)] = new_irouter;
-        if ( old_target != new_target )
-        {
-            old_vcpu = v->domain->vcpu[old_target];
+        if ( old_vcpu != new_vcpu )
             vgic_migrate_irq(old_vcpu, new_vcpu, (gicd_reg - GICD_IROUTER)/8);
-        }
         vgic_unlock_rank(v, rank, flags);
         return 1;
     case GICD_NSACR ... GICD_NSACRN:
         /* We do not implement security extensions for guests, write ignore */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_SGIR:
         /* it is accessed as system register in GICv3 */
-        goto write_ignore;
+        goto write_ignore_32;
     case GICD_CPENDSGIR ... GICD_CPENDSGIRN:
         /* Replaced with GICR_ICPENDR0. So ignore write */
         if ( dabt.size != DABT_WORD ) goto bad_width;
@@ -937,7 +954,7 @@ static int vgic_v3_distr_mmio_write(struct vcpu *v, mmio_info_t *info)
         return 0;
     case GICD_PIDR7... GICD_PIDR0:
         /* RO -- write ignore */
-        goto write_ignore;
+        goto write_ignore_32;
     case 0x00c:
     case 0x044:
     case 0x04c:
@@ -963,13 +980,16 @@ bad_width:
     domain_crash_synchronous();
     return 0;
 
-write_ignore:
+write_ignore_32:
     if ( dabt.size != DABT_WORD ) goto bad_width;
     return 1;
 
 write_ignore_64:
     if ( dabt.size != DABT_DOUBLE_WORD ) goto bad_width;
     return 1;
+
+write_ignore:
+    return 1;
 }
 
 static int vgic_v3_to_sgi(struct vcpu *v, register_t sgir)
@@ -977,28 +997,30 @@ static int vgic_v3_to_sgi(struct vcpu *v, register_t sgir)
     int virq;
     int irqmode;
     enum gic_sgi_mode sgi_mode;
-    unsigned long vcpu_mask = 0;
+    struct sgi_target target;
 
     irqmode = (sgir >> ICH_SGI_IRQMODE_SHIFT) & ICH_SGI_IRQMODE_MASK;
     virq = (sgir >> ICH_SGI_IRQ_SHIFT ) & ICH_SGI_IRQ_MASK;
-    /* SGI's are injected at Rdist level 0. ignoring affinity 1, 2, 3 */
-    vcpu_mask = sgir & ICH_SGI_TARGETLIST_MASK;
 
     /* Map GIC sgi value to enum value */
     switch ( irqmode )
     {
     case ICH_SGI_TARGET_LIST:
+        sgi_target_init(&target);
+        /* We assume that only AFF1 is used in ICC_SGI1R_EL1. */
+        target.aff1 = (sgir >> ICH_SGI_AFFINITY_LEVEL(1)) & ICH_SGI_AFFx_MASK;
+        target.list = sgir & ICH_SGI_TARGETLIST_MASK;
         sgi_mode = SGI_TARGET_LIST;
         break;
     case ICH_SGI_TARGET_OTHERS:
         sgi_mode = SGI_TARGET_OTHERS;
         break;
     default:
-        gdprintk(XENLOG_WARNING, "Wrong irq mode in SGI1R_EL1 register\n");
+        gprintk(XENLOG_WARNING, "Wrong irq mode in SGI1R_EL1 register\n");
         return 0;
     }
 
-    return vgic_to_sgi(v, sgir, sgi_mode, virq, vcpu_mask);
+    return vgic_to_sgi(v, sgir, sgi_mode, virq, &target);
 }
 
 static int vgic_v3_emulate_sysreg(struct cpu_user_regs *regs, union hsr hsr)
@@ -1009,6 +1031,11 @@ static int vgic_v3_emulate_sysreg(struct cpu_user_regs *regs, union hsr hsr)
 
     ASSERT (hsr.ec == HSR_EC_SYSREG);
 
+    if ( sysreg.read )
+        perfc_incr(vgic_sysreg_reads);
+    else
+        perfc_incr(vgic_sysreg_writes);
+
     switch ( hsr.bits & HSR_SYSREG_REGS_MASK )
     {
     case HSR_SYSREG_ICC_SGI1R_EL1:
@@ -1017,7 +1044,7 @@ static int vgic_v3_emulate_sysreg(struct cpu_user_regs *regs, union hsr hsr)
             return vgic_v3_to_sgi(v, *r);
         else
         {
-            gdprintk(XENLOG_WARNING, "Reading SGI1R_EL1 - WO register\n");
+            gprintk(XENLOG_WARNING, "Reading SGI1R_EL1 - WO register\n");
             return 0;
         }
     default:
@@ -1051,6 +1078,13 @@ static int vgic_v3_vcpu_init(struct vcpu *v)
 {
     int i;
     uint64_t affinity;
+    paddr_t rdist_base;
+    struct vgic_rdist_region *region;
+    unsigned int last_cpu;
+
+    /* Convenient alias */
+    struct domain *d = v->domain;
+    uint32_t rdist_stride = d->arch.vgic.rdist_stride;
 
     /* For SGI and PPI the target is always this CPU */
     affinity = (MPIDR_AFFINITY_LEVEL(v->arch.vmpidr, 3) << 32 |
@@ -1061,6 +1095,45 @@ static int vgic_v3_vcpu_init(struct vcpu *v)
     for ( i = 0 ; i < 32 ; i++ )
         v->arch.vgic.private_irqs->v3.irouter[i] = affinity;
 
+    /*
+     * Find the region where the re-distributor lives. For this purpose,
+     * we look one region ahead as we have only the first CPU in hand.
+     */
+    for ( i = 1; i < d->arch.vgic.nr_regions; i++ )
+    {
+        if ( v->vcpu_id < d->arch.vgic.rdist_regions[i].first_cpu )
+            break;
+    }
+
+    region = &d->arch.vgic.rdist_regions[i - 1];
+
+    /* Get the base address of the redistributor */
+    rdist_base = region->base;
+    rdist_base += (v->vcpu_id - region->first_cpu) * rdist_stride;
+
+    /* Check if a valid region was found for the re-distributor */
+    if ( (rdist_base < region->base) ||
+         ((rdist_base + rdist_stride) > (region->base + region->size)) )
+    {
+        dprintk(XENLOG_ERR,
+                "d%u: Unable to find a re-distributor for VCPU %u\n",
+                d->domain_id, v->vcpu_id);
+        return -EINVAL;
+    }
+
+    v->arch.vgic.rdist_base = rdist_base;
+
+    /*
+     * If the redistributor is the last one of the
+     * contiguous region of the vCPU is the last of the domain, set
+     * VGIC_V3_RDIST_LAST flags.
+     * Note that we are assuming max_vcpus will never change.
+     */
+    last_cpu = (region->size / rdist_stride) + region->first_cpu - 1;
+
+    if ( v->vcpu_id == last_cpu || (v->vcpu_id == (d->max_vcpus - 1)) )
+        v->arch.vgic.flags |= VGIC_V3_RDIST_LAST;
+
     return 0;
 }
 
@@ -1068,24 +1141,78 @@ static int vgic_v3_domain_init(struct domain *d)
 {
     int i, idx;
 
+    /*
+     * Domain 0 gets the hardware address.
+     * Guests get the virtual platform layout.
+     */
+    if ( is_hardware_domain(d) )
+    {
+        unsigned int first_cpu = 0;
+
+        d->arch.vgic.dbase = vgic_v3_hw.dbase;
+
+        d->arch.vgic.rdist_stride = vgic_v3_hw.rdist_stride;
+        /*
+         * If the stride is not set, the default stride for GICv3 is 2 * 64K:
+         *     - first 64k page for Control and Physical LPIs
+         *     - second 64k page for Control and Generation of SGIs
+         */
+        if ( !d->arch.vgic.rdist_stride )
+            d->arch.vgic.rdist_stride = 2 * SZ_64K;
+
+        for ( i = 0; i < vgic_v3_hw.nr_rdist_regions; i++ )
+        {
+            paddr_t size = vgic_v3_hw.regions[i].size;
+
+            d->arch.vgic.rdist_regions[i].base = vgic_v3_hw.regions[i].base;
+            d->arch.vgic.rdist_regions[i].size = size;
+
+            /* Set the first CPU handled by this region */
+            d->arch.vgic.rdist_regions[i].first_cpu = first_cpu;
+
+            first_cpu += size / d->arch.vgic.rdist_stride;
+        }
+        d->arch.vgic.nr_regions = vgic_v3_hw.nr_rdist_regions;
+    }
+    else
+    {
+        d->arch.vgic.dbase = GUEST_GICV3_GICD_BASE;
+
+        /* XXX: Only one Re-distributor region mapped for the guest */
+        BUILD_BUG_ON(GUEST_GICV3_RDIST_REGIONS != 1);
+
+        d->arch.vgic.nr_regions = GUEST_GICV3_RDIST_REGIONS;
+        d->arch.vgic.rdist_stride = GUEST_GICV3_RDIST_STRIDE;
+
+        /* The first redistributor should contain enough space for all CPUs */
+        BUILD_BUG_ON((GUEST_GICV3_GICR0_SIZE / GUEST_GICV3_RDIST_STRIDE) < MAX_VIRT_CPUS);
+        d->arch.vgic.rdist_regions[0].base = GUEST_GICV3_GICR0_BASE;
+        d->arch.vgic.rdist_regions[0].size = GUEST_GICV3_GICR0_SIZE;
+        d->arch.vgic.rdist_regions[0].first_cpu = 0;
+    }
+
     /* By default deliver to CPU0 */
     for ( i = 0; i < DOMAIN_NR_RANKS(d); i++ )
     {
         for ( idx = 0; idx < 32; idx++ )
             d->arch.vgic.shared_irqs[i].v3.irouter[idx] = 0;
     }
-    /* We rely on gicv init to get dbase and size */
+
+    /* Register mmio handle for the Distributor */
     register_mmio_handler(d, &vgic_distr_mmio_handler, d->arch.vgic.dbase,
-                          d->arch.vgic.dbase_size);
+                          SZ_64K);
 
     /*
-     * Register mmio handler per redistributor region but not for
-     * every sgi rdist region which is per core.
-     * The redistributor region encompasses per core sgi region.
+     * Register mmio handler per contiguous region occupied by the
+     * redistributors. The handler will take care to choose which
+     * redistributor is targeted.
      */
-    for ( i = 0; i < d->arch.vgic.rdist_count; i++ )
+    for ( i = 0; i < d->arch.vgic.nr_regions; i++ )
         register_mmio_handler(d, &vgic_rdistr_mmio_handler,
-            d->arch.vgic.rbase[i], d->arch.vgic.rbase_size[i]);
+            d->arch.vgic.rdist_regions[i].base,
+            d->arch.vgic.rdist_regions[i].size);
+
+    d->arch.vgic.ctlr = VGICD_CTLR_DEFAULT;
 
     return 0;
 }
@@ -1096,10 +1223,23 @@ static const struct vgic_ops v3_ops = {
     .get_irq_priority = vgic_v3_get_irq_priority,
     .get_target_vcpu  = vgic_v3_get_target_vcpu,
     .emulate_sysreg  = vgic_v3_emulate_sysreg,
+    /*
+     * We use both AFF1 and AFF0 in (v)MPIDR. Thus, the max number of CPU
+     * that can be supported is up to 4096(==256*16) in theory.
+     */
+    .max_vcpus = 4096,
 };
 
 int vgic_v3_init(struct domain *d)
 {
+    if ( !vgic_v3_hw.enabled )
+    {
+        printk(XENLOG_G_ERR
+               "d%d: vGICv3 is not supported on this platform.\n",
+               d->domain_id);
+        return -ENODEV;
+    }
+
     register_vgic_ops(d, &v3_ops);
 
     return 0;
diff --git a/xen/arch/arm/vgic.c b/xen/arch/arm/vgic.c
index 41d3e48..a6835a8 100644
--- a/xen/arch/arm/vgic.c
+++ b/xen/arch/arm/vgic.c
@@ -24,6 +24,7 @@
 #include <xen/softirq.h>
 #include <xen/irq.h>
 #include <xen/sched.h>
+#include <xen/perfc.h>
 
 #include <asm/current.h>
 
@@ -60,20 +61,29 @@ struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned int irq)
     return vgic_get_rank(v, rank);
 }
 
-int domain_vgic_init(struct domain *d)
+static void vgic_init_pending_irq(struct pending_irq *p, unsigned int virq)
+{
+    INIT_LIST_HEAD(&p->inflight);
+    INIT_LIST_HEAD(&p->lr_queue);
+    p->irq = virq;
+}
+
+int domain_vgic_init(struct domain *d, unsigned int nr_spis)
 {
     int i;
+    int ret;
 
     d->arch.vgic.ctlr = 0;
 
-    if ( is_hardware_domain(d) )
-        d->arch.vgic.nr_spis = gic_number_lines() - 32;
-    else
-        d->arch.vgic.nr_spis = 0; /* We don't need SPIs for the guest */
+    /* Limit the number of virtual SPIs supported to (1020 - 32) = 988  */
+    if ( nr_spis > (1020 - NR_LOCAL_IRQS) )
+        return -EINVAL;
 
-    switch ( gic_hw_version() )
+    d->arch.vgic.nr_spis = nr_spis;
+
+    switch ( d->arch.vgic.version )
     {
-#ifdef CONFIG_ARM_64
+#ifdef HAS_GICV3
     case GIC_V3:
         if ( vgic_v3_init(d) )
            return -ENODEV;
@@ -84,6 +94,8 @@ int domain_vgic_init(struct domain *d)
             return -ENODEV;
         break;
     default:
+        printk(XENLOG_G_ERR "d%d: Unknown vGIC version %u\n",
+               d->domain_id, d->arch.vgic.version);
         return -ENODEV;
     }
 
@@ -100,14 +112,23 @@ int domain_vgic_init(struct domain *d)
         return -ENOMEM;
 
     for (i=0; i<d->arch.vgic.nr_spis; i++)
-    {
-        INIT_LIST_HEAD(&d->arch.vgic.pending_irqs[i].inflight);
-        INIT_LIST_HEAD(&d->arch.vgic.pending_irqs[i].lr_queue);
-    }
+        vgic_init_pending_irq(&d->arch.vgic.pending_irqs[i], i + 32);
+
     for (i=0; i<DOMAIN_NR_RANKS(d); i++)
         spin_lock_init(&d->arch.vgic.shared_irqs[i].lock);
 
-    d->arch.vgic.handler->domain_init(d);
+    ret = d->arch.vgic.handler->domain_init(d);
+    if ( ret )
+        return ret;
+
+    d->arch.vgic.allocated_irqs =
+        xzalloc_array(unsigned long, BITS_TO_LONGS(vgic_num_irqs(d)));
+    if ( !d->arch.vgic.allocated_irqs )
+        return -ENOMEM;
+
+    /* vIRQ0-15 (SGIs) are reserved */
+    for ( i = 0; i < NR_GIC_SGI; i++ )
+        set_bit(i, d->arch.vgic.allocated_irqs);
 
     return 0;
 }
@@ -119,8 +140,25 @@ void register_vgic_ops(struct domain *d, const struct vgic_ops *ops)
 
 void domain_vgic_free(struct domain *d)
 {
+    int i;
+    int ret;
+
+    for ( i = 0; i < (d->arch.vgic.nr_spis); i++ )
+    {
+        struct pending_irq *p = spi_to_pending(d, i + 32);
+
+        if ( p->desc )
+        {
+            ret = release_guest_irq(d, p->irq);
+            if ( ret )
+                dprintk(XENLOG_G_WARNING, "d%u: Failed to release virq %u ret = %d\n",
+                        d->domain_id, p->irq, ret);
+        }
+    }
+
     xfree(d->arch.vgic.shared_irqs);
     xfree(d->arch.vgic.pending_irqs);
+    xfree(d->arch.vgic.allocated_irqs);
 }
 
 int vcpu_vgic_init(struct vcpu *v)
@@ -137,10 +175,7 @@ int vcpu_vgic_init(struct vcpu *v)
 
     memset(&v->arch.vgic.pending_irqs, 0, sizeof(v->arch.vgic.pending_irqs));
     for (i = 0; i < 32; i++)
-    {
-        INIT_LIST_HEAD(&v->arch.vgic.pending_irqs[i].inflight);
-        INIT_LIST_HEAD(&v->arch.vgic.pending_irqs[i].lr_queue);
-    }
+        vgic_init_pending_irq(&v->arch.vgic.pending_irqs[i], i);
 
     INIT_LIST_HEAD(&v->arch.vgic.inflight_irqs);
     INIT_LIST_HEAD(&v->arch.vgic.lr_pending);
@@ -182,6 +217,8 @@ void vgic_migrate_irq(struct vcpu *old, struct vcpu *new, unsigned int irq)
     if ( test_bit(GIC_IRQ_GUEST_MIGRATING, &p->status) )
         return;
 
+    perfc_incr(vgic_irq_migrates);
+
     spin_lock_irqsave(&old->arch.vgic.lock, flags);
 
     if ( list_empty(&p->inflight) )
@@ -283,60 +320,56 @@ void vgic_enable_irqs(struct vcpu *v, uint32_t r, int n)
     }
 }
 
-/* TODO: unsigned long is used to fit vcpu_mask.*/
 int vgic_to_sgi(struct vcpu *v, register_t sgir, enum gic_sgi_mode irqmode, int virq,
-                unsigned long vcpu_mask)
+                const struct sgi_target *target)
 {
     struct domain *d = v->domain;
     int vcpuid;
     int i;
-
-    ASSERT(d->max_vcpus < 8*sizeof(vcpu_mask));
+    unsigned int base;
+    unsigned long int bitmap;
 
     ASSERT( virq < 16 );
 
     switch ( irqmode )
     {
     case SGI_TARGET_LIST:
+        perfc_incr(vgic_sgi_list);
+        base = target->aff1 << 4;
+        bitmap = target->list;
+        for_each_set_bit( i, &bitmap, sizeof(target->list) * 8 )
+        {
+            vcpuid = base + i;
+            if ( d->vcpu[vcpuid] == NULL || !is_vcpu_online(d->vcpu[vcpuid]) )
+            {
+                gprintk(XENLOG_WARNING, "VGIC: write r=%"PRIregister" \
+                        target->list=%hx, wrong CPUTargetList \n",
+                        sgir, target->list);
+                continue;
+            }
+            vgic_vcpu_inject_irq(d->vcpu[vcpuid], virq);
+        }
         break;
     case SGI_TARGET_OTHERS:
-        /*
-         * We expect vcpu_mask to be 0 for SGI_TARGET_OTHERS and
-         * SGI_TARGET_SELF mode. So Force vcpu_mask to 0
-         */
-        vcpu_mask = 0;
+        perfc_incr(vgic_sgi_others);
         for ( i = 0; i < d->max_vcpus; i++ )
         {
             if ( i != current->vcpu_id && d->vcpu[i] != NULL &&
                  is_vcpu_online(d->vcpu[i]) )
-                set_bit(i, &vcpu_mask);
+                vgic_vcpu_inject_irq(d->vcpu[i], virq);
         }
         break;
     case SGI_TARGET_SELF:
-        /*
-         * We expect vcpu_mask to be 0 for SGI_TARGET_OTHERS and
-         * SGI_TARGET_SELF mode. So Force vcpu_mask to 0
-         */
-        vcpu_mask = 0;
-        set_bit(current->vcpu_id, &vcpu_mask);
+        perfc_incr(vgic_sgi_self);
+        vgic_vcpu_inject_irq(d->vcpu[current->vcpu_id], virq);
         break;
     default:
-        gdprintk(XENLOG_WARNING,
-                 "vGICD:unhandled GICD_SGIR write %"PRIregister" \
-                  with wrong mode\n", sgir);
+        gprintk(XENLOG_WARNING,
+                "vGICD:unhandled GICD_SGIR write %"PRIregister" \
+                 with wrong mode\n", sgir);
         return 0;
     }
 
-    for_each_set_bit( vcpuid, &vcpu_mask, d->max_vcpus )
-    {
-        if ( d->vcpu[vcpuid] != NULL && !is_vcpu_online(d->vcpu[vcpuid]) )
-        {
-            gdprintk(XENLOG_WARNING, "VGIC: write r=%"PRIregister" \
-                     vcpu_mask=%lx, wrong CPUTargetList\n", sgir, vcpu_mask);
-            continue;
-        }
-        vgic_vcpu_inject_irq(d->vcpu[vcpuid], virq);
-    }
     return 1;
 }
 
@@ -352,6 +385,13 @@ struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq)
     return n;
 }
 
+struct pending_irq *spi_to_pending(struct domain *d, unsigned int irq)
+{
+    ASSERT(irq >= NR_LOCAL_IRQS);
+
+    return &d->arch.vgic.pending_irqs[irq - 32];
+}
+
 void vgic_clear_pending_irqs(struct vcpu *v)
 {
     struct pending_irq *p, *t;
@@ -364,16 +404,16 @@ void vgic_clear_pending_irqs(struct vcpu *v)
     spin_unlock_irqrestore(&v->arch.vgic.lock, flags);
 }
 
-void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq)
+void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq)
 {
     uint8_t priority;
-    struct vgic_irq_rank *rank = vgic_rank_irq(v, irq);
-    struct pending_irq *iter, *n = irq_to_pending(v, irq);
+    struct vgic_irq_rank *rank = vgic_rank_irq(v, virq);
+    struct pending_irq *iter, *n = irq_to_pending(v, virq);
     unsigned long flags;
     bool_t running;
 
     vgic_lock_rank(v, rank, flags);
-    priority = v->domain->arch.vgic.handler->get_irq_priority(v, irq);
+    priority = v->domain->arch.vgic.handler->get_irq_priority(v, virq);
     vgic_unlock_rank(v, rank, flags);
 
     spin_lock_irqsave(&v->arch.vgic.lock, flags);
@@ -389,16 +429,15 @@ void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq)
 
     if ( !list_empty(&n->inflight) )
     {
-        gic_raise_inflight_irq(v, irq);
+        gic_raise_inflight_irq(v, virq);
         goto out;
     }
 
-    n->irq = irq;
     n->priority = priority;
 
     /* the irq is enabled */
     if ( test_bit(GIC_IRQ_GUEST_ENABLED, &n->status) )
-        gic_raise_guest_irq(v, irq, priority);
+        gic_raise_guest_irq(v, virq, priority);
 
     list_for_each_entry ( iter, &v->arch.vgic.inflight_irqs, inflight )
     {
@@ -415,18 +454,21 @@ out:
     running = v->is_running;
     vcpu_unblock(v);
     if ( running && v != current )
+    {
+        perfc_incr(vgic_cross_cpu_intr_inject);
         smp_send_event_check_mask(cpumask_of(v->processor));
+    }
 }
 
-void vgic_vcpu_inject_spi(struct domain *d, unsigned int irq)
+void vgic_vcpu_inject_spi(struct domain *d, unsigned int virq)
 {
     struct vcpu *v;
 
     /* the IRQ needs to be an SPI */
-    ASSERT(irq >= 32 && irq <= gic_number_lines());
+    ASSERT(virq >= 32 && virq <= vgic_num_irqs(d));
 
-    v = vgic_get_target_vcpu(d->vcpu[0], irq);
-    vgic_vcpu_inject_irq(v, irq);
+    v = vgic_get_target_vcpu(d->vcpu[0], virq);
+    vgic_vcpu_inject_irq(v, virq);
 }
 
 void arch_evtchn_inject(struct vcpu *v)
@@ -443,6 +485,51 @@ int vgic_emulate(struct cpu_user_regs *regs, union hsr hsr)
     return v->domain->arch.vgic.handler->emulate_sysreg(regs, hsr);
 }
 
+bool_t vgic_reserve_virq(struct domain *d, unsigned int virq)
+{
+    if ( virq >= vgic_num_irqs(d) )
+        return 0;
+
+    return !test_and_set_bit(virq, d->arch.vgic.allocated_irqs);
+}
+
+int vgic_allocate_virq(struct domain *d, bool_t spi)
+{
+    int first, end;
+    unsigned int virq;
+
+    if ( !spi )
+    {
+        /* We only allocate PPIs. SGIs are all reserved */
+        first = 16;
+        end = 32;
+    }
+    else
+    {
+        first = 32;
+        end = vgic_num_irqs(d);
+    }
+
+    /*
+     * There is no spinlock to protect allocated_irqs, therefore
+     * test_and_set_bit may fail. If so retry it.
+     */
+    do
+    {
+        virq = find_next_zero_bit(d->arch.vgic.allocated_irqs, end, first);
+        if ( virq >= end )
+            return -1;
+    }
+    while ( test_and_set_bit(virq, d->arch.vgic.allocated_irqs) );
+
+    return virq;
+}
+
+void vgic_free_virq(struct domain *d, unsigned int virq)
+{
+    clear_bit(virq, d->arch.vgic.allocated_irqs);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/vpsci.c b/xen/arch/arm/vpsci.c
index 3f2a482..aebe1e2 100644
--- a/xen/arch/arm/vpsci.c
+++ b/xen/arch/arm/vpsci.c
@@ -32,13 +32,7 @@ static int do_common_cpu_on(register_t target_cpu, register_t entry_point,
     int is_thumb = entry_point & 1;
     register_t vcpuid;
 
-    if( ver == XEN_PSCI_V_0_2 )
-        vcpuid = (target_cpu & MPIDR_HWID_MASK);
-    else
-        vcpuid = target_cpu;
-
-    if ( (vcpuid < 0) || (vcpuid >= MAX_VIRT_CPUS) )
-        return PSCI_INVALID_PARAMETERS;
+    vcpuid = vaffinity_to_vcpuid(target_cpu);
 
     if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
         return PSCI_INVALID_PARAMETERS;
diff --git a/xen/arch/arm/vtimer.c b/xen/arch/arm/vtimer.c
index 2e95ceb..1418092 100644
--- a/xen/arch/arm/vtimer.c
+++ b/xen/arch/arm/vtimer.c
@@ -21,6 +21,7 @@
 #include <xen/lib.h>
 #include <xen/timer.h>
 #include <xen/sched.h>
+#include <xen/perfc.h>
 #include <asm/irq.h>
 #include <asm/time.h>
 #include <asm/gic.h>
@@ -30,12 +31,25 @@
 extern s_time_t ticks_to_ns(uint64_t ticks);
 extern uint64_t ns_to_ticks(s_time_t ns);
 
+/*
+ * Check if regs is allowed access, user_gate is tail end of a
+ * CNTKCTL_EL1_ bit name which gates user access
+ */
+#define ACCESS_ALLOWED(regs, user_gate) \
+    ( !psr_mode_is_user(regs) || \
+      (READ_SYSREG(CNTKCTL_EL1) & CNTKCTL_EL1_##user_gate) )
+
 static void phys_timer_expired(void *data)
 {
     struct vtimer *t = data;
     t->ctl |= CNTx_CTL_PENDING;
     if ( !(t->ctl & CNTx_CTL_MASK) )
+    {
+        perfc_incr(vtimer_phys_inject);
         vgic_vcpu_inject_irq(t->v, t->irq);
+    }
+    else
+        perfc_incr(vtimer_phys_masked);
 }
 
 static void virt_timer_expired(void *data)
@@ -43,12 +57,40 @@ static void virt_timer_expired(void *data)
     struct vtimer *t = data;
     t->ctl |= CNTx_CTL_MASK;
     vgic_vcpu_inject_irq(t->v, t->irq);
+    perfc_incr(vtimer_virt_inject);
 }
 
-int domain_vtimer_init(struct domain *d)
+int domain_vtimer_init(struct domain *d, struct xen_arch_domainconfig *config)
 {
     d->arch.phys_timer_base.offset = NOW();
     d->arch.virt_timer_base.offset = READ_SYSREG64(CNTPCT_EL0);
+
+    config->clock_frequency = timer_dt_clock_frequency;
+
+    /* At this stage vgic_reserve_virq can't fail */
+    if ( is_hardware_domain(d) )
+    {
+        if ( !vgic_reserve_virq(d, timer_get_irq(TIMER_PHYS_SECURE_PPI)) )
+            BUG();
+
+        if ( !vgic_reserve_virq(d, timer_get_irq(TIMER_PHYS_NONSECURE_PPI)) )
+            BUG();
+
+        if ( !vgic_reserve_virq(d, timer_get_irq(TIMER_VIRT_PPI)) )
+            BUG();
+    }
+    else
+    {
+        if ( !vgic_reserve_virq(d, GUEST_TIMER_PHYS_S_PPI) )
+            BUG();
+
+        if ( !vgic_reserve_virq(d, GUEST_TIMER_PHYS_NS_PPI) )
+            BUG();
+
+        if ( !vgic_reserve_virq(d, GUEST_TIMER_VIRT_PPI) )
+            BUG();
+    }
+
     return 0;
 }
 
@@ -122,9 +164,13 @@ int virt_timer_restore(struct vcpu *v)
     return 0;
 }
 
-static void vtimer_cntp_ctl(struct cpu_user_regs *regs, uint32_t *r, int read)
+static int vtimer_cntp_ctl(struct cpu_user_regs *regs, uint32_t *r, int read)
 {
     struct vcpu *v = current;
+
+    if ( !ACCESS_ALLOWED(regs, EL0PTEN) )
+        return 0;
+
     if ( read )
     {
         *r = v->arch.phys_timer.ctl;
@@ -144,13 +190,17 @@ static void vtimer_cntp_ctl(struct cpu_user_regs *regs, uint32_t *r, int read)
         else
             stop_timer(&v->arch.phys_timer.timer);
     }
+    return 1;
 }
 
-static void vtimer_cntp_tval(struct cpu_user_regs *regs, uint32_t *r, int read)
+static int vtimer_cntp_tval(struct cpu_user_regs *regs, uint32_t *r, int read)
 {
     struct vcpu *v = current;
     s_time_t now;
 
+    if ( !ACCESS_ALLOWED(regs, EL0PTEN) )
+        return 0;
+
     now = NOW() - v->domain->arch.phys_timer_base.offset;
 
     if ( read )
@@ -168,43 +218,51 @@ static void vtimer_cntp_tval(struct cpu_user_regs *regs, uint32_t *r, int read)
                       v->domain->arch.phys_timer_base.offset);
         }
     }
+    return 1;
 }
 
-static int vtimer_cntpct(struct cpu_user_regs *regs, uint64_t *r, int read)
+static int vtimer_cntp_cval(struct cpu_user_regs *regs, uint64_t *r, int read)
 {
     struct vcpu *v = current;
-    uint64_t ticks;
-    s_time_t now;
+
+    if ( !ACCESS_ALLOWED(regs, EL0PTEN) )
+        return 0;
 
     if ( read )
     {
-        now = NOW() - v->domain->arch.phys_timer_base.offset;
-        ticks = ns_to_ticks(now);
-        *r = ticks;
-        return 1;
+        *r = ns_to_ticks(v->arch.phys_timer.cval);
     }
     else
     {
-        gdprintk(XENLOG_DEBUG, "WRITE to R/O CNTPCT\n");
-        return 0;
+        v->arch.phys_timer.cval = ticks_to_ns(*r);
+        if ( v->arch.phys_timer.ctl & CNTx_CTL_ENABLE )
+        {
+            v->arch.phys_timer.ctl &= ~CNTx_CTL_PENDING;
+            set_timer(&v->arch.phys_timer.timer,
+                      v->arch.phys_timer.cval +
+                      v->domain->arch.phys_timer_base.offset);
+        }
     }
+    return 1;
 }
 
-
 static int vtimer_emulate_cp32(struct cpu_user_regs *regs, union hsr hsr)
 {
     struct hsr_cp32 cp32 = hsr.cp32;
     uint32_t *r = (uint32_t *)select_user_reg(regs, cp32.reg);
 
+    if ( cp32.read )
+        perfc_incr(vtimer_cp32_reads);
+    else
+        perfc_incr(vtimer_cp32_writes);
+
     switch ( hsr.bits & HSR_CP32_REGS_MASK )
     {
     case HSR_CPREG32(CNTP_CTL):
-        vtimer_cntp_ctl(regs, r, cp32.read);
-        return 1;
+        return vtimer_cntp_ctl(regs, r, cp32.read);
 
     case HSR_CPREG32(CNTP_TVAL):
-        vtimer_cntp_tval(regs, r, cp32.read);
-        return 1;
+        return vtimer_cntp_tval(regs, r, cp32.read);
 
     default:
         return 0;
@@ -216,24 +274,31 @@ static int vtimer_emulate_cp64(struct cpu_user_regs *regs, union hsr hsr)
     struct hsr_cp64 cp64 = hsr.cp64;
     uint32_t *r1 = (uint32_t *)select_user_reg(regs, cp64.reg1);
     uint32_t *r2 = (uint32_t *)select_user_reg(regs, cp64.reg2);
-    uint64_t x;
+    uint64_t x = (uint64_t)(*r1) | ((uint64_t)(*r2) << 32);
+
+    if ( cp64.read )
+        perfc_incr(vtimer_cp64_reads);
+    else
+        perfc_incr(vtimer_cp64_writes);
 
     switch ( hsr.bits & HSR_CP64_REGS_MASK )
     {
-    case HSR_CPREG64(CNTPCT):
-        if (!vtimer_cntpct(regs, &x, cp64.read))
+    case HSR_CPREG64(CNTP_CVAL):
+        if ( !vtimer_cntp_cval(regs, &x, cp64.read) )
             return 0;
-
-        if ( cp64.read )
-        {
-            *r1 = (uint32_t)(x & 0xffffffff);
-            *r2 = (uint32_t)(x >> 32);
-        }
-        return 1;
+        break;
 
     default:
         return 0;
     }
+
+    if ( cp64.read )
+    {
+        *r1 = (uint32_t)(x & 0xffffffff);
+        *r2 = (uint32_t)(x >> 32);
+    }
+
+    return 1;
 }
 
 #ifdef CONFIG_ARM_64
@@ -243,21 +308,28 @@ static int vtimer_emulate_sysreg(struct cpu_user_regs *regs, union hsr hsr)
     register_t *x = select_user_reg(regs, sysreg.reg);
     uint32_t r = (uint32_t)*x;
 
+    if ( sysreg.read )
+        perfc_incr(vtimer_sysreg_reads);
+    else
+        perfc_incr(vtimer_sysreg_writes);
+
     switch ( hsr.bits & HSR_SYSREG_REGS_MASK )
     {
     case HSR_SYSREG_CNTP_CTL_EL0:
-        vtimer_cntp_ctl(regs, &r, sysreg.read);
+        if ( !vtimer_cntp_ctl(regs, &r, sysreg.read) )
+            return 0;
         if ( sysreg.read )
             *x = r;
         return 1;
     case HSR_SYSREG_CNTP_TVAL_EL0:
-        vtimer_cntp_tval(regs, &r, sysreg.read);
+        if ( !vtimer_cntp_tval(regs, &r, sysreg.read) )
+            return 0;
         if ( sysreg.read )
             *x = r;
         return 1;
 
-    case HSR_SYSREG_CNTPCT_EL0:
-        return vtimer_cntpct(regs, x, sysreg.read);
+    case HSR_SYSREG_CNTP_CVAL_EL0:
+        return vtimer_cntp_cval(regs, x, sysreg.read);
 
     default:
         return 0;
@@ -271,17 +343,11 @@ int vtimer_emulate(struct cpu_user_regs *regs, union hsr hsr)
 
     switch (hsr.ec) {
     case HSR_EC_CP15_32:
-        if ( !is_32bit_domain(current->domain) )
-            return 0;
         return vtimer_emulate_cp32(regs, hsr);
     case HSR_EC_CP15_64:
-        if ( !is_32bit_domain(current->domain) )
-            return 0;
         return vtimer_emulate_cp64(regs, hsr);
 #ifdef CONFIG_ARM_64
     case HSR_EC_SYSREG:
-        if ( is_32bit_domain(current->domain) )
-            return 0;
         return vtimer_emulate_sysreg(regs, hsr);
 #endif
     default:
diff --git a/xen/arch/arm/vtimer.h b/xen/arch/arm/vtimer.h
index 6d2e46e..99e8145 100644
--- a/xen/arch/arm/vtimer.h
+++ b/xen/arch/arm/vtimer.h
@@ -20,7 +20,8 @@
 #ifndef __ARCH_ARM_VTIMER_H__
 #define __ARCH_ARM_VTIMER_H__
 
-extern int domain_vtimer_init(struct domain *d);
+extern int domain_vtimer_init(struct domain *d,
+                              struct xen_arch_domainconfig *config);
 extern int vcpu_vtimer_init(struct vcpu *v);
 extern int vtimer_emulate(struct cpu_user_regs *regs, union hsr hsr);
 extern int virt_timer_save(struct vcpu *v);
diff --git a/xen/arch/arm/vuart.c b/xen/arch/arm/vuart.c
index e327c15..d9f4249 100644
--- a/xen/arch/arm/vuart.c
+++ b/xen/arch/arm/vuart.c
@@ -39,6 +39,7 @@
 #include <xen/ctype.h>
 #include <xen/serial.h>
 #include <asm/mmio.h>
+#include <xen/perfc.h>
 
 #include "vuart.h"
 
@@ -112,6 +113,8 @@ static int vuart_mmio_read(struct vcpu *v, mmio_info_t *info)
     register_t *r = select_user_reg(regs, dabt.reg);
     paddr_t offset = info->gpa - d->arch.vuart.info->base_addr;
 
+    perfc_incr(vuart_reads);
+
     /* By default zeroed the register */
     *r = 0;
 
@@ -130,6 +133,8 @@ static int vuart_mmio_write(struct vcpu *v, mmio_info_t *info)
     register_t *r = select_user_reg(regs, dabt.reg);
     paddr_t offset = info->gpa - d->arch.vuart.info->base_addr;
 
+    perfc_incr(vuart_writes);
+
     if ( offset == d->arch.vuart.info->data_off )
         /* ignore any status bits */
         vuart_print_char(v, *r & 0xFF);
diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index 86ca5f8..5f24951 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -36,6 +36,7 @@ obj-y += microcode_intel.o
 # This must come after the vendor specific files.
 obj-y += microcode.o
 obj-y += mm.o
+obj-y += monitor.o
 obj-y += mpparse.o
 obj-y += nmi.o
 obj-y += numa.o
@@ -59,6 +60,7 @@ obj-y += machine_kexec.o
 obj-y += crash.o
 obj-y += tboot.o
 obj-y += hpet.o
+obj-y += vm_event.o
 obj-y += xstate.o
 
 obj-$(crash_debug) += gdbstub.o
diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk
index 6775cb5..4a04a8a 100644
--- a/xen/arch/x86/Rules.mk
+++ b/xen/arch/x86/Rules.mk
@@ -15,12 +15,6 @@ HAS_GDBSX := y
 HAS_PDX := y
 xenoprof := y
 
-#
-# If you change any of these configuration options then you must
-# 'make clean' before rebuilding.
-#
-supervisor_mode_kernel ?= n
-
 CFLAGS += -I$(BASEDIR)/include 
 CFLAGS += -I$(BASEDIR)/include/asm-x86/mach-generic
 CFLAGS += -I$(BASEDIR)/include/asm-x86/mach-default
@@ -34,17 +28,19 @@ $(call as-insn-check,CFLAGS,CC,"vmcall",-DHAVE_GAS_VMX)
 $(call as-insn-check,CFLAGS,CC,"invept (%rax)$$(comma)%rax",-DHAVE_GAS_EPT)
 $(call as-insn-check,CFLAGS,CC,"rdfsbase %rax",-DHAVE_GAS_FSGSBASE)
 
-ifeq ($(supervisor_mode_kernel),y)
-CFLAGS += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1
-endif
-
 x86 := y
 x86_32 := n
 x86_64 := y
 
+shadow-paging ?= y
+bigmem        ?= n
+
 CFLAGS += -mno-red-zone -mno-sse -fpic
 CFLAGS += -fno-asynchronous-unwind-tables
 # -fvisibility=hidden reduces -fpic cost, if it's available
 ifneq ($(call cc-option,$(CC),-fvisibility=hidden,n),n)
 CFLAGS += -DGCC_HAS_VISIBILITY_ATTRIBUTE
 endif
+
+CFLAGS-$(shadow-paging) += -DCONFIG_SHADOW_PAGING
+CFLAGS-$(bigmem)        += -DCONFIG_BIGMEM
diff --git a/xen/arch/x86/acpi/boot.c b/xen/arch/x86/acpi/boot.c
index 903830b..fac36c6 100644
--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -17,8 +17,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
@@ -309,6 +308,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
 
 	hpet_address = hpet_tbl->address.address;
 	hpet_blockid = hpet_tbl->sequence;
+	hpet_flags = hpet_tbl->flags;
 	printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
 	       hpet_tbl->id, hpet_address);
 
diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 2b2bcc6..15fe2e9 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -25,8 +25,7 @@
  *  General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
@@ -62,6 +61,7 @@
 
 #define GET_HW_RES_IN_NS(msr, val) \
     do { rdmsrl(msr, val); val = tsc_ticks2ns(val); } while( 0 )
+#define GET_MC6_RES(val)  GET_HW_RES_IN_NS(0x664, val) /* Atom E3000 only */
 #define GET_PC2_RES(val)  GET_HW_RES_IN_NS(0x60D, val) /* SNB onwards */
 #define GET_PC3_RES(val)  GET_HW_RES_IN_NS(0x3F8, val)
 #define GET_PC6_RES(val)  GET_HW_RES_IN_NS(0x3F9, val)
@@ -73,6 +73,7 @@
 #define GET_CC3_RES(val)  GET_HW_RES_IN_NS(0x3FC, val)
 #define GET_CC6_RES(val)  GET_HW_RES_IN_NS(0x3FD, val)
 #define GET_CC7_RES(val)  GET_HW_RES_IN_NS(0x3FE, val) /* SNB onwards */
+#define PHI_CC6_RES(val)  GET_HW_RES_IN_NS(0x3FF, val) /* Xeon Phi only */
 
 static void lapic_timer_nop(void) { }
 void (*__read_mostly lapic_timer_off)(void);
@@ -113,6 +114,8 @@ struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
 
 struct hw_residencies
 {
+    uint64_t mc0;
+    uint64_t mc6;
     uint64_t pc2;
     uint64_t pc3;
     uint64_t pc4;
@@ -153,8 +156,11 @@ static void do_get_hw_residencies(void *arg)
     case 0x3C:
     case 0x3F:
     case 0x46:
-    /* future */
+    /* Broadwell */
     case 0x3D:
+    case 0x4F:
+    case 0x56:
+    /* future */
     case 0x4E:
         GET_PC2_RES(hw_res->pc2);
         GET_CC7_RES(hw_res->cc7);
@@ -174,6 +180,16 @@ static void do_get_hw_residencies(void *arg)
         GET_CC3_RES(hw_res->cc3);
         GET_CC6_RES(hw_res->cc6);
         break;
+    /* next gen Xeon Phi */
+    case 0x57:
+        GET_CC3_RES(hw_res->mc0); /* abusing GET_CC3_RES */
+        GET_CC6_RES(hw_res->mc6); /* abusing GET_CC6_RES */
+        GET_PC2_RES(hw_res->pc2);
+        GET_PC3_RES(hw_res->pc3);
+        GET_PC6_RES(hw_res->pc6);
+        GET_PC7_RES(hw_res->pc7);
+        PHI_CC6_RES(hw_res->cc6);
+        break;
     /* various Atoms */
     case 0x27:
         GET_PC3_RES(hw_res->pc2); /* abusing GET_PC3_RES */
@@ -182,10 +198,14 @@ static void do_get_hw_residencies(void *arg)
         break;
     /* Silvermont */
     case 0x37:
+        GET_MC6_RES(hw_res->mc6);
+        /* fall through */
     case 0x4A:
     case 0x4D:
     case 0x5A:
     case 0x5D:
+    /* Airmont */
+    case 0x4C:
         GET_PC7_RES(hw_res->pc6); /* abusing GET_PC7_RES */
         GET_CC1_RES(hw_res->cc1);
         GET_CC6_RES(hw_res->cc6);
@@ -209,6 +229,9 @@ static void print_hw_residencies(uint32_t cpu)
 
     get_hw_residencies(cpu, &hw_res);
 
+    if ( hw_res.mc0 | hw_res.mc6 )
+        printk("MC0[%"PRIu64"] MC6[%"PRIu64"]\n",
+               hw_res.mc0, hw_res.mc6);
     printk("PC2[%"PRIu64"] PC%d[%"PRIu64"] PC6[%"PRIu64"] PC7[%"PRIu64"]\n",
            hw_res.pc2,
            hw_res.pc4 ? 4 : 3, hw_res.pc4 ?: hw_res.pc3,
@@ -229,40 +252,74 @@ static char* acpi_cstate_method_name[] =
     "HALT"
 };
 
+static uint64_t get_stime_tick(void) { return (uint64_t)NOW(); }
+static uint64_t stime_ticks_elapsed(uint64_t t1, uint64_t t2) { return t2 - t1; }
+static uint64_t stime_tick_to_ns(uint64_t ticks) { return ticks; }
+
+static uint64_t get_acpi_pm_tick(void) { return (uint64_t)inl(pmtmr_ioport); }
+static uint64_t acpi_pm_ticks_elapsed(uint64_t t1, uint64_t t2)
+{
+    if ( t2 >= t1 )
+        return (t2 - t1);
+    else if ( !(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER) )
+        return (((0x00FFFFFF - t1) + t2 + 1) & 0x00FFFFFF);
+    else
+        return ((0xFFFFFFFF - t1) + t2 +1);
+}
+
+uint64_t (*__read_mostly cpuidle_get_tick)(void) = get_acpi_pm_tick;
+static uint64_t (*__read_mostly ticks_elapsed)(uint64_t, uint64_t)
+    = acpi_pm_ticks_elapsed;
+
 static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
 {
-    uint32_t i, idle_usage = 0;
-    uint64_t res, idle_res = 0;
-    u32 usage;
-    u8 last_state_idx;
+    uint64_t idle_res = 0, idle_usage = 0;
+    uint64_t last_state_update_tick, current_tick, current_stime;
+    uint64_t usage[ACPI_PROCESSOR_MAX_POWER] = { 0 };
+    uint64_t res_tick[ACPI_PROCESSOR_MAX_POWER] = { 0 };
+    unsigned int i;
+    signed int last_state_idx;
 
     printk("==cpu%d==\n", cpu);
     last_state_idx = power->last_state ? power->last_state->idx : -1;
     printk("active state:\t\tC%d\n", last_state_idx);
     printk("max_cstate:\t\tC%d\n", max_cstate);
     printk("states:\n");
-    
+
+    spin_lock_irq(&power->stat_lock);
+    current_tick = cpuidle_get_tick();
+    current_stime = NOW();
     for ( i = 1; i < power->count; i++ )
     {
-        spin_lock_irq(&power->stat_lock);	
-        res = tick_to_ns(power->states[i].time);
-        usage = power->states[i].usage;
-        spin_unlock_irq(&power->stat_lock);
+        res_tick[i] = power->states[i].time;
+        usage[i] = power->states[i].usage;
+    }
+    last_state_update_tick = power->last_state_update_tick;
+    spin_unlock_irq(&power->stat_lock);
 
-        idle_usage += usage;
-        idle_res += res;
+    if ( last_state_idx >= 0 )
+    {
+        res_tick[last_state_idx] += ticks_elapsed(last_state_update_tick,
+                                                  current_tick);
+        usage[last_state_idx]++;
+    }
+
+    for ( i = 1; i < power->count; i++ )
+    {
+        idle_usage += usage[i];
+        idle_res += tick_to_ns(res_tick[i]);
 
         printk((last_state_idx == i) ? "   *" : "    ");
         printk("C%d:\t", i);
         printk("type[C%d] ", power->states[i].type);
         printk("latency[%03d] ", power->states[i].latency);
-        printk("usage[%08d] ", usage);
+        printk("usage[%08"PRIu64"] ", usage[i]);
         printk("method[%5s] ", acpi_cstate_method_name[power->states[i].entry_method]);
-        printk("duration[%"PRId64"]\n", res);
+        printk("duration[%"PRIu64"]\n", tick_to_ns(res_tick[i]));
     }
     printk((last_state_idx == 0) ? "   *" : "    ");
-    printk("C0:\tusage[%08d] duration[%"PRId64"]\n",
-           idle_usage, NOW() - idle_res);
+    printk("C0:\tusage[%08"PRIu64"] duration[%"PRIu64"]\n",
+           usage[0] + idle_usage, current_stime - idle_res);
 
     print_hw_residencies(cpu);
 }
@@ -290,25 +347,6 @@ static int __init cpu_idle_key_init(void)
 }
 __initcall(cpu_idle_key_init);
 
-static uint64_t get_stime_tick(void) { return (uint64_t)NOW(); }
-static uint64_t stime_ticks_elapsed(uint64_t t1, uint64_t t2) { return t2 - t1; }
-static uint64_t stime_tick_to_ns(uint64_t ticks) { return ticks; }
-
-static uint64_t get_acpi_pm_tick(void) { return (uint64_t)inl(pmtmr_ioport); }
-static uint64_t acpi_pm_ticks_elapsed(uint64_t t1, uint64_t t2)
-{
-    if ( t2 >= t1 )
-        return (t2 - t1);
-    else if ( !(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER) )
-        return (((0x00FFFFFF - t1) + t2 + 1) & 0x00FFFFFF);
-    else
-        return ((0xFFFFFFFF - t1) + t2 +1);
-}
-
-uint64_t (*__read_mostly cpuidle_get_tick)(void) = get_acpi_pm_tick;
-static uint64_t (*__read_mostly ticks_elapsed)(uint64_t, uint64_t)
-    = acpi_pm_ticks_elapsed;
-
 /*
  * The bit is set iff cpu use monitor/mwait to enter C state
  * with this flag set, CPU can be waken up from C state
@@ -463,6 +501,17 @@ bool_t errata_c6_eoi_workaround(void)
     return (fix_needed && cpu_has_pending_apic_eoi());
 }
 
+void update_last_cx_stat(struct acpi_processor_power *power,
+                         struct acpi_processor_cx *cx, uint64_t ticks)
+{
+    ASSERT(!local_irq_is_enabled());
+
+    spin_lock(&power->stat_lock);
+    power->last_state = cx;
+    power->last_state_update_tick = ticks;
+    spin_unlock(&power->stat_lock);
+}
+
 void update_idle_stats(struct acpi_processor_power *power,
                        struct acpi_processor_cx *cx,
                        uint64_t before, uint64_t after)
@@ -478,6 +527,8 @@ void update_idle_stats(struct acpi_processor_power *power,
         power->last_residency = tick_to_ns(sleep_ticks) / 1000UL;
         cx->time += sleep_ticks;
     }
+    power->last_state = &power->states[0];
+    power->last_state_update_tick = after;
 
     spin_unlock(&power->stat_lock);
 }
@@ -534,7 +585,6 @@ static void acpi_processor_idle(void)
     if ( (cx->type == ACPI_STATE_C3) && errata_c6_eoi_workaround() )
         cx = power->safe_state;
 
-    power->last_state = cx;
 
     /*
      * Sleep:
@@ -551,6 +601,9 @@ static void acpi_processor_idle(void)
             t1 = cpuidle_get_tick();
             /* Trace cpu idle entry */
             TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
+
+            update_last_cx_stat(power, cx, t1);
+
             /* Invoke C2 */
             acpi_idle_do_entry(cx);
             /* Get end time (ticks) */
@@ -580,6 +633,8 @@ static void acpi_processor_idle(void)
         /* Trace cpu idle entry */
         TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
 
+        update_last_cx_stat(power, cx, t1);
+
         /*
          * disable bus master
          * bm_check implies we need ARB_DIS
@@ -1149,7 +1204,9 @@ int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat)
 {
     struct acpi_processor_power *power = processor_powers[cpuid];
     uint64_t idle_usage = 0, idle_res = 0;
-    uint64_t usage[ACPI_PROCESSOR_MAX_POWER], res[ACPI_PROCESSOR_MAX_POWER];
+    uint64_t last_state_update_tick, current_stime, current_tick;
+    uint64_t usage[ACPI_PROCESSOR_MAX_POWER] = { 0 };
+    uint64_t res[ACPI_PROCESSOR_MAX_POWER] = { 0 };
     unsigned int i, nr, nr_pc = 0, nr_cc = 0;
 
     if ( power == NULL )
@@ -1162,7 +1219,6 @@ int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat)
         return 0;
     }
 
-    stat->last = power->last_state ? power->last_state->idx : 0;
     stat->idle_time = get_cpu_idle_time(cpuid);
     nr = min(stat->nr, power->count);
 
@@ -1170,23 +1226,45 @@ int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat)
     if ( pm_idle_save == NULL )
     {
         stat->nr = 2;
+        stat->last = power->last_state ? power->last_state->idx : 0;
 
         usage[1] = idle_usage = 1;
         res[1] = idle_res = stat->idle_time;
+
+        current_stime = NOW();
     }
     else
     {
         struct hw_residencies hw_res;
+        signed int last_state_idx;
 
         stat->nr = power->count;
 
+        spin_lock_irq(&power->stat_lock);
+        current_tick = cpuidle_get_tick();
+        current_stime = NOW();
         for ( i = 1; i < nr; i++ )
         {
-            spin_lock_irq(&power->stat_lock);
             usage[i] = power->states[i].usage;
-            res[i] = tick_to_ns(power->states[i].time);
-            spin_unlock_irq(&power->stat_lock);
+            res[i] = power->states[i].time;
+        }
+        last_state_update_tick = power->last_state_update_tick;
+        last_state_idx = power->last_state ? power->last_state->idx : -1;
+        spin_unlock_irq(&power->stat_lock);
 
+        if ( last_state_idx >= 0 )
+        {
+            usage[last_state_idx]++;
+            res[last_state_idx] += ticks_elapsed(last_state_update_tick,
+                                                 current_tick);
+            stat->last = last_state_idx;
+        }
+        else
+            stat->last = 0;
+
+        for ( i = 1; i < nr; i++ )
+        {
+            res[i] = tick_to_ns(res[i]);
             idle_usage += usage[i];
             idle_res += res[i];
         }
@@ -1219,8 +1297,8 @@ int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat)
 #undef PUT_xC
     }
 
-    usage[0] = idle_usage;
-    res[0] = NOW() - idle_res;
+    usage[0] += idle_usage;
+    res[0] = current_stime - idle_res;
 
     if ( copy_to_guest(stat->triggers, usage, nr) ||
          copy_to_guest(stat->residencies, res, nr) )
diff --git a/xen/arch/x86/acpi/cpufreq/cpufreq.c b/xen/arch/x86/acpi/cpufreq/cpufreq.c
index fa3678d..ef79f77 100644
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c
@@ -22,8 +22,7 @@
  *  General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/xen/arch/x86/acpi/cpufreq/powernow.c b/xen/arch/x86/acpi/cpufreq/powernow.c
index 2c9fea2..4de6f8d 100644
--- a/xen/arch/x86/acpi/cpufreq/powernow.c
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c
@@ -16,8 +16,7 @@
  *  General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/xen/arch/x86/acpi/cpuidle_menu.c b/xen/arch/x86/acpi/cpuidle_menu.c
index 4afaa8d..0218b38 100644
--- a/xen/arch/x86/acpi/cpuidle_menu.c
+++ b/xen/arch/x86/acpi/cpuidle_menu.c
@@ -18,8 +18,7 @@
  *  General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/xen/arch/x86/acpi/lib.c b/xen/arch/x86/acpi/lib.c
index 1f98c31..cc15ea3 100644
--- a/xen/arch/x86/acpi/lib.c
+++ b/xen/arch/x86/acpi/lib.c
@@ -14,8 +14,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
index 2743792..46ac0fd 100644
--- a/xen/arch/x86/alternative.c
+++ b/xen/arch/x86/alternative.c
@@ -12,8 +12,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/types.h>
diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c
index 1de693f..2c9ae4e 100644
--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -434,7 +434,7 @@ void __init sync_Arb_IDs(void)
  */
 void __init init_bsp_APIC(void)
 {
-    unsigned long value, ver;
+    unsigned long value;
 
     /*
      * Don't do the setup now if we have a SMP BIOS as the
@@ -443,9 +443,6 @@ void __init init_bsp_APIC(void)
     if (smp_found_config || !cpu_has_apic)
         return;
 
-    value = apic_read(APIC_LVR);
-    ver = GET_APIC_VERSION(value);
-    
     /*
      * Do not trust the local APIC being empty at bootup.
      */
@@ -995,7 +992,7 @@ void __init init_apic_mappings(void)
         apic_phys = mp_lapic_addr;
 
     set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-    apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
+    apic_printk(APIC_VERBOSE, "mapped APIC to %08Lx (%08lx)\n", APIC_BASE,
                 apic_phys);
 
 __next:
@@ -1151,7 +1148,7 @@ static int __init calibrate_APIC_clock(void)
      * We wrapped around just now. Let's start:
      */
     if (cpu_has_tsc)
-        rdtscll(t1);
+        t1 = rdtsc();
     tt1 = apic_read(APIC_TMCCT);
 
     /*
@@ -1162,7 +1159,7 @@ static int __init calibrate_APIC_clock(void)
 
     tt2 = apic_read(APIC_TMCCT);
     if (cpu_has_tsc)
-        rdtscll(t2);
+        t2 = rdtsc();
 
     /*
      * The APIC bus clock counter is 32 bits only, it
@@ -1324,7 +1321,18 @@ out: ;
 
 void error_interrupt(struct cpu_user_regs *regs)
 {
-    unsigned long v, v1;
+    static const char *const esr_fields[] = {
+        "Send CS error",
+        "Receive CS error",
+        "Send accept error",
+        "Receive accept error",
+        "Redirectable IPI",
+        "Send illegal vector",
+        "Received illegal vector",
+        "Illegal register address",
+    };
+    unsigned int v, v1;
+    int i;
 
     /* First tickle the hardware, only then report what went on. -- REW */
     v = apic_read(APIC_ESR);
@@ -1332,18 +1340,12 @@ void error_interrupt(struct cpu_user_regs *regs)
     v1 = apic_read(APIC_ESR);
     ack_APIC_irq();
 
-    /* Here is what the APIC error bits mean:
-       0: Send CS error
-       1: Receive CS error
-       2: Send accept error
-       3: Receive accept error
-       4: Reserved
-       5: Send illegal vector
-       6: Received illegal vector
-       7: Illegal register address
-    */
-    printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+    printk(XENLOG_DEBUG "APIC error on CPU%u: %02x(%02x)",
             smp_processor_id(), v , v1);
+    for ( i = 7; i >= 0; --i )
+        if ( v1 & (1 << i) )
+            printk(", %s", esr_fields[i]);
+    printk("\n");
 }
 
 /*
diff --git a/xen/arch/x86/bitops.c b/xen/arch/x86/bitops.c
index c037567..f6ee715 100644
--- a/xen/arch/x86/bitops.c
+++ b/xen/arch/x86/bitops.c
@@ -62,7 +62,7 @@ unsigned int __find_first_zero_bit(
         "   je 2f\n\t"
         "   xor -"STR(BITS_PER_LONG/8)"(%2),%3\n\t"
         "   jz 1b\n\t"
-        "   bsf %3,%0\n\t"
+        "   rep; bsf %3,%0\n\t"
         "   lea -"STR(BITS_PER_LONG/8)"(%2),%2\n\t"
         "2: sub %%ebx,%%edi\n\t"
         "   shl $3,%%edi\n\t"
diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
index c99f739..cfd59dc 100644
--- a/xen/arch/x86/boot/head.S
+++ b/xen/arch/x86/boot/head.S
@@ -23,7 +23,7 @@ ENTRY(start)
         jmp     __start
 
         .align 4
-/*** MULTIBOOT HEADER ****/
+multiboot1_header_start:       /*** MULTIBOOT1 HEADER ****/
 #define MULTIBOOT_HEADER_FLAGS (MULTIBOOT_HEADER_MODS_ALIGNED | \
                                 MULTIBOOT_HEADER_WANT_MEMORY)
         /* Magic number indicating a Multiboot header. */
@@ -32,6 +32,7 @@ ENTRY(start)
         .long   MULTIBOOT_HEADER_FLAGS
         /* Checksum: must be the negated sum of the first two fields. */
         .long   -(MULTIBOOT_HEADER_MAGIC + MULTIBOOT_HEADER_FLAGS)
+multiboot1_header_end:
 
         .section .init.rodata, "a", @progbits
         .align 4
@@ -124,7 +125,7 @@ __start:
 
         /* Initialize BSS (no nasty surprises!) */
         mov     $sym_phys(__bss_start),%edi
-        mov     $sym_phys(_end),%ecx
+        mov     $sym_phys(__bss_end),%ecx
         sub     %edi,%ecx
         xor     %eax,%eax
         rep     stosb
@@ -212,27 +213,4 @@ ENTRY(trampoline_start)
 #include "trampoline.S"
 GLOBAL(trampoline_end)
 
-        .text
-__high_start:
 #include "x86_64.S"
-
-        .section .data.page_aligned, "aw", @progbits
-        .p2align PAGE_SHIFT
-/*
- * Mapping of first 2 megabytes of memory. This is mapped with 4kB mappings
- * to avoid type conflicts with fixed-range MTRRs covering the lowest megabyte
- * of physical memory. In any case the VGA hole should be mapped with type UC.
- */
-GLOBAL(l1_identmap)
-        pfn = 0
-        .rept L1_PAGETABLE_ENTRIES
-        /* VGA hole (0xa0000-0xc0000) should be mapped UC. */
-        .if pfn >= 0xa0 && pfn < 0xc0
-        .long (pfn << PAGE_SHIFT) | PAGE_HYPERVISOR_NOCACHE | MAP_SMALL_PAGES
-        .else
-        .long (pfn << PAGE_SHIFT) | PAGE_HYPERVISOR | MAP_SMALL_PAGES
-        .endif
-        .long 0
-        pfn = pfn + 1
-        .endr
-        .size l1_identmap, . - l1_identmap
diff --git a/xen/arch/x86/boot/reloc.c b/xen/arch/x86/boot/reloc.c
index f971920..63045c0 100644
--- a/xen/arch/x86/boot/reloc.c
+++ b/xen/arch/x86/boot/reloc.c
@@ -90,7 +90,6 @@ multiboot_info_t *reloc(multiboot_info_t *mbi_old)
 
     /* Mask features we don't understand or don't relocate. */
     mbi->flags &= (MBI_MEMLIMITS |
-                   MBI_BOOTDEV |
                    MBI_CMDLINE |
                    MBI_MODULES |
                    MBI_MEMMAP |
diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S
index bfbafd2..c8bf9d0 100644
--- a/xen/arch/x86/boot/x86_64.S
+++ b/xen/arch/x86/boot/x86_64.S
@@ -1,5 +1,7 @@
+        .text
         .code64
 
+ENTRY(__high_start)
         /* Install relocated data selectors. */
         lgdt    gdt_descr(%rip)
         mov     $(__HYPERVISOR_DS64),%ecx
@@ -80,6 +82,24 @@ GLOBAL(boot_cpu_compat_gdt_table)
         .align PAGE_SIZE, 0
 
 GLOBAL(__page_tables_start)
+/*
+ * Mapping of first 2 megabytes of memory. This is mapped with 4kB mappings
+ * to avoid type conflicts with fixed-range MTRRs covering the lowest megabyte
+ * of physical memory. In any case the VGA hole should be mapped with type UC.
+ */
+GLOBAL(l1_identmap)
+        pfn = 0
+        .rept L1_PAGETABLE_ENTRIES
+        /* VGA hole (0xa0000-0xc0000) should be mapped UC. */
+        .if pfn >= 0xa0 && pfn < 0xc0
+        .long (pfn << PAGE_SHIFT) | PAGE_HYPERVISOR_NOCACHE | MAP_SMALL_PAGES
+        .else
+        .long (pfn << PAGE_SHIFT) | PAGE_HYPERVISOR | MAP_SMALL_PAGES
+        .endif
+        .long 0
+        pfn = pfn + 1
+        .endr
+        .size l1_identmap, . - l1_identmap
 
 /* Mapping of first 16 megabytes of memory. */
 GLOBAL(l2_identmap)
diff --git a/xen/arch/x86/compat.c b/xen/arch/x86/compat.c
index 2d05867..2d4be2e 100644
--- a/xen/arch/x86/compat.c
+++ b/xen/arch/x86/compat.c
@@ -5,9 +5,10 @@
  * hypercall after doing necessary argument munging.
  */
 
-#include <xen/config.h>
 #include <xen/guest_access.h>
 #include <xen/hypercall.h>
+#include <xen/trace.h>
+#include <public/sched.h>
 
 #ifndef COMPAT
 typedef long ret_t;
@@ -26,6 +27,28 @@ ret_t do_physdev_op_compat(XEN_GUEST_HANDLE(physdev_op_t) uop)
 
 #ifndef COMPAT
 
+/* Legacy hypercall (as of 0x00030101). */
+long do_sched_op_compat(int cmd, unsigned long arg)
+{
+    switch ( cmd )
+    {
+    case SCHEDOP_yield:
+    case SCHEDOP_block:
+        return do_sched_op(cmd, guest_handle_from_ptr(NULL, void));
+
+    case SCHEDOP_shutdown:
+        TRACE_3D(TRC_SCHED_SHUTDOWN,
+                 current->domain->domain_id, current->vcpu_id, arg);
+        domain_shutdown(current->domain, (u8)arg);
+        break;
+
+    default:
+        return -ENOSYS;
+    }
+
+    return 0;
+}
+
 /* Legacy hypercall (as of 0x00030202). */
 long do_event_channel_op_compat(XEN_GUEST_HANDLE_PARAM(evtchn_op_t) uop)
 {
diff --git a/xen/arch/x86/cpu/Makefile b/xen/arch/x86/cpu/Makefile
index d73d93a..74f23ae 100644
--- a/xen/arch/x86/cpu/Makefile
+++ b/xen/arch/x86/cpu/Makefile
@@ -7,3 +7,4 @@ obj-y += common.o
 obj-y += intel.o
 obj-y += intel_cacheinfo.o
 obj-y += mwait-idle.o
+obj-y += vpmu.o vpmu_amd.o vpmu_intel.o
diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 566cdac..ad5fd09 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -620,7 +620,7 @@ static void __devinit init_amd(struct cpuinfo_x86 *c)
 	check_syscfg_dram_mod_en();
 }
 
-static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+static const struct cpu_dev amd_cpu_dev = {
 	.c_vendor	= "AMD",
 	.c_ident 	= { "AuthenticAMD" },
 	.c_init		= init_amd,
diff --git a/xen/arch/x86/cpu/centaur.c b/xen/arch/x86/cpu/centaur.c
index 4aaa144..aaa0386 100644
--- a/xen/arch/x86/cpu/centaur.c
+++ b/xen/arch/x86/cpu/centaur.c
@@ -60,7 +60,7 @@ static void __init init_centaur(struct cpuinfo_x86 *c)
 		init_c3(c);
 }
 
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+static const struct cpu_dev centaur_cpu_dev = {
 	.c_vendor	= "Centaur",
 	.c_ident	= { "CentaurHauls" },
 	.c_init		= init_centaur,
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 5c8d3c2..35ef21b 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -12,6 +12,7 @@
 #include <asm/apic.h>
 #include <mach_apic.h>
 #include <asm/setup.h>
+#include <public/sysctl.h> /* for XEN_INVALID_{SOCKET,CORE}_ID */
 
 #include "cpu.h"
 
@@ -34,7 +35,7 @@ integer_param("cpuid_mask_ext_ecx", opt_cpuid_mask_ext_ecx);
 unsigned int __devinitdata opt_cpuid_mask_ext_edx = ~0u;
 integer_param("cpuid_mask_ext_edx", opt_cpuid_mask_ext_edx);
 
-struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
+const struct cpu_dev *__read_mostly cpu_devs[X86_VENDOR_NUM] = {};
 
 unsigned int paddr_bits __read_mostly = 36;
 
@@ -60,11 +61,11 @@ static void default_init(struct cpuinfo_x86 * c)
 	__clear_bit(X86_FEATURE_SEP, c->x86_capability);
 }
 
-static struct cpu_dev default_cpu = {
+static const struct cpu_dev default_cpu = {
 	.c_init	= default_init,
 	.c_vendor = "Unknown",
 };
-static struct cpu_dev * this_cpu = &default_cpu;
+static const struct cpu_dev *this_cpu = &default_cpu;
 
 bool_t opt_cpu_info;
 boolean_param("cpuinfo", opt_cpu_info);
@@ -125,9 +126,8 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 		       l2size, ecx & 0xFF);
 }
 
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
+int get_cpu_vendor(const char v[], enum get_cpu_vendor mode)
 {
-	char *v = c->x86_vendor_id;
 	int i;
 	static int printed;
 
@@ -136,20 +136,22 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
 			if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
 			    (cpu_devs[i]->c_ident[1] && 
 			     !strcmp(v,cpu_devs[i]->c_ident[1]))) {
-				c->x86_vendor = i;
-				if (!early)
+				if (mode == gcv_host_late)
 					this_cpu = cpu_devs[i];
-				return;
+				return i;
 			}
 		}
 	}
+	if (mode == gcv_guest)
+		return X86_VENDOR_UNKNOWN;
 	if (!printed) {
 		printed++;
 		printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
 		printk(KERN_ERR "CPU: Your system may be unstable.\n");
 	}
-	c->x86_vendor = X86_VENDOR_UNKNOWN;
 	this_cpu = &default_cpu;
+
+	return X86_VENDOR_UNKNOWN;
 }
 
 static inline u32 _phys_pkg_id(u32 cpuid_apic, int index_msb)
@@ -188,7 +190,7 @@ static void __init early_cpu_detect(void)
 	      (int *)&c->x86_vendor_id[8],
 	      (int *)&c->x86_vendor_id[4]);
 
-	get_cpu_vendor(c, 1);
+	c->x86_vendor = get_cpu_vendor(c->x86_vendor_id, gcv_host_early);
 
 	cpuid(0x00000001, &tfms, &misc, &cap4, &cap0);
 	c->x86 = (tfms >> 8) & 15;
@@ -217,7 +219,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
 	      (int *)&c->x86_vendor_id[8],
 	      (int *)&c->x86_vendor_id[4]);
 		
-	get_cpu_vendor(c, 0);
+	c->x86_vendor = get_cpu_vendor(c->x86_vendor_id, gcv_host_late);
 	/* Initialize the standard set of capabilities */
 	/* Note that the vendor-specific code below might override */
 	
@@ -277,9 +279,9 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_max_cores = 1;
 	c->x86_num_siblings = 1;
 	c->x86_clflush_size = 0;
-	c->phys_proc_id = BAD_APICID;
-	c->cpu_core_id = BAD_APICID;
-	c->compute_unit_id = BAD_APICID;
+	c->phys_proc_id = XEN_INVALID_SOCKET_ID;
+	c->cpu_core_id = XEN_INVALID_CORE_ID;
+	c->compute_unit_id = INVALID_CUID;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
 
 	generic_identify(c);
@@ -636,3 +638,41 @@ void cpu_uninit(unsigned int cpu)
 {
 	cpumask_clear_cpu(cpu, &cpu_initialized);
 }
+
+/*
+ * x86_match_cpu - match the current CPU against an array of
+ * x86_cpu_ids
+ * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
+ *         {}.
+ * Return the entry if the current CPU matches the entries in the
+ * passed x86_cpu_id match table. Otherwise NULL.  The match table
+ * contains vendor (X86_VENDOR_*), family, model and feature bits or
+ * respective wildcard entries.
+ *
+ * A typical table entry would be to match a specific CPU
+ * { X86_VENDOR_INTEL, 6, 0x12 }
+ * or to match a specific CPU feature
+ * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
+ *
+ * This always matches against the boot cpu, assuming models and
+features are
+ * consistent over all CPUs.
+ */
+const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id table[])
+{
+	const struct x86_cpu_id *m;
+	const struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	for (m = table; m->vendor | m->family | m->model | m->feature; m++) {
+		if (c->x86_vendor != m->vendor)
+			continue;
+		if (c->x86 != m->family)
+			continue;
+		if (c->x86_model != m->model)
+			continue;
+		if (!cpu_has(c, m->feature))
+			continue;
+		return m;
+	}
+	return NULL;
+}
diff --git a/xen/arch/x86/cpu/cpu.h b/xen/arch/x86/cpu/cpu.h
index 68563bb..ed6cdf0 100644
--- a/xen/arch/x86/cpu/cpu.h
+++ b/xen/arch/x86/cpu/cpu.h
@@ -8,7 +8,7 @@ struct cpu_dev {
 	void		(*c_init)(struct cpuinfo_x86 * c);
 };
 
-extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
+extern const struct cpu_dev *cpu_devs[X86_VENDOR_NUM];
 
 extern bool_t opt_arat;
 extern unsigned int opt_cpuid_mask_ecx, opt_cpuid_mask_edx;
diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
index 9868cd5..53bfec8 100644
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -286,7 +286,7 @@ static void __devinit init_intel(struct cpuinfo_x86 *c)
 		set_bit(X86_FEATURE_ARAT, c->x86_capability);
 }
 
-static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+static const struct cpu_dev intel_cpu_dev = {
 	.c_vendor	= "Intel",
 	.c_ident 	= { "GenuineIntel" },
 	.c_init		= init_intel,
diff --git a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
index 98a0f8d..8a80a9f 100644
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
diff --git a/xen/arch/x86/cpu/mcheck/mce-apei.c b/xen/arch/x86/cpu/mcheck/mce-apei.c
index 08f6401..3933c19 100644
--- a/xen/arch/x86/cpu/mcheck/mce-apei.c
+++ b/xen/arch/x86/cpu/mcheck/mce-apei.c
@@ -25,8 +25,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/kernel.h>
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
index 05a86fb..7c2cacc 100644
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -235,7 +235,7 @@ static void mca_init_bank(enum mca_source who,
 
     if (who == MCA_CMCI_HANDLER) {
         mib->mc_ctrl2 = mca_rdmsr(MSR_IA32_MC0_CTL2 + bank);
-        rdtscll(mib->mc_tsc);
+        mib->mc_tsc = rdtsc();
     }
 }
 
@@ -1339,7 +1339,7 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_mc_t) u_xen_mc)
             mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id);
             mctelem_ack(which, cookie);
         } else {
-            if (!is_pv_32on64_vcpu(v)
+            if (!is_pv_32bit_vcpu(v)
                 ? guest_handle_is_null(mc_fetch.nat->data)
                 : compat_handle_is_null(mc_fetch.cmp->data))
                 return x86_mcerr("do_mca fetch: guest buffer "
@@ -1347,7 +1347,7 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_mc_t) u_xen_mc)
 
             if ((mctc = mctelem_consume_oldest_begin(which))) {
                 struct mc_info *mcip = mctelem_dataptr(mctc);
-                if (!is_pv_32on64_vcpu(v)
+                if (!is_pv_32bit_vcpu(v)
                     ? copy_to_guest(mc_fetch.nat->data, mcip, 1)
                     : copy_to_compat(mc_fetch.cmp->data,
                                      mcip, 1)) {
@@ -1378,7 +1378,7 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_mc_t) u_xen_mc)
         mc_physcpuinfo.nat = &op->u.mc_physcpuinfo;
         nlcpu = num_online_cpus();
 
-        if (!is_pv_32on64_vcpu(v)
+        if (!is_pv_32bit_vcpu(v)
             ? !guest_handle_is_null(mc_physcpuinfo.nat->info)
             : !compat_handle_is_null(mc_physcpuinfo.cmp->info)) {
             if (mc_physcpuinfo.nat->ncpus <= 0)
@@ -1389,7 +1389,7 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_mc_t) u_xen_mc)
             if (log_cpus == NULL)
                 return x86_mcerr("do_mca cpuinfo", -ENOMEM);
             on_each_cpu(do_mc_get_cpu_info, log_cpus, 1);
-            if (!is_pv_32on64_vcpu(v)
+            if (!is_pv_32bit_vcpu(v)
                 ? copy_to_guest(mc_physcpuinfo.nat->info,
                                 log_cpus, nlcpu)
                 : copy_to_compat(mc_physcpuinfo.cmp->info,
diff --git a/xen/arch/x86/cpu/mcheck/mce_amd.c b/xen/arch/x86/cpu/mcheck/mce_amd.c
index 4e8ad38..599e465 100644
--- a/xen/arch/x86/cpu/mcheck/mce_amd.c
+++ b/xen/arch/x86/cpu/mcheck/mce_amd.c
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /* K8 common MCA documentation published at
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
index 94db396..193366b 100644
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -492,6 +492,9 @@ static int do_cmci_discover(int i)
 {
     unsigned msr = MSR_IA32_MCx_CTL2(i);
     u64 val;
+    unsigned int threshold, max_threshold;
+    static unsigned int cmci_threshold = 2;
+    integer_param("cmci-threshold", cmci_threshold);
 
     rdmsrl(msr, val);
     /* Some other CPU already owns this bank. */
@@ -500,15 +503,28 @@ static int do_cmci_discover(int i)
         goto out;
     }
 
-    val &= ~CMCI_THRESHOLD_MASK;
-    wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
-    rdmsrl(msr, val);
+    if ( cmci_threshold )
+    {
+        wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK);
+        rdmsrl(msr, val);
+    }
 
     if (!(val & CMCI_EN)) {
         /* This bank does not support CMCI. Polling timer has to handle it. */
         mcabanks_set(i, __get_cpu_var(no_cmci_banks));
+        wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK);
         return 0;
     }
+    max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK);
+    threshold = cmci_threshold;
+    if ( threshold > max_threshold )
+    {
+       mce_printk(MCE_QUIET,
+                  "CMCI: threshold %#x too large for CPU%u bank %u, using %#x\n",
+                  threshold, smp_processor_id(), i, max_threshold);
+       threshold = max_threshold;
+    }
+    wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold);
     mcabanks_set(i, __get_cpu_var(mce_banks_owned));
 out:
     mcabanks_clear(i, __get_cpu_var(no_cmci_banks));
diff --git a/xen/arch/x86/cpu/mcheck/mce_quirks.h b/xen/arch/x86/cpu/mcheck/mce_quirks.h
index 54cddd5..2c93a31 100644
--- a/xen/arch/x86/cpu/mcheck/mce_quirks.h
+++ b/xen/arch/x86/cpu/mcheck/mce_quirks.h
@@ -12,8 +12,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _MCE_QUIRK_H
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c b/xen/arch/x86/cpu/mcheck/mctelem.c
index b8da465..95e83c5 100644
--- a/xen/arch/x86/cpu/mcheck/mctelem.c
+++ b/xen/arch/x86/cpu/mcheck/mctelem.c
@@ -260,8 +260,7 @@ void __init mctelem_init(unsigned int datasz)
 	if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
 	    MC_NENT)) == NULL ||
 	    (datarr = xmalloc_bytes(MC_NENT * datasz)) == NULL) {
-		if (mctctl.mctc_elems)
-			xfree(mctctl.mctc_elems);
+		xfree(mctctl.mctc_elems);
 		printk("Allocations for MCA telemetry failed\n");
 		return;
 	}
diff --git a/xen/arch/x86/cpu/mcheck/vmce.c b/xen/arch/x86/cpu/mcheck/vmce.c
index ba1693c..b136320 100644
--- a/xen/arch/x86/cpu/mcheck/vmce.c
+++ b/xen/arch/x86/cpu/mcheck/vmce.c
@@ -15,8 +15,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/init.h>
diff --git a/xen/arch/x86/cpu/mcheck/x86_mca.h b/xen/arch/x86/cpu/mcheck/x86_mca.h
index a2cd37e..76467d6 100644
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef X86_MCA_H
@@ -87,8 +86,6 @@
 #define K8_HWCR_MCi_STATUS_WREN		(1ULL << 18)
 
 /*Intel Specific bitfield*/
-#define CMCI_THRESHOLD			0x2
-
 #define MCi_MISC_ADDRMOD_MASK (0x7UL << 6)
 #define MCi_MISC_PHYSMOD    (0x2UL << 6)
 
diff --git a/xen/arch/x86/cpu/mtrr/generic.c b/xen/arch/x86/cpu/mtrr/generic.c
index 493830b..935f0a0 100644
--- a/xen/arch/x86/cpu/mtrr/generic.c
+++ b/xen/arch/x86/cpu/mtrr/generic.c
@@ -182,6 +182,18 @@ static void __init print_mtrr_state(const char *level)
 		else
 			printk("%s  %u disabled\n", level, i);
 	}
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
+	    && boot_cpu_data.x86 >= 0xf) {
+		uint64_t syscfg, tom2;
+
+		rdmsrl(MSR_K8_SYSCFG, syscfg);
+		if (syscfg & (1 << 21)) {
+			rdmsrl(MSR_K8_TOP_MEM2, tom2);
+			printk("%sTOM2: %012"PRIx64"%s\n", level, tom2,
+			       syscfg & (1 << 22) ? " (WB)" : "");
+		}
+	}
 }
 
 /*  Some BIOS's are fucked and don't set all MTRRs the same!  */
diff --git a/xen/arch/x86/cpu/mtrr/main.c b/xen/arch/x86/cpu/mtrr/main.c
index f5d5317..bf489e3 100644
--- a/xen/arch/x86/cpu/mtrr/main.c
+++ b/xen/arch/x86/cpu/mtrr/main.c
@@ -14,8 +14,7 @@
     Library General Public License for more details.
 
     You should have received a copy of the GNU Library General Public
-    License along with this library; if not, write to the Free
-    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+    License along with this library; If not, see <http://www.gnu.org/licenses/>.
 
     Richard Gooch may be reached by email at  rgooch at atnf.csiro.au
     The postal address is:
@@ -36,6 +35,7 @@
 #include <xen/lib.h>
 #include <xen/smp.h>
 #include <xen/spinlock.h>
+#include <asm/atomic.h>
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
diff --git a/xen/arch/x86/cpu/mwait-idle.c b/xen/arch/x86/cpu/mwait-idle.c
index 6dd5822..07d8794 100644
--- a/xen/arch/x86/cpu/mwait-idle.c
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
@@ -196,28 +195,22 @@ static const struct cpuidle_state byt_cstates[] = {
 		.target_residency = 1,
 	},
 	{
-		.name = "C1E-BYT",
-		.flags = MWAIT2flg(0x01),
-		.exit_latency = 15,
-		.target_residency = 30,
-	},
-	{
 		.name = "C6N-BYT",
 		.flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TLB_FLUSHED,
-		.exit_latency = 40,
+		.exit_latency = 300,
 		.target_residency = 275,
 	},
 	{
 		.name = "C6S-BYT",
 		.flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
-		.exit_latency = 140,
+		.exit_latency = 500,
 		.target_residency = 560,
 	},
 	{
 		.name = "C7-BYT",
 		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 1200,
-		.target_residency = 1500,
+		.target_residency = 4000,
 	},
 	{
 		.name = "C7S-BYT",
@@ -228,6 +221,40 @@ static const struct cpuidle_state byt_cstates[] = {
 	{}
 };
 
+static const struct cpuidle_state cht_cstates[] = {
+	{
+		.name = "C1-CHT",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 1,
+	},
+	{
+		.name = "C6N-CHT",
+		.flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 80,
+		.target_residency = 275,
+	},
+	{
+		.name = "C6S-CHT",
+		.flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 200,
+		.target_residency = 560,
+	},
+	{
+		.name = "C7-CHT",
+		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 1200,
+		.target_residency = 4000,
+	},
+	{
+		.name = "C7S-CHT",
+		.flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 10000,
+		.target_residency = 20000,
+	},
+	{}
+};
+
 static const struct cpuidle_state ivb_cstates[] = {
 	{
 		.name = "C1-IVB",
@@ -536,7 +563,6 @@ static void mwait_idle(void)
 		return;
 	}
 
-	power->last_state = cx;
 	eax = cx->address;
 	cstate = ((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
 
@@ -555,6 +581,8 @@ static void mwait_idle(void)
 	before = cpuidle_get_tick();
 	TRACE_4D(TRC_PM_IDLE_ENTRY, cx->type, before, exp, pred);
 
+	update_last_cx_stat(power, cx, before);
+
 	if (cpu_is_haltable(cpu))
 		mwait_idle_with_hints(eax, MWAIT_ECX_INTERRUPT_BREAK);
 
@@ -565,15 +593,13 @@ static void mwait_idle(void)
 	TRACE_6D(TRC_PM_IDLE_EXIT, cx->type, after,
 		irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
 
+	/* Now back in C0. */
 	update_idle_stats(power, cx, before, after);
 	local_irq_enable();
 
 	if (!(lapic_timer_reliable_states & (1 << cstate)))
 		lapic_timer_on();
 
-	/* Now back in C0. */
-	power->last_state = &power->states[0];
-
 	sched_tick_resume();
 	cpufreq_dbs_timer_resume();
 
@@ -631,6 +657,12 @@ static const struct idle_cpu idle_cpu_byt = {
 	.byt_auto_demotion_disable_flag = 1,
 };
 
+static const struct idle_cpu idle_cpu_cht = {
+	.state_table = cht_cstates,
+	.disable_promotion_to_c1e = 1,
+	.byt_auto_demotion_disable_flag = 1,
+};
+
 static const struct idle_cpu idle_cpu_ivb = {
 	.state_table = ivb_cstates,
 	.disable_promotion_to_c1e = 1,
@@ -656,12 +688,11 @@ static const struct idle_cpu idle_cpu_avn = {
 	.disable_promotion_to_c1e = 1,
 };
 
-#define ICPU(model, cpu) { 6, model, &idle_cpu_##cpu }
+#define ICPU(model, cpu) \
+    { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, \
+        &idle_cpu_##cpu}
 
-static struct intel_idle_id {
-	unsigned int family, model;
-	const struct idle_cpu *data;
-} intel_idle_ids[] __initdata = {
+static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	ICPU(0x1a, nehalem),
 	ICPU(0x1e, nehalem),
 	ICPU(0x1f, nehalem),
@@ -675,6 +706,7 @@ static struct intel_idle_id {
 	ICPU(0x2d, snb),
 	ICPU(0x36, atom),
 	ICPU(0x37, byt),
+	ICPU(0x4c, cht),
 	ICPU(0x3a, ivb),
 	ICPU(0x3e, ivt),
 	ICPU(0x3c, hsw),
@@ -683,6 +715,7 @@ static struct intel_idle_id {
 	ICPU(0x46, hsw),
 	ICPU(0x4d, avn),
 	ICPU(0x3d, bdw),
+	ICPU(0x47, bdw),
 	ICPU(0x4f, bdw),
 	ICPU(0x56, bdw),
 	{}
@@ -722,23 +755,17 @@ static void __init mwait_idle_state_table_update(void)
 static int __init mwait_idle_probe(void)
 {
 	unsigned int eax, ebx, ecx;
-	const struct intel_idle_id *id;
+	const struct x86_cpu_id *id = x86_match_cpu(intel_idle_ids);
 
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-	    !boot_cpu_has(X86_FEATURE_MWAIT) ||
-	    boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
-		return -ENODEV;
-
-	for (id = intel_idle_ids; id->family; ++id)
-		if (id->family == boot_cpu_data.x86 &&
-		    id->model == boot_cpu_data.x86_model)
-			break;
-	if (!id->family) {
+	if (!id) {
 		pr_debug(PREFIX "does not run on family %d model %d\n",
 			 boot_cpu_data.x86, boot_cpu_data.x86_model);
 		return -ENODEV;
 	}
 
+	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+		return -ENODEV;
+
 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
 
 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
@@ -753,7 +780,7 @@ static int __init mwait_idle_probe(void)
 
 	pr_debug(PREFIX "MWAIT substates: %#x\n", mwait_substates);
 
-	icpu = id->data;
+	icpu = id->driver_data;
 	cpuidle_state_table = icpu->state_table;
 
 	if (boot_cpu_has(X86_FEATURE_ARAT))
diff --git a/xen/arch/x86/cpu/vpmu.c b/xen/arch/x86/cpu/vpmu.c
new file mode 100644
index 0000000..8af3df1
--- /dev/null
+++ b/xen/arch/x86/cpu/vpmu.c
@@ -0,0 +1,817 @@
+/*
+ * vpmu.c: PMU virtualization for HVM domain.
+ *
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Haitao Shan <haitao.shan at intel.com>
+ */
+#include <xen/config.h>
+#include <xen/sched.h>
+#include <xen/xenoprof.h>
+#include <xen/event.h>
+#include <xen/guest_access.h>
+#include <asm/regs.h>
+#include <asm/types.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+#include <asm/p2m.h>
+#include <asm/vpmu.h>
+#include <asm/hvm/support.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <asm/hvm/vmx/vmcs.h>
+#include <asm/hvm/svm/svm.h>
+#include <asm/hvm/svm/vmcb.h>
+#include <asm/apic.h>
+#include <public/pmu.h>
+#include <xsm/xsm.h>
+
+#include <compat/pmu.h>
+CHECK_pmu_cntr_pair;
+CHECK_pmu_data;
+CHECK_pmu_params;
+
+/*
+ * "vpmu" :     vpmu generally enabled
+ * "vpmu=off" : vpmu generally disabled
+ * "vpmu=bts" : vpmu enabled and Intel BTS feature switched on.
+ */
+static unsigned int __read_mostly opt_vpmu_enabled;
+unsigned int __read_mostly vpmu_mode = XENPMU_MODE_OFF;
+unsigned int __read_mostly vpmu_features = 0;
+static void parse_vpmu_param(char *s);
+custom_param("vpmu", parse_vpmu_param);
+
+static DEFINE_SPINLOCK(vpmu_lock);
+static unsigned vpmu_count;
+
+static DEFINE_PER_CPU(struct vcpu *, last_vcpu);
+
+static void __init parse_vpmu_param(char *s)
+{
+    switch ( parse_bool(s) )
+    {
+    case 0:
+        break;
+    default:
+        if ( !strcmp(s, "bts") )
+            vpmu_features |= XENPMU_FEATURE_INTEL_BTS;
+        else if ( *s )
+        {
+            printk("VPMU: unknown flag: %s - vpmu disabled!\n", s);
+            break;
+        }
+        /* fall through */
+    case 1:
+        /* Default VPMU mode */
+        vpmu_mode = XENPMU_MODE_SELF;
+        opt_vpmu_enabled = 1;
+        break;
+    }
+}
+
+void vpmu_lvtpc_update(uint32_t val)
+{
+    struct vpmu_struct *vpmu;
+    struct vcpu *curr = current;
+
+    if ( likely(vpmu_mode == XENPMU_MODE_OFF) )
+        return;
+
+    vpmu = vcpu_vpmu(curr);
+
+    vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR | (val & APIC_LVT_MASKED);
+
+    /* Postpone APIC updates for PV(H) guests if PMU interrupt is pending */
+    if ( is_hvm_vcpu(curr) || !vpmu->xenpmu_data ||
+         !vpmu_is_set(vpmu, VPMU_CACHED) )
+        apic_write(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
+}
+
+int vpmu_do_msr(unsigned int msr, uint64_t *msr_content,
+                uint64_t supported, bool_t is_write)
+{
+    struct vcpu *curr = current;
+    struct vpmu_struct *vpmu;
+    const struct arch_vpmu_ops *ops;
+    int ret = 0;
+
+    if ( likely(vpmu_mode == XENPMU_MODE_OFF) ||
+         ((vpmu_mode & XENPMU_MODE_ALL) &&
+          !is_hardware_domain(current->domain)) )
+         goto nop;
+
+    vpmu = vcpu_vpmu(curr);
+    ops = vpmu->arch_vpmu_ops;
+    if ( !ops )
+        goto nop;
+
+    if ( is_write && ops->do_wrmsr )
+        ret = ops->do_wrmsr(msr, *msr_content, supported);
+    else if ( !is_write && ops->do_rdmsr )
+        ret = ops->do_rdmsr(msr, msr_content);
+    else
+        goto nop;
+
+    /*
+     * We may have received a PMU interrupt while handling MSR access
+     * and since do_wr/rdmsr may load VPMU context we should save
+     * (and unload) it again.
+     */
+    if ( !is_hvm_vcpu(curr) && vpmu->xenpmu_data &&
+        vpmu_is_set(vpmu, VPMU_CACHED) )
+    {
+        vpmu_set(vpmu, VPMU_CONTEXT_SAVE);
+        ops->arch_vpmu_save(curr, 0);
+        vpmu_reset(vpmu, VPMU_CONTEXT_SAVE | VPMU_CONTEXT_LOADED);
+    }
+
+    return ret;
+
+ nop:
+    if ( !is_write )
+        *msr_content = 0;
+
+    return 0;
+}
+
+static inline struct vcpu *choose_hwdom_vcpu(void)
+{
+    unsigned idx;
+
+    if ( hardware_domain->max_vcpus == 0 )
+        return NULL;
+
+    idx = smp_processor_id() % hardware_domain->max_vcpus;
+
+    return hardware_domain->vcpu[idx];
+}
+
+void vpmu_do_interrupt(struct cpu_user_regs *regs)
+{
+    struct vcpu *sampled = current, *sampling;
+    struct vpmu_struct *vpmu;
+    struct vlapic *vlapic;
+    u32 vlapic_lvtpc;
+
+    /*
+     * dom0 will handle interrupt for special domains (e.g. idle domain) or,
+     * in XENPMU_MODE_ALL, for everyone.
+     */
+    if ( (vpmu_mode & XENPMU_MODE_ALL) ||
+         (sampled->domain->domain_id >= DOMID_FIRST_RESERVED) )
+    {
+        sampling = choose_hwdom_vcpu();
+        if ( !sampling )
+            return;
+    }
+    else
+        sampling = sampled;
+
+    vpmu = vcpu_vpmu(sampling);
+    if ( !vpmu->arch_vpmu_ops )
+        return;
+
+    /* PV(H) guest */
+    if ( !is_hvm_vcpu(sampling) || (vpmu_mode & XENPMU_MODE_ALL) )
+    {
+        const struct cpu_user_regs *cur_regs;
+        uint64_t *flags = &vpmu->xenpmu_data->pmu.pmu_flags;
+        domid_t domid;
+
+        if ( !vpmu->xenpmu_data )
+            return;
+
+        if ( is_pvh_vcpu(sampling) &&
+             !(vpmu_mode & XENPMU_MODE_ALL) &&
+             !vpmu->arch_vpmu_ops->do_interrupt(regs) )
+            return;
+
+        if ( vpmu_is_set(vpmu, VPMU_CACHED) )
+            return;
+
+        /* PV guest will be reading PMU MSRs from xenpmu_data */
+        vpmu_set(vpmu, VPMU_CONTEXT_SAVE | VPMU_CONTEXT_LOADED);
+        vpmu->arch_vpmu_ops->arch_vpmu_save(sampling, 1);
+        vpmu_reset(vpmu, VPMU_CONTEXT_SAVE | VPMU_CONTEXT_LOADED);
+
+        if ( has_hvm_container_vcpu(sampled) )
+            *flags = 0;
+        else
+            *flags = PMU_SAMPLE_PV;
+
+        if ( sampled == sampling )
+            domid = DOMID_SELF;
+        else
+            domid = sampled->domain->domain_id;
+
+        /* Store appropriate registers in xenpmu_data */
+        /* FIXME: 32-bit PVH should go here as well */
+        if ( is_pv_32bit_vcpu(sampling) )
+        {
+            /*
+             * 32-bit dom0 cannot process Xen's addresses (which are 64 bit)
+             * and therefore we treat it the same way as a non-privileged
+             * PV 32-bit domain.
+             */
+            struct compat_pmu_regs *cmp;
+
+            cur_regs = guest_cpu_user_regs();
+
+            cmp = (void *)&vpmu->xenpmu_data->pmu.r.regs;
+            cmp->ip = cur_regs->rip;
+            cmp->sp = cur_regs->rsp;
+            cmp->flags = cur_regs->eflags;
+            cmp->ss = cur_regs->ss;
+            cmp->cs = cur_regs->cs;
+            if ( (cmp->cs & 3) > 1 )
+                *flags |= PMU_SAMPLE_USER;
+        }
+        else
+        {
+            struct xen_pmu_regs *r = &vpmu->xenpmu_data->pmu.r.regs;
+
+            if ( (vpmu_mode & XENPMU_MODE_SELF) )
+                cur_regs = guest_cpu_user_regs();
+            else if ( !guest_mode(regs) &&
+                      is_hardware_domain(sampling->domain) )
+            {
+                cur_regs = regs;
+                domid = DOMID_XEN;
+            }
+            else
+                cur_regs = guest_cpu_user_regs();
+
+            r->ip = cur_regs->rip;
+            r->sp = cur_regs->rsp;
+            r->flags = cur_regs->eflags;
+
+            if ( !has_hvm_container_vcpu(sampled) )
+            {
+                r->ss = cur_regs->ss;
+                r->cs = cur_regs->cs;
+                if ( !(sampled->arch.flags & TF_kernel_mode) )
+                    *flags |= PMU_SAMPLE_USER;
+            }
+            else
+            {
+                struct segment_register seg;
+
+                hvm_get_segment_register(sampled, x86_seg_cs, &seg);
+                r->cs = seg.sel;
+                hvm_get_segment_register(sampled, x86_seg_ss, &seg);
+                r->ss = seg.sel;
+                r->cpl = seg.attr.fields.dpl;
+                if ( !(sampled->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
+                    *flags |= PMU_SAMPLE_REAL;
+            }
+        }
+
+        vpmu->xenpmu_data->domain_id = domid;
+        vpmu->xenpmu_data->vcpu_id = sampled->vcpu_id;
+        if ( is_hardware_domain(sampling->domain) )
+            vpmu->xenpmu_data->pcpu_id = smp_processor_id();
+        else
+            vpmu->xenpmu_data->pcpu_id = sampled->vcpu_id;
+
+        vpmu->hw_lapic_lvtpc |= APIC_LVT_MASKED;
+        apic_write(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
+        *flags |= PMU_CACHED;
+        vpmu_set(vpmu, VPMU_CACHED);
+
+        send_guest_vcpu_virq(sampling, VIRQ_XENPMU);
+
+        return;
+    }
+
+    /* HVM guests */
+    vlapic = vcpu_vlapic(sampling);
+
+    /* We don't support (yet) HVM dom0 */
+    ASSERT(sampling == sampled);
+
+    if ( !vpmu->arch_vpmu_ops->do_interrupt(regs) ||
+         !is_vlapic_lvtpc_enabled(vlapic) )
+        return;
+
+    vlapic_lvtpc = vlapic_get_reg(vlapic, APIC_LVTPC);
+
+    switch ( GET_APIC_DELIVERY_MODE(vlapic_lvtpc) )
+    {
+    case APIC_MODE_FIXED:
+        vlapic_set_irq(vlapic, vlapic_lvtpc & APIC_VECTOR_MASK, 0);
+        break;
+    case APIC_MODE_NMI:
+        sampling->nmi_pending = 1;
+        break;
+    }
+}
+
+void vpmu_do_cpuid(unsigned int input,
+                   unsigned int *eax, unsigned int *ebx,
+                   unsigned int *ecx, unsigned int *edx)
+{
+    struct vpmu_struct *vpmu = vcpu_vpmu(current);
+
+    if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_cpuid )
+        vpmu->arch_vpmu_ops->do_cpuid(input, eax, ebx, ecx, edx);
+}
+
+static void vpmu_save_force(void *arg)
+{
+    struct vcpu *v = (struct vcpu *)arg;
+    struct vpmu_struct *vpmu = vcpu_vpmu(v);
+
+    if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) )
+        return;
+
+    vpmu_set(vpmu, VPMU_CONTEXT_SAVE);
+
+    if ( vpmu->arch_vpmu_ops )
+        (void)vpmu->arch_vpmu_ops->arch_vpmu_save(v, 0);
+
+    vpmu_reset(vpmu, VPMU_CONTEXT_SAVE);
+
+    per_cpu(last_vcpu, smp_processor_id()) = NULL;
+}
+
+void vpmu_save(struct vcpu *v)
+{
+    struct vpmu_struct *vpmu = vcpu_vpmu(v);
+    int pcpu = smp_processor_id();
+
+    if ( !vpmu_are_all_set(vpmu, VPMU_CONTEXT_ALLOCATED | VPMU_CONTEXT_LOADED) )
+       return;
+
+    vpmu->last_pcpu = pcpu;
+    per_cpu(last_vcpu, pcpu) = v;
+
+    if ( vpmu->arch_vpmu_ops )
+        if ( vpmu->arch_vpmu_ops->arch_vpmu_save(v, 0) )
+            vpmu_reset(vpmu, VPMU_CONTEXT_LOADED);
+
+    apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED);
+}
+
+int vpmu_load(struct vcpu *v, bool_t from_guest)
+{
+    struct vpmu_struct *vpmu = vcpu_vpmu(v);
+    int pcpu = smp_processor_id();
+    struct vcpu *prev = NULL;
+
+    if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) )
+        return 0;
+
+    /* First time this VCPU is running here */
+    if ( vpmu->last_pcpu != pcpu )
+    {
+        /*
+         * Get the context from last pcpu that we ran on. Note that if another
+         * VCPU is running there it must have saved this VPCU's context before
+         * startig to run (see below).
+         * There should be no race since remote pcpu will disable interrupts
+         * before saving the context.
+         */
+        if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) )
+        {
+            on_selected_cpus(cpumask_of(vpmu->last_pcpu),
+                             vpmu_save_force, (void *)v, 1);
+            vpmu_reset(vpmu, VPMU_CONTEXT_LOADED);
+        }
+    } 
+
+    /* Prevent forced context save from remote CPU */
+    local_irq_disable();
+
+    prev = per_cpu(last_vcpu, pcpu);
+
+    if ( prev != v && prev )
+    {
+        vpmu = vcpu_vpmu(prev);
+
+        /* Someone ran here before us */
+        vpmu_save_force(prev);
+        vpmu_reset(vpmu, VPMU_CONTEXT_LOADED);
+
+        vpmu = vcpu_vpmu(v);
+    }
+
+    local_irq_enable();
+
+    /* Only when PMU is counting, we load PMU context immediately. */
+    if ( !vpmu_is_set(vpmu, VPMU_RUNNING) ||
+         (!is_hvm_vcpu(vpmu_vcpu(vpmu)) && vpmu_is_set(vpmu, VPMU_CACHED)) )
+        return 0;
+
+    if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_load )
+    {
+        int ret;
+
+        apic_write_around(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
+        /* Arch code needs to set VPMU_CONTEXT_LOADED */
+        ret = vpmu->arch_vpmu_ops->arch_vpmu_load(v, from_guest);
+        if ( ret )
+        {
+            apic_write_around(APIC_LVTPC,
+                              vpmu->hw_lapic_lvtpc | APIC_LVT_MASKED);
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+void vpmu_initialise(struct vcpu *v)
+{
+    struct vpmu_struct *vpmu = vcpu_vpmu(v);
+    uint8_t vendor = current_cpu_data.x86_vendor;
+    int ret;
+    bool_t is_priv_vpmu = is_hardware_domain(v->domain);
+
+    BUILD_BUG_ON(sizeof(struct xen_pmu_intel_ctxt) > XENPMU_CTXT_PAD_SZ);
+    BUILD_BUG_ON(sizeof(struct xen_pmu_amd_ctxt) > XENPMU_CTXT_PAD_SZ);
+    BUILD_BUG_ON(sizeof(struct xen_pmu_regs) > XENPMU_REGS_PAD_SZ);
+    BUILD_BUG_ON(sizeof(struct compat_pmu_regs) > XENPMU_REGS_PAD_SZ);
+
+    ASSERT(!vpmu->flags && !vpmu->context);
+
+    if ( !is_priv_vpmu )
+    {
+        /*
+         * Count active VPMUs so that we won't try to change vpmu_mode while
+         * they are in use.
+         * vpmu_mode can be safely updated while dom0's VPMUs are active and
+         * so we don't need to include it in the count.
+         */
+        spin_lock(&vpmu_lock);
+        vpmu_count++;
+        spin_unlock(&vpmu_lock);
+    }
+
+    switch ( vendor )
+    {
+    case X86_VENDOR_AMD:
+        ret = svm_vpmu_initialise(v);
+        break;
+
+    case X86_VENDOR_INTEL:
+        ret = vmx_vpmu_initialise(v);
+        break;
+
+    default:
+        if ( vpmu_mode != XENPMU_MODE_OFF )
+        {
+            printk(XENLOG_G_WARNING "VPMU: Unknown CPU vendor %d. "
+                   "Disabling VPMU\n", vendor);
+            opt_vpmu_enabled = 0;
+            vpmu_mode = XENPMU_MODE_OFF;
+        }
+        return; /* Don't bother restoring vpmu_count, VPMU is off forever */
+    }
+
+    if ( ret )
+        printk(XENLOG_G_WARNING "VPMU: Initialization failed for %pv\n", v);
+
+    /* Intel needs to initialize VPMU ops even if VPMU is not in use */
+    if ( !is_priv_vpmu &&
+         (ret || (vpmu_mode == XENPMU_MODE_OFF) ||
+          (vpmu_mode == XENPMU_MODE_ALL)) )
+    {
+        spin_lock(&vpmu_lock);
+        vpmu_count--;
+        spin_unlock(&vpmu_lock);
+    }
+}
+
+static void vpmu_clear_last(void *arg)
+{
+    if ( this_cpu(last_vcpu) == arg )
+        this_cpu(last_vcpu) = NULL;
+}
+
+void vpmu_destroy(struct vcpu *v)
+{
+    struct vpmu_struct *vpmu = vcpu_vpmu(v);
+
+    if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) )
+        return;
+
+    /*
+     * Need to clear last_vcpu in case it points to v.
+     * We can check here non-atomically whether it is 'v' since
+     * last_vcpu can never become 'v' again at this point.
+     * We will test it again in vpmu_clear_last() with interrupts
+     * disabled to make sure we don't clear someone else.
+     */
+    if ( per_cpu(last_vcpu, vpmu->last_pcpu) == v )
+        on_selected_cpus(cpumask_of(vpmu->last_pcpu),
+                         vpmu_clear_last, v, 1);
+
+    if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_destroy )
+    {
+        /* Unload VPMU first. This will stop counters */
+        on_selected_cpus(cpumask_of(vcpu_vpmu(v)->last_pcpu),
+                         vpmu_save_force, v, 1);
+         vpmu->arch_vpmu_ops->arch_vpmu_destroy(v);
+    }
+
+    spin_lock(&vpmu_lock);
+    if ( !is_hardware_domain(v->domain) )
+        vpmu_count--;
+    spin_unlock(&vpmu_lock);
+}
+
+static int pvpmu_init(struct domain *d, xen_pmu_params_t *params)
+{
+    struct vcpu *v;
+    struct vpmu_struct *vpmu;
+    struct page_info *page;
+    uint64_t gfn = params->val;
+
+    if ( (vpmu_mode == XENPMU_MODE_OFF) ||
+         ((vpmu_mode & XENPMU_MODE_ALL) && !is_hardware_domain(d)) )
+        return -EINVAL;
+
+    if ( (params->vcpu >= d->max_vcpus) || (d->vcpu[params->vcpu] == NULL) )
+        return -EINVAL;
+
+    page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC);
+    if ( !page )
+        return -EINVAL;
+
+    if ( !get_page_type(page, PGT_writable_page) )
+    {
+        put_page(page);
+        return -EINVAL;
+    }
+
+    v = d->vcpu[params->vcpu];
+    vpmu = vcpu_vpmu(v);
+
+    spin_lock(&vpmu->vpmu_lock);
+
+    if ( v->arch.vpmu.xenpmu_data )
+    {
+        spin_unlock(&vpmu->vpmu_lock);
+        put_page_and_type(page);
+        return -EEXIST;
+    }
+
+    v->arch.vpmu.xenpmu_data = __map_domain_page_global(page);
+    if ( !v->arch.vpmu.xenpmu_data )
+    {
+        spin_unlock(&vpmu->vpmu_lock);
+        put_page_and_type(page);
+        return -ENOMEM;
+    }
+
+    vpmu_initialise(v);
+
+    spin_unlock(&vpmu->vpmu_lock);
+
+    return 0;
+}
+
+static void pvpmu_finish(struct domain *d, xen_pmu_params_t *params)
+{
+    struct vcpu *v;
+    struct vpmu_struct *vpmu;
+    uint64_t mfn;
+    void *xenpmu_data;
+
+    if ( (params->vcpu >= d->max_vcpus) || (d->vcpu[params->vcpu] == NULL) )
+        return;
+
+    v = d->vcpu[params->vcpu];
+    if ( v != current )
+        vcpu_pause(v);
+
+    vpmu = vcpu_vpmu(v);
+    spin_lock(&vpmu->vpmu_lock);
+
+    vpmu_destroy(v);
+    xenpmu_data = vpmu->xenpmu_data;
+    vpmu->xenpmu_data = NULL;
+
+    spin_unlock(&vpmu->vpmu_lock);
+
+    if ( xenpmu_data )
+    {
+        mfn = domain_page_map_to_mfn(xenpmu_data);
+        ASSERT(mfn_valid(mfn));
+        unmap_domain_page_global(xenpmu_data);
+        put_page_and_type(mfn_to_page(mfn));
+    }
+
+    if ( v != current )
+        vcpu_unpause(v);
+}
+
+/* Dump some vpmu informations on console. Used in keyhandler dump_domains(). */
+void vpmu_dump(struct vcpu *v)
+{
+    struct vpmu_struct *vpmu = vcpu_vpmu(v);
+
+    if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_dump )
+        vpmu->arch_vpmu_ops->arch_vpmu_dump(v);
+}
+
+long do_xenpmu_op(unsigned int op, XEN_GUEST_HANDLE_PARAM(xen_pmu_params_t) arg)
+{
+    int ret;
+    struct vcpu *curr;
+    struct xen_pmu_params pmu_params = {.val = 0};
+    struct xen_pmu_data *xenpmu_data;
+    struct vpmu_struct *vpmu;
+
+    if ( !opt_vpmu_enabled )
+        return -EOPNOTSUPP;
+
+    ret = xsm_pmu_op(XSM_OTHER, current->domain, op);
+    if ( ret )
+        return ret;
+
+    /* Check major version when parameters are specified */
+    switch ( op )
+    {
+    case XENPMU_mode_set:
+    case XENPMU_feature_set:
+    case XENPMU_init:
+    case XENPMU_finish:
+        if ( copy_from_guest(&pmu_params, arg, 1) )
+            return -EFAULT;
+
+        if ( pmu_params.version.maj != XENPMU_VER_MAJ )
+            return -EINVAL;
+    }
+
+    switch ( op )
+    {
+    case XENPMU_mode_set:
+    {
+        if ( (pmu_params.val &
+              ~(XENPMU_MODE_SELF | XENPMU_MODE_HV | XENPMU_MODE_ALL)) ||
+             (hweight64(pmu_params.val) > 1) )
+            return -EINVAL;
+
+        /* 32-bit dom0 can only sample itself. */
+        if ( is_pv_32bit_vcpu(current) &&
+             (pmu_params.val & (XENPMU_MODE_HV | XENPMU_MODE_ALL)) )
+            return -EINVAL;
+
+        spin_lock(&vpmu_lock);
+
+        /*
+         * We can always safely switch between XENPMU_MODE_SELF and
+         * XENPMU_MODE_HV while other VPMUs are active.
+         */
+        if ( (vpmu_count == 0) ||
+             ((vpmu_mode ^ pmu_params.val) ==
+              (XENPMU_MODE_SELF | XENPMU_MODE_HV)) )
+            vpmu_mode = pmu_params.val;
+        else if ( vpmu_mode != pmu_params.val )
+        {
+            printk(XENLOG_WARNING
+                   "VPMU: Cannot change mode while active VPMUs exist\n");
+            ret = -EBUSY;
+        }
+
+        spin_unlock(&vpmu_lock);
+
+        break;
+    }
+
+    case XENPMU_mode_get:
+        memset(&pmu_params, 0, sizeof(pmu_params));
+        pmu_params.val = vpmu_mode;
+
+        pmu_params.version.maj = XENPMU_VER_MAJ;
+        pmu_params.version.min = XENPMU_VER_MIN;
+
+        if ( copy_to_guest(arg, &pmu_params, 1) )
+            ret = -EFAULT;
+
+        break;
+
+    case XENPMU_feature_set:
+        if ( pmu_params.val & ~XENPMU_FEATURE_INTEL_BTS )
+            return -EINVAL;
+
+        spin_lock(&vpmu_lock);
+
+        if ( (vpmu_count == 0) || (vpmu_features == pmu_params.val) )
+            vpmu_features = pmu_params.val;
+        else
+        {
+            printk(XENLOG_WARNING "VPMU: Cannot change features while"
+                                  " active VPMUs exist\n");
+            ret = -EBUSY;
+        }
+
+        spin_unlock(&vpmu_lock);
+
+        break;
+
+    case XENPMU_feature_get:
+        pmu_params.val = vpmu_features;
+        if ( copy_field_to_guest(arg, &pmu_params, val) )
+            ret = -EFAULT;
+
+        break;
+
+    case XENPMU_init:
+        ret = pvpmu_init(current->domain, &pmu_params);
+        break;
+
+    case XENPMU_finish:
+        pvpmu_finish(current->domain, &pmu_params);
+        break;
+
+    case XENPMU_lvtpc_set:
+        xenpmu_data = current->arch.vpmu.xenpmu_data;
+        if ( xenpmu_data != NULL )
+            vpmu_lvtpc_update(xenpmu_data->pmu.l.lapic_lvtpc);
+        else
+            ret = -EINVAL;
+        break;
+
+    case XENPMU_flush:
+        curr = current;
+        vpmu = vcpu_vpmu(curr);
+        xenpmu_data = curr->arch.vpmu.xenpmu_data;
+        if ( xenpmu_data == NULL )
+            return -EINVAL;
+        xenpmu_data->pmu.pmu_flags &= ~PMU_CACHED;
+        vpmu_reset(vpmu, VPMU_CACHED);
+        vpmu_lvtpc_update(xenpmu_data->pmu.l.lapic_lvtpc);
+        if ( vpmu_load(curr, 1) )
+        {
+            xenpmu_data->pmu.pmu_flags |= PMU_CACHED;
+            vpmu_set(vpmu, VPMU_CACHED);
+            ret = -EIO;
+        }
+        break ;
+
+    default:
+        ret = -EINVAL;
+    }
+
+    return ret;
+}
+
+static int __init vpmu_init(void)
+{
+    int vendor = current_cpu_data.x86_vendor;
+
+    if ( !opt_vpmu_enabled )
+    {
+        printk(XENLOG_INFO "VPMU: disabled\n");
+        return 0;
+    }
+
+    /* NMI watchdog uses LVTPC and HW counter */
+    if ( opt_watchdog && opt_vpmu_enabled )
+    {
+        printk(XENLOG_WARNING "NMI watchdog is enabled. Turning VPMU off.\n");
+        opt_vpmu_enabled = 0;
+        vpmu_mode = XENPMU_MODE_OFF;
+        return 0;
+    }
+
+    switch ( vendor )
+    {
+    case X86_VENDOR_AMD:
+        if ( amd_vpmu_init() )
+           vpmu_mode = XENPMU_MODE_OFF;
+        break;
+    case X86_VENDOR_INTEL:
+        if ( core2_vpmu_init() )
+           vpmu_mode = XENPMU_MODE_OFF;
+        break;
+    default:
+        printk(XENLOG_WARNING "VPMU: Unknown CPU vendor: %d. "
+               "Turning VPMU off.\n", vendor);
+        vpmu_mode = XENPMU_MODE_OFF;
+        break;
+    }
+
+    if ( vpmu_mode != XENPMU_MODE_OFF )
+        printk(XENLOG_INFO "VPMU: version " __stringify(XENPMU_VER_MAJ) "."
+               __stringify(XENPMU_VER_MIN) "\n");
+    else
+        opt_vpmu_enabled = 0;
+
+    return 0;
+}
+__initcall(vpmu_init);
diff --git a/xen/arch/x86/hvm/svm/vpmu.c b/xen/arch/x86/cpu/vpmu_amd.c
similarity index 56%
rename from xen/arch/x86/hvm/svm/vpmu.c
rename to xen/arch/x86/cpu/vpmu_amd.c
index 4c448bb..04da81a 100644
--- a/xen/arch/x86/hvm/svm/vpmu.c
+++ b/xen/arch/x86/cpu/vpmu_amd.c
@@ -17,8 +17,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
@@ -28,12 +27,9 @@
 #include <xen/sched.h>
 #include <xen/irq.h>
 #include <asm/apic.h>
+#include <asm/vpmu.h>
 #include <asm/hvm/vlapic.h>
-#include <asm/hvm/vpmu.h>
-
-#define F10H_NUM_COUNTERS 4
-#define F15H_NUM_COUNTERS 6
-#define MAX_NUM_COUNTERS F15H_NUM_COUNTERS
+#include <public/pmu.h>
 
 #define MSR_F10H_EVNTSEL_GO_SHIFT   40
 #define MSR_F10H_EVNTSEL_EN_SHIFT   22
@@ -49,6 +45,13 @@ static const u32 __read_mostly *counters;
 static const u32 __read_mostly *ctrls;
 static bool_t __read_mostly k7_counters_mirrored;
 
+/* Total size of PMU registers block (copied to/from PV(H) guest) */
+static unsigned int __read_mostly regs_sz;
+
+#define F10H_NUM_COUNTERS   4
+#define F15H_NUM_COUNTERS   6
+#define MAX_NUM_COUNTERS    F15H_NUM_COUNTERS
+
 /* PMU Counter MSRs. */
 static const u32 AMD_F10H_COUNTERS[] = {
     MSR_K7_PERFCTR0,
@@ -83,24 +86,38 @@ static const u32 AMD_F15H_CTRLS[] = {
     MSR_AMD_FAM15H_EVNTSEL5
 };
 
-/* storage for context switching */
-struct amd_vpmu_context {
-    u64 counters[MAX_NUM_COUNTERS];
-    u64 ctrls[MAX_NUM_COUNTERS];
-    bool_t msr_bitmap_set;
-};
-
-static inline int get_pmu_reg_type(u32 addr)
+/* Bits [63:42], [39:36], 21 and 19 are reserved */
+#define CTRL_RSVD_MASK ((-1ULL & (~((1ULL << 42) - 1))) | \
+                        (0xfULL << 36) | (1ULL << 21) | (1ULL << 19))
+static uint64_t __read_mostly ctrl_rsvd[MAX_NUM_COUNTERS];
+
+/* Use private context as a flag for MSR bitmap */
+#define msr_bitmap_on(vpmu)    do {                                    \
+                                   (vpmu)->priv_context = (void *)-1L; \
+                               } while (0)
+#define msr_bitmap_off(vpmu)   do {                                    \
+                                   (vpmu)->priv_context = NULL;        \
+                               } while (0)
+#define is_msr_bitmap_on(vpmu) ((vpmu)->priv_context != NULL)
+
+static inline int get_pmu_reg_type(u32 addr, unsigned int *idx)
 {
     if ( (addr >= MSR_K7_EVNTSEL0) && (addr <= MSR_K7_EVNTSEL3) )
+    {
+        *idx = addr - MSR_K7_EVNTSEL0;
         return MSR_TYPE_CTRL;
+    }
 
     if ( (addr >= MSR_K7_PERFCTR0) && (addr <= MSR_K7_PERFCTR3) )
+    {
+        *idx = addr - MSR_K7_PERFCTR0;
         return MSR_TYPE_COUNTER;
+    }
 
     if ( (addr >= MSR_AMD_FAM15H_EVNTSEL0) &&
          (addr <= MSR_AMD_FAM15H_PERFCTR5 ) )
     {
+        *idx = (addr - MSR_AMD_FAM15H_EVNTSEL0) >> 1;
         if (addr & 1)
             return MSR_TYPE_COUNTER;
         else
@@ -138,11 +155,20 @@ static inline u32 get_fam15h_addr(u32 addr)
     return addr;
 }
 
+static void amd_vpmu_init_regs(struct xen_pmu_amd_ctxt *ctxt)
+{
+    unsigned i;
+    uint64_t *ctrl_regs = vpmu_reg_pointer(ctxt, ctrls);
+
+    memset(&ctxt->regs[0], 0, regs_sz);
+    for ( i = 0; i < num_counters; i++ )
+        ctrl_regs[i] = ctrl_rsvd[i];
+}
+
 static void amd_vpmu_set_msr_bitmap(struct vcpu *v)
 {
     unsigned int i;
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct amd_vpmu_context *ctxt = vpmu->context;
 
     for ( i = 0; i < num_counters; i++ )
     {
@@ -150,14 +176,13 @@ static void amd_vpmu_set_msr_bitmap(struct vcpu *v)
         svm_intercept_msr(v, ctrls[i], MSR_INTERCEPT_WRITE);
     }
 
-    ctxt->msr_bitmap_set = 1;
+    msr_bitmap_on(vpmu);
 }
 
 static void amd_vpmu_unset_msr_bitmap(struct vcpu *v)
 {
     unsigned int i;
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct amd_vpmu_context *ctxt = vpmu->context;
 
     for ( i = 0; i < num_counters; i++ )
     {
@@ -165,7 +190,7 @@ static void amd_vpmu_unset_msr_bitmap(struct vcpu *v)
         svm_intercept_msr(v, ctrls[i], MSR_INTERCEPT_RW);
     }
 
-    ctxt->msr_bitmap_set = 0;
+    msr_bitmap_off(vpmu);
 }
 
 static int amd_vpmu_do_interrupt(struct cpu_user_regs *regs)
@@ -177,65 +202,102 @@ static inline void context_load(struct vcpu *v)
 {
     unsigned int i;
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct amd_vpmu_context *ctxt = vpmu->context;
+    struct xen_pmu_amd_ctxt *ctxt = vpmu->context;
+    uint64_t *counter_regs = vpmu_reg_pointer(ctxt, counters);
+    uint64_t *ctrl_regs = vpmu_reg_pointer(ctxt, ctrls);
 
     for ( i = 0; i < num_counters; i++ )
     {
-        wrmsrl(counters[i], ctxt->counters[i]);
-        wrmsrl(ctrls[i], ctxt->ctrls[i]);
+        wrmsrl(counters[i], counter_regs[i]);
+        wrmsrl(ctrls[i], ctrl_regs[i]);
     }
 }
 
-static void amd_vpmu_load(struct vcpu *v)
+static int amd_vpmu_load(struct vcpu *v, bool_t from_guest)
 {
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct amd_vpmu_context *ctxt = vpmu->context;
+    struct xen_pmu_amd_ctxt *ctxt;
+    uint64_t *ctrl_regs;
+    unsigned int i;
 
     vpmu_reset(vpmu, VPMU_FROZEN);
 
-    if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) )
+    if ( !from_guest && vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) )
+    {
+        ctxt = vpmu->context;
+        ctrl_regs = vpmu_reg_pointer(ctxt, ctrls);
+
+        for ( i = 0; i < num_counters; i++ )
+            wrmsrl(ctrls[i], ctrl_regs[i]);
+
+        return 0;
+    }
+
+    if ( from_guest )
     {
-        unsigned int i;
+        bool_t is_running = 0;
+        struct xen_pmu_amd_ctxt *guest_ctxt = &vpmu->xenpmu_data->pmu.c.amd;
+
+        ASSERT(!is_hvm_vcpu(v));
+
+        ctxt = vpmu->context;
+        ctrl_regs = vpmu_reg_pointer(ctxt, ctrls);
+
+        memcpy(&ctxt->regs[0], &guest_ctxt->regs[0], regs_sz);
 
         for ( i = 0; i < num_counters; i++ )
-            wrmsrl(ctrls[i], ctxt->ctrls[i]);
+        {
+            if ( (ctrl_regs[i] & CTRL_RSVD_MASK) != ctrl_rsvd[i] )
+            {
+                /*
+                 * Not necessary to re-init context since we should never load
+                 * it until guest provides valid values. But just to be safe.
+                 */
+                amd_vpmu_init_regs(ctxt);
+                return -EINVAL;
+            }
+
+            if ( is_pmu_enabled(ctrl_regs[i]) )
+                is_running = 1;
+        }
 
-        return;
+        if ( is_running )
+            vpmu_set(vpmu, VPMU_RUNNING);
+        else
+            vpmu_reset(vpmu, VPMU_RUNNING);
     }
 
     vpmu_set(vpmu, VPMU_CONTEXT_LOADED);
 
     context_load(v);
+
+    return 0;
 }
 
 static inline void context_save(struct vcpu *v)
 {
     unsigned int i;
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct amd_vpmu_context *ctxt = vpmu->context;
+    struct xen_pmu_amd_ctxt *ctxt = vpmu->context;
+    uint64_t *counter_regs = vpmu_reg_pointer(ctxt, counters);
 
     /* No need to save controls -- they are saved in amd_vpmu_do_wrmsr */
     for ( i = 0; i < num_counters; i++ )
-        rdmsrl(counters[i], ctxt->counters[i]);
+        rdmsrl(counters[i], counter_regs[i]);
 }
 
-static int amd_vpmu_save(struct vcpu *v)
+static int amd_vpmu_save(struct vcpu *v,  bool_t to_guest)
 {
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct amd_vpmu_context *ctx = vpmu->context;
     unsigned int i;
 
-    /*
-     * Stop the counters. If we came here via vpmu_save_force (i.e.
-     * when VPMU_CONTEXT_SAVE is set) counters are already stopped.
-     */
+    /* Stop the counters. */
+    for ( i = 0; i < num_counters; i++ )
+        wrmsrl(ctrls[i], 0);
+
     if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_SAVE) )
     {
         vpmu_set(vpmu, VPMU_FROZEN);
-
-        for ( i = 0; i < num_counters; i++ )
-            wrmsrl(ctrls[i], 0);
-
         return 0;
     }
 
@@ -244,9 +306,20 @@ static int amd_vpmu_save(struct vcpu *v)
 
     context_save(v);
 
-    if ( !vpmu_is_set(vpmu, VPMU_RUNNING) && ctx->msr_bitmap_set )
+    if ( !vpmu_is_set(vpmu, VPMU_RUNNING) &&
+         has_hvm_container_vcpu(v) && is_msr_bitmap_on(vpmu) )
         amd_vpmu_unset_msr_bitmap(v);
 
+    if ( to_guest )
+    {
+        struct xen_pmu_amd_ctxt *guest_ctxt, *ctxt;
+
+        ASSERT(!is_hvm_vcpu(v));
+        ctxt = vpmu->context;
+        guest_ctxt = &vpmu->xenpmu_data->pmu.c.amd;
+        memcpy(&guest_ctxt->regs[0], &ctxt->regs[0], regs_sz);
+    }
+
     return 1;
 }
 
@@ -255,7 +328,9 @@ static void context_update(unsigned int msr, u64 msr_content)
     unsigned int i;
     struct vcpu *v = current;
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct amd_vpmu_context *ctxt = vpmu->context;
+    struct xen_pmu_amd_ctxt *ctxt = vpmu->context;
+    uint64_t *counter_regs = vpmu_reg_pointer(ctxt, counters);
+    uint64_t *ctrl_regs = vpmu_reg_pointer(ctxt, ctrls);
 
     if ( k7_counters_mirrored &&
         ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)) )
@@ -267,12 +342,12 @@ static void context_update(unsigned int msr, u64 msr_content)
     {
        if ( msr == ctrls[i] )
        {
-           ctxt->ctrls[i] = msr_content;
+           ctrl_regs[i] = msr_content;
            return;
        }
         else if (msr == counters[i] )
         {
-            ctxt->counters[i] = msr_content;
+            counter_regs[i] = msr_content;
             return;
         }
     }
@@ -283,39 +358,41 @@ static int amd_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
 {
     struct vcpu *v = current;
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
+    unsigned int idx = 0;
+    int type = get_pmu_reg_type(msr, &idx);
 
     ASSERT(!supported);
 
+    if ( (type == MSR_TYPE_CTRL ) &&
+         ((msr_content & CTRL_RSVD_MASK) != ctrl_rsvd[idx]) )
+        return -EINVAL;
+
     /* For all counters, enable guest only mode for HVM guest */
-    if ( (get_pmu_reg_type(msr) == MSR_TYPE_CTRL) &&
-        !(is_guest_mode(msr_content)) )
+    if ( has_hvm_container_vcpu(v) && (type == MSR_TYPE_CTRL) &&
+         !is_guest_mode(msr_content) )
     {
         set_guest_mode(msr_content);
     }
 
     /* check if the first counter is enabled */
-    if ( (get_pmu_reg_type(msr) == MSR_TYPE_CTRL) &&
+    if ( (type == MSR_TYPE_CTRL) &&
         is_pmu_enabled(msr_content) && !vpmu_is_set(vpmu, VPMU_RUNNING) )
     {
         if ( !acquire_pmu_ownership(PMU_OWNER_HVM) )
-            return 1;
+            return 0;
         vpmu_set(vpmu, VPMU_RUNNING);
-        apic_write(APIC_LVTPC, PMU_APIC_VECTOR);
-        vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR;
 
-        if ( !((struct amd_vpmu_context *)vpmu->context)->msr_bitmap_set )
-            amd_vpmu_set_msr_bitmap(v);
+        if ( has_hvm_container_vcpu(v) && is_msr_bitmap_on(vpmu) )
+             amd_vpmu_set_msr_bitmap(v);
     }
 
     /* stop saving & restore if guest stops first counter */
-    if ( (get_pmu_reg_type(msr) == MSR_TYPE_CTRL) &&
+    if ( (type == MSR_TYPE_CTRL) &&
         (is_pmu_enabled(msr_content) == 0) && vpmu_is_set(vpmu, VPMU_RUNNING) )
     {
-        apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED);
-        vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR | APIC_LVT_MASKED;
         vpmu_reset(vpmu, VPMU_RUNNING);
-        if ( ((struct amd_vpmu_context *)vpmu->context)->msr_bitmap_set )
-            amd_vpmu_unset_msr_bitmap(v);
+        if ( has_hvm_container_vcpu(v) && is_msr_bitmap_on(vpmu) )
+             amd_vpmu_unset_msr_bitmap(v);
         release_pmu_ownship(PMU_OWNER_HVM);
     }
 
@@ -332,7 +409,7 @@ static int amd_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
 
     /* Write to hw counters */
     wrmsrl(msr, msr_content);
-    return 1;
+    return 0;
 }
 
 static int amd_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
@@ -350,52 +427,6 @@ static int amd_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
 
     rdmsrl(msr, *msr_content);
 
-    return 1;
-}
-
-static int amd_vpmu_initialise(struct vcpu *v)
-{
-    struct amd_vpmu_context *ctxt;
-    struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    uint8_t family = current_cpu_data.x86;
-
-    if ( vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) )
-        return 0;
-
-    if ( counters == NULL )
-    {
-         switch ( family )
-	 {
-	 case 0x15:
-	     num_counters = F15H_NUM_COUNTERS;
-	     counters = AMD_F15H_COUNTERS;
-	     ctrls = AMD_F15H_CTRLS;
-	     k7_counters_mirrored = 1;
-	     break;
-	 case 0x10:
-	 case 0x12:
-	 case 0x14:
-	 case 0x16:
-	 default:
-	     num_counters = F10H_NUM_COUNTERS;
-	     counters = AMD_F10H_COUNTERS;
-	     ctrls = AMD_F10H_CTRLS;
-	     k7_counters_mirrored = 0;
-	     break;
-	 }
-    }
-
-    ctxt = xzalloc(struct amd_vpmu_context);
-    if ( !ctxt )
-    {
-        gdprintk(XENLOG_WARNING, "Insufficient memory for PMU, "
-            " PMU feature is unavailable on domain %d vcpu %d.\n",
-            v->vcpu_id, v->domain->domain_id);
-        return -ENOMEM;
-    }
-
-    vpmu->context = ctxt;
-    vpmu_set(vpmu, VPMU_CONTEXT_ALLOCATED);
     return 0;
 }
 
@@ -403,24 +434,26 @@ static void amd_vpmu_destroy(struct vcpu *v)
 {
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
 
-    if ( ((struct amd_vpmu_context *)vpmu->context)->msr_bitmap_set )
+    if ( has_hvm_container_vcpu(v) && is_msr_bitmap_on(vpmu) )
         amd_vpmu_unset_msr_bitmap(v);
 
     xfree(vpmu->context);
-    vpmu_reset(vpmu, VPMU_CONTEXT_ALLOCATED);
+    vpmu->context = NULL;
+    vpmu->priv_context = NULL;
 
     if ( vpmu_is_set(vpmu, VPMU_RUNNING) )
-    {
-        vpmu_reset(vpmu, VPMU_RUNNING);
         release_pmu_ownship(PMU_OWNER_HVM);
-    }
+
+    vpmu_clear(vpmu);
 }
 
 /* VPMU part of the 'q' keyhandler */
 static void amd_vpmu_dump(const struct vcpu *v)
 {
     const struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    const struct amd_vpmu_context *ctxt = vpmu->context;
+    const struct xen_pmu_amd_ctxt *ctxt = vpmu->context;
+    const uint64_t *counter_regs = vpmu_reg_pointer(ctxt, counters);
+    const uint64_t *ctrl_regs = vpmu_reg_pointer(ctxt, ctrls);
     unsigned int i;
 
     printk("    VPMU state: 0x%x ", vpmu->flags);
@@ -450,8 +483,8 @@ static void amd_vpmu_dump(const struct vcpu *v)
         rdmsrl(ctrls[i], ctrl);
         rdmsrl(counters[i], cntr);
         printk("      %#x: %#lx (%#lx in HW)    %#x: %#lx (%#lx in HW)\n",
-               ctrls[i], ctxt->ctrls[i], ctrl,
-               counters[i], ctxt->counters[i], cntr);
+               ctrls[i], ctrl_regs[i], ctrl,
+               counters[i], counter_regs[i], cntr);
     }
 }
 
@@ -465,32 +498,92 @@ struct arch_vpmu_ops amd_vpmu_ops = {
     .arch_vpmu_dump = amd_vpmu_dump
 };
 
-int svm_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags)
+int svm_vpmu_initialise(struct vcpu *v)
 {
+    struct xen_pmu_amd_ctxt *ctxt;
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    uint8_t family = current_cpu_data.x86;
-    int ret = 0;
 
-    /* vpmu enabled? */
-    if ( !vpmu_flags )
+    if ( vpmu_mode == XENPMU_MODE_OFF )
         return 0;
 
-    switch ( family )
+    if ( !counters )
+        return -EINVAL;
+
+    ctxt = xmalloc_bytes(sizeof(*ctxt) + regs_sz);
+    if ( !ctxt )
+    {
+        printk(XENLOG_G_WARNING "Insufficient memory for PMU, "
+               " PMU feature is unavailable on domain %d vcpu %d.\n",
+               v->vcpu_id, v->domain->domain_id);
+        return -ENOMEM;
+    }
+
+    ctxt->counters = sizeof(*ctxt);
+    ctxt->ctrls = ctxt->counters + sizeof(uint64_t) * num_counters;
+    amd_vpmu_init_regs(ctxt);
+
+    vpmu->context = ctxt;
+    vpmu->priv_context = NULL;
+
+    if ( !is_hvm_vcpu(v) )
     {
+        /* Copy register offsets to shared area */
+        ASSERT(vpmu->xenpmu_data);
+        memcpy(&vpmu->xenpmu_data->pmu.c.amd, ctxt,
+               offsetof(struct xen_pmu_amd_ctxt, regs));
+    }
+
+    vpmu->arch_vpmu_ops = &amd_vpmu_ops;
+
+    vpmu_set(vpmu, VPMU_CONTEXT_ALLOCATED);
+    return 0;
+}
+
+int __init amd_vpmu_init(void)
+{
+    unsigned int i;
+
+    switch ( current_cpu_data.x86 )
+    {
+    case 0x15:
+        num_counters = F15H_NUM_COUNTERS;
+        counters = AMD_F15H_COUNTERS;
+        ctrls = AMD_F15H_CTRLS;
+        k7_counters_mirrored = 1;
+        break;
     case 0x10:
     case 0x12:
     case 0x14:
-    case 0x15:
     case 0x16:
-        ret = amd_vpmu_initialise(v);
-        if ( !ret )
-            vpmu->arch_vpmu_ops = &amd_vpmu_ops;
-        return ret;
+        num_counters = F10H_NUM_COUNTERS;
+        counters = AMD_F10H_COUNTERS;
+        ctrls = AMD_F10H_CTRLS;
+        k7_counters_mirrored = 0;
+        break;
+    default:
+        printk(XENLOG_WARNING "VPMU: Unsupported CPU family %#x\n",
+               current_cpu_data.x86);
+        return -EINVAL;
+    }
+
+    if ( sizeof(struct xen_pmu_data) +
+         2 * sizeof(uint64_t) * num_counters > PAGE_SIZE )
+    {
+        printk(XENLOG_WARNING
+               "VPMU: Register bank does not fit into VPMU shared page\n");
+        counters = ctrls = NULL;
+        num_counters = 0;
+        return -ENOSPC;
     }
 
-    printk("VPMU: Initialization failed. "
-           "AMD processor family %d has not "
-           "been supported\n", family);
-    return -EINVAL;
+    for ( i = 0; i < num_counters; i++ )
+    {
+        rdmsrl(ctrls[i], ctrl_rsvd[i]);
+        ctrl_rsvd[i] &= CTRL_RSVD_MASK;
+    }
+
+    regs_sz = 2 * sizeof(uint64_t) * num_counters;
+
+    return 0;
 }
 
diff --git a/xen/arch/x86/hvm/vmx/vpmu_core2.c b/xen/arch/x86/cpu/vpmu_intel.c
similarity index 52%
rename from xen/arch/x86/hvm/vmx/vpmu_core2.c
rename to xen/arch/x86/cpu/vpmu_intel.c
index 590c2a9..12f80ae 100644
--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c
+++ b/xen/arch/x86/cpu/vpmu_intel.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Author: Haitao Shan <haitao.shan at intel.com>
  */
@@ -27,16 +26,17 @@
 #include <asm/regs.h>
 #include <asm/types.h>
 #include <asm/apic.h>
+#include <asm/traps.h>
 #include <asm/msr.h>
 #include <asm/msr-index.h>
+#include <asm/vpmu.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/vlapic.h>
 #include <asm/hvm/vmx/vmx.h>
 #include <asm/hvm/vmx/vmcs.h>
 #include <public/sched.h>
 #include <public/hvm/save.h>
-#include <asm/hvm/vpmu.h>
-#include <asm/hvm/vmx/vpmu_core2.h>
+#include <public/pmu.h>
 
 /*
  * See Intel SDM Vol 2a Instruction Set Reference chapter 3 for CPUID
@@ -68,6 +68,34 @@
 #define MSR_PMC_ALIAS_MASK       (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_A_PERFCTR0))
 static bool_t __read_mostly full_width_write;
 
+/* Intel-specific VPMU features */
+#define VPMU_CPU_HAS_DS                     0x100 /* Has Debug Store */
+#define VPMU_CPU_HAS_BTS                    0x200 /* Has Branch Trace Store */
+
+/*
+ * MSR_CORE_PERF_FIXED_CTR_CTRL contains the configuration of all fixed
+ * counters. 4 bits for every counter.
+ */
+#define FIXED_CTR_CTRL_BITS 4
+#define FIXED_CTR_CTRL_MASK ((1 << FIXED_CTR_CTRL_BITS) - 1)
+
+#define ARCH_CNTR_ENABLED   (1ULL << 22)
+
+/* Number of general-purpose and fixed performance counters */
+static unsigned int __read_mostly arch_pmc_cnt, fixed_pmc_cnt;
+
+/* Masks used for testing whether and MSR is valid */
+#define ARCH_CTRL_MASK  (~((1ull << 32) - 1) | (1ull << 21))
+static uint64_t __read_mostly fixed_ctrl_mask, fixed_counters_mask;
+static uint64_t __read_mostly global_ovf_ctrl_mask;
+
+/* Total size of PMU registers block (copied to/from PV(H) guest) */
+static unsigned int __read_mostly regs_sz;
+/* Offset into context of the beginning of PMU register block */
+static const unsigned int regs_off =
+        sizeof(((struct xen_pmu_intel_ctxt *)0)->fixed_counters) +
+        sizeof(((struct xen_pmu_intel_ctxt *)0)->arch_counters);
+
 /*
  * QUIRK to workaround an issue on various family 6 cpus.
  * The issue leads to endless PMC interrupt loops on the processor.
@@ -88,11 +116,8 @@ static void check_pmc_quirk(void)
         is_pmc_quirk = 0;    
 }
 
-static int core2_get_pmc_count(void);
 static void handle_pmc_quirk(u64 msr_content)
 {
-    int num_gen_pmc = core2_get_pmc_count();
-    int num_fix_pmc  = 3;
     int i;
     u64 val;
 
@@ -100,7 +125,7 @@ static void handle_pmc_quirk(u64 msr_content)
         return;
 
     val = msr_content;
-    for ( i = 0; i < num_gen_pmc; i++ )
+    for ( i = 0; i < arch_pmc_cnt; i++ )
     {
         if ( val & 0x1 )
         {
@@ -112,7 +137,7 @@ static void handle_pmc_quirk(u64 msr_content)
         val >>= 1;
     }
     val = msr_content >> 32;
-    for ( i = 0; i < num_fix_pmc; i++ )
+    for ( i = 0; i < fixed_pmc_cnt; i++ )
     {
         if ( val & 0x1 )
         {
@@ -125,128 +150,89 @@ static void handle_pmc_quirk(u64 msr_content)
     }
 }
 
-static const u32 core2_fix_counters_msr[] = {
-    MSR_CORE_PERF_FIXED_CTR0,
-    MSR_CORE_PERF_FIXED_CTR1,
-    MSR_CORE_PERF_FIXED_CTR2
-};
-
-/*
- * MSR_CORE_PERF_FIXED_CTR_CTRL contains the configuration of all fixed
- * counters. 4 bits for every counter.
- */
-#define FIXED_CTR_CTRL_BITS 4
-#define FIXED_CTR_CTRL_MASK ((1 << FIXED_CTR_CTRL_BITS) - 1)
-
-/* The index into the core2_ctrls_msr[] of this MSR used in core2_vpmu_dump() */
-#define MSR_CORE_PERF_FIXED_CTR_CTRL_IDX 0
-
-/* Core 2 Non-architectual Performance Control MSRs. */
-static const u32 core2_ctrls_msr[] = {
-    MSR_CORE_PERF_FIXED_CTR_CTRL,
-    MSR_IA32_PEBS_ENABLE,
-    MSR_IA32_DS_AREA
-};
-
-struct pmumsr {
-    unsigned int num;
-    const u32 *msr;
-};
-
-static const struct pmumsr core2_fix_counters = {
-    VPMU_CORE2_NUM_FIXED,
-    core2_fix_counters_msr
-};
-
-static const struct pmumsr core2_ctrls = {
-    VPMU_CORE2_NUM_CTRLS,
-    core2_ctrls_msr
-};
-static int arch_pmc_cnt;
-
 /*
  * Read the number of general counters via CPUID.EAX[0xa].EAX[8..15]
  */
-static int core2_get_pmc_count(void)
+static int core2_get_arch_pmc_count(void)
 {
-    u32 eax, ebx, ecx, edx;
+    u32 eax;
 
-    if ( arch_pmc_cnt == 0 )
-    {
-        cpuid(0xa, &eax, &ebx, &ecx, &edx);
-        arch_pmc_cnt = (eax & PMU_GENERAL_NR_MASK) >> PMU_GENERAL_NR_SHIFT;
-    }
-
-    return arch_pmc_cnt;
+    eax = cpuid_eax(0xa);
+    return MASK_EXTR(eax, PMU_GENERAL_NR_MASK);
 }
 
-static u64 core2_calc_intial_glb_ctrl_msr(void)
+/*
+ * Read the number of fixed counters via CPUID.EDX[0xa].EDX[0..4]
+ */
+static int core2_get_fixed_pmc_count(void)
 {
-    int arch_pmc_bits = (1 << core2_get_pmc_count()) - 1;
-    u64 fix_pmc_bits  = (1 << 3) - 1;
-    return ((fix_pmc_bits << 32) | arch_pmc_bits);
+    u32 eax;
+
+    eax = cpuid_eax(0xa);
+    return MASK_EXTR(eax, PMU_FIXED_NR_MASK);
 }
 
 /* edx bits 5-12: Bit width of fixed-function performance counters  */
 static int core2_get_bitwidth_fix_count(void)
 {
-    u32 eax, ebx, ecx, edx;
+    u32 edx;
 
-    cpuid(0xa, &eax, &ebx, &ecx, &edx);
-    return ((edx & PMU_FIXED_WIDTH_MASK) >> PMU_FIXED_WIDTH_SHIFT);
+    edx = cpuid_edx(0xa);
+    return MASK_EXTR(edx, PMU_FIXED_WIDTH_MASK);
 }
 
 static int is_core2_vpmu_msr(u32 msr_index, int *type, int *index)
 {
-    int i;
     u32 msr_index_pmc;
 
-    for ( i = 0; i < core2_fix_counters.num; i++ )
+    switch ( msr_index )
     {
-        if ( core2_fix_counters.msr[i] == msr_index )
+    case MSR_CORE_PERF_FIXED_CTR_CTRL:
+    case MSR_IA32_DS_AREA:
+    case MSR_IA32_PEBS_ENABLE:
+        *type = MSR_TYPE_CTRL;
+        return 1;
+
+    case MSR_CORE_PERF_GLOBAL_CTRL:
+    case MSR_CORE_PERF_GLOBAL_STATUS:
+    case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        *type = MSR_TYPE_GLOBAL;
+        return 1;
+
+    default:
+
+        if ( (msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
+             (msr_index < MSR_CORE_PERF_FIXED_CTR0 + fixed_pmc_cnt) )
         {
+            *index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
             *type = MSR_TYPE_COUNTER;
-            *index = i;
             return 1;
         }
-    }
 
-    for ( i = 0; i < core2_ctrls.num; i++ )
-    {
-        if ( core2_ctrls.msr[i] == msr_index )
+        if ( (msr_index >= MSR_P6_EVNTSEL(0)) &&
+             (msr_index < MSR_P6_EVNTSEL(arch_pmc_cnt)) )
         {
-            *type = MSR_TYPE_CTRL;
-            *index = i;
+            *index = msr_index - MSR_P6_EVNTSEL(0);
+            *type = MSR_TYPE_ARCH_CTRL;
             return 1;
         }
-    }
-
-    if ( (msr_index == MSR_CORE_PERF_GLOBAL_CTRL) ||
-         (msr_index == MSR_CORE_PERF_GLOBAL_STATUS) ||
-         (msr_index == MSR_CORE_PERF_GLOBAL_OVF_CTRL) )
-    {
-        *type = MSR_TYPE_GLOBAL;
-        return 1;
-    }
-
-    msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
-    if ( (msr_index_pmc >= MSR_IA32_PERFCTR0) &&
-         (msr_index_pmc < (MSR_IA32_PERFCTR0 + core2_get_pmc_count())) )
-    {
-        *type = MSR_TYPE_ARCH_COUNTER;
-        *index = msr_index_pmc - MSR_IA32_PERFCTR0;
-        return 1;
-    }
 
-    if ( (msr_index >= MSR_P6_EVNTSEL(0)) &&
-         (msr_index < (MSR_P6_EVNTSEL(core2_get_pmc_count()))) )
-    {
-        *type = MSR_TYPE_ARCH_CTRL;
-        *index = msr_index - MSR_P6_EVNTSEL(0);
-        return 1;
+        msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
+        if ( (msr_index_pmc >= MSR_IA32_PERFCTR0) &&
+             (msr_index_pmc < (MSR_IA32_PERFCTR0 + arch_pmc_cnt)) )
+        {
+            *type = MSR_TYPE_ARCH_COUNTER;
+            *index = msr_index_pmc - MSR_IA32_PERFCTR0;
+            return 1;
+        }
+        return 0;
     }
+}
 
-    return 0;
+static inline int msraddr_to_bitpos(int x)
+{
+    ASSERT(x == (x & 0x1fff));
+    return x;
 }
 
 static void core2_vpmu_set_msr_bitmap(unsigned long *msr_bitmap)
@@ -254,13 +240,13 @@ static void core2_vpmu_set_msr_bitmap(unsigned long *msr_bitmap)
     int i;
 
     /* Allow Read/Write PMU Counters MSR Directly. */
-    for ( i = 0; i < core2_fix_counters.num; i++ )
+    for ( i = 0; i < fixed_pmc_cnt; i++ )
     {
-        clear_bit(msraddr_to_bitpos(core2_fix_counters.msr[i]), msr_bitmap);
-        clear_bit(msraddr_to_bitpos(core2_fix_counters.msr[i]),
+        clear_bit(msraddr_to_bitpos(MSR_CORE_PERF_FIXED_CTR0 + i), msr_bitmap);
+        clear_bit(msraddr_to_bitpos(MSR_CORE_PERF_FIXED_CTR0 + i),
                   msr_bitmap + 0x800/BYTES_PER_LONG);
     }
-    for ( i = 0; i < core2_get_pmc_count(); i++ )
+    for ( i = 0; i < arch_pmc_cnt; i++ )
     {
         clear_bit(msraddr_to_bitpos(MSR_IA32_PERFCTR0+i), msr_bitmap);
         clear_bit(msraddr_to_bitpos(MSR_IA32_PERFCTR0+i),
@@ -275,26 +261,28 @@ static void core2_vpmu_set_msr_bitmap(unsigned long *msr_bitmap)
     }
 
     /* Allow Read PMU Non-global Controls Directly. */
-    for ( i = 0; i < core2_ctrls.num; i++ )
-        clear_bit(msraddr_to_bitpos(core2_ctrls.msr[i]), msr_bitmap);
-    for ( i = 0; i < core2_get_pmc_count(); i++ )
-        clear_bit(msraddr_to_bitpos(MSR_P6_EVNTSEL(i)), msr_bitmap);
+    for ( i = 0; i < arch_pmc_cnt; i++ )
+         clear_bit(msraddr_to_bitpos(MSR_P6_EVNTSEL(i)), msr_bitmap);
+
+    clear_bit(msraddr_to_bitpos(MSR_CORE_PERF_FIXED_CTR_CTRL), msr_bitmap);
+    clear_bit(msraddr_to_bitpos(MSR_IA32_PEBS_ENABLE), msr_bitmap);
+    clear_bit(msraddr_to_bitpos(MSR_IA32_DS_AREA), msr_bitmap);
 }
 
 static void core2_vpmu_unset_msr_bitmap(unsigned long *msr_bitmap)
 {
     int i;
 
-    for ( i = 0; i < core2_fix_counters.num; i++ )
+    for ( i = 0; i < fixed_pmc_cnt; i++ )
     {
-        set_bit(msraddr_to_bitpos(core2_fix_counters.msr[i]), msr_bitmap);
-        set_bit(msraddr_to_bitpos(core2_fix_counters.msr[i]),
+        set_bit(msraddr_to_bitpos(MSR_CORE_PERF_FIXED_CTR0 + i), msr_bitmap);
+        set_bit(msraddr_to_bitpos(MSR_CORE_PERF_FIXED_CTR0 + i),
                 msr_bitmap + 0x800/BYTES_PER_LONG);
     }
-    for ( i = 0; i < core2_get_pmc_count(); i++ )
+    for ( i = 0; i < arch_pmc_cnt; i++ )
     {
-        set_bit(msraddr_to_bitpos(MSR_IA32_PERFCTR0+i), msr_bitmap);
-        set_bit(msraddr_to_bitpos(MSR_IA32_PERFCTR0+i),
+        set_bit(msraddr_to_bitpos(MSR_IA32_PERFCTR0 + i), msr_bitmap);
+        set_bit(msraddr_to_bitpos(MSR_IA32_PERFCTR0 + i),
                 msr_bitmap + 0x800/BYTES_PER_LONG);
 
         if ( full_width_write )
@@ -305,129 +293,236 @@ static void core2_vpmu_unset_msr_bitmap(unsigned long *msr_bitmap)
         }
     }
 
-    for ( i = 0; i < core2_ctrls.num; i++ )
-        set_bit(msraddr_to_bitpos(core2_ctrls.msr[i]), msr_bitmap);
-    for ( i = 0; i < core2_get_pmc_count(); i++ )
+    for ( i = 0; i < arch_pmc_cnt; i++ )
         set_bit(msraddr_to_bitpos(MSR_P6_EVNTSEL(i)), msr_bitmap);
+
+    set_bit(msraddr_to_bitpos(MSR_CORE_PERF_FIXED_CTR_CTRL), msr_bitmap);
+    set_bit(msraddr_to_bitpos(MSR_IA32_PEBS_ENABLE), msr_bitmap);
+    set_bit(msraddr_to_bitpos(MSR_IA32_DS_AREA), msr_bitmap);
 }
 
 static inline void __core2_vpmu_save(struct vcpu *v)
 {
     int i;
-    struct core2_vpmu_context *core2_vpmu_cxt = vcpu_vpmu(v)->context;
-
-    for ( i = 0; i < core2_fix_counters.num; i++ )
-        rdmsrl(core2_fix_counters.msr[i], core2_vpmu_cxt->fix_counters[i]);
-    for ( i = 0; i < core2_get_pmc_count(); i++ )
-        rdmsrl(MSR_IA32_PERFCTR0+i, core2_vpmu_cxt->arch_msr_pair[i].counter);
+    struct xen_pmu_intel_ctxt *core2_vpmu_cxt = vcpu_vpmu(v)->context;
+    uint64_t *fixed_counters = vpmu_reg_pointer(core2_vpmu_cxt, fixed_counters);
+    struct xen_pmu_cntr_pair *xen_pmu_cntr_pair =
+        vpmu_reg_pointer(core2_vpmu_cxt, arch_counters);
+
+    for ( i = 0; i < fixed_pmc_cnt; i++ )
+        rdmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, fixed_counters[i]);
+    for ( i = 0; i < arch_pmc_cnt; i++ )
+        rdmsrl(MSR_IA32_PERFCTR0 + i, xen_pmu_cntr_pair[i].counter);
+
+    if ( !has_hvm_container_vcpu(v) )
+        rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, core2_vpmu_cxt->global_status);
 }
 
-static int core2_vpmu_save(struct vcpu *v)
+static int core2_vpmu_save(struct vcpu *v, bool_t to_guest)
 {
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
 
-    if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_SAVE) )
-        return 0;
+    if ( !has_hvm_container_vcpu(v) )
+        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
-    if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) 
+    if ( !vpmu_are_all_set(vpmu, VPMU_CONTEXT_SAVE | VPMU_CONTEXT_LOADED) )
         return 0;
 
     __core2_vpmu_save(v);
 
     /* Unset PMU MSR bitmap to trap lazy load. */
-    if ( !vpmu_is_set(vpmu, VPMU_RUNNING) && cpu_has_vmx_msr_bitmap )
+    if ( !vpmu_is_set(vpmu, VPMU_RUNNING) &&
+         has_hvm_container_vcpu(v) && cpu_has_vmx_msr_bitmap )
         core2_vpmu_unset_msr_bitmap(v->arch.hvm_vmx.msr_bitmap);
 
+    if ( to_guest )
+    {
+        ASSERT(!is_hvm_vcpu(v));
+        memcpy((void *)(&vpmu->xenpmu_data->pmu.c.intel) + regs_off,
+               vpmu->context + regs_off, regs_sz);
+    }
+
     return 1;
 }
 
 static inline void __core2_vpmu_load(struct vcpu *v)
 {
     unsigned int i, pmc_start;
-    struct core2_vpmu_context *core2_vpmu_cxt = vcpu_vpmu(v)->context;
+    struct xen_pmu_intel_ctxt *core2_vpmu_cxt = vcpu_vpmu(v)->context;
+    uint64_t *fixed_counters = vpmu_reg_pointer(core2_vpmu_cxt, fixed_counters);
+    struct xen_pmu_cntr_pair *xen_pmu_cntr_pair =
+        vpmu_reg_pointer(core2_vpmu_cxt, arch_counters);
 
-    for ( i = 0; i < core2_fix_counters.num; i++ )
-        wrmsrl(core2_fix_counters.msr[i], core2_vpmu_cxt->fix_counters[i]);
+    for ( i = 0; i < fixed_pmc_cnt; i++ )
+        wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, fixed_counters[i]);
 
     if ( full_width_write )
         pmc_start = MSR_IA32_A_PERFCTR0;
     else
         pmc_start = MSR_IA32_PERFCTR0;
-    for ( i = 0; i < core2_get_pmc_count(); i++ )
-        wrmsrl(pmc_start + i, core2_vpmu_cxt->arch_msr_pair[i].counter);
+    for ( i = 0; i < arch_pmc_cnt; i++ )
+    {
+        wrmsrl(pmc_start + i, xen_pmu_cntr_pair[i].counter);
+        wrmsrl(MSR_P6_EVNTSEL(i), xen_pmu_cntr_pair[i].control);
+    }
 
-    for ( i = 0; i < core2_ctrls.num; i++ )
-        wrmsrl(core2_ctrls.msr[i], core2_vpmu_cxt->ctrls[i]);
-    for ( i = 0; i < core2_get_pmc_count(); i++ )
-        wrmsrl(MSR_P6_EVNTSEL(i), core2_vpmu_cxt->arch_msr_pair[i].control);
+    wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, core2_vpmu_cxt->fixed_ctrl);
+    wrmsrl(MSR_IA32_DS_AREA, core2_vpmu_cxt->ds_area);
+    wrmsrl(MSR_IA32_PEBS_ENABLE, core2_vpmu_cxt->pebs_enable);
+
+    if ( !has_hvm_container_vcpu(v) )
+    {
+        wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, core2_vpmu_cxt->global_ovf_ctrl);
+        core2_vpmu_cxt->global_ovf_ctrl = 0;
+        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
+    }
+}
+
+static int core2_vpmu_verify(struct vcpu *v)
+{
+    unsigned int i;
+    struct vpmu_struct *vpmu = vcpu_vpmu(v);
+    struct xen_pmu_intel_ctxt *core2_vpmu_cxt = vcpu_vpmu(v)->context;
+    uint64_t *fixed_counters = vpmu_reg_pointer(core2_vpmu_cxt, fixed_counters);
+    struct xen_pmu_cntr_pair *xen_pmu_cntr_pair =
+        vpmu_reg_pointer(core2_vpmu_cxt, arch_counters);
+    uint64_t fixed_ctrl;
+    uint64_t *priv_context = vpmu->priv_context;
+    uint64_t enabled_cntrs = 0;
+
+    if ( core2_vpmu_cxt->global_ovf_ctrl & global_ovf_ctrl_mask )
+        return -EINVAL;
+
+    fixed_ctrl = core2_vpmu_cxt->fixed_ctrl;
+    if ( fixed_ctrl & fixed_ctrl_mask )
+        return -EINVAL;
+
+    for ( i = 0; i < fixed_pmc_cnt; i++ )
+    {
+        if ( fixed_counters[i] & fixed_counters_mask )
+            return -EINVAL;
+        if ( (fixed_ctrl >> (i * FIXED_CTR_CTRL_BITS)) & 3 )
+            enabled_cntrs |= (1ULL << i);
+    }
+    enabled_cntrs <<= 32;
+
+    for ( i = 0; i < arch_pmc_cnt; i++ )
+    {
+        uint64_t control = xen_pmu_cntr_pair[i].control;
+
+        if ( control & ARCH_CTRL_MASK )
+            return -EINVAL;
+        if ( control & ARCH_CNTR_ENABLED )
+            enabled_cntrs |= (1ULL << i);
+    }
+
+    if ( vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) &&
+         !is_canonical_address(core2_vpmu_cxt->ds_area) )
+        return -EINVAL;
+
+    if ( (core2_vpmu_cxt->global_ctrl & enabled_cntrs) ||
+         (core2_vpmu_cxt->ds_area != 0) )
+        vpmu_set(vpmu, VPMU_RUNNING);
+    else
+        vpmu_reset(vpmu, VPMU_RUNNING);
+
+    *priv_context = enabled_cntrs;
+
+    return 0;
 }
 
-static void core2_vpmu_load(struct vcpu *v)
+static int core2_vpmu_load(struct vcpu *v, bool_t from_guest)
 {
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
 
     if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) )
-        return;
+        return 0;
+
+    if ( from_guest )
+    {
+        int ret;
+
+        ASSERT(!is_hvm_vcpu(v));
+
+        memcpy(vpmu->context + regs_off,
+               (void *)&v->arch.vpmu.xenpmu_data->pmu.c.intel + regs_off,
+               regs_sz);
+
+        ret = core2_vpmu_verify(v);
+        if ( ret )
+        {
+            /*
+             * Not necessary since we should never load the context until
+             * guest provides valid values. But just to be safe.
+             */
+            memset(vpmu->context + regs_off, 0, regs_sz);
+            return ret;
+        }
+    }
 
     vpmu_set(vpmu, VPMU_CONTEXT_LOADED);
 
     __core2_vpmu_load(v);
+
+    return 0;
 }
 
 static int core2_vpmu_alloc_resource(struct vcpu *v)
 {
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct core2_vpmu_context *core2_vpmu_cxt;
-    struct core2_pmu_enable *pmu_enable;
+    struct xen_pmu_intel_ctxt *core2_vpmu_cxt = NULL;
+    uint64_t *p = NULL;
 
     if ( !acquire_pmu_ownership(PMU_OWNER_HVM) )
         return 0;
 
-    wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-    if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
-        return 0;
-
-    if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
-        return 0;
-    vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
-                 core2_calc_intial_glb_ctrl_msr());
+    if ( has_hvm_container_vcpu(v) )
+    {
+        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+        if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+            goto out_err;
 
-    pmu_enable = xzalloc_bytes(sizeof(struct core2_pmu_enable) +
-                               core2_get_pmc_count() - 1);
-    if ( !pmu_enable )
-        goto out1;
+        if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+            goto out_err;
+        vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+    }
 
-    core2_vpmu_cxt = xzalloc_bytes(sizeof(struct core2_vpmu_context) +
-                    (core2_get_pmc_count()-1)*sizeof(struct arch_msr_pair));
-    if ( !core2_vpmu_cxt )
-        goto out2;
-    core2_vpmu_cxt->pmu_enable = pmu_enable;
-    vpmu->context = (void *)core2_vpmu_cxt;
+    core2_vpmu_cxt = xzalloc_bytes(sizeof(*core2_vpmu_cxt) +
+                                   sizeof(uint64_t) * fixed_pmc_cnt +
+                                   sizeof(struct xen_pmu_cntr_pair) *
+                                   arch_pmc_cnt);
+    p = xzalloc(uint64_t);
+    if ( !core2_vpmu_cxt || !p )
+        goto out_err;
 
-    return 1;
- out2:
-    xfree(pmu_enable);
- out1:
-    gdprintk(XENLOG_WARNING, "Insufficient memory for PMU, PMU feature is "
-             "unavailable on domain %d vcpu %d.\n",
-             v->vcpu_id, v->domain->domain_id);
-    return 0;
-}
+    core2_vpmu_cxt->fixed_counters = sizeof(*core2_vpmu_cxt);
+    core2_vpmu_cxt->arch_counters = core2_vpmu_cxt->fixed_counters +
+                                    sizeof(uint64_t) * fixed_pmc_cnt;
 
-static void core2_vpmu_save_msr_context(struct vcpu *v, int type,
-                                       int index, u64 msr_data)
-{
-    struct core2_vpmu_context *core2_vpmu_cxt = vcpu_vpmu(v)->context;
+    vpmu->context = core2_vpmu_cxt;
+    vpmu->priv_context = p;
 
-    switch ( type )
+    if ( !is_hvm_vcpu(v) )
     {
-    case MSR_TYPE_CTRL:
-        core2_vpmu_cxt->ctrls[index] = msr_data;
-        break;
-    case MSR_TYPE_ARCH_CTRL:
-        core2_vpmu_cxt->arch_msr_pair[index].control = msr_data;
-        break;
+        /* Copy fixed/arch register offsets to shared area */
+        ASSERT(vpmu->xenpmu_data);
+        memcpy(&vpmu->xenpmu_data->pmu.c.intel, core2_vpmu_cxt, regs_off);
     }
+
+    vpmu_set(vpmu, VPMU_CONTEXT_ALLOCATED);
+
+    return 1;
+
+out_err:
+    release_pmu_ownship(PMU_OWNER_HVM);
+
+    xfree(core2_vpmu_cxt);
+    xfree(p);
+
+    printk("Failed to allocate VPMU resources for domain %u vcpu %u\n",
+           v->vcpu_id, v->domain->domain_id);
+
+    return 0;
 }
 
 static int core2_vpmu_msr_common_check(u32 msr_index, int *type, int *index)
@@ -438,17 +533,16 @@ static int core2_vpmu_msr_common_check(u32 msr_index, int *type, int *index)
         return 0;
 
     if ( unlikely(!vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED)) &&
-	 (vpmu->context != NULL ||
-	  !core2_vpmu_alloc_resource(current)) )
+         !core2_vpmu_alloc_resource(current) )
         return 0;
-    vpmu_set(vpmu, VPMU_CONTEXT_ALLOCATED);
 
     /* Do the lazy load staff. */
     if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) )
     {
         __core2_vpmu_load(current);
         vpmu_set(vpmu, VPMU_CONTEXT_LOADED);
-        if ( cpu_has_vmx_msr_bitmap )
+        if ( has_hvm_container_vcpu(current) &&
+             cpu_has_vmx_msr_bitmap )
             core2_vpmu_set_msr_bitmap(current->arch.hvm_vmx.msr_bitmap);
     }
     return 1;
@@ -457,13 +551,12 @@ static int core2_vpmu_msr_common_check(u32 msr_index, int *type, int *index)
 static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
                                uint64_t supported)
 {
-    u64 global_ctrl, non_global_ctrl;
-    char pmu_enable = 0;
     int i, tmp;
     int type = -1, index = -1;
     struct vcpu *v = current;
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct core2_vpmu_context *core2_vpmu_cxt = NULL;
+    struct xen_pmu_intel_ctxt *core2_vpmu_cxt;
+    uint64_t *enabled_cntrs;
 
     if ( !core2_vpmu_msr_common_check(msr, &type, &index) )
     {
@@ -478,34 +571,42 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
                              IA32_DEBUGCTLMSR_BTS_OFF_USR;
             if ( !(msr_content & ~supported) &&
                  vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
-                return 1;
+                return 0;
             if ( (msr_content & supported) &&
                  !vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
                 printk(XENLOG_G_WARNING
                        "%pv: Debug Store unsupported on this CPU\n",
                        current);
         }
-        return 0;
+        return -EINVAL;
     }
 
     ASSERT(!supported);
 
+    if ( (type == MSR_TYPE_COUNTER) && (msr_content & fixed_counters_mask) )
+        /* Writing unsupported bits to a fixed counter */
+        return -EINVAL;
+
     core2_vpmu_cxt = vpmu->context;
+    enabled_cntrs = vpmu->priv_context;
     switch ( msr )
     {
     case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-        core2_vpmu_cxt->global_ovf_status &= ~msr_content;
-        return 1;
+        if ( msr_content & global_ovf_ctrl_mask )
+            return -EINVAL;
+        core2_vpmu_cxt->global_status &= ~msr_content;
+        wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, msr_content);
+        return 0;
     case MSR_CORE_PERF_GLOBAL_STATUS:
         gdprintk(XENLOG_INFO, "Can not write readonly MSR: "
                  "MSR_PERF_GLOBAL_STATUS(0x38E)!\n");
-        hvm_inject_hw_exception(TRAP_gp_fault, 0);
-        return 1;
+        return -EINVAL;
     case MSR_IA32_PEBS_ENABLE:
         if ( msr_content & 1 )
             gdprintk(XENLOG_WARNING, "Guest is trying to enable PEBS, "
                      "which is not supported.\n");
-        return 1;
+        core2_vpmu_cxt->pebs_enable = msr_content;
+        return 0;
     case MSR_IA32_DS_AREA:
         if ( vpmu_is_set(vpmu, VPMU_CPU_HAS_DS) )
         {
@@ -514,112 +615,81 @@ static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
                 gdprintk(XENLOG_WARNING,
                          "Illegal address for IA32_DS_AREA: %#" PRIx64 "x\n",
                          msr_content);
-                hvm_inject_hw_exception(TRAP_gp_fault, 0);
-                return 1;
+                return -EINVAL;
             }
-            core2_vpmu_cxt->pmu_enable->ds_area_enable = msr_content ? 1 : 0;
+            core2_vpmu_cxt->ds_area = msr_content;
             break;
         }
         gdprintk(XENLOG_WARNING, "Guest setting of DTS is ignored.\n");
-        return 1;
+        return 0;
     case MSR_CORE_PERF_GLOBAL_CTRL:
-        global_ctrl = msr_content;
-        for ( i = 0; i < core2_get_pmc_count(); i++ )
-        {
-            rdmsrl(MSR_P6_EVNTSEL(i), non_global_ctrl);
-            core2_vpmu_cxt->pmu_enable->arch_pmc_enable[i] =
-                    global_ctrl & (non_global_ctrl >> 22) & 1;
-            global_ctrl >>= 1;
-        }
-
-        rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, non_global_ctrl);
-        global_ctrl = msr_content >> 32;
-        for ( i = 0; i < core2_fix_counters.num; i++ )
-        {
-            core2_vpmu_cxt->pmu_enable->fixed_ctr_enable[i] =
-                (global_ctrl & 1) & ((non_global_ctrl & 0x3)? 1: 0);
-            non_global_ctrl >>= FIXED_CTR_CTRL_BITS;
-            global_ctrl >>= 1;
-        }
+        core2_vpmu_cxt->global_ctrl = msr_content;
         break;
     case MSR_CORE_PERF_FIXED_CTR_CTRL:
-        non_global_ctrl = msr_content;
-        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
-        global_ctrl >>= 32;
-        for ( i = 0; i < core2_fix_counters.num; i++ )
+        if ( msr_content & fixed_ctrl_mask )
+            return -EINVAL;
+
+        if ( has_hvm_container_vcpu(v) )
+            vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
+                               &core2_vpmu_cxt->global_ctrl);
+        else
+            rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
+        *enabled_cntrs &= ~(((1ULL << fixed_pmc_cnt) - 1) << 32);
+        if ( msr_content != 0 )
         {
-            core2_vpmu_cxt->pmu_enable->fixed_ctr_enable[i] =
-                (global_ctrl & 1) & ((non_global_ctrl & 0x3)? 1: 0);
-            non_global_ctrl >>= 4;
-            global_ctrl >>= 1;
+            u64 val = msr_content;
+            for ( i = 0; i < fixed_pmc_cnt; i++ )
+            {
+                if ( val & 3 )
+                    *enabled_cntrs |= (1ULL << 32) << i;
+                val >>= FIXED_CTR_CTRL_BITS;
+            }
         }
+
+        core2_vpmu_cxt->fixed_ctrl = msr_content;
         break;
     default:
         tmp = msr - MSR_P6_EVNTSEL(0);
-        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
-        if ( tmp >= 0 && tmp < core2_get_pmc_count() )
-            core2_vpmu_cxt->pmu_enable->arch_pmc_enable[tmp] =
-                (global_ctrl >> tmp) & (msr_content >> 22) & 1;
-    }
+        if ( tmp >= 0 && tmp < arch_pmc_cnt )
+        {
+            struct xen_pmu_cntr_pair *xen_pmu_cntr_pair =
+                vpmu_reg_pointer(core2_vpmu_cxt, arch_counters);
 
-    for ( i = 0; i < core2_fix_counters.num; i++ )
-        pmu_enable |= core2_vpmu_cxt->pmu_enable->fixed_ctr_enable[i];
-    for ( i = 0; i < core2_get_pmc_count(); i++ )
-        pmu_enable |= core2_vpmu_cxt->pmu_enable->arch_pmc_enable[i];
-    pmu_enable |= core2_vpmu_cxt->pmu_enable->ds_area_enable;
-    if ( pmu_enable )
-        vpmu_set(vpmu, VPMU_RUNNING);
-    else
-        vpmu_reset(vpmu, VPMU_RUNNING);
+            if ( msr_content & ARCH_CTRL_MASK )
+                return -EINVAL;
 
-    /* Setup LVTPC in local apic */
-    if ( vpmu_is_set(vpmu, VPMU_RUNNING) &&
-         is_vlapic_lvtpc_enabled(vcpu_vlapic(v)) )
-    {
-        apic_write_around(APIC_LVTPC, PMU_APIC_VECTOR);
-        vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR;
-    }
-    else
-    {
-        apic_write_around(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED);
-        vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR | APIC_LVT_MASKED;
+            if ( has_hvm_container_vcpu(v) )
+                vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL,
+                                   &core2_vpmu_cxt->global_ctrl);
+            else
+                rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, core2_vpmu_cxt->global_ctrl);
+
+            if ( msr_content & ARCH_CNTR_ENABLED )
+                *enabled_cntrs |= 1ULL << tmp;
+            else
+                *enabled_cntrs &= ~(1ULL << tmp);
+
+            xen_pmu_cntr_pair[tmp].control = msr_content;
+        }
     }
 
-    core2_vpmu_save_msr_context(v, type, index, msr_content);
     if ( type != MSR_TYPE_GLOBAL )
+        wrmsrl(msr, msr_content);
+    else
     {
-        u64 mask;
-        int inject_gp = 0;
-        switch ( type )
-        {
-        case MSR_TYPE_ARCH_CTRL:      /* MSR_P6_EVNTSEL[0,...] */
-            mask = ~((1ull << 32) - 1);
-            if (msr_content & mask)
-                inject_gp = 1;
-            break;
-        case MSR_TYPE_CTRL:           /* IA32_FIXED_CTR_CTRL */
-            if  ( msr == MSR_IA32_DS_AREA )
-                break;
-            /* 4 bits per counter, currently 3 fixed counters implemented. */
-            mask = ~((1ull << (VPMU_CORE2_NUM_FIXED * FIXED_CTR_CTRL_BITS)) - 1);
-            if (msr_content & mask)
-                inject_gp = 1;
-            break;
-        case MSR_TYPE_COUNTER:        /* IA32_FIXED_CTR[0-2] */
-            mask = ~((1ull << core2_get_bitwidth_fix_count()) - 1);
-            if (msr_content & mask)
-                inject_gp = 1;
-            break;
-        }
-        if (inject_gp)
-            hvm_inject_hw_exception(TRAP_gp_fault, 0);
+        if ( has_hvm_container_vcpu(v) )
+            vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
         else
-            wrmsrl(msr, msr_content);
+            wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
     }
+
+    if ( (core2_vpmu_cxt->global_ctrl & *enabled_cntrs) ||
+         (core2_vpmu_cxt->ds_area != 0) )
+        vpmu_set(vpmu, VPMU_RUNNING);
     else
-        vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+        vpmu_reset(vpmu, VPMU_RUNNING);
 
-    return 1;
+    return 0;
 }
 
 static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
@@ -627,7 +697,7 @@ static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
     int type = -1, index = -1;
     struct vcpu *v = current;
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct core2_vpmu_context *core2_vpmu_cxt = NULL;
+    struct xen_pmu_intel_ctxt *core2_vpmu_cxt;
 
     if ( core2_vpmu_msr_common_check(msr, &type, &index) )
     {
@@ -638,28 +708,26 @@ static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
             *msr_content = 0;
             break;
         case MSR_CORE_PERF_GLOBAL_STATUS:
-            *msr_content = core2_vpmu_cxt->global_ovf_status;
+            *msr_content = core2_vpmu_cxt->global_status;
             break;
         case MSR_CORE_PERF_GLOBAL_CTRL:
-            vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+            if ( has_hvm_container_vcpu(v) )
+                vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+            else
+                rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, *msr_content);
             break;
         default:
             rdmsrl(msr, *msr_content);
         }
     }
-    else
+    else if ( msr == MSR_IA32_MISC_ENABLE )
     {
         /* Extension for BTS */
-        if ( msr == MSR_IA32_MISC_ENABLE )
-        {
-            if ( vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
-                *msr_content &= ~MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
-        }
-        else
-            return 0;
+        if ( vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) )
+            *msr_content &= ~MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
     }
 
-    return 1;
+    return 0;
 }
 
 static void core2_vpmu_do_cpuid(unsigned int input,
@@ -686,11 +754,13 @@ static void core2_vpmu_do_cpuid(unsigned int input,
 static void core2_vpmu_dump(const struct vcpu *v)
 {
     const struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    int i, num;
-    const struct core2_vpmu_context *core2_vpmu_cxt = NULL;
+    unsigned int i;
+    const struct xen_pmu_intel_ctxt *core2_vpmu_cxt = vpmu->context;
     u64 val;
+    uint64_t *fixed_counters;
+    struct xen_pmu_cntr_pair *cntr_pair;
 
-    if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) )
+    if ( !core2_vpmu_cxt || !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) )
          return;
 
     if ( !vpmu_is_set(vpmu, VPMU_RUNNING) )
@@ -703,28 +773,25 @@ static void core2_vpmu_dump(const struct vcpu *v)
     }
 
     printk("    vPMU running\n");
-    core2_vpmu_cxt = vpmu->context;
-    num = core2_get_pmc_count();
+
+    cntr_pair = vpmu_reg_pointer(core2_vpmu_cxt, arch_counters);
+    fixed_counters = vpmu_reg_pointer(core2_vpmu_cxt, fixed_counters);
+
     /* Print the contents of the counter and its configuration msr. */
-    for ( i = 0; i < num; i++ )
-    {
-        const struct arch_msr_pair *msr_pair = core2_vpmu_cxt->arch_msr_pair;
+    for ( i = 0; i < arch_pmc_cnt; i++ )
+        printk("      general_%d: 0x%016lx ctrl: 0x%016lx\n",
+            i, cntr_pair[i].counter, cntr_pair[i].control);
 
-        if ( core2_vpmu_cxt->pmu_enable->arch_pmc_enable[i] )
-            printk("      general_%d: 0x%016lx ctrl: 0x%016lx\n",
-                   i, msr_pair[i].counter, msr_pair[i].control);
-    }
     /*
      * The configuration of the fixed counter is 4 bits each in the
      * MSR_CORE_PERF_FIXED_CTR_CTRL.
      */
-    val = core2_vpmu_cxt->ctrls[MSR_CORE_PERF_FIXED_CTR_CTRL_IDX];
-    for ( i = 0; i < core2_fix_counters.num; i++ )
+    val = core2_vpmu_cxt->fixed_ctrl;
+    for ( i = 0; i < fixed_pmc_cnt; i++ )
     {
-        if ( core2_vpmu_cxt->pmu_enable->fixed_ctr_enable[i] )
-            printk("      fixed_%d:   0x%016lx ctrl: %#lx\n",
-                   i, core2_vpmu_cxt->fix_counters[i],
-                   val & FIXED_CTR_CTRL_MASK);
+        printk("      fixed_%d:   0x%016lx ctrl: %#lx\n",
+               i, fixed_counters[i],
+               val & FIXED_CTR_CTRL_MASK);
         val >>= FIXED_CTR_CTRL_BITS;
     }
 }
@@ -734,15 +801,15 @@ static int core2_vpmu_do_interrupt(struct cpu_user_regs *regs)
     struct vcpu *v = current;
     u64 msr_content;
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct core2_vpmu_context *core2_vpmu_cxt = vpmu->context;
+    struct xen_pmu_intel_ctxt *core2_vpmu_cxt = vpmu->context;
 
     rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, msr_content);
     if ( msr_content )
     {
         if ( is_pmc_quirk )
             handle_pmc_quirk(msr_content);
-        core2_vpmu_cxt->global_ovf_status |= msr_content;
-        msr_content = 0xC000000700000000 | ((1 << core2_get_pmc_count()) - 1);
+        core2_vpmu_cxt->global_status |= msr_content;
+        msr_content = 0xC000000700000000 | ((1 << arch_pmc_cnt) - 1);
         wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, msr_content);
     }
     else
@@ -753,77 +820,21 @@ static int core2_vpmu_do_interrupt(struct cpu_user_regs *regs)
             return 0;
     }
 
-    /* HW sets the MASK bit when performance counter interrupt occurs*/
-    vpmu->hw_lapic_lvtpc = apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED;
-    apic_write_around(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
-
     return 1;
 }
 
-static int core2_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags)
-{
-    struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    u64 msr_content;
-    static bool_t ds_warned;
-
-    if ( !(vpmu_flags & VPMU_BOOT_BTS) )
-        goto func_out;
-    /* Check the 'Debug Store' feature in the CPUID.EAX[1]:EDX[21] */
-    while ( boot_cpu_has(X86_FEATURE_DS) )
-    {
-        if ( !boot_cpu_has(X86_FEATURE_DTES64) )
-        {
-            if ( !ds_warned )
-                printk(XENLOG_G_WARNING "CPU doesn't support 64-bit DS Area"
-                       " - Debug Store disabled for guests\n");
-            break;
-        }
-        vpmu_set(vpmu, VPMU_CPU_HAS_DS);
-        rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
-        if ( msr_content & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL )
-        {
-            /* If BTS_UNAVAIL is set reset the DS feature. */
-            vpmu_reset(vpmu, VPMU_CPU_HAS_DS);
-            if ( !ds_warned )
-                printk(XENLOG_G_WARNING "CPU has set BTS_UNAVAIL"
-                       " - Debug Store disabled for guests\n");
-            break;
-        }
-
-        vpmu_set(vpmu, VPMU_CPU_HAS_BTS);
-        if ( !ds_warned )
-        {
-            if ( !boot_cpu_has(X86_FEATURE_DSCPL) )
-                printk(XENLOG_G_INFO
-                       "vpmu: CPU doesn't support CPL-Qualified BTS\n");
-            printk("******************************************************\n");
-            printk("** WARNING: Emulation of BTS Feature is switched on **\n");
-            printk("** Using this processor feature in a virtualized    **\n");
-            printk("** environment is not 100%% safe.                    **\n");
-            printk("** Setting the DS buffer address with wrong values  **\n");
-            printk("** may lead to hypervisor hangs or crashes.         **\n");
-            printk("** It is NOT recommended for production use!        **\n");
-            printk("******************************************************\n");
-        }
-        break;
-    }
-    ds_warned = 1;
- func_out:
-    check_pmc_quirk();
-    return 0;
-}
-
 static void core2_vpmu_destroy(struct vcpu *v)
 {
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    struct core2_vpmu_context *core2_vpmu_cxt = vpmu->context;
 
-    xfree(core2_vpmu_cxt->pmu_enable);
     xfree(vpmu->context);
-    if ( cpu_has_vmx_msr_bitmap )
+    vpmu->context = NULL;
+    xfree(vpmu->priv_context);
+    vpmu->priv_context = NULL;
+    if ( has_hvm_container_vcpu(v) && cpu_has_vmx_msr_bitmap )
         core2_vpmu_unset_msr_bitmap(v->arch.hvm_vmx.msr_bitmap);
     release_pmu_ownship(PMU_OWNER_HVM);
-    vpmu_reset(vpmu, VPMU_CONTEXT_ALLOCATED);
+    vpmu_clear(vpmu);
 }
 
 struct arch_vpmu_ops core2_vpmu_ops = {
@@ -863,9 +874,9 @@ static int core2_no_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
 {
     int type = -1, index = -1;
     if ( !is_core2_vpmu_msr(msr, &type, &index) )
-        return 0;
+        return -EINVAL;
     *msr_content = 0;
-    return 1;
+    return 0;
 }
 
 /*
@@ -876,26 +887,84 @@ struct arch_vpmu_ops core2_no_vpmu_ops = {
     .do_cpuid = core2_no_vpmu_do_cpuid,
 };
 
-int vmx_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags)
+int vmx_vpmu_initialise(struct vcpu *v)
 {
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    uint8_t family = current_cpu_data.x86;
-    uint8_t cpu_model = current_cpu_data.x86_model;
-    int ret = 0;
+    u64 msr_content;
+    static bool_t ds_warned;
 
     vpmu->arch_vpmu_ops = &core2_no_vpmu_ops;
-    if ( !vpmu_flags )
+    if ( vpmu_mode == XENPMU_MODE_OFF )
         return 0;
 
-    if ( family == 6 )
-    {
-        u64 caps;
+    if ( (arch_pmc_cnt + fixed_pmc_cnt) == 0 )
+        return -EINVAL;
 
-        rdmsrl(MSR_IA32_PERF_CAPABILITIES, caps);
-        full_width_write = (caps >> 13) & 1;
+    if ( !(vpmu_features & XENPMU_FEATURE_INTEL_BTS) )
+        goto func_out;
+    /* Check the 'Debug Store' feature in the CPUID.EAX[1]:EDX[21] */
+    while ( boot_cpu_has(X86_FEATURE_DS) )
+    {
+        if ( !boot_cpu_has(X86_FEATURE_DTES64) )
+        {
+            if ( !ds_warned )
+                printk(XENLOG_G_WARNING "CPU doesn't support 64-bit DS Area"
+                       " - Debug Store disabled for guests\n");
+            break;
+        }
+        vpmu_set(vpmu, VPMU_CPU_HAS_DS);
+        rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
+        if ( msr_content & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL )
+        {
+            /* If BTS_UNAVAIL is set reset the DS feature. */
+            vpmu_reset(vpmu, VPMU_CPU_HAS_DS);
+            if ( !ds_warned )
+                printk(XENLOG_G_WARNING "CPU has set BTS_UNAVAIL"
+                       " - Debug Store disabled for guests\n");
+            break;
+        }
 
-        switch ( cpu_model )
+        vpmu_set(vpmu, VPMU_CPU_HAS_BTS);
+        if ( !ds_warned )
         {
+            if ( !boot_cpu_has(X86_FEATURE_DSCPL) )
+                printk(XENLOG_G_INFO
+                       "vpmu: CPU doesn't support CPL-Qualified BTS\n");
+            printk("******************************************************\n");
+            printk("** WARNING: Emulation of BTS Feature is switched on **\n");
+            printk("** Using this processor feature in a virtualized    **\n");
+            printk("** environment is not 100%% safe.                    **\n");
+            printk("** Setting the DS buffer address with wrong values  **\n");
+            printk("** may lead to hypervisor hangs or crashes.         **\n");
+            printk("** It is NOT recommended for production use!        **\n");
+            printk("******************************************************\n");
+        }
+        break;
+    }
+    ds_warned = 1;
+ func_out:
+
+    /* PV domains can allocate resources immediately */
+    if ( is_pv_vcpu(v) && !core2_vpmu_alloc_resource(v) )
+        return -EIO;
+
+    vpmu->arch_vpmu_ops = &core2_vpmu_ops;
+
+    return 0;
+}
+
+int __init core2_vpmu_init(void)
+{
+    u64 caps;
+
+    if ( current_cpu_data.x86 != 6 )
+    {
+        printk(XENLOG_WARNING "VPMU: only family 6 is supported\n");
+        return -EINVAL;
+    }
+
+    switch ( current_cpu_data.x86_model )
+    {
         /* Core2: */
         case 0x0f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
         case 0x16: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
@@ -913,7 +982,7 @@ int vmx_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags)
         /* Westmere: */
         case 0x25: /* 32 nm nehalem, "Clarkdale", "Arrandale" */
         case 0x2c: /* 32 nm nehalem, "Gulftown", "Westmere-EP" */
-        case 0x27: /* 32 nm Westmere-EX */
+        case 0x2f: /* 32 nm Westmere-EX */
 
         case 0x3a: /* IvyBridge */
         case 0x3e: /* IvyBridge EP */
@@ -924,19 +993,50 @@ int vmx_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags)
         case 0x45:
         case 0x46:
 
-        /* future: */
+        /* Broadwell */
         case 0x3d:
+        case 0x4f:
+        case 0x56:
+
+        /* future: */
         case 0x4e:
-            ret = core2_vpmu_initialise(v, vpmu_flags);
-            if ( !ret )
-                vpmu->arch_vpmu_ops = &core2_vpmu_ops;
-            return ret;
-        }
+
+        /* next gen Xeon Phi */
+        case 0x57:
+            break;
+
+        default:
+            printk(XENLOG_WARNING "VPMU: Unsupported CPU model %#x\n",
+                   current_cpu_data.x86_model);
+            return -EINVAL;
     }
 
-    printk("VPMU: Initialization failed. "
-           "Intel processor family %d model %d has not "
-           "been supported\n", family, cpu_model);
-    return -EINVAL;
+    arch_pmc_cnt = core2_get_arch_pmc_count();
+    fixed_pmc_cnt = core2_get_fixed_pmc_count();
+    rdmsrl(MSR_IA32_PERF_CAPABILITIES, caps);
+    full_width_write = (caps >> 13) & 1;
+
+    fixed_ctrl_mask = ~((1ull << (fixed_pmc_cnt * FIXED_CTR_CTRL_BITS)) - 1);
+    fixed_counters_mask = ~((1ull << core2_get_bitwidth_fix_count()) - 1);
+    global_ovf_ctrl_mask = ~(0xC000000000000000 |
+                             (((1ULL << fixed_pmc_cnt) - 1) << 32) |
+                             ((1ULL << arch_pmc_cnt) - 1));
+
+    regs_sz = (sizeof(struct xen_pmu_intel_ctxt) - regs_off) +
+              sizeof(uint64_t) * fixed_pmc_cnt +
+              sizeof(struct xen_pmu_cntr_pair) * arch_pmc_cnt;
+
+    check_pmc_quirk();
+
+    if ( sizeof(struct xen_pmu_data) + sizeof(uint64_t) * fixed_pmc_cnt +
+         sizeof(struct xen_pmu_cntr_pair) * arch_pmc_cnt > PAGE_SIZE )
+    {
+        printk(XENLOG_WARNING
+               "VPMU: Register bank does not fit into VPMU share page\n");
+        arch_pmc_cnt = fixed_pmc_cnt = 0;
+        return -ENOSPC;
+    }
+
+    return 0;
 }
 
diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c
index eb7be9c..888a214 100644
--- a/xen/arch/x86/crash.c
+++ b/xen/arch/x86/crash.c
@@ -140,13 +140,10 @@ static void nmi_shootdown_cpus(void)
      * Ideally would be:
      *   exception_table[TRAP_nmi] = &do_nmi_crash;
      *
-     * but the exception_table is read only.  Borrow an unused fixmap entry
-     * to construct a writable mapping.
+     * but the exception_table is read only.  Access it via its directmap
+     * mappings.
      */
-    set_fixmap(FIX_TBOOT_MAP_ADDRESS, __pa(&exception_table[TRAP_nmi]));
-    write_atomic((unsigned long *)
-                 (fix_to_virt(FIX_TBOOT_MAP_ADDRESS) +
-                  ((unsigned long)&exception_table[TRAP_nmi] & ~PAGE_MASK)),
+    write_atomic((unsigned long *)__va(__pa(&exception_table[TRAP_nmi])),
                  (unsigned long)&do_nmi_crash);
 
     /* Ensure the new callback function is set before sending out the NMI. */
diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c
index 435bd40..58cae22 100644
--- a/xen/arch/x86/debug.c
+++ b/xen/arch/x86/debug.c
@@ -11,9 +11,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -41,6 +39,9 @@
 #define DBGP2(...) ((void)0)
 #endif
 
+typedef unsigned long dbgva_t;
+typedef unsigned char dbgbyte_t;
+
 /* Returns: mfn for the given (hvm guest) vaddr */
 static unsigned long 
 dbg_hvm_va2mfn(dbgva_t vaddr, struct domain *dp, int toaddr,
@@ -105,7 +106,7 @@ dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val)
 
     if ( pgd3val == 0 )
     {
-        l4t = map_domain_page(mfn);
+        l4t = map_domain_page(_mfn(mfn));
         l4e = l4t[l4_table_offset(vaddr)];
         unmap_domain_page(l4t);
         mfn = l4e_get_pfn(l4e);
@@ -117,7 +118,7 @@ dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val)
             return INVALID_MFN;
         }
 
-        l3t = map_domain_page(mfn);
+        l3t = map_domain_page(_mfn(mfn));
         l3e = l3t[l3_table_offset(vaddr)];
         unmap_domain_page(l3t);
         mfn = l3e_get_pfn(l3e);
@@ -131,7 +132,7 @@ dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val)
         }
     }
 
-    l2t = map_domain_page(mfn);
+    l2t = map_domain_page(_mfn(mfn));
     l2e = l2t[l2_table_offset(vaddr)];
     unmap_domain_page(l2t);
     mfn = l2e_get_pfn(l2e);
@@ -143,7 +144,7 @@ dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val)
         DBGP1("l2 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3);
         return INVALID_MFN;
     }
-    l1t = map_domain_page(mfn);
+    l1t = map_domain_page(_mfn(mfn));
     l1e = l1t[l1_table_offset(vaddr)];
     unmap_domain_page(l1t);
     mfn = l1e_get_pfn(l1e);
@@ -154,13 +155,14 @@ dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val)
 }
 
 /* Returns: number of bytes remaining to be copied */
-static int
-dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t *buf, int len, struct domain *dp, 
-                 int toaddr, uint64_t pgd3)
+unsigned int dbg_rw_guest_mem(struct domain *dp, void * __user gaddr,
+                              void * __user buf, unsigned int len,
+                              bool_t toaddr, uint64_t pgd3)
 {
     while ( len > 0 )
     {
         char *va;
+        unsigned long addr = (unsigned long)gaddr;
         unsigned long mfn, gfn = INVALID_GFN, pagecnt;
 
         pagecnt = min_t(long, PAGE_SIZE - (addr & ~PAGE_MASK), len);
@@ -171,17 +173,17 @@ dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t *buf, int len, struct domain *dp,
         if ( mfn == INVALID_MFN ) 
             break;
 
-        va = map_domain_page(mfn);
+        va = map_domain_page(_mfn(mfn));
         va = va + (addr & (PAGE_SIZE-1));
 
         if ( toaddr )
         {
-            memcpy(va, buf, pagecnt);    /* va = buf */
+            copy_from_user(va, buf, pagecnt);    /* va = buf */
             paging_mark_dirty(dp, mfn);
         }
         else
         {
-            memcpy(buf, va, pagecnt);    /* buf = va */
+            copy_to_user(buf, va, pagecnt);    /* buf = va */
         }
 
         unmap_domain_page(va);
@@ -203,27 +205,30 @@ dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t *buf, int len, struct domain *dp,
  * pgd3: value of init_mm.pgd[3] in guest. see above.
  * Returns: number of bytes remaining to be copied. 
  */
-int
-dbg_rw_mem(dbgva_t addr, dbgbyte_t *buf, int len, domid_t domid, int toaddr,
-           uint64_t pgd3)
+unsigned int dbg_rw_mem(void * __user addr, void * __user buf,
+                        unsigned int len, domid_t domid, bool_t toaddr,
+                        uint64_t pgd3)
 {
-    struct domain *dp = get_domain_by_id(domid);
-    int hyp = (domid == DOMID_IDLE);
+    DBGP2("gmem:addr:%lx buf:%p len:$%u domid:%d toaddr:%x\n",
+          addr, buf, len, domid, toaddr);
 
-    DBGP2("gmem:addr:%lx buf:%p len:$%d domid:%x toaddr:%x dp:%p\n", 
-          addr, buf, len, domid, toaddr, dp);
-    if ( hyp )
+    if ( domid == DOMID_IDLE )
     {
         if ( toaddr )
-            len = __copy_to_user((void *)addr, buf, len);
+            len = __copy_to_user(addr, buf, len);
         else
-            len = __copy_from_user(buf, (void *)addr, len);
+            len = __copy_from_user(buf, addr, len);
     }
-    else if ( dp )
+    else
     {
-        if ( !dp->is_dying )   /* make sure guest is still there */
-            len= dbg_rw_guest_mem(addr, buf, len, dp, toaddr, pgd3);
-        put_domain(dp);
+        struct domain *d = get_domain_by_id(domid);
+
+        if ( d )
+        {
+            if ( !d->is_dying )
+                len = dbg_rw_guest_mem(d, addr, buf, len, toaddr, pgd3);
+            put_domain(d);
+        }
     }
 
     DBGP2("gmem:exit:len:$%d\n", len);
diff --git a/xen/arch/x86/delay.c b/xen/arch/x86/delay.c
index bc1772e..ef6bc5d 100644
--- a/xen/arch/x86/delay.c
+++ b/xen/arch/x86/delay.c
@@ -21,10 +21,10 @@ void __udelay(unsigned long usecs)
     unsigned long ticks = usecs * (cpu_khz / 1000);
     unsigned long s, e;
 
-    rdtscl(s);
+    s = rdtsc();
     do
     {
         rep_nop();
-        rdtscl(e);
+        e = rdtsc();
     } while ((e-s) < ticks);
 }
diff --git a/xen/arch/x86/dmi_scan.c b/xen/arch/x86/dmi_scan.c
index 500133a..8e07f8d 100644
--- a/xen/arch/x86/dmi_scan.c
+++ b/xen/arch/x86/dmi_scan.c
@@ -38,6 +38,18 @@ struct __packed smbios_eps {
 	struct dmi_eps dmi;
 };
 
+struct __packed smbios3_eps {
+	char anchor[5];			/* "_SM3_" */
+	u8 checksum;
+	u8 length;
+	u8 major, minor;
+	u8 docrev;
+	u8 revision;
+	u8 _rsrvd_;
+	u32 max_size;
+	u64 address;
+};
+
 struct dmi_header
 {
 	u8	type;
@@ -45,6 +57,53 @@ struct dmi_header
 	u16	handle;
 };
 
+enum dmi_entry_type {
+	DMI_ENTRY_BIOS = 0,
+	DMI_ENTRY_SYSTEM,
+	DMI_ENTRY_BASEBOARD,
+	DMI_ENTRY_CHASSIS,
+	DMI_ENTRY_PROCESSOR,
+	DMI_ENTRY_MEM_CONTROLLER,
+	DMI_ENTRY_MEM_MODULE,
+	DMI_ENTRY_CACHE,
+	DMI_ENTRY_PORT_CONNECTOR,
+	DMI_ENTRY_SYSTEM_SLOT,
+	DMI_ENTRY_ONBOARD_DEVICE,
+	DMI_ENTRY_OEMSTRINGS,
+	DMI_ENTRY_SYSCONF,
+	DMI_ENTRY_BIOS_LANG,
+	DMI_ENTRY_GROUP_ASSOC,
+	DMI_ENTRY_SYSTEM_EVENT_LOG,
+	DMI_ENTRY_PHYS_MEM_ARRAY,
+	DMI_ENTRY_MEM_DEVICE,
+	DMI_ENTRY_32_MEM_ERROR,
+	DMI_ENTRY_MEM_ARRAY_MAPPED_ADDR,
+	DMI_ENTRY_MEM_DEV_MAPPED_ADDR,
+	DMI_ENTRY_BUILTIN_POINTING_DEV,
+	DMI_ENTRY_PORTABLE_BATTERY,
+	DMI_ENTRY_SYSTEM_RESET,
+	DMI_ENTRY_HW_SECURITY,
+	DMI_ENTRY_SYSTEM_POWER_CONTROLS,
+	DMI_ENTRY_VOLTAGE_PROBE,
+	DMI_ENTRY_COOLING_DEV,
+	DMI_ENTRY_TEMP_PROBE,
+	DMI_ENTRY_ELECTRICAL_CURRENT_PROBE,
+	DMI_ENTRY_OOB_REMOTE_ACCESS,
+	DMI_ENTRY_BIS_ENTRY,
+	DMI_ENTRY_SYSTEM_BOOT,
+	DMI_ENTRY_MGMT_DEV,
+	DMI_ENTRY_MGMT_DEV_COMPONENT,
+	DMI_ENTRY_MGMT_DEV_THRES,
+	DMI_ENTRY_MEM_CHANNEL,
+	DMI_ENTRY_IPMI_DEV,
+	DMI_ENTRY_SYS_POWER_SUPPLY,
+	DMI_ENTRY_ADDITIONAL,
+	DMI_ENTRY_ONBOARD_DEV_EXT,
+	DMI_ENTRY_MGMT_CONTROLLER_HOST,
+	DMI_ENTRY_INACTIVE = 126,
+	DMI_ENTRY_END_OF_TABLE = 127,
+};
+
 #undef DMI_DEBUG
 
 #ifdef DMI_DEBUG
@@ -74,7 +133,8 @@ static char * __init dmi_string(struct dmi_header *dm, u8 s)
  *	pointing to completely the wrong place for example
  */
  
-static int __init dmi_table(u32 base, int len, int num, void (*decode)(struct dmi_header *))
+static int __init dmi_table(paddr_t base, u32 len, int num,
+			    void (*decode)(struct dmi_header *))
 {
 	u8 *buf;
 	struct dmi_header *dm;
@@ -88,11 +148,12 @@ static int __init dmi_table(u32 base, int len, int num, void (*decode)(struct dm
 	data = buf;
 
 	/*
- 	 *	Stop when we see all the items the table claimed to have
- 	 *	OR we run off the end of the table (also happens)
- 	 */
- 
-	while(i<num && data-buf+sizeof(struct dmi_header)<=len)
+	 * Stop when we have seen all the items the table claimed to have
+	 * (SMBIOS < 3.0 only) OR we reach an end-of-table marker (SMBIOS
+	 * >= 3.0 only) OR we run off the end of the table (should never
+	 * happen but sometimes does on bogus implementations.)
+	 */
+	while((num < 0 || i < num) && data-buf+sizeof(struct dmi_header)<=len)
 	{
 		dm=(struct dmi_header *)data;
 		/*
@@ -105,6 +166,16 @@ static int __init dmi_table(u32 base, int len, int num, void (*decode)(struct dm
 			data++;
 		if(data-buf<len-1)
 			decode(dm);
+		/*
+		 * 7.45 End-of-Table (Type 127) [SMBIOS reference spec v3.0.0]
+		 * For tables behind a 64-bit entry point, we have no item
+		 * count and no exact table length, so stop on end-of-table
+		 * marker. For tables behind a 32-bit entry point, we have
+		 * seen OEM structures behind the end-of-table marker on
+		 * some systems, so don't trust it.
+		 */
+		if (num < 0 && dm->type == DMI_ENTRY_END_OF_TABLE)
+			break;
 		data+=2;
 		i++;
 	}
@@ -127,117 +198,232 @@ static inline bool_t __init dmi_checksum(const void __iomem *buf,
 
 static u32 __initdata efi_dmi_address;
 static u32 __initdata efi_dmi_size;
+static u32 __initdata efi_smbios_address;
+static u32 __initdata efi_smbios_size;
+static u64 __initdata efi_smbios3_address;
+static u32 __initdata efi_smbios3_size;
 
 /*
  * Important: This function gets called while still in EFI
  * (pseudo-)physical mode.
  */
-void __init dmi_efi_get_table(void *smbios)
+void __init dmi_efi_get_table(const void *smbios, const void *smbios3)
 {
-	struct smbios_eps *eps = smbios;
-
-	if (memcmp(eps->anchor, "_SM_", 4) &&
-	    dmi_checksum(eps, eps->length) &&
-	    memcmp(eps->dmi.anchor, "_DMI_", 5) == 0 &&
-	    dmi_checksum(&eps->dmi, sizeof(eps->dmi))) {
-		efi_dmi_address = eps->dmi.address;
-		efi_dmi_size = eps->dmi.size;
+	const struct smbios_eps *eps = smbios;
+	const struct smbios3_eps *eps3 = smbios3;
+
+	if (eps3 && memcmp(eps3->anchor, "_SM3_", 5) == 0 &&
+	    eps3->length >= sizeof(*eps3) &&
+	    dmi_checksum(eps3, eps3->length)) {
+		efi_smbios3_address = eps3->address;
+		efi_smbios3_size = eps3->max_size;
+		return;
+	}
+
+	if (eps && memcmp(eps->anchor, "_SM_", 4) == 0 &&
+	    eps->length >= sizeof(*eps) &&
+	    dmi_checksum(eps, eps->length)) {
+		efi_smbios_address = (u32)(long)eps;
+		efi_smbios_size = eps->length;
+
+		if (memcmp(eps->dmi.anchor, "_DMI_", 5) == 0 &&
+		    dmi_checksum(&eps->dmi, sizeof(eps->dmi))) {
+			efi_dmi_address = eps->dmi.address;
+			efi_dmi_size = eps->dmi.size;
+		}
 	}
 }
 
-int __init dmi_get_table(u32 *base, u32 *len)
+const char *__init dmi_get_table(paddr_t *base, u32 *len)
 {
-	struct dmi_eps eps;
-	char __iomem *p, *q;
+	static unsigned int __initdata instance;
 
 	if (efi_enabled) {
-		if (!efi_dmi_size)
-			return -1;
-		*base = efi_dmi_address;
-		*len = efi_dmi_size;
-		return 0;
-	}
-
-	p = maddr_to_virt(0xF0000);
-	for (q = p; q < p + 0x10000; q += 16) {
-		memcpy_fromio(&eps, q, 15);
-		if (memcmp(eps.anchor, "_DMI_", 5) == 0 &&
-		    dmi_checksum(&eps, sizeof(eps))) {
-			*base = eps.address;
-			*len = eps.size;
-			return 0;
+		if (efi_smbios3_size && !(instance & 1)) {
+			*base = efi_smbios3_address;
+			*len = efi_smbios3_size;
+			instance |= 1;
+			return "SMBIOSv3";
+		}
+		if (efi_dmi_size && !(instance & 2)) {
+			*base = efi_dmi_address;
+			*len = efi_dmi_size;
+			instance |= 2;
+			return "DMI";
+		}
+		if (efi_smbios_size && !(instance & 4)) {
+			*base = efi_smbios_address;
+			*len = efi_smbios_size;
+			instance |= 4;
+			return "SMBIOS";
+		}
+	} else {
+		char __iomem *p = maddr_to_virt(0xF0000), *q;
+		union {
+			struct dmi_eps dmi;
+			struct smbios3_eps smbios3;
+		} eps;
+
+		for (q = p; q <= p + 0x10000 - sizeof(eps.dmi); q += 16) {
+			memcpy_fromio(&eps, q, sizeof(eps.dmi));
+			if (!(instance & 1) &&
+			    memcmp(eps.dmi.anchor, "_DMI_", 5) == 0 &&
+			    dmi_checksum(&eps.dmi, sizeof(eps.dmi))) {
+				*base = eps.dmi.address;
+				*len = eps.dmi.size;
+				instance |= 1;
+				return "DMI";
+			}
+
+			BUILD_BUG_ON(sizeof(eps.smbios3) <= sizeof(eps.dmi));
+			if ((instance & 2) ||
+			    q > p + 0x10000 - sizeof(eps.smbios3))
+				continue;
+			memcpy_fromio(&eps.dmi + 1, q + sizeof(eps.dmi),
+			              sizeof(eps.smbios3) - sizeof(eps.dmi));
+			if (!memcmp(eps.smbios3.anchor, "_SM3_", 5) &&
+			    eps.smbios3.length >= sizeof(eps.smbios3) &&
+			    q <= p + 0x10000 - eps.smbios3.length &&
+			    dmi_checksum(q, eps.smbios3.length)) {
+				*base = eps.smbios3.address;
+				*len = eps.smbios3.max_size;
+				instance |= 2;
+				return "SMBIOSv3";
+			}
 		}
 	}
-	return -1;
+	return NULL;
 }
 
+typedef union {
+	const struct smbios_eps __iomem *legacy;
+	const struct smbios3_eps __iomem *v3;
+} smbios_eps_u __attribute__((transparent_union));
+
 static int __init _dmi_iterate(const struct dmi_eps *dmi,
-			       const struct smbios_eps __iomem *smbios,
+			       const smbios_eps_u smbios,
 			       void (*decode)(struct dmi_header *))
 {
-	u16 num = dmi->num_structures;
-	u16 len = dmi->size;
-	u32 base = dmi->address;
+	int num;
+	u32 len;
+	paddr_t base;
+
+	if (!dmi) {
+		num = -1;
+		len = smbios.v3->max_size;
+		base = smbios.v3->address;
+		printk(KERN_INFO "SMBIOS %d.%d present.\n",
+		       smbios.v3->major, smbios.v3->minor);
+		dmi_printk((KERN_INFO "SMBIOS v3 table at 0x%"PRIpaddr".\n", base));
+	} else {
+		num = dmi->num_structures;
+		len = dmi->size;
+		base = dmi->address;
 
-	/*
-	 * DMI version 0.0 means that the real version is taken from
-	 * the SMBIOS version, which we may not know at this point.
-	 */
-	if (dmi->revision)
-		printk(KERN_INFO "DMI %d.%d present.\n",
-		       dmi->revision >> 4,  dmi->revision & 0x0f);
-	else if (!smbios)
-		printk(KERN_INFO "DMI present.\n");
-	dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n",
-		    num, len));
-	dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", base));
+		/*
+		 * DMI version 0.0 means that the real version is taken from
+		 * the SMBIOS version, which we may not know at this point.
+		 */
+		if (dmi->revision)
+			printk(KERN_INFO "DMI %d.%d present.\n",
+			       dmi->revision >> 4,  dmi->revision & 0x0f);
+		else if (!smbios.legacy)
+			printk(KERN_INFO "DMI present.\n");
+		dmi_printk((KERN_INFO "%d structures occupying %u bytes.\n",
+			    num, len));
+		dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", (u32)base));
+	}
 	return dmi_table(base, len, num, decode);
 }
 
 static int __init dmi_iterate(void (*decode)(struct dmi_header *))
 {
-	struct dmi_eps eps;
+	struct dmi_eps dmi;
+	struct smbios3_eps smbios3;
 	char __iomem *p, *q;
 
+	dmi.size = 0;
+	smbios3.length = 0;
+
 	p = maddr_to_virt(0xF0000);
 	for (q = p; q < p + 0x10000; q += 16) {
-		memcpy_fromio(&eps, q, sizeof(eps));
-		if (memcmp(eps.anchor, "_DMI_", 5) == 0 &&
-		    dmi_checksum(&eps, sizeof(eps)))
-			return _dmi_iterate(&eps, NULL, decode);
+		if (!dmi.size) {
+			memcpy_fromio(&dmi, q, sizeof(dmi));
+			if (memcmp(dmi.anchor, "_DMI_", 5) ||
+			    !dmi_checksum(&dmi, sizeof(dmi)))
+				dmi.size = 0;
+		}
+		if (!smbios3.length &&
+		    q <= p + 0x10000 - sizeof(smbios3)) {
+			memcpy_fromio(&smbios3, q, sizeof(smbios3));
+			if (memcmp(smbios3.anchor, "_SM3_", 5) ||
+			    smbios3.length < sizeof(smbios3) ||
+			    q < p + 0x10000 - smbios3.length ||
+			    !dmi_checksum(q, smbios3.length))
+				smbios3.length = 0;
+		}
 	}
+
+	if (smbios3.length)
+		return _dmi_iterate(NULL, &smbios3, decode);
+	if (dmi.size)
+		return _dmi_iterate(&dmi, NULL, decode);
 	return -1;
 }
 
 static int __init dmi_efi_iterate(void (*decode)(struct dmi_header *))
 {
-	struct smbios_eps eps;
-	const struct smbios_eps __iomem *p;
 	int ret = -1;
 
-	if (efi.smbios == EFI_INVALID_TABLE_ADDR)
-		return -1;
+	while (efi.smbios3 != EFI_INVALID_TABLE_ADDR) {
+		struct smbios3_eps eps;
+		const struct smbios3_eps __iomem *p;
 
-	p = bt_ioremap(efi.smbios, sizeof(eps));
-	if (!p)
-		return -1;
-	memcpy_fromio(&eps, p, sizeof(eps));
-	bt_iounmap(p, sizeof(eps));
+		p = bt_ioremap(efi.smbios3, sizeof(eps));
+		if (!p)
+			break;
+		memcpy_fromio(&eps, p, sizeof(eps));
+		bt_iounmap(p, sizeof(eps));
 
-	if (memcmp(eps.anchor, "_SM_", 4))
-		return -1;
+		if (memcmp(eps.anchor, "_SM3_", 5) ||
+		    eps.length < sizeof(eps))
+			break;
 
-	p = bt_ioremap(efi.smbios, eps.length);
-	if (!p)
-		return -1;
-	if (dmi_checksum(p, eps.length) &&
-	    memcmp(eps.dmi.anchor, "_DMI_", 5) == 0 &&
-	    dmi_checksum(&eps.dmi, sizeof(eps.dmi))) {
-		printk(KERN_INFO "SMBIOS %d.%d present.\n",
-		       eps.major, eps.minor);
-		ret = _dmi_iterate(&eps.dmi, p, decode);
+		p = bt_ioremap(efi.smbios3, eps.length);
+		if (!p)
+			break;
+		if (dmi_checksum(p, eps.length))
+			ret = _dmi_iterate(NULL, p, decode);
+		bt_iounmap(p, eps.length);
+		break;
+	}
+
+	if (ret != 0 && efi.smbios != EFI_INVALID_TABLE_ADDR) {
+		struct smbios_eps eps;
+		const struct smbios_eps __iomem *p;
+
+		p = bt_ioremap(efi.smbios, sizeof(eps));
+		if (!p)
+			return -1;
+		memcpy_fromio(&eps, p, sizeof(eps));
+		bt_iounmap(p, sizeof(eps));
+
+		if (memcmp(eps.anchor, "_SM_", 4) ||
+		    eps.length < sizeof(eps))
+			return -1;
+
+		p = bt_ioremap(efi.smbios, eps.length);
+		if (!p)
+			return -1;
+		if (dmi_checksum(p, eps.length) &&
+		    memcmp(eps.dmi.anchor, "_DMI_", 5) == 0 &&
+		    dmi_checksum(&eps.dmi, sizeof(eps.dmi))) {
+			printk(KERN_INFO "SMBIOS %d.%d present.\n",
+			       eps.major, eps.minor);
+			ret = _dmi_iterate(&eps.dmi, p, decode);
+		}
+		bt_iounmap(p, eps.length);
 	}
-	bt_iounmap(p, eps.length);
 
 	return ret;
 }
@@ -476,7 +662,7 @@ static void __init dmi_decode(struct dmi_header *dm)
 	
 	switch(dm->type)
 	{
-		case  0:
+		case DMI_ENTRY_BIOS:
 			dmi_printk(("BIOS Vendor: %s\n",
 				dmi_string(dm, data[4])));
 			dmi_save_ident(dm, DMI_BIOS_VENDOR, 4);
@@ -487,7 +673,7 @@ static void __init dmi_decode(struct dmi_header *dm)
 				dmi_string(dm, data[8])));
 			dmi_save_ident(dm, DMI_BIOS_DATE, 8);
 			break;
-		case 1:
+		case DMI_ENTRY_SYSTEM:
 			dmi_printk(("System Vendor: %s\n",
 				dmi_string(dm, data[4])));
 			dmi_save_ident(dm, DMI_SYS_VENDOR, 4);
@@ -500,7 +686,7 @@ static void __init dmi_decode(struct dmi_header *dm)
 			dmi_printk(("Serial Number: %s\n",
 				dmi_string(dm, data[7])));
 			break;
-		case 2:
+		case DMI_ENTRY_BASEBOARD:
 			dmi_printk(("Board Vendor: %s\n",
 				dmi_string(dm, data[4])));
 			dmi_save_ident(dm, DMI_BOARD_VENDOR, 4);
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index f1fc993..a877bab 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -204,6 +204,7 @@ smap_check_policy_t smap_policy_change(struct vcpu *v,
     return old_policy;
 }
 
+#ifndef CONFIG_BIGMEM
 /*
  * The hole may be at or above the 44-bit boundary, so we need to determine
  * the total bit count until reaching 32 significant (not squashed out) bits
@@ -225,10 +226,14 @@ static unsigned int __init noinline _domain_struct_bits(void)
 
     return bits;
 }
+#endif
 
 struct domain *alloc_domain_struct(void)
 {
     struct domain *d;
+#ifdef CONFIG_BIGMEM
+    const unsigned int bits = 0;
+#else
     /*
      * We pack the PDX of the domain structure into a 32-bit field within
      * the page_info structure. Hence the MEMF_bits() restriction.
@@ -237,6 +242,7 @@ struct domain *alloc_domain_struct(void)
 
     if ( unlikely(!bits) )
          bits = _domain_struct_bits();
+#endif
 
     BUILD_BUG_ON(sizeof(*d) > PAGE_SIZE);
     d = alloc_xenheap_pages(0, MEMF_bits(bits));
@@ -285,14 +291,15 @@ struct vcpu_guest_context *alloc_vcpu_guest_context(void)
 
     for ( i = 0; i < PFN_UP(sizeof(struct vcpu_guest_context)); ++i )
     {
-        struct page_info *pg = alloc_domheap_page(NULL, 0);
+        struct page_info *pg = alloc_domheap_page(current->domain,
+                                                  MEMF_no_owner);
 
         if ( unlikely(pg == NULL) )
         {
             free_vcpu_guest_context(NULL);
             return NULL;
         }
-        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR);
+        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR_RW);
         per_cpu(vgc_pages[i], cpu) = pg;
     }
     return (void *)fix_to_virt(idx);
@@ -310,7 +317,7 @@ void free_vcpu_guest_context(struct vcpu_guest_context *vgc)
     {
         if ( !per_cpu(vgc_pages[i], cpu) )
             continue;
-        __set_fixmap(idx - i, 0, 0);
+        clear_fixmap(idx - i);
         free_domheap_page(per_cpu(vgc_pages[i], cpu));
         per_cpu(vgc_pages[i], cpu) = NULL;
     }
@@ -320,25 +327,17 @@ static int setup_compat_l4(struct vcpu *v)
 {
     struct page_info *pg;
     l4_pgentry_t *l4tab;
-    int rc;
 
-    pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
+    pg = alloc_domheap_page(v->domain, MEMF_no_owner);
     if ( pg == NULL )
         return -ENOMEM;
 
-    rc = setup_compat_arg_xlat(v);
-    if ( rc )
-    {
-        free_domheap_page(pg);
-        return rc;
-    }
-
     /* This page needs to look like a pagetable so that it can be shadowed */
     pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
 
     l4tab = __map_domain_page(pg);
     clear_page(l4tab);
-    init_guest_l4_table(l4tab, v->domain);
+    init_guest_l4_table(l4tab, v->domain, 1);
     unmap_domain_page(l4tab);
 
     v->arch.guest_table = pagetable_from_page(pg);
@@ -349,7 +348,6 @@ static int setup_compat_l4(struct vcpu *v)
 
 static void release_compat_l4(struct vcpu *v)
 {
-    free_compat_arg_xlat(v);
     free_domheap_page(pagetable_get_page(v->arch.guest_table));
     v->arch.guest_table = pagetable_null();
     v->arch.guest_table_user = pagetable_null();
@@ -362,21 +360,19 @@ static inline int may_switch_mode(struct domain *d)
 
 int switch_native(struct domain *d)
 {
-    unsigned int vcpuid;
+    struct vcpu *v;
 
-    if ( d == NULL )
-        return -EINVAL;
     if ( !may_switch_mode(d) )
         return -EACCES;
-    if ( !is_pv_32on64_domain(d) )
+    if ( !is_pv_32bit_domain(d) )
         return 0;
 
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
 
-    for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
+    for_each_vcpu( d, v )
     {
-        if (d->vcpu[vcpuid])
-            release_compat_l4(d->vcpu[vcpuid]);
+        free_compat_arg_xlat(v);
+        release_compat_l4(v);
     }
 
     return 0;
@@ -384,10 +380,8 @@ int switch_native(struct domain *d)
 
 int switch_compat(struct domain *d)
 {
-    unsigned int vcpuid;
-
-    if ( d == NULL )
-        return -EINVAL;
+    struct vcpu *v;
+    int rc;
 
     if ( is_pvh_domain(d) )
     {
@@ -398,15 +392,17 @@ int switch_compat(struct domain *d)
 
     if ( !may_switch_mode(d) )
         return -EACCES;
-    if ( is_pv_32on64_domain(d) )
+    if ( is_pv_32bit_domain(d) )
         return 0;
 
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
 
-    for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ )
+    for_each_vcpu( d, v )
     {
-        if ( (d->vcpu[vcpuid] != NULL) &&
-             (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
+        rc = setup_compat_arg_xlat(v);
+        if ( !rc )
+            rc = setup_compat_l4(v);
+        if ( rc )
             goto undo_and_fail;
     }
 
@@ -416,12 +412,15 @@ int switch_compat(struct domain *d)
 
  undo_and_fail:
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
-    while ( vcpuid-- != 0 )
+    for_each_vcpu( d, v )
     {
-        if ( d->vcpu[vcpuid] != NULL )
-            release_compat_l4(d->vcpu[vcpuid]);
+        free_compat_arg_xlat(v);
+
+        if ( !pagetable_is_null(v->arch.guest_table) )
+            release_compat_l4(v);
     }
-    return -ENOMEM;
+
+    return rc;
 }
 
 int vcpu_initialise(struct vcpu *v)
@@ -432,18 +431,23 @@ int vcpu_initialise(struct vcpu *v)
     v->arch.flags = TF_kernel_mode;
 
     /* By default, do not emulate */
-    v->arch.mem_event.emulate_flags = 0;
+    v->arch.vm_event.emulate_flags = 0;
 
     rc = mapcache_vcpu_init(v);
     if ( rc )
         return rc;
 
-    paging_vcpu_init(v);
+    if ( !is_idle_domain(d) )
+    {
+        paging_vcpu_init(v);
 
-    if ( (rc = vcpu_init_fpu(v)) != 0 )
-        return rc;
+        if ( (rc = vcpu_init_fpu(v)) != 0 )
+            return rc;
 
-    vmce_init_vcpu(v);
+        vmce_init_vcpu(v);
+    }
+
+    spin_lock_init(&v->arch.vpmu.vpmu_lock);
 
     if ( has_hvm_container_domain(d) )
     {
@@ -488,7 +492,17 @@ int vcpu_initialise(struct vcpu *v)
 
     v->arch.pv_vcpu.ctrlreg[4] = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
 
-    rc = is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0;
+    if ( is_pv_32bit_domain(d) )
+    {
+        if ( (rc = setup_compat_arg_xlat(v)) )
+            goto done;
+
+        if ( (rc = setup_compat_l4(v)) )
+        {
+            free_compat_arg_xlat(v);
+            goto done;
+        }
+    }
  done:
     if ( rc )
     {
@@ -503,8 +517,14 @@ int vcpu_initialise(struct vcpu *v)
 
 void vcpu_destroy(struct vcpu *v)
 {
-    if ( is_pv_32on64_vcpu(v) )
+    xfree(v->arch.vm_event.emul_read_data);
+    v->arch.vm_event.emul_read_data = NULL;
+
+    if ( is_pv_32bit_vcpu(v) )
+    {
+        free_compat_arg_xlat(v);
         release_compat_l4(v);
+    }
 
     vcpu_destroy_fpu(v);
 
@@ -514,17 +534,12 @@ void vcpu_destroy(struct vcpu *v)
         xfree(v->arch.pv_vcpu.trap_ctxt);
 }
 
-int arch_domain_create(struct domain *d, unsigned int domcr_flags)
+int arch_domain_create(struct domain *d, unsigned int domcr_flags,
+                       struct xen_arch_domainconfig *config)
 {
     int i, paging_initialised = 0;
     int rc = -ENOMEM;
 
-    d->arch.hvm_domain.hap_enabled =
-        has_hvm_container_domain(d) &&
-        hvm_funcs.hap_supported &&
-        (domcr_flags & DOMCRF_hap);
-    d->arch.hvm_domain.mem_sharing_enabled = 0;
-
     d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity);
 
     INIT_LIST_HEAD(&d->arch.pdev_list);
@@ -547,7 +562,12 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
     }
 
     if ( has_hvm_container_domain(d) )
+    {
+        d->arch.hvm_domain.hap_enabled =
+            hvm_funcs.hap_supported && (domcr_flags & DOMCRF_hap);
+
         rc = create_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0, NULL, NULL);
+    }
     else if ( is_idle_domain(d) )
         rc = 0;
     else
@@ -570,12 +590,12 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
     HYPERVISOR_COMPAT_VIRT_START(d) =
         is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u;
 
-    if ( (rc = paging_domain_init(d, domcr_flags)) != 0 )
-        goto fail;
-    paging_initialised = 1;
-
     if ( !is_idle_domain(d) )
     {
+        if ( (rc = paging_domain_init(d, domcr_flags)) != 0 )
+            goto fail;
+        paging_initialised = 1;
+
         d->arch.cpuids = xmalloc_array(cpuid_input_t, MAX_CPUID_INPUT);
         rc = -ENOMEM;
         if ( d->arch.cpuids == NULL )
@@ -586,6 +606,10 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
             d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
         }
 
+        d->arch.x86_vendor = boot_cpu_data.x86_vendor;
+        d->arch.x86        = boot_cpu_data.x86;
+        d->arch.x86_model  = boot_cpu_data.x86_model;
+
         d->arch.ioport_caps = 
             rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
         rc = -ENOMEM;
@@ -623,6 +647,9 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
         /* 64-bit PV guest by default. */
         d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
 
+    if ( (rc = psr_domain_init(d)) != 0 )
+        goto fail;
+
     /* initialize default tsc behavior in case tools don't */
     tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
     spin_lock_init(&d->arch.vtsc_lock);
@@ -641,11 +668,15 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
     free_perdomain_mappings(d);
     if ( is_pv_domain(d) )
         free_xenheap_page(d->arch.pv_domain.gdt_ldt_l1tab);
+    psr_domain_free(d);
     return rc;
 }
 
 void arch_domain_destroy(struct domain *d)
 {
+    vfree(d->arch.event_write_data);
+    d->arch.event_write_data = NULL;
+
     if ( has_hvm_container_domain(d) )
         hvm_domain_destroy(d);
 
@@ -664,7 +695,7 @@ void arch_domain_destroy(struct domain *d)
     free_xenheap_page(d->shared_info);
     cleanup_domain_irq_mapping(d);
 
-    psr_free_rmid(d);
+    psr_domain_free(d);
 }
 
 void arch_domain_shutdown(struct domain *d)
@@ -685,24 +716,47 @@ void arch_domain_unpause(struct domain *d)
         viridian_time_ref_count_thaw(d);
 }
 
-unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4)
+/*
+ * These are the masks of CR4 bits (subject to hardware availability) which a
+ * PV guest may not legitimiately attempt to modify.
+ */
+static unsigned long __read_mostly pv_cr4_mask, compat_pv_cr4_mask;
+
+static int __init init_pv_cr4_masks(void)
 {
-    unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
+    unsigned long common_mask = ~X86_CR4_TSD;
 
-    hv_cr4_mask = ~X86_CR4_TSD;
+    /*
+     * All PV guests may attempt to modify TSD, DE and OSXSAVE.
+     */
     if ( cpu_has_de )
-        hv_cr4_mask &= ~X86_CR4_DE;
-    if ( cpu_has_fsgsbase && !is_pv_32bit_domain(v->domain) )
-        hv_cr4_mask &= ~X86_CR4_FSGSBASE;
+        common_mask &= ~X86_CR4_DE;
     if ( cpu_has_xsave )
-        hv_cr4_mask &= ~X86_CR4_OSXSAVE;
+        common_mask &= ~X86_CR4_OSXSAVE;
+
+    pv_cr4_mask = compat_pv_cr4_mask = common_mask;
+
+    /*
+     * 64bit PV guests may attempt to modify FSGSBASE.
+     */
+    if ( cpu_has_fsgsbase )
+        pv_cr4_mask &= ~X86_CR4_FSGSBASE;
+
+    return 0;
+}
+__initcall(init_pv_cr4_masks);
+
+unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4)
+{
+    unsigned long hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
+    unsigned long mask = is_pv_32bit_vcpu(v) ? compat_pv_cr4_mask : pv_cr4_mask;
 
-    if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
+    if ( (guest_cr4 & mask) != (hv_cr4 & mask) )
         printk(XENLOG_G_WARNING
                "d%d attempted to change %pv's CR4 flags %08lx -> %08lx\n",
                current->domain->domain_id, v, hv_cr4, guest_cr4);
 
-    return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask);
+    return (hv_cr4 & mask) | (guest_cr4 & ~mask);
 }
 
 #define xen_vcpu_guest_context vcpu_guest_context
@@ -724,12 +778,12 @@ int arch_set_info_guest(
 
     /* The context is a compat-mode one if the target domain is compat-mode;
      * we expect the tools to DTRT even in compat-mode callers. */
-    compat = is_pv_32on64_domain(d);
+    compat = is_pv_32bit_domain(d);
 
 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
     flags = c(flags);
 
-    if ( is_pv_vcpu(v) )
+    if ( is_pv_domain(d) )
     {
         if ( !compat )
         {
@@ -770,25 +824,23 @@ int arch_set_info_guest(
              (c(ldt_ents) > 8192) )
             return -EINVAL;
     }
-    else if ( is_pvh_vcpu(v) )
+    else if ( is_pvh_domain(d) )
     {
-        /* PVH 32bitfixme */
-        ASSERT(!compat);
-
         if ( c(ctrlreg[0]) || c(ctrlreg[1]) || c(ctrlreg[2]) ||
              c(ctrlreg[4]) || c(ctrlreg[5]) || c(ctrlreg[6]) ||
              c(ctrlreg[7]) ||  c(ldt_base) || c(ldt_ents) ||
              c(user_regs.cs) || c(user_regs.ss) || c(user_regs.es) ||
              c(user_regs.ds) || c(user_regs.fs) || c(user_regs.gs) ||
-             c(kernel_ss) || c(kernel_sp) || c.nat->gs_base_kernel ||
-             c.nat->gdt_ents || c.nat->fs_base || c.nat->gs_base_user )
+             c(kernel_ss) || c(kernel_sp) || c(gdt_ents) ||
+             (!compat && (c.nat->gs_base_kernel ||
+              c.nat->fs_base || c.nat->gs_base_user)) )
             return -EINVAL;
     }
 
     v->fpu_initialised = !!(flags & VGCF_I387_VALID);
 
     v->arch.flags &= ~TF_kernel_mode;
-    if ( (flags & VGCF_in_kernel) || has_hvm_container_vcpu(v)/*???*/ )
+    if ( (flags & VGCF_in_kernel) || has_hvm_container_domain(d)/*???*/ )
         v->arch.flags |= TF_kernel_mode;
 
     v->arch.vgc_flags = flags;
@@ -803,26 +855,29 @@ int arch_set_info_guest(
     if ( !compat )
     {
         memcpy(&v->arch.user_regs, &c.nat->user_regs, sizeof(c.nat->user_regs));
-        if ( is_pv_vcpu(v) )
+        if ( is_pv_domain(d) )
             memcpy(v->arch.pv_vcpu.trap_ctxt, c.nat->trap_ctxt,
                    sizeof(c.nat->trap_ctxt));
     }
     else
     {
         XLAT_cpu_user_regs(&v->arch.user_regs, &c.cmp->user_regs);
-        for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i )
-            XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
-                           c.cmp->trap_ctxt + i);
+        if ( is_pv_domain(d) )
+        {
+            for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i )
+                XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
+                               c.cmp->trap_ctxt + i);
+        }
     }
 
-    if ( has_hvm_container_vcpu(v) )
+    if ( has_hvm_container_domain(d) )
     {
         for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
             v->arch.debugreg[i] = c(debugreg[i]);
 
         hvm_set_info_guest(v);
 
-        if ( is_hvm_vcpu(v) || v->is_initialised )
+        if ( is_hvm_domain(d) || v->is_initialised )
             goto out;
 
         /* NB: No need to use PV cr3 un-pickling macros */
@@ -830,7 +885,7 @@ int arch_set_info_guest(
         cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
 
         v->arch.cr3 = page_to_maddr(cr3_page);
-        v->arch.hvm_vcpu.guest_cr[3] = c.nat->ctrlreg[3];
+        v->arch.hvm_vcpu.guest_cr[3] = c(ctrlreg[3]);
         v->arch.guest_table = pagetable_from_page(cr3_page);
 
         ASSERT(paging_mode_enabled(d));
@@ -871,7 +926,7 @@ int arch_set_info_guest(
                 fail |= xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[1];
             }
         } else {
-            l4_pgentry_t *l4tab = map_domain_page(pfn);
+            l4_pgentry_t *l4tab = map_domain_page(_mfn(pfn));
 
             pfn = l4e_get_pfn(*l4tab);
             unmap_domain_page(l4tab);
@@ -908,7 +963,6 @@ int arch_set_info_guest(
         v->arch.pv_vcpu.event_callback_cs = c(event_callback_cs);
         v->arch.pv_vcpu.failsafe_callback_cs = c(failsafe_callback_cs);
     }
-    v->arch.pv_vcpu.vm_assist = c(vm_assist);
 
     /* Only CR0.TS is modifiable by guest or admin. */
     v->arch.pv_vcpu.ctrlreg[0] &= X86_CR0_TS;
@@ -978,7 +1032,11 @@ int arch_set_info_guest(
         case -EINTR:
             rc = -ERESTART;
         case -ERESTART:
+            break;
         case 0:
+            if ( !compat && !VM_ASSIST(d, m2p_strict) &&
+                 !paging_mode_refcounts(d) )
+                fill_ro_mpt(cr3_gfn);
             break;
         default:
             if ( cr3_page == current->arch.old_guest_table )
@@ -1013,7 +1071,10 @@ int arch_set_info_guest(
                 default:
                     if ( cr3_page == current->arch.old_guest_table )
                         cr3_page = NULL;
+                    break;
                 case 0:
+                    if ( VM_ASSIST(d, m2p_strict) )
+                        zap_ro_mpt(cr3_gfn);
                     break;
                 }
             }
@@ -1025,7 +1086,7 @@ int arch_set_info_guest(
     {
         l4_pgentry_t *l4tab;
 
-        l4tab = map_domain_page(pagetable_get_pfn(v->arch.guest_table));
+        l4tab = map_domain_page(_mfn(pagetable_get_pfn(v->arch.guest_table)));
         *l4tab = l4e_from_pfn(page_to_mfn(cr3_page),
             _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
         unmap_domain_page(l4tab);
@@ -1192,7 +1253,7 @@ static void load_segments(struct vcpu *n)
             all_segs_okay &= loadsegment(gs, uregs->gs);
     }
 
-    if ( !is_pv_32on64_domain(n->domain) )
+    if ( !is_pv_32bit_vcpu(n) )
     {
         /* This can only be non-zero if selector is NULL. */
         if ( n->arch.pv_vcpu.fs_base )
@@ -1221,7 +1282,7 @@ static void load_segments(struct vcpu *n)
             (unsigned long *)pv->kernel_sp;
         unsigned long cs_and_mask, rflags;
 
-        if ( is_pv_32on64_domain(n->domain) )
+        if ( is_pv_32bit_vcpu(n) )
         {
             unsigned int *esp = ring_1(regs) ?
                                 (unsigned int *)regs->rsp :
@@ -1252,8 +1313,8 @@ static void load_segments(struct vcpu *n)
                  put_user(uregs->es,           esp-6) |
                  put_user(uregs->ds,           esp-7) )
             {
-                gdprintk(XENLOG_ERR, "Error while creating compat "
-                         "failsafe callback frame.\n");
+                gprintk(XENLOG_ERR,
+                        "error while creating compat failsafe callback frame\n");
                 domain_crash(n->domain);
             }
 
@@ -1294,8 +1355,8 @@ static void load_segments(struct vcpu *n)
              put_user(regs->r11,           rsp-10) |
              put_user(regs->rcx,           rsp-11) )
         {
-            gdprintk(XENLOG_ERR, "Error while creating failsafe "
-                    "callback frame.\n");
+            gprintk(XENLOG_ERR,
+                    "error while creating failsafe callback frame\n");
             domain_crash(n->domain);
         }
 
@@ -1337,7 +1398,7 @@ static void save_segments(struct vcpu *v)
     if ( regs->es )
         dirty_segment_mask |= DIRTY_ES;
 
-    if ( regs->fs || is_pv_32on64_domain(v->domain) )
+    if ( regs->fs || is_pv_32bit_vcpu(v) )
     {
         dirty_segment_mask |= DIRTY_FS;
         v->arch.pv_vcpu.fs_base = 0; /* != 0 selector kills fs_base */
@@ -1347,7 +1408,7 @@ static void save_segments(struct vcpu *v)
         dirty_segment_mask |= DIRTY_FS_BASE;
     }
 
-    if ( regs->gs || is_pv_32on64_domain(v->domain) )
+    if ( regs->gs || is_pv_32bit_vcpu(v) )
     {
         dirty_segment_mask |= DIRTY_GS;
         v->arch.pv_vcpu.gs_base_user = 0; /* != 0 selector kills gs_base_user */
@@ -1430,9 +1491,9 @@ static void _update_runstate_area(struct vcpu *v)
         v->arch.pv_vcpu.need_update_runstate_area = 1;
 }
 
-static inline int need_full_gdt(struct vcpu *v)
+static inline bool_t need_full_gdt(const struct domain *d)
 {
-    return (is_pv_vcpu(v) && !is_idle_vcpu(v));
+    return is_pv_domain(d) && !is_idle_domain(d);
 }
 
 static void __context_switch(void)
@@ -1441,18 +1502,17 @@ static void __context_switch(void)
     unsigned int          cpu = smp_processor_id();
     struct vcpu          *p = per_cpu(curr_vcpu, cpu);
     struct vcpu          *n = current;
+    struct domain        *pd = p->domain, *nd = n->domain;
     struct desc_struct   *gdt;
     struct desc_ptr       gdt_desc;
 
     ASSERT(p != n);
     ASSERT(cpumask_empty(n->vcpu_dirty_cpumask));
 
-    if ( !is_idle_vcpu(p) )
+    if ( !is_idle_domain(pd) )
     {
         memcpy(&p->arch.user_regs, stack_regs, CTXT_SWITCH_STACK_BYTES);
         vcpu_save_fpu(p);
-        if ( psr_cmt_enabled() )
-            psr_assoc_rmid(0);
         p->arch.ctxt_switch_from(p);
     }
 
@@ -1461,11 +1521,11 @@ static void __context_switch(void)
      * ctxt_switch_to(). This avoids a race on things like EPT flushing,
      * which is synchronised on that function.
      */
-    if ( p->domain != n->domain )
-        cpumask_set_cpu(cpu, n->domain->domain_dirty_cpumask);
+    if ( pd != nd )
+        cpumask_set_cpu(cpu, nd->domain_dirty_cpumask);
     cpumask_set_cpu(cpu, n->vcpu_dirty_cpumask);
 
-    if ( !is_idle_vcpu(n) )
+    if ( !is_idle_domain(nd) )
     {
         memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES);
         if ( cpu_has_xsave )
@@ -1477,17 +1537,16 @@ static void __context_switch(void)
         }
         vcpu_restore_fpu_eager(n);
         n->arch.ctxt_switch_to(n);
-
-        if ( psr_cmt_enabled() && n->domain->arch.psr_rmid > 0 )
-            psr_assoc_rmid(n->domain->arch.psr_rmid);
     }
 
-    gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
-                                  per_cpu(compat_gdt_table, cpu);
-    if ( need_full_gdt(n) )
+    psr_ctxt_switch_to(nd);
+
+    gdt = !is_pv_32bit_domain(nd) ? per_cpu(gdt_table, cpu) :
+                                    per_cpu(compat_gdt_table, cpu);
+    if ( need_full_gdt(nd) )
     {
         unsigned long mfn = virt_to_mfn(gdt);
-        l1_pgentry_t *pl1e = gdt_ldt_ptes(n->domain, n);
+        l1_pgentry_t *pl1e = gdt_ldt_ptes(nd, n);
         unsigned int i;
 
         for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
@@ -1495,8 +1554,8 @@ static void __context_switch(void)
                       l1e_from_pfn(mfn + i, __PAGE_HYPERVISOR));
     }
 
-    if ( need_full_gdt(p) &&
-         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
+    if ( need_full_gdt(pd) &&
+         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(nd)) )
     {
         gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
         gdt_desc.base  = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
@@ -1505,16 +1564,16 @@ static void __context_switch(void)
 
     write_ptbase(n);
 
-    if ( need_full_gdt(n) &&
-         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
+    if ( need_full_gdt(nd) &&
+         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
     {
         gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
         gdt_desc.base = GDT_VIRT_START(n);
         asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
     }
 
-    if ( p->domain != n->domain )
-        cpumask_clear_cpu(cpu, p->domain->domain_dirty_cpumask);
+    if ( pd != nd )
+        cpumask_clear_cpu(cpu, pd->domain_dirty_cpumask);
     cpumask_clear_cpu(cpu, p->vcpu_dirty_cpumask);
 
     per_cpu(curr_vcpu, cpu) = n;
@@ -1524,6 +1583,7 @@ static void __context_switch(void)
 void context_switch(struct vcpu *prev, struct vcpu *next)
 {
     unsigned int cpu = smp_processor_id();
+    const struct domain *prevd = prev->domain, *nextd = next->domain;
     cpumask_t dirty_mask;
 
     ASSERT(local_irq_is_enabled());
@@ -1539,23 +1599,20 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
     }
 
     if ( prev != next )
-        _update_runstate_area(prev);
-
-    if ( is_hvm_vcpu(prev) )
     {
-        if (prev != next)
-            vpmu_save(prev);
-
-        if ( !list_empty(&prev->arch.hvm_vcpu.tm_list) )
-            pt_save_timer(prev);
+        _update_runstate_area(prev);
+        vpmu_switch_from(prev);
     }
 
+    if ( is_hvm_domain(prevd) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
+        pt_save_timer(prev);
+
     local_irq_disable();
 
     set_current(next);
 
     if ( (per_cpu(curr_vcpu, cpu) == next) ||
-         (is_idle_vcpu(next) && cpu_online(cpu)) )
+         (is_idle_domain(nextd) && cpu_online(cpu)) )
     {
         local_irq_enable();
     }
@@ -1563,10 +1620,10 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
     {
         __context_switch();
 
-        if ( is_pv_vcpu(next) &&
-             (is_idle_vcpu(prev) ||
-              has_hvm_container_vcpu(prev) ||
-              is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) )
+        if ( is_pv_domain(nextd) &&
+             (is_idle_domain(prevd) ||
+              has_hvm_container_domain(prevd) ||
+              is_pv_32bit_domain(prevd) != is_pv_32bit_domain(nextd)) )
         {
             uint64_t efer = read_efer();
             if ( !(efer & EFER_SCE) )
@@ -1576,26 +1633,27 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
         /* Re-enable interrupts before restoring state which may fault. */
         local_irq_enable();
 
-        if ( is_pv_vcpu(next) )
+        if ( is_pv_domain(nextd) )
         {
             load_LDT(next);
             load_segments(next);
         }
 
-        set_cpuid_faulting(is_pv_vcpu(next) &&
-                           !is_control_domain(next->domain) &&
-                           !is_hardware_domain(next->domain));
+        set_cpuid_faulting(is_pv_domain(nextd) &&
+                           !is_control_domain(nextd) &&
+                           !is_hardware_domain(nextd));
     }
 
-    if (is_hvm_vcpu(next) && (prev != next) )
-        /* Must be done with interrupts enabled */
-        vpmu_load(next);
-
     context_saved(prev);
 
     if ( prev != next )
+    {
         _update_runstate_area(next);
 
+        /* Must be done with interrupts enabled */
+        vpmu_switch_to(next);
+    }
+
     /* Ensure that the vcpu has an up-to-date time base. */
     update_vcpu_system_time(next);
 
@@ -1675,7 +1733,6 @@ unsigned long hypercall_create_continuation(
     unsigned int op, const char *format, ...)
 {
     struct mc_state *mcs = &current->mc_state;
-    struct cpu_user_regs *regs;
     const char *p = format;
     unsigned long arg;
     unsigned int i;
@@ -1689,26 +1746,23 @@ unsigned long hypercall_create_continuation(
 
         for ( i = 0; *p != '\0'; i++ )
             mcs->call.args[i] = next_arg(p, args);
-        if ( is_pv_32on64_domain(current->domain) )
-        {
-            for ( ; i < 6; i++ )
-                mcs->call.args[i] = 0;
-        }
     }
     else
     {
-        regs       = guest_cpu_user_regs();
-        regs->eax  = op;
+        struct cpu_user_regs *regs = guest_cpu_user_regs();
+        struct vcpu *curr = current;
+
+        regs->eax = op;
 
         /* Ensure the hypercall trap instruction is re-executed. */
-        if ( is_pv_vcpu(current) )
+        if ( is_pv_vcpu(curr) )
             regs->eip -= 2;  /* re-execute 'syscall' / 'int $xx' */
         else
-            current->arch.hvm_vcpu.hcall_preempted = 1;
+            curr->arch.hvm_vcpu.hcall_preempted = 1;
 
-        if ( is_pv_vcpu(current) ?
-             !is_pv_32on64_vcpu(current) :
-             (hvm_guest_x86_mode(current) == 8) )
+        if ( is_pv_vcpu(curr) ?
+             !is_pv_32bit_vcpu(curr) :
+             curr->arch.hvm_vcpu.hcall_64bit )
         {
             for ( i = 0; *p != '\0'; i++ )
             {
@@ -1726,9 +1780,6 @@ unsigned long hypercall_create_continuation(
         }
         else
         {
-            if ( supervisor_mode_kernel )
-                regs->eip &= ~31; /* re-execute entire hypercall entry stub */
-
             for ( i = 0; *p != '\0'; i++ )
             {
                 arg = next_arg(p, args);
@@ -1762,9 +1813,8 @@ int hypercall_xlat_continuation(unsigned int *id, unsigned int nr,
 
     ASSERT(nr <= ARRAY_SIZE(mcs->call.args));
     ASSERT(!(mask >> nr));
-
-    BUG_ON(id && *id >= nr);
-    BUG_ON(id && (mask & (1U << *id)));
+    ASSERT(!id || *id < nr);
+    ASSERT(!id || !(mask & (1U << *id)));
 
     va_start(args, mask);
 
@@ -1969,7 +2019,9 @@ int domain_relinquish_resources(struct domain *d)
     switch ( d->arch.relmem )
     {
     case RELMEM_not_started:
-        pci_release_devices(d);
+        ret = pci_release_devices(d);
+        if ( ret )
+            return ret;
 
         /* Tear down paging-assistance stuff. */
         ret = paging_teardown(d);
@@ -2079,8 +2131,7 @@ void arch_dump_vcpu_info(struct vcpu *v)
 {
     paging_dump_vcpu_info(v);
 
-    if ( is_hvm_vcpu(v) )
-        vpmu_dump(v);
+    vpmu_dump(v);
 }
 
 void domain_cpuid(
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 7993b17..bca6fe7 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -36,6 +36,7 @@
 #include <asm/bzimage.h> /* for bzimage_parse */
 #include <asm/io_apic.h>
 #include <asm/hap.h>
+#include <asm/hpet.h>
 
 #include <public/version.h>
 
@@ -77,8 +78,6 @@ static void __init parse_dom0_mem(const char *s)
             dom0_max_nrpages = parse_amt(s+4, &s);
         else
             dom0_nrpages = parse_amt(s, &s);
-        if ( *s != ',' )
-            break;
     } while ( *s++ == ',' );
 }
 custom_param("dom0_mem", parse_dom0_mem);
@@ -88,24 +87,85 @@ static unsigned int __initdata opt_dom0_max_vcpus_max = UINT_MAX;
 
 static void __init parse_dom0_max_vcpus(const char *s)
 {
-    if (*s == '-')              /* -M */
+    if ( *s == '-' )                   /* -M */
         opt_dom0_max_vcpus_max = simple_strtoul(s + 1, &s, 0);
-    else                        /* N, N-, or N-M */
+    else                               /* N, N-, or N-M */
     {
         opt_dom0_max_vcpus_min = simple_strtoul(s, &s, 0);
-        if (*s++ == '\0')       /* N */
+        if ( opt_dom0_max_vcpus_min == 0 )
+            opt_dom0_max_vcpus_min = 1;
+        if ( !*s )                    /* N */
             opt_dom0_max_vcpus_max = opt_dom0_max_vcpus_min;
-        else if (*s != '\0')    /* N-M */
+        else if ( *s++ == '-' && *s ) /* N-M */
             opt_dom0_max_vcpus_max = simple_strtoul(s, &s, 0);
     }
 }
 custom_param("dom0_max_vcpus", parse_dom0_max_vcpus);
 
-unsigned int __init dom0_max_vcpus(void)
+static __initdata unsigned int dom0_nr_pxms;
+static __initdata unsigned int dom0_pxms[MAX_NUMNODES] =
+    { [0 ... MAX_NUMNODES - 1] = ~0 };
+static __initdata bool_t dom0_affinity_relaxed;
+
+static void __init parse_dom0_nodes(const char *s)
 {
-    unsigned max_vcpus;
+    do {
+        if ( isdigit(*s) )
+            dom0_pxms[dom0_nr_pxms] = simple_strtoul(s, &s, 0);
+        else if ( !strncmp(s, "relaxed", 7) && (!s[7] || s[7] == ',') )
+        {
+            dom0_affinity_relaxed = 1;
+            s += 7;
+        }
+        else if ( !strncmp(s, "strict", 6) && (!s[6] || s[6] == ',') )
+        {
+            dom0_affinity_relaxed = 0;
+            s += 6;
+        }
+        else
+            break;
+    } while ( ++dom0_nr_pxms < ARRAY_SIZE(dom0_pxms) && *s++ == ',' );
+}
+custom_param("dom0_nodes", parse_dom0_nodes);
+
+static cpumask_t __initdata dom0_cpus;
+
+static struct vcpu *__init setup_dom0_vcpu(struct domain *d,
+                                           unsigned int vcpu_id,
+                                           unsigned int cpu)
+{
+    struct vcpu *v = alloc_vcpu(d, vcpu_id, cpu);
+
+    if ( v )
+    {
+        if ( !d->is_pinned && !dom0_affinity_relaxed )
+            cpumask_copy(v->cpu_hard_affinity, &dom0_cpus);
+        cpumask_copy(v->cpu_soft_affinity, &dom0_cpus);
+    }
 
-    max_vcpus = num_cpupool_cpus(cpupool0);
+    return v;
+}
+
+static nodemask_t __initdata dom0_nodes;
+
+unsigned int __init dom0_max_vcpus(void)
+{
+    unsigned int i, max_vcpus;
+    nodeid_t node;
+
+    for ( i = 0; i < dom0_nr_pxms; ++i )
+        if ( (node = pxm_to_node(dom0_pxms[i])) != NUMA_NO_NODE )
+            node_set(node, dom0_nodes);
+    nodes_and(dom0_nodes, dom0_nodes, node_online_map);
+    if ( nodes_empty(dom0_nodes) )
+        dom0_nodes = node_online_map;
+    for_each_node_mask ( node, dom0_nodes )
+        cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node));
+    cpumask_and(&dom0_cpus, &dom0_cpus, cpupool0->cpu_valid);
+    if ( cpumask_empty(&dom0_cpus) )
+        cpumask_copy(&dom0_cpus, cpupool0->cpu_valid);
+
+    max_vcpus = cpumask_weight(&dom0_cpus);
     if ( opt_dom0_max_vcpus_min > max_vcpus )
         max_vcpus = opt_dom0_max_vcpus_min;
     if ( opt_dom0_max_vcpus_max < max_vcpus )
@@ -120,20 +180,30 @@ struct vcpu *__init alloc_dom0_vcpu0(struct domain *dom0)
 {
     unsigned int max_vcpus = dom0_max_vcpus();
 
+    dom0->node_affinity = dom0_nodes;
+    dom0->auto_node_affinity = !dom0_nr_pxms;
+
     dom0->vcpu = xzalloc_array(struct vcpu *, max_vcpus);
     if ( !dom0->vcpu )
         return NULL;
     dom0->max_vcpus = max_vcpus;
 
-    return alloc_vcpu(dom0, 0, 0);
+    return setup_dom0_vcpu(dom0, 0, cpumask_first(&dom0_cpus));
 }
 
+#ifdef CONFIG_SHADOW_PAGING
 static bool_t __initdata opt_dom0_shadow;
 boolean_param("dom0_shadow", opt_dom0_shadow);
+#else
+#define opt_dom0_shadow 0
+#endif
 
 static char __initdata opt_dom0_ioports_disable[200] = "";
 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
 
+static bool_t __initdata ro_hpet = 1;
+boolean_param("ro-hpet", ro_hpet);
+
 /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
 #define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
 #define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
@@ -150,7 +220,7 @@ static struct page_info * __init alloc_chunk(
     struct domain *d, unsigned long max_pages)
 {
     static unsigned int __initdata last_order = MAX_ORDER;
-    static unsigned int __initdata memflags = MEMF_no_dma;
+    static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
     struct page_info *page;
     unsigned int order = get_order_from_pages(max_pages), free_order;
 
@@ -184,7 +254,7 @@ static struct page_info * __init alloc_chunk(
 
         if ( d->tot_pages + (1 << order) > d->max_pages )
             continue;
-        pg2 = alloc_domheap_pages(d, order, 0);
+        pg2 = alloc_domheap_pages(d, order, MEMF_exact_node);
         if ( pg2 > page )
         {
             free_domheap_pages(page, free_order);
@@ -197,19 +267,33 @@ static struct page_info * __init alloc_chunk(
     return page;
 }
 
+static unsigned long __init dom0_paging_pages(const struct domain *d,
+                                              unsigned long nr_pages)
+{
+    /* Copied from: libxl_get_required_shadow_memory() */
+    unsigned long memkb = nr_pages * (PAGE_SIZE / 1024);
+
+    memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
+
+    return ((memkb + 1023) / 1024) << (20 - PAGE_SHIFT);
+}
+
 static unsigned long __init compute_dom0_nr_pages(
     struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
 {
-    unsigned long avail = avail_domheap_pages() + initial_images_nrpages();
-    unsigned long nr_pages = dom0_nrpages;
-    unsigned long min_pages = dom0_min_nrpages;
-    unsigned long max_pages = dom0_max_nrpages;
+    nodeid_t node;
+    unsigned long avail = 0, nr_pages, min_pages, max_pages;
+    bool_t need_paging;
+
+    for_each_node_mask ( node, dom0_nodes )
+        avail += avail_domheap_pages_region(node, 0, 0) +
+                 initial_images_nrpages(node);
 
     /* Reserve memory for further dom0 vcpu-struct allocations... */
     avail -= (d->max_vcpus - 1UL)
              << get_order_from_bytes(sizeof(struct vcpu));
     /* ...and compat_l4's, if needed. */
-    if ( is_pv_32on64_domain(d) )
+    if ( is_pv_32bit_domain(d) )
         avail -= d->max_vcpus - 1;
 
     /* Reserve memory for iommu_dom0_init() (rough estimate). */
@@ -221,23 +305,37 @@ static unsigned long __init compute_dom0_nr_pages(
             avail -= max_pdx >> s;
     }
 
-    /*
-     * If domain 0 allocation isn't specified, reserve 1/16th of available
-     * memory for things like DMA buffers. This reservation is clamped to 
-     * a maximum of 128MB.
-     */
-    if ( nr_pages == 0 )
-        nr_pages = -min(avail / 16, 128UL << (20 - PAGE_SHIFT));
+    need_paging = opt_dom0_shadow || (is_pvh_domain(d) && !iommu_hap_pt_share);
+    for ( ; ; need_paging = 0 )
+    {
+        nr_pages = dom0_nrpages;
+        min_pages = dom0_min_nrpages;
+        max_pages = dom0_max_nrpages;
+
+        /*
+         * If allocation isn't specified, reserve 1/16th of available memory
+         * for things like DMA buffers. This reservation is clamped to a
+         * maximum of 128MB.
+         */
+        if ( nr_pages == 0 )
+            nr_pages = -min(avail / 16, 128UL << (20 - PAGE_SHIFT));
 
-    /* Negative memory specification means "all memory - specified amount". */
-    if ( (long)nr_pages  < 0 ) nr_pages  += avail;
-    if ( (long)min_pages < 0 ) min_pages += avail;
-    if ( (long)max_pages < 0 ) max_pages += avail;
+        /* Negative specification means "all memory - specified amount". */
+        if ( (long)nr_pages  < 0 ) nr_pages  += avail;
+        if ( (long)min_pages < 0 ) min_pages += avail;
+        if ( (long)max_pages < 0 ) max_pages += avail;
 
-    /* Clamp dom0 memory according to min/max limits and available memory. */
-    nr_pages = max(nr_pages, min_pages);
-    nr_pages = min(nr_pages, max_pages);
-    nr_pages = min(nr_pages, avail);
+        /* Clamp according to min/max limits and available memory. */
+        nr_pages = max(nr_pages, min_pages);
+        nr_pages = min(nr_pages, max_pages);
+        nr_pages = min(nr_pages, avail);
+
+        if ( !need_paging )
+            break;
+
+        /* Reserve memory for shadow or HAP. */
+        avail -= dom0_paging_pages(d, nr_pages);
+    }
 
     if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
          ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) )
@@ -319,11 +417,26 @@ static __init void pvh_add_mem_mapping(struct domain *d, unsigned long gfn,
                                        unsigned long mfn, unsigned long nr_mfns)
 {
     unsigned long i;
+    p2m_access_t a;
+    mfn_t omfn;
+    p2m_type_t t;
     int rc;
 
     for ( i = 0; i < nr_mfns; i++ )
     {
-        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i))) )
+        if ( !iomem_access_permitted(d, mfn + i, mfn + i) )
+        {
+            omfn = get_gfn_query_unlocked(d, gfn + i, &t);
+            guest_physmap_remove_page(d, gfn + i, mfn_x(omfn), PAGE_ORDER_4K);
+            continue;
+        }
+
+        if ( rangeset_contains_singleton(mmio_ro_ranges, mfn + i) )
+            a = p2m_access_r;
+        else
+            a = p2m_access_rw;
+
+        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i), a)) )
             panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n",
                   gfn, mfn, i, rc);
         if ( !(i & 0xfffff) )
@@ -495,7 +608,7 @@ static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
         BUG_ON(rc);
         return;
     }
-    if ( !is_pv_32on64_domain(d) )
+    if ( !is_pv_32bit_domain(d) )
         ((unsigned long *)vphysmap_s)[pfn] = mfn;
     else
         ((unsigned int *)vphysmap_s)[pfn] = mfn;
@@ -517,7 +630,7 @@ static __init void pvh_fixup_page_tables_for_hap(struct vcpu *v,
 
     ASSERT(paging_mode_enabled(v->domain));
 
-    l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table));
+    l4start = map_domain_page(_mfn(pagetable_get_pfn(v->arch.guest_table)));
 
     /* Clear entries prior to guest L4 start */
     pl4e = l4start + l4_table_offset(v_start);
@@ -605,7 +718,7 @@ static __init void mark_pv_pt_pages_rdonly(struct domain *d,
 
         /* Top-level p.t. is pinned. */
         if ( (page->u.inuse.type_info & PGT_type_mask) ==
-             (!is_pv_32on64_domain(d) ?
+             (!is_pv_32bit_domain(d) ?
               PGT_l4_page_table : PGT_l3_page_table) )
         {
             page->count_info        += 1;
@@ -633,7 +746,7 @@ static __init void setup_pv_physmap(struct domain *d, unsigned long pgtbl_pfn,
                                     unsigned long nr_pages)
 {
     struct page_info *page = NULL;
-    l4_pgentry_t *pl4e, *l4start = map_domain_page(pgtbl_pfn);
+    l4_pgentry_t *pl4e, *l4start = map_domain_page(_mfn(pgtbl_pfn));
     l3_pgentry_t *pl3e = NULL;
     l2_pgentry_t *pl2e = NULL;
     l1_pgentry_t *pl1e = NULL;
@@ -676,7 +789,7 @@ static __init void setup_pv_physmap(struct domain *d, unsigned long pgtbl_pfn,
             clear_page(pl3e);
             *pl4e = l4e_from_page(page, L4_PROT);
         } else
-            pl3e = map_domain_page(l4e_get_pfn(*pl4e));
+            pl3e = map_domain_page(_mfn(l4e_get_pfn(*pl4e)));
 
         pl3e += l3_table_offset(vphysmap_start);
         if ( !l3e_get_intpte(*pl3e) )
@@ -703,7 +816,7 @@ static __init void setup_pv_physmap(struct domain *d, unsigned long pgtbl_pfn,
             *pl3e = l3e_from_page(page, L3_PROT);
         }
         else
-           pl2e = map_domain_page(l3e_get_pfn(*pl3e));
+            pl2e = map_domain_page(_mfn(l3e_get_pfn(*pl3e)));
 
         pl2e += l2_table_offset(vphysmap_start);
         if ( !l2e_get_intpte(*pl2e) )
@@ -731,7 +844,7 @@ static __init void setup_pv_physmap(struct domain *d, unsigned long pgtbl_pfn,
             *pl2e = l2e_from_page(page, L2_PROT);
         }
         else
-            pl1e = map_domain_page(l2e_get_pfn(*pl2e));
+            pl1e = map_domain_page(_mfn(l2e_get_pfn(*pl2e)));
 
         pl1e += l1_table_offset(vphysmap_start);
         BUG_ON(l1e_get_intpte(*pl1e));
@@ -817,6 +930,8 @@ int __init construct_dom0(
     BUG_ON(d->vcpu[0] == NULL);
     BUG_ON(v->is_initialised);
 
+    process_pending_softirqs();
+
     printk("*** LOADING DOMAIN 0 ***\n");
 
     d->max_pages = ~0U;
@@ -933,7 +1048,7 @@ int __init construct_dom0(
         vinitrd_end    = vinitrd_start + initrd_len;
         vphysmap_start = round_pgup(vinitrd_end);
     }
-    vphysmap_end     = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
+    vphysmap_end     = vphysmap_start + (nr_pages * (!is_pv_32bit_domain(d) ?
                                                      sizeof(unsigned long) :
                                                      sizeof(unsigned int)));
     if ( parms.p2m_base != UNSET_ADDR )
@@ -961,9 +1076,9 @@ int __init construct_dom0(
 #define NR(_l,_h,_s) \
     (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
        ((_l) & ~((1UL<<(_s))-1))) >> (_s))
-        if ( (!is_pv_32on64_domain(d) + /* # L4 */
+        if ( (!is_pv_32bit_domain(d) + /* # L4 */
               NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
-              (!is_pv_32on64_domain(d) ?
+              (!is_pv_32bit_domain(d) ?
                NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */
                4) + /* # compat L2 */
               NR(v_start, v_end, L2_PAGETABLE_SHIFT))  /* # L1 */
@@ -1054,12 +1169,14 @@ int __init construct_dom0(
            _p(v_start), _p(v_end));
     printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
 
+    process_pending_softirqs();
+
     mpt_alloc = (vpt_start - v_start) + pfn_to_paddr(alloc_spfn);
     if ( vinitrd_start )
         mpt_alloc -= PAGE_ALIGN(initrd_len);
 
     /* Overlap with Xen protected area? */
-    if ( !is_pv_32on64_domain(d) ?
+    if ( !is_pv_32bit_domain(d) ?
          ((v_start < HYPERVISOR_VIRT_END) &&
           (v_end > HYPERVISOR_VIRT_START)) :
          (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) )
@@ -1069,21 +1186,21 @@ int __init construct_dom0(
         goto out;
     }
 
-    if ( is_pv_32on64_domain(d) )
+    if ( is_pv_32bit_domain(d) )
     {
         v->arch.pv_vcpu.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS;
         v->arch.pv_vcpu.event_callback_cs    = FLAT_COMPAT_KERNEL_CS;
     }
 
     /* WARNING: The new domain must have its 'processor' field filled in! */
-    if ( !is_pv_32on64_domain(d) )
+    if ( !is_pv_32bit_domain(d) )
     {
         maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
         l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
     }
     else
     {
-        page = alloc_domheap_page(NULL, 0);
+        page = alloc_domheap_page(d, MEMF_no_owner);
         if ( !page )
             panic("Not enough RAM for domain 0 PML4");
         page->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
@@ -1092,9 +1209,9 @@ int __init construct_dom0(
         l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
     }
     clear_page(l4tab);
-    init_guest_l4_table(l4tab, d);
+    init_guest_l4_table(l4tab, d, 0);
     v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
-    if ( is_pv_32on64_domain(d) )
+    if ( is_pv_32bit_domain(d) )
         v->arch.guest_table_user = v->arch.guest_table;
 
     l4tab += l4_table_offset(v_start);
@@ -1140,7 +1257,7 @@ int __init construct_dom0(
             mfn = pfn++;
         else
             mfn = initrd_mfn++;
-        *l1tab = l1e_from_pfn(mfn, (!is_pv_32on64_domain(d) ?
+        *l1tab = l1e_from_pfn(mfn, (!is_pv_32bit_domain(d) ?
                                     L1_PROT : COMPAT_L1_PROT));
         l1tab++;
 
@@ -1153,7 +1270,7 @@ int __init construct_dom0(
         }
     }
 
-    if ( is_pv_32on64_domain(d) )
+    if ( is_pv_32bit_domain(d) )
     {
         /* Ensure the first four L3 entries are all populated. */
         for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
@@ -1185,11 +1302,11 @@ int __init construct_dom0(
 
     printk("Dom0 has maximum %u VCPUs\n", d->max_vcpus);
 
-    cpu = cpumask_first(cpupool0->cpu_valid);
+    cpu = v->processor;
     for ( i = 1; i < d->max_vcpus; i++ )
     {
-        cpu = cpumask_cycle(cpu, cpupool0->cpu_valid);
-        (void)alloc_vcpu(d, i, cpu);
+        cpu = cpumask_cycle(cpu, &dom0_cpus);
+        setup_dom0_vcpu(d, i, cpu);
     }
 
     /*
@@ -1266,14 +1383,7 @@ int __init construct_dom0(
     }
 
     if ( is_pvh_domain(d) )
-    {
-        unsigned long hap_pages, memkb = nr_pages * (PAGE_SIZE / 1024);
-
-        /* Copied from: libxl_get_required_shadow_memory() */
-        memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
-        hap_pages = ( (memkb + 1023) / 1024) << (20 - PAGE_SHIFT);
-        hap_set_alloc_for_pvh_dom0(d, hap_pages);
-    }
+        hap_set_alloc_for_pvh_dom0(d, dom0_paging_pages(d, nr_pages));
 
     /*
      * We enable paging mode again so guest_physmap_add_page will do the
@@ -1367,7 +1477,7 @@ int __init construct_dom0(
     if ( is_pvh_domain(d) )
         si->shared_info = shared_info_paddr;
 
-    if ( is_pv_32on64_domain(d) )
+    if ( is_pv_32bit_domain(d) )
         xlat_start_info(si, XLAT_start_info_console_dom0);
 
     /* Return to idle domain's page tables. */
@@ -1389,16 +1499,17 @@ int __init construct_dom0(
      */
     regs = &v->arch.user_regs;
     regs->ds = regs->es = regs->fs = regs->gs =
-        !is_pv_32on64_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS;
-    regs->ss = (!is_pv_32on64_domain(d) ?
+        !is_pv_32bit_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS;
+    regs->ss = (!is_pv_32bit_domain(d) ?
                 FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS);
-    regs->cs = (!is_pv_32on64_domain(d) ?
+    regs->cs = (!is_pv_32bit_domain(d) ?
                 FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS);
     regs->eip = parms.virt_entry;
     regs->esp = vstack_end;
     regs->esi = vstartinfo_start;
     regs->eflags = X86_EFLAGS_IF;
 
+#ifdef CONFIG_SHADOW_PAGING
     if ( opt_dom0_shadow )
     {
         if ( is_pvh_domain(d) )
@@ -1409,25 +1520,14 @@ int __init construct_dom0(
         if ( paging_enable(d, PG_SH_enable) == 0 ) 
             paging_update_paging_modes(v);
     }
+#endif
 
-    if ( supervisor_mode_kernel )
-    {
-        v->arch.pv_vcpu.kernel_ss &= ~3;
-        v->arch.user_regs.ss &= ~3;
-        v->arch.user_regs.es &= ~3;
-        v->arch.user_regs.ds &= ~3;
-        v->arch.user_regs.fs &= ~3;
-        v->arch.user_regs.gs &= ~3;
-        printk("Dom0 runs in ring 0 (supervisor mode)\n");
-        if ( !test_bit(XENFEAT_supervisor_mode_kernel,
-                       parms.f_supported) )
-            panic("Dom0 does not support supervisor-mode execution");
-    }
-    else
-    {
-        if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) )
-            panic("Dom0 requires supervisor-mode execution");
-    }
+    /*
+     * PVH Fixme: XENFEAT_supervisor_mode_kernel has been reused in PVH with a
+     * different meaning.
+     */
+    if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) )
+        panic("Dom0 requires supervisor-mode execution");
 
     rc = 0;
 
@@ -1492,6 +1592,20 @@ int __init construct_dom0(
             rc |= iomem_deny_access(d, sfn, efn);
     }
 
+    /* Prevent access to HPET */
+    if ( hpet_address )
+    {
+        u8 prot_flags = hpet_flags & ACPI_HPET_PAGE_PROTECT_MASK;
+
+        mfn = paddr_to_pfn(hpet_address);
+        if ( prot_flags == ACPI_HPET_PAGE_PROTECT4 )
+            rc |= iomem_deny_access(d, mfn, mfn);
+        else if ( prot_flags == ACPI_HPET_PAGE_PROTECT64 )
+            rc |= iomem_deny_access(d, mfn, mfn + 15);
+        else if ( ro_hpet )
+            rc |= rangeset_add_singleton(mmio_ro_ranges, mfn);
+    }
+
     BUG_ON(rc != 0);
 
     if ( elf_check_broken(&elf) )
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 158a164..d86f8fe 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -32,20 +32,25 @@ static inline struct vcpu *mapcache_current_vcpu(void)
         return NULL;
 
     /*
+     * When using efi runtime page tables, we have the equivalent of the idle
+     * domain's page tables but current may point at another domain's VCPU.
+     * Return NULL as though current is not properly set up yet.
+     */
+    if ( efi_enabled && efi_rs_using_pgtables() )
+        return NULL;
+
+    /*
      * If guest_table is NULL, and we are running a paravirtualised guest,
      * then it means we are running on the idle domain's page table and must
      * therefore use its mapcache.
      */
     if ( unlikely(pagetable_is_null(v->arch.guest_table)) && is_pv_vcpu(v) )
     {
-        unsigned long cr3;
-
         /* If we really are idling, perform lazy context switch now. */
         if ( (v = idle_vcpu[smp_processor_id()]) == current )
             sync_local_execstate();
         /* We must now be running on the idle page table. */
-        ASSERT((cr3 = read_cr3()) == __pa(idle_pg_table) ||
-               (efi_enabled && cr3 == efi_rs_page_table()));
+        ASSERT(read_cr3() == __pa(idle_pg_table));
     }
 
     return v;
@@ -61,7 +66,7 @@ void __init mapcache_override_current(struct vcpu *v)
 #define MAPCACHE_L1ENT(idx) \
     __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
 
-void *map_domain_page(unsigned long mfn)
+void *map_domain_page(mfn_t mfn)
 {
     unsigned long flags;
     unsigned int idx, i;
@@ -71,31 +76,31 @@ void *map_domain_page(unsigned long mfn)
     struct vcpu_maphash_entry *hashent;
 
 #ifdef NDEBUG
-    if ( mfn <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
-        return mfn_to_virt(mfn);
+    if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+        return mfn_to_virt(mfn_x(mfn));
 #endif
 
     v = mapcache_current_vcpu();
     if ( !v || !is_pv_vcpu(v) )
-        return mfn_to_virt(mfn);
+        return mfn_to_virt(mfn_x(mfn));
 
     dcache = &v->domain->arch.pv_domain.mapcache;
     vcache = &v->arch.pv_vcpu.mapcache;
     if ( !dcache->inuse )
-        return mfn_to_virt(mfn);
+        return mfn_to_virt(mfn_x(mfn));
 
     perfc_incr(map_domain_page_count);
 
     local_irq_save(flags);
 
-    hashent = &vcache->hash[MAPHASH_HASHFN(mfn)];
-    if ( hashent->mfn == mfn )
+    hashent = &vcache->hash[MAPHASH_HASHFN(mfn_x(mfn))];
+    if ( hashent->mfn == mfn_x(mfn) )
     {
         idx = hashent->idx;
         ASSERT(idx < dcache->entries);
         hashent->refcnt++;
         ASSERT(hashent->refcnt);
-        ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == mfn);
+        ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == mfn_x(mfn));
         goto out;
     }
 
@@ -130,7 +135,7 @@ void *map_domain_page(unsigned long mfn)
         else
         {
             /* Replace a hash entry instead. */
-            i = MAPHASH_HASHFN(mfn);
+            i = MAPHASH_HASHFN(mfn_x(mfn));
             do {
                 hashent = &vcache->hash[i];
                 if ( hashent->idx != MAPHASHENT_NOTINUSE && !hashent->refcnt )
@@ -144,7 +149,7 @@ void *map_domain_page(unsigned long mfn)
                 }
                 if ( ++i == MAPHASH_ENTRIES )
                     i = 0;
-            } while ( i != MAPHASH_HASHFN(mfn) );
+            } while ( i != MAPHASH_HASHFN(mfn_x(mfn)) );
         }
         BUG_ON(idx >= dcache->entries);
 
@@ -160,7 +165,7 @@ void *map_domain_page(unsigned long mfn)
 
     spin_unlock(&dcache->lock);
 
-    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
+    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR_RW));
 
  out:
     local_irq_restore(flags);
@@ -225,24 +230,6 @@ void unmap_domain_page(const void *ptr)
     local_irq_restore(flags);
 }
 
-void clear_domain_page(unsigned long mfn)
-{
-    void *ptr = map_domain_page(mfn);
-
-    clear_page(ptr);
-    unmap_domain_page(ptr);
-}
-
-void copy_domain_page(unsigned long dmfn, unsigned long smfn)
-{
-    const void *src = map_domain_page(smfn);
-    void *dst = map_domain_page(dmfn);
-
-    copy_page(dst, src);
-    unmap_domain_page(dst);
-    unmap_domain_page(src);
-}
-
 int mapcache_domain_init(struct domain *d)
 {
     struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache;
@@ -315,13 +302,13 @@ int mapcache_vcpu_init(struct vcpu *v)
     return 0;
 }
 
-void *map_domain_page_global(unsigned long mfn)
+void *map_domain_page_global(mfn_t mfn)
 {
     ASSERT(!in_irq() && local_irq_is_enabled());
 
 #ifdef NDEBUG
-    if ( mfn <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
-        return mfn_to_virt(mfn);
+    if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+        return mfn_to_virt(mfn_x(mfn));
 #endif
 
     return vmap(&mfn, 1);
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 1585526..bf62a88 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Arch-specific domctl.c
- * 
+ *
  * Copyright (c) 2002-2006, K A Fraser
  */
 
@@ -30,21 +30,21 @@
 #include <xen/hypercall.h> /* for arch_do_domctl */
 #include <xsm/xsm.h>
 #include <xen/iommu.h>
-#include <xen/mem_event.h>
-#include <public/mem_event.h>
+#include <xen/vm_event.h>
+#include <public/vm_event.h>
 #include <asm/mem_sharing.h>
 #include <asm/xstate.h>
 #include <asm/debugger.h>
 #include <asm/psr.h>
 
-static int gdbsx_guest_mem_io(
-    domid_t domid, struct xen_domctl_gdbsx_memio *iop)
-{   
-    ulong l_uva = (ulong)iop->uva;
-    iop->remain = dbg_rw_mem(
-        (dbgva_t)iop->gva, (dbgbyte_t *)l_uva, iop->len, domid,
-        iop->gwr, iop->pgd3val);
-    return (iop->remain ? -EFAULT : 0);
+static int gdbsx_guest_mem_io(domid_t domid, struct xen_domctl_gdbsx_memio *iop)
+{
+    void * __user gva = (void *)iop->gva, * __user uva = (void *)iop->uva;
+
+    iop->remain = dbg_rw_mem(gva, uva, iop->len, domid,
+                             !!iop->gwr, iop->pgd3val);
+
+    return iop->remain ? -EFAULT : 0;
 }
 
 #define MAX_IOPORTS 0x10000
@@ -53,6 +53,8 @@ long arch_do_domctl(
     struct xen_domctl *domctl, struct domain *d,
     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
     long ret = 0;
     bool_t copyback = 0;
     unsigned long i;
@@ -61,15 +63,13 @@ long arch_do_domctl(
     {
 
     case XEN_DOMCTL_shadow_op:
-    {
         ret = paging_domctl(d, &domctl->u.shadow_op,
                             guest_handle_cast(u_domctl, void), 0);
         if ( ret == -ERESTART )
             return hypercall_create_continuation(__HYPERVISOR_arch_1,
                                                  "h", u_domctl);
         copyback = 1;
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_ioport_permission:
     {
@@ -79,8 +79,7 @@ long arch_do_domctl(
 
         if ( (fp + np) <= fp || (fp + np) > MAX_IOPORTS )
             ret = -EINVAL;
-        else if ( !ioports_access_permitted(current->domain,
-                                            fp, fp + np - 1) ||
+        else if ( !ioports_access_permitted(currd, fp, fp + np - 1) ||
                   xsm_ioport_permission(XSM_HOOK, d, fp, fp + np - 1, allow) )
             ret = -EPERM;
         else if ( allow )
@@ -89,239 +88,8 @@ long arch_do_domctl(
             ret = ioports_deny_access(d, fp, fp + np - 1);
         if ( !ret )
             memory_type_changed(d);
+        break;
     }
-    break;
-
-    case XEN_DOMCTL_getpageframeinfo:
-    {
-        struct page_info *page;
-        unsigned long mfn = domctl->u.getpageframeinfo.gmfn;
-
-        ret = -EINVAL;
-        if ( unlikely(!mfn_valid(mfn)) )
-            break;
-
-        page = mfn_to_page(mfn);
-
-        if ( likely(get_page(page, d)) )
-        {
-            ret = 0;
-
-            domctl->u.getpageframeinfo.type = XEN_DOMCTL_PFINFO_NOTAB;
-
-            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
-            {
-                switch ( page->u.inuse.type_info & PGT_type_mask )
-                {
-                case PGT_l1_page_table:
-                    domctl->u.getpageframeinfo.type = XEN_DOMCTL_PFINFO_L1TAB;
-                    break;
-                case PGT_l2_page_table:
-                    domctl->u.getpageframeinfo.type = XEN_DOMCTL_PFINFO_L2TAB;
-                    break;
-                case PGT_l3_page_table:
-                    domctl->u.getpageframeinfo.type = XEN_DOMCTL_PFINFO_L3TAB;
-                    break;
-                case PGT_l4_page_table:
-                    domctl->u.getpageframeinfo.type = XEN_DOMCTL_PFINFO_L4TAB;
-                    break;
-                }
-            }
-            
-            put_page(page);
-        }
-
-        copyback = 1;
-    }
-    break;
-
-    case XEN_DOMCTL_getpageframeinfo3:
-        if (!has_32bit_shinfo(current->domain))
-        {
-            unsigned int n, j;
-            unsigned int num = domctl->u.getpageframeinfo3.num;
-            struct page_info *page;
-            xen_pfn_t *arr;
-
-            if ( unlikely(num > 1024) ||
-                 unlikely(num != domctl->u.getpageframeinfo3.num) )
-            {
-                ret = -E2BIG;
-                break;
-            }
-
-            page = alloc_domheap_page(NULL, 0);
-            if ( !page )
-            {
-                ret = -ENOMEM;
-                break;
-            }
-            arr = __map_domain_page(page);
-
-            for ( n = ret = 0; n < num; )
-            {
-                unsigned int k = min_t(unsigned int, num - n,
-                                       PAGE_SIZE / sizeof(*arr));
-
-                if ( copy_from_guest_offset(arr,
-                                            domctl->u.getpageframeinfo3.array,
-                                            n, k) )
-                {
-                    ret = -EFAULT;
-                    break;
-                }
-
-                for ( j = 0; j < k; j++ )
-                {
-                    unsigned long type = 0;
-                    p2m_type_t t;
-
-                    page = get_page_from_gfn(d, arr[j], &t, P2M_ALLOC);
-
-                    if ( unlikely(!page) ||
-                         unlikely(is_xen_heap_page(page)) )
-                    {
-                        if ( p2m_is_broken(t) )
-                            type = XEN_DOMCTL_PFINFO_BROKEN;
-                        else
-                            type = XEN_DOMCTL_PFINFO_XTAB;
-                    }
-                    else
-                    {
-                        switch( page->u.inuse.type_info & PGT_type_mask )
-                        {
-                        case PGT_l1_page_table:
-                            type = XEN_DOMCTL_PFINFO_L1TAB;
-                            break;
-                        case PGT_l2_page_table:
-                            type = XEN_DOMCTL_PFINFO_L2TAB;
-                            break;
-                        case PGT_l3_page_table:
-                            type = XEN_DOMCTL_PFINFO_L3TAB;
-                            break;
-                        case PGT_l4_page_table:
-                            type = XEN_DOMCTL_PFINFO_L4TAB;
-                            break;
-                        }
-
-                        if ( page->u.inuse.type_info & PGT_pinned )
-                            type |= XEN_DOMCTL_PFINFO_LPINTAB;
-
-                        if ( page->count_info & PGC_broken )
-                            type = XEN_DOMCTL_PFINFO_BROKEN;
-                    }
-
-                    if ( page )
-                        put_page(page);
-                    arr[j] = type;
-                }
-
-                if ( copy_to_guest_offset(domctl->u.getpageframeinfo3.array,
-                                          n, arr, k) )
-                {
-                    ret = -EFAULT;
-                    break;
-                }
-
-                n += k;
-            }
-
-            page = mfn_to_page(domain_page_map_to_mfn(arr));
-            unmap_domain_page(arr);
-            free_domheap_page(page);
-
-            break;
-        }
-        /* fall thru */
-    case XEN_DOMCTL_getpageframeinfo2:
-    {
-        int n,j;
-        int num = domctl->u.getpageframeinfo2.num;
-        uint32_t *arr32;
-
-        if ( unlikely(num > 1024) )
-        {
-            ret = -E2BIG;
-            break;
-        }
-
-        arr32 = alloc_xenheap_page();
-        if ( !arr32 )
-        {
-            ret = -ENOMEM;
-            break;
-        }
- 
-        ret = 0;
-        for ( n = 0; n < num; )
-        {
-            int k = PAGE_SIZE / 4;
-            if ( (num - n) < k )
-                k = num - n;
-
-            if ( copy_from_guest_offset(arr32,
-                                        domctl->u.getpageframeinfo2.array,
-                                        n, k) )
-            {
-                ret = -EFAULT;
-                break;
-            }
-     
-            for ( j = 0; j < k; j++ )
-            {      
-                struct page_info *page;
-                unsigned long gfn = arr32[j];
-
-                page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC);
-
-                if ( domctl->cmd == XEN_DOMCTL_getpageframeinfo3)
-                    arr32[j] = 0;
-
-                if ( unlikely(!page) ||
-                     unlikely(is_xen_heap_page(page)) )
-                    arr32[j] |= XEN_DOMCTL_PFINFO_XTAB;
-                else
-                {
-                    unsigned long type = 0;
-
-                    switch( page->u.inuse.type_info & PGT_type_mask )
-                    {
-                    case PGT_l1_page_table:
-                        type = XEN_DOMCTL_PFINFO_L1TAB;
-                        break;
-                    case PGT_l2_page_table:
-                        type = XEN_DOMCTL_PFINFO_L2TAB;
-                        break;
-                    case PGT_l3_page_table:
-                        type = XEN_DOMCTL_PFINFO_L3TAB;
-                        break;
-                    case PGT_l4_page_table:
-                        type = XEN_DOMCTL_PFINFO_L4TAB;
-                        break;
-                    }
-
-                    if ( page->u.inuse.type_info & PGT_pinned )
-                        type |= XEN_DOMCTL_PFINFO_LPINTAB;
-                    arr32[j] |= type;
-                }
-
-                if ( page )
-                    put_page(page);
-            }
-
-            if ( copy_to_guest_offset(domctl->u.getpageframeinfo2.array,
-                                      n, arr32, k) )
-            {
-                ret = -EFAULT;
-                break;
-            }
-
-            n += k;
-        }
-
-        free_xenheap_page(arr32);
-    }
-    break;
 
     case XEN_DOMCTL_getmemlist:
     {
@@ -329,7 +97,8 @@ long arch_do_domctl(
         uint64_t mfn;
         struct page_info *page;
 
-        if ( unlikely(d->is_dying) ) {
+        if ( unlikely(d->is_dying) )
+        {
             ret = -EINVAL;
             break;
         }
@@ -346,7 +115,7 @@ long arch_do_domctl(
          * rather than trying to fix it we restrict it for the time being.
          */
         if ( /* No nested locks inside copy_to_guest_offset(). */
-             paging_mode_external(current->domain) ||
+             paging_mode_external(currd) ||
              /* Arbitrary limit capping processing time. */
              max_pfns > GB(4) / PAGE_SIZE )
         {
@@ -375,8 +144,83 @@ long arch_do_domctl(
 
         domctl->u.getmemlist.num_pfns = i;
         copyback = 1;
+        break;
+    }
+
+    case XEN_DOMCTL_getpageframeinfo3:
+    {
+        unsigned int num = domctl->u.getpageframeinfo3.num;
+        unsigned int width = has_32bit_shinfo(currd) ? 4 : 8;
+
+        /* Games to allow this code block to handle a compat guest. */
+        void __user *guest_handle = domctl->u.getpageframeinfo3.array.p;
+
+        if ( unlikely(num > 1024) ||
+             unlikely(num != domctl->u.getpageframeinfo3.num) )
+        {
+            ret = -E2BIG;
+            break;
+        }
+
+        for ( i = 0; i < num; ++i )
+        {
+            unsigned long gfn = 0, type = 0;
+            struct page_info *page;
+            p2m_type_t t;
+
+            if ( raw_copy_from_guest(&gfn, guest_handle + (i * width), width) )
+            {
+                ret = -EFAULT;
+                break;
+            }
+
+            page = get_page_from_gfn(d, gfn, &t, P2M_ALLOC);
+
+            if ( unlikely(!page) ||
+                 unlikely(is_xen_heap_page(page)) )
+            {
+                if ( unlikely(p2m_is_broken(t)) )
+                    type = XEN_DOMCTL_PFINFO_BROKEN;
+                else
+                    type = XEN_DOMCTL_PFINFO_XTAB;
+            }
+            else
+            {
+                switch( page->u.inuse.type_info & PGT_type_mask )
+                {
+                case PGT_l1_page_table:
+                    type = XEN_DOMCTL_PFINFO_L1TAB;
+                    break;
+                case PGT_l2_page_table:
+                    type = XEN_DOMCTL_PFINFO_L2TAB;
+                    break;
+                case PGT_l3_page_table:
+                    type = XEN_DOMCTL_PFINFO_L3TAB;
+                    break;
+                case PGT_l4_page_table:
+                    type = XEN_DOMCTL_PFINFO_L4TAB;
+                    break;
+                }
+
+                if ( page->u.inuse.type_info & PGT_pinned )
+                    type |= XEN_DOMCTL_PFINFO_LPINTAB;
+
+                if ( page->count_info & PGC_broken )
+                    type = XEN_DOMCTL_PFINFO_BROKEN;
+            }
+
+            if ( page )
+                put_page(page);
+
+            if ( __raw_copy_to_guest(guest_handle + (i * width), &type, width) )
+            {
+                ret = -EFAULT;
+                break;
+            }
+        }
+
+        break;
     }
-    break;
 
     case XEN_DOMCTL_hypercall_init:
     {
@@ -386,30 +230,33 @@ long arch_do_domctl(
 
         page = get_page_from_gfn(d, gmfn, NULL, P2M_ALLOC);
 
-        ret = -EACCES;
         if ( !page || !get_page_type(page, PGT_writable_page) )
         {
             if ( page )
+            {
+                ret = -EPERM;
                 put_page(page);
+            }
+            else
+                ret = -EINVAL;
             break;
         }
 
-        ret = 0;
-
         hypercall_page = __map_domain_page(page);
         hypercall_page_initialise(d, hypercall_page);
         unmap_domain_page(hypercall_page);
 
         put_page_and_type(page);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_sethvmcontext:
-    { 
+    {
         struct hvm_domain_context c = { .size = domctl->u.hvmcontext.size };
 
         ret = -EINVAL;
-        if ( !is_hvm_domain(d) ) 
+        if ( (d == currd) || /* no domain_pause() */
+             !is_hvm_domain(d) )
             goto sethvmcontext_out;
 
         ret = -ENOMEM;
@@ -417,7 +264,7 @@ long arch_do_domctl(
             goto sethvmcontext_out;
 
         ret = -EFAULT;
-        if ( copy_from_guest(c.data, domctl->u.hvmcontext.buffer, c.size) != 0)
+        if ( copy_from_guest(c.data, domctl->u.hvmcontext.buffer, c.size) != 0 )
             goto sethvmcontext_out;
 
         domain_pause(d);
@@ -425,17 +272,17 @@ long arch_do_domctl(
         domain_unpause(d);
 
     sethvmcontext_out:
-        if ( c.data != NULL )
-            xfree(c.data);
+        xfree(c.data);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_gethvmcontext:
-    { 
+    {
         struct hvm_domain_context c = { 0 };
 
         ret = -EINVAL;
-        if ( !is_hvm_domain(d) ) 
+        if ( (d == currd) || /* no domain_pause() */
+             !is_hvm_domain(d) )
             goto gethvmcontext_out;
 
         c.size = hvm_save_size(d);
@@ -445,12 +292,12 @@ long arch_do_domctl(
             /* Client is querying for the correct buffer size */
             domctl->u.hvmcontext.size = c.size;
             ret = 0;
-            goto gethvmcontext_out;            
+            goto gethvmcontext_out;
         }
 
         /* Check that the client has a big enough buffer */
         ret = -ENOSPC;
-        if ( domctl->u.hvmcontext.size < c.size ) 
+        if ( domctl->u.hvmcontext.size < c.size )
             goto gethvmcontext_out;
 
         /* Allocate our own marshalling buffer */
@@ -468,16 +315,14 @@ long arch_do_domctl(
 
     gethvmcontext_out:
         copyback = 1;
-
-        if ( c.data != NULL )
-            xfree(c.data);
+        xfree(c.data);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_gethvmcontext_partial:
-    { 
         ret = -EINVAL;
-        if ( !is_hvm_domain(d) ) 
+        if ( (d == currd) || /* no domain_pause() */
+             !is_hvm_domain(d) )
             break;
 
         domain_pause(d);
@@ -485,12 +330,9 @@ long arch_do_domctl(
                            domctl->u.hvmcontext_partial.instance,
                            domctl->u.hvmcontext_partial.buffer);
         domain_unpause(d);
-    }
-    break;
-
+        break;
 
     case XEN_DOMCTL_set_address_size:
-    {
         switch ( domctl->u.address_size.size )
         {
         case 32:
@@ -503,48 +345,30 @@ long arch_do_domctl(
             ret = (domctl->u.address_size.size == BITS_PER_LONG) ? 0 : -EINVAL;
             break;
         }
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_get_address_size:
-    {
         domctl->u.address_size.size =
-            is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
-
-        ret = 0;
+            is_pv_32bit_domain(d) ? 32 : BITS_PER_LONG;
         copyback = 1;
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_set_machine_address_size:
-    {
-        ret = -EBUSY;
         if ( d->tot_pages > 0 )
-            break;
-
-        d->arch.physaddr_bitsize = domctl->u.address_size.size;
-
-        ret = 0;
-    }
-    break;
+            ret = -EBUSY;
+        else
+            d->arch.physaddr_bitsize = domctl->u.address_size.size;
+        break;
 
     case XEN_DOMCTL_get_machine_address_size:
-    {
         domctl->u.address_size.size = d->arch.physaddr_bitsize;
-
-        ret = 0;
         copyback = 1;
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_sendtrigger:
     {
         struct vcpu *v;
 
-        ret = -EINVAL;
-        if ( domctl->u.sendtrigger.vcpu >= MAX_VIRT_CPUS )
-            break;
-
         ret = -ESRCH;
         if ( domctl->u.sendtrigger.vcpu >= d->max_vcpus ||
              (v = d->vcpu[domctl->u.sendtrigger.vcpu]) == NULL )
@@ -553,40 +377,34 @@ long arch_do_domctl(
         switch ( domctl->u.sendtrigger.trigger )
         {
         case XEN_DOMCTL_SENDTRIGGER_NMI:
-        {
             ret = 0;
             if ( !test_and_set_bool(v->nmi_pending) )
                 vcpu_kick(v);
-        }
-        break;
+            break;
 
         case XEN_DOMCTL_SENDTRIGGER_POWER:
-        {
             ret = -EINVAL;
-            if ( is_hvm_domain(d) ) 
+            if ( is_hvm_domain(d) )
             {
                 ret = 0;
                 hvm_acpi_power_button(d);
             }
-        }
-        break;
+            break;
 
         case XEN_DOMCTL_SENDTRIGGER_SLEEP:
-        {
             ret = -EINVAL;
-            if ( is_hvm_domain(d) ) 
+            if ( is_hvm_domain(d) )
             {
                 ret = 0;
                 hvm_acpi_sleep_button(d);
             }
-        }
-        break;
+            break;
 
         default:
             ret = -ENOSYS;
         }
+        break;
     }
-    break;
 
     case XEN_DOMCTL_bind_pt_irq:
     {
@@ -603,7 +421,7 @@ long arch_do_domctl(
 
         irq = domain_pirq_to_irq(d, bind->machine_irq);
         ret = -EPERM;
-        if ( irq <= 0 || !irq_access_permitted(current->domain, irq) )
+        if ( irq <= 0 || !irq_access_permitted(currd, irq) )
             break;
 
         ret = -ESRCH;
@@ -616,8 +434,8 @@ long arch_do_domctl(
         if ( ret < 0 )
             printk(XENLOG_G_ERR "pt_irq_create_bind failed (%ld) for dom%d\n",
                    ret, d->domain_id);
+        break;
     }
-    break;    
 
     case XEN_DOMCTL_unbind_pt_irq:
     {
@@ -625,7 +443,7 @@ long arch_do_domctl(
         int irq = domain_pirq_to_irq(d, bind->machine_irq);
 
         ret = -EPERM;
-        if ( irq <= 0 || !irq_access_permitted(current->domain, irq) )
+        if ( irq <= 0 || !irq_access_permitted(currd, irq) )
             break;
 
         ret = xsm_unbind_pt_irq(XSM_HOOK, d, bind);
@@ -641,8 +459,8 @@ long arch_do_domctl(
         if ( ret < 0 )
             printk(XENLOG_G_ERR "pt_irq_destroy_bind failed (%ld) for dom%d\n",
                    ret, d->domain_id);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_ioport_mapping:
     {
@@ -665,7 +483,7 @@ long arch_do_domctl(
         }
 
         ret = -EPERM;
-        if ( !ioports_access_permitted(current->domain, fmp, fmp + np - 1) )
+        if ( !ioports_access_permitted(currd, fmp, fmp + np - 1) )
             break;
 
         ret = xsm_ioport_mapping(XSM_HOOK, d, fmp, fmp + np - 1, add);
@@ -721,33 +539,29 @@ long arch_do_domctl(
                     break;
                 }
             ret = ioports_deny_access(d, fmp, fmp + np - 1);
-            if ( ret && is_hardware_domain(current->domain) )
+            if ( ret && is_hardware_domain(currd) )
                 printk(XENLOG_ERR
                        "ioport_map: error %ld denying dom%d access to [%x,%x]\n",
                        ret, d->domain_id, fmp, fmp + np - 1);
         }
         if ( !ret )
             memory_type_changed(d);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_pin_mem_cacheattr:
-    {
         ret = hvm_set_mem_pinned_cacheattr(
             d, domctl->u.pin_mem_cacheattr.start,
             domctl->u.pin_mem_cacheattr.end,
             domctl->u.pin_mem_cacheattr.type);
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_set_ext_vcpucontext:
     case XEN_DOMCTL_get_ext_vcpucontext:
     {
-        struct xen_domctl_ext_vcpucontext *evc;
+        struct xen_domctl_ext_vcpucontext *evc = &domctl->u.ext_vcpucontext;
         struct vcpu *v;
 
-        evc = &domctl->u.ext_vcpucontext;
-
         ret = -ESRCH;
         if ( (evc->vcpu >= d->max_vcpus) ||
              ((v = d->vcpu[evc->vcpu]) == NULL) )
@@ -755,7 +569,7 @@ long arch_do_domctl(
 
         if ( domctl->cmd == XEN_DOMCTL_get_ext_vcpucontext )
         {
-            if ( v == current ) /* no vcpu_pause() */
+            if ( v == curr ) /* no vcpu_pause() */
                 break;
 
             evc->size = sizeof(*evc);
@@ -796,7 +610,7 @@ long arch_do_domctl(
         }
         else
         {
-            if ( d == current->domain ) /* no domain_pause() */
+            if ( d == currd ) /* no domain_pause() */
                 break;
             ret = -EINVAL;
             if ( evc->size < offsetof(typeof(*evc), vmce) )
@@ -850,8 +664,8 @@ long arch_do_domctl(
 
             domain_unpause(d);
         }
+        break;
     }
-    break;
 
     case XEN_DOMCTL_set_cpuid:
     {
@@ -874,60 +688,80 @@ long arch_do_domctl(
                   (cpuid->input[1] == ctl->input[1])) )
                 break;
         }
-        
+
         if ( i < MAX_CPUID_INPUT )
             *cpuid = *ctl;
         else if ( unused )
             *unused = *ctl;
         else
             ret = -ENOENT;
-    }
-    break;
 
-    case XEN_DOMCTL_gettscinfo:
-    {
-        xen_guest_tsc_info_t info = { 0 };
+        if ( !ret )
+        {
+            switch ( ctl->input[0] )
+            {
+            case 0: {
+                union {
+                    typeof(boot_cpu_data.x86_vendor_id) str;
+                    struct {
+                        uint32_t ebx, edx, ecx;
+                    } reg;
+                } vendor_id = {
+                    .reg = {
+                        .ebx = ctl->ebx,
+                        .edx = ctl->edx,
+                        .ecx = ctl->ecx
+                    }
+                };
 
-        ret = -EINVAL;
-        if ( d == current->domain ) /* no domain_pause() */
-            break;
+                d->arch.x86_vendor = get_cpu_vendor(vendor_id.str, gcv_guest);
+                break;
+            }
+            case 1:
+                d->arch.x86 = (ctl->eax >> 8) & 0xf;
+                if ( d->arch.x86 == 0xf )
+                    d->arch.x86 += (ctl->eax >> 20) & 0xff;
+                d->arch.x86_model = (ctl->eax >> 4) & 0xf;
+                if ( d->arch.x86 >= 0x6 )
+                    d->arch.x86_model |= (ctl->eax >> 12) & 0xf0;
+                break;
+            }
+        }
+        break;
+    }
 
-        domain_pause(d);
-        tsc_get_info(d, &info.tsc_mode,
-                        &info.elapsed_nsec,
-                        &info.gtsc_khz,
-                        &info.incarnation);
-        if ( copy_to_guest(domctl->u.tsc_info.out_info, &info, 1) )
-            ret = -EFAULT;
+    case XEN_DOMCTL_gettscinfo:
+        if ( d == currd ) /* no domain_pause() */
+            ret = -EINVAL;
         else
-            ret = 0;
-        domain_unpause(d);
-    }
-    break;
+        {
+            domain_pause(d);
+            tsc_get_info(d, &domctl->u.tsc_info.tsc_mode,
+                         &domctl->u.tsc_info.elapsed_nsec,
+                         &domctl->u.tsc_info.gtsc_khz,
+                         &domctl->u.tsc_info.incarnation);
+            domain_unpause(d);
+            copyback = 1;
+        }
+        break;
 
     case XEN_DOMCTL_settscinfo:
-    {
-        ret = -EINVAL;
-        if ( d == current->domain ) /* no domain_pause() */
-            break;
-
-        domain_pause(d);
-        tsc_set_info(d, domctl->u.tsc_info.info.tsc_mode,
-                     domctl->u.tsc_info.info.elapsed_nsec,
-                     domctl->u.tsc_info.info.gtsc_khz,
-                     domctl->u.tsc_info.info.incarnation);
-        domain_unpause(d);
-
-        ret = 0;
-    }
-    break;
+        if ( d == currd ) /* no domain_pause() */
+            ret = -EINVAL;
+        else
+        {
+            domain_pause(d);
+            tsc_set_info(d, domctl->u.tsc_info.tsc_mode,
+                         domctl->u.tsc_info.elapsed_nsec,
+                         domctl->u.tsc_info.gtsc_khz,
+                         domctl->u.tsc_info.incarnation);
+            domain_unpause(d);
+        }
+        break;
 
     case XEN_DOMCTL_suppress_spurious_page_faults:
-    {
         d->arch.suppress_spurious_page_faults = 1;
-        ret = 0;
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_debug_op:
     {
@@ -939,23 +773,20 @@ long arch_do_domctl(
             break;
 
         ret = -EINVAL;
-        if ( !is_hvm_domain(d))
+        if ( (v == curr) || /* no vcpu_pause() */
+             !is_hvm_domain(d) )
             break;
 
         ret = hvm_debug_op(v, domctl->u.debug_op.op);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_gdbsx_guestmemio:
-    {
-        domctl->u.gdbsx_guest_memio.remain =
-            domctl->u.gdbsx_guest_memio.len;
-
+        domctl->u.gdbsx_guest_memio.remain = domctl->u.gdbsx_guest_memio.len;
         ret = gdbsx_guest_mem_io(domctl->domain, &domctl->u.gdbsx_guest_memio);
         if ( !ret )
            copyback = 1;
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_gdbsx_pausevcpu:
     {
@@ -969,8 +800,8 @@ long arch_do_domctl(
              (v = d->vcpu[domctl->u.gdbsx_pauseunp_vcpu.vcpu]) == NULL )
             break;
         ret = vcpu_pause_by_systemcontroller(v);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_gdbsx_unpausevcpu:
     {
@@ -987,9 +818,9 @@ long arch_do_domctl(
         if ( ret == -EINVAL )
             printk(XENLOG_G_WARNING
                    "WARN: d%d attempting to unpause %pv which is not paused\n",
-                   current->domain->domain_id, v);
+                   currd->domain_id, v);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_gdbsx_domstatus:
     {
@@ -1011,29 +842,26 @@ long arch_do_domctl(
                 }
             }
         }
-        ret = 0;
         copyback = 1;
+        break;
     }
-    break;
 
     case XEN_DOMCTL_setvcpuextstate:
     case XEN_DOMCTL_getvcpuextstate:
     {
-        struct xen_domctl_vcpuextstate *evc;
+        struct xen_domctl_vcpuextstate *evc = &domctl->u.vcpuextstate;
         struct vcpu *v;
         uint32_t offset = 0;
 
 #define PV_XSAVE_SIZE(xcr0) (2 * sizeof(uint64_t) + xstate_ctxt_size(xcr0))
 
-        evc = &domctl->u.vcpuextstate;
-
         ret = -ESRCH;
         if ( (evc->vcpu >= d->max_vcpus) ||
              ((v = d->vcpu[evc->vcpu]) == NULL) )
             goto vcpuextstate_out;
 
         ret = -EINVAL;
-        if ( v == current ) /* no vcpu_pause() */
+        if ( v == curr ) /* no vcpu_pause() */
             goto vcpuextstate_out;
 
         if ( domctl->cmd == XEN_DOMCTL_getvcpuextstate )
@@ -1139,31 +967,26 @@ long arch_do_domctl(
     vcpuextstate_out:
         if ( domctl->cmd == XEN_DOMCTL_getvcpuextstate )
             copyback = 1;
+        break;
     }
-    break;
 
     case XEN_DOMCTL_mem_sharing_op:
-    {
         ret = mem_sharing_domctl(d, &domctl->u.mem_sharing_op);
-    }
-    break;
+        break;
 
 #if P2M_AUDIT
     case XEN_DOMCTL_audit_p2m:
-    {
-        if ( d == current->domain )
-        {
+        if ( d == currd )
             ret = -EPERM;
-            break;
+        else
+        {
+            audit_p2m(d,
+                      &domctl->u.audit_p2m.orphans,
+                      &domctl->u.audit_p2m.m2p_bad,
+                      &domctl->u.audit_p2m.p2m_bad);
+            copyback = 1;
         }
-
-        audit_p2m(d,
-                  &domctl->u.audit_p2m.orphans,
-                  &domctl->u.audit_p2m.m2p_bad,
-                  &domctl->u.audit_p2m.p2m_bad);
-        copyback = 1;
-    }
-    break;
+        break;
 #endif /* P2M_AUDIT */
 
     case XEN_DOMCTL_set_broken_page_p2m:
@@ -1178,8 +1001,8 @@ long arch_do_domctl(
             ret = p2m_change_type_one(d, pfn, pt, p2m_ram_broken);
 
         put_gfn(d, pfn);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_get_vcpu_msrs:
     case XEN_DOMCTL_set_vcpu_msrs:
@@ -1195,7 +1018,7 @@ long arch_do_domctl(
             break;
 
         ret = -EINVAL;
-        if ( (v == current) || /* no vcpu_pause() */
+        if ( (v == curr) || /* no vcpu_pause() */
              !is_pv_domain(d) )
             break;
 
@@ -1305,8 +1128,8 @@ long arch_do_domctl(
                 copyback = 1;
             }
         }
+        break;
     }
-    break;
 
     case XEN_DOMCTL_psr_cmt_op:
         if ( !psr_cmt_enabled() )
@@ -1320,22 +1143,45 @@ long arch_do_domctl(
         case XEN_DOMCTL_PSR_CMT_OP_ATTACH:
             ret = psr_alloc_rmid(d);
             break;
+
         case XEN_DOMCTL_PSR_CMT_OP_DETACH:
             if ( d->arch.psr_rmid > 0 )
                 psr_free_rmid(d);
             else
                 ret = -ENOENT;
             break;
+
         case XEN_DOMCTL_PSR_CMT_OP_QUERY_RMID:
             domctl->u.psr_cmt_op.data = d->arch.psr_rmid;
             copyback = 1;
             break;
+
         default:
             ret = -ENOSYS;
             break;
         }
         break;
 
+    case XEN_DOMCTL_psr_cat_op:
+        switch ( domctl->u.psr_cat_op.cmd )
+        {
+        case XEN_DOMCTL_PSR_CAT_OP_SET_L3_CBM:
+            ret = psr_set_l3_cbm(d, domctl->u.psr_cat_op.target,
+                                 domctl->u.psr_cat_op.data);
+            break;
+
+        case XEN_DOMCTL_PSR_CAT_OP_GET_L3_CBM:
+            ret = psr_get_l3_cbm(d, domctl->u.psr_cat_op.target,
+                                 &domctl->u.psr_cat_op.data);
+            copyback = 1;
+            break;
+
+        default:
+            ret = -EOPNOTSUPP;
+            break;
+        }
+        break;
+
     default:
         ret = iommu_do_domctl(domctl, d, u_domctl);
         break;
@@ -1356,10 +1202,11 @@ CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt);
 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
 {
     unsigned int i;
-    bool_t compat = is_pv_32on64_domain(v->domain);
+    const struct domain *d = v->domain;
+    bool_t compat = is_pv_32bit_domain(d);
 #define c(fld) (!compat ? (c.nat->fld) : (c.cmp->fld))
 
-    if ( !is_pv_vcpu(v) )
+    if ( !is_pv_domain(d) )
         memset(c.nat, 0, sizeof(*c.nat));
     memcpy(&c.nat->fpu_ctxt, v->arch.fpu_ctxt, sizeof(c.nat->fpu_ctxt));
     c(flags = v->arch.vgc_flags & ~(VGCF_i387_valid|VGCF_in_kernel));
@@ -1370,22 +1217,25 @@ void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
     if ( !compat )
     {
         memcpy(&c.nat->user_regs, &v->arch.user_regs, sizeof(c.nat->user_regs));
-        if ( is_pv_vcpu(v) )
+        if ( is_pv_domain(d) )
             memcpy(c.nat->trap_ctxt, v->arch.pv_vcpu.trap_ctxt,
                    sizeof(c.nat->trap_ctxt));
     }
     else
     {
         XLAT_cpu_user_regs(&c.cmp->user_regs, &v->arch.user_regs);
-        for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i )
-            XLAT_trap_info(c.cmp->trap_ctxt + i,
-                           v->arch.pv_vcpu.trap_ctxt + i);
+        if ( is_pv_domain(d) )
+        {
+            for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i )
+                XLAT_trap_info(c.cmp->trap_ctxt + i,
+                               v->arch.pv_vcpu.trap_ctxt + i);
+        }
     }
 
     for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
         c(debugreg[i] = v->arch.debugreg[i]);
 
-    if ( has_hvm_container_vcpu(v) )
+    if ( has_hvm_container_domain(d) )
     {
         struct segment_register sreg;
 
@@ -1446,13 +1296,12 @@ void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
             c(event_callback_cs = v->arch.pv_vcpu.event_callback_cs);
             c(failsafe_callback_cs = v->arch.pv_vcpu.failsafe_callback_cs);
         }
-        c(vm_assist = v->arch.pv_vcpu.vm_assist);
 
         /* IOPL privileges are virtualised: merge back into returned eflags. */
         BUG_ON((c(user_regs.eflags) & X86_EFLAGS_IOPL) != 0);
         c(user_regs.eflags |= v->arch.pv_vcpu.iopl << 12);
 
-        if ( !is_pv_32on64_domain(v->domain) )
+        if ( !compat )
         {
             c.nat->ctrlreg[3] = xen_pfn_to_cr3(
                 pagetable_get_pfn(v->arch.guest_table));
@@ -1467,7 +1316,7 @@ void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
         else
         {
             const l4_pgentry_t *l4e =
-                map_domain_page(pagetable_get_pfn(v->arch.guest_table));
+                map_domain_page(_mfn(pagetable_get_pfn(v->arch.guest_table)));
 
             c.cmp->ctrlreg[3] = compat_pfn_to_cr3(l4e_get_pfn(*l4e));
             unmap_domain_page(l4e);
@@ -1481,7 +1330,7 @@ void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
             c(flags |= VGCF_in_kernel);
     }
 
-    c(vm_assist = v->domain->vm_assist);
+    c(vm_assist = d->vm_assist);
 #undef c
 }
 
diff --git a/xen/arch/x86/e820.c b/xen/arch/x86/e820.c
index bf84bae..3c64f19 100644
--- a/xen/arch/x86/e820.c
+++ b/xen/arch/x86/e820.c
@@ -74,20 +74,18 @@ static void __init add_memory_region(unsigned long long start,
 {
     int x;
 
-    /*if (!efi_enabled)*/ {
-        x = e820.nr_map;
+    x = e820.nr_map;
 
-        if (x == E820MAX) {
-            printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-            return;
-        }
-
-        e820.map[x].addr = start;
-        e820.map[x].size = size;
-        e820.map[x].type = type;
-        e820.nr_map++;
+    if (x == E820MAX) {
+        printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+        return;
     }
-} /* add_memory_region */
+
+    e820.map[x].addr = start;
+    e820.map[x].size = size;
+    e820.map[x].type = type;
+    e820.nr_map++;
+}
 
 static void __init print_e820_memory_map(struct e820entry *map, unsigned int entries)
 {
@@ -349,13 +347,6 @@ static unsigned long __init find_max_pfn(void)
     int i;
     unsigned long max_pfn = 0;
 
-#if 0
-    if (efi_enabled) {
-        efi_memmap_walk(efi_find_max_pfn, &max_pfn);
-        return;
-    }
-#endif
-
     for (i = 0; i < e820.nr_map; i++) {
         unsigned long start, end;
         /* RAM? */
@@ -504,11 +495,19 @@ static uint64_t __init mtrr_top_of_ram(void)
 
 static void __init reserve_dmi_region(void)
 {
-    u32 base, len;
-    if ( (dmi_get_table(&base, &len) == 0) && ((base + len) > base) &&
-         reserve_e820_ram(&e820, base, base + len) )
-        printk("WARNING: DMI table located in E820 RAM %08x-%08x. Fixed.\n",
-               base, base+len);
+    for ( ; ; )
+    {
+        paddr_t base;
+        u32 len;
+        const char *what = dmi_get_table(&base, &len);
+
+        if ( !what )
+            break;
+        if ( ((base + len) > base) &&
+             reserve_e820_ram(&e820, base, base + len) )
+            printk("WARNING: %s table located in E820 RAM %"PRIpaddr"-%"PRIpaddr". Fixed.\n",
+                   what, base, base + len);
+    }
 }
 
 static void __init machine_specific_memory_setup(
diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h
index 3a3b4fe..4c7f383 100644
--- a/xen/arch/x86/efi/efi-boot.h
+++ b/xen/arch/x86/efi/efi-boot.h
@@ -148,12 +148,16 @@ static void __init efi_arch_process_memory_map(EFI_SYSTEM_TABLE *SystemTable,
 
         switch ( desc->Type )
         {
-        default:
-            type = E820_RESERVED;
-            break;
-        case EfiConventionalMemory:
         case EfiBootServicesCode:
         case EfiBootServicesData:
+            if ( map_bs )
+            {
+        default:
+                type = E820_RESERVED;
+                break;
+            }
+            /* fall through */
+        case EfiConventionalMemory:
             if ( !trampoline_phys && desc->PhysicalStart + len <= 0x100000 &&
                  len >= cfg.size && desc->PhysicalStart + len > cfg.addr )
                 cfg.addr = (desc->PhysicalStart + len - cfg.size) & PAGE_MASK;
@@ -190,10 +194,10 @@ static void __init efi_arch_process_memory_map(EFI_SYSTEM_TABLE *SystemTable,
 
 }
 
-static void *__init efi_arch_allocate_mmap_buffer(UINTN *map_size)
+static void *__init efi_arch_allocate_mmap_buffer(UINTN map_size)
 {
     place_string(&mbi.mem_upper, NULL);
-    mbi.mem_upper -= *map_size;
+    mbi.mem_upper -= map_size;
     mbi.mem_upper &= -__alignof__(EFI_MEMORY_DESCRIPTOR);
     if ( mbi.mem_upper < xen_phys_start )
         return NULL;
@@ -614,6 +618,13 @@ static void __init efi_arch_blexit(void)
         efi_bs->FreePages(ucode.addr, PFN_UP(ucode.size));
 }
 
+static void __init efi_arch_halt(void)
+{
+    local_irq_disable();
+    for ( ; ; )
+        halt();
+}
+
 static void __init efi_arch_load_addr_check(EFI_LOADED_IMAGE *loaded_image)
 {
     xen_phys_start = (UINTN)loaded_image->ImageBase;
@@ -629,6 +640,8 @@ static bool_t __init efi_arch_use_config_file(EFI_SYSTEM_TABLE *SystemTable)
     return 1; /* x86 always uses a config file */
 }
 
+static void efi_arch_flush_dcache_area(const void *vaddr, UINTN size) { }
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/efi/runtime.h b/xen/arch/x86/efi/runtime.h
index 0eb2fb9..d9eb8f5 100644
--- a/xen/arch/x86/efi/runtime.h
+++ b/xen/arch/x86/efi/runtime.h
@@ -1,5 +1,12 @@
+#include <asm/atomic.h>
 #include <asm/mc146818rtc.h>
 
 #ifndef COMPAT
 l4_pgentry_t *__read_mostly efi_l4_pgtable;
+
+void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t l4e)
+{
+    if ( efi_l4_pgtable )
+        l4e_write(efi_l4_pgtable + l4idx, l4e);
+}
 #endif
diff --git a/xen/arch/x86/efi/stub.c b/xen/arch/x86/efi/stub.c
index b8f49f8..07c2bd0 100644
--- a/xen/arch/x86/efi/stub.c
+++ b/xen/arch/x86/efi/stub.c
@@ -2,6 +2,7 @@
 #include <xen/errno.h>
 #include <xen/init.h>
 #include <xen/lib.h>
+#include <asm/page.h>
 
 #ifndef efi_enabled
 const bool_t efi_enabled = 0;
@@ -9,7 +10,9 @@ const bool_t efi_enabled = 0;
 
 void __init efi_init_memory(void) { }
 
-paddr_t efi_rs_page_table(void)
+void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t l4e) { }
+
+bool_t efi_rs_using_pgtables(void)
 {
     BUG();
     return 0;
diff --git a/xen/arch/x86/gdbstub.c b/xen/arch/x86/gdbstub.c
index 2390a1b..2a39189 100644
--- a/xen/arch/x86/gdbstub.c
+++ b/xen/arch/x86/gdbstub.c
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #include <asm/debugger.h>
 
diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
index 9a3cfd9..d894a98 100644
--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c
index 0b13f52..f78054d 100644
--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -52,6 +52,7 @@ DEFINE_PER_CPU(struct hpet_event_channel *, cpu_bc_channel);
 
 unsigned long __initdata hpet_address;
 u8 __initdata hpet_blockid;
+u8 __initdata hpet_flags;
 
 /*
  * force_hpet_broadcast: by default legacy hpet broadcast will be stopped
@@ -157,7 +158,7 @@ static void evt_do_broadcast(cpumask_t *mask)
 {
     unsigned int cpu = smp_processor_id();
 
-    if ( cpumask_test_and_clear_cpu(cpu, mask) )
+    if ( __cpumask_test_and_clear_cpu(cpu, mask) )
         raise_softirq(TIMER_SOFTIRQ);
 
     cpuidle_wakeup_mwait(mask);
@@ -196,7 +197,7 @@ again:
             continue;
 
         if ( deadline <= now )
-            cpumask_set_cpu(cpu, &mask);
+            __cpumask_set_cpu(cpu, &mask);
         else if ( deadline < next_event )
             next_event = deadline;
     }
@@ -240,7 +241,7 @@ static void hpet_msi_unmask(struct irq_desc *desc)
     cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
     cfg |= HPET_TN_ENABLE;
     hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
-    ch->msi.msi_attrib.masked = 0;
+    ch->msi.msi_attrib.host_masked = 0;
 }
 
 static void hpet_msi_mask(struct irq_desc *desc)
@@ -251,7 +252,7 @@ static void hpet_msi_mask(struct irq_desc *desc)
     cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
     cfg &= ~HPET_TN_ENABLE;
     hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
-    ch->msi.msi_attrib.masked = 1;
+    ch->msi.msi_attrib.host_masked = 1;
 }
 
 static int hpet_msi_write(struct hpet_event_channel *ch, struct msi_msg *msg)
diff --git a/xen/arch/x86/hvm/Makefile b/xen/arch/x86/hvm/Makefile
index eea5555..794e793 100644
--- a/xen/arch/x86/hvm/Makefile
+++ b/xen/arch/x86/hvm/Makefile
@@ -3,6 +3,7 @@ subdir-y += vmx
 
 obj-y += asid.o
 obj-y += emulate.o
+obj-y += event.o
 obj-y += hpet.o
 obj-y += hvm.o
 obj-y += i8254.o
@@ -22,4 +23,3 @@ obj-y += vlapic.o
 obj-y += vmsi.o
 obj-y += vpic.o
 obj-y += vpt.o
-obj-y += vpmu.o
\ No newline at end of file
diff --git a/xen/arch/x86/hvm/asid.c b/xen/arch/x86/hvm/asid.c
index 21ec492..24f2f00 100644
--- a/xen/arch/x86/hvm/asid.c
+++ b/xen/arch/x86/hvm/asid.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 14c1847..30acb78 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -23,7 +23,7 @@
 #include <asm/hvm/support.h>
 #include <asm/hvm/svm/svm.h>
 
-static void hvmtrace_io_assist(int is_mmio, ioreq_t *p)
+static void hvmtrace_io_assist(const ioreq_t *p)
 {
     unsigned int size, event;
     unsigned char buffer[12];
@@ -31,7 +31,7 @@ static void hvmtrace_io_assist(int is_mmio, ioreq_t *p)
     if ( likely(!tb_init_done) )
         return;
 
-    if ( is_mmio )
+    if ( p->type == IOREQ_TYPE_COPY )
         event = p->dir ? TRC_HVM_IOMEM_READ : TRC_HVM_IOMEM_WRITE;
     else
         event = p->dir ? TRC_HVM_IOPORT_READ : TRC_HVM_IOPORT_WRITE;
@@ -50,42 +50,69 @@ static void hvmtrace_io_assist(int is_mmio, ioreq_t *p)
     trace_var(event, 0/*!cycles*/, size, buffer);
 }
 
+static int null_read(const struct hvm_io_handler *io_handler,
+                     uint64_t addr,
+                     uint32_t size,
+                     uint64_t *data)
+{
+    *data = ~0ul;
+    return X86EMUL_OKAY;
+}
+
+static int null_write(const struct hvm_io_handler *handler,
+                      uint64_t addr,
+                      uint32_t size,
+                      uint64_t data)
+{
+    return X86EMUL_OKAY;
+}
+
+static int set_context_data(void *buffer, unsigned int size)
+{
+    struct vcpu *curr = current;
+
+    if ( curr->arch.vm_event.emul_read_data )
+    {
+        unsigned int safe_size =
+            min(size, curr->arch.vm_event.emul_read_data->size);
+
+        memcpy(buffer, curr->arch.vm_event.emul_read_data->data, safe_size);
+        memset(buffer + safe_size, 0, size - safe_size);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static const struct hvm_io_ops null_ops = {
+    .read = null_read,
+    .write = null_write
+};
+
+static const struct hvm_io_handler null_handler = {
+    .ops = &null_ops
+};
+
 static int hvmemul_do_io(
-    int is_mmio, paddr_t addr, unsigned long *reps, int size,
-    paddr_t ram_gpa, int dir, int df, void *p_data)
+    bool_t is_mmio, paddr_t addr, unsigned long reps, unsigned int size,
+    uint8_t dir, bool_t df, bool_t data_is_addr, uintptr_t data)
 {
     struct vcpu *curr = current;
-    struct hvm_vcpu_io *vio;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
     ioreq_t p = {
         .type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO,
         .addr = addr,
         .size = size,
+        .count = reps,
         .dir = dir,
         .df = df,
-        .data = ram_gpa,
-        .data_is_ptr = (p_data == NULL),
+        .data = data,
+        .data_is_ptr = data_is_addr, /* ioreq_t field name is misleading */
+        .state = STATE_IOREQ_READY,
     };
-    unsigned long ram_gfn = paddr_to_pfn(ram_gpa);
-    p2m_type_t p2mt;
-    struct page_info *ram_page;
+    void *p_data = (void *)data;
     int rc;
 
-    /* Check for paged out page */
-    ram_page = get_page_from_gfn(curr->domain, ram_gfn, &p2mt, P2M_UNSHARE);
-    if ( p2m_is_paging(p2mt) )
-    {
-        if ( ram_page )
-            put_page(ram_page);
-        p2m_mem_paging_populate(curr->domain, ram_gfn);
-        return X86EMUL_RETRY;
-    }
-    if ( p2m_is_shared(p2mt) )
-    {
-        if ( ram_page )
-            put_page(ram_page);
-        return X86EMUL_RETRY;
-    }
-
     /*
      * Weird-sized accesses have undefined behaviour: we discard writes
      * and read all-ones.
@@ -93,208 +120,283 @@ static int hvmemul_do_io(
     if ( unlikely((size > sizeof(long)) || (size & (size - 1))) )
     {
         gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size);
-        ASSERT(p_data != NULL); /* cannot happen with a REP prefix */
-        if ( dir == IOREQ_READ )
-            memset(p_data, ~0, size);
-        if ( ram_page )
-            put_page(ram_page);
         return X86EMUL_UNHANDLEABLE;
     }
 
-    if ( !p.data_is_ptr && (dir == IOREQ_WRITE) )
-    {
-        memcpy(&p.data, p_data, size);
-        p_data = NULL;
-    }
-
-    vio = &curr->arch.hvm_vcpu.hvm_io;
-
-    if ( is_mmio && !p.data_is_ptr )
+    switch ( vio->io_req.state )
     {
-        /* Part of a multi-cycle read or write? */
-        if ( dir == IOREQ_WRITE )
-        {
-            paddr_t pa = vio->mmio_large_write_pa;
-            unsigned int bytes = vio->mmio_large_write_bytes;
-            if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) )
-            {
-                if ( ram_page )
-                    put_page(ram_page);
-                return X86EMUL_OKAY;
-            }
-        }
-        else
-        {
-            paddr_t pa = vio->mmio_large_read_pa;
-            unsigned int bytes = vio->mmio_large_read_bytes;
-            if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) )
-            {
-                memcpy(p_data, &vio->mmio_large_read[addr - pa],
-                       size);
-                if ( ram_page )
-                    put_page(ram_page);
-                return X86EMUL_OKAY;
-            }
-        }
-    }
-
-    switch ( vio->io_state )
-    {
-    case HVMIO_none:
+    case STATE_IOREQ_NONE:
         break;
-    case HVMIO_completed:
-        vio->io_state = HVMIO_none;
-        if ( p_data == NULL )
-        {
-            if ( ram_page )
-                put_page(ram_page);
+    case STATE_IORESP_READY:
+        vio->io_req.state = STATE_IOREQ_NONE;
+        p = vio->io_req;
+
+        /* Verify the emulation request has been correctly re-issued */
+        if ( (p.type != is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO) ||
+             (p.addr != addr) ||
+             (p.size != size) ||
+             (p.count != reps) ||
+             (p.dir != dir) ||
+             (p.df != df) ||
+             (p.data_is_ptr != data_is_addr) )
+            domain_crash(curr->domain);
+
+        if ( data_is_addr )
             return X86EMUL_UNHANDLEABLE;
-        }
         goto finish_access;
-    case HVMIO_dispatched:
-        /* May have to wait for previous cycle of a multi-write to complete. */
-        if ( is_mmio && !p.data_is_ptr && (dir == IOREQ_WRITE) &&
-             (addr == (vio->mmio_large_write_pa +
-                       vio->mmio_large_write_bytes)) )
-        {
-            if ( ram_page )
-                put_page(ram_page);
-            return X86EMUL_RETRY;
-        }
     default:
-        if ( ram_page )
-            put_page(ram_page);
         return X86EMUL_UNHANDLEABLE;
     }
 
-    if ( hvm_io_pending(curr) )
+    if ( dir == IOREQ_WRITE )
     {
-        gdprintk(XENLOG_WARNING, "WARNING: io already pending?\n");
-        if ( ram_page )
-            put_page(ram_page);
-        return X86EMUL_UNHANDLEABLE;
-    }
-
-    vio->io_state =
-        (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
-    vio->io_size = size;
-
-    /*
-     * When retrying a repeated string instruction, force exit to guest after
-     * completion of the retried iteration to allow handling of interrupts.
-     */
-    if ( vio->mmio_retrying )
-        *reps = 1;
+        if ( !data_is_addr )
+            memcpy(&p.data, p_data, size);
 
-    p.count = *reps;
+        hvmtrace_io_assist(&p);
+    }
 
-    if ( dir == IOREQ_WRITE )
-        hvmtrace_io_assist(is_mmio, &p);
+    vio->io_req = p;
 
-    if ( is_mmio )
-    {
-        rc = hvm_mmio_intercept(&p);
-        if ( rc == X86EMUL_UNHANDLEABLE )
-            rc = hvm_buffered_io_intercept(&p);
-    }
-    else
-    {
-        rc = hvm_portio_intercept(&p);
-    }
+    rc = hvm_io_intercept(&p);
 
     switch ( rc )
     {
     case X86EMUL_OKAY:
-    case X86EMUL_RETRY:
-        *reps = p.count;
-        p.state = STATE_IORESP_READY;
-        if ( !vio->mmio_retry )
-        {
-            hvm_io_assist(&p);
-            vio->io_state = HVMIO_none;
-        }
-        else
-            /* Defer hvm_io_assist() invocation to hvm_do_resume(). */
-            vio->io_state = HVMIO_handle_mmio_awaiting_completion;
+        vio->io_req.state = STATE_IOREQ_NONE;
         break;
     case X86EMUL_UNHANDLEABLE:
-        /* If there is no backing DM, just ignore accesses */
-        if ( !hvm_has_dm(curr->domain) )
+    {
+        struct hvm_ioreq_server *s =
+            hvm_select_ioreq_server(curr->domain, &p);
+
+        /* If there is no suitable backing DM, just ignore accesses */
+        if ( !s )
         {
-            rc = X86EMUL_OKAY;
-            vio->io_state = HVMIO_none;
+            rc = hvm_process_io_intercept(&null_handler, &p);
+            vio->io_req.state = STATE_IOREQ_NONE;
         }
         else
         {
-            rc = X86EMUL_RETRY;
-            if ( !hvm_send_assist_req(&p) )
-                vio->io_state = HVMIO_none;
-            else if ( p_data == NULL )
+            rc = hvm_send_ioreq(s, &p, 0);
+            if ( rc != X86EMUL_RETRY || curr->domain->is_shutting_down )
+                vio->io_req.state = STATE_IOREQ_NONE;
+            else if ( data_is_addr )
                 rc = X86EMUL_OKAY;
         }
         break;
+    }
     default:
         BUG();
     }
 
     if ( rc != X86EMUL_OKAY )
-    {
-        if ( ram_page )
-            put_page(ram_page);
         return rc;
-    }
 
  finish_access:
     if ( dir == IOREQ_READ )
-        hvmtrace_io_assist(is_mmio, &p);
+    {
+        hvmtrace_io_assist(&p);
+
+        if ( !data_is_addr )
+            memcpy(p_data, &p.data, size);
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_do_io_buffer(
+    bool_t is_mmio, paddr_t addr, unsigned long *reps, unsigned int size,
+    uint8_t dir, bool_t df, void *buffer)
+{
+    int rc;
+
+    BUG_ON(buffer == NULL);
+
+    rc = hvmemul_do_io(is_mmio, addr, *reps, size, dir, df, 0,
+                       (uintptr_t)buffer);
+    if ( rc == X86EMUL_UNHANDLEABLE && dir == IOREQ_READ )
+        memset(buffer, 0xff, size);
+
+    return rc;
+}
+
+static int hvmemul_acquire_page(unsigned long gmfn, struct page_info **page)
+{
+    struct domain *curr_d = current->domain;
+    p2m_type_t p2mt;
+
+    *page = get_page_from_gfn(curr_d, gmfn, &p2mt, P2M_UNSHARE);
+
+    if ( *page == NULL )
+        return X86EMUL_UNHANDLEABLE;
 
-    if ( p_data != NULL )
-        memcpy(p_data, &vio->io_data, size);
+    if ( p2m_is_paging(p2mt) )
+    {
+        put_page(*page);
+        p2m_mem_paging_populate(curr_d, gmfn);
+        return X86EMUL_RETRY;
+    }
 
-    if ( is_mmio && !p.data_is_ptr )
+    if ( p2m_is_shared(p2mt) )
     {
-        /* Part of a multi-cycle read or write? */
-        if ( dir == IOREQ_WRITE )
-        {
-            paddr_t pa = vio->mmio_large_write_pa;
-            unsigned int bytes = vio->mmio_large_write_bytes;
-            if ( bytes == 0 )
-                pa = vio->mmio_large_write_pa = addr;
-            if ( addr == (pa + bytes) )
-                vio->mmio_large_write_bytes += size;
-        }
-        else
-        {
-            paddr_t pa = vio->mmio_large_read_pa;
-            unsigned int bytes = vio->mmio_large_read_bytes;
-            if ( bytes == 0 )
-                pa = vio->mmio_large_read_pa = addr;
-            if ( (addr == (pa + bytes)) &&
-                 ((bytes + size) <= sizeof(vio->mmio_large_read)) )
-            {
-                memcpy(&vio->mmio_large_read[bytes], p_data, size);
-                vio->mmio_large_read_bytes += size;
-            }
-        }
+        put_page(*page);
+        return X86EMUL_RETRY;
+    }
+
+    /* This code should not be reached if the gmfn is not RAM */
+    if ( p2m_is_mmio(p2mt) )
+    {
+        domain_crash(curr_d);
+
+        put_page(*page);
+        return X86EMUL_UNHANDLEABLE;
     }
 
-    if ( ram_page )
-        put_page(ram_page);
     return X86EMUL_OKAY;
 }
 
-int hvmemul_do_pio(
-    unsigned long port, unsigned long *reps, int size,
-    paddr_t ram_gpa, int dir, int df, void *p_data)
+static inline void hvmemul_release_page(struct page_info *page)
 {
-    return hvmemul_do_io(0, port, reps, size, ram_gpa, dir, df, p_data);
+    put_page(page);
 }
 
-static int hvmemul_do_mmio(
-    paddr_t gpa, unsigned long *reps, int size,
-    paddr_t ram_gpa, int dir, int df, void *p_data)
+static int hvmemul_do_io_addr(
+    bool_t is_mmio, paddr_t addr, unsigned long *reps,
+    unsigned int size, uint8_t dir, bool_t df, paddr_t ram_gpa)
+{
+    struct vcpu *v = current;
+    unsigned long ram_gmfn = paddr_to_pfn(ram_gpa);
+    unsigned int page_off = ram_gpa & (PAGE_SIZE - 1);
+    struct page_info *ram_page[2];
+    unsigned int nr_pages = 0;
+    unsigned long count;
+    int rc;
+
+    rc = hvmemul_acquire_page(ram_gmfn, &ram_page[nr_pages]);
+    if ( rc != X86EMUL_OKAY )
+        goto out;
+
+    nr_pages++;
+
+    /* Detemine how many reps will fit within this page */
+    count = min_t(unsigned long,
+                  *reps,
+                  df ?
+                  ((page_off + size - 1) & ~PAGE_MASK) / size :
+                  (PAGE_SIZE - page_off) / size);
+
+    if ( count == 0 )
+    {
+        /*
+         * This access must span two pages, so grab a reference to
+         * the next page and do a single rep.
+         * It is safe to assume multiple pages are physically
+         * contiguous at this point as hvmemul_linear_to_phys() will
+         * ensure this is the case.
+         */
+        rc = hvmemul_acquire_page(df ? ram_gmfn - 1 : ram_gmfn + 1,
+                                  &ram_page[nr_pages]);
+        if ( rc != X86EMUL_OKAY )
+            goto out;
+
+        nr_pages++;
+        count = 1;
+    }
+
+    rc = hvmemul_do_io(is_mmio, addr, count, size, dir, df, 1,
+                       ram_gpa);
+    if ( rc == X86EMUL_OKAY )
+    {
+        v->arch.hvm_vcpu.hvm_io.mmio_retry = (count < *reps);
+        *reps = count;
+    }
+
+ out:
+    while ( nr_pages )
+        hvmemul_release_page(ram_page[--nr_pages]);
+
+    return rc;
+}
+
+/*
+ * Perform I/O between <port> and <buffer>. <dir> indicates the
+ * direction: IOREQ_READ means a read from <port> to <buffer> and
+ * IOREQ_WRITE means a write from <buffer> to <port>. Each access has
+ * width <size>.
+ */
+int hvmemul_do_pio_buffer(uint16_t port,
+                          unsigned int size,
+                          uint8_t dir,
+                          void *buffer)
+{
+    unsigned long one_rep = 1;
+
+    return hvmemul_do_io_buffer(0, port, &one_rep, size, dir, 0, buffer);
+}
+
+/*
+ * Perform I/O between <port> and guest RAM starting at <ram_addr>.
+ * <dir> indicates the direction: IOREQ_READ means a read from <port> to
+ * RAM and IOREQ_WRITE means a write from RAM to <port>. Each access has
+ * width <size> and up to *<reps> accesses will be performed. If
+ * X86EMUL_OKAY is returned then <reps> will be updated with the number
+ * of accesses actually performed.
+ * Each access will be done to/from successive RAM addresses, increasing
+ * if <df> is 0 or decreasing if <df> is 1.
+ */
+static int hvmemul_do_pio_addr(uint16_t port,
+                               unsigned long *reps,
+                               unsigned int size,
+                               uint8_t dir,
+                               bool_t df,
+                               paddr_t ram_addr)
 {
-    return hvmemul_do_io(1, gpa, reps, size, ram_gpa, dir, df, p_data);
+    return hvmemul_do_io_addr(0, port, reps, size, dir, df, ram_addr);
+}
+
+/*
+ * Perform I/O between MMIO space starting at <mmio_gpa> and <buffer>.
+ * <dir> indicates the direction: IOREQ_READ means a read from MMIO to
+ * <buffer> and IOREQ_WRITE means a write from <buffer> to MMIO. Each
+ * access has width <size> and up to *<reps> accesses will be performed.
+ * If X86EMUL_OKAY is returned then <reps> will be updated with the number
+ * of accesses actually performed.
+ * Each access will be done to/from successive MMIO addresses, increasing
+ * if <df> is 0 or decreasing if <df> is 1.
+ *
+ * NOTE: If *<reps> is greater than 1, each access will use the
+ *       <buffer> pointer; there is no implicit interation over a
+ *       block of memory starting at <buffer>.
+ */
+static int hvmemul_do_mmio_buffer(paddr_t mmio_gpa,
+                                  unsigned long *reps,
+                                  unsigned int size,
+                                  uint8_t dir,
+                                  bool_t df,
+                                  void *buffer)
+{
+    return hvmemul_do_io_buffer(1, mmio_gpa, reps, size, dir, df, buffer);
+}
+
+/*
+ * Perform I/O between MMIO space starting at <mmio_gpa> and guest RAM
+ * starting at <ram_gpa>. <dir> indicates the direction: IOREQ_READ
+ * means a read from MMIO to RAM and IOREQ_WRITE means a write from RAM
+ * to MMIO. Each access has width <size> and up to *<reps> accesses will
+ * be performed. If X86EMUL_OKAY is returned then <reps> will be updated
+ * with the number of accesses actually performed.
+ * Each access will be done to/from successive RAM *and* MMIO addresses,
+ * increasing if <df> is 0 or decreasing if <df> is 1.
+ */
+static int hvmemul_do_mmio_addr(paddr_t mmio_gpa,
+                                unsigned long *reps,
+                                unsigned int size,
+                                uint8_t dir,
+                                bool_t df,
+                                paddr_t ram_gpa)
+{
+    return hvmemul_do_io_addr(1, mmio_gpa, reps, size, dir, df, ram_gpa);
 }
 
 /*
@@ -407,11 +509,11 @@ static int hvmemul_virtual_to_linear(
      * The chosen maximum is very conservative but it's what we use in
      * hvmemul_linear_to_phys() so there is no point in using a larger value.
      * If introspection has been enabled for this domain, *reps should be
-     * at most 1, since optimization might otherwise cause a single mem_event
+     * at most 1, since optimization might otherwise cause a single vm_event
      * being triggered for repeated writes to a whole page.
      */
     *reps = min_t(unsigned long, *reps,
-                  unlikely(current->domain->arch.hvm_domain.introspection_enabled)
+                  unlikely(current->domain->arch.mem_access_emulate_enabled)
                            ? 1 : 4096);
 
     reg = hvmemul_get_seg_reg(seg, hvmemul_ctxt);
@@ -454,6 +556,178 @@ static int hvmemul_virtual_to_linear(
     return X86EMUL_EXCEPTION;
 }
 
+static int hvmemul_phys_mmio_access(
+    struct hvm_mmio_cache *cache, paddr_t gpa, unsigned int size, uint8_t dir,
+    uint8_t *buffer, unsigned int offset)
+{
+    unsigned long one_rep = 1;
+    unsigned int chunk;
+    int rc = X86EMUL_OKAY;
+
+    /* Accesses must fall within a page. */
+    BUG_ON((gpa & ~PAGE_MASK) + size > PAGE_SIZE);
+
+    /*
+     * hvmemul_do_io() cannot handle non-power-of-2 accesses or
+     * accesses larger than sizeof(long), so choose the highest power
+     * of 2 not exceeding sizeof(long) as the 'chunk' size.
+     */
+    ASSERT(size != 0);
+    chunk = 1u << (fls(size) - 1);
+    if ( chunk > sizeof (long) )
+        chunk = sizeof (long);
+
+    for ( ;; )
+    {
+        /* Have we already done this chunk? */
+        if ( offset < cache->size )
+        {
+            ASSERT((offset + chunk) <= cache->size);
+
+            if ( dir == IOREQ_READ )
+                memcpy(&buffer[offset], &cache->buffer[offset], chunk);
+            else if ( memcmp(&buffer[offset], &cache->buffer[offset], chunk) != 0 )
+                domain_crash(current->domain);
+        }
+        else
+        {
+            ASSERT(offset == cache->size);
+
+            rc = hvmemul_do_mmio_buffer(gpa, &one_rep, chunk, dir, 0,
+                                        &buffer[offset]);
+            if ( rc != X86EMUL_OKAY )
+                break;
+
+            /* Note that we have now done this chunk. */
+            memcpy(&cache->buffer[offset], &buffer[offset], chunk);
+            cache->size += chunk;
+        }
+
+        /* Advance to the next chunk. */
+        gpa += chunk;
+        offset += chunk;
+        size -= chunk;
+
+        if ( size == 0 )
+            break;
+
+        /*
+         * If the chunk now exceeds the remaining size, choose the next
+         * lowest power of 2 that will fit.
+         */
+        while ( chunk > size )
+            chunk >>= 1;
+    }
+
+    return rc;
+}
+
+/*
+ * Multi-cycle MMIO handling is based upon the assumption that emulation
+ * of the same instruction will not access the same MMIO region more
+ * than once. Hence we can deal with re-emulation (for secondary or
+ * subsequent cycles) by looking up the result or previous I/O in a
+ * cache indexed by linear MMIO address.
+ */
+static struct hvm_mmio_cache *hvmemul_find_mmio_cache(
+    struct hvm_vcpu_io *vio, unsigned long gla, uint8_t dir)
+{
+    unsigned int i;
+    struct hvm_mmio_cache *cache;
+
+    for ( i = 0; i < vio->mmio_cache_count; i ++ )
+    {
+        cache = &vio->mmio_cache[i];
+
+        if ( gla == cache->gla &&
+             dir == cache->dir )
+            return cache;
+    }
+
+    i = vio->mmio_cache_count++;
+    if( i == ARRAY_SIZE(vio->mmio_cache) )
+    {
+        domain_crash(current->domain);
+        return NULL;
+    }
+
+    cache = &vio->mmio_cache[i];
+    memset(cache, 0, sizeof (*cache));
+
+    cache->gla = gla;
+    cache->dir = dir;
+
+    return cache;
+}
+
+static int hvmemul_linear_mmio_access(
+    unsigned long gla, unsigned int size, uint8_t dir, void *buffer,
+    uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt, bool_t known_gpfn)
+{
+    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
+    unsigned long offset = gla & ~PAGE_MASK;
+    struct hvm_mmio_cache *cache = hvmemul_find_mmio_cache(vio, gla, dir);
+    unsigned int chunk, buffer_offset = 0;
+    paddr_t gpa;
+    unsigned long one_rep = 1;
+    int rc;
+
+    if ( cache == NULL )
+        return X86EMUL_UNHANDLEABLE;
+
+    chunk = min_t(unsigned int, size, PAGE_SIZE - offset);
+
+    if ( known_gpfn )
+        gpa = pfn_to_paddr(vio->mmio_gpfn) | offset;
+    else
+    {
+        rc = hvmemul_linear_to_phys(gla, &gpa, chunk, &one_rep, pfec,
+                                    hvmemul_ctxt);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+    }
+
+    for ( ;; )
+    {
+        rc = hvmemul_phys_mmio_access(cache, gpa, chunk, dir, buffer, buffer_offset);
+        if ( rc != X86EMUL_OKAY )
+            break;
+
+        gla += chunk;
+        buffer_offset += chunk;
+        size -= chunk;
+
+        if ( size == 0 )
+            break;
+
+        chunk = min_t(unsigned int, size, PAGE_SIZE);
+        rc = hvmemul_linear_to_phys(gla, &gpa, chunk, &one_rep, pfec,
+                                    hvmemul_ctxt);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+    }
+
+    return rc;
+}
+
+static inline int hvmemul_linear_mmio_read(
+    unsigned long gla, unsigned int size, void *buffer,
+    uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt,
+    bool_t translate)
+{
+    return hvmemul_linear_mmio_access(gla, size, IOREQ_READ, buffer,
+                                      pfec, hvmemul_ctxt, translate);
+}
+
+static inline int hvmemul_linear_mmio_write(
+    unsigned long gla, unsigned int size, void *buffer,
+    uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt,
+    bool_t translate)
+{
+    return hvmemul_linear_mmio_access(gla, size, IOREQ_WRITE, buffer,
+                                      pfec, hvmemul_ctxt, translate);
+}
+
 static int __hvmemul_read(
     enum x86_segment seg,
     unsigned long offset,
@@ -464,50 +738,19 @@ static int __hvmemul_read(
 {
     struct vcpu *curr = current;
     unsigned long addr, reps = 1;
-    unsigned int off, chunk = min(bytes, 1U << LONG_BYTEORDER);
     uint32_t pfec = PFEC_page_present;
     struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
-    paddr_t gpa;
     int rc;
 
     rc = hvmemul_virtual_to_linear(
         seg, offset, bytes, &reps, access_type, hvmemul_ctxt, &addr);
     if ( rc != X86EMUL_OKAY )
         return rc;
-    off = addr & (PAGE_SIZE - 1);
-    /*
-     * We only need to handle sizes actual instruction operands can have. All
-     * such sizes are either powers of 2 or the sum of two powers of 2. Thus
-     * picking as initial chunk size the largest power of 2 not greater than
-     * the total size will always result in only power-of-2 size requests
-     * issued to hvmemul_do_mmio() (hvmemul_do_io() rejects non-powers-of-2).
-     */
-    while ( chunk & (chunk - 1) )
-        chunk &= chunk - 1;
-    if ( off + bytes > PAGE_SIZE )
-        while ( off & (chunk - 1) )
-            chunk >>= 1;
-
     if ( ((access_type != hvm_access_insn_fetch
            ? vio->mmio_access.read_access
            : vio->mmio_access.insn_fetch)) &&
          (vio->mmio_gva == (addr & PAGE_MASK)) )
-    {
-        gpa = (((paddr_t)vio->mmio_gpfn << PAGE_SHIFT) | off);
-        while ( (off + chunk) <= PAGE_SIZE )
-        {
-            rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_READ, 0, p_data);
-            if ( rc != X86EMUL_OKAY || bytes == chunk )
-                return rc;
-            addr += chunk;
-            off += chunk;
-            gpa += chunk;
-            p_data += chunk;
-            bytes -= chunk;
-            if ( bytes < chunk )
-                chunk = bytes;
-        }
-    }
+        return hvmemul_linear_mmio_read(addr, bytes, p_data, pfec, hvmemul_ctxt, 1);
 
     if ( (seg != x86_seg_none) &&
          (hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3) )
@@ -526,29 +769,8 @@ static int __hvmemul_read(
     case HVMCOPY_bad_gfn_to_mfn:
         if ( access_type == hvm_access_insn_fetch )
             return X86EMUL_UNHANDLEABLE;
-        rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
-                                    hvmemul_ctxt);
-        while ( rc == X86EMUL_OKAY )
-        {
-            rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_READ, 0, p_data);
-            if ( rc != X86EMUL_OKAY || bytes == chunk )
-                break;
-            addr += chunk;
-            off += chunk;
-            p_data += chunk;
-            bytes -= chunk;
-            if ( bytes < chunk )
-                chunk = bytes;
-            if ( off < PAGE_SIZE )
-                gpa += chunk;
-            else
-            {
-                rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
-                                            hvmemul_ctxt);
-                off = 0;
-            }
-        }
-        return rc;
+
+        return hvmemul_linear_mmio_read(addr, bytes, p_data, pfec, hvmemul_ctxt, 0);
     case HVMCOPY_gfn_paged_out:
     case HVMCOPY_gfn_shared:
         return X86EMUL_RETRY;
@@ -566,6 +788,12 @@ static int hvmemul_read(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+
+    if ( unlikely(hvmemul_ctxt->set_context) )
+        return set_context_data(p_data, bytes);
+
     return __hvmemul_read(
         seg, offset, p_data, bytes, hvm_access_read,
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
@@ -614,42 +842,18 @@ static int hvmemul_write(
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
     struct vcpu *curr = current;
     unsigned long addr, reps = 1;
-    unsigned int off, chunk = min(bytes, 1U << LONG_BYTEORDER);
     uint32_t pfec = PFEC_page_present | PFEC_write_access;
     struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
-    paddr_t gpa;
     int rc;
 
     rc = hvmemul_virtual_to_linear(
         seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
     if ( rc != X86EMUL_OKAY )
         return rc;
-    off = addr & (PAGE_SIZE - 1);
-    /* See the respective comment in __hvmemul_read(). */
-    while ( chunk & (chunk - 1) )
-        chunk &= chunk - 1;
-    if ( off + bytes > PAGE_SIZE )
-        while ( off & (chunk - 1) )
-            chunk >>= 1;
 
     if ( vio->mmio_access.write_access &&
          (vio->mmio_gva == (addr & PAGE_MASK)) )
-    {
-        gpa = (((paddr_t)vio->mmio_gpfn << PAGE_SHIFT) | off);
-        while ( (off + chunk) <= PAGE_SIZE )
-        {
-            rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_WRITE, 0, p_data);
-            if ( rc != X86EMUL_OKAY || bytes == chunk )
-                return rc;
-            addr += chunk;
-            off += chunk;
-            gpa += chunk;
-            p_data += chunk;
-            bytes -= chunk;
-            if ( bytes < chunk )
-                chunk = bytes;
-        }
-    }
+        return hvmemul_linear_mmio_write(addr, bytes, p_data, pfec, hvmemul_ctxt, 1);
 
     if ( (seg != x86_seg_none) &&
          (hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3) )
@@ -664,29 +868,7 @@ static int hvmemul_write(
     case HVMCOPY_bad_gva_to_gfn:
         return X86EMUL_EXCEPTION;
     case HVMCOPY_bad_gfn_to_mfn:
-        rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
-                                    hvmemul_ctxt);
-        while ( rc == X86EMUL_OKAY )
-        {
-            rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_WRITE, 0, p_data);
-            if ( rc != X86EMUL_OKAY || bytes == chunk )
-                break;
-            addr += chunk;
-            off += chunk;
-            p_data += chunk;
-            bytes -= chunk;
-            if ( bytes < chunk )
-                chunk = bytes;
-            if ( off < PAGE_SIZE )
-                gpa += chunk;
-            else
-            {
-                rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec,
-                                            hvmemul_ctxt);
-                off = 0;
-            }
-        }
-        return rc;
+        return hvmemul_linear_mmio_write(addr, bytes, p_data, pfec, hvmemul_ctxt, 0);
     case HVMCOPY_gfn_paged_out:
     case HVMCOPY_gfn_shared:
         return X86EMUL_RETRY;
@@ -731,6 +913,17 @@ static int hvmemul_rep_movs_discard(
     return X86EMUL_OKAY;
 }
 
+static int hvmemul_rep_stos_discard(
+    void *p_data,
+    enum x86_segment seg,
+    unsigned long offset,
+    unsigned int bytes_per_rep,
+    unsigned long *reps,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_OKAY;
+}
+
 static int hvmemul_rep_outs_discard(
     enum x86_segment src_seg,
     unsigned long src_offset,
@@ -793,6 +986,17 @@ static int hvmemul_cmpxchg(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+
+    if ( unlikely(hvmemul_ctxt->set_context) )
+    {
+        int rc = set_context_data(p_new, bytes);
+
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+    }
+
     /* Fix this in case the guest is really relying on r-m-w atomicity. */
     return hvmemul_write(seg, offset, p_new, bytes, ctxt);
 }
@@ -831,8 +1035,35 @@ static int hvmemul_rep_ins(
     if ( p2mt == p2m_mmio_direct || p2mt == p2m_mmio_dm )
         return X86EMUL_UNHANDLEABLE;
 
-    return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ,
-                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
+    return hvmemul_do_pio_addr(src_port, reps, bytes_per_rep, IOREQ_READ,
+                               !!(ctxt->regs->eflags & X86_EFLAGS_DF), gpa);
+}
+
+static int hvmemul_rep_outs_set_context(
+    enum x86_segment src_seg,
+    unsigned long src_offset,
+    uint16_t dst_port,
+    unsigned int bytes_per_rep,
+    unsigned long *reps,
+    struct x86_emulate_ctxt *ctxt)
+{
+    unsigned int bytes = *reps * bytes_per_rep;
+    char *buf;
+    int rc;
+
+    buf = xmalloc_array(char, bytes);
+
+    if ( buf == NULL )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = set_context_data(buf, bytes);
+
+    if ( rc == X86EMUL_OKAY )
+        rc = hvmemul_do_pio_buffer(dst_port, bytes, IOREQ_WRITE, buf);
+
+    xfree(buf);
+
+    return rc;
 }
 
 static int hvmemul_rep_outs(
@@ -851,6 +1082,10 @@ static int hvmemul_rep_outs(
     p2m_type_t p2mt;
     int rc;
 
+    if ( unlikely(hvmemul_ctxt->set_context) )
+        return hvmemul_rep_outs_set_context(src_seg, src_offset, dst_port,
+                                            bytes_per_rep, reps, ctxt);
+
     rc = hvmemul_virtual_to_linear(
         src_seg, src_offset, bytes_per_rep, reps, hvm_access_read,
         hvmemul_ctxt, &addr);
@@ -869,8 +1104,8 @@ static int hvmemul_rep_outs(
     if ( p2mt == p2m_mmio_direct || p2mt == p2m_mmio_dm )
         return X86EMUL_UNHANDLEABLE;
 
-    return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE,
-                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
+    return hvmemul_do_pio_addr(dst_port, reps, bytes_per_rep, IOREQ_WRITE,
+                               !!(ctxt->regs->eflags & X86_EFLAGS_DF), gpa);
 }
 
 static int hvmemul_rep_movs(
@@ -926,12 +1161,12 @@ static int hvmemul_rep_movs(
         return X86EMUL_UNHANDLEABLE;
 
     if ( sp2mt == p2m_mmio_dm )
-        return hvmemul_do_mmio(
-            sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ, df, NULL);
+        return hvmemul_do_mmio_addr(
+            sgpa, reps, bytes_per_rep, IOREQ_READ, df, dgpa);
 
     if ( dp2mt == p2m_mmio_dm )
-        return hvmemul_do_mmio(
-            dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE, df, NULL);
+        return hvmemul_do_mmio_addr(
+            dgpa, reps, bytes_per_rep, IOREQ_WRITE, df, sgpa);
 
     /* RAM-to-RAM copy: emulate as equivalent of memmove(dgpa, sgpa, bytes). */
     bytes = *reps * bytes_per_rep;
@@ -957,11 +1192,26 @@ static int hvmemul_rep_movs(
     if ( buf == NULL )
         return X86EMUL_UNHANDLEABLE;
 
-    /*
-     * We do a modicum of checking here, just for paranoia's sake and to
-     * definitely avoid copying an unitialised buffer into guest address space.
-     */
-    rc = hvm_copy_from_guest_phys(buf, sgpa, bytes);
+    if ( unlikely(hvmemul_ctxt->set_context) )
+    {
+        rc = set_context_data(buf, bytes);
+
+        if ( rc != X86EMUL_OKAY)
+        {
+            xfree(buf);
+            return rc;
+        }
+
+        rc = HVMCOPY_okay;
+    }
+    else
+        /*
+         * We do a modicum of checking here, just for paranoia's sake and to
+         * definitely avoid copying an unitialised buffer into guest address
+         * space.
+         */
+        rc = hvm_copy_from_guest_phys(buf, sgpa, bytes);
+
     if ( rc == HVMCOPY_okay )
         rc = hvm_copy_to_guest_phys(dgpa, buf, bytes);
 
@@ -982,6 +1232,113 @@ static int hvmemul_rep_movs(
     return X86EMUL_OKAY;
 }
 
+static int hvmemul_rep_stos(
+    void *p_data,
+    enum x86_segment seg,
+    unsigned long offset,
+    unsigned int bytes_per_rep,
+    unsigned long *reps,
+    struct x86_emulate_ctxt *ctxt)
+{
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+    unsigned long addr;
+    paddr_t gpa;
+    p2m_type_t p2mt;
+    bool_t df = !!(ctxt->regs->eflags & X86_EFLAGS_DF);
+    int rc = hvmemul_virtual_to_linear(seg, offset, bytes_per_rep, reps,
+                                       hvm_access_write, hvmemul_ctxt, &addr);
+
+    if ( rc == X86EMUL_OKAY )
+    {
+        uint32_t pfec = PFEC_page_present | PFEC_write_access;
+
+        if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
+            pfec |= PFEC_user_mode;
+
+        rc = hvmemul_linear_to_phys(
+            addr, &gpa, bytes_per_rep, reps, pfec, hvmemul_ctxt);
+    }
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    /* Check for MMIO op */
+    (void)get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, &p2mt);
+
+    switch ( p2mt )
+    {
+        unsigned long bytes;
+        void *buf;
+
+    default:
+        /* Allocate temporary buffer. */
+        for ( ; ; )
+        {
+            bytes = *reps * bytes_per_rep;
+            buf = xmalloc_bytes(bytes);
+            if ( buf || *reps <= 1 )
+                break;
+            *reps >>= 1;
+        }
+
+        if ( !buf )
+            buf = p_data;
+        else
+            switch ( bytes_per_rep )
+            {
+                unsigned long dummy;
+
+#define CASE(bits, suffix)                                     \
+            case (bits) / 8:                                   \
+                asm ( "rep stos" #suffix                       \
+                      : "=m" (*(char (*)[bytes])buf),          \
+                        "=D" (dummy), "=c" (dummy)             \
+                      : "a" (*(const uint##bits##_t *)p_data), \
+                         "1" (buf), "2" (*reps) );             \
+                break
+            CASE(8, b);
+            CASE(16, w);
+            CASE(32, l);
+            CASE(64, q);
+#undef CASE
+
+            default:
+                ASSERT_UNREACHABLE();
+                xfree(buf);
+                return X86EMUL_UNHANDLEABLE;
+            }
+
+        /* Adjust address for reverse store. */
+        if ( df )
+            gpa -= bytes - bytes_per_rep;
+
+        rc = hvm_copy_to_guest_phys(gpa, buf, bytes);
+
+        if ( buf != p_data )
+            xfree(buf);
+
+        switch ( rc )
+        {
+        case HVMCOPY_gfn_paged_out:
+        case HVMCOPY_gfn_shared:
+            return X86EMUL_RETRY;
+        case HVMCOPY_okay:
+            return X86EMUL_OKAY;
+        }
+
+        gdprintk(XENLOG_WARNING,
+                 "Failed REP STOS: gpa=%"PRIpaddr" reps=%lu bytes_per_rep=%u\n",
+                 gpa, *reps, bytes_per_rep);
+        /* fall through */
+    case p2m_mmio_direct:
+        return X86EMUL_UNHANDLEABLE;
+
+    case p2m_mmio_dm:
+        return hvmemul_do_mmio_buffer(gpa, reps, bytes_per_rep, IOREQ_WRITE, df,
+                                      p_data);
+    }
+}
+
 static int hvmemul_read_segment(
     enum x86_segment seg,
     struct segment_register *reg,
@@ -1015,9 +1372,15 @@ static int hvmemul_read_io(
     unsigned long *val,
     struct x86_emulate_ctxt *ctxt)
 {
-    unsigned long reps = 1;
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+
     *val = 0;
-    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, val);
+
+    if ( unlikely(hvmemul_ctxt->set_context) )
+        return set_context_data(val, bytes);
+
+    return hvmemul_do_pio_buffer(port, bytes, IOREQ_READ, val);
 }
 
 static int hvmemul_write_io(
@@ -1026,8 +1389,7 @@ static int hvmemul_write_io(
     unsigned long val,
     struct x86_emulate_ctxt *ctxt)
 {
-    unsigned long reps = 1;
-    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_WRITE, 0, &val);
+    return hvmemul_do_pio_buffer(port, bytes, IOREQ_WRITE, &val);
 }
 
 static int hvmemul_read_cr(
@@ -1060,14 +1422,14 @@ static int hvmemul_write_cr(
     switch ( reg )
     {
     case 0:
-        return hvm_set_cr0(val);
+        return hvm_set_cr0(val, 1);
     case 2:
         current->arch.hvm_vcpu.guest_cr[2] = val;
         return X86EMUL_OKAY;
     case 3:
-        return hvm_set_cr3(val);
+        return hvm_set_cr3(val, 1);
     case 4:
-        return hvm_set_cr4(val);
+        return hvm_set_cr4(val, 1);
     default:
         break;
     }
@@ -1088,7 +1450,7 @@ static int hvmemul_write_msr(
     uint64_t val,
     struct x86_emulate_ctxt *ctxt)
 {
-    return hvm_msr_write_intercept(reg, val);
+    return hvm_msr_write_intercept(reg, val, 1);
 }
 
 static int hvmemul_wbinvd(
@@ -1231,6 +1593,18 @@ static int hvmemul_invlpg(
     return rc;
 }
 
+static int hvmemul_vmfunc(
+    struct x86_emulate_ctxt *ctxt)
+{
+    int rc;
+
+    rc = hvm_funcs.altp2m_vcpu_emulate_vmfunc(ctxt->regs);
+    if ( rc != X86EMUL_OKAY )
+        hvmemul_inject_hw_exception(TRAP_invalid_op, 0, ctxt);
+
+    return rc;
+}
+
 static const struct x86_emulate_ops hvm_emulate_ops = {
     .read          = hvmemul_read,
     .insn_fetch    = hvmemul_insn_fetch,
@@ -1239,6 +1613,7 @@ static const struct x86_emulate_ops hvm_emulate_ops = {
     .rep_ins       = hvmemul_rep_ins,
     .rep_outs      = hvmemul_rep_outs,
     .rep_movs      = hvmemul_rep_movs,
+    .rep_stos      = hvmemul_rep_stos,
     .read_segment  = hvmemul_read_segment,
     .write_segment = hvmemul_write_segment,
     .read_io       = hvmemul_read_io,
@@ -1253,7 +1628,8 @@ static const struct x86_emulate_ops hvm_emulate_ops = {
     .inject_sw_interrupt = hvmemul_inject_sw_interrupt,
     .get_fpu       = hvmemul_get_fpu,
     .put_fpu       = hvmemul_put_fpu,
-    .invlpg        = hvmemul_invlpg
+    .invlpg        = hvmemul_invlpg,
+    .vmfunc        = hvmemul_vmfunc,
 };
 
 static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
@@ -1264,6 +1640,7 @@ static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
     .rep_ins       = hvmemul_rep_ins_discard,
     .rep_outs      = hvmemul_rep_outs_discard,
     .rep_movs      = hvmemul_rep_movs_discard,
+    .rep_stos      = hvmemul_rep_stos_discard,
     .read_segment  = hvmemul_read_segment,
     .write_segment = hvmemul_write_segment,
     .read_io       = hvmemul_read_io_discard,
@@ -1278,7 +1655,8 @@ static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
     .inject_sw_interrupt = hvmemul_inject_sw_interrupt,
     .get_fpu       = hvmemul_get_fpu,
     .put_fpu       = hvmemul_put_fpu,
-    .invlpg        = hvmemul_invlpg
+    .invlpg        = hvmemul_invlpg,
+    .vmfunc        = hvmemul_vmfunc,
 };
 
 static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
@@ -1331,7 +1709,6 @@ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
     }
 
     hvmemul_ctxt->exn_pending = 0;
-    vio->mmio_retrying = vio->mmio_retry;
     vio->mmio_retry = 0;
 
     if ( cpu_has_vmx )
@@ -1347,7 +1724,7 @@ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
         rc = X86EMUL_RETRY;
     if ( rc != X86EMUL_RETRY )
     {
-        vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0;
+        vio->mmio_cache_count = 0;
         vio->mmio_insn_bytes = 0;
     }
     else
@@ -1401,7 +1778,7 @@ int hvm_emulate_one_no_write(
     return _hvm_emulate_one(hvmemul_ctxt, &hvm_emulate_ops_no_write);
 }
 
-void hvm_mem_event_emulate_one(bool_t nowrite, unsigned int trapnr,
+void hvm_mem_access_emulate_one(enum emul_kind kind, unsigned int trapnr,
     unsigned int errcode)
 {
     struct hvm_emulate_ctxt ctx = {{ 0 }};
@@ -1409,16 +1786,23 @@ void hvm_mem_event_emulate_one(bool_t nowrite, unsigned int trapnr,
 
     hvm_emulate_prepare(&ctx, guest_cpu_user_regs());
 
-    if ( nowrite )
+    switch ( kind )
+    {
+    case EMUL_KIND_NOWRITE:
         rc = hvm_emulate_one_no_write(&ctx);
-    else
+        break;
+    case EMUL_KIND_SET_CONTEXT:
+        ctx.set_context = 1;
+        /* Intentional fall-through. */
+    default:
         rc = hvm_emulate_one(&ctx);
+    }
 
     switch ( rc )
     {
     case X86EMUL_RETRY:
         /*
-         * This function is called when handling an EPT-related mem_event
+         * This function is called when handling an EPT-related vm_event
          * reply. As such, nothing else needs to be done here, since simply
          * returning makes the current instruction cause a page fault again,
          * consistent with X86EMUL_RETRY.
@@ -1446,6 +1830,7 @@ void hvm_emulate_prepare(
     hvmemul_ctxt->ctxt.force_writeback = 1;
     hvmemul_ctxt->seg_reg_accessed = 0;
     hvmemul_ctxt->seg_reg_dirty = 0;
+    hvmemul_ctxt->set_context = 0;
     hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt);
     hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt);
 }
diff --git a/xen/arch/x86/hvm/event.c b/xen/arch/x86/hvm/event.c
new file mode 100644
index 0000000..4097af0
--- /dev/null
+++ b/xen/arch/x86/hvm/event.c
@@ -0,0 +1,189 @@
+/*
+* event.c: Common hardware virtual machine event abstractions.
+*
+* Copyright (c) 2004, Intel Corporation.
+* Copyright (c) 2005, International Business Machines Corporation.
+* Copyright (c) 2008, Citrix Systems, Inc.
+*
+* This program is free software; you can redistribute it and/or modify it
+* under the terms and conditions of the GNU General Public License,
+* version 2, as published by the Free Software Foundation.
+*
+* This program is distributed in the hope it will be useful, but WITHOUT
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+* more details.
+*
+* You should have received a copy of the GNU General Public License along with
+* this program; If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <xen/vm_event.h>
+#include <xen/paging.h>
+#include <asm/hvm/event.h>
+#include <asm/monitor.h>
+#include <public/vm_event.h>
+
+static void hvm_event_fill_regs(vm_event_request_t *req)
+{
+    const struct cpu_user_regs *regs = guest_cpu_user_regs();
+    const struct vcpu *curr = current;
+
+    req->data.regs.x86.rax = regs->eax;
+    req->data.regs.x86.rcx = regs->ecx;
+    req->data.regs.x86.rdx = regs->edx;
+    req->data.regs.x86.rbx = regs->ebx;
+    req->data.regs.x86.rsp = regs->esp;
+    req->data.regs.x86.rbp = regs->ebp;
+    req->data.regs.x86.rsi = regs->esi;
+    req->data.regs.x86.rdi = regs->edi;
+
+    req->data.regs.x86.r8  = regs->r8;
+    req->data.regs.x86.r9  = regs->r9;
+    req->data.regs.x86.r10 = regs->r10;
+    req->data.regs.x86.r11 = regs->r11;
+    req->data.regs.x86.r12 = regs->r12;
+    req->data.regs.x86.r13 = regs->r13;
+    req->data.regs.x86.r14 = regs->r14;
+    req->data.regs.x86.r15 = regs->r15;
+
+    req->data.regs.x86.rflags = regs->eflags;
+    req->data.regs.x86.rip    = regs->eip;
+
+    req->data.regs.x86.msr_efer = curr->arch.hvm_vcpu.guest_efer;
+    req->data.regs.x86.cr0 = curr->arch.hvm_vcpu.guest_cr[0];
+    req->data.regs.x86.cr3 = curr->arch.hvm_vcpu.guest_cr[3];
+    req->data.regs.x86.cr4 = curr->arch.hvm_vcpu.guest_cr[4];
+}
+
+static int hvm_event_traps(uint8_t sync, vm_event_request_t *req)
+{
+    int rc;
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+
+    rc = vm_event_claim_slot(currd, &currd->vm_event->monitor);
+    switch ( rc )
+    {
+    case 0:
+        break;
+    case -ENOSYS:
+        /*
+         * If there was no ring to handle the event, then
+         * simply continue executing normally.
+         */
+        return 1;
+    default:
+        return rc;
+    };
+
+    if ( sync )
+    {
+        req->flags |= VM_EVENT_FLAG_VCPU_PAUSED;
+        vm_event_vcpu_pause(curr);
+    }
+
+    hvm_event_fill_regs(req);
+    vm_event_put_request(currd, &currd->vm_event->monitor, req);
+
+    return 1;
+}
+
+bool_t hvm_event_cr(unsigned int index, unsigned long value, unsigned long old)
+{
+    struct arch_domain *currad = &current->domain->arch;
+    unsigned int ctrlreg_bitmask = monitor_ctrlreg_bitmask(index);
+
+    if ( (currad->monitor.write_ctrlreg_enabled & ctrlreg_bitmask) &&
+         (!(currad->monitor.write_ctrlreg_onchangeonly & ctrlreg_bitmask) ||
+          value != old) )
+    {
+        vm_event_request_t req = {
+            .reason = VM_EVENT_REASON_WRITE_CTRLREG,
+            .vcpu_id = current->vcpu_id,
+            .u.write_ctrlreg.index = index,
+            .u.write_ctrlreg.new_value = value,
+            .u.write_ctrlreg.old_value = old
+        };
+
+        hvm_event_traps(currad->monitor.write_ctrlreg_sync & ctrlreg_bitmask,
+                        &req);
+        return 1;
+    }
+
+    return 0;
+}
+
+void hvm_event_msr(unsigned int msr, uint64_t value)
+{
+    struct vcpu *curr = current;
+    vm_event_request_t req = {
+        .reason = VM_EVENT_REASON_MOV_TO_MSR,
+        .vcpu_id = curr->vcpu_id,
+        .u.mov_to_msr.msr = msr,
+        .u.mov_to_msr.value = value,
+    };
+
+    if ( curr->domain->arch.monitor.mov_to_msr_enabled )
+        hvm_event_traps(1, &req);
+}
+
+void hvm_event_guest_request(void)
+{
+    struct vcpu *curr = current;
+    struct arch_domain *currad = &curr->domain->arch;
+
+    if ( currad->monitor.guest_request_enabled )
+    {
+        vm_event_request_t req = {
+            .reason = VM_EVENT_REASON_GUEST_REQUEST,
+            .vcpu_id = curr->vcpu_id,
+        };
+
+        hvm_event_traps(currad->monitor.guest_request_sync, &req);
+    }
+}
+
+int hvm_event_int3(unsigned long gla)
+{
+    int rc = 0;
+    uint32_t pfec = PFEC_page_present;
+    struct vcpu *curr = current;
+    vm_event_request_t req = {
+        .reason = VM_EVENT_REASON_SOFTWARE_BREAKPOINT,
+        .vcpu_id = curr->vcpu_id,
+        .u.software_breakpoint.gfn = paging_gva_to_gfn(curr, gla, &pfec)
+    };
+
+    if ( curr->domain->arch.monitor.software_breakpoint_enabled )
+        rc = hvm_event_traps(1, &req);
+
+    return rc;
+}
+
+int hvm_event_single_step(unsigned long gla)
+{
+    int rc = 0;
+    uint32_t pfec = PFEC_page_present;
+    struct vcpu *curr = current;
+    vm_event_request_t req = {
+        .reason = VM_EVENT_REASON_SINGLESTEP,
+        .vcpu_id = curr->vcpu_id,
+        .u.singlestep.gfn = paging_gva_to_gfn(curr, gla, &pfec)
+    };
+
+    if ( curr->domain->arch.monitor.singlestep_enabled )
+        rc = hvm_event_traps(1, &req);
+
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/hvm/hpet.c b/xen/arch/x86/hvm/hpet.c
index bdfc6fc..facab83 100644
--- a/xen/arch/x86/hvm/hpet.c
+++ b/xen/arch/x86/hvm/hpet.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <asm/hvm/vpt.h>
@@ -166,13 +165,19 @@ static inline int hpet_check_access_length(
 }
 
 static int hpet_read(
-    struct vcpu *v, unsigned long addr, unsigned long length,
+    struct vcpu *v, unsigned long addr, unsigned int length,
     unsigned long *pval)
 {
     HPETState *h = vcpu_vhpet(v);
     unsigned long result;
     uint64_t val;
 
+    if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] )
+    {
+        result = ~0ul;
+        goto out;
+    }
+
     addr &= HPET_MMAP_SIZE-1;
 
     if ( hpet_check_access_length(addr, length) != 0 )
@@ -295,7 +300,7 @@ static inline uint64_t hpet_fixup_reg(
 
 static int hpet_write(
     struct vcpu *v, unsigned long addr,
-    unsigned long length, unsigned long val)
+    unsigned int length, unsigned long val)
 {
     HPETState *h = vcpu_vhpet(v);
     uint64_t old_val, new_val;
@@ -309,6 +314,9 @@ static int hpet_write(
 #define set_start_timer(n)   (__set_bit((n), &start_timers))
 #define set_restart_timer(n) (set_stop_timer(n),set_start_timer(n))
 
+    if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] )
+        goto out;
+
     addr &= HPET_MMAP_SIZE-1;
 
     if ( hpet_check_access_length(addr, length) != 0 )
@@ -491,26 +499,27 @@ static int hpet_write(
 
 static int hpet_range(struct vcpu *v, unsigned long addr)
 {
-    return (v->domain->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] &&
-            (addr >= HPET_BASE_ADDRESS) &&
-            (addr < (HPET_BASE_ADDRESS + HPET_MMAP_SIZE)));
+    return ( (addr >= HPET_BASE_ADDRESS) &&
+             (addr < (HPET_BASE_ADDRESS + HPET_MMAP_SIZE)) );
 }
 
-const struct hvm_mmio_handler hpet_mmio_handler = {
-    .check_handler = hpet_range,
-    .read_handler  = hpet_read,
-    .write_handler = hpet_write
+static const struct hvm_mmio_ops hpet_mmio_ops = {
+    .check = hpet_range,
+    .read  = hpet_read,
+    .write = hpet_write
 };
 
 
 static int hpet_save(struct domain *d, hvm_domain_context_t *h)
 {
     HPETState *hp = domain_vhpet(d);
+    struct vcpu *v = pt_global_vcpu_target(d);
     int rc;
     uint64_t guest_time;
 
     write_lock(&hp->lock);
-    guest_time = guest_time_hpet(hp);
+    guest_time = (v->arch.hvm_vcpu.guest_time ?: hvm_get_guest_time(v)) /
+                 STIME_PER_HPET_TICK;
 
     /* Write the proper value into the main counter */
     if ( hpet_enabled(hp) )
@@ -651,6 +660,8 @@ void hpet_init(struct domain *d)
         h->hpet.comparator64[i] = ~0ULL;
         h->pt[i].source = PTSRC_isa;
     }
+
+    register_mmio_handler(d, &hpet_mmio_ops);
 }
 
 void hpet_deinit(struct domain *d)
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 55077f9..615fa89 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -15,8 +15,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -35,7 +34,6 @@
 #include <xen/paging.h>
 #include <xen/cpu.h>
 #include <xen/wait.h>
-#include <xen/mem_event.h>
 #include <xen/mem_access.h>
 #include <xen/rangeset.h>
 #include <asm/shadow.h>
@@ -52,21 +50,24 @@
 #include <asm/xstate.h>
 #include <asm/traps.h>
 #include <asm/mc146818rtc.h>
-#include <asm/spinlock.h>
 #include <asm/mce.h>
+#include <asm/monitor.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/vpt.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/cacheattr.h>
 #include <asm/hvm/trace.h>
 #include <asm/hvm/nestedhvm.h>
+#include <asm/hvm/event.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <asm/altp2m.h>
 #include <asm/mtrr.h>
 #include <asm/apic.h>
 #include <public/sched.h>
 #include <public/hvm/ioreq.h>
 #include <public/version.h>
 #include <public/memory.h>
-#include <public/mem_event.h>
+#include <public/vm_event.h>
 #include <public/arch-x86/cpuid.h>
 
 bool_t __read_mostly hvm_enabled;
@@ -78,9 +79,13 @@ integer_param("hvm_debug", opt_hvm_debug_level);
 
 struct hvm_function_table hvm_funcs __read_mostly;
 
-/* I/O permission bitmap is globally shared by all HVM guests. */
-unsigned long __attribute__ ((__section__ (".bss.page_aligned")))
-    hvm_io_bitmap[3*PAGE_SIZE/BYTES_PER_LONG];
+/*
+ * The I/O permission bitmap is globally shared by all HVM guests except
+ * the hardware domain which needs a more permissive one.
+ */
+#define HVM_IOBITMAP_SIZE (3 * PAGE_SIZE)
+unsigned long __section(".bss.page_aligned")
+    hvm_io_bitmap[HVM_IOBITMAP_SIZE / BYTES_PER_LONG];
 
 /* Xen command-line option to enable HAP */
 static bool_t __initdata opt_hap_enabled = 1;
@@ -91,6 +96,10 @@ bool_t opt_hvm_fep;
 boolean_param("hvm_fep", opt_hvm_fep);
 #endif
 
+/* Xen command-line option to enable altp2m */
+static bool_t __initdata opt_altp2m_enabled = 0;
+boolean_param("altp2m", opt_altp2m_enabled);
+
 static int cpu_callback(
     struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
@@ -157,6 +166,9 @@ static int __init hvm_enable(void)
     if ( !fns->pvh_supported )
         printk(XENLOG_INFO "HVM: PVH mode not supported on this platform\n");
 
+    if ( !opt_altp2m_enabled )
+        hvm_funcs.altp2m_supported = 0;
+
     /*
      * Allow direct access to the PC debug ports 0x80 and 0xed (they are
      * often used for I/O delays, but the vmexits simply slow things down).
@@ -205,6 +217,16 @@ int hvm_event_needs_reinjection(uint8_t type, uint8_t vector)
  */
 uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2)
 {
+    const unsigned int contributory_exceptions =
+        (1 << TRAP_divide_error) |
+        (1 << TRAP_invalid_tss) |
+        (1 << TRAP_no_segment) |
+        (1 << TRAP_stack_error) |
+        (1 << TRAP_gp_fault);
+    const unsigned int page_faults =
+        (1 << TRAP_page_fault) |
+        (1 << TRAP_virtualisation);
+
     /* Exception during double-fault delivery always causes a triple fault. */
     if ( vec1 == TRAP_double_fault )
     {
@@ -213,11 +235,12 @@ uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2)
     }
 
     /* Exception during page-fault delivery always causes a double fault. */
-    if ( vec1 == TRAP_page_fault )
+    if ( (1u << vec1) & page_faults )
         return TRAP_double_fault;
 
     /* Discard the first exception if it's benign or if we now have a #PF. */
-    if ( !((1u << vec1) & 0x7c01u) || (vec2 == TRAP_page_fault) )
+    if ( !((1u << vec1) & contributory_exceptions) ||
+         ((1u << vec2) & page_faults) )
         return vec2;
 
     /* Cannot combine the exceptions: double fault. */
@@ -281,7 +304,7 @@ void hvm_set_guest_tsc_fixed(struct vcpu *v, u64 guest_tsc, u64 at_tsc)
     }
     else
     {
-        rdtscll(tsc);
+        tsc = rdtsc();
     }
 
     delta_tsc = guest_tsc - tsc;
@@ -308,7 +331,6 @@ u64 hvm_get_guest_tsc_fixed(struct vcpu *v, uint64_t at_tsc)
     {
         tsc = hvm_get_guest_time_fixed(v, at_tsc);
         tsc = gtime_to_gtsc(v->domain, tsc);
-        v->domain->arch.vtsc_kerncount++;
     }
     else if ( at_tsc )
     {
@@ -316,7 +338,7 @@ u64 hvm_get_guest_tsc_fixed(struct vcpu *v, uint64_t at_tsc)
     }
     else
     {
-        rdtscll(tsc);
+        tsc = rdtsc();
     }
 
     return tsc + v->arch.hvm_vcpu.cache_tsc_offset;
@@ -389,25 +411,57 @@ bool_t hvm_io_pending(struct vcpu *v)
                           &d->arch.hvm_domain.ioreq_server.list,
                           list_entry )
     {
-        ioreq_t *p = get_ioreq(s, v);
+        struct hvm_ioreq_vcpu *sv;
 
-        if ( p->state != STATE_IOREQ_NONE )
-            return 1;
+        list_for_each_entry ( sv,
+                              &s->ioreq_vcpu_list,
+                              list_entry )
+        {
+            if ( sv->vcpu == v && sv->pending )
+                return 1;
+        }
     }
 
     return 0;
 }
 
+static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data)
+{
+    struct vcpu *v = sv->vcpu;
+    struct hvm_vcpu_io *vio = &v->arch.hvm_vcpu.hvm_io;
+
+    if ( hvm_vcpu_io_need_completion(vio) )
+    {
+        vio->io_req.state = STATE_IORESP_READY;
+        vio->io_req.data = data;
+    }
+    else
+        vio->io_req.state = STATE_IOREQ_NONE;
+
+    msix_write_completion(v);
+    vcpu_end_shutdown_deferral(v);
+
+    sv->pending = 0;
+}
+
 static bool_t hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
 {
-    /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
-    while ( p->state != STATE_IOREQ_NONE )
+    while ( sv->pending )
     {
         switch ( p->state )
         {
+        case STATE_IOREQ_NONE:
+            /*
+             * The only reason we should see this case is when an
+             * emulator is dying and it races with an I/O being
+             * requested.
+             */
+            hvm_io_assist(sv, ~0ul);
+            break;
         case STATE_IORESP_READY: /* IORESP_READY -> NONE */
             rmb(); /* see IORESP_READY /then/ read contents of ioreq */
-            hvm_io_assist(p);
+            p->state = STATE_IOREQ_NONE;
+            hvm_io_assist(sv, p->data);
             break;
         case STATE_IOREQ_READY:  /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
         case STATE_IOREQ_INPROCESS:
@@ -417,6 +471,7 @@ static bool_t hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
             break;
         default:
             gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
+            sv->pending = 0;
             domain_crash(sv->vcpu->domain);
             return 0; /* bail */
         }
@@ -427,12 +482,14 @@ static bool_t hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
 
 void hvm_do_resume(struct vcpu *v)
 {
+    struct hvm_vcpu_io *vio = &v->arch.hvm_vcpu.hvm_io;
     struct domain *d = v->domain;
     struct hvm_ioreq_server *s;
+    enum hvm_io_completion io_completion;
 
     check_wakeup_from_wait();
 
-    if ( is_hvm_vcpu(v) )
+    if ( is_hvm_domain(d) )
         pt_restore_timer(v);
 
     list_for_each_entry ( s,
@@ -445,7 +502,7 @@ void hvm_do_resume(struct vcpu *v)
                               &s->ioreq_vcpu_list,
                               list_entry )
         {
-            if ( sv->vcpu == v )
+            if ( sv->vcpu == v && sv->pending )
             {
                 if ( !hvm_wait_for_io(sv, get_ioreq(s, v)) )
                     return;
@@ -455,6 +512,64 @@ void hvm_do_resume(struct vcpu *v)
         }
     }
 
+    io_completion = vio->io_completion;
+    vio->io_completion = HVMIO_no_completion;
+
+    switch ( io_completion )
+    {
+    case HVMIO_no_completion:
+        break;
+    case HVMIO_mmio_completion:
+        handle_mmio();
+        break;
+    case HVMIO_pio_completion:
+        (void)handle_pio(vio->io_req.addr, vio->io_req.size,
+                         vio->io_req.dir);
+        break;
+    case HVMIO_realmode_completion:
+    {
+        struct hvm_emulate_ctxt ctxt;
+
+        hvm_emulate_prepare(&ctxt, guest_cpu_user_regs());
+        vmx_realmode_emulate_one(&ctxt);
+        hvm_emulate_writeback(&ctxt);
+
+        break;
+    }
+    default:
+        ASSERT_UNREACHABLE();
+        break;
+    }
+
+    if ( unlikely(d->arch.event_write_data) )
+    {
+        struct monitor_write_data *w = &d->arch.event_write_data[v->vcpu_id];
+
+        if ( w->do_write.msr )
+        {
+            hvm_msr_write_intercept(w->msr, w->value, 0);
+            w->do_write.msr = 0;
+        }
+
+        if ( w->do_write.cr0 )
+        {
+            hvm_set_cr0(w->cr0, 0);
+            w->do_write.cr0 = 0;
+        }
+
+        if ( w->do_write.cr4 )
+        {
+            hvm_set_cr4(w->cr4, 0);
+            w->do_write.cr4 = 0;
+        }
+
+        if ( w->do_write.cr3 )
+        {
+            hvm_set_cr3(w->cr3, 0);
+            w->do_write.cr3 = 0;
+        }
+    }
+
     /* Inject pending hw/sw trap */
     if ( v->arch.hvm_vcpu.inject_trap.vector != -1 ) 
     {
@@ -486,7 +601,8 @@ static void hvm_free_ioreq_gmfn(struct domain *d, unsigned long gmfn)
 {
     unsigned int i = gmfn - d->arch.hvm_domain.ioreq_gmfn.base;
 
-    clear_bit(i, &d->arch.hvm_domain.ioreq_gmfn.mask);
+    if ( gmfn != INVALID_GFN )
+        set_bit(i, &d->arch.hvm_domain.ioreq_gmfn.mask);
 }
 
 static void hvm_unmap_ioreq_page(struct hvm_ioreq_server *s, bool_t buf)
@@ -545,7 +661,7 @@ static int hvm_add_ioreq_gmfn(
 }
 
 static int hvm_print_line(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
 {
     struct domain *cd = current->domain;
     char c = *val;
@@ -571,7 +687,7 @@ static int hvm_print_line(
 }
 
 static int hvm_access_cf8(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
 {
     struct domain *d = current->domain;
 
@@ -583,15 +699,14 @@ static int hvm_access_cf8(
 }
 
 static int handle_pvh_io(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
 {
-    struct vcpu *curr = current;
-    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct domain *currd = current->domain;
 
     if ( dir == IOREQ_WRITE )
-        guest_io_write(port, bytes, *val, curr, regs);
+        guest_io_write(port, bytes, *val, currd);
     else
-        *val = guest_io_read(port, bytes, curr, regs);
+        *val = guest_io_read(port, bytes, currd);
 
     return X86EMUL_OKAY;
 }
@@ -623,7 +738,8 @@ static int hvm_ioreq_server_add_vcpu(struct hvm_ioreq_server *s,
 
     spin_lock(&s->lock);
 
-    rc = alloc_unbound_xen_event_channel(v, s->domid, NULL);
+    rc = alloc_unbound_xen_event_channel(v->domain, v->vcpu_id, s->domid,
+                                         NULL);
     if ( rc < 0 )
         goto fail2;
 
@@ -633,7 +749,7 @@ static int hvm_ioreq_server_add_vcpu(struct hvm_ioreq_server *s,
     {
         struct domain *d = s->domain;
 
-        rc = alloc_unbound_xen_event_channel(v, s->domid, NULL);
+        rc = alloc_unbound_xen_event_channel(v->domain, 0, s->domid, NULL);
         if ( rc < 0 )
             goto fail3;
 
@@ -654,7 +770,7 @@ static int hvm_ioreq_server_add_vcpu(struct hvm_ioreq_server *s,
     return 0;
 
  fail3:
-    free_xen_event_channel(v, sv->ioreq_evtchn);
+    free_xen_event_channel(v->domain, sv->ioreq_evtchn);
     
  fail2:
     spin_unlock(&s->lock);
@@ -681,9 +797,9 @@ static void hvm_ioreq_server_remove_vcpu(struct hvm_ioreq_server *s,
         list_del(&sv->list_entry);
 
         if ( v->vcpu_id == 0 && s->bufioreq.va != NULL )
-            free_xen_event_channel(v, s->bufioreq_evtchn);
+            free_xen_event_channel(v->domain, s->bufioreq_evtchn);
 
-        free_xen_event_channel(v, sv->ioreq_evtchn);
+        free_xen_event_channel(v->domain, sv->ioreq_evtchn);
 
         xfree(sv);
         break;
@@ -708,9 +824,9 @@ static void hvm_ioreq_server_remove_all_vcpus(struct hvm_ioreq_server *s)
         list_del(&sv->list_entry);
 
         if ( v->vcpu_id == 0 && s->bufioreq.va != NULL )
-            free_xen_event_channel(v, s->bufioreq_evtchn);
+            free_xen_event_channel(v->domain, s->bufioreq_evtchn);
 
-        free_xen_event_channel(v, sv->ioreq_evtchn);
+        free_xen_event_channel(v->domain, sv->ioreq_evtchn);
 
         xfree(sv);
     }
@@ -719,62 +835,59 @@ static void hvm_ioreq_server_remove_all_vcpus(struct hvm_ioreq_server *s)
 }
 
 static int hvm_ioreq_server_map_pages(struct hvm_ioreq_server *s,
-                                      bool_t is_default, bool_t handle_bufioreq)
+                                      unsigned long ioreq_pfn,
+                                      unsigned long bufioreq_pfn)
+{
+    int rc;
+
+    rc = hvm_map_ioreq_page(s, 0, ioreq_pfn);
+    if ( rc )
+        return rc;
+
+    if ( bufioreq_pfn != INVALID_GFN )
+        rc = hvm_map_ioreq_page(s, 1, bufioreq_pfn);
+
+    if ( rc )
+        hvm_unmap_ioreq_page(s, 0);
+
+    return rc;
+}
+
+static int hvm_ioreq_server_setup_pages(struct hvm_ioreq_server *s,
+                                        bool_t is_default,
+                                        bool_t handle_bufioreq)
 {
     struct domain *d = s->domain;
-    unsigned long ioreq_pfn, bufioreq_pfn;
+    unsigned long ioreq_pfn = INVALID_GFN;
+    unsigned long bufioreq_pfn = INVALID_GFN;
     int rc;
 
     if ( is_default )
     {
-        ioreq_pfn = d->arch.hvm_domain.params[HVM_PARAM_IOREQ_PFN];
-
         /*
          * The default ioreq server must handle buffered ioreqs, for
          * backwards compatibility.
          */
         ASSERT(handle_bufioreq);
-        bufioreq_pfn = d->arch.hvm_domain.params[HVM_PARAM_BUFIOREQ_PFN];
-    }
-    else
-    {
-        rc = hvm_alloc_ioreq_gmfn(d, &ioreq_pfn);
-        if ( rc )
-            goto fail1;
-
-        if ( handle_bufioreq )
-        {
-            rc = hvm_alloc_ioreq_gmfn(d, &bufioreq_pfn);
-            if ( rc )
-                goto fail2;
-        }
-    }
-
-    rc = hvm_map_ioreq_page(s, 0, ioreq_pfn);
-    if ( rc )
-        goto fail3;
-
-    if ( handle_bufioreq )
-    {
-        rc = hvm_map_ioreq_page(s, 1, bufioreq_pfn);
-        if ( rc )
-            goto fail4;
+        return hvm_ioreq_server_map_pages(s,
+                   d->arch.hvm_domain.params[HVM_PARAM_IOREQ_PFN],
+                   d->arch.hvm_domain.params[HVM_PARAM_BUFIOREQ_PFN]);
     }
 
-    return 0;
+    rc = hvm_alloc_ioreq_gmfn(d, &ioreq_pfn);
 
-fail4:
-    hvm_unmap_ioreq_page(s, 0);
+    if ( !rc && handle_bufioreq )
+        rc = hvm_alloc_ioreq_gmfn(d, &bufioreq_pfn);
 
-fail3:
-    if ( !is_default && handle_bufioreq )
-        hvm_free_ioreq_gmfn(d, bufioreq_pfn);
+    if ( !rc )
+        rc = hvm_ioreq_server_map_pages(s, ioreq_pfn, bufioreq_pfn);
 
-fail2:
-    if ( !is_default )
+    if ( rc )
+    {
         hvm_free_ioreq_gmfn(d, ioreq_pfn);
+        hvm_free_ioreq_gmfn(d, bufioreq_pfn);
+    }
 
-fail1:
     return rc;
 }
 
@@ -881,13 +994,6 @@ static void hvm_ioreq_server_enable(struct hvm_ioreq_server *s,
 
   done:
     spin_unlock(&s->lock);
-
-    /* This check is protected by the domain ioreq server lock. */
-    if ( d->arch.hvm_domain.ioreq_server.waiting )
-    {
-        d->arch.hvm_domain.ioreq_server.waiting = 0;
-        domain_unpause(d);
-    }
 }
 
 static void hvm_ioreq_server_disable(struct hvm_ioreq_server *s,
@@ -917,7 +1023,7 @@ static void hvm_ioreq_server_disable(struct hvm_ioreq_server *s,
 
 static int hvm_ioreq_server_init(struct hvm_ioreq_server *s, struct domain *d,
                                  domid_t domid, bool_t is_default,
-                                 bool_t handle_bufioreq, ioservid_t id)
+                                 int bufioreq_handling, ioservid_t id)
 {
     struct vcpu *v;
     int rc;
@@ -934,7 +1040,11 @@ static int hvm_ioreq_server_init(struct hvm_ioreq_server *s, struct domain *d,
     if ( rc )
         return rc;
 
-    rc = hvm_ioreq_server_map_pages(s, is_default, handle_bufioreq);
+    if ( bufioreq_handling == HVM_IOREQSRV_BUFIOREQ_ATOMIC )
+        s->bufioreq_atomic = 1;
+
+    rc = hvm_ioreq_server_setup_pages(
+             s, is_default, bufioreq_handling != HVM_IOREQSRV_BUFIOREQ_OFF);
     if ( rc )
         goto fail_map;
 
@@ -960,6 +1070,7 @@ static int hvm_ioreq_server_init(struct hvm_ioreq_server *s, struct domain *d,
 static void hvm_ioreq_server_deinit(struct hvm_ioreq_server *s,
                                     bool_t is_default)
 {
+    ASSERT(!s->enabled);
     hvm_ioreq_server_remove_all_vcpus(s);
     hvm_ioreq_server_unmap_pages(s, is_default);
     hvm_ioreq_server_free_rangesets(s, is_default);
@@ -992,12 +1103,15 @@ static ioservid_t next_ioservid(struct domain *d)
 }
 
 static int hvm_create_ioreq_server(struct domain *d, domid_t domid,
-                                   bool_t is_default, bool_t handle_bufioreq,
+                                   bool_t is_default, int bufioreq_handling,
                                    ioservid_t *id)
 {
     struct hvm_ioreq_server *s;
     int rc;
 
+    if ( bufioreq_handling > HVM_IOREQSRV_BUFIOREQ_ATOMIC )
+        return -EINVAL;
+
     rc = -ENOMEM;
     s = xzalloc(struct hvm_ioreq_server);
     if ( !s )
@@ -1010,7 +1124,7 @@ static int hvm_create_ioreq_server(struct domain *d, domid_t domid,
     if ( is_default && d->arch.hvm_domain.default_ioreq_server != NULL )
         goto fail2;
 
-    rc = hvm_ioreq_server_init(s, d, domid, is_default, handle_bufioreq,
+    rc = hvm_ioreq_server_init(s, d, domid, is_default, bufioreq_handling,
                                next_ioservid(d));
     if ( rc )
         goto fail3;
@@ -1062,6 +1176,8 @@ static int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id)
 
         domain_pause(d);
 
+        hvm_ioreq_server_disable(s, 0);
+
         list_del(&s->list_entry);
         
         hvm_ioreq_server_deinit(s, 0);
@@ -1320,11 +1436,10 @@ static void hvm_destroy_all_ioreq_servers(struct domain *d)
     {
         bool_t is_default = (s == d->arch.hvm_domain.default_ioreq_server);
 
+        hvm_ioreq_server_disable(s, is_default);
+
         if ( is_default )
-        {
-            hvm_ioreq_server_disable(s, 1);
             d->arch.hvm_domain.default_ioreq_server = NULL;
-        }
 
         list_del(&s->list_entry);
         
@@ -1341,13 +1456,14 @@ static int hvm_replace_event_channel(struct vcpu *v, domid_t remote_domid,
 {
     int old_port, new_port;
 
-    new_port = alloc_unbound_xen_event_channel(v, remote_domid, NULL);
+    new_port = alloc_unbound_xen_event_channel(v->domain, v->vcpu_id,
+                                               remote_domid, NULL);
     if ( new_port < 0 )
         return new_port;
 
     /* xchg() ensures that only we call free_xen_event_channel(). */
     old_port = xchg(p_port, new_port);
-    free_xen_event_channel(v, old_port);
+    free_xen_event_channel(v->domain, old_port);
     return 0;
 }
 
@@ -1438,26 +1554,9 @@ int hvm_domain_initialise(struct domain *d)
 
     spin_lock_init(&d->arch.hvm_domain.ioreq_server.lock);
     INIT_LIST_HEAD(&d->arch.hvm_domain.ioreq_server.list);
-    
-    /*
-     * In the case where a stub domain is providing emulation for
-     * the guest, there is no interlock in the toolstack to prevent
-     * the guest from running before the stub domain is ready.
-     * Hence the domain must remain paused until at least one ioreq
-     * server is created and enabled.
-     */
-    if ( !is_pvh_domain(d) )
-    {
-        domain_pause(d);
-        d->arch.hvm_domain.ioreq_server.waiting = 1;
-    }
-
     spin_lock_init(&d->arch.hvm_domain.irq_lock);
     spin_lock_init(&d->arch.hvm_domain.uc_lock);
 
-    INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list);
-    spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock);
-
     hvm_init_cacheattr_region_list(d);
 
     rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
@@ -1465,11 +1564,27 @@ int hvm_domain_initialise(struct domain *d)
         goto fail0;
 
     d->arch.hvm_domain.params = xzalloc_array(uint64_t, HVM_NR_PARAMS);
-    d->arch.hvm_domain.io_handler = xmalloc(struct hvm_io_handler);
+    d->arch.hvm_domain.io_handler = xzalloc_array(struct hvm_io_handler,
+                                                  NR_IO_HANDLERS);
     rc = -ENOMEM;
     if ( !d->arch.hvm_domain.params || !d->arch.hvm_domain.io_handler )
         goto fail1;
-    d->arch.hvm_domain.io_handler->num_slot = 0;
+
+    /* Set the default IO Bitmap. */
+    if ( is_hardware_domain(d) )
+    {
+        d->arch.hvm_domain.io_bitmap = _xmalloc(HVM_IOBITMAP_SIZE, PAGE_SIZE);
+        if ( d->arch.hvm_domain.io_bitmap == NULL )
+        {
+            rc = -ENOMEM;
+            goto fail1;
+        }
+        memset(d->arch.hvm_domain.io_bitmap, ~0, HVM_IOBITMAP_SIZE);
+    }
+    else
+        d->arch.hvm_domain.io_bitmap = hvm_io_bitmap;
+
+    register_dpci_portio_handler(d);
 
     if ( is_pvh_domain(d) )
     {
@@ -1492,6 +1607,8 @@ int hvm_domain_initialise(struct domain *d)
 
     rtc_init(d);
 
+    msixtbl_init(d);
+
     register_portio_handler(d, 0xe9, 1, hvm_print_line);
     register_portio_handler(d, 0xcf8, 4, hvm_access_cf8);
 
@@ -1506,6 +1623,8 @@ int hvm_domain_initialise(struct domain *d)
     stdvga_deinit(d);
     vioapic_deinit(d);
  fail1:
+    if ( is_hardware_domain(d) )
+        xfree(d->arch.hvm_domain.io_bitmap);
     xfree(d->arch.hvm_domain.io_handler);
     xfree(d->arch.hvm_domain.params);
  fail0:
@@ -1700,20 +1819,65 @@ static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
     return 0;
 }
 
-static bool_t hvm_efer_valid(struct domain *d,
-                             uint64_t value, uint64_t efer_validbits)
+/* Return a string indicating the error, or NULL for valid. */
+static const char * hvm_efer_valid(const struct vcpu *v, uint64_t value,
+                                   signed int cr0_pg)
 {
-    if ( nestedhvm_enabled(d) && cpu_has_svm )
-        efer_validbits |= EFER_SVME;
+    unsigned int ext1_ecx = 0, ext1_edx = 0;
+
+    if ( cr0_pg < 0 && !is_hardware_domain(v->domain) )
+    {
+        unsigned int level;
+
+        ASSERT(v == current);
+        hvm_cpuid(0x80000000, &level, NULL, NULL, NULL);
+        if ( level >= 0x80000001 )
+        {
+            unsigned int dummy;
+
+            level = 0x80000001;
+            hvm_funcs.cpuid_intercept(&level, &dummy, &ext1_ecx, &ext1_edx);
+        }
+    }
+    else
+    {
+        ext1_edx = boot_cpu_data.x86_capability[X86_FEATURE_LM / 32];
+        ext1_ecx = boot_cpu_data.x86_capability[X86_FEATURE_SVM / 32];
+    }
+
+    /*
+     * Guests may want to set EFER.SCE and EFER.LME at the same time, so we
+     * can't make the check depend on only X86_FEATURE_SYSCALL (which on VMX
+     * will be clear without the guest having entered 64-bit mode).
+     */
+    if ( (value & EFER_SCE) &&
+         !(ext1_edx & cpufeat_mask(X86_FEATURE_SYSCALL)) &&
+         (cr0_pg >= 0 || !(value & EFER_LME)) )
+        return "SCE without feature";
+
+    if ( (value & (EFER_LME | EFER_LMA)) &&
+         !(ext1_edx & cpufeat_mask(X86_FEATURE_LM)) )
+        return "LME/LMA without feature";
+
+    if ( (value & EFER_LMA) && (!(value & EFER_LME) || !cr0_pg) )
+        return "LMA/LME/CR0.PG inconsistency";
 
-    return !((value & ~efer_validbits) ||
-             ((sizeof(long) != 8) && (value & EFER_LME)) ||
-             (!cpu_has_svm && (value & EFER_SVME)) ||
-             (!cpu_has_nx && (value & EFER_NX)) ||
-             (!cpu_has_syscall && (value & EFER_SCE)) ||
-             (!cpu_has_lmsl && (value & EFER_LMSLE)) ||
-             (!cpu_has_ffxsr && (value & EFER_FFXSE)) ||
-             ((value & (EFER_LME|EFER_LMA)) == EFER_LMA));
+    if ( (value & EFER_NX) && !(ext1_edx & cpufeat_mask(X86_FEATURE_NX)) )
+        return "NX without feature";
+
+    if ( (value & EFER_SVME) &&
+         (!(ext1_ecx & cpufeat_mask(X86_FEATURE_SVM)) ||
+          !nestedhvm_enabled(v->domain)) )
+        return "SVME without nested virt";
+
+    if ( (value & EFER_LMSLE) && !cpu_has_lmsl )
+        return "LMSLE without support";
+
+    if ( (value & EFER_FFXSE) &&
+         !(ext1_edx & cpufeat_mask(X86_FEATURE_FFXSR)) )
+        return "FFXSE without feature";
+
+    return NULL;
 }
 
 /* These reserved bits in lower 32 remain 0 after any load of CR0 */
@@ -1791,7 +1955,7 @@ static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
     struct vcpu *v;
     struct hvm_hw_cpu ctxt;
     struct segment_register seg;
-    uint64_t efer_validbits;
+    const char *errstr;
 
     /* Which vcpu is this? */
     vcpuid = hvm_load_instance(h);
@@ -1822,12 +1986,11 @@ static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
         return -EINVAL;
     }
 
-    efer_validbits = EFER_FFXSE | EFER_LMSLE | EFER_LME | EFER_LMA
-                   | EFER_NX | EFER_SCE;
-    if ( !hvm_efer_valid(d, ctxt.msr_efer, efer_validbits) )
+    errstr = hvm_efer_valid(v, ctxt.msr_efer, MASK_EXTR(ctxt.cr0, X86_CR0_PG));
+    if ( errstr )
     {
-        printk(XENLOG_G_ERR "HVM%d restore: bad EFER %#" PRIx64 "\n",
-               d->domain_id, ctxt.msr_efer);
+        printk(XENLOG_G_ERR "%pv: HVM restore: bad EFER %#" PRIx64 " - %s\n",
+               v, ctxt.msr_efer, errstr);
         return -EINVAL;
     }
 
@@ -2259,7 +2422,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
 
     v->arch.hvm_vcpu.inject_trap.vector = -1;
 
-    if ( is_pvh_vcpu(v) )
+    if ( is_pvh_domain(d) )
     {
         v->arch.hvm_vcpu.hcall_64bit = 1;    /* PVH 32bitfixme. */
         /* This is for hvm_long_mode_enabled(v). */
@@ -2311,9 +2474,10 @@ int hvm_vcpu_initialise(struct vcpu *v)
 
 void hvm_vcpu_destroy(struct vcpu *v)
 {
-    struct domain *d = v->domain;
+    hvm_all_ioreq_servers_remove_vcpu(v->domain, v);
 
-    hvm_all_ioreq_servers_remove_vcpu(d, v);
+    if ( hvm_altp2m_supported() )
+        altp2m_vcpu_destroy(v);
 
     nestedhvm_vcpu_destroy(v);
 
@@ -2352,14 +2516,9 @@ void hvm_vcpu_down(struct vcpu *v)
     }
 }
 
-static struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
-                                                        ioreq_t *p)
+struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
+                                                 ioreq_t *p)
 {
-#define CF8_BDF(cf8)     (((cf8) & 0x00ffff00) >> 8)
-#define CF8_ADDR_LO(cf8) ((cf8) & 0x000000fc)
-#define CF8_ADDR_HI(cf8) (((cf8) & 0x0f000000) >> 16)
-#define CF8_ENABLED(cf8) (!!((cf8) & 0x80000000))
-
     struct hvm_ioreq_server *s;
     uint32_t cf8;
     uint8_t type;
@@ -2388,9 +2547,19 @@ static struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
 
         type = IOREQ_TYPE_PCI_CONFIG;
         addr = ((uint64_t)sbdf << 32) |
-               CF8_ADDR_HI(cf8) |
                CF8_ADDR_LO(cf8) |
                (p->addr & 3);
+        /* AMD extended configuration space access? */
+        if ( CF8_ADDR_HI(cf8) &&
+             d->arch.x86_vendor == X86_VENDOR_AMD &&
+             d->arch.x86 >= 0x10 && d->arch.x86 <= 0x17 )
+        {
+            uint64_t msr_val;
+
+            if ( !rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) &&
+                 (msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
+                addr |= CF8_ADDR_HI(cf8);
+        }
     }
     else
     {
@@ -2444,17 +2613,11 @@ static struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
     }
 
     return d->arch.hvm_domain.default_ioreq_server;
-
-#undef CF8_ADDR_ENABLED
-#undef CF8_ADDR_HI
-#undef CF8_ADDR_LO
-#undef CF8_BDF
 }
 
-int hvm_buffered_io_send(ioreq_t *p)
+static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, ioreq_t *p)
 {
     struct domain *d = current->domain;
-    struct hvm_ioreq_server *s = hvm_select_ioreq_server(d, p);
     struct hvm_ioreq_page *iorp;
     buffered_iopage_t *pg;
     buf_ioreq_t bp = { .data = p->data,
@@ -2467,14 +2630,11 @@ int hvm_buffered_io_send(ioreq_t *p)
     /* Ensure buffered_iopage fits in a page */
     BUILD_BUG_ON(sizeof(buffered_iopage_t) > PAGE_SIZE);
 
-    if ( !s )
-        return 0;
-
     iorp = &s->bufioreq;
     pg = iorp->va;
 
     if ( !pg )
-        return 0;
+        return X86EMUL_UNHANDLEABLE;
 
     /*
      * Return 0 for the cases we can't deal with:
@@ -2504,51 +2664,63 @@ int hvm_buffered_io_send(ioreq_t *p)
         break;
     default:
         gdprintk(XENLOG_WARNING, "unexpected ioreq size: %u\n", p->size);
-        return 0;
+        return X86EMUL_UNHANDLEABLE;
     }
 
     spin_lock(&s->bufioreq_lock);
 
-    if ( (pg->write_pointer - pg->read_pointer) >=
+    if ( (pg->ptrs.write_pointer - pg->ptrs.read_pointer) >=
          (IOREQ_BUFFER_SLOT_NUM - qw) )
     {
         /* The queue is full: send the iopacket through the normal path. */
         spin_unlock(&s->bufioreq_lock);
-        return 0;
+        return X86EMUL_UNHANDLEABLE;
     }
 
-    pg->buf_ioreq[pg->write_pointer % IOREQ_BUFFER_SLOT_NUM] = bp;
+    pg->buf_ioreq[pg->ptrs.write_pointer % IOREQ_BUFFER_SLOT_NUM] = bp;
 
     if ( qw )
     {
         bp.data = p->data >> 32;
-        pg->buf_ioreq[(pg->write_pointer+1) % IOREQ_BUFFER_SLOT_NUM] = bp;
+        pg->buf_ioreq[(pg->ptrs.write_pointer+1) % IOREQ_BUFFER_SLOT_NUM] = bp;
     }
 
     /* Make the ioreq_t visible /before/ write_pointer. */
     wmb();
-    pg->write_pointer += qw ? 2 : 1;
+    pg->ptrs.write_pointer += qw ? 2 : 1;
+
+    /* Canonicalize read/write pointers to prevent their overflow. */
+    while ( s->bufioreq_atomic && qw++ < IOREQ_BUFFER_SLOT_NUM &&
+            pg->ptrs.read_pointer >= IOREQ_BUFFER_SLOT_NUM )
+    {
+        union bufioreq_pointers old = pg->ptrs, new;
+        unsigned int n = old.read_pointer / IOREQ_BUFFER_SLOT_NUM;
+
+        new.read_pointer = old.read_pointer - n * IOREQ_BUFFER_SLOT_NUM;
+        new.write_pointer = old.write_pointer - n * IOREQ_BUFFER_SLOT_NUM;
+        cmpxchg(&pg->ptrs.full, old.full, new.full);
+    }
 
     notify_via_xen_event_channel(d, s->bufioreq_evtchn);
     spin_unlock(&s->bufioreq_lock);
 
-    return 1;
-}
-
-bool_t hvm_has_dm(struct domain *d)
-{
-    return !list_empty(&d->arch.hvm_domain.ioreq_server.list);
+    return X86EMUL_OKAY;
 }
 
-bool_t hvm_send_assist_req_to_ioreq_server(struct hvm_ioreq_server *s,
-                                           ioreq_t *proto_p)
+int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
+                   bool_t buffered)
 {
     struct vcpu *curr = current;
     struct domain *d = curr->domain;
     struct hvm_ioreq_vcpu *sv;
 
+    ASSERT(s);
+
+    if ( buffered )
+        return hvm_send_buffered_ioreq(s, proto_p);
+
     if ( unlikely(!vcpu_start_shutdown_deferral(curr)) )
-        return 0; /* implicitly bins the i/o operation */
+        return X86EMUL_RETRY;
 
     list_for_each_entry ( sv,
                           &s->ioreq_vcpu_list,
@@ -2561,18 +2733,16 @@ bool_t hvm_send_assist_req_to_ioreq_server(struct hvm_ioreq_server *s,
 
             if ( unlikely(p->state != STATE_IOREQ_NONE) )
             {
-                gdprintk(XENLOG_ERR,
-                         "Device model set bad IO state %d.\n",
-                         p->state);
-                goto crash;
+                gprintk(XENLOG_ERR, "device model set bad IO state %d\n",
+                        p->state);
+                break;
             }
 
             if ( unlikely(p->vp_eport != port) )
             {
-                gdprintk(XENLOG_ERR,
-                         "Device model set bad event channel %d.\n",
-                         p->vp_eport);
-                goto crash;
+                gprintk(XENLOG_ERR, "device model set bad event channel %d\n",
+                        p->vp_eport);
+                break;
             }
 
             proto_p->state = STATE_IOREQ_NONE;
@@ -2588,68 +2758,28 @@ bool_t hvm_send_assist_req_to_ioreq_server(struct hvm_ioreq_server *s,
              */
             p->state = STATE_IOREQ_READY;
             notify_via_xen_event_channel(d, port);
-            break;
-        }
-    }
-
-    return 1;
-
- crash:
-    domain_crash(d);
-    return 0;
-}
 
-static bool_t hvm_complete_assist_req(ioreq_t *p)
-{
-    switch ( p->type )
-    {
-    case IOREQ_TYPE_COPY:
-    case IOREQ_TYPE_PIO:
-        if ( p->dir == IOREQ_READ )
-        {
-            if ( !p->data_is_ptr )
-                p->data = ~0ul;
-            else
-            {
-                int i, step = p->df ? -p->size : p->size;
-                uint32_t data = ~0;
-
-                for ( i = 0; i < p->count; i++ )
-                    hvm_copy_to_guest_phys(p->data + step * i, &data,
-                                           p->size);
-            }
+            sv->pending = 1;
+            return X86EMUL_RETRY;
         }
-        /* FALLTHRU */
-    default:
-        p->state = STATE_IORESP_READY;
-        hvm_io_assist(p);
-        break;
     }
 
-    return 1;
-}
-
-bool_t hvm_send_assist_req(ioreq_t *p)
-{
-    struct hvm_ioreq_server *s = hvm_select_ioreq_server(current->domain, p);
-
-    if ( !s )
-        return hvm_complete_assist_req(p);
-
-    return hvm_send_assist_req_to_ioreq_server(s, p);
+    return X86EMUL_UNHANDLEABLE;
 }
 
-void hvm_broadcast_assist_req(ioreq_t *p)
+unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool_t buffered)
 {
     struct domain *d = current->domain;
     struct hvm_ioreq_server *s;
-
-    ASSERT(p->type == IOREQ_TYPE_INVALIDATE);
+    unsigned int failed = 0;
 
     list_for_each_entry ( s,
                           &d->arch.hvm_domain.ioreq_server.list,
                           list_entry )
-        (void) hvm_send_assist_req_to_ioreq_server(s, p);
+        if ( hvm_send_ioreq(s, p, buffered) == X86EMUL_UNHANDLEABLE )
+            failed++;
+
+    return failed;
 }
 
 void hvm_hlt(unsigned long rflags)
@@ -2667,7 +2797,7 @@ void hvm_hlt(unsigned long rflags)
     if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
         return hvm_vcpu_down(curr);
 
-    do_sched_op_compat(SCHEDOP_block, 0);
+    do_sched_op(SCHEDOP_block, guest_handle_from_ptr(NULL, void));
 
     HVMTRACE_1D(HLT, /* pending = */ vcpu_runnable(curr));
 }
@@ -2678,9 +2808,9 @@ void hvm_triple_fault(void)
     struct domain *d = v->domain;
     u8 reason = d->arch.hvm_domain.params[HVM_PARAM_TRIPLE_FAULT_REASON];
 
-    gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - "
-             "invoking HVM shutdown action %"PRIu8".\n",
-             v->vcpu_id, reason);
+    gprintk(XENLOG_INFO,
+            "Triple fault - invoking HVM shutdown action %d\n",
+            reason);
     domain_shutdown(d, reason);
 }
 
@@ -2741,19 +2871,21 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
     p2m_type_t p2mt;
     p2m_access_t p2ma;
     mfn_t mfn;
-    struct vcpu *v = current;
-    struct p2m_domain *p2m;
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct p2m_domain *p2m, *hostp2m;
     int rc, fall_through = 0, paged = 0;
     int sharing_enomem = 0;
-    mem_event_request_t *req_ptr = NULL;
+    vm_event_request_t *req_ptr = NULL;
+    bool_t ap2m_active;
 
     /* On Nested Virtualization, walk the guest page table.
      * If this succeeds, all is fine.
      * If this fails, inject a nested page fault into the guest.
      */
-    if ( nestedhvm_enabled(v->domain)
-        && nestedhvm_vcpu_in_guestmode(v)
-        && nestedhvm_paging_mode_hap(v) )
+    if ( nestedhvm_enabled(currd)
+        && nestedhvm_vcpu_in_guestmode(curr)
+        && nestedhvm_paging_mode_hap(curr) )
     {
         int rv;
 
@@ -2765,7 +2897,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
          * the same as for shadow paging.
          */
 
-         rv = nestedhvm_hap_nested_page_fault(v, &gpa,
+         rv = nestedhvm_hap_nested_page_fault(curr, &gpa,
                                               npfec.read_access,
                                               npfec.write_access,
                                               npfec.insn_fetch);
@@ -2795,8 +2927,8 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
      * - 32-bit WinXP (& older Windows) on AMD CPUs for LAPIC accesses,
      * - newer Windows (like Server 2012) for HPET accesses.
      */
-    if ( !nestedhvm_vcpu_in_guestmode(v)
-         && is_hvm_vcpu(v)
+    if ( !nestedhvm_vcpu_in_guestmode(curr)
+         && is_hvm_domain(currd)
          && hvm_mmio_internal(gpa) )
     {
         if ( !handle_mmio_with_translation(gla, gpa >> PAGE_SHIFT, npfec) )
@@ -2805,17 +2937,38 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
         goto out;
     }
 
-    p2m = p2m_get_hostp2m(v->domain);
-    mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 
+    ap2m_active = altp2m_active(currd);
+
+    /*
+     * Take a lock on the host p2m speculatively, to avoid potential
+     * locking order problems later and to handle unshare etc.
+     */
+    hostp2m = p2m_get_hostp2m(currd);
+    mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma,
                               P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0),
                               NULL);
 
+    if ( ap2m_active )
+    {
+        if ( p2m_altp2m_lazy_copy(curr, gpa, gla, npfec, &p2m) )
+        {
+            /* entry was lazily copied from host -- retry */
+            __put_gfn(hostp2m, gfn);
+            rc = 1;
+            goto out;
+        }
+
+        mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL);
+    }
+    else
+        p2m = hostp2m;
+
     /* Check access permissions first, then handle faults */
     if ( mfn_x(mfn) != INVALID_MFN )
     {
         bool_t violation;
 
-        /* If the access is against the permissions, then send to mem_event */
+        /* If the access is against the permissions, then send to vm_event */
         switch (p2ma)
         {
         case p2m_access_n:
@@ -2849,6 +3002,20 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
 
         if ( violation )
         {
+            /* Should #VE be emulated for this fault? */
+            if ( p2m_is_altp2m(p2m) && !cpu_has_vmx_virt_exceptions )
+            {
+                bool_t sve;
+
+                p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, &sve);
+
+                if ( !sve && altp2m_vcpu_emulate_ve(curr) )
+                {
+                    rc = 1;
+                    goto out_put_gfn;
+                }
+            }
+
             if ( p2m_mem_access_check(gpa, gla, npfec, &req_ptr) )
             {
                 fall_through = 1;
@@ -2865,12 +3032,15 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
      * to the mmio handler.
      */
     if ( (p2mt == p2m_mmio_dm) || 
-         (npfec.write_access && (p2mt == p2m_ram_ro)) )
+         (npfec.write_access &&
+          (p2m_is_discard_write(p2mt) || (p2mt == p2m_mmio_write_dm))) )
     {
-        put_gfn(p2m->domain, gfn);
+        __put_gfn(p2m, gfn);
+        if ( ap2m_active )
+            __put_gfn(hostp2m, gfn);
 
         rc = 0;
-        if ( unlikely(is_pvh_vcpu(v)) )
+        if ( unlikely(is_pvh_domain(currd)) )
             goto out;
 
         if ( !handle_mmio_with_translation(gla, gpa >> PAGE_SHIFT, npfec) )
@@ -2886,9 +3056,9 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
     /* Mem sharing: unshare the page and try again */
     if ( npfec.write_access && (p2mt == p2m_ram_shared) )
     {
-        ASSERT(!p2m_is_nestedp2m(p2m));
+        ASSERT(p2m_is_hostp2m(p2m));
         sharing_enomem = 
-            (mem_sharing_unshare_page(p2m->domain, gfn, 0) < 0);
+            (mem_sharing_unshare_page(currd, gfn, 0) < 0);
         rc = 1;
         goto out_put_gfn;
     }
@@ -2896,6 +3066,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
     /* Spurious fault? PoD and log-dirty also take this path. */
     if ( p2m_is_ram(p2mt) )
     {
+        rc = 1;
         /*
          * Page log dirty is always done with order 0. If this mfn resides in
          * a large page, we do not change other pages type within that large
@@ -2903,20 +3074,18 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
          */
         if ( npfec.write_access )
         {
-            paging_mark_dirty(v->domain, mfn_x(mfn));
-            p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty, p2m_ram_rw);
-        }
-        rc = 1;
-        goto out_put_gfn;
-    }
+            paging_mark_dirty(currd, mfn_x(mfn));
+            /*
+             * If p2m is really an altp2m, unlock here to avoid lock ordering
+             * violation when the change below is propagated from host p2m.
+             */
+            if ( ap2m_active )
+                __put_gfn(p2m, gfn);
+            p2m_change_type_one(currd, gfn, p2m_ram_logdirty, p2m_ram_rw);
+            __put_gfn(ap2m_active ? hostp2m : p2m, gfn);
 
-    /* Shouldn't happen: Maybe the guest was writing to a r/o grant mapping? */
-    if ( npfec.write_access && (p2mt == p2m_grant_map_ro) )
-    {
-        gdprintk(XENLOG_WARNING,
-                 "trying to write to read-only grant mapping\n");
-        hvm_inject_hw_exception(TRAP_gp_fault, 0);
-        rc = 1;
+            goto out;
+        }
         goto out_put_gfn;
     }
 
@@ -2926,28 +3095,30 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
     rc = fall_through;
 
 out_put_gfn:
-    put_gfn(p2m->domain, gfn);
+    __put_gfn(p2m, gfn);
+    if ( ap2m_active )
+        __put_gfn(hostp2m, gfn);
 out:
     /* All of these are delayed until we exit, since we might 
      * sleep on event ring wait queues, and we must not hold
      * locks in such circumstance */
     if ( paged )
-        p2m_mem_paging_populate(v->domain, gfn);
+        p2m_mem_paging_populate(currd, gfn);
     if ( sharing_enomem )
     {
         int rv;
-        if ( (rv = mem_sharing_notify_enomem(v->domain, gfn, 1)) < 0 )
+        if ( (rv = mem_sharing_notify_enomem(currd, gfn, 1)) < 0 )
         {
             gdprintk(XENLOG_ERR, "Domain %hu attempt to unshare "
                      "gfn %lx, ENOMEM and no helper (rc %d)\n",
-                        v->domain->domain_id, gfn, rv);
+                     currd->domain_id, gfn, rv);
             /* Crash the domain */
             rc = 0;
         }
     }
     if ( req_ptr )
     {
-        mem_access_send_req(v->domain, req_ptr);
+        mem_access_send_req(currd, req_ptr);
         xfree(req_ptr);
     }
     return rc;
@@ -2956,11 +3127,14 @@ out:
 int hvm_handle_xsetbv(u32 index, u64 new_bv)
 {
     struct segment_register sreg;
+    struct vcpu *curr = current;
 
-    hvm_get_segment_register(current, x86_seg_ss, &sreg);
+    hvm_get_segment_register(curr, x86_seg_ss, &sreg);
     if ( sreg.attr.fields.dpl != 0 )
         goto err;
 
+    hvm_event_crX(XCR0, new_bv, curr->arch.xcr0);
+
     if ( handle_xsetbv(index, new_bv) )
         goto err;
 
@@ -2973,15 +3147,16 @@ err:
 int hvm_set_efer(uint64_t value)
 {
     struct vcpu *v = current;
-    uint64_t efer_validbits;
+    const char *errstr;
 
     value &= ~EFER_LMA;
 
-    efer_validbits = EFER_FFXSE | EFER_LMSLE | EFER_LME | EFER_NX | EFER_SCE;
-    if ( !hvm_efer_valid(v->domain, value, efer_validbits) )
+    errstr = hvm_efer_valid(v, value, -1);
+    if ( errstr )
     {
-        gdprintk(XENLOG_WARNING, "Trying to set reserved bit in "
-                 "EFER: %#"PRIx64"\n", value);
+        printk(XENLOG_G_WARNING
+               "%pv: Invalid EFER update: %#"PRIx64" -> %#"PRIx64" - %s\n",
+               v, v->arch.hvm_vcpu.guest_efer, value, errstr);
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
         return X86EMUL_EXCEPTION;
     }
@@ -3053,13 +3228,13 @@ int hvm_mov_to_cr(unsigned int cr, unsigned int gpr)
     switch ( cr )
     {
     case 0:
-        return hvm_set_cr0(val);
+        return hvm_set_cr0(val, 1);
 
     case 3:
-        return hvm_set_cr3(val);
+        return hvm_set_cr3(val, 1);
 
     case 4:
-        return hvm_set_cr4(val);
+        return hvm_set_cr4(val, 1);
 
     case 8:
         vlapic_set_reg(vcpu_vlapic(curr), APIC_TASKPRI, ((val & 0x0f) << 4));
@@ -3156,12 +3331,13 @@ static void hvm_update_cr(struct vcpu *v, unsigned int cr, unsigned long value)
     hvm_update_guest_cr(v, cr);
 }
 
-int hvm_set_cr0(unsigned long value)
+int hvm_set_cr0(unsigned long value, bool_t may_defer)
 {
     struct vcpu *v = current;
     struct domain *d = v->domain;
     unsigned long gfn, old_value = v->arch.hvm_vcpu.guest_cr[0];
     struct page_info *page;
+    struct arch_domain *currad = &v->domain->arch;
 
     HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value);
 
@@ -3191,6 +3367,21 @@ int hvm_set_cr0(unsigned long value)
         goto gpf;
     }
 
+    if ( may_defer && unlikely(currad->monitor.write_ctrlreg_enabled &
+                               monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) )
+    {
+        ASSERT(currad->event_write_data != NULL);
+
+        if ( hvm_event_crX(CR0, value, old_value) )
+        {
+            /* The actual write will occur in hvm_do_resume(), if permitted. */
+            currad->event_write_data[v->vcpu_id].do_write.cr0 = 1;
+            currad->event_write_data[v->vcpu_id].cr0 = value;
+
+            return X86EMUL_OKAY;
+        }
+    }
+
     if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) )
     {
         if ( v->arch.hvm_vcpu.guest_efer & EFER_LME )
@@ -3257,7 +3448,6 @@ int hvm_set_cr0(unsigned long value)
         hvm_funcs.handle_cd(v, value);
 
     hvm_update_cr(v, 0, value);
-    hvm_memory_event_cr0(value, old_value);
 
     if ( (value ^ old_value) & X86_CR0_PG ) {
         if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
@@ -3273,11 +3463,27 @@ int hvm_set_cr0(unsigned long value)
     return X86EMUL_EXCEPTION;
 }
 
-int hvm_set_cr3(unsigned long value)
+int hvm_set_cr3(unsigned long value, bool_t may_defer)
 {
     struct vcpu *v = current;
     struct page_info *page;
-    unsigned long old;
+    unsigned long old = v->arch.hvm_vcpu.guest_cr[3];
+    struct arch_domain *currad = &v->domain->arch;
+
+    if ( may_defer && unlikely(currad->monitor.write_ctrlreg_enabled &
+                               monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
+    {
+        ASSERT(currad->event_write_data != NULL);
+
+        if ( hvm_event_crX(CR3, value, old) )
+        {
+            /* The actual write will occur in hvm_do_resume(), if permitted. */
+            currad->event_write_data[v->vcpu_id].do_write.cr3 = 1;
+            currad->event_write_data[v->vcpu_id].cr3 = value;
+
+            return X86EMUL_OKAY;
+        }
+    }
 
     if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
          (value != v->arch.hvm_vcpu.guest_cr[3]) )
@@ -3295,10 +3501,8 @@ int hvm_set_cr3(unsigned long value)
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
     }
 
-    old=v->arch.hvm_vcpu.guest_cr[3];
     v->arch.hvm_vcpu.guest_cr[3] = value;
     paging_update_cr3(v);
-    hvm_memory_event_cr3(value, old);
     return X86EMUL_OKAY;
 
  bad_cr3:
@@ -3307,10 +3511,11 @@ int hvm_set_cr3(unsigned long value)
     return X86EMUL_UNHANDLEABLE;
 }
 
-int hvm_set_cr4(unsigned long value)
+int hvm_set_cr4(unsigned long value, bool_t may_defer)
 {
     struct vcpu *v = current;
     unsigned long old_cr;
+    struct arch_domain *currad = &v->domain->arch;
 
     if ( value & hvm_cr4_guest_reserved_bits(v, 0) )
     {
@@ -3338,10 +3543,24 @@ int hvm_set_cr4(unsigned long value)
         goto gpf;
     }
 
-    hvm_update_cr(v, 4, value);
-    hvm_memory_event_cr4(value, old_cr);
+    if ( may_defer && unlikely(currad->monitor.write_ctrlreg_enabled &
+                               monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) )
+    {
+        ASSERT(currad->event_write_data != NULL);
 
-    /*
+        if ( hvm_event_crX(CR4, value, old_cr) )
+        {
+            /* The actual write will occur in hvm_do_resume(), if permitted. */
+            currad->event_write_data[v->vcpu_id].do_write.cr4 = 1;
+            currad->event_write_data[v->vcpu_id].cr4 = value;
+
+            return X86EMUL_OKAY;
+        }
+    }
+
+    hvm_update_cr(v, 4, value);
+
+    /*
      * Modifying CR4.{PSE,PAE,PGE,SMEP}, or clearing CR4.PCIDE
      * invalidate all TLB entries.
      */
@@ -3447,8 +3666,8 @@ int hvm_virtual_to_linear_addr(
 
 /* On non-NULL return, we leave this function holding an additional 
  * ref on the underlying mfn, if any */
-static void *__hvm_map_guest_frame(unsigned long gfn, bool_t writable,
-                                   bool_t permanent)
+static void *_hvm_map_guest_frame(unsigned long gfn, bool_t permanent,
+                                  bool_t *writable)
 {
     void *map;
     p2m_type_t p2mt;
@@ -3471,7 +3690,12 @@ static void *__hvm_map_guest_frame(unsigned long gfn, bool_t writable,
     }
 
     if ( writable )
-        paging_mark_dirty(d, page_to_mfn(page));
+    {
+        if ( !p2m_is_discard_write(p2mt) )
+            paging_mark_dirty(d, page_to_mfn(page));
+        else
+            *writable = 0;
+    }
 
     if ( !permanent )
         return __map_domain_page(page);
@@ -3483,14 +3707,16 @@ static void *__hvm_map_guest_frame(unsigned long gfn, bool_t writable,
     return map;
 }
 
-void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent)
+void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
+                             bool_t *writable)
 {
-    return __hvm_map_guest_frame(gfn, 1, permanent);
+    *writable = 1;
+    return _hvm_map_guest_frame(gfn, permanent, writable);
 }
 
 void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent)
 {
-    return __hvm_map_guest_frame(gfn, 0, permanent);
+    return _hvm_map_guest_frame(gfn, permanent, NULL);
 }
 
 void hvm_unmap_guest_frame(void *p, bool_t permanent)
@@ -3510,7 +3736,7 @@ void hvm_unmap_guest_frame(void *p, bool_t permanent)
     put_page(mfn_to_page(mfn));
 }
 
-static void *hvm_map_entry(unsigned long va)
+static void *hvm_map_entry(unsigned long va, bool_t *writable)
 {
     unsigned long gfn;
     uint32_t pfec;
@@ -3533,7 +3759,7 @@ static void *hvm_map_entry(unsigned long va)
     if ( (pfec == PFEC_page_paged) || (pfec == PFEC_page_shared) )
         goto fail;
 
-    v = hvm_map_guest_frame_rw(gfn, 0);
+    v = hvm_map_guest_frame_rw(gfn, 0, writable);
     if ( v == NULL )
         goto fail;
 
@@ -3555,6 +3781,7 @@ static int hvm_load_segment_selector(
     struct segment_register desctab, cs, segr;
     struct desc_struct *pdesc, desc;
     u8 dpl, rpl, cpl;
+    bool_t writable;
     int fault_type = TRAP_invalid_tss;
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct vcpu *v = current;
@@ -3591,7 +3818,7 @@ static int hvm_load_segment_selector(
     if ( ((sel & 0xfff8) + 7) > desctab.limit )
         goto fail;
 
-    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8));
+    pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8), &writable);
     if ( pdesc == NULL )
         goto hvm_map_fail;
 
@@ -3650,6 +3877,7 @@ static int hvm_load_segment_selector(
             break;
         }
     } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */
+              writable && /* except if we are to discard writes */
               (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) );
 
     /* Force the Accessed flag in our local copy. */
@@ -3687,6 +3915,7 @@ void hvm_task_switch(
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct segment_register gdt, tr, prev_tr, segr;
     struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc;
+    bool_t otd_writable, ntd_writable;
     unsigned long eflags;
     int exn_raised, rc;
     struct {
@@ -3713,11 +3942,12 @@ void hvm_task_switch(
         goto out;
     }
 
-    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8)); 
+    optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8),
+                               &otd_writable);
     if ( optss_desc == NULL )
         goto out;
 
-    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8)); 
+    nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8), &ntd_writable);
     if ( nptss_desc == NULL )
         goto out;
 
@@ -3803,7 +4033,7 @@ void hvm_task_switch(
         goto out;
 
 
-    if ( hvm_set_cr3(tss.cr3) )
+    if ( hvm_set_cr3(tss.cr3, 1) )
         goto out;
 
     regs->eip    = tss.eip;
@@ -3849,11 +4079,11 @@ void hvm_task_switch(
     v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS;
     hvm_update_guest_cr(v, 0);
 
-    if ( (taskswitch_reason == TSW_iret) ||
-         (taskswitch_reason == TSW_jmp) )
+    if ( (taskswitch_reason == TSW_iret ||
+          taskswitch_reason == TSW_jmp) && otd_writable )
         clear_bit(41, optss_desc); /* clear B flag of old task */
 
-    if ( taskswitch_reason != TSW_iret )
+    if ( taskswitch_reason != TSW_iret && ntd_writable )
         set_bit(41, nptss_desc); /* set B flag of new task */
 
     if ( errcode >= 0 )
@@ -3969,7 +4199,7 @@ static enum hvm_copy_result __hvm_copy(
 
         if ( flags & HVMCOPY_to_guest )
         {
-            if ( p2mt == p2m_ram_ro )
+            if ( p2m_is_discard_write(p2mt) )
             {
                 static unsigned long lastpage;
                 if ( xchg(&lastpage, gfn) != gfn )
@@ -4063,7 +4293,7 @@ static enum hvm_copy_result __hvm_clear(paddr_t addr, int size)
 
         p = (char *)__map_domain_page(page) + (addr & ~PAGE_MASK);
 
-        if ( p2mt == p2m_ram_ro )
+        if ( p2m_is_discard_write(p2mt) )
         {
             static unsigned long lastpage;
             if ( xchg(&lastpage, gfn) != gfn )
@@ -4216,6 +4446,10 @@ void hvm_hypervisor_cpuid_leaf(uint32_t sub_idx,
          * foreign pages) has valid IOMMU entries.
          */
         *eax |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
+
+        /* Indicate presence of vcpu id and set it in ebx */
+        *eax |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
+        *ebx = current->vcpu_id;
     }
 }
 
@@ -4343,12 +4577,41 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
     }
 }
 
+static uint64_t _hvm_rdtsc_intercept(void)
+{
+    struct vcpu *curr = current;
+#if !defined(NDEBUG) || defined(PERF_COUNTERS)
+    struct domain *currd = curr->domain;
+
+    if ( currd->arch.vtsc )
+        switch ( hvm_guest_x86_mode(curr) )
+        {
+            struct segment_register sreg;
+
+        case 8:
+        case 4:
+        case 2:
+            hvm_get_segment_register(curr, x86_seg_ss, &sreg);
+            if ( unlikely(sreg.attr.fields.dpl) )
+            {
+        case 1:
+                currd->arch.vtsc_usercount++;
+                break;
+            }
+            /* fall through */
+        case 0:
+            currd->arch.vtsc_kerncount++;
+            break;
+        }
+#endif
+
+    return hvm_get_guest_tsc(curr);
+}
+
 void hvm_rdtsc_intercept(struct cpu_user_regs *regs)
 {
-    uint64_t tsc;
-    struct vcpu *v = current;
+    uint64_t tsc = _hvm_rdtsc_intercept();
 
-    tsc = hvm_get_guest_tsc(v);
     regs->eax = (uint32_t)tsc;
     regs->edx = (uint32_t)(tsc >> 32);
 
@@ -4376,7 +4639,7 @@ int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
         break;
 
     case MSR_IA32_TSC:
-        *msr_content = hvm_get_guest_tsc(v);
+        *msr_content = _hvm_rdtsc_intercept();
         break;
 
     case MSR_IA32_TSC_ADJUST:
@@ -4472,12 +4735,14 @@ int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
     goto out;
 }
 
-int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
+int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content,
+                            bool_t may_defer)
 {
     struct vcpu *v = current;
     bool_t mtrr;
     unsigned int edx, index;
     int ret = X86EMUL_OKAY;
+    struct arch_domain *currad = &current->domain->arch;
 
     HVMTRACE_3D(MSR_WRITE, msr,
                (uint32_t)msr_content, (uint32_t)(msr_content >> 32));
@@ -4485,7 +4750,18 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
     hvm_cpuid(1, NULL, NULL, NULL, &edx);
     mtrr = !!(edx & cpufeat_mask(X86_FEATURE_MTRR));
 
-    hvm_memory_event_msr(msr, msr_content);
+    if ( may_defer && unlikely(currad->monitor.mov_to_msr_enabled) )
+    {
+        ASSERT(currad->event_write_data != NULL);
+
+        /* The actual write will occur in hvm_do_resume() (if permitted). */
+        currad->event_write_data[v->vcpu_id].do_write.msr = 1;
+        currad->event_write_data[v->vcpu_id].msr = msr;
+        currad->event_write_data[v->vcpu_id].value = msr_content;
+
+        hvm_event_msr(msr, msr_content);
+        return X86EMUL_OKAY;
+    }
 
     switch ( msr )
     {
@@ -4662,7 +4938,6 @@ static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 
     switch ( cmd & MEMOP_CMD_MASK )
     {
-    case XENMEM_memory_map:
     case XENMEM_machine_memory_map:
     case XENMEM_machphys_mapping:
         return -ENOSYS;
@@ -4692,7 +4967,7 @@ static long hvm_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 }
 
 static long hvm_vcpu_op(
-    int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
+    int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     long rc;
 
@@ -4738,7 +5013,6 @@ static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 
     switch ( cmd & MEMOP_CMD_MASK )
     {
-    case XENMEM_memory_map:
     case XENMEM_machine_memory_map:
     case XENMEM_machphys_mapping:
         return -ENOSYS;
@@ -4751,7 +5025,7 @@ static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 }
 
 static long hvm_vcpu_op_compat32(
-    int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
+    int cmd, unsigned vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     long rc;
 
@@ -4848,12 +5122,16 @@ static hvm_hypercall_t *const pvh_hypercall64_table[NR_hypercalls] = {
     HYPERCALL(hvm_op),
     HYPERCALL(sysctl),
     HYPERCALL(domctl),
+    HYPERCALL(xenpmu_op),
     [ __HYPERVISOR_arch_1 ] = (hvm_hypercall_t *)paging_domctl_continuation
 };
 
+extern const uint8_t hypercall_args_table[], compat_hypercall_args_table[];
+
 int hvm_do_hypercall(struct cpu_user_regs *regs)
 {
     struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
     struct segment_register sreg;
     int mode = hvm_guest_x86_mode(curr);
     uint32_t eax = regs->eax;
@@ -4874,12 +5152,12 @@ int hvm_do_hypercall(struct cpu_user_regs *regs)
         break;
     }
 
-    if ( (eax & 0x80000000) && is_viridian_domain(curr->domain) )
+    if ( (eax & 0x80000000) && is_viridian_domain(currd) )
         return viridian_hypercall(regs);
 
     if ( (eax >= NR_hypercalls) ||
-         (is_pvh_vcpu(curr) ? !pvh_hypercall64_table[eax]
-                            : !hvm_hypercall32_table[eax]) )
+         (is_pvh_domain(currd) ? !pvh_hypercall64_table[eax]
+                               : !hvm_hypercall32_table[eax]) )
     {
         regs->eax = -ENOSYS;
         return HVM_HCALL_completed;
@@ -4889,36 +5167,95 @@ int hvm_do_hypercall(struct cpu_user_regs *regs)
 
     if ( mode == 8 )
     {
+        unsigned long rdi = regs->rdi;
+        unsigned long rsi = regs->rsi;
+        unsigned long rdx = regs->rdx;
+        unsigned long r10 = regs->r10;
+        unsigned long r8 = regs->r8;
+        unsigned long r9 = regs->r9;
+
         HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx, %lx)",
-                    eax, regs->rdi, regs->rsi, regs->rdx,
-                    regs->r10, regs->r8, regs->r9);
+                    eax, rdi, rsi, rdx, r10, r8, r9);
+
+#ifndef NDEBUG
+        /* Deliberately corrupt parameter regs not used by this hypercall. */
+        switch ( hypercall_args_table[eax] )
+        {
+        case 0: rdi = 0xdeadbeefdeadf00dUL;
+        case 1: rsi = 0xdeadbeefdeadf00dUL;
+        case 2: rdx = 0xdeadbeefdeadf00dUL;
+        case 3: r10 = 0xdeadbeefdeadf00dUL;
+        case 4: r8 = 0xdeadbeefdeadf00dUL;
+        case 5: r9 = 0xdeadbeefdeadf00dUL;
+        }
+#endif
 
         curr->arch.hvm_vcpu.hcall_64bit = 1;
-        if ( is_pvh_vcpu(curr) )
-            regs->rax = pvh_hypercall64_table[eax](regs->rdi, regs->rsi,
-                                                   regs->rdx, regs->r10,
-                                                   regs->r8, regs->r9);
-        else
-            regs->rax = hvm_hypercall64_table[eax](regs->rdi, regs->rsi,
-                                                   regs->rdx, regs->r10,
-                                                   regs->r8, regs->r9);
+        regs->rax = (is_pvh_domain(currd)
+                     ? pvh_hypercall64_table
+                     : hvm_hypercall64_table)[eax](rdi, rsi, rdx, r10, r8, r9);
         curr->arch.hvm_vcpu.hcall_64bit = 0;
+
+#ifndef NDEBUG
+        if ( !curr->arch.hvm_vcpu.hcall_preempted )
+        {
+            /* Deliberately corrupt parameter regs used by this hypercall. */
+            switch ( hypercall_args_table[eax] )
+            {
+            case 6: regs->r9  = 0xdeadbeefdeadf00dUL;
+            case 5: regs->r8  = 0xdeadbeefdeadf00dUL;
+            case 4: regs->r10 = 0xdeadbeefdeadf00dUL;
+            case 3: regs->edx = 0xdeadbeefdeadf00dUL;
+            case 2: regs->esi = 0xdeadbeefdeadf00dUL;
+            case 1: regs->edi = 0xdeadbeefdeadf00dUL;
+            }
+        }
+#endif
     }
-    else if ( unlikely(is_pvh_vcpu(curr)) )
+    else if ( unlikely(is_pvh_domain(currd)) )
         regs->_eax = -ENOSYS; /* PVH 32bitfixme. */
     else
     {
+        unsigned int ebx = regs->_ebx;
+        unsigned int ecx = regs->_ecx;
+        unsigned int edx = regs->_edx;
+        unsigned int esi = regs->_esi;
+        unsigned int edi = regs->_edi;
+        unsigned int ebp = regs->_ebp;
+
         HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x, %x)", eax,
-                    (uint32_t)regs->ebx, (uint32_t)regs->ecx,
-                    (uint32_t)regs->edx, (uint32_t)regs->esi,
-                    (uint32_t)regs->edi, (uint32_t)regs->ebp);
+                    ebx, ecx, edx, esi, edi, ebp);
+
+#ifndef NDEBUG
+        /* Deliberately corrupt parameter regs not used by this hypercall. */
+        switch ( compat_hypercall_args_table[eax] )
+        {
+        case 0: ebx = 0xdeadf00d;
+        case 1: ecx = 0xdeadf00d;
+        case 2: edx = 0xdeadf00d;
+        case 3: esi = 0xdeadf00d;
+        case 4: edi = 0xdeadf00d;
+        case 5: ebp = 0xdeadf00d;
+        }
+#endif
+
+        regs->_eax = hvm_hypercall32_table[eax](ebx, ecx, edx, esi, edi, ebp);
 
-        regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx,
-                                               (uint32_t)regs->ecx,
-                                               (uint32_t)regs->edx,
-                                               (uint32_t)regs->esi,
-                                               (uint32_t)regs->edi,
-                                               (uint32_t)regs->ebp);
+#ifndef NDEBUG
+        if ( !curr->arch.hvm_vcpu.hcall_preempted )
+        {
+            /* Deliberately corrupt parameter regs used by this hypercall. */
+            switch ( compat_hypercall_args_table[eax] )
+            {
+            case 6: regs->ebp = 0xdeadf00d;
+            case 5: regs->edi = 0xdeadf00d;
+            case 4: regs->esi = 0xdeadf00d;
+            case 3: regs->edx = 0xdeadf00d;
+            case 2: regs->ecx = 0xdeadf00d;
+            case 1: regs->ebx = 0xdeadf00d;
+            }
+        }
+#endif
     }
 
     HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx",
@@ -4927,8 +5264,8 @@ int hvm_do_hypercall(struct cpu_user_regs *regs)
     if ( curr->arch.hvm_vcpu.hcall_preempted )
         return HVM_HCALL_preempted;
 
-    if ( unlikely(curr->domain->arch.hvm_domain.qemu_mapcache_invalidate) &&
-         test_and_clear_bool(curr->domain->arch.hvm_domain.
+    if ( unlikely(currd->arch.hvm_domain.qemu_mapcache_invalidate) &&
+         test_and_clear_bool(currd->arch.hvm_domain.
                              qemu_mapcache_invalidate) )
         return HVM_HCALL_invalidate;
 
@@ -5321,7 +5658,7 @@ static int hvmop_create_ioreq_server(
         goto out;
 
     rc = hvm_create_ioreq_server(d, curr_d->domain_id, 0,
-                                 !!op.handle_bufioreq, &op.id);
+                                 op.handle_bufioreq, &op.id);
     if ( rc != 0 )
         goto out;
 
@@ -5486,6 +5823,503 @@ static int hvmop_destroy_ioreq_server(
     return rc;
 }
 
+static int hvmop_set_evtchn_upcall_vector(
+    XEN_GUEST_HANDLE_PARAM(xen_hvm_evtchn_upcall_vector_t) uop)
+{
+    xen_hvm_evtchn_upcall_vector_t op;
+    struct domain *d = current->domain;
+    struct vcpu *v;
+
+    if ( copy_from_guest(&op, uop, 1) )
+        return -EFAULT;
+
+    if ( !is_hvm_domain(d) )
+        return -EINVAL;
+
+    if ( op.vector < 0x10 )
+        return -EINVAL;
+
+    if ( op.vcpu >= d->max_vcpus || (v = d->vcpu[op.vcpu]) == NULL )
+        return -ENOENT;
+
+    printk(XENLOG_G_INFO "%pv: upcall vector %02x\n", v, op.vector);
+
+    v->arch.hvm_vcpu.evtchn_upcall_vector = op.vector;
+    return 0;
+}
+
+static int hvm_allow_set_param(struct domain *d,
+                               const struct xen_hvm_param *a)
+{
+    uint64_t value = d->arch.hvm_domain.params[a->index];
+    int rc;
+
+    rc = xsm_hvm_param(XSM_TARGET, d, HVMOP_set_param);
+    if ( rc )
+        return rc;
+
+    switch ( a->index )
+    {
+    /* The following parameters can be set by the guest. */
+    case HVM_PARAM_CALLBACK_IRQ:
+    case HVM_PARAM_VM86_TSS:
+    case HVM_PARAM_ACPI_IOPORTS_LOCATION:
+    case HVM_PARAM_VM_GENERATION_ID_ADDR:
+    case HVM_PARAM_STORE_EVTCHN:
+    case HVM_PARAM_CONSOLE_EVTCHN:
+        break;
+    /*
+     * The following parameters must not be set by the guest
+     * since the domain may need to be paused.
+     */
+    case HVM_PARAM_IDENT_PT:
+    case HVM_PARAM_DM_DOMAIN:
+    case HVM_PARAM_ACPI_S_STATE:
+    /* The remaining parameters should not be set by the guest. */
+    default:
+        if ( d == current->domain )
+            rc = -EPERM;
+        break;
+    }
+
+    if ( rc )
+        return rc;
+
+    switch ( a->index )
+    {
+    /* The following parameters should only be changed once. */
+    case HVM_PARAM_VIRIDIAN:
+    case HVM_PARAM_IOREQ_SERVER_PFN:
+    case HVM_PARAM_NR_IOREQ_SERVER_PAGES:
+    case HVM_PARAM_ALTP2M:
+        if ( value != 0 && a->value != value )
+            rc = -EEXIST;
+        break;
+    default:
+        break;
+    }
+
+    return rc;
+}
+
+static int hvmop_set_param(
+    XEN_GUEST_HANDLE_PARAM(xen_hvm_param_t) arg)
+{
+    struct domain *curr_d = current->domain;
+    struct xen_hvm_param a;
+    struct domain *d;
+    struct vcpu *v;
+    int rc;
+
+    if ( copy_from_guest(&a, arg, 1) )
+        return -EFAULT;
+
+    if ( a.index >= HVM_NR_PARAMS )
+        return -EINVAL;
+
+    d = rcu_lock_domain_by_any_id(a.domid);
+    if ( d == NULL )
+        return -ESRCH;
+
+    rc = -EINVAL;
+    if ( !has_hvm_container_domain(d) ||
+         (is_pvh_domain(d) && (a.index != HVM_PARAM_CALLBACK_IRQ)) )
+        goto out;
+
+    rc = hvm_allow_set_param(d, &a);
+    if ( rc )
+        goto out;
+
+    switch ( a.index )
+    {
+    case HVM_PARAM_CALLBACK_IRQ:
+        hvm_set_callback_via(d, a.value);
+        hvm_latch_shinfo_size(d);
+        break;
+    case HVM_PARAM_TIMER_MODE:
+        if ( a.value > HVMPTM_one_missed_tick_pending )
+            rc = -EINVAL;
+        break;
+    case HVM_PARAM_VIRIDIAN:
+        if ( (a.value & ~HVMPV_feature_mask) ||
+             !(a.value & HVMPV_base_freq) )
+            rc = -EINVAL;
+        break;
+    case HVM_PARAM_IDENT_PT:
+        /*
+         * Only actually required for VT-x lacking unrestricted_guest
+         * capabilities.  Short circuit the pause if possible.
+         */
+        if ( !paging_mode_hap(d) || !cpu_has_vmx )
+        {
+            d->arch.hvm_domain.params[a.index] = a.value;
+            break;
+        }
+
+        /*
+         * Update GUEST_CR3 in each VMCS to point at identity map.
+         * All foreign updates to guest state must synchronise on
+         * the domctl_lock.
+         */
+        rc = -ERESTART;
+        if ( !domctl_lock_acquire() )
+            break;
+
+        rc = 0;
+        domain_pause(d);
+        d->arch.hvm_domain.params[a.index] = a.value;
+        for_each_vcpu ( d, v )
+            paging_update_cr3(v);
+        domain_unpause(d);
+
+        domctl_lock_release();
+        break;
+    case HVM_PARAM_DM_DOMAIN:
+        if ( a.value == DOMID_SELF )
+            a.value = curr_d->domain_id;
+
+        rc = hvm_set_dm_domain(d, a.value);
+        break;
+    case HVM_PARAM_ACPI_S_STATE:
+        rc = 0;
+        if ( a.value == 3 )
+            hvm_s3_suspend(d);
+        else if ( a.value == 0 )
+            hvm_s3_resume(d);
+        else
+            rc = -EINVAL;
+
+        break;
+    case HVM_PARAM_ACPI_IOPORTS_LOCATION:
+        rc = pmtimer_change_ioport(d, a.value);
+        break;
+    case HVM_PARAM_MEMORY_EVENT_CR0:
+    case HVM_PARAM_MEMORY_EVENT_CR3:
+    case HVM_PARAM_MEMORY_EVENT_CR4:
+    case HVM_PARAM_MEMORY_EVENT_INT3:
+    case HVM_PARAM_MEMORY_EVENT_SINGLE_STEP:
+    case HVM_PARAM_MEMORY_EVENT_MSR:
+        /* Deprecated */
+        rc = -EOPNOTSUPP;
+        break;
+    case HVM_PARAM_NESTEDHVM:
+        rc = xsm_hvm_param_nested(XSM_PRIV, d);
+        if ( rc )
+            break;
+        if ( a.value > 1 )
+            rc = -EINVAL;
+        /*
+         * Remove the check below once we have
+         * shadow-on-shadow.
+         */
+        if ( cpu_has_svm && !paging_mode_hap(d) && a.value )
+            rc = -EINVAL;
+        if ( a.value &&
+             d->arch.hvm_domain.params[HVM_PARAM_ALTP2M] )
+            rc = -EINVAL;
+        /* Set up NHVM state for any vcpus that are already up. */
+        if ( a.value &&
+             !d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM] )
+            for_each_vcpu(d, v)
+                if ( rc == 0 )
+                    rc = nestedhvm_vcpu_initialise(v);
+        if ( !a.value || rc )
+            for_each_vcpu(d, v)
+                nestedhvm_vcpu_destroy(v);
+        break;
+    case HVM_PARAM_ALTP2M:
+        rc = xsm_hvm_param_altp2mhvm(XSM_PRIV, d);
+        if ( rc )
+            break;
+        if ( a.value > 1 )
+            rc = -EINVAL;
+        if ( a.value &&
+             d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM] )
+            rc = -EINVAL;
+        break;
+    case HVM_PARAM_BUFIOREQ_EVTCHN:
+        rc = -EINVAL;
+        break;
+    case HVM_PARAM_TRIPLE_FAULT_REASON:
+        if ( a.value > SHUTDOWN_MAX )
+            rc = -EINVAL;
+        break;
+    case HVM_PARAM_IOREQ_SERVER_PFN:
+        d->arch.hvm_domain.ioreq_gmfn.base = a.value;
+        break;
+    case HVM_PARAM_NR_IOREQ_SERVER_PAGES:
+    {
+        unsigned int i;
+
+        if ( a.value == 0 ||
+             a.value > sizeof(d->arch.hvm_domain.ioreq_gmfn.mask) * 8 )
+        {
+            rc = -EINVAL;
+            break;
+        }
+        for ( i = 0; i < a.value; i++ )
+            set_bit(i, &d->arch.hvm_domain.ioreq_gmfn.mask);
+
+        break;
+    }
+    }
+
+    if ( rc != 0 )
+        goto out;
+
+    d->arch.hvm_domain.params[a.index] = a.value;
+
+    HVM_DBG_LOG(DBG_LEVEL_HCALL, "set param %u = %"PRIx64,
+                a.index, a.value);
+
+ out:
+    rcu_unlock_domain(d);
+    return rc;
+}
+
+static int hvm_allow_get_param(struct domain *d,
+                               const struct xen_hvm_param *a)
+{
+    int rc;
+
+    rc = xsm_hvm_param(XSM_TARGET, d, HVMOP_get_param);
+    if ( rc )
+        return rc;
+
+    switch ( a->index )
+    {
+    /* The following parameters can be read by the guest. */
+    case HVM_PARAM_CALLBACK_IRQ:
+    case HVM_PARAM_VM86_TSS:
+    case HVM_PARAM_ACPI_IOPORTS_LOCATION:
+    case HVM_PARAM_VM_GENERATION_ID_ADDR:
+    case HVM_PARAM_STORE_PFN:
+    case HVM_PARAM_STORE_EVTCHN:
+    case HVM_PARAM_CONSOLE_PFN:
+    case HVM_PARAM_CONSOLE_EVTCHN:
+    case HVM_PARAM_ALTP2M:
+        break;
+    /*
+     * The following parameters must not be read by the guest
+     * since the domain may need to be paused.
+     */
+    case HVM_PARAM_IOREQ_PFN:
+    case HVM_PARAM_BUFIOREQ_PFN:
+    case HVM_PARAM_BUFIOREQ_EVTCHN:
+    /* The remaining parameters should not be read by the guest. */
+    default:
+        if ( d == current->domain )
+            rc = -EPERM;
+        break;
+    }
+
+    return rc;
+}
+
+static int hvmop_get_param(
+    XEN_GUEST_HANDLE_PARAM(xen_hvm_param_t) arg)
+{
+    struct xen_hvm_param a;
+    struct domain *d;
+    int rc;
+
+    if ( copy_from_guest(&a, arg, 1) )
+        return -EFAULT;
+
+    if ( a.index >= HVM_NR_PARAMS )
+        return -EINVAL;
+
+    d = rcu_lock_domain_by_any_id(a.domid);
+    if ( d == NULL )
+        return -ESRCH;
+
+    rc = -EINVAL;
+    if ( !has_hvm_container_domain(d) ||
+         (is_pvh_domain(d) && (a.index != HVM_PARAM_CALLBACK_IRQ)) )
+        goto out;
+
+    rc = hvm_allow_get_param(d, &a);
+    if ( rc )
+        goto out;
+
+    switch ( a.index )
+    {
+    case HVM_PARAM_ACPI_S_STATE:
+        a.value = d->arch.hvm_domain.is_s3_suspended ? 3 : 0;
+        break;
+    case HVM_PARAM_IOREQ_PFN:
+    case HVM_PARAM_BUFIOREQ_PFN:
+    case HVM_PARAM_BUFIOREQ_EVTCHN:
+    {
+        domid_t domid;
+
+        /* May need to create server. */
+        domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
+        rc = hvm_create_ioreq_server(d, domid, 1,
+                                     HVM_IOREQSRV_BUFIOREQ_LEGACY, NULL);
+        if ( rc != 0 && rc != -EEXIST )
+            goto out;
+    }
+    /*FALLTHRU*/
+    default:
+        a.value = d->arch.hvm_domain.params[a.index];
+        break;
+    }
+
+    rc = __copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
+
+    HVM_DBG_LOG(DBG_LEVEL_HCALL, "get param %u = %"PRIx64,
+                a.index, a.value);
+
+ out:
+    rcu_unlock_domain(d);
+    return rc;
+}
+
+static int do_altp2m_op(
+    XEN_GUEST_HANDLE_PARAM(void) arg)
+{
+    struct xen_hvm_altp2m_op a;
+    struct domain *d = NULL;
+    int rc = 0;
+
+    if ( !hvm_altp2m_supported() )
+        return -EOPNOTSUPP;
+
+    if ( copy_from_guest(&a, arg, 1) )
+        return -EFAULT;
+
+    if ( a.pad1 || a.pad2 ||
+         (a.version != HVMOP_ALTP2M_INTERFACE_VERSION) ||
+         (a.cmd < HVMOP_altp2m_get_domain_state) ||
+         (a.cmd > HVMOP_altp2m_change_gfn) )
+        return -EINVAL;
+
+    d = (a.cmd != HVMOP_altp2m_vcpu_enable_notify) ?
+        rcu_lock_domain_by_any_id(a.domain) : rcu_lock_current_domain();
+
+    if ( d == NULL )
+        return -ESRCH;
+
+    if ( !is_hvm_domain(d) )
+    {
+        rc = -EOPNOTSUPP;
+        goto out;
+    }
+
+    if ( (a.cmd != HVMOP_altp2m_get_domain_state) &&
+         (a.cmd != HVMOP_altp2m_set_domain_state) &&
+         !d->arch.altp2m_active )
+    {
+        rc = -EOPNOTSUPP;
+        goto out;
+    }
+
+    if ( (rc = xsm_hvm_altp2mhvm_op(XSM_TARGET, d)) )
+        goto out;
+
+    switch ( a.cmd )
+    {
+    case HVMOP_altp2m_get_domain_state:
+        if ( !d->arch.hvm_domain.params[HVM_PARAM_ALTP2M] )
+        {
+            rc = -EINVAL;
+            break;
+        }
+
+        a.u.domain_state.state = altp2m_active(d);
+        rc = __copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
+        break;
+
+    case HVMOP_altp2m_set_domain_state:
+    {
+        struct vcpu *v;
+        bool_t ostate;
+
+        if ( !d->arch.hvm_domain.params[HVM_PARAM_ALTP2M] ||
+             nestedhvm_enabled(d) )
+        {
+            rc = -EINVAL;
+            break;
+        }
+
+        ostate = d->arch.altp2m_active;
+        d->arch.altp2m_active = !!a.u.domain_state.state;
+
+        /* If the alternate p2m state has changed, handle appropriately */
+        if ( d->arch.altp2m_active != ostate &&
+             (ostate || !(rc = p2m_init_altp2m_by_id(d, 0))) )
+        {
+            for_each_vcpu( d, v )
+            {
+                if ( !ostate )
+                    altp2m_vcpu_initialise(v);
+                else
+                    altp2m_vcpu_destroy(v);
+            }
+
+            if ( ostate )
+                p2m_flush_altp2m(d);
+        }
+        break;
+    }
+
+    case HVMOP_altp2m_vcpu_enable_notify:
+    {
+        struct vcpu *curr = current;
+        p2m_type_t p2mt;
+
+        if ( a.u.enable_notify.pad || a.domain != DOMID_SELF ||
+             a.u.enable_notify.vcpu_id != curr->vcpu_id )
+            rc = -EINVAL;
+
+        if ( (gfn_x(vcpu_altp2m(curr).veinfo_gfn) != INVALID_GFN) ||
+             (mfn_x(get_gfn_query_unlocked(curr->domain,
+                    a.u.enable_notify.gfn, &p2mt)) == INVALID_MFN) )
+            return -EINVAL;
+
+        vcpu_altp2m(curr).veinfo_gfn = _gfn(a.u.enable_notify.gfn);
+        altp2m_vcpu_update_vmfunc_ve(curr);
+        break;
+    }
+
+    case HVMOP_altp2m_create_p2m:
+        if ( !(rc = p2m_init_next_altp2m(d, &a.u.view.view)) )
+            rc = __copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
+        break;
+
+    case HVMOP_altp2m_destroy_p2m:
+        rc = p2m_destroy_altp2m_by_id(d, a.u.view.view);
+        break;
+
+    case HVMOP_altp2m_switch_p2m:
+        rc = p2m_switch_domain_altp2m_by_id(d, a.u.view.view);
+        break;
+
+    case HVMOP_altp2m_set_mem_access:
+        if ( a.u.set_mem_access.pad )
+            rc = -EINVAL;
+        else
+            rc = p2m_set_altp2m_mem_access(d, a.u.set_mem_access.view,
+                    _gfn(a.u.set_mem_access.gfn),
+                    a.u.set_mem_access.hvmmem_access);
+        break;
+
+    case HVMOP_altp2m_change_gfn:
+        if ( a.u.change_gfn.pad1 || a.u.change_gfn.pad2 )
+            rc = -EINVAL;
+        else
+            rc = p2m_change_altp2m_gfn(d, a.u.change_gfn.view,
+                    _gfn(a.u.change_gfn.old_gfn),
+                    _gfn(a.u.change_gfn.new_gfn));
+    }
+
+ out:
+    rcu_unlock_domain(d);
+
+    return rc;
+}
+
 /*
  * Note that this value is effectively part of the ABI, even if we don't need
  * to make it a formal part of it: A guest suspended for migration in the
@@ -5495,9 +6329,7 @@ static int hvmop_destroy_ioreq_server(
 #define HVMOP_op_mask 0xff
 
 long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
-
 {
-    struct domain *curr_d = current->domain;
     unsigned long start_iter, mask;
     long rc = 0;
 
@@ -5545,270 +6377,20 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
             guest_handle_cast(arg, xen_hvm_destroy_ioreq_server_t));
         break;
     
+    case HVMOP_set_evtchn_upcall_vector:
+        rc = hvmop_set_evtchn_upcall_vector(
+            guest_handle_cast(arg, xen_hvm_evtchn_upcall_vector_t));
+        break;
+    
     case HVMOP_set_param:
-    case HVMOP_get_param:
-    {
-        struct xen_hvm_param a;
-        struct domain *d;
-        struct vcpu *v;
-
-        if ( copy_from_guest(&a, arg, 1) )
-            return -EFAULT;
-
-        if ( a.index >= HVM_NR_PARAMS )
-            return -EINVAL;
-
-        d = rcu_lock_domain_by_any_id(a.domid);
-        if ( d == NULL )
-            return -ESRCH;
-
-        rc = -EINVAL;
-        if ( !has_hvm_container_domain(d) )
-            goto param_fail;
-
-        if ( is_pvh_domain(d)
-             && (a.index != HVM_PARAM_CALLBACK_IRQ) )
-            goto param_fail;
-
-        rc = xsm_hvm_param(XSM_TARGET, d, op);
-        if ( rc )
-            goto param_fail;
-
-        if ( op == HVMOP_set_param )
-        {
-            rc = 0;
-
-            switch ( a.index )
-            {
-            case HVM_PARAM_CALLBACK_IRQ:
-                hvm_set_callback_via(d, a.value);
-                hvm_latch_shinfo_size(d);
-                break;
-            case HVM_PARAM_TIMER_MODE:
-                if ( a.value > HVMPTM_one_missed_tick_pending )
-                    rc = -EINVAL;
-                break;
-            case HVM_PARAM_VIRIDIAN:
-                /* This should only ever be set once by the tools and read by the guest. */
-                rc = -EPERM;
-                if ( curr_d == d )
-                    break;
-
-                if ( a.value != d->arch.hvm_domain.params[a.index] )
-                {
-                    rc = -EEXIST;
-                    if ( d->arch.hvm_domain.params[a.index] != 0 )
-                        break;
-
-                    rc = -EINVAL;
-                    if ( (a.value & ~HVMPV_feature_mask) ||
-                         !(a.value & HVMPV_base_freq) )
-                        break;
-                }
-
-                rc = 0;
-                break;
-            case HVM_PARAM_IDENT_PT:
-                /* Not reflexive, as we must domain_pause(). */
-                rc = -EPERM;
-                if ( curr_d == d )
-                    break;
-
-                rc = -EINVAL;
-                if ( d->arch.hvm_domain.params[a.index] != 0 )
-                    break;
-
-                rc = 0;
-                if ( !paging_mode_hap(d) )
-                    break;
-
-                /*
-                 * Update GUEST_CR3 in each VMCS to point at identity map.
-                 * All foreign updates to guest state must synchronise on
-                 * the domctl_lock.
-                 */
-                rc = -ERESTART;
-                if ( !domctl_lock_acquire() )
-                    break;
-
-                rc = 0;
-                domain_pause(d);
-                d->arch.hvm_domain.params[a.index] = a.value;
-                for_each_vcpu ( d, v )
-                    paging_update_cr3(v);
-                domain_unpause(d);
-
-                domctl_lock_release();
-                break;
-            case HVM_PARAM_DM_DOMAIN:
-                /* Not reflexive, as we may need to domain_pause(). */
-                rc = -EPERM;
-                if ( curr_d == d )
-                    break;
-
-                if ( a.value == DOMID_SELF )
-                    a.value = curr_d->domain_id;
-
-                rc = hvm_set_dm_domain(d, a.value);
-                break;
-            case HVM_PARAM_ACPI_S_STATE:
-                /* Not reflexive, as we must domain_pause(). */
-                rc = -EPERM;
-                if ( curr_d == d )
-                    break;
-
-                rc = 0;
-                if ( a.value == 3 )
-                    hvm_s3_suspend(d);
-                else if ( a.value == 0 )
-                    hvm_s3_resume(d);
-                else
-                    rc = -EINVAL;
-
-                break;
-            case HVM_PARAM_ACPI_IOPORTS_LOCATION:
-                rc = pmtimer_change_ioport(d, a.value);
-                break;
-            case HVM_PARAM_MEMORY_EVENT_CR0:
-            case HVM_PARAM_MEMORY_EVENT_CR3:
-            case HVM_PARAM_MEMORY_EVENT_CR4:
-                if ( d == current->domain )
-                    rc = -EPERM;
-                break;
-            case HVM_PARAM_MEMORY_EVENT_INT3:
-            case HVM_PARAM_MEMORY_EVENT_SINGLE_STEP:
-            case HVM_PARAM_MEMORY_EVENT_MSR:
-                if ( d == current->domain )
-                {
-                    rc = -EPERM;
-                    break;
-                }
-                if ( a.value & HVMPME_onchangeonly )
-                    rc = -EINVAL;
-                break;
-            case HVM_PARAM_NESTEDHVM:
-                rc = xsm_hvm_param_nested(XSM_PRIV, d);
-                if ( rc )
-                    break;
-                if ( a.value > 1 )
-                    rc = -EINVAL;
-                /* Remove the check below once we have
-                 * shadow-on-shadow.
-                 */
-                if ( cpu_has_svm && !paging_mode_hap(d) && a.value )
-                    rc = -EINVAL;
-                /* Set up NHVM state for any vcpus that are already up */
-                if ( a.value &&
-                     !d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM] )
-                    for_each_vcpu(d, v)
-                        if ( rc == 0 )
-                            rc = nestedhvm_vcpu_initialise(v);
-                if ( !a.value || rc )
-                    for_each_vcpu(d, v)
-                        nestedhvm_vcpu_destroy(v);
-                break;
-            case HVM_PARAM_BUFIOREQ_EVTCHN:
-                rc = -EINVAL;
-                break;
-            case HVM_PARAM_TRIPLE_FAULT_REASON:
-                if ( a.value > SHUTDOWN_MAX )
-                    rc = -EINVAL;
-                break;
-            case HVM_PARAM_IOREQ_SERVER_PFN:
-                if ( d == current->domain )
-                {
-                    rc = -EPERM;
-                    break;
-                }
-                d->arch.hvm_domain.ioreq_gmfn.base = a.value;
-                break;
-            case HVM_PARAM_NR_IOREQ_SERVER_PAGES:
-            {
-                unsigned int i;
-
-                if ( d == current->domain )
-                {
-                    rc = -EPERM;
-                    break;
-                }
-                if ( a.value == 0 ||
-                     a.value > sizeof(d->arch.hvm_domain.ioreq_gmfn.mask) * 8 )
-                {
-                    rc = -EINVAL;
-                    break;
-                }
-                for ( i = 0; i < a.value; i++ )
-                    set_bit(i, &d->arch.hvm_domain.ioreq_gmfn.mask);
-
-                break;
-            }
-            }
-
-            if ( rc == 0 ) 
-            {
-                d->arch.hvm_domain.params[a.index] = a.value;
-
-                switch( a.index )
-                {
-                case HVM_PARAM_MEMORY_EVENT_INT3:
-                case HVM_PARAM_MEMORY_EVENT_SINGLE_STEP:
-                {
-                    domain_pause(d);
-                    domain_unpause(d); /* Causes guest to latch new status */
-                    break;
-                }
-                case HVM_PARAM_MEMORY_EVENT_CR3:
-                {
-                    for_each_vcpu ( d, v )
-                        hvm_funcs.update_guest_cr(v, 0); /* Latches new CR3 mask through CR0 code */
-                    break;
-                }
-                }
-
-            }
-
-        }
-        else
-        {
-            switch ( a.index )
-            {
-            case HVM_PARAM_ACPI_S_STATE:
-                a.value = d->arch.hvm_domain.is_s3_suspended ? 3 : 0;
-                break;
-            case HVM_PARAM_IOREQ_SERVER_PFN:
-            case HVM_PARAM_NR_IOREQ_SERVER_PAGES:
-                if ( d == current->domain )
-                {
-                    rc = -EPERM;
-                    break;
-                }
-            case HVM_PARAM_IOREQ_PFN:
-            case HVM_PARAM_BUFIOREQ_PFN:
-            case HVM_PARAM_BUFIOREQ_EVTCHN: {
-                domid_t domid;
-                
-                /* May need to create server */
-                domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
-                rc = hvm_create_ioreq_server(d, domid, 1, 1, NULL);
-                if ( rc != 0 && rc != -EEXIST )
-                    goto param_fail;
-                /*FALLTHRU*/
-            }
-            default:
-                a.value = d->arch.hvm_domain.params[a.index];
-                break;
-            }
-            rc = __copy_to_guest(arg, &a, 1) ? -EFAULT : 0;
-        }
-
-        HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64,
-                    op == HVMOP_set_param ? "set" : "get",
-                    a.index, a.value);
+        rc = hvmop_set_param(
+            guest_handle_cast(arg, xen_hvm_param_t));
+        break;
 
-    param_fail:
-        rcu_unlock_domain(d);
+    case HVMOP_get_param:
+        rc = hvmop_get_param(
+            guest_handle_cast(arg, xen_hvm_param_t));
         break;
-    }
 
     case HVMOP_set_pci_intx_level:
         rc = hvmop_set_pci_intx_level(
@@ -5916,7 +6498,7 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
                 paging_mark_dirty(d, page_to_mfn(page));
                 /* These are most probably not page tables any more */
                 /* don't take a long time and don't die either */
-                sh_remove_shadows(d->vcpu[0], _mfn(page_to_mfn(page)), 1, 0);
+                sh_remove_shadows(d, _mfn(page_to_mfn(page)), 1, 0);
                 put_page(page);
             }
 
@@ -5960,6 +6542,8 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
             get_gfn_query_unlocked(d, a.pfn, &t);
             if ( p2m_is_mmio(t) )
                 a.mem_type =  HVMMEM_mmio_dm;
+            else if ( t == p2m_mmio_write_dm )
+                a.mem_type = HVMMEM_mmio_write_dm;
             else if ( p2m_is_readonly(t) )
                 a.mem_type =  HVMMEM_ram_ro;
             else if ( p2m_is_ram(t) )
@@ -5987,7 +6571,8 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
         static const p2m_type_t memtype[] = {
             [HVMMEM_ram_rw]  = p2m_ram_rw,
             [HVMMEM_ram_ro]  = p2m_ram_ro,
-            [HVMMEM_mmio_dm] = p2m_mmio_dm
+            [HVMMEM_mmio_dm] = p2m_mmio_dm,
+            [HVMMEM_mmio_write_dm] = p2m_mmio_write_dm
         };
 
         if ( copy_from_guest(&a, arg, 1) )
@@ -6034,7 +6619,8 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
                 goto param_fail4;
             }
             if ( !p2m_is_ram(t) &&
-                 (!p2m_is_hole(t) || a.hvmmem_type != HVMMEM_mmio_dm) )
+                 (!p2m_is_hole(t) || a.hvmmem_type != HVMMEM_mmio_dm) &&
+                 (t != p2m_mmio_write_dm || a.hvmmem_type != HVMMEM_ram_rw) )
             {
                 put_gfn(d, pfn);
                 goto param_fail4;
@@ -6156,6 +6742,17 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
         break;
     }
 
+    case HVMOP_guest_request_vm_event:
+        if ( guest_handle_is_null(arg) )
+            hvm_event_guest_request();
+        else
+            rc = -EINVAL;
+        break;
+
+    case HVMOP_altp2m:
+        rc = do_altp2m_op(arg);
+        break;
+
     default:
     {
         gdprintk(XENLOG_DEBUG, "Bad HVM op %ld.\n", op);
@@ -6199,202 +6796,33 @@ int hvm_debug_op(struct vcpu *v, int32_t op)
     return rc;
 }
 
-static void hvm_mem_event_fill_regs(mem_event_request_t *req)
-{
-    const struct cpu_user_regs *regs = guest_cpu_user_regs();
-    const struct vcpu *curr = current;
-
-    req->x86_regs.rax = regs->eax;
-    req->x86_regs.rcx = regs->ecx;
-    req->x86_regs.rdx = regs->edx;
-    req->x86_regs.rbx = regs->ebx;
-    req->x86_regs.rsp = regs->esp;
-    req->x86_regs.rbp = regs->ebp;
-    req->x86_regs.rsi = regs->esi;
-    req->x86_regs.rdi = regs->edi;
-
-    req->x86_regs.r8  = regs->r8;
-    req->x86_regs.r9  = regs->r9;
-    req->x86_regs.r10 = regs->r10;
-    req->x86_regs.r11 = regs->r11;
-    req->x86_regs.r12 = regs->r12;
-    req->x86_regs.r13 = regs->r13;
-    req->x86_regs.r14 = regs->r14;
-    req->x86_regs.r15 = regs->r15;
-
-    req->x86_regs.rflags = regs->eflags;
-    req->x86_regs.rip    = regs->eip;
-
-    req->x86_regs.msr_efer = curr->arch.hvm_vcpu.guest_efer;
-    req->x86_regs.cr0 = curr->arch.hvm_vcpu.guest_cr[0];
-    req->x86_regs.cr3 = curr->arch.hvm_vcpu.guest_cr[3];
-    req->x86_regs.cr4 = curr->arch.hvm_vcpu.guest_cr[4];
-}
-
-static int hvm_memory_event_traps(long p, uint32_t reason,
-                                  unsigned long value, unsigned long old, 
-                                  bool_t gla_valid, unsigned long gla) 
+void hvm_toggle_singlestep(struct vcpu *v)
 {
-    struct vcpu* v = current;
-    struct domain *d = v->domain;
-    mem_event_request_t req = { .reason = reason };
-    int rc;
-
-    if ( !(p & HVMPME_MODE_MASK) ) 
-        return 0;
-
-    if ( (p & HVMPME_onchangeonly) && (value == old) )
-        return 1;
-
-    rc = mem_event_claim_slot(d, &d->mem_event->access);
-    if ( rc == -ENOSYS )
-    {
-        /* If there was no ring to handle the event, then
-         * simple continue executing normally. */
-        return 1;
-    }
-    else if ( rc < 0 )
-        return rc;
-
-    if ( (p & HVMPME_MODE_MASK) == HVMPME_mode_sync ) 
-    {
-        req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED;    
-        mem_event_vcpu_pause(v);
-    }
-
-    req.gfn = value;
-    req.vcpu_id = v->vcpu_id;
-    if ( gla_valid ) 
-    {
-        req.offset = gla & ((1 << PAGE_SHIFT) - 1);
-        req.gla = gla;
-        req.gla_valid = 1;
-    }
-    else
-    {
-        req.gla = old;
-    }
-    
-    hvm_mem_event_fill_regs(&req);
-    mem_event_put_request(d, &d->mem_event->access, &req);
-    
-    return 1;
-}
+    ASSERT(atomic_read(&v->pause_count));
 
-void hvm_memory_event_cr0(unsigned long value, unsigned long old) 
-{
-    hvm_memory_event_traps(current->domain->arch.hvm_domain
-                             .params[HVM_PARAM_MEMORY_EVENT_CR0],
-                           MEM_EVENT_REASON_CR0,
-                           value, old, 0, 0);
-}
-
-void hvm_memory_event_cr3(unsigned long value, unsigned long old) 
-{
-    hvm_memory_event_traps(current->domain->arch.hvm_domain
-                             .params[HVM_PARAM_MEMORY_EVENT_CR3],
-                           MEM_EVENT_REASON_CR3,
-                           value, old, 0, 0);
-}
-
-void hvm_memory_event_cr4(unsigned long value, unsigned long old) 
-{
-    hvm_memory_event_traps(current->domain->arch.hvm_domain
-                             .params[HVM_PARAM_MEMORY_EVENT_CR4],
-                           MEM_EVENT_REASON_CR4,
-                           value, old, 0, 0);
-}
-
-void hvm_memory_event_msr(unsigned long msr, unsigned long value)
-{
-    hvm_memory_event_traps(current->domain->arch.hvm_domain
-                             .params[HVM_PARAM_MEMORY_EVENT_MSR],
-                           MEM_EVENT_REASON_MSR,
-                           value, ~value, 1, msr);
-}
-
-int hvm_memory_event_int3(unsigned long gla) 
-{
-    uint32_t pfec = PFEC_page_present;
-    unsigned long gfn;
-    gfn = paging_gva_to_gfn(current, gla, &pfec);
-
-    return hvm_memory_event_traps(current->domain->arch.hvm_domain
-                                    .params[HVM_PARAM_MEMORY_EVENT_INT3],
-                                  MEM_EVENT_REASON_INT3,
-                                  gfn, 0, 1, gla);
-}
-
-int hvm_memory_event_single_step(unsigned long gla)
-{
-    uint32_t pfec = PFEC_page_present;
-    unsigned long gfn;
-    gfn = paging_gva_to_gfn(current, gla, &pfec);
-
-    return hvm_memory_event_traps(current->domain->arch.hvm_domain
-            .params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP],
-            MEM_EVENT_REASON_SINGLESTEP,
-            gfn, 0, 1, gla);
-}
-
-int nhvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs)
-{
-    if (hvm_funcs.nhvm_vcpu_hostrestore)
-        return hvm_funcs.nhvm_vcpu_hostrestore(v, regs);
-    return -EOPNOTSUPP;
-}
-
-int nhvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs,
-                     uint64_t exitcode)
-{
-    if (hvm_funcs.nhvm_vcpu_vmexit)
-        return hvm_funcs.nhvm_vcpu_vmexit(v, regs, exitcode);
-    return -EOPNOTSUPP;
-}
-
-int nhvm_vcpu_vmexit_trap(struct vcpu *v, struct hvm_trap *trap)
-{
-    return hvm_funcs.nhvm_vcpu_vmexit_trap(v, trap);
-}
-
-uint64_t nhvm_vcpu_guestcr3(struct vcpu *v)
-{
-    if (hvm_funcs.nhvm_vcpu_guestcr3)
-        return hvm_funcs.nhvm_vcpu_guestcr3(v);
-    return -EOPNOTSUPP;
-}
-
-uint64_t nhvm_vcpu_p2m_base(struct vcpu *v)
-{
-    if ( hvm_funcs.nhvm_vcpu_p2m_base )
-        return hvm_funcs.nhvm_vcpu_p2m_base(v);
-    return -EOPNOTSUPP;
-}
+    if ( !hvm_is_singlestep_supported() )
+        return;
 
-uint32_t nhvm_vcpu_asid(struct vcpu *v)
-{
-    if (hvm_funcs.nhvm_vcpu_asid)
-        return hvm_funcs.nhvm_vcpu_asid(v);
-    return -EOPNOTSUPP;
+    v->arch.hvm_vcpu.single_step = !v->arch.hvm_vcpu.single_step;
 }
 
-int nhvm_vmcx_guest_intercepts_trap(struct vcpu *v, unsigned int trap, int errcode)
+void altp2m_vcpu_update_p2m(struct vcpu *v)
 {
-    if (hvm_funcs.nhvm_vmcx_guest_intercepts_trap)
-        return hvm_funcs.nhvm_vmcx_guest_intercepts_trap(v, trap, errcode);
-    return -EOPNOTSUPP;
+    if ( hvm_funcs.altp2m_vcpu_update_p2m )
+        hvm_funcs.altp2m_vcpu_update_p2m(v);
 }
 
-bool_t nhvm_vmcx_hap_enabled(struct vcpu *v)
+void altp2m_vcpu_update_vmfunc_ve(struct vcpu *v)
 {
-    if (hvm_funcs.nhvm_vmcx_hap_enabled)
-        return hvm_funcs.nhvm_vmcx_hap_enabled(v);
-    return -EOPNOTSUPP;
+    if ( hvm_funcs.altp2m_vcpu_update_vmfunc_ve )
+        hvm_funcs.altp2m_vcpu_update_vmfunc_ve(v);
 }
 
-enum hvm_intblk nhvm_interrupt_blocked(struct vcpu *v)
+bool_t altp2m_vcpu_emulate_ve(struct vcpu *v)
 {
-    return hvm_funcs.nhvm_intr_blocked(v);
+    if ( hvm_funcs.altp2m_vcpu_emulate_ve )
+        return hvm_funcs.altp2m_vcpu_emulate_ve(v);
+    return 0;
 }
 
 /*
diff --git a/xen/arch/x86/hvm/i8254.c b/xen/arch/x86/hvm/i8254.c
index 36a0a53..8a93c88 100644
--- a/xen/arch/x86/hvm/i8254.c
+++ b/xen/arch/x86/hvm/i8254.c
@@ -50,9 +50,9 @@
 #define RW_STATE_WORD1 4
 
 static int handle_pit_io(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val);
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val);
 static int handle_speaker_io(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val);
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val);
 
 #define get_guest_time(v) \
    (is_hvm_vcpu(v) ? hvm_get_guest_time(v) : (u64)get_s_time())
@@ -479,7 +479,7 @@ void pit_deinit(struct domain *d)
 
 /* the intercept action for PIT DM retval:0--not handled; 1--handled */  
 static int handle_pit_io(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
 {
     struct PITState *vpit = vcpu_vpit(current);
 
@@ -522,7 +522,7 @@ static uint32_t speaker_ioport_read(
 }
 
 static int handle_speaker_io(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, uint32_t bytes, uint32_t *val)
 {
     struct PITState *vpit = vcpu_vpit(current);
 
diff --git a/xen/arch/x86/hvm/intercept.c b/xen/arch/x86/hvm/intercept.c
index d52a48c..7096d74 100644
--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -32,371 +31,305 @@
 #include <xen/event.h>
 #include <xen/iommu.h>
 
-static const struct hvm_mmio_handler *const
-hvm_mmio_handlers[HVM_MMIO_HANDLER_NR] =
+static bool_t hvm_mmio_accept(const struct hvm_io_handler *handler,
+                              const ioreq_t *p)
 {
-    &hpet_mmio_handler,
-    &vlapic_mmio_handler,
-    &vioapic_mmio_handler,
-    &msixtbl_mmio_handler,
-    &iommu_mmio_handler
-};
+    paddr_t first = hvm_mmio_first_byte(p);
+    paddr_t last = hvm_mmio_last_byte(p);
 
-static int hvm_mmio_access(struct vcpu *v,
-                           ioreq_t *p,
-                           hvm_mmio_read_t read_handler,
-                           hvm_mmio_write_t write_handler)
-{
-    struct hvm_vcpu_io *vio = &v->arch.hvm_vcpu.hvm_io;
-    unsigned long data;
-    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
+    BUG_ON(handler->type != IOREQ_TYPE_COPY);
 
-    if ( !p->data_is_ptr )
-    {
-        if ( p->dir == IOREQ_READ )
-        {
-            if ( vio->mmio_retrying )
-            {
-                if ( vio->mmio_large_read_bytes != p->size )
-                    return X86EMUL_UNHANDLEABLE;
-                memcpy(&data, vio->mmio_large_read, p->size);
-                vio->mmio_large_read_bytes = 0;
-                vio->mmio_retrying = 0;
-            }
-            else
-                rc = read_handler(v, p->addr, p->size, &data);
-            p->data = data;
-        }
-        else /* p->dir == IOREQ_WRITE */
-            rc = write_handler(v, p->addr, p->size, p->data);
-        return rc;
-    }
+    if ( !handler->mmio.ops->check(current, first) )
+        return 0;
 
-    if ( p->dir == IOREQ_READ )
-    {
-        for ( i = 0; i < p->count; i++ )
-        {
-            if ( vio->mmio_retrying )
-            {
-                if ( vio->mmio_large_read_bytes != p->size )
-                    return X86EMUL_UNHANDLEABLE;
-                memcpy(&data, vio->mmio_large_read, p->size);
-                vio->mmio_large_read_bytes = 0;
-                vio->mmio_retrying = 0;
-            }
-            else
-            {
-                rc = read_handler(v, p->addr + step * i, p->size, &data);
-                if ( rc != X86EMUL_OKAY )
-                    break;
-            }
-            switch ( hvm_copy_to_guest_phys(p->data + step * i,
-                                            &data, p->size) )
-            {
-            case HVMCOPY_okay:
-                break;
-            case HVMCOPY_gfn_paged_out:
-            case HVMCOPY_gfn_shared:
-                rc = X86EMUL_RETRY;
-                break;
-            case HVMCOPY_bad_gfn_to_mfn:
-                /* Drop the write as real hardware would. */
-                continue;
-            case HVMCOPY_bad_gva_to_gfn:
-                ASSERT(0);
-                /* fall through */
-            default:
-                rc = X86EMUL_UNHANDLEABLE;
-                break;
-            }
-            if ( rc != X86EMUL_OKAY)
-                break;
-        }
+    /* Make sure the handler will accept the whole access */
+    if ( p->size > 1 &&
+         !handler->mmio.ops->check(current, last) )
+        domain_crash(current->domain);
 
-        if ( rc == X86EMUL_RETRY )
-        {
-            vio->mmio_retry = 1;
-            vio->mmio_large_read_bytes = p->size;
-            memcpy(vio->mmio_large_read, &data, p->size);
-        }
-    }
-    else
-    {
-        for ( i = 0; i < p->count; i++ )
-        {
-            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
-                                              p->size) )
-            {
-            case HVMCOPY_okay:
-                break;
-            case HVMCOPY_gfn_paged_out:
-            case HVMCOPY_gfn_shared:
-                rc = X86EMUL_RETRY;
-                break;
-            case HVMCOPY_bad_gfn_to_mfn:
-                data = ~0;
-                break;
-            case HVMCOPY_bad_gva_to_gfn:
-                ASSERT(0);
-                /* fall through */
-            default:
-                rc = X86EMUL_UNHANDLEABLE;
-                break;
-            }
-            if ( rc != X86EMUL_OKAY )
-                break;
-            rc = write_handler(v, p->addr + step * i, p->size, data);
-            if ( rc != X86EMUL_OKAY )
-                break;
-        }
+    return 1;
+}
 
-        if ( rc == X86EMUL_RETRY )
-            vio->mmio_retry = 1;
-    }
+static int hvm_mmio_read(const struct hvm_io_handler *handler,
+                         uint64_t addr, uint32_t size, uint64_t *data)
+{
+    BUG_ON(handler->type != IOREQ_TYPE_COPY);
 
-    if ( i != 0 )
-    {
-        p->count = i;
-        rc = X86EMUL_OKAY;
-    }
+    return handler->mmio.ops->read(current, addr, size, data);
+}
 
-    return rc;
+static int hvm_mmio_write(const struct hvm_io_handler *handler,
+                          uint64_t addr, uint32_t size, uint64_t data)
+{
+    BUG_ON(handler->type != IOREQ_TYPE_COPY);
+
+    return handler->mmio.ops->write(current, addr, size, data);
 }
 
-bool_t hvm_mmio_internal(paddr_t gpa)
+static const struct hvm_io_ops mmio_ops = {
+    .accept = hvm_mmio_accept,
+    .read = hvm_mmio_read,
+    .write = hvm_mmio_write
+};
+
+static bool_t hvm_portio_accept(const struct hvm_io_handler *handler,
+                                const ioreq_t *p)
 {
-    struct vcpu *curr = current;
-    unsigned int i;
+    unsigned int start = handler->portio.port;
+    unsigned int end = start + handler->portio.size;
 
-    for ( i = 0; i < HVM_MMIO_HANDLER_NR; ++i )
-        if ( hvm_mmio_handlers[i]->check_handler(curr, gpa) )
-            return 1;
+    BUG_ON(handler->type != IOREQ_TYPE_PIO);
 
-    return 0;
+    return (p->addr >= start) && ((p->addr + p->size) <= end);
 }
 
-int hvm_mmio_intercept(ioreq_t *p)
+static int hvm_portio_read(const struct hvm_io_handler *handler,
+                           uint64_t addr, uint32_t size, uint64_t *data)
 {
-    struct vcpu *v = current;
-    int i;
+    uint32_t val = ~0u;
+    int rc;
 
-    for ( i = 0; i < HVM_MMIO_HANDLER_NR; i++ )
-    {
-        hvm_mmio_check_t check_handler =
-            hvm_mmio_handlers[i]->check_handler;
+    BUG_ON(handler->type != IOREQ_TYPE_PIO);
 
-        if ( check_handler(v, p->addr) )
-        {
-            if ( unlikely(p->count > 1) &&
-                 !check_handler(v, unlikely(p->df)
-                                   ? p->addr - (p->count - 1L) * p->size
-                                   : p->addr + (p->count - 1L) * p->size) )
-                p->count = 1;
-
-            return hvm_mmio_access(
-                v, p,
-                hvm_mmio_handlers[i]->read_handler,
-                hvm_mmio_handlers[i]->write_handler);
-        }
-    }
+    rc = handler->portio.action(IOREQ_READ, addr, size, &val);
+    *data = val;
 
-    return X86EMUL_UNHANDLEABLE;
+    return rc;
 }
 
-static int process_portio_intercept(portio_action_t action, ioreq_t *p)
+static int hvm_portio_write(const struct hvm_io_handler *handler,
+                            uint64_t addr, uint32_t size, uint64_t data)
 {
-    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
-    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
-    uint32_t data;
+    uint32_t val = data;
 
-    if ( !p->data_is_ptr )
-    {
-        if ( p->dir == IOREQ_READ )
-        {
-            if ( vio->mmio_retrying )
-            {
-                if ( vio->mmio_large_read_bytes != p->size )
-                    return X86EMUL_UNHANDLEABLE;
-                memcpy(&data, vio->mmio_large_read, p->size);
-                vio->mmio_large_read_bytes = 0;
-                vio->mmio_retrying = 0;
-            }
-            else
-                rc = action(IOREQ_READ, p->addr, p->size, &data);
-            p->data = data;
-        }
-        else
-        {
-            data = p->data;
-            rc = action(IOREQ_WRITE, p->addr, p->size, &data);
-        }
-        return rc;
-    }
+    BUG_ON(handler->type != IOREQ_TYPE_PIO);
+
+    return handler->portio.action(IOREQ_WRITE, addr, size, &val);
+}
+
+static const struct hvm_io_ops portio_ops = {
+    .accept = hvm_portio_accept,
+    .read = hvm_portio_read,
+    .write = hvm_portio_write
+};
+
+int hvm_process_io_intercept(const struct hvm_io_handler *handler,
+                             ioreq_t *p)
+{
+    const struct hvm_io_ops *ops = handler->ops;
+    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
+    uint64_t data;
+    uint64_t addr;
 
     if ( p->dir == IOREQ_READ )
     {
         for ( i = 0; i < p->count; i++ )
         {
-            if ( vio->mmio_retrying )
-            {
-                if ( vio->mmio_large_read_bytes != p->size )
-                    return X86EMUL_UNHANDLEABLE;
-                memcpy(&data, vio->mmio_large_read, p->size);
-                vio->mmio_large_read_bytes = 0;
-                vio->mmio_retrying = 0;
-            }
-            else
+            addr = (p->type == IOREQ_TYPE_COPY) ?
+                   p->addr + step * i :
+                   p->addr;
+            rc = ops->read(handler, addr, p->size, &data);
+            if ( rc != X86EMUL_OKAY )
+                break;
+
+            if ( p->data_is_ptr )
             {
-                rc = action(IOREQ_READ, p->addr, p->size, &data);
+                switch ( hvm_copy_to_guest_phys(p->data + step * i,
+                                                &data, p->size) )
+                {
+                case HVMCOPY_okay:
+                    break;
+                case HVMCOPY_bad_gfn_to_mfn:
+                    /* Drop the write as real hardware would. */
+                    continue;
+                case HVMCOPY_bad_gva_to_gfn:
+                case HVMCOPY_gfn_paged_out:
+                case HVMCOPY_gfn_shared:
+                    ASSERT_UNREACHABLE();
+                    /* fall through */
+                default:
+                    rc = X86EMUL_UNHANDLEABLE;
+                    break;
+                }
                 if ( rc != X86EMUL_OKAY )
                     break;
             }
-            switch ( hvm_copy_to_guest_phys(p->data + step * i,
-                                            &data, p->size) )
-            {
-            case HVMCOPY_okay:
-                break;
-            case HVMCOPY_gfn_paged_out:
-            case HVMCOPY_gfn_shared:
-                rc = X86EMUL_RETRY;
-                break;
-            case HVMCOPY_bad_gfn_to_mfn:
-                /* Drop the write as real hardware would. */
-                continue;
-            case HVMCOPY_bad_gva_to_gfn:
-                ASSERT(0);
-                /* fall through */
-            default:
-                rc = X86EMUL_UNHANDLEABLE;
-                break;
-            }
-            if ( rc != X86EMUL_OKAY)
-                break;
-        }
-
-        if ( rc == X86EMUL_RETRY )
-        {
-            vio->mmio_retry = 1;
-            vio->mmio_large_read_bytes = p->size;
-            memcpy(vio->mmio_large_read, &data, p->size);
+            else
+                p->data = data;
         }
     }
     else /* p->dir == IOREQ_WRITE */
     {
         for ( i = 0; i < p->count; i++ )
         {
-            data = 0;
-            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
-                                              p->size) )
+            if ( p->data_is_ptr )
             {
-            case HVMCOPY_okay:
-                break;
-            case HVMCOPY_gfn_paged_out:
-            case HVMCOPY_gfn_shared:
-                rc = X86EMUL_RETRY;
-                break;
-            case HVMCOPY_bad_gfn_to_mfn:
-                data = ~0;
-                break;
-            case HVMCOPY_bad_gva_to_gfn:
-                ASSERT(0);
-                /* fall through */
-            default:
-                rc = X86EMUL_UNHANDLEABLE;
-                break;
+                switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
+                                                  p->size) )
+                {
+                case HVMCOPY_okay:
+                    break;
+                case HVMCOPY_bad_gfn_to_mfn:
+                    data = ~0;
+                    break;
+                case HVMCOPY_bad_gva_to_gfn:
+                case HVMCOPY_gfn_paged_out:
+                case HVMCOPY_gfn_shared:
+                    ASSERT_UNREACHABLE();
+                    /* fall through */
+                default:
+                    rc = X86EMUL_UNHANDLEABLE;
+                    break;
+                }
+                if ( rc != X86EMUL_OKAY )
+                    break;
             }
-            if ( rc != X86EMUL_OKAY )
-                break;
-            rc = action(IOREQ_WRITE, p->addr, p->size, &data);
+            else
+                data = p->data;
+
+            addr = (p->type == IOREQ_TYPE_COPY) ?
+                   p->addr + step * i :
+                   p->addr;
+            rc = ops->write(handler, addr, p->size, data);
             if ( rc != X86EMUL_OKAY )
                 break;
         }
-
-        if ( rc == X86EMUL_RETRY )
-            vio->mmio_retry = 1;
     }
 
-    if ( i != 0 )
+    if ( i != 0 && rc == X86EMUL_UNHANDLEABLE )
+        domain_crash(current->domain);
+
+    return rc;
+}
+
+const struct hvm_io_handler *hvm_find_io_handler(ioreq_t *p)
+{
+    struct domain *curr_d = current->domain;
+    unsigned int i;
+
+    BUG_ON((p->type != IOREQ_TYPE_PIO) &&
+           (p->type != IOREQ_TYPE_COPY));
+
+    for ( i = 0; i < curr_d->arch.hvm_domain.io_handler_count; i++ )
     {
-        p->count = i;
-        rc = X86EMUL_OKAY;
+        const struct hvm_io_handler *handler =
+            &curr_d->arch.hvm_domain.io_handler[i];
+        const struct hvm_io_ops *ops = handler->ops;
+
+        if ( handler->type != p->type )
+            continue;
+
+        if ( ops->accept(handler, p) )
+            return handler;
     }
 
+    return NULL;
+}
+
+int hvm_io_intercept(ioreq_t *p)
+{
+    const struct hvm_io_handler *handler;
+    const struct hvm_io_ops *ops;
+    int rc;
+
+    handler = hvm_find_io_handler(p);
+
+    if ( handler == NULL )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = hvm_process_io_intercept(handler, p);
+
+    ops = handler->ops;
+    if ( ops->complete != NULL )
+        ops->complete(handler);
+
     return rc;
 }
 
-/*
- * Check if the request is handled inside xen
- * return value: 0 --not handled; 1 --handled
- */
-int hvm_io_intercept(ioreq_t *p, int type)
+struct hvm_io_handler *hvm_next_io_handler(struct domain *d)
 {
-    struct vcpu *v = current;
-    struct hvm_io_handler *handler = v->domain->arch.hvm_domain.io_handler;
-    int i;
-    unsigned long addr, size;
+    unsigned int i = d->arch.hvm_domain.io_handler_count++;
 
-    if ( type == HVM_PORTIO )
+    if ( i == NR_IO_HANDLERS )
     {
-        int rc = dpci_ioport_intercept(p);
-        if ( (rc == X86EMUL_OKAY) || (rc == X86EMUL_RETRY) )
-            return rc;
+        domain_crash(d);
+        return NULL;
     }
 
-    for ( i = 0; i < handler->num_slot; i++ )
-    {
-        if ( type != handler->hdl_list[i].type )
-            continue;
-        addr = handler->hdl_list[i].addr;
-        size = handler->hdl_list[i].size;
-        if ( (p->addr >= addr) &&
-             ((p->addr + p->size) <= (addr + size)) )
-        {
-            if ( type == HVM_PORTIO )
-                return process_portio_intercept(
-                    handler->hdl_list[i].action.portio, p);
+    return &d->arch.hvm_domain.io_handler[i];
+}
 
-            if ( unlikely(p->count > 1) &&
-                 (unlikely(p->df)
-                  ? p->addr - (p->count - 1L) * p->size < addr
-                  : p->addr + p->count * 1L * p->size - 1 >= addr + size) )
-                p->count = 1;
+void register_mmio_handler(struct domain *d,
+                           const struct hvm_mmio_ops *ops)
+{
+    struct hvm_io_handler *handler = hvm_next_io_handler(d);
 
-            return handler->hdl_list[i].action.mmio(p);
-        }
-    }
+    if ( handler == NULL )
+        return;
 
-    return X86EMUL_UNHANDLEABLE;
+    handler->type = IOREQ_TYPE_COPY;
+    handler->ops = &mmio_ops;
+    handler->mmio.ops = ops;
 }
 
-void register_io_handler(
-    struct domain *d, unsigned long addr, unsigned long size,
-    void *action, int type)
+void register_portio_handler(struct domain *d, unsigned int port,
+                             unsigned int size, portio_action_t action)
 {
-    struct hvm_io_handler *handler = d->arch.hvm_domain.io_handler;
-    int num = handler->num_slot;
+    struct hvm_io_handler *handler = hvm_next_io_handler(d);
 
-    BUG_ON(num >= MAX_IO_HANDLER);
+    if ( handler == NULL )
+        return;
 
-    handler->hdl_list[num].addr = addr;
-    handler->hdl_list[num].size = size;
-    handler->hdl_list[num].action.ptr = action;
-    handler->hdl_list[num].type = type;
-    handler->num_slot++;
+    handler->type = IOREQ_TYPE_PIO;
+    handler->ops = &portio_ops;
+    handler->portio.port = port;
+    handler->portio.size = size;
+    handler->portio.action = action;
 }
 
-void relocate_io_handler(
-    struct domain *d, unsigned long old_addr, unsigned long new_addr,
-    unsigned long size, int type)
+void relocate_portio_handler(struct domain *d, unsigned int old_port,
+                             unsigned int new_port, unsigned int size)
 {
-    struct hvm_io_handler *handler = d->arch.hvm_domain.io_handler;
-    int i;
-
-    for ( i = 0; i < handler->num_slot; i++ )
-        if ( (handler->hdl_list[i].addr == old_addr) &&
-             (handler->hdl_list[i].size == size) &&
-             (handler->hdl_list[i].type == type) )
-            handler->hdl_list[i].addr = new_addr;
+    unsigned int i;
+
+    for ( i = 0; i < d->arch.hvm_domain.io_handler_count; i++ )
+    {
+        struct hvm_io_handler *handler =
+            &d->arch.hvm_domain.io_handler[i];
+
+        if ( handler->type != IOREQ_TYPE_PIO )
+            continue;
+
+        if ( (handler->portio.port == old_port) &&
+             (handler->portio.size = size) )
+        {
+            handler->portio.port = new_port;
+            break;
+        }
+    }
+}
+
+bool_t hvm_mmio_internal(paddr_t gpa)
+{
+    const struct hvm_io_handler *handler;
+    const struct hvm_io_ops *ops;
+    ioreq_t p = {
+        .type = IOREQ_TYPE_COPY,
+        .addr = gpa,
+        .count = 1,
+        .size = 1,
+    };
+
+    handler = hvm_find_io_handler(&p);
+
+    if ( handler == NULL )
+        return 0;
+
+    ops = handler->ops;
+    if ( ops->complete != NULL )
+        ops->complete(handler);
+
+    return 1;
 }
 
 /*
diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index 68fb890..fee812a 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -15,8 +15,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -60,8 +59,8 @@ void send_timeoffset_req(unsigned long timeoff)
     if ( timeoff == 0 )
         return;
 
-    if ( !hvm_buffered_io_send(&p) )
-        printk("Unsuccessful timeoffset update\n");
+    if ( hvm_broadcast_ioreq(&p, 1) != 0 )
+        gprintk(XENLOG_ERR, "Unsuccessful timeoffset update\n");
 }
 
 /* Ask ioemu mapcache to invalidate mappings. */
@@ -74,7 +73,8 @@ void send_invalidate_req(void)
         .data = ~0UL, /* flush all */
     };
 
-    hvm_broadcast_assist_req(&p);
+    if ( hvm_broadcast_ioreq(&p, 0) != 0 )
+        gprintk(XENLOG_ERR, "Unsuccessful map-cache invalidate\n");
 }
 
 int handle_mmio(void)
@@ -90,10 +90,8 @@ int handle_mmio(void)
 
     rc = hvm_emulate_one(&ctxt);
 
-    if ( rc != X86EMUL_RETRY )
-        vio->io_state = HVMIO_none;
-    if ( vio->io_state == HVMIO_awaiting_completion )
-        vio->io_state = HVMIO_handle_mmio_awaiting_completion;
+    if ( hvm_vcpu_io_need_completion(vio) || vio->mmio_retry )
+        vio->io_completion = HVMIO_mmio_completion;
     else
         vio->mmio_access = (struct npfec){};
 
@@ -132,7 +130,7 @@ int handle_pio(uint16_t port, unsigned int size, int dir)
 {
     struct vcpu *curr = current;
     struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
-    unsigned long data, reps = 1;
+    unsigned long data;
     int rc;
 
     ASSERT((size - 1) < 4 && size != 3);
@@ -140,7 +138,10 @@ int handle_pio(uint16_t port, unsigned int size, int dir)
     if ( dir == IOREQ_WRITE )
         data = guest_cpu_user_regs()->eax;
 
-    rc = hvmemul_do_pio(port, &reps, size, 0, dir, 0, &data);
+    rc = hvmemul_do_pio_buffer(port, size, dir, &data);
+
+    if ( hvm_vcpu_io_need_completion(vio) )
+        vio->io_completion = HVMIO_pio_completion;
 
     switch ( rc )
     {
@@ -154,11 +155,10 @@ int handle_pio(uint16_t port, unsigned int size, int dir)
         }
         break;
     case X86EMUL_RETRY:
-        if ( vio->io_state != HVMIO_awaiting_completion )
+        /* We should not advance RIP/EIP if the domain is shutting down */
+        if ( curr->domain->is_shutting_down )
             return 0;
-        /* Completion in hvm_io_assist() with no re-emulation required. */
-        ASSERT(dir == IOREQ_READ);
-        vio->io_state = HVMIO_handle_pio_awaiting_completion;
+
         break;
     default:
         gdprintk(XENLOG_ERR, "Weird HVM ioemulation status %d.\n", rc);
@@ -169,224 +169,98 @@ int handle_pio(uint16_t port, unsigned int size, int dir)
     return 1;
 }
 
-void hvm_io_assist(ioreq_t *p)
+static bool_t dpci_portio_accept(const struct hvm_io_handler *handler,
+                                 const ioreq_t *p)
 {
     struct vcpu *curr = current;
+    struct hvm_iommu *hd = domain_hvm_iommu(curr->domain);
     struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
-    enum hvm_io_state io_state;
-
-    p->state = STATE_IOREQ_NONE;
-
-    io_state = vio->io_state;
-    vio->io_state = HVMIO_none;
-
-    switch ( io_state )
-    {
-    case HVMIO_awaiting_completion:
-        vio->io_state = HVMIO_completed;
-        vio->io_data = p->data;
-        break;
-    case HVMIO_handle_mmio_awaiting_completion:
-        vio->io_state = HVMIO_completed;
-        vio->io_data = p->data;
-        (void)handle_mmio();
-        break;
-    case HVMIO_handle_pio_awaiting_completion:
-        if ( vio->io_size == 4 ) /* Needs zero extension. */
-            guest_cpu_user_regs()->rax = (uint32_t)p->data;
-        else
-            memcpy(&guest_cpu_user_regs()->rax, &p->data, vio->io_size);
-        break;
-    default:
-        break;
-    }
-
-    if ( p->state == STATE_IOREQ_NONE )
-    {
-        msix_write_completion(curr);
-        vcpu_end_shutdown_deferral(curr);
-    }
-}
-
-static int dpci_ioport_read(uint32_t mport, ioreq_t *p)
-{
-    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
-    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
-    uint32_t data = 0;
+    struct g2m_ioport *g2m_ioport;
+    unsigned int start, end;
 
-    for ( i = 0; i < p->count; i++ )
+    list_for_each_entry( g2m_ioport, &hd->arch.g2m_ioport_list, list )
     {
-        if ( vio->mmio_retrying )
-        {
-            if ( vio->mmio_large_read_bytes != p->size )
-                return X86EMUL_UNHANDLEABLE;
-            memcpy(&data, vio->mmio_large_read, p->size);
-            vio->mmio_large_read_bytes = 0;
-            vio->mmio_retrying = 0;
-        }
-        else switch ( p->size )
-        {
-        case 1:
-            data = inb(mport);
-            break;
-        case 2:
-            data = inw(mport);
-            break;
-        case 4:
-            data = inl(mport);
-            break;
-        default:
-            BUG();
-        }
-
-        if ( p->data_is_ptr )
+        start = g2m_ioport->gport;
+        end = start + g2m_ioport->np;
+        if ( (p->addr >= start) && (p->addr + p->size <= end) )
         {
-            switch ( hvm_copy_to_guest_phys(p->data + step * i,
-                                            &data, p->size) )
-            {
-            case HVMCOPY_okay:
-                break;
-            case HVMCOPY_gfn_paged_out:
-            case HVMCOPY_gfn_shared:
-                rc = X86EMUL_RETRY;
-                break;
-            case HVMCOPY_bad_gfn_to_mfn:
-                /* Drop the write as real hardware would. */
-                continue;
-            case HVMCOPY_bad_gva_to_gfn:
-                ASSERT(0);
-                /* fall through */
-            default:
-                rc = X86EMUL_UNHANDLEABLE;
-                break;
-            }
-            if ( rc != X86EMUL_OKAY)
-                break;
+            vio->g2m_ioport = g2m_ioport;
+            return 1;
         }
-        else
-            p->data = data;
-    }
-
-    if ( rc == X86EMUL_RETRY )
-    {
-        vio->mmio_retry = 1;
-        vio->mmio_large_read_bytes = p->size;
-        memcpy(vio->mmio_large_read, &data, p->size);
-    }
-
-    if ( i != 0 )
-    {
-        p->count = i;
-        rc = X86EMUL_OKAY;
     }
 
-    return rc;
+    return 0;
 }
 
-static int dpci_ioport_write(uint32_t mport, ioreq_t *p)
+static int dpci_portio_read(const struct hvm_io_handler *handler,
+                            uint64_t addr,
+                            uint32_t size,
+                            uint64_t *data)
 {
-    int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size;
-    uint32_t data;
-
-    for ( i = 0; i < p->count; i++ )
-    {
-        data = p->data;
-        if ( p->data_is_ptr )
-        {
-            switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
-                                              p->size) )
-            {
-            case HVMCOPY_okay:
-                break;
-            case HVMCOPY_gfn_paged_out:
-            case HVMCOPY_gfn_shared:
-                rc = X86EMUL_RETRY;
-                break;
-            case HVMCOPY_bad_gfn_to_mfn:
-                data = ~0;
-                break;
-            case HVMCOPY_bad_gva_to_gfn:
-                ASSERT(0);
-                /* fall through */
-            default:
-                rc = X86EMUL_UNHANDLEABLE;
-                break;
-            }
-            if ( rc != X86EMUL_OKAY)
-                break;
-        }
-
-        switch ( p->size )
-        {
-        case 1:
-            outb(data, mport);
-            break;
-        case 2:
-            outw(data, mport);
-            break;
-        case 4:
-            outl(data, mport);
-            break;
-        default:
-            BUG();
-        }
-    }
-
-    if ( rc == X86EMUL_RETRY )
-        current->arch.hvm_vcpu.hvm_io.mmio_retry = 1;
+    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
+    const struct g2m_ioport *g2m_ioport = vio->g2m_ioport;
+    unsigned int mport = (addr - g2m_ioport->gport) + g2m_ioport->mport;
 
-    if ( i != 0 )
+    switch ( size )
     {
-        p->count = i;
-        rc = X86EMUL_OKAY;
+    case 1:
+        *data = inb(mport);
+        break;
+    case 2:
+        *data = inw(mport);
+        break;
+    case 4:
+        *data = inl(mport);
+        break;
+    default:
+        BUG();
     }
 
-    return rc;
+    return X86EMUL_OKAY;
 }
 
-int dpci_ioport_intercept(ioreq_t *p)
+static int dpci_portio_write(const struct hvm_io_handler *handler,
+                             uint64_t addr,
+                             uint32_t size,
+                             uint64_t data)
 {
-    struct domain *d = current->domain;
-    struct hvm_iommu *hd = domain_hvm_iommu(d);
-    struct g2m_ioport *g2m_ioport;
-    unsigned int mport, gport = p->addr;
-    unsigned int s = 0, e = 0;
-    int rc;
+    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
+    const struct g2m_ioport *g2m_ioport = vio->g2m_ioport;
+    unsigned int mport = (addr - g2m_ioport->gport) + g2m_ioport->mport;
 
-    list_for_each_entry( g2m_ioport, &hd->arch.g2m_ioport_list, list )
+    switch ( size )
     {
-        s = g2m_ioport->gport;
-        e = s + g2m_ioport->np;
-        if ( (gport >= s) && (gport < e) )
-            goto found;
+    case 1:
+        outb(data, mport);
+        break;
+    case 2:
+        outw(data, mport);
+        break;
+    case 4:
+        outl(data, mport);
+        break;
+    default:
+        BUG();
     }
 
-    return X86EMUL_UNHANDLEABLE;
+    return X86EMUL_OKAY;
+}
 
- found:
-    mport = (gport - s) + g2m_ioport->mport;
+static const struct hvm_io_ops dpci_portio_ops = {
+    .accept = dpci_portio_accept,
+    .read = dpci_portio_read,
+    .write = dpci_portio_write
+};
 
-    if ( !ioports_access_permitted(d, mport, mport + p->size - 1) ) 
-    {
-        gdprintk(XENLOG_ERR, "Error: access to gport=%#x denied!\n",
-                 (uint32_t)p->addr);
-        return X86EMUL_UNHANDLEABLE;
-    }
+void register_dpci_portio_handler(struct domain *d)
+{
+    struct hvm_io_handler *handler = hvm_next_io_handler(d);
 
-    switch ( p->dir )
-    {
-    case IOREQ_READ:
-        rc = dpci_ioport_read(mport, p);
-        break;
-    case IOREQ_WRITE:
-        rc = dpci_ioport_write(mport, p);
-        break;
-    default:
-        gdprintk(XENLOG_ERR, "Error: couldn't handle p->dir = %d", p->dir);
-        rc = X86EMUL_UNHANDLEABLE;
-    }
+    if ( handler == NULL )
+        return;
 
-    return rc;
+    handler->type = IOREQ_TYPE_PIO;
+    handler->ops = &dpci_portio_ops;
 }
 
 /*
diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
index 35f4f94..50fcf73 100644
--- a/xen/arch/x86/hvm/irq.c
+++ b/xen/arch/x86/hvm/irq.c
@@ -15,8 +15,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -218,7 +217,13 @@ void hvm_assert_evtchn_irq(struct vcpu *v)
         return;
     }
 
-    if ( is_hvm_pv_evtchn_vcpu(v) )
+    if ( v->arch.hvm_vcpu.evtchn_upcall_vector != 0 )
+    {
+        uint8_t vector = v->arch.hvm_vcpu.evtchn_upcall_vector;
+
+        vlapic_set_irq(vcpu_vlapic(v), vector, 0);
+    }
+    else if ( is_hvm_pv_evtchn_vcpu(v) )
         vcpu_kick(v);
     else if ( v->vcpu_id == 0 )
         hvm_set_callback_irq_level(v);
diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c
index ee18553..aa7adcf 100644
--- a/xen/arch/x86/hvm/mtrr.c
+++ b/xen/arch/x86/hvm/mtrr.c
@@ -13,18 +13,15 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <public/hvm/e820.h>
-#include <xen/types.h>
+#include <xen/domain_page.h>
 #include <asm/e820.h>
 #include <asm/iocap.h>
-#include <asm/mm.h>
 #include <asm/paging.h>
 #include <asm/p2m.h>
-#include <xen/domain_page.h>
 #include <asm/mtrr.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/cacheattr.h>
@@ -791,7 +788,7 @@ HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, hvm_load_mtrr_msr,
 
 void memory_type_changed(struct domain *d)
 {
-    if ( iommu_enabled && d->vcpu && d->vcpu[0] )
+    if ( need_iommu(d) && d->vcpu && d->vcpu[0] )
     {
         p2m_memory_type_changed(d);
         flush_all(FLUSH_CACHE);
diff --git a/xen/arch/x86/hvm/nestedhvm.c b/xen/arch/x86/hvm/nestedhvm.c
index 964f58f..caad525 100644
--- a/xen/arch/x86/hvm/nestedhvm.c
+++ b/xen/arch/x86/hvm/nestedhvm.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <asm/msr.h>
diff --git a/xen/arch/x86/hvm/pmtimer.c b/xen/arch/x86/hvm/pmtimer.c
index 6ad2797..c8229e0 100644
--- a/xen/arch/x86/hvm/pmtimer.c
+++ b/xen/arch/x86/hvm/pmtimer.c
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <asm/hvm/vpt.h>
@@ -142,7 +141,7 @@ static void pmt_timer_callback(void *opaque)
 
 /* Handle port I/O to the PM1a_STS and PM1a_EN registers */
 static int handle_evt_io(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
 {
     struct vcpu *v = current;
     PMTState *s = &v->domain->arch.hvm_domain.pl_time.vpmt;
@@ -205,7 +204,7 @@ static int handle_evt_io(
 
 /* Handle port I/O to the TMR_VAL register */
 static int handle_pmt_io(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
 {
     struct vcpu *v = current;
     PMTState *s = &v->domain->arch.hvm_domain.pl_time.vpmt;
@@ -250,10 +249,12 @@ static int pmtimer_save(struct domain *d, hvm_domain_context_t *h)
 
     spin_lock(&s->lock);
 
-    /* Update the counter to the guest's current time.  We always save
-     * with the domain paused, so the saved time should be after the
-     * last_gtime, but just in case, make sure we only go forwards */
-    x = ((s->vcpu->arch.hvm_vcpu.guest_time - s->last_gtime) * s->scale) >> 32;
+    /*
+     * Update the counter to the guest's current time.  Make sure it only
+     * goes forwards.
+     */
+    x = (((s->vcpu->arch.hvm_vcpu.guest_time ?: hvm_get_guest_time(s->vcpu)) -
+          s->last_gtime) * s->scale) >> 32;
     if ( x < 1UL<<31 )
         s->pm.tmr_val += x;
     if ( (s->pm.tmr_val & TMR_VAL_MSB) != msb )
diff --git a/xen/arch/x86/hvm/quirks.c b/xen/arch/x86/hvm/quirks.c
index 37357dd..efe666a 100644
--- a/xen/arch/x86/hvm/quirks.c
+++ b/xen/arch/x86/hvm/quirks.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/arch/x86/hvm/rtc.c b/xen/arch/x86/hvm/rtc.c
index 3448971..a9efeaf 100644
--- a/xen/arch/x86/hvm/rtc.c
+++ b/xen/arch/x86/hvm/rtc.c
@@ -697,7 +697,7 @@ static uint32_t rtc_ioport_read(RTCState *s, uint32_t addr)
 }
 
 static int handle_rtc_io(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
 {
     struct RTCState *vrtc = vcpu_vrtc(current);
 
diff --git a/xen/arch/x86/hvm/save.c b/xen/arch/x86/hvm/save.c
index 6af19be..4660beb 100644
--- a/xen/arch/x86/hvm/save.c
+++ b/xen/arch/x86/hvm/save.c
@@ -17,8 +17,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <asm/hvm/support.h>
@@ -36,7 +35,7 @@ void arch_hvm_save(struct domain *d, struct hvm_save_header *hdr)
     hdr->gtsc_khz = d->arch.tsc_khz;
 
     /* Time when saving started */
-    rdtscll(d->arch.hvm_domain.sync_tsc);
+    d->arch.hvm_domain.sync_tsc = rdtsc();
 }
 
 int arch_hvm_load(struct domain *d, struct hvm_save_header *hdr)
@@ -71,7 +70,7 @@ int arch_hvm_load(struct domain *d, struct hvm_save_header *hdr)
         hvm_set_rdtsc_exiting(d, 1);
 
     /* Time when restore started  */
-    rdtscll(d->arch.hvm_domain.sync_tsc);
+    d->arch.hvm_domain.sync_tsc = rdtsc();
 
     /* VGA state is not saved/restored, so we nobble the cache. */
     d->arch.hvm_domain.stdvga.cache = 0;
diff --git a/xen/arch/x86/hvm/stdvga.c b/xen/arch/x86/hvm/stdvga.c
index 19e80ed..f50bff7 100644
--- a/xen/arch/x86/hvm/stdvga.c
+++ b/xen/arch/x86/hvm/stdvga.c
@@ -148,7 +148,7 @@ static int stdvga_outb(uint64_t addr, uint8_t val)
     }
     else if ( prev_stdvga && !s->stdvga )
     {
-        gdprintk(XENLOG_INFO, "leaving stdvga\n");
+        gdprintk(XENLOG_INFO, "leaving stdvga mode\n");
     }
 
     return rc;
@@ -173,7 +173,7 @@ static void stdvga_out(uint32_t port, uint32_t bytes, uint32_t val)
 }
 
 static int stdvga_intercept_pio(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
 {
     struct hvm_hw_stdvga *s = &current->domain->arch.hvm_domain.stdvga;
 
@@ -275,9 +275,10 @@ static uint8_t stdvga_mem_readb(uint64_t addr)
     return ret;
 }
 
-static uint64_t stdvga_mem_read(uint64_t addr, uint64_t size)
+static int stdvga_mem_read(const struct hvm_io_handler *handler,
+                           uint64_t addr, uint32_t size, uint64_t *p_data)
 {
-    uint64_t data = 0;
+    uint64_t data = ~0ul;
 
     switch ( size )
     {
@@ -309,11 +310,12 @@ static uint64_t stdvga_mem_read(uint64_t addr, uint64_t size)
         break;
 
     default:
-        gdprintk(XENLOG_WARNING, "invalid io size: %"PRId64"\n", size);
+        gdprintk(XENLOG_WARNING, "invalid io size: %u\n", size);
         break;
     }
 
-    return data;
+    *p_data = data;
+    return X86EMUL_OKAY;
 }
 
 static void stdvga_mem_writeb(uint64_t addr, uint32_t val)
@@ -424,8 +426,24 @@ static void stdvga_mem_writeb(uint64_t addr, uint32_t val)
     }
 }
 
-static void stdvga_mem_write(uint64_t addr, uint64_t data, uint64_t size)
+static int stdvga_mem_write(const struct hvm_io_handler *handler,
+                            uint64_t addr, uint32_t size,
+                            uint64_t data)
 {
+    struct hvm_hw_stdvga *s = &current->domain->arch.hvm_domain.stdvga;
+    ioreq_t p = {
+        .type = IOREQ_TYPE_COPY,
+        .addr = addr,
+        .size = size,
+        .count = 1,
+        .dir = IOREQ_WRITE,
+        .data = data,
+    };
+    struct hvm_ioreq_server *srv;
+
+    if ( !s->cache || !s->stdvga )
+        goto done;
+
     /* Intercept mmio write */
     switch ( size )
     {
@@ -457,136 +475,79 @@ static void stdvga_mem_write(uint64_t addr, uint64_t data, uint64_t size)
         break;
 
     default:
-        gdprintk(XENLOG_WARNING, "invalid io size: %"PRId64"\n", size);
+        gdprintk(XENLOG_WARNING, "invalid io size: %u\n", size);
         break;
     }
-}
-
-static uint32_t read_data;
-
-static int mmio_move(struct hvm_hw_stdvga *s, ioreq_t *p)
-{
-    int i;
-    uint64_t addr = p->addr;
-    p2m_type_t p2mt;
-    struct domain *d = current->domain;
 
-    if ( p->data_is_ptr )
-    {
-        uint64_t data = p->data, tmp;
-        int step = p->df ? -p->size : p->size;
-
-        if ( p->dir == IOREQ_READ )
-        {
-            for ( i = 0; i < p->count; i++ ) 
-            {
-                tmp = stdvga_mem_read(addr, p->size);
-                if ( hvm_copy_to_guest_phys(data, &tmp, p->size) !=
-                     HVMCOPY_okay )
-                {
-                    struct page_info *dp = get_page_from_gfn(
-                            d, data >> PAGE_SHIFT, &p2mt, P2M_ALLOC);
-                    /*
-                     * The only case we handle is vga_mem <-> vga_mem.
-                     * Anything else disables caching and leaves it to qemu-dm.
-                     */
-                    if ( (p2mt != p2m_mmio_dm) || (data < VGA_MEM_BASE) ||
-                         ((data + p->size) > (VGA_MEM_BASE + VGA_MEM_SIZE)) )
-                    {
-                        if ( dp )
-                            put_page(dp);
-                        return 0;
-                    }
-                    ASSERT(!dp);
-                    stdvga_mem_write(data, tmp, p->size);
-                }
-                data += step;
-                addr += step;
-            }
-        }
-        else
-        {
-            for ( i = 0; i < p->count; i++ )
-            {
-                if ( hvm_copy_from_guest_phys(&tmp, data, p->size) !=
-                     HVMCOPY_okay )
-                {
-                    struct page_info *dp = get_page_from_gfn(
-                        d, data >> PAGE_SHIFT, &p2mt, P2M_ALLOC);
-                    if ( (p2mt != p2m_mmio_dm) || (data < VGA_MEM_BASE) ||
-                         ((data + p->size) > (VGA_MEM_BASE + VGA_MEM_SIZE)) )
-                    {
-                        if ( dp )
-                            put_page(dp);
-                        return 0;
-                    }
-                    ASSERT(!dp);
-                    tmp = stdvga_mem_read(data, p->size);
-                }
-                stdvga_mem_write(addr, tmp, p->size);
-                data += step;
-                addr += step;
-            }
-        }
-    }
-    else
-    {
-        ASSERT(p->count == 1);
-        if ( p->dir == IOREQ_READ )
-            p->data = stdvga_mem_read(addr, p->size);
-        else
-            stdvga_mem_write(addr, p->data, p->size);
-    }
+ done:
+    srv = hvm_select_ioreq_server(current->domain, &p);
+    if ( !srv )
+        return X86EMUL_UNHANDLEABLE;
 
-    read_data = p->data;
-    return 1;
+    return hvm_send_ioreq(srv, &p, 1);
 }
 
-static int stdvga_intercept_mmio(ioreq_t *p)
+static bool_t stdvga_mem_accept(const struct hvm_io_handler *handler,
+                                const ioreq_t *p)
 {
-    struct domain *d = current->domain;
-    struct hvm_hw_stdvga *s = &d->arch.hvm_domain.stdvga;
-    int buf = 0, rc;
+    struct hvm_hw_stdvga *s = &current->domain->arch.hvm_domain.stdvga;
 
-    if ( p->size > 8 )
-    {
-        gdprintk(XENLOG_WARNING, "invalid mmio size %d\n", (int)p->size);
-        return X86EMUL_UNHANDLEABLE;
-    }
+    /*
+     * The range check must be done without taking the lock, to avoid
+     * deadlock when hvm_mmio_internal() is called from
+     * hvm_copy_to/from_guest_phys() in hvm_process_io_intercept().
+     */
+    if ( (hvm_mmio_first_byte(p) < VGA_MEM_BASE) ||
+         (hvm_mmio_last_byte(p) >= (VGA_MEM_BASE + VGA_MEM_SIZE)) )
+        return 0;
 
     spin_lock(&s->lock);
 
-    if ( s->stdvga && s->cache )
+    if ( p->dir == IOREQ_WRITE && p->count > 1 )
     {
-        switch ( p->type )
+        /*
+         * We cannot return X86EMUL_UNHANDLEABLE on anything other then the
+         * first cycle of an I/O. So, since we cannot guarantee to always be
+         * able to send buffered writes, we have to reject any multi-cycle
+         * I/O and, since we are rejecting an I/O, we must invalidate the
+         * cache.
+         * Single-cycle write transactions are accepted even if the cache is
+         * not active since we can assert, when in stdvga mode, that writes
+         * to VRAM have no side effect and thus we can try to buffer them.
+         */
+        if ( s->cache )
         {
-        case IOREQ_TYPE_COPY:
-            buf = mmio_move(s, p);
-            if ( !buf )
-                s->cache = 0;
-            break;
-        default:
-            gdprintk(XENLOG_WARNING, "unsupported mmio request type:%d "
-                     "addr:0x%04x data:0x%04x size:%d count:%d state:%d "
-                     "isptr:%d dir:%d df:%d\n",
-                     p->type, (int)p->addr, (int)p->data, (int)p->size,
-                     (int)p->count, p->state,
-                     p->data_is_ptr, p->dir, p->df);
+            gdprintk(XENLOG_INFO, "leaving caching mode\n");
             s->cache = 0;
         }
+
+        goto reject;
     }
-    else
-    {
-        buf = (p->dir == IOREQ_WRITE);
-    }
+    else if ( p->dir == IOREQ_READ && (!s->cache || !s->stdvga) )
+        goto reject;
 
-    rc = (buf && hvm_buffered_io_send(p));
+    /* s->lock intentionally held */
+    return 1;
 
+ reject:
     spin_unlock(&s->lock);
+    return 0;
+}
+
+static void stdvga_mem_complete(const struct hvm_io_handler *handler)
+{
+    struct hvm_hw_stdvga *s = &current->domain->arch.hvm_domain.stdvga;
 
-    return rc ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+    spin_unlock(&s->lock);
 }
 
+static const struct hvm_io_ops stdvga_mem_ops = {
+    .accept = stdvga_mem_accept,
+    .read = stdvga_mem_read,
+    .write = stdvga_mem_write,
+    .complete = stdvga_mem_complete
+};
+
 void stdvga_init(struct domain *d)
 {
     struct hvm_hw_stdvga *s = &d->arch.hvm_domain.stdvga;
@@ -599,7 +560,7 @@ void stdvga_init(struct domain *d)
     
     for ( i = 0; i != ARRAY_SIZE(s->vram_page); i++ )
     {
-        pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
+        pg = alloc_domheap_page(d, MEMF_no_owner);
         if ( pg == NULL )
             break;
         s->vram_page[i] = pg;
@@ -610,13 +571,21 @@ void stdvga_init(struct domain *d)
 
     if ( i == ARRAY_SIZE(s->vram_page) )
     {
+        struct hvm_io_handler *handler;
+
         /* Sequencer registers. */
         register_portio_handler(d, 0x3c4, 2, stdvga_intercept_pio);
         /* Graphics registers. */
         register_portio_handler(d, 0x3ce, 2, stdvga_intercept_pio);
-        /* MMIO. */
-        register_buffered_io_handler(
-            d, VGA_MEM_BASE, VGA_MEM_SIZE, stdvga_intercept_mmio);
+
+        /* VGA memory */
+        handler = hvm_next_io_handler(d);
+
+        if ( handler == NULL )
+            return;
+
+        handler->type = IOREQ_TYPE_COPY;
+        handler->ops = &stdvga_mem_ops;
     }
 }
 
@@ -633,3 +602,13 @@ void stdvga_deinit(struct domain *d)
         s->vram_page[i] = NULL;
     }
 }
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/hvm/svm/Makefile b/xen/arch/x86/hvm/svm/Makefile
index a10a55e..760d295 100644
--- a/xen/arch/x86/hvm/svm/Makefile
+++ b/xen/arch/x86/hvm/svm/Makefile
@@ -6,4 +6,3 @@ obj-y += nestedsvm.o
 obj-y += svm.o
 obj-y += svmdebug.o
 obj-y += vmcb.o
-obj-y += vpmu.o
diff --git a/xen/arch/x86/hvm/svm/asid.c b/xen/arch/x86/hvm/svm/asid.c
index 344e143..b65be66 100644
--- a/xen/arch/x86/hvm/svm/asid.c
+++ b/xen/arch/x86/hvm/svm/asid.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/arch/x86/hvm/svm/emulate.c b/xen/arch/x86/hvm/svm/emulate.c
index 37a1ece..e3eb714 100644
--- a/xen/arch/x86/hvm/svm/emulate.c
+++ b/xen/arch/x86/hvm/svm/emulate.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -27,8 +26,6 @@
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/hvm/svm/emulate.h>
 
-#define MAX_INST_LEN 15
-
 static unsigned int is_prefix(u8 opc)
 {
     switch ( opc )
diff --git a/xen/arch/x86/hvm/svm/entry.S b/xen/arch/x86/hvm/svm/entry.S
index ef4d5f4..e816d66 100644
--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/arch/x86/hvm/svm/intr.c b/xen/arch/x86/hvm/svm/intr.c
index 023151a..1d97c1b 100644
--- a/xen/arch/x86/hvm/svm/intr.c
+++ b/xen/arch/x86/hvm/svm/intr.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/arch/x86/hvm/svm/nestedsvm.c b/xen/arch/x86/hvm/svm/nestedsvm.c
index be5797a..46f2532 100644
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
@@ -75,10 +74,20 @@ int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr)
         nv->nv_vvmcxaddr = VMCX_EADDR;
     }
 
-    if (nv->nv_vvmcx == NULL) {
-        nv->nv_vvmcx = hvm_map_guest_frame_rw(vmcbaddr >> PAGE_SHIFT, 1);
-        if (nv->nv_vvmcx == NULL)
+    if ( !nv->nv_vvmcx )
+    {
+        bool_t writable;
+        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(vmcbaddr), 1,
+                                             &writable);
+
+        if ( !vvmcx )
+            return 0;
+        if ( !writable )
+        {
+            hvm_unmap_guest_frame(vvmcx, 1);
             return 0;
+        }
+        nv->nv_vvmcx = vvmcx;
         nv->nv_vvmcxaddr = vmcbaddr;
     }
 
@@ -246,7 +255,7 @@ static int nsvm_vcpu_hostsave(struct vcpu *v, unsigned int inst_len)
     return 0;
 }
 
-int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs)
+static int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs)
 {
     struct nestedvcpu *nv = &vcpu_nestedhvm(v);
     struct nestedsvm *svm = &vcpu_nestedsvm(v);
@@ -274,7 +283,7 @@ int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs)
 
     /* CR4 */
     v->arch.hvm_vcpu.guest_cr[4] = n1vmcb->_cr4;
-    rc = hvm_set_cr4(n1vmcb->_cr4);
+    rc = hvm_set_cr4(n1vmcb->_cr4, 1);
     if (rc != X86EMUL_OKAY)
         gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc);
 
@@ -283,7 +292,7 @@ int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs)
         svm->ns_cr0, v->arch.hvm_vcpu.guest_cr[0]);
     v->arch.hvm_vcpu.guest_cr[0] = n1vmcb->_cr0 | X86_CR0_PE;
     n1vmcb->rflags &= ~X86_EFLAGS_VM;
-    rc = hvm_set_cr0(n1vmcb->_cr0 | X86_CR0_PE);
+    rc = hvm_set_cr0(n1vmcb->_cr0 | X86_CR0_PE, 1);
     if (rc != X86EMUL_OKAY)
         gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc);
     svm->ns_cr0 = v->arch.hvm_vcpu.guest_cr[0];
@@ -309,7 +318,7 @@ int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs)
         v->arch.guest_table = pagetable_null();
         /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
     }
-    rc = hvm_set_cr3(n1vmcb->_cr3);
+    rc = hvm_set_cr3(n1vmcb->_cr3, 1);
     if (rc != X86EMUL_OKAY)
         gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
 
@@ -534,7 +543,7 @@ static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
 
     /* CR4 */
     v->arch.hvm_vcpu.guest_cr[4] = ns_vmcb->_cr4;
-    rc = hvm_set_cr4(ns_vmcb->_cr4);
+    rc = hvm_set_cr4(ns_vmcb->_cr4, 1);
     if (rc != X86EMUL_OKAY)
         gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc);
 
@@ -542,7 +551,7 @@ static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
     svm->ns_cr0 = v->arch.hvm_vcpu.guest_cr[0];
     cr0 = nestedsvm_fpu_vmentry(svm->ns_cr0, ns_vmcb, n1vmcb, n2vmcb);
     v->arch.hvm_vcpu.guest_cr[0] = ns_vmcb->_cr0;
-    rc = hvm_set_cr0(cr0);
+    rc = hvm_set_cr0(cr0, 1);
     if (rc != X86EMUL_OKAY)
         gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc);
 
@@ -558,7 +567,7 @@ static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
         nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
 
         /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
-        rc = hvm_set_cr3(ns_vmcb->_cr3);
+        rc = hvm_set_cr3(ns_vmcb->_cr3, 1);
         if (rc != X86EMUL_OKAY)
             gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
     } else if (paging_mode_hap(v->domain)) {
@@ -570,7 +579,7 @@ static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
          * we assume it intercepts page faults.
          */
         /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
-        rc = hvm_set_cr3(ns_vmcb->_cr3);
+        rc = hvm_set_cr3(ns_vmcb->_cr3, 1);
         if (rc != X86EMUL_OKAY)
             gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
     } else {
@@ -761,7 +770,7 @@ nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs)
     return 0;
 }
 
-int
+static int
 nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs,
     uint64_t exitcode)
 {
@@ -821,21 +830,11 @@ nsvm_vcpu_vmexit_trap(struct vcpu *v, struct hvm_trap *trap)
     return NESTEDHVM_VMEXIT_DONE;
 }
 
-uint64_t nsvm_vcpu_guestcr3(struct vcpu *v)
-{
-    return vcpu_nestedsvm(v).ns_vmcb_guestcr3;
-}
-
 uint64_t nsvm_vcpu_hostcr3(struct vcpu *v)
 {
     return vcpu_nestedsvm(v).ns_vmcb_hostcr3;
 }
 
-uint32_t nsvm_vcpu_asid(struct vcpu *v)
-{
-    return vcpu_nestedsvm(v).ns_guest_asid;
-}
-
 static int
 nsvm_vmcb_guest_intercepts_msr(unsigned long *msr_bitmap,
     uint32_t msr, bool_t write)
@@ -911,7 +910,7 @@ nsvm_vmcb_guest_intercepts_ioio(paddr_t iopm_pa, uint64_t exitinfo1)
     return NESTEDHVM_VMEXIT_INJECT;
 }
 
-int
+static bool_t
 nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v,
     struct cpu_user_regs *regs, uint64_t exitcode)
 {
@@ -994,7 +993,7 @@ nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v,
     return 1;
 }
 
-int
+bool_t
 nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr, int errcode)
 {
     return nsvm_vmcb_guest_intercepts_exitcode(v,
@@ -1231,7 +1230,7 @@ enum hvm_intblk nsvm_intr_blocked(struct vcpu *v)
          * Delay the injection because this would result in delivering
          * an interrupt *within* the execution of an instruction.
          */
-        if ( v->arch.hvm_vcpu.hvm_io.io_state != HVMIO_none )
+        if ( v->arch.hvm_vcpu.hvm_io.io_req.state != STATE_IOREQ_NONE )
             return hvm_intblk_shadow;
 
         if ( !nv->nv_vmexit_pending && n2vmcb->exitintinfo.bytes != 0 ) {
@@ -1409,7 +1408,7 @@ nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs)
     if (rc)
         ret = NESTEDHVM_VMEXIT_ERROR;
 
-    rc = nhvm_vcpu_hostrestore(v, regs);
+    rc = nsvm_vcpu_hostrestore(v, regs);
     if (rc)
         ret = NESTEDHVM_VMEXIT_FATALERROR;
 
@@ -1461,7 +1460,7 @@ nestedsvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs,
     /* Prepare for running the l1 guest. Make the actual
      * modifications to the virtual VMCB/VMCS.
      */
-    rc = nhvm_vcpu_vmexit(v, regs, exitcode);
+    rc = nsvm_vcpu_vmexit_inject(v, regs, exitcode);
 
     /* If l1 guest uses shadow paging, update the paging mode. */
     if (!nestedhvm_paging_mode_hap(v))
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index a7655bd..8de41fa 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -36,12 +35,11 @@
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
 #include <asm/amd.h>
-#include <asm/types.h>
+#include <asm/guest_access.h>
 #include <asm/debugreg.h>
 #include <asm/msr.h>
 #include <asm/i387.h>
 #include <asm/iocap.h>
-#include <asm/spinlock.h>
 #include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
@@ -106,7 +104,7 @@ void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
     if ( unlikely(inst_len == 0) )
         return;
 
-    if ( unlikely(inst_len > 15) )
+    if ( unlikely(inst_len > MAX_INST_LEN) )
     {
         gdprintk(XENLOG_ERR, "Bad instruction length %u\n", inst_len);
         svm_crash_or_fault(curr);
@@ -805,7 +803,7 @@ static void svm_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc)
         if ( at_tsc )
             host_tsc = at_tsc;
         else
-            rdtscll(host_tsc);
+            host_tsc = rdtsc();
         offset = svm_get_tsc_offset(host_tsc, guest_tsc, vcpu_tsc_ratio(v));
     }
 
@@ -859,7 +857,7 @@ static unsigned int svm_get_insn_bytes(struct vcpu *v, uint8_t *buf)
     if ( len != 0 )
     {
         /* Latch and clear the cached instruction. */
-        memcpy(buf, vmcb->guest_ins, 15);
+        memcpy(buf, vmcb->guest_ins, MAX_INST_LEN);
         v->arch.hvm_svm.cached_insn_len = 0;
     }
 
@@ -1166,7 +1164,9 @@ static int svm_vcpu_initialise(struct vcpu *v)
         return rc;
     }
 
-    vpmu_initialise(v);
+    /* PVH's VPMU is initialized via hypercall */
+    if ( is_hvm_vcpu(v) )
+        vpmu_initialise(v);
 
     svm_guest_osvw_init(v);
 
@@ -1707,7 +1707,8 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
     case MSR_AMD_FAM15H_EVNTSEL3:
     case MSR_AMD_FAM15H_EVNTSEL4:
     case MSR_AMD_FAM15H_EVNTSEL5:
-        vpmu_do_rdmsr(msr, msr_content);
+        if ( vpmu_do_rdmsr(msr, msr_content) )
+            goto gpf;
         break;
 
     case MSR_AMD64_DR0_ADDRESS_MASK:
@@ -1858,7 +1859,8 @@ static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
     case MSR_AMD_FAM15H_EVNTSEL3:
     case MSR_AMD_FAM15H_EVNTSEL4:
     case MSR_AMD_FAM15H_EVNTSEL5:
-        vpmu_do_wrmsr(msr, msr_content, 0);
+        if ( vpmu_do_wrmsr(msr, msr_content, 0) )
+            goto gpf;
         break;
 
     case MSR_IA32_MCx_MISC(4): /* Threshold register */
@@ -1946,7 +1948,7 @@ static void svm_do_msr_access(struct cpu_user_regs *regs)
         if ( (inst_len = __get_instruction_length(v, INSTR_WRMSR)) == 0 )
             return;
         msr_content = ((uint64_t)regs->edx << 32) | (uint32_t)regs->eax;
-        rc = hvm_msr_write_intercept(regs->ecx, msr_content);
+        rc = hvm_msr_write_intercept(regs->ecx, msr_content, 1);
     }
 
     if ( rc == X86EMUL_OKAY )
@@ -1989,7 +1991,7 @@ static void svm_vmexit_do_pause(struct cpu_user_regs *regs)
      * Do something useful, like reschedule the guest
      */
     perfc_incr(pauseloop_exits);
-    do_sched_op_compat(SCHEDOP_yield, 0);
+    do_sched_op(SCHEDOP_yield, guest_handle_from_ptr(NULL, void));
 }
 
 static void
@@ -2275,12 +2277,8 @@ static struct hvm_function_table __initdata svm_function_table = {
     .nhvm_vcpu_initialise = nsvm_vcpu_initialise,
     .nhvm_vcpu_destroy = nsvm_vcpu_destroy,
     .nhvm_vcpu_reset = nsvm_vcpu_reset,
-    .nhvm_vcpu_hostrestore = nsvm_vcpu_hostrestore,
-    .nhvm_vcpu_vmexit = nsvm_vcpu_vmexit_inject,
     .nhvm_vcpu_vmexit_trap = nsvm_vcpu_vmexit_trap,
-    .nhvm_vcpu_guestcr3 = nsvm_vcpu_guestcr3,
     .nhvm_vcpu_p2m_base = nsvm_vcpu_hostcr3,
-    .nhvm_vcpu_asid = nsvm_vcpu_asid,
     .nhvm_vmcx_guest_intercepts_trap = nsvm_vmcb_guest_intercepts_trap,
     .nhvm_vmcx_hap_enabled = nsvm_vmcb_hap_enabled,
     .nhvm_intr_blocked = nsvm_intr_blocked,
@@ -2378,6 +2376,7 @@ void svm_vmexit_handler(struct cpu_user_regs *regs)
             case NESTEDHVM_VMEXIT_ERROR:
                 break;
             }
+            /* fallthrough */
         case NESTEDHVM_VMEXIT_ERROR:
             gdprintk(XENLOG_ERR,
                 "nestedsvm_check_intercepts() returned NESTEDHVM_VMEXIT_ERROR\n");
diff --git a/xen/arch/x86/hvm/svm/svmdebug.c b/xen/arch/x86/hvm/svm/svmdebug.c
index ce7d055..ded5d19 100644
--- a/xen/arch/x86/hvm/svm/svmdebug.c
+++ b/xen/arch/x86/hvm/svm/svmdebug.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/xen/arch/x86/hvm/svm/vmcb.c b/xen/arch/x86/hvm/svm/vmcb.c
index 21292bb..b5d7165 100644
--- a/xen/arch/x86/hvm/svm/vmcb.c
+++ b/xen/arch/x86/hvm/svm/vmcb.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
@@ -118,7 +117,7 @@ static int construct_vmcb(struct vcpu *v)
         svm_disable_intercept_for_msr(v, MSR_AMD64_LWP_CBADDR);
 
     vmcb->_msrpm_base_pa = (u64)virt_to_maddr(arch_svm->msrpm);
-    vmcb->_iopm_base_pa  = (u64)virt_to_maddr(hvm_io_bitmap);
+    vmcb->_iopm_base_pa = __pa(v->domain->arch.hvm_domain.io_bitmap);
 
     /* Virtualise EFLAGS.IF and LAPIC TPR (CR8). */
     vmcb->_vintr.fields.intr_masking = 1;
diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c
index d3c681b..d348235 100644
--- a/xen/arch/x86/hvm/vioapic.c
+++ b/xen/arch/x86/hvm/vioapic.c
@@ -18,8 +18,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  *  Yunhong Jiang <yunhong.jiang at intel.com>
  *  Ported to xen by using virtual IRQ line.
@@ -46,27 +45,33 @@
 
 static void vioapic_deliver(struct hvm_hw_vioapic *vioapic, int irq);
 
-static unsigned long vioapic_read_indirect(struct hvm_hw_vioapic *vioapic,
-                                           unsigned long addr,
-                                           unsigned long length)
+static uint32_t vioapic_read_indirect(const struct hvm_hw_vioapic *vioapic)
 {
-    unsigned long result = 0;
+    uint32_t result = 0;
 
     switch ( vioapic->ioregsel )
     {
     case VIOAPIC_REG_VERSION:
-        result = ((((VIOAPIC_NUM_PINS-1) & 0xff) << 16)
-                  | (VIOAPIC_VERSION_ID & 0xff));
+        result = ((union IO_APIC_reg_01){
+                  .bits = { .version = VIOAPIC_VERSION_ID,
+                            .entries = VIOAPIC_NUM_PINS - 1 }
+                  }).raw;
         break;
 
     case VIOAPIC_REG_APIC_ID:
+        /*
+         * Using union IO_APIC_reg_02 for the ID register too, as
+         * union IO_APIC_reg_00's ID field is 8 bits wide for some reason.
+         */
     case VIOAPIC_REG_ARB_ID:
-        result = ((vioapic->id & 0xf) << 24);
+        result = ((union IO_APIC_reg_02){
+                  .bits = { .arbitration = vioapic->id }
+                  }).raw;
         break;
 
     default:
     {
-        uint32_t redir_index = (vioapic->ioregsel - 0x10) >> 1;
+        uint32_t redir_index = (vioapic->ioregsel - VIOAPIC_REG_RTE0) >> 1;
         uint64_t redir_content;
 
         if ( redir_index >= VIOAPIC_NUM_PINS )
@@ -77,9 +82,8 @@ static unsigned long vioapic_read_indirect(struct hvm_hw_vioapic *vioapic,
         }
 
         redir_content = vioapic->redirtbl[redir_index].bits;
-        result = (vioapic->ioregsel & 0x1)?
-            (redir_content >> 32) & 0xffffffff :
-            redir_content & 0xffffffff;
+        result = (vioapic->ioregsel & 1) ? (redir_content >> 32)
+                                         : redir_content;
         break;
     }
     }
@@ -89,23 +93,21 @@ static unsigned long vioapic_read_indirect(struct hvm_hw_vioapic *vioapic,
 
 static int vioapic_read(
     struct vcpu *v, unsigned long addr,
-    unsigned long length, unsigned long *pval)
+    unsigned int length, unsigned long *pval)
 {
-    struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain);
+    const struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain);
     uint32_t result;
 
     HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "addr %lx", addr);
 
-    addr &= 0xff;
-
-    switch ( addr )
+    switch ( addr & 0xff )
     {
     case VIOAPIC_REG_SELECT:
         result = vioapic->ioregsel;
         break;
 
     case VIOAPIC_REG_WINDOW:
-        result = vioapic_read_indirect(vioapic, addr, length);
+        result = vioapic_read_indirect(vioapic);
         break;
 
     default:
@@ -169,7 +171,7 @@ static void vioapic_write_redirent(
 }
 
 static void vioapic_write_indirect(
-    struct hvm_hw_vioapic *vioapic, unsigned long length, unsigned long val)
+    struct hvm_hw_vioapic *vioapic, uint32_t val)
 {
     switch ( vioapic->ioregsel )
     {
@@ -178,7 +180,12 @@ static void vioapic_write_indirect(
         break;
 
     case VIOAPIC_REG_APIC_ID:
-        vioapic->id = (val >> 24) & 0xf;
+        /*
+         * Presumably because we emulate an Intel IOAPIC which only has a
+         * 4 bit ID field (compared to 8 for AMD), using union IO_APIC_reg_02
+         * for the ID register (union IO_APIC_reg_00's ID field is 8 bits).
+         */
+        vioapic->id = ((union IO_APIC_reg_02){ .raw = val }).bits.arbitration;
         break;
 
     case VIOAPIC_REG_ARB_ID:
@@ -186,10 +193,10 @@ static void vioapic_write_indirect(
 
     default:
     {
-        uint32_t redir_index = (vioapic->ioregsel - 0x10) >> 1;
+        uint32_t redir_index = (vioapic->ioregsel - VIOAPIC_REG_RTE0) >> 1;
 
-        HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "change redir index %x val %lx",
-                    redir_index, val);
+        HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "rte[%02x].%s = %08x",
+                    redir_index, vioapic->ioregsel & 1 ? "hi" : "lo", val);
 
         if ( redir_index >= VIOAPIC_NUM_PINS )
         {
@@ -207,20 +214,18 @@ static void vioapic_write_indirect(
 
 static int vioapic_write(
     struct vcpu *v, unsigned long addr,
-    unsigned long length, unsigned long val)
+    unsigned int length, unsigned long val)
 {
     struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain);
 
-    addr &= 0xff;
-
-    switch ( addr )
+    switch ( addr & 0xff )
     {
     case VIOAPIC_REG_SELECT:
         vioapic->ioregsel = val;
         break;
 
     case VIOAPIC_REG_WINDOW:
-        vioapic_write_indirect(vioapic, length, val);
+        vioapic_write_indirect(vioapic, val);
         break;
 
 #if VIOAPIC_VERSION_ID >= 0x20
@@ -244,10 +249,10 @@ static int vioapic_range(struct vcpu *v, unsigned long addr)
              (addr < vioapic->base_address + VIOAPIC_MEM_LENGTH)));
 }
 
-const struct hvm_mmio_handler vioapic_mmio_handler = {
-    .check_handler = vioapic_range,
-    .read_handler = vioapic_read,
-    .write_handler = vioapic_write
+static const struct hvm_mmio_ops vioapic_mmio_ops = {
+    .check = vioapic_range,
+    .read = vioapic_read,
+    .write = vioapic_write
 };
 
 static void ioapic_inj_irq(
@@ -380,7 +385,7 @@ void vioapic_irq_positive_edge(struct domain *d, unsigned int irq)
     }
 }
 
-void vioapic_update_EOI(struct domain *d, int vector)
+void vioapic_update_EOI(struct domain *d, u8 vector)
 {
     struct hvm_hw_vioapic *vioapic = domain_vioapic(d);
     struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
@@ -450,6 +455,8 @@ int vioapic_init(struct domain *d)
     d->arch.hvm_domain.vioapic->domain = d;
     vioapic_reset(d);
 
+    register_mmio_handler(d, &vioapic_mmio_ops);
+
     return 0;
 }
 
diff --git a/xen/arch/x86/hvm/viridian.c b/xen/arch/x86/hvm/viridian.c
index 3197b6b..2f22783 100644
--- a/xen/arch/x86/hvm/viridian.c
+++ b/xen/arch/x86/hvm/viridian.c
@@ -9,6 +9,7 @@
 #include <xen/perfc.h>
 #include <xen/hypercall.h>
 #include <xen/domain_page.h>
+#include <asm/guest_access.h>
 #include <asm/paging.h>
 #include <asm/p2m.h>
 #include <asm/apic.h>
@@ -21,6 +22,7 @@
 #define VIRIDIAN_MSR_HYPERCALL                  0x40000001
 #define VIRIDIAN_MSR_VP_INDEX                   0x40000002
 #define VIRIDIAN_MSR_TIME_REF_COUNT             0x40000020
+#define VIRIDIAN_MSR_REFERENCE_TSC              0x40000021
 #define VIRIDIAN_MSR_TSC_FREQUENCY              0x40000022
 #define VIRIDIAN_MSR_APIC_FREQUENCY             0x40000023
 #define VIRIDIAN_MSR_EOI                        0x40000070
@@ -40,6 +42,7 @@
 #define CPUID3A_MSR_APIC_ACCESS    (1 << 4)
 #define CPUID3A_MSR_HYPERCALL      (1 << 5)
 #define CPUID3A_MSR_VP_INDEX       (1 << 6)
+#define CPUID3A_MSR_REFERENCE_TSC  (1 << 9)
 #define CPUID3A_MSR_FREQ           (1 << 11)
 
 /* Viridian CPUID 4000004, Implementation Recommendations. */
@@ -95,6 +98,8 @@ int cpuid_viridian_leaves(unsigned int leaf, unsigned int *eax,
             *eax |= CPUID3A_MSR_FREQ;
         if ( viridian_feature_mask(d) & HVMPV_time_ref_count )
             *eax |= CPUID3A_MSR_TIME_REF_COUNT;
+        if ( viridian_feature_mask(d) & HVMPV_reference_tsc )
+            *eax |= CPUID3A_MSR_REFERENCE_TSC;
         break;
     case 4:
         /* Recommended hypercall usage. */
@@ -155,6 +160,17 @@ static void dump_apic_assist(const struct vcpu *v)
            v, aa->fields.enabled, (unsigned long)aa->fields.pfn);
 }
 
+static void dump_reference_tsc(const struct domain *d)
+{
+    const union viridian_reference_tsc *rt;
+
+    rt = &d->arch.hvm_domain.viridian.reference_tsc;
+    
+    printk(XENLOG_G_INFO "d%d: VIRIDIAN REFERENCE_TSC: enabled: %x pfn: %lx\n",
+           d->domain_id,
+           rt->fields.enabled, (unsigned long)rt->fields.pfn);
+}
+
 static void enable_hypercall_page(struct domain *d)
 {
     unsigned long gmfn = d->arch.hvm_domain.viridian.hypercall_gpa.fields.pfn;
@@ -224,6 +240,79 @@ static void initialize_apic_assist(struct vcpu *v)
     put_page_and_type(page);
 }
 
+static void update_reference_tsc(struct domain *d, bool_t initialize)
+{
+    unsigned long gmfn = d->arch.hvm_domain.viridian.reference_tsc.fields.pfn;
+    struct page_info *page = get_page_from_gfn(d, gmfn, NULL, P2M_ALLOC);
+    HV_REFERENCE_TSC_PAGE *p;
+
+    if ( !page || !get_page_type(page, PGT_writable_page) )
+    {
+        if ( page )
+            put_page(page);
+        gdprintk(XENLOG_WARNING, "Bad GMFN %lx (MFN %lx)\n", gmfn,
+                 page ? page_to_mfn(page) : INVALID_MFN);
+        return;
+    }
+
+    p = __map_domain_page(page);
+
+    if ( initialize )
+        clear_page(p);
+
+    /*
+     * This enlightenment must be disabled is the host TSC is not invariant.
+     * However it is also disabled if vtsc is true (which means rdtsc is being
+     * emulated). This generally happens when guest TSC freq and host TSC freq
+     * don't match. The TscScale value could be adjusted to cope with this,
+     * allowing vtsc to be turned off, but support for this is not yet present
+     * in the hypervisor. Thus is it is possible that migrating a Windows VM
+     * between hosts of differing TSC frequencies may result in large
+     * differences in guest performance.
+     */
+    if ( !host_tsc_is_safe() || d->arch.vtsc )
+    {
+        /*
+         * The specification states that valid values of TscSequence range
+         * from 0 to 0xFFFFFFFE. The value 0xFFFFFFFF is used to indicate
+         * this mechanism is no longer a reliable source of time and that
+         * the VM should fall back to a different source.
+         *
+         * Server 2012 (6.2 kernel) and 2012 R2 (6.3 kernel) actually violate
+         * the spec. and rely on a value of 0 to indicate that this
+         * enlightenment should no longer be used. These two kernel
+         * versions are currently the only ones to make use of this
+         * enlightenment, so just use 0 here.
+         */
+        p->TscSequence = 0;
+
+        printk(XENLOG_G_INFO "d%d: VIRIDIAN REFERENCE_TSC: invalidated\n",
+               d->domain_id);
+        goto out;
+    }
+
+    /*
+     * The guest will calculate reference time according to the following
+     * formula:
+     *
+     * ReferenceTime = ((RDTSC() * TscScale) >> 64) + TscOffset
+     *
+     * Windows uses a 100ns tick, so we need a scale which is cpu
+     * ticks per 100ns shifted left by 64.
+     */
+    p->TscScale = ((10000ul << 32) / d->arch.tsc_khz) << 32;
+
+    p->TscSequence++;
+    if ( p->TscSequence == 0xFFFFFFFF ||
+         p->TscSequence == 0 ) /* Avoid both 'invalid' values */
+        p->TscSequence = 1;
+
+ out:
+    unmap_domain_page(p);
+
+    put_page_and_type(page);
+}
+
 int wrmsr_viridian_regs(uint32_t idx, uint64_t val)
 {
     struct vcpu *v = current;
@@ -282,6 +371,17 @@ int wrmsr_viridian_regs(uint32_t idx, uint64_t val)
             initialize_apic_assist(v);
         break;
 
+    case VIRIDIAN_MSR_REFERENCE_TSC:
+        if ( !(viridian_feature_mask(d) & HVMPV_reference_tsc) )
+            return 0;
+
+        perfc_incr(mshv_wrmsr_tsc_msr);
+        d->arch.hvm_domain.viridian.reference_tsc.raw = val;
+        dump_reference_tsc(d);
+        if ( d->arch.hvm_domain.viridian.reference_tsc.fields.enabled )
+            update_reference_tsc(d, 1);
+        break;
+
     default:
         return 0;
     }
@@ -379,6 +479,14 @@ int rdmsr_viridian_regs(uint32_t idx, uint64_t *val)
         *val = v->arch.hvm_vcpu.viridian.apic_assist.raw;
         break;
 
+    case VIRIDIAN_MSR_REFERENCE_TSC:
+        if ( !(viridian_feature_mask(d) & HVMPV_reference_tsc) )
+            return 0;
+
+        perfc_incr(mshv_rdmsr_tsc_msr);
+        *val = d->arch.hvm_domain.viridian.reference_tsc.raw;
+        break;
+
     case VIRIDIAN_MSR_TIME_REF_COUNT:
     {
         struct viridian_time_ref_count *trc;
@@ -454,7 +562,7 @@ int viridian_hypercall(struct cpu_user_regs *regs)
     {
     case HvNotifyLongSpinWait:
         perfc_incr(mshv_call_long_wait);
-        do_sched_op_compat(SCHEDOP_yield, 0);
+        do_sched_op(SCHEDOP_yield, guest_handle_from_ptr(NULL, void));
         status = HV_STATUS_SUCCESS;
         break;
     default:
@@ -487,6 +595,7 @@ static int viridian_save_domain_ctxt(struct domain *d, hvm_domain_context_t *h)
     ctxt.time_ref_count = d->arch.hvm_domain.viridian.time_ref_count.val;
     ctxt.hypercall_gpa  = d->arch.hvm_domain.viridian.hypercall_gpa.raw;
     ctxt.guest_os_id    = d->arch.hvm_domain.viridian.guest_os_id.raw;
+    ctxt.reference_tsc  = d->arch.hvm_domain.viridian.reference_tsc.raw;
 
     return (hvm_save_entry(VIRIDIAN_DOMAIN, 0, h, &ctxt) != 0);
 }
@@ -501,6 +610,10 @@ static int viridian_load_domain_ctxt(struct domain *d, hvm_domain_context_t *h)
     d->arch.hvm_domain.viridian.time_ref_count.val = ctxt.time_ref_count;
     d->arch.hvm_domain.viridian.hypercall_gpa.raw  = ctxt.hypercall_gpa;
     d->arch.hvm_domain.viridian.guest_os_id.raw    = ctxt.guest_os_id;
+    d->arch.hvm_domain.viridian.reference_tsc.raw  = ctxt.reference_tsc;
+
+    if ( d->arch.hvm_domain.viridian.reference_tsc.fields.enabled )
+        update_reference_tsc(d, 0);
 
     return 0;
 }
diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
index 72b6509..b893b40 100644
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -33,6 +32,7 @@
 #include <asm/page.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
+#include <asm/vpmu.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/io.h>
 #include <asm/hvm/support.h>
@@ -420,18 +420,17 @@ void vlapic_EOI_set(struct vlapic *vlapic)
     if ( hvm_funcs.handle_eoi )
         hvm_funcs.handle_eoi(vector);
 
-    if ( vlapic_test_and_clear_vector(vector, &vlapic->regs->data[APIC_TMR]) )
-        vioapic_update_EOI(vlapic_domain(vlapic), vector);
-
-    hvm_dpci_msi_eoi(current->domain, vector);
+    vlapic_handle_EOI(vlapic, vector);
 }
 
-void vlapic_handle_EOI_induced_exit(struct vlapic *vlapic, int vector)
+void vlapic_handle_EOI(struct vlapic *vlapic, u8 vector)
 {
+    struct domain *d = vlapic_domain(vlapic);
+
     if ( vlapic_test_and_clear_vector(vector, &vlapic->regs->data[APIC_TMR]) )
-        vioapic_update_EOI(vlapic_domain(vlapic), vector);
+        vioapic_update_EOI(d, vector);
 
-    hvm_dpci_msi_eoi(current->domain, vector);
+    hvm_dpci_msi_eoi(d, vector);
 }
 
 static bool_t is_multicast_dest(struct vlapic *vlapic, unsigned int short_hand,
@@ -556,52 +555,42 @@ static void vlapic_set_tdcr(struct vlapic *vlapic, unsigned int val)
                 "timer_divisor: %d", vlapic->hw.timer_divisor);
 }
 
-static void vlapic_read_aligned(
-    struct vlapic *vlapic, unsigned int offset, unsigned int *result)
+static uint32_t vlapic_read_aligned(struct vlapic *vlapic, unsigned int offset)
 {
     switch ( offset )
     {
     case APIC_PROCPRI:
-        *result = vlapic_get_ppr(vlapic);
-        break;
+        return vlapic_get_ppr(vlapic);
 
     case APIC_TMCCT: /* Timer CCR */
         if ( !vlapic_lvtt_oneshot(vlapic) && !vlapic_lvtt_period(vlapic) )
-        {
-            *result = 0;
             break;
-        }
-        *result = vlapic_get_tmcct(vlapic);
-        break;
+        return vlapic_get_tmcct(vlapic);
 
     case APIC_TMICT: /* Timer ICR */
         if ( !vlapic_lvtt_oneshot(vlapic) && !vlapic_lvtt_period(vlapic) )
-        {
-            *result = 0;
             break;
-        }
+        /* fall through */
     default:
-        *result = vlapic_get_reg(vlapic, offset);
-        break;
+        return vlapic_get_reg(vlapic, offset);
     }
+
+    return 0;
 }
 
 static int vlapic_read(
     struct vcpu *v, unsigned long address,
-    unsigned long len, unsigned long *pval)
+    unsigned int len, unsigned long *pval)
 {
-    unsigned int alignment;
-    unsigned int tmp;
-    unsigned long result = 0;
     struct vlapic *vlapic = vcpu_vlapic(v);
     unsigned int offset = address - vlapic_base_address(vlapic);
+    unsigned int alignment = offset & 3, tmp, result = 0;
 
     if ( offset > (APIC_TDCR + 0x3) )
         goto out;
 
-    alignment = offset & 0x3;
+    tmp = vlapic_read_aligned(vlapic, offset & ~3);
 
-    vlapic_read_aligned(vlapic, offset & ~0x3, &tmp);
     switch ( len )
     {
     case 1:
@@ -621,20 +610,20 @@ static int vlapic_read(
         break;
 
     default:
-        gdprintk(XENLOG_ERR, "Local APIC read with len=%#lx, "
+        gdprintk(XENLOG_ERR, "Local APIC read with len=%#x, "
                  "should be 4 instead.\n", len);
         goto exit_and_crash;
     }
 
-    HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "offset %#x with length %#lx, "
-                "and the result is %#lx", offset, len, result);
+    HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "offset %#x with length %#x, "
+                "and the result is %#x", offset, len, result);
 
  out:
     *pval = result;
     return X86EMUL_OKAY;
 
  unaligned_exit_and_crash:
-    gdprintk(XENLOG_ERR, "Unaligned LAPIC read len=%#lx at offset=%#x.\n",
+    gdprintk(XENLOG_ERR, "Unaligned LAPIC read len=%#x at offset=%#x.\n",
              len, offset);
  exit_and_crash:
     domain_crash(v->domain);
@@ -643,45 +632,31 @@ static int vlapic_read(
 
 int hvm_x2apic_msr_read(struct vcpu *v, unsigned int msr, uint64_t *msr_content)
 {
+    static const unsigned long readable[] =
+        {
+#define REG(x) (1UL << (APIC_ ## x >> 4))
+            REG(ID)    | REG(LVR)  | REG(TASKPRI) | REG(PROCPRI) |
+            REG(LDR)   | REG(SPIV) | REG(ESR)     | REG(ICR)     |
+            REG(CMCI)  | REG(LVTT) | REG(LVTTHMR) | REG(LVTPC)   |
+            REG(LVT0)  | REG(LVT1) | REG(LVTERR)  | REG(TMICT)   |
+            REG(TMCCT) | REG(TDCR) |
+#undef REG
+#define REGBLOCK(x) (((1UL << (NR_VECTORS / 32)) - 1) << (APIC_ ## x >> 4))
+            REGBLOCK(ISR) | REGBLOCK(TMR) | REGBLOCK(IRR)
+#undef REGBLOCK
+        };
     struct vlapic *vlapic = vcpu_vlapic(v);
-    uint32_t low, high = 0, offset = (msr - MSR_IA32_APICBASE_MSR) << 4;
+    uint32_t high = 0, reg = msr - MSR_IA32_APICBASE_MSR, offset = reg << 4;
 
-    if ( !vlapic_x2apic_mode(vlapic) )
+    if ( !vlapic_x2apic_mode(vlapic) ||
+         (reg >= sizeof(readable) * 8) || !test_bit(reg, readable) )
         return X86EMUL_UNHANDLEABLE;
 
-    switch ( offset )
-    {
-    case APIC_ICR:
-        vlapic_read_aligned(vlapic, APIC_ICR2, &high);
-        /* Fallthrough. */
-    case APIC_ID:
-    case APIC_LVR:
-    case APIC_TASKPRI:
-    case APIC_PROCPRI:
-    case APIC_LDR:
-    case APIC_SPIV:
-    case APIC_ISR ... APIC_ISR + 0x70:
-    case APIC_TMR ... APIC_TMR + 0x70:
-    case APIC_IRR ... APIC_IRR + 0x70:
-    case APIC_ESR:
-    case APIC_CMCI:
-    case APIC_LVTT:
-    case APIC_LVTTHMR:
-    case APIC_LVTPC:
-    case APIC_LVT0:
-    case APIC_LVT1:
-    case APIC_LVTERR:
-    case APIC_TMICT:
-    case APIC_TMCCT:
-    case APIC_TDCR:
-        vlapic_read_aligned(vlapic, offset, &low);
-        break;
+    if ( offset == APIC_ICR )
+        high = vlapic_read_aligned(vlapic, APIC_ICR2);
 
-    default:
-        return X86EMUL_UNHANDLEABLE;
-    }
-
-    *msr_content = (((uint64_t)high) << 32) | low;
+    *msr_content = ((uint64_t)high << 32) |
+                   vlapic_read_aligned(vlapic, offset);
 
     return X86EMUL_OKAY;
 }
@@ -699,7 +674,7 @@ static void vlapic_tdt_pt_cb(struct vcpu *v, void *data)
 }
 
 static int vlapic_reg_write(struct vcpu *v,
-                            unsigned int offset, unsigned long val)
+                            unsigned int offset, uint32_t val)
 {
     struct vlapic *vlapic = vcpu_vlapic(v);
     int rc = X86EMUL_OKAY;
@@ -773,6 +748,7 @@ static int vlapic_reg_write(struct vcpu *v,
             vlapic->hw.tdt_msr = 0;
         }
         vlapic->pt.irq = val & APIC_VECTOR_MASK;
+        /* fallthrough */
     case APIC_LVTTHMR:      /* LVT Thermal Monitor */
     case APIC_LVTPC:        /* LVT Performance Counter */
     case APIC_LVT0:         /* LVT LINT0 Reg */
@@ -789,6 +765,8 @@ static int vlapic_reg_write(struct vcpu *v,
         }
         if ( (offset == APIC_LVTT) && !(val & APIC_LVT_MASKED) )
             pt_may_unmask_irq(NULL, &vlapic->pt);
+        if ( offset == APIC_LVTPC )
+            vpmu_lvtpc_update(val);
         break;
 
     case APIC_TMICT:
@@ -806,8 +784,7 @@ static int vlapic_reg_write(struct vcpu *v,
             break;
         }
 
-        period = ((uint64_t)APIC_BUS_CYCLE_NS *
-                  (uint32_t)val * vlapic->hw.timer_divisor);
+        period = (uint64_t)APIC_BUS_CYCLE_NS * val * vlapic->hw.timer_divisor;
         TRACE_2_LONG_3D(TRC_HVM_EMUL_LAPIC_START_TIMER, TRC_PAR_LONG(period),
                  TRC_PAR_LONG(vlapic_lvtt_period(vlapic) ? period : 0LL),
                  vlapic->pt.irq);
@@ -820,7 +797,7 @@ static int vlapic_reg_write(struct vcpu *v,
 
         HVM_DBG_LOG(DBG_LEVEL_VLAPIC,
                     "bus cycle is %uns, "
-                    "initial count %lu, period %"PRIu64"ns",
+                    "initial count %u, period %"PRIu64"ns",
                     APIC_BUS_CYCLE_NS, val, period);
     }
     break;
@@ -841,62 +818,56 @@ static int vlapic_reg_write(struct vcpu *v,
 }
 
 static int vlapic_write(struct vcpu *v, unsigned long address,
-                        unsigned long len, unsigned long val)
+                        unsigned int len, unsigned long val)
 {
     struct vlapic *vlapic = vcpu_vlapic(v);
     unsigned int offset = address - vlapic_base_address(vlapic);
     int rc = X86EMUL_OKAY;
 
-    if ( offset != 0xb0 )
+    if ( offset != APIC_EOI )
         HVM_DBG_LOG(DBG_LEVEL_VLAPIC,
-                    "offset %#x with length %#lx, and value is %#lx",
+                    "offset %#x with length %#x, and value is %#lx",
                     offset, len, val);
 
     /*
      * According to the IA32 Manual, all accesses should be 32 bits.
      * Some OSes do 8- or 16-byte accesses, however.
      */
-    val = (uint32_t)val;
-    if ( len != 4 )
+    if ( unlikely(len != 4) )
     {
-        unsigned int tmp;
-        unsigned char alignment;
-
-        gdprintk(XENLOG_INFO, "Notice: Local APIC write with len = %lx\n",len);
-
-        alignment = offset & 0x3;
-        (void)vlapic_read_aligned(vlapic, offset & ~0x3, &tmp);
+        unsigned int tmp = vlapic_read_aligned(vlapic, offset & ~3);
+        unsigned char alignment = (offset & 3) * 8;
 
         switch ( len )
         {
         case 1:
-            val = ((tmp & ~(0xff << (8*alignment))) |
-                   ((val & 0xff) << (8*alignment)));
+            val = ((tmp & ~(0xffU << alignment)) |
+                   ((val & 0xff) << alignment));
             break;
 
         case 2:
             if ( alignment & 1 )
                 goto unaligned_exit_and_crash;
-            val = ((tmp & ~(0xffff << (8*alignment))) |
-                   ((val & 0xffff) << (8*alignment)));
+            val = ((tmp & ~(0xffffU << alignment)) |
+                   ((val & 0xffff) << alignment));
             break;
 
         default:
-            gdprintk(XENLOG_ERR, "Local APIC write with len = %lx, "
-                     "should be 4 instead\n", len);
+            gprintk(XENLOG_ERR, "LAPIC write with len %u\n", len);
             goto exit_and_crash;
         }
+
+        gdprintk(XENLOG_INFO, "Notice: LAPIC write with len %u\n", len);
+        offset &= ~3;
     }
-    else if ( (offset & 0x3) != 0 )
+    else if ( unlikely(offset & 3) )
         goto unaligned_exit_and_crash;
 
-    offset &= ~0x3;
-
     return vlapic_reg_write(v, offset, val);
 
  unaligned_exit_and_crash:
-    gdprintk(XENLOG_ERR, "Unaligned LAPIC write len=%#lx at offset=%#x.\n",
-             len, offset);
+    gprintk(XENLOG_ERR, "Unaligned LAPIC write: len=%u offset=%#x.\n",
+            len, offset);
  exit_and_crash:
     domain_crash(v->domain);
     return rc;
@@ -992,7 +963,7 @@ int hvm_x2apic_msr_write(struct vcpu *v, unsigned int msr, uint64_t msr_content)
             return X86EMUL_UNHANDLEABLE;
     }
 
-    return vlapic_reg_write(v, offset, (uint32_t)msr_content);
+    return vlapic_reg_write(v, offset, msr_content);
 }
 
 static int vlapic_range(struct vcpu *v, unsigned long addr)
@@ -1005,10 +976,10 @@ static int vlapic_range(struct vcpu *v, unsigned long addr)
            (offset < PAGE_SIZE);
 }
 
-const struct hvm_mmio_handler vlapic_mmio_handler = {
-    .check_handler = vlapic_range,
-    .read_handler = vlapic_read,
-    .write_handler = vlapic_write
+static const struct hvm_mmio_ops vlapic_mmio_ops = {
+    .check = vlapic_range,
+    .read = vlapic_read,
+    .write = vlapic_write
 };
 
 static void set_x2apic_id(struct vlapic *vlapic)
@@ -1425,7 +1396,6 @@ HVM_REGISTER_SAVE_RESTORE(LAPIC_REGS, lapic_save_regs, lapic_load_regs,
 int vlapic_init(struct vcpu *v)
 {
     struct vlapic *vlapic = vcpu_vlapic(v);
-    unsigned int memflags = MEMF_node(vcpu_to_node(v));
 
     HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "%d", v->vcpu_id);
 
@@ -1439,7 +1409,7 @@ int vlapic_init(struct vcpu *v)
 
     if (vlapic->regs_page == NULL)
     {
-        vlapic->regs_page = alloc_domheap_page(NULL, memflags);
+        vlapic->regs_page = alloc_domheap_page(v->domain, MEMF_no_owner);
         if ( vlapic->regs_page == NULL )
         {
             dprintk(XENLOG_ERR, "alloc vlapic regs error: %d/%d\n",
@@ -1472,6 +1442,9 @@ int vlapic_init(struct vcpu *v)
                  vlapic_init_sipi_action,
                  (unsigned long)v);
 
+    if ( v->vcpu_id == 0 )
+        register_mmio_handler(v->domain, &vlapic_mmio_ops);
+
     return 0;
 }
 
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index ab4ac50..ac838a9 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -18,8 +18,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Support for virtual MSI logic
  * Will be merged it with virtual IOAPIC logic, since most is the same
@@ -153,12 +152,15 @@ struct msixtbl_entry
     /* TODO: resolve the potential race by destruction of pdev */
     struct pci_dev *pdev;
     unsigned long gtable;       /* gpa of msix table */
-    unsigned long table_len;
-    unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)];
+    DECLARE_BITMAP(table_flags, MAX_MSIX_TABLE_ENTRIES);
 #define MAX_MSIX_ACC_ENTRIES 3
+    unsigned int table_len;
     struct { 
         uint32_t msi_ad[3];	/* Shadow of address low, high and data */
     } gentries[MAX_MSIX_ACC_ENTRIES];
+    DECLARE_BITMAP(acc_valid, 3 * MAX_MSIX_ACC_ENTRIES);
+#define acc_bit(what, ent, slot, idx) \
+        what##_bit((slot) * 3 + (idx), (ent)->acc_valid)
     struct rcu_head rcu;
 };
 
@@ -178,49 +180,35 @@ static struct msixtbl_entry *msixtbl_find_entry(
     return NULL;
 }
 
-static struct msi_desc *virt_to_msi_desc(struct pci_dev *dev, void *virt)
+static struct msi_desc *msixtbl_addr_to_desc(
+    const struct msixtbl_entry *entry, unsigned long addr)
 {
+    unsigned int nr_entry;
     struct msi_desc *desc;
 
-    list_for_each_entry( desc, &dev->msi_list, list )
-        if ( desc->msi_attrib.type == PCI_CAP_ID_MSIX  &&
-             virt >= desc->mask_base &&
-             virt < desc->mask_base + PCI_MSIX_ENTRY_SIZE ) 
-            return desc;
-
-    return NULL;
-}
-
-static void __iomem *msixtbl_addr_to_virt(
-    struct msixtbl_entry *entry, unsigned long addr)
-{
-    unsigned int idx, nr_page;
-
     if ( !entry || !entry->pdev )
         return NULL;
 
-    nr_page = (addr >> PAGE_SHIFT) -
-              (entry->gtable >> PAGE_SHIFT);
+    nr_entry = (addr - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
 
-    idx = entry->pdev->msix->table_idx[nr_page];
-    if ( !idx )
-        return NULL;
+    list_for_each_entry( desc, &entry->pdev->msi_list, list )
+        if ( desc->msi_attrib.type == PCI_CAP_ID_MSIX &&
+             desc->msi_attrib.entry_nr == nr_entry )
+            return desc;
 
-    return (void *)(fix_to_virt(idx) +
-                    (addr & ((1UL << PAGE_SHIFT) - 1)));
+    return NULL;
 }
 
 static int msixtbl_read(
     struct vcpu *v, unsigned long address,
-    unsigned long len, unsigned long *pval)
+    unsigned int len, unsigned long *pval)
 {
     unsigned long offset;
     struct msixtbl_entry *entry;
-    void *virt;
     unsigned int nr_entry, index;
     int r = X86EMUL_UNHANDLEABLE;
 
-    if ( len != 4 || (address & 3) )
+    if ( (len != 4 && len != 8) || (address & (len - 1)) )
         return r;
 
     rcu_read_lock(&msixtbl_rcu_lock);
@@ -233,17 +221,33 @@ static int msixtbl_read(
     if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
     {
         nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
-        if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
-            goto out;
         index = offset / sizeof(uint32_t);
+        if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ||
+             !acc_bit(test, entry, nr_entry, index) )
+            goto out;
         *pval = entry->gentries[nr_entry].msi_ad[index];
+        if ( len == 8 )
+        {
+            if ( index )
+                offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+            else if ( acc_bit(test, entry, nr_entry, 1) )
+                *pval |= (u64)entry->gentries[nr_entry].msi_ad[1] << 32;
+            else
+                goto out;
+        }
     }
-    else 
+    if ( offset == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
     {
-        virt = msixtbl_addr_to_virt(entry, address);
-        if ( !virt )
+        const struct msi_desc *msi_desc = msixtbl_addr_to_desc(entry, address);
+
+        if ( !msi_desc )
             goto out;
-        *pval = readl(virt);
+        if ( len == 4 )
+            *pval = MASK_INSR(msi_desc->msi_attrib.guest_masked,
+                              PCI_MSIX_VECTOR_BITMASK);
+        else
+            *pval |= (u64)MASK_INSR(msi_desc->msi_attrib.guest_masked,
+                                    PCI_MSIX_VECTOR_BITMASK) << 32;
     }
     
     r = X86EMUL_OKAY;
@@ -253,18 +257,17 @@ out:
 }
 
 static int msixtbl_write(struct vcpu *v, unsigned long address,
-                         unsigned long len, unsigned long val)
+                         unsigned int len, unsigned long val)
 {
     unsigned long offset;
     struct msixtbl_entry *entry;
     const struct msi_desc *msi_desc;
-    void *virt;
     unsigned int nr_entry, index;
     int r = X86EMUL_UNHANDLEABLE;
-    unsigned long flags, orig;
+    unsigned long flags;
     struct irq_desc *desc;
 
-    if ( len != 4 || (address & 3) )
+    if ( (len != 4 && len != 8) || (address & (len - 1)) )
         return r;
 
     rcu_read_lock(&msixtbl_rcu_lock);
@@ -275,30 +278,34 @@ static int msixtbl_write(struct vcpu *v, unsigned long address,
     nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
 
     offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
-    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
+    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
     {
+        index = offset / sizeof(uint32_t);
         if ( nr_entry < MAX_MSIX_ACC_ENTRIES ) 
         {
-            index = offset / sizeof(uint32_t);
             entry->gentries[nr_entry].msi_ad[index] = val;
+            acc_bit(set, entry, nr_entry, index);
+            if ( len == 8 && !index )
+            {
+                entry->gentries[nr_entry].msi_ad[1] = val >> 32;
+                acc_bit(set, entry, nr_entry, 1);
+            }
         }
         set_bit(nr_entry, &entry->table_flags);
-        goto out;
+        if ( len != 8 || !index )
+            goto out;
+        val >>= 32;
     }
 
-    /* exit to device model if address/data has been modified */
-    if ( test_and_clear_bit(nr_entry, &entry->table_flags) )
+    /* Exit to device model when unmasking and address/data got modified. */
+    if ( !(val & PCI_MSIX_VECTOR_BITMASK) &&
+         test_and_clear_bit(nr_entry, &entry->table_flags) )
     {
-        if ( !(val & PCI_MSIX_VECTOR_BITMASK) )
-            v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address;
+        v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address;
         goto out;
     }
 
-    virt = msixtbl_addr_to_virt(entry, address);
-    if ( !virt )
-        goto out;
-
-    msi_desc = virt_to_msi_desc(entry->pdev, virt);
+    msi_desc = msixtbl_addr_to_desc(entry, address);
     if ( !msi_desc || msi_desc->irq < 0 )
         goto out;
     
@@ -313,41 +320,12 @@ static int msixtbl_write(struct vcpu *v, unsigned long address,
 
     ASSERT(msi_desc == desc->msi_desc);
    
-    orig = readl(virt);
-
-    /*
-     * Do not allow guest to modify MSI-X control bit if it is masked 
-     * by Xen. We'll only handle the case where Xen thinks that
-     * bit is unmasked, but hardware has silently masked the bit
-     * (in case of SR-IOV VF reset, etc). On the other hand, if Xen 
-     * thinks that the bit is masked, but it's really not, 
-     * we log a warning.
-     */
-    if ( msi_desc->msi_attrib.masked )
-    {
-        if ( !(orig & PCI_MSIX_VECTOR_BITMASK) )
-            printk(XENLOG_WARNING "MSI-X control bit is unmasked when"
-                   " it is expected to be masked [%04x:%02x:%02x.%u]\n", 
-                   entry->pdev->seg, entry->pdev->bus,
-                   PCI_SLOT(entry->pdev->devfn), 
-                   PCI_FUNC(entry->pdev->devfn));
-
-        goto unlock;
-    }
-
-    /*
-     * The mask bit is the only defined bit in the word. But we 
-     * ought to preserve the reserved bits. Clearing the reserved 
-     * bits can result in undefined behaviour (see PCI Local Bus
-     * Specification revision 2.3).
-     */
-    val &= PCI_MSIX_VECTOR_BITMASK;
-    val |= (orig & ~PCI_MSIX_VECTOR_BITMASK);
-    writel(val, virt);
+    guest_mask_msi_irq(desc, !!(val & PCI_MSIX_VECTOR_BITMASK));
 
 unlock:
     spin_unlock_irqrestore(&desc->lock, flags);
-    r = X86EMUL_OKAY;
+    if ( len == 4 )
+        r = X86EMUL_OKAY;
 
 out:
     rcu_read_unlock(&msixtbl_rcu_lock);
@@ -356,23 +334,19 @@ out:
 
 static int msixtbl_range(struct vcpu *v, unsigned long addr)
 {
-    struct msixtbl_entry *entry;
-    void *virt;
+    const struct msi_desc *desc;
 
     rcu_read_lock(&msixtbl_rcu_lock);
-
-    entry = msixtbl_find_entry(v, addr);
-    virt = msixtbl_addr_to_virt(entry, addr);
-
+    desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr);
     rcu_read_unlock(&msixtbl_rcu_lock);
 
-    return !!virt;
+    return !!desc;
 }
 
-const struct hvm_mmio_handler msixtbl_mmio_handler = {
-    .check_handler = msixtbl_range,
-    .read_handler = msixtbl_read,
-    .write_handler = msixtbl_write
+static const struct hvm_mmio_ops msixtbl_mmio_ops = {
+    .check = msixtbl_range,
+    .read = msixtbl_read,
+    .write = msixtbl_write
 };
 
 static void add_msixtbl_entry(struct domain *d,
@@ -380,16 +354,11 @@ static void add_msixtbl_entry(struct domain *d,
                               uint64_t gtable,
                               struct msixtbl_entry *entry)
 {
-    u32 len;
-
-    memset(entry, 0, sizeof(struct msixtbl_entry));
-        
     INIT_LIST_HEAD(&entry->list);
     INIT_RCU_HEAD(&entry->rcu);
     atomic_set(&entry->refcnt, 0);
 
-    len = pci_msix_get_table_len(pdev);
-    entry->table_len = len;
+    entry->table_len = pci_msix_get_table_len(pdev);
     entry->pdev = pdev;
     entry->gtable = (unsigned long) gtable;
 
@@ -426,7 +395,7 @@ int msixtbl_pt_register(struct domain *d, struct pirq *pirq, uint64_t gtable)
      * xmalloc() with irq_disabled causes the failure of check_lock() 
      * for xenpool->lock. So we allocate an entry beforehand.
      */
-    new_entry = xmalloc(struct msixtbl_entry);
+    new_entry = xzalloc(struct msixtbl_entry);
     if ( !new_entry )
         return -ENOMEM;
 
@@ -511,6 +480,14 @@ found:
     spin_unlock_irq(&irq_desc->lock);
 }
 
+void msixtbl_init(struct domain *d)
+{
+    INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list);
+    spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock);
+
+    register_mmio_handler(d, &msixtbl_mmio_ops);
+}
+
 void msixtbl_pt_cleanup(struct domain *d)
 {
     struct msixtbl_entry *entry, *temp;
diff --git a/xen/arch/x86/hvm/vmx/Makefile b/xen/arch/x86/hvm/vmx/Makefile
index 373b3d9..04a29ce 100644
--- a/xen/arch/x86/hvm/vmx/Makefile
+++ b/xen/arch/x86/hvm/vmx/Makefile
@@ -3,5 +3,4 @@ obj-y += intr.o
 obj-y += realmode.o
 obj-y += vmcs.o
 obj-y += vmx.o
-obj-y += vpmu_core2.o
 obj-y += vvmx.o
diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S
index 664ed83..2a4ed57 100644
--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c
index 8507432..4189f89 100644
--- a/xen/arch/x86/hvm/vmx/intr.c
+++ b/xen/arch/x86/hvm/vmx/intr.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/arch/x86/hvm/vmx/realmode.c b/xen/arch/x86/hvm/vmx/realmode.c
index fe8b4a0..e83a61f 100644
--- a/xen/arch/x86/hvm/vmx/realmode.c
+++ b/xen/arch/x86/hvm/vmx/realmode.c
@@ -101,15 +101,19 @@ static void realmode_deliver_exception(
     }
 }
 
-static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
+void vmx_realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
 {
     struct vcpu *curr = current;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
     int rc;
 
     perfc_incr(realmode_emulations);
 
     rc = hvm_emulate_one(hvmemul_ctxt);
 
+    if ( hvm_vcpu_io_need_completion(vio) || vio->mmio_retry )
+        vio->io_completion = HVMIO_realmode_completion;
+
     if ( rc == X86EMUL_UNHANDLEABLE )
     {
         gdprintk(XENLOG_ERR, "Failed to emulate insn.\n");
@@ -177,9 +181,6 @@ void vmx_realmode(struct cpu_user_regs *regs)
 
     hvm_emulate_prepare(&hvmemul_ctxt, regs);
 
-    if ( vio->io_state == HVMIO_completed )
-        realmode_emulate_one(&hvmemul_ctxt);
-
     /* Only deliver interrupts into emulated real mode. */
     if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) &&
          (intr_info & INTR_INFO_VALID_MASK) )
@@ -190,8 +191,7 @@ void vmx_realmode(struct cpu_user_regs *regs)
 
     curr->arch.hvm_vmx.vmx_emulate = 1;
     while ( curr->arch.hvm_vmx.vmx_emulate &&
-            !softirq_pending(smp_processor_id()) &&
-            (vio->io_state == HVMIO_none) )
+            !softirq_pending(smp_processor_id()) )
     {
         /*
          * Check for pending interrupts only every 16 instructions, because
@@ -203,7 +203,10 @@ void vmx_realmode(struct cpu_user_regs *regs)
              hvm_local_events_need_delivery(curr) )
             break;
 
-        realmode_emulate_one(&hvmemul_ctxt);
+        vmx_realmode_emulate_one(&hvmemul_ctxt);
+
+        if ( vio->io_req.state != STATE_IOREQ_NONE || vio->mmio_retry )
+            break;
 
         /* Stop emulating unless our segment state is not safe */
         if ( curr->arch.hvm_vmx.vmx_realmode )
@@ -216,7 +219,7 @@ void vmx_realmode(struct cpu_user_regs *regs)
     }
 
     /* Need to emulate next time if we've started an IO operation */
-    if ( vio->io_state != HVMIO_none )
+    if ( vio->io_req.state != STATE_IOREQ_NONE )
         curr->arch.hvm_vmx.vmx_emulate = 1;
 
     if ( !curr->arch.hvm_vmx.vmx_emulate && !curr->arch.hvm_vmx.vmx_realmode )
@@ -245,3 +248,13 @@ void vmx_realmode(struct cpu_user_regs *regs)
     if ( intr_info & INTR_INFO_VALID_MASK )
         __vmwrite(VM_ENTRY_INTR_INFO, intr_info);
 }
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 9d8033e..62e405f 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -25,7 +24,7 @@
 #include <xen/event.h>
 #include <xen/kernel.h>
 #include <xen/keyhandler.h>
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
 #include <asm/current.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
@@ -64,6 +63,40 @@ integer_param("ple_gap", ple_gap);
 static unsigned int __read_mostly ple_window = 4096;
 integer_param("ple_window", ple_window);
 
+static bool_t __read_mostly opt_pml_enabled = 0;
+static s8 __read_mostly opt_ept_ad = -1;
+
+/*
+ * The 'ept' parameter controls functionalities that depend on, or impact the
+ * EPT mechanism. Optional comma separated value may contain:
+ *
+ *  pml                 Enable PML
+ *  ad                  Use A/D bits
+ */
+static void __init parse_ept_param(char *s)
+{
+    char *ss;
+
+    do {
+        bool_t val = !!strncmp(s, "no-", 3);
+
+        if ( !val )
+            s += 3;
+
+        ss = strchr(s, ',');
+        if ( ss )
+            *ss = '\0';
+
+        if ( !strcmp(s, "pml") )
+            opt_pml_enabled = val;
+        else if ( !strcmp(s, "ad") )
+            opt_ept_ad = val;
+
+        s = ss + 1;
+    } while ( ss );
+}
+custom_param("ept", parse_ept_param);
+
 /* Dynamic (run-time adjusted) execution control flags. */
 u32 vmx_pin_based_exec_control __read_mostly;
 u32 vmx_cpu_based_exec_control __read_mostly;
@@ -71,6 +104,8 @@ u32 vmx_secondary_exec_control __read_mostly;
 u32 vmx_vmexit_control __read_mostly;
 u32 vmx_vmentry_control __read_mostly;
 u64 vmx_ept_vpid_cap __read_mostly;
+u64 vmx_vmfunc __read_mostly;
+bool_t vmx_virt_exception __read_mostly;
 
 const u32 vmx_introspection_force_enabled_msrs[] = {
     MSR_IA32_SYSENTER_EIP,
@@ -110,6 +145,9 @@ static void __init vmx_display_features(void)
     P(cpu_has_vmx_virtual_intr_delivery, "Virtual Interrupt Delivery");
     P(cpu_has_vmx_posted_intr_processing, "Posted Interrupt Processing");
     P(cpu_has_vmx_vmcs_shadowing, "VMCS shadowing");
+    P(cpu_has_vmx_vmfunc, "VM Functions");
+    P(cpu_has_vmx_virt_exceptions, "Virtualisation Exceptions");
+    P(cpu_has_vmx_pml, "Page Modification Logging");
 #undef P
 
     if ( !printed )
@@ -154,6 +192,7 @@ static int vmx_init_vmcs_config(void)
     u64 _vmx_misc_cap = 0;
     u32 _vmx_vmexit_control;
     u32 _vmx_vmentry_control;
+    u64 _vmx_vmfunc = 0;
     bool_t mismatch = 0;
 
     rdmsr(MSR_IA32_VMX_BASIC, vmx_basic_msr_low, vmx_basic_msr_high);
@@ -199,7 +238,9 @@ static int vmx_init_vmcs_config(void)
                SECONDARY_EXEC_ENABLE_EPT |
                SECONDARY_EXEC_ENABLE_RDTSCP |
                SECONDARY_EXEC_PAUSE_LOOP_EXITING |
-               SECONDARY_EXEC_ENABLE_INVPCID);
+               SECONDARY_EXEC_ENABLE_INVPCID |
+               SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
+               SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS);
         rdmsrl(MSR_IA32_VMX_MISC, _vmx_misc_cap);
         if ( _vmx_misc_cap & VMX_MISC_VMWRITE_ALL )
             opt |= SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
@@ -207,6 +248,8 @@ static int vmx_init_vmcs_config(void)
             opt |= SECONDARY_EXEC_ENABLE_VPID;
         if ( opt_unrestricted_guest_enabled )
             opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST;
+        if ( opt_pml_enabled )
+            opt |= SECONDARY_EXEC_ENABLE_PML;
 
         /*
          * "APIC Register Virtualization" and "Virtual Interrupt Delivery"
@@ -229,6 +272,13 @@ static int vmx_init_vmcs_config(void)
     {
         rdmsrl(MSR_IA32_VMX_EPT_VPID_CAP, _vmx_ept_vpid_cap);
 
+        if ( !opt_ept_ad )
+            _vmx_ept_vpid_cap &= ~VMX_EPT_AD_BIT;
+        else if ( /* Work around Erratum AVR41 on Avoton processors. */
+                  boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x4d &&
+                  opt_ept_ad < 0 )
+            _vmx_ept_vpid_cap &= ~VMX_EPT_AD_BIT;
+
         /*
          * Additional sanity checking before using EPT:
          * 1) the CPU we are running on must support EPT WB, as we will set
@@ -253,6 +303,10 @@ static int vmx_init_vmcs_config(void)
          */
         if ( !(_vmx_ept_vpid_cap & VMX_VPID_INVVPID_ALL_CONTEXT) )
             _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+
+        /* EPT A/D bits is required for PML */
+        if ( !(_vmx_ept_vpid_cap & VMX_EPT_AD_BIT) )
+            _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
     }
 
     if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
@@ -273,6 +327,14 @@ static int vmx_init_vmcs_config(void)
                   SECONDARY_EXEC_UNRESTRICTED_GUEST);
     }
 
+    /* PML cannot be supported if EPT is not used */
+    if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) )
+        _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+    /* Turn off opt_pml_enabled if PML feature is not present */
+    if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_PML) )
+        opt_pml_enabled = 0;
+
     if ( (_vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) &&
           ple_gap == 0 )
     {
@@ -296,6 +358,24 @@ static int vmx_init_vmcs_config(void)
           || !(_vmx_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT) )
         _vmx_pin_based_exec_control  &= ~ PIN_BASED_POSTED_INTERRUPT;
 
+    /* The IA32_VMX_VMFUNC MSR exists only when VMFUNC is available */
+    if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VM_FUNCTIONS )
+    {
+        rdmsrl(MSR_IA32_VMX_VMFUNC, _vmx_vmfunc);
+
+        /*
+         * VMFUNC leaf 0 (EPTP switching) must be supported.
+         *
+         * Or we just don't use VMFUNC.
+         */
+        if ( !(_vmx_vmfunc & VMX_VMFUNC_EPTP_SWITCHING) )
+            _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VM_FUNCTIONS;
+    }
+
+    /* Virtualization exceptions are only enabled if VMFUNC is enabled */
+    if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VM_FUNCTIONS) )
+        _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS;
+
     min = 0;
     opt = VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_BNDCFGS;
     _vmx_vmentry_control = adjust_vmx_controls(
@@ -316,6 +396,9 @@ static int vmx_init_vmcs_config(void)
         vmx_vmentry_control        = _vmx_vmentry_control;
         vmx_basic_msr              = ((u64)vmx_basic_msr_high << 32) |
                                      vmx_basic_msr_low;
+        vmx_vmfunc                 = _vmx_vmfunc;
+        vmx_virt_exception         = !!(_vmx_secondary_exec_control &
+                                       SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS);
         vmx_display_features();
 
         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
@@ -352,6 +435,9 @@ static int vmx_init_vmcs_config(void)
         mismatch |= cap_check(
             "EPT and VPID Capability",
             vmx_ept_vpid_cap, _vmx_ept_vpid_cap);
+        mismatch |= cap_check(
+            "VMFUNC Capability",
+            vmx_vmfunc, _vmx_vmfunc);
         if ( cpu_has_vmx_ins_outs_instr_info !=
              !!(vmx_basic_msr_high & (VMX_BASIC_INS_OUT_INFO >> 32)) )
         {
@@ -714,8 +800,9 @@ void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type)
     if ( msr_bitmap == NULL )
         return;
 
-    if ( unlikely(d->arch.hvm_domain.introspection_enabled) &&
-         mem_event_check_ring(&d->mem_event->access) )
+    if ( unlikely(d->arch.monitor.mov_to_msr_enabled &&
+                  d->arch.monitor.mov_to_msr_extended) &&
+         vm_event_check_ring(&d->vm_event->monitor) )
     {
         unsigned int i;
 
@@ -921,6 +1008,11 @@ static int construct_vmcs(struct vcpu *v)
     /* Do not enable Monitor Trap Flag unless start single step debug */
     v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
 
+    /* Disable VMFUNC and #VE for now: they may be enabled later by altp2m. */
+    v->arch.hvm_vmx.secondary_exec_control &=
+        ~(SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
+          SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS);
+
     if ( is_pvh_domain(d) )
     {
         /* Disable virtual apics, TPR */
@@ -930,6 +1022,10 @@ static int construct_vmcs(struct vcpu *v)
               | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
         v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW;
 
+        /* In turn, disable posted interrupts. */
+        __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
+                  vmx_pin_based_exec_control & ~PIN_BASED_POSTED_INTERRUPT);
+
         /* Unrestricted guest (real mode for EPT) */
         v->arch.hvm_vmx.secondary_exec_control &=
             ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -986,8 +1082,8 @@ static int construct_vmcs(struct vcpu *v)
     }
 
     /* I/O access bitmap. */
-    __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0));
-    __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE));
+    __vmwrite(IO_BITMAP_A, __pa(d->arch.hvm_domain.io_bitmap));
+    __vmwrite(IO_BITMAP_B, __pa(d->arch.hvm_domain.io_bitmap) + PAGE_SIZE);
 
     if ( cpu_has_vmx_virtual_intr_delivery )
     {
@@ -1008,6 +1104,9 @@ static int construct_vmcs(struct vcpu *v)
         __vmwrite(POSTED_INTR_NOTIFICATION_VECTOR, posted_intr_vector);
     }
 
+    /* Disable PML anyway here as it will only be enabled in log dirty mode */
+    v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
     /* Host data selectors. */
     __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
     __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
@@ -1154,7 +1253,7 @@ static int construct_vmcs(struct vcpu *v)
     vmx_vmcs_exit(v);
 
     /* PVH: paging mode is updated by arch_set_info_guest(). */
-    if ( is_hvm_vcpu(v) )
+    if ( is_hvm_domain(d) )
     {
         /* will update HOST & GUEST_CR3 as reqd */
         paging_update_paging_modes(v);
@@ -1201,64 +1300,62 @@ int vmx_write_guest_msr(u32 msr, u64 val)
     return -ESRCH;
 }
 
-int vmx_add_guest_msr(u32 msr)
+int vmx_add_msr(u32 msr, int type)
 {
     struct vcpu *curr = current;
-    unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
-    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
+    unsigned int idx, *msr_count;
+    struct vmx_msr_entry **msr_area, *msr_area_elem;
 
-    if ( msr_area == NULL )
+    if ( type == VMX_GUEST_MSR )
     {
-        if ( (msr_area = alloc_xenheap_page()) == NULL )
+        msr_count = &curr->arch.hvm_vmx.msr_count;
+        msr_area = &curr->arch.hvm_vmx.msr_area;
+    }
+    else
+    {
+        ASSERT(type == VMX_HOST_MSR);
+        msr_count = &curr->arch.hvm_vmx.host_msr_count;
+        msr_area = &curr->arch.hvm_vmx.host_msr_area;
+    }
+
+    if ( *msr_area == NULL )
+    {
+        if ( (*msr_area = alloc_xenheap_page()) == NULL )
             return -ENOMEM;
-        curr->arch.hvm_vmx.msr_area = msr_area;
-        __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
-        __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
+
+        if ( type == VMX_GUEST_MSR )
+        {
+            __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area));
+            __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
+        }
+        else
+            __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
     }
 
-    for ( i = 0; i < msr_count; i++ )
-        if ( msr_area[i].index == msr )
+    for ( idx = 0; idx < *msr_count; idx++ )
+        if ( (*msr_area)[idx].index == msr )
             return 0;
 
-    if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
+    if ( *msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
         return -ENOSPC;
 
-    msr_area[msr_count].index = msr;
-    msr_area[msr_count].mbz   = 0;
-    msr_area[msr_count].data  = 0;
-    curr->arch.hvm_vmx.msr_count = ++msr_count;
-    __vmwrite(VM_EXIT_MSR_STORE_COUNT, msr_count);
-    __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, msr_count);
-
-    return 0;
-}
+    msr_area_elem = *msr_area + *msr_count;
+    msr_area_elem->index = msr;
+    msr_area_elem->mbz = 0;
 
-int vmx_add_host_load_msr(u32 msr)
-{
-    struct vcpu *curr = current;
-    unsigned int i, msr_count = curr->arch.hvm_vmx.host_msr_count;
-    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.host_msr_area;
+    ++*msr_count;
 
-    if ( msr_area == NULL )
+    if ( type == VMX_GUEST_MSR )
     {
-        if ( (msr_area = alloc_xenheap_page()) == NULL )
-            return -ENOMEM;
-        curr->arch.hvm_vmx.host_msr_area = msr_area;
-        __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
+        msr_area_elem->data = 0;
+        __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count);
+        __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count);
+    }
+    else
+    {
+        rdmsrl(msr, msr_area_elem->data);
+        __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
     }
-
-    for ( i = 0; i < msr_count; i++ )
-        if ( msr_area[i].index == msr )
-            return 0;
-
-    if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
-        return -ENOSPC;
-
-    msr_area[msr_count].index = msr;
-    msr_area[msr_count].mbz   = 0;
-    rdmsrl(msr, msr_area[msr_count].data);
-    curr->arch.hvm_vmx.host_msr_count = ++msr_count;
-    __vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count);
 
     return 0;
 }
@@ -1277,6 +1374,185 @@ void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector)
                 &v->arch.hvm_vmx.eoi_exitmap_changed);
 }
 
+bool_t vmx_vcpu_pml_enabled(const struct vcpu *v)
+{
+    return !!(v->arch.hvm_vmx.secondary_exec_control &
+              SECONDARY_EXEC_ENABLE_PML);
+}
+
+int vmx_vcpu_enable_pml(struct vcpu *v)
+{
+    if ( vmx_vcpu_pml_enabled(v) )
+        return 0;
+
+    v->arch.hvm_vmx.pml_pg = v->domain->arch.paging.alloc_page(v->domain);
+    if ( !v->arch.hvm_vmx.pml_pg )
+        return -ENOMEM;
+
+    vmx_vmcs_enter(v);
+
+    __vmwrite(PML_ADDRESS, page_to_mfn(v->arch.hvm_vmx.pml_pg) << PAGE_SHIFT);
+    __vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
+
+    v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_PML;
+
+    __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+              v->arch.hvm_vmx.secondary_exec_control);
+
+    vmx_vmcs_exit(v);
+
+    return 0;
+}
+
+void vmx_vcpu_disable_pml(struct vcpu *v)
+{
+    if ( !vmx_vcpu_pml_enabled(v) )
+        return;
+
+    /* Make sure we don't lose any logged GPAs. */
+    vmx_vcpu_flush_pml_buffer(v);
+
+    vmx_vmcs_enter(v);
+
+    v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+    __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+              v->arch.hvm_vmx.secondary_exec_control);
+
+    vmx_vmcs_exit(v);
+
+    v->domain->arch.paging.free_page(v->domain, v->arch.hvm_vmx.pml_pg);
+    v->arch.hvm_vmx.pml_pg = NULL;
+}
+
+void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
+{
+    uint64_t *pml_buf;
+    unsigned long pml_idx;
+
+    ASSERT((v == current) || (!vcpu_runnable(v) && !v->is_running));
+    ASSERT(vmx_vcpu_pml_enabled(v));
+
+    vmx_vmcs_enter(v);
+
+    __vmread(GUEST_PML_INDEX, &pml_idx);
+
+    /* Do nothing if PML buffer is empty. */
+    if ( pml_idx == (NR_PML_ENTRIES - 1) )
+        goto out;
+
+    pml_buf = __map_domain_page(v->arch.hvm_vmx.pml_pg);
+
+    /*
+     * PML index can be either 2^16-1 (buffer is full), or 0 ~ NR_PML_ENTRIES-1
+     * (buffer is not full), and in latter case PML index always points to next
+     * available entity.
+     */
+    if ( pml_idx >= NR_PML_ENTRIES )
+        pml_idx = 0;
+    else
+        pml_idx++;
+
+    for ( ; pml_idx < NR_PML_ENTRIES; pml_idx++ )
+    {
+        unsigned long gfn = pml_buf[pml_idx] >> PAGE_SHIFT;
+
+        /*
+         * Need to change type from log-dirty to normal memory for logged GFN.
+         * hap_track_dirty_vram depends on it to work. And we mark all logged
+         * GFNs to be dirty, as we cannot be sure whether it's safe to ignore
+         * GFNs on which p2m_change_type_one returns failure. The failure cases
+         * are very rare, and additional cost is negligible, but a missing mark
+         * is extremely difficult to debug.
+         */
+        p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty, p2m_ram_rw);
+        paging_mark_gfn_dirty(v->domain, gfn);
+    }
+
+    unmap_domain_page(pml_buf);
+
+    /* Reset PML index */
+    __vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
+
+ out:
+    vmx_vmcs_exit(v);
+}
+
+bool_t vmx_domain_pml_enabled(const struct domain *d)
+{
+    return !!(d->arch.hvm_domain.vmx.status & VMX_DOMAIN_PML_ENABLED);
+}
+
+/*
+ * This function enables PML for particular domain. It should be called when
+ * domain is paused.
+ *
+ * PML needs to be enabled globally for all vcpus of the domain, as PML buffer
+ * and PML index are pre-vcpu, but EPT table is shared by vcpus, therefore
+ * enabling PML on partial vcpus won't work.
+ */
+int vmx_domain_enable_pml(struct domain *d)
+{
+    struct vcpu *v;
+    int rc;
+
+    ASSERT(atomic_read(&d->pause_count));
+
+    if ( vmx_domain_pml_enabled(d) )
+        return 0;
+
+    for_each_vcpu( d, v )
+        if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
+            goto error;
+
+    d->arch.hvm_domain.vmx.status |= VMX_DOMAIN_PML_ENABLED;
+
+    return 0;
+
+ error:
+    for_each_vcpu( d, v )
+        if ( vmx_vcpu_pml_enabled(v) )
+            vmx_vcpu_disable_pml(v);
+    return rc;
+}
+
+/*
+ * Disable PML for particular domain. Called when domain is paused.
+ *
+ * The same as enabling PML for domain, disabling PML should be done for all
+ * vcpus at once.
+ */
+void vmx_domain_disable_pml(struct domain *d)
+{
+    struct vcpu *v;
+
+    ASSERT(atomic_read(&d->pause_count));
+
+    if ( !vmx_domain_pml_enabled(d) )
+        return;
+
+    for_each_vcpu( d, v )
+        vmx_vcpu_disable_pml(v);
+
+    d->arch.hvm_domain.vmx.status &= ~VMX_DOMAIN_PML_ENABLED;
+}
+
+/*
+ * Flush PML buffer of all vcpus, and update the logged dirty pages to log-dirty
+ * radix tree. Called when domain is paused.
+ */
+void vmx_domain_flush_pml_buffers(struct domain *d)
+{
+    struct vcpu *v;
+
+    ASSERT(atomic_read(&d->pause_count));
+
+    if ( !vmx_domain_pml_enabled(d) )
+        return;
+
+    for_each_vcpu( d, v )
+        vmx_vcpu_flush_pml_buffer(v);
+}
+
 int vmx_create_vmcs(struct vcpu *v)
 {
     struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
@@ -1375,8 +1651,8 @@ void vmx_do_resume(struct vcpu *v)
     }
 
     debug_state = v->domain->debugger_attached
-                  || v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_INT3]
-                  || v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP];
+                  || v->domain->arch.monitor.software_breakpoint_enabled
+                  || v->domain->arch.monitor.singlestep_enabled;
 
     if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
     {
@@ -1395,6 +1671,16 @@ static inline unsigned long vmr(unsigned long field)
     return __vmread_safe(field, &val) ? val : 0;
 }
 
+#define vmr16(fld) ({             \
+    BUILD_BUG_ON((fld) & 0x6001); \
+    (uint16_t)vmr(fld);           \
+})
+
+#define vmr32(fld) ({                         \
+    BUILD_BUG_ON(((fld) & 0x6001) != 0x4000); \
+    (uint32_t)vmr(fld);                       \
+})
+
 static void vmx_dump_sel(char *name, uint32_t selector)
 {
     uint32_t sel, attr, limit;
@@ -1403,8 +1689,7 @@ static void vmx_dump_sel(char *name, uint32_t selector)
     attr = vmr(selector + (GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR));
     limit = vmr(selector + (GUEST_ES_LIMIT - GUEST_ES_SELECTOR));
     base = vmr(selector + (GUEST_ES_BASE - GUEST_ES_SELECTOR));
-    printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016"PRIx64"\n",
-           name, sel, attr, limit, base);
+    printk("%s: %04x %05x %08x %016"PRIx64"\n", name, sel, attr, limit, base);
 }
 
 static void vmx_dump_sel2(char *name, uint32_t lim)
@@ -1413,134 +1698,147 @@ static void vmx_dump_sel2(char *name, uint32_t lim)
     uint64_t base;
     limit = vmr(lim);
     base = vmr(lim + (GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
-    printk("%s:                           limit=0x%08x, base=0x%016"PRIx64"\n",
-           name, limit, base);
+    printk("%s:            %08x %016"PRIx64"\n", name, limit, base);
 }
 
 void vmcs_dump_vcpu(struct vcpu *v)
 {
     struct cpu_user_regs *regs = &v->arch.user_regs;
-    unsigned long long x;
+    uint32_t vmentry_ctl, vmexit_ctl;
+    unsigned long cr4;
+    uint64_t efer;
+    unsigned int i, n;
 
     if ( v == current )
         regs = guest_cpu_user_regs();
 
     vmx_vmcs_enter(v);
 
+    vmentry_ctl = vmr32(VM_ENTRY_CONTROLS),
+    vmexit_ctl = vmr32(VM_EXIT_CONTROLS);
+    cr4 = vmr(GUEST_CR4);
+    efer = vmr(GUEST_EFER);
+
     printk("*** Guest State ***\n");
-    printk("CR0: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n",
-           (unsigned long long)vmr(GUEST_CR0),
-           (unsigned long long)vmr(CR0_READ_SHADOW), 
-           (unsigned long long)vmr(CR0_GUEST_HOST_MASK));
-    printk("CR4: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n",
-           (unsigned long long)vmr(GUEST_CR4),
-           (unsigned long long)vmr(CR4_READ_SHADOW), 
-           (unsigned long long)vmr(CR4_GUEST_HOST_MASK));
-    printk("CR3: actual=0x%016llx, target_count=%d\n",
-           (unsigned long long)vmr(GUEST_CR3),
-           (int)vmr(CR3_TARGET_COUNT));
-    printk("     target0=%016llx, target1=%016llx\n",
-           (unsigned long long)vmr(CR3_TARGET_VALUE0),
-           (unsigned long long)vmr(CR3_TARGET_VALUE1));
-    printk("     target2=%016llx, target3=%016llx\n",
-           (unsigned long long)vmr(CR3_TARGET_VALUE2),
-           (unsigned long long)vmr(CR3_TARGET_VALUE3));
-    printk("RSP = 0x%016llx (0x%016llx)  RIP = 0x%016llx (0x%016llx)\n", 
-           (unsigned long long)vmr(GUEST_RSP),
-           (unsigned long long)regs->esp,
-           (unsigned long long)vmr(GUEST_RIP),
-           (unsigned long long)regs->eip);
-    printk("RFLAGS=0x%016llx (0x%016llx)  DR7 = 0x%016llx\n", 
-           (unsigned long long)vmr(GUEST_RFLAGS),
-           (unsigned long long)regs->eflags,
-           (unsigned long long)vmr(GUEST_DR7));
-    printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n",
-           (unsigned long long)vmr(GUEST_SYSENTER_ESP),
-           (int)vmr(GUEST_SYSENTER_CS),
-           (unsigned long long)vmr(GUEST_SYSENTER_EIP));
-    vmx_dump_sel("CS", GUEST_CS_SELECTOR);
-    vmx_dump_sel("DS", GUEST_DS_SELECTOR);
-    vmx_dump_sel("SS", GUEST_SS_SELECTOR);
-    vmx_dump_sel("ES", GUEST_ES_SELECTOR);
-    vmx_dump_sel("FS", GUEST_FS_SELECTOR);
-    vmx_dump_sel("GS", GUEST_GS_SELECTOR);
+    printk("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+           vmr(GUEST_CR0), vmr(CR0_READ_SHADOW), vmr(CR0_GUEST_HOST_MASK));
+    printk("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+           cr4, vmr(CR4_READ_SHADOW), vmr(CR4_GUEST_HOST_MASK));
+    printk("CR3 = 0x%016lx\n", vmr(GUEST_CR3));
+    if ( (v->arch.hvm_vmx.secondary_exec_control &
+          SECONDARY_EXEC_ENABLE_EPT) &&
+         (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA) )
+    {
+        printk("PDPTE0 = 0x%016lx  PDPTE1 = 0x%016lx\n",
+               vmr(GUEST_PDPTE(0)), vmr(GUEST_PDPTE(1)));
+        printk("PDPTE2 = 0x%016lx  PDPTE3 = 0x%016lx\n",
+               vmr(GUEST_PDPTE(2)), vmr(GUEST_PDPTE(3)));
+    }
+    printk("RSP = 0x%016lx (0x%016lx)  RIP = 0x%016lx (0x%016lx)\n",
+           vmr(GUEST_RSP), regs->esp,
+           vmr(GUEST_RIP), regs->eip);
+    printk("RFLAGS=0x%08lx (0x%08lx)  DR7 = 0x%016lx\n",
+           vmr(GUEST_RFLAGS), regs->eflags,
+           vmr(GUEST_DR7));
+    printk("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+           vmr(GUEST_SYSENTER_ESP),
+           vmr32(GUEST_SYSENTER_CS), vmr(GUEST_SYSENTER_EIP));
+    printk("       sel  attr  limit   base\n");
+    vmx_dump_sel("  CS", GUEST_CS_SELECTOR);
+    vmx_dump_sel("  DS", GUEST_DS_SELECTOR);
+    vmx_dump_sel("  SS", GUEST_SS_SELECTOR);
+    vmx_dump_sel("  ES", GUEST_ES_SELECTOR);
+    vmx_dump_sel("  FS", GUEST_FS_SELECTOR);
+    vmx_dump_sel("  GS", GUEST_GS_SELECTOR);
     vmx_dump_sel2("GDTR", GUEST_GDTR_LIMIT);
     vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR);
     vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT);
-    vmx_dump_sel("TR", GUEST_TR_SELECTOR);
-    printk("Guest PAT = 0x%08x%08x\n",
-           (uint32_t)vmr(GUEST_PAT_HIGH), (uint32_t)vmr(GUEST_PAT));
-    x  = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32;
-    x |= (uint32_t)vmr(TSC_OFFSET);
-    printk("TSC Offset = %016llx\n", x);
-    x  = (unsigned long long)vmr(GUEST_IA32_DEBUGCTL_HIGH) << 32;
-    x |= (uint32_t)vmr(GUEST_IA32_DEBUGCTL);
-    printk("DebugCtl=%016llx DebugExceptions=%016llx\n", x,
-           (unsigned long long)vmr(GUEST_PENDING_DBG_EXCEPTIONS));
-    printk("Interruptibility=%04x ActivityState=%04x\n",
-           (int)vmr(GUEST_INTERRUPTIBILITY_INFO),
-           (int)vmr(GUEST_ACTIVITY_STATE));
+    vmx_dump_sel("  TR", GUEST_TR_SELECTOR);
+    if ( (vmexit_ctl & (VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_SAVE_GUEST_EFER)) ||
+         (vmentry_ctl & (VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_GUEST_EFER)) )
+        printk("EFER = 0x%016lx  PAT = 0x%016lx\n", efer, vmr(GUEST_PAT));
+    printk("PreemptionTimer = 0x%08x  SM Base = 0x%08x\n",
+           vmr32(GUEST_PREEMPTION_TIMER), vmr32(GUEST_SMBASE));
+    printk("DebugCtl = 0x%016lx  DebugExceptions = 0x%016lx\n",
+           vmr(GUEST_IA32_DEBUGCTL), vmr(GUEST_PENDING_DBG_EXCEPTIONS));
+    if ( vmentry_ctl & (VM_ENTRY_LOAD_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_BNDCFGS) )
+        printk("PerfGlobCtl = 0x%016lx  BndCfgS = 0x%016lx\n",
+               vmr(GUEST_PERF_GLOBAL_CTRL), vmr(GUEST_BNDCFGS));
+    printk("Interruptibility = %08x  ActivityState = %08x\n",
+           vmr32(GUEST_INTERRUPTIBILITY_INFO), vmr32(GUEST_ACTIVITY_STATE));
+    if ( v->arch.hvm_vmx.secondary_exec_control &
+         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY )
+        printk("InterruptStatus = %04x\n", vmr16(GUEST_INTR_STATUS));
 
     printk("*** Host State ***\n");
-    printk("RSP = 0x%016llx  RIP = 0x%016llx\n", 
-           (unsigned long long)vmr(HOST_RSP),
-           (unsigned long long)vmr(HOST_RIP));
-    printk("CS=%04x DS=%04x ES=%04x FS=%04x GS=%04x SS=%04x TR=%04x\n",
-           (uint16_t)vmr(HOST_CS_SELECTOR),
-           (uint16_t)vmr(HOST_DS_SELECTOR),
-           (uint16_t)vmr(HOST_ES_SELECTOR),
-           (uint16_t)vmr(HOST_FS_SELECTOR),
-           (uint16_t)vmr(HOST_GS_SELECTOR),
-           (uint16_t)vmr(HOST_SS_SELECTOR),
-           (uint16_t)vmr(HOST_TR_SELECTOR));
-    printk("FSBase=%016llx GSBase=%016llx TRBase=%016llx\n",
-           (unsigned long long)vmr(HOST_FS_BASE),
-           (unsigned long long)vmr(HOST_GS_BASE),
-           (unsigned long long)vmr(HOST_TR_BASE));
-    printk("GDTBase=%016llx IDTBase=%016llx\n",
-           (unsigned long long)vmr(HOST_GDTR_BASE),
-           (unsigned long long)vmr(HOST_IDTR_BASE));
-    printk("CR0=%016llx CR3=%016llx CR4=%016llx\n",
-           (unsigned long long)vmr(HOST_CR0),
-           (unsigned long long)vmr(HOST_CR3),
-           (unsigned long long)vmr(HOST_CR4));
-    printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n",
-           (unsigned long long)vmr(HOST_SYSENTER_ESP),
-           (int)vmr(HOST_SYSENTER_CS),
-           (unsigned long long)vmr(HOST_SYSENTER_EIP));
-    printk("Host PAT = 0x%08x%08x\n",
-           (uint32_t)vmr(HOST_PAT_HIGH), (uint32_t)vmr(HOST_PAT));
+    printk("RIP = 0x%016lx (%ps)  RSP = 0x%016lx\n",
+           vmr(HOST_RIP), (void *)vmr(HOST_RIP), vmr(HOST_RSP));
+    printk("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+           vmr16(HOST_CS_SELECTOR), vmr16(HOST_SS_SELECTOR),
+           vmr16(HOST_DS_SELECTOR), vmr16(HOST_ES_SELECTOR),
+           vmr16(HOST_FS_SELECTOR), vmr16(HOST_GS_SELECTOR),
+           vmr16(HOST_TR_SELECTOR));
+    printk("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+           vmr(HOST_FS_BASE), vmr(HOST_GS_BASE), vmr(HOST_TR_BASE));
+    printk("GDTBase=%016lx IDTBase=%016lx\n",
+           vmr(HOST_GDTR_BASE), vmr(HOST_IDTR_BASE));
+    printk("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+           vmr(HOST_CR0), vmr(HOST_CR3), vmr(HOST_CR4));
+    printk("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+           vmr(HOST_SYSENTER_ESP),
+           vmr32(HOST_SYSENTER_CS), vmr(HOST_SYSENTER_EIP));
+    if ( vmexit_ctl & (VM_EXIT_LOAD_HOST_PAT | VM_EXIT_LOAD_HOST_EFER) )
+        printk("EFER = 0x%016lx  PAT = 0x%016lx\n", vmr(HOST_EFER), vmr(HOST_PAT));
+    if ( vmexit_ctl & VM_EXIT_LOAD_PERF_GLOBAL_CTRL )
+        printk("PerfGlobCtl = 0x%016lx\n",
+               vmr(HOST_PERF_GLOBAL_CTRL));
 
     printk("*** Control State ***\n");
     printk("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
-           (uint32_t)vmr(PIN_BASED_VM_EXEC_CONTROL),
-           (uint32_t)vmr(CPU_BASED_VM_EXEC_CONTROL),
-           (uint32_t)vmr(SECONDARY_VM_EXEC_CONTROL));
-    printk("EntryControls=%08x ExitControls=%08x\n",
-           (uint32_t)vmr(VM_ENTRY_CONTROLS),
-           (uint32_t)vmr(VM_EXIT_CONTROLS));
-    printk("ExceptionBitmap=%08x\n",
-           (uint32_t)vmr(EXCEPTION_BITMAP));
+           vmr32(PIN_BASED_VM_EXEC_CONTROL),
+           vmr32(CPU_BASED_VM_EXEC_CONTROL),
+           vmr32(SECONDARY_VM_EXEC_CONTROL));
+    printk("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+    printk("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+           vmr32(EXCEPTION_BITMAP),
+           vmr32(PAGE_FAULT_ERROR_CODE_MASK),
+           vmr32(PAGE_FAULT_ERROR_CODE_MATCH));
     printk("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
-           (uint32_t)vmr(VM_ENTRY_INTR_INFO),
-           (uint32_t)vmr(VM_ENTRY_EXCEPTION_ERROR_CODE),
-           (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN));
+           vmr32(VM_ENTRY_INTR_INFO),
+           vmr32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+           vmr32(VM_ENTRY_INSTRUCTION_LEN));
     printk("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
-           (uint32_t)vmr(VM_EXIT_INTR_INFO),
-           (uint32_t)vmr(VM_EXIT_INTR_ERROR_CODE),
-           (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN));
-    printk("        reason=%08x qualification=%08x\n",
-           (uint32_t)vmr(VM_EXIT_REASON),
-           (uint32_t)vmr(EXIT_QUALIFICATION));
+           vmr32(VM_EXIT_INTR_INFO),
+           vmr32(VM_EXIT_INTR_ERROR_CODE),
+           vmr32(VM_EXIT_INSTRUCTION_LEN));
+    printk("        reason=%08x qualification=%016lx\n",
+           vmr32(VM_EXIT_REASON), vmr(EXIT_QUALIFICATION));
     printk("IDTVectoring: info=%08x errcode=%08x\n",
-           (uint32_t)vmr(IDT_VECTORING_INFO),
-           (uint32_t)vmr(IDT_VECTORING_ERROR_CODE));
-    printk("TPR Threshold = 0x%02x\n",
-           (uint32_t)vmr(TPR_THRESHOLD));
-    printk("EPT pointer = 0x%08x%08x\n",
-           (uint32_t)vmr(EPT_POINTER_HIGH), (uint32_t)vmr(EPT_POINTER));
-    printk("Virtual processor ID = 0x%04x\n",
-           (uint32_t)vmr(VIRTUAL_PROCESSOR_ID));
+           vmr32(IDT_VECTORING_INFO), vmr32(IDT_VECTORING_ERROR_CODE));
+    printk("TSC Offset = 0x%016lx\n", vmr(TSC_OFFSET));
+    if ( (v->arch.hvm_vmx.exec_control & CPU_BASED_TPR_SHADOW) ||
+         (vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT) )
+        printk("TPR Threshold = 0x%02x  PostedIntrVec = 0x%02x\n",
+               vmr32(TPR_THRESHOLD), vmr16(POSTED_INTR_NOTIFICATION_VECTOR));
+    if ( (v->arch.hvm_vmx.secondary_exec_control &
+          SECONDARY_EXEC_ENABLE_EPT) )
+        printk("EPT pointer = 0x%016lx  EPTP index = 0x%04x\n",
+               vmr(EPT_POINTER), vmr16(EPTP_INDEX));
+    n = vmr32(CR3_TARGET_COUNT);
+    for ( i = 0; i + 1 < n; i += 2 )
+        printk("CR3 target%u=%016lx target%u=%016lx\n",
+               i, vmr(CR3_TARGET_VALUE(i)),
+               i + 1, vmr(CR3_TARGET_VALUE(i + 1)));
+    if ( i < n )
+        printk("CR3 target%u=%016lx\n", i, vmr(CR3_TARGET_VALUE(i)));
+    if ( v->arch.hvm_vmx.secondary_exec_control &
+         SECONDARY_EXEC_PAUSE_LOOP_EXITING )
+        printk("PLE Gap=%08x Window=%08x\n",
+               vmr32(PLE_GAP), vmr32(PLE_WINDOW));
+    if ( v->arch.hvm_vmx.secondary_exec_control &
+         (SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_VM_FUNCTIONS) )
+        printk("Virtual processor ID = 0x%04x VMfunc controls = %016lx\n",
+               vmr16(VIRTUAL_PROCESSOR_ID), vmr(VM_FUNCTION_CONTROL));
 
     vmx_vmcs_exit(v);
 }
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 8584f1f..2582cdd 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -32,10 +31,9 @@
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
-#include <asm/types.h>
+#include <asm/guest_access.h>
 #include <asm/debugreg.h>
 #include <asm/msr.h>
-#include <asm/spinlock.h>
 #include <asm/paging.h>
 #include <asm/p2m.h>
 #include <asm/mem_sharing.h>
@@ -52,11 +50,14 @@
 #include <asm/hvm/vpt.h>
 #include <public/hvm/save.h>
 #include <asm/hvm/trace.h>
+#include <asm/hvm/event.h>
 #include <asm/xenoprof.h>
 #include <asm/debugger.h>
 #include <asm/apic.h>
 #include <asm/hvm/nestedhvm.h>
+#include <asm/altp2m.h>
 #include <asm/event.h>
+#include <asm/monitor.h>
 #include <public/arch-x86/cpuid.h>
 
 static bool_t __initdata opt_force_ept;
@@ -80,6 +81,7 @@ static void vmx_fpu_dirty_intercept(void);
 static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content);
 static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content);
 static void vmx_invlpg_intercept(unsigned long vaddr);
+static int vmx_vmfunc_intercept(struct cpu_user_regs *regs);
 
 uint8_t __read_mostly posted_intr_vector;
 
@@ -116,7 +118,32 @@ static int vmx_vcpu_initialise(struct vcpu *v)
         return rc;
     }
 
-    vpmu_initialise(v);
+    /*
+     * It's rare but still possible that domain has already been in log-dirty
+     * mode when vcpu is being created (commented by Tim), in which case we
+     * should enable PML for this vcpu if PML has been enabled for the domain,
+     * and failure to enable results in failure of creating this vcpu.
+     *
+     * Note even there's no vcpu created for the domain, vmx_domain_enable_pml
+     * will return successful in which case vmx_domain_pml_enabled will also
+     * return true. And even this is the first vcpu to be created with
+     * vmx_domain_pml_enabled being true, failure of enabling PML still results
+     * in failure of creating vcpu, to avoid complicated logic to revert PML
+     * style EPT table to non-PML style EPT table.
+     */
+    if ( vmx_domain_pml_enabled(v->domain) )
+    {
+        if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
+        {
+            dprintk(XENLOG_ERR, "%pv: Failed to enable PML.\n", v);
+            vmx_destroy_vmcs(v);
+            return rc;
+        }
+    }
+
+    /* PVH's VPMU is initialized via hypercall */
+    if ( is_hvm_vcpu(v) )
+        vpmu_initialise(v);
 
     vmx_install_vlapic_mapping(v);
 
@@ -129,6 +156,14 @@ static int vmx_vcpu_initialise(struct vcpu *v)
 
 static void vmx_vcpu_destroy(struct vcpu *v)
 {
+    /*
+     * There are cases that domain still remains in log-dirty mode when it is
+     * about to be destroyed (ex, user types 'xl destroy <dom>'), in which case
+     * we should disable PML manually here. Note that vmx_vcpu_destroy is called
+     * prior to vmx_domain_destroy so we need to disable PML for each vcpu
+     * separately here.
+     */
+    vmx_vcpu_disable_pml(v);
     vmx_destroy_vmcs(v);
     vpmu_destroy(v);
     passive_domain_destroy(v);
@@ -152,12 +187,12 @@ void vmx_save_host_msrs(void)
         rdmsrl(msr_index[i], host_msr_state->msrs[i]);
 }
 
-#define WRITE_MSR(address)                                              \
+#define WRITE_MSR(address) do {                                         \
         guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \
         set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags);    \
         wrmsrl(MSR_ ## address, msr_content);                           \
         set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags);     \
-        break
+    } while ( 0 )
 
 static enum handler_return
 long_mode_do_msr_read(unsigned int msr, uint64_t *msr_content)
@@ -232,11 +267,13 @@ long_mode_do_msr_write(unsigned int msr, uint64_t msr_content)
 
     case MSR_STAR:
         WRITE_MSR(STAR);
+        break;
 
     case MSR_LSTAR:
         if ( !is_canonical_address(msr_content) )
             goto uncanonical_address;
         WRITE_MSR(LSTAR);
+        break;
 
     case MSR_CSTAR:
         if ( !is_canonical_address(msr_content) )
@@ -246,6 +283,7 @@ long_mode_do_msr_write(unsigned int msr, uint64_t msr_content)
 
     case MSR_SYSCALL_MASK:
         WRITE_MSR(SYSCALL_MASK);
+        break;
 
     default:
         return HNDL_unhandled;
@@ -1126,7 +1164,7 @@ static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
 static void vmx_load_pdptrs(struct vcpu *v)
 {
     unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3];
-    uint64_t *guest_pdptrs;
+    uint64_t *guest_pdptes;
     struct page_info *page;
     p2m_type_t p2mt;
     char *p;
@@ -1152,7 +1190,7 @@ static void vmx_load_pdptrs(struct vcpu *v)
 
     p = __map_domain_page(page);
 
-    guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
+    guest_pdptes = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
 
     /*
      * We do not check the PDPTRs for validity. The CPU will do this during
@@ -1162,10 +1200,10 @@ static void vmx_load_pdptrs(struct vcpu *v)
 
     vmx_vmcs_enter(v);
 
-    __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
-    __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
-    __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
-    __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
+    __vmwrite(GUEST_PDPTE(0), guest_pdptes[0]);
+    __vmwrite(GUEST_PDPTE(1), guest_pdptes[1]);
+    __vmwrite(GUEST_PDPTE(2), guest_pdptes[2]);
+    __vmwrite(GUEST_PDPTE(3), guest_pdptes[3]);
 
     vmx_vmcs_exit(v);
 
@@ -1228,7 +1266,8 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
                 v->arch.hvm_vmx.exec_control |= cr3_ctls;
 
             /* Trap CR3 updates if CR3 memory events are enabled. */
-            if ( v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_CR3] )
+            if ( v->domain->arch.monitor.write_ctrlreg_enabled &
+                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3) )
                 v->arch.hvm_vmx.exec_control |= CPU_BASED_CR3_LOAD_EXITING;
 
             vmx_update_cpu_exec_control(v);
@@ -1403,7 +1442,9 @@ static void __vmx_inject_exception(int trap, int type, int error_code)
      *   VM entry]", PRM Vol. 3, 22.6.1 (Interruptibility State).
      */
 
-    intr_fields = (INTR_INFO_VALID_MASK | (type<<8) | trap);
+    intr_fields = INTR_INFO_VALID_MASK |
+                  MASK_INSR(type, INTR_INFO_INTR_TYPE_MASK) |
+                  MASK_INSR(trap, INTR_INFO_VECTOR_MASK);
     if ( error_code != HVM_DELIVER_NO_ERROR_CODE ) {
         __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
         intr_fields |= INTR_INFO_DELIVER_CODE_MASK;
@@ -1427,7 +1468,9 @@ void vmx_inject_extint(int trap, uint8_t source)
                                      PIN_BASED_VM_EXEC_CONTROL);
         if ( pin_based_cntrl & PIN_BASED_EXT_INTR_MASK ) {
             nvmx_enqueue_n2_exceptions (v, 
-               INTR_INFO_VALID_MASK | (X86_EVENTTYPE_EXT_INTR<<8) | trap,
+               INTR_INFO_VALID_MASK |
+               MASK_INSR(X86_EVENTTYPE_EXT_INTR, INTR_INFO_INTR_TYPE_MASK) |
+               MASK_INSR(trap, INTR_INFO_VECTOR_MASK),
                HVM_DELIVER_NO_ERROR_CODE, source);
             return;
         }
@@ -1446,7 +1489,9 @@ void vmx_inject_nmi(void)
                                      PIN_BASED_VM_EXEC_CONTROL);
         if ( pin_based_cntrl & PIN_BASED_NMI_EXITING ) {
             nvmx_enqueue_n2_exceptions (v, 
-               INTR_INFO_VALID_MASK | (X86_EVENTTYPE_NMI<<8) | TRAP_nmi,
+               INTR_INFO_VALID_MASK |
+               MASK_INSR(X86_EVENTTYPE_NMI, INTR_INFO_INTR_TYPE_MASK) |
+               MASK_INSR(TRAP_nmi, INTR_INFO_VECTOR_MASK),
                HVM_DELIVER_NO_ERROR_CODE, hvm_intsrc_nmi);
             return;
         }
@@ -1471,7 +1516,7 @@ static void vmx_inject_trap(struct hvm_trap *trap)
 
     if ( (_trap.vector == TRAP_page_fault) &&
          (_trap.type == X86_EVENTTYPE_HW_EXCEPTION) )
-        current->arch.hvm_vcpu.guest_cr[2] = _trap.cr2;
+        curr->arch.hvm_vcpu.guest_cr[2] = _trap.cr2;
 
     if ( nestedhvm_vcpu_in_guestmode(curr) )
         intr_info = vcpu_2_nvmx(curr).intr.intr_info;
@@ -1484,7 +1529,7 @@ static void vmx_inject_trap(struct hvm_trap *trap)
         if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
         {
             __restore_debug_registers(curr);
-            write_debugreg(6, read_debugreg(6) | 0x4000);
+            write_debugreg(6, read_debugreg(6) | DR_STEP);
         }
         if ( cpu_has_monitor_trap_flag )
             break;
@@ -1499,7 +1544,8 @@ static void vmx_inject_trap(struct hvm_trap *trap)
     }
 
     if ( unlikely(intr_info & INTR_INFO_VALID_MASK) &&
-         (((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) )
+         (MASK_EXTR(intr_info, INTR_INFO_INTR_TYPE_MASK) ==
+          X86_EVENTTYPE_HW_EXCEPTION) )
     {
         _trap.vector = hvm_combine_hw_exceptions(
             (uint8_t)intr_info, _trap.vector);
@@ -1514,7 +1560,9 @@ static void vmx_inject_trap(struct hvm_trap *trap)
          nvmx_intercepts_exception(curr, _trap.vector, _trap.error_code) )
     {
         nvmx_enqueue_n2_exceptions (curr, 
-            INTR_INFO_VALID_MASK | (_trap.type<<8) | _trap.vector,
+            INTR_INFO_VALID_MASK |
+            MASK_INSR(_trap.type, INTR_INFO_INTR_TYPE_MASK) |
+            MASK_INSR(_trap.vector, INTR_INFO_VECTOR_MASK),
             _trap.error_code, hvm_intsrc_none);
         return;
     }
@@ -1524,7 +1572,7 @@ static void vmx_inject_trap(struct hvm_trap *trap)
     if ( (_trap.vector == TRAP_page_fault) &&
          (_trap.type == X86_EVENTTYPE_HW_EXCEPTION) )
         HVMTRACE_LONG_2D(PF_INJECT, _trap.error_code,
-                         TRC_PAR_LONG(current->arch.hvm_vcpu.guest_cr[2]));
+                         TRC_PAR_LONG(curr->arch.hvm_vcpu.guest_cr[2]));
     else
         HVMTRACE_2D(INJ_EXC, _trap.vector, _trap.error_code);
 }
@@ -1718,6 +1766,126 @@ static void vmx_enable_msr_exit_interception(struct domain *d)
                                          MSR_TYPE_W);
 }
 
+static bool_t vmx_is_singlestep_supported(void)
+{
+    return !!cpu_has_monitor_trap_flag;
+}
+
+static void vmx_vcpu_update_eptp(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct p2m_domain *p2m = NULL;
+    struct ept_data *ept;
+
+    if ( altp2m_active(d) )
+        p2m = p2m_get_altp2m(v);
+    if ( !p2m )
+        p2m = p2m_get_hostp2m(d);
+
+    ept = &p2m->ept;
+    ept->asr = pagetable_get_pfn(p2m_get_pagetable(p2m));
+
+    vmx_vmcs_enter(v);
+
+    __vmwrite(EPT_POINTER, ept_get_eptp(ept));
+
+    if ( v->arch.hvm_vmx.secondary_exec_control &
+         SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS )
+        __vmwrite(EPTP_INDEX, vcpu_altp2m(v).p2midx);
+
+    vmx_vmcs_exit(v);
+}
+
+static void vmx_vcpu_update_vmfunc_ve(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    u32 mask = SECONDARY_EXEC_ENABLE_VM_FUNCTIONS;
+
+    if ( !cpu_has_vmx_vmfunc )
+        return;
+
+    if ( cpu_has_vmx_virt_exceptions )
+        mask |= SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS;
+
+    vmx_vmcs_enter(v);
+
+    if ( !d->is_dying && altp2m_active(d) )
+    {
+        v->arch.hvm_vmx.secondary_exec_control |= mask;
+        __vmwrite(VM_FUNCTION_CONTROL, VMX_VMFUNC_EPTP_SWITCHING);
+        __vmwrite(EPTP_LIST_ADDR, virt_to_maddr(d->arch.altp2m_eptp));
+
+        if ( cpu_has_vmx_virt_exceptions )
+        {
+            p2m_type_t t;
+            mfn_t mfn;
+
+            mfn = get_gfn_query_unlocked(d, gfn_x(vcpu_altp2m(v).veinfo_gfn), &t);
+
+            if ( mfn_x(mfn) != INVALID_MFN )
+                __vmwrite(VIRT_EXCEPTION_INFO, mfn_x(mfn) << PAGE_SHIFT);
+            else
+                v->arch.hvm_vmx.secondary_exec_control &=
+                    ~SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS;
+        }
+    }
+    else
+        v->arch.hvm_vmx.secondary_exec_control &= ~mask;
+
+    __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+              v->arch.hvm_vmx.secondary_exec_control);
+
+    vmx_vmcs_exit(v);
+}
+
+static int vmx_vcpu_emulate_vmfunc(struct cpu_user_regs *regs)
+{
+    int rc = X86EMUL_EXCEPTION;
+    struct vcpu *curr = current;
+
+    if ( !cpu_has_vmx_vmfunc && altp2m_active(curr->domain) &&
+         regs->_eax == 0 &&
+         p2m_switch_vcpu_altp2m_by_id(curr, regs->_ecx) )
+        rc = X86EMUL_OKAY;
+
+    return rc;
+}
+
+static bool_t vmx_vcpu_emulate_ve(struct vcpu *v)
+{
+    bool_t rc = 0, writable;
+    unsigned long gfn = gfn_x(vcpu_altp2m(v).veinfo_gfn);
+    ve_info_t *veinfo;
+
+    if ( gfn == INVALID_GFN )
+        return 0;
+
+    veinfo = hvm_map_guest_frame_rw(gfn, 0, &writable);
+    if ( !veinfo )
+        return 0;
+    if ( !writable || veinfo->semaphore != 0 )
+        goto out;
+
+    rc = 1;
+
+    veinfo->exit_reason = EXIT_REASON_EPT_VIOLATION;
+    veinfo->semaphore = ~0;
+    veinfo->eptp_index = vcpu_altp2m(v).p2midx;
+
+    vmx_vmcs_enter(v);
+    __vmread(EXIT_QUALIFICATION, &veinfo->exit_qualification);
+    __vmread(GUEST_LINEAR_ADDRESS, &veinfo->gla);
+    __vmread(GUEST_PHYSICAL_ADDRESS, &veinfo->gpa);
+    vmx_vmcs_exit(v);
+
+    hvm_inject_hw_exception(TRAP_virtualisation,
+                            HVM_DELIVER_NO_ERROR_CODE);
+
+ out:
+    hvm_unmap_guest_frame(veinfo, 0);
+    return rc;
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
     .name                 = "VMX",
     .cpu_up_prepare       = vmx_cpu_up_prepare,
@@ -1754,15 +1922,14 @@ static struct hvm_function_table __initdata vmx_function_table = {
     .msr_read_intercept   = vmx_msr_read_intercept,
     .msr_write_intercept  = vmx_msr_write_intercept,
     .invlpg_intercept     = vmx_invlpg_intercept,
+    .vmfunc_intercept     = vmx_vmfunc_intercept,
     .handle_cd            = vmx_handle_cd,
     .set_info_guest       = vmx_set_info_guest,
     .set_rdtsc_exiting    = vmx_set_rdtsc_exiting,
     .nhvm_vcpu_initialise = nvmx_vcpu_initialise,
     .nhvm_vcpu_destroy    = nvmx_vcpu_destroy,
     .nhvm_vcpu_reset      = nvmx_vcpu_reset,
-    .nhvm_vcpu_guestcr3   = nvmx_vcpu_guestcr3,
     .nhvm_vcpu_p2m_base   = nvmx_vcpu_eptp_base,
-    .nhvm_vcpu_asid       = nvmx_vcpu_asid,
     .nhvm_vmcx_hap_enabled = nvmx_ept_enabled,
     .nhvm_vmcx_guest_intercepts_trap = nvmx_intercepts_exception,
     .nhvm_vcpu_vmexit_trap = nvmx_vmexit_trap,
@@ -1777,6 +1944,11 @@ static struct hvm_function_table __initdata vmx_function_table = {
     .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
     .hypervisor_cpuid_leaf = vmx_hypervisor_cpuid_leaf,
     .enable_msr_exit_interception = vmx_enable_msr_exit_interception,
+    .is_singlestep_supported = vmx_is_singlestep_supported,
+    .altp2m_vcpu_update_p2m = vmx_vcpu_update_eptp,
+    .altp2m_vcpu_update_vmfunc_ve = vmx_vcpu_update_vmfunc_ve,
+    .altp2m_vcpu_emulate_ve = vmx_vcpu_emulate_ve,
+    .altp2m_vcpu_emulate_vmfunc = vmx_vcpu_emulate_vmfunc,
 };
 
 const struct hvm_function_table * __init start_vmx(void)
@@ -1796,6 +1968,7 @@ const struct hvm_function_table * __init start_vmx(void)
     if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) )
     {
         vmx_function_table.hap_supported = 1;
+        vmx_function_table.altp2m_supported = 1;
 
         vmx_function_table.hap_capabilities = 0;
 
@@ -1842,7 +2015,7 @@ static int get_instruction_length(void)
     unsigned long len;
 
     __vmread(VM_EXIT_INSTRUCTION_LEN, &len); /* Safe: callers audited */
-    BUG_ON((len < 1) || (len > 15));
+    BUG_ON((len < 1) || (len > MAX_INST_LEN));
     return len;
 }
 
@@ -1947,6 +2120,19 @@ static void vmx_invlpg_intercept(unsigned long vaddr)
         vpid_sync_vcpu_gva(curr, vaddr);
 }
 
+static int vmx_vmfunc_intercept(struct cpu_user_regs *regs)
+{
+    /*
+     * This handler is a placeholder for future where Xen may
+     * want to handle VMFUNC exits and resume a domain normally without
+     * injecting a #UD to the guest - for example, in a VT-nested
+     * scenario where Xen may want to lazily shadow the alternate
+     * EPTP list.
+     */
+    gdprintk(XENLOG_ERR, "Failed guest VMFUNC execution\n");
+    return X86EMUL_EXCEPTION;
+}
+
 static int vmx_cr_access(unsigned long exit_qualification)
 {
     struct vcpu *curr = current;
@@ -1965,18 +2151,28 @@ static int vmx_cr_access(unsigned long exit_qualification)
     }
     case VMX_CONTROL_REG_ACCESS_TYPE_CLTS: {
         unsigned long old = curr->arch.hvm_vcpu.guest_cr[0];
-        curr->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS;
+        unsigned long value = old & ~X86_CR0_TS;
+
+        /*
+         * Special case unlikely to be interesting to a
+         * VM_EVENT_FLAG_DENY-capable application, so the hvm_event_crX()
+         * return value is ignored for now.
+         */
+        hvm_event_crX(CR0, value, old);
+        curr->arch.hvm_vcpu.guest_cr[0] = value;
         vmx_update_guest_cr(curr, 0);
-        hvm_memory_event_cr0(curr->arch.hvm_vcpu.guest_cr[0], old);
         HVMTRACE_0D(CLTS);
         break;
     }
     case VMX_CONTROL_REG_ACCESS_TYPE_LMSW: {
         unsigned long value = curr->arch.hvm_vcpu.guest_cr[0];
-        /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
-        value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
+
+        /* LMSW can (1) set PE; (2) set or clear MP, EM, and TS. */
+        value = (value & ~(X86_CR0_MP|X86_CR0_EM|X86_CR0_TS)) |
+                (VMX_CONTROL_REG_ACCESS_DATA(exit_qualification) &
+                 (X86_CR0_PE|X86_CR0_MP|X86_CR0_EM|X86_CR0_TS));
         HVMTRACE_LONG_1D(LMSW, value);
-        return hvm_set_cr0(value);
+        return hvm_set_cr0(value, 1);
     }
     default:
         BUG();
@@ -2040,14 +2236,20 @@ static const struct lbr_info *last_branch_msr_get(void)
         case 58: case 62:
         /* Haswell */
         case 60: case 63: case 69: case 70:
+        /* Broadwell */
+        case 61: case 79: case 86:
         /* future */
-        case 61: case 78:
+        case 78:
             return nh_lbr;
             break;
         /* Atom */
         case 28: case 38: case 39: case 53: case 54:
         /* Silvermont */
         case 55: case 74: case 77: case 90: case 93:
+        /* next gen Xeon Phi */
+        case 87:
+        /* Airmont */
+        case 76:
             return at_lbr;
             break;
         }
@@ -2110,12 +2312,17 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
         *msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
                        MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
         /* Perhaps vpmu will change some bits. */
+        /* FALLTHROUGH */
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+    case MSR_IA32_PEBS_ENABLE:
+    case MSR_IA32_DS_AREA:
         if ( vpmu_do_rdmsr(msr, msr_content) )
-            goto done;
+            goto gp_fault;
         break;
     default:
-        if ( vpmu_do_rdmsr(msr, msr_content) )
-            break;
         if ( passive_domain_do_rdmsr(msr, msr_content) )
             goto done;
         switch ( long_mode_do_msr_read(msr, msr_content) )
@@ -2170,7 +2377,7 @@ static int vmx_alloc_vlapic_mapping(struct domain *d)
     share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
     d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
     set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE),
-        _mfn(virt_to_mfn(apic_va)));
+        _mfn(virt_to_mfn(apic_va)), p2m_get_hostp2m(d)->default_access);
 
     return 0;
 }
@@ -2291,7 +2498,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
         if ( msr_content & ~supported )
         {
             /* Perhaps some other bits are supported in vpmu. */
-            if ( !vpmu_do_wrmsr(msr, msr_content, supported) )
+            if ( vpmu_do_wrmsr(msr, msr_content, supported) )
                 break;
         }
         if ( msr_content & IA32_DEBUGCTLMSR_LBR )
@@ -2319,9 +2526,16 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
         if ( !nvmx_msr_write_intercept(msr, msr_content) )
             goto gp_fault;
         break;
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(7):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+    case MSR_IA32_PEBS_ENABLE:
+    case MSR_IA32_DS_AREA:
+         if ( vpmu_do_wrmsr(msr, msr_content, 0) )
+            goto gp_fault;
+        break;
     default:
-        if ( vpmu_do_wrmsr(msr, msr_content, 0) )
-            return X86EMUL_OKAY;
         if ( passive_domain_do_wrmsr(msr, msr_content) )
             return X86EMUL_OKAY;
 
@@ -2458,21 +2672,21 @@ static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
 
     /* Everything else is an error. */
     mfn = get_gfn_query_unlocked(d, gfn, &p2mt);
-    gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
-             "gpa %#"PRIpaddr", mfn %#lx, type %i.\n", 
-             qualification, 
-             (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
-             (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
-             (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
-             (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
-             (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
-             (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
-             gpa, mfn_x(mfn), p2mt);
+    gprintk(XENLOG_ERR,
+            "EPT violation %#lx (%c%c%c/%c%c%c) gpa %#"PRIpaddr" mfn %#lx type %i\n",
+            qualification,
+            (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
+            (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
+            (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
+            (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
+            (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
+            (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
+            gpa, mfn_x(mfn), p2mt);
 
     ept_walk_table(d, gfn);
 
     if ( qualification & EPT_GLA_VALID )
-        gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);
+        gprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);
 
     domain_crash(d);
 }
@@ -2606,8 +2820,9 @@ static void vmx_idtv_reinject(unsigned long idtv_info)
          * Clear NMI-blocking interruptibility info if an NMI delivery faulted.
          * Re-delivery will re-set it (see SDM 3B 25.7.1.2).
          */
-        if ( cpu_has_vmx_vnmi && ((idtv_info & INTR_INFO_INTR_TYPE_MASK) ==
-                                 (X86_EVENTTYPE_NMI<<8)) )
+        if ( cpu_has_vmx_vnmi &&
+             ((idtv_info & INTR_INFO_INTR_TYPE_MASK) ==
+              MASK_INSR(X86_EVENTTYPE_NMI, INTR_INFO_INTR_TYPE_MASK)) )
         {
             unsigned long intr_info;
 
@@ -2628,17 +2843,6 @@ static int vmx_handle_apic_write(void)
     return vlapic_apicv_write(current, exit_qualification & 0xfff);
 }
 
-/*
- * When "Virtual Interrupt Delivery" is enabled, this function is used
- * to handle EOI-induced VM exit
- */
-void vmx_handle_EOI_induced_exit(struct vlapic *vlapic, int vector)
-{
-    ASSERT(cpu_has_vmx_virtual_intr_delivery);
-
-    vlapic_handle_EOI_induced_exit(vlapic, vector);
-}
-
 void vmx_vmexit_handler(struct cpu_user_regs *regs)
 {
     unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0;
@@ -2683,9 +2887,9 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
         vector = intr_info & INTR_INFO_VECTOR_MASK;
         if ( vector == TRAP_machine_check )
             do_machine_check(regs);
-        if ( vector == TRAP_nmi
-             && ((intr_info & INTR_INFO_INTR_TYPE_MASK) ==
-                 (X86_EVENTTYPE_NMI << 8)) )
+        if ( (vector == TRAP_nmi) &&
+             ((intr_info & INTR_INFO_INTR_TYPE_MASK) ==
+              MASK_INSR(X86_EVENTTYPE_NMI, INTR_INFO_INTR_TYPE_MASK)) )
         {
             exception_table[TRAP_nmi](regs);
             enable_nmis();
@@ -2699,6 +2903,42 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
     /* Now enable interrupts so it's safe to take locks. */
     local_irq_enable();
 
+    /*
+     * If the guest has the ability to switch EPTP without an exit,
+     * figure out whether it has done so and update the altp2m data.
+     */
+    if ( altp2m_active(v->domain) &&
+        (v->arch.hvm_vmx.secondary_exec_control &
+        SECONDARY_EXEC_ENABLE_VM_FUNCTIONS) )
+    {
+        unsigned long idx;
+
+        if ( v->arch.hvm_vmx.secondary_exec_control &
+            SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS )
+            __vmread(EPTP_INDEX, &idx);
+        else
+        {
+            unsigned long eptp;
+
+            __vmread(EPT_POINTER, &eptp);
+
+            if ( (idx = p2m_find_altp2m_by_eptp(v->domain, eptp)) ==
+                 INVALID_ALTP2M )
+            {
+                gdprintk(XENLOG_ERR, "EPTP not found in alternate p2m list\n");
+                domain_crash(v->domain);
+            }
+        }
+
+        if ( idx != vcpu_altp2m(v).p2midx )
+        {
+            BUG_ON(idx >= MAX_ALTP2M);
+            atomic_dec(&p2m_get_altp2m(v)->active_vcpus);
+            vcpu_altp2m(v).p2midx = idx;
+            atomic_inc(&p2m_get_altp2m(v)->active_vcpus);
+        }
+    }
+
     /* XXX: This looks ugly, but we need a mechanism to ensure
      * any pending vmresume has really happened
      */
@@ -2800,7 +3040,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
              */
             __vmread(EXIT_QUALIFICATION, &exit_qualification);
             HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
-            write_debugreg(6, exit_qualification | 0xffff0ff0);
+            write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
             if ( !v->domain->debugger_attached || cpu_has_monitor_trap_flag )
                 goto exit_and_crash;
             domain_pause_for_debugger();
@@ -2811,12 +3051,12 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
             if ( v->domain->debugger_attached )
             {
                 update_guest_eip(); /* Safe: INT3 */            
-                current->arch.gdbsx_vcpu_event = TRAP_int3;
+                v->arch.gdbsx_vcpu_event = TRAP_int3;
                 domain_pause_for_debugger();
                 break;
             }
             else {
-                int handled = hvm_memory_event_int3(regs->eip);
+                int handled = hvm_event_int3(regs->eip);
                 
                 if ( handled < 0 ) 
                 {
@@ -2869,8 +3109,8 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
             hvm_inject_page_fault(regs->error_code, exit_qualification);
             break;
         case TRAP_nmi:
-            if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
-                 (X86_EVENTTYPE_NMI << 8) )
+            if ( MASK_EXTR(intr_info, INTR_INFO_INTR_TYPE_MASK) !=
+                 X86_EVENTTYPE_NMI )
                 goto exit_and_crash;
             HVMTRACE_0D(NMI);
             /* Already handled above. */
@@ -2921,7 +3161,8 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
          *  - TSW is a vectored event due to a SW exception or SW interrupt.
          */
         inst_len = ((source != 3) ||        /* CALL, IRET, or JMP? */
-                    (idtv_info & (1u<<10))) /* IntrType > 3? */
+                    (MASK_EXTR(idtv_info, INTR_INFO_INTR_TYPE_MASK)
+                     > 3)) /* IntrType > 3? */
             ? get_instruction_length() /* Safe: SDM 3B 23.2.4 */ : 0;
         if ( (source == 3) && (idtv_info & INTR_INFO_DELIVER_CODE_MASK) )
             __vmread(IDT_VECTORING_ERROR_CODE, &ecode);
@@ -2990,7 +3231,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
     {
         uint64_t msr_content;
         msr_content = ((uint64_t)regs->edx << 32) | (uint32_t)regs->eax;
-        if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY )
+        if ( hvm_msr_write_intercept(regs->ecx, msr_content, 1) == X86EMUL_OKAY )
             update_guest_eip(); /* Safe: WRMSR */
         break;
     }
@@ -3050,6 +3291,13 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
             update_guest_eip();
         break;
 
+    case EXIT_REASON_VMFUNC:
+        if ( vmx_vmfunc_intercept(regs) != X86EMUL_OKAY )
+            hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+        else
+            update_guest_eip();
+        break;
+
     case EXIT_REASON_MWAIT_INSTRUCTION:
     case EXIT_REASON_MONITOR_INSTRUCTION:
     case EXIT_REASON_GETSEC:
@@ -3071,15 +3319,12 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
         break;
 
     case EXIT_REASON_EOI_INDUCED:
-    {
-        int vector;
-
         __vmread(EXIT_QUALIFICATION, &exit_qualification);
-        vector = exit_qualification & 0xff;
 
-        vmx_handle_EOI_induced_exit(vcpu_vlapic(current), vector);
+        ASSERT(cpu_has_vmx_virtual_intr_delivery);
+
+        vlapic_handle_EOI(vcpu_vlapic(v), exit_qualification);
         break;
-    }
 
     case EXIT_REASON_IO_INSTRUCTION:
         __vmread(EXIT_QUALIFICATION, &exit_qualification);
@@ -3133,7 +3378,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
         v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
         vmx_update_cpu_exec_control(v);
         if ( v->arch.hvm_vcpu.single_step ) {
-          hvm_memory_event_single_step(regs->eip);
+          hvm_event_single_step(regs->eip);
           if ( v->domain->debugger_attached )
               domain_pause_for_debugger();
         }
@@ -3142,7 +3387,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 
     case EXIT_REASON_PAUSE_INSTRUCTION:
         perfc_incr(pauseloop_exits);
-        do_sched_op_compat(SCHEDOP_yield, 0);
+        do_sched_op(SCHEDOP_yield, guest_handle_from_ptr(NULL, void));
         break;
 
     case EXIT_REASON_XSETBV:
@@ -3155,6 +3400,10 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
         vmx_handle_apic_write();
         break;
 
+    case EXIT_REASON_PML_FULL:
+        vmx_vcpu_flush_pml_buffer(v);
+        break;
+
     case EXIT_REASON_ACCESS_GDTR_OR_IDTR:
     case EXIT_REASON_ACCESS_LDTR_OR_TR:
     case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index 9ccc03f..cb6f9b8 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -15,8 +15,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
@@ -98,9 +97,9 @@ int nvmx_vcpu_initialise(struct vcpu *v)
          * Let them vmexit as usual.
          */
         set_bit(IO_BITMAP_A, vw);
-        set_bit(IO_BITMAP_A_HIGH, vw);
+        set_bit(VMCS_HIGH(IO_BITMAP_A), vw);
         set_bit(IO_BITMAP_B, vw);
-        set_bit(IO_BITMAP_B_HIGH, vw);
+        set_bit(VMCS_HIGH(IO_BITMAP_B), vw);
 
         unmap_domain_page(vr);
         unmap_domain_page(vw);
@@ -174,13 +173,6 @@ int nvmx_vcpu_reset(struct vcpu *v)
     return 0;
 }
 
-uint64_t nvmx_vcpu_guestcr3(struct vcpu *v)
-{
-    /* TODO */
-    ASSERT(0);
-    return 0;
-}
-
 uint64_t nvmx_vcpu_eptp_base(struct vcpu *v)
 {
     uint64_t eptp_base;
@@ -190,13 +182,6 @@ uint64_t nvmx_vcpu_eptp_base(struct vcpu *v)
     return eptp_base & PAGE_MASK;
 }
 
-uint32_t nvmx_vcpu_asid(struct vcpu *v)
-{
-    /* TODO */
-    ASSERT(0);
-    return 0;
-}
-
 bool_t nvmx_ept_enabled(struct vcpu *v)
 {
     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
@@ -514,8 +499,8 @@ static void vmreturn(struct cpu_user_regs *regs, enum vmx_ops_result ops_res)
     regs->eflags = eflags;
 }
 
-int nvmx_intercepts_exception(struct vcpu *v, unsigned int trap,
-                               int error_code)
+bool_t nvmx_intercepts_exception(struct vcpu *v, unsigned int trap,
+                                 int error_code)
 {
     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
     u32 exception_bitmap, pfec_match=0, pfec_mask=0;
@@ -881,11 +866,11 @@ static const u16 vmcs_gstate_field[] = {
     GUEST_SYSENTER_EIP,
 };
 
-static const u16 gpdptr_fields[] = {
-    GUEST_PDPTR0,
-    GUEST_PDPTR1,
-    GUEST_PDPTR2,
-    GUEST_PDPTR3,
+static const u16 gpdpte_fields[] = {
+    GUEST_PDPTE(0),
+    GUEST_PDPTE(1),
+    GUEST_PDPTE(2),
+    GUEST_PDPTE(3),
 };
 
 /*
@@ -1048,15 +1033,16 @@ static void load_shadow_guest_state(struct vcpu *v)
 
     nvcpu->guest_cr[0] = __get_vvmcs(vvmcs, CR0_READ_SHADOW);
     nvcpu->guest_cr[4] = __get_vvmcs(vvmcs, CR4_READ_SHADOW);
-    hvm_set_cr0(__get_vvmcs(vvmcs, GUEST_CR0));
-    hvm_set_cr4(__get_vvmcs(vvmcs, GUEST_CR4));
-    hvm_set_cr3(__get_vvmcs(vvmcs, GUEST_CR3));
+    hvm_set_cr0(__get_vvmcs(vvmcs, GUEST_CR0), 1);
+    hvm_set_cr4(__get_vvmcs(vvmcs, GUEST_CR4), 1);
+    hvm_set_cr3(__get_vvmcs(vvmcs, GUEST_CR3), 1);
 
     control = __get_vvmcs(vvmcs, VM_ENTRY_CONTROLS);
     if ( control & VM_ENTRY_LOAD_GUEST_PAT )
         hvm_set_guest_pat(v, __get_vvmcs(vvmcs, GUEST_PAT));
     if ( control & VM_ENTRY_LOAD_PERF_GLOBAL_CTRL )
-        hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL, __get_vvmcs(vvmcs, GUEST_PERF_GLOBAL_CTRL));
+        hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL,
+                                __get_vvmcs(vvmcs, GUEST_PERF_GLOBAL_CTRL), 0);
 
     hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, 0);
 
@@ -1173,7 +1159,7 @@ static void virtual_vmentry(struct cpu_user_regs *regs)
 
     if ( nvmx_ept_enabled(v) && hvm_pae_enabled(v) &&
          !(v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
-        vvmcs_to_shadow_bulk(v, ARRAY_SIZE(gpdptr_fields), gpdptr_fields);
+        vvmcs_to_shadow_bulk(v, ARRAY_SIZE(gpdpte_fields), gpdpte_fields);
 
     regs->eip = __get_vvmcs(vvmcs, GUEST_RIP);
     regs->esp = __get_vvmcs(vvmcs, GUEST_RSP);
@@ -1249,15 +1235,16 @@ static void load_vvmcs_host_state(struct vcpu *v)
         __vmwrite(vmcs_h2g_field[i].guest_field, r);
     }
 
-    hvm_set_cr0(__get_vvmcs(vvmcs, HOST_CR0));
-    hvm_set_cr4(__get_vvmcs(vvmcs, HOST_CR4));
-    hvm_set_cr3(__get_vvmcs(vvmcs, HOST_CR3));
+    hvm_set_cr0(__get_vvmcs(vvmcs, HOST_CR0), 1);
+    hvm_set_cr4(__get_vvmcs(vvmcs, HOST_CR4), 1);
+    hvm_set_cr3(__get_vvmcs(vvmcs, HOST_CR3), 1);
 
     control = __get_vvmcs(vvmcs, VM_EXIT_CONTROLS);
     if ( control & VM_EXIT_LOAD_HOST_PAT )
         hvm_set_guest_pat(v, __get_vvmcs(vvmcs, HOST_PAT));
     if ( control & VM_EXIT_LOAD_PERF_GLOBAL_CTRL )
-        hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL, __get_vvmcs(vvmcs, HOST_PERF_GLOBAL_CTRL));
+        hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL,
+                                __get_vvmcs(vvmcs, HOST_PERF_GLOBAL_CTRL), 1);
 
     hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, 0);
 
@@ -1272,7 +1259,7 @@ static void sync_exception_state(struct vcpu *v)
     if ( !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) )
         return;
 
-    switch ( (nvmx->intr.intr_info & INTR_INFO_INTR_TYPE_MASK) >> 8 )
+    switch ( MASK_EXTR(nvmx->intr.intr_info, INTR_INFO_INTR_TYPE_MASK) )
     {
     case X86_EVENTTYPE_EXT_INTR:
         /* rename exit_reason to EXTERNAL_INTERRUPT */
@@ -1327,10 +1314,10 @@ static void nvmx_update_apicv(struct vcpu *v)
         ppr = vlapic_set_ppr(vlapic);
         WARN_ON((ppr & 0xf0) != (vector & 0xf0));
 
-        status = vector << 8;
+        status = vector << VMX_GUEST_INTR_STATUS_SVI_OFFSET;
         rvi = vlapic_has_pending_irq(v);
         if ( rvi != -1 )
-            status |= rvi & 0xff;
+            status |= rvi & VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK;
 
         __vmwrite(GUEST_INTR_STATUS, status);
     }
@@ -1348,7 +1335,7 @@ static void virtual_vmexit(struct cpu_user_regs *regs)
 
     if ( nvmx_ept_enabled(v) && hvm_pae_enabled(v) &&
          !(v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
-        shadow_to_vvmcs_bulk(v, ARRAY_SIZE(gpdptr_fields), gpdptr_fields);
+        shadow_to_vvmcs_bulk(v, ARRAY_SIZE(gpdpte_fields), gpdpte_fields);
 
     vmx_vmcs_switch(v->arch.hvm_vmx.vmcs, nvcpu->nv_n1vmcx);
 
@@ -1631,10 +1618,23 @@ int nvmx_handle_vmptrld(struct cpu_user_regs *regs)
 
     if ( nvcpu->nv_vvmcxaddr == VMCX_EADDR )
     {
-        nvcpu->nv_vvmcx = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 1);
-        if ( nvcpu->nv_vvmcx )
-            nvcpu->nv_vvmcxaddr = gpa;
-        if ( !nvcpu->nv_vvmcx ||
+        bool_t writable;
+        void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 1, &writable);
+
+        if ( vvmcx )
+        {
+            if ( writable )
+            {
+                nvcpu->nv_vvmcx = vvmcx;
+                nvcpu->nv_vvmcxaddr = gpa;
+            }
+            else
+            {
+                hvm_unmap_guest_frame(vvmcx, 1);
+                vvmcx = NULL;
+            }
+        }
+        if ( !vvmcx ||
              !map_io_bitmap_all(v) ||
              !_map_msr_bitmap(v) )
         {
@@ -1688,13 +1688,10 @@ int nvmx_handle_vmclear(struct cpu_user_regs *regs)
     if ( rc != X86EMUL_OKAY )
         return rc;
 
+    BUILD_BUG_ON(X86EMUL_OKAY != VMSUCCEED); /* rc = VMSUCCEED; */
     if ( gpa & 0xfff )
-    {
-        vmreturn(regs, VMFAIL_INVALID);
-        return X86EMUL_OKAY;
-    }
-    
-    if ( gpa == nvcpu->nv_vvmcxaddr ) 
+        rc = VMFAIL_INVALID;
+    else if ( gpa == nvcpu->nv_vvmcxaddr )
     {
         if ( cpu_has_vmx_vmcs_shadowing )
             nvmx_clear_vmcs_pointer(v, nvcpu->nv_vvmcx);
@@ -1705,14 +1702,22 @@ int nvmx_handle_vmclear(struct cpu_user_regs *regs)
     else 
     {
         /* Even if this VMCS isn't the current one, we must clear it. */
-        vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0);
+        bool_t writable;
+
+        vvmcs = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 0, &writable);
         if ( vvmcs ) 
-            clear_vvmcs_launched(&nvmx->launched_list,
-                domain_page_map_to_mfn(vvmcs));
-        hvm_unmap_guest_frame(vvmcs, 0);
+        {
+            if ( writable )
+                clear_vvmcs_launched(&nvmx->launched_list,
+                                     domain_page_map_to_mfn(vvmcs));
+            else
+                rc = VMFAIL_VALID;
+            hvm_unmap_guest_frame(vvmcs, 0);
+        }
     }
 
-    vmreturn(regs, VMSUCCEED);
+    vmreturn(regs, rc);
+
     return X86EMUL_OKAY;
 }
 
@@ -1761,15 +1766,15 @@ int nvmx_handle_vmwrite(struct cpu_user_regs *regs)
     vmcs_encoding = reg_read(regs, decode.reg2);
     __set_vvmcs(nvcpu->nv_vvmcx, vmcs_encoding, operand);
 
-    switch ( vmcs_encoding )
+    switch ( vmcs_encoding & ~VMCS_HIGH(0) )
     {
-    case IO_BITMAP_A: case IO_BITMAP_A_HIGH:
+    case IO_BITMAP_A:
         okay = _map_io_bitmap(v, IO_BITMAP_A);
         break;
-    case IO_BITMAP_B: case IO_BITMAP_B_HIGH:
+    case IO_BITMAP_B:
         okay = _map_io_bitmap(v, IO_BITMAP_B);
         break;
-    case MSR_BITMAP: case MSR_BITMAP_HIGH:
+    case MSR_BITMAP:
         okay = _map_msr_bitmap(v);
         break;
     }
@@ -2161,7 +2166,8 @@ int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs,
     case EXIT_REASON_EXCEPTION_NMI:
     {
         unsigned long intr_info;
-        u32 valid_mask = (X86_EVENTTYPE_HW_EXCEPTION << 8) |
+        u32 valid_mask = MASK_INSR(X86_EVENTTYPE_HW_EXCEPTION,
+                                  INTR_INFO_INTR_TYPE_MASK) |
                          INTR_INFO_VALID_MASK;
         u64 exec_bitmap;
         int vector;
@@ -2350,8 +2356,8 @@ int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs,
         u32 mask = 0;
 
         __vmread(EXIT_QUALIFICATION, &exit_qualification);
-        cr = exit_qualification & 0xf;
-        write = (exit_qualification >> 4) & 3;
+        cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
+        write = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
         /* also according to guest exec_control */
         ctrl = __n2_exec_control(v);
 
@@ -2443,8 +2449,9 @@ int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs,
                 u64 cr0_gh_mask = __get_vvmcs(nvcpu->nv_vvmcx, CR0_GUEST_HOST_MASK);
 
                 __vmread(CR0_READ_SHADOW, &old_val);
-                old_val &= 0xf;
-                val = (exit_qualification >> 16) & 0xf;
+                old_val &= X86_CR0_PE|X86_CR0_MP|X86_CR0_EM|X86_CR0_TS;
+                val = VMX_CONTROL_REG_ACCESS_DATA(exit_qualification) &
+                      (X86_CR0_PE|X86_CR0_MP|X86_CR0_EM|X86_CR0_TS);
                 changed_bits = old_val ^ val;
                 if ( changed_bits & cr0_gh_mask )
                     nvcpu->nv_vmexit_pending = 1;
diff --git a/xen/arch/x86/hvm/vpic.c b/xen/arch/x86/hvm/vpic.c
index c2c8fb6..7c2edc8 100644
--- a/xen/arch/x86/hvm/vpic.c
+++ b/xen/arch/x86/hvm/vpic.c
@@ -56,7 +56,7 @@ static int vpic_get_priority(struct hvm_hw_vpic *vpic, uint8_t mask)
         return VPIC_PRIO_NONE;
 
     /* prio = ffs(mask ROR vpic->priority_add); */
-    asm ( "ror %%cl,%b1 ; bsf %1,%0"
+    asm ( "ror %%cl,%b1 ; rep; bsf %1,%0"
           : "=r" (prio) : "q" ((uint32_t)mask), "c" (vpic->priority_add) );
     return prio;
 }
@@ -324,7 +324,7 @@ static uint32_t vpic_ioport_read(struct hvm_hw_vpic *vpic, uint32_t addr)
 }
 
 static int vpic_intercept_pic_io(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
 {
     struct hvm_hw_vpic *vpic;
 
@@ -346,7 +346,7 @@ static int vpic_intercept_pic_io(
 }
 
 static int vpic_intercept_elcr_io(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val)
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val)
 {
     struct hvm_hw_vpic *vpic;
     uint32_t data;
diff --git a/xen/arch/x86/hvm/vpmu.c b/xen/arch/x86/hvm/vpmu.c
deleted file mode 100644
index 654b8b5..0000000
--- a/xen/arch/x86/hvm/vpmu.c
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * vpmu.c: PMU virtualization for HVM domain.
- *
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Author: Haitao Shan <haitao.shan at intel.com>
- */
-#include <xen/config.h>
-#include <xen/sched.h>
-#include <xen/xenoprof.h>
-#include <asm/regs.h>
-#include <asm/types.h>
-#include <asm/msr.h>
-#include <asm/nmi.h>
-#include <asm/hvm/support.h>
-#include <asm/hvm/vmx/vmx.h>
-#include <asm/hvm/vmx/vmcs.h>
-#include <asm/hvm/vpmu.h>
-#include <asm/hvm/svm/svm.h>
-#include <asm/hvm/svm/vmcb.h>
-#include <asm/apic.h>
-
-/*
- * "vpmu" :     vpmu generally enabled
- * "vpmu=off" : vpmu generally disabled
- * "vpmu=bts" : vpmu enabled and Intel BTS feature switched on.
- */
-static unsigned int __read_mostly opt_vpmu_enabled;
-static void parse_vpmu_param(char *s);
-custom_param("vpmu", parse_vpmu_param);
-
-static DEFINE_PER_CPU(struct vcpu *, last_vcpu);
-
-static void __init parse_vpmu_param(char *s)
-{
-    switch ( parse_bool(s) )
-    {
-    case 0:
-        break;
-    default:
-        if ( !strcmp(s, "bts") )
-            opt_vpmu_enabled |= VPMU_BOOT_BTS;
-        else if ( *s )
-        {
-            printk("VPMU: unknown flag: %s - vpmu disabled!\n", s);
-            break;
-        }
-        /* fall through */
-    case 1:
-        opt_vpmu_enabled |= VPMU_BOOT_ENABLED;
-        break;
-    }
-}
-
-int vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, uint64_t supported)
-{
-    struct vpmu_struct *vpmu = vcpu_vpmu(current);
-
-    if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_wrmsr )
-        return vpmu->arch_vpmu_ops->do_wrmsr(msr, msr_content, supported);
-    return 0;
-}
-
-int vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
-{
-    struct vpmu_struct *vpmu = vcpu_vpmu(current);
-
-    if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_rdmsr )
-        return vpmu->arch_vpmu_ops->do_rdmsr(msr, msr_content);
-    return 0;
-}
-
-void vpmu_do_interrupt(struct cpu_user_regs *regs)
-{
-    struct vcpu *v = current;
-    struct vpmu_struct *vpmu = vcpu_vpmu(v);
-
-    if ( vpmu->arch_vpmu_ops )
-    {
-        struct vlapic *vlapic = vcpu_vlapic(v);
-        u32 vlapic_lvtpc;
-
-        if ( !vpmu->arch_vpmu_ops->do_interrupt(regs) ||
-             !is_vlapic_lvtpc_enabled(vlapic) )
-            return;
-
-        vlapic_lvtpc = vlapic_get_reg(vlapic, APIC_LVTPC);
-
-        switch ( GET_APIC_DELIVERY_MODE(vlapic_lvtpc) )
-        {
-        case APIC_MODE_FIXED:
-            vlapic_set_irq(vlapic, vlapic_lvtpc & APIC_VECTOR_MASK, 0);
-            break;
-        case APIC_MODE_NMI:
-            v->nmi_pending = 1;
-            break;
-        }
-    }
-}
-
-void vpmu_do_cpuid(unsigned int input,
-                   unsigned int *eax, unsigned int *ebx,
-                   unsigned int *ecx, unsigned int *edx)
-{
-    struct vpmu_struct *vpmu = vcpu_vpmu(current);
-
-    if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_cpuid )
-        vpmu->arch_vpmu_ops->do_cpuid(input, eax, ebx, ecx, edx);
-}
-
-static void vpmu_save_force(void *arg)
-{
-    struct vcpu *v = (struct vcpu *)arg;
-    struct vpmu_struct *vpmu = vcpu_vpmu(v);
-
-    if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) )
-        return;
-
-    if ( vpmu->arch_vpmu_ops )
-        (void)vpmu->arch_vpmu_ops->arch_vpmu_save(v);
-
-    vpmu_reset(vpmu, VPMU_CONTEXT_SAVE);
-
-    per_cpu(last_vcpu, smp_processor_id()) = NULL;
-}
-
-void vpmu_save(struct vcpu *v)
-{
-    struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    int pcpu = smp_processor_id();
-
-    if ( !(vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) &&
-           vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED)) )
-       return;
-
-    vpmu->last_pcpu = pcpu;
-    per_cpu(last_vcpu, pcpu) = v;
-
-    if ( vpmu->arch_vpmu_ops )
-        if ( vpmu->arch_vpmu_ops->arch_vpmu_save(v) )
-            vpmu_reset(vpmu, VPMU_CONTEXT_LOADED);
-
-    apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED);
-}
-
-void vpmu_load(struct vcpu *v)
-{
-    struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    int pcpu = smp_processor_id();
-    struct vcpu *prev = NULL;
-
-    if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) )
-        return;
-
-    /* First time this VCPU is running here */
-    if ( vpmu->last_pcpu != pcpu )
-    {
-        /*
-         * Get the context from last pcpu that we ran on. Note that if another
-         * VCPU is running there it must have saved this VPCU's context before
-         * startig to run (see below).
-         * There should be no race since remote pcpu will disable interrupts
-         * before saving the context.
-         */
-        if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) )
-        {
-            vpmu_set(vpmu, VPMU_CONTEXT_SAVE);
-            on_selected_cpus(cpumask_of(vpmu->last_pcpu),
-                             vpmu_save_force, (void *)v, 1);
-            vpmu_reset(vpmu, VPMU_CONTEXT_LOADED);
-        }
-    } 
-
-    /* Prevent forced context save from remote CPU */
-    local_irq_disable();
-
-    prev = per_cpu(last_vcpu, pcpu);
-
-    if ( prev != v && prev )
-    {
-        vpmu = vcpu_vpmu(prev);
-
-        /* Someone ran here before us */
-        vpmu_set(vpmu, VPMU_CONTEXT_SAVE);
-        vpmu_save_force(prev);
-        vpmu_reset(vpmu, VPMU_CONTEXT_LOADED);
-
-        vpmu = vcpu_vpmu(v);
-    }
-
-    local_irq_enable();
-
-    /* Only when PMU is counting, we load PMU context immediately. */
-    if ( !vpmu_is_set(vpmu, VPMU_RUNNING) )
-        return;
-
-    if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_load )
-    {
-        apic_write_around(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
-        /* Arch code needs to set VPMU_CONTEXT_LOADED */
-        vpmu->arch_vpmu_ops->arch_vpmu_load(v);
-    }
-}
-
-void vpmu_initialise(struct vcpu *v)
-{
-    struct vpmu_struct *vpmu = vcpu_vpmu(v);
-    uint8_t vendor = current_cpu_data.x86_vendor;
-
-    if ( is_pvh_vcpu(v) )
-        return;
-
-    if ( vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) )
-        vpmu_destroy(v);
-    vpmu_clear(vpmu);
-    vpmu->context = NULL;
-
-    switch ( vendor )
-    {
-    case X86_VENDOR_AMD:
-        if ( svm_vpmu_initialise(v, opt_vpmu_enabled) != 0 )
-            opt_vpmu_enabled = 0;
-        break;
-
-    case X86_VENDOR_INTEL:
-        if ( vmx_vpmu_initialise(v, opt_vpmu_enabled) != 0 )
-            opt_vpmu_enabled = 0;
-        break;
-
-    default:
-        printk("VPMU: Initialization failed. "
-               "Unknown CPU vendor %d\n", vendor);
-        opt_vpmu_enabled = 0;
-        break;
-    }
-}
-
-static void vpmu_clear_last(void *arg)
-{
-    if ( this_cpu(last_vcpu) == arg )
-        this_cpu(last_vcpu) = NULL;
-}
-
-void vpmu_destroy(struct vcpu *v)
-{
-    struct vpmu_struct *vpmu = vcpu_vpmu(v);
-
-    if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) )
-        return;
-
-    /*
-     * Need to clear last_vcpu in case it points to v.
-     * We can check here non-atomically whether it is 'v' since
-     * last_vcpu can never become 'v' again at this point.
-     * We will test it again in vpmu_clear_last() with interrupts
-     * disabled to make sure we don't clear someone else.
-     */
-    if ( per_cpu(last_vcpu, vpmu->last_pcpu) == v )
-        on_selected_cpus(cpumask_of(vpmu->last_pcpu),
-                         vpmu_clear_last, v, 1);
-
-    if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_destroy )
-        vpmu->arch_vpmu_ops->arch_vpmu_destroy(v);
-}
-
-/* Dump some vpmu informations on console. Used in keyhandler dump_domains(). */
-void vpmu_dump(struct vcpu *v)
-{
-    struct vpmu_struct *vpmu = vcpu_vpmu(v);
-
-    if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_dump )
-        vpmu->arch_vpmu_ops->arch_vpmu_dump(v);
-}
-
-static int __init vpmu_init(void)
-{
-    /* NMI watchdog uses LVTPC and HW counter */
-    if ( opt_watchdog && opt_vpmu_enabled )
-    {
-        printk(XENLOG_WARNING "NMI watchdog is enabled. Turning VPMU off.\n");
-        opt_vpmu_enabled = 0;
-    }
-
-    return 0;
-}
-__initcall(vpmu_init);
diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
index 7c6549c..0c8b22e 100644
--- a/xen/arch/x86/hvm/vpt.c
+++ b/xen/arch/x86/hvm/vpt.c
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/time.h>
diff --git a/xen/arch/x86/i387.c b/xen/arch/x86/i387.c
index a372e0b..14f2a79 100644
--- a/xen/arch/x86/i387.c
+++ b/xen/arch/x86/i387.c
@@ -303,12 +303,8 @@ void save_fpu_enable(void)
 /* Initialize FPU's context save area */
 int vcpu_init_fpu(struct vcpu *v)
 {
-    int rc = 0;
+    int rc;
     
-    /* Idle domain doesn't have FPU state allocated */
-    if ( is_idle_vcpu(v) )
-        goto done;
-
     if ( (rc = xstate_alloc_save_area(v)) != 0 )
         return rc;
 
@@ -318,13 +314,9 @@ int vcpu_init_fpu(struct vcpu *v)
     {
         v->arch.fpu_ctxt = _xzalloc(sizeof(v->arch.xsave_area->fpu_sse), 16);
         if ( !v->arch.fpu_ctxt )
-        {
             rc = -ENOMEM;
-            goto done;
-        }
     }
 
-done:
     return rc;
 }
 
diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
index 01f816b..b8e37b5 100644
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -2371,9 +2371,14 @@ int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val)
      * pirq and irq mapping. Where the GSI is greater than 256, we assume
      * that dom0 pirq == irq.
      */
-    pirq = (irq >= 256) ? irq : rte.vector;
-    if ( (pirq < 0) || (pirq >= hardware_domain->nr_pirqs) )
-        return -EINVAL;
+    if ( !rte.mask )
+    {
+        pirq = (irq >= 256) ? irq : rte.vector;
+        if ( pirq >= hardware_domain->nr_pirqs )
+            return -EINVAL;
+    }
+    else
+        pirq = -1;
     
     if ( desc->action )
     {
@@ -2408,12 +2413,15 @@ int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val)
 
         printk(XENLOG_INFO "allocated vector %02x for irq %d\n", ret, irq);
     }
-    spin_lock(&hardware_domain->event_lock);
-    ret = map_domain_pirq(hardware_domain, pirq, irq,
-            MAP_PIRQ_TYPE_GSI, NULL);
-    spin_unlock(&hardware_domain->event_lock);
-    if ( ret < 0 )
-        return ret;
+    if ( pirq >= 0 )
+    {
+        spin_lock(&hardware_domain->event_lock);
+        ret = map_domain_pirq(hardware_domain, pirq, irq,
+                              MAP_PIRQ_TYPE_GSI, NULL);
+        spin_unlock(&hardware_domain->event_lock);
+        if ( ret < 0 )
+            return ret;
+    }
 
     spin_lock_irqsave(&ioapic_lock, flags);
     /* Set the correct irq-handling type. */
@@ -2546,13 +2554,13 @@ void __init init_ioapic_mappings(void)
             clear_page(__va(ioapic_phys));
         }
         set_fixmap_nocache(idx, ioapic_phys);
-        apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
+        apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08Lx (%08lx)\n",
                     __fix_to_virt(idx), ioapic_phys);
         idx++;
 
         if ( bad_ioapic_register(i) )
         {
-            __set_fixmap(idx, 0, 0);
+            clear_fixmap(idx);
             continue;
         }
 
@@ -2614,6 +2622,10 @@ unsigned int arch_hwdom_irqs(domid_t domid)
     if ( !domid )
         n = min(n, dom0_max_vcpus());
     n = min(nr_irqs_gsi + n * NR_DYNAMIC_VECTORS, nr_irqs);
+
+    /* Bounded by the domain pirq eoi bitmap gfn. */
+    n = min_t(unsigned int, n, PAGE_SIZE * BITS_PER_BYTE);
+
     printk("Dom%d has maximum %u PIRQs\n", domid, n);
 
     return n;
diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index 84738e5..bf2e822 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -153,7 +153,7 @@ int __init bind_irq_vector(int irq, int vector, const cpumask_t *cpu_mask)
 /*
  * Dynamic irq allocate and deallocation for MSI
  */
-int create_irq(int node)
+int create_irq(nodeid_t node)
 {
     int irq, ret;
     struct irq_desc *desc;
@@ -173,7 +173,7 @@ int create_irq(int node)
     {
         cpumask_t *mask = NULL;
 
-        if (node != NUMA_NO_NODE && node >= 0)
+        if ( node != NUMA_NO_NODE )
         {
             mask = &node_to_cpumask(node);
             if (cpumask_empty(mask))
@@ -217,9 +217,9 @@ void destroy_irq(unsigned int irq)
     }
 
     spin_lock_irqsave(&desc->lock, flags);
-    desc->status  |= IRQ_DISABLED;
     desc->status  &= ~IRQ_GUEST;
     desc->handler->shutdown(desc);
+    desc->status |= IRQ_DISABLED;
     action = desc->action;
     desc->action  = NULL;
     desc->msi_desc = NULL;
@@ -995,8 +995,8 @@ void __init release_irq(unsigned int irq, const void *dev_id)
     spin_lock_irqsave(&desc->lock,flags);
     action = desc->action;
     desc->action  = NULL;
-    desc->status |= IRQ_DISABLED;
     desc->handler->shutdown(desc);
+    desc->status |= IRQ_DISABLED;
     spin_unlock_irqrestore(&desc->lock,flags);
 
     /* Wait to make sure it's not being used on another CPU */
@@ -1073,13 +1073,19 @@ bool_t cpu_has_pending_apic_eoi(void)
 static inline void set_pirq_eoi(struct domain *d, unsigned int irq)
 {
     if ( d->arch.pirq_eoi_map )
+    {
+        ASSERT(irq < PAGE_SIZE * BITS_PER_BYTE);
         set_bit(irq, d->arch.pirq_eoi_map);
+    }
 }
 
 static inline void clear_pirq_eoi(struct domain *d, unsigned int irq)
 {
     if ( d->arch.pirq_eoi_map )
+    {
+        ASSERT(irq < PAGE_SIZE * BITS_PER_BYTE);
         clear_bit(irq, d->arch.pirq_eoi_map);
+    }
 }
 
 static void set_eoi_ready(void *data);
@@ -1450,7 +1456,7 @@ void desc_guest_eoi(struct irq_desc *desc, struct pirq *pirq)
         
     cpumask_copy(&cpu_eoi_map, action->cpu_eoi_map);
 
-    if ( cpumask_test_and_clear_cpu(smp_processor_id(), &cpu_eoi_map) )
+    if ( __cpumask_test_and_clear_cpu(smp_processor_id(), &cpu_eoi_map) )
     {
         __set_eoi_ready(desc);
         spin_unlock(&desc->lock);
@@ -1608,12 +1614,13 @@ int pirq_guest_bind(struct vcpu *v, struct pirq *pirq, int will_share)
         init_timer(&action->eoi_timer, irq_guest_eoi_timer_fn, desc, 0);
 
         desc->status |= IRQ_GUEST;
-        desc->status &= ~IRQ_DISABLED;
-        desc->handler->startup(desc);
 
         /* Attempt to bind the interrupt target to the correct CPU. */
         if ( !opt_noirqbalance && (desc->handler->set_affinity != NULL) )
             desc->handler->set_affinity(desc, cpumask_of(v->processor));
+
+        desc->status &= ~IRQ_DISABLED;
+        desc->handler->startup(desc);
     }
     else if ( !will_share || !action->shareable )
     {
@@ -1725,8 +1732,8 @@ static irq_guest_action_t *__pirq_guest_unbind(
     BUG_ON(action->in_flight != 0);
 
     /* Disabling IRQ before releasing the desc_lock avoids an IRQ storm. */
-    desc->status |= IRQ_DISABLED;
     desc->handler->disable(desc);
+    desc->status |= IRQ_DISABLED;
 
     /*
      * Mark any remaining pending EOIs as ready to flush.
@@ -1899,7 +1906,7 @@ int map_domain_pirq(
     if ( !irq_access_permitted(current->domain, irq))
         return -EPERM;
 
-    if ( pirq < 0 || pirq >= d->nr_pirqs || irq < 0 || irq >= nr_irqs )
+    if ( pirq < 0 || pirq >= d->nr_pirqs || irq <= 0 || irq >= nr_irqs )
     {
         dprintk(XENLOG_G_ERR, "dom%d: invalid pirq %d or irq %d\n",
                 d->domain_id, pirq, irq);
@@ -1912,8 +1919,9 @@ int map_domain_pirq(
     if ( (old_irq > 0 && (old_irq != irq) ) ||
          (old_pirq && (old_pirq != pirq)) )
     {
-        dprintk(XENLOG_G_WARNING, "dom%d: pirq %d or irq %d already mapped\n",
-                d->domain_id, pirq, irq);
+        dprintk(XENLOG_G_WARNING,
+                "dom%d: pirq %d or irq %d already mapped (%d,%d)\n",
+                d->domain_id, pirq, irq, old_pirq, old_irq);
         return 0;
     }
 
@@ -2495,6 +2503,25 @@ int unmap_domain_pirq_emuirq(struct domain *d, int pirq)
     return ret;
 }
 
+void arch_evtchn_bind_pirq(struct domain *d, int pirq)
+{
+    int irq = domain_pirq_to_irq(d, pirq);
+    struct irq_desc *desc;
+    unsigned long flags;
+
+    if ( irq <= 0 )
+        return;
+
+    if ( is_hvm_domain(d) )
+        map_domain_emuirq_pirq(d, pirq, IRQ_PT);
+
+    desc = irq_to_desc(irq);
+    spin_lock_irqsave(&desc->lock, flags);
+    if ( desc->msi_desc )
+        guest_mask_msi_irq(desc, 0);
+    spin_unlock_irqrestore(&desc->lock, flags);
+}
+
 bool_t hvm_domain_use_pirq(const struct domain *d, const struct pirq *pirq)
 {
     return is_hvm_domain(d) && pirq &&
diff --git a/xen/arch/x86/microcode.c b/xen/arch/x86/microcode.c
index 091d5d1..c20bde6 100644
--- a/xen/arch/x86/microcode.c
+++ b/xen/arch/x86/microcode.c
@@ -195,7 +195,7 @@ struct microcode_info {
     char buffer[1];
 };
 
-static void __microcode_fini_cpu(int cpu)
+static void __microcode_fini_cpu(unsigned int cpu)
 {
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
 
@@ -203,14 +203,14 @@ static void __microcode_fini_cpu(int cpu)
     memset(uci, 0, sizeof(*uci));
 }
 
-static void microcode_fini_cpu(int cpu)
+static void microcode_fini_cpu(unsigned int cpu)
 {
     spin_lock(&microcode_mutex);
     __microcode_fini_cpu(cpu);
     spin_unlock(&microcode_mutex);
 }
 
-int microcode_resume_cpu(int cpu)
+int microcode_resume_cpu(unsigned int cpu)
 {
     int err;
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
diff --git a/xen/arch/x86/microcode_amd.c b/xen/arch/x86/microcode_amd.c
index f79b397..a61c926 100644
--- a/xen/arch/x86/microcode_amd.c
+++ b/xen/arch/x86/microcode_amd.c
@@ -79,7 +79,7 @@ struct mpbhdr {
 static DEFINE_SPINLOCK(microcode_update_lock);
 
 /* See comment in start_update() for cases when this routine fails */
-static int collect_cpu_info(int cpu, struct cpu_signature *csig)
+static int collect_cpu_info(unsigned int cpu, struct cpu_signature *csig)
 {
     struct cpuinfo_x86 *c = &cpu_data[cpu];
 
@@ -149,7 +149,8 @@ static bool_t find_equiv_cpu_id(const struct equiv_cpu_entry *equiv_cpu_table,
     return 0;
 }
 
-static bool_t microcode_fits(const struct microcode_amd *mc_amd, int cpu)
+static bool_t microcode_fits(const struct microcode_amd *mc_amd,
+                             unsigned int cpu)
 {
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
     const struct microcode_header_amd *mc_header = mc_amd->mpb;
@@ -186,7 +187,7 @@ static bool_t microcode_fits(const struct microcode_amd *mc_amd, int cpu)
     return 1;
 }
 
-static int apply_microcode(int cpu)
+static int apply_microcode(unsigned int cpu)
 {
     unsigned long flags;
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
@@ -347,7 +348,45 @@ static int container_fast_forward(const void *data, size_t size_left, size_t *of
     return 0;
 }
 
-static int cpu_request_microcode(int cpu, const void *buf, size_t bufsize)
+/*
+ * The 'final_levels' of patch ids have been obtained empirically.
+ * Refer bug https://bugzilla.suse.com/show_bug.cgi?id=913996 
+ * for details of the issue. The short version is that people
+ * using certain Fam10h systems noticed system hang issues when
+ * trying to update microcode levels beyond the patch IDs below.
+ * From internal discussions, we gathered that OS/hypervisor
+ * cannot reliably perform microcode updates beyond these levels
+ * due to hardware issues. Therefore, we need to abort microcode
+ * update process if we hit any of these levels.
+ */
+static const unsigned int final_levels[] = {
+    0x01000098,
+    0x0100009f,
+    0x010000af
+};
+
+static bool_t check_final_patch_levels(unsigned int cpu)
+{
+    /*
+     * Check the current patch levels on the cpu. If they are equal to
+     * any of the 'final_levels', then we should not update the microcode
+     * patch on the cpu as system will hang otherwise.
+     */
+    struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
+    unsigned int i;
+
+    if ( boot_cpu_data.x86 != 0x10 )
+        return 0;
+
+    for ( i = 0; i < ARRAY_SIZE(final_levels); i++ )
+        if ( uci->cpu_sig.rev == final_levels[i] )
+            return 1;
+
+    return 0;
+}
+
+static int cpu_request_microcode(unsigned int cpu, const void *buf,
+                                 size_t bufsize)
 {
     struct microcode_amd *mc_amd, *mc_old;
     size_t offset = 0;
@@ -369,6 +408,14 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t bufsize)
         goto out;
     }
 
+    if ( check_final_patch_levels(cpu) )
+    {
+        printk(XENLOG_INFO
+               "microcode: Cannot update microcode patch on the cpu as we hit a final level\n");
+        error = -EPERM;
+        goto out;
+    }
+
     mc_amd = xmalloc(struct microcode_amd);
     if ( !mc_amd )
     {
@@ -511,7 +558,7 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t bufsize)
     return error;
 }
 
-static int microcode_resume_match(int cpu, const void *mc)
+static int microcode_resume_match(unsigned int cpu, const void *mc)
 {
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
     struct microcode_amd *mc_amd = uci->mc.mc_amd;
diff --git a/xen/arch/x86/microcode_intel.c b/xen/arch/x86/microcode_intel.c
index b54cd71..0a5f403 100644
--- a/xen/arch/x86/microcode_intel.c
+++ b/xen/arch/x86/microcode_intel.c
@@ -90,7 +90,7 @@ struct extended_sigtable {
 /* serialize access to the physical write to MSR 0x79 */
 static DEFINE_SPINLOCK(microcode_update_lock);
 
-static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+static int collect_cpu_info(unsigned int cpu_num, struct cpu_signature *csig)
 {
     struct cpuinfo_x86 *c = &cpu_data[cpu_num];
     uint64_t msr_content;
@@ -129,7 +129,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 }
 
 static inline int microcode_update_match(
-    int cpu_num, const struct microcode_header_intel *mc_header,
+    unsigned int cpu_num, const struct microcode_header_intel *mc_header,
     int sig, int pf)
 {
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu_num);
@@ -232,7 +232,7 @@ static int microcode_sanity_check(void *mc)
  * return 1 - found update
  * return < 0 - error
  */
-static int get_matching_microcode(const void *mc, int cpu)
+static int get_matching_microcode(const void *mc, unsigned int cpu)
 {
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
     const struct microcode_header_intel *mc_header = mc;
@@ -277,12 +277,12 @@ static int get_matching_microcode(const void *mc, int cpu)
     return 1;
 }
 
-static int apply_microcode(int cpu)
+static int apply_microcode(unsigned int cpu)
 {
     unsigned long flags;
     uint64_t msr_content;
     unsigned int val[2];
-    int cpu_num = raw_smp_processor_id();
+    unsigned int cpu_num = raw_smp_processor_id();
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu_num);
 
     /* We should bind the task to the CPU */
@@ -351,7 +351,8 @@ static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
     return offset + total_size;
 }
 
-static int cpu_request_microcode(int cpu, const void *buf, size_t size)
+static int cpu_request_microcode(unsigned int cpu, const void *buf,
+                                 size_t size)
 {
     long offset = 0;
     int error = 0;
@@ -391,7 +392,7 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t size)
     return error;
 }
 
-static int microcode_resume_match(int cpu, const void *mc)
+static int microcode_resume_match(unsigned int cpu, const void *mc)
 {
     return get_matching_microcode(mc, cpu);
 }
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index d4965da..202ff76 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -15,8 +15,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
@@ -126,8 +125,7 @@
 #include <asm/pci.h>
 
 /* Mapping of the fixmap space needed early. */
-l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
-    l1_fixmap[L1_PAGETABLE_ENTRIES];
+l1_pgentry_t __section(".bss.page_aligned") l1_fixmap[L1_PAGETABLE_ENTRIES];
 
 #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)
 
@@ -164,9 +162,8 @@ static uint32_t base_disallow_mask;
 #define L1_DISALLOW_MASK ((base_disallow_mask | _PAGE_GNTTAB) & ~_PAGE_GLOBAL)
 #define L2_DISALLOW_MASK (base_disallow_mask & ~_PAGE_PSE)
 
-#define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ?  \
-                             base_disallow_mask :       \
-                             0xFFFFF198U)
+#define l3_disallow_mask(d) (!is_pv_32bit_domain(d) ? \
+                             base_disallow_mask : 0xFFFFF198U)
 
 #define L4_DISALLOW_MASK (base_disallow_mask)
 
@@ -275,7 +272,7 @@ void __init arch_init_memory(void)
      * Hidden PCI devices will also be associated with this domain
      * (but be [partly] controlled by Dom0 nevertheless).
      */
-    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
+    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0, NULL);
     BUG_ON(IS_ERR(dom_xen));
     INIT_LIST_HEAD(&dom_xen->arch.pdev_list);
 
@@ -284,14 +281,14 @@ void __init arch_init_memory(void)
      * This domain owns I/O pages that are within the range of the page_info
      * array. Mappings occur at the priv of the caller.
      */
-    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
+    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0, NULL);
     BUG_ON(IS_ERR(dom_io));
     
     /*
      * Initialise our COW domain.
      * This domain owns sharable pages.
      */
-    dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0);
+    dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0, NULL);
     BUG_ON(IS_ERR(dom_cow));
 
     /* First 1MB of RAM is historically marked as I/O. */
@@ -372,7 +369,7 @@ void __init arch_init_memory(void)
 
                     for ( i = 0; i < l3_table_offset(split_va); ++i )
                         l3tab[i] = l3idle[i];
-                    for ( ; i <= L3_PAGETABLE_ENTRIES; ++i )
+                    for ( ; i < L3_PAGETABLE_ENTRIES; ++i )
                         l3tab[i] = l3e_empty();
                     split_l4e = l4e_from_pfn(virt_to_mfn(l3tab),
                                              __PAGE_HYPERVISOR);
@@ -545,21 +542,17 @@ static void invalidate_shadow_ldt(struct vcpu *v, int flush)
 
 static int alloc_segdesc_page(struct page_info *page)
 {
-    struct desc_struct *descs;
-    int i;
-
-    descs = __map_domain_page(page);
+    const struct domain *owner = page_get_owner(page);
+    struct desc_struct *descs = __map_domain_page(page);
+    unsigned i;
 
     for ( i = 0; i < 512; i++ )
-        if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) )
-            goto fail;
+        if ( unlikely(!check_descriptor(owner, &descs[i])) )
+            break;
 
     unmap_domain_page(descs);
-    return 0;
 
- fail:
-    unmap_domain_page(descs);
-    return -EINVAL;
+    return i == 512 ? 0 : -EINVAL;
 }
 
 
@@ -740,6 +733,46 @@ static int update_xen_mappings(unsigned long mfn, unsigned long cacheattr)
     return err;
 }
 
+#ifndef NDEBUG
+struct mmio_emul_range_ctxt {
+    const struct domain *d;
+    unsigned long mfn;
+};
+
+static int print_mmio_emul_range(unsigned long s, unsigned long e, void *arg)
+{
+    const struct mmio_emul_range_ctxt *ctxt = arg;
+
+    if ( ctxt->mfn > e )
+        return 0;
+
+    if ( ctxt->mfn >= s )
+    {
+        static DEFINE_SPINLOCK(last_lock);
+        static const struct domain *last_d;
+        static unsigned long last_s = ~0UL, last_e;
+        bool_t print = 0;
+
+        spin_lock(&last_lock);
+        if ( last_d != ctxt->d || last_s != s || last_e != e )
+        {
+            last_d = ctxt->d;
+            last_s = s;
+            last_e = e;
+            print = 1;
+        }
+        spin_unlock(&last_lock);
+
+        if ( print )
+            printk(XENLOG_G_INFO
+                   "d%d: Forcing write emulation on MFNs %lx-%lx\n",
+                   ctxt->d->domain_id, s, e);
+    }
+
+    return 1;
+}
+#endif
+
 int
 get_page_from_l1e(
     l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner)
@@ -763,6 +796,11 @@ get_page_from_l1e(
     if ( !mfn_valid(mfn) ||
          (real_pg_owner = page_get_owner_and_reference(page)) == dom_io )
     {
+#ifndef NDEBUG
+        const unsigned long *ro_map;
+        unsigned int seg, bdf;
+#endif
+
         /* Only needed the reference to confirm dom_io ownership. */
         if ( mfn_valid(mfn) )
             put_page(page);
@@ -798,9 +836,20 @@ get_page_from_l1e(
         if ( !(l1f & _PAGE_RW) ||
              !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
             return 0;
-        dprintk(XENLOG_G_WARNING,
-                "d%d: Forcing read-only access to MFN %lx\n",
-                l1e_owner->domain_id, mfn);
+#ifndef NDEBUG
+        if ( !pci_mmcfg_decode(mfn, &seg, &bdf) ||
+             ((ro_map = pci_get_ro_map(seg)) != NULL &&
+              test_bit(bdf, ro_map)) )
+            printk(XENLOG_G_WARNING
+                   "d%d: Forcing read-only access to MFN %lx\n",
+                   l1e_owner->domain_id, mfn);
+        else
+            rangeset_report_ranges(mmio_ro_ranges, 0, ~0UL,
+                                   print_mmio_emul_range,
+                                   &(struct mmio_emul_range_ctxt){
+                                      .d = l1e_owner,
+                                      .mfn = mfn });
+#endif
         return 1;
     }
 
@@ -990,7 +1039,7 @@ get_page_from_l4e(
 #define adjust_guest_l1e(pl1e, d)                                            \
     do {                                                                     \
         if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) &&                \
-             likely(!is_pv_32on64_domain(d)) )                               \
+             likely(!is_pv_32bit_domain(d)) )                                \
         {                                                                    \
             /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */    \
             if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \
@@ -1007,14 +1056,14 @@ get_page_from_l4e(
 #define adjust_guest_l2e(pl2e, d)                               \
     do {                                                        \
         if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) &&   \
-             likely(!is_pv_32on64_domain(d)) )                  \
+             likely(!is_pv_32bit_domain(d)) )                   \
             l2e_add_flags((pl2e), _PAGE_USER);                  \
     } while ( 0 )
 
 #define adjust_guest_l3e(pl3e, d)                                   \
     do {                                                            \
         if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) )        \
-            l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \
+            l3e_add_flags((pl3e), likely(!is_pv_32bit_domain(d)) ?  \
                                          _PAGE_USER :               \
                                          _PAGE_USER|_PAGE_RW);      \
     } while ( 0 )
@@ -1022,13 +1071,13 @@ get_page_from_l4e(
 #define adjust_guest_l4e(pl4e, d)                               \
     do {                                                        \
         if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) &&   \
-             likely(!is_pv_32on64_domain(d)) )                  \
+             likely(!is_pv_32bit_domain(d)) )                   \
             l4e_add_flags((pl4e), _PAGE_USER);                  \
     } while ( 0 )
 
 #define unadjust_guest_l3e(pl3e, d)                                         \
     do {                                                                    \
-        if ( unlikely(is_pv_32on64_domain(d)) &&                            \
+        if ( unlikely(is_pv_32bit_domain(d)) &&                             \
              likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) )                \
             l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);   \
     } while ( 0 )
@@ -1180,7 +1229,7 @@ static int alloc_l1_table(struct page_info *page)
     unsigned int   i;
     int            ret = 0;
 
-    pl1e = map_domain_page(pfn);
+    pl1e = map_domain_page(_mfn(pfn));
 
     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
     {
@@ -1261,7 +1310,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type,
     unsigned int   i;
     int            rc = 0;
 
-    pl2e = map_domain_page(pfn);
+    pl2e = map_domain_page(_mfn(pfn));
 
     for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
     {
@@ -1310,7 +1359,7 @@ static int alloc_l3_table(struct page_info *page)
     unsigned int   i;
     int            rc = 0, partial = page->partial_pte;
 
-    pl3e = map_domain_page(pfn);
+    pl3e = map_domain_page(_mfn(pfn));
 
     /*
      * PAE guests allocate full pages, but aren't required to initialize
@@ -1319,7 +1368,7 @@ static int alloc_l3_table(struct page_info *page)
      * 512 entries must be valid/verified, which is most easily achieved
      * by clearing them out.
      */
-    if ( is_pv_32on64_domain(d) )
+    if ( is_pv_32bit_domain(d) )
         memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
 
     for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
@@ -1380,7 +1429,8 @@ static int alloc_l3_table(struct page_info *page)
     return rc > 0 ? 0 : rc;
 }
 
-void init_guest_l4_table(l4_pgentry_t l4tab[], const struct domain *d)
+void init_guest_l4_table(l4_pgentry_t l4tab[], const struct domain *d,
+                         bool_t zap_ro_mpt)
 {
     /* Xen private mappings. */
     memcpy(&l4tab[ROOT_PAGETABLE_FIRST_XEN_SLOT],
@@ -1395,13 +1445,32 @@ void init_guest_l4_table(l4_pgentry_t l4tab[], const struct domain *d)
         l4e_from_pfn(domain_page_map_to_mfn(l4tab), __PAGE_HYPERVISOR);
     l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
         l4e_from_page(d->arch.perdomain_l3_pg, __PAGE_HYPERVISOR);
+    if ( zap_ro_mpt || is_pv_32bit_domain(d) || paging_mode_refcounts(d) )
+        l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
+}
+
+void fill_ro_mpt(unsigned long mfn)
+{
+    l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn));
+
+    l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
+        idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
+    unmap_domain_page(l4tab);
+}
+
+void zap_ro_mpt(unsigned long mfn)
+{
+    l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn));
+
+    l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
+    unmap_domain_page(l4tab);
 }
 
 static int alloc_l4_table(struct page_info *page)
 {
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = page_to_mfn(page);
-    l4_pgentry_t  *pl4e = map_domain_page(pfn);
+    l4_pgentry_t  *pl4e = map_domain_page(_mfn(pfn));
     unsigned int   i;
     int            rc = 0, partial = page->partial_pte;
 
@@ -1444,7 +1513,7 @@ static int alloc_l4_table(struct page_info *page)
         adjust_guest_l4e(pl4e[i], d);
     }
 
-    init_guest_l4_table(pl4e, d);
+    init_guest_l4_table(pl4e, d, !VM_ASSIST(d, m2p_strict));
     unmap_domain_page(pl4e);
 
     return rc > 0 ? 0 : rc;
@@ -1457,7 +1526,7 @@ static void free_l1_table(struct page_info *page)
     l1_pgentry_t *pl1e;
     unsigned int  i;
 
-    pl1e = map_domain_page(pfn);
+    pl1e = map_domain_page(_mfn(pfn));
 
     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
         if ( is_guest_l1_slot(i) )
@@ -1475,7 +1544,7 @@ static int free_l2_table(struct page_info *page, int preemptible)
     unsigned int  i = page->nr_validated_ptes - 1;
     int err = 0;
 
-    pl2e = map_domain_page(pfn);
+    pl2e = map_domain_page(_mfn(pfn));
 
     ASSERT(page->nr_validated_ptes);
     do {
@@ -1504,7 +1573,7 @@ static int free_l3_table(struct page_info *page)
     int rc = 0, partial = page->partial_pte;
     unsigned int  i = page->nr_validated_ptes - !partial;
 
-    pl3e = map_domain_page(pfn);
+    pl3e = map_domain_page(_mfn(pfn));
 
     do {
         if ( is_guest_l3_slot(i) )
@@ -1539,7 +1608,7 @@ static int free_l4_table(struct page_info *page)
 {
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
-    l4_pgentry_t *pl4e = map_domain_page(pfn);
+    l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
     int rc = 0, partial = page->partial_pte;
     unsigned int  i = page->nr_validated_ptes - !partial;
 
@@ -1969,6 +2038,7 @@ void put_page(struct page_info *page)
 struct domain *page_get_owner_and_reference(struct page_info *page)
 {
     unsigned long x, y = page->count_info;
+    struct domain *owner;
 
     do {
         x = y;
@@ -1982,7 +2052,10 @@ struct domain *page_get_owner_and_reference(struct page_info *page)
     }
     while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
 
-    return page_get_owner(page);
+    owner = page_get_owner(page);
+    ASSERT(owner);
+
+    return owner;
 }
 
 
@@ -1993,15 +2066,16 @@ int get_page(struct page_info *page, struct domain *domain)
     if ( likely(owner == domain) )
         return 1;
 
-    if ( owner != NULL )
+    if ( !paging_mode_refcounts(domain) && !domain->is_dying )
+        gprintk(XENLOG_INFO,
+                "Error pfn %lx: rd=%d od=%d caf=%08lx taf=%" PRtype_info "\n",
+                page_to_mfn(page), domain->domain_id,
+                owner ? owner->domain_id : DOMID_INVALID,
+                page->count_info - !!owner, page->u.inuse.type_info);
+
+    if ( owner )
         put_page(page);
 
-    if ( !paging_mode_refcounts(domain) && !domain->is_dying )
-        gdprintk(XENLOG_INFO,
-                 "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%"
-                 PRtype_info "\n",
-                 page_to_mfn(page), domain, owner,
-                 page->count_info, page->u.inuse.type_info);
     return 0;
 }
 
@@ -2118,7 +2192,7 @@ int free_page_type(struct page_info *page, unsigned long type,
         ASSERT(VALID_M2P(gmfn));
         /* Page sharing not supported for shadowed domains */
         if(!SHARED_M2P(gmfn))
-            shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
+            shadow_remove_all_shadows(owner, _mfn(gmfn));
     }
 
     if ( !(type & PGT_partial) )
@@ -2283,7 +2357,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
                  && (page->count_info & PGC_page_table)
                  && !((page->shadow_flags & (1u<<29))
                       && type == PGT_writable_page) )
-               shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
+               shadow_remove_all_shadows(d, _mfn(page_to_mfn(page)));
 
             ASSERT(!(x & PGT_pae_xen_l2));
             if ( (x & PGT_type_mask) != type )
@@ -2636,9 +2710,9 @@ int vcpu_destroy_pagetables(struct vcpu *v)
     if ( rc )
         return rc;
 
-    if ( is_pv_32on64_vcpu(v) )
+    if ( is_pv_32bit_vcpu(v) )
     {
-        l4tab = map_domain_page(mfn);
+        l4tab = map_domain_page(_mfn(mfn));
         mfn = l4e_get_pfn(*l4tab);
     }
 
@@ -2691,10 +2765,10 @@ int new_guest_cr3(unsigned long mfn)
     int rc;
     unsigned long old_base_mfn;
 
-    if ( is_pv_32on64_domain(d) )
+    if ( is_pv_32bit_domain(d) )
     {
         unsigned long gt_mfn = pagetable_get_pfn(curr->arch.guest_table);
-        l4_pgentry_t *pl4e = map_domain_page(gt_mfn);
+        l4_pgentry_t *pl4e = map_domain_page(_mfn(gt_mfn));
 
         rc = paging_mode_refcounts(d)
              ? -EINVAL /* Old code was broken, but what should it be? */
@@ -2755,6 +2829,8 @@ int new_guest_cr3(unsigned long mfn)
 
     invalidate_shadow_ldt(curr, 0);
 
+    if ( !VM_ASSIST(d, m2p_strict) && !paging_mode_refcounts(d) )
+        fill_ro_mpt(mfn);
     curr->arch.guest_table = pagetable_from_pfn(mfn);
     update_cr3(curr);
 
@@ -2771,6 +2847,7 @@ int new_guest_cr3(unsigned long mfn)
             {
             case -EINTR:
                 rc = -ERESTART;
+                /* fallthrough */
             case -ERESTART:
                 curr->arch.old_guest_table = page;
                 break;
@@ -2837,7 +2914,7 @@ static inline int vcpumask_to_pcpumask(
     unsigned int vcpu_id, vcpu_bias, offs;
     unsigned long vmask;
     struct vcpu *v;
-    bool_t is_native = !is_pv_32on64_domain(d);
+    bool_t is_native = !is_pv_32bit_domain(d);
 
     cpumask_clear(pmask);
     for ( vmask = 0, offs = 0; ; ++offs)
@@ -3111,6 +3188,8 @@ long do_mmuext_op(
                                 op.arg1.mfn);
                     break;
                 }
+                if ( VM_ASSIST(d, m2p_strict) && !paging_mode_refcounts(d) )
+                    zap_ro_mpt(op.arg1.mfn);
             }
 
             curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
@@ -3126,6 +3205,7 @@ long do_mmuext_op(
                     {
                     case -EINTR:
                         rc = -ERESTART;
+                        /* fallthrough */
                     case -ERESTART:
                         curr->arch.old_guest_table = page;
                         okay = 0;
@@ -3216,7 +3296,7 @@ long do_mmuext_op(
                 for_each_online_cpu(cpu)
                     if ( !cpumask_intersects(&mask,
                                              per_cpu(cpu_sibling_mask, cpu)) )
-                        cpumask_set_cpu(cpu, &mask);
+                        __cpumask_set_cpu(cpu, &mask);
                 flush_mask(&mask, FLUSH_CACHE);
             }
             else
@@ -3272,7 +3352,7 @@ long do_mmuext_op(
             /* A page is dirtied when it's being cleared. */
             paging_mark_dirty(pg_owner, page_to_mfn(page));
 
-            clear_domain_page(page_to_mfn(page));
+            clear_domain_page(_mfn(page_to_mfn(page)));
 
             put_page_and_type(page);
             break;
@@ -3306,7 +3386,8 @@ long do_mmuext_op(
             /* A page is dirtied when it's being copied to. */
             paging_mark_dirty(pg_owner, page_to_mfn(dst_page));
 
-            copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page));
+            copy_domain_page(_mfn(page_to_mfn(dst_page)),
+                             _mfn(page_to_mfn(src_page)));
 
             put_page_and_type(dst_page);
             put_page(src_page);
@@ -3746,7 +3827,7 @@ static int create_grant_pte_mapping(
     }
     
     mfn = page_to_mfn(page);
-    va = map_domain_page(mfn);
+    va = map_domain_page(_mfn(mfn));
     va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
 
     if ( !page_lock(page) )
@@ -3801,7 +3882,7 @@ static int destroy_grant_pte_mapping(
     }
     
     mfn = page_to_mfn(page);
-    va = map_domain_page(mfn);
+    va = map_domain_page(_mfn(mfn));
     va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
 
     if ( !page_lock(page) )
@@ -4040,9 +4121,8 @@ static int replace_grant_p2m_mapping(
     if ( !p2m_is_grant(type) || mfn_x(old_mfn) != frame )
     {
         put_gfn(d, gfn);
-        gdprintk(XENLOG_WARNING,
-                 "replace_grant_p2m_mapping: old mapping invalid (type %d, mfn %lx, frame %lx)\n",
-                 type, mfn_x(old_mfn), frame);
+        MEM_LOG("replace_grant_p2m_mapping: old mapping invalid (type %d, mfn %lx, frame %lx)",
+                type, mfn_x(old_mfn), frame);
         return GNTST_general_error;
     }
     guest_physmap_remove_page(d, gfn, frame, PAGE_ORDER_4K);
@@ -4131,9 +4211,11 @@ int replace_grant_host_mapping(
 int donate_page(
     struct domain *d, struct page_info *page, unsigned int memflags)
 {
+    const struct domain *owner = dom_xen;
+
     spin_lock(&d->page_alloc_lock);
 
-    if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) )
+    if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != NULL) )
         goto fail;
 
     if ( d->is_dying )
@@ -4158,9 +4240,10 @@ int donate_page(
 
  fail:
     spin_unlock(&d->page_alloc_lock);
-    MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
-            (void *)page_to_mfn(page), d, d->domain_id,
-            page_get_owner(page), page->count_info, page->u.inuse.type_info);
+    MEM_LOG("Bad donate %lx: ed=%d sd=%d caf=%08lx taf=%" PRtype_info,
+            page_to_mfn(page), d->domain_id,
+            owner ? owner->domain_id : DOMID_INVALID,
+            page->count_info, page->u.inuse.type_info);
     return -1;
 }
 
@@ -4169,10 +4252,11 @@ int steal_page(
 {
     unsigned long x, y;
     bool_t drop_dom_ref = 0;
+    const struct domain *owner = dom_xen;
 
     spin_lock(&d->page_alloc_lock);
 
-    if ( is_xen_heap_page(page) || (page_get_owner(page) != d) )
+    if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != d) )
         goto fail;
 
     /*
@@ -4207,9 +4291,10 @@ int steal_page(
 
  fail:
     spin_unlock(&d->page_alloc_lock);
-    MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
-            (void *)page_to_mfn(page), d, d->domain_id,
-            page_get_owner(page), page->count_info, page->u.inuse.type_info);
+    MEM_LOG("Bad page %lx: ed=%d sd=%d caf=%08lx taf=%" PRtype_info,
+            page_to_mfn(page), d->domain_id,
+            owner ? owner->domain_id : DOMID_INVALID,
+            page->count_info, page->u.inuse.type_info);
     return -1;
 }
 
@@ -4358,20 +4443,15 @@ long set_gdt(struct vcpu *v,
     l1_pgentry_t *pl1e;
     /* NB. There are 512 8-byte entries per GDT page. */
     int i, nr_pages = (entries + 511) / 512;
-    unsigned long mfn, *pfns;
 
     if ( entries > FIRST_RESERVED_GDT_ENTRY )
         return -EINVAL;
 
-    pfns = xmalloc_array(unsigned long, nr_pages);
-    if ( !pfns )
-        return -ENOMEM;
-
     /* Check the pages in the new GDT. */
     for ( i = 0; i < nr_pages; i++ )
     {
         struct page_info *page;
-        pfns[i] = frames[i];
+
         page = get_page_from_gfn(d, frames[i], NULL, P2M_ALLOC);
         if ( !page )
             goto fail;
@@ -4380,7 +4460,7 @@ long set_gdt(struct vcpu *v,
             put_page(page);
             goto fail;
         }
-        mfn = frames[i] = page_to_mfn(page);
+        frames[i] = page_to_mfn(page);
     }
 
     /* Tear down the old GDT. */
@@ -4392,10 +4472,9 @@ long set_gdt(struct vcpu *v,
     for ( i = 0; i < nr_pages; i++ )
     {
         v->arch.pv_vcpu.gdt_frames[i] = frames[i];
-        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
+        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR_RW));
     }
 
-    xfree(pfns);
     return 0;
 
  fail:
@@ -4403,7 +4482,6 @@ long set_gdt(struct vcpu *v,
     {
         put_page_and_type(mfn_to_page(frames[i]));
     }
-    xfree(pfns);
     return -EINVAL;
 }
 
@@ -4475,7 +4553,7 @@ long do_update_descriptor(u64 pa, u64 desc)
     paging_mark_dirty(dom, mfn);
 
     /* All is good so make the update. */
-    gdt_pent = map_domain_page(mfn);
+    gdt_pent = map_domain_page(_mfn(mfn));
     write_atomic((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d);
     unmap_domain_page(gdt_pent);
 
@@ -4569,7 +4647,7 @@ int xenmem_add_to_physmap_one(
                 mfn = virt_to_mfn(d->shared_info);
             break;
         case XENMAPSPACE_grant_table:
-            spin_lock(&d->grant_table->lock);
+            write_lock(&d->grant_table->lock);
 
             if ( d->grant_table->gt_version == 0 )
                 d->grant_table->gt_version = 1;
@@ -4591,7 +4669,7 @@ int xenmem_add_to_physmap_one(
                     mfn = virt_to_mfn(d->grant_table->shared_raw[idx]);
             }
 
-            spin_unlock(&d->grant_table->lock);
+            write_unlock(&d->grant_table->lock);
             break;
         case XENMAPSPACE_gmfn_range:
         case XENMAPSPACE_gmfn:
@@ -4691,12 +4769,6 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
             return rc;
         }
 
-        if ( is_hvm_domain(d) )
-        {
-            rcu_unlock_domain(d);
-            return -EPERM;
-        }
-
         e820 = xmalloc_array(e820entry_t, fmap.map.nr_entries);
         if ( e820 == NULL )
         {
@@ -5013,7 +5085,7 @@ static int ptwr_emulated_update(
     adjust_guest_l1e(nl1e, d);
 
     /* Checked successfully: do the update (write or cmpxchg). */
-    pl1e = map_domain_page(mfn);
+    pl1e = map_domain_page(_mfn(mfn));
     pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
     if ( do_cmpxchg )
     {
@@ -5119,6 +5191,7 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
 
     /* We are looking only for read-only mappings of p.t. pages. */
     if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
+         rangeset_contains_singleton(mmio_ro_ranges, l1e_get_pfn(pte)) ||
          !get_page_from_pagenr(l1e_get_pfn(pte), d) )
         goto bail;
 
@@ -5139,7 +5212,7 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
     ptwr_ctxt.ctxt.regs = regs;
     ptwr_ctxt.ctxt.force_writeback = 0;
     ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
-        is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
+        is_pv_32bit_domain(d) ? 32 : BITS_PER_LONG;
     ptwr_ctxt.ctxt.swint_emulate = x86_swint_emulate_none;
     ptwr_ctxt.cr2 = addr;
     ptwr_ctxt.pte = pte;
@@ -5166,6 +5239,7 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
 struct mmio_ro_emulate_ctxt {
     struct x86_emulate_ctxt ctxt;
     unsigned long cr2;
+    unsigned int seg, bdf;
 };
 
 static int mmio_ro_emulated_read(
@@ -5205,14 +5279,51 @@ static const struct x86_emulate_ops mmio_ro_emulate_ops = {
     .write      = mmio_ro_emulated_write,
 };
 
+static int mmio_intercept_write(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    struct mmio_ro_emulate_ctxt *mmio_ctxt =
+        container_of(ctxt, struct mmio_ro_emulate_ctxt, ctxt);
+
+    /*
+     * Only allow naturally-aligned stores no wider than 4 bytes to the
+     * original %cr2 address.
+     */
+    if ( ((bytes | offset) & (bytes - 1)) || bytes > 4 ||
+         offset != mmio_ctxt->cr2 )
+    {
+        MEM_LOG("mmio_intercept: bad write (cr2=%lx, addr=%lx, bytes=%u)",
+                mmio_ctxt->cr2, offset, bytes);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    offset &= 0xfff;
+    pci_conf_write_intercept(mmio_ctxt->seg, mmio_ctxt->bdf, offset, bytes,
+                             p_data);
+    pci_mmcfg_write(mmio_ctxt->seg, PCI_BUS(mmio_ctxt->bdf),
+                    PCI_DEVFN2(mmio_ctxt->bdf), offset, bytes,
+                    *(uint32_t *)p_data);
+
+    return X86EMUL_OKAY;
+}
+
+static const struct x86_emulate_ops mmio_intercept_ops = {
+    .read       = mmio_ro_emulated_read,
+    .insn_fetch = ptwr_emulated_read,
+    .write      = mmio_intercept_write,
+};
+
 /* Check if guest is trying to modify a r/o MMIO page. */
 int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr,
                           struct cpu_user_regs *regs)
 {
-    l1_pgentry_t      pte;
-    unsigned long     mfn;
-    unsigned int      addr_size = is_pv_32on64_domain(v->domain) ?
-                                  32 : BITS_PER_LONG;
+    l1_pgentry_t pte;
+    unsigned long mfn;
+    unsigned int addr_size = is_pv_32bit_vcpu(v) ? 32 : BITS_PER_LONG;
     struct mmio_ro_emulate_ctxt mmio_ro_ctxt = {
         .ctxt.regs = regs,
         .ctxt.addr_size = addr_size,
@@ -5220,6 +5331,7 @@ int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr,
         .ctxt.swint_emulate = x86_swint_emulate_none,
         .cr2 = addr
     };
+    const unsigned long *ro_map;
     int rc;
 
     /* Attempt to read the PTE that maps the VA being accessed. */
@@ -5244,7 +5356,12 @@ int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr,
     if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
         return 0;
 
-    rc = x86_emulate(&mmio_ro_ctxt.ctxt, &mmio_ro_emulate_ops);
+    if ( pci_mmcfg_decode(mfn, &mmio_ro_ctxt.seg, &mmio_ro_ctxt.bdf) &&
+         ((ro_map = pci_get_ro_map(mmio_ro_ctxt.seg)) == NULL ||
+          !test_bit(mmio_ro_ctxt.bdf, ro_map)) )
+        rc = x86_emulate(&mmio_ro_ctxt.ctxt, &mmio_intercept_ops);
+    else
+        rc = x86_emulate(&mmio_ro_ctxt.ctxt, &mmio_ro_emulate_ops);
 
     return rc != X86EMUL_UNHANDLEABLE ? EXCRET_fault_fixed : 0;
 }
@@ -5287,7 +5404,10 @@ static l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
             spin_lock(&map_pgdir_lock);
         if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
         {
-            l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
+            l4_pgentry_t l4e = l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR);
+
+            l4e_write(pl4e, l4e);
+            efi_update_l4_pgtable(l4_table_offset(v), l4e);
             pl3e = NULL;
         }
         if ( locking )
@@ -5699,6 +5819,12 @@ int map_pages_to_xen(
     return 0;
 }
 
+int populate_pt_range(unsigned long virt, unsigned long mfn,
+                      unsigned long nr_mfns)
+{
+    return map_pages_to_xen(virt, mfn, nr_mfns, MAP_SMALL_PAGES);
+}
+
 void destroy_xen_mappings(unsigned long s, unsigned long e)
 {
     bool_t locking = system_state > SYS_STATE_boot;
@@ -5854,10 +5980,10 @@ void *__init arch_vmap_virt_end(void)
 
 void __iomem *ioremap(paddr_t pa, size_t len)
 {
-    unsigned long pfn = PFN_DOWN(pa);
+    mfn_t mfn = _mfn(PFN_DOWN(pa));
     void *va;
 
-    WARN_ON(page_is_ram_type(pfn, RAM_TYPE_CONVENTIONAL));
+    WARN_ON(page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL));
 
     /* The low first Mb is always mapped. */
     if ( !((pa + len - 1) >> 20) )
@@ -5867,7 +5993,7 @@ void __iomem *ioremap(paddr_t pa, size_t len)
         unsigned int offs = pa & (PAGE_SIZE - 1);
         unsigned int nr = PFN_UP(offs + len);
 
-        va = __vmap(&pfn, nr, 1, 1, PAGE_HYPERVISOR_NOCACHE) + offs;
+        va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_NOCACHE) + offs;
     }
 
     return (void __force __iomem *)va;
@@ -5881,7 +6007,6 @@ int create_perdomain_mapping(struct domain *d, unsigned long va,
     l3_pgentry_t *l3tab;
     l2_pgentry_t *l2tab;
     l1_pgentry_t *l1tab;
-    unsigned int memf = MEMF_node(domain_to_node(d));
     int rc = 0;
 
     ASSERT(va >= PERDOMAIN_VIRT_START &&
@@ -5889,7 +6014,7 @@ int create_perdomain_mapping(struct domain *d, unsigned long va,
 
     if ( !d->arch.perdomain_l3_pg )
     {
-        pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
+        pg = alloc_domheap_page(d, MEMF_no_owner);
         if ( !pg )
             return -ENOMEM;
         l3tab = __map_domain_page(pg);
@@ -5910,7 +6035,7 @@ int create_perdomain_mapping(struct domain *d, unsigned long va,
 
     if ( !(l3e_get_flags(l3tab[l3_table_offset(va)]) & _PAGE_PRESENT) )
     {
-        pg = alloc_domheap_page(NULL, memf);
+        pg = alloc_domheap_page(d, MEMF_no_owner);
         if ( !pg )
         {
             unmap_domain_page(l3tab);
@@ -5921,7 +6046,7 @@ int create_perdomain_mapping(struct domain *d, unsigned long va,
         l3tab[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR);
     }
     else
-        l2tab = map_domain_page(l3e_get_pfn(l3tab[l3_table_offset(va)]));
+        l2tab = map_domain_page(_mfn(l3e_get_pfn(l3tab[l3_table_offset(va)])));
 
     unmap_domain_page(l3tab);
 
@@ -5939,7 +6064,7 @@ int create_perdomain_mapping(struct domain *d, unsigned long va,
         {
             if ( pl1tab && !IS_NIL(pl1tab) )
             {
-                l1tab = alloc_xenheap_pages(0, memf);
+                l1tab = alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
                 if ( !l1tab )
                 {
                     rc = -ENOMEM;
@@ -5951,7 +6076,7 @@ int create_perdomain_mapping(struct domain *d, unsigned long va,
             }
             else
             {
-                pg = alloc_domheap_page(NULL, memf);
+                pg = alloc_domheap_page(d, MEMF_no_owner);
                 if ( !pg )
                 {
                     rc = -ENOMEM;
@@ -5963,19 +6088,19 @@ int create_perdomain_mapping(struct domain *d, unsigned long va,
             *pl2e = l2e_from_page(pg, __PAGE_HYPERVISOR);
         }
         else if ( !l1tab )
-            l1tab = map_domain_page(l2e_get_pfn(*pl2e));
+            l1tab = map_domain_page(_mfn(l2e_get_pfn(*pl2e)));
 
         if ( ppg &&
              !(l1e_get_flags(l1tab[l1_table_offset(va)]) & _PAGE_PRESENT) )
         {
-            pg = alloc_domheap_page(NULL, memf);
+            pg = alloc_domheap_page(d, MEMF_no_owner);
             if ( pg )
             {
-                clear_domain_page(page_to_mfn(pg));
+                clear_domain_page(_mfn(page_to_mfn(pg)));
                 if ( !IS_NIL(ppg) )
                     *ppg++ = pg;
                 l1tab[l1_table_offset(va)] =
-                    l1e_from_page(pg, __PAGE_HYPERVISOR | _PAGE_AVAIL0);
+                    l1e_from_page(pg, __PAGE_HYPERVISOR_RW | _PAGE_AVAIL0);
                 l2e_add_flags(*pl2e, _PAGE_AVAIL0);
             }
             else
@@ -6014,7 +6139,7 @@ void destroy_perdomain_mapping(struct domain *d, unsigned long va,
 
     if ( l3e_get_flags(*pl3e) & _PAGE_PRESENT )
     {
-        const l2_pgentry_t *l2tab = map_domain_page(l3e_get_pfn(*pl3e));
+        const l2_pgentry_t *l2tab = map_domain_page(_mfn(l3e_get_pfn(*pl3e)));
         const l2_pgentry_t *pl2e = l2tab + l2_table_offset(va);
         unsigned int i = l1_table_offset(va);
 
@@ -6022,7 +6147,7 @@ void destroy_perdomain_mapping(struct domain *d, unsigned long va,
         {
             if ( l2e_get_flags(*pl2e) & _PAGE_PRESENT )
             {
-                l1_pgentry_t *l1tab = map_domain_page(l2e_get_pfn(*pl2e));
+                l1_pgentry_t *l1tab = map_domain_page(_mfn(l2e_get_pfn(*pl2e)));
 
                 for ( ; nr && i < L1_PAGETABLE_ENTRIES; --nr, ++i )
                 {
@@ -6104,7 +6229,7 @@ void memguard_init(void)
         (unsigned long)__va(start),
         start >> PAGE_SHIFT,
         (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
-        __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
+        __PAGE_HYPERVISOR_RW|MAP_SMALL_PAGES);
     BUG_ON(start != xen_phys_start);
     map_pages_to_xen(
         XEN_VIRT_START,
@@ -6117,7 +6242,7 @@ static void __memguard_change_range(void *p, unsigned long l, int guard)
 {
     unsigned long _p = (unsigned long)p;
     unsigned long _l = (unsigned long)l;
-    unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
+    unsigned int flags = __PAGE_HYPERVISOR_RW | MAP_SMALL_PAGES;
 
     /* Ensure we are dealing with a page-aligned whole number of pages. */
     ASSERT((_p&~PAGE_MASK) == 0);
diff --git a/xen/arch/x86/mm/Makefile b/xen/arch/x86/mm/Makefile
index ed4b1f8..aeccdfc 100644
--- a/xen/arch/x86/mm/Makefile
+++ b/xen/arch/x86/mm/Makefile
@@ -3,6 +3,7 @@ subdir-y += hap
 
 obj-y += paging.o
 obj-y += p2m.o p2m-pt.o p2m-ept.o p2m-pod.o
+obj-y += altp2m.o
 obj-y += guest_walk_2.o
 obj-y += guest_walk_3.o
 obj-$(x86_64) += guest_walk_4.o
diff --git a/xen/arch/x86/mm/altp2m.c b/xen/arch/x86/mm/altp2m.c
new file mode 100644
index 0000000..10605c8
--- /dev/null
+++ b/xen/arch/x86/mm/altp2m.c
@@ -0,0 +1,76 @@
+/*
+ * Alternate p2m HVM
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <asm/hvm/support.h>
+#include <asm/hvm/hvm.h>
+#include <asm/p2m.h>
+#include <asm/altp2m.h>
+
+void
+altp2m_vcpu_reset(struct vcpu *v)
+{
+    struct altp2mvcpu *av = &vcpu_altp2m(v);
+
+    av->p2midx = INVALID_ALTP2M;
+    av->veinfo_gfn = _gfn(INVALID_GFN);
+}
+
+void
+altp2m_vcpu_initialise(struct vcpu *v)
+{
+    if ( v != current )
+        vcpu_pause(v);
+
+    altp2m_vcpu_reset(v);
+    vcpu_altp2m(v).p2midx = 0;
+    atomic_inc(&p2m_get_altp2m(v)->active_vcpus);
+
+    altp2m_vcpu_update_p2m(v);
+
+    if ( v != current )
+        vcpu_unpause(v);
+}
+
+void
+altp2m_vcpu_destroy(struct vcpu *v)
+{
+    struct p2m_domain *p2m;
+
+    if ( v != current )
+        vcpu_pause(v);
+
+    if ( (p2m = p2m_get_altp2m(v)) )
+        atomic_dec(&p2m->active_vcpus);
+
+    altp2m_vcpu_reset(v);
+
+    altp2m_vcpu_update_p2m(v);
+    altp2m_vcpu_update_vmfunc_ve(v);
+
+    if ( v != current )
+        vcpu_unpause(v);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c
index 1b26175..773454d 100644
--- a/xen/arch/x86/mm/guest_walk.c
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -18,8 +18,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/types.h>
@@ -99,7 +98,7 @@ void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn,
                                  q);
     if ( p2m_is_paging(*p2mt) )
     {
-        ASSERT(!p2m_is_nestedp2m(p2m));
+        ASSERT(p2m_is_hostp2m(p2m));
         if ( page )
             put_page(page);
         p2m_mem_paging_populate(p2m->domain, gfn_x(gfn));
@@ -121,7 +120,7 @@ void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn,
     *mfn = _mfn(page_to_mfn(page));
     ASSERT(mfn_valid(mfn_x(*mfn)));
 
-    map = map_domain_page(mfn_x(*mfn));
+    map = map_domain_page(*mfn);
     return map;
 }
 
@@ -159,7 +158,7 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     mflags = mandatory_flags(v, pfec);
     iflags = (_PAGE_NX_BIT | _PAGE_INVALID_BITS);
 
-    if ( is_hvm_vcpu(v) && !(pfec & PFEC_user_mode) )
+    if ( is_hvm_domain(d) && !(pfec & PFEC_user_mode) )
     {
         struct segment_register seg;
         const struct cpu_user_regs *regs = guest_cpu_user_regs();
diff --git a/xen/arch/x86/mm/hap/guest_walk.c b/xen/arch/x86/mm/hap/guest_walk.c
index 25d9792..66f0010 100644
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -15,8 +15,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
@@ -64,7 +63,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
                                      &p2mt, NULL, P2M_ALLOC | P2M_UNSHARE);
     if ( p2m_is_paging(p2mt) )
     {
-        ASSERT(!p2m_is_nestedp2m(p2m));
+        ASSERT(p2m_is_hostp2m(p2m));
         pfec[0] = PFEC_page_paged;
         if ( top_page )
             put_page(top_page);
@@ -87,7 +86,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
 
     /* Map the top-level table and call the tree-walker */
     ASSERT(mfn_valid(mfn_x(top_mfn)));
-    top_map = map_domain_page(mfn_x(top_mfn));
+    top_map = map_domain_page(top_mfn);
 #if GUEST_PAGING_LEVELS == 3
     top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
@@ -106,7 +105,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
             put_page(page);
         if ( p2m_is_paging(p2mt) )
         {
-            ASSERT(!p2m_is_nestedp2m(p2m));
+            ASSERT(p2m_is_hostp2m(p2m));
             pfec[0] = PFEC_page_paged;
             p2m_mem_paging_populate(p2m->domain, gfn_x(gfn));
             return INVALID_GFN;
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index abf3d7a..e9c0080 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -87,7 +86,7 @@ int hap_track_dirty_vram(struct domain *d,
         }
 
         rc = -ENOMEM;
-        dirty_bitmap = xzalloc_bytes(size);
+        dirty_bitmap = vzalloc(size);
         if ( !dirty_bitmap )
             goto out;
 
@@ -121,7 +120,10 @@ int hap_track_dirty_vram(struct domain *d,
                 p2m_change_type_range(d, ostart, oend,
                                       p2m_ram_logdirty, p2m_ram_rw);
 
-            /* set l1e entries of range within P2M table to be read-only. */
+            /*
+             * Switch vram to log dirty mode, either by setting l1e entries of
+             * P2M table to be read-only, or via hardware-assisted log-dirty.
+             */
             p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
                                   p2m_ram_rw, p2m_ram_logdirty);
 
@@ -135,6 +137,9 @@ int hap_track_dirty_vram(struct domain *d,
 
             domain_pause(d);
 
+            /* Flush dirty GFNs potentially cached by hardware. */
+            p2m_flush_hardware_cached_dirty(d);
+
             /* get the bitmap */
             paging_log_dirty_range(d, begin_pfn, nr, dirty_bitmap);
 
@@ -168,8 +173,7 @@ int hap_track_dirty_vram(struct domain *d,
                                   p2m_ram_logdirty, p2m_ram_rw);
     }
 out:
-    if ( dirty_bitmap )
-        xfree(dirty_bitmap);
+    vfree(dirty_bitmap);
 
     return rc;
 }
@@ -191,9 +195,15 @@ static int hap_enable_log_dirty(struct domain *d, bool_t log_global)
     d->arch.paging.mode |= PG_log_dirty;
     paging_unlock(d);
 
+    /* Enable hardware-assisted log-dirty if it is supported. */
+    p2m_enable_hardware_log_dirty(d);
+
     if ( log_global )
     {
-        /* set l1e entries of P2M table to be read-only. */
+        /*
+         * Switch to log dirty mode, either by setting l1e entries of P2M table
+         * to be read-only, or via hardware-assisted log-dirty.
+         */
         p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty);
         flush_tlb_mask(d->domain_dirty_cpumask);
     }
@@ -206,14 +216,23 @@ static int hap_disable_log_dirty(struct domain *d)
     d->arch.paging.mode &= ~PG_log_dirty;
     paging_unlock(d);
 
-    /* set l1e entries of P2M table with normal mode */
+    /* Disable hardware-assisted log-dirty if it is supported. */
+    p2m_disable_hardware_log_dirty(d);
+
+    /*
+     * switch to normal mode, either by setting l1e entries of P2M table to
+     * normal mode, or via hardware-assisted log-dirty.
+     */
     p2m_change_entry_type_global(d, p2m_ram_logdirty, p2m_ram_rw);
     return 0;
 }
 
 static void hap_clean_dirty_bitmap(struct domain *d)
 {
-    /* set l1e entries of P2M table to be read-only. */
+    /*
+     * Switch to log-dirty mode, either by setting l1e entries of P2M table to
+     * be read-only, or via hardware-assisted log-dirty.
+     */
     p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty);
     flush_tlb_mask(d->domain_dirty_cpumask);
 }
@@ -332,7 +351,7 @@ hap_set_allocation(struct domain *d, unsigned int pages, int *preempted)
         if ( d->arch.paging.hap.total_pages < pages )
         {
             /* Need to allocate more memory from domheap */
-            pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
+            pg = alloc_domheap_page(d, MEMF_no_owner);
             if ( pg == NULL )
             {
                 HAP_PRINTK("failed to allocate hap pages.\n");
@@ -375,7 +394,7 @@ static void hap_install_xen_entries_in_l4(struct vcpu *v, mfn_t l4mfn)
     struct domain *d = v->domain;
     l4_pgentry_t *l4e;
 
-    l4e = hap_map_domain_page(l4mfn);
+    l4e = map_domain_page(l4mfn);
 
     /* Copy the common Xen mappings from the idle domain */
     memcpy(&l4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
@@ -391,7 +410,7 @@ static void hap_install_xen_entries_in_l4(struct vcpu *v, mfn_t l4mfn)
     l4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
         l4e_from_pfn(mfn_x(l4mfn), __PAGE_HYPERVISOR);
 
-    hap_unmap_domain_page(l4e);
+    unmap_domain_page(l4e);
 }
 
 static mfn_t hap_make_monitor_table(struct vcpu *v)
@@ -439,18 +458,11 @@ void hap_domain_init(struct domain *d)
 int hap_enable(struct domain *d, u32 mode)
 {
     unsigned int old_pages;
-    uint8_t i;
+    unsigned int i;
     int rv = 0;
 
     domain_pause(d);
 
-    /* error check */
-    if ( (d == current->domain) )
-    {
-        rv = -EINVAL;
-        goto out;
-    }
-
     old_pages = d->arch.paging.hap.total_pages;
     if ( old_pages == 0 )
     {
@@ -485,6 +497,28 @@ int hap_enable(struct domain *d, u32 mode)
            goto out;
     }
 
+    if ( hvm_altp2m_supported() )
+    {
+        /* Init alternate p2m data */
+        if ( (d->arch.altp2m_eptp = alloc_xenheap_page()) == NULL )
+        {
+            rv = -ENOMEM;
+            goto out;
+        }
+
+        for ( i = 0; i < MAX_EPTP; i++ )
+            d->arch.altp2m_eptp[i] = INVALID_MFN;
+
+        for ( i = 0; i < MAX_ALTP2M; i++ )
+        {
+            rv = p2m_alloc_table(d->arch.altp2m_p2m[i]);
+            if ( rv != 0 )
+               goto out;
+        }
+
+        d->arch.altp2m_active = 0;
+    }
+
     /* Now let other users see the new mode */
     d->arch.paging.mode = mode | PG_HAP_enable;
 
@@ -495,7 +529,21 @@ int hap_enable(struct domain *d, u32 mode)
 
 void hap_final_teardown(struct domain *d)
 {
-    uint8_t i;
+    unsigned int i;
+
+    if ( hvm_altp2m_supported() )
+    {
+        d->arch.altp2m_active = 0;
+
+        if ( d->arch.altp2m_eptp )
+        {
+            free_xenheap_page(d->arch.altp2m_eptp);
+            d->arch.altp2m_eptp = NULL;
+        }
+
+        for ( i = 0; i < MAX_ALTP2M; i++ )
+            p2m_teardown(d->arch.altp2m_p2m[i]);
+    }
 
     /* Destroy nestedp2m's first */
     for (i = 0; i < MAX_NESTEDP2M; i++) {
@@ -503,7 +551,7 @@ void hap_final_teardown(struct domain *d)
     }
 
     if ( d->arch.paging.hap.total_pages != 0 )
-        hap_teardown(d);
+        hap_teardown(d, NULL);
 
     p2m_teardown(p2m_get_hostp2m(d));
     /* Free any memory that the p2m teardown released */
@@ -513,7 +561,7 @@ void hap_final_teardown(struct domain *d)
     paging_unlock(d);
 }
 
-void hap_teardown(struct domain *d)
+void hap_teardown(struct domain *d, int *preempted)
 {
     struct vcpu *v;
     mfn_t mfn;
@@ -541,18 +589,11 @@ void hap_teardown(struct domain *d)
 
     if ( d->arch.paging.hap.total_pages != 0 )
     {
-        HAP_PRINTK("teardown of domain %u starts."
-                      "  pages total = %u, free = %u, p2m=%u\n",
-                      d->domain_id,
-                      d->arch.paging.hap.total_pages,
-                      d->arch.paging.hap.free_pages,
-                      d->arch.paging.hap.p2m_pages);
-        hap_set_allocation(d, 0, NULL);
-        HAP_PRINTK("teardown done."
-                      "  pages total = %u, free = %u, p2m=%u\n",
-                      d->arch.paging.hap.total_pages,
-                      d->arch.paging.hap.free_pages,
-                      d->arch.paging.hap.p2m_pages);
+        hap_set_allocation(d, 0, preempted);
+
+        if ( preempted && *preempted )
+            goto out;
+
         ASSERT(d->arch.paging.hap.total_pages == 0);
     }
 
@@ -561,6 +602,7 @@ void hap_teardown(struct domain *d)
     xfree(d->arch.hvm_domain.dirty_vram);
     d->arch.hvm_domain.dirty_vram = NULL;
 
+out:
     paging_unlock(d);
 }
 
diff --git a/xen/arch/x86/mm/hap/nested_ept.c b/xen/arch/x86/mm/hap/nested_ept.c
index cbbc4e9..4b5576d 100644
--- a/xen/arch/x86/mm/hap/nested_ept.c
+++ b/xen/arch/x86/mm/hap/nested_ept.c
@@ -14,12 +14,11 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
 #include <xen/event.h>
-#include <public/mem_event.h>
+#include <public/vm_event.h>
 #include <asm/domain.h>
 #include <asm/page.h>
 #include <asm/paging.h>
diff --git a/xen/arch/x86/mm/hap/nested_hap.c b/xen/arch/x86/mm/hap/nested_hap.c
index 9c1ec11..0dbae13 100644
--- a/xen/arch/x86/mm/hap/nested_hap.c
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -15,13 +15,12 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
 #include <xen/event.h>
-#include <public/mem_event.h>
+#include <public/vm_event.h>
 #include <asm/domain.h>
 #include <asm/page.h>
 #include <asm/paging.h>
diff --git a/xen/arch/x86/mm/hap/private.h b/xen/arch/x86/mm/hap/private.h
index b5c0b6a..973fbe8 100644
--- a/xen/arch/x86/mm/hap/private.h
+++ b/xen/arch/x86/mm/hap/private.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 #ifndef __HAP_PRIVATE_H__
diff --git a/xen/arch/x86/mm/mem_paging.c b/xen/arch/x86/mm/mem_paging.c
index 65f6a3d..a049e0d 100644
--- a/xen/arch/x86/mm/mem_paging.c
+++ b/xen/arch/x86/mm/mem_paging.c
@@ -16,47 +16,63 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
 #include <asm/p2m.h>
-#include <xen/mem_event.h>
+#include <xen/guest_access.h>
+#include <xsm/xsm.h>
 
-
-int mem_paging_memop(struct domain *d, xen_mem_event_op_t *mec)
+int mem_paging_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_paging_op_t) arg)
 {
-    if ( unlikely(!d->mem_event->paging.ring_page) )
-        return -ENODEV;
+    int rc;
+    xen_mem_paging_op_t mpo;
+    struct domain *d;
+    bool_t copyback = 0;
+
+    if ( copy_from_guest(&mpo, arg, 1) )
+        return -EFAULT;
+
+    rc = rcu_lock_live_remote_domain_by_id(mpo.domain, &d);
+    if ( rc )
+        return rc;
+
+    rc = xsm_mem_paging(XSM_DM_PRIV, d);
+    if ( rc )
+        goto out;
 
-    switch( mec->op )
+    rc = -ENODEV;
+    if ( unlikely(!d->vm_event->paging.ring_page) )
+        goto out;
+
+    switch( mpo.op )
     {
     case XENMEM_paging_op_nominate:
-    {
-        unsigned long gfn = mec->gfn;
-        return p2m_mem_paging_nominate(d, gfn);
-    }
-    break;
+        rc = p2m_mem_paging_nominate(d, mpo.gfn);
+        break;
 
     case XENMEM_paging_op_evict:
-    {
-        unsigned long gfn = mec->gfn;
-        return p2m_mem_paging_evict(d, gfn);
-    }
-    break;
+        rc = p2m_mem_paging_evict(d, mpo.gfn);
+        break;
 
     case XENMEM_paging_op_prep:
-    {
-        unsigned long gfn = mec->gfn;
-        return p2m_mem_paging_prep(d, gfn, mec->buffer);
-    }
-    break;
+        rc = p2m_mem_paging_prep(d, mpo.gfn, mpo.buffer);
+        if ( !rc )
+            copyback = 1;
+        break;
 
     default:
-        return -ENOSYS;
+        rc = -ENOSYS;
         break;
     }
+
+    if ( copyback && __copy_to_guest(arg, &mpo, 1) )
+        rc = -EFAULT;
+
+out:
+    rcu_unlock_domain(d);
+    return rc;
 }
 
 
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 7c0fc7d..a95e105 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -17,8 +17,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/types.h>
@@ -28,7 +27,8 @@
 #include <xen/grant_table.h>
 #include <xen/sched.h>
 #include <xen/rcupdate.h>
-#include <xen/mem_event.h>
+#include <xen/guest_access.h>
+#include <xen/vm_event.h>
 #include <asm/page.h>
 #include <asm/string.h>
 #include <asm/p2m.h>
@@ -559,22 +559,24 @@ int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn,
 {
     struct vcpu *v = current;
     int rc;
-    mem_event_request_t req = { .gfn = gfn };
-
-    if ( (rc = __mem_event_claim_slot(d, 
-                        &d->mem_event->share, allow_sleep)) < 0 )
+    vm_event_request_t req = {
+        .reason = VM_EVENT_REASON_MEM_SHARING,
+        .vcpu_id = v->vcpu_id,
+        .u.mem_sharing.gfn = gfn,
+        .u.mem_sharing.p2mt = p2m_ram_shared
+    };
+
+    if ( (rc = __vm_event_claim_slot(d, 
+                        &d->vm_event->share, allow_sleep)) < 0 )
         return rc;
 
     if ( v->domain == d )
     {
-        req.flags = MEM_EVENT_FLAG_VCPU_PAUSED;
-        mem_event_vcpu_pause(v);
+        req.flags = VM_EVENT_FLAG_VCPU_PAUSED;
+        vm_event_vcpu_pause(v);
     }
 
-    req.p2mt = p2m_ram_shared;
-    req.vcpu_id = v->vcpu_id;
-
-    mem_event_put_request(d, &d->mem_event->share, &req);
+    vm_event_put_request(d, &d->vm_event->share, &req);
 
     return 0;
 }
@@ -589,32 +591,6 @@ unsigned int mem_sharing_get_nr_shared_mfns(void)
     return (unsigned int)atomic_read(&nr_shared_mfns);
 }
 
-int mem_sharing_sharing_resume(struct domain *d)
-{
-    mem_event_response_t rsp;
-
-    /* Get all requests off the ring */
-    while ( mem_event_get_response(d, &d->mem_event->share, &rsp) )
-    {
-        struct vcpu *v;
-
-        if ( rsp.flags & MEM_EVENT_FLAG_DUMMY )
-            continue;
-
-        /* Validate the vcpu_id in the response. */
-        if ( (rsp.vcpu_id >= d->max_vcpus) || !d->vcpu[rsp.vcpu_id] )
-            continue;
-
-        v = d->vcpu[rsp.vcpu_id];
-
-        /* Unpause domain/vcpu */
-        if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
-            mem_event_vcpu_unpause(v);
-    }
-
-    return 0;
-}
-
 /* Functions that change a page's type and ownership */
 static int page_make_sharable(struct domain *d, 
                        struct page_info *page, 
@@ -1136,7 +1112,7 @@ err_out:
 
 /* A note on the rationale for unshare error handling:
  *  1. Unshare can only fail with ENOMEM. Any other error conditions BUG_ON()'s
- *  2. We notify a potential dom0 helper through a mem_event ring. But we
+ *  2. We notify a potential dom0 helper through a vm_event ring. But we
  *     allow the notification to not go to sleep. If the event ring is full 
  *     of ENOMEM warnings, then it's on the ball.
  *  3. We cannot go to sleep until the unshare is resolved, because we might
@@ -1233,8 +1209,8 @@ int __mem_sharing_unshare_page(struct domain *d,
         return -ENOMEM;
     }
 
-    s = map_domain_page(__page_to_mfn(old_page));
-    t = map_domain_page(__page_to_mfn(page));
+    s = map_domain_page(_mfn(__page_to_mfn(old_page)));
+    t = map_domain_page(_mfn(__page_to_mfn(page)));
     memcpy(t, s, PAGE_SIZE);
     unmap_domain_page(s);
     unmap_domain_page(t);
@@ -1283,7 +1259,7 @@ int relinquish_shared_pages(struct domain *d)
 
         if ( atomic_read(&d->shr_pages) == 0 )
             break;
-        mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL);
+        mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
         if ( mfn_valid(mfn) && (t == p2m_ram_shared) )
         {
             /* Does not fail with ENOMEM given the DESTROY flag */
@@ -1293,7 +1269,7 @@ int relinquish_shared_pages(struct domain *d)
              * unshare.  Must succeed: we just read the old entry and
              * we hold the p2m lock. */
             set_rc = p2m->set_entry(p2m, gfn, _mfn(0), PAGE_ORDER_4K,
-                                    p2m_invalid, p2m_access_rwx);
+                                    p2m_invalid, p2m_access_rwx, -1);
             ASSERT(set_rc == 0);
             count += 0x10;
         }
@@ -1317,39 +1293,62 @@ int relinquish_shared_pages(struct domain *d)
     return rc;
 }
 
-int mem_sharing_memop(struct domain *d, xen_mem_sharing_op_t *mec)
+int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 {
-    int rc = 0;
+    int rc;
+    xen_mem_sharing_op_t mso;
+    struct domain *d;
+
+    rc = -EFAULT;
+    if ( copy_from_guest(&mso, arg, 1) )
+        return rc;
+
+    if ( mso.op == XENMEM_sharing_op_audit )
+        return mem_sharing_audit();
+
+    rc = rcu_lock_live_remote_domain_by_id(mso.domain, &d);
+    if ( rc )
+        return rc;
+
+    rc = xsm_mem_sharing(XSM_DM_PRIV, d);
+    if ( rc )
+        goto out;
 
     /* Only HAP is supported */
+    rc = -ENODEV;
     if ( !hap_enabled(d) || !d->arch.hvm_domain.mem_sharing_enabled )
-         return -ENODEV;
+        goto out;
 
-    switch(mec->op)
+    switch ( mso.op )
     {
         case XENMEM_sharing_op_nominate_gfn:
         {
-            unsigned long gfn = mec->u.nominate.u.gfn;
+            unsigned long gfn = mso.u.nominate.u.gfn;
             shr_handle_t handle;
+
+            rc = -EINVAL;
             if ( !mem_sharing_enabled(d) )
-                return -EINVAL;
+                goto out;
+
             rc = mem_sharing_nominate_page(d, gfn, 0, &handle);
-            mec->u.nominate.handle = handle;
+            mso.u.nominate.handle = handle;
         }
         break;
 
         case XENMEM_sharing_op_nominate_gref:
         {
-            grant_ref_t gref = mec->u.nominate.u.grant_ref;
+            grant_ref_t gref = mso.u.nominate.u.grant_ref;
             unsigned long gfn;
             shr_handle_t handle;
 
+            rc = -EINVAL;
             if ( !mem_sharing_enabled(d) )
-                return -EINVAL;
+                goto out;
             if ( mem_sharing_gref_to_gfn(d, gref, &gfn) < 0 )
-                return -EINVAL;
+                goto out;
+
             rc = mem_sharing_nominate_page(d, gfn, 3, &handle);
-            mec->u.nominate.handle = handle;
+            mso.u.nominate.handle = handle;
         }
         break;
 
@@ -1359,57 +1358,61 @@ int mem_sharing_memop(struct domain *d, xen_mem_sharing_op_t *mec)
             struct domain *cd;
             shr_handle_t sh, ch;
 
+            rc = -EINVAL;
             if ( !mem_sharing_enabled(d) )
-                return -EINVAL;
+                goto out;
 
-            rc = rcu_lock_live_remote_domain_by_id(mec->u.share.client_domain,
+            rc = rcu_lock_live_remote_domain_by_id(mso.u.share.client_domain,
                                                    &cd);
             if ( rc )
-                return rc;
+                goto out;
 
-            rc = xsm_mem_sharing_op(XSM_DM_PRIV, d, cd, mec->op);
+            rc = xsm_mem_sharing_op(XSM_DM_PRIV, d, cd, mso.op);
             if ( rc )
             {
                 rcu_unlock_domain(cd);
-                return rc;
+                goto out;
             }
 
             if ( !mem_sharing_enabled(cd) )
             {
                 rcu_unlock_domain(cd);
-                return -EINVAL;
+                rc = -EINVAL;
+                goto out;
             }
 
-            if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mec->u.share.source_gfn) )
+            if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mso.u.share.source_gfn) )
             {
                 grant_ref_t gref = (grant_ref_t) 
                                     (XENMEM_SHARING_OP_FIELD_GET_GREF(
-                                        mec->u.share.source_gfn));
+                                        mso.u.share.source_gfn));
                 if ( mem_sharing_gref_to_gfn(d, gref, &sgfn) < 0 )
                 {
                     rcu_unlock_domain(cd);
-                    return -EINVAL;
+                    rc = -EINVAL;
+                    goto out;
                 }
             } else {
-                sgfn  = mec->u.share.source_gfn;
+                sgfn  = mso.u.share.source_gfn;
             }
 
-            if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mec->u.share.client_gfn) )
+            if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mso.u.share.client_gfn) )
             {
                 grant_ref_t gref = (grant_ref_t) 
                                     (XENMEM_SHARING_OP_FIELD_GET_GREF(
-                                        mec->u.share.client_gfn));
+                                        mso.u.share.client_gfn));
                 if ( mem_sharing_gref_to_gfn(cd, gref, &cgfn) < 0 )
                 {
                     rcu_unlock_domain(cd);
-                    return -EINVAL;
+                    rc = -EINVAL;
+                    goto out;
                 }
             } else {
-                cgfn  = mec->u.share.client_gfn;
+                cgfn  = mso.u.share.client_gfn;
             }
 
-            sh = mec->u.share.source_handle;
-            ch = mec->u.share.client_handle;
+            sh = mso.u.share.source_handle;
+            ch = mso.u.share.client_handle;
 
             rc = mem_sharing_share_pages(d, sgfn, sh, cd, cgfn, ch); 
 
@@ -1423,37 +1426,40 @@ int mem_sharing_memop(struct domain *d, xen_mem_sharing_op_t *mec)
             struct domain *cd;
             shr_handle_t sh;
 
+            rc = -EINVAL;
             if ( !mem_sharing_enabled(d) )
-                return -EINVAL;
+                goto out;
 
-            rc = rcu_lock_live_remote_domain_by_id(mec->u.share.client_domain,
+            rc = rcu_lock_live_remote_domain_by_id(mso.u.share.client_domain,
                                                    &cd);
             if ( rc )
-                return rc;
+                goto out;
 
-            rc = xsm_mem_sharing_op(XSM_DM_PRIV, d, cd, mec->op);
+            rc = xsm_mem_sharing_op(XSM_DM_PRIV, d, cd, mso.op);
             if ( rc )
             {
                 rcu_unlock_domain(cd);
-                return rc;
+                goto out;
             }
 
             if ( !mem_sharing_enabled(cd) )
             {
                 rcu_unlock_domain(cd);
-                return -EINVAL;
+                rc = -EINVAL;
+                goto out;
             }
 
-            if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mec->u.share.source_gfn) )
+            if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mso.u.share.source_gfn) )
             {
                 /* Cannot add a gref to the physmap */
                 rcu_unlock_domain(cd);
-                return -EINVAL;
+                rc = -EINVAL;
+                goto out;
             }
 
-            sgfn    = mec->u.share.source_gfn;
-            sh      = mec->u.share.source_handle;
-            cgfn    = mec->u.share.client_gfn;
+            sgfn    = mso.u.share.source_gfn;
+            sh      = mso.u.share.source_handle;
+            cgfn    = mso.u.share.client_gfn;
 
             rc = mem_sharing_add_to_physmap(d, sgfn, sh, cd, cgfn); 
 
@@ -1461,24 +1467,16 @@ int mem_sharing_memop(struct domain *d, xen_mem_sharing_op_t *mec)
         }
         break;
 
-        case XENMEM_sharing_op_resume:
-        {
-            if ( !mem_sharing_enabled(d) )
-                return -EINVAL;
-            rc = mem_sharing_sharing_resume(d);
-        }
-        break;
-
         case XENMEM_sharing_op_debug_gfn:
         {
-            unsigned long gfn = mec->u.debug.u.gfn;
+            unsigned long gfn = mso.u.debug.u.gfn;
             rc = mem_sharing_debug_gfn(d, gfn);
         }
         break;
 
         case XENMEM_sharing_op_debug_gref:
         {
-            grant_ref_t gref = mec->u.debug.u.gref;
+            grant_ref_t gref = mso.u.debug.u.gref;
             rc = mem_sharing_debug_gref(d, gref);
         }
         break;
@@ -1488,6 +1486,11 @@ int mem_sharing_memop(struct domain *d, xen_mem_sharing_op_t *mec)
             break;
     }
 
+    if ( !rc && __copy_to_guest(arg, &mso, 1) )
+        rc = -EFAULT;
+
+out:
+    rcu_unlock_domain(d);
     return rc;
 }
 
diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h
index 769f7bc..76c7217 100644
--- a/xen/arch/x86/mm/mm-locks.h
+++ b/xen/arch/x86/mm/mm-locks.h
@@ -19,8 +19,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _MM_LOCKS_H
@@ -205,11 +204,19 @@ static inline void mm_enforce_order_unlock(int unlock_level,
  *                                                                      *
  ************************************************************************/
 
+/* Nested P2M lock (per-domain)
+ *
+ * A per-domain lock that protects the mapping from nested-CR3 to
+ * nested-p2m.  In particular it covers:
+ * - the array of nested-p2m tables, and all LRU activity therein; and
+ * - setting the "cr3" field of any p2m table to a non-P2M_BASE_EAADR value.
+ *   (i.e. assigning a p2m table to be the shadow of that cr3 */
+
 declare_mm_lock(nestedp2m)
 #define nestedp2m_lock(d)   mm_lock(nestedp2m, &(d)->arch.nested_p2m_lock)
 #define nestedp2m_unlock(d) mm_unlock(&(d)->arch.nested_p2m_lock)
 
-/* P2M lock (per-p2m-table)
+/* P2M lock (per-non-alt-p2m-table)
  *
  * This protects all queries and updates to the p2m table.
  * Queries may be made under the read lock but all modifications
@@ -217,10 +224,52 @@ declare_mm_lock(nestedp2m)
  *
  * The write lock is recursive as it is common for a code path to look
  * up a gfn and later mutate it.
+ *
+ * Note that this lock shares its implementation with the altp2m
+ * lock (not the altp2m list lock), so the implementation
+ * is found there.
+ *
+ * Changes made to the host p2m when in altp2m mode are propagated to the
+ * altp2ms synchronously in ept_set_entry().  At that point, we will hold
+ * the host p2m lock; propagating this change involves grabbing the
+ * altp2m_list lock, and the locks of the individual alternate p2ms.  In
+ * order to allow us to maintain locking order discipline, we split the p2m
+ * lock into p2m (for host p2ms) and altp2m (for alternate p2ms), putting
+ * the altp2mlist lock in the middle.
  */
 
 declare_mm_rwlock(p2m);
-#define p2m_lock(p)           mm_write_lock(p2m, &(p)->lock);
+
+/* Alternate P2M list lock (per-domain)
+ *
+ * A per-domain lock that protects the list of alternate p2m's.
+ * Any operation that walks the list needs to acquire this lock.
+ * Additionally, before destroying an alternate p2m all VCPU's
+ * in the target domain must be paused.
+ */
+
+declare_mm_lock(altp2mlist)
+#define altp2m_list_lock(d)   mm_lock(altp2mlist, &(d)->arch.altp2m_list_lock)
+#define altp2m_list_unlock(d) mm_unlock(&(d)->arch.altp2m_list_lock)
+
+/* P2M lock (per-altp2m-table)
+ *
+ * This protects all queries and updates to the p2m table.
+ * Queries may be made under the read lock but all modifications
+ * need the main (write) lock.
+ *
+ * The write lock is recursive as it is common for a code path to look
+ * up a gfn and later mutate it.
+ */
+
+declare_mm_rwlock(altp2m);
+#define p2m_lock(p)                         \
+{                                           \
+    if ( p2m_is_altp2m(p) )                 \
+        mm_write_lock(altp2m, &(p)->lock);  \
+    else                                    \
+        mm_write_lock(p2m, &(p)->lock);     \
+}
 #define p2m_unlock(p)         mm_write_unlock(&(p)->lock);
 #define gfn_lock(p,g,o)       p2m_lock(p)
 #define gfn_unlock(p,g,o)     p2m_unlock(p)
@@ -244,14 +293,6 @@ declare_mm_order_constraint(per_page_sharing)
         mm_enforce_order_lock_post_per_page_sharing((l), (r))
 #define page_sharing_mm_unlock(l, r) mm_enforce_order_unlock((l), (r))
 
-/* Nested P2M lock (per-domain)
- *
- * A per-domain lock that protects the mapping from nested-CR3 to 
- * nested-p2m.  In particular it covers:
- * - the array of nested-p2m tables, and all LRU activity therein; and
- * - setting the "cr3" field of any p2m table to a non-P2M_BASE_EAADR value.
- *   (i.e. assigning a p2m table to be the shadow of that cr3 */
-
 /* PoD lock (per-p2m-table)
  * 
  * Protects private PoD data structs: entry and cache
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 15c6e83..9860c6c 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -26,6 +25,7 @@
 #include <asm/p2m.h>
 #include <asm/hvm/vmx/vmx.h>
 #include <asm/hvm/vmx/vmcs.h>
+#include <asm/hvm/nestedhvm.h>
 #include <xen/iommu.h>
 #include <asm/mtrr.h>
 #include <asm/hvm/cacheattr.h>
@@ -41,7 +41,8 @@
 #define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
 static inline bool_t is_epte_valid(ept_entry_t *e)
 {
-    return (e->epte != 0 && e->sa_p2mt != p2m_invalid);
+    /* suppress_ve alone is not considered valid, so mask it off */
+    return ((e->epte & ~(1ul << 63)) != 0 && e->sa_p2mt != p2m_invalid);
 }
 
 /* returns : 0 for success, -errno otherwise */
@@ -102,9 +103,20 @@ static int atomic_write_ept_entry(ept_entry_t *entryptr, ept_entry_t new,
     return rc;
 }
 
-static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, p2m_access_t access)
+static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
+                                  p2m_type_t type, p2m_access_t access)
 {
-    /* First apply type permissions */
+    /*
+     * First apply type permissions.
+     *
+     * A/D bits are also manually set to avoid overhead of MMU having to set
+     * them later. Both A/D bits are safe to be updated directly as they are
+     * ignored by processor if EPT A/D bits is not turned on.
+     *
+     * A bit is set for all present p2m types in middle and leaf EPT entries.
+     * D bit is set for all writable types in EPT leaf entry, except for
+     * log-dirty type with PML.
+     */
     switch(type)
     {
         case p2m_invalid:
@@ -118,26 +130,51 @@ static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, p2m_acces
             break;
         case p2m_ram_rw:
             entry->r = entry->w = entry->x = 1;
+            entry->a = entry->d = !!cpu_has_vmx_ept_ad;
             break;
         case p2m_mmio_direct:
             entry->r = entry->x = 1;
             entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
                                                     entry->mfn);
+            entry->a = !!cpu_has_vmx_ept_ad;
+            entry->d = entry->w && cpu_has_vmx_ept_ad;
             break;
         case p2m_ram_logdirty:
+            entry->r = entry->x = 1;
+            /*
+             * In case of PML, we don't have to write protect 4K page, but
+             * only need to clear D-bit for it, but we still need to write
+             * protect super page in order to split it to 4K pages in EPT
+             * violation.
+             */
+            if ( vmx_domain_pml_enabled(p2m->domain) &&
+                 !is_epte_superpage(entry) )
+                entry->w = 1;
+            else
+                entry->w = 0;
+            entry->a = !!cpu_has_vmx_ept_ad;
+            /* For both PML or non-PML cases we clear D bit anyway */
+            entry->d = 0;
+            break;
         case p2m_ram_ro:
         case p2m_ram_shared:
             entry->r = entry->x = 1;
             entry->w = 0;
+            entry->a = !!cpu_has_vmx_ept_ad;
+            entry->d = 0;
             break;
         case p2m_grant_map_rw:
         case p2m_map_foreign:
             entry->r = entry->w = 1;
             entry->x = 0;
+            entry->a = entry->d = !!cpu_has_vmx_ept_ad;
             break;
         case p2m_grant_map_ro:
+        case p2m_mmio_write_dm:
             entry->r = 1;
             entry->w = entry->x = 0;
+            entry->a = !!cpu_has_vmx_ept_ad;
+            entry->d = 0;
             break;
     }
 
@@ -183,6 +220,8 @@ static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, p2m_acces
 static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry)
 {
     struct page_info *pg;
+    ept_entry_t *table;
+    unsigned int i;
 
     pg = p2m_alloc_ptp(p2m, 0);
     if ( pg == NULL )
@@ -193,6 +232,17 @@ static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry)
     ept_entry->access = p2m->default_access;
 
     ept_entry->r = ept_entry->w = ept_entry->x = 1;
+    /* Manually set A bit to avoid overhead of MMU having to write it later. */
+    ept_entry->a = !!cpu_has_vmx_ept_ad;
+
+    ept_entry->suppress_ve = 1;
+
+    table = __map_domain_page(pg);
+
+    for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+        table[i].suppress_ve = 1;
+
+    unmap_domain_page(table);
 
     return 1;
 }
@@ -207,7 +257,7 @@ static void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int l
 
     if ( level > 1 )
     {
-        ept_entry_t *epte = map_domain_page(ept_entry->mfn);
+        ept_entry_t *epte = map_domain_page(_mfn(ept_entry->mfn));
         for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
             ept_free_entry(p2m, epte + i, level - 1);
         unmap_domain_page(epte);
@@ -232,7 +282,7 @@ static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,
     if ( !ept_set_middle_entry(p2m, &new_ept) )
         return 0;
 
-    table = map_domain_page(new_ept.mfn);
+    table = map_domain_page(_mfn(new_ept.mfn));
     trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER);
 
     for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
@@ -243,10 +293,9 @@ static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,
         epte->sp = (level > 1);
         epte->mfn += i * trunk;
         epte->snp = (iommu_enabled && iommu_snoop);
-        ASSERT(!epte->rsvd1);
-        ASSERT(!epte->avail3);
+        epte->suppress_ve = 1;
 
-        ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
+        ept_p2m_type_to_flags(p2m, epte, epte->sa_p2mt, epte->access);
 
         if ( (level - 1) == target )
             continue;
@@ -321,7 +370,7 @@ static int ept_next_level(struct p2m_domain *p2m, bool_t read_only,
 
     mfn = e.mfn;
     unmap_domain_page(*table);
-    *table = map_domain_page(mfn);
+    *table = map_domain_page(_mfn(mfn));
     *gfn_remainder &= (1UL << shift) - 1;
     return GUEST_TABLE_NORMAL_PAGE;
 }
@@ -334,7 +383,7 @@ static int ept_next_level(struct p2m_domain *p2m, bool_t read_only,
 static bool_t ept_invalidate_emt(mfn_t mfn, bool_t recalc, int level)
 {
     int rc;
-    ept_entry_t *epte = map_domain_page(mfn_x(mfn));
+    ept_entry_t *epte = map_domain_page(mfn);
     unsigned int i;
     bool_t changed = 0;
 
@@ -376,7 +425,7 @@ static int ept_invalidate_emt_range(struct p2m_domain *p2m,
     unsigned int i, index;
     int wrc, rc = 0, ret = GUEST_TABLE_MAP_FAILED;
 
-    table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
+    table = map_domain_page(_mfn(pagetable_get_pfn(p2m_get_pagetable(p2m))));
     for ( i = ept_get_wl(&p2m->ept); i > target; --i )
     {
         ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
@@ -460,7 +509,7 @@ static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn)
         ept_entry_t e;
         unsigned int i;
 
-        epte = map_domain_page(mfn);
+        epte = map_domain_page(_mfn(mfn));
         i = (gfn >> (level * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1);
         e = atomic_read_ept_entry(&epte[i]);
 
@@ -488,7 +537,7 @@ static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn)
                     {
                          e.sa_p2mt = p2m_is_logdirty_range(p2m, gfn + i, gfn + i)
                                      ? p2m_ram_logdirty : p2m_ram_rw;
-                         ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access);
+                         ept_p2m_type_to_flags(p2m, &e, e.sa_p2mt, e.access);
                     }
                     e.recalc = 0;
                     wrc = atomic_write_ept_entry(&epte[i], e, level);
@@ -540,7 +589,7 @@ static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn)
                 e.ipat = ipat;
                 e.recalc = 0;
                 if ( recalc && p2m_is_changeable(e.sa_p2mt) )
-                    ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access);
+                    ept_p2m_type_to_flags(p2m, &e, e.sa_p2mt, e.access);
                 wrc = atomic_write_ept_entry(&epte[i], e, level);
                 ASSERT(wrc == 0);
             }
@@ -608,7 +657,8 @@ bool_t ept_handle_misconfig(uint64_t gpa)
  */
 static int
 ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, 
-              unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma)
+              unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma,
+              int sve)
 {
     ept_entry_t *table, *ept_entry = NULL;
     unsigned long gfn_remainder = gfn;
@@ -618,6 +668,7 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
     uint8_t ipat = 0;
     int need_modify_vtd_table = 1;
     int vtd_pte_present = 0;
+    unsigned int iommu_flags = p2m_get_iommu_flags(p2mt);
     enum { sync_off, sync_on, sync_check } needs_sync = sync_check;
     ept_entry_t old_entry = { .epte = 0 };
     ept_entry_t new_entry = { .epte = 0 };
@@ -651,7 +702,7 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
            (target == 0));
     ASSERT(!p2m_is_foreign(p2mt) || target == 0);
 
-    table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
+    table = map_domain_page(_mfn(pagetable_get_pfn(p2m_get_pagetable(p2m))));
 
     ret = GUEST_TABLE_MAP_FAILED;
     for ( i = ept_get_wl(ept); i > target; i-- )
@@ -748,12 +799,19 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         new_entry.mfn = mfn_x(mfn);
 
         /* Safe to read-then-write because we hold the p2m lock */
-        if ( ept_entry->mfn == new_entry.mfn )
-             need_modify_vtd_table = 0;
+        if ( ept_entry->mfn == new_entry.mfn &&
+             p2m_get_iommu_flags(ept_entry->sa_p2mt) == iommu_flags )
+            need_modify_vtd_table = 0;
 
-        ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
+        ept_p2m_type_to_flags(p2m, &new_entry, p2mt, p2ma);
     }
 
+    if ( sve != -1 )
+        new_entry.suppress_ve = !!sve;
+    else
+        new_entry.suppress_ve = is_epte_valid(&old_entry) ?
+                                    old_entry.suppress_ve : 1;
+
     rc = atomic_write_ept_entry(ept_entry, new_entry, target);
     if ( unlikely(rc) )
         old_entry.epte = 0;
@@ -763,30 +821,28 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
 
 out:
-    unmap_domain_page(table);
-
     if ( needs_sync != sync_off )
         ept_sync_domain(p2m);
 
-    /* For non-nested p2m, may need to change VT-d page table.*/
-    if ( rc == 0 && !p2m_is_nestedp2m(p2m) && need_iommu(d) &&
+    /* For host p2m, may need to change VT-d page table.*/
+    if ( rc == 0 && p2m_is_hostp2m(p2m) && need_iommu(d) &&
          need_modify_vtd_table )
     {
         if ( iommu_hap_pt_share )
             iommu_pte_flush(d, gfn, &ept_entry->epte, order, vtd_pte_present);
         else
         {
-            unsigned int flags = p2m_get_iommu_flags(p2mt);
-
-            if ( flags != 0 )
+            if ( iommu_flags )
                 for ( i = 0; i < (1 << order); i++ )
-                    iommu_map_page(d, gfn + i, mfn_x(mfn) + i, flags);
+                    iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
             else
                 for ( i = 0; i < (1 << order); i++ )
                     iommu_unmap_page(d, gfn + i);
         }
     }
 
+    unmap_domain_page(table);
+
     /* Release the old intermediate tables, if any.  This has to be the
        last thing we do, after the ept_sync_domain() and removal
        from the iommu tables, so as to avoid a potential
@@ -794,15 +850,19 @@ out:
     if ( is_epte_present(&old_entry) )
         ept_free_entry(p2m, &old_entry, target);
 
+    if ( rc == 0 && p2m_is_hostp2m(p2m) )
+        p2m_altp2m_propagate_change(d, _gfn(gfn), mfn, order, p2mt, p2ma);
+
     return rc;
 }
 
 /* Read ept p2m entries */
 static mfn_t ept_get_entry(struct p2m_domain *p2m,
                            unsigned long gfn, p2m_type_t *t, p2m_access_t* a,
-                           p2m_query_t q, unsigned int *page_order)
+                           p2m_query_t q, unsigned int *page_order,
+                           bool_t *sve)
 {
-    ept_entry_t *table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
+    ept_entry_t *table = map_domain_page(_mfn(pagetable_get_pfn(p2m_get_pagetable(p2m))));
     unsigned long gfn_remainder = gfn;
     ept_entry_t *ept_entry;
     u32 index;
@@ -814,6 +874,8 @@ static mfn_t ept_get_entry(struct p2m_domain *p2m,
 
     *t = p2m_mmio_dm;
     *a = p2m_access_n;
+    if ( sve )
+        *sve = 1;
 
     /* This pfn is higher than the highest the p2m map currently holds */
     if ( gfn > p2m->max_mapped_pfn )
@@ -879,6 +941,8 @@ static mfn_t ept_get_entry(struct p2m_domain *p2m,
         else
             *t = ept_entry->sa_p2mt;
         *a = ept_entry->access;
+        if ( sve )
+            *sve = ept_entry->suppress_ve;
 
         mfn = _mfn(ept_entry->mfn);
         if ( i )
@@ -906,19 +970,18 @@ void ept_walk_table(struct domain *d, unsigned long gfn)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     struct ept_data *ept = &p2m->ept;
-    ept_entry_t *table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
+    ept_entry_t *table = map_domain_page(_mfn(pagetable_get_pfn(p2m_get_pagetable(p2m))));
     unsigned long gfn_remainder = gfn;
 
     int i;
 
-    gdprintk(XENLOG_ERR, "Walking EPT tables for domain %d gfn %lx\n",
-           d->domain_id, gfn);
+    gprintk(XENLOG_ERR, "Walking EPT tables for GFN %lx:\n", gfn);
 
     /* This pfn is higher than the highest the p2m map currently holds */
     if ( gfn > p2m->max_mapped_pfn )
     {
-        gdprintk(XENLOG_ERR, " gfn exceeds max_mapped_pfn %lx\n",
-               p2m->max_mapped_pfn);
+        gprintk(XENLOG_ERR, " gfn exceeds max_mapped_pfn %lx\n",
+                p2m->max_mapped_pfn);
         goto out;
     }
 
@@ -931,7 +994,7 @@ void ept_walk_table(struct domain *d, unsigned long gfn)
         index = gfn_remainder >> (i*EPT_TABLE_ORDER);
         ept_entry = table + index;
 
-        gdprintk(XENLOG_ERR, " epte %"PRIx64"\n", ept_entry->epte);
+        gprintk(XENLOG_ERR, " epte %"PRIx64"\n", ept_entry->epte);
 
         if ( (i == 0) || !is_epte_present(ept_entry) ||
              is_epte_superpage(ept_entry) )
@@ -940,7 +1003,7 @@ void ept_walk_table(struct domain *d, unsigned long gfn)
         {
             gfn_remainder &= (1UL << (i*EPT_TABLE_ORDER)) - 1;
 
-            next = map_domain_page(ept_entry->mfn);
+            next = map_domain_page(_mfn(ept_entry->mfn));
 
             unmap_domain_page(table);
 
@@ -1040,6 +1103,9 @@ void ept_sync_domain(struct p2m_domain *p2m)
 
     ASSERT(local_irq_is_enabled());
 
+    if ( nestedhvm_enabled(d) && !p2m_is_nestedp2m(p2m) )
+        p2m_flush_nestedp2m(d);
+
     /*
      * Flush active cpus synchronously. Flush others the next time this domain
      * is scheduled onto them. We accept the race of other CPUs adding to
@@ -1053,6 +1119,26 @@ void ept_sync_domain(struct p2m_domain *p2m)
                      __ept_sync_domain, p2m, 1);
 }
 
+static void ept_enable_pml(struct p2m_domain *p2m)
+{
+    /*
+     * No need to check if vmx_domain_enable_pml has succeeded or not, as
+     * ept_p2m_type_to_flags will do the check, and write protection will be
+     * used if PML is not enabled.
+     */
+    vmx_domain_enable_pml(p2m->domain);
+}
+
+static void ept_disable_pml(struct p2m_domain *p2m)
+{
+    vmx_domain_disable_pml(p2m->domain);
+}
+
+static void ept_flush_pml_buffers(struct p2m_domain *p2m)
+{
+    vmx_domain_flush_pml_buffers(p2m->domain);
+}
+
 int ept_p2m_init(struct p2m_domain *p2m)
 {
     struct ept_data *ept = &p2m->ept;
@@ -1070,6 +1156,15 @@ int ept_p2m_init(struct p2m_domain *p2m)
     /* set EPT page-walk length, now it's actual walk length - 1, i.e. 3 */
     ept->ept_wl = 3;
 
+    if ( cpu_has_vmx_pml )
+    {
+        /* Enable EPT A/D bits if we are going to use PML. */
+        ept->ept_ad = cpu_has_vmx_pml ? 1 : 0;
+        p2m->enable_hardware_log_dirty = ept_enable_pml;
+        p2m->disable_hardware_log_dirty = ept_disable_pml;
+        p2m->flush_hardware_cached_dirty = ept_flush_pml_buffers;
+    }
+
     if ( !zalloc_cpumask_var(&ept->synced_mask) )
         return -ENOMEM;
 
@@ -1119,7 +1214,7 @@ static void ept_dump_p2m_table(unsigned char key)
             char c = 0;
 
             gfn_remainder = gfn;
-            table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
+            table = map_domain_page(_mfn(pagetable_get_pfn(p2m_get_pagetable(p2m))));
 
             for ( i = ept_get_wl(ept); i > 0; i-- )
             {
diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
index 43f507c..8156525 100644
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -16,14 +16,13 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/iommu.h>
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
 #include <xen/event.h>
-#include <public/mem_event.h>
+#include <public/vm_event.h>
 #include <asm/domain.h>
 #include <asm/page.h>
 #include <asm/paging.h>
@@ -109,7 +108,7 @@ p2m_pod_cache_add(struct p2m_domain *p2m,
      */
     for ( i = 0; i < (1 << order); i++ )
     {
-        char *b = map_domain_page(mfn_x(page_to_mfn(page)) + i);
+        char *b = map_domain_page(_mfn(mfn_x(page_to_mfn(page)) + i));
         clear_page(b);
         unmap_domain_page(b);
     }
@@ -536,7 +535,7 @@ recount:
         p2m_access_t a;
         p2m_type_t t;
 
-        (void)p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL);
+        (void)p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL, NULL);
 
         if ( t == p2m_populate_on_demand )
             pod++;
@@ -587,7 +586,7 @@ recount:
         p2m_type_t t;
         p2m_access_t a;
 
-        mfn = p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL);
+        mfn = p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL, NULL);
         if ( t == p2m_populate_on_demand )
         {
             p2m_set_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid,
@@ -676,7 +675,7 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
     for ( i=0; i<SUPERPAGE_PAGES; i++ )
     {
         p2m_access_t a; 
-        mfn = p2m->get_entry(p2m, gfn + i, &type, &a, 0, NULL);
+        mfn = p2m->get_entry(p2m, gfn + i, &type, &a, 0, NULL, NULL);
 
         if ( i == 0 )
         {
@@ -710,7 +709,7 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
     for ( i=0; i<SUPERPAGE_PAGES; i++ )
     {
         /* Quick zero-check */
-        map = map_domain_page(mfn_x(mfn0) + i);
+        map = map_domain_page(_mfn(mfn_x(mfn0) + i));
 
         for ( j=0; j<16; j++ )
             if( *(map+j) != 0 )
@@ -743,7 +742,7 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
     /* Finally, do a full zero-check */
     for ( i=0; i < SUPERPAGE_PAGES; i++ )
     {
-        map = map_domain_page(mfn_x(mfn0) + i);
+        map = map_domain_page(_mfn(mfn_x(mfn0) + i));
 
         for ( j=0; j<PAGE_SIZE/sizeof(*map); j++ )
             if( *(map+j) != 0 )
@@ -808,14 +807,14 @@ p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count)
     for ( i=0; i<count; i++ )
     {
         p2m_access_t a;
-        mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, &a, 0, NULL);
+        mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, &a, 0, NULL, NULL);
         /* If this is ram, and not a pagetable or from the xen heap, and probably not mapped
            elsewhere, map it; otherwise, skip. */
         if ( p2m_is_ram(types[i])
              && ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 ) 
              && ( (mfn_to_page(mfns[i])->count_info & (PGC_page_table|PGC_xen_heap)) == 0 ) 
              && ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= max_ref ) )
-            map[i] = map_domain_page(mfn_x(mfns[i]));
+            map[i] = map_domain_page(mfns[i]);
         else
             map[i] = NULL;
     }
@@ -947,7 +946,7 @@ p2m_pod_emergency_sweep(struct p2m_domain *p2m)
     for ( i=p2m->pod.reclaim_single; i > 0 ; i-- )
     {
         p2m_access_t a;
-        (void)p2m->get_entry(p2m, i, &t, &a, 0, NULL);
+        (void)p2m->get_entry(p2m, i, &t, &a, 0, NULL, NULL);
         if ( p2m_is_ram(t) )
         {
             gfns[j] = i;
@@ -1135,7 +1134,7 @@ guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
     for ( i = 0; i < (1UL << order); i++ )
     {
         p2m_access_t a;
-        omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL);
+        omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL, NULL);
         if ( p2m_is_ram(ot) )
         {
             P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot);
diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c
index e48b63a..709920a 100644
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -21,15 +21,14 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/iommu.h>
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
 #include <xen/event.h>
 #include <xen/trace.h>
-#include <public/mem_event.h>
+#include <public/vm_event.h>
 #include <asm/domain.h>
 #include <asm/page.h>
 #include <asm/paging.h>
@@ -94,6 +93,7 @@ static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
     default:
         return flags | _PAGE_NX_BIT;
     case p2m_grant_map_ro:
+    case p2m_mmio_write_dm:
         return flags | P2M_BASE_FLAGS | _PAGE_NX_BIT;
     case p2m_ram_ro:
     case p2m_ram_logdirty:
@@ -145,7 +145,7 @@ p2m_free_entry(struct p2m_domain *p2m, l1_pgentry_t *p2m_entry, int page_order)
 
     if ( page_order > PAGE_ORDER_2M )
     {
-        l1_pgentry_t *l3_table = map_domain_page(l1e_get_pfn(*p2m_entry));
+        l1_pgentry_t *l3_table = map_domain_page(_mfn(l1e_get_pfn(*p2m_entry)));
         for ( int i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
             p2m_free_entry(p2m, l3_table + i, page_order - 9);
         unmap_domain_page(l3_table);
@@ -279,7 +279,7 @@ p2m_next_level(struct p2m_domain *p2m, void **table,
         p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 2);
     }
 
-    next = map_domain_page(l1e_get_pfn(*p2m_entry));
+    next = map_domain_page(_mfn(l1e_get_pfn(*p2m_entry)));
     if ( unmap )
         unmap_domain_page(*table);
     *table = next;
@@ -303,7 +303,7 @@ static int p2m_pt_set_recalc_range(struct p2m_domain *p2m,
     l1_pgentry_t *pent, *plast;
     int err = 0;
 
-    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    table = map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
     for ( i = 4; i-- > level; )
     {
         remainder = gfn_remainder;
@@ -365,7 +365,7 @@ static int do_recalc(struct p2m_domain *p2m, unsigned long gfn)
     l1_pgentry_t *pent;
     int err = 0;
 
-    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    table = map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
     while ( --level )
     {
         unsigned long remainder = gfn_remainder;
@@ -481,18 +481,33 @@ int p2m_pt_handle_deferred_changes(uint64_t gpa)
 /* Returns: 0 for success, -errno for failure */
 static int
 p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
-                 unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma)
+                 unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma,
+                 int sve)
 {
     /* XXX -- this might be able to be faster iff current->domain == d */
     void *table;
     unsigned long i, gfn_remainder = gfn;
-    l1_pgentry_t *p2m_entry;
-    l1_pgentry_t entry_content;
+    l1_pgentry_t *p2m_entry, entry_content;
+    /* Intermediate table to free if we're replacing it with a superpage. */
+    l1_pgentry_t intermediate_entry = l1e_empty();
     l2_pgentry_t l2e_content;
     l3_pgentry_t l3e_content;
     int rc;
     unsigned int iommu_pte_flags = p2m_get_iommu_flags(p2mt);
-    unsigned long old_mfn = 0;
+    /*
+     * old_mfn and iommu_old_flags control possible flush/update needs on the
+     * IOMMU: We need to flush when MFN or flags (i.e. permissions) change.
+     * iommu_old_flags being initialized to zero covers the case of the entry
+     * getting replaced being a non-present (leaf or intermediate) one. For
+     * present leaf entries the real value will get calculated below, while
+     * for present intermediate entries ~0 (guaranteed != iommu_pte_flags)
+     * will be used (to cover all cases of what the leaf entries underneath
+     * the intermediate one might be).
+     */
+    unsigned int flags, iommu_old_flags = 0;
+    unsigned long old_mfn = INVALID_MFN;
+
+    ASSERT(sve != 0);
 
     if ( tb_init_done )
     {
@@ -523,7 +538,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
     if ( rc < 0 )
         return rc;
 
-    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    table = map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
     rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table, 1);
@@ -535,17 +550,24 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
      */
     if ( page_order == PAGE_ORDER_1G )
     {
-        l1_pgentry_t old_entry = l1e_empty();
         p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
                                    L3_PAGETABLE_SHIFT - PAGE_SHIFT,
                                    L3_PAGETABLE_ENTRIES);
         ASSERT(p2m_entry);
-        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
-             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        flags = l1e_get_flags(*p2m_entry);
+        if ( flags & _PAGE_PRESENT )
         {
-            /* We're replacing a non-SP page with a superpage.  Make sure to
-             * handle freeing the table properly. */
-            old_entry = *p2m_entry;
+            if ( flags & _PAGE_PSE )
+            {
+                iommu_old_flags =
+                    p2m_get_iommu_flags(p2m_flags_to_type(flags));
+                old_mfn = l1e_get_pfn(*p2m_entry);
+            }
+            else
+            {
+                iommu_old_flags = ~0;
+                intermediate_entry = *p2m_entry;
+            }
         }
 
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
@@ -556,17 +578,10 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         entry_content.l1 = l3e_content.l3;
 
         if ( entry_content.l1 != 0 )
-        {
             p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
-            old_mfn = l1e_get_pfn(*p2m_entry);
-        }
 
         p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 3);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
-
-        /* Free old intermediate tables if necessary */
-        if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
-            p2m_free_entry(p2m, &old_entry, page_order);
     }
     else 
     {
@@ -588,7 +603,10 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
                                    0, L1_PAGETABLE_ENTRIES);
         ASSERT(p2m_entry);
-        
+        iommu_old_flags =
+            p2m_get_iommu_flags(p2m_flags_to_type(l1e_get_flags(*p2m_entry)));
+        old_mfn = l1e_get_pfn(*p2m_entry);
+
         if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct)
                             || p2m_is_paging(p2mt) )
             entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
@@ -597,29 +615,32 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
             entry_content = l1e_empty();
 
         if ( entry_content.l1 != 0 )
-        {
             p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
-            old_mfn = l1e_get_pfn(*p2m_entry);
-        }
+
         /* level 1 entry */
         p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 1);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
     }
     else if ( page_order == PAGE_ORDER_2M )
     {
-        l1_pgentry_t old_entry = l1e_empty();
         p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
                                    L2_PAGETABLE_SHIFT - PAGE_SHIFT,
                                    L2_PAGETABLE_ENTRIES);
         ASSERT(p2m_entry);
-        
-        /* FIXME: Deal with 4k replaced by 2meg pages */
-        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
-             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        flags = l1e_get_flags(*p2m_entry);
+        if ( flags & _PAGE_PRESENT )
         {
-            /* We're replacing a non-SP page with a superpage.  Make sure to
-             * handle freeing the table properly. */
-            old_entry = *p2m_entry;
+            if ( flags & _PAGE_PSE )
+            {
+                iommu_old_flags =
+                    p2m_get_iommu_flags(p2m_flags_to_type(flags));
+                old_mfn = l1e_get_pfn(*p2m_entry);
+            }
+            else
+            {
+                iommu_old_flags = ~0;
+                intermediate_entry = *p2m_entry;
+            }
         }
         
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
@@ -633,17 +654,10 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         entry_content.l1 = l2e_content.l2;
 
         if ( entry_content.l1 != 0 )
-        {
             p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
-            old_mfn = l1e_get_pfn(*p2m_entry);
-        }
 
         p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 2);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
-
-        /* Free old intermediate tables if necessary */
-        if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
-            p2m_free_entry(p2m, &old_entry, page_order);
     }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
@@ -651,26 +665,31 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
          && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
         p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
 
-    if ( iommu_enabled && need_iommu(p2m->domain) )
+    if ( iommu_enabled && need_iommu(p2m->domain) &&
+         (iommu_old_flags != iommu_pte_flags || old_mfn != mfn_x(mfn)) )
     {
-        if ( iommu_hap_pt_share )
+        if ( iommu_use_hap_pt(p2m->domain) )
         {
-            if ( old_mfn && (old_mfn != mfn_x(mfn)) )
+            if ( iommu_old_flags )
                 amd_iommu_flush_pages(p2m->domain, gfn, page_order);
         }
+        else if ( iommu_pte_flags )
+            for ( i = 0; i < (1UL << page_order); i++ )
+                iommu_map_page(p2m->domain, gfn + i, mfn_x(mfn) + i,
+                               iommu_pte_flags);
         else
-        {
-            unsigned int flags = p2m_get_iommu_flags(p2mt);
-
-            if ( flags != 0 )
-                for ( i = 0; i < (1UL << page_order); i++ )
-                    iommu_map_page(p2m->domain, gfn+i, mfn_x(mfn)+i, flags);
-            else
-                for ( int i = 0; i < (1UL << page_order); i++ )
-                    iommu_unmap_page(p2m->domain, gfn+i);
-        }
+            for ( i = 0; i < (1UL << page_order); i++ )
+                iommu_unmap_page(p2m->domain, gfn + i);
     }
 
+    /*
+     * Free old intermediate tables if necessary.  This has to be the
+     * last thing we do, after removal from the IOMMU tables, so as to
+     * avoid a potential use-after-free.
+     */
+    if ( l1e_get_flags(intermediate_entry) & _PAGE_PRESENT )
+        p2m_free_entry(p2m, &intermediate_entry, page_order);
+
  out:
     unmap_domain_page(table);
     return rc;
@@ -688,7 +707,7 @@ static inline p2m_type_t recalc_type(bool_t recalc, p2m_type_t t,
 static mfn_t
 p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn,
                  p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
-                 unsigned int *page_order)
+                 unsigned int *page_order, bool_t *sve)
 {
     mfn_t mfn;
     paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
@@ -700,6 +719,9 @@ p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn,
 
     ASSERT(paging_mode_translate(p2m->domain));
 
+    if ( sve )
+        *sve = 1;
+
     /* XXX This is for compatibility with the old model, where anything not 
      * XXX marked as RAM was considered to be emulated MMIO space.
      * XXX Once we start explicitly registering MMIO regions in the p2m 
@@ -715,7 +737,7 @@ p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn,
     mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
 
     {
-        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
+        l4_pgentry_t *l4e = map_domain_page(mfn);
         l4e += l4_table_offset(addr);
         if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
         {
@@ -727,7 +749,7 @@ p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn,
         unmap_domain_page(l4e);
     }
     {
-        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
+        l3_pgentry_t *l3e = map_domain_page(mfn);
         l3e += l3_table_offset(addr);
 pod_retry_l3:
         flags = l3e_get_flags(*l3e);
@@ -768,7 +790,7 @@ pod_retry_l3:
         unmap_domain_page(l3e);
     }
 
-    l2e = map_domain_page(mfn_x(mfn));
+    l2e = map_domain_page(mfn);
     l2e += l2_table_offset(addr);
 
 pod_retry_l2:
@@ -806,7 +828,7 @@ pod_retry_l2:
         recalc = 1;
     unmap_domain_page(l2e);
 
-    l1e = map_domain_page(mfn_x(mfn));
+    l1e = map_domain_page(mfn);
     l1e += l1_table_offset(addr);
 pod_retry_l1:
     flags = l1e_get_flags(*l1e);
@@ -848,7 +870,7 @@ static void p2m_pt_change_entry_type_global(struct p2m_domain *p2m,
 
     ASSERT(hap_enabled(p2m->domain));
 
-    tab = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    tab = map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
     for ( changed = i = 0; i < (1 << PAGETABLE_ORDER); ++i )
     {
         l1_pgentry_t e = tab[i];
@@ -928,7 +950,7 @@ long p2m_pt_audit_p2m(struct p2m_domain *p2m)
         l4_pgentry_t *l4e;
         l3_pgentry_t *l3e;
         int i4, i3;
-        l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+        l4e = map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
 
         gfn = 0;
         for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
@@ -938,7 +960,7 @@ long p2m_pt_audit_p2m(struct p2m_domain *p2m)
                 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
                 continue;
             }
-            l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
+            l3e = map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
             for ( i3 = 0;
                   i3 < L3_PAGETABLE_ENTRIES;
                   i3++ )
@@ -973,7 +995,7 @@ long p2m_pt_audit_p2m(struct p2m_domain *p2m)
                     }
                 }
 
-                l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
+                l2e = map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
                 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
                 {
                     if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
@@ -1009,7 +1031,7 @@ long p2m_pt_audit_p2m(struct p2m_domain *p2m)
                         continue;
                     }
 
-                    l1e = map_domain_page(l2e_get_pfn(l2e[i2]));
+                    l1e = map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
 
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
                     {
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index efa49dd..c6b883d 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -20,14 +20,13 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/iommu.h>
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
 #include <xen/event.h>
-#include <public/mem_event.h>
+#include <public/vm_event.h>
 #include <asm/domain.h>
 #include <asm/page.h>
 #include <asm/paging.h>
@@ -35,6 +34,7 @@
 #include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */
 #include <asm/mem_sharing.h>
 #include <asm/hvm/nestedhvm.h>
+#include <asm/altp2m.h>
 #include <asm/hvm/svm/amd-iommu-proto.h>
 #include <xsm/xsm.h>
 
@@ -71,6 +71,7 @@ static int p2m_initialise(struct domain *d, struct p2m_domain *p2m)
 
     p2m->domain = d;
     p2m->default_access = p2m_access_rwx;
+    p2m->p2m_class = p2m_host;
 
     p2m->np2m_base = P2M_BASE_EADDR;
 
@@ -158,6 +159,7 @@ static int p2m_init_nestedp2m(struct domain *d)
             p2m_teardown_nestedp2m(d);
             return -ENOMEM;
         }
+        p2m->p2m_class = p2m_nested;
         p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
         list_add(&p2m->np2m_list, &p2m_get_hostp2m(d)->np2m_list);
     }
@@ -181,6 +183,43 @@ static void p2m_teardown_nestedp2m(struct domain *d)
     }
 }
 
+static void p2m_teardown_altp2m(struct domain *d)
+{
+    unsigned int i;
+    struct p2m_domain *p2m;
+
+    for ( i = 0; i < MAX_ALTP2M; i++ )
+    {
+        if ( !d->arch.altp2m_p2m[i] )
+            continue;
+        p2m = d->arch.altp2m_p2m[i];
+        d->arch.altp2m_p2m[i] = NULL;
+        p2m_free_one(p2m);
+    }
+}
+
+static int p2m_init_altp2m(struct domain *d)
+{
+    unsigned int i;
+    struct p2m_domain *p2m;
+
+    mm_lock_init(&d->arch.altp2m_list_lock);
+    for ( i = 0; i < MAX_ALTP2M; i++ )
+    {
+        d->arch.altp2m_p2m[i] = p2m = p2m_init_one(d);
+        if ( p2m == NULL )
+        {
+            p2m_teardown_altp2m(d);
+            return -ENOMEM;
+        }
+        p2m->p2m_class = p2m_alternate;
+        p2m->access_required = 1;
+        _atomic_set(&p2m->active_vcpus, 0);
+    }
+
+    return 0;
+}
+
 int p2m_init(struct domain *d)
 {
     int rc;
@@ -194,7 +233,17 @@ int p2m_init(struct domain *d)
      * (p2m_init runs too early for HVM_PARAM_* options) */
     rc = p2m_init_nestedp2m(d);
     if ( rc )
+    {
+        p2m_teardown_hostp2m(d);
+        return rc;
+    }
+
+    rc = p2m_init_altp2m(d);
+    if ( rc )
+    {
         p2m_teardown_hostp2m(d);
+        p2m_teardown_nestedp2m(d);
+    }
 
     return rc;
 }
@@ -202,7 +251,7 @@ int p2m_init(struct domain *d)
 int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start,
                           unsigned long end)
 {
-    ASSERT(!p2m_is_nestedp2m(p2m));
+    ASSERT(p2m_is_hostp2m(p2m));
     if ( p2m->global_logdirty ||
          rangeset_contains_range(p2m->logdirty_ranges, start, end) )
         return 1;
@@ -237,6 +286,42 @@ void p2m_memory_type_changed(struct domain *d)
     }
 }
 
+void p2m_enable_hardware_log_dirty(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    if ( p2m->enable_hardware_log_dirty )
+    {
+        p2m_lock(p2m);
+        p2m->enable_hardware_log_dirty(p2m);
+        p2m_unlock(p2m);
+    }
+}
+
+void p2m_disable_hardware_log_dirty(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    if ( p2m->disable_hardware_log_dirty )
+    {
+        p2m_lock(p2m);
+        p2m->disable_hardware_log_dirty(p2m);
+        p2m_unlock(p2m);
+    }
+}
+
+void p2m_flush_hardware_cached_dirty(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    if ( p2m->flush_hardware_cached_dirty )
+    {
+        p2m_lock(p2m);
+        p2m->flush_hardware_cached_dirty(p2m);
+        p2m_unlock(p2m);
+    }
+}
+
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
                     p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
                     unsigned int *page_order, bool_t locked)
@@ -259,16 +344,16 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
         /* Grab the lock here, don't release until put_gfn */
         gfn_lock(p2m, gfn, 0);
 
-    mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order);
+    mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
 
     if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
     {
-        ASSERT(!p2m_is_nestedp2m(p2m));
+        ASSERT(p2m_is_hostp2m(p2m));
         /* Try to unshare. If we fail, communicate ENOMEM without
          * sleeping. */
         if ( mem_sharing_unshare_page(p2m->domain, gfn, 0) < 0 )
             (void)mem_sharing_notify_enomem(p2m->domain, gfn, 0);
-        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order);
+        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
     }
 
     if (unlikely((p2m_is_broken(*t))))
@@ -372,7 +457,7 @@ int p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         else
             order = 0;
 
-        set_rc = p2m->set_entry(p2m, gfn, mfn, order, p2mt, p2ma);
+        set_rc = p2m->set_entry(p2m, gfn, mfn, order, p2mt, p2ma, -1);
         if ( set_rc )
             rc = set_rc;
 
@@ -431,7 +516,7 @@ int p2m_alloc_table(struct p2m_domain *p2m)
 
     p2m_lock(p2m);
 
-    if ( !p2m_is_nestedp2m(p2m)
+    if ( p2m_is_hostp2m(p2m)
          && !page_list_empty(&d->page_list) )
     {
         P2M_ERROR("dom %d already has memory allocated\n", d->domain_id);
@@ -536,7 +621,7 @@ p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn, unsigned long mfn,
     {
         for ( i = 0; i < (1UL << page_order); i++ )
         {
-            mfn_return = p2m->get_entry(p2m, gfn + i, &t, &a, 0, NULL);
+            mfn_return = p2m->get_entry(p2m, gfn + i, &t, &a, 0, NULL, NULL);
             if ( !p2m_is_grant(t) && !p2m_is_shared(t) && !p2m_is_foreign(t) )
                 set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
             ASSERT( !p2m_is_valid(t) || mfn + i == mfn_x(mfn_return) );
@@ -546,14 +631,16 @@ p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn, unsigned long mfn,
                          p2m->default_access);
 }
 
-void
+int
 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
                           unsigned long mfn, unsigned int page_order)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int rc;
     gfn_lock(p2m, gfn, page_order);
-    p2m_remove_page(p2m, gfn, mfn, page_order);
+    rc = p2m_remove_page(p2m, gfn, mfn, page_order);
     gfn_unlock(p2m, gfn, page_order);
+    return rc;
 }
 
 int
@@ -599,7 +686,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
     /* First, remove m->p mappings for existing p->m mappings */
     for ( i = 0; i < (1UL << page_order); i++ )
     {
-        omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL);
+        omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL, NULL);
         if ( p2m_is_shared(ot) )
         {
             /* Do an unshare to cleanly take care of all corner 
@@ -623,7 +710,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
                 (void)mem_sharing_notify_enomem(p2m->domain, gfn + i, 0);
                 return rc;
             }
-            omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL);
+            omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL, NULL);
             ASSERT(!p2m_is_shared(ot));
         }
         if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
@@ -671,7 +758,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
              * address */
             P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
                       mfn + i, ogfn, gfn + i);
-            omfn = p2m->get_entry(p2m, ogfn, &ot, &a, 0, NULL);
+            omfn = p2m->get_entry(p2m, ogfn, &ot, &a, 0, NULL, NULL);
             if ( p2m_is_ram(ot) && !p2m_is_paged(ot) )
             {
                 ASSERT(mfn_valid(omfn));
@@ -738,7 +825,7 @@ int p2m_change_type_one(struct domain *d, unsigned long gfn,
 
     gfn_lock(p2m, gfn, 0);
 
-    mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, NULL, NULL);
     rc = likely(pt == ot)
          ? p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, nt,
                          p2m->default_access)
@@ -810,7 +897,7 @@ void p2m_change_type_range(struct domain *d,
 
 /* Returns: 0 for success, -errno for failure */
 static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
-                               p2m_type_t gfn_p2mt)
+                               p2m_type_t gfn_p2mt, p2m_access_t access)
 {
     int rc = 0;
     p2m_access_t a;
@@ -822,10 +909,10 @@ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
         return -EIO;
 
     gfn_lock(p2m, gfn, 0);
-    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL);
+    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
     if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
     {
-        p2m_unlock(p2m);
+        gfn_unlock(p2m, gfn, 0);
         domain_crash(d);
         return -ENOENT;
     }
@@ -837,7 +924,7 @@ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
 
     P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn, mfn_x(mfn));
     rc = p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, gfn_p2mt,
-                       p2m->default_access);
+                       access);
     gfn_unlock(p2m, gfn, 0);
     if ( rc )
         gdprintk(XENLOG_ERR,
@@ -850,12 +937,64 @@ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
 static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
                                  mfn_t mfn)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign);
+    return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign,
+                               p2m_get_hostp2m(d)->default_access);
 }
 
-int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                       p2m_access_t access)
 {
-    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct);
+    return set_typed_p2m_entry(d, gfn, mfn, p2m_mmio_direct, access);
+}
+
+int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
+                           p2m_access_t p2ma, unsigned int flag)
+{
+    p2m_type_t p2mt;
+    p2m_access_t a;
+    mfn_t mfn;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int ret;
+
+    if ( !paging_mode_translate(p2m->domain) )
+    {
+        if ( !need_iommu(d) )
+            return 0;
+        return iommu_map_page(d, gfn, gfn, IOMMUF_readable|IOMMUF_writable);
+    }
+
+    gfn_lock(p2m, gfn, 0);
+
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
+
+    if ( p2mt == p2m_invalid || p2mt == p2m_mmio_dm )
+        ret = p2m_set_entry(p2m, gfn, _mfn(gfn), PAGE_ORDER_4K,
+                            p2m_mmio_direct, p2ma);
+    else if ( mfn_x(mfn) == gfn && p2mt == p2m_mmio_direct && a == p2ma )
+    {
+        ret = 0;
+        /*
+         * PVH fixme: during Dom0 PVH construction, p2m entries are being set
+         * but iomem regions are not mapped with IOMMU. This makes sure that
+         * RMRRs are correctly mapped with IOMMU.
+         */
+        if ( is_hardware_domain(d) && !iommu_use_hap_pt(d) )
+            ret = iommu_map_page(d, gfn, gfn, IOMMUF_readable|IOMMUF_writable);
+    }
+    else
+    {
+        if ( flag & XEN_DOMCTL_DEV_RDM_RELAXED )
+            ret = 0;
+        else
+            ret = -EBUSY;
+        printk(XENLOG_G_WARNING
+               "Cannot setup identity map d%d:%lx,"
+               " gfn already mapped to %lx.\n",
+               d->domain_id, gfn, mfn_x(mfn));
+    }
+
+    gfn_unlock(p2m, gfn, 0);
+    return ret;
 }
 
 /* Returns: 0 for success, -errno for failure */
@@ -871,7 +1010,7 @@ int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
         return -EIO;
 
     gfn_lock(p2m, gfn, 0);
-    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL);
+    actual_mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL, NULL);
 
     /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */
     if ( (INVALID_MFN == mfn_x(actual_mfn)) || (t != p2m_mmio_direct) )
@@ -893,6 +1032,42 @@ int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
     return rc;
 }
 
+int clear_identity_p2m_entry(struct domain *d, unsigned long gfn)
+{
+    p2m_type_t p2mt;
+    p2m_access_t a;
+    mfn_t mfn;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int ret;
+
+    if ( !paging_mode_translate(d) )
+    {
+        if ( !need_iommu(d) )
+            return 0;
+        return iommu_unmap_page(d, gfn);
+    }
+
+    gfn_lock(p2m, gfn, 0);
+
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
+    if ( p2mt == p2m_mmio_direct && mfn_x(mfn) == gfn )
+    {
+        ret = p2m_set_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K,
+                            p2m_invalid, p2m->default_access);
+        gfn_unlock(p2m, gfn, 0);
+    }
+    else
+    {
+        gfn_unlock(p2m, gfn, 0);
+        printk(XENLOG_G_WARNING
+               "non-identity map d%d:%lx not cleared (mapped to %lx)\n",
+               d->domain_id, gfn, mfn_x(mfn));
+        ret = 0;
+    }
+
+    return ret;
+}
+
 /* Returns: 0 for success, -errno for failure */
 int set_shared_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
 {
@@ -907,7 +1082,7 @@ int set_shared_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
         return -EIO;
 
     gfn_lock(p2m, gfn, 0);
-    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL);
+    omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL, NULL);
     /* At the moment we only allow p2m change if gfn has already been made
      * sharable first */
     ASSERT(p2m_is_shared(ot));
@@ -959,7 +1134,7 @@ int p2m_mem_paging_nominate(struct domain *d, unsigned long gfn)
 
     gfn_lock(p2m, gfn, 0);
 
-    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
 
     /* Check if mfn is valid */
     if ( !mfn_valid(mfn) )
@@ -1021,7 +1196,7 @@ int p2m_mem_paging_evict(struct domain *d, unsigned long gfn)
     gfn_lock(p2m, gfn, 0);
 
     /* Get mfn */
-    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
     if ( unlikely(!mfn_valid(mfn)) )
         goto out;
 
@@ -1077,27 +1252,30 @@ int p2m_mem_paging_evict(struct domain *d, unsigned long gfn)
 void p2m_mem_paging_drop_page(struct domain *d, unsigned long gfn,
                                 p2m_type_t p2mt)
 {
-    mem_event_request_t req = { .gfn = gfn };
+    vm_event_request_t req = {
+        .reason = VM_EVENT_REASON_MEM_PAGING,
+        .u.mem_paging.gfn = gfn
+    };
 
     /* We allow no ring in this unique case, because it won't affect
      * correctness of the guest execution at this point.  If this is the only
      * page that happens to be paged-out, we'll be okay..  but it's likely the
      * guest will crash shortly anyways. */
-    int rc = mem_event_claim_slot(d, &d->mem_event->paging);
+    int rc = vm_event_claim_slot(d, &d->vm_event->paging);
     if ( rc < 0 )
         return;
 
     /* Send release notification to pager */
-    req.flags = MEM_EVENT_FLAG_DROP_PAGE;
+    req.u.mem_paging.flags = MEM_PAGING_DROP_PAGE;
 
     /* Update stats unless the page hasn't yet been evicted */
     if ( p2mt != p2m_ram_paging_out )
         atomic_dec(&d->paged_pages);
     else
         /* Evict will fail now, tag this request for pager */
-        req.flags |= MEM_EVENT_FLAG_EVICT_FAIL;
+        req.u.mem_paging.flags |= MEM_PAGING_EVICT_FAIL;
 
-    mem_event_put_request(d, &d->mem_event->paging, &req);
+    vm_event_put_request(d, &d->vm_event->paging, &req);
 }
 
 /**
@@ -1124,14 +1302,17 @@ void p2m_mem_paging_drop_page(struct domain *d, unsigned long gfn,
 void p2m_mem_paging_populate(struct domain *d, unsigned long gfn)
 {
     struct vcpu *v = current;
-    mem_event_request_t req = { .gfn = gfn };
+    vm_event_request_t req = {
+        .reason = VM_EVENT_REASON_MEM_PAGING,
+        .u.mem_paging.gfn = gfn
+    };
     p2m_type_t p2mt;
     p2m_access_t a;
     mfn_t mfn;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     /* We're paging. There should be a ring */
-    int rc = mem_event_claim_slot(d, &d->mem_event->paging);
+    int rc = vm_event_claim_slot(d, &d->vm_event->paging);
     if ( rc == -ENOSYS )
     {
         gdprintk(XENLOG_ERR, "Domain %hu paging gfn %lx yet no ring "
@@ -1147,13 +1328,13 @@ void p2m_mem_paging_populate(struct domain *d, unsigned long gfn)
 
     /* Fix p2m mapping */
     gfn_lock(p2m, gfn, 0);
-    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
     /* Allow only nominated or evicted pages to enter page-in path */
     if ( p2mt == p2m_ram_paging_out || p2mt == p2m_ram_paged )
     {
         /* Evict will fail now, tag this request for pager */
         if ( p2mt == p2m_ram_paging_out )
-            req.flags |= MEM_EVENT_FLAG_EVICT_FAIL;
+            req.u.mem_paging.flags |= MEM_PAGING_EVICT_FAIL;
 
         p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2m_ram_paging_in, a);
     }
@@ -1162,22 +1343,22 @@ void p2m_mem_paging_populate(struct domain *d, unsigned long gfn)
     /* Pause domain if request came from guest and gfn has paging type */
     if ( p2m_is_paging(p2mt) && v->domain == d )
     {
-        mem_event_vcpu_pause(v);
-        req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED;
+        vm_event_vcpu_pause(v);
+        req.flags |= VM_EVENT_FLAG_VCPU_PAUSED;
     }
     /* No need to inform pager if the gfn is not in the page-out path */
     else if ( p2mt != p2m_ram_paging_out && p2mt != p2m_ram_paged )
     {
         /* gfn is already on its way back and vcpu is not paused */
-        mem_event_cancel_slot(d, &d->mem_event->paging);
+        vm_event_cancel_slot(d, &d->vm_event->paging);
         return;
     }
 
     /* Send request to pager */
-    req.p2mt = p2mt;
+    req.u.mem_paging.p2mt = p2mt;
     req.vcpu_id = v->vcpu_id;
 
-    mem_event_put_request(d, &d->mem_event->paging, &req);
+    vm_event_put_request(d, &d->vm_event->paging, &req);
 }
 
 /**
@@ -1209,7 +1390,7 @@ int p2m_mem_paging_prep(struct domain *d, unsigned long gfn, uint64_t buffer)
 
     gfn_lock(p2m, gfn, 0);
 
-    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
 
     ret = -ENOENT;
     /* Allow missing pages */
@@ -1239,7 +1420,7 @@ int p2m_mem_paging_prep(struct domain *d, unsigned long gfn, uint64_t buffer)
         int rc;
 
         ASSERT( mfn_valid(mfn) );
-        guest_map = map_domain_page(mfn_x(mfn));
+        guest_map = map_domain_page(mfn);
         rc = copy_from_user(guest_map, user_ptr, PAGE_SIZE);
         unmap_domain_page(guest_map);
         if ( rc )
@@ -1269,13 +1450,13 @@ int p2m_mem_paging_prep(struct domain *d, unsigned long gfn, uint64_t buffer)
 }
 
 /**
- * p2m_mem_paging_resume - Resume guest gfn and vcpus
+ * p2m_mem_paging_resume - Resume guest gfn
  * @d: guest domain
- * @gfn: guest page in paging state
+ * @rsp: vm_event response received
+ *
+ * p2m_mem_paging_resume() will forward the p2mt of a gfn to ram_rw. It is
+ * called by the pager.
  *
- * p2m_mem_paging_resume() will forward the p2mt of a gfn to ram_rw and all
- * waiting vcpus will be unpaused again. It is called by the pager.
- * 
  * The gfn was previously either evicted and populated, or nominated and
  * populated. If the page was evicted the p2mt will be p2m_ram_paging_in. If
  * the page was just nominated the p2mt will be p2m_ram_paging_in_start because
@@ -1283,51 +1464,37 @@ int p2m_mem_paging_prep(struct domain *d, unsigned long gfn, uint64_t buffer)
  *
  * If the gfn was dropped the vcpu needs to be unpaused.
  */
-void p2m_mem_paging_resume(struct domain *d)
+
+void p2m_mem_paging_resume(struct domain *d, vm_event_response_t *rsp)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    mem_event_response_t rsp;
     p2m_type_t p2mt;
     p2m_access_t a;
     mfn_t mfn;
 
-    /* Pull all responses off the ring */
-    while( mem_event_get_response(d, &d->mem_event->paging, &rsp) )
+    /* Fix p2m entry if the page was not dropped */
+    if ( !(rsp->u.mem_paging.flags & MEM_PAGING_DROP_PAGE) )
     {
-        struct vcpu *v;
+        unsigned long gfn = rsp->u.mem_access.gfn;
 
-        if ( rsp.flags & MEM_EVENT_FLAG_DUMMY )
-            continue;
-
-        /* Validate the vcpu_id in the response. */
-        if ( (rsp.vcpu_id >= d->max_vcpus) || !d->vcpu[rsp.vcpu_id] )
-            continue;
-
-        v = d->vcpu[rsp.vcpu_id];
-
-        /* Fix p2m entry if the page was not dropped */
-        if ( !(rsp.flags & MEM_EVENT_FLAG_DROP_PAGE) )
+        gfn_lock(p2m, gfn, 0);
+        mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL, NULL);
+        /*
+         * Allow only pages which were prepared properly, or pages which
+         * were nominated but not evicted.
+         */
+        if ( mfn_valid(mfn) && (p2mt == p2m_ram_paging_in) )
         {
-            gfn_lock(p2m, rsp.gfn, 0);
-            mfn = p2m->get_entry(p2m, rsp.gfn, &p2mt, &a, 0, NULL);
-            /* Allow only pages which were prepared properly, or pages which
-             * were nominated but not evicted */
-            if ( mfn_valid(mfn) && (p2mt == p2m_ram_paging_in) )
-            {
-                p2m_set_entry(p2m, rsp.gfn, mfn, PAGE_ORDER_4K,
-                              paging_mode_log_dirty(d) ? p2m_ram_logdirty :
-                              p2m_ram_rw, a);
-                set_gpfn_from_mfn(mfn_x(mfn), rsp.gfn);
-            }
-            gfn_unlock(p2m, rsp.gfn, 0);
+            p2m_set_entry(p2m, gfn, mfn, PAGE_ORDER_4K,
+                          paging_mode_log_dirty(d) ? p2m_ram_logdirty :
+                          p2m_ram_rw, a);
+            set_gpfn_from_mfn(mfn_x(mfn), gfn);
         }
-        /* Unpause domain */
-        if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
-            mem_event_vcpu_unpause(v);
+        gfn_unlock(p2m, gfn, 0);
     }
 }
 
-static void p2m_mem_event_fill_regs(mem_event_request_t *req)
+static void p2m_vm_event_fill_regs(vm_event_request_t *req)
 {
     const struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct segment_register seg;
@@ -1337,92 +1504,94 @@ static void p2m_mem_event_fill_regs(mem_event_request_t *req)
     /* Architecture-specific vmcs/vmcb bits */
     hvm_funcs.save_cpu_ctxt(curr, &ctxt);
 
-    req->x86_regs.rax = regs->eax;
-    req->x86_regs.rcx = regs->ecx;
-    req->x86_regs.rdx = regs->edx;
-    req->x86_regs.rbx = regs->ebx;
-    req->x86_regs.rsp = regs->esp;
-    req->x86_regs.rbp = regs->ebp;
-    req->x86_regs.rsi = regs->esi;
-    req->x86_regs.rdi = regs->edi;
-
-    req->x86_regs.r8  = regs->r8;
-    req->x86_regs.r9  = regs->r9;
-    req->x86_regs.r10 = regs->r10;
-    req->x86_regs.r11 = regs->r11;
-    req->x86_regs.r12 = regs->r12;
-    req->x86_regs.r13 = regs->r13;
-    req->x86_regs.r14 = regs->r14;
-    req->x86_regs.r15 = regs->r15;
-
-    req->x86_regs.rflags = regs->eflags;
-    req->x86_regs.rip    = regs->eip;
-
-    req->x86_regs.dr7 = curr->arch.debugreg[7];
-    req->x86_regs.cr0 = ctxt.cr0;
-    req->x86_regs.cr2 = ctxt.cr2;
-    req->x86_regs.cr3 = ctxt.cr3;
-    req->x86_regs.cr4 = ctxt.cr4;
-
-    req->x86_regs.sysenter_cs = ctxt.sysenter_cs;
-    req->x86_regs.sysenter_esp = ctxt.sysenter_esp;
-    req->x86_regs.sysenter_eip = ctxt.sysenter_eip;
-
-    req->x86_regs.msr_efer = ctxt.msr_efer;
-    req->x86_regs.msr_star = ctxt.msr_star;
-    req->x86_regs.msr_lstar = ctxt.msr_lstar;
+    req->data.regs.x86.rax = regs->eax;
+    req->data.regs.x86.rcx = regs->ecx;
+    req->data.regs.x86.rdx = regs->edx;
+    req->data.regs.x86.rbx = regs->ebx;
+    req->data.regs.x86.rsp = regs->esp;
+    req->data.regs.x86.rbp = regs->ebp;
+    req->data.regs.x86.rsi = regs->esi;
+    req->data.regs.x86.rdi = regs->edi;
+
+    req->data.regs.x86.r8  = regs->r8;
+    req->data.regs.x86.r9  = regs->r9;
+    req->data.regs.x86.r10 = regs->r10;
+    req->data.regs.x86.r11 = regs->r11;
+    req->data.regs.x86.r12 = regs->r12;
+    req->data.regs.x86.r13 = regs->r13;
+    req->data.regs.x86.r14 = regs->r14;
+    req->data.regs.x86.r15 = regs->r15;
+
+    req->data.regs.x86.rflags = regs->eflags;
+    req->data.regs.x86.rip    = regs->eip;
+
+    req->data.regs.x86.dr7 = curr->arch.debugreg[7];
+    req->data.regs.x86.cr0 = ctxt.cr0;
+    req->data.regs.x86.cr2 = ctxt.cr2;
+    req->data.regs.x86.cr3 = ctxt.cr3;
+    req->data.regs.x86.cr4 = ctxt.cr4;
+
+    req->data.regs.x86.sysenter_cs = ctxt.sysenter_cs;
+    req->data.regs.x86.sysenter_esp = ctxt.sysenter_esp;
+    req->data.regs.x86.sysenter_eip = ctxt.sysenter_eip;
+
+    req->data.regs.x86.msr_efer = ctxt.msr_efer;
+    req->data.regs.x86.msr_star = ctxt.msr_star;
+    req->data.regs.x86.msr_lstar = ctxt.msr_lstar;
 
     hvm_get_segment_register(curr, x86_seg_fs, &seg);
-    req->x86_regs.fs_base = seg.base;
+    req->data.regs.x86.fs_base = seg.base;
 
     hvm_get_segment_register(curr, x86_seg_gs, &seg);
-    req->x86_regs.gs_base = seg.base;
+    req->data.regs.x86.gs_base = seg.base;
 
     hvm_get_segment_register(curr, x86_seg_cs, &seg);
-    req->x86_regs.cs_arbytes = seg.attr.bytes;
+    req->data.regs.x86.cs_arbytes = seg.attr.bytes;
 }
 
-void p2m_mem_event_emulate_check(struct vcpu *v, const mem_event_response_t *rsp)
+void p2m_mem_access_emulate_check(struct vcpu *v,
+                                  const vm_event_response_t *rsp)
 {
     /* Mark vcpu for skipping one instruction upon rescheduling. */
-    if ( rsp->flags & MEM_EVENT_FLAG_EMULATE )
+    if ( rsp->flags & VM_EVENT_FLAG_EMULATE )
     {
         xenmem_access_t access;
         bool_t violation = 1;
+        const struct vm_event_mem_access *data = &rsp->u.mem_access;
 
-        if ( p2m_get_mem_access(v->domain, rsp->gfn, &access) == 0 )
+        if ( p2m_get_mem_access(v->domain, _gfn(data->gfn), &access) == 0 )
         {
             switch ( access )
             {
             case XENMEM_access_n:
             case XENMEM_access_n2rwx:
             default:
-                violation = rsp->access_r || rsp->access_w || rsp->access_x;
+                violation = data->flags & MEM_ACCESS_RWX;
                 break;
 
             case XENMEM_access_r:
-                violation = rsp->access_w || rsp->access_x;
+                violation = data->flags & MEM_ACCESS_WX;
                 break;
 
             case XENMEM_access_w:
-                violation = rsp->access_r || rsp->access_x;
+                violation = data->flags & MEM_ACCESS_RX;
                 break;
 
             case XENMEM_access_x:
-                violation = rsp->access_r || rsp->access_w;
+                violation = data->flags & MEM_ACCESS_RW;
                 break;
 
             case XENMEM_access_rx:
             case XENMEM_access_rx2rw:
-                violation = rsp->access_w;
+                violation = data->flags & MEM_ACCESS_W;
                 break;
 
             case XENMEM_access_wx:
-                violation = rsp->access_r;
+                violation = data->flags & MEM_ACCESS_R;
                 break;
 
             case XENMEM_access_rw:
-                violation = rsp->access_x;
+                violation = data->flags & MEM_ACCESS_X;
                 break;
 
             case XENMEM_access_rwx:
@@ -1431,43 +1600,49 @@ void p2m_mem_event_emulate_check(struct vcpu *v, const mem_event_response_t *rsp
             }
         }
 
-        v->arch.mem_event.emulate_flags = violation ? rsp->flags : 0;
+        v->arch.vm_event.emulate_flags = violation ? rsp->flags : 0;
+
+        if ( (rsp->flags & VM_EVENT_FLAG_SET_EMUL_READ_DATA) &&
+             v->arch.vm_event.emul_read_data )
+            *v->arch.vm_event.emul_read_data = rsp->data.emul_read_data;
     }
 }
 
-void p2m_setup_introspection(struct domain *d)
+void p2m_altp2m_check(struct vcpu *v, uint16_t idx)
 {
-    if ( hvm_funcs.enable_msr_exit_interception )
-    {
-        d->arch.hvm_domain.introspection_enabled = 1;
-        hvm_funcs.enable_msr_exit_interception(d);
-    }
+    if ( altp2m_active(v->domain) )
+        p2m_switch_vcpu_altp2m_by_id(v, idx);
 }
 
 bool_t p2m_mem_access_check(paddr_t gpa, unsigned long gla,
                             struct npfec npfec,
-                            mem_event_request_t **req_ptr)
+                            vm_event_request_t **req_ptr)
 {
     struct vcpu *v = current;
     unsigned long gfn = gpa >> PAGE_SHIFT;
     struct domain *d = v->domain;    
-    struct p2m_domain* p2m = p2m_get_hostp2m(d);
+    struct p2m_domain *p2m = NULL;
     mfn_t mfn;
     p2m_type_t p2mt;
     p2m_access_t p2ma;
-    mem_event_request_t *req;
+    vm_event_request_t *req;
     int rc;
     unsigned long eip = guest_cpu_user_regs()->eip;
 
+    if ( altp2m_active(d) )
+        p2m = p2m_get_altp2m(v);
+    if ( !p2m )
+        p2m = p2m_get_hostp2m(d);
+
     /* First, handle rx2rw conversion automatically.
      * These calls to p2m->set_entry() must succeed: we have the gfn
      * locked and just did a successful get_entry(). */
     gfn_lock(p2m, gfn, 0);
-    mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
 
     if ( npfec.write_access && p2ma == p2m_access_rx2rw ) 
     {
-        rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rw);
+        rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rw, -1);
         ASSERT(rc == 0);
         gfn_unlock(p2m, gfn, 0);
         return 1;
@@ -1476,19 +1651,19 @@ bool_t p2m_mem_access_check(paddr_t gpa, unsigned long gla,
     {
         ASSERT(npfec.write_access || npfec.read_access || npfec.insn_fetch);
         rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K,
-                            p2mt, p2m_access_rwx);
+                            p2mt, p2m_access_rwx, -1);
         ASSERT(rc == 0);
     }
     gfn_unlock(p2m, gfn, 0);
 
     /* Otherwise, check if there is a memory event listener, and send the message along */
-    if ( !mem_event_check_ring(&d->mem_event->access) || !req_ptr ) 
+    if ( !vm_event_check_ring(&d->vm_event->monitor) || !req_ptr ) 
     {
         /* No listener */
         if ( p2m->access_required ) 
         {
             gdprintk(XENLOG_INFO, "Memory access permissions failure, "
-                                  "no mem_event listener VCPU %d, dom %d\n",
+                                  "no vm_event listener VCPU %d, dom %d\n",
                                   v->vcpu_id, d->domain_id);
             domain_crash(v->domain);
             return 0;
@@ -1496,14 +1671,14 @@ bool_t p2m_mem_access_check(paddr_t gpa, unsigned long gla,
         else
         {
             gfn_lock(p2m, gfn, 0);
-            mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL);
+            mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
             if ( p2ma != p2m_access_n2rwx )
             {
                 /* A listener is not required, so clear the access
                  * restrictions.  This set must succeed: we have the
                  * gfn locked and just did a successful get_entry(). */
                 rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K,
-                                    p2mt, p2m_access_rwx);
+                                    p2mt, p2m_access_rwx, -1);
                 ASSERT(rc == 0);
             }
             gfn_unlock(p2m, gfn, 0);
@@ -1511,75 +1686,96 @@ bool_t p2m_mem_access_check(paddr_t gpa, unsigned long gla,
         }
     }
 
-    /* The previous mem_event reply does not match the current state. */
-    if ( v->arch.mem_event.gpa != gpa || v->arch.mem_event.eip != eip )
+    /* The previous vm_event reply does not match the current state. */
+    if ( v->arch.vm_event.gpa != gpa || v->arch.vm_event.eip != eip )
     {
-        /* Don't emulate the current instruction, send a new mem_event. */
-        v->arch.mem_event.emulate_flags = 0;
+        /* Don't emulate the current instruction, send a new vm_event. */
+        v->arch.vm_event.emulate_flags = 0;
 
         /*
          * Make sure to mark the current state to match it again against
-         * the new mem_event about to be sent.
+         * the new vm_event about to be sent.
          */
-        v->arch.mem_event.gpa = gpa;
-        v->arch.mem_event.eip = eip;
+        v->arch.vm_event.gpa = gpa;
+        v->arch.vm_event.eip = eip;
     }
 
-    if ( v->arch.mem_event.emulate_flags )
+    if ( v->arch.vm_event.emulate_flags )
     {
-        hvm_mem_event_emulate_one((v->arch.mem_event.emulate_flags &
-                                   MEM_EVENT_FLAG_EMULATE_NOWRITE) != 0,
-                                  TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+        enum emul_kind kind = EMUL_KIND_NORMAL;
 
-        v->arch.mem_event.emulate_flags = 0;
+        if ( v->arch.vm_event.emulate_flags &
+             VM_EVENT_FLAG_SET_EMUL_READ_DATA )
+            kind = EMUL_KIND_SET_CONTEXT;
+        else if ( v->arch.vm_event.emulate_flags &
+                  VM_EVENT_FLAG_EMULATE_NOWRITE )
+            kind = EMUL_KIND_NOWRITE;
+
+        hvm_mem_access_emulate_one(kind, TRAP_invalid_op,
+                                   HVM_DELIVER_NO_ERROR_CODE);
+
+        v->arch.vm_event.emulate_flags = 0;
         return 1;
     }
 
     *req_ptr = NULL;
-    req = xzalloc(mem_event_request_t);
+    req = xzalloc(vm_event_request_t);
     if ( req )
     {
         *req_ptr = req;
-        req->reason = MEM_EVENT_REASON_VIOLATION;
+        req->reason = VM_EVENT_REASON_MEM_ACCESS;
 
         /* Pause the current VCPU */
         if ( p2ma != p2m_access_n2rwx )
-            req->flags |= MEM_EVENT_FLAG_VCPU_PAUSED;
+            req->flags |= VM_EVENT_FLAG_VCPU_PAUSED;
 
         /* Send request to mem event */
-        req->gfn = gfn;
-        req->offset = gpa & ((1 << PAGE_SHIFT) - 1);
-        req->gla_valid = npfec.gla_valid;
-        req->gla = gla;
-        if ( npfec.kind == npfec_kind_with_gla )
-            req->fault_with_gla = 1;
-        else if ( npfec.kind == npfec_kind_in_gpt )
-            req->fault_in_gpt = 1;
-        req->access_r = npfec.read_access;
-        req->access_w = npfec.write_access;
-        req->access_x = npfec.insn_fetch;
+        req->u.mem_access.gfn = gfn;
+        req->u.mem_access.offset = gpa & ((1 << PAGE_SHIFT) - 1);
+        if ( npfec.gla_valid )
+        {
+            req->u.mem_access.flags |= MEM_ACCESS_GLA_VALID;
+            req->u.mem_access.gla = gla;
+
+            if ( npfec.kind == npfec_kind_with_gla )
+                req->u.mem_access.flags |= MEM_ACCESS_FAULT_WITH_GLA;
+            else if ( npfec.kind == npfec_kind_in_gpt )
+                req->u.mem_access.flags |= MEM_ACCESS_FAULT_IN_GPT;
+        }
+        req->u.mem_access.flags |= npfec.read_access    ? MEM_ACCESS_R : 0;
+        req->u.mem_access.flags |= npfec.write_access   ? MEM_ACCESS_W : 0;
+        req->u.mem_access.flags |= npfec.insn_fetch     ? MEM_ACCESS_X : 0;
         req->vcpu_id = v->vcpu_id;
 
-        p2m_mem_event_fill_regs(req);
+        p2m_vm_event_fill_regs(req);
+
+        if ( altp2m_active(v->domain) )
+        {
+            req->flags |= VM_EVENT_FLAG_ALTERNATE_P2M;
+            req->altp2m_idx = vcpu_altp2m(v).p2midx;
+        }
     }
 
     /* Pause the current VCPU */
     if ( p2ma != p2m_access_n2rwx )
-        mem_event_vcpu_pause(v);
+        vm_event_vcpu_pause(v);
 
     /* VCPU may be paused, return whether we promoted automatically */
     return (p2ma == p2m_access_n2rwx);
 }
 
-/* Set access type for a region of pfns.
- * If start_pfn == -1ul, sets the default access type */
-long p2m_set_mem_access(struct domain *d, unsigned long pfn, uint32_t nr,
+/*
+ * Set access type for a region of gfns.
+ * If gfn == INVALID_GFN, sets the default access type.
+ */
+long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
                         uint32_t start, uint32_t mask, xenmem_access_t access)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     p2m_access_t a, _a;
     p2m_type_t t;
     mfn_t mfn;
+    unsigned long gfn_l;
     long rc = 0;
 
     static const p2m_access_t memaccess[] = {
@@ -1609,18 +1805,18 @@ long p2m_set_mem_access(struct domain *d, unsigned long pfn, uint32_t nr,
         return -EINVAL;
     }
 
-    /* If request to set default access */
-    if ( pfn == ~0ul )
+    /* If request to set default access. */
+    if ( gfn_x(gfn) == INVALID_GFN )
     {
         p2m->default_access = a;
         return 0;
     }
 
     p2m_lock(p2m);
-    for ( pfn += start; nr > start; ++pfn )
+    for ( gfn_l = gfn_x(gfn) + start; nr > start; ++gfn_l )
     {
-        mfn = p2m->get_entry(p2m, pfn, &t, &_a, 0, NULL);
-        rc = p2m->set_entry(p2m, pfn, mfn, PAGE_ORDER_4K, t, a);
+        mfn = p2m->get_entry(p2m, gfn_l, &t, &_a, 0, NULL, NULL);
+        rc = p2m->set_entry(p2m, gfn_l, mfn, PAGE_ORDER_4K, t, a, -1);
         if ( rc )
             break;
 
@@ -1635,10 +1831,11 @@ long p2m_set_mem_access(struct domain *d, unsigned long pfn, uint32_t nr,
     return rc;
 }
 
-/* Get access type for a pfn
- * If pfn == -1ul, gets the default access type */
-int p2m_get_mem_access(struct domain *d, unsigned long pfn, 
-                       xenmem_access_t *access)
+/*
+ * Get access type for a gfn.
+ * If gfn == INVALID_GFN, gets the default access type.
+ */
+int p2m_get_mem_access(struct domain *d, gfn_t gfn, xenmem_access_t *access)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     p2m_type_t t;
@@ -1660,15 +1857,15 @@ int p2m_get_mem_access(struct domain *d, unsigned long pfn,
 #undef ACCESS
     };
 
-    /* If request to get default access */
-    if ( pfn == ~0ull ) 
+    /* If request to get default access. */
+    if ( gfn_x(gfn) == INVALID_GFN )
     {
         *access = memaccess[p2m->default_access];
         return 0;
     }
 
     gfn_lock(p2m, gfn, 0);
-    mfn = p2m->get_entry(p2m, pfn, &t, &a, 0, NULL);
+    mfn = p2m->get_entry(p2m, gfn_x(gfn), &t, &a, 0, NULL, NULL);
     gfn_unlock(p2m, gfn, 0);
 
     if ( mfn_x(mfn) == INVALID_MFN )
@@ -1708,11 +1905,17 @@ p2m_flush_table(struct p2m_domain *p2m)
 
     /* "Host" p2m tables can have shared entries &c that need a bit more 
      * care when discarding them */
-    ASSERT(p2m_is_nestedp2m(p2m));
+    ASSERT(!p2m_is_hostp2m(p2m));
     /* Nested p2m's do not do pod, hence the asserts (and no pod lock)*/
     ASSERT(page_list_empty(&p2m->pod.super));
     ASSERT(page_list_empty(&p2m->pod.single));
 
+    if ( p2m->np2m_base == P2M_BASE_EADDR )
+    {
+        p2m_unlock(p2m);
+        return;
+    }
+
     /* This is no longer a valid nested p2m for any address space */
     p2m->np2m_base = P2M_BASE_EADDR;
     
@@ -1822,9 +2025,7 @@ unsigned long paging_gva_to_gfn(struct vcpu *v,
     struct p2m_domain *hostp2m = p2m_get_hostp2m(v->domain);
     const struct paging_mode *hostmode = paging_get_hostmode(v);
 
-    if ( is_hvm_domain(v->domain)
-        && paging_mode_hap(v->domain) 
-        && nestedhvm_is_n2(v) )
+    if ( is_hvm_vcpu(v) && paging_mode_hap(v->domain) && nestedhvm_is_n2(v) )
     {
         unsigned long gfn;
         struct p2m_domain *p2m;
@@ -1858,7 +2059,8 @@ int map_mmio_regions(struct domain *d,
 
     for ( i = 0; !ret && i < nr; i++ )
     {
-        ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i));
+        ret = set_mmio_p2m_entry(d, start_gfn + i, _mfn(mfn + i),
+                                 p2m_get_hostp2m(d)->default_access);
         if ( ret )
         {
             unmap_mmio_regions(d, start_gfn, i, mfn);
@@ -1890,6 +2092,497 @@ int unmap_mmio_regions(struct domain *d,
     return err;
 }
 
+unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp)
+{
+    struct p2m_domain *p2m;
+    struct ept_data *ept;
+    unsigned int i;
+
+    altp2m_list_lock(d);
+
+    for ( i = 0; i < MAX_ALTP2M; i++ )
+    {
+        if ( d->arch.altp2m_eptp[i] == INVALID_MFN )
+            continue;
+
+        p2m = d->arch.altp2m_p2m[i];
+        ept = &p2m->ept;
+
+        if ( eptp == ept_get_eptp(ept) )
+            goto out;
+    }
+
+    i = INVALID_ALTP2M;
+
+ out:
+    altp2m_list_unlock(d);
+    return i;
+}
+
+bool_t p2m_switch_vcpu_altp2m_by_id(struct vcpu *v, unsigned int idx)
+{
+    struct domain *d = v->domain;
+    bool_t rc = 0;
+
+    if ( idx >= MAX_ALTP2M )
+        return rc;
+
+    altp2m_list_lock(d);
+
+    if ( d->arch.altp2m_eptp[idx] != INVALID_MFN )
+    {
+        if ( idx != vcpu_altp2m(v).p2midx )
+        {
+            atomic_dec(&p2m_get_altp2m(v)->active_vcpus);
+            vcpu_altp2m(v).p2midx = idx;
+            atomic_inc(&p2m_get_altp2m(v)->active_vcpus);
+            altp2m_vcpu_update_p2m(v);
+        }
+        rc = 1;
+    }
+
+    altp2m_list_unlock(d);
+    return rc;
+}
+
+/*
+ * If the fault is for a not present entry:
+ *     if the entry in the host p2m has a valid mfn, copy it and retry
+ *     else indicate that outer handler should handle fault
+ *
+ * If the fault is for a present entry:
+ *     indicate that outer handler should handle fault
+ */
+
+bool_t p2m_altp2m_lazy_copy(struct vcpu *v, paddr_t gpa,
+                            unsigned long gla, struct npfec npfec,
+                            struct p2m_domain **ap2m)
+{
+    struct p2m_domain *hp2m = p2m_get_hostp2m(v->domain);
+    p2m_type_t p2mt;
+    p2m_access_t p2ma;
+    unsigned int page_order;
+    gfn_t gfn = _gfn(paddr_to_pfn(gpa));
+    unsigned long mask;
+    mfn_t mfn;
+    int rv;
+
+    *ap2m = p2m_get_altp2m(v);
+
+    mfn = get_gfn_type_access(*ap2m, gfn_x(gfn), &p2mt, &p2ma,
+                              0, &page_order);
+    __put_gfn(*ap2m, gfn_x(gfn));
+
+    if ( mfn_x(mfn) != INVALID_MFN )
+        return 0;
+
+    mfn = get_gfn_type_access(hp2m, gfn_x(gfn), &p2mt, &p2ma,
+                              P2M_ALLOC | P2M_UNSHARE, &page_order);
+    __put_gfn(hp2m, gfn_x(gfn));
+
+    if ( mfn_x(mfn) == INVALID_MFN )
+        return 0;
+
+    p2m_lock(*ap2m);
+
+    /*
+     * If this is a superpage mapping, round down both frame numbers
+     * to the start of the superpage.
+     */
+    mask = ~((1UL << page_order) - 1);
+    mfn = _mfn(mfn_x(mfn) & mask);
+
+    rv = p2m_set_entry(*ap2m, gfn_x(gfn) & mask, mfn, page_order, p2mt, p2ma);
+    p2m_unlock(*ap2m);
+
+    if ( rv )
+    {
+        gdprintk(XENLOG_ERR,
+	    "failed to set entry for %#"PRIx64" -> %#"PRIx64" p2m %#"PRIx64"\n",
+	    gfn_x(gfn), mfn_x(mfn), (unsigned long)*ap2m);
+        domain_crash(hp2m->domain);
+    }
+
+    return 1;
+}
+
+void p2m_flush_altp2m(struct domain *d)
+{
+    unsigned int i;
+
+    altp2m_list_lock(d);
+
+    for ( i = 0; i < MAX_ALTP2M; i++ )
+    {
+        p2m_flush_table(d->arch.altp2m_p2m[i]);
+        /* Uninit and reinit ept to force TLB shootdown */
+        ept_p2m_uninit(d->arch.altp2m_p2m[i]);
+        ept_p2m_init(d->arch.altp2m_p2m[i]);
+        d->arch.altp2m_eptp[i] = INVALID_MFN;
+    }
+
+    altp2m_list_unlock(d);
+}
+
+static void p2m_init_altp2m_helper(struct domain *d, unsigned int i)
+{
+    struct p2m_domain *p2m = d->arch.altp2m_p2m[i];
+    struct ept_data *ept;
+
+    p2m->min_remapped_gfn = INVALID_GFN;
+    p2m->max_remapped_gfn = 0;
+    ept = &p2m->ept;
+    ept->asr = pagetable_get_pfn(p2m_get_pagetable(p2m));
+    d->arch.altp2m_eptp[i] = ept_get_eptp(ept);
+}
+
+int p2m_init_altp2m_by_id(struct domain *d, unsigned int idx)
+{
+    int rc = -EINVAL;
+
+    if ( idx >= MAX_ALTP2M )
+        return rc;
+
+    altp2m_list_lock(d);
+
+    if ( d->arch.altp2m_eptp[idx] == INVALID_MFN )
+    {
+        p2m_init_altp2m_helper(d, idx);
+        rc = 0;
+    }
+
+    altp2m_list_unlock(d);
+    return rc;
+}
+
+int p2m_init_next_altp2m(struct domain *d, uint16_t *idx)
+{
+    int rc = -EINVAL;
+    unsigned int i;
+
+    altp2m_list_lock(d);
+
+    for ( i = 0; i < MAX_ALTP2M; i++ )
+    {
+        if ( d->arch.altp2m_eptp[i] != INVALID_MFN )
+            continue;
+
+        p2m_init_altp2m_helper(d, i);
+        *idx = i;
+        rc = 0;
+
+        break;
+    }
+
+    altp2m_list_unlock(d);
+    return rc;
+}
+
+int p2m_destroy_altp2m_by_id(struct domain *d, unsigned int idx)
+{
+    struct p2m_domain *p2m;
+    int rc = -EBUSY;
+
+    if ( !idx || idx >= MAX_ALTP2M )
+        return rc;
+
+    domain_pause_except_self(d);
+
+    altp2m_list_lock(d);
+
+    if ( d->arch.altp2m_eptp[idx] != INVALID_MFN )
+    {
+        p2m = d->arch.altp2m_p2m[idx];
+
+        if ( !_atomic_read(p2m->active_vcpus) )
+        {
+            p2m_flush_table(d->arch.altp2m_p2m[idx]);
+            /* Uninit and reinit ept to force TLB shootdown */
+            ept_p2m_uninit(d->arch.altp2m_p2m[idx]);
+            ept_p2m_init(d->arch.altp2m_p2m[idx]);
+            d->arch.altp2m_eptp[idx] = INVALID_MFN;
+            rc = 0;
+        }
+    }
+
+    altp2m_list_unlock(d);
+
+    domain_unpause_except_self(d);
+
+    return rc;
+}
+
+int p2m_switch_domain_altp2m_by_id(struct domain *d, unsigned int idx)
+{
+    struct vcpu *v;
+    int rc = -EINVAL;
+
+    if ( idx >= MAX_ALTP2M )
+        return rc;
+
+    domain_pause_except_self(d);
+
+    altp2m_list_lock(d);
+
+    if ( d->arch.altp2m_eptp[idx] != INVALID_MFN )
+    {
+        for_each_vcpu( d, v )
+            if ( idx != vcpu_altp2m(v).p2midx )
+            {
+                atomic_dec(&p2m_get_altp2m(v)->active_vcpus);
+                vcpu_altp2m(v).p2midx = idx;
+                atomic_inc(&p2m_get_altp2m(v)->active_vcpus);
+                altp2m_vcpu_update_p2m(v);
+            }
+
+        rc = 0;
+    }
+
+    altp2m_list_unlock(d);
+
+    domain_unpause_except_self(d);
+
+    return rc;
+}
+
+int p2m_set_altp2m_mem_access(struct domain *d, unsigned int idx,
+                              gfn_t gfn, xenmem_access_t access)
+{
+    struct p2m_domain *hp2m, *ap2m;
+    p2m_access_t req_a, old_a;
+    p2m_type_t t;
+    mfn_t mfn;
+    unsigned int page_order;
+    int rc = -EINVAL;
+
+    static const p2m_access_t memaccess[] = {
+#define ACCESS(ac) [XENMEM_access_##ac] = p2m_access_##ac
+        ACCESS(n),
+        ACCESS(r),
+        ACCESS(w),
+        ACCESS(rw),
+        ACCESS(x),
+        ACCESS(rx),
+        ACCESS(wx),
+        ACCESS(rwx),
+#undef ACCESS
+    };
+
+    if ( idx >= MAX_ALTP2M || d->arch.altp2m_eptp[idx] == INVALID_MFN )
+        return rc;
+
+    ap2m = d->arch.altp2m_p2m[idx];
+
+    switch ( access )
+    {
+    case 0 ... ARRAY_SIZE(memaccess) - 1:
+        req_a = memaccess[access];
+        break;
+    case XENMEM_access_default:
+        req_a = ap2m->default_access;
+        break;
+    default:
+        return rc;
+    }
+
+    /* If request to set default access */
+    if ( gfn_x(gfn) == INVALID_GFN )
+    {
+        ap2m->default_access = req_a;
+        return 0;
+    }
+
+    hp2m = p2m_get_hostp2m(d);
+
+    p2m_lock(ap2m);
+
+    mfn = ap2m->get_entry(ap2m, gfn_x(gfn), &t, &old_a, 0, NULL, NULL);
+
+    /* Check host p2m if no valid entry in alternate */
+    if ( !mfn_valid(mfn) )
+    {
+        mfn = hp2m->get_entry(hp2m, gfn_x(gfn), &t, &old_a,
+                              P2M_ALLOC | P2M_UNSHARE, &page_order, NULL);
+
+        if ( !mfn_valid(mfn) || t != p2m_ram_rw )
+            goto out;
+
+        /* If this is a superpage, copy that first */
+        if ( page_order != PAGE_ORDER_4K )
+        {
+            gfn_t gfn2;
+            unsigned long mask;
+            mfn_t mfn2;
+
+            mask = ~((1UL << page_order) - 1);
+            gfn2 = _gfn(gfn_x(gfn) & mask);
+            mfn2 = _mfn(mfn_x(mfn) & mask);
+
+            if ( ap2m->set_entry(ap2m, gfn_x(gfn2), mfn2, page_order, t, old_a, 1) )
+                goto out;
+        }
+    }
+
+    if ( !ap2m->set_entry(ap2m, gfn_x(gfn), mfn, PAGE_ORDER_4K, t, req_a,
+                          (current->domain != d)) )
+        rc = 0;
+
+ out:
+    p2m_unlock(ap2m);
+    return rc;
+}
+
+int p2m_change_altp2m_gfn(struct domain *d, unsigned int idx,
+                          gfn_t old_gfn, gfn_t new_gfn)
+{
+    struct p2m_domain *hp2m, *ap2m;
+    p2m_access_t a;
+    p2m_type_t t;
+    mfn_t mfn;
+    unsigned int page_order;
+    int rc = -EINVAL;
+
+    if ( idx >= MAX_ALTP2M || d->arch.altp2m_eptp[idx] == INVALID_MFN )
+        return rc;
+
+    hp2m = p2m_get_hostp2m(d);
+    ap2m = d->arch.altp2m_p2m[idx];
+
+    p2m_lock(ap2m);
+
+    mfn = ap2m->get_entry(ap2m, gfn_x(old_gfn), &t, &a, 0, NULL, NULL);
+
+    if ( gfn_x(new_gfn) == INVALID_GFN )
+    {
+        if ( mfn_valid(mfn) )
+            p2m_remove_page(ap2m, gfn_x(old_gfn), mfn_x(mfn), PAGE_ORDER_4K);
+        rc = 0;
+        goto out;
+    }
+
+    /* Check host p2m if no valid entry in alternate */
+    if ( !mfn_valid(mfn) )
+    {
+        mfn = hp2m->get_entry(hp2m, gfn_x(old_gfn), &t, &a,
+                              P2M_ALLOC | P2M_UNSHARE, &page_order, NULL);
+
+        if ( !mfn_valid(mfn) || t != p2m_ram_rw )
+            goto out;
+
+        /* If this is a superpage, copy that first */
+        if ( page_order != PAGE_ORDER_4K )
+        {
+            gfn_t gfn;
+            unsigned long mask;
+
+            mask = ~((1UL << page_order) - 1);
+            gfn = _gfn(gfn_x(old_gfn) & mask);
+            mfn = _mfn(mfn_x(mfn) & mask);
+
+            if ( ap2m->set_entry(ap2m, gfn_x(gfn), mfn, page_order, t, a, 1) )
+                goto out;
+        }
+    }
+
+    mfn = ap2m->get_entry(ap2m, gfn_x(new_gfn), &t, &a, 0, NULL, NULL);
+
+    if ( !mfn_valid(mfn) )
+        mfn = hp2m->get_entry(hp2m, gfn_x(new_gfn), &t, &a, 0, NULL, NULL);
+
+    if ( !mfn_valid(mfn) || (t != p2m_ram_rw) )
+        goto out;
+
+    if ( !ap2m->set_entry(ap2m, gfn_x(old_gfn), mfn, PAGE_ORDER_4K, t, a,
+                          (current->domain != d)) )
+    {
+        rc = 0;
+
+        if ( gfn_x(new_gfn) < ap2m->min_remapped_gfn )
+            ap2m->min_remapped_gfn = gfn_x(new_gfn);
+        if ( gfn_x(new_gfn) > ap2m->max_remapped_gfn )
+            ap2m->max_remapped_gfn = gfn_x(new_gfn);
+    }
+
+ out:
+    p2m_unlock(ap2m);
+    return rc;
+}
+
+static void p2m_reset_altp2m(struct p2m_domain *p2m)
+{
+    p2m_flush_table(p2m);
+    /* Uninit and reinit ept to force TLB shootdown */
+    ept_p2m_uninit(p2m);
+    ept_p2m_init(p2m);
+    p2m->min_remapped_gfn = INVALID_GFN;
+    p2m->max_remapped_gfn = 0;
+}
+
+void p2m_altp2m_propagate_change(struct domain *d, gfn_t gfn,
+                                 mfn_t mfn, unsigned int page_order,
+                                 p2m_type_t p2mt, p2m_access_t p2ma)
+{
+    struct p2m_domain *p2m;
+    p2m_access_t a;
+    p2m_type_t t;
+    mfn_t m;
+    unsigned int i;
+    unsigned int reset_count = 0;
+    unsigned int last_reset_idx = ~0;
+
+    if ( !altp2m_active(d) )
+        return;
+
+    altp2m_list_lock(d);
+
+    for ( i = 0; i < MAX_ALTP2M; i++ )
+    {
+        if ( d->arch.altp2m_eptp[i] == INVALID_MFN )
+            continue;
+
+        p2m = d->arch.altp2m_p2m[i];
+        m = get_gfn_type_access(p2m, gfn_x(gfn), &t, &a, 0, NULL);
+
+        /* Check for a dropped page that may impact this altp2m */
+        if ( mfn_x(mfn) == INVALID_MFN &&
+             gfn_x(gfn) >= p2m->min_remapped_gfn &&
+             gfn_x(gfn) <= p2m->max_remapped_gfn )
+        {
+            if ( !reset_count++ )
+            {
+                p2m_reset_altp2m(p2m);
+                last_reset_idx = i;
+            }
+            else
+            {
+                /* At least 2 altp2m's impacted, so reset everything */
+                __put_gfn(p2m, gfn_x(gfn));
+
+                for ( i = 0; i < MAX_ALTP2M; i++ )
+                {
+                    if ( i == last_reset_idx ||
+                         d->arch.altp2m_eptp[i] == INVALID_MFN )
+                        continue;
+
+                    p2m = d->arch.altp2m_p2m[i];
+                    p2m_lock(p2m);
+                    p2m_reset_altp2m(p2m);
+                    p2m_unlock(p2m);
+                }
+
+                goto out;
+            }
+        }
+        else if ( mfn_x(m) != INVALID_MFN )
+            p2m_set_entry(p2m, gfn_x(gfn), mfn, page_order, p2mt, p2ma);
+
+        __put_gfn(p2m, gfn_x(gfn));
+    }
+
+ out:
+    altp2m_list_unlock(d);
+}
+
 /*** Audit ***/
 
 #if P2M_AUDIT
diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index 6b788f7..5becee8 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/init.h>
@@ -81,7 +80,7 @@ static mfn_t paging_new_log_dirty_leaf(struct domain *d)
     mfn_t mfn = paging_new_log_dirty_page(d);
     if ( mfn_valid(mfn) )
     {
-        void *leaf = map_domain_page(mfn_x(mfn));
+        void *leaf = map_domain_page(mfn);
         clear_page(leaf);
         unmap_domain_page(leaf);
     }
@@ -95,7 +94,7 @@ static mfn_t paging_new_log_dirty_node(struct domain *d)
     if ( mfn_valid(mfn) )
     {
         int i;
-        mfn_t *node = map_domain_page(mfn_x(mfn));
+        mfn_t *node = map_domain_page(mfn);
         for ( i = 0; i < LOGDIRTY_NODE_ENTRIES; i++ )
             node[i] = _mfn(INVALID_MFN);
         unmap_domain_page(node);
@@ -107,7 +106,7 @@ static mfn_t paging_new_log_dirty_node(struct domain *d)
 static mfn_t *paging_map_log_dirty_bitmap(struct domain *d)
 {
     if ( likely(mfn_valid(d->arch.paging.log_dirty.top)) )
-        return map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+        return map_domain_page(d->arch.paging.log_dirty.top);
     return NULL;
 }
 
@@ -144,7 +143,7 @@ static int paging_free_log_dirty_bitmap(struct domain *d, int rc)
         return -EBUSY;
     }
 
-    l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+    l4 = map_domain_page(d->arch.paging.log_dirty.top);
     i4 = d->arch.paging.preempt.log_dirty.i4;
     i3 = d->arch.paging.preempt.log_dirty.i3;
     rc = 0;
@@ -154,14 +153,14 @@ static int paging_free_log_dirty_bitmap(struct domain *d, int rc)
         if ( !mfn_valid(l4[i4]) )
             continue;
 
-        l3 = map_domain_page(mfn_x(l4[i4]));
+        l3 = map_domain_page(l4[i4]);
 
         for ( ; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
         {
             if ( !mfn_valid(l3[i3]) )
                 continue;
 
-            l2 = map_domain_page(mfn_x(l3[i3]));
+            l2 = map_domain_page(l3[i3]);
 
             for ( i2 = 0; i2 < LOGDIRTY_NODE_ENTRIES; i2++ )
                 if ( mfn_valid(l2[i2]) )
@@ -266,24 +265,17 @@ static int paging_log_dirty_disable(struct domain *d, bool_t resuming)
     return ret;
 }
 
-/* Mark a page as dirty */
-void paging_mark_dirty(struct domain *d, unsigned long guest_mfn)
+/* Mark a page as dirty, with taking guest pfn as parameter */
+void paging_mark_gfn_dirty(struct domain *d, unsigned long pfn)
 {
-    unsigned long pfn;
-    mfn_t gmfn;
     int changed;
     mfn_t mfn, *l4, *l3, *l2;
     unsigned long *l1;
     int i1, i2, i3, i4;
 
-    gmfn = _mfn(guest_mfn);
-
-    if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) ||
-         page_get_owner(mfn_to_page(gmfn)) != d )
+    if ( !paging_mode_log_dirty(d) )
         return;
 
-    /* We /really/ mean PFN here, even for non-translated guests. */
-    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
     /* Shared MFNs should NEVER be marked dirty */
     BUG_ON(SHARED_M2P(pfn));
 
@@ -318,7 +310,7 @@ void paging_mark_dirty(struct domain *d, unsigned long guest_mfn)
     if ( !mfn_valid(mfn) )
         goto out;
 
-    l3 = map_domain_page(mfn_x(mfn));
+    l3 = map_domain_page(mfn);
     mfn = l3[i3];
     if ( !mfn_valid(mfn) )
         l3[i3] = mfn = paging_new_log_dirty_node(d);
@@ -326,7 +318,7 @@ void paging_mark_dirty(struct domain *d, unsigned long guest_mfn)
     if ( !mfn_valid(mfn) )
         goto out;
 
-    l2 = map_domain_page(mfn_x(mfn));
+    l2 = map_domain_page(mfn);
     mfn = l2[i2];
     if ( !mfn_valid(mfn) )
         l2[i2] = mfn = paging_new_log_dirty_leaf(d);
@@ -334,7 +326,7 @@ void paging_mark_dirty(struct domain *d, unsigned long guest_mfn)
     if ( !mfn_valid(mfn) )
         goto out;
 
-    l1 = map_domain_page(mfn_x(mfn));
+    l1 = map_domain_page(mfn);
     changed = !__test_and_set_bit(i1, l1);
     unmap_domain_page(l1);
     if ( changed )
@@ -351,6 +343,24 @@ out:
     return;
 }
 
+/* Mark a page as dirty */
+void paging_mark_dirty(struct domain *d, unsigned long guest_mfn)
+{
+    unsigned long pfn;
+    mfn_t gmfn;
+
+    gmfn = _mfn(guest_mfn);
+
+    if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) ||
+         page_get_owner(mfn_to_page(gmfn)) != d )
+        return;
+
+    /* We /really/ mean PFN here, even for non-translated guests. */
+    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+    paging_mark_gfn_dirty(d, pfn);
+}
+
 
 /* Is this guest page dirty? */
 int paging_mfn_is_dirty(struct domain *d, mfn_t gmfn)
@@ -373,25 +383,25 @@ int paging_mfn_is_dirty(struct domain *d, mfn_t gmfn)
     if ( !mfn_valid(mfn) )
         return 0;
 
-    l4 = map_domain_page(mfn_x(mfn));
+    l4 = map_domain_page(mfn);
     mfn = l4[L4_LOGDIRTY_IDX(pfn)];
     unmap_domain_page(l4);
     if ( !mfn_valid(mfn) )
         return 0;
 
-    l3 = map_domain_page(mfn_x(mfn));
+    l3 = map_domain_page(mfn);
     mfn = l3[L3_LOGDIRTY_IDX(pfn)];
     unmap_domain_page(l3);
     if ( !mfn_valid(mfn) )
         return 0;
 
-    l2 = map_domain_page(mfn_x(mfn));
+    l2 = map_domain_page(mfn);
     mfn = l2[L2_LOGDIRTY_IDX(pfn)];
     unmap_domain_page(l2);
     if ( !mfn_valid(mfn) )
         return 0;
 
-    l1 = map_domain_page(mfn_x(mfn));
+    l1 = map_domain_page(mfn);
     rv = test_bit(L1_LOGDIRTY_IDX(pfn), l1);
     unmap_domain_page(l1);
     return rv;
@@ -411,7 +421,17 @@ static int paging_log_dirty_op(struct domain *d,
     int i4, i3, i2;
 
     if ( !resuming )
+    {
         domain_pause(d);
+
+        /*
+         * Flush dirty GFNs potentially cached by hardware. Only need to flush
+         * when not resuming, as domain was paused in resuming case therefore
+         * it's not possible to have any new dirty pages.
+         */
+        p2m_flush_hardware_cached_dirty(d);
+    }
+
     paging_lock(d);
 
     if ( !d->arch.paging.preempt.dom )
@@ -455,18 +475,18 @@ static int paging_log_dirty_op(struct domain *d,
 
     for ( ; (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES); i4++, i3 = 0 )
     {
-        l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+        l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(l4[i4]) : NULL;
         for ( ; (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES); i3++ )
         {
             l2 = ((l3 && mfn_valid(l3[i3])) ?
-                  map_domain_page(mfn_x(l3[i3])) : NULL);
+                  map_domain_page(l3[i3]) : NULL);
             for ( i2 = 0;
                   (pages < sc->pages) && (i2 < LOGDIRTY_NODE_ENTRIES);
                   i2++ )
             {
                 unsigned int bytes = PAGE_SIZE;
                 l1 = ((l2 && mfn_valid(l2[i2])) ?
-                      map_domain_page(mfn_x(l2[i2])) : NULL);
+                      map_domain_page(l2[i2]) : NULL);
                 if ( unlikely(((sc->pages - pages + 7) >> 3) < bytes) )
                     bytes = (unsigned int)((sc->pages - pages + 7) >> 3);
                 if ( likely(peek) )
@@ -635,16 +655,16 @@ int paging_domain_init(struct domain *d, unsigned int domcr_flags)
      * don't want to leak any active log-dirty bitmaps */
     d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
 
-    /* The order of the *_init calls below is important, as the later
-     * ones may rewrite some common fields.  Shadow pagetables are the
-     * default... */
-    shadow_domain_init(d, domcr_flags);
-
-    /* ... but we will use hardware assistance if it's available. */
+    /*
+     * Shadow pagetables are the default, but we will use
+     * hardware assistance if it's available and enabled.
+     */
     if ( hap_enabled(d) )
         hap_domain_init(d);
+    else
+        rc = shadow_domain_init(d, domcr_flags);
 
-    return 0;
+    return rc;
 }
 
 /* vcpu paging struct initialization goes here */
@@ -745,7 +765,7 @@ long paging_domctl_continuation(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 
     if ( op.interface_version != XEN_DOMCTL_INTERFACE_VERSION ||
          op.cmd != XEN_DOMCTL_shadow_op )
-        return -EBADRQC;
+        return -EOPNOTSUPP;
 
     d = rcu_lock_domain_by_id(op.domain);
     if ( d == NULL )
@@ -779,12 +799,15 @@ long paging_domctl_continuation(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 /* Call when destroying a domain */
 int paging_teardown(struct domain *d)
 {
-    int rc;
+    int rc, preempted = 0;
 
     if ( hap_enabled(d) )
-        hap_teardown(d);
+        hap_teardown(d, &preempted);
     else
-        shadow_teardown(d);
+        shadow_teardown(d, &preempted);
+
+    if ( preempted )
+        return -ERESTART;
 
     /* clean up log dirty resources. */
     rc = paging_free_log_dirty_bitmap(d, 0);
@@ -822,12 +845,16 @@ int paging_enable(struct domain *d, u32 mode)
  * and therefore its pagetables will soon be discarded */
 void pagetable_dying(struct domain *d, paddr_t gpa)
 {
+#ifdef CONFIG_SHADOW_PAGING
     struct vcpu *v;
 
     ASSERT(paging_mode_shadow(d));
 
     v = d->vcpu[0];
     v->arch.paging.mode->shadow.pagetable_dying(v, gpa);
+#else
+    BUG();
+#endif
 }
 
 /* Print paging-assistance info to the console */
diff --git a/xen/arch/x86/mm/shadow/Makefile b/xen/arch/x86/mm/shadow/Makefile
index b3b0cde..a07bc0c 100644
--- a/xen/arch/x86/mm/shadow/Makefile
+++ b/xen/arch/x86/mm/shadow/Makefile
@@ -1,4 +1,8 @@
-obj-$(x86_64) += common.o guest_2.o guest_3.o guest_4.o
+ifeq ($(shadow-paging),y)
+obj-y += common.o guest_2.o guest_3.o guest_4.o
+else
+obj-y += none.o
+endif
 
 guest_%.o: multi.c Makefile
 	$(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index a5eed28..0264b91 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -5,7 +5,7 @@
  * Parts of this code are Copyright (c) 2006 by XenSource Inc.
  * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
  * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
- * 
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -17,8 +17,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -41,22 +40,28 @@
 
 DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
 
+static int sh_enable_log_dirty(struct domain *, bool_t log_global);
+static int sh_disable_log_dirty(struct domain *);
+static void sh_clean_dirty_bitmap(struct domain *);
+
 /* Set up the shadow-specific parts of a domain struct at start of day.
  * Called for every domain from arch_domain_create() */
-void shadow_domain_init(struct domain *d, unsigned int domcr_flags)
+int shadow_domain_init(struct domain *d, unsigned int domcr_flags)
 {
     INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.freelist);
     INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
 
     /* Use shadow pagetables for log-dirty support */
-    paging_log_dirty_init(d, shadow_enable_log_dirty, 
-                          shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
+    paging_log_dirty_init(d, sh_enable_log_dirty,
+                          sh_disable_log_dirty, sh_clean_dirty_bitmap);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     d->arch.paging.shadow.oos_active = 0;
     d->arch.paging.shadow.oos_off = (domcr_flags & DOMCRF_oos_off) ?  1 : 0;
 #endif
     d->arch.paging.shadow.pagetable_dying_op = 0;
+
+    return 0;
 }
 
 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
@@ -246,7 +251,7 @@ hvm_emulate_write(enum x86_segment seg,
         v, addr, p_data, bytes, sh_ctxt);
 }
 
-static int 
+static int
 hvm_emulate_cmpxchg(enum x86_segment seg,
                     unsigned long offset,
                     void *p_old,
@@ -323,7 +328,7 @@ pv_emulate_write(enum x86_segment seg,
         v, offset, p_data, bytes, sh_ctxt);
 }
 
-static int 
+static int
 pv_emulate_cmpxchg(enum x86_segment seg,
                    unsigned long offset,
                    void *p_old,
@@ -403,9 +408,9 @@ const struct x86_emulate_ops *shadow_init_emulation(
     return &hvm_shadow_emulator_ops;
 }
 
-/* Update an initialized emulation context to prepare for the next 
+/* Update an initialized emulation context to prepare for the next
  * instruction */
-void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt, 
+void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
                                struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
@@ -431,17 +436,17 @@ void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
         }
     }
 }
- 
+
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
 /**************************************************************************/
-/* Out-of-sync shadows. */ 
+/* Out-of-sync shadows. */
 
-/* From time to time, we let a shadowed pagetable page go out of sync 
- * with its shadow: the guest is allowed to write directly to the page, 
+/* From time to time, we let a shadowed pagetable page go out of sync
+ * with its shadow: the guest is allowed to write directly to the page,
  * and those writes are not synchronously reflected in the shadow.
- * This lets us avoid many emulations if the guest is writing a lot to a 
- * pagetable, but it relaxes a pretty important invariant in the shadow 
+ * This lets us avoid many emulations if the guest is writing a lot to a
+ * pagetable, but it relaxes a pretty important invariant in the shadow
  * pagetable design.  Therefore, some rules:
  *
  * 1. Only L1 pagetables may go out of sync: any page that is shadowed
@@ -449,21 +454,21 @@ void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
  *    using linear shadow pagetables much less dangerous.
  *    That means that: (a) unsyncing code needs to check for higher-level
  *    shadows, and (b) promotion code needs to resync.
- * 
+ *
  * 2. All shadow operations on a guest page require the page to be brought
  *    back into sync before proceeding.  This must be done under the
  *    paging lock so that the page is guaranteed to remain synced until
  *    the operation completes.
  *
- *    Exceptions to this rule: the pagefault and invlpg handlers may 
- *    update only one entry on an out-of-sync page without resyncing it. 
+ *    Exceptions to this rule: the pagefault and invlpg handlers may
+ *    update only one entry on an out-of-sync page without resyncing it.
  *
  * 3. Operations on shadows that do not start from a guest page need to
  *    be aware that they may be handling an out-of-sync shadow.
  *
- * 4. Operations that do not normally take the paging lock (fast-path 
- *    #PF handler, INVLPG) must fall back to a locking, syncing version 
- *    if they see an out-of-sync table. 
+ * 4. Operations that do not normally take the paging lock (fast-path
+ *    #PF handler, INVLPG) must fall back to a locking, syncing version
+ *    if they see an out-of-sync table.
  *
  * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
  *    must explicitly resync all relevant pages or update their
@@ -482,26 +487,26 @@ void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
 
 
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
-static void sh_oos_audit(struct domain *d) 
+static void sh_oos_audit(struct domain *d)
 {
     int idx, expected_idx, expected_idx_alt;
     struct page_info *pg;
     struct vcpu *v;
-    
-    for_each_vcpu(d, v) 
+
+    for_each_vcpu(d, v)
     {
         for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
         {
             mfn_t *oos = v->arch.paging.shadow.oos;
             if ( !mfn_valid(oos[idx]) )
                 continue;
-            
+
             expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
             expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
             if ( idx != expected_idx && idx != expected_idx_alt )
             {
                 printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
-                       __func__, idx, mfn_x(oos[idx]), 
+                       __func__, idx, mfn_x(oos[idx]),
                        expected_idx, expected_idx_alt);
                 BUG();
             }
@@ -530,21 +535,21 @@ static void sh_oos_audit(struct domain *d)
 #endif
 
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
-void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) 
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn)
 {
     int idx;
     struct vcpu *v;
     mfn_t *oos;
 
     ASSERT(mfn_is_out_of_sync(gmfn));
-    
-    for_each_vcpu(d, v) 
+
+    for_each_vcpu(d, v)
     {
         oos = v->arch.paging.shadow.oos;
         idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
         if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
             idx = (idx + 1) % SHADOW_OOS_PAGES;
-        
+
         if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
             return;
     }
@@ -581,13 +586,14 @@ static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
 static inline int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn,
                                        struct oos_fixup *fixup)
 {
+    struct domain *d = v->domain;
     int i;
     for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
     {
         if ( mfn_x(fixup->smfn[i]) != INVALID_MFN )
         {
-            sh_remove_write_access_from_sl1p(v, gmfn,
-                                             fixup->smfn[i], 
+            sh_remove_write_access_from_sl1p(d, gmfn,
+                                             fixup->smfn[i],
                                              fixup->off[i]);
             fixup->smfn[i] = _mfn(INVALID_MFN);
         }
@@ -597,17 +603,17 @@ static inline int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn,
     return 1;
 }
 
-void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
+void oos_fixup_add(struct domain *d, mfn_t gmfn,
                    mfn_t smfn,  unsigned long off)
 {
     int idx, next;
     mfn_t *oos;
     struct oos_fixup *oos_fixup;
-    struct domain *d = v->domain;
+    struct vcpu *v;
 
     perfc_incr(shadow_oos_fixup_add);
-    
-    for_each_vcpu(d, v) 
+
+    for_each_vcpu(d, v)
     {
         oos = v->arch.paging.shadow.oos;
         oos_fixup = v->arch.paging.shadow.oos_fixup;
@@ -632,7 +638,7 @@ void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
                 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT);
 
                 /* Reuse this slot and remove current writable mapping. */
-                sh_remove_write_access_from_sl1p(v, gmfn, 
+                sh_remove_write_access_from_sl1p(d, gmfn,
                                                  oos_fixup[idx].smfn[next],
                                                  oos_fixup[idx].off[next]);
                 perfc_incr(shadow_oos_fixup_evict);
@@ -659,11 +665,12 @@ void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
 static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn,
                                    struct oos_fixup *fixup)
 {
+    struct domain *d = v->domain;
     int ftlb = 0;
 
     ftlb |= oos_fixup_flush_gmfn(v, gmfn, fixup);
 
-    switch ( sh_remove_write_access(v, gmfn, 0, 0) )
+    switch ( sh_remove_write_access(d, gmfn, 0, 0) )
     {
     default:
     case 0:
@@ -675,16 +682,16 @@ static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn,
 
     case -1:
         /* An unfindable writeable typecount has appeared, probably via a
-         * grant table entry: can't shoot the mapping, so try to unshadow 
+         * grant table entry: can't shoot the mapping, so try to unshadow
          * the page.  If that doesn't work either, the guest is granting
          * his pagetables and must be killed after all.
          * This will flush the tlb, so we can return with no worries. */
-        sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
+        sh_remove_shadows(d, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
         return 1;
     }
 
     if ( ftlb )
-        flush_tlb_mask(v->domain->domain_dirty_cpumask);
+        flush_tlb_mask(d->domain_dirty_cpumask);
 
     return 0;
 }
@@ -709,7 +716,7 @@ static void _sh_resync(struct vcpu *v, mfn_t gmfn,
     ASSERT(paging_locked_by_me(v->domain));
     ASSERT(mfn_is_out_of_sync(gmfn));
     /* Guest page must be shadowed *only* as L1 when out of sync. */
-    ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask 
+    ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask
              & ~SHF_L1_ANY));
     ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
 
@@ -745,14 +752,14 @@ static void oos_hash_add(struct vcpu *v, mfn_t gmfn)
     mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
     struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
     struct oos_fixup fixup = { .next = 0 };
-    
+
     for (i = 0; i < SHADOW_OOS_FIXUPS; i++ )
         fixup.smfn[i] = _mfn(INVALID_MFN);
 
     idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
     oidx = idx;
 
-    if ( mfn_valid(oos[idx]) 
+    if ( mfn_valid(oos[idx])
          && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
     {
         /* Punt the current occupant into the next slot */
@@ -773,23 +780,23 @@ static void oos_hash_add(struct vcpu *v, mfn_t gmfn)
     if ( swap )
         SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
 
-    gptr = sh_map_domain_page(oos[oidx]);
-    gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
+    gptr = map_domain_page(oos[oidx]);
+    gsnpptr = map_domain_page(oos_snapshot[oidx]);
     memcpy(gsnpptr, gptr, PAGE_SIZE);
-    sh_unmap_domain_page(gptr);
-    sh_unmap_domain_page(gsnpptr);
+    unmap_domain_page(gptr);
+    unmap_domain_page(gsnpptr);
 }
 
 /* Remove an MFN from the list of out-of-sync guest pagetables */
-static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
+static void oos_hash_remove(struct domain *d, mfn_t gmfn)
 {
     int idx;
     mfn_t *oos;
-    struct domain *d = v->domain;
+    struct vcpu *v;
 
-    SHADOW_PRINTK("%pv gmfn %lx\n", v, mfn_x(gmfn));
+    SHADOW_PRINTK("d%d gmfn %lx\n", d->domain_id, mfn_x(gmfn));
 
-    for_each_vcpu(d, v) 
+    for_each_vcpu(d, v)
     {
         oos = v->arch.paging.shadow.oos;
         idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
@@ -806,14 +813,14 @@ static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
     BUG();
 }
 
-mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
+mfn_t oos_snapshot_lookup(struct domain *d, mfn_t gmfn)
 {
     int idx;
     mfn_t *oos;
     mfn_t *oos_snapshot;
-    struct domain *d = v->domain;
-    
-    for_each_vcpu(d, v) 
+    struct vcpu *v;
+
+    for_each_vcpu(d, v)
     {
         oos = v->arch.paging.shadow.oos;
         oos_snapshot = v->arch.paging.shadow.oos_snapshot;
@@ -832,15 +839,15 @@ mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
 }
 
 /* Pull a single guest page back into sync */
-void sh_resync(struct vcpu *v, mfn_t gmfn)
+void sh_resync(struct domain *d, mfn_t gmfn)
 {
     int idx;
     mfn_t *oos;
     mfn_t *oos_snapshot;
     struct oos_fixup *oos_fixup;
-    struct domain *d = v->domain;
+    struct vcpu *v;
 
-    for_each_vcpu(d, v) 
+    for_each_vcpu(d, v)
     {
         oos = v->arch.paging.shadow.oos;
         oos_fixup = v->arch.paging.shadow.oos_fixup;
@@ -848,7 +855,7 @@ void sh_resync(struct vcpu *v, mfn_t gmfn)
         idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
         if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
             idx = (idx + 1) % SHADOW_OOS_PAGES;
-        
+
         if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
         {
             _sh_resync(v, gmfn, &oos_fixup[idx], oos_snapshot[idx]);
@@ -899,7 +906,7 @@ void sh_resync_all(struct vcpu *v, int skip, int this, int others)
         goto resync_others;
 
     /* First: resync all of this vcpu's oos pages */
-    for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
+    for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
         if ( mfn_valid(oos[idx]) )
         {
             /* Write-protect and sync contents */
@@ -914,14 +921,14 @@ void sh_resync_all(struct vcpu *v, int skip, int this, int others)
     /* Second: make all *other* vcpus' oos pages safe. */
     for_each_vcpu(v->domain, other)
     {
-        if ( v == other ) 
+        if ( v == other )
             continue;
 
         oos = other->arch.paging.shadow.oos;
         oos_fixup = other->arch.paging.shadow.oos_fixup;
         oos_snapshot = other->arch.paging.shadow.oos_snapshot;
 
-        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
+        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
         {
             if ( !mfn_valid(oos[idx]) )
                 continue;
@@ -940,7 +947,7 @@ void sh_resync_all(struct vcpu *v, int skip, int this, int others)
                 _sh_resync(other, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
                 oos[idx] = _mfn(INVALID_MFN);
             }
-        }        
+        }
     }
 }
 
@@ -949,21 +956,21 @@ void sh_resync_all(struct vcpu *v, int skip, int this, int others)
 int sh_unsync(struct vcpu *v, mfn_t gmfn)
 {
     struct page_info *pg;
-    
+
     ASSERT(paging_locked_by_me(v->domain));
 
     SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
                   v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
 
     pg = mfn_to_page(gmfn);
- 
+
     /* Guest page must be shadowed *only* as L1 and *only* once when out
-     * of sync.  Also, get out now if it's already out of sync. 
+     * of sync.  Also, get out now if it's already out of sync.
      * Also, can't safely unsync if some vcpus have paging disabled.*/
-    if ( pg->shadow_flags & 
-         ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) 
+    if ( pg->shadow_flags &
+         ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync)
          || sh_page_has_multiple_shadows(pg)
-         || is_pv_domain(v->domain)
+         || is_pv_vcpu(v)
          || !v->domain->arch.paging.shadow.oos_active )
         return 0;
 
@@ -983,22 +990,22 @@ int sh_unsync(struct vcpu *v, mfn_t gmfn)
  * involves making sure there are no writable mappings available to the guest
  * for this page.
  */
-void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
+void shadow_promote(struct domain *d, mfn_t gmfn, unsigned int type)
 {
     struct page_info *page = mfn_to_page(gmfn);
 
     ASSERT(mfn_valid(gmfn));
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     /* Is the page already shadowed and out of sync? */
-    if ( page_is_out_of_sync(page) ) 
-        sh_resync(v, gmfn);
+    if ( page_is_out_of_sync(page) )
+        sh_resync(d, gmfn);
 #endif
 
     /* We should never try to promote a gmfn that has writeable mappings */
     ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
            || (page->u.inuse.type_info & PGT_count_mask) == 0
-           || v->domain->is_shutting_down);
+           || d->is_shutting_down);
 
     /* Is the page already shadowed? */
     if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
@@ -1009,7 +1016,7 @@ void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
     TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
 }
 
-void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
+void shadow_demote(struct domain *d, mfn_t gmfn, u32 type)
 {
     struct page_info *page = mfn_to_page(gmfn);
 
@@ -1020,13 +1027,13 @@ void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
 
     if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
     {
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
         /* Was the page out of sync? */
-        if ( page_is_out_of_sync(page) ) 
+        if ( page_is_out_of_sync(page) )
         {
-            oos_hash_remove(v, gmfn);
+            oos_hash_remove(d, gmfn);
         }
-#endif 
+#endif
         clear_bit(_PGC_page_table, &page->count_info);
     }
 
@@ -1044,11 +1051,11 @@ sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
     struct page_info *page = mfn_to_page(gmfn);
 
     paging_mark_dirty(v->domain, mfn_x(gmfn));
-    
+
     // Determine which types of shadows are affected, and update each.
     //
     // Always validate L1s before L2s to prevent another cpu with a linear
-    // mapping of this gmfn from seeing a walk that results from 
+    // mapping of this gmfn from seeing a walk that results from
     // using the new L2 value and the old L1 value.  (It is OK for such a
     // guest to see a walk that uses the old L2 value with the new L1 value,
     // as hardware could behave this way if one level of the pagewalk occurs
@@ -1061,40 +1068,40 @@ sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
     if ( !(page->count_info & PGC_page_table) )
         return 0;  /* Not shadowed at all */
 
-    if ( page->shadow_flags & SHF_L1_32 ) 
+    if ( page->shadow_flags & SHF_L1_32 )
         result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2)
             (v, gmfn, entry, size);
-    if ( page->shadow_flags & SHF_L2_32 ) 
+    if ( page->shadow_flags & SHF_L2_32 )
         result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2)
             (v, gmfn, entry, size);
 
-    if ( page->shadow_flags & SHF_L1_PAE ) 
+    if ( page->shadow_flags & SHF_L1_PAE )
         result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3)
             (v, gmfn, entry, size);
-    if ( page->shadow_flags & SHF_L2_PAE ) 
+    if ( page->shadow_flags & SHF_L2_PAE )
         result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3)
             (v, gmfn, entry, size);
-    if ( page->shadow_flags & SHF_L2H_PAE ) 
+    if ( page->shadow_flags & SHF_L2H_PAE )
         result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3)
             (v, gmfn, entry, size);
 
-    if ( page->shadow_flags & SHF_L1_64 ) 
+    if ( page->shadow_flags & SHF_L1_64 )
         result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4)
             (v, gmfn, entry, size);
-    if ( page->shadow_flags & SHF_L2_64 ) 
+    if ( page->shadow_flags & SHF_L2_64 )
         result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4)
             (v, gmfn, entry, size);
-    if ( page->shadow_flags & SHF_L2H_64 ) 
+    if ( page->shadow_flags & SHF_L2H_64 )
         result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4)
             (v, gmfn, entry, size);
-    if ( page->shadow_flags & SHF_L3_64 ) 
+    if ( page->shadow_flags & SHF_L3_64 )
         result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4)
             (v, gmfn, entry, size);
-    if ( page->shadow_flags & SHF_L4_64 ) 
+    if ( page->shadow_flags & SHF_L4_64 )
         result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4)
             (v, gmfn, entry, size);
 
-    this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED)); 
+    this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED));
 
     return result;
 }
@@ -1115,20 +1122,20 @@ sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
     if ( rc & SHADOW_SET_FLUSH )
         /* Need to flush TLBs to pick up shadow PT changes */
         flush_tlb_mask(d->domain_dirty_cpumask);
-    if ( rc & SHADOW_SET_ERROR ) 
+    if ( rc & SHADOW_SET_ERROR )
     {
-        /* This page is probably not a pagetable any more: tear it out of the 
-         * shadows, along with any tables that reference it.  
-         * Since the validate call above will have made a "safe" (i.e. zero) 
-         * shadow entry, we can let the domain live even if we can't fully 
+        /* This page is probably not a pagetable any more: tear it out of the
+         * shadows, along with any tables that reference it.
+         * Since the validate call above will have made a "safe" (i.e. zero)
+         * shadow entry, we can let the domain live even if we can't fully
          * unshadow the page. */
-        sh_remove_shadows(v, gmfn, 0, 0);
+        sh_remove_shadows(d, gmfn, 0, 0);
     }
 }
 
 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
                              intpte_t new, mfn_t gmfn)
-/* Write a new value into the guest pagetable, and update the shadows 
+/* Write a new value into the guest pagetable, and update the shadows
  * appropriately.  Returns 0 if we page-faulted, 1 for success. */
 {
     int failed;
@@ -1142,7 +1149,7 @@ int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
 
 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
                                intpte_t *old, intpte_t new, mfn_t gmfn)
-/* Cmpxchg a new value into the guest pagetable, and update the shadows 
+/* Cmpxchg a new value into the guest pagetable, and update the shadows
  * appropriately. Returns 0 if we page-faulted, 1 if not.
  * N.B. caller should check the value of "old" to see if the
  * cmpxchg itself was successful. */
@@ -1160,7 +1167,7 @@ int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
 
 
 /**************************************************************************/
-/* Memory management for shadow pages. */ 
+/* Memory management for shadow pages. */
 
 /* Allocating shadow pages
  * -----------------------
@@ -1174,12 +1181,12 @@ int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
  * PAE/64-bit l2 tables (1GB va each).  These multi-page shadows are
  * not contiguous in memory; functions for handling offsets into them are
  * defined in shadow/multi.c (shadow_l1_index() etc.)
- *    
+ *
  * This table shows the allocation behaviour of the different modes:
  *
  * Xen paging      64b  64b  64b
  * Guest paging    32b  pae  64b
- * PV or HVM       HVM  HVM   * 
+ * PV or HVM       HVM  HVM   *
  * Shadow paging   pae  pae  64b
  *
  * sl1 size         8k   4k   4k
@@ -1187,23 +1194,43 @@ int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
  * sl3 size         -    -    4k
  * sl4 size         -    -    4k
  *
- * In HVM guests, the p2m table is built out of shadow pages, and we provide 
- * a function for the p2m management to steal pages, in max-order chunks, from 
+ * In HVM guests, the p2m table is built out of shadow pages, and we provide
+ * a function for the p2m management to steal pages, in max-order chunks, from
  * the free pool.
  */
 
+const u8 sh_type_to_size[] = {
+    1, /* SH_type_none           */
+    2, /* SH_type_l1_32_shadow   */
+    2, /* SH_type_fl1_32_shadow  */
+    4, /* SH_type_l2_32_shadow   */
+    1, /* SH_type_l1_pae_shadow  */
+    1, /* SH_type_fl1_pae_shadow */
+    1, /* SH_type_l2_pae_shadow  */
+    1, /* SH_type_l2h_pae_shadow */
+    1, /* SH_type_l1_64_shadow   */
+    1, /* SH_type_fl1_64_shadow  */
+    1, /* SH_type_l2_64_shadow   */
+    1, /* SH_type_l2h_64_shadow  */
+    1, /* SH_type_l3_64_shadow   */
+    1, /* SH_type_l4_64_shadow   */
+    1, /* SH_type_p2m_table      */
+    1, /* SH_type_monitor_table  */
+    1  /* SH_type_oos_snapshot   */
+};
+
 /* Figure out the least acceptable quantity of shadow memory.
  * The minimum memory requirement for always being able to free up a
  * chunk of memory is very small -- only three max-order chunks per
- * vcpu to hold the top level shadows and pages with Xen mappings in them.  
+ * vcpu to hold the top level shadows and pages with Xen mappings in them.
  *
  * But for a guest to be guaranteed to successfully execute a single
  * instruction, we must be able to map a large number (about thirty) VAs
  * at the same time, which means that to guarantee progress, we must
  * allow for more than ninety allocated pages per vcpu.  We round that
- * up to 128 pages, or half a megabyte per vcpu, and add 1 more vcpu's 
+ * up to 128 pages, or half a megabyte per vcpu, and add 1 more vcpu's
  * worth to make sure we never return zero. */
-static unsigned int shadow_min_acceptable_pages(struct domain *d) 
+static unsigned int shadow_min_acceptable_pages(struct domain *d)
 {
     u32 vcpu_count = 1;
     struct vcpu *v;
@@ -1212,52 +1239,25 @@ static unsigned int shadow_min_acceptable_pages(struct domain *d)
         vcpu_count++;
 
     return (vcpu_count * 128);
-} 
-
-/* Figure out the size (in pages) of a given shadow type */
-static inline u32
-shadow_size(unsigned int shadow_type) 
-{
-    static const u32 type_to_size[SH_type_unused] = {
-        1, /* SH_type_none           */
-        2, /* SH_type_l1_32_shadow   */
-        2, /* SH_type_fl1_32_shadow  */
-        4, /* SH_type_l2_32_shadow   */
-        1, /* SH_type_l1_pae_shadow  */
-        1, /* SH_type_fl1_pae_shadow */
-        1, /* SH_type_l2_pae_shadow  */
-        1, /* SH_type_l2h_pae_shadow */
-        1, /* SH_type_l1_64_shadow   */
-        1, /* SH_type_fl1_64_shadow  */
-        1, /* SH_type_l2_64_shadow   */
-        1, /* SH_type_l2h_64_shadow  */
-        1, /* SH_type_l3_64_shadow   */
-        1, /* SH_type_l4_64_shadow   */
-        1, /* SH_type_p2m_table      */
-        1, /* SH_type_monitor_table  */
-        1  /* SH_type_oos_snapshot   */
-        };
-    ASSERT(shadow_type < SH_type_unused);
-    return type_to_size[shadow_type];
 }
 
 /* Dispatcher function: call the per-mode function that will unhook the
  * non-Xen mappings in this top-level shadow mfn.  With user_only == 1,
  * unhooks only the user-mode mappings. */
-void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn, int user_only)
+void shadow_unhook_mappings(struct domain *d, mfn_t smfn, int user_only)
 {
     struct page_info *sp = mfn_to_page(smfn);
     switch ( sp->u.sh.type )
     {
     case SH_type_l2_32_shadow:
-        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v, smfn, user_only);
+        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(d, smfn, user_only);
         break;
     case SH_type_l2_pae_shadow:
     case SH_type_l2h_pae_shadow:
-        SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(v, smfn, user_only);
+        SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(d, smfn, user_only);
         break;
     case SH_type_l4_64_shadow:
-        SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(v, smfn, user_only);
+        SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(d, smfn, user_only);
         break;
     default:
         SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->u.sh.type);
@@ -1279,23 +1279,17 @@ static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
 
 /* Make sure there are at least count order-sized pages
  * available in the shadow page pool. */
-static void _shadow_prealloc(
-    struct domain *d,
-    unsigned int pages)
+static void _shadow_prealloc(struct domain *d, unsigned int pages)
 {
-    /* Need a vpcu for calling unpins; for now, since we don't have
-     * per-vcpu shadows, any will do */
-    struct vcpu *v, *v2;
+    struct vcpu *v;
     struct page_info *sp, *t;
     mfn_t smfn;
     int i;
 
     if ( d->arch.paging.shadow.free_pages >= pages ) return;
-    
-    v = current;
-    if ( v->domain != d )
-        v = d->vcpu[0];
-    ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus  */
+
+    /* Shouldn't have enabled shadows if we've no vcpus. */
+    ASSERT(d->vcpu && d->vcpu[0]);
 
     /* Stage one: walk the list of pinned pages, unpinning them */
     perfc_incr(shadow_prealloc_1);
@@ -1305,7 +1299,7 @@ static void _shadow_prealloc(
 
         /* Unpin this top-level shadow */
         trace_shadow_prealloc_unpin(d, smfn);
-        sh_unpin(v, smfn);
+        sh_unpin(d, smfn);
 
         /* See if that freed up enough space */
         if ( d->arch.paging.shadow.free_pages >= pages ) return;
@@ -1316,14 +1310,14 @@ static void _shadow_prealloc(
      * mappings. */
     perfc_incr(shadow_prealloc_2);
 
-    for_each_vcpu(d, v2) 
+    for_each_vcpu(d, v)
         for ( i = 0 ; i < 4 ; i++ )
         {
-            if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
+            if ( !pagetable_is_null(v->arch.shadow_table[i]) )
             {
                 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK);
-                shadow_unhook_mappings(v, 
-                               pagetable_get_mfn(v2->arch.shadow_table[i]), 0);
+                shadow_unhook_mappings(d,
+                               pagetable_get_mfn(v->arch.shadow_table[i]), 0);
 
                 /* See if that freed up enough space */
                 if ( d->arch.paging.shadow.free_pages >= pages )
@@ -1333,7 +1327,7 @@ static void _shadow_prealloc(
                 }
             }
         }
-    
+
     /* Nothing more we can do: all remaining shadows are of pages that
      * hold Xen mappings for some vcpu.  This can never happen. */
     SHADOW_ERROR("Can't pre-allocate %u shadow pages!\n"
@@ -1357,27 +1351,28 @@ void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
 
 /* Deliberately free all the memory we can: this will tear down all of
  * this domain's shadows */
-static void shadow_blow_tables(struct domain *d) 
+static void shadow_blow_tables(struct domain *d)
 {
     struct page_info *sp, *t;
-    struct vcpu *v = d->vcpu[0];
+    struct vcpu *v;
     mfn_t smfn;
     int i;
 
-    ASSERT(v != NULL);
+    /* Shouldn't have enabled shadows if we've no vcpus. */
+    ASSERT(d->vcpu && d->vcpu[0]);
 
     /* Pass one: unpin all pinned pages */
     foreach_pinned_shadow(d, sp, t)
     {
         smfn = page_to_mfn(sp);
-        sh_unpin(v, smfn);
+        sh_unpin(d, smfn);
     }
-        
+
     /* Second pass: unhook entries of in-use shadows */
-    for_each_vcpu(d, v) 
+    for_each_vcpu(d, v)
         for ( i = 0 ; i < 4 ; i++ )
             if ( !pagetable_is_null(v->arch.shadow_table[i]) )
-                shadow_unhook_mappings(v, 
+                shadow_unhook_mappings(d,
                                pagetable_get_mfn(v->arch.shadow_table[i]), 0);
 
     /* Make sure everyone sees the unshadowings */
@@ -1442,9 +1437,9 @@ set_next_shadow(struct page_info *sp, struct page_info *next)
 }
 
 /* Allocate another shadow's worth of (contiguous, aligned) pages,
- * and fill in the type and backpointer fields of their page_infos. 
+ * and fill in the type and backpointer fields of their page_infos.
  * Never fails to allocate. */
-mfn_t shadow_alloc(struct domain *d,  
+mfn_t shadow_alloc(struct domain *d,
                     u32 shadow_type,
                     unsigned long backpointer)
 {
@@ -1483,16 +1478,13 @@ mfn_t shadow_alloc(struct domain *d,
         break;
     }
 
-    /* Page lists don't have pointers back to the head structure, so
-     * it's safe to use a head structure on the stack to link the pages
-     * together. */
     INIT_PAGE_LIST_HEAD(&tmp_list);
 
     /* Init page info fields and clear the pages */
-    for ( i = 0; i < pages ; i++ ) 
+    for ( i = 0; i < pages ; i++ )
     {
         sp = page_list_remove_head(&d->arch.paging.shadow.freelist);
-        /* Before we overwrite the old contents of this page, 
+        /* Before we overwrite the old contents of this page,
          * we need to be sure that no TLB holds a pointer to it. */
         cpumask_copy(&mask, d->domain_dirty_cpumask);
         tlbflush_filter(mask, sp->tlbflush_timestamp);
@@ -1505,7 +1497,7 @@ mfn_t shadow_alloc(struct domain *d,
         p = __map_domain_page(sp);
         ASSERT(p != NULL);
         clear_page(p);
-        sh_unmap_domain_page(p);
+        unmap_domain_page(p);
         INIT_PAGE_LIST_ENTRY(&sp->list);
         page_list_add(sp, &tmp_list);
         sp->u.sh.type = shadow_type;
@@ -1516,9 +1508,12 @@ mfn_t shadow_alloc(struct domain *d,
         set_next_shadow(sp, NULL);
         perfc_incr(shadow_alloc_count);
     }
-    if ( shadow_type >= SH_type_min_shadow 
+    if ( shadow_type >= SH_type_min_shadow
          && shadow_type <= SH_type_max_shadow )
         sp->u.sh.head = 1;
+
+    sh_terminate_list(&tmp_list);
+
     return page_to_mfn(sp);
 }
 
@@ -1526,7 +1521,8 @@ mfn_t shadow_alloc(struct domain *d,
 /* Return some shadow pages to the pool. */
 void shadow_free(struct domain *d, mfn_t smfn)
 {
-    struct page_info *next = NULL, *sp = mfn_to_page(smfn); 
+    struct page_info *next = NULL, *sp = mfn_to_page(smfn);
+    struct page_list_head *pin_list;
     unsigned int pages;
     u32 shadow_type;
     int i;
@@ -1538,17 +1534,18 @@ void shadow_free(struct domain *d, mfn_t smfn)
     ASSERT(shadow_type != SH_type_none);
     ASSERT(sp->u.sh.head || (shadow_type > SH_type_max_shadow));
     pages = shadow_size(shadow_type);
+    pin_list = &d->arch.paging.shadow.pinned_shadows;
 
-    for ( i = 0; i < pages; i++ ) 
+    for ( i = 0; i < pages; i++ )
     {
 #if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION)
         struct vcpu *v;
-        for_each_vcpu(d, v) 
+        for_each_vcpu(d, v)
         {
 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
             /* No longer safe to look for a writeable mapping in this shadow */
-            if ( v->arch.paging.shadow.last_writeable_pte_smfn 
-                 == mfn_x(page_to_mfn(sp)) ) 
+            if ( v->arch.paging.shadow.last_writeable_pte_smfn
+                 == mfn_x(page_to_mfn(sp)) )
                 v->arch.paging.shadow.last_writeable_pte_smfn = 0;
 #endif
 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
@@ -1558,10 +1555,10 @@ void shadow_free(struct domain *d, mfn_t smfn)
 #endif
         /* Get the next page before we overwrite the list header */
         if ( i < pages - 1 )
-            next = pdx_to_page(sp->list.next);
+            next = page_list_next(sp, pin_list);
         /* Strip out the type: this is now a free shadow page */
         sp->u.sh.type = sp->u.sh.head = 0;
-        /* Remember the TLB timestamp so we will know whether to flush 
+        /* Remember the TLB timestamp so we will know whether to flush
          * TLBs when we reuse the page.  Because the destructors leave the
          * contents of the pages in place, we can delay TLB flushes until
          * just before the allocator hands the page out again. */
@@ -1583,11 +1580,11 @@ shadow_alloc_p2m_page(struct domain *d)
 {
     struct page_info *pg;
 
-    /* This is called both from the p2m code (which never holds the 
+    /* This is called both from the p2m code (which never holds the
      * paging lock) and the log-dirty code (which always does). */
     paging_lock_recursive(d);
 
-    if ( d->arch.paging.shadow.total_pages 
+    if ( d->arch.paging.shadow.total_pages
          < shadow_min_acceptable_pages(d) + 1 )
     {
         if ( !d->arch.paging.p2m_alloc_failed )
@@ -1629,9 +1626,9 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg)
     }
     pg->count_info &= ~PGC_count_mask;
     pg->u.sh.type = SH_type_p2m_table; /* p2m code reuses type-info */
-    page_set_owner(pg, NULL); 
+    page_set_owner(pg, NULL);
 
-    /* This is called both from the p2m code (which never holds the 
+    /* This is called both from the p2m code (which never holds the
      * paging lock) and the log-dirty code (which always does). */
     paging_lock_recursive(d);
 
@@ -1646,7 +1643,7 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg)
  * Input will be rounded up to at least shadow_min_acceptable_pages(),
  * plus space for the p2m table.
  * Returns 0 for success, non-zero for failure. */
-static unsigned int sh_set_allocation(struct domain *d, 
+static unsigned int sh_set_allocation(struct domain *d,
                                       unsigned int pages,
                                       int *preempted)
 {
@@ -1662,7 +1659,7 @@ static unsigned int sh_set_allocation(struct domain *d,
             pages = 0;
         else
             pages -= d->arch.paging.shadow.p2m_pages;
-        
+
         /* Don't allocate less than the minimum acceptable, plus one page per
          * megabyte of RAM (for the p2m table) */
         lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
@@ -1670,18 +1667,18 @@ static unsigned int sh_set_allocation(struct domain *d,
             pages = lower_bound;
     }
 
-    SHADOW_PRINTK("current %i target %i\n", 
+    SHADOW_PRINTK("current %i target %i\n",
                    d->arch.paging.shadow.total_pages, pages);
 
     for ( ; ; )
     {
-        if ( d->arch.paging.shadow.total_pages < pages ) 
+        if ( d->arch.paging.shadow.total_pages < pages )
         {
             /* Need to allocate more memory from domheap */
             sp = (struct page_info *)
-                alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
-            if ( sp == NULL ) 
-            { 
+                alloc_domheap_page(d, MEMF_no_owner);
+            if ( sp == NULL )
+            {
                 SHADOW_PRINTK("failed to allocate shadow pages.\n");
                 return -ENOMEM;
             }
@@ -1692,8 +1689,8 @@ static unsigned int sh_set_allocation(struct domain *d,
             sp->u.sh.count = 0;
             sp->tlbflush_timestamp = 0; /* Not in any TLB */
             page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
-        } 
-        else if ( d->arch.paging.shadow.total_pages > pages ) 
+        }
+        else if ( d->arch.paging.shadow.total_pages > pages )
         {
             /* Need to return memory to domheap */
             _shadow_prealloc(d, 1);
@@ -1733,7 +1730,7 @@ static unsigned int shadow_get_allocation(struct domain *d)
 
 /**************************************************************************/
 /* Hash table for storing the guest->shadow mappings.
- * The table itself is an array of pointers to shadows; the shadows are then 
+ * The table itself is an array of pointers to shadows; the shadows are then
  * threaded on a singly-linked list of shadows with the same hash value */
 
 #define SHADOW_HASH_BUCKETS 251
@@ -1741,7 +1738,7 @@ static unsigned int shadow_get_allocation(struct domain *d)
 
 /* Hash function that takes a gfn or mfn, plus another byte of type info */
 typedef u32 key_t;
-static inline key_t sh_hash(unsigned long n, unsigned int t) 
+static inline key_t sh_hash(unsigned long n, unsigned int t)
 {
     unsigned char *p = (unsigned char *)&n;
     key_t k = t;
@@ -1800,7 +1797,7 @@ static void sh_hash_audit_bucket(struct domain *d, int bucket)
                         SHADOW_ERROR("MFN %#"PRI_mfn" shadowed (by %#"PRI_mfn")"
                                      " and not OOS but has typecount %#lx\n",
                                      __backpointer(sp),
-                                     mfn_x(page_to_mfn(sp)), 
+                                     mfn_x(page_to_mfn(sp)),
                                      gpg->u.inuse.type_info);
                         BUG();
                     }
@@ -1808,7 +1805,7 @@ static void sh_hash_audit_bucket(struct domain *d, int bucket)
             }
             else /* Not an l1 */
 #endif
-            if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 
+            if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
                  && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
             {
                 SHADOW_ERROR("MFN %#"PRI_mfn" shadowed (by %#"PRI_mfn")"
@@ -1838,7 +1835,7 @@ static void sh_hash_audit(struct domain *d)
     if ( !(SHADOW_AUDIT_ENABLE) )
         return;
 
-    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) 
+    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
     {
         sh_hash_audit_bucket(d, i);
     }
@@ -1848,7 +1845,7 @@ static void sh_hash_audit(struct domain *d)
 #define sh_hash_audit(_d) do {} while(0)
 #endif /* Hashtable bucket audit */
 
-/* Allocate and initialise the table itself.  
+/* Allocate and initialise the table itself.
  * Returns 0 for success, 1 for error. */
 static int shadow_hash_alloc(struct domain *d)
 {
@@ -1875,11 +1872,10 @@ static void shadow_hash_teardown(struct domain *d)
 }
 
 
-mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
+mfn_t shadow_hash_lookup(struct domain *d, unsigned long n, unsigned int t)
 /* Find an entry in the hash table.  Returns the MFN of the shadow,
  * or INVALID_MFN if it doesn't exist */
 {
-    struct domain *d = v->domain;
     struct page_info *sp, *prev;
     key_t key;
 
@@ -1905,11 +1901,11 @@ mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
                 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
                     /* Can't reorder: someone is walking the hash chains */
                     return page_to_mfn(sp);
-                else 
+                else
                 {
                     ASSERT(prev);
                     /* Delete sp from the list */
-                    prev->next_shadow = sp->next_shadow;                    
+                    prev->next_shadow = sp->next_shadow;
                     /* Re-insert it at the head of the list */
                     set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]);
                     d->arch.paging.shadow.hash_table[key] = sp;
@@ -1929,14 +1925,13 @@ mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
     return _mfn(INVALID_MFN);
 }
 
-void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t, 
+void shadow_hash_insert(struct domain *d, unsigned long n, unsigned int t,
                         mfn_t smfn)
 /* Put a mapping (n,t)->smfn into the hash table */
 {
-    struct domain *d = v->domain;
     struct page_info *sp;
     key_t key;
-    
+
     ASSERT(paging_locked_by_me(d));
     ASSERT(d->arch.paging.shadow.hash_table);
     ASSERT(t);
@@ -1946,20 +1941,19 @@ void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
     perfc_incr(shadow_hash_inserts);
     key = sh_hash(n, t);
     sh_hash_audit_bucket(d, key);
-    
+
     /* Insert this shadow at the top of the bucket */
     sp = mfn_to_page(smfn);
     set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]);
     d->arch.paging.shadow.hash_table[key] = sp;
-    
+
     sh_hash_audit_bucket(d, key);
 }
 
-void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t, 
+void shadow_hash_delete(struct domain *d, unsigned long n, unsigned int t,
                         mfn_t smfn)
 /* Excise the mapping (n,t)->smfn from the hash table */
 {
-    struct domain *d = v->domain;
     struct page_info *sp, *x;
     key_t key;
 
@@ -1972,12 +1966,12 @@ void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
     perfc_incr(shadow_hash_deletes);
     key = sh_hash(n, t);
     sh_hash_audit_bucket(d, key);
-    
+
     sp = mfn_to_page(smfn);
-    if ( d->arch.paging.shadow.hash_table[key] == sp ) 
+    if ( d->arch.paging.shadow.hash_table[key] == sp )
         /* Easy case: we're deleting the head item. */
         d->arch.paging.shadow.hash_table[key] = next_shadow(sp);
-    else 
+    else
     {
         /* Need to search for the one we want */
         x = d->arch.paging.shadow.hash_table[key];
@@ -1998,19 +1992,19 @@ void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
     sh_hash_audit_bucket(d, key);
 }
 
-typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
+typedef int (*hash_vcpu_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
+typedef int (*hash_domain_callback_t)(struct domain *d, mfn_t smfn, mfn_t other_mfn);
 
-static void hash_foreach(struct vcpu *v, 
-                         unsigned int callback_mask, 
-                         const hash_callback_t callbacks[],
-                         mfn_t callback_mfn)
-/* Walk the hash table looking at the types of the entries and 
- * calling the appropriate callback function for each entry. 
+static void hash_vcpu_foreach(struct vcpu *v, unsigned int callback_mask,
+                              const hash_vcpu_callback_t callbacks[],
+                              mfn_t callback_mfn)
+/* Walk the hash table looking at the types of the entries and
+ * calling the appropriate callback function for each entry.
  * The mask determines which shadow types we call back for, and the array
  * of callbacks tells us which function to call.
- * Any callback may return non-zero to let us skip the rest of the scan. 
+ * Any callback may return non-zero to let us skip the rest of the scan.
  *
- * WARNING: Callbacks MUST NOT add or remove hash entries unless they 
+ * WARNING: Callbacks MUST NOT add or remove hash entries unless they
  * then return non-zero to terminate the scan. */
 {
     int i, done = 0;
@@ -2027,7 +2021,7 @@ static void hash_foreach(struct vcpu *v,
     ASSERT(d->arch.paging.shadow.hash_walking == 0);
     d->arch.paging.shadow.hash_walking = 1;
 
-    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) 
+    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
     {
         /* WARNING: This is not safe against changes to the hash table.
          * The callback *must* return non-zero if it has inserted or
@@ -2043,18 +2037,65 @@ static void hash_foreach(struct vcpu *v,
                 if ( done ) break;
             }
         }
-        if ( done ) break; 
+        if ( done ) break;
     }
-    d->arch.paging.shadow.hash_walking = 0; 
+    d->arch.paging.shadow.hash_walking = 0;
+}
+
+static void hash_domain_foreach(struct domain *d,
+                                unsigned int callback_mask,
+                                const hash_domain_callback_t callbacks[],
+                                mfn_t callback_mfn)
+/* Walk the hash table looking at the types of the entries and
+ * calling the appropriate callback function for each entry.
+ * The mask determines which shadow types we call back for, and the array
+ * of callbacks tells us which function to call.
+ * Any callback may return non-zero to let us skip the rest of the scan.
+ *
+ * WARNING: Callbacks MUST NOT add or remove hash entries unless they
+ * then return non-zero to terminate the scan. */
+{
+    int i, done = 0;
+    struct page_info *x;
+
+    ASSERT(paging_locked_by_me(d));
+
+    /* Can be called via p2m code &c after shadow teardown. */
+    if ( unlikely(!d->arch.paging.shadow.hash_table) )
+        return;
+
+    /* Say we're here, to stop hash-lookups reordering the chains */
+    ASSERT(d->arch.paging.shadow.hash_walking == 0);
+    d->arch.paging.shadow.hash_walking = 1;
+
+    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
+    {
+        /* WARNING: This is not safe against changes to the hash table.
+         * The callback *must* return non-zero if it has inserted or
+         * deleted anything from the hash (lookups are OK, though). */
+        for ( x = d->arch.paging.shadow.hash_table[i]; x; x = next_shadow(x) )
+        {
+            if ( callback_mask & (1 << x->u.sh.type) )
+            {
+                ASSERT(x->u.sh.type <= 15);
+                ASSERT(callbacks[x->u.sh.type] != NULL);
+                done = callbacks[x->u.sh.type](d, page_to_mfn(x),
+                                               callback_mfn);
+                if ( done ) break;
+            }
+        }
+        if ( done ) break;
+    }
+    d->arch.paging.shadow.hash_walking = 0;
 }
 
 
 /**************************************************************************/
 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
- * which will decrement refcounts appropriately and return memory to the 
+ * which will decrement refcounts appropriately and return memory to the
  * free pool. */
 
-void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
+void sh_destroy_shadow(struct domain *d, mfn_t smfn)
 {
     struct page_info *sp = mfn_to_page(smfn);
     unsigned int t = sp->u.sh.type;
@@ -2064,13 +2105,12 @@ void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
 
     /* Double-check, if we can, that the shadowed page belongs to this
      * domain, (by following the back-pointer). */
-    ASSERT(t == SH_type_fl1_32_shadow  ||  
-           t == SH_type_fl1_pae_shadow ||  
-           t == SH_type_fl1_64_shadow  || 
-           t == SH_type_monitor_table  || 
-           (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
-           (page_get_owner(mfn_to_page(backpointer(sp)))
-            == v->domain)); 
+    ASSERT(t == SH_type_fl1_32_shadow  ||
+           t == SH_type_fl1_pae_shadow ||
+           t == SH_type_fl1_64_shadow  ||
+           t == SH_type_monitor_table  ||
+           (is_pv_32bit_domain(d) && t == SH_type_l4_64_shadow) ||
+           (page_get_owner(mfn_to_page(backpointer(sp))) == d));
 
     /* The down-shifts here are so that the switch statement is on nice
      * small numbers that the compiler will enjoy */
@@ -2078,43 +2118,43 @@ void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
     {
     case SH_type_l1_32_shadow:
     case SH_type_fl1_32_shadow:
-        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(v, smfn);
+        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(d, smfn);
         break;
     case SH_type_l2_32_shadow:
-        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(v, smfn);
+        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(d, smfn);
         break;
 
     case SH_type_l1_pae_shadow:
     case SH_type_fl1_pae_shadow:
-        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(v, smfn);
+        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(d, smfn);
         break;
     case SH_type_l2_pae_shadow:
     case SH_type_l2h_pae_shadow:
-        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(v, smfn);
+        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(d, smfn);
         break;
 
     case SH_type_l1_64_shadow:
     case SH_type_fl1_64_shadow:
-        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(v, smfn);
+        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(d, smfn);
         break;
     case SH_type_l2h_64_shadow:
-        ASSERT(is_pv_32on64_vcpu(v));
+        ASSERT(is_pv_32bit_domain(d));
         /* Fall through... */
     case SH_type_l2_64_shadow:
-        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(v, smfn);
+        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(d, smfn);
         break;
     case SH_type_l3_64_shadow:
-        SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(v, smfn);
+        SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(d, smfn);
         break;
     case SH_type_l4_64_shadow:
-        SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(v, smfn);
+        SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(d, smfn);
         break;
 
     default:
         SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n",
                      (unsigned long)t);
         BUG();
-    }    
+    }
 }
 
 static inline void trace_shadow_wrmap_bf(mfn_t gmfn)
@@ -2128,18 +2168,18 @@ static inline void trace_shadow_wrmap_bf(mfn_t gmfn)
 }
 
 /**************************************************************************/
-/* Remove all writeable mappings of a guest frame from the shadow tables 
- * Returns non-zero if we need to flush TLBs. 
+/* Remove all writeable mappings of a guest frame from the shadow tables
+ * Returns non-zero if we need to flush TLBs.
  * level and fault_addr desribe how we found this to be a pagetable;
  * level==0 means we have some other reason for revoking write access.
  * If level==0 we are allowed to fail, returning -1. */
 
-int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, 
+int sh_remove_write_access(struct domain *d, mfn_t gmfn,
                            unsigned int level,
                            unsigned long fault_addr)
 {
     /* Dispatch table for getting per-type functions */
-    static const hash_callback_t callbacks[SH_type_unused] = {
+    static const hash_domain_callback_t callbacks[SH_type_unused] = {
         NULL, /* none    */
         SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* l1_32   */
         SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* fl1_32  */
@@ -2158,28 +2198,31 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
         NULL  /* unused  */
     };
 
-    static unsigned int callback_mask = 
-          1 << SH_type_l1_32_shadow
-        | 1 << SH_type_fl1_32_shadow
-        | 1 << SH_type_l1_pae_shadow
-        | 1 << SH_type_fl1_pae_shadow
-        | 1 << SH_type_l1_64_shadow
-        | 1 << SH_type_fl1_64_shadow
+    static const unsigned int callback_mask =
+          SHF_L1_32
+        | SHF_FL1_32
+        | SHF_L1_PAE
+        | SHF_FL1_PAE
+        | SHF_L1_64
+        | SHF_FL1_64
         ;
     struct page_info *pg = mfn_to_page(gmfn);
+#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
+    struct vcpu *curr = current;
+#endif
 
-    ASSERT(paging_locked_by_me(v->domain));
+    ASSERT(paging_locked_by_me(d));
 
     /* Only remove writable mappings if we are doing shadow refcounts.
      * In guest refcounting, we trust Xen to already be restricting
      * all the writes to the guest page tables, so we do not need to
      * do more. */
-    if ( !shadow_mode_refcounts(v->domain) )
+    if ( !shadow_mode_refcounts(d) )
         return 0;
 
     /* Early exit if it's already a pagetable, or otherwise not writeable */
     if ( (sh_mfn_is_a_page_table(gmfn)
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
          /* Unless they've been allowed to go out of sync with their shadows */
            && !mfn_oos_may_write(gmfn)
 #endif
@@ -2191,18 +2234,18 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
 
     perfc_incr(shadow_writeable);
 
-    /* If this isn't a "normal" writeable page, the domain is trying to 
+    /* If this isn't a "normal" writeable page, the domain is trying to
      * put pagetables in special memory of some kind.  We can't allow that. */
     if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
     {
-        SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %" 
+        SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
                       PRtype_info "\n",
                       mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
-        domain_crash(v->domain);
+        domain_crash(d);
     }
 
 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
-    if ( v == current )
+    if ( curr->domain == d )
     {
         unsigned long gfn;
         /* Heuristic: there is likely to be only one writeable mapping,
@@ -2210,7 +2253,8 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
          * in the guest's linear map (on non-HIGHPTE linux and windows)*/
 
 #define GUESS(_a, _h) do {                                              \
-            if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
+            if ( curr->arch.paging.mode->shadow.guess_wrmap(            \
+                     curr, (_a), gmfn) )                                \
                 perfc_incr(shadow_writeable_h_ ## _h);                  \
             if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )        \
             {                                                           \
@@ -2218,35 +2262,35 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
                 return 1;                                               \
             }                                                           \
         } while (0)
-        
-        if ( v->arch.paging.mode->guest_levels == 2 )
+
+        if ( curr->arch.paging.mode->guest_levels == 2 )
         {
             if ( level == 1 )
                 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
                 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
 
             /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
-            if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 ) 
+            if ((gfn = mfn_to_gfn(d, gmfn)) < 0x38000 )
                 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
 
             /* FreeBSD: Linear map at 0xBFC00000 */
             if ( level == 1 )
-                GUESS(0xBFC00000UL 
+                GUESS(0xBFC00000UL
                       + ((fault_addr & VADDR_MASK) >> 10), 6);
         }
-        else if ( v->arch.paging.mode->guest_levels == 3 )
+        else if ( curr->arch.paging.mode->guest_levels == 3 )
         {
             /* 32bit PAE w2k3: linear map at 0xC0000000 */
-            switch ( level ) 
+            switch ( level )
             {
             case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
             case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
             }
 
             /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
-            if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 ) 
+            if ((gfn = mfn_to_gfn(d, gmfn)) < 0x38000 )
                 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
-            
+
             /* FreeBSD PAE: Linear map at 0xBF800000 */
             switch ( level )
             {
@@ -2256,23 +2300,23 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
                           + ((fault_addr & VADDR_MASK) >> 18), 6); break;
             }
         }
-        else if ( v->arch.paging.mode->guest_levels == 4 )
+        else if ( curr->arch.paging.mode->guest_levels == 4 )
         {
             /* 64bit w2k3: linear map at 0xfffff68000000000 */
-            switch ( level ) 
+            switch ( level )
             {
-            case 1: GUESS(0xfffff68000000000UL 
+            case 1: GUESS(0xfffff68000000000UL
                           + ((fault_addr & VADDR_MASK) >> 9), 3); break;
             case 2: GUESS(0xfffff6fb40000000UL
                           + ((fault_addr & VADDR_MASK) >> 18), 3); break;
-            case 3: GUESS(0xfffff6fb7da00000UL 
+            case 3: GUESS(0xfffff6fb7da00000UL
                           + ((fault_addr & VADDR_MASK) >> 27), 3); break;
             }
 
             /* 64bit Linux direct map at 0xffff880000000000; older kernels
              * had it at 0xffff810000000000, and older kernels yet had it
              * at 0x0000010000000000UL */
-            gfn = mfn_to_gfn(v->domain, gmfn); 
+            gfn = mfn_to_gfn(d, gmfn);
             GUESS(0xffff880000000000UL + (gfn << PAGE_SHIFT), 4);
             GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
             GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
@@ -2282,7 +2326,7 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
              * kpm_vbase; 0xfffffe0000000000UL
              */
             GUESS(0xfffffe0000000000UL + (gfn << PAGE_SHIFT), 4);
- 
+
              /* FreeBSD 64bit: linear map 0xffff800000000000 */
              switch ( level )
              {
@@ -2309,14 +2353,15 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
      * the writeable mapping by looking at the same MFN where the last
      * brute-force search succeeded. */
 
-    if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
+    if ( (curr->domain == d) &&
+         (curr->arch.paging.shadow.last_writeable_pte_smfn != 0) )
     {
         unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
-        mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
+        mfn_t last_smfn = _mfn(curr->arch.paging.shadow.last_writeable_pte_smfn);
         int shtype = mfn_to_page(last_smfn)->u.sh.type;
 
-        if ( callbacks[shtype] ) 
-            callbacks[shtype](v, last_smfn, gmfn);
+        if ( callbacks[shtype] )
+            callbacks[shtype](d, last_smfn, gmfn);
 
         if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
             perfc_incr(shadow_writeable_h_5);
@@ -2326,14 +2371,14 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
         return 1;
 
 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
-    
+
     /* Brute-force search of all the shadows, by walking the hash */
     trace_shadow_wrmap_bf(gmfn);
     if ( level == 0 )
         perfc_incr(shadow_writeable_bf_1);
     else
         perfc_incr(shadow_writeable_bf);
-    hash_foreach(v, callback_mask, callbacks, gmfn);
+    hash_domain_foreach(d, callback_mask, callbacks, gmfn);
 
     /* If that didn't catch the mapping, then there's some non-pagetable
      * mapping -- ioreq page, grant mapping, &c. */
@@ -2345,51 +2390,51 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
         SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
                       "%lu special-use mappings of it\n", mfn_x(gmfn),
                       (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
-        domain_crash(v->domain);
+        domain_crash(d);
     }
-    
+
     /* We killed at least one writeable mapping, so must flush TLBs. */
     return 1;
 }
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
-int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+int sh_remove_write_access_from_sl1p(struct domain *d, mfn_t gmfn,
                                      mfn_t smfn, unsigned long off)
 {
     struct page_info *sp = mfn_to_page(smfn);
-    
+
     ASSERT(mfn_valid(smfn));
     ASSERT(mfn_valid(gmfn));
-    
+
     if ( sp->u.sh.type == SH_type_l1_32_shadow
          || sp->u.sh.type == SH_type_fl1_32_shadow )
     {
         return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
-            (v, gmfn, smfn, off);
+            (d, gmfn, smfn, off);
     }
     else if ( sp->u.sh.type == SH_type_l1_pae_shadow
               || sp->u.sh.type == SH_type_fl1_pae_shadow )
         return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
-            (v, gmfn, smfn, off);
+            (d, gmfn, smfn, off);
     else if ( sp->u.sh.type == SH_type_l1_64_shadow
               || sp->u.sh.type == SH_type_fl1_64_shadow )
         return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
-            (v, gmfn, smfn, off);
+            (d, gmfn, smfn, off);
 
     return 0;
 }
-#endif 
+#endif
 
 /**************************************************************************/
 /* Remove all mappings of a guest frame from the shadow tables.
  * Returns non-zero if we need to flush TLBs. */
 
-int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
+static int sh_remove_all_mappings(struct domain *d, mfn_t gmfn)
 {
     struct page_info *page = mfn_to_page(gmfn);
 
     /* Dispatch table for getting per-type functions */
-    static const hash_callback_t callbacks[SH_type_unused] = {
+    static const hash_domain_callback_t callbacks[SH_type_unused] = {
         NULL, /* none    */
         SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* l1_32   */
         SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* fl1_32  */
@@ -2408,13 +2453,13 @@ int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
         NULL  /* unused  */
     };
 
-    static unsigned int callback_mask = 
-          1 << SH_type_l1_32_shadow
-        | 1 << SH_type_fl1_32_shadow
-        | 1 << SH_type_l1_pae_shadow
-        | 1 << SH_type_fl1_pae_shadow
-        | 1 << SH_type_l1_64_shadow
-        | 1 << SH_type_fl1_64_shadow
+    static const unsigned int callback_mask =
+          SHF_L1_32
+        | SHF_FL1_32
+        | SHF_L1_PAE
+        | SHF_FL1_PAE
+        | SHF_L1_64
+        | SHF_FL1_64
         ;
 
     perfc_incr(shadow_mappings);
@@ -2424,35 +2469,35 @@ int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
     /* Although this is an externally visible function, we do not know
      * whether the paging lock will be held when it is called (since it
      * can be called via put_page_type when we clear a shadow l1e).*/
-    paging_lock_recursive(v->domain);
+    paging_lock_recursive(d);
 
-    /* XXX TODO: 
+    /* XXX TODO:
      * Heuristics for finding the (probably) single mapping of this gmfn */
-    
+
     /* Brute-force search of all the shadows, by walking the hash */
     perfc_incr(shadow_mappings_bf);
-    hash_foreach(v, callback_mask, callbacks, gmfn);
+    hash_domain_foreach(d, callback_mask, callbacks, gmfn);
 
     /* If that didn't catch the mapping, something is very wrong */
     if ( !sh_check_page_has_no_refs(page) )
     {
-        /* Don't complain if we're in HVM and there are some extra mappings: 
-         * The qemu helper process has an untyped mapping of this dom's RAM 
+        /* Don't complain if we're in HVM and there are some extra mappings:
+         * The qemu helper process has an untyped mapping of this dom's RAM
          * and the HVM restore program takes another.
          * Also allow one typed refcount for xenheap pages, to match
          * share_xen_page_with_guest(). */
-        if ( !(shadow_mode_external(v->domain)
+        if ( !(shadow_mode_external(d)
                && (page->count_info & PGC_count_mask) <= 3
                && ((page->u.inuse.type_info & PGT_count_mask)
                    == !!is_xen_heap_page(page))) )
         {
             SHADOW_ERROR("can't find all mappings of mfn %lx: "
-                          "c=%08lx t=%08lx\n", mfn_x(gmfn), 
+                          "c=%08lx t=%08lx\n", mfn_x(gmfn),
                           page->count_info, page->u.inuse.type_info);
         }
     }
 
-    paging_unlock(v->domain);
+    paging_unlock(d);
 
     /* We killed at least one mapping, so must flush TLBs. */
     return 1;
@@ -2462,7 +2507,7 @@ int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
 /**************************************************************************/
 /* Remove all shadows of a guest frame from the shadow tables */
 
-static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
+static int sh_remove_shadow_via_pointer(struct domain *d, mfn_t smfn)
 /* Follow this shadow's up-pointer, if it has one, and remove the reference
  * found there.  Returns 1 if that was the only reference to this shadow */
 {
@@ -2473,16 +2518,16 @@ static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
 
     ASSERT(sp->u.sh.type > 0);
     ASSERT(sp->u.sh.type < SH_type_max_shadow);
-    ASSERT(sh_type_has_up_pointer(v, sp->u.sh.type));
-    
+    ASSERT(sh_type_has_up_pointer(d, sp->u.sh.type));
+
     if (sp->up == 0) return 0;
     pmfn = _mfn(sp->up >> PAGE_SHIFT);
     ASSERT(mfn_valid(pmfn));
-    vaddr = sh_map_domain_page(pmfn);
+    vaddr = map_domain_page(pmfn);
     ASSERT(vaddr);
     vaddr += sp->up & (PAGE_SIZE-1);
     ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
-    
+
     /* Is this the only reference to this shadow? */
     rc = (sp->u.sh.count == 1) ? 1 : 0;
 
@@ -2491,24 +2536,24 @@ static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
     {
     case SH_type_l1_32_shadow:
     case SH_type_l2_32_shadow:
-        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(v, vaddr, pmfn);
+        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(d, vaddr, pmfn);
         break;
     case SH_type_l1_pae_shadow:
     case SH_type_l2_pae_shadow:
     case SH_type_l2h_pae_shadow:
-        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(v, vaddr, pmfn);
+        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(d, vaddr, pmfn);
         break;
     case SH_type_l1_64_shadow:
     case SH_type_l2_64_shadow:
     case SH_type_l2h_64_shadow:
     case SH_type_l3_64_shadow:
     case SH_type_l4_64_shadow:
-        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(v, vaddr, pmfn);
+        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(d, vaddr, pmfn);
         break;
     default: BUG(); /* Some wierd unknown shadow type */
     }
-    
-    sh_unmap_domain_page(vaddr);
+
+    unmap_domain_page(vaddr);
     if ( rc )
         perfc_incr(shadow_up_pointer);
     else
@@ -2517,9 +2562,9 @@ static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
     return rc;
 }
 
-void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
-/* Remove the shadows of this guest page.  
- * If fast != 0, just try the quick heuristic, which will remove 
+void sh_remove_shadows(struct domain *d, mfn_t gmfn, int fast, int all)
+/* Remove the shadows of this guest page.
+ * If fast != 0, just try the quick heuristic, which will remove
  * at most one reference to each shadow of the page.  Otherwise, walk
  * all the shadow tables looking for refs to shadows of this gmfn.
  * If all != 0, kill the domain if we can't find all the shadows.
@@ -2529,10 +2574,10 @@ void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
     struct page_info *pg = mfn_to_page(gmfn);
     mfn_t smfn;
     unsigned char t;
-    
+
     /* Dispatch table for getting per-type functions: each level must
      * be called with the function to remove a lower-level shadow. */
-    static const hash_callback_t callbacks[SH_type_unused] = {
+    static const hash_domain_callback_t callbacks[SH_type_unused] = {
         NULL, /* none    */
         NULL, /* l1_32   */
         NULL, /* fl1_32  */
@@ -2552,22 +2597,20 @@ void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
     };
 
     /* Another lookup table, for choosing which mask to use */
-    static unsigned int masks[SH_type_unused] = {
+    static const unsigned int masks[SH_type_unused] = {
         0, /* none    */
-        1 << SH_type_l2_32_shadow, /* l1_32   */
+        SHF_L2_32, /* l1_32   */
         0, /* fl1_32  */
         0, /* l2_32   */
-        ((1 << SH_type_l2h_pae_shadow)
-         | (1 << SH_type_l2_pae_shadow)), /* l1_pae  */
+        SHF_L2H_PAE | SHF_L2_PAE, /* l1_pae  */
         0, /* fl1_pae */
         0, /* l2_pae  */
         0, /* l2h_pae  */
-        ((1 << SH_type_l2h_64_shadow)
-         | (1 << SH_type_l2_64_shadow)),  /* l1_64   */
+        SHF_L2H_64 | SHF_L2_64, /* l1_64   */
         0, /* fl1_64  */
-        1 << SH_type_l3_64_shadow, /* l2_64   */
-        1 << SH_type_l3_64_shadow, /* l2h_64  */
-        1 << SH_type_l4_64_shadow, /* l3_64   */
+        SHF_L3_64, /* l2_64   */
+        SHF_L3_64, /* l2h_64  */
+        SHF_L4_64, /* l3_64   */
         0, /* l4_64   */
         0, /* p2m     */
         0  /* unused  */
@@ -2579,15 +2622,14 @@ void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
     /* Although this is an externally visible function, we do not know
      * whether the paging lock will be held when it is called (since it
      * can be called via put_page_type when we clear a shadow l1e).*/
-    paging_lock_recursive(v->domain);
+    paging_lock_recursive(d);
 
-    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
-                   v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
+    SHADOW_PRINTK("d=%d: gmfn=%lx\n", d->domain_id, mfn_x(gmfn));
 
     /* Bail out now if the page is not shadowed */
     if ( (pg->count_info & PGC_page_table) == 0 )
     {
-        paging_unlock(v->domain);
+        paging_unlock(d);
         return;
     }
 
@@ -2595,7 +2637,7 @@ void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
     perfc_incr(shadow_unshadow);
 
     /* Lower-level shadows need to be excised from upper-level shadows.
-     * This call to hash_foreach() looks dangerous but is in fact OK: each
+     * This call to hash_vcpu_foreach() looks dangerous but is in fact OK: each
      * call will remove at most one shadow, and terminate immediately when
      * it does remove it, so we never walk the hash after doing a deletion.  */
 #define DO_UNSHADOW(_type) do {                                         \
@@ -2603,7 +2645,7 @@ void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
     if( !(pg->count_info & PGC_page_table)                              \
         || !(pg->shadow_flags & (1 << t)) )                             \
         break;                                                          \
-    smfn = shadow_hash_lookup(v, mfn_x(gmfn), t);                       \
+    smfn = shadow_hash_lookup(d, mfn_x(gmfn), t);                       \
     if ( unlikely(!mfn_valid(smfn)) )                                   \
     {                                                                   \
         SHADOW_ERROR(": gmfn %#lx has flags %#"PRIx32                   \
@@ -2611,14 +2653,14 @@ void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
                      mfn_x(gmfn), (uint32_t)pg->shadow_flags, t);       \
         break;                                                          \
     }                                                                   \
-    if ( sh_type_is_pinnable(v, t) )                                    \
-        sh_unpin(v, smfn);                                              \
-    else if ( sh_type_has_up_pointer(v, t) )                            \
-        sh_remove_shadow_via_pointer(v, smfn);                          \
+    if ( sh_type_is_pinnable(d, t) )                                    \
+        sh_unpin(d, smfn);                                              \
+    else if ( sh_type_has_up_pointer(d, t) )                            \
+        sh_remove_shadow_via_pointer(d, smfn);                          \
     if( !fast                                                           \
         && (pg->count_info & PGC_page_table)                            \
         && (pg->shadow_flags & (1 << t)) )                              \
-        hash_foreach(v, masks[t], callbacks, smfn);                     \
+        hash_domain_foreach(d, masks[t], callbacks, smfn);              \
 } while (0)
 
     DO_UNSHADOW(SH_type_l2_32_shadow);
@@ -2640,35 +2682,35 @@ void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
         SHADOW_ERROR("can't find all shadows of mfn %05lx "
                      "(shadow_flags=%08x)\n",
                       mfn_x(gmfn), pg->shadow_flags);
-        domain_crash(v->domain);
+        domain_crash(d);
     }
 
-    /* Need to flush TLBs now, so that linear maps are safe next time we 
+    /* Need to flush TLBs now, so that linear maps are safe next time we
      * take a fault. */
-    flush_tlb_mask(v->domain->domain_dirty_cpumask);
+    flush_tlb_mask(d->domain_dirty_cpumask);
 
-    paging_unlock(v->domain);
+    paging_unlock(d);
 }
 
 static void
-sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
+sh_remove_all_shadows_and_parents(struct domain *d, mfn_t gmfn)
 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
  * Unshadow it, and recursively unshadow pages that reference it. */
 {
-    sh_remove_shadows(v, gmfn, 0, 1);
+    sh_remove_shadows(d, gmfn, 0, 1);
     /* XXX TODO:
-     * Rework this hashtable walker to return a linked-list of all 
-     * the shadows it modified, then do breadth-first recursion 
-     * to find the way up to higher-level tables and unshadow them too. 
+     * Rework this hashtable walker to return a linked-list of all
+     * the shadows it modified, then do breadth-first recursion
+     * to find the way up to higher-level tables and unshadow them too.
      *
      * The current code (just tearing down each page's shadows as we
-     * detect that it is not a pagetable) is correct, but very slow. 
+     * detect that it is not a pagetable) is correct, but very slow.
      * It means extra emulated writes and slows down removal of mappings. */
 }
 
 /**************************************************************************/
 
-/* Reset the up-pointers of every L3 shadow to 0. 
+/* Reset the up-pointers of every L3 shadow to 0.
  * This is called when l3 shadows stop being pinnable, to clear out all
  * the list-head bits so the up-pointer field is properly inititalised. */
 static int sh_clear_up_pointer(struct vcpu *v, mfn_t smfn, mfn_t unused)
@@ -2679,7 +2721,7 @@ static int sh_clear_up_pointer(struct vcpu *v, mfn_t smfn, mfn_t unused)
 
 void sh_reset_l3_up_pointers(struct vcpu *v)
 {
-    static hash_callback_t callbacks[SH_type_unused] = {
+    static const hash_vcpu_callback_t callbacks[SH_type_unused] = {
         NULL, /* none    */
         NULL, /* l1_32   */
         NULL, /* fl1_32  */
@@ -2697,9 +2739,9 @@ void sh_reset_l3_up_pointers(struct vcpu *v)
         NULL, /* p2m     */
         NULL  /* unused  */
     };
-    static unsigned int callback_mask = 1 << SH_type_l3_64_shadow;    
+    static const unsigned int callback_mask = SHF_L3_64;
 
-    hash_foreach(v, callback_mask, callbacks, _mfn(INVALID_MFN));
+    hash_vcpu_foreach(v, callback_mask, callbacks, _mfn(INVALID_MFN));
 }
 
 
@@ -2712,7 +2754,7 @@ static void sh_update_paging_modes(struct vcpu *v)
 
     ASSERT(paging_locked_by_me(d));
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
     /* Make sure this vcpu has a virtual TLB array allocated */
     if ( unlikely(!v->arch.paging.vtlb) )
     {
@@ -2728,7 +2770,7 @@ static void sh_update_paging_modes(struct vcpu *v)
     }
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
     {
         int i;
@@ -2769,7 +2811,7 @@ static void sh_update_paging_modes(struct vcpu *v)
         ASSERT(shadow_mode_translate(d));
         ASSERT(shadow_mode_external(d));
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
         /* Need to resync all our pages now, because if a page goes out
          * of sync with paging enabled and is resynced with paging
          * disabled, the resync will go wrong. */
@@ -2828,7 +2870,7 @@ static void sh_update_paging_modes(struct vcpu *v)
                 /* Need to make a new monitor table for the new mode */
                 mfn_t new_mfn, old_mfn;
 
-                if ( v != current && vcpu_runnable(v) ) 
+                if ( v != current && vcpu_runnable(v) )
                 {
                     SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
                                  "this HVM vcpu's (d=%u v=%u) paging mode "
@@ -2848,7 +2890,7 @@ static void sh_update_paging_modes(struct vcpu *v)
                 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
                                mfn_x(new_mfn));
 
-                /* Don't be running on the old monitor table when we 
+                /* Don't be running on the old monitor table when we
                  * pull it down!  Switch CR3, and warn the HVM code that
                  * its host cr3 has changed. */
                 make_cr3(v, mfn_x(new_mfn));
@@ -2915,9 +2957,9 @@ static void sh_new_mode(struct domain *d, u32 new_mode)
 int shadow_enable(struct domain *d, u32 mode)
 /* Turn on "permanent" shadow features: external, translate, refcount.
  * Can only be called once on a domain, and these features cannot be
- * disabled. 
+ * disabled.
  * Returns 0 for success, -errno for failure. */
-{    
+{
     unsigned int old_pages;
     struct page_info *pg = NULL;
     uint32_t *e;
@@ -2929,8 +2971,7 @@ int shadow_enable(struct domain *d, u32 mode)
     domain_pause(d);
 
     /* Sanity check the arguments */
-    if ( (d == current->domain) ||
-         shadow_mode_enabled(d) ||
+    if ( shadow_mode_enabled(d) ||
          ((mode & PG_translate) && !(mode & PG_refcounts)) ||
          ((mode & PG_external) && !(mode & PG_translate)) )
     {
@@ -2943,14 +2984,14 @@ int shadow_enable(struct domain *d, u32 mode)
     if ( old_pages == 0 )
     {
         unsigned int r;
-        paging_lock(d);                
+        paging_lock(d);
         r = sh_set_allocation(d, 1024, NULL); /* Use at least 4MB */
         if ( r != 0 )
         {
             sh_set_allocation(d, 0, NULL);
             rv = -ENOMEM;
             goto out_locked;
-        }        
+        }
         paging_unlock(d);
     }
 
@@ -2958,7 +2999,7 @@ int shadow_enable(struct domain *d, u32 mode)
     d->arch.paging.alloc_page = shadow_alloc_p2m_page;
     d->arch.paging.free_page = shadow_free_p2m_page;
 
-    /* Init the P2M table.  Must be done before we take the paging lock 
+    /* Init the P2M table.  Must be done before we take the paging lock
      * to avoid possible deadlock. */
     if ( mode & PG_translate )
     {
@@ -2971,7 +3012,7 @@ int shadow_enable(struct domain *d, u32 mode)
      * have paging disabled */
     if ( is_hvm_domain(d) )
     {
-        /* Get a single page from the shadow pool.  Take it via the 
+        /* Get a single page from the shadow pool.  Take it via the
          * P2M interface to make freeing it simpler afterwards. */
         pg = shadow_alloc_p2m_page(d);
         if ( pg == NULL )
@@ -2980,13 +3021,13 @@ int shadow_enable(struct domain *d, u32 mode)
             goto out_unlocked;
         }
         /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
-         * of virtual address space onto the same physical address range */ 
+         * of virtual address space onto the same physical address range */
         e = __map_domain_page(pg);
         for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ )
             e[i] = ((0x400000U * i)
-                    | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER 
+                    | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
                     | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
-        sh_unmap_domain_page(e);
+        unmap_domain_page(e);
         pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
     }
 
@@ -3006,8 +3047,8 @@ int shadow_enable(struct domain *d, u32 mode)
         goto out_locked;
     }
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) 
-    /* We assume we're dealing with an older 64bit linux guest until we 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
+    /* We assume we're dealing with an older 64bit linux guest until we
      * see the guest use more than one l4 per vcpu. */
     d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
 #endif
@@ -3030,7 +3071,7 @@ int shadow_enable(struct domain *d, u32 mode)
     return rv;
 }
 
-void shadow_teardown(struct domain *d)
+void shadow_teardown(struct domain *d, int *preempted)
 /* Destroy the shadow pagetables of this domain and free its shadow memory.
  * Should only be called for dying domains. */
 {
@@ -3074,7 +3115,7 @@ void shadow_teardown(struct domain *d)
         }
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
         {
             int i;
             mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
@@ -3091,37 +3132,28 @@ void shadow_teardown(struct domain *d)
 
     if ( d->arch.paging.shadow.total_pages != 0 )
     {
-        SHADOW_PRINTK("teardown of domain %u starts."
-                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
-                       d->domain_id,
-                       d->arch.paging.shadow.total_pages, 
-                       d->arch.paging.shadow.free_pages, 
-                       d->arch.paging.shadow.p2m_pages);
         /* Destroy all the shadows and release memory to domheap */
-        sh_set_allocation(d, 0, NULL);
+        sh_set_allocation(d, 0, preempted);
+
+        if ( preempted && *preempted )
+            goto out;
+
         /* Release the hash table back to xenheap */
-        if (d->arch.paging.shadow.hash_table) 
+        if (d->arch.paging.shadow.hash_table)
             shadow_hash_teardown(d);
-        /* Should not have any more memory held */
-        SHADOW_PRINTK("teardown done."
-                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
-                       d->arch.paging.shadow.total_pages, 
-                       d->arch.paging.shadow.free_pages, 
-                       d->arch.paging.shadow.p2m_pages);
+
         ASSERT(d->arch.paging.shadow.total_pages == 0);
     }
 
-    /* Free the non-paged-vcpus pagetable; must happen after we've 
+    /* Free the non-paged-vcpus pagetable; must happen after we've
      * destroyed any shadows of it or sh_destroy_shadow will get confused. */
     if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) )
     {
+        ASSERT(is_hvm_domain(d));
         for_each_vcpu(d, v)
-        {
-            ASSERT(is_hvm_vcpu(v));
             if ( !hvm_paging_enabled(v) )
                 v->arch.guest_table = pagetable_null();
-        }
-        unpaged_pagetable = 
+        unpaged_pagetable =
             pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable);
         d->arch.paging.shadow.unpaged_pagetable = pagetable_null();
     }
@@ -3138,10 +3170,11 @@ void shadow_teardown(struct domain *d)
         d->arch.hvm_domain.dirty_vram = NULL;
     }
 
+out:
     paging_unlock(d);
 
     /* Must be called outside the lock */
-    if ( unpaged_pagetable ) 
+    if ( unpaged_pagetable )
         shadow_free_p2m_page(d, unpaged_pagetable);
 }
 
@@ -3151,15 +3184,15 @@ void shadow_final_teardown(struct domain *d)
     SHADOW_PRINTK("dom %u final teardown starts."
                    "  Shadow pages total = %u, free = %u, p2m=%u\n",
                    d->domain_id,
-                   d->arch.paging.shadow.total_pages, 
-                   d->arch.paging.shadow.free_pages, 
+                   d->arch.paging.shadow.total_pages,
+                   d->arch.paging.shadow.free_pages,
                    d->arch.paging.shadow.p2m_pages);
 
-    /* Double-check that the domain didn't have any shadow memory.  
+    /* Double-check that the domain didn't have any shadow memory.
      * It is possible for a domain that never got domain_kill()ed
      * to get here with its shadow allocation intact. */
     if ( d->arch.paging.shadow.total_pages != 0 )
-        shadow_teardown(d);
+        shadow_teardown(d, NULL);
 
     /* It is now safe to pull down the p2m map. */
     p2m_teardown(p2m_get_hostp2m(d));
@@ -3169,8 +3202,8 @@ void shadow_final_teardown(struct domain *d)
     SHADOW_PRINTK("dom %u final teardown done."
                    "  Shadow pages total = %u, free = %u, p2m=%u\n",
                    d->domain_id,
-                   d->arch.paging.shadow.total_pages, 
-                   d->arch.paging.shadow.free_pages, 
+                   d->arch.paging.shadow.total_pages,
+                   d->arch.paging.shadow.free_pages,
                    d->arch.paging.shadow.p2m_pages);
     paging_unlock(d);
 }
@@ -3215,7 +3248,7 @@ static int shadow_one_bit_enable(struct domain *d, u32 mode)
     return 0;
 }
 
-static int shadow_one_bit_disable(struct domain *d, u32 mode) 
+static int shadow_one_bit_disable(struct domain *d, u32 mode)
 /* Turn off a single shadow mode feature */
 {
     struct vcpu *v;
@@ -3235,8 +3268,8 @@ static int shadow_one_bit_disable(struct domain *d, u32 mode)
         SHADOW_PRINTK("un-shadowing of domain %u starts."
                        "  Shadow pages total = %u, free = %u, p2m=%u\n",
                        d->domain_id,
-                       d->arch.paging.shadow.total_pages, 
-                       d->arch.paging.shadow.free_pages, 
+                       d->arch.paging.shadow.total_pages,
+                       d->arch.paging.shadow.free_pages,
                        d->arch.paging.shadow.p2m_pages);
         for_each_vcpu(d, v)
         {
@@ -3247,7 +3280,7 @@ static int shadow_one_bit_disable(struct domain *d, u32 mode)
             else
                 make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
             {
                 int i;
                 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
@@ -3268,8 +3301,8 @@ static int shadow_one_bit_disable(struct domain *d, u32 mode)
         SHADOW_PRINTK("un-shadowing of domain %u done."
                        "  Shadow pages total = %u, free = %u, p2m=%u\n",
                        d->domain_id,
-                       d->arch.paging.shadow.total_pages, 
-                       d->arch.paging.shadow.free_pages, 
+                       d->arch.paging.shadow.total_pages,
+                       d->arch.paging.shadow.free_pages,
                        d->arch.paging.shadow.p2m_pages);
     }
 
@@ -3307,7 +3340,7 @@ static int shadow_test_disable(struct domain *d)
 /* P2M map manipulations */
 
 /* shadow specific code which should be called when P2M table entry is updated
- * with new content. It is responsible for update the entry, as well as other 
+ * with new content. It is responsible for update the entry, as well as other
  * shadow processing jobs.
  */
 
@@ -3315,11 +3348,6 @@ static void sh_unshadow_for_p2m_change(struct domain *d, unsigned long gfn,
                                        l1_pgentry_t *p, l1_pgentry_t new,
                                        unsigned int level)
 {
-    struct vcpu *v = current;
-
-    if ( v->domain != d )
-        v = d->vcpu ? d->vcpu[0] : NULL;
-
     /* The following assertion is to make sure we don't step on 1GB host
      * page support of HVM guest. */
     ASSERT(!(level > 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) &&
@@ -3330,16 +3358,16 @@ static void sh_unshadow_for_p2m_change(struct domain *d, unsigned long gfn,
     {
         mfn_t mfn = _mfn(l1e_get_pfn(*p));
         p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p));
-        if ( (p2m_is_valid(p2mt) || p2m_is_grant(p2mt)) && mfn_valid(mfn) ) 
+        if ( (p2m_is_valid(p2mt) || p2m_is_grant(p2mt)) && mfn_valid(mfn) )
         {
-            sh_remove_all_shadows_and_parents(v, mfn);
-            if ( sh_remove_all_mappings(v, mfn) )
+            sh_remove_all_shadows_and_parents(d, mfn);
+            if ( sh_remove_all_mappings(d, mfn) )
                 flush_tlb_mask(d->domain_dirty_cpumask);
         }
     }
 
-    /* If we're removing a superpage mapping from the p2m, we need to check 
-     * all the pages covered by it.  If they're still there in the new 
+    /* If we're removing a superpage mapping from the p2m, we need to check
+     * all the pages covered by it.  If they're still there in the new
      * scheme, that's OK, but otherwise they must be unshadowed. */
     if ( level == 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) &&
          (l1e_get_flags(*p) & _PAGE_PSE) )
@@ -3356,26 +3384,26 @@ static void sh_unshadow_for_p2m_change(struct domain *d, unsigned long gfn,
 
             /* If we're replacing a superpage with a normal L1 page, map it */
             if ( (l1e_get_flags(new) & _PAGE_PRESENT)
-                 && !(l1e_get_flags(new) & _PAGE_PSE) 
+                 && !(l1e_get_flags(new) & _PAGE_PSE)
                  && mfn_valid(nmfn) )
-                npte = map_domain_page(mfn_x(nmfn));
-            
+                npte = map_domain_page(nmfn);
+
             for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
             {
-                if ( !npte 
+                if ( !npte
                      || !p2m_is_ram(p2m_flags_to_type(l1e_get_flags(npte[i])))
                      || l1e_get_pfn(npte[i]) != mfn_x(omfn) )
                 {
                     /* This GFN->MFN mapping has gone away */
-                    sh_remove_all_shadows_and_parents(v, omfn);
-                    if ( sh_remove_all_mappings(v, omfn) )
+                    sh_remove_all_shadows_and_parents(d, omfn);
+                    if ( sh_remove_all_mappings(d, omfn) )
                         cpumask_or(&flushmask, &flushmask,
                                    d->domain_dirty_cpumask);
                 }
                 omfn = _mfn(mfn_x(omfn) + 1);
             }
             flush_tlb_mask(&flushmask);
-            
+
             if ( npte )
                 unmap_domain_page(npte);
         }
@@ -3390,7 +3418,7 @@ shadow_write_p2m_entry(struct domain *d, unsigned long gfn,
     paging_lock(d);
 
     /* If there are any shadows, update them.  But if shadow_teardown()
-     * has already been called then it's not safe to try. */ 
+     * has already been called then it's not safe to try. */
     if ( likely(d->arch.paging.shadow.total_pages != 0) )
          sh_unshadow_for_p2m_change(d, gfn, p, new, level);
 
@@ -3420,15 +3448,15 @@ shadow_write_p2m_entry(struct domain *d, unsigned long gfn,
 /* Shadow specific code which is called in paging_log_dirty_enable().
  * Return 0 if no problem found.
  */
-int shadow_enable_log_dirty(struct domain *d, bool_t log_global)
+static int sh_enable_log_dirty(struct domain *d, bool_t log_global)
 {
     int ret;
 
     paging_lock(d);
     if ( shadow_mode_enabled(d) )
     {
-        /* This domain already has some shadows: need to clear them out 
-         * of the way to make sure that all references to guest memory are 
+        /* This domain already has some shadows: need to clear them out
+         * of the way to make sure that all references to guest memory are
          * properly write-protected */
         shadow_blow_tables(d);
     }
@@ -3437,10 +3465,10 @@ int shadow_enable_log_dirty(struct domain *d, bool_t log_global)
     /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
      * change an l4e instead of cr3 to switch tables.  Give them the
      * same optimization */
-    if ( is_pv_32on64_domain(d) )
+    if ( is_pv_32bit_domain(d) )
         d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
 #endif
-    
+
     ret = shadow_one_bit_enable(d, PG_log_dirty);
     paging_unlock(d);
 
@@ -3448,21 +3476,21 @@ int shadow_enable_log_dirty(struct domain *d, bool_t log_global)
 }
 
 /* shadow specfic code which is called in paging_log_dirty_disable() */
-int shadow_disable_log_dirty(struct domain *d)
+static int sh_disable_log_dirty(struct domain *d)
 {
     int ret;
 
     paging_lock(d);
     ret = shadow_one_bit_disable(d, PG_log_dirty);
     paging_unlock(d);
-    
+
     return ret;
 }
 
-/* This function is called when we CLEAN log dirty bitmap. See 
- * paging_log_dirty_op() for details. 
+/* This function is called when we CLEAN log dirty bitmap. See
+ * paging_log_dirty_op() for details.
  */
-void shadow_clean_dirty_bitmap(struct domain *d)
+static void sh_clean_dirty_bitmap(struct domain *d)
 {
     paging_lock(d);
     /* Need to revoke write access to the domain's pages again.
@@ -3478,9 +3506,9 @@ void shadow_clean_dirty_bitmap(struct domain *d)
 int shadow_track_dirty_vram(struct domain *d,
                             unsigned long begin_pfn,
                             unsigned long nr,
-                            XEN_GUEST_HANDLE_64(uint8) dirty_bitmap)
+                            XEN_GUEST_HANDLE_64(uint8) guest_dirty_bitmap)
 {
-    int rc;
+    int rc = 0;
     unsigned long end_pfn = begin_pfn + nr;
     unsigned long dirty_size = (nr + 7) / 8;
     int flush_tlb = 0;
@@ -3488,6 +3516,7 @@ int shadow_track_dirty_vram(struct domain *d,
     p2m_type_t t;
     struct sh_dirty_vram *dirty_vram;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    uint8_t *dirty_bitmap = NULL;
 
     if ( end_pfn < begin_pfn || end_pfn > p2m->max_mapped_pfn + 1 )
         return -EINVAL;
@@ -3511,16 +3540,19 @@ int shadow_track_dirty_vram(struct domain *d,
     }
 
     if ( !nr )
+        goto out;
+
+    dirty_bitmap = vzalloc(dirty_size);
+    if ( dirty_bitmap == NULL )
     {
-        rc = 0;
+        rc = -ENOMEM;
         goto out;
     }
-
     /* This should happen seldomly (Video mode change),
      * no need to be careful. */
     if ( !dirty_vram )
     {
-        /* Throw away all the shadows rather than walking through them 
+        /* Throw away all the shadows rather than walking through them
          * up to nr times getting rid of mappings of each pfn */
         shadow_blow_tables(d);
 
@@ -3546,12 +3578,8 @@ int shadow_track_dirty_vram(struct domain *d,
         rc = -ENODATA;
     }
     else if (dirty_vram->last_dirty == -1)
-    {
         /* still completely clean, just copy our empty bitmap */
-        rc = -EFAULT;
-        if ( copy_to_guest(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size) == 0 )
-            rc = 0;
-    }
+        memcpy(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size);
     else
     {
         unsigned long map_mfn = INVALID_MFN;
@@ -3584,7 +3612,7 @@ int shadow_track_dirty_vram(struct domain *d,
                         dirty = 1;
                         /* TODO: Heuristics for finding the single mapping of
                          * this gmfn */
-                        flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn);
+                        flush_tlb |= sh_remove_all_mappings(d, mfn);
                     }
                     else
                     {
@@ -3596,8 +3624,8 @@ int shadow_track_dirty_vram(struct domain *d,
                         if ( sl1mfn != map_mfn )
                         {
                             if ( map_sl1p )
-                                sh_unmap_domain_page(map_sl1p);
-                            map_sl1p = sh_map_domain_page(_mfn(sl1mfn));
+                                unmap_domain_page(map_sl1p);
+                            map_sl1p = map_domain_page(_mfn(sl1mfn));
                             map_mfn = sl1mfn;
                         }
                         sl1e = map_sl1p + (sl1ma & ~PAGE_MASK);
@@ -3628,23 +3656,21 @@ int shadow_track_dirty_vram(struct domain *d,
         }
 
         if ( map_sl1p )
-            sh_unmap_domain_page(map_sl1p);
+            unmap_domain_page(map_sl1p);
 
-        rc = -EFAULT;
-        if ( copy_to_guest(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size) == 0 ) {
-            memset(dirty_vram->dirty_bitmap, 0, dirty_size);
-            if (dirty_vram->last_dirty + SECONDS(2) < NOW())
+        memcpy(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size);
+        memset(dirty_vram->dirty_bitmap, 0, dirty_size);
+        if ( dirty_vram->last_dirty + SECONDS(2) < NOW() )
+        {
+            /* was clean for more than two seconds, try to disable guest
+             * write access */
+            for ( i = begin_pfn; i < end_pfn; i++ )
             {
-                /* was clean for more than two seconds, try to disable guest
-                 * write access */
-                for ( i = begin_pfn; i < end_pfn; i++ ) {
-                    mfn_t mfn = get_gfn_query_unlocked(d, i, &t);
-                    if (mfn_x(mfn) != INVALID_MFN)
-                        flush_tlb |= sh_remove_write_access(d->vcpu[0], mfn, 1, 0);
-                }
-                dirty_vram->last_dirty = -1;
+                mfn_t mfn = get_gfn_query_unlocked(d, i, &t);
+                if ( mfn_x(mfn) != INVALID_MFN )
+                    flush_tlb |= sh_remove_write_access(d, mfn, 1, 0);
             }
-            rc = 0;
+            dirty_vram->last_dirty = -1;
         }
     }
     if ( flush_tlb )
@@ -3659,6 +3685,16 @@ out_dirty_vram:
 
 out:
     paging_unlock(d);
+    if ( rc == 0 && dirty_bitmap != NULL &&
+         copy_to_guest(guest_dirty_bitmap, dirty_bitmap, dirty_size) )
+    {
+        paging_lock(d);
+        for ( i = 0; i < dirty_size; i++ )
+            dirty_vram->dirty_bitmap[i] |= dirty_bitmap[i];
+        paging_unlock(d);
+        rc = -EFAULT;
+    }
+    vfree(dirty_bitmap);
     p2m_unlock(p2m_get_hostp2m(d));
     return rc;
 }
@@ -3666,7 +3702,7 @@ out:
 /**************************************************************************/
 /* Shadow-control XEN_DOMCTL dispatcher */
 
-int shadow_domctl(struct domain *d, 
+int shadow_domctl(struct domain *d,
                   xen_domctl_shadow_op_t *sc,
                   XEN_GUEST_HANDLE_PARAM(void) u_domctl)
 {
@@ -3676,7 +3712,7 @@ int shadow_domctl(struct domain *d,
     {
     case XEN_DOMCTL_SHADOW_OP_OFF:
         if ( d->arch.paging.mode == PG_SH_enable )
-            if ( (rc = shadow_test_disable(d)) != 0 ) 
+            if ( (rc = shadow_test_disable(d)) != 0 )
                 return rc;
         return 0;
 
@@ -3696,7 +3732,7 @@ int shadow_domctl(struct domain *d,
     case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
         paging_lock(d);
         if ( sc->mb == 0 && shadow_mode_enabled(d) )
-        {            
+        {
             /* Can't set the allocation to zero unless the domain stops using
              * shadow pagetables first */
             SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
@@ -3710,7 +3746,7 @@ int shadow_domctl(struct domain *d,
             /* Not finished.  Set up to re-run the call. */
             rc = hypercall_create_continuation(
                 __HYPERVISOR_domctl, "h", u_domctl);
-        else 
+        else
             /* Finished.  Return the new allocation */
             sc->mb = shadow_get_allocation(d);
         return rc;
@@ -3727,10 +3763,10 @@ int shadow_domctl(struct domain *d,
 
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
 
-void shadow_audit_tables(struct vcpu *v) 
+void shadow_audit_tables(struct vcpu *v)
 {
     /* Dispatch table for getting per-type functions */
-    static const hash_callback_t callbacks[SH_type_unused] = {
+    static const hash_vcpu_callback_t callbacks[SH_type_unused] = {
         NULL, /* none    */
         SHADOW_INTERNAL_NAME(sh_audit_l1_table, 2),  /* l1_32   */
         SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 2), /* fl1_32  */
@@ -3747,7 +3783,7 @@ void shadow_audit_tables(struct vcpu *v)
         SHADOW_INTERNAL_NAME(sh_audit_l4_table, 4),  /* l4_64   */
         NULL  /* All the rest */
     };
-    unsigned int mask; 
+    unsigned int mask;
 
     if ( !(SHADOW_AUDIT_ENABLE) )
         return;
@@ -3757,7 +3793,7 @@ void shadow_audit_tables(struct vcpu *v)
 #endif
 
     if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
-        mask = ~1; /* Audit every table in the system */
+        mask = SHF_page_type_mask; /* Audit every table in the system */
     else 
     {
         /* Audit only the current mode's tables */
@@ -3766,13 +3802,13 @@ void shadow_audit_tables(struct vcpu *v)
         case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
         case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
                         |SHF_L2H_PAE); break;
-        case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64  
+        case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
                         |SHF_L3_64|SHF_L4_64); break;
         default: BUG();
         }
     }
 
-    hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
+    hash_vcpu_foreach(v, mask, callbacks, _mfn(INVALID_MFN));
 }
 
 #endif /* Shadow audit */
@@ -3783,5 +3819,5 @@ void shadow_audit_tables(struct vcpu *v)
  * c-file-style: "BSD"
  * c-basic-offset: 4
  * indent-tabs-mode: nil
- * End: 
+ * End:
  */
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 225290e..22081a1 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -1,7 +1,7 @@
 /******************************************************************************
  * arch/x86/mm/shadow/multi.c
  *
- * Simple, mostly-synchronous shadow page tables. 
+ * Simple, mostly-synchronous shadow page tables.
  * Parts of this code are Copyright (c) 2006 by XenSource Inc.
  * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
  * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
@@ -17,8 +17,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -43,25 +42,25 @@
 #include "types.h"
 
 /* THINGS TO DO LATER:
- * 
+ *
  * TEARDOWN HEURISTICS
- * Also: have a heuristic for when to destroy a previous paging-mode's 
+ * Also: have a heuristic for when to destroy a previous paging-mode's
  * shadows.  When a guest is done with its start-of-day 32-bit tables
- * and reuses the memory we want to drop those shadows.  Start with 
- * shadows in a page in two modes as a hint, but beware of clever tricks 
+ * and reuses the memory we want to drop those shadows.  Start with
+ * shadows in a page in two modes as a hint, but beware of clever tricks
  * like reusing a pagetable for both PAE and 64-bit during boot...
  *
  * PAE LINEAR MAPS
  * Rework shadow_get_l*e() to have the option of using map_domain_page()
- * instead of linear maps.  Add appropriate unmap_l*e calls in the users. 
- * Then we can test the speed difference made by linear maps.  If the 
- * map_domain_page() version is OK on PAE, we could maybe allow a lightweight 
- * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
- * to share l2h pages again. 
+ * instead of linear maps.  Add appropriate unmap_l*e calls in the users.
+ * Then we can test the speed difference made by linear maps.  If the
+ * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
+ * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
+ * to share l2h pages again.
  *
  * PSE disabled / PSE36
  * We don't support any modes other than PSE enabled, PSE36 disabled.
- * Neither of those would be hard to change, but we'd need to be able to 
+ * Neither of those would be hard to change, but we'd need to be able to
  * deal with shadows made in one mode and used in another.
  */
 
@@ -90,80 +89,77 @@ static char *fetch_type_names[] = {
  *              shadow L1 which maps its "splinters".
  */
 
-static inline mfn_t 
-get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
+static inline mfn_t
+get_fl1_shadow_status(struct domain *d, gfn_t gfn)
 /* Look for FL1 shadows in the hash table */
 {
-    mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
+    mfn_t smfn = shadow_hash_lookup(d, gfn_x(gfn), SH_type_fl1_shadow);
     ASSERT(!mfn_valid(smfn) || mfn_to_page(smfn)->u.sh.head);
     return smfn;
 }
 
-static inline mfn_t 
-get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+static inline mfn_t
+get_shadow_status(struct domain *d, mfn_t gmfn, u32 shadow_type)
 /* Look for shadows in the hash table */
 {
-    mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
+    mfn_t smfn = shadow_hash_lookup(d, mfn_x(gmfn), shadow_type);
     ASSERT(!mfn_valid(smfn) || mfn_to_page(smfn)->u.sh.head);
     perfc_incr(shadow_get_shadow_status);
     return smfn;
 }
 
-static inline void 
-set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+static inline void
+set_fl1_shadow_status(struct domain *d, gfn_t gfn, mfn_t smfn)
 /* Put an FL1 shadow into the hash table */
 {
     SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
                    gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
 
     ASSERT(mfn_to_page(smfn)->u.sh.head);
-    shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
+    shadow_hash_insert(d, gfn_x(gfn), SH_type_fl1_shadow, smfn);
 }
 
-static inline void 
-set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+static inline void
+set_shadow_status(struct domain *d, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
 /* Put a shadow into the hash table */
 {
-    struct domain *d = v->domain;
     int res;
 
-    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
-                   d->domain_id, v->vcpu_id, mfn_x(gmfn),
-                   shadow_type, mfn_x(smfn));
+    SHADOW_PRINTK("d=%d: gmfn=%lx, type=%08x, smfn=%lx\n",
+                  d->domain_id, mfn_x(gmfn), shadow_type, mfn_x(smfn));
 
     ASSERT(mfn_to_page(smfn)->u.sh.head);
 
-    /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
-    if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
+    /* 32-bit PV guests don't own their l4 pages so can't get_page them */
+    if ( !is_pv_32bit_domain(d) || shadow_type != SH_type_l4_64_shadow )
     {
         res = get_page(mfn_to_page(gmfn), d);
         ASSERT(res == 1);
     }
 
-    shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
+    shadow_hash_insert(d, mfn_x(gmfn), shadow_type, smfn);
 }
 
-static inline void 
-delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+static inline void
+delete_fl1_shadow_status(struct domain *d, gfn_t gfn, mfn_t smfn)
 /* Remove a shadow from the hash table */
 {
     SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
                    gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
     ASSERT(mfn_to_page(smfn)->u.sh.head);
-    shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
+    shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn);
 }
 
-static inline void 
-delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+static inline void
+delete_shadow_status(struct domain *d, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
 /* Remove a shadow from the hash table */
 {
-    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
-                   v->domain->domain_id, v->vcpu_id,
-                   mfn_x(gmfn), shadow_type, mfn_x(smfn));
+    SHADOW_PRINTK("d=%d: gmfn=%lx, type=%08x, smfn=%lx\n",
+                  d->domain_id, mfn_x(gmfn), shadow_type, mfn_x(smfn));
     ASSERT(mfn_to_page(smfn)->u.sh.head);
-    shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
-    /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
-    if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
+    shadow_hash_delete(d, mfn_x(gmfn), shadow_type, smfn);
+    /* 32-bit PV guests don't own their l4 pages; see set_shadow_status */
+    if ( !is_pv_32bit_domain(d) || shadow_type != SH_type_l4_64_shadow )
         put_page(mfn_to_page(gmfn));
 }
 
@@ -172,10 +168,10 @@ delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
 /* Functions for walking the guest page tables */
 
 static inline uint32_t
-sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
+sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                      uint32_t pfec)
 {
-    return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec, 
+    return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec,
 #if GUEST_PAGING_LEVELS == 3 /* PAE */
                              _mfn(INVALID_MFN),
                              v->arch.paging.shadow.gl3e
@@ -224,16 +220,16 @@ shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
     l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
     mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
-    l3p = sh_map_domain_page(gw->l3mfn);
+    l3p = map_domain_page(gw->l3mfn);
     mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
-    sh_unmap_domain_page(l3p);
+    unmap_domain_page(l3p);
 #else
     mismatch |= (gw->l3e.l3 !=
                  v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
 #endif
-    l2p = sh_map_domain_page(gw->l2mfn);
+    l2p = map_domain_page(gw->l2mfn);
     mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
-    sh_unmap_domain_page(l2p);
+    unmap_domain_page(l2p);
 #else
     l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
     mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
@@ -241,9 +237,9 @@ shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
     if ( !(guest_supports_superpages(v) &&
            (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
     {
-        l1p = sh_map_domain_page(gw->l1mfn);
+        l1p = map_domain_page(gw->l1mfn);
         mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
-        sh_unmap_domain_page(l1p);
+        unmap_domain_page(l1p);
     }
 
     return !mismatch;
@@ -259,7 +255,7 @@ shadow_check_gl1e(struct vcpu *v, walk_t *gw)
         return 0;
 
     /* Can't just pull-through because mfn may have changed */
-    l1p = map_domain_page(mfn_x(gw->l1mfn));
+    l1p = map_domain_page(gw->l1mfn);
     nl1e.l1 = l1p[guest_l1_table_offset(gw->va)].l1;
     unmap_domain_page(l1p);
 
@@ -281,6 +277,7 @@ shadow_check_gl1e(struct vcpu *v, walk_t *gw)
 static inline uint32_t
 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
 {
+    struct domain *d = v->domain;
     uint32_t rc = 0;
 
 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
@@ -288,24 +285,24 @@ gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     if ( mfn_is_out_of_sync(gw->l3mfn) )
     {
-        sh_resync(v, gw->l3mfn);
+        sh_resync(d, gw->l3mfn);
         rc = GW_RMWR_REWALK;
     }
     else
 #endif /* OOS */
-     if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
+     if ( sh_remove_write_access(d, gw->l3mfn, 3, va) )
          rc = GW_RMWR_FLUSHTLB;
 #endif /* GUEST_PAGING_LEVELS >= 4 */
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     if ( mfn_is_out_of_sync(gw->l2mfn) )
     {
-        sh_resync(v, gw->l2mfn);
+        sh_resync(d, gw->l2mfn);
         rc |= GW_RMWR_REWALK;
     }
     else
 #endif /* OOS */
-    if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
+    if ( sh_remove_write_access(d, gw->l2mfn, 2, va) )
         rc |= GW_RMWR_FLUSHTLB;
 #endif /* GUEST_PAGING_LEVELS >= 3 */
 
@@ -314,7 +311,7 @@ gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
          && !mfn_is_out_of_sync(gw->l1mfn)
 #endif /* OOS */
-         && sh_remove_write_access(v, gw->l1mfn, 1, va) )
+         && sh_remove_write_access(d, gw->l1mfn, 1, va) )
         rc |= GW_RMWR_FLUSHTLB;
 
     return rc;
@@ -323,8 +320,9 @@ gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
 /* Lightweight audit: pass all the shadows associated with this guest walk
  * through the audit mechanisms */
-static void sh_audit_gw(struct vcpu *v, walk_t *gw) 
+static void sh_audit_gw(struct vcpu *v, walk_t *gw)
 {
+    struct domain *d = v->domain;
     mfn_t smfn;
 
     if ( !(SHADOW_AUDIT_ENABLE) )
@@ -332,33 +330,33 @@ static void sh_audit_gw(struct vcpu *v, walk_t *gw)
 
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
     if ( mfn_valid(gw->l4mfn)
-         && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn, 
+         && mfn_valid((smfn = get_shadow_status(d, gw->l4mfn,
                                                 SH_type_l4_shadow))) )
         (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
     if ( mfn_valid(gw->l3mfn)
-         && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn, 
+         && mfn_valid((smfn = get_shadow_status(d, gw->l3mfn,
                                                 SH_type_l3_shadow))) )
         (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
 #endif /* PAE or 64... */
     if ( mfn_valid(gw->l2mfn) )
     {
-        if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn, 
+        if ( mfn_valid((smfn = get_shadow_status(d, gw->l2mfn,
                                                  SH_type_l2_shadow))) )
             (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
 #if GUEST_PAGING_LEVELS == 3
-        if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn, 
+        if ( mfn_valid((smfn = get_shadow_status(d, gw->l2mfn,
                                                  SH_type_l2h_shadow))) )
             (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
 #endif
     }
     if ( mfn_valid(gw->l1mfn)
-         && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn, 
+         && mfn_valid((smfn = get_shadow_status(d, gw->l1mfn,
                                                 SH_type_l1_shadow))) )
         (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
     else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
               && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
-              && mfn_valid( 
-              (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
+              && mfn_valid(
+              (smfn = get_fl1_shadow_status(d, guest_l2e_get_gfn(gw->l2e)))) )
         (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
 }
 
@@ -376,16 +374,16 @@ sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
     walk_t gw;
 
     ASSERT(shadow_mode_translate(v->domain));
-        
+
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0 
+    if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0
          && mfn_valid(gw.l1mfn) )
     {
         if ( gl1mfn )
             *gl1mfn = mfn_x(gw.l1mfn);
-        pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
+        pl1e = map_domain_page(gw.l1mfn) +
             (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
     }
 
@@ -398,7 +396,7 @@ sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
     walk_t gw;
 
     ASSERT(shadow_mode_translate(v->domain));
-        
+
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
@@ -428,20 +426,20 @@ sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
 /* From one page of a multi-page shadow, find the next one */
 static inline mfn_t sh_next_page(mfn_t smfn)
 {
-    mfn_t next;
-    struct page_info *pg = mfn_to_page(smfn);
+    struct page_info *pg = mfn_to_page(smfn), *next;
+    struct page_list_head h = PAGE_LIST_HEAD_INIT(h);
 
     ASSERT(pg->u.sh.type == SH_type_l1_32_shadow
            || pg->u.sh.type == SH_type_fl1_32_shadow
            || pg->u.sh.type == SH_type_l2_32_shadow);
     ASSERT(pg->u.sh.type == SH_type_l2_32_shadow || pg->u.sh.head);
-    ASSERT(pg->list.next != PAGE_LIST_NULL);
 
-    next = _mfn(pdx_to_pfn(pg->list.next));
+    next = page_list_next(pg, &h);
 
-    ASSERT(mfn_to_page(next)->u.sh.type == pg->u.sh.type);
-    ASSERT(!mfn_to_page(next)->u.sh.head);
-    return next;
+    ASSERT(next);
+    ASSERT(next->u.sh.type == pg->u.sh.type);
+    ASSERT(!next->u.sh.head);
+    return page_to_mfn(next);
 }
 
 static inline u32
@@ -506,12 +504,12 @@ shadow_l4_index(mfn_t *smfn, u32 guest_index)
  */
 
 static always_inline void
-_sh_propagate(struct vcpu *v, 
+_sh_propagate(struct vcpu *v,
               guest_intpte_t guest_intpte,
-              mfn_t target_mfn, 
+              mfn_t target_mfn,
               void *shadow_entry_ptr,
               int level,
-              fetch_type_t ft, 
+              fetch_type_t ft,
               p2m_type_t p2mt)
 {
     guest_l1e_t guest_entry = { guest_intpte };
@@ -537,11 +535,11 @@ _sh_propagate(struct vcpu *v,
     if ( unlikely(!(gflags & _PAGE_PRESENT)) )
     {
 #if !(SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
-        /* If a guest l1 entry is not present, shadow with the magic 
+        /* If a guest l1 entry is not present, shadow with the magic
          * guest-not-present entry. */
         if ( level == 1 )
             *sp = sh_l1e_gnp();
-        else 
+        else
 #endif /* !OOS */
             *sp = shadow_l1e_empty();
         goto done;
@@ -562,7 +560,7 @@ _sh_propagate(struct vcpu *v,
     // return early.
     //
     if ( !mfn_valid(target_mfn)
-         && !(level == 1 && (!shadow_mode_refcounts(d) 
+         && !(level == 1 && (!shadow_mode_refcounts(d)
                              || p2mt == p2m_mmio_direct)) )
     {
         ASSERT((ft == ft_prefetch));
@@ -595,7 +593,7 @@ _sh_propagate(struct vcpu *v,
         ASSERT(!(sflags & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)));
 
         /* compute the PAT index for shadow page entry when VT-d is enabled
-         * and device assigned. 
+         * and device assigned.
          * 1) direct MMIO: compute the PAT index with gMTRR=UC and gPAT.
          * 2) if enables snoop control, compute the PAT index as WB.
          * 3) if disables snoop control, compute the PAT index with
@@ -613,7 +611,7 @@ _sh_propagate(struct vcpu *v,
                             gflags,
                             gfn_to_paddr(target_gfn),
                             pfn_to_paddr(mfn_x(target_mfn)),
-                            MTRR_TYPE_UNCACHABLE); 
+                            MTRR_TYPE_UNCACHABLE);
                 else if ( iommu_snoop )
                     sflags |= pat_type_2_pte_flags(PAT_TYPE_WRBACK);
                 else
@@ -654,12 +652,12 @@ _sh_propagate(struct vcpu *v,
     // Only allow the guest write access to a page a) on a demand fault,
     // or b) if the page is already marked as dirty.
     //
-    // (We handle log-dirty entirely inside the shadow code, without using the 
+    // (We handle log-dirty entirely inside the shadow code, without using the
     // p2m_ram_logdirty p2m type: only HAP uses that.)
     if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
     {
         if ( mfn_valid(target_mfn) ) {
-            if ( ft & FETCH_TYPE_WRITE ) 
+            if ( ft & FETCH_TYPE_WRITE )
                 paging_mark_dirty(d, mfn_x(target_mfn));
             else if ( !paging_mfn_is_dirty(d, target_mfn) )
                 sflags &= ~_PAGE_RW;
@@ -682,10 +680,10 @@ _sh_propagate(struct vcpu *v,
          (p2mt == p2m_mmio_direct &&
           rangeset_contains_singleton(mmio_ro_ranges, mfn_x(target_mfn))) )
         sflags &= ~_PAGE_RW;
-    
+
     // protect guest page tables
     //
-    if ( unlikely((level == 1) 
+    if ( unlikely((level == 1)
                   && sh_mfn_is_a_page_table(target_mfn)
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
                   /* Unless the page is out of sync and the guest is
@@ -699,7 +697,7 @@ _sh_propagate(struct vcpu *v,
     // PV guests in 64-bit mode use two different page tables for user vs
     // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
     // It is always shadowed as present...
-    if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d) 
+    if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32bit_domain(d)
          && is_pv_domain(d) )
     {
         sflags |= _PAGE_USER;
@@ -720,7 +718,7 @@ _sh_propagate(struct vcpu *v,
 
 #if GUEST_PAGING_LEVELS >= 4
 static void
-l4e_propagate_from_guest(struct vcpu *v, 
+l4e_propagate_from_guest(struct vcpu *v,
                          guest_l4e_t gl4e,
                          mfn_t sl3mfn,
                          shadow_l4e_t *sl4e,
@@ -732,7 +730,7 @@ l4e_propagate_from_guest(struct vcpu *v,
 static void
 l3e_propagate_from_guest(struct vcpu *v,
                          guest_l3e_t gl3e,
-                         mfn_t sl2mfn, 
+                         mfn_t sl2mfn,
                          shadow_l3e_t *sl3e,
                          fetch_type_t ft)
 {
@@ -741,7 +739,7 @@ l3e_propagate_from_guest(struct vcpu *v,
 #endif // GUEST_PAGING_LEVELS >= 4
 
 static void
-l2e_propagate_from_guest(struct vcpu *v, 
+l2e_propagate_from_guest(struct vcpu *v,
                          guest_l2e_t gl2e,
                          mfn_t sl1mfn,
                          shadow_l2e_t *sl2e,
@@ -751,11 +749,11 @@ l2e_propagate_from_guest(struct vcpu *v,
 }
 
 static void
-l1e_propagate_from_guest(struct vcpu *v, 
+l1e_propagate_from_guest(struct vcpu *v,
                          guest_l1e_t gl1e,
-                         mfn_t gmfn, 
+                         mfn_t gmfn,
                          shadow_l1e_t *sl1e,
-                         fetch_type_t ft, 
+                         fetch_type_t ft,
                          p2m_type_t p2mt)
 {
     _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
@@ -768,10 +766,10 @@ l1e_propagate_from_guest(struct vcpu *v,
  * functions which ever write (non-zero) data onto a shadow page.
  */
 
-static inline void safe_write_entry(void *dst, void *src) 
+static inline void safe_write_entry(void *dst, void *src)
 /* Copy one PTE safely when processors might be running on the
  * destination pagetable.   This does *not* give safety against
- * concurrent writes (that's what the paging lock is for), just 
+ * concurrent writes (that's what the paging lock is for), just
  * stops the hardware picking up partially written entries. */
 {
     volatile unsigned long *d = dst;
@@ -784,7 +782,7 @@ static inline void safe_write_entry(void *dst, void *src)
 }
 
 
-static inline void 
+static inline void
 shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
 /* This function does the actual writes to shadow pages.
  * It must not be called directly, since it doesn't do the bookkeeping
@@ -797,13 +795,13 @@ shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
 
     /* Because we mirror access rights at all levels in the shadow, an
      * l2 (or higher) entry with the RW bit cleared will leave us with
-     * no write access through the linear map.  
-     * We detect that by writing to the shadow with copy_to_user() and 
+     * no write access through the linear map.
+     * We detect that by writing to the shadow with copy_to_user() and
      * using map_domain_page() to get a writeable mapping if we need to. */
-    if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 ) 
+    if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
     {
         perfc_incr(shadow_linear_map_failed);
-        map = sh_map_domain_page(mfn);
+        map = map_domain_page(mfn);
         dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
     }
 
@@ -811,7 +809,7 @@ shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
     for ( i = 0; i < entries; i++ )
         safe_write_entry(dst++, src++);
 
-    if ( map != NULL ) sh_unmap_domain_page(map);
+    if ( map != NULL ) unmap_domain_page(map);
 }
 
 /* type is only used to distinguish grant map pages from ordinary RAM
@@ -874,7 +872,7 @@ shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d, p2m_type_t type)
 
 static void inline
 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
-{ 
+{
     if ( !shadow_mode_refcounts(d) )
         return;
 
@@ -882,9 +880,9 @@ shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
 }
 
 #if GUEST_PAGING_LEVELS >= 4
-static int shadow_set_l4e(struct vcpu *v, 
-                          shadow_l4e_t *sl4e, 
-                          shadow_l4e_t new_sl4e, 
+static int shadow_set_l4e(struct domain *d,
+                          shadow_l4e_t *sl4e,
+                          shadow_l4e_t new_sl4e,
                           mfn_t sl4mfn)
 {
     int flags = 0, ok;
@@ -894,21 +892,21 @@ static int shadow_set_l4e(struct vcpu *v,
     old_sl4e = *sl4e;
 
     if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
-    
-    paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) 
+
+    paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
              | (((unsigned long)sl4e) & ~PAGE_MASK));
 
-    if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) 
+    if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
     {
-        /* About to install a new reference */        
+        /* About to install a new reference */
         mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
-        ok = sh_get_ref(v, sl3mfn, paddr);
+        ok = sh_get_ref(d, sl3mfn, paddr);
         /* Are we pinning l3 shadows to handle wierd linux behaviour? */
-        if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
-            ok |= sh_pin(v, sl3mfn);
+        if ( sh_type_is_pinnable(d, SH_type_l3_64_shadow) )
+            ok |= sh_pin(d, sl3mfn);
         if ( !ok )
         {
-            domain_crash(v->domain);
+            domain_crash(d);
             return SHADOW_SET_ERROR;
         }
     }
@@ -917,24 +915,24 @@ static int shadow_set_l4e(struct vcpu *v,
     shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
     flags |= SHADOW_SET_CHANGED;
 
-    if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) 
+    if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
     {
         /* We lost a reference to an old mfn. */
         mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
         if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
-             || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e), 
+             || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
                                           shadow_l4e_get_flags(new_sl4e)) )
         {
             flags |= SHADOW_SET_FLUSH;
         }
-        sh_put_ref(v, osl3mfn, paddr);
+        sh_put_ref(d, osl3mfn, paddr);
     }
     return flags;
 }
 
-static int shadow_set_l3e(struct vcpu *v, 
-                          shadow_l3e_t *sl3e, 
-                          shadow_l3e_t new_sl3e, 
+static int shadow_set_l3e(struct domain *d,
+                          shadow_l3e_t *sl3e,
+                          shadow_l3e_t new_sl3e,
                           mfn_t sl3mfn)
 {
     int flags = 0;
@@ -945,15 +943,15 @@ static int shadow_set_l3e(struct vcpu *v,
 
     if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
 
-    paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) 
+    paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
              | (((unsigned long)sl3e) & ~PAGE_MASK));
-    
+
     if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
     {
-        /* About to install a new reference */        
-        if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
+        /* About to install a new reference */
+        if ( !sh_get_ref(d, shadow_l3e_get_mfn(new_sl3e), paddr) )
         {
-            domain_crash(v->domain);
+            domain_crash(d);
             return SHADOW_SET_ERROR;
         }
     }
@@ -962,25 +960,25 @@ static int shadow_set_l3e(struct vcpu *v,
     shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
     flags |= SHADOW_SET_CHANGED;
 
-    if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) 
+    if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
     {
         /* We lost a reference to an old mfn. */
         mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
         if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
-             !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e), 
-                                       shadow_l3e_get_flags(new_sl3e)) ) 
+             !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
+                                       shadow_l3e_get_flags(new_sl3e)) )
         {
             flags |= SHADOW_SET_FLUSH;
         }
-        sh_put_ref(v, osl2mfn, paddr);
+        sh_put_ref(d, osl2mfn, paddr);
     }
     return flags;
 }
-#endif /* GUEST_PAGING_LEVELS >= 4 */ 
+#endif /* GUEST_PAGING_LEVELS >= 4 */
 
-static int shadow_set_l2e(struct vcpu *v, 
-                          shadow_l2e_t *sl2e, 
-                          shadow_l2e_t new_sl2e, 
+static int shadow_set_l2e(struct domain *d,
+                          shadow_l2e_t *sl2e,
+                          shadow_l2e_t new_sl2e,
                           mfn_t sl2mfn)
 {
     int flags = 0;
@@ -990,7 +988,7 @@ static int shadow_set_l2e(struct vcpu *v,
 #if GUEST_PAGING_LEVELS == 2
     /* In 2-on-3 we work with pairs of l2es pointing at two-page
      * shadows.  Reference counting and up-pointers track from the first
-     * page of the shadow to the first l2e, so make sure that we're 
+     * page of the shadow to the first l2e, so make sure that we're
      * working with those:
      * Start with a pair of identical entries */
     shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
@@ -1000,21 +998,21 @@ static int shadow_set_l2e(struct vcpu *v,
 
     ASSERT(sl2e != NULL);
     old_sl2e = *sl2e;
-    
+
     if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
-    
+
     paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
              | (((unsigned long)sl2e) & ~PAGE_MASK));
 
-    if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 
+    if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
     {
         mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
         ASSERT(mfn_to_page(sl1mfn)->u.sh.head);
 
         /* About to install a new reference */
-        if ( !sh_get_ref(v, sl1mfn, paddr) )
+        if ( !sh_get_ref(d, sl1mfn, paddr) )
         {
-            domain_crash(v->domain);
+            domain_crash(d);
             return SHADOW_SET_ERROR;
         }
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
@@ -1028,8 +1026,8 @@ static int shadow_set_l2e(struct vcpu *v,
                the GFN instead of the GMFN, and it's definitely not
                OOS. */
             if ( (sp->u.sh.type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
-                 && mfn_is_out_of_sync(gl1mfn) ) 
-                sh_resync(v, gl1mfn);
+                 && mfn_is_out_of_sync(gl1mfn) )
+                sh_resync(d, gl1mfn);
         }
 #endif
 #if GUEST_PAGING_LEVELS == 2
@@ -1047,17 +1045,17 @@ static int shadow_set_l2e(struct vcpu *v,
 #endif
     flags |= SHADOW_SET_CHANGED;
 
-    if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT ) 
+    if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
     {
         /* We lost a reference to an old mfn. */
         mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
         if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
-             !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e), 
-                                       shadow_l2e_get_flags(new_sl2e)) ) 
+             !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
+                                       shadow_l2e_get_flags(new_sl2e)) )
         {
             flags |= SHADOW_SET_FLUSH;
         }
-        sh_put_ref(v, osl1mfn, paddr);
+        sh_put_ref(d, osl1mfn, paddr);
     }
     return flags;
 }
@@ -1066,7 +1064,7 @@ static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
                                        shadow_l1e_t *sl1e,
                                        mfn_t sl1mfn,
                                        struct domain *d)
-{ 
+{
     mfn_t mfn = shadow_l1e_get_mfn(new_sl1e);
     int flags = shadow_l1e_get_flags(new_sl1e);
     unsigned long gfn;
@@ -1085,7 +1083,7 @@ static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
     {
         unsigned long i = gfn - dirty_vram->begin_pfn;
         struct page_info *page = mfn_to_page(mfn);
-        
+
         if ( (page->u.inuse.type_info & PGT_count_mask) == 1 )
             /* Initial guest reference, record it */
             dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
@@ -1159,14 +1157,13 @@ static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
     }
 }
 
-static int shadow_set_l1e(struct vcpu *v, 
-                          shadow_l1e_t *sl1e, 
+static int shadow_set_l1e(struct domain *d,
+                          shadow_l1e_t *sl1e,
                           shadow_l1e_t new_sl1e,
                           p2m_type_t new_type,
                           mfn_t sl1mfn)
 {
     int flags = 0;
-    struct domain *d = v->domain;
     shadow_l1e_t old_sl1e;
 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
     mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
@@ -1177,17 +1174,17 @@ static int shadow_set_l1e(struct vcpu *v,
     if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
          && ((shadow_l1e_get_flags(new_sl1e) & (_PAGE_RW|_PAGE_PRESENT))
              == (_PAGE_RW|_PAGE_PRESENT)) )
-        oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
+        oos_fixup_add(d, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
 #endif
-    
+
     old_sl1e = *sl1e;
 
     if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
-    
+
     if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
-         && !sh_l1e_is_magic(new_sl1e) ) 
+         && !sh_l1e_is_magic(new_sl1e) )
     {
-        /* About to install a new reference */        
+        /* About to install a new reference */
         if ( shadow_mode_refcounts(d) ) {
             TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
             switch ( shadow_get_page_from_l1e(new_sl1e, d, new_type) )
@@ -1205,45 +1202,45 @@ static int shadow_set_l1e(struct vcpu *v,
                 break;
             }
         }
-    } 
+    }
 
     /* Write the new entry */
     shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
     flags |= SHADOW_SET_CHANGED;
 
-    if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT) 
+    if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
          && !sh_l1e_is_magic(old_sl1e) )
     {
         /* We lost a reference to an old mfn. */
-        /* N.B. Unlike higher-level sets, never need an extra flush 
-         * when writing an l1e.  Because it points to the same guest frame 
+        /* N.B. Unlike higher-level sets, never need an extra flush
+         * when writing an l1e.  Because it points to the same guest frame
          * as the guest l1e did, it's the guest's responsibility to
          * trigger a flush later. */
-        if ( shadow_mode_refcounts(d) ) 
+        if ( shadow_mode_refcounts(d) )
         {
             shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
             shadow_put_page_from_l1e(old_sl1e, d);
             TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF);
-        } 
+        }
     }
     return flags;
 }
 
 
 /**************************************************************************/
-/* Macros to walk pagetables.  These take the shadow of a pagetable and 
- * walk every "interesting" entry.  That is, they don't touch Xen mappings, 
- * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every 
+/* Macros to walk pagetables.  These take the shadow of a pagetable and
+ * walk every "interesting" entry.  That is, they don't touch Xen mappings,
+ * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
  * second entry (since pairs of entries are managed together). For multi-page
  * shadows they walk all pages.
- * 
- * Arguments are an MFN, the variable to point to each entry, a variable 
- * to indicate that we are done (we will shortcut to the end of the scan 
+ *
+ * Arguments are an MFN, the variable to point to each entry, a variable
+ * to indicate that we are done (we will shortcut to the end of the scan
  * when _done != 0), a variable to indicate that we should avoid Xen mappings,
- * and the code. 
+ * and the code.
  *
- * WARNING: These macros have side-effects.  They change the values of both 
- * the pointer and the MFN. */ 
+ * WARNING: These macros have side-effects.  They change the values of both
+ * the pointer and the MFN. */
 
 static inline void increment_ptr_to_guest_entry(void *ptr)
 {
@@ -1258,7 +1255,7 @@ static inline void increment_ptr_to_guest_entry(void *ptr)
 #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)        \
 do {                                                                    \
     int _i;                                                             \
-    shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn));                  \
+    shadow_l1e_t *_sp = map_domain_page((_sl1mfn));                     \
     ASSERT(mfn_to_page(_sl1mfn)->u.sh.type == SH_type_l1_shadow  \
            || mfn_to_page(_sl1mfn)->u.sh.type == SH_type_fl1_shadow);\
     for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ )              \
@@ -1269,7 +1266,7 @@ do {                                                                    \
         if ( _done ) break;                                             \
         increment_ptr_to_guest_entry(_gl1p);                            \
     }                                                                   \
-    sh_unmap_domain_page(_sp);                                          \
+    unmap_domain_page(_sp);                                             \
 } while (0)
 
 /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
@@ -1288,7 +1285,7 @@ do {                                                                    \
 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)         \
        _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
 #endif
-    
+
 
 #if GUEST_PAGING_LEVELS == 2
 
@@ -1300,7 +1297,7 @@ do {                                                                      \
     ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_32_shadow);      \
     for ( _j = 0; _j < 4 && !__done; _j++ )                               \
     {                                                                     \
-        shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn);                  \
+        shadow_l2e_t *_sp = map_domain_page(_sl2mfn);                     \
         for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 )         \
         {                                                                 \
             (_sl2e) = _sp + _i;                                           \
@@ -1309,7 +1306,7 @@ do {                                                                      \
             if ( (__done = (_done)) ) break;                              \
             increment_ptr_to_guest_entry(_gl2p);                          \
         }                                                                 \
-        sh_unmap_domain_page(_sp);                                        \
+        unmap_domain_page(_sp);                                           \
         if ( _j < 3 ) _sl2mfn = sh_next_page(_sl2mfn);                    \
     }                                                                     \
 } while (0)
@@ -1320,7 +1317,7 @@ do {                                                                      \
 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code)      \
 do {                                                                       \
     int _i;                                                                \
-    shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn));                     \
+    shadow_l2e_t *_sp = map_domain_page((_sl2mfn));                        \
     ASSERT(shadow_mode_external(_dom));                                    \
     ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_pae_shadow        \
            || mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_pae_shadow);  \
@@ -1332,24 +1329,24 @@ do {                                                                       \
         if ( _done ) break;                                                \
         increment_ptr_to_guest_entry(_gl2p);                               \
     }                                                                      \
-    sh_unmap_domain_page(_sp);                                             \
+    unmap_domain_page(_sp);                                                \
 } while (0)
 
-#else 
+#else
 
 /* 64-bit l2: touch all entries except for PAE compat guests. */
 #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code)       \
 do {                                                                        \
     int _i;                                                                 \
     int _xen = !shadow_mode_external(_dom);                                 \
-    shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn));                      \
+    shadow_l2e_t *_sp = map_domain_page((_sl2mfn));                         \
     ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_64_shadow ||\
            mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_64_shadow);\
     for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )                  \
     {                                                                       \
         if ( (!(_xen))                                                      \
-             || !is_pv_32on64_domain(_dom)                                  \
-             || mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_64_shadow\
+             || !is_pv_32bit_domain(_dom)                                   \
+             || mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_64_shadow    \
              || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) )           \
         {                                                                   \
             (_sl2e) = _sp + _i;                                             \
@@ -1359,7 +1356,7 @@ do {                                                                        \
             increment_ptr_to_guest_entry(_gl2p);                            \
         }                                                                   \
     }                                                                       \
-    sh_unmap_domain_page(_sp);                                              \
+    unmap_domain_page(_sp);                                                 \
 } while (0)
 
 #endif /* different kinds of l2 */
@@ -1370,7 +1367,7 @@ do {                                                                        \
 #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code)         \
 do {                                                                    \
     int _i;                                                             \
-    shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn));                  \
+    shadow_l3e_t *_sp = map_domain_page((_sl3mfn));                     \
     ASSERT(mfn_to_page(_sl3mfn)->u.sh.type == SH_type_l3_64_shadow);\
     for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ )              \
     {                                                                   \
@@ -1380,13 +1377,13 @@ do {                                                                    \
         if ( _done ) break;                                             \
         increment_ptr_to_guest_entry(_gl3p);                            \
     }                                                                   \
-    sh_unmap_domain_page(_sp);                                          \
+    unmap_domain_page(_sp);                                             \
 } while (0)
 
 /* 64-bit l4: avoid Xen mappings */
 #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code)   \
 do {                                                                    \
-    shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn));                  \
+    shadow_l4e_t *_sp = map_domain_page((_sl4mfn));                     \
     int _xen = !shadow_mode_external(_dom);                             \
     int _i;                                                             \
     ASSERT(mfn_to_page(_sl4mfn)->u.sh.type == SH_type_l4_64_shadow);\
@@ -1401,7 +1398,7 @@ do {                                                                    \
         }                                                               \
         increment_ptr_to_guest_entry(_gl4p);                            \
     }                                                                   \
-    sh_unmap_domain_page(_sp);                                          \
+    unmap_domain_page(_sp);                                             \
 } while (0)
 
 #endif
@@ -1416,15 +1413,14 @@ do {                                                                    \
 //        shadow-types.h to shadow-private.h
 //
 #if GUEST_PAGING_LEVELS == 4
-void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
+void sh_install_xen_entries_in_l4(struct domain *d, mfn_t gl4mfn, mfn_t sl4mfn)
 {
-    struct domain *d = v->domain;
     shadow_l4e_t *sl4e;
     unsigned int slots;
 
-    sl4e = sh_map_domain_page(sl4mfn);
+    sl4e = map_domain_page(sl4mfn);
     BUILD_BUG_ON(sizeof (l4_pgentry_t) != sizeof (shadow_l4e_t));
-    
+
     /* Copy the common Xen mappings from the idle domain */
     slots = (shadow_mode_external(d)
              ? ROOT_PAGETABLE_XEN_SLOTS
@@ -1438,6 +1434,13 @@ void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
         shadow_l4e_from_mfn(page_to_mfn(d->arch.perdomain_l3_pg),
                             __PAGE_HYPERVISOR);
 
+    if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) &&
+         !VM_ASSIST(d, m2p_strict) )
+    {
+        /* open coded zap_ro_mpt(mfn_x(sl4mfn)): */
+        sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] = shadow_l4e_empty();
+    }
+
     /* Shadow linear mapping for 4-level shadows.  N.B. for 3-level
      * shadows on 64-bit xen, this linear mapping is later replaced by the
      * monitor pagetable structure, which is built in make_monitor_table
@@ -1446,7 +1449,7 @@ void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
         shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
 
     /* Self linear mapping.  */
-    if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
+    if ( shadow_mode_translate(d) && !shadow_mode_external(d) )
     {
         // linear tables may not be used with translated PV guests
         sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
@@ -1458,7 +1461,7 @@ void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
             shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
     }
 
-    sh_unmap_domain_page(sl4e);    
+    unmap_domain_page(sl4e);
 }
 #endif
 
@@ -1467,15 +1470,14 @@ void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
 // place, which means that we need to populate the l2h entry in the l3
 // table.
 
-static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
+static void sh_install_xen_entries_in_l2h(struct domain *d, mfn_t sl2hmfn)
 {
-    struct domain *d = v->domain;
     shadow_l2e_t *sl2e;
 
-    if ( !is_pv_32on64_vcpu(v) )
+    if ( !is_pv_32bit_domain(d) )
         return;
 
-    sl2e = sh_map_domain_page(sl2hmfn);
+    sl2e = map_domain_page(sl2hmfn);
     BUILD_BUG_ON(sizeof (l2_pgentry_t) != sizeof (shadow_l2e_t));
 
     /* Copy the common Xen mappings from the idle domain */
@@ -1484,7 +1486,7 @@ static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
         &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
         COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
 
-    sh_unmap_domain_page(sl2e);
+    unmap_domain_page(sl2e);
 }
 #endif
 
@@ -1495,21 +1497,22 @@ static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
 static mfn_t
 sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
 {
-    mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
+    struct domain *d = v->domain;
+    mfn_t smfn = shadow_alloc(d, shadow_type, mfn_x(gmfn));
     SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
                   mfn_x(gmfn), shadow_type, mfn_x(smfn));
 
-    if ( sh_type_has_up_pointer(v, shadow_type) )
+    if ( sh_type_has_up_pointer(d, shadow_type) )
         /* Lower-level shadow, not yet linked form a higher level */
         mfn_to_page(smfn)->up = 0;
 
 #if GUEST_PAGING_LEVELS == 4
-#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
     if ( shadow_type == SH_type_l4_64_shadow &&
-         unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
+         unlikely(d->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
     {
         /* We're shadowing a new l4, but we've been assuming the guest uses
-         * only one l4 per vcpu and context switches using an l4 entry. 
+         * only one l4 per vcpu and context switches using an l4 entry.
          * Count the number of active l4 shadows.  If there are enough
          * of them, decide that this isn't an old linux guest, and stop
          * pinning l3es.  This is not very quick but it doesn't happen
@@ -1517,22 +1520,22 @@ sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
         struct page_info *sp, *t;
         struct vcpu *v2;
         int l4count = 0, vcpus = 0;
-        page_list_for_each(sp, &v->domain->arch.paging.shadow.pinned_shadows)
+        page_list_for_each(sp, &d->arch.paging.shadow.pinned_shadows)
         {
             if ( sp->u.sh.type == SH_type_l4_64_shadow )
                 l4count++;
         }
-        for_each_vcpu ( v->domain, v2 ) 
+        for_each_vcpu ( d, v2 )
             vcpus++;
-        if ( l4count > 2 * vcpus ) 
+        if ( l4count > 2 * vcpus )
         {
             /* Unpin all the pinned l3 tables, and don't pin any more. */
-            page_list_for_each_safe(sp, t, &v->domain->arch.paging.shadow.pinned_shadows)
+            page_list_for_each_safe(sp, t, &d->arch.paging.shadow.pinned_shadows)
             {
                 if ( sp->u.sh.type == SH_type_l3_64_shadow )
-                    sh_unpin(v, page_to_mfn(sp));
+                    sh_unpin(d, page_to_mfn(sp));
             }
-            v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
+            d->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
             sh_reset_l3_up_pointers(v);
         }
     }
@@ -1540,39 +1543,40 @@ sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
 #endif
 
     // Create the Xen mappings...
-    if ( !shadow_mode_external(v->domain) )
+    if ( !shadow_mode_external(d) )
     {
-        switch (shadow_type) 
+        switch (shadow_type)
         {
 #if GUEST_PAGING_LEVELS == 4
         case SH_type_l4_shadow:
-            sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
+            sh_install_xen_entries_in_l4(v->domain, gmfn, smfn);
+            break;
 #endif
 #if GUEST_PAGING_LEVELS >= 3
         case SH_type_l2h_shadow:
-            sh_install_xen_entries_in_l2h(v, smfn); break;
+            sh_install_xen_entries_in_l2h(v->domain, smfn);
+            break;
 #endif
         default: /* Do nothing */ break;
         }
     }
 
-    shadow_promote(v, gmfn, shadow_type);
-    set_shadow_status(v, gmfn, shadow_type, smfn);
+    shadow_promote(d, gmfn, shadow_type);
+    set_shadow_status(d, gmfn, shadow_type, smfn);
 
     return smfn;
 }
 
 /* Make a splintered superpage shadow */
 static mfn_t
-make_fl1_shadow(struct vcpu *v, gfn_t gfn)
+make_fl1_shadow(struct domain *d, gfn_t gfn)
 {
-    mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
-                               (unsigned long) gfn_x(gfn));
+    mfn_t smfn = shadow_alloc(d, SH_type_fl1_shadow, gfn_x(gfn));
 
     SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
                   gfn_x(gfn), mfn_x(smfn));
 
-    set_fl1_shadow_status(v, gfn, smfn);
+    set_fl1_shadow_status(d, gfn, smfn);
     return smfn;
 }
 
@@ -1584,14 +1588,14 @@ sh_make_monitor_table(struct vcpu *v)
     struct domain *d = v->domain;
 
     ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
-    
+
     /* Guarantee we can get the memory we need */
     shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
 
     {
         mfn_t m4mfn;
         m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
-        sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
+        sh_install_xen_entries_in_l4(d, m4mfn, m4mfn);
         /* Remember the level of this table */
         mfn_to_page(m4mfn)->shadow_flags = 4;
 #if SHADOW_PAGING_LEVELS < 4
@@ -1599,10 +1603,10 @@ sh_make_monitor_table(struct vcpu *v)
             mfn_t m3mfn, m2mfn;
             l4_pgentry_t *l4e;
             l3_pgentry_t *l3e;
-            /* Install an l3 table and an l2 table that will hold the shadow 
-             * linear map entries.  This overrides the linear map entry that 
+            /* Install an l3 table and an l2 table that will hold the shadow
+             * linear map entries.  This overrides the linear map entry that
              * was installed by sh_install_xen_entries_in_l4. */
-            l4e = sh_map_domain_page(m4mfn);
+            l4e = map_domain_page(m4mfn);
 
             m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
             mfn_to_page(m3mfn)->shadow_flags = 3;
@@ -1611,27 +1615,27 @@ sh_make_monitor_table(struct vcpu *v)
 
             m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
             mfn_to_page(m2mfn)->shadow_flags = 2;
-            l3e = sh_map_domain_page(m3mfn);
+            l3e = map_domain_page(m3mfn);
             l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
-            sh_unmap_domain_page(l3e);
+            unmap_domain_page(l3e);
 
-            if ( is_pv_32on64_vcpu(v) )
+            if ( is_pv_32bit_domain(d) )
             {
-                /* For 32-on-64 PV guests, we need to map the 32-bit Xen
+                /* For 32-bit PV guests, we need to map the 32-bit Xen
                  * area into its usual VAs in the monitor tables */
                 m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
                 mfn_to_page(m3mfn)->shadow_flags = 3;
                 l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
-                
+
                 m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
                 mfn_to_page(m2mfn)->shadow_flags = 2;
-                l3e = sh_map_domain_page(m3mfn);
+                l3e = map_domain_page(m3mfn);
                 l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
-                sh_install_xen_entries_in_l2h(v, m2mfn);
-                sh_unmap_domain_page(l3e);
+                sh_install_xen_entries_in_l2h(d, m2mfn);
+                unmap_domain_page(l3e);
             }
 
-            sh_unmap_domain_page(l4e);
+            unmap_domain_page(l4e);
         }
 #endif /* SHADOW_PAGING_LEVELS < 4 */
         return m4mfn;
@@ -1647,13 +1651,13 @@ sh_make_monitor_table(struct vcpu *v)
  * If the necessary tables are not present in the guest, they return NULL. */
 
 /* N.B. The use of GUEST_PAGING_LEVELS here is correct.  If the shadow has
- * more levels than the guest, the upper levels are always fixed and do not 
- * reflect any information from the guest, so we do not use these functions 
+ * more levels than the guest, the upper levels are always fixed and do not
+ * reflect any information from the guest, so we do not use these functions
  * to access them. */
 
 #if GUEST_PAGING_LEVELS >= 4
-static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, 
-                                                walk_t *gw, 
+static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
+                                                walk_t *gw,
                                                 mfn_t *sl4mfn)
 {
     /* There is always a shadow of the top level table.  Get it. */
@@ -1662,37 +1666,38 @@ static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
     return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
 }
 
-static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, 
-                                                walk_t *gw, 
+static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
+                                                walk_t *gw,
                                                 mfn_t *sl3mfn,
                                                 fetch_type_t ft,
                                                 int *resync)
 {
+    struct domain *d = v->domain;
     mfn_t sl4mfn;
     shadow_l4e_t *sl4e;
     if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
     /* Get the l4e */
     sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
     ASSERT(sl4e != NULL);
-    if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) 
+    if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
     {
         *sl3mfn = shadow_l4e_get_mfn(*sl4e);
         ASSERT(mfn_valid(*sl3mfn));
-    } 
-    else 
+    }
+    else
     {
         int r;
         shadow_l4e_t new_sl4e;
         /* No l3 shadow installed: find and install it. */
-        *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
-        if ( !mfn_valid(*sl3mfn) ) 
+        *sl3mfn = get_shadow_status(d, gw->l3mfn, SH_type_l3_shadow);
+        if ( !mfn_valid(*sl3mfn) )
         {
             /* No l3 shadow of this page exists at all: make one. */
             *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
         }
         /* Install the new sl3 table in the sl4e */
         l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
-        r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
+        r = shadow_set_l4e(d, sl4e, new_sl4e, sl4mfn);
         ASSERT((r & SHADOW_SET_FLUSH) == 0);
         if ( r & SHADOW_SET_ERROR )
             return NULL;
@@ -1708,49 +1713,50 @@ static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
 #endif /* GUEST_PAGING_LEVELS >= 4 */
 
 
-static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, 
-                                                walk_t *gw, 
+static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
+                                                walk_t *gw,
                                                 mfn_t *sl2mfn,
                                                 fetch_type_t ft,
                                                 int *resync)
 {
 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+    struct domain *d = v->domain;
     mfn_t sl3mfn = _mfn(INVALID_MFN);
     shadow_l3e_t *sl3e;
     if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
     /* Get the l3e */
     sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync);
-    if ( sl3e == NULL ) return NULL; 
-    if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) 
+    if ( sl3e == NULL ) return NULL;
+    if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
     {
         *sl2mfn = shadow_l3e_get_mfn(*sl3e);
         ASSERT(mfn_valid(*sl2mfn));
-    } 
-    else 
+    }
+    else
     {
         int r;
         shadow_l3e_t new_sl3e;
         unsigned int t = SH_type_l2_shadow;
 
         /* Tag compat L2 containing hypervisor (m2p) mappings */
-        if ( is_pv_32on64_domain(v->domain) &&
+        if ( is_pv_32bit_vcpu(v) &&
              guest_l4_table_offset(gw->va) == 0 &&
              guest_l3_table_offset(gw->va) == 3 )
             t = SH_type_l2h_shadow;
 
         /* No l2 shadow installed: find and install it. */
-        *sl2mfn = get_shadow_status(v, gw->l2mfn, t);
-        if ( !mfn_valid(*sl2mfn) ) 
+        *sl2mfn = get_shadow_status(d, gw->l2mfn, t);
+        if ( !mfn_valid(*sl2mfn) )
         {
             /* No l2 shadow of this page exists at all: make one. */
             *sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
         }
         /* Install the new sl2 table in the sl3e */
         l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
-        r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
+        r = shadow_set_l3e(d, sl3e, new_sl3e, sl3mfn);
         ASSERT((r & SHADOW_SET_FLUSH) == 0);
         if ( r & SHADOW_SET_ERROR )
-            return NULL;        
+            return NULL;
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
         *resync |= 1;
@@ -1762,9 +1768,9 @@ static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
 #elif GUEST_PAGING_LEVELS == 3 /* PAE... */
     /* We never demand-shadow PAE l3es: they are only created in
      * sh_update_cr3().  Check if the relevant sl3e is present. */
-    shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table) 
+    shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
         + shadow_l3_linear_offset(gw->va);
-    if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) 
+    if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
         return NULL;
     *sl2mfn = shadow_l3e_get_mfn(*sl3e);
     ASSERT(mfn_valid(*sl2mfn));
@@ -1778,15 +1784,16 @@ static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
     (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
     /* Reading the top level table is always valid. */
     return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
-#endif 
+#endif
 }
 
 
-static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, 
-                                                walk_t *gw, 
+static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
+                                                walk_t *gw,
                                                 mfn_t *sl1mfn,
                                                 fetch_type_t ft)
 {
+    struct domain *d = v->domain;
     mfn_t sl2mfn;
     int resync = 0;
     shadow_l2e_t *sl2e;
@@ -1797,38 +1804,38 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
 
     /* Install the sl1 in the l2e if it wasn't there or if we need to
      * re-do it to fix a PSE dirty bit. */
-    if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT 
+    if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
          && likely(ft != ft_demand_write
-                   || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW) 
+                   || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
                    || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
     {
         *sl1mfn = shadow_l2e_get_mfn(*sl2e);
         ASSERT(mfn_valid(*sl1mfn));
-    } 
-    else 
+    }
+    else
     {
         shadow_l2e_t new_sl2e;
         int r, flags = guest_l2e_get_flags(gw->l2e);
         /* No l1 shadow installed: find and install it. */
         if ( !(flags & _PAGE_PRESENT) )
             return NULL; /* No guest page. */
-        if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) ) 
+        if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
         {
             /* Splintering a superpage */
             gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
-            *sl1mfn = get_fl1_shadow_status(v, l2gfn);
-            if ( !mfn_valid(*sl1mfn) ) 
+            *sl1mfn = get_fl1_shadow_status(d, l2gfn);
+            if ( !mfn_valid(*sl1mfn) )
             {
                 /* No fl1 shadow of this superpage exists at all: make one. */
-                *sl1mfn = make_fl1_shadow(v, l2gfn);
+                *sl1mfn = make_fl1_shadow(d, l2gfn);
             }
-        } 
-        else 
+        }
+        else
         {
             /* Shadowing an actual guest l1 table */
             if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
-            *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
-            if ( !mfn_valid(*sl1mfn) ) 
+            *sl1mfn = get_shadow_status(d, gw->l1mfn, SH_type_l1_shadow);
+            if ( !mfn_valid(*sl1mfn) )
             {
                 /* No l1 shadow of this page exists at all: make one. */
                 *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
@@ -1836,8 +1843,8 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
         }
         /* Install the new sl1 table in the sl2e */
         l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
-        r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
-        ASSERT((r & SHADOW_SET_FLUSH) == 0);        
+        r = shadow_set_l2e(d, sl2e, new_sl2e, sl2mfn);
+        ASSERT((r & SHADOW_SET_FLUSH) == 0);
         if ( r & SHADOW_SET_ERROR )
             return NULL;
 
@@ -1863,7 +1870,7 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
 
 
 /**************************************************************************/
-/* Destructors for shadow tables: 
+/* Destructors for shadow tables:
  * Unregister the shadow, decrement refcounts of any entries present in it,
  * and release the memory.
  *
@@ -1873,7 +1880,7 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
  */
 
 #if GUEST_PAGING_LEVELS >= 4
-void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
+void sh_destroy_l4_shadow(struct domain *d, mfn_t smfn)
 {
     shadow_l4e_t *sl4e;
     struct page_info *sp = mfn_to_page(smfn);
@@ -1887,24 +1894,24 @@ void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
 
     /* Record that the guest page isn't shadowed any more (in this type) */
     gmfn = backpointer(sp);
-    delete_shadow_status(v, gmfn, t, smfn);
-    shadow_demote(v, gmfn, t);
+    delete_shadow_status(d, gmfn, t, smfn);
+    shadow_demote(d, gmfn, t);
     /* Decrement refcounts of all the old entries */
-    sl4mfn = smfn; 
-    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
-        if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) 
+    sl4mfn = smfn;
+    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, d, {
+        if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
         {
-            sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
-                       (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) 
+            sh_put_ref(d, shadow_l4e_get_mfn(*sl4e),
+                       (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
                        | ((unsigned long)sl4e & ~PAGE_MASK));
         }
     });
-    
+
     /* Put the memory back in the pool */
-    shadow_free(v->domain, smfn);
+    shadow_free(d, smfn);
 }
 
-void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
+void sh_destroy_l3_shadow(struct domain *d, mfn_t smfn)
 {
     shadow_l3e_t *sl3e;
     struct page_info *sp = mfn_to_page(smfn);
@@ -1918,25 +1925,25 @@ void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
 
     /* Record that the guest page isn't shadowed any more (in this type) */
     gmfn = backpointer(sp);
-    delete_shadow_status(v, gmfn, t, smfn);
-    shadow_demote(v, gmfn, t);
+    delete_shadow_status(d, gmfn, t, smfn);
+    shadow_demote(d, gmfn, t);
 
     /* Decrement refcounts of all the old entries */
-    sl3mfn = smfn; 
+    sl3mfn = smfn;
     SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
-        if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) 
-            sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
-                        (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) 
+        if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
+            sh_put_ref(d, shadow_l3e_get_mfn(*sl3e),
+                        (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
                         | ((unsigned long)sl3e & ~PAGE_MASK));
     });
 
     /* Put the memory back in the pool */
-    shadow_free(v->domain, smfn);
+    shadow_free(d, smfn);
 }
 #endif /* GUEST_PAGING_LEVELS >= 4 */
 
 
-void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
+void sh_destroy_l2_shadow(struct domain *d, mfn_t smfn)
 {
     shadow_l2e_t *sl2e;
     struct page_info *sp = mfn_to_page(smfn);
@@ -1955,25 +1962,24 @@ void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
 
     /* Record that the guest page isn't shadowed any more (in this type) */
     gmfn = backpointer(sp);
-    delete_shadow_status(v, gmfn, t, smfn);
-    shadow_demote(v, gmfn, t);
+    delete_shadow_status(d, gmfn, t, smfn);
+    shadow_demote(d, gmfn, t);
 
     /* Decrement refcounts of all the old entries */
     sl2mfn = smfn;
-    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
-        if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) 
-            sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
-                        (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) 
+    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, d, {
+        if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
+            sh_put_ref(d, shadow_l2e_get_mfn(*sl2e),
+                        (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
                         | ((unsigned long)sl2e & ~PAGE_MASK));
     });
 
     /* Put the memory back in the pool */
-    shadow_free(v->domain, smfn);
+    shadow_free(d, smfn);
 }
 
-void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
+void sh_destroy_l1_shadow(struct domain *d, mfn_t smfn)
 {
-    struct domain *d = v->domain;
     shadow_l1e_t *sl1e;
     struct page_info *sp = mfn_to_page(smfn);
     u32 t = sp->u.sh.type;
@@ -1987,19 +1993,19 @@ void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
     if ( t == SH_type_fl1_shadow )
     {
         gfn_t gfn = _gfn(sp->v.sh.back);
-        delete_fl1_shadow_status(v, gfn, smfn);
+        delete_fl1_shadow_status(d, gfn, smfn);
     }
-    else 
+    else
     {
         mfn_t gmfn = backpointer(sp);
-        delete_shadow_status(v, gmfn, t, smfn);
-        shadow_demote(v, gmfn, t);
+        delete_shadow_status(d, gmfn, t, smfn);
+        shadow_demote(d, gmfn, t);
     }
-    
+
     if ( shadow_mode_refcounts(d) )
     {
         /* Decrement refcounts of all the old entries */
-        mfn_t sl1mfn = smfn; 
+        mfn_t sl1mfn = smfn;
         SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
             if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
                  && !sh_l1e_is_magic(*sl1e) ) {
@@ -2008,9 +2014,9 @@ void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
             }
         });
     }
-    
+
     /* Put the memory back in the pool */
-    shadow_free(v->domain, smfn);
+    shadow_free(d, smfn);
 }
 
 #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
@@ -2022,33 +2028,33 @@ void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
 #if SHADOW_PAGING_LEVELS != 4
     {
         mfn_t m3mfn;
-        l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
+        l4_pgentry_t *l4e = map_domain_page(mmfn);
         l3_pgentry_t *l3e;
         int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
- 
-        /* Need to destroy the l3 and l2 monitor pages used 
+
+        /* Need to destroy the l3 and l2 monitor pages used
          * for the linear map */
         ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
         m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
-        l3e = sh_map_domain_page(m3mfn);
+        l3e = map_domain_page(m3mfn);
         ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
         shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
-        sh_unmap_domain_page(l3e);
+        unmap_domain_page(l3e);
         shadow_free(d, m3mfn);
 
-        if ( is_pv_32on64_vcpu(v) )
+        if ( is_pv_32bit_domain(d) )
         {
             /* Need to destroy the l3 and l2 monitor pages that map the
              * Xen VAs at 3GB-4GB */
             ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
             m3mfn = _mfn(l4e_get_pfn(l4e[0]));
-            l3e = sh_map_domain_page(m3mfn);
+            l3e = map_domain_page(m3mfn);
             ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
             shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
-            sh_unmap_domain_page(l3e);
+            unmap_domain_page(l3e);
             shadow_free(d, m3mfn);
         }
-        sh_unmap_domain_page(l4e);
+        unmap_domain_page(l4e);
     }
 #endif
 
@@ -2060,45 +2066,45 @@ void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
 /**************************************************************************/
 /* Functions to destroy non-Xen mappings in a pagetable hierarchy.
  * These are called from common code when we are running out of shadow
- * memory, and unpinning all the top-level shadows hasn't worked. 
+ * memory, and unpinning all the top-level shadows hasn't worked.
  *
  * With user_only == 1, we leave guest kernel-mode mappings in place too,
  * unhooking only the user-mode mappings
  *
- * This implementation is pretty crude and slow, but we hope that it won't 
+ * This implementation is pretty crude and slow, but we hope that it won't
  * be called very often. */
 
 #if GUEST_PAGING_LEVELS == 2
 
-void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn, int user_only)
-{    
+void sh_unhook_32b_mappings(struct domain *d, mfn_t sl2mfn, int user_only)
+{
     shadow_l2e_t *sl2e;
-    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
+    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, d, {
         if ( !user_only || (sl2e->l2 & _PAGE_USER) )
-            (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+            (void) shadow_set_l2e(d, sl2e, shadow_l2e_empty(), sl2mfn);
     });
 }
 
 #elif GUEST_PAGING_LEVELS == 3
 
-void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn, int user_only)
+void sh_unhook_pae_mappings(struct domain *d, mfn_t sl2mfn, int user_only)
 /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
 {
     shadow_l2e_t *sl2e;
-    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
+    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, d, {
         if ( !user_only || (sl2e->l2 & _PAGE_USER) )
-            (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+            (void) shadow_set_l2e(d, sl2e, shadow_l2e_empty(), sl2mfn);
     });
 }
 
 #elif GUEST_PAGING_LEVELS == 4
 
-void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn, int user_only)
+void sh_unhook_64b_mappings(struct domain *d, mfn_t sl4mfn, int user_only)
 {
     shadow_l4e_t *sl4e;
-    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
+    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, d, {
         if ( !user_only || (sl4e->l4 & _PAGE_USER) )
-            (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+            (void) shadow_set_l4e(d, sl4e, shadow_l4e_empty(), sl4mfn);
     });
 }
 
@@ -2109,7 +2115,7 @@ void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn, int user_only)
  * These functions require a pointer to the shadow entry that will be updated.
  */
 
-/* These functions take a new guest entry, translate it to shadow and write 
+/* These functions take a new guest entry, translate it to shadow and write
  * the shadow entry.
  *
  * They return the same bitmaps as the shadow_set_lXe() functions.
@@ -2133,7 +2139,7 @@ static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
         gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
         mfn_t gl3mfn = get_gfn_query_unlocked(d, gfn_x(gl3gfn), &p2mt);
         if ( p2m_is_ram(p2mt) )
-            sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
+            sl3mfn = get_shadow_status(d, gl3mfn, SH_type_l3_shadow);
         else if ( p2mt != p2m_populate_on_demand )
             result |= SHADOW_SET_ERROR;
 
@@ -2169,13 +2175,14 @@ static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
         }
     }
 
-    result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
+    result |= shadow_set_l4e(d, sl4p, new_sl4e, sl4mfn);
     return result;
 }
 
 
 static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
 {
+    struct domain *d = v->domain;
     shadow_l3e_t new_sl3e;
     guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
     shadow_l3e_t *sl3p = se;
@@ -2188,9 +2195,9 @@ static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
     if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
     {
         gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
-        mfn_t gl2mfn = get_gfn_query_unlocked(v->domain, gfn_x(gl2gfn), &p2mt);
+        mfn_t gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt);
         if ( p2m_is_ram(p2mt) )
-            sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
+            sl2mfn = get_shadow_status(d, gl2mfn, SH_type_l2_shadow);
         else if ( p2mt != p2m_populate_on_demand )
             result |= SHADOW_SET_ERROR;
 
@@ -2200,7 +2207,7 @@ static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
 #endif
     }
     l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
-    result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
+    result |= shadow_set_l3e(d, sl3p, new_sl3e, sl3mfn);
 
     return result;
 }
@@ -2208,6 +2215,7 @@ static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
 
 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
 {
+    struct domain *d = v->domain;
     shadow_l2e_t new_sl2e;
     guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
     shadow_l2e_t *sl2p = se;
@@ -2225,35 +2233,35 @@ static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
         {
             // superpage -- need to look up the shadow L1 which holds the
             // splitters...
-            sl1mfn = get_fl1_shadow_status(v, gl1gfn);
+            sl1mfn = get_fl1_shadow_status(d, gl1gfn);
 #if 0
             // XXX - it's possible that we want to do some kind of prefetch
             // for superpage fl1's here, but this is *not* on the demand path,
             // so we'll hold off trying that for now...
             //
             if ( !mfn_valid(sl1mfn) )
-                sl1mfn = make_fl1_shadow(v, gl1gfn);
+                sl1mfn = make_fl1_shadow(d, gl1gfn);
 #endif
         }
         else
         {
-            mfn_t gl1mfn = get_gfn_query_unlocked(v->domain, gfn_x(gl1gfn),
-                                                  &p2mt);
+            mfn_t gl1mfn = get_gfn_query_unlocked(d, gfn_x(gl1gfn), &p2mt);
             if ( p2m_is_ram(p2mt) )
-                sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); 
+                sl1mfn = get_shadow_status(d, gl1mfn, SH_type_l1_shadow);
             else if ( p2mt != p2m_populate_on_demand )
                 result |= SHADOW_SET_ERROR;
         }
     }
     l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
 
-    result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
+    result |= shadow_set_l2e(d, sl2p, new_sl2e, sl2mfn);
 
     return result;
 }
 
 static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
 {
+    struct domain *d = v->domain;
     shadow_l1e_t new_sl1e;
     guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
     shadow_l1e_t *sl1p = se;
@@ -2268,25 +2276,25 @@ static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
     perfc_incr(shadow_validate_gl1e_calls);
 
     gfn = guest_l1e_get_gfn(new_gl1e);
-    gmfn = get_gfn_query_unlocked(v->domain, gfn_x(gfn), &p2mt);
+    gmfn = get_gfn_query_unlocked(d, gfn_x(gfn), &p2mt);
 
     l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
-    result |= shadow_set_l1e(v, sl1p, new_sl1e, p2mt, sl1mfn);
+    result |= shadow_set_l1e(d, sl1p, new_sl1e, p2mt, sl1mfn);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     gl1mfn = backpointer(mfn_to_page(sl1mfn));
-    if ( mfn_valid(gl1mfn) 
+    if ( mfn_valid(gl1mfn)
          && mfn_is_out_of_sync(gl1mfn) )
     {
         /* Update the OOS snapshot. */
-        mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
+        mfn_t snpmfn = oos_snapshot_lookup(d, gl1mfn);
         guest_l1e_t *snp;
 
         ASSERT(mfn_valid(snpmfn));
 
-        snp = sh_map_domain_page(snpmfn);
+        snp = map_domain_page(snpmfn);
         snp[guest_index(new_ge)] = new_gl1e;
-        sh_unmap_domain_page(snp);
+        unmap_domain_page(snp);
     }
 #endif /* OOS */
 
@@ -2295,13 +2303,14 @@ static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
 /**************************************************************************/
-/* Special validation function for re-syncing out-of-sync shadows. 
+/* Special validation function for re-syncing out-of-sync shadows.
  * Walks the *shadow* page, and for every entry that it finds,
  * revalidates the guest entry that corresponds to it.
  * N.B. This function is called with the vcpu that unsynced the page,
  *      *not* the one that is causing it to be resynced. */
 void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
 {
+    struct domain *d = v->domain;
     mfn_t sl1mfn;
     shadow_l1e_t *sl1p;
     guest_l1e_t *gl1p, *gp, *snp;
@@ -2309,11 +2318,11 @@ void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
 
     ASSERT(mfn_valid(snpmfn));
 
-    sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+    sl1mfn = get_shadow_status(d, gl1mfn, SH_type_l1_shadow);
     ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
 
-    snp = sh_map_domain_page(snpmfn);
-    gp = sh_map_domain_page(gl1mfn);
+    snp = map_domain_page(snpmfn);
+    gp = map_domain_page(gl1mfn);
     gl1p = gp;
 
    SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
@@ -2328,35 +2337,36 @@ void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
             shadow_l1e_t nsl1e;
 
             gfn = guest_l1e_get_gfn(gl1e);
-            gmfn = get_gfn_query_unlocked(v->domain, gfn_x(gfn), &p2mt);
+            gmfn = get_gfn_query_unlocked(d, gfn_x(gfn), &p2mt);
             l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
-            rc |= shadow_set_l1e(v, sl1p, nsl1e, p2mt, sl1mfn);
+            rc |= shadow_set_l1e(d, sl1p, nsl1e, p2mt, sl1mfn);
             *snpl1p = gl1e;
         }
     });
 
-    sh_unmap_domain_page(gp);
-    sh_unmap_domain_page(snp);
+    unmap_domain_page(gp);
+    unmap_domain_page(snp);
 
     /* Setting shadow L1 entries should never need us to flush the TLB */
     ASSERT(!(rc & SHADOW_SET_FLUSH));
 }
 
-/* Figure out whether it's definitely safe not to sync this l1 table. 
- * That is: if we can tell that it's only used once, and that the 
- * toplevel shadow responsible is not one of ours. 
- * N.B. This function is called with the vcpu that required the resync, 
+/* Figure out whether it's definitely safe not to sync this l1 table.
+ * That is: if we can tell that it's only used once, and that the
+ * toplevel shadow responsible is not one of ours.
+ * N.B. This function is called with the vcpu that required the resync,
  *      *not* the one that originally unsynced the page, but it is
  *      called in the *mode* of the vcpu that unsynced it.  Clear?  Good. */
 int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
 {
+    struct domain *d = v->domain;
     struct page_info *sp;
     mfn_t smfn;
 
-    if ( !sh_type_has_up_pointer(v, SH_type_l1_shadow) )
+    if ( !sh_type_has_up_pointer(d, SH_type_l1_shadow) )
         return 0;
 
-    smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+    smfn = get_shadow_status(d, gl1mfn, SH_type_l1_shadow);
     ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
 
     /* Up to l2 */
@@ -2366,10 +2376,10 @@ int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
     smfn = _mfn(sp->up >> PAGE_SHIFT);
     ASSERT(mfn_valid(smfn));
 
-#if (SHADOW_PAGING_LEVELS == 4) 
+#if (SHADOW_PAGING_LEVELS == 4)
     /* up to l3 */
     sp = mfn_to_page(smfn);
-    ASSERT(sh_type_has_up_pointer(v, SH_type_l2_shadow));
+    ASSERT(sh_type_has_up_pointer(d, SH_type_l2_shadow));
     if ( sp->u.sh.count != 1 || !sp->up )
         return 0;
     smfn = _mfn(sp->up >> PAGE_SHIFT);
@@ -2378,22 +2388,22 @@ int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
     /* up to l4 */
     sp = mfn_to_page(smfn);
     if ( sp->u.sh.count != 1
-         || !sh_type_has_up_pointer(v, SH_type_l3_64_shadow) || !sp->up )
+         || !sh_type_has_up_pointer(d, SH_type_l3_64_shadow) || !sp->up )
         return 0;
     smfn = _mfn(sp->up >> PAGE_SHIFT);
     ASSERT(mfn_valid(smfn));
 #endif
 
     if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
-#if (SHADOW_PAGING_LEVELS == 3) 
+#if (SHADOW_PAGING_LEVELS == 3)
          || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
          || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
-         || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn) 
+         || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
 #endif
         )
         return 0;
-    
-    /* Only in use in one toplevel shadow, and it's not the one we're 
+
+    /* Only in use in one toplevel shadow, and it's not the one we're
      * running on */
     return 1;
 }
@@ -2401,18 +2411,19 @@ int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
 
 
 /**************************************************************************/
-/* Functions which translate and install the shadows of arbitrary guest 
+/* Functions which translate and install the shadows of arbitrary guest
  * entries that we have just seen the guest write. */
 
 
-static inline int 
+static inline int
 sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
-                     void *new_gp, u32 size, u32 sh_type, 
+                     void *new_gp, u32 size, u32 sh_type,
                      u32 (*shadow_index)(mfn_t *smfn, u32 idx),
-                     int (*validate_ge)(struct vcpu *v, void *ge, 
+                     int (*validate_ge)(struct vcpu *v, void *ge,
                                         mfn_t smfn, void *se))
 /* Generic function for mapping and validating. */
 {
+    struct domain *d = v->domain;
     mfn_t smfn, smfn2, map_mfn;
     shadow_l1e_t *sl1p;
     u32 shadow_idx, guest_idx;
@@ -2425,12 +2436,12 @@ sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
     ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
 
     /* Map the shadow page */
-    smfn = get_shadow_status(v, gmfn, sh_type);
+    smfn = get_shadow_status(d, gmfn, sh_type);
     ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
     guest_idx = guest_index(new_gp);
     map_mfn = smfn;
     shadow_idx = shadow_index(&map_mfn, guest_idx);
-    sl1p = sh_map_domain_page(map_mfn);
+    sl1p = map_domain_page(map_mfn);
 
     /* Validate one entry at a time */
     while ( size )
@@ -2442,8 +2453,8 @@ sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
         {
             /* We have moved to another page of the shadow */
             map_mfn = smfn2;
-            sh_unmap_domain_page(sl1p);
-            sl1p = sh_map_domain_page(map_mfn);
+            unmap_domain_page(sl1p);
+            sl1p = map_domain_page(map_mfn);
         }
         result |= validate_ge(v,
                               new_gp,
@@ -2452,7 +2463,7 @@ sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
         size -= sizeof(guest_l1e_t);
         new_gp += sizeof(guest_l1e_t);
     }
-    sh_unmap_domain_page(sl1p);
+    unmap_domain_page(sl1p);
     return result;
 }
 
@@ -2462,25 +2473,25 @@ sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
                           void *new_gl4p, u32 size)
 {
 #if GUEST_PAGING_LEVELS >= 4
-    return sh_map_and_validate(v, gl4mfn, new_gl4p, size, 
-                                SH_type_l4_shadow, 
-                                shadow_l4_index, 
+    return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
+                                SH_type_l4_shadow,
+                                shadow_l4_index,
                                 validate_gl4e);
 #else // ! GUEST_PAGING_LEVELS >= 4
     SHADOW_ERROR("called in wrong paging mode!\n");
     BUG();
     return 0;
-#endif 
+#endif
 }
-    
+
 int
 sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
                           void *new_gl3p, u32 size)
 {
 #if GUEST_PAGING_LEVELS >= 4
-    return sh_map_and_validate(v, gl3mfn, new_gl3p, size, 
-                                SH_type_l3_shadow, 
-                                shadow_l3_index, 
+    return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
+                                SH_type_l3_shadow,
+                                shadow_l3_index,
                                 validate_gl3e);
 #else // ! GUEST_PAGING_LEVELS >= 4
     SHADOW_ERROR("called in wrong paging mode!\n");
@@ -2493,9 +2504,9 @@ int
 sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
                           void *new_gl2p, u32 size)
 {
-    return sh_map_and_validate(v, gl2mfn, new_gl2p, size, 
-                                SH_type_l2_shadow, 
-                                shadow_l2_index, 
+    return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
+                                SH_type_l2_shadow,
+                                shadow_l2_index,
                                 validate_gl2e);
 }
 
@@ -2504,9 +2515,9 @@ sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
                            void *new_gl2p, u32 size)
 {
 #if GUEST_PAGING_LEVELS >= 3
-    return sh_map_and_validate(v, gl2mfn, new_gl2p, size, 
-                                SH_type_l2h_shadow, 
-                                shadow_l2_index, 
+    return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
+                                SH_type_l2h_shadow,
+                                shadow_l2_index,
                                 validate_gl2e);
 #else /* Non-PAE guests don't have different kinds of l2 table */
     SHADOW_ERROR("called in wrong paging mode!\n");
@@ -2519,9 +2530,9 @@ int
 sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
                           void *new_gl1p, u32 size)
 {
-    return sh_map_and_validate(v, gl1mfn, new_gl1p, size, 
-                                SH_type_l1_shadow, 
-                                shadow_l1_index, 
+    return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
+                                SH_type_l1_shadow,
+                                shadow_l1_index,
                                 validate_gl1e);
 }
 
@@ -2539,6 +2550,7 @@ sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
 static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
 {
 #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
+    struct domain *d = v->domain;
     /* If the domain has never made a "dying" op, use the two-writes
      * heuristic; otherwise, unshadow as soon as we write a zero for a dying
      * process.
@@ -2546,15 +2558,15 @@ static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
      * Don't bother trying to unshadow if it's not a PT, or if it's > l1.
      */
     if ( ( v->arch.paging.shadow.pagetable_dying
-           || ( !v->domain->arch.paging.shadow.pagetable_dying_op
+           || ( !d->arch.paging.shadow.pagetable_dying_op
                 && v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn) ) )
          && sh_mfn_is_a_page_table(gmfn)
-         && (!v->domain->arch.paging.shadow.pagetable_dying_op ||
+         && (!d->arch.paging.shadow.pagetable_dying_op ||
              !(mfn_to_page(gmfn)->shadow_flags
                & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64))) )
     {
         perfc_incr(shadow_early_unshadow);
-        sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
+        sh_remove_shadows(d, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
         TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
     }
     v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
@@ -2572,7 +2584,7 @@ static inline void reset_early_unshadow(struct vcpu *v)
 
 
 /**************************************************************************/
-/* Optimization: Prefetch multiple L1 entries.  This is called after we have 
+/* Optimization: Prefetch multiple L1 entries.  This is called after we have
  * demand-faulted a shadow l1e in the fault handler, to see if it's
  * worth fetching some more.
  */
@@ -2582,9 +2594,10 @@ static inline void reset_early_unshadow(struct vcpu *v)
 /* XXX magic number */
 #define PREFETCH_DISTANCE 32
 
-static void sh_prefetch(struct vcpu *v, walk_t *gw, 
+static void sh_prefetch(struct vcpu *v, walk_t *gw,
                         shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
 {
+    struct domain *d = v->domain;
     int i, dist;
     gfn_t gfn;
     mfn_t gmfn;
@@ -2606,22 +2619,22 @@ static void sh_prefetch(struct vcpu *v, walk_t *gw,
     if ( mfn_valid(gw->l1mfn) )
     {
         /* Normal guest page; grab the next guest entry */
-        gl1p = sh_map_domain_page(gw->l1mfn);
+        gl1p = map_domain_page(gw->l1mfn);
         gl1p += guest_l1_table_offset(gw->va);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
         if ( mfn_is_out_of_sync(gw->l1mfn) )
         {
-            mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
+            mfn_t snpmfn = oos_snapshot_lookup(d, gw->l1mfn);
 
             ASSERT(mfn_valid(snpmfn));
-            snpl1p = sh_map_domain_page(snpmfn);
+            snpl1p = map_domain_page(snpmfn);
             snpl1p += guest_l1_table_offset(gw->va);
         }
 #endif /* OOS */
     }
 
-    for ( i = 1; i < dist ; i++ ) 
+    for ( i = 1; i < dist ; i++ )
     {
         /* No point in prefetching if there's already a shadow */
         if ( ptr_sl1e[i].l1 != 0 )
@@ -2634,28 +2647,28 @@ static void sh_prefetch(struct vcpu *v, walk_t *gw,
             /* Not worth continuing if we hit an entry that will need another
              * fault for A/D-bit propagation anyway */
             gflags = guest_l1e_get_flags(gl1e);
-            if ( (gflags & _PAGE_PRESENT) 
+            if ( (gflags & _PAGE_PRESENT)
                  && (!(gflags & _PAGE_ACCESSED)
                      || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
                 break;
-        } 
-        else 
+        }
+        else
         {
             /* Fragmented superpage, unless we've been called wrongly */
             ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
             /* Increment the l1e's GFN by the right number of guest pages */
             gl1e = guest_l1e_from_gfn(
-                _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i), 
+                _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
                 guest_l1e_get_flags(gw->l1e));
         }
 
         /* Look at the gfn that the l1e is pointing at */
         gfn = guest_l1e_get_gfn(gl1e);
-        gmfn = get_gfn_query_unlocked(v->domain, gfn_x(gfn), &p2mt);
+        gmfn = get_gfn_query_unlocked(d, gfn_x(gfn), &p2mt);
 
         /* Propagate the entry.  */
         l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
-        (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, p2mt, sl1mfn);
+        (void) shadow_set_l1e(d, ptr_sl1e + i, sl1e, p2mt, sl1mfn);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
         if ( snpl1p != NULL )
@@ -2663,10 +2676,10 @@ static void sh_prefetch(struct vcpu *v, walk_t *gw,
 #endif /* OOS */
     }
     if ( gl1p != NULL )
-        sh_unmap_domain_page(gl1p);
+        unmap_domain_page(gl1p);
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     if ( snpl1p != NULL )
-        sh_unmap_domain_page(snpl1p);
+        unmap_domain_page(snpl1p);
 #endif /* OOS */
 }
 
@@ -2715,7 +2728,7 @@ static inline void trace_shadow_fixup(guest_l1e_t gl1e,
         __trace_var(event, 0/*!tsc*/, sizeof(d), &d);
     }
 }
-                                          
+
 static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
                                           guest_va_t va)
 {
@@ -2739,7 +2752,7 @@ static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
         __trace_var(event, 0/*!tsc*/, sizeof(d), &d);
     }
 }
-                                          
+
 static inline void trace_shadow_emulate_other(u32 event,
                                                  guest_va_t va,
                                                  gfn_t gfn)
@@ -2807,8 +2820,8 @@ static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
  * shadow code (and the guest should retry) or 0 if it is not (and the
  * fault should be handled elsewhere or passed to the guest). */
 
-static int sh_page_fault(struct vcpu *v, 
-                          unsigned long va, 
+static int sh_page_fault(struct vcpu *v,
+                          unsigned long va,
                           struct cpu_user_regs *regs)
 {
     struct domain *d = v->domain;
@@ -2848,7 +2861,7 @@ static int sh_page_fault(struct vcpu *v,
      * Then try to emulate early to avoid lock aquisition.
      */
     if ( v->arch.paging.last_write_emul_ok
-         && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) ) 
+         && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
     {
         /* check whether error code is 3, or else fall back to normal path
          * in case of some validation is required
@@ -2858,7 +2871,7 @@ static int sh_page_fault(struct vcpu *v,
             fast_emul = 1;
             gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
             /* Fall back to the slow path if we're trying to emulate
                writes to an out of sync page. */
             if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
@@ -2886,7 +2899,7 @@ static int sh_page_fault(struct vcpu *v,
 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
     if ( (regs->error_code & PFEC_reserved_bit) )
     {
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
         /* First, need to check that this isn't an out-of-sync
          * shadow l1e.  If it is, we fall back to the slow path, which
          * will sync it up again. */
@@ -2902,7 +2915,7 @@ static int sh_page_fault(struct vcpu *v,
                                   shadow_l2e_get_mfn(sl2e))))
                  || unlikely(mfn_is_out_of_sync(gl1mfn)) )
             {
-                /* Hit the slow path as if there had been no 
+                /* Hit the slow path as if there had been no
                  * shadow entry at all, and let it tidy up */
                 ASSERT(regs->error_code & PFEC_page_present);
                 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
@@ -2910,10 +2923,10 @@ static int sh_page_fault(struct vcpu *v,
             }
         }
 #endif /* SHOPT_OUT_OF_SYNC */
-        /* The only reasons for reserved bits to be set in shadow entries 
+        /* The only reasons for reserved bits to be set in shadow entries
          * are the two "magic" shadow_l1e entries. */
-        if ( likely((__copy_from_user(&sl1e, 
-                                      (sh_linear_l1_table(v) 
+        if ( likely((__copy_from_user(&sl1e,
+                                      (sh_linear_l1_table(v)
                                        + shadow_l1_linear_offset(va)),
                                       sizeof(sl1e)) == 0)
                     && sh_l1e_is_magic(sl1e)) )
@@ -2935,8 +2948,8 @@ static int sh_page_fault(struct vcpu *v,
             {
                 /* Magic MMIO marker: extract gfn for MMIO address */
                 ASSERT(sh_l1e_is_mmio(sl1e));
-                gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e)))) 
-                       << PAGE_SHIFT) 
+                gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
+                       << PAGE_SHIFT)
                     | (va & ~PAGE_MASK);
             }
             perfc_incr(shadow_fault_fast_mmio);
@@ -2949,24 +2962,24 @@ static int sh_page_fault(struct vcpu *v,
         else
         {
             /* This should be exceptionally rare: another vcpu has fixed
-             * the tables between the fault and our reading the l1e. 
+             * the tables between the fault and our reading the l1e.
              * Retry and let the hardware give us the right fault next time. */
             perfc_incr(shadow_fault_fast_fail);
-            SHADOW_PRINTK("fast path false alarm!\n");            
+            SHADOW_PRINTK("fast path false alarm!\n");
             trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va);
             return EXCRET_fault_fixed;
         }
     }
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  page_fault_slow_path:
 #endif
 #endif /* SHOPT_FAST_FAULT_PATH */
 
     /* Detect if this page fault happened while we were already in Xen
      * doing a shadow operation.  If that happens, the only thing we can
-     * do is let Xen's normal fault handlers try to fix it.  In any case, 
-     * a diagnostic trace of the fault will be more useful than 
+     * do is let Xen's normal fault handlers try to fix it.  In any case,
+     * a diagnostic trace of the fault will be more useful than
      * a BUG() when we try to take the lock again. */
     if ( unlikely(paging_locked_by_me(d)) )
     {
@@ -2980,7 +2993,7 @@ static int sh_page_fault(struct vcpu *v,
     /* The walk is done in a lock-free style, with some sanity check
      * postponed after grabbing paging lock later. Those delayed checks
      * will make sure no inconsistent mapping being translated into
-     * shadow page table. */ 
+     * shadow page table. */
     version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
     rmb();
     rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
@@ -3001,9 +3014,9 @@ static int sh_page_fault(struct vcpu *v,
         goto propagate;
     }
 
-    /* It's possible that the guest has put pagetables in memory that it has 
+    /* It's possible that the guest has put pagetables in memory that it has
      * already used for some special purpose (ioreq pages, or granted pages).
-     * If that happens we'll have killed the guest already but it's still not 
+     * If that happens we'll have killed the guest already but it's still not
      * safe to propagate entries out of the guest PT so get out now. */
     if ( unlikely(d->is_shutting_down && d->shutdown_code == SHUTDOWN_crash) )
     {
@@ -3019,12 +3032,12 @@ static int sh_page_fault(struct vcpu *v,
     gfn = guest_l1e_get_gfn(gw.l1e);
     gmfn = get_gfn(d, gfn, &p2mt);
 
-    if ( shadow_mode_refcounts(d) && 
+    if ( shadow_mode_refcounts(d) &&
          ((!p2m_is_valid(p2mt) && !p2m_is_grant(p2mt)) ||
           (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
     {
         perfc_incr(shadow_fault_bail_bad_gfn);
-        SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n", 
+        SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
                       gfn_x(gfn), mfn_x(gmfn));
         reset_early_unshadow(v);
         put_gfn(d, gfn_x(gfn));
@@ -3033,7 +3046,7 @@ static int sh_page_fault(struct vcpu *v,
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
     /* Remember this successful VA->GFN translation for later. */
-    vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), 
+    vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
                 regs->error_code | PFEC_page_present);
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
@@ -3053,7 +3066,7 @@ static int sh_page_fault(struct vcpu *v,
     shadow_prealloc(d,
                     SH_type_l1_shadow,
                     GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
-    
+
     rc = gw_remove_write_accesses(v, va, &gw);
 
     /* First bit set: Removed write access to a page. */
@@ -3088,10 +3101,10 @@ static int sh_page_fault(struct vcpu *v,
     shadow_audit_tables(v);
     sh_audit_gw(v, &gw);
 
-    /* Acquire the shadow.  This must happen before we figure out the rights 
+    /* Acquire the shadow.  This must happen before we figure out the rights
      * for the shadow entry, since we might promote a page here. */
     ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
-    if ( unlikely(ptr_sl1e == NULL) ) 
+    if ( unlikely(ptr_sl1e == NULL) )
     {
         /* Couldn't get the sl1e!  Since we know the guest entries
          * are OK, this can only have been caused by a failed
@@ -3143,21 +3156,21 @@ static int sh_page_fault(struct vcpu *v,
 
     /* Calculate the shadow entry and write it */
     l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
-    r = shadow_set_l1e(v, ptr_sl1e, sl1e, p2mt, sl1mfn);
+    r = shadow_set_l1e(d, ptr_sl1e, sl1e, p2mt, sl1mfn);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
-    if ( mfn_valid(gw.l1mfn) 
+    if ( mfn_valid(gw.l1mfn)
          && mfn_is_out_of_sync(gw.l1mfn) )
     {
         /* Update the OOS snapshot. */
-        mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
+        mfn_t snpmfn = oos_snapshot_lookup(d, gw.l1mfn);
         guest_l1e_t *snp;
-        
+
         ASSERT(mfn_valid(snpmfn));
-        
-        snp = sh_map_domain_page(snpmfn);
+
+        snp = map_domain_page(snpmfn);
         snp[guest_l1_table_offset(va)] = gw.l1e;
-        sh_unmap_domain_page(snp);
+        unmap_domain_page(snp);
     }
 #endif /* OOS */
 
@@ -3168,7 +3181,7 @@ static int sh_page_fault(struct vcpu *v,
 
     /* Need to emulate accesses to page tables */
     if ( sh_mfn_is_a_page_table(gmfn)
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
          /* Unless they've been allowed to go out of sync with their
             shadows and we don't need to unshadow it. */
          && !(mfn_is_out_of_sync(gmfn)
@@ -3181,7 +3194,8 @@ static int sh_page_fault(struct vcpu *v,
     }
 
     /* Need to hand off device-model MMIO to the device model */
-    if ( p2mt == p2m_mmio_dm ) 
+    if ( p2mt == p2m_mmio_dm
+         || (p2mt == p2m_mmio_write_dm && ft == ft_demand_write) )
     {
         gpa = guest_walk_to_gpa(&gw);
         goto mmio;
@@ -3201,10 +3215,10 @@ static int sh_page_fault(struct vcpu *v,
     /* In HVM guests, we force CR0.WP always to be set, so that the
      * pagetables are always write-protected.  If the guest thinks
      * CR0.WP is clear, we must emulate faulting supervisor writes to
-     * allow the guest to write through read-only PTEs.  Emulate if the 
+     * allow the guest to write through read-only PTEs.  Emulate if the
      * fault was a non-user write to a present page.  */
-    if ( is_hvm_domain(d) 
-         && unlikely(!hvm_wp_enabled(v)) 
+    if ( is_hvm_domain(d)
+         && unlikely(!hvm_wp_enabled(v))
          && regs->error_code == (PFEC_write_access|PFEC_page_present)
          && mfn_valid(gmfn) )
     {
@@ -3236,10 +3250,10 @@ static int sh_page_fault(struct vcpu *v,
      */
     if ( (regs->error_code & PFEC_user_mode) )
     {
-        SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n", 
+        SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
                       mfn_x(gmfn));
         perfc_incr(shadow_fault_emulate_failed);
-        sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
+        sh_remove_shadows(d, gmfn, 0 /* thorough */, 1 /* must succeed */);
         trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER,
                                       va, gfn);
         goto done;
@@ -3281,7 +3295,7 @@ static int sh_page_fault(struct vcpu *v,
         }
 
         if ( !used )
-            sh_remove_shadows(v, gmfn, 1 /* fast */, 0 /* can fail */);
+            sh_remove_shadows(d, gmfn, 1 /* fast */, 0 /* can fail */);
     }
 
     /*
@@ -3317,16 +3331,16 @@ static int sh_page_fault(struct vcpu *v,
             }
 #endif
             gdprintk(XENLOG_DEBUG, "write to pagetable during event "
-                     "injection: cr2=%#lx, mfn=%#lx\n", 
+                     "injection: cr2=%#lx, mfn=%#lx\n",
                      va, mfn_x(gmfn));
-            sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
+            sh_remove_shadows(d, gmfn, 0 /* thorough */, 1 /* must succeed */);
             trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ,
                                        va, gfn);
             return EXCRET_fault_fixed;
         }
     }
 
-    SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n", 
+    SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
                   (unsigned long)regs->eip, (unsigned long)regs->esp);
 
     emul_ops = shadow_init_emulation(&emul_ctxt, regs);
@@ -3348,12 +3362,12 @@ static int sh_page_fault(struct vcpu *v,
             v->arch.paging.last_write_emul_ok = 0;
         }
 #endif
-        SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n", 
+        SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
                        mfn_x(gmfn));
-        /* If this is actually a page table, then we have a bug, and need 
-         * to support more operations in the emulator.  More likely, 
+        /* If this is actually a page table, then we have a bug, and need
+         * to support more operations in the emulator.  More likely,
          * though, this is a hint that this page should not be shadowed. */
-        shadow_remove_all_shadows(v, gmfn);
+        shadow_remove_all_shadows(d, gmfn);
 
         trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED,
                                    va, gfn);
@@ -3386,7 +3400,7 @@ static int sh_page_fault(struct vcpu *v,
     if ( r == X86EMUL_OKAY ) {
         int i, emulation_count=0;
         this_cpu(trace_emulate_initial_va) = va;
-        /* Emulate up to four extra instructions in the hope of catching 
+        /* Emulate up to four extra instructions in the hope of catching
          * the "second half" of a 64-bit pagetable write. */
         for ( i = 0 ; i < 4 ; i++ )
         {
@@ -3394,7 +3408,7 @@ static int sh_page_fault(struct vcpu *v,
             v->arch.paging.last_write_was_pt = 0;
             r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
             if ( r == X86EMUL_OKAY )
-            { 
+            {
                 emulation_count++;
                 if ( v->arch.paging.last_write_was_pt )
                 {
@@ -3402,7 +3416,7 @@ static int sh_page_fault(struct vcpu *v,
                     TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN);
                     break; /* Don't emulate past the other half of the write */
                 }
-                else 
+                else
                     perfc_incr(shadow_em_ex_non_pt);
             }
             else
@@ -3458,7 +3472,7 @@ sh_invlpg(struct vcpu *v, unsigned long va)
 {
     mfn_t sl1mfn;
     shadow_l2e_t sl2e;
-    
+
     perfc_incr(shadow_invlpg);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
@@ -3471,7 +3485,7 @@ sh_invlpg(struct vcpu *v, unsigned long va)
 #endif
 
     /* First check that we can safely read the shadow l2e.  SMP/PAE linux can
-     * run as high as 6% of invlpg calls where we haven't shadowed the l2 
+     * run as high as 6% of invlpg calls where we haven't shadowed the l2
      * yet. */
 #if SHADOW_PAGING_LEVELS == 4
     {
@@ -3483,7 +3497,7 @@ sh_invlpg(struct vcpu *v, unsigned long va)
         /* This must still be a copy-from-user because we don't have the
          * paging lock, and the higher-level shadows might disappear
          * under our feet. */
-        if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v) 
+        if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
                                       + shadow_l3_linear_offset(va)),
                               sizeof (sl3e)) != 0 )
         {
@@ -3502,7 +3516,7 @@ sh_invlpg(struct vcpu *v, unsigned long va)
 
     /* This must still be a copy-from-user because we don't have the shadow
      * lock, and the higher-level shadows might disappear under our feet. */
-    if ( __copy_from_user(&sl2e, 
+    if ( __copy_from_user(&sl2e,
                           sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
                           sizeof (sl2e)) != 0 )
     {
@@ -3528,52 +3542,53 @@ sh_invlpg(struct vcpu *v, unsigned long va)
         return 0;
     }
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     /* Check to see if the SL1 is out of sync. */
     {
+        struct domain *d = v->domain;
         mfn_t gl1mfn = backpointer(mfn_to_page(sl1mfn));
         struct page_info *pg = mfn_to_page(gl1mfn);
-        if ( mfn_valid(gl1mfn) 
+        if ( mfn_valid(gl1mfn)
              && page_is_out_of_sync(pg) )
         {
             /* The test above may give false positives, since we don't
              * hold the paging lock yet.  Check again with the lock held. */
-            paging_lock(v->domain);
+            paging_lock(d);
 
             /* This must still be a copy-from-user because we didn't
              * have the paging lock last time we checked, and the
              * higher-level shadows might have disappeared under our
              * feet. */
-            if ( __copy_from_user(&sl2e, 
+            if ( __copy_from_user(&sl2e,
                                   sh_linear_l2_table(v)
                                   + shadow_l2_linear_offset(va),
                                   sizeof (sl2e)) != 0 )
             {
                 perfc_incr(shadow_invlpg_fault);
-                paging_unlock(v->domain);
+                paging_unlock(d);
                 return 0;
             }
 
             if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
             {
-                paging_unlock(v->domain);
+                paging_unlock(d);
                 return 0;
             }
 
             sl1mfn = shadow_l2e_get_mfn(sl2e);
             gl1mfn = backpointer(mfn_to_page(sl1mfn));
             pg = mfn_to_page(gl1mfn);
-            
+
             if ( likely(sh_mfn_is_a_page_table(gl1mfn)
                         && page_is_out_of_sync(pg) ) )
             {
                 shadow_l1e_t *sl1;
                 sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
                 /* Remove the shadow entry that maps this VA */
-                (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(),
+                (void) shadow_set_l1e(d, sl1, shadow_l1e_empty(),
                                       p2m_invalid, sl1mfn);
             }
-            paging_unlock(v->domain);
+            paging_unlock(d);
             /* Need the invlpg, to pick up the disappeareance of the sl1e */
             return 1;
         }
@@ -3597,7 +3612,7 @@ sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
     /* Check the vTLB cache first */
     unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
-    if ( VALID_GFN(vtlb_gfn) ) 
+    if ( VALID_GFN(vtlb_gfn) )
         return vtlb_gfn;
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
@@ -3636,7 +3651,7 @@ sh_update_linear_entries(struct vcpu *v)
      * is subtler.  Normal linear mappings are made by having an entry
      * in the top-level table that points to itself (shadow linear) or
      * to the guest top-level table (guest linear).  For PAE, to set up
-     * a linear map requires us to copy the four top-level entries into 
+     * a linear map requires us to copy the four top-level entries into
      * level-2 entries.  That means that every time we change a PAE l3e,
      * we need to reflect the change into the copy.
      *
@@ -3646,44 +3661,44 @@ sh_update_linear_entries(struct vcpu *v)
      * For HVM guests, the linear pagetables are installed in the monitor
      * tables (since we can't put them in the shadow).  Shadow linear
      * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
-     * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for 
-     * a linear pagetable of the monitor tables themselves.  We have 
+     * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
+     * a linear pagetable of the monitor tables themselves.  We have
      * the same issue of having to re-copy PAE l3 entries whevever we use
-     * PAE shadows. 
+     * PAE shadows.
      *
-     * Because HVM guests run on the same monitor tables regardless of the 
-     * shadow tables in use, the linear mapping of the shadow tables has to 
-     * be updated every time v->arch.shadow_table changes. 
+     * Because HVM guests run on the same monitor tables regardless of the
+     * shadow tables in use, the linear mapping of the shadow tables has to
+     * be updated every time v->arch.shadow_table changes.
      */
 
     /* Don't try to update the monitor table if it doesn't exist */
-    if ( shadow_mode_external(d) 
-         && pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
+    if ( shadow_mode_external(d)
+         && pagetable_get_pfn(v->arch.monitor_table) == 0 )
         return;
 
 #if SHADOW_PAGING_LEVELS == 4
-    
+
     /* For PV, one l4e points at the guest l4, one points at the shadow
-     * l4.  No maintenance required. 
+     * l4.  No maintenance required.
      * For HVM, just need to update the l4e that points to the shadow l4. */
 
     if ( shadow_mode_external(d) )
     {
         /* Use the linear map if we can; otherwise make a new mapping */
-        if ( v == current ) 
+        if ( v == current )
         {
-            __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] = 
+            __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
                 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
                              __PAGE_HYPERVISOR);
-        } 
+        }
         else
-        { 
+        {
             l4_pgentry_t *ml4e;
-            ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
-            ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = 
+            ml4e = map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+            ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
                 l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
                              __PAGE_HYPERVISOR);
-            sh_unmap_domain_page(ml4e);
+            unmap_domain_page(ml4e);
         }
     }
 
@@ -3711,23 +3726,23 @@ sh_update_linear_entries(struct vcpu *v)
         if ( v == current )
             ml2e = __linear_l2_table
                 + l2_linear_offset(SH_LINEAR_PT_VIRT_START);
-        else 
-        {   
+        else
+        {
             mfn_t l3mfn, l2mfn;
             l4_pgentry_t *ml4e;
             l3_pgentry_t *ml3e;
             int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
-            ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+            ml4e = map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
 
             ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
             l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
-            ml3e = sh_map_domain_page(l3mfn);
-            sh_unmap_domain_page(ml4e);
+            ml3e = map_domain_page(l3mfn);
+            unmap_domain_page(ml4e);
 
             ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
             l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
-            ml2e = sh_map_domain_page(l2mfn);
-            sh_unmap_domain_page(ml3e);
+            ml2e = map_domain_page(l2mfn);
+            unmap_domain_page(ml3e);
         }
 
         /* Shadow l3 tables are made up by sh_update_cr3 */
@@ -3735,15 +3750,15 @@ sh_update_linear_entries(struct vcpu *v)
 
         for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
         {
-            ml2e[i] = 
-                (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT) 
+            ml2e[i] =
+                (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
                 ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
-                               __PAGE_HYPERVISOR) 
+                               __PAGE_HYPERVISOR)
                 : l2e_empty();
         }
 
-        if ( v != current ) 
-            sh_unmap_domain_page(ml2e);
+        if ( v != current )
+            unmap_domain_page(ml2e);
     }
     else
         domain_crash(d); /* XXX */
@@ -3757,11 +3772,11 @@ sh_update_linear_entries(struct vcpu *v)
         /*
          * Having modified the linear pagetable mapping, flush local host TLBs.
          * This was not needed when vmenter/vmexit always had the side effect
-         * of flushing host TLBs but, with ASIDs, it is possible to finish 
-         * this CR3 update, vmenter the guest, vmexit due to a page fault, 
-         * without an intervening host TLB flush. Then the page fault code 
-         * could use the linear pagetable to read a top-level shadow page 
-         * table entry. But, without this change, it would fetch the wrong 
+         * of flushing host TLBs but, with ASIDs, it is possible to finish
+         * this CR3 update, vmenter the guest, vmexit due to a page fault,
+         * without an intervening host TLB flush. Then the page fault code
+         * could use the linear pagetable to read a top-level shadow page
+         * table entry. But, without this change, it would fetch the wrong
          * value due to a stale TLB.
          */
         flush_tlb_local();
@@ -3775,6 +3790,7 @@ sh_update_linear_entries(struct vcpu *v)
 static void
 sh_detach_old_tables(struct vcpu *v)
 {
+    struct domain *d = v->domain;
     mfn_t smfn;
     int i = 0;
 
@@ -3788,9 +3804,8 @@ sh_detach_old_tables(struct vcpu *v)
 #else
     if ( v->arch.paging.shadow.guest_vtable )
     {
-        struct domain *d = v->domain;
         if ( shadow_mode_external(d) || shadow_mode_translate(d) )
-            sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
+            unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
         v->arch.paging.shadow.guest_vtable = NULL;
     }
 #endif // !NDEBUG
@@ -3807,23 +3822,23 @@ sh_detach_old_tables(struct vcpu *v)
     {
         smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
         if ( mfn_x(smfn) )
-            sh_put_ref(v, smfn, 0);
+            sh_put_ref(d, smfn, 0);
         v->arch.shadow_table[i] = pagetable_null();
     }
 }
 
 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
 static void
-sh_set_toplevel_shadow(struct vcpu *v, 
+sh_set_toplevel_shadow(struct vcpu *v,
                        int slot,
-                       mfn_t gmfn, 
-                       unsigned int root_type) 
+                       mfn_t gmfn,
+                       unsigned int root_type)
 {
     mfn_t smfn;
     pagetable_t old_entry, new_entry;
 
     struct domain *d = v->domain;
-    
+
     /* Remember the old contents of this slot */
     old_entry = v->arch.shadow_table[slot];
 
@@ -3835,7 +3850,7 @@ sh_set_toplevel_shadow(struct vcpu *v,
     }
 
     /* Guest mfn is valid: shadow it and install the shadow */
-    smfn = get_shadow_status(v, gmfn, root_type);
+    smfn = get_shadow_status(d, gmfn, root_type);
     if ( !mfn_valid(smfn) )
     {
         /* Make sure there's enough free shadow memory. */
@@ -3844,20 +3859,20 @@ sh_set_toplevel_shadow(struct vcpu *v,
         smfn = sh_make_shadow(v, gmfn, root_type);
     }
     ASSERT(mfn_valid(smfn));
-    
+
     /* Pin the shadow and put it (back) on the list of pinned shadows */
-    if ( sh_pin(v, smfn) == 0 )
+    if ( sh_pin(d, smfn) == 0 )
     {
         SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
-        domain_crash(v->domain);
+        domain_crash(d);
     }
 
     /* Take a ref to this page: it will be released in sh_detach_old_tables()
      * or the next call to set_toplevel_shadow() */
-    if ( !sh_get_ref(v, smfn, 0) )
+    if ( !sh_get_ref(d, smfn, 0) )
     {
         SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
-        domain_crash(v->domain);
+        domain_crash(d);
     }
 
     new_entry = pagetable_from_mfn(smfn);
@@ -3875,12 +3890,12 @@ sh_set_toplevel_shadow(struct vcpu *v,
         /* Need to repin the old toplevel shadow if it's been unpinned
          * by shadow_prealloc(): in PV mode we're still running on this
          * shadow and it's not safe to free it yet. */
-        if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(v, old_smfn) )
+        if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(d, old_smfn) )
         {
             SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
-            domain_crash(v->domain);
+            domain_crash(d);
         }
-        sh_put_ref(v, old_smfn, 0);
+        sh_put_ref(d, old_smfn, 0);
     }
 }
 
@@ -3891,10 +3906,10 @@ sh_update_cr3(struct vcpu *v, int do_locking)
  * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
  * if appropriate).
  * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
- * this function will call hvm_update_guest_cr(v, 3) to tell them where the 
+ * this function will call hvm_update_guest_cr(v, 3) to tell them where the
  * shadow tables are.
- * If do_locking != 0, assume we are being called from outside the 
- * shadow code, and must take and release the paging lock; otherwise 
+ * If do_locking != 0, assume we are being called from outside the
+ * shadow code, and must take and release the paging lock; otherwise
  * that is the caller's responsibility.
  */
 {
@@ -3928,26 +3943,26 @@ sh_update_cr3(struct vcpu *v, int do_locking)
     ////
     //// vcpu->arch.guest_table is already set
     ////
-    
-#ifndef NDEBUG 
+
+#ifndef NDEBUG
     /* Double-check that the HVM code has sent us a sane guest_table */
     if ( is_hvm_domain(d) )
     {
         ASSERT(shadow_mode_external(d));
         if ( hvm_paging_enabled(v) )
             ASSERT(pagetable_get_pfn(v->arch.guest_table));
-        else 
+        else
             ASSERT(v->arch.guest_table.pfn
                    == d->arch.paging.shadow.unpaged_pagetable.pfn);
     }
 #endif
 
     SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
-                   d->domain_id, v->vcpu_id, 
+                   d->domain_id, v->vcpu_id,
                    (unsigned long)pagetable_get_pfn(v->arch.guest_table));
 
 #if GUEST_PAGING_LEVELS == 4
-    if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
+    if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32bit_domain(d) )
         gmfn = pagetable_get_mfn(v->arch.guest_table_user);
     else
 #endif
@@ -3961,8 +3976,8 @@ sh_update_cr3(struct vcpu *v, int do_locking)
     if ( shadow_mode_external(d) || shadow_mode_translate(d) )
     {
         if ( v->arch.paging.shadow.guest_vtable )
-            sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
-        v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
+            unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
+        v->arch.paging.shadow.guest_vtable = map_domain_page_global(gmfn);
         /* PAGING_LEVELS==4 implies 64-bit, which means that
          * map_domain_page_global can't fail */
         BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
@@ -3974,28 +3989,28 @@ sh_update_cr3(struct vcpu *v, int do_locking)
       * table.  We cache the current state of that table and shadow that,
       * until the next CR3 write makes us refresh our cache. */
      ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
- 
-     if ( shadow_mode_external(d) ) 
+
+     if ( shadow_mode_external(d) )
          /* Find where in the page the l3 table is */
          guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
      else
-         /* PV guest: l3 is at the start of a page */ 
-         guest_idx = 0; 
+         /* PV guest: l3 is at the start of a page */
+         guest_idx = 0;
 
      // Ignore the low 2 bits of guest_idx -- they are really just
      // cache control.
      guest_idx &= ~3;
-     
-     gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
+
+     gl3e = ((guest_l3e_t *)map_domain_page(gmfn)) + guest_idx;
      for ( i = 0; i < 4 ; i++ )
          v->arch.paging.shadow.gl3e[i] = gl3e[i];
-     sh_unmap_domain_page(gl3e);
+     unmap_domain_page(gl3e);
 #elif GUEST_PAGING_LEVELS == 2
     if ( shadow_mode_external(d) || shadow_mode_translate(d) )
     {
         if ( v->arch.paging.shadow.guest_vtable )
-            sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
-        v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
+            unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
+        v->arch.paging.shadow.guest_vtable = map_domain_page_global(gmfn);
         /* Does this really need map_domain_page_global?  Handle the
          * error properly if so. */
         BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
@@ -4012,14 +4027,14 @@ sh_update_cr3(struct vcpu *v, int do_locking)
     ////
 
     /* We revoke write access to the new guest toplevel page(s) before we
-     * replace the old shadow pagetable(s), so that we can safely use the 
+     * replace the old shadow pagetable(s), so that we can safely use the
      * (old) shadow linear maps in the writeable mapping heuristics. */
 #if GUEST_PAGING_LEVELS == 2
-    if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
+    if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 )
         flush_tlb_mask(d->domain_dirty_cpumask);
     sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
 #elif GUEST_PAGING_LEVELS == 3
-    /* PAE guests have four shadow_table entries, based on the 
+    /* PAE guests have four shadow_table entries, based on the
      * current values of the guest's four l3es. */
     {
         int flush = 0;
@@ -4035,39 +4050,49 @@ sh_update_cr3(struct vcpu *v, int do_locking)
                 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
                 gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt);
                 if ( p2m_is_ram(p2mt) )
-                    flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
+                    flush |= sh_remove_write_access(d, gl2mfn, 2, 0);
             }
         }
-        if ( flush ) 
+        if ( flush )
             flush_tlb_mask(d->domain_dirty_cpumask);
         /* Now install the new shadows. */
-        for ( i = 0; i < 4; i++ ) 
+        for ( i = 0; i < 4; i++ )
         {
             if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
             {
                 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
                 gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt);
                 if ( p2m_is_ram(p2mt) )
-                    sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3) 
-                                           ? SH_type_l2h_shadow 
+                    sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
+                                           ? SH_type_l2h_shadow
                                            : SH_type_l2_shadow);
                 else
-                    sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0); 
+                    sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
             }
             else
-                sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0); 
+                sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
         }
     }
 #elif GUEST_PAGING_LEVELS == 4
-    if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
+    if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 )
         flush_tlb_mask(d->domain_dirty_cpumask);
     sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
+    if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) )
+    {
+        mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table[0]);
+
+        if ( !(v->arch.flags & TF_kernel_mode) && VM_ASSIST(d, m2p_strict) )
+            zap_ro_mpt(mfn_x(smfn));
+        else if ( (v->arch.flags & TF_kernel_mode) &&
+                  !VM_ASSIST(d, m2p_strict) )
+            fill_ro_mpt(mfn_x(smfn));
+    }
 #else
-#error This should never happen 
+#error This should never happen
 #endif
 
 
-    /// 
+    ///
     /// v->arch.paging.shadow.l3table
     ///
 #if SHADOW_PAGING_LEVELS == 3
@@ -4084,8 +4109,8 @@ sh_update_cr3(struct vcpu *v, int do_locking)
                 /* 3-on-3: make a PAE l3 that points at the four l2 pages */
                 smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
 #endif
-                v->arch.paging.shadow.l3table[i] = 
-                    (mfn_x(smfn) == 0) 
+                v->arch.paging.shadow.l3table[i] =
+                    (mfn_x(smfn) == 0)
                     ? shadow_l3e_empty()
                     : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
             }
@@ -4164,9 +4189,10 @@ sh_update_cr3(struct vcpu *v, int do_locking)
 /* Functions to revoke guest rights */
 
 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
-int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, 
+int sh_rm_write_access_from_sl1p(struct domain *d, mfn_t gmfn,
                                  mfn_t smfn, unsigned long off)
 {
+    struct vcpu *curr = current;
     int r;
     shadow_l1e_t *sl1p, sl1e;
     struct page_info *sp;
@@ -4175,9 +4201,9 @@ int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
     ASSERT(mfn_valid(smfn));
 
     /* Remember if we've been told that this process is being torn down */
-    v->arch.paging.shadow.pagetable_dying
-        = !!(mfn_to_page(gmfn)->shadow_flags & SHF_pagetable_dying);
-
+    if ( curr->domain == d )
+        curr->arch.paging.shadow.pagetable_dying
+            = !!(mfn_to_page(gmfn)->shadow_flags & SHF_pagetable_dying);
 
     sp = mfn_to_page(smfn);
 
@@ -4186,23 +4212,23 @@ int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
              && sp->u.sh.type != SH_type_fl1_shadow) )
         goto fail;
 
-    sl1p = sh_map_domain_page(smfn);
+    sl1p = map_domain_page(smfn);
     sl1p += off;
     sl1e = *sl1p;
     if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
           != (_PAGE_PRESENT|_PAGE_RW))
          || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
     {
-        sh_unmap_domain_page(sl1p);
+        unmap_domain_page(sl1p);
         goto fail;
     }
 
     /* Found it!  Need to remove its write permissions. */
     sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
-    r = shadow_set_l1e(v, sl1p, sl1e, p2m_ram_rw, smfn);
+    r = shadow_set_l1e(d, sl1p, sl1e, p2m_ram_rw, smfn);
     ASSERT( !(r & SHADOW_SET_ERROR) );
 
-    sh_unmap_domain_page(sl1p);
+    unmap_domain_page(sl1p);
     perfc_incr(shadow_writeable_h_7);
     return 1;
 
@@ -4217,6 +4243,7 @@ static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
 /* Look up this vaddr in the current shadow and see if it's a writeable
  * mapping of this gmfn.  If so, remove it.  Returns 1 if it worked. */
 {
+    struct domain *d = v->domain;
     shadow_l1e_t sl1e, *sl1p;
     shadow_l2e_t *sl2p;
     shadow_l3e_t *sl3p;
@@ -4235,7 +4262,7 @@ static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
     if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
         return 0;
 #else /* SHADOW_PAGING_LEVELS == 3 */
-    sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table) 
+    sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
         + shadow_l3_linear_offset(vaddr);
     if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
         return 0;
@@ -4253,7 +4280,7 @@ static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
     /* Found it!  Need to remove its write permissions. */
     sl1mfn = shadow_l2e_get_mfn(*sl2p);
     sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
-    r = shadow_set_l1e(v, sl1p, sl1e, p2m_ram_rw, sl1mfn);
+    r = shadow_set_l1e(d, sl1p, sl1e, p2m_ram_rw, sl1mfn);
     if ( r & SHADOW_SET_ERROR ) {
         /* Can only currently happen if we found a grant-mapped
          * page.  Just make the guess fail. */
@@ -4264,29 +4291,31 @@ static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
 }
 #endif
 
-int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
+int sh_rm_write_access_from_l1(struct domain *d, mfn_t sl1mfn,
                                mfn_t readonly_mfn)
 /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
 {
     shadow_l1e_t *sl1e;
     int done = 0;
     int flags;
-#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC 
+#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
+    struct vcpu *curr = current;
     mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
 #endif
-    
-    SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, 
+
+    SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
     {
         flags = shadow_l1e_get_flags(*sl1e);
-        if ( (flags & _PAGE_PRESENT) 
-             && (flags & _PAGE_RW) 
+        if ( (flags & _PAGE_PRESENT)
+             && (flags & _PAGE_RW)
              && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
         {
             shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
-            (void) shadow_set_l1e(v, sl1e, ro_sl1e, p2m_ram_rw, sl1mfn);
-#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC 
+            (void) shadow_set_l1e(d, sl1e, ro_sl1e, p2m_ram_rw, sl1mfn);
+#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
             /* Remember the last shadow that we shot a writeable mapping in */
-            v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
+            if ( curr->domain == d )
+                curr->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
 #endif
             if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
                   & PGT_count_mask) == 0 )
@@ -4298,20 +4327,20 @@ int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
 }
 
 
-int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
+int sh_rm_mappings_from_l1(struct domain *d, mfn_t sl1mfn, mfn_t target_mfn)
 /* Excises all mappings to guest frame from this shadow l1 table */
 {
     shadow_l1e_t *sl1e;
     int done = 0;
     int flags;
-    
-    SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, 
+
+    SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
     {
         flags = shadow_l1e_get_flags(*sl1e);
-        if ( (flags & _PAGE_PRESENT) 
+        if ( (flags & _PAGE_PRESENT)
              && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
         {
-            (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(),
+            (void) shadow_set_l1e(d, sl1e, shadow_l1e_empty(),
                                   p2m_invalid, sl1mfn);
             if ( sh_check_page_has_no_refs(mfn_to_page(target_mfn)) )
                 /* This breaks us cleanly out of the FOREACH macro */
@@ -4324,46 +4353,46 @@ int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
 /**************************************************************************/
 /* Functions to excise all pointers to shadows from higher-level shadows. */
 
-void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
+void sh_clear_shadow_entry(struct domain *d, void *ep, mfn_t smfn)
 /* Blank out a single shadow entry */
 {
     switch ( mfn_to_page(smfn)->u.sh.type )
     {
     case SH_type_l1_shadow:
-        (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), p2m_invalid, smfn);
+        (void) shadow_set_l1e(d, ep, shadow_l1e_empty(), p2m_invalid, smfn);
         break;
     case SH_type_l2_shadow:
 #if GUEST_PAGING_LEVELS >= 3
     case SH_type_l2h_shadow:
 #endif
-        (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn);
+        (void) shadow_set_l2e(d, ep, shadow_l2e_empty(), smfn);
         break;
 #if GUEST_PAGING_LEVELS >= 4
     case SH_type_l3_shadow:
-        (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn);
+        (void) shadow_set_l3e(d, ep, shadow_l3e_empty(), smfn);
         break;
     case SH_type_l4_shadow:
-        (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn);
+        (void) shadow_set_l4e(d, ep, shadow_l4e_empty(), smfn);
         break;
 #endif
     default: BUG(); /* Called with the wrong kind of shadow. */
     }
 }
 
-int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
+int sh_remove_l1_shadow(struct domain *d, mfn_t sl2mfn, mfn_t sl1mfn)
 /* Remove all mappings of this l1 shadow from this l2 shadow */
 {
     shadow_l2e_t *sl2e;
     int done = 0;
     int flags;
-    
-    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain, 
+
+    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, d,
     {
         flags = shadow_l2e_get_flags(*sl2e);
-        if ( (flags & _PAGE_PRESENT) 
+        if ( (flags & _PAGE_PRESENT)
              && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
         {
-            (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+            (void) shadow_set_l2e(d, sl2e, shadow_l2e_empty(), sl2mfn);
             if ( mfn_to_page(sl1mfn)->u.sh.type == 0 )
                 /* This breaks us cleanly out of the FOREACH macro */
                 done = 1;
@@ -4373,20 +4402,20 @@ int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
 }
 
 #if GUEST_PAGING_LEVELS >= 4
-int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
+int sh_remove_l2_shadow(struct domain *d, mfn_t sl3mfn, mfn_t sl2mfn)
 /* Remove all mappings of this l2 shadow from this l3 shadow */
 {
     shadow_l3e_t *sl3e;
     int done = 0;
     int flags;
-    
-    SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done, 
+
+    SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
     {
         flags = shadow_l3e_get_flags(*sl3e);
-        if ( (flags & _PAGE_PRESENT) 
+        if ( (flags & _PAGE_PRESENT)
              && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
         {
-            (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
+            (void) shadow_set_l3e(d, sl3e, shadow_l3e_empty(), sl3mfn);
             if ( mfn_to_page(sl2mfn)->u.sh.type == 0 )
                 /* This breaks us cleanly out of the FOREACH macro */
                 done = 1;
@@ -4395,20 +4424,20 @@ int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
     return done;
 }
 
-int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
+int sh_remove_l3_shadow(struct domain *d, mfn_t sl4mfn, mfn_t sl3mfn)
 /* Remove all mappings of this l3 shadow from this l4 shadow */
 {
     shadow_l4e_t *sl4e;
     int done = 0;
     int flags;
-    
-    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
+
+    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, d,
     {
         flags = shadow_l4e_get_flags(*sl4e);
-        if ( (flags & _PAGE_PRESENT) 
+        if ( (flags & _PAGE_PRESENT)
              && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
         {
-            (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+            (void) shadow_set_l4e(d, sl4e, shadow_l4e_empty(), sl4mfn);
             if ( mfn_to_page(sl3mfn)->u.sh.type == 0 )
                 /* This breaks us cleanly out of the FOREACH macro */
                 done = 1;
@@ -4416,7 +4445,7 @@ int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
     });
     return done;
 }
-#endif /* 64bit guest */ 
+#endif /* 64bit guest */
 
 /**************************************************************************/
 /* Function for the guest to inform us that a process is being torn
@@ -4426,6 +4455,7 @@ int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
 #if GUEST_PAGING_LEVELS == 3
 static void sh_pagetable_dying(struct vcpu *v, paddr_t gpa)
 {
+    struct domain *d = v->domain;
     int i = 0;
     int flush = 0;
     int fast_path = 0;
@@ -4443,7 +4473,7 @@ static void sh_pagetable_dying(struct vcpu *v, paddr_t gpa)
         fast_path = 1;
 
     l3gfn = gpa >> PAGE_SHIFT;
-    l3mfn = get_gfn_query(v->domain, _gfn(l3gfn), &p2mt);
+    l3mfn = get_gfn_query(d, _gfn(l3gfn), &p2mt);
     if ( !mfn_valid(l3mfn) || !p2m_is_ram(p2mt) )
     {
         printk(XENLOG_DEBUG "sh_pagetable_dying: gpa not valid %"PRIpaddr"\n",
@@ -4451,11 +4481,11 @@ static void sh_pagetable_dying(struct vcpu *v, paddr_t gpa)
         goto out_put_gfn;
     }
 
-    paging_lock(v->domain);
+    paging_lock(d);
 
     if ( !fast_path )
     {
-        gl3pa = sh_map_domain_page(l3mfn);
+        gl3pa = map_domain_page(l3mfn);
         gl3e = (guest_l3e_t *)(gl3pa + ((unsigned long)gpa & ~PAGE_MASK));
     }
     for ( i = 0; i < 4; i++ )
@@ -4474,66 +4504,67 @@ static void sh_pagetable_dying(struct vcpu *v, paddr_t gpa)
             /* retrieving the l2s */
             gl2a = guest_l3e_get_paddr(gl3e[i]);
             gfn = gl2a >> PAGE_SHIFT;
-            gmfn = get_gfn_query_unlocked(v->domain, gfn, &p2mt);
-            smfn = shadow_hash_lookup(v, mfn_x(gmfn), SH_type_l2_pae_shadow);
+            gmfn = get_gfn_query_unlocked(d, gfn, &p2mt);
+            smfn = shadow_hash_lookup(d, mfn_x(gmfn), SH_type_l2_pae_shadow);
         }
 
         if ( mfn_valid(smfn) )
         {
             gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
             mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying;
-            shadow_unhook_mappings(v, smfn, 1/* user pages only */);
+            shadow_unhook_mappings(d, smfn, 1/* user pages only */);
             flush = 1;
         }
     }
     if ( flush )
-        flush_tlb_mask(v->domain->domain_dirty_cpumask);
+        flush_tlb_mask(d->domain_dirty_cpumask);
 
     /* Remember that we've seen the guest use this interface, so we
      * can rely on it using it in future, instead of guessing at
      * when processes are being torn down. */
-    v->domain->arch.paging.shadow.pagetable_dying_op = 1;
+    d->arch.paging.shadow.pagetable_dying_op = 1;
 
     v->arch.paging.shadow.pagetable_dying = 1;
 
     if ( !fast_path )
         unmap_domain_page(gl3pa);
-    paging_unlock(v->domain);
+    paging_unlock(d);
 out_put_gfn:
-    put_gfn(v->domain, l3gfn);
+    put_gfn(d, l3gfn);
 }
 #else
 static void sh_pagetable_dying(struct vcpu *v, paddr_t gpa)
 {
+    struct domain *d = v->domain;
     mfn_t smfn, gmfn;
     p2m_type_t p2mt;
 
-    gmfn = get_gfn_query(v->domain, _gfn(gpa >> PAGE_SHIFT), &p2mt);
-    paging_lock(v->domain);
+    gmfn = get_gfn_query(d, _gfn(gpa >> PAGE_SHIFT), &p2mt);
+    paging_lock(d);
 
 #if GUEST_PAGING_LEVELS == 2
-    smfn = shadow_hash_lookup(v, mfn_x(gmfn), SH_type_l2_32_shadow);
+    smfn = shadow_hash_lookup(d, mfn_x(gmfn), SH_type_l2_32_shadow);
 #else
-    smfn = shadow_hash_lookup(v, mfn_x(gmfn), SH_type_l4_64_shadow);
+    smfn = shadow_hash_lookup(d, mfn_x(gmfn), SH_type_l4_64_shadow);
 #endif
-    
+
     if ( mfn_valid(smfn) )
     {
         mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying;
-        shadow_unhook_mappings(v, smfn, 1/* user pages only */);
+        shadow_unhook_mappings(d, smfn, 1/* user pages only */);
         /* Now flush the TLB: we removed toplevel mappings. */
-        flush_tlb_mask(v->domain->domain_dirty_cpumask);
+        flush_tlb_mask(d->domain_dirty_cpumask);
     }
 
     /* Remember that we've seen the guest use this interface, so we
      * can rely on it using it in future, instead of guessing at
      * when processes are being torn down. */
-    v->domain->arch.paging.shadow.pagetable_dying_op = 1;
+    d->arch.paging.shadow.pagetable_dying_op = 1;
 
     v->arch.paging.shadow.pagetable_dying = 1;
 
-    paging_unlock(v->domain);
-    put_gfn(v->domain, gpa >> PAGE_SHIFT);
+    paging_unlock(d);
+    put_gfn(d, gpa >> PAGE_SHIFT);
 }
 #endif
 
@@ -4556,7 +4587,7 @@ static mfn_t emulate_gva_to_mfn(struct vcpu *v,
 
     /* Translate the VA to a GFN */
     gfn = sh_gva_to_gfn(v, NULL, vaddr, &pfec);
-    if ( gfn == INVALID_GFN ) 
+    if ( gfn == INVALID_GFN )
     {
         if ( is_hvm_vcpu(v) )
             hvm_inject_page_fault(pfec, vaddr);
@@ -4575,7 +4606,7 @@ static mfn_t emulate_gva_to_mfn(struct vcpu *v,
     {
         return _mfn(BAD_GFN_TO_MFN);
     }
-    if ( p2m_is_readonly(p2mt) )
+    if ( p2m_is_discard_write(p2mt) )
     {
         put_page(page);
         return _mfn(READONLY_GFN);
@@ -4595,7 +4626,7 @@ static mfn_t emulate_gva_to_mfn(struct vcpu *v,
     return mfn;
 }
 
-/* Check that the user is allowed to perform this write. 
+/* Check that the user is allowed to perform this write.
  * Returns a mapped pointer to write to, or NULL for error. */
 #define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE)
 #define MAPPING_EXCEPTION    ((void *)(unsigned long)X86EMUL_EXCEPTION)
@@ -4606,10 +4637,11 @@ static void *emulate_map_dest(struct vcpu *v,
                               u32 bytes,
                               struct sh_emulate_ctxt *sh_ctxt)
 {
+    struct domain *d = v->domain;
     void *map = NULL;
 
     sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
-    if ( !mfn_valid(sh_ctxt->mfn1) ) 
+    if ( !mfn_valid(sh_ctxt->mfn1) )
         return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
                 MAPPING_EXCEPTION :
                 (mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ?
@@ -4624,40 +4656,40 @@ static void *emulate_map_dest(struct vcpu *v,
         return MAPPING_UNHANDLEABLE;
     }
 #endif
-                
+
     /* Unaligned writes mean probably this isn't a pagetable */
     if ( vaddr & (bytes - 1) )
-        sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
+        sh_remove_shadows(d, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
 
     if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
     {
         /* Whole write fits on a single page */
         sh_ctxt->mfn2 = _mfn(INVALID_MFN);
-        map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
+        map = map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
     }
-    else 
+    else
     {
-        unsigned long mfns[2];
+        mfn_t mfns[2];
 
-        /* Cross-page emulated writes are only supported for HVM guests; 
+        /* Cross-page emulated writes are only supported for HVM guests;
          * PV guests ought to know better */
-        if ( !is_hvm_vcpu(v) )
+        if ( !is_hvm_domain(d) )
             return MAPPING_UNHANDLEABLE;
 
         /* This write crosses a page boundary.  Translate the second page */
         sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
                                            sh_ctxt);
-        if ( !mfn_valid(sh_ctxt->mfn2) ) 
+        if ( !mfn_valid(sh_ctxt->mfn2) )
             return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
                     MAPPING_EXCEPTION :
                     (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
                     MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
 
         /* Cross-page writes mean probably not a pagetable */
-        sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
-        
-        mfns[0] = mfn_x(sh_ctxt->mfn1);
-        mfns[1] = mfn_x(sh_ctxt->mfn2);
+        sh_remove_shadows(d, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
+
+        mfns[0] = sh_ctxt->mfn1;
+        mfns[1] = sh_ctxt->mfn2;
         map = vmap(mfns, 2);
         if ( !map )
             return MAPPING_UNHANDLEABLE;
@@ -4689,10 +4721,10 @@ static void emulate_unmap_dest(struct vcpu *v,
     {
         if ( ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
             check_for_early_unshadow(v, sh_ctxt->mfn1);
-        /* Don't reset the heuristic if we're writing zeros at non-aligned 
+        /* Don't reset the heuristic if we're writing zeros at non-aligned
          * addresses, otherwise it doesn't catch REP MOVSD on PAE guests */
     }
-    else 
+    else
         reset_early_unshadow(v);
 
     /* We can avoid re-verifying the page contents after the write if:
@@ -4716,11 +4748,11 @@ static void emulate_unmap_dest(struct vcpu *v,
               && bytes <= 4)) )
     {
         /* Writes with this alignment constraint can't possibly cross pages */
-        ASSERT(!mfn_valid(sh_ctxt->mfn2)); 
+        ASSERT(!mfn_valid(sh_ctxt->mfn2));
     }
-    else 
+    else
 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
-    {        
+    {
         if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
         {
             /* Validate as two writes, one to each page */
@@ -4741,8 +4773,8 @@ static void emulate_unmap_dest(struct vcpu *v,
         paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
         vunmap((void *)((unsigned long)addr & PAGE_MASK));
     }
-    else 
-        sh_unmap_domain_page(addr);
+    else
+        unmap_domain_page(addr);
 
     atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
 }
@@ -4787,7 +4819,7 @@ sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
 }
 
 static int
-sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr, 
+sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
                         unsigned long old, unsigned long new,
                         unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
 {
@@ -4815,7 +4847,7 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
         prev = ~old;
     }
 
-    if ( prev != old ) 
+    if ( prev != old )
         rv = X86EMUL_CMPXCHG_FAILED;
 
     SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
@@ -4864,28 +4896,28 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
 } while (0)
 
 static char * sh_audit_flags(struct vcpu *v, int level,
-                              int gflags, int sflags) 
+                              int gflags, int sflags)
 /* Common code for auditing flag bits */
 {
     if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
         return "shadow is present but guest is not present";
-    if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) ) 
+    if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
         return "global bit set in PV shadow";
     if ( level == 2 && (sflags & _PAGE_PSE) )
         return "PS bit set in shadow";
 #if SHADOW_PAGING_LEVELS == 3
     if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
 #endif
-    if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) ) 
+    if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
         return "accessed bit not propagated";
     if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
-         && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) ) 
+         && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
         return "dirty bit not propagated";
-    if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) ) 
+    if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
         return "user/supervisor bit does not match";
-    if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) ) 
+    if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
         return "NX bit does not match";
-    if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) ) 
+    if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
         return "shadow grants write access but guest does not";
     return NULL;
 }
@@ -4899,7 +4931,7 @@ int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
     p2m_type_t p2mt;
     char *s;
     int done = 0;
-    
+
     /* Follow the backpointer */
     ASSERT(mfn_to_page(sl1mfn)->u.sh.head);
     gl1mfn = backpointer(mfn_to_page(sl1mfn));
@@ -4913,35 +4945,35 @@ int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
     }
 #endif
 
-    gl1e = gp = sh_map_domain_page(gl1mfn);
+    gl1e = gp = map_domain_page(gl1mfn);
     SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
 
-        if ( sh_l1e_is_magic(*sl1e) ) 
+        if ( sh_l1e_is_magic(*sl1e) )
         {
 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
             if ( sh_l1e_is_gnp(*sl1e) )
             {
                 if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
                     AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
-            } 
-            else 
+            }
+            else
             {
                 ASSERT(sh_l1e_is_mmio(*sl1e));
                 gfn = sh_l1e_mmio_get_gfn(*sl1e);
                 if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
-                    AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn 
+                    AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
                                " but guest gfn is %" SH_PRI_gfn,
                                gfn_x(gfn),
                                gfn_x(guest_l1e_get_gfn(*gl1e)));
             }
 #endif
         }
-        else 
+        else
         {
             s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
                                shadow_l1e_get_flags(*sl1e));
             if ( s ) AUDIT_FAIL(1, "%s", s);
-            
+
             if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
             {
                 gfn = guest_l1e_get_gfn(*gl1e);
@@ -4954,7 +4986,7 @@ int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
             }
         }
     });
-    sh_unmap_domain_page(gp);
+    unmap_domain_page(gp);
     return done;
 }
 
@@ -4971,12 +5003,12 @@ int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
     SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
         f = shadow_l1e_get_flags(*sl1e);
         f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
-        if ( !(f == 0 
+        if ( !(f == 0
                || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
-                        _PAGE_ACCESSED) 
+                        _PAGE_ACCESSED)
                || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED)
                || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
-                        _PAGE_ACCESSED|_PAGE_DIRTY) 
+                        _PAGE_ACCESSED|_PAGE_DIRTY)
                || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
                || sh_l1e_is_magic(*sl1e)) )
             AUDIT_FAIL(1, "fl1e has bad flags");
@@ -4986,6 +5018,7 @@ int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
 
 int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
 {
+    struct domain *d = v->domain;
     guest_l2e_t *gl2e, *gp;
     shadow_l2e_t *sl2e;
     mfn_t mfn, gmfn, gl2mfn;
@@ -5004,8 +5037,8 @@ int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
         AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
 #endif
 
-    gl2e = gp = sh_map_domain_page(gl2mfn);
-    SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
+    gl2e = gp = map_domain_page(gl2mfn);
+    SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, d, {
 
         s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
                             shadow_l2e_get_flags(*sl2e));
@@ -5015,28 +5048,29 @@ int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
         {
             gfn = guest_l2e_get_gfn(*gl2e);
             mfn = shadow_l2e_get_mfn(*sl2e);
-            gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)  
-                ? get_fl1_shadow_status(v, gfn)
-                : get_shadow_status(v, 
-                    get_gfn_query_unlocked(v->domain, gfn_x(gfn), 
+            gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
+                ? get_fl1_shadow_status(d, gfn)
+                : get_shadow_status(d,
+                    get_gfn_query_unlocked(d, gfn_x(gfn),
                                         &p2mt), SH_type_l1_shadow);
             if ( mfn_x(gmfn) != mfn_x(mfn) )
                 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
                            " (--> %" PRI_mfn ")"
                            " --> %" PRI_mfn " != mfn %" PRI_mfn,
-                           gfn_x(gfn), 
+                           gfn_x(gfn),
                            (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
-                           : mfn_x(get_gfn_query_unlocked(v->domain,
+                           : mfn_x(get_gfn_query_unlocked(d,
                                    gfn_x(gfn), &p2mt)), mfn_x(gmfn), mfn_x(mfn));
         }
     });
-    sh_unmap_domain_page(gp);
+    unmap_domain_page(gp);
     return 0;
 }
 
 #if GUEST_PAGING_LEVELS >= 4
 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
 {
+    struct domain *d = v->domain;
     guest_l3e_t *gl3e, *gp;
     shadow_l3e_t *sl3e;
     mfn_t mfn, gmfn, gl3mfn;
@@ -5049,13 +5083,13 @@ int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
     ASSERT(mfn_to_page(sl3mfn)->u.sh.head);
     gl3mfn = backpointer(mfn_to_page(sl3mfn));
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     /* Only L1's may be out of sync. */
     if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
         AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
 #endif
 
-    gl3e = gp = sh_map_domain_page(gl3mfn);
+    gl3e = gp = map_domain_page(gl3mfn);
     SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
 
         s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
@@ -5066,11 +5100,11 @@ int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
         {
             gfn = guest_l3e_get_gfn(*gl3e);
             mfn = shadow_l3e_get_mfn(*sl3e);
-            gmfn = get_shadow_status(v, get_gfn_query_unlocked(
-                                        v->domain, gfn_x(gfn), &p2mt),
+            gmfn = get_shadow_status(d, get_gfn_query_unlocked(
+                                        d, gfn_x(gfn), &p2mt),
                                      ((GUEST_PAGING_LEVELS == 3 ||
-                                       is_pv_32on64_vcpu(v))
-                                      && !shadow_mode_external(v->domain)
+                                       is_pv_32bit_domain(d))
+                                      && !shadow_mode_external(d)
                                       && (guest_index(gl3e) % 4) == 3)
                                      ? SH_type_l2h_shadow
                                      : SH_type_l2_shadow);
@@ -5080,12 +5114,13 @@ int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
                            gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
         }
     });
-    sh_unmap_domain_page(gp);
+    unmap_domain_page(gp);
     return 0;
 }
 
 int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
 {
+    struct domain *d = v->domain;
     guest_l4e_t *gl4e, *gp;
     shadow_l4e_t *sl4e;
     mfn_t mfn, gmfn, gl4mfn;
@@ -5098,14 +5133,14 @@ int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
     ASSERT(mfn_to_page(sl4mfn)->u.sh.head);
     gl4mfn = backpointer(mfn_to_page(sl4mfn));
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     /* Only L1's may be out of sync. */
     if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
         AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
 #endif
 
-    gl4e = gp = sh_map_domain_page(gl4mfn);
-    SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
+    gl4e = gp = map_domain_page(gl4mfn);
+    SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, d,
     {
         s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
                             shadow_l4e_get_flags(*sl4e));
@@ -5115,8 +5150,8 @@ int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
         {
             gfn = guest_l4e_get_gfn(*gl4e);
             mfn = shadow_l4e_get_mfn(*sl4e);
-            gmfn = get_shadow_status(v, get_gfn_query_unlocked(
-                                     v->domain, gfn_x(gfn), &p2mt), 
+            gmfn = get_shadow_status(d, get_gfn_query_unlocked(
+                                     d, gfn_x(gfn), &p2mt),
                                      SH_type_l3_shadow);
             if ( mfn_x(gmfn) != mfn_x(mfn) )
                 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
@@ -5124,7 +5159,7 @@ int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
                            gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
         }
     });
-    sh_unmap_domain_page(gp);
+    unmap_domain_page(gp);
     return 0;
 }
 #endif /* GUEST_PAGING_LEVELS >= 4 */
@@ -5138,7 +5173,7 @@ int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
 /* Entry points into this mode of the shadow code.
  * This will all be mangled by the preprocessor to uniquify everything. */
 const struct paging_mode sh_paging_mode = {
-    .page_fault                    = sh_page_fault, 
+    .page_fault                    = sh_page_fault,
     .invlpg                        = sh_invlpg,
     .gva_to_gfn                    = sh_gva_to_gfn,
     .update_cr3                    = sh_update_cr3,
@@ -5167,5 +5202,5 @@ const struct paging_mode sh_paging_mode = {
  * c-file-style: "BSD"
  * c-basic-offset: 4
  * indent-tabs-mode: nil
- * End: 
+ * End:
  */
diff --git a/xen/arch/x86/mm/shadow/multi.h b/xen/arch/x86/mm/shadow/multi.h
index 835121e..b5cc1e9 100644
--- a/xen/arch/x86/mm/shadow/multi.h
+++ b/xen/arch/x86/mm/shadow/multi.h
@@ -17,84 +17,83 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
-extern int 
+extern int
 SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, GUEST_LEVELS)(
     struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size);
-extern int 
+extern int
 SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, GUEST_LEVELS)(
     struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
-extern int 
+extern int
 SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, GUEST_LEVELS)(
     struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
-extern int 
+extern int
 SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, GUEST_LEVELS)(
     struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size);
-extern int 
+extern int
 SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, GUEST_LEVELS)(
     struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size);
 
-extern void 
+extern void
 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, GUEST_LEVELS)(
-    struct vcpu *v, mfn_t smfn);
-extern void 
+    struct domain *d, mfn_t smfn);
+extern void
 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, GUEST_LEVELS)(
-    struct vcpu *v, mfn_t smfn);
-extern void 
+    struct domain *d, mfn_t smfn);
+extern void
 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, GUEST_LEVELS)(
-    struct vcpu *v, mfn_t smfn);
-extern void 
+    struct domain *d, mfn_t smfn);
+extern void
 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, GUEST_LEVELS)(
-    struct vcpu *v, mfn_t smfn);
+    struct domain *d, mfn_t smfn);
 
-extern void 
+extern void
 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl2mfn, int user_only);
-extern void 
+    (struct domain *d, mfn_t sl2mfn, int user_only);
+extern void
 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl3mfn, int user_only);
-extern void 
+    (struct domain *d, mfn_t sl3mfn, int user_only);
+extern void
 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl4mfn, int user_only);
+    (struct domain *d, mfn_t sl4mfn, int user_only);
 
 extern int
 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn);
+    (struct domain *d, mfn_t sl1mfn, mfn_t readonly_mfn);
 extern int
 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn);
+    (struct domain *d, mfn_t sl1mfn, mfn_t target_mfn);
 
 extern void
 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, GUEST_LEVELS)
-    (struct vcpu *v, void *ep, mfn_t smfn);
+    (struct domain *d, void *ep, mfn_t smfn);
 
 extern int
 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn);
+    (struct domain *d, mfn_t sl2mfn, mfn_t sl1mfn);
 extern int
 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn);
+    (struct domain *d, mfn_t sl3mfn, mfn_t sl2mfn);
 extern int
 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, GUEST_LEVELS)
-    (struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn);
+    (struct domain *d, mfn_t sl4mfn, mfn_t sl3mfn);
 
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
-int 
+int
 SHADOW_INTERNAL_NAME(sh_audit_l1_table, GUEST_LEVELS)
     (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
-int 
+int
 SHADOW_INTERNAL_NAME(sh_audit_fl1_table, GUEST_LEVELS)
     (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
-int 
+int
 SHADOW_INTERNAL_NAME(sh_audit_l2_table, GUEST_LEVELS)
     (struct vcpu *v, mfn_t sl2mfn, mfn_t x);
-int 
+int
 SHADOW_INTERNAL_NAME(sh_audit_l3_table, GUEST_LEVELS)
     (struct vcpu *v, mfn_t sl3mfn, mfn_t x);
-int 
+int
 SHADOW_INTERNAL_NAME(sh_audit_l4_table, GUEST_LEVELS)
     (struct vcpu *v, mfn_t sl4mfn, mfn_t x);
 #endif
@@ -117,7 +116,7 @@ extern const struct paging_mode
 SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS);
 
 #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
-extern void 
+extern void
 SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS)
      (struct vcpu *v, mfn_t gmfn, mfn_t snpmfn);
 
@@ -127,5 +126,5 @@ SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS)
 
 extern int
 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p, GUEST_LEVELS)
-     (struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off);
+     (struct domain *d, mfn_t gmfn, mfn_t smfn, unsigned long off);
 #endif
diff --git a/xen/arch/x86/mm/shadow/none.c b/xen/arch/x86/mm/shadow/none.c
new file mode 100644
index 0000000..9526443
--- /dev/null
+++ b/xen/arch/x86/mm/shadow/none.c
@@ -0,0 +1,78 @@
+#include <xen/mm.h>
+#include <asm/shadow.h>
+
+static int _enable_log_dirty(struct domain *d, bool_t log_global)
+{
+    ASSERT(is_pv_domain(d));
+    return -EOPNOTSUPP;
+}
+
+static int _disable_log_dirty(struct domain *d)
+{
+    ASSERT(is_pv_domain(d));
+    return -EOPNOTSUPP;
+}
+
+static void _clean_dirty_bitmap(struct domain *d)
+{
+    ASSERT(is_pv_domain(d));
+}
+
+int shadow_domain_init(struct domain *d, unsigned int domcr_flags)
+{
+    paging_log_dirty_init(d, _enable_log_dirty,
+                          _disable_log_dirty, _clean_dirty_bitmap);
+    return is_pv_domain(d) ? 0 : -EOPNOTSUPP;
+}
+
+static int _page_fault(struct vcpu *v, unsigned long va,
+                       struct cpu_user_regs *regs)
+{
+    ASSERT_UNREACHABLE();
+    return 0;
+}
+
+static int _invlpg(struct vcpu *v, unsigned long va)
+{
+    ASSERT_UNREACHABLE();
+    return -EOPNOTSUPP;
+}
+
+static unsigned long _gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
+                                 unsigned long va, uint32_t *pfec)
+{
+    ASSERT_UNREACHABLE();
+    return INVALID_GFN;
+}
+
+static void _update_cr3(struct vcpu *v, int do_locking)
+{
+    ASSERT_UNREACHABLE();
+}
+
+static void _update_paging_modes(struct vcpu *v)
+{
+    ASSERT_UNREACHABLE();
+}
+
+static void _write_p2m_entry(struct domain *d, unsigned long gfn,
+                             l1_pgentry_t *p, l1_pgentry_t new,
+                             unsigned int level)
+{
+    ASSERT_UNREACHABLE();
+}
+
+static const struct paging_mode sh_paging_none = {
+    .page_fault                    = _page_fault,
+    .invlpg                        = _invlpg,
+    .gva_to_gfn                    = _gva_to_gfn,
+    .update_cr3                    = _update_cr3,
+    .update_paging_modes           = _update_paging_modes,
+    .write_p2m_entry               = _write_p2m_entry,
+};
+
+void shadow_vcpu_init(struct vcpu *v)
+{
+    ASSERT(is_pv_vcpu(v));
+    v->arch.paging.mode = &sh_paging_none;
+}
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index b778fcf..2d146cf 100644
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -17,8 +17,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _XEN_SHADOW_PRIVATE_H
@@ -139,7 +138,7 @@ enum {
 #endif
 
 /******************************************************************************
- * Auditing routines 
+ * Auditing routines
  */
 
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
@@ -191,27 +190,27 @@ extern void shadow_audit_tables(struct vcpu *v);
 #define SH_type_oos_snapshot  (16U) /* in use as OOS snapshot */
 #define SH_type_unused        (17U)
 
-/* 
+/*
  * What counts as a pinnable shadow?
  */
 
-static inline int sh_type_is_pinnable(struct vcpu *v, unsigned int t) 
+static inline int sh_type_is_pinnable(struct domain *d, unsigned int t)
 {
-    /* Top-level shadow types in each mode can be pinned, so that they 
+    /* Top-level shadow types in each mode can be pinned, so that they
      * persist even when not currently in use in a guest CR3 */
     if ( t == SH_type_l2_32_shadow
          || t == SH_type_l2_pae_shadow
-         || t == SH_type_l2h_pae_shadow 
+         || t == SH_type_l2h_pae_shadow
          || t == SH_type_l4_64_shadow )
         return 1;
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
     /* Early 64-bit linux used three levels of pagetables for the guest
      * and context switched by changing one l4 entry in a per-cpu l4
      * page.  When we're shadowing those kernels, we have to pin l3
      * shadows so they don't just evaporate on every context switch.
-     * For all other guests, we'd rather use the up-pointer field in l3s. */ 
-    if ( unlikely((v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) 
+     * For all other guests, we'd rather use the up-pointer field in l3s. */
+    if ( unlikely((d->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL)
                   && t == SH_type_l3_64_shadow) )
         return 1;
 #endif
@@ -220,7 +219,7 @@ static inline int sh_type_is_pinnable(struct vcpu *v, unsigned int t)
     return 0;
 }
 
-static inline int sh_type_has_up_pointer(struct vcpu *v, unsigned int t) 
+static inline int sh_type_has_up_pointer(struct domain *d, unsigned int t)
 {
     /* Multi-page shadows don't have up-pointers */
     if ( t == SH_type_l1_32_shadow
@@ -228,7 +227,18 @@ static inline int sh_type_has_up_pointer(struct vcpu *v, unsigned int t)
          || t == SH_type_l2_32_shadow )
         return 0;
     /* Pinnable shadows don't have up-pointers either */
-    return !sh_type_is_pinnable(v, t);
+    return !sh_type_is_pinnable(d, t);
+}
+
+static inline void sh_terminate_list(struct page_list_head *tmp_list)
+{
+#ifndef PAGE_LIST_NULL
+    /* The temporary list-head is on our stack.  Invalidate the
+     * pointers to it in the shadows, just to get a clean failure if
+     * we accidentally follow them. */
+    tmp_list->prev->next = LIST_POISON1;
+    tmp_list->next->prev = LIST_POISON2;
+#endif
 }
 
 /*
@@ -260,9 +270,9 @@ static inline int sh_type_has_up_pointer(struct vcpu *v, unsigned int t)
 
 #define SHF_L1_ANY  (SHF_L1_32|SHF_L1_PAE|SHF_L1_64)
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
 /* Marks a guest L1 page table which is shadowed but not write-protected.
- * If set, then *only* L1 shadows (SHF_L1_*) are allowed. 
+ * If set, then *only* L1 shadows (SHF_L1_*) are allowed.
  *
  * out_of_sync indicates that the shadow tables may not reflect the
  * guest tables.  If it is clear, then the shadow tables *must* reflect
@@ -270,9 +280,9 @@ static inline int sh_type_has_up_pointer(struct vcpu *v, unsigned int t)
  *
  * oos_may_write indicates that a page may have writable mappings.
  *
- * Most of the time the flags are synonymous.  There is a short period of time 
- * during resync that oos_may_write is clear but out_of_sync is not.  If a 
- * codepath is called during that time and is sensitive to oos issues, it may 
+ * Most of the time the flags are synonymous.  There is a short period of time
+ * during resync that oos_may_write is clear but out_of_sync is not.  If a
+ * codepath is called during that time and is sensitive to oos issues, it may
  * need to use the second flag.
  */
 #define SHF_out_of_sync (1u<<30)
@@ -292,69 +302,78 @@ static inline int sh_page_has_multiple_shadows(struct page_info *pg)
     return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 );
 }
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
 /* The caller must verify this is reasonable to call; i.e., valid mfn,
  * domain is translated, &c */
-static inline int page_is_out_of_sync(struct page_info *p) 
+static inline int page_is_out_of_sync(struct page_info *p)
 {
     return (p->count_info & PGC_page_table)
         && (p->shadow_flags & SHF_out_of_sync);
 }
 
-static inline int mfn_is_out_of_sync(mfn_t gmfn) 
+static inline int mfn_is_out_of_sync(mfn_t gmfn)
 {
     return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn)));
 }
 
-static inline int page_oos_may_write(struct page_info *p) 
+static inline int page_oos_may_write(struct page_info *p)
 {
     return (p->count_info & PGC_page_table)
         && (p->shadow_flags & SHF_oos_may_write);
 }
 
-static inline int mfn_oos_may_write(mfn_t gmfn) 
+static inline int mfn_oos_may_write(mfn_t gmfn)
 {
     return page_oos_may_write(mfn_to_page(mfn_x(gmfn)));
 }
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
+/* Figure out the size (in pages) of a given shadow type */
+extern const u8 sh_type_to_size[SH_type_unused];
+static inline unsigned int
+shadow_size(unsigned int shadow_type)
+{
+    ASSERT(shadow_type < ARRAY_SIZE(sh_type_to_size));
+    return sh_type_to_size[shadow_type];
+}
+
 /******************************************************************************
- * Various function declarations 
+ * Various function declarations
  */
 
 /* Hash table functions */
-mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t);
-void  shadow_hash_insert(struct vcpu *v, 
+mfn_t shadow_hash_lookup(struct domain *d, unsigned long n, unsigned int t);
+void  shadow_hash_insert(struct domain *d,
                          unsigned long n, unsigned int t, mfn_t smfn);
-void  shadow_hash_delete(struct vcpu *v, 
+void  shadow_hash_delete(struct domain *d,
                          unsigned long n, unsigned int t, mfn_t smfn);
 
 /* shadow promotion */
-void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type);
-void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type);
+void shadow_promote(struct domain *d, mfn_t gmfn, u32 type);
+void shadow_demote(struct domain *d, mfn_t gmfn, u32 type);
 
 /* Shadow page allocation functions */
 void  shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count);
-mfn_t shadow_alloc(struct domain *d, 
+mfn_t shadow_alloc(struct domain *d,
                     u32 shadow_type,
                     unsigned long backpointer);
 void  shadow_free(struct domain *d, mfn_t smfn);
 
 /* Install the xen mappings in various flavours of shadow */
-void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn);
+void sh_install_xen_entries_in_l4(struct domain *, mfn_t gl4mfn, mfn_t sl4mfn);
 
 /* Update the shadows in response to a pagetable write from Xen */
 int sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size);
 
 /* Update the shadows in response to a pagetable write from a HVM guest */
-void sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, 
+void sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
                                 void *entry, u32 size);
 
 /* Remove all writeable mappings of a guest frame from the shadows.
- * Returns non-zero if we need to flush TLBs. 
+ * Returns non-zero if we need to flush TLBs.
  * level and fault_addr desribe how we found this to be a pagetable;
  * level==0 means we have some other reason for revoking write access. */
-extern int sh_remove_write_access(struct vcpu *v, mfn_t readonly_mfn,
+extern int sh_remove_write_access(struct domain *d, mfn_t readonly_mfn,
                                   unsigned int level,
                                   unsigned long fault_addr);
 
@@ -367,20 +386,25 @@ int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
                                intpte_t *old, intpte_t new, mfn_t gmfn);
 
+/* Update all the things that are derived from the guest's CR0/CR3/CR4.
+ * Called to initialize paging structures if the paging mode
+ * has changed, and when bringing up a VCPU for the first time. */
+void shadow_update_paging_modes(struct vcpu *v);
+
 /* Unhook the non-Xen mappings in this top-level shadow mfn.
  * With user_only == 1, unhooks only the user-mode mappings. */
-void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn, int user_only);
+void shadow_unhook_mappings(struct domain *d, mfn_t smfn, int user_only);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
 /* Allow a shadowed page to go out of sync */
 int sh_unsync(struct vcpu *v, mfn_t gmfn);
 
 /* Pull an out-of-sync page back into sync. */
-void sh_resync(struct vcpu *v, mfn_t gmfn);
+void sh_resync(struct domain *d, mfn_t gmfn);
 
-void oos_fixup_add(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off);
+void oos_fixup_add(struct domain *d, mfn_t gmfn, mfn_t smfn, unsigned long off);
 
-int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
+int sh_remove_write_access_from_sl1p(struct domain *d, mfn_t gmfn,
                                      mfn_t smfn, unsigned long offset);
 
 /* Pull all out-of-sync shadows back into sync.  If skip != 0, we try
@@ -407,12 +431,12 @@ shadow_sync_other_vcpus(struct vcpu *v)
 }
 
 void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn);
-mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn);
+mfn_t oos_snapshot_lookup(struct domain *d, mfn_t gmfn);
 
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 
-/* Reset the up-pointers of every L3 shadow to 0. 
+/* Reset the up-pointers of every L3 shadow to 0.
  * This is called when l3 shadows stop being pinnable, to clear out all
  * the list-head bits so the up-pointer field is properly inititalised. */
 void sh_reset_l3_up_pointers(struct vcpu *v);
@@ -430,7 +454,7 @@ void sh_reset_l3_up_pointers(struct vcpu *v);
 
 
 /******************************************************************************
- * MFN/page-info handling 
+ * MFN/page-info handling
  */
 
 /* Override macros from asm/page.h to make them work with mfn_t */
@@ -471,49 +495,24 @@ sh_mfn_is_a_page_table(mfn_t gmfn)
         return 0;
 
     owner = page_get_owner(page);
-    if ( owner && shadow_mode_refcounts(owner) 
+    if ( owner && shadow_mode_refcounts(owner)
          && (page->count_info & PGC_page_table) )
-        return 1; 
+        return 1;
 
     type_info = page->u.inuse.type_info & PGT_type_mask;
     return type_info && (type_info <= PGT_l4_page_table);
 }
 
-// Provide mfn_t-aware versions of common xen functions
-static inline void *
-sh_map_domain_page(mfn_t mfn)
-{
-    return map_domain_page(mfn_x(mfn));
-}
-
-static inline void 
-sh_unmap_domain_page(void *p) 
-{
-    unmap_domain_page(p);
-}
-
-static inline void *
-sh_map_domain_page_global(mfn_t mfn)
-{
-    return map_domain_page_global(mfn_x(mfn));
-}
-
-static inline void 
-sh_unmap_domain_page_global(void *p) 
-{
-    unmap_domain_page_global(p);
-}
-
 /**************************************************************************/
 /* Shadow-page refcounting. */
 
-void sh_destroy_shadow(struct vcpu *v, mfn_t smfn);
+void sh_destroy_shadow(struct domain *d, mfn_t smfn);
 
-/* Increase the refcount of a shadow page.  Arguments are the mfn to refcount, 
+/* Increase the refcount of a shadow page.  Arguments are the mfn to refcount,
  * and the physical address of the shadow entry that holds the ref (or zero
- * if the ref is held by something else).  
+ * if the ref is held by something else).
  * Returns 0 for failure, 1 for success. */
-static inline int sh_get_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
+static inline int sh_get_ref(struct domain *d, mfn_t smfn, paddr_t entry_pa)
 {
     u32 x, nx;
     struct page_info *sp = mfn_to_page(smfn);
@@ -530,23 +529,23 @@ static inline int sh_get_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
                        __backpointer(sp), mfn_x(smfn));
         return 0;
     }
-    
+
     /* Guarded by the paging lock, so no need for atomic update */
     sp->u.sh.count = nx;
 
     /* We remember the first shadow entry that points to each shadow. */
-    if ( entry_pa != 0 
-         && sh_type_has_up_pointer(v, sp->u.sh.type)
-         && sp->up == 0 ) 
+    if ( entry_pa != 0
+         && sh_type_has_up_pointer(d, sp->u.sh.type)
+         && sp->up == 0 )
         sp->up = entry_pa;
-    
+
     return 1;
 }
 
 
 /* Decrease the refcount of a shadow page.  As for get_ref, takes the
  * physical address of the shadow entry that held this reference. */
-static inline void sh_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
+static inline void sh_put_ref(struct domain *d, mfn_t smfn, paddr_t entry_pa)
 {
     u32 x, nx;
     struct page_info *sp = mfn_to_page(smfn);
@@ -556,15 +555,15 @@ static inline void sh_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
     ASSERT(!(sp->count_info & PGC_count_mask));
 
     /* If this is the entry in the up-pointer, remove it */
-    if ( entry_pa != 0 
-         && sh_type_has_up_pointer(v, sp->u.sh.type)
-         && sp->up == entry_pa ) 
+    if ( entry_pa != 0
+         && sh_type_has_up_pointer(d, sp->u.sh.type)
+         && sp->up == entry_pa )
         sp->up = 0;
 
     x = sp->u.sh.count;
     nx = x - 1;
 
-    if ( unlikely(x == 0) ) 
+    if ( unlikely(x == 0) )
     {
         SHADOW_ERROR("shadow ref underflow, smfn=%lx oc=%08x t=%#x\n",
                      mfn_x(smfn), sp->u.sh.count, sp->u.sh.type);
@@ -574,34 +573,37 @@ static inline void sh_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
     /* Guarded by the paging lock, so no need for atomic update */
     sp->u.sh.count = nx;
 
-    if ( unlikely(nx == 0) ) 
-        sh_destroy_shadow(v, smfn);
+    if ( unlikely(nx == 0) )
+        sh_destroy_shadow(d, smfn);
 }
 
 
-/* Walk the list of pinned shadows, from the tail forwards, 
+/* Walk the list of pinned shadows, from the tail forwards,
  * skipping the non-head-page entries */
 static inline struct page_info *
-prev_pinned_shadow(const struct page_info *page,
+prev_pinned_shadow(struct page_info *page,
                    const struct domain *d)
 {
     struct page_info *p;
+    const struct page_list_head *pin_list;
+
+    pin_list = &d->arch.paging.shadow.pinned_shadows;
 
-    if ( page == d->arch.paging.shadow.pinned_shadows.next ) 
+    if ( page_list_empty(pin_list) || page == page_list_first(pin_list) )
         return NULL;
-    
+
     if ( page == NULL ) /* If no current place, start at the tail */
-        p = d->arch.paging.shadow.pinned_shadows.tail;
+        p = page_list_last(pin_list);
     else
-        p = pdx_to_page(page->list.prev);
+        p = page_list_prev(page, pin_list);
     /* Skip over the non-tail parts of multi-page shadows */
     if ( p && p->u.sh.type == SH_type_l2_32_shadow )
     {
-        p = pdx_to_page(p->list.prev);
+        p = page_list_prev(p, pin_list);
         ASSERT(p && p->u.sh.type == SH_type_l2_32_shadow);
-        p = pdx_to_page(p->list.prev);
+        p = page_list_prev(p, pin_list);
         ASSERT(p && p->u.sh.type == SH_type_l2_32_shadow);
-        p = pdx_to_page(p->list.prev);
+        p = page_list_prev(p, pin_list);
         ASSERT(p && p->u.sh.type == SH_type_l2_32_shadow);
     }
     ASSERT(!p || p->u.sh.head);
@@ -616,99 +618,87 @@ prev_pinned_shadow(const struct page_info *page,
 /* Pin a shadow page: take an extra refcount, set the pin bit,
  * and put the shadow at the head of the list of pinned shadows.
  * Returns 0 for failure, 1 for success. */
-static inline int sh_pin(struct vcpu *v, mfn_t smfn)
+static inline int sh_pin(struct domain *d, mfn_t smfn)
 {
-    struct page_info *sp;
-    struct page_list_head h, *pin_list;
-    
+    struct page_info *sp[4];
+    struct page_list_head *pin_list;
+    unsigned int i, pages;
+    bool_t already_pinned;
+
     ASSERT(mfn_valid(smfn));
-    sp = mfn_to_page(smfn);
-    ASSERT(sh_type_is_pinnable(v, sp->u.sh.type));
-    ASSERT(sp->u.sh.head);
+    sp[0] = mfn_to_page(smfn);
+    pages = shadow_size(sp[0]->u.sh.type);
+    already_pinned = sp[0]->u.sh.pinned;
+    ASSERT(sh_type_is_pinnable(d, sp[0]->u.sh.type));
+    ASSERT(sp[0]->u.sh.head);
+
+    pin_list = &d->arch.paging.shadow.pinned_shadows;
+    if ( already_pinned && sp[0] == page_list_first(pin_list) )
+        return 1;
 
     /* Treat the up-to-four pages of the shadow as a unit in the list ops */
-    h.next = h.tail = sp; 
-    if ( sp->u.sh.type == SH_type_l2_32_shadow ) 
+    for ( i = 1; i < pages; i++ )
     {
-        h.tail = pdx_to_page(h.tail->list.next);
-        h.tail = pdx_to_page(h.tail->list.next);
-        h.tail = pdx_to_page(h.tail->list.next);
-        ASSERT(h.tail->u.sh.type == SH_type_l2_32_shadow); 
+        sp[i] = page_list_next(sp[i - 1], pin_list);
+        ASSERT(sp[i]->u.sh.type == sp[0]->u.sh.type);
+        ASSERT(!sp[i]->u.sh.head);
     }
-    pin_list = &v->domain->arch.paging.shadow.pinned_shadows;
 
-    if ( sp->u.sh.pinned )
+    if ( already_pinned )
     {
-        /* Already pinned: take it out of the pinned-list so it can go 
-         * at the front */
-        if ( pin_list->next == h.next )
-            return 1;
-        page_list_prev(h.next, pin_list)->list.next = h.tail->list.next;
-        if ( pin_list->tail == h.tail )
-            pin_list->tail = page_list_prev(h.next, pin_list);
-        else
-            page_list_next(h.tail, pin_list)->list.prev = h.next->list.prev;
-        h.tail->list.next = h.next->list.prev = PAGE_LIST_NULL;
+        /* Take it out of the pinned-list so it can go at the front */
+        for ( i = 0; i < pages; i++ )
+            page_list_del(sp[i], pin_list);
     }
     else
     {
         /* Not pinned: pin it! */
-        if ( !sh_get_ref(v, smfn, 0) )
+        if ( !sh_get_ref(d, smfn, 0) )
             return 0;
-        sp->u.sh.pinned = 1;
-        ASSERT(h.next->list.prev == PAGE_LIST_NULL);
-        ASSERT(h.tail->list.next == PAGE_LIST_NULL);
+        sp[0]->u.sh.pinned = 1;
     }
+
     /* Put it at the head of the list of pinned shadows */
-    page_list_splice(&h, pin_list);
+    for ( i = pages; i > 0; i-- )
+        page_list_add(sp[i - 1], pin_list);
+
     return 1;
 }
 
 /* Unpin a shadow page: unset the pin bit, take the shadow off the list
  * of pinned shadows, and release the extra ref. */
-static inline void sh_unpin(struct vcpu *v, mfn_t smfn)
+static inline void sh_unpin(struct domain *d, mfn_t smfn)
 {
-    struct page_list_head h, *pin_list;
-    struct page_info *sp;
-    
+    struct page_list_head tmp_list, *pin_list;
+    struct page_info *sp, *next;
+    unsigned int i, head_type;
+
     ASSERT(mfn_valid(smfn));
     sp = mfn_to_page(smfn);
-    ASSERT(sh_type_is_pinnable(v, sp->u.sh.type));
+    head_type = sp->u.sh.type;
+    ASSERT(sh_type_is_pinnable(d, sp->u.sh.type));
     ASSERT(sp->u.sh.head);
 
-    /* Treat the up-to-four pages of the shadow as a unit in the list ops */
-    h.next = h.tail = sp; 
-    if ( sp->u.sh.type == SH_type_l2_32_shadow ) 
-    {
-        h.tail = pdx_to_page(h.tail->list.next);
-        h.tail = pdx_to_page(h.tail->list.next);
-        h.tail = pdx_to_page(h.tail->list.next);
-        ASSERT(h.tail->u.sh.type == SH_type_l2_32_shadow); 
-    }
-    pin_list = &v->domain->arch.paging.shadow.pinned_shadows;
-
     if ( !sp->u.sh.pinned )
         return;
-
     sp->u.sh.pinned = 0;
 
-    /* Cut the sub-list out of the list of pinned shadows */
-    if ( pin_list->next == h.next && pin_list->tail == h.tail )
-        pin_list->next = pin_list->tail = NULL;
-    else 
+    /* Cut the sub-list out of the list of pinned shadows,
+     * stitching it back into a list fragment of its own. */
+    pin_list = &d->arch.paging.shadow.pinned_shadows;
+    INIT_PAGE_LIST_HEAD(&tmp_list);
+    for ( i = 0; i < shadow_size(head_type); i++ )
     {
-        if ( pin_list->next == h.next )
-            pin_list->next = page_list_next(h.tail, pin_list);
-        else
-            page_list_prev(h.next, pin_list)->list.next = h.tail->list.next;
-        if ( pin_list->tail == h.tail )
-            pin_list->tail = page_list_prev(h.next, pin_list);
-        else
-            page_list_next(h.tail, pin_list)->list.prev = h.next->list.prev;
+        ASSERT(sp->u.sh.type == head_type);
+        ASSERT(!i || !sp->u.sh.head);
+        next = page_list_next(sp, pin_list);
+        page_list_del(sp, pin_list);
+        page_list_add_tail(sp, &tmp_list);
+        sp = next;
     }
-    h.tail->list.next = h.next->list.prev = PAGE_LIST_NULL;
-    
-    sh_put_ref(v, smfn, 0);
+    sh_terminate_list(&tmp_list);
+
+    sh_put_ref(d, smfn, 0);
 }
 
 
@@ -731,7 +721,7 @@ struct sh_emulate_ctxt {
     mfn_t mfn1, mfn2;
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
-    /* Special case for avoiding having to verify writes: remember 
+    /* Special case for avoiding having to verify writes: remember
      * whether the old value had its low bit (_PAGE_PRESENT) clear. */
     int low_bit_was_clear:1;
 #endif
@@ -746,12 +736,12 @@ struct segment_register *hvm_get_seg_reg(
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
 /**************************************************************************/
-/* Virtual TLB entries 
+/* Virtual TLB entries
  *
- * We keep a cache of virtual-to-physical translations that we have seen 
- * since the last TLB flush.  This is safe to use for frame translations, 
+ * We keep a cache of virtual-to-physical translations that we have seen
+ * since the last TLB flush.  This is safe to use for frame translations,
  * but callers need to re-check the actual guest tables if the lookup fails.
- * 
+ *
  * Lookups and updates are protected by a per-vTLB (and hence per-vcpu)
  * lock.  This lock is held *only* while reading or writing the table,
  * so it is safe to take in any non-interrupt context.  Most lookups
@@ -769,7 +759,7 @@ struct shadow_vtlb {
 };
 
 /* Call whenever the guest flushes hit actual TLB */
-static inline void vtlb_flush(struct vcpu *v) 
+static inline void vtlb_flush(struct vcpu *v)
 {
     spin_lock(&v->arch.paging.vtlb_lock);
     memset(v->arch.paging.vtlb, 0, VTLB_ENTRIES * sizeof (struct shadow_vtlb));
@@ -785,7 +775,7 @@ static inline int vtlb_hash(unsigned long page_number)
 static inline void vtlb_insert(struct vcpu *v, unsigned long page,
                                unsigned long frame, uint32_t pfec)
 {
-    struct shadow_vtlb entry = 
+    struct shadow_vtlb entry =
         { .page_number = page, .frame_number = frame, .pfec = pfec };
     spin_lock(&v->arch.paging.vtlb_lock);
     v->arch.paging.vtlb[vtlb_hash(page)] = entry;
@@ -802,7 +792,7 @@ static inline unsigned long vtlb_lookup(struct vcpu *v,
 
     spin_lock(&v->arch.paging.vtlb_lock);
     if ( v->arch.paging.vtlb[i].pfec != 0
-         && v->arch.paging.vtlb[i].page_number == page_number 
+         && v->arch.paging.vtlb[i].page_number == page_number
          /* Any successful walk that had at least these pfec bits is OK */
          && (v->arch.paging.vtlb[i].pfec & pfec) == pfec )
     {
@@ -817,7 +807,7 @@ static inline int sh_check_page_has_no_refs(struct page_info *page)
 {
     unsigned long count = read_atomic(&page->count_info);
     return ( (count & PGC_count_mask) ==
-             ((count & PGC_allocated) ? 1 : 0) ); 
+             ((count & PGC_allocated) ? 1 : 0) );
 }
 
 #endif /* _XEN_SHADOW_PRIVATE_H */
diff --git a/xen/arch/x86/mm/shadow/types.h b/xen/arch/x86/mm/shadow/types.h
index 953d168..9bc369f 100644
--- a/xen/arch/x86/mm/shadow/types.h
+++ b/xen/arch/x86/mm/shadow/types.h
@@ -1,23 +1,22 @@
 /******************************************************************************
  * arch/x86/mm/shadow/types.h
- * 
+ *
  * Parts of this code are Copyright (c) 2006 by XenSource Inc.
  * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
  * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
- * 
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _XEN_SHADOW_TYPES_H
@@ -31,7 +30,7 @@
 #define SHADOW_PAGING_LEVELS 3
 #endif
 
-/* 
+/*
  * Define various types for handling pagetabels, based on these options:
  * SHADOW_PAGING_LEVELS : Number of levels of shadow pagetables
  * GUEST_PAGING_LEVELS  : Number of levels of guest pagetables
@@ -101,14 +100,14 @@ static inline shadow_l1e_t
 shadow_l1e_remove_flags(shadow_l1e_t sl1e, u32 flags)
 { l1e_remove_flags(sl1e, flags); return sl1e; }
 
-static inline shadow_l1e_t shadow_l1e_empty(void) 
+static inline shadow_l1e_t shadow_l1e_empty(void)
 { return l1e_empty(); }
-static inline shadow_l2e_t shadow_l2e_empty(void) 
+static inline shadow_l2e_t shadow_l2e_empty(void)
 { return l2e_empty(); }
-static inline shadow_l3e_t shadow_l3e_empty(void) 
+static inline shadow_l3e_t shadow_l3e_empty(void)
 { return l3e_empty(); }
 #if SHADOW_PAGING_LEVELS >= 4
-static inline shadow_l4e_t shadow_l4e_empty(void) 
+static inline shadow_l4e_t shadow_l4e_empty(void)
 { return l4e_empty(); }
 #endif
 
@@ -141,10 +140,10 @@ static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags)
 #define shadow_l4_linear_offset(_a)                                           \
         (((_a) & VADDR_MASK) >> SHADOW_L4_PAGETABLE_SHIFT)
 
-/* Where to find each level of the linear mapping.  For PV guests, we use 
- * the shadow linear-map self-entry as many times as we need.  For HVM 
- * guests, the shadow doesn't have a linear-map self-entry so we must use 
- * the monitor-table's linear-map entry N-1 times and then the shadow-map 
+/* Where to find each level of the linear mapping.  For PV guests, we use
+ * the shadow linear-map self-entry as many times as we need.  For HVM
+ * guests, the shadow doesn't have a linear-map self-entry so we must use
+ * the monitor-table's linear-map entry N-1 times and then the shadow-map
  * entry once. */
 #define __sh_linear_l1_table ((shadow_l1e_t *)(SH_LINEAR_PT_VIRT_START))
 #define __sh_linear_l2_table ((shadow_l2e_t *)                               \
@@ -304,12 +303,12 @@ static inline int sh_l1e_is_magic(shadow_l1e_t sl1e)
 }
 
 /* Guest not present: a single magic value */
-static inline shadow_l1e_t sh_l1e_gnp(void) 
+static inline shadow_l1e_t sh_l1e_gnp(void)
 {
     return (shadow_l1e_t){ -1ULL };
 }
 
-static inline int sh_l1e_is_gnp(shadow_l1e_t sl1e) 
+static inline int sh_l1e_is_gnp(shadow_l1e_t sl1e)
 {
     return (sl1e.l1 == sh_l1e_gnp().l1);
 }
@@ -323,24 +322,24 @@ static inline int sh_l1e_is_gnp(shadow_l1e_t sl1e)
 #define SH_L1E_MMIO_GFN_MASK    0x00000000fffffff0ULL
 #define SH_L1E_MMIO_GFN_SHIFT   4
 
-static inline shadow_l1e_t sh_l1e_mmio(gfn_t gfn, u32 gflags) 
+static inline shadow_l1e_t sh_l1e_mmio(gfn_t gfn, u32 gflags)
 {
-    return (shadow_l1e_t) { (SH_L1E_MMIO_MAGIC 
-                             | (gfn_x(gfn) << SH_L1E_MMIO_GFN_SHIFT) 
+    return (shadow_l1e_t) { (SH_L1E_MMIO_MAGIC
+                             | (gfn_x(gfn) << SH_L1E_MMIO_GFN_SHIFT)
                              | (gflags & (_PAGE_USER|_PAGE_RW))) };
 }
 
-static inline int sh_l1e_is_mmio(shadow_l1e_t sl1e) 
+static inline int sh_l1e_is_mmio(shadow_l1e_t sl1e)
 {
     return ((sl1e.l1 & SH_L1E_MMIO_MAGIC_MASK) == SH_L1E_MMIO_MAGIC);
 }
 
-static inline gfn_t sh_l1e_mmio_get_gfn(shadow_l1e_t sl1e) 
+static inline gfn_t sh_l1e_mmio_get_gfn(shadow_l1e_t sl1e)
 {
     return _gfn((sl1e.l1 & SH_L1E_MMIO_GFN_MASK) >> SH_L1E_MMIO_GFN_SHIFT);
 }
 
-static inline u32 sh_l1e_mmio_get_flags(shadow_l1e_t sl1e) 
+static inline u32 sh_l1e_mmio_get_flags(shadow_l1e_t sl1e)
 {
     return (u32)((sl1e.l1 & (_PAGE_USER|_PAGE_RW)));
 }
diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
new file mode 100644
index 0000000..3d52135
--- /dev/null
+++ b/xen/arch/x86/monitor.c
@@ -0,0 +1,217 @@
+/*
+ * arch/x86/monitor.c
+ *
+ * Architecture-specific monitor_op domctl handler.
+ *
+ * Copyright (c) 2015 Tamas K Lengyel (tamas at tklengyel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/config.h>
+#include <xen/sched.h>
+#include <xen/mm.h>
+#include <asm/domain.h>
+#include <asm/monitor.h>
+#include <public/domctl.h>
+#include <xsm/xsm.h>
+
+/*
+ * Sanity check whether option is already enabled/disabled
+ */
+static inline
+int status_check(struct xen_domctl_monitor_op *mop, bool_t status)
+{
+    bool_t requested_status = (mop->op == XEN_DOMCTL_MONITOR_OP_ENABLE);
+
+    if ( status == requested_status )
+        return -EEXIST;
+
+    return 0;
+}
+
+static inline uint32_t get_capabilities(struct domain *d)
+{
+    uint32_t capabilities = 0;
+
+    /*
+     * At the moment only Intel HVM domains are supported. However, event
+     * delivery could be extended to AMD and PV domains.
+     */
+    if ( !is_hvm_domain(d) || !cpu_has_vmx )
+        return capabilities;
+
+    capabilities = (1 << XEN_DOMCTL_MONITOR_EVENT_WRITE_CTRLREG) |
+                   (1 << XEN_DOMCTL_MONITOR_EVENT_MOV_TO_MSR) |
+                   (1 << XEN_DOMCTL_MONITOR_EVENT_SOFTWARE_BREAKPOINT) |
+                   (1 << XEN_DOMCTL_MONITOR_EVENT_GUEST_REQUEST);
+
+    /* Since we know this is on VMX, we can just call the hvm func */
+    if ( hvm_is_singlestep_supported() )
+        capabilities |= (1 << XEN_DOMCTL_MONITOR_EVENT_SINGLESTEP);
+
+    return capabilities;
+}
+
+int monitor_domctl(struct domain *d, struct xen_domctl_monitor_op *mop)
+{
+    int rc;
+    struct arch_domain *ad = &d->arch;
+    uint32_t capabilities = get_capabilities(d);
+
+    rc = xsm_vm_event_control(XSM_PRIV, d, mop->op, mop->event);
+    if ( rc )
+        return rc;
+
+    if ( mop->op == XEN_DOMCTL_MONITOR_OP_GET_CAPABILITIES )
+    {
+        mop->event = capabilities;
+        return 0;
+    }
+
+    /*
+     * Sanity check
+     */
+    if ( mop->op != XEN_DOMCTL_MONITOR_OP_ENABLE &&
+         mop->op != XEN_DOMCTL_MONITOR_OP_DISABLE )
+        return -EOPNOTSUPP;
+
+    /* Check if event type is available. */
+    if ( !(capabilities & (1 << mop->event)) )
+        return -EOPNOTSUPP;
+
+    switch ( mop->event )
+    {
+    case XEN_DOMCTL_MONITOR_EVENT_WRITE_CTRLREG:
+    {
+        unsigned int ctrlreg_bitmask =
+            monitor_ctrlreg_bitmask(mop->u.mov_to_cr.index);
+        bool_t status =
+            !!(ad->monitor.write_ctrlreg_enabled & ctrlreg_bitmask);
+        struct vcpu *v;
+
+        rc = status_check(mop, status);
+        if ( rc )
+            return rc;
+
+        if ( mop->u.mov_to_cr.sync )
+            ad->monitor.write_ctrlreg_sync |= ctrlreg_bitmask;
+        else
+            ad->monitor.write_ctrlreg_sync &= ~ctrlreg_bitmask;
+
+        if ( mop->u.mov_to_cr.onchangeonly )
+            ad->monitor.write_ctrlreg_onchangeonly |= ctrlreg_bitmask;
+        else
+            ad->monitor.write_ctrlreg_onchangeonly &= ~ctrlreg_bitmask;
+
+        domain_pause(d);
+
+        if ( !status )
+            ad->monitor.write_ctrlreg_enabled |= ctrlreg_bitmask;
+        else
+            ad->monitor.write_ctrlreg_enabled &= ~ctrlreg_bitmask;
+
+        domain_unpause(d);
+
+        if ( mop->u.mov_to_cr.index == VM_EVENT_X86_CR3 )
+            /* Latches new CR3 mask through CR0 code */
+            for_each_vcpu ( d, v )
+                hvm_update_guest_cr(v, 0);
+
+        break;
+    }
+
+    case XEN_DOMCTL_MONITOR_EVENT_MOV_TO_MSR:
+    {
+        bool_t status = ad->monitor.mov_to_msr_enabled;
+
+        rc = status_check(mop, status);
+        if ( rc )
+            return rc;
+
+        if ( mop->op == XEN_DOMCTL_MONITOR_OP_ENABLE &&
+             mop->u.mov_to_msr.extended_capture )
+        {
+            if ( hvm_enable_msr_exit_interception(d) )
+                ad->monitor.mov_to_msr_extended = 1;
+            else
+                return -EOPNOTSUPP;
+        } else
+            ad->monitor.mov_to_msr_extended = 0;
+
+        domain_pause(d);
+        ad->monitor.mov_to_msr_enabled = !status;
+        domain_unpause(d);
+        break;
+    }
+
+    case XEN_DOMCTL_MONITOR_EVENT_SINGLESTEP:
+    {
+        bool_t status = ad->monitor.singlestep_enabled;
+
+        rc = status_check(mop, status);
+        if ( rc )
+            return rc;
+
+        domain_pause(d);
+        ad->monitor.singlestep_enabled = !status;
+        domain_unpause(d);
+        break;
+    }
+
+    case XEN_DOMCTL_MONITOR_EVENT_SOFTWARE_BREAKPOINT:
+    {
+        bool_t status = ad->monitor.software_breakpoint_enabled;
+
+        rc = status_check(mop, status);
+        if ( rc )
+            return rc;
+
+        domain_pause(d);
+        ad->monitor.software_breakpoint_enabled = !status;
+        domain_unpause(d);
+        break;
+    }
+
+    case XEN_DOMCTL_MONITOR_EVENT_GUEST_REQUEST:
+    {
+        bool_t status = ad->monitor.guest_request_enabled;
+
+        rc = status_check(mop, status);
+        if ( rc )
+            return rc;
+
+        ad->monitor.guest_request_sync = mop->u.guest_request.sync;
+
+        domain_pause(d);
+        ad->monitor.guest_request_enabled = !status;
+        domain_unpause(d);
+        break;
+    }
+
+    default:
+        return -EOPNOTSUPP;
+
+    };
+
+    return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c
index a38e016..8609f4a 100644
--- a/xen/arch/x86/mpparse.c
+++ b/xen/arch/x86/mpparse.c
@@ -87,6 +87,23 @@ void __init set_nr_cpu_ids(unsigned int max_cpus)
 #endif
 }
 
+void __init set_nr_sockets(void)
+{
+    /*
+     * Count the actual cpus in the socket 0 and use it to calculate nr_sockets
+     * so that the latter will be always >= the actual socket number in the
+     * system even when APIC IDs from MP table are too sparse.
+     */
+    unsigned int cpus = bitmap_weight(phys_cpu_present_map.mask,
+                                      boot_cpu_data.x86_max_cores *
+                                      boot_cpu_data.x86_num_siblings);
+
+    if ( cpus == 0 )
+        cpus = 1;
+
+    nr_sockets = DIV_ROUND_UP(num_processors + disabled_cpus, cpus);
+}
+
 /*
  * Intel MP BIOS table parsing routines:
  */
@@ -541,7 +558,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 static __init void efi_unmap_mpf(void)
 {
 	if (efi_enabled)
-		__set_fixmap(FIX_EFI_MPF, 0, 0);
+		clear_fixmap(FIX_EFI_MPF);
 }
 
 static struct intel_mp_floating *__initdata mpf_found;
diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 7410d03..3dbb84d 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -35,6 +35,8 @@
 static s8 __read_mostly use_msi = -1;
 boolean_param("msi", use_msi);
 
+static void __pci_disable_msix(struct msi_desc *);
+
 /* bitmap indicate which fixed map is free */
 static DEFINE_SPINLOCK(msix_fixmap_lock);
 static DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES);
@@ -112,7 +114,7 @@ static void msix_put_fixmap(struct arch_msix *msix, int idx)
 
     if ( --msix->table_refcnt[i] == 0 )
     {
-        __set_fixmap(idx, 0, 0);
+        clear_fixmap(idx);
         msix_fixmap_free(idx);
         msix->table_idx[i] = 0;
     }
@@ -121,6 +123,38 @@ static void msix_put_fixmap(struct arch_msix *msix, int idx)
     spin_unlock(&msix->table_lock);
 }
 
+static bool_t memory_decoded(const struct pci_dev *dev)
+{
+    u8 bus, slot, func;
+
+    if ( !dev->info.is_virtfn )
+    {
+        bus = dev->bus;
+        slot = PCI_SLOT(dev->devfn);
+        func = PCI_FUNC(dev->devfn);
+    }
+    else
+    {
+        bus = dev->info.physfn.bus;
+        slot = PCI_SLOT(dev->info.physfn.devfn);
+        func = PCI_FUNC(dev->info.physfn.devfn);
+    }
+
+    return !!(pci_conf_read16(dev->seg, bus, slot, func, PCI_COMMAND) &
+              PCI_COMMAND_MEMORY);
+}
+
+static bool_t msix_memory_decoded(const struct pci_dev *dev, unsigned int pos)
+{
+    u16 control = pci_conf_read16(dev->seg, dev->bus, PCI_SLOT(dev->devfn),
+                                  PCI_FUNC(dev->devfn), msix_control_reg(pos));
+
+    if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
+        return 0;
+
+    return memory_decoded(dev);
+}
+
 /*
  * MSI message composition
  */
@@ -129,12 +163,14 @@ void msi_compose_msg(unsigned vector, const cpumask_t *cpu_mask, struct msi_msg
     unsigned dest;
 
     memset(msg, 0, sizeof(*msg));
-    if ( !cpumask_intersects(cpu_mask, &cpu_online_map) ) {
+    if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
+    {
         dprintk(XENLOG_ERR,"%s, compose msi message error!!\n", __func__);
         return;
     }
 
-    if ( vector ) {
+    if ( vector )
+    {
         cpumask_t *mask = this_cpu(scratch_mask);
 
         cpumask_and(mask, cpu_mask, &cpu_online_map);
@@ -162,7 +198,7 @@ void msi_compose_msg(unsigned vector, const cpumask_t *cpu_mask, struct msi_msg
     }
 }
 
-static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+static bool_t read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 {
     switch ( entry->msi_attrib.type )
     {
@@ -195,9 +231,11 @@ static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
     }
     case PCI_CAP_ID_MSIX:
     {
-        void __iomem *base;
-        base = entry->mask_base;
+        void __iomem *base = entry->mask_base;
 
+        if ( unlikely(!msix_memory_decoded(entry->dev,
+                                           entry->msi_attrib.pos)) )
+            return 0;
         msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
         msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
@@ -209,6 +247,8 @@ static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 
     if ( iommu_intremap )
         iommu_read_msi_from_ire(entry, msg);
+
+    return 1;
 }
 
 static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -257,9 +297,11 @@ static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
     }
     case PCI_CAP_ID_MSIX:
     {
-        void __iomem *base;
-        base = entry->mask_base;
+        void __iomem *base = entry->mask_base;
 
+        if ( unlikely(!msix_memory_decoded(entry->dev,
+                                           entry->msi_attrib.pos)) )
+            return -ENXIO;
         writel(msg->address_lo,
                base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         writel(msg->address_hi,
@@ -281,13 +323,14 @@ void set_msi_affinity(struct irq_desc *desc, const cpumask_t *mask)
     struct msi_desc *msi_desc = desc->msi_desc;
 
     dest = set_desc_affinity(desc, mask);
-    if (dest == BAD_APICID || !msi_desc)
+    if ( dest == BAD_APICID || !msi_desc )
         return;
 
     ASSERT(spin_is_locked(&desc->lock));
 
     memset(&msg, 0, sizeof(msg));
-    read_msi_msg(msi_desc, &msg);
+    if ( !read_msi_msg(msi_desc, &msg) )
+        return;
 
     msg.data &= ~MSI_DATA_VECTOR_MASK;
     msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
@@ -332,11 +375,11 @@ static void msix_set_enable(struct pci_dev *dev, int enable)
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     if ( pos )
     {
-        control = pci_conf_read16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS);
+        control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
         control &= ~PCI_MSIX_FLAGS_ENABLE;
         if ( enable )
             control |= PCI_MSIX_FLAGS_ENABLE;
-        pci_conf_write16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS, control);
+        pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
     }
 }
 
@@ -347,20 +390,27 @@ int msi_maskable_irq(const struct msi_desc *entry)
            || entry->msi_attrib.maskbit;
 }
 
-static void msi_set_mask_bit(struct irq_desc *desc, int flag)
+static bool_t msi_set_mask_bit(struct irq_desc *desc, bool_t host, bool_t guest)
 {
     struct msi_desc *entry = desc->msi_desc;
+    struct pci_dev *pdev;
+    u16 seg, control;
+    u8 bus, slot, func;
+    bool_t flag = host || guest, maskall;
 
     ASSERT(spin_is_locked(&desc->lock));
     BUG_ON(!entry || !entry->dev);
-    switch (entry->msi_attrib.type) {
+    pdev = entry->dev;
+    seg = pdev->seg;
+    bus = pdev->bus;
+    slot = PCI_SLOT(pdev->devfn);
+    func = PCI_FUNC(pdev->devfn);
+    switch ( entry->msi_attrib.type )
+    {
     case PCI_CAP_ID_MSI:
-        if (entry->msi_attrib.maskbit) {
+        if ( entry->msi_attrib.maskbit )
+        {
             u32 mask_bits;
-            u16 seg = entry->dev->seg;
-            u8 bus = entry->dev->bus;
-            u8 slot = PCI_SLOT(entry->dev->devfn);
-            u8 func = PCI_FUNC(entry->dev->devfn);
 
             mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
             mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
@@ -369,24 +419,63 @@ static void msi_set_mask_bit(struct irq_desc *desc, int flag)
         }
         break;
     case PCI_CAP_ID_MSIX:
-    {
-        int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
-        writel(flag, entry->mask_base + offset);
-        readl(entry->mask_base + offset);
-        break;
-    }
+        maskall = pdev->msix->host_maskall;
+        control = pci_conf_read16(seg, bus, slot, func,
+                                  msix_control_reg(entry->msi_attrib.pos));
+        if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
+        {
+            pdev->msix->host_maskall = 1;
+            pci_conf_write16(seg, bus, slot, func,
+                             msix_control_reg(entry->msi_attrib.pos),
+                             control | (PCI_MSIX_FLAGS_ENABLE |
+                                        PCI_MSIX_FLAGS_MASKALL));
+        }
+        if ( likely(memory_decoded(pdev)) )
+        {
+            writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+            readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+            if ( likely(control & PCI_MSIX_FLAGS_ENABLE) )
+                break;
+            flag = 1;
+        }
+        else if ( flag && !(control & PCI_MSIX_FLAGS_MASKALL) )
+        {
+            domid_t domid = pdev->domain->domain_id;
+
+            maskall = 1;
+            if ( pdev->msix->warned != domid )
+            {
+                pdev->msix->warned = domid;
+                printk(XENLOG_G_WARNING
+                       "cannot mask IRQ %d: masking MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
+                       desc->irq, domid, pdev->seg, pdev->bus,
+                       PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+            }
+        }
+        pdev->msix->host_maskall = maskall;
+        if ( maskall || pdev->msix->guest_maskall )
+            control |= PCI_MSIX_FLAGS_MASKALL;
+        pci_conf_write16(seg, bus, slot, func,
+                         msix_control_reg(entry->msi_attrib.pos), control);
+        return flag;
     default:
-        BUG();
-        break;
+        return 0;
     }
-    entry->msi_attrib.masked = !!flag;
+    entry->msi_attrib.host_masked = host;
+    entry->msi_attrib.guest_masked = guest;
+
+    return 1;
 }
 
 static int msi_get_mask_bit(const struct msi_desc *entry)
 {
-    switch (entry->msi_attrib.type) {
+    if ( !entry->dev )
+        return -1;
+
+    switch ( entry->msi_attrib.type )
+    {
     case PCI_CAP_ID_MSI:
-        if (!entry->dev || !entry->msi_attrib.maskbit)
+        if ( !entry->msi_attrib.maskbit )
             break;
         return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
                                 PCI_SLOT(entry->dev->devfn),
@@ -394,6 +483,9 @@ static int msi_get_mask_bit(const struct msi_desc *entry)
                                 entry->msi.mpos) >>
                 entry->msi_attrib.entry_nr) & 1;
     case PCI_CAP_ID_MSIX:
+        if ( unlikely(!msix_memory_decoded(entry->dev,
+                                           entry->msi_attrib.pos)) )
+            break;
         return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
     }
     return -1;
@@ -401,20 +493,36 @@ static int msi_get_mask_bit(const struct msi_desc *entry)
 
 void mask_msi_irq(struct irq_desc *desc)
 {
-    msi_set_mask_bit(desc, 1);
+    if ( unlikely(!msi_set_mask_bit(desc, 1,
+                                    desc->msi_desc->msi_attrib.guest_masked)) )
+        BUG_ON(!(desc->status & IRQ_DISABLED));
 }
 
 void unmask_msi_irq(struct irq_desc *desc)
 {
-    msi_set_mask_bit(desc, 0);
+    if ( unlikely(!msi_set_mask_bit(desc, 0,
+                                    desc->msi_desc->msi_attrib.guest_masked)) )
+        WARN();
+}
+
+void guest_mask_msi_irq(struct irq_desc *desc, bool_t mask)
+{
+    msi_set_mask_bit(desc, desc->msi_desc->msi_attrib.host_masked, mask);
 }
 
 static unsigned int startup_msi_irq(struct irq_desc *desc)
 {
-    unmask_msi_irq(desc);
+    if ( unlikely(!msi_set_mask_bit(desc, 0, !!(desc->status & IRQ_GUEST))) )
+        WARN();
     return 0;
 }
 
+static void shutdown_msi_irq(struct irq_desc *desc)
+{
+    if ( unlikely(!msi_set_mask_bit(desc, 1, 1)) )
+        BUG_ON(!(desc->status & IRQ_DISABLED));
+}
+
 void ack_nonmaskable_msi_irq(struct irq_desc *desc)
 {
     irq_complete_move(desc);
@@ -439,7 +547,7 @@ void end_nonmaskable_msi_irq(struct irq_desc *desc, u8 vector)
 static hw_irq_controller pci_msi_maskable = {
     .typename     = "PCI-MSI/-X",
     .startup      = startup_msi_irq,
-    .shutdown     = mask_msi_irq,
+    .shutdown     = shutdown_msi_irq,
     .enable       = unmask_msi_irq,
     .disable      = mask_msi_irq,
     .ack          = ack_maskable_msi_irq,
@@ -479,9 +587,31 @@ static struct msi_desc *alloc_msi_entry(unsigned int nr)
 
 int setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc)
 {
-    return __setup_msi_irq(desc, msidesc,
-                           msi_maskable_irq(msidesc) ? &pci_msi_maskable
-                                                     : &pci_msi_nonmaskable);
+    const struct pci_dev *pdev = msidesc->dev;
+    unsigned int cpos = msix_control_reg(msidesc->msi_attrib.pos);
+    u16 control = ~0;
+    int rc;
+
+    if ( msidesc->msi_attrib.type == PCI_CAP_ID_MSIX )
+    {
+        control = pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                                  PCI_FUNC(pdev->devfn), cpos);
+        if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
+            pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                             PCI_FUNC(pdev->devfn), cpos,
+                             control | (PCI_MSIX_FLAGS_ENABLE |
+                                        PCI_MSIX_FLAGS_MASKALL));
+    }
+
+    rc = __setup_msi_irq(desc, msidesc,
+                         msi_maskable_irq(msidesc) ? &pci_msi_maskable
+                                                   : &pci_msi_nonmaskable);
+
+    if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
+        pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                         PCI_FUNC(pdev->devfn), cpos, control);
+
+    return rc;
 }
 
 int __setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc,
@@ -566,6 +696,8 @@ static int msi_capability_init(struct pci_dev *dev,
 
     ASSERT(spin_is_locked(&pcidevs_lock));
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI);
+    if ( !pos )
+        return -ENODEV;
     control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
     maxvec = multi_msi_capable(control);
     if ( nvec > maxvec )
@@ -586,8 +718,9 @@ static int msi_capability_init(struct pci_dev *dev,
         entry[i].msi_attrib.type = PCI_CAP_ID_MSI;
         entry[i].msi_attrib.is_64 = is_64bit_address(control);
         entry[i].msi_attrib.entry_nr = i;
+        entry[i].msi_attrib.host_masked =
         entry[i].msi_attrib.maskbit = is_mask_bit_support(control);
-        entry[i].msi_attrib.masked = 1;
+        entry[i].msi_attrib.guest_masked = 0;
         entry[i].msi_attrib.pos = pos;
         if ( entry[i].msi_attrib.maskbit )
             entry[i].msi.mpos = mpos;
@@ -701,13 +834,14 @@ static u64 read_pci_mem_bar(u16 seg, u8 bus, u8 slot, u8 func, u8 bir, int vf)
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev,
+                                unsigned int pos,
                                 struct msi_info *msi,
                                 struct msi_desc **desc,
                                 unsigned int nr_entries)
 {
     struct arch_msix *msix = dev->msix;
     struct msi_desc *entry = NULL;
-    int pos, vf;
+    int vf;
     u16 control;
     u64 table_paddr;
     u32 table_offset;
@@ -716,18 +850,38 @@ static int msix_capability_init(struct pci_dev *dev,
     u8 bus = dev->bus;
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
+    bool_t maskall = msix->host_maskall;
 
     ASSERT(spin_is_locked(&pcidevs_lock));
 
-    pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
-    msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
+    /*
+     * Ensure MSI-X interrupts are masked during setup. Some devices require
+     * MSI-X to be enabled before we can touch the MSI-X registers. We need
+     * to mask all the vectors to prevent interrupts coming in before they're
+     * fully set up.
+     */
+    msix->host_maskall = 1;
+    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                     control | (PCI_MSIX_FLAGS_ENABLE |
+                                PCI_MSIX_FLAGS_MASKALL));
+
+    if ( unlikely(!memory_decoded(dev)) )
+    {
+        pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                         control & ~PCI_MSIX_FLAGS_ENABLE);
+        return -ENXIO;
+    }
 
     if ( desc )
     {
         entry = alloc_msi_entry(1);
         if ( !entry )
+        {
+            pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                             control & ~PCI_MSIX_FLAGS_ENABLE);
             return -ENOMEM;
+        }
         ASSERT(msi);
     }
 
@@ -758,6 +912,8 @@ static int msix_capability_init(struct pci_dev *dev,
     {
         if ( !msi || !msi->table_base )
         {
+            pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                             control & ~PCI_MSIX_FLAGS_ENABLE);
             xfree(entry);
             return -ENXIO;
         }
@@ -800,6 +956,8 @@ static int msix_capability_init(struct pci_dev *dev,
 
         if ( idx < 0 )
         {
+            pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                             control & ~PCI_MSIX_FLAGS_ENABLE);
             xfree(entry);
             return idx;
         }
@@ -813,7 +971,8 @@ static int msix_capability_init(struct pci_dev *dev,
         entry->msi_attrib.is_64 = 1;
         entry->msi_attrib.entry_nr = msi->entry_nr;
         entry->msi_attrib.maskbit = 1;
-        entry->msi_attrib.masked = 1;
+        entry->msi_attrib.host_masked = 1;
+        entry->msi_attrib.guest_masked = 1;
         entry->msi_attrib.pos = pos;
         entry->irq = msi->irq;
         entry->dev = dev;
@@ -825,6 +984,12 @@ static int msix_capability_init(struct pci_dev *dev,
 
     if ( !msix->used_entries )
     {
+        maskall = 0;
+        if ( !msix->guest_maskall )
+            control &= ~PCI_MSIX_FLAGS_MASKALL;
+        else
+            control |= PCI_MSIX_FLAGS_MASKALL;
+
         if ( rangeset_add_range(mmio_ro_ranges, msix->table.first,
                                 msix->table.last) )
             WARN();
@@ -855,6 +1020,7 @@ static int msix_capability_init(struct pci_dev *dev,
     ++msix->used_entries;
 
     /* Restore MSI-X enabled bits */
+    msix->host_maskall = maskall;
     pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
     return 0;
@@ -884,10 +1050,9 @@ static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSI);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "irq %d has already mapped to MSI on "
-                "device %04x:%02x:%02x.%01x\n",
-                msi->irq, msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        printk(XENLOG_WARNING "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n",
+               msi->irq, msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
         *desc = old_desc;
         return 0;
     }
@@ -895,10 +1060,10 @@ static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
     old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "MSI-X is already in use on "
-                "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
-        pci_disable_msi(old_desc);
+        printk(XENLOG_WARNING "MSI-X already in use on %04x:%02x:%02x.%u\n",
+               msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        __pci_disable_msix(old_desc);
     }
 
     return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr);
@@ -912,7 +1077,6 @@ static void __pci_disable_msi(struct msi_desc *entry)
     msi_set_enable(dev, 0);
 
     BUG_ON(list_empty(&dev->msi_list));
-
 }
 
 /**
@@ -932,7 +1096,7 @@ static void __pci_disable_msi(struct msi_desc *entry)
  **/
 static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
 {
-    int status, pos, nr_entries;
+    int pos, nr_entries;
     struct pci_dev *pdev;
     u16 control;
     u8 slot = PCI_SLOT(msi->devfn);
@@ -941,23 +1105,22 @@ static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
 
     ASSERT(spin_is_locked(&pcidevs_lock));
     pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
-    if ( !pdev )
+    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
+    if ( !pdev || !pos )
         return -ENODEV;
 
-    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(msi->seg, msi->bus, slot, func,
                               msix_control_reg(pos));
     nr_entries = multi_msix_capable(control);
-    if (msi->entry_nr >= nr_entries)
+    if ( msi->entry_nr >= nr_entries )
         return -EINVAL;
 
     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "irq %d has already mapped to MSIX on "
-                "device %04x:%02x:%02x.%01x\n",
-                msi->irq, msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        printk(XENLOG_WARNING "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n",
+               msi->irq, msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
         *desc = old_desc;
         return 0;
     }
@@ -965,15 +1128,13 @@ static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
     old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "MSI is already in use on "
-                "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
-        pci_disable_msi(old_desc);
-
+        printk(XENLOG_WARNING "MSI already in use on %04x:%02x:%02x.%u\n",
+               msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        __pci_disable_msi(old_desc);
     }
 
-    status = msix_capability_init(pdev, msi, desc, nr_entries);
-    return status;
+    return msix_capability_init(pdev, pos, msi, desc, nr_entries);
 }
 
 static void _pci_cleanup_msix(struct arch_msix *msix)
@@ -991,25 +1152,40 @@ static void _pci_cleanup_msix(struct arch_msix *msix)
 
 static void __pci_disable_msix(struct msi_desc *entry)
 {
-    struct pci_dev *dev;
-    int pos;
-    u16 control, seg;
-    u8 bus, slot, func;
-
-    dev = entry->dev;
-    seg = dev->seg;
-    bus = dev->bus;
-    slot = PCI_SLOT(dev->devfn);
-    func = PCI_FUNC(dev->devfn);
+    struct pci_dev *dev = entry->dev;
+    u16 seg = dev->seg;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+                                           PCI_CAP_ID_MSIX);
+    u16 control = pci_conf_read16(seg, bus, slot, func,
+                                  msix_control_reg(entry->msi_attrib.pos));
+    bool_t maskall = dev->msix->host_maskall;
 
-    pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
-    control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
-    msix_set_enable(dev, 0);
+    if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
+    {
+        dev->msix->host_maskall = 1;
+        pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                         control | (PCI_MSIX_FLAGS_ENABLE |
+                                    PCI_MSIX_FLAGS_MASKALL));
+    }
 
     BUG_ON(list_empty(&dev->msi_list));
 
-    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-
+    if ( likely(memory_decoded(dev)) )
+        writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+    else if ( !(control & PCI_MSIX_FLAGS_MASKALL) )
+    {
+        printk(XENLOG_WARNING
+               "cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n",
+               entry->irq, dev->seg, dev->bus,
+               PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+        maskall = 1;
+    }
+    dev->msix->host_maskall = maskall;
+    if ( maskall || dev->msix->guest_maskall )
+        control |= PCI_MSIX_FLAGS_MASKALL;
     pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
     _pci_cleanup_msix(dev->msix);
@@ -1045,7 +1221,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off)
         u16 control = pci_conf_read16(seg, bus, slot, func,
                                       msix_control_reg(pos));
 
-        rc = msix_capability_init(pdev, NULL, NULL,
+        rc = msix_capability_init(pdev, pos, NULL, NULL,
                                   multi_msix_capable(control));
     }
     spin_unlock(&pcidevs_lock);
@@ -1064,8 +1240,8 @@ int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
     if ( !use_msi )
         return -EPERM;
 
-    return  msi->table_base ? __pci_enable_msix(msi, desc) :
-        __pci_enable_msi(msi, desc);
+    return msi->table_base ? __pci_enable_msix(msi, desc) :
+                             __pci_enable_msi(msi, desc);
 }
 
 /*
@@ -1098,6 +1274,71 @@ void pci_cleanup_msi(struct pci_dev *pdev)
     msi_free_irqs(pdev);
 }
 
+int pci_msi_conf_write_intercept(struct pci_dev *pdev, unsigned int reg,
+                                 unsigned int size, uint32_t *data)
+{
+    u16 seg = pdev->seg;
+    u8 bus = pdev->bus;
+    u8 slot = PCI_SLOT(pdev->devfn);
+    u8 func = PCI_FUNC(pdev->devfn);
+    struct msi_desc *entry;
+    unsigned int pos;
+
+    if ( pdev->msix )
+    {
+        entry = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX);
+        pos = entry ? entry->msi_attrib.pos
+                    : pci_find_cap_offset(seg, bus, slot, func,
+                                          PCI_CAP_ID_MSIX);
+        ASSERT(pos);
+
+        if ( reg < pos || reg >= msix_pba_offset_reg(pos) + 4 )
+            return 0;
+
+        if ( reg != msix_control_reg(pos) || size != 2 )
+            return -EACCES;
+
+        pdev->msix->guest_maskall = !!(*data & PCI_MSIX_FLAGS_MASKALL);
+        if ( pdev->msix->host_maskall )
+            *data |= PCI_MSIX_FLAGS_MASKALL;
+
+        return 1;
+    }
+
+    entry = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);
+    if ( entry && entry->msi_attrib.maskbit )
+    {
+        uint16_t cntl;
+        uint32_t unused;
+
+        pos = entry->msi_attrib.pos;
+        if ( reg < pos || reg >= entry->msi.mpos + 8 )
+            return 0;
+
+        if ( reg == msi_control_reg(pos) )
+            return size == 2 ? 1 : -EACCES;
+        if ( reg < entry->msi.mpos || reg >= entry->msi.mpos + 4 || size != 4 )
+            return -EACCES;
+
+        cntl = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
+        unused = ~(uint32_t)0 >> (32 - multi_msi_capable(cntl));
+        for ( pos = 0; pos < entry->msi.nvec; ++pos, ++entry )
+        {
+            entry->msi_attrib.guest_masked =
+                *data >> entry->msi_attrib.entry_nr;
+            if ( entry->msi_attrib.host_masked )
+                *data |= 1 << pos;
+            unused &= ~(1 << pos);
+        }
+
+        *data |= unused;
+
+        return 1;
+    }
+
+    return 0;
+}
+
 int pci_restore_msi_state(struct pci_dev *pdev)
 {
     unsigned long flags;
@@ -1106,16 +1347,18 @@ int pci_restore_msi_state(struct pci_dev *pdev)
     struct msi_desc *entry, *tmp;
     struct irq_desc *desc;
     struct msi_msg msg;
+    u8 slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    unsigned int type = 0, pos = 0;
+    u16 control = 0;
 
     ASSERT(spin_is_locked(&pcidevs_lock));
 
     if ( !use_msi )
         return -EOPNOTSUPP;
 
-    if ( !pdev )
-        return -EINVAL;
-
-    ret = xsm_resource_setup_pci(XSM_PRIV, (pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn);
+    ret = xsm_resource_setup_pci(XSM_PRIV,
+                                (pdev->seg << 16) | (pdev->bus << 8) |
+                                pdev->devfn);
     if ( ret )
         return ret;
 
@@ -1138,23 +1381,48 @@ int pci_restore_msi_state(struct pci_dev *pdev)
                     pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
                     PCI_FUNC(pdev->devfn), i);
             spin_unlock_irqrestore(&desc->lock, flags);
+            if ( type == PCI_CAP_ID_MSIX )
+                pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                                 msix_control_reg(pos),
+                                 control & ~PCI_MSIX_FLAGS_ENABLE);
             return -EINVAL;
         }
 
+        ASSERT(!type || type == entry->msi_attrib.type);
+        pos = entry->msi_attrib.pos;
         if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
         {
             msi_set_enable(pdev, 0);
             nr = entry->msi.nvec;
         }
-        else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
-            msix_set_enable(pdev, 0);
+        else if ( !type && entry->msi_attrib.type == PCI_CAP_ID_MSIX )
+        {
+            control = pci_conf_read16(pdev->seg, pdev->bus, slot, func,
+                                      msix_control_reg(pos));
+            pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                             msix_control_reg(pos),
+                             control | (PCI_MSIX_FLAGS_ENABLE |
+                                        PCI_MSIX_FLAGS_MASKALL));
+            if ( unlikely(!memory_decoded(pdev)) )
+            {
+                spin_unlock_irqrestore(&desc->lock, flags);
+                pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                                 msix_control_reg(pos),
+                                 control & ~PCI_MSIX_FLAGS_ENABLE);
+                return -ENXIO;
+            }
+        }
+        type = entry->msi_attrib.type;
 
         msg = entry->msg;
         write_msi_msg(entry, &msg);
 
         for ( i = 0; ; )
         {
-            msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
+            if ( unlikely(!msi_set_mask_bit(desc,
+                                            entry[i].msi_attrib.host_masked,
+                                            entry[i].msi_attrib.guest_masked)) )
+                BUG();
 
             if ( !--nr )
                 break;
@@ -1168,24 +1436,25 @@ int pci_restore_msi_state(struct pci_dev *pdev)
 
         spin_unlock_irqrestore(&desc->lock, flags);
 
-        if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
+        if ( type == PCI_CAP_ID_MSI )
         {
-            unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
-            u16 control = pci_conf_read16(pdev->seg, pdev->bus,
-                                          PCI_SLOT(pdev->devfn),
-                                          PCI_FUNC(pdev->devfn), cpos);
+            unsigned int cpos = msi_control_reg(pos);
 
-            control &= ~PCI_MSI_FLAGS_QSIZE;
+            control = pci_conf_read16(pdev->seg, pdev->bus, slot, func, cpos) &
+                      ~PCI_MSI_FLAGS_QSIZE;
             multi_msi_enable(control, entry->msi.nvec);
             pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
                              PCI_FUNC(pdev->devfn), cpos, control);
 
             msi_set_enable(pdev, 1);
         }
-        else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
-            msix_set_enable(pdev, 1);
     }
 
+    if ( type == PCI_CAP_ID_MSIX )
+        pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                         msix_control_reg(pos),
+                         control | PCI_MSIX_FLAGS_ENABLE);
+
     return 0;
 }
 
@@ -1306,7 +1575,7 @@ static void dump_msi(unsigned char key)
         else
             mask = '?';
         printk(" %-6s%4u vec=%02x%7s%6s%3sassert%5s%7s"
-               " dest=%08x mask=%d/%d/%c\n",
+               " dest=%08x mask=%d/%c%c/%c\n",
                type, irq,
                (data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT,
                data & MSI_DATA_DELIVERY_LOWPRI ? "lowest" : "fixed",
@@ -1314,7 +1583,10 @@ static void dump_msi(unsigned char key)
                data & MSI_DATA_LEVEL_ASSERT ? "" : "de",
                addr & MSI_ADDR_DESTMODE_LOGIC ? "log" : "phys",
                addr & MSI_ADDR_REDIRECTION_LOWPRI ? "lowest" : "cpu",
-               dest32, attr.maskbit, attr.masked, mask);
+               dest32, attr.maskbit,
+               attr.host_masked ? 'H' : ' ',
+               attr.guest_masked ? 'G' : ' ',
+               mask);
     }
 }
 
diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c
index 98c1e15..2ab97a0 100644
--- a/xen/arch/x86/nmi.c
+++ b/xen/arch/x86/nmi.c
@@ -148,7 +148,7 @@ int __init check_nmi_watchdog (void)
     int cpu;
     bool_t ok = 1;
 
-    if ( !nmi_watchdog )
+    if ( nmi_watchdog == NMI_NONE )
         return 0;
 
     printk("Testing NMI watchdog on all CPUs:");
@@ -361,7 +361,7 @@ static int __pminit setup_p4_watchdog(void)
 
 void __pminit setup_apic_nmi_watchdog(void)
 {
-    if (!nmi_watchdog)
+    if ( nmi_watchdog == NMI_NONE )
         return;
 
     switch (boot_cpu_data.x86_vendor) {
diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
index 628a40a..132d694 100644
--- a/xen/arch/x86/numa.c
+++ b/xen/arch/x86/numa.c
@@ -16,6 +16,7 @@
 #include <xen/pfn.h>
 #include <asm/acpi.h>
 #include <xen/sched.h>
+#include <xen/softirq.h>
 
 static int numa_setup(char *s);
 custom_param("numa", numa_setup);
@@ -35,13 +36,13 @@ static typeof(*memnodemap) _memnodemap[64];
 unsigned long memnodemapsize;
 u8 *memnodemap;
 
-unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
+nodeid_t cpu_to_node[NR_CPUS] __read_mostly = {
     [0 ... NR_CPUS-1] = NUMA_NO_NODE
 };
 /*
  * Keep BIOS's CPU2node information, should not be used for memory allocaion
  */
-unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+nodeid_t apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
     [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
@@ -65,7 +66,7 @@ int srat_disabled(void)
  * -1 if node overlap or lost ram (shift too big)
  */
 static int __init populate_memnodemap(const struct node *nodes,
-                                      int numnodes, int shift, int *nodeids)
+                                      int numnodes, int shift, nodeid_t *nodeids)
 {
     unsigned long spdx, epdx;
     int i, res = -1;
@@ -150,7 +151,7 @@ static int __init extract_lsb_from_nodes(const struct node *nodes,
 }
 
 int __init compute_hash_shift(struct node *nodes, int numnodes,
-                              int *nodeids)
+                              nodeid_t *nodeids)
 {
     int shift;
 
@@ -172,7 +173,7 @@ int __init compute_hash_shift(struct node *nodes, int numnodes,
     return shift;
 }
 /* initialize NODE_DATA given nodeid and start/end */
-void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
+void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
 { 
     unsigned long start_pfn, end_pfn;
 
@@ -294,7 +295,7 @@ __cpuinit void numa_add_cpu(int cpu)
     cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
 } 
 
-void __cpuinit numa_set_node(int cpu, int node)
+void __cpuinit numa_set_node(int cpu, nodeid_t node)
 {
     cpu_to_node[cpu] = node;
 }
@@ -340,7 +341,8 @@ static __init int numa_setup(char *opt)
  */
 void __init init_cpu_to_node(void)
 {
-    int i, node;
+    unsigned int i;
+    nodeid_t node;
 
     for ( i = 0; i < nr_cpu_ids; i++ )
     {
@@ -363,10 +365,12 @@ EXPORT_SYMBOL(node_data);
 static void dump_numa(unsigned char key)
 {
     s_time_t now = NOW();
-    int i;
+    unsigned int i, j;
+    int err;
     struct domain *d;
     struct page_info *page;
     unsigned int page_num_node[MAX_NUMNODES];
+    const struct vnuma_info *vnuma;
 
     printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
            (u32)(now>>32), (u32)now);
@@ -393,6 +397,8 @@ static void dump_numa(unsigned char key)
     printk("Memory location of each domain:\n");
     for_each_domain ( d )
     {
+        process_pending_softirqs();
+
         printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages);
 
         for_each_online_node ( i )
@@ -408,6 +414,70 @@ static void dump_numa(unsigned char key)
 
         for_each_online_node ( i )
             printk("    Node %u: %u\n", i, page_num_node[i]);
+
+        if ( !read_trylock(&d->vnuma_rwlock) )
+            continue;
+
+        if ( !d->vnuma )
+        {
+            read_unlock(&d->vnuma_rwlock);
+            continue;
+        }
+
+        vnuma = d->vnuma;
+        printk("     %u vnodes, %u vcpus, guest physical layout:\n",
+               vnuma->nr_vnodes, d->max_vcpus);
+        for ( i = 0; i < vnuma->nr_vnodes; i++ )
+        {
+            unsigned int start_cpu = ~0U;
+
+            err = snprintf(keyhandler_scratch, 12, "%3u",
+                    vnuma->vnode_to_pnode[i]);
+            if ( err < 0 || vnuma->vnode_to_pnode[i] == NUMA_NO_NODE )
+                strlcpy(keyhandler_scratch, "???", sizeof(keyhandler_scratch));
+
+            printk("       %3u: pnode %s,", i, keyhandler_scratch);
+
+            printk(" vcpus ");
+
+            for ( j = 0; j < d->max_vcpus; j++ )
+            {
+                if ( !(j & 0x3f) )
+                    process_pending_softirqs();
+
+                if ( vnuma->vcpu_to_vnode[j] == i )
+                {
+                    if ( start_cpu == ~0U )
+                    {
+                        printk("%d", j);
+                        start_cpu = j;
+                    }
+                }
+                else if ( start_cpu != ~0U )
+                {
+                    if ( j - 1 != start_cpu )
+                        printk("-%d ", j - 1);
+                    else
+                        printk(" ");
+                    start_cpu = ~0U;
+                }
+            }
+
+            if ( start_cpu != ~0U  && start_cpu != j - 1 )
+                printk("-%d", j - 1);
+
+            printk("\n");
+
+            for ( j = 0; j < vnuma->nr_vmemranges; j++ )
+            {
+                if ( vnuma->vmemrange[j].nid == i )
+                    printk("           %016"PRIx64" - %016"PRIx64"\n",
+                           vnuma->vmemrange[j].start,
+                           vnuma->vmemrange[j].end);
+            }
+        }
+
+        read_unlock(&d->vnuma_rwlock);
     }
 
     rcu_read_unlock(&domlist_read_lock);
diff --git a/xen/arch/x86/oprofile/op_model_ppro.c b/xen/arch/x86/oprofile/op_model_ppro.c
index aa99e4d..89649d0 100644
--- a/xen/arch/x86/oprofile/op_model_ppro.c
+++ b/xen/arch/x86/oprofile/op_model_ppro.c
@@ -19,12 +19,16 @@
 #include <asm/processor.h>
 #include <asm/regs.h>
 #include <asm/current.h>
-#include <asm/hvm/vpmu.h>
-#include <asm/hvm/vmx/vpmu_core2.h>
+#include <asm/vpmu.h>
 
 #include "op_x86_model.h"
 #include "op_counter.h"
 
+struct arch_msr_pair {
+    u64 counter;
+    u64 control;
+};
+
 /*
  * Intel "Architectural Performance Monitoring" CPUID
  * detection/enumeration details:
diff --git a/xen/arch/x86/pci.c b/xen/arch/x86/pci.c
index 88e926d..5bcecbb 100644
--- a/xen/arch/x86/pci.c
+++ b/xen/arch/x86/pci.c
@@ -67,3 +67,28 @@ void pci_conf_write(uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data)
 
     spin_unlock_irqrestore(&pci_config_lock, flags);
 }
+
+int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
+                             unsigned int reg, unsigned int size,
+                             uint32_t *data)
+{
+    struct pci_dev *pdev;
+    int rc = 0;
+
+    /*
+     * Avoid expensive operations when no hook is going to do anything
+     * for the access anyway.
+     */
+    if ( reg < 64 || reg >= 256 )
+        return 0;
+
+    spin_lock(&pcidevs_lock);
+
+    pdev = pci_get_pdev(seg, PCI_BUS(bdf), PCI_DEVFN2(bdf));
+    if ( pdev )
+        rc = pci_msi_conf_write_intercept(pdev, reg, size, data);
+
+    spin_unlock(&pcidevs_lock);
+
+    return rc;
+}
diff --git a/xen/arch/x86/physdev.c b/xen/arch/x86/physdev.c
index 6b3201b..57b7800 100644
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -291,7 +291,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     int irq;
     ret_t ret;
-    struct vcpu *v = current;
+    struct domain *currd = current->domain;
 
     switch ( cmd )
     {
@@ -303,32 +303,31 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         if ( copy_from_guest(&eoi, arg, 1) != 0 )
             break;
         ret = -EINVAL;
-        if ( eoi.irq >= v->domain->nr_pirqs )
+        if ( eoi.irq >= currd->nr_pirqs )
             break;
-        spin_lock(&v->domain->event_lock);
-        pirq = pirq_info(v->domain, eoi.irq);
+        spin_lock(&currd->event_lock);
+        pirq = pirq_info(currd, eoi.irq);
         if ( !pirq ) {
-            spin_unlock(&v->domain->event_lock);
+            spin_unlock(&currd->event_lock);
             break;
         }
-        if ( v->domain->arch.auto_unmask )
+        if ( currd->arch.auto_unmask )
             evtchn_unmask(pirq->evtchn);
-        if ( is_pv_domain(v->domain) ||
-             domain_pirq_to_irq(v->domain, eoi.irq) > 0 )
+        if ( is_pv_domain(currd) || domain_pirq_to_irq(currd, eoi.irq) > 0 )
             pirq_guest_eoi(pirq);
-        if ( is_hvm_domain(v->domain) &&
-                domain_pirq_to_emuirq(v->domain, eoi.irq) > 0 )
+        if ( is_hvm_domain(currd) &&
+             domain_pirq_to_emuirq(currd, eoi.irq) > 0 )
         {
-            struct hvm_irq *hvm_irq = &v->domain->arch.hvm_domain.irq;
-            int gsi = domain_pirq_to_emuirq(v->domain, eoi.irq);
+            struct hvm_irq *hvm_irq = &currd->arch.hvm_domain.irq;
+            int gsi = domain_pirq_to_emuirq(currd, eoi.irq);
 
             /* if this is a level irq and count > 0, send another
              * notification */ 
             if ( gsi >= NR_ISAIRQS /* ISA irqs are edge triggered */
                     && hvm_irq->gsi_assert_count[gsi] )
-                send_guest_pirq(v->domain, pirq);
+                send_guest_pirq(currd, pirq);
         }
-        spin_unlock(&v->domain->event_lock);
+        spin_unlock(&currd->event_lock);
         ret = 0;
         break;
     }
@@ -336,7 +335,6 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     case PHYSDEVOP_pirq_eoi_gmfn_v2:
     case PHYSDEVOP_pirq_eoi_gmfn_v1: {
         struct physdev_pirq_eoi_gmfn info;
-        unsigned long mfn;
         struct page_info *page;
 
         ret = -EFAULT;
@@ -352,26 +350,25 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
             put_page(page);
             break;
         }
-        mfn = page_to_mfn(page);
 
-        if ( cmpxchg(&v->domain->arch.pirq_eoi_map_mfn,
-                     0, mfn) != 0 )
+        if ( cmpxchg(&currd->arch.pirq_eoi_map_mfn,
+                     0, page_to_mfn(page)) != 0 )
         {
-            put_page_and_type(mfn_to_page(mfn));
+            put_page_and_type(page);
             ret = -EBUSY;
             break;
         }
 
-        v->domain->arch.pirq_eoi_map = map_domain_page_global(mfn);
-        if ( v->domain->arch.pirq_eoi_map == NULL )
+        currd->arch.pirq_eoi_map = __map_domain_page_global(page);
+        if ( currd->arch.pirq_eoi_map == NULL )
         {
-            v->domain->arch.pirq_eoi_map_mfn = 0;
-            put_page_and_type(mfn_to_page(mfn));
+            currd->arch.pirq_eoi_map_mfn = 0;
+            put_page_and_type(page);
             ret = -ENOSPC;
             break;
         }
         if ( cmd == PHYSDEVOP_pirq_eoi_gmfn_v1 )
-            v->domain->arch.auto_unmask = 1;
+            currd->arch.auto_unmask = 1;
 
         ret = 0;
         break;
@@ -379,7 +376,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 
     /* Legacy since 0x00030202. */
     case PHYSDEVOP_IRQ_UNMASK_NOTIFY: {
-        ret = pirq_guest_unmask(v->domain);
+        ret = pirq_guest_unmask(currd);
         break;
     }
 
@@ -390,12 +387,12 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
             break;
         irq = irq_status_query.irq;
         ret = -EINVAL;
-        if ( (irq < 0) || (irq >= v->domain->nr_pirqs) )
+        if ( (irq < 0) || (irq >= currd->nr_pirqs) )
             break;
         irq_status_query.flags = 0;
-        if ( is_hvm_domain(v->domain) &&
-             domain_pirq_to_irq(v->domain, irq) <= 0 &&
-             domain_pirq_to_emuirq(v->domain, irq) == IRQ_UNBOUND )
+        if ( is_hvm_domain(currd) &&
+             domain_pirq_to_irq(currd, irq) <= 0 &&
+             domain_pirq_to_emuirq(currd, irq) == IRQ_UNBOUND )
         {
             ret = -EINVAL;
             break;
@@ -410,7 +407,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
          * then dom0 is probably modern anyway.
          */
         irq_status_query.flags |= XENIRQSTAT_needs_eoi;
-        if ( pirq_shared(v->domain, irq) )
+        if ( pirq_shared(currd, irq) )
             irq_status_query.flags |= XENIRQSTAT_shared;
         ret = __copy_to_guest(arg, &irq_status_query, 1) ? -EFAULT : 0;
         break;
@@ -471,7 +468,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         ret = -EFAULT;
         if ( copy_from_guest(&apic, arg, 1) != 0 )
             break;
-        ret = xsm_apic(XSM_PRIV, v->domain, cmd);
+        ret = xsm_apic(XSM_PRIV, currd, cmd);
         if ( ret )
             break;
         ret = ioapic_guest_read(apic.apic_physbase, apic.reg, &apic.value);
@@ -485,7 +482,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         ret = -EFAULT;
         if ( copy_from_guest(&apic, arg, 1) != 0 )
             break;
-        ret = xsm_apic(XSM_PRIV, v->domain, cmd);
+        ret = xsm_apic(XSM_PRIV, currd, cmd);
         if ( ret )
             break;
         ret = ioapic_guest_write(apic.apic_physbase, apic.reg, apic.value);
@@ -501,7 +498,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 
         /* Use the APIC check since this dummy hypercall should still only
          * be called by the domain with access to program the ioapic */
-        ret = xsm_apic(XSM_PRIV, v->domain, cmd);
+        ret = xsm_apic(XSM_PRIV, currd, cmd);
         if ( ret )
             break;
 
@@ -518,10 +515,11 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     }
 
     case PHYSDEVOP_set_iopl: {
+        struct vcpu *curr = current;
         struct physdev_set_iopl set_iopl;
 
         ret = -ENOSYS;
-        if ( is_pvh_vcpu(current) )
+        if ( is_pvh_vcpu(curr) )
             break;
 
         ret = -EFAULT;
@@ -531,15 +529,16 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         if ( set_iopl.iopl > 3 )
             break;
         ret = 0;
-        v->arch.pv_vcpu.iopl = set_iopl.iopl;
+        curr->arch.pv_vcpu.iopl = set_iopl.iopl;
         break;
     }
 
     case PHYSDEVOP_set_iobitmap: {
+        struct vcpu *curr = current;
         struct physdev_set_iobitmap set_iobitmap;
 
         ret = -ENOSYS;
-        if ( is_pvh_vcpu(current) )
+        if ( is_pvh_vcpu(curr) )
             break;
 
         ret = -EFAULT;
@@ -551,11 +550,12 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
             break;
         ret = 0;
 #ifndef COMPAT
-        v->arch.pv_vcpu.iobmp = set_iobitmap.bitmap;
+        curr->arch.pv_vcpu.iobmp = set_iobitmap.bitmap;
 #else
-        guest_from_compat_handle(v->arch.pv_vcpu.iobmp, set_iobitmap.bitmap);
+        guest_from_compat_handle(curr->arch.pv_vcpu.iobmp,
+                                 set_iobitmap.bitmap);
 #endif
-        v->arch.pv_vcpu.iobmp_limit = set_iobitmap.nr_ports;
+        curr->arch.pv_vcpu.iobmp_limit = set_iobitmap.nr_ports;
         break;
     }
 
@@ -565,7 +565,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         if ( copy_from_guest(&manage_pci, arg, 1) != 0 )
             break;
 
-        ret = pci_add_device(0, manage_pci.bus, manage_pci.devfn, NULL);
+        ret = pci_add_device(0, manage_pci.bus, manage_pci.devfn,
+                             NULL, NUMA_NO_NODE);
         break;
     }
 
@@ -597,13 +598,14 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         pdev_info.physfn.devfn = manage_pci_ext.physfn.devfn;
         ret = pci_add_device(0, manage_pci_ext.bus,
                              manage_pci_ext.devfn,
-                             &pdev_info);
+                             &pdev_info, NUMA_NO_NODE);
         break;
     }
 
     case PHYSDEVOP_pci_device_add: {
         struct physdev_pci_device_add add;
         struct pci_dev_info pdev_info;
+        nodeid_t node;
 
         ret = -EFAULT;
         if ( copy_from_guest(&add, arg, 1) != 0 )
@@ -618,7 +620,22 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         }
         else
             pdev_info.is_virtfn = 0;
-        ret = pci_add_device(add.seg, add.bus, add.devfn, &pdev_info);
+
+        if ( add.flags & XEN_PCI_DEV_PXM )
+        {
+            uint32_t pxm;
+            size_t optarr_off = offsetof(struct physdev_pci_device_add, optarr) /
+                                sizeof(add.optarr[0]);
+
+            if ( copy_from_guest_offset(&pxm, arg, optarr_off, 1) )
+                break;
+
+            node = pxm_to_node(pxm);
+        }
+        else
+            node = NUMA_NO_NODE;
+
+        ret = pci_add_device(add.seg, add.bus, add.devfn, &pdev_info, node);
         break;
     }
 
@@ -715,18 +732,17 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     }
     case PHYSDEVOP_get_free_pirq: {
         struct physdev_get_free_pirq out;
-        struct domain *d = v->domain;
 
         ret = -EFAULT;
         if ( copy_from_guest(&out, arg, 1) != 0 )
             break;
 
-        spin_lock(&d->event_lock);
+        spin_lock(&currd->event_lock);
 
-        ret = get_free_pirq(d, out.type);
+        ret = get_free_pirq(currd, out.type);
         if ( ret >= 0 )
         {
-            struct pirq *info = pirq_get_info(d, ret);
+            struct pirq *info = pirq_get_info(currd, ret);
 
             if ( info )
                 info->arch.irq = PIRQ_ALLOCATED;
@@ -734,7 +750,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
                 ret = -ENOMEM;
         }
 
-        spin_unlock(&d->event_lock);
+        spin_unlock(&currd->event_lock);
 
         if ( ret >= 0 )
         {
@@ -748,7 +764,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     case PHYSDEVOP_dbgp_op: {
         struct physdev_dbgp_op op;
 
-        if ( !is_hardware_domain(v->domain) )
+        if ( !is_hardware_domain(currd) )
             ret = -EPERM;
         else if ( copy_from_guest(&op, arg, 1) )
             ret = -EFAULT;
diff --git a/xen/arch/x86/platform_hypercall.c b/xen/arch/x86/platform_hypercall.c
index 32f39b2..7626261 100644
--- a/xen/arch/x86/platform_hypercall.c
+++ b/xen/arch/x86/platform_hypercall.c
@@ -23,6 +23,7 @@
 #include <xen/cpu.h>
 #include <xen/pmstat.h>
 #include <xen/irq.h>
+#include <xen/symbols.h>
 #include <asm/current.h>
 #include <public/platform.h>
 #include <acpi/cpufreq/processor_perf.h>
@@ -61,7 +62,7 @@ long cpu_down_helper(void *data);
 long core_parking_helper(void *data);
 uint32_t get_cur_idle_nums(void);
 
-#define RESOURCE_ACCESS_MAX_ENTRIES 2
+#define RESOURCE_ACCESS_MAX_ENTRIES 3
 struct xen_resource_access {
     unsigned int nr_done;
     unsigned int nr_entries;
@@ -75,6 +76,7 @@ static bool_t allow_access_msr(unsigned int msr)
     /* MSR for CMT, refer to chapter 17.14 of Intel SDM. */
     case MSR_IA32_CMT_EVTSEL:
     case MSR_IA32_CMT_CTR:
+    case MSR_IA32_TSC:
         return 1;
     }
 
@@ -124,6 +126,7 @@ static void resource_access(void *info)
 {
     struct xen_resource_access *ra = info;
     unsigned int i;
+    u64 tsc = 0;
 
     for ( i = 0; i < ra->nr_done; i++ )
     {
@@ -133,10 +136,40 @@ static void resource_access(void *info)
         switch ( entry->u.cmd )
         {
         case XEN_RESOURCE_OP_MSR_READ:
-            ret = rdmsr_safe(entry->idx, entry->val);
+            if ( unlikely(entry->idx == MSR_IA32_TSC) )
+            {
+                /* Return obfuscated scaled time instead of raw timestamp */
+                entry->val = get_s_time_fixed(tsc)
+                             + SECONDS(boot_random) - boot_random;
+                ret = 0;
+            }
+            else
+            {
+                unsigned long flags = 0;
+                /*
+                 * If next entry is MSR_IA32_TSC read, then the actual rdtsc
+                 * is performed together with current entry, with IRQ disabled.
+                 */
+                bool_t read_tsc = (i < ra->nr_done - 1 &&
+                                   unlikely(entry[1].idx == MSR_IA32_TSC));
+
+                if ( unlikely(read_tsc) )
+                    local_irq_save(flags);
+
+                ret = rdmsr_safe(entry->idx, entry->val);
+
+                if ( unlikely(read_tsc) )
+                {
+                    tsc = rdtsc();
+                    local_irq_restore(flags);
+                }
+            }
             break;
         case XEN_RESOURCE_OP_MSR_WRITE:
-            ret = wrmsr_safe(entry->idx, entry->val);
+            if ( unlikely(entry->idx == MSR_IA32_TSC) )
+                ret = -EPERM;
+            else
+                ret = wrmsr_safe(entry->idx, entry->val);
             break;
         default:
             BUG();
@@ -155,7 +188,7 @@ static void resource_access(void *info)
 
 ret_t do_platform_op(XEN_GUEST_HANDLE_PARAM(xen_platform_op_t) u_xenpf_op)
 {
-    ret_t ret = 0;
+    ret_t ret;
     struct xen_platform_op curop, *op = &curop;
 
     if ( copy_from_guest(op, u_xenpf_op, 1) )
@@ -180,14 +213,20 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PARAM(xen_platform_op_t) u_xenpf_op)
 
     switch ( op->cmd )
     {
-    case XENPF_settime:
-    {
-        do_settime(op->u.settime.secs, 
-                   op->u.settime.nsecs, 
-                   op->u.settime.system_time);
-        ret = 0;
-    }
-    break;
+    case XENPF_settime32:
+        do_settime(op->u.settime32.secs,
+                   op->u.settime32.nsecs,
+                   op->u.settime32.system_time);
+        break;
+
+    case XENPF_settime64:
+        if ( likely(!op->u.settime64.mbz) )
+            do_settime(op->u.settime64.secs,
+                       op->u.settime64.nsecs,
+                       op->u.settime64.system_time);
+        else
+            ret = -EINVAL;
+        break;
 
     case XENPF_add_memtype:
     {
@@ -451,7 +490,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PARAM(xen_platform_op_t) u_xenpf_op)
 
             if ( !idletime )
             {
-                cpumask_clear_cpu(cpu, cpumap);
+                __cpumask_clear_cpu(cpu, cpumap);
                 continue;
             }
 
@@ -760,6 +799,33 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PARAM(xen_platform_op_t) u_xenpf_op)
     }
     break;
 
+    case XENPF_get_symbol:
+    {
+        static char name[KSYM_NAME_LEN + 1]; /* protected by xenpf_lock */
+        XEN_GUEST_HANDLE(char) nameh;
+        uint32_t namelen, copylen;
+
+        guest_from_compat_handle(nameh, op->u.symdata.name);
+
+        ret = xensyms_read(&op->u.symdata.symnum, &op->u.symdata.type,
+                           &op->u.symdata.address, name);
+
+        namelen = strlen(name) + 1;
+
+        if ( namelen > op->u.symdata.namelen )
+            copylen = op->u.symdata.namelen;
+        else
+            copylen = namelen;
+
+        op->u.symdata.namelen = namelen;
+
+        if ( !ret && copy_to_guest(nameh, name, copylen) )
+            ret = -EFAULT;
+        if ( !ret && __copy_field_to_guest(u_xenpf_op, op, u.symdata) )
+            ret = -EFAULT;
+    }
+    break;
+
     default:
         ret = -ENOSYS;
         break;
diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c
index 2ef83df..c0daa2e 100644
--- a/xen/arch/x86/psr.c
+++ b/xen/arch/x86/psr.c
@@ -15,22 +15,70 @@
  */
 #include <xen/init.h>
 #include <xen/cpu.h>
+#include <xen/err.h>
 #include <xen/sched.h>
 #include <asm/psr.h>
 
 #define PSR_CMT        (1<<0)
+#define PSR_CAT        (1<<1)
+
+struct psr_cat_cbm {
+    uint64_t cbm;
+    unsigned int ref;
+};
+
+struct psr_cat_socket_info {
+    unsigned int cbm_len;
+    unsigned int cos_max;
+    struct psr_cat_cbm *cos_to_cbm;
+    spinlock_t cbm_lock;
+};
 
 struct psr_assoc {
     uint64_t val;
-    bool_t initialized;
+    uint64_t cos_mask;
 };
 
 struct psr_cmt *__read_mostly psr_cmt;
-static bool_t __initdata opt_psr;
+
+static unsigned long *__read_mostly cat_socket_enable;
+static struct psr_cat_socket_info *__read_mostly cat_socket_info;
+
+static unsigned int __initdata opt_psr;
 static unsigned int __initdata opt_rmid_max = 255;
+static unsigned int __read_mostly opt_cos_max = 255;
 static uint64_t rmid_mask;
 static DEFINE_PER_CPU(struct psr_assoc, psr_assoc);
 
+static struct psr_cat_cbm *temp_cos_to_cbm;
+
+static unsigned int get_socket_cpu(unsigned int socket)
+{
+    if ( likely(socket < nr_sockets) )
+        return cpumask_any(socket_cpumask[socket]);
+
+    return nr_cpu_ids;
+}
+
+static void __init parse_psr_bool(char *s, char *value, char *feature,
+                                  unsigned int mask)
+{
+    if ( !strcmp(s, feature) )
+    {
+        if ( !value )
+            opt_psr |= mask;
+        else
+        {
+            int val_int = parse_bool(value);
+
+            if ( val_int == 0 )
+                opt_psr &= ~mask;
+            else if ( val_int == 1 )
+                opt_psr |= mask;
+        }
+    }
+}
+
 static void __init parse_psr_param(char *s)
 {
     char *ss, *val_str;
@@ -44,23 +92,15 @@ static void __init parse_psr_param(char *s)
         if ( val_str )
             *val_str++ = '\0';
 
-        if ( !strcmp(s, "cmt") )
-        {
-            if ( !val_str )
-                opt_psr |= PSR_CMT;
-            else
-            {
-                int val_int = parse_bool(val_str);
-                if ( val_int == 1 )
-                    opt_psr |= PSR_CMT;
-                else if ( val_int != 0 )
-                    printk("PSR: unknown cmt value: %s - CMT disabled!\n",
-                                    val_str);
-            }
-        }
-        else if ( val_str && !strcmp(s, "rmid_max") )
+        parse_psr_bool(s, val_str, "cmt", PSR_CMT);
+        parse_psr_bool(s, val_str, "cat", PSR_CAT);
+
+        if ( val_str && !strcmp(s, "rmid_max") )
             opt_rmid_max = simple_strtoul(val_str, NULL, 0);
 
+        if ( val_str && !strcmp(s, "cos_max") )
+            opt_cos_max = simple_strtoul(val_str, NULL, 0);
+
         s = ss + 1;
     } while ( ss );
 }
@@ -115,14 +155,6 @@ static void __init init_psr_cmt(unsigned int rmid_max)
     printk(XENLOG_INFO "Cache Monitoring Technology enabled\n");
 }
 
-static int __init init_psr(void)
-{
-    if ( (opt_psr & PSR_CMT) && opt_rmid_max )
-        init_psr_cmt(opt_rmid_max);
-    return 0;
-}
-__initcall(init_psr);
-
 /* Called with domain lock held, no psr specific lock needed */
 int psr_alloc_rmid(struct domain *d)
 {
@@ -146,7 +178,7 @@ int psr_alloc_rmid(struct domain *d)
     if ( rmid > psr_cmt->rmid_max )
     {
         d->arch.psr_rmid = 0;
-        return -EUSERS;
+        return -EOVERFLOW;
     }
 
     d->arch.psr_rmid = rmid;
@@ -168,27 +200,417 @@ void psr_free_rmid(struct domain *d)
     d->arch.psr_rmid = 0;
 }
 
-void psr_assoc_rmid(unsigned int rmid)
+static inline void psr_assoc_init(void)
 {
-    uint64_t val;
-    uint64_t new_val;
     struct psr_assoc *psra = &this_cpu(psr_assoc);
 
-    if ( !psra->initialized )
+    if ( cat_socket_info )
     {
+        unsigned int socket = cpu_to_socket(smp_processor_id());
+
+        if ( test_bit(socket, cat_socket_enable) )
+            psra->cos_mask = ((1ull << get_count_order(
+                             cat_socket_info[socket].cos_max)) - 1) << 32;
+    }
+
+    if ( psr_cmt_enabled() || psra->cos_mask )
         rdmsrl(MSR_IA32_PSR_ASSOC, psra->val);
-        psra->initialized = 1;
+}
+
+static inline void psr_assoc_rmid(uint64_t *reg, unsigned int rmid)
+{
+    *reg = (*reg & ~rmid_mask) | (rmid & rmid_mask);
+}
+
+static inline void psr_assoc_cos(uint64_t *reg, unsigned int cos,
+                                 uint64_t cos_mask)
+{
+    *reg = (*reg & ~cos_mask) | (((uint64_t)cos << 32) & cos_mask);
+}
+
+void psr_ctxt_switch_to(struct domain *d)
+{
+    struct psr_assoc *psra = &this_cpu(psr_assoc);
+    uint64_t reg = psra->val;
+
+    if ( psr_cmt_enabled() )
+        psr_assoc_rmid(&reg, d->arch.psr_rmid);
+
+    if ( psra->cos_mask )
+        psr_assoc_cos(&reg, d->arch.psr_cos_ids ?
+                      d->arch.psr_cos_ids[cpu_to_socket(smp_processor_id())] :
+                      0, psra->cos_mask);
+
+    if ( reg != psra->val )
+    {
+        wrmsrl(MSR_IA32_PSR_ASSOC, reg);
+        psra->val = reg;
+    }
+}
+static struct psr_cat_socket_info *get_cat_socket_info(unsigned int socket)
+{
+    if ( !cat_socket_info )
+        return ERR_PTR(-ENODEV);
+
+    if ( socket >= nr_sockets )
+        return ERR_PTR(-ENOTSOCK);
+
+    if ( !test_bit(socket, cat_socket_enable) )
+        return ERR_PTR(-ENOENT);
+
+    return cat_socket_info + socket;
+}
+
+int psr_get_cat_l3_info(unsigned int socket, uint32_t *cbm_len,
+                        uint32_t *cos_max)
+{
+    struct psr_cat_socket_info *info = get_cat_socket_info(socket);
+
+    if ( IS_ERR(info) )
+        return PTR_ERR(info);
+
+    *cbm_len = info->cbm_len;
+    *cos_max = info->cos_max;
+
+    return 0;
+}
+
+int psr_get_l3_cbm(struct domain *d, unsigned int socket, uint64_t *cbm)
+{
+    struct psr_cat_socket_info *info = get_cat_socket_info(socket);
+
+    if ( IS_ERR(info) )
+        return PTR_ERR(info);
+
+    *cbm = info->cos_to_cbm[d->arch.psr_cos_ids[socket]].cbm;
+
+    return 0;
+}
+
+static bool_t psr_check_cbm(unsigned int cbm_len, uint64_t cbm)
+{
+    unsigned int first_bit, zero_bit;
+
+    /* Set bits should only in the range of [0, cbm_len). */
+    if ( cbm & (~0ull << cbm_len) )
+        return 0;
+
+    /* At least one bit need to be set. */
+    if ( cbm == 0 )
+        return 0;
+
+    first_bit = find_first_bit(&cbm, cbm_len);
+    zero_bit = find_next_zero_bit(&cbm, cbm_len, first_bit);
+
+    /* Set bits should be contiguous. */
+    if ( zero_bit < cbm_len &&
+         find_next_bit(&cbm, cbm_len, zero_bit) < cbm_len )
+        return 0;
+
+    return 1;
+}
+
+struct cos_cbm_info
+{
+    unsigned int cos;
+    uint64_t cbm;
+};
+
+static void do_write_l3_cbm(void *data)
+{
+    struct cos_cbm_info *info = data;
+
+    wrmsrl(MSR_IA32_PSR_L3_MASK(info->cos), info->cbm);
+}
+
+static int write_l3_cbm(unsigned int socket, unsigned int cos, uint64_t cbm)
+{
+    struct cos_cbm_info info = { .cos = cos, .cbm = cbm };
+
+    if ( socket == cpu_to_socket(smp_processor_id()) )
+        do_write_l3_cbm(&info);
+    else
+    {
+        unsigned int cpu = get_socket_cpu(socket);
+
+        if ( cpu >= nr_cpu_ids )
+            return -ENOTSOCK;
+        on_selected_cpus(cpumask_of(cpu), do_write_l3_cbm, &info, 1);
+    }
+
+    return 0;
+}
+
+int psr_set_l3_cbm(struct domain *d, unsigned int socket, uint64_t cbm)
+{
+    unsigned int old_cos, cos;
+    struct psr_cat_cbm *map, *found = NULL;
+    struct psr_cat_socket_info *info = get_cat_socket_info(socket);
+
+    if ( IS_ERR(info) )
+        return PTR_ERR(info);
+
+    if ( !psr_check_cbm(info->cbm_len, cbm) )
+        return -EINVAL;
+
+    old_cos = d->arch.psr_cos_ids[socket];
+    map = info->cos_to_cbm;
+
+    spin_lock(&info->cbm_lock);
+
+    for ( cos = 0; cos <= info->cos_max; cos++ )
+    {
+        /* If still not found, then keep unused one. */
+        if ( !found && cos != 0 && map[cos].ref == 0 )
+            found = map + cos;
+        else if ( map[cos].cbm == cbm )
+        {
+            if ( unlikely(cos == old_cos) )
+            {
+                ASSERT(cos == 0 || map[cos].ref != 0);
+                spin_unlock(&info->cbm_lock);
+                return 0;
+            }
+            found = map + cos;
+            break;
+        }
     }
-    val = psra->val;
 
-    new_val = (val & ~rmid_mask) | (rmid & rmid_mask);
-    if ( val != new_val )
+    /* If old cos is referred only by the domain, then use it. */
+    if ( !found && map[old_cos].ref == 1 )
+        found = map + old_cos;
+
+    if ( !found )
+    {
+        spin_unlock(&info->cbm_lock);
+        return -EOVERFLOW;
+    }
+
+    cos = found - map;
+    if ( found->cbm != cbm )
+    {
+        int ret = write_l3_cbm(socket, cos, cbm);
+
+        if ( ret )
+        {
+            spin_unlock(&info->cbm_lock);
+            return ret;
+        }
+        found->cbm = cbm;
+    }
+
+    found->ref++;
+    map[old_cos].ref--;
+    spin_unlock(&info->cbm_lock);
+
+    d->arch.psr_cos_ids[socket] = cos;
+
+    return 0;
+}
+
+/* Called with domain lock held, no extra lock needed for 'psr_cos_ids' */
+static void psr_free_cos(struct domain *d)
+{
+    unsigned int socket;
+    unsigned int cos;
+    struct psr_cat_socket_info *info;
+
+    if( !d->arch.psr_cos_ids )
+        return;
+
+    for_each_set_bit(socket, cat_socket_enable, nr_sockets)
+    {
+        if ( (cos = d->arch.psr_cos_ids[socket]) == 0 )
+            continue;
+
+        info = cat_socket_info + socket;
+        spin_lock(&info->cbm_lock);
+        info->cos_to_cbm[cos].ref--;
+        spin_unlock(&info->cbm_lock);
+    }
+
+    xfree(d->arch.psr_cos_ids);
+    d->arch.psr_cos_ids = NULL;
+}
+
+int psr_domain_init(struct domain *d)
+{
+    if ( cat_socket_info )
+    {
+        d->arch.psr_cos_ids = xzalloc_array(unsigned int, nr_sockets);
+        if ( !d->arch.psr_cos_ids )
+            return -ENOMEM;
+    }
+
+    return 0;
+}
+
+void psr_domain_free(struct domain *d)
+{
+    psr_free_rmid(d);
+    psr_free_cos(d);
+}
+
+static int cat_cpu_prepare(unsigned int cpu)
+{
+    if ( !cat_socket_info )
+        return 0;
+
+    if ( temp_cos_to_cbm == NULL &&
+         (temp_cos_to_cbm = xzalloc_array(struct psr_cat_cbm,
+                                          opt_cos_max + 1UL)) == NULL )
+        return -ENOMEM;
+
+    return 0;
+}
+
+static void cat_cpu_init(void)
+{
+    unsigned int eax, ebx, ecx, edx;
+    struct psr_cat_socket_info *info;
+    unsigned int socket;
+    unsigned int cpu = smp_processor_id();
+    const struct cpuinfo_x86 *c = cpu_data + cpu;
+
+    if ( !cpu_has(c, X86_FEATURE_CAT) || c->cpuid_level < PSR_CPUID_LEVEL_CAT )
+        return;
+
+    socket = cpu_to_socket(cpu);
+    if ( test_bit(socket, cat_socket_enable) )
+        return;
+
+    cpuid_count(PSR_CPUID_LEVEL_CAT, 0, &eax, &ebx, &ecx, &edx);
+    if ( ebx & PSR_RESOURCE_TYPE_L3 )
+    {
+        cpuid_count(PSR_CPUID_LEVEL_CAT, 1, &eax, &ebx, &ecx, &edx);
+        info = cat_socket_info + socket;
+        info->cbm_len = (eax & 0x1f) + 1;
+        info->cos_max = min(opt_cos_max, edx & 0xffff);
+
+        info->cos_to_cbm = temp_cos_to_cbm;
+        temp_cos_to_cbm = NULL;
+        /* cos=0 is reserved as default cbm(all ones). */
+        info->cos_to_cbm[0].cbm = (1ull << info->cbm_len) - 1;
+
+        spin_lock_init(&info->cbm_lock);
+
+        set_bit(socket, cat_socket_enable);
+        printk(XENLOG_INFO "CAT: enabled on socket %u, cos_max:%u, cbm_len:%u\n",
+               socket, info->cos_max, info->cbm_len);
+    }
+}
+
+static void cat_cpu_fini(unsigned int cpu)
+{
+    unsigned int socket = cpu_to_socket(cpu);
+
+    if ( !socket_cpumask[socket] || cpumask_empty(socket_cpumask[socket]) )
     {
-        wrmsrl(MSR_IA32_PSR_ASSOC, new_val);
-        psra->val = new_val;
+        struct psr_cat_socket_info *info = cat_socket_info + socket;
+
+        if ( info->cos_to_cbm )
+        {
+            xfree(info->cos_to_cbm);
+            info->cos_to_cbm = NULL;
+        }
+        clear_bit(socket, cat_socket_enable);
     }
 }
 
+static void __init psr_cat_free(void)
+{
+    xfree(cat_socket_enable);
+    cat_socket_enable = NULL;
+    xfree(cat_socket_info);
+    cat_socket_info = NULL;
+}
+
+static void __init init_psr_cat(void)
+{
+    if ( opt_cos_max < 1 )
+    {
+        printk(XENLOG_INFO "CAT: disabled, cos_max is too small\n");
+        return;
+    }
+
+    cat_socket_enable = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_sockets));
+    cat_socket_info = xzalloc_array(struct psr_cat_socket_info, nr_sockets);
+
+    if ( !cat_socket_enable || !cat_socket_info )
+        psr_cat_free();
+}
+
+static int psr_cpu_prepare(unsigned int cpu)
+{
+    return cat_cpu_prepare(cpu);
+}
+
+static void psr_cpu_init(void)
+{
+    if ( cat_socket_info )
+        cat_cpu_init();
+
+    psr_assoc_init();
+}
+
+static void psr_cpu_fini(unsigned int cpu)
+{
+    if ( cat_socket_info )
+        cat_cpu_fini(cpu);
+}
+
+static int cpu_callback(
+    struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+    int rc = 0;
+    unsigned int cpu = (unsigned long)hcpu;
+
+    switch ( action )
+    {
+    case CPU_UP_PREPARE:
+        rc = psr_cpu_prepare(cpu);
+        break;
+    case CPU_STARTING:
+        psr_cpu_init();
+        break;
+    case CPU_UP_CANCELED:
+    case CPU_DEAD:
+        psr_cpu_fini(cpu);
+        break;
+    }
+
+    return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
+}
+
+static struct notifier_block cpu_nfb = {
+    .notifier_call = cpu_callback,
+    /*
+     * Ensure socket_cpumask is still valid in CPU_DEAD notification
+     * (E.g. our CPU_DEAD notification should be called ahead of
+     * cpu_smpboot_free).
+     */
+    .priority = -1
+};
+
+static int __init psr_presmp_init(void)
+{
+    if ( (opt_psr & PSR_CMT) && opt_rmid_max )
+        init_psr_cmt(opt_rmid_max);
+
+    if ( opt_psr & PSR_CAT )
+        init_psr_cat();
+
+    if ( psr_cpu_prepare(0) )
+        psr_cat_free();
+
+    psr_cpu_init();
+    if ( psr_cmt_enabled() || cat_socket_info )
+        register_cpu_notifier(&cpu_nfb);
+
+    return 0;
+}
+presmp_initcall(psr_presmp_init);
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index fefa0b7..3946e4c 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -49,6 +49,7 @@
 #include <xen/cpu.h>
 #include <asm/nmi.h>
 #include <asm/alternative.h>
+#include <asm/mc146818rtc.h>
 
 /* opt_nosmp: If true, secondary processors are ignored. */
 static bool_t __initdata opt_nosmp;
@@ -100,7 +101,7 @@ unsigned long __read_mostly xen_virt_end;
 
 DEFINE_PER_CPU(struct tss_struct, init_tss);
 
-char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE];
+char __section(".bss.stack_aligned") cpu0_stack[STACK_SIZE];
 
 struct cpuinfo_x86 __read_mostly boot_cpu_data = { 0, 0, 0, 0, -1 };
 
@@ -140,13 +141,21 @@ static void __init parse_acpi_param(char *s)
 static const module_t *__initdata initial_images;
 static unsigned int __initdata nr_initial_images;
 
-unsigned long __init initial_images_nrpages(void)
+unsigned long __init initial_images_nrpages(nodeid_t node)
 {
+    unsigned long node_start = node_start_pfn(node);
+    unsigned long node_end = node_end_pfn(node);
     unsigned long nr;
     unsigned int i;
 
     for ( nr = i = 0; i < nr_initial_images; ++i )
-        nr += PFN_UP(initial_images[i].mod_end);
+    {
+        unsigned long start = initial_images[i].mod_start;
+        unsigned long end = start + PFN_UP(initial_images[i].mod_end);
+
+        if ( end > node_start && node_end > start )
+            nr += min(node_end, end) - max(node_start, start);
+    }
 
     return nr;
 }
@@ -177,7 +186,7 @@ static void free_xen_data(char *s, char *e)
     memguard_guard_range(__va(__pa(s)), e-s);
 }
 
-extern char __init_begin[], __init_end[], __bss_start[];
+extern char __init_begin[], __init_end[], __bss_start[], __bss_end[];
 
 static void __init init_idle_domain(void)
 {
@@ -188,7 +197,7 @@ static void __init init_idle_domain(void)
 
 void __devinit srat_detect_node(int cpu)
 {
-    unsigned node;
+    nodeid_t node;
     u32 apicid = x86_cpu_to_apicid[cpu];
 
     node = apicid_to_node[apicid];
@@ -386,8 +395,13 @@ static void __init setup_max_pdx(unsigned long top_page)
     if ( max_pdx > FRAMETABLE_NR )
         max_pdx = FRAMETABLE_NR;
 
+    if ( max_pdx > MPT_VIRT_SIZE / sizeof(unsigned long) )
+        max_pdx = MPT_VIRT_SIZE / sizeof(unsigned long);
+
+#ifdef PAGE_LIST_NULL
     if ( max_pdx >= PAGE_LIST_NULL )
         max_pdx = PAGE_LIST_NULL - 1;
+#endif
 
     max_page = pdx_to_pfn(max_pdx - 1) + 1;
 }
@@ -493,6 +507,10 @@ static void __init kexec_reserve_area(struct e820map *e820)
 
 static void noinline init_done(void)
 {
+    system_state = SYS_STATE_active;
+
+    domain_unpause_by_systemcontroller(hardware_domain);
+
     /* Free (or page-protect) the init areas. */
     memset(__init_begin, 0xcc, __init_end - __init_begin); /* int3 poison */
     free_xen_data(__init_begin, __init_end);
@@ -501,6 +519,23 @@ static void noinline init_done(void)
     startup_cpu_idle_loop();
 }
 
+/* Reinitalise all state referring to the old virtual address of the stack. */
+static void __init noreturn reinit_bsp_stack(void)
+{
+    unsigned long *stack = (void*)(get_stack_bottom() & ~(STACK_SIZE - 1));
+
+    /* Update TSS and ISTs */
+    load_system_tables();
+
+    /* Update SYSCALL trampolines */
+    percpu_traps_init();
+
+    stack_base[0] = stack;
+    memguard_guard_stack(stack);
+
+    reset_stack_and_jump(init_done);
+}
+
 static bool_t __init loader_is_grub2(const char *loader_name)
 {
     /* GRUB1="GNU GRUB 0.xx"; GRUB2="GRUB 1.xx" */
@@ -660,9 +695,6 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
         panic("dom0 kernel not specified. Check bootloader configuration.");
 
-    if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
-        panic("Misaligned CPU0 stack.");
-
     if ( efi_enabled )
     {
         set_pdx_range(xen_phys_start >> PAGE_SHIFT,
@@ -887,7 +919,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
             /* The only data mappings to be relocated are in the Xen area. */
             pl2e = __va(__pa(l2_xenmap));
             *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
-                                   PAGE_HYPERVISOR | _PAGE_PSE);
+                                   PAGE_HYPERVISOR_RWX | _PAGE_PSE);
             for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
             {
                 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
@@ -970,7 +1002,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 
     setup_max_pdx(raw_max_page);
     if ( highmem_start )
-        xenheap_max_mfn(PFN_DOWN(highmem_start));
+        xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
 
     /*
      * Walk every RAM region and map it in its entirety (on x86/64, at least)
@@ -1074,7 +1106,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
             /* This range must not be passed to the boot allocator and
              * must also not be mapped with _PAGE_GLOBAL. */
             map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e),
-                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR);
+                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
         }
         if ( s < map_s )
         {
@@ -1151,9 +1183,6 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 
     numa_initmem_init(0, raw_max_page);
 
-    end_boot_allocator();
-    system_state = SYS_STATE_boot;
-
     if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
     {
         unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
@@ -1162,6 +1191,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
         if ( !highmem_start )
             xenheap_max_mfn(limit);
 
+        end_boot_allocator();
+
         /* Pass the remaining memory to the allocator. */
         for ( i = 0; i < boot_e820.nr_map; i++ )
         {
@@ -1185,6 +1216,10 @@ void __init noreturn __start_xen(unsigned long mbi_p)
            opt_tmem = 0;
         }
     }
+    else
+        end_boot_allocator();
+
+    system_state = SYS_STATE_boot;
 
     vm_init();
     console_init_ring();
@@ -1199,9 +1234,6 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 
     tboot_probe();
 
-    /* Unmap the first page of CPU0's stack. */
-    memguard_guard_stack(cpu0_stack);
-
     open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
 
     if ( opt_watchdog ) 
@@ -1255,16 +1287,6 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 
     timer_init();
 
-    init_idle_domain();
-
-    trap_init();
-
-    rcu_init();
-    
-    early_time_init();
-
-    arch_init_memory();
-
     identify_cpu(&boot_cpu_data);
 
     if ( cpu_has_fxsr )
@@ -1285,6 +1307,20 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     if ( cpu_has_fsgsbase )
         set_in_cr4(X86_CR4_FSGSBASE);
 
+    init_idle_domain();
+
+    this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
+                                           &this_cpu(stubs).mfn);
+    BUG_ON(!this_cpu(stubs.addr));
+
+    trap_init();
+
+    rcu_init();
+
+    early_time_init();
+
+    arch_init_memory();
+
     alternative_instructions();
 
     local_irq_enable();
@@ -1347,8 +1383,12 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     if ( opt_dom0pvh )
         domcr_flags |= DOMCRF_pvh | DOMCRF_hap;
 
-    /* Create initial domain 0. */
-    dom0 = domain_create(0, domcr_flags, 0);
+    /*
+     * Create initial domain 0.
+     * x86 doesn't support arch-configuration. So it's fine to pass
+     * NULL.
+     */
+    dom0 = domain_create(0, domcr_flags, 0, NULL);
     if ( IS_ERR(dom0) || (alloc_dom0_vcpu0(dom0) == NULL) )
         panic("Error creating domain 0");
 
@@ -1402,6 +1442,10 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     if ( cpu_has_smap )
         write_cr4(read_cr4() & ~X86_CR4_SMAP);
 
+    printk("%sNX (Execute Disable) protection %sactive\n",
+           cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
+           cpu_has_nx ? "" : "not ");
+
     /*
      * We're going to setup domain0 using the module(s) that we stashed safely
      * above our heap. The second module, if present, is an initrd ramdisk.
@@ -1429,11 +1473,13 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 
     dmi_end_boot();
 
-    system_state = SYS_STATE_active;
-
-    domain_unpause_by_systemcontroller(dom0);
+    setup_io_bitmap(dom0);
 
-    reset_stack_and_jump(init_done);
+    /* Jump to the 1:1 virtual mappings of cpu0_stack. */
+    asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
+                  [stk] "g" (__va(__pa(get_stack_bottom()))),
+                  [fn] "i" (reinit_bsp_stack) : "memory");
+    unreachable();
 }
 
 void arch_get_xen_caps(xen_capabilities_info_t *info)
@@ -1480,7 +1526,7 @@ int __hwdom_init xen_in_range(unsigned long mfn)
         xen_regions[region_text].e = __pa(&__init_begin);
         /* bss */
         xen_regions[region_bss].s = __pa(&__bss_start);
-        xen_regions[region_bss].e = __pa(&_end);
+        xen_regions[region_bss].e = __pa(&__bss_end);
     }
 
     start = (paddr_t)mfn << PAGE_SHIFT;
@@ -1492,6 +1538,42 @@ int __hwdom_init xen_in_range(unsigned long mfn)
     return 0;
 }
 
+static int __hwdom_init io_bitmap_cb(unsigned long s, unsigned long e,
+                                     void *ctx)
+{
+    struct domain *d = ctx;
+    unsigned int i;
+
+    ASSERT(e <= INT_MAX);
+    for ( i = s; i <= e; i++ )
+        __clear_bit(i, d->arch.hvm_domain.io_bitmap);
+
+    return 0;
+}
+
+void __hwdom_init setup_io_bitmap(struct domain *d)
+{
+    int rc;
+
+    if ( has_hvm_container_domain(d) )
+    {
+        bitmap_fill(d->arch.hvm_domain.io_bitmap, 0x10000);
+        rc = rangeset_report_ranges(d->arch.ioport_caps, 0, 0x10000,
+                                    io_bitmap_cb, d);
+        BUG_ON(rc);
+        /*
+         * NB: we need to trap accesses to 0xcf8 in order to intercept
+         * 4 byte accesses, that need to be handled by Xen in order to
+         * keep consistency.
+         * Access to 1 byte RTC ports also needs to be trapped in order
+         * to keep consistency with PV.
+         */
+        __set_bit(0xcf8, d->arch.hvm_domain.io_bitmap);
+        __set_bit(RTC_PORT(0), d->arch.hvm_domain.io_bitmap);
+        __set_bit(RTC_PORT(1), d->arch.hvm_domain.io_bitmap);
+    }
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/shutdown.c b/xen/arch/x86/shutdown.c
index 9ec8f97..0e1499d 100644
--- a/xen/arch/x86/shutdown.c
+++ b/xen/arch/x86/shutdown.c
@@ -33,6 +33,7 @@ enum reboot_type {
         BOOT_KBD = 'k',
         BOOT_ACPI = 'a',
         BOOT_CF9 = 'p',
+        BOOT_CF9_PWR = 'P',
         BOOT_EFI = 'e',
 };
 
@@ -47,6 +48,7 @@ static int reboot_mode;
  * kbd    Use the keyboard controller. cold reset (default)
  * acpi   Use the RESET_REG in the FADT
  * pci    Use the so-called "PCI reset register", CF9
+ * Power  Like 'pci' but for a full power-cyle reset
  * efi    Use the EFI reboot (if running under EFI)
  */
 static enum reboot_type reboot_type = BOOT_INVALID;
@@ -68,8 +70,9 @@ static void __init set_reboot_type(char *str)
         case 'a':
         case 'e':
         case 'k':
-        case 't':
+        case 'P':
         case 'p':
+        case 't':
             reboot_type = *str;
             break;
         }
@@ -571,11 +574,18 @@ void machine_restart(unsigned int delay_millisecs)
             reboot_type = BOOT_KBD;
             break;
         case BOOT_CF9:
+        case BOOT_CF9_PWR:
             {
-                u8 cf9 = inb(0xcf9) & ~6;
-                outb(cf9|2, 0xcf9); /* Request hard reset */
+                u8 cf9 = inb(0xcf9) & ~0x0e;
+
+                /* Request warm, hard, or power-cycle reset. */
+                if ( reboot_type == BOOT_CF9_PWR )
+                    cf9 |= 0x0a;
+                else if ( reboot_mode == 0 )
+                    cf9 |= 0x02;
+                outb(cf9, 0xcf9);
                 udelay(50);
-                outb(cf9|6, 0xcf9); /* Actually do the reset */
+                outb(cf9 | 0x04, 0xcf9); /* Actually do the reset. */
                 udelay(50);
             }
             reboot_type = BOOT_ACPI;
diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
index 06a833e..8caa0bc 100644
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -311,9 +311,9 @@ void smp_send_stop(void)
         mdelay(1);
 
     local_irq_disable();
-    __stop_this_cpu();
     disable_IO_APIC();
     hpet_disable();
+    __stop_this_cpu();
     local_irq_enable();
 }
 
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index c54be7e..0d55d7f 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -25,6 +24,7 @@
 #include <xen/kernel.h>
 #include <xen/mm.h>
 #include <xen/domain.h>
+#include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <xen/sched-if.h>
 #include <xen/irq.h>
@@ -59,6 +59,10 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
 cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
+unsigned int __read_mostly nr_sockets;
+cpumask_t **__read_mostly socket_cpumask;
+static cpumask_t *secondary_socket_cpumask;
+
 struct cpuinfo_x86 cpu_data[NR_CPUS];
 
 u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
@@ -80,11 +84,21 @@ void *stack_base[NR_CPUS];
 static void smp_store_cpu_info(int id)
 {
     struct cpuinfo_x86 *c = cpu_data + id;
+    unsigned int socket;
 
     *c = boot_cpu_data;
     if ( id != 0 )
+    {
         identify_cpu(c);
 
+        socket = cpu_to_socket(id);
+        if ( !socket_cpumask[socket] )
+        {
+            socket_cpumask[socket] = secondary_socket_cpumask;
+            secondary_socket_cpumask = NULL;
+        }
+    }
+
     /*
      * Certain Athlons might work (for various values of 'work') in SMP
      * but they are not certified as MP capable.
@@ -142,7 +156,7 @@ static void synchronize_tsc_master(unsigned int slave)
 
     for ( i = 1; i <= 5; i++ )
     {
-        rdtscll(tsc_value);
+        tsc_value = rdtsc();
         wmb();
         atomic_inc(&tsc_count);
         while ( atomic_read(&tsc_count) != (i<<1) )
@@ -244,6 +258,8 @@ static void set_cpu_sibling_map(int cpu)
 
     cpumask_set_cpu(cpu, &cpu_sibling_setup_map);
 
+    cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
+
     if ( c[cpu].x86_num_siblings > 1 )
     {
         for_each_cpu ( i, &cpu_sibling_setup_map )
@@ -603,6 +619,43 @@ static int do_boot_cpu(int apicid, int cpu)
     return rc;
 }
 
+#define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
+
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
+{
+    unsigned long stub_va;
+    struct page_info *pg;
+
+    BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
+
+    if ( *mfn )
+        pg = mfn_to_page(*mfn);
+    else
+    {
+        nodeid_t node = cpu_to_node(cpu);
+        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
+
+        pg = alloc_domheap_page(NULL, memflags);
+        if ( !pg )
+            return 0;
+
+        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
+    }
+
+    stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
+    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
+                          PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
+    {
+        if ( !*mfn )
+            free_domheap_page(pg);
+        stub_va = 0;
+    }
+    else if ( !*mfn )
+        *mfn = page_to_mfn(pg);
+
+    return stub_va;
+}
+
 void cpu_exit_clear(unsigned int cpu)
 {
     cpu_uninit(cpu);
@@ -611,11 +664,40 @@ void cpu_exit_clear(unsigned int cpu)
 
 static void cpu_smpboot_free(unsigned int cpu)
 {
-    unsigned int order;
+    unsigned int order, socket = cpu_to_socket(cpu);
+    struct cpuinfo_x86 *c = cpu_data;
+
+    if ( cpumask_empty(socket_cpumask[socket]) )
+    {
+        xfree(socket_cpumask[socket]);
+        socket_cpumask[socket] = NULL;
+    }
+
+    c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
+    c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
+    c[cpu].compute_unit_id = INVALID_CUID;
+    cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
 
     free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
     free_cpumask_var(per_cpu(cpu_core_mask, cpu));
 
+    if ( per_cpu(stubs.addr, cpu) )
+    {
+        unsigned long mfn = per_cpu(stubs.mfn, cpu);
+        unsigned char *stub_page = map_domain_page(_mfn(mfn));
+        unsigned int i;
+
+        memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
+        for ( i = 0; i < STUBS_PER_PAGE; ++i )
+            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
+                break;
+        unmap_domain_page(stub_page);
+        destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
+                             (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
+        if ( i == STUBS_PER_PAGE )
+            free_domheap_page(mfn_to_page(mfn));
+    }
+
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
     free_xenheap_pages(per_cpu(gdt_table, cpu), order);
 
@@ -635,36 +717,56 @@ static void cpu_smpboot_free(unsigned int cpu)
 
 static int cpu_smpboot_alloc(unsigned int cpu)
 {
-    unsigned int order;
+    unsigned int i, order, memflags = 0;
+    nodeid_t node = cpu_to_node(cpu);
     struct desc_struct *gdt;
+    unsigned long stub_page;
 
-    stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, 0);
+    if ( node != NUMA_NO_NODE )
+        memflags = MEMF_node(node);
+
+    stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
     if ( stack_base[cpu] == NULL )
         goto oom;
     memguard_guard_stack(stack_base[cpu]);
 
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
-    per_cpu(gdt_table, cpu) = gdt =
-        alloc_xenheap_pages(order, MEMF_node(cpu_to_node(cpu)));
+    per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
     if ( gdt == NULL )
         goto oom;
     memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
     BUILD_BUG_ON(NR_CPUS > 0x10000);
     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
 
-    per_cpu(compat_gdt_table, cpu) = gdt =
-        alloc_xenheap_pages(order, MEMF_node(cpu_to_node(cpu)));
+    per_cpu(compat_gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
     if ( gdt == NULL )
         goto oom;
     memcpy(gdt, boot_cpu_compat_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
 
     order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
-    idt_tables[cpu] = alloc_xenheap_pages(order, MEMF_node(cpu_to_node(cpu)));
+    idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
     if ( idt_tables[cpu] == NULL )
         goto oom;
     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
 
+    for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+          i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+        if ( cpu_online(i) && cpu_to_node(i) == node )
+        {
+            per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
+            break;
+        }
+    BUG_ON(i == cpu);
+    stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
+    if ( !stub_page )
+        goto oom;
+    per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
+
+    if ( secondary_socket_cpumask == NULL &&
+         (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
+        goto oom;
+
     if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
          zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) )
         return 0;
@@ -715,6 +817,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
     stack_base[0] = stack_start;
 
+    set_nr_sockets();
+
+    socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
+    if ( socket_cpumask == NULL ||
+         (socket_cpumask[cpu_to_socket(0)] = xzalloc(cpumask_t)) == NULL )
+        panic("No memory for socket CPU siblings map");
+
     if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, 0)) ||
          !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) )
         panic("No memory for boot CPU sibling/core maps");
@@ -778,24 +887,21 @@ static void
 remove_siblinginfo(int cpu)
 {
     int sibling;
-    struct cpuinfo_x86 *c = cpu_data;
+
+    cpumask_clear_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
 
     for_each_cpu ( sibling, per_cpu(cpu_core_mask, cpu) )
     {
         cpumask_clear_cpu(cpu, per_cpu(cpu_core_mask, sibling));
         /* Last thread sibling in this cpu core going down. */
         if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
-            c[sibling].booted_cores--;
+            cpu_data[sibling].booted_cores--;
     }
    
     for_each_cpu(sibling, per_cpu(cpu_sibling_mask, cpu))
         cpumask_clear_cpu(cpu, per_cpu(cpu_sibling_mask, sibling));
     cpumask_clear(per_cpu(cpu_sibling_mask, cpu));
     cpumask_clear(per_cpu(cpu_core_mask, cpu));
-    c[cpu].phys_proc_id = BAD_APICID;
-    c[cpu].cpu_core_id = BAD_APICID;
-    c[cpu].compute_unit_id = BAD_APICID;
-    cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
 }
 
 void __cpu_disable(void)
@@ -816,7 +922,6 @@ void __cpu_disable(void)
     remove_siblinginfo(cpu);
 
     /* It's now safe to remove this processor from the online map */
-    cpumask_clear_cpu(cpu, cpupool0->cpu_valid);
     cpumask_clear_cpu(cpu, &cpu_online_map);
     fixup_irqs();
 
@@ -843,7 +948,7 @@ void __cpu_die(unsigned int cpu)
 
 int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm)
 {
-    int node, cpu = -1;
+    int cpu = -1;
 
     dprintk(XENLOG_DEBUG, "cpu_add apic_id %x acpi_id %x pxm %x\n",
             apic_id, acpi_id, pxm);
@@ -877,7 +982,9 @@ int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm)
 
     if ( !srat_disabled() )
     {
-        if ( (node = setup_node(pxm)) < 0 )
+        nodeid_t node = setup_node(pxm);
+
+        if ( node == NUMA_NO_NODE )
         {
             dprintk(XENLOG_WARNING,
                     "Setup node failed for pxm %x\n", pxm);
diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
index 29fc724..4242d10 100644
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -25,40 +25,85 @@ static struct acpi_table_slit *__read_mostly acpi_slit;
 
 static nodemask_t memory_nodes_parsed __initdata;
 static nodemask_t processor_nodes_parsed __initdata;
-static nodemask_t nodes_found __initdata;
 static struct node nodes[MAX_NUMNODES] __initdata;
-static u8 __read_mostly pxm2node[256] = { [0 ... 255] = NUMA_NO_NODE };
 
+struct pxm2node {
+	unsigned pxm;
+	nodeid_t node;
+};
+static struct pxm2node __read_mostly pxm2node[MAX_NUMNODES] =
+	{ [0 ... MAX_NUMNODES - 1] = {.node = NUMA_NO_NODE} };
+
+static unsigned node_to_pxm(nodeid_t n);
 
 static int num_node_memblks;
 static struct node node_memblk_range[NR_NODE_MEMBLKS];
-static int memblk_nodeid[NR_NODE_MEMBLKS];
-
+static nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
+static __initdata DECLARE_BITMAP(memblk_hotplug, NR_NODE_MEMBLKS);
 
-static int node_to_pxm(int n);
+static inline bool_t node_found(unsigned idx, unsigned pxm)
+{
+	return ((pxm2node[idx].pxm == pxm) &&
+		(pxm2node[idx].node != NUMA_NO_NODE));
+}
 
-int pxm_to_node(int pxm)
+nodeid_t pxm_to_node(unsigned pxm)
 {
-	if ((unsigned)pxm >= 256)
-		return -1;
-	/* Extend 0xff to (int)-1 */
-	return (signed char)pxm2node[pxm];
+	unsigned i;
+
+	if ((pxm < ARRAY_SIZE(pxm2node)) && node_found(pxm, pxm))
+		return pxm2node[pxm].node;
+
+	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
+		if (node_found(i, pxm))
+			return pxm2node[i].node;
+
+	return NUMA_NO_NODE;
 }
 
-__devinit int setup_node(int pxm)
+nodeid_t setup_node(unsigned pxm)
 {
-	unsigned node = pxm2node[pxm];
-	if (node == 0xff) {
-		if (nodes_weight(nodes_found) >= MAX_NUMNODES)
-			return -1;
-		node = first_unset_node(nodes_found); 
-		node_set(node, nodes_found);
-		pxm2node[pxm] = node;
+	nodeid_t node;
+	unsigned idx;
+	static bool_t warned;
+	static unsigned nodes_found;
+
+	BUILD_BUG_ON(MAX_NUMNODES >= NUMA_NO_NODE);
+
+	if (pxm < ARRAY_SIZE(pxm2node)) {
+		if (node_found(pxm, pxm))
+			return pxm2node[pxm].node;
+
+		/* Try to maintain indexing of pxm2node by pxm */
+		if (pxm2node[pxm].node == NUMA_NO_NODE) {
+			idx = pxm;
+			goto finish;
+		}
+	}
+
+	for (idx = 0; idx < ARRAY_SIZE(pxm2node); idx++)
+		if (pxm2node[idx].node == NUMA_NO_NODE)
+			goto finish;
+
+	if (!warned) {
+		printk(KERN_WARNING "SRAT: Too many proximity domains (%#x)\n",
+		       pxm);
+		warned = 1;
 	}
-	return pxm2node[pxm];
+
+	return NUMA_NO_NODE;
+
+ finish:
+	node = nodes_found++;
+	if (node >= MAX_NUMNODES)
+		return NUMA_NO_NODE;
+	pxm2node[idx].pxm = pxm;
+	pxm2node[idx].node = node;
+
+	return node;
 }
 
-int valid_numa_range(u64 start, u64 end, int node)
+int valid_numa_range(u64 start, u64 end, nodeid_t node)
 {
 	int i;
 
@@ -82,9 +127,9 @@ static __init int conflicting_memblks(u64 start, u64 end)
 		if (nd->start == nd->end)
 			continue;
 		if (nd->end > start && nd->start < end)
-			return memblk_nodeid[i];
+			return i;
 		if (nd->end == end && nd->start == start)
-			return memblk_nodeid[i];
+			return i;
 	}
 	return -1;
 }
@@ -112,7 +157,7 @@ static __init void bad_srat(void)
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
 		apicid_to_node[i] = NUMA_NO_NODE;
 	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
-		pxm2node[i] = NUMA_NO_NODE;
+		pxm2node[i].node = NUMA_NO_NODE;
 	mem_hotplug = 0;
 }
 
@@ -162,8 +207,9 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 void __init
 acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 {
-	int pxm, node;
-	int apic_id;
+	unsigned pxm;
+	nodeid_t node;
+	u32 apic_id;
 
 	if (srat_disabled())
 		return;
@@ -175,8 +221,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 		return;
 	pxm = pa->proximity_domain;
 	node = setup_node(pxm);
-	if (node < 0) {
-		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+	if (node == NUMA_NO_NODE) {
 		bad_srat();
 		return;
 	}
@@ -192,7 +237,9 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 void __init
 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 {
-	int pxm, node;
+	unsigned pxm;
+	nodeid_t node;
+
 	if (srat_disabled())
 		return;
 	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -208,8 +255,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 		pxm |= pa->proximity_domain_hi[2] << 24;
 	}
 	node = setup_node(pxm);
-	if (node < 0) {
-		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+	if (node == NUMA_NO_NODE) {
 		bad_srat();
 		return;
 	}
@@ -224,9 +270,9 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 void __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
-	struct node *nd;
 	u64 start, end;
-	int node, pxm;
+	unsigned pxm;
+	nodeid_t node;
 	int i;
 
 	if (srat_disabled())
@@ -252,37 +298,46 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	if (srat_rev < 2)
 		pxm &= 0xff;
 	node = setup_node(pxm);
-	if (node < 0) {
-		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+	if (node == NUMA_NO_NODE) {
 		bad_srat();
 		return;
 	}
 	/* It is fine to add this area to the nodes data it will be used later*/
 	i = conflicting_memblks(start, end);
-	if (i == node) {
-		printk(KERN_WARNING
-		"SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with itself (%"
-		PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, nodes[i].end);
-	} else if (i >= 0) {
+	if (i < 0)
+		/* everything fine */;
+	else if (memblk_nodeid[i] == node) {
+		bool_t mismatch = !(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) !=
+		                  !test_bit(i, memblk_hotplug);
+
+		printk("%sSRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with itself (%"PRIx64"-%"PRIx64")\n",
+		       mismatch ? KERN_ERR : KERN_WARNING, pxm, start, end,
+		       node_memblk_range[i].start, node_memblk_range[i].end);
+		if (mismatch) {
+			bad_srat();
+			return;
+		}
+	} else {
 		printk(KERN_ERR
-		       "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d (%"
-		       PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i),
-			   nodes[i].start, nodes[i].end);
+		       "SRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with PXM %u (%"PRIx64"-%"PRIx64")\n",
+		       pxm, start, end, node_to_pxm(memblk_nodeid[i]),
+		       node_memblk_range[i].start, node_memblk_range[i].end);
 		bad_srat();
 		return;
 	}
-	nd = &nodes[node];
-	if (!node_test_and_set(node, memory_nodes_parsed)) {
-		nd->start = start;
-		nd->end = end;
-	} else {
-		if (start < nd->start)
+	if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
+		struct node *nd = &nodes[node];
+
+		if (!node_test_and_set(node, memory_nodes_parsed)) {
 			nd->start = start;
-		if (nd->end < end)
 			nd->end = end;
+		} else {
+			if (start < nd->start)
+				nd->start = start;
+			if (nd->end < end)
+				nd->end = end;
+		}
 	}
-	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && end > mem_hotplug)
-		mem_hotplug = end;
 	printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"%s\n",
 	       node, pxm, start, end,
 	       ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : "");
@@ -290,12 +345,17 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	node_memblk_range[num_node_memblks].start = start;
 	node_memblk_range[num_node_memblks].end = end;
 	memblk_nodeid[num_node_memblks] = node;
+	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+		__set_bit(num_node_memblks, memblk_hotplug);
+		if (end > mem_hotplug)
+			mem_hotplug = end;
+	}
 	num_node_memblks++;
 }
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
    Make sure the PXMs cover all memory. */
-static int nodes_cover_memory(void)
+static int __init nodes_cover_memory(void)
 {
 	int i;
 
@@ -438,25 +498,33 @@ int __init acpi_scan_nodes(u64 start, u64 end)
 	return 0;
 }
 
-static int node_to_pxm(int n)
+static unsigned node_to_pxm(nodeid_t n)
 {
-       int i;
-       if (pxm2node[n] == n)
-               return n;
-       for (i = 0; i < 256; i++)
-               if (pxm2node[i] == n)
-                       return i;
-       return 0;
+	unsigned i;
+
+	if ((n < ARRAY_SIZE(pxm2node)) && (pxm2node[n].node == n))
+		return pxm2node[n].pxm;
+	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
+		if (pxm2node[i].node == n)
+			return pxm2node[i].pxm;
+	return 0;
 }
 
-int __node_distance(int a, int b)
+u8 __node_distance(nodeid_t a, nodeid_t b)
 {
-	int index;
+	unsigned index;
+	u8 slit_val;
 
 	if (!acpi_slit)
 		return a == b ? 10 : 20;
 	index = acpi_slit->locality_count * node_to_pxm(a);
-	return acpi_slit->entry[index + node_to_pxm(b)];
+	slit_val = acpi_slit->entry[index + node_to_pxm(b)];
+
+	/* ACPI defines 0xff as an unreachable node and 0-9 are undefined */
+	if ((slit_val == 0xff) || (slit_val <= 9))
+		return NUMA_NO_DISTANCE;
+	else
+		return slit_val;
 }
 
 EXPORT_SYMBOL(__node_distance);
diff --git a/xen/arch/x86/string.c b/xen/arch/x86/string.c
index 3af0ea8..043ae66 100644
--- a/xen/arch/x86/string.c
+++ b/xen/arch/x86/string.c
@@ -15,7 +15,7 @@ void *memcpy(void *dest, const void *src, size_t n)
 
     asm volatile (
         "   rep ; movs"__OS" ; "
-        "   mov %4,%3        ; "
+        "   mov %k4,%k3      ; "
         "   rep ; movsb        "
         : "=&c" (d0), "=&D" (d1), "=&S" (d2)
         : "0" (n/BYTES_PER_LONG), "r" (n%BYTES_PER_LONG), "1" (dest), "2" (src)
diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
index 57ad992..38b5dcb 100644
--- a/xen/arch/x86/sysctl.c
+++ b/xen/arch/x86/sysctl.c
@@ -75,7 +75,8 @@ long cpu_down_helper(void *data)
 
 void arch_do_physinfo(xen_sysctl_physinfo_t *pi)
 {
-    memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4);
+    memcpy(pi->hw_cap, boot_cpu_data.x86_capability,
+           min(sizeof(pi->hw_cap), sizeof(boot_cpu_data.x86_capability)));
     if ( hvm_enabled )
         pi->capabilities |= XEN_SYSCTL_PHYSCAP_hvm;
     if ( iommu_enabled )
@@ -157,6 +158,9 @@ long arch_do_sysctl(
             sysctl->u.psr_cmt_op.u.data = (ret ? 0 : info.size);
             break;
         }
+        case XEN_SYSCTL_PSR_CMT_get_l3_event_mask:
+            sysctl->u.psr_cmt_op.u.data = psr_cmt->l3.features;
+            break;
         default:
             sysctl->u.psr_cmt_op.u.data = 0;
             ret = -ENOSYS;
@@ -168,6 +172,24 @@ long arch_do_sysctl(
 
         break;
 
+    case XEN_SYSCTL_psr_cat_op:
+        switch ( sysctl->u.psr_cat_op.cmd )
+        {
+        case XEN_SYSCTL_PSR_CAT_get_l3_info:
+            ret = psr_get_cat_l3_info(sysctl->u.psr_cat_op.target,
+                                      &sysctl->u.psr_cat_op.u.l3_info.cbm_len,
+                                      &sysctl->u.psr_cat_op.u.l3_info.cos_max);
+
+            if ( !ret && __copy_field_to_guest(u_sysctl, sysctl, u.psr_cat_op) )
+                ret = -EFAULT;
+
+            break;
+        default:
+            ret = -EOPNOTSUPP;
+            break;
+        }
+        break;
+
     default:
         ret = -ENOSYS;
         break;
diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c
index ca4839e..88142d2 100644
--- a/xen/arch/x86/tboot.c
+++ b/xen/arch/x86/tboot.c
@@ -48,7 +48,7 @@ static uint64_t __initdata sinit_base, __initdata sinit_size;
 #define TXTCR_HEAP_BASE             0x0300
 #define TXTCR_HEAP_SIZE             0x0308
 
-extern char __init_begin[], __bss_start[];
+extern char __init_begin[], __bss_start[], __bss_end[];
 
 #define SHA1_SIZE      20
 typedef uint8_t   sha1_hash_t[SHA1_SIZE];
@@ -138,7 +138,7 @@ void __init tboot_probe(void)
                       TXT_PUB_CONFIG_REGS_BASE + TXTCR_SINIT_BASE);
     tboot_copy_memory((unsigned char *)&sinit_size, sizeof(sinit_size),
                       TXT_PUB_CONFIG_REGS_BASE + TXTCR_SINIT_SIZE);
-    __set_fixmap(FIX_TBOOT_MAP_ADDRESS, 0, 0);
+    clear_fixmap(FIX_TBOOT_MAP_ADDRESS);
 }
 
 /* definitions from xen/drivers/passthrough/vtd/iommu.h
@@ -161,7 +161,7 @@ static void update_iommu_mac(vmac_ctx_t *ctx, uint64_t pt_maddr, int level)
     if ( pt_maddr == 0 )
         return;
 
-    pt_vaddr = (struct dma_pte *)map_domain_page(pt_maddr >> PAGE_SHIFT_4K);
+    pt_vaddr = (struct dma_pte *)map_domain_page(_mfn(paddr_to_pfn(pt_maddr)));
     vmac_update((void *)pt_vaddr, PAGE_SIZE, ctx);
 
     for ( i = 0; i < PTE_NUM; i++ )
@@ -194,7 +194,8 @@ static void update_pagetable_mac(vmac_ctx_t *ctx)
         {
             if ( page->count_info & PGC_page_table )
             {
-                void *pg = map_domain_page(mfn);
+                void *pg = map_domain_page(_mfn(mfn));
+
                 vmac_update(pg, PAGE_SIZE, ctx);
                 unmap_domain_page(pg);
             }
@@ -374,7 +375,7 @@ void tboot_shutdown(uint32_t shutdown_type)
                                               __pa(&_stext);
         /* bss */
         g_tboot_shared->mac_regions[2].start = (uint64_t)__pa(&__bss_start);
-        g_tboot_shared->mac_regions[2].size = __pa(&_end) - __pa(&__bss_start);
+        g_tboot_shared->mac_regions[2].size = __pa(&__bss_end) - __pa(&__bss_start);
 
         /*
          * MAC domains and other Xen memory
@@ -435,13 +436,12 @@ int __init tboot_protect_mem_regions(void)
 
 int __init tboot_parse_dmar_table(acpi_table_handler dmar_handler)
 {
-    struct acpi_table_header *dmar_table;
     int rc;
     uint64_t size;
     uint32_t dmar_table_length;
     unsigned long pa;
     sinit_mle_data_t sinit_mle_data;
-    unsigned char *dmar_table_raw;
+    void *dmar_table;
 
     if ( !tboot_in_measured_env() )
         return acpi_table_parse(ACPI_SIG_DMAR, dmar_handler);
@@ -474,13 +474,12 @@ int __init tboot_parse_dmar_table(acpi_table_handler dmar_handler)
     tboot_copy_memory((unsigned char *)&dmar_table_length,
                       sizeof(dmar_table_length),
                       pa + sizeof(char) * ACPI_NAME_SIZE);
-    dmar_table_raw = xmalloc_array(unsigned char, dmar_table_length);
-    tboot_copy_memory(dmar_table_raw, dmar_table_length, pa);
-    dmar_table = (struct acpi_table_header *)dmar_table_raw;
-    __set_fixmap(FIX_TBOOT_MAP_ADDRESS, 0, 0);
+    dmar_table = xmalloc_bytes(dmar_table_length);
+    tboot_copy_memory(dmar_table, dmar_table_length, pa);
+    clear_fixmap(FIX_TBOOT_MAP_ADDRESS);
 
     rc = dmar_handler(dmar_table);
-    xfree(dmar_table_raw);
+    xfree(dmar_table);
 
     /* acpi_parse_dmar() zaps APCI DMAR signature in TXT heap table */
     /* but dom0 will read real table, so must zap it there too */
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index 74c01e3..bbb7e6c 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -47,7 +47,8 @@ string_param("clocksource", opt_clocksource);
 unsigned long __read_mostly cpu_khz;  /* CPU clock frequency in kHz. */
 DEFINE_SPINLOCK(rtc_lock);
 unsigned long pit0_ticks;
-static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */
+static unsigned long wc_sec; /* UTC time at last 'time update'. */
+static unsigned int wc_nsec;
 static DEFINE_SPINLOCK(wc_lock);
 
 struct cpu_time {
@@ -178,7 +179,7 @@ static void smp_send_timer_broadcast_ipi(void)
 
     if ( cpumask_test_cpu(cpu, &mask) )
     {
-        cpumask_clear_cpu(cpu, &mask);
+        __cpumask_clear_cpu(cpu, &mask);
         raise_softirq(TIMER_SOFTIRQ);
     }
 
@@ -260,10 +261,10 @@ static u64 init_pit_and_calibrate_tsc(void)
     outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
     outb(CALIBRATE_LATCH >> 8, PIT_CH2);   /* MSB of count */
 
-    rdtscll(start);
+    start = rdtsc();
     for ( count = 0; (inb(0x61) & 0x20) == 0; count++ )
         continue;
-    rdtscll(end);
+    end = rdtsc();
 
     /* Error if the CTC doesn't behave itself. */
     if ( count == 0 )
@@ -763,7 +764,7 @@ s_time_t get_s_time_fixed(u64 at_tsc)
     if ( at_tsc )
         tsc = at_tsc;
     else
-        rdtscll(tsc);
+        tsc = rdtsc();
     delta = tsc - t->local_tsc_stamp;
     now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale);
 
@@ -902,6 +903,7 @@ void force_update_vcpu_system_time(struct vcpu *v)
 void update_domain_wallclock_time(struct domain *d)
 {
     uint32_t *wc_version;
+    unsigned long sec;
 
     spin_lock(&wc_lock);
 
@@ -909,8 +911,19 @@ void update_domain_wallclock_time(struct domain *d)
     *wc_version = version_update_begin(*wc_version);
     wmb();
 
-    shared_info(d, wc_sec)  = wc_sec + d->time_offset_seconds;
-    shared_info(d, wc_nsec) = wc_nsec;
+    sec = wc_sec + d->time_offset_seconds;
+    if ( likely(!has_32bit_shinfo(d)) )
+    {
+        d->shared_info->native.wc_sec    = sec;
+        d->shared_info->native.wc_nsec   = wc_nsec;
+        d->shared_info->native.wc_sec_hi = sec >> 32;
+    }
+    else
+    {
+        d->shared_info->compat.wc_sec         = sec;
+        d->shared_info->compat.wc_nsec        = wc_nsec;
+        d->shared_info->compat.arch.wc_sec_hi = sec >> 32;
+    }
 
     wmb();
     *wc_version = version_update_end(*wc_version);
@@ -931,7 +944,7 @@ static void update_domain_rtc(void)
     rcu_read_unlock(&domlist_read_lock);
 }
 
-void domain_set_time_offset(struct domain *d, int32_t time_offset_seconds)
+void domain_set_time_offset(struct domain *d, int64_t time_offset_seconds)
 {
     d->time_offset_seconds = time_offset_seconds;
     if ( is_hvm_domain(d) )
@@ -958,7 +971,7 @@ int cpu_frequency_change(u64 freq)
     /* TSC-extrapolated time may be bogus after frequency change. */
     /*t->stime_local_stamp = get_s_time();*/
     t->stime_local_stamp = t->stime_master_stamp;
-    rdtscll(curr_tsc);
+    curr_tsc = rdtsc();
     t->local_tsc_stamp = curr_tsc;
     set_time_scale(&t->tsc_scale, freq);
     local_irq_enable();
@@ -976,13 +989,13 @@ int cpu_frequency_change(u64 freq)
 }
 
 /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */
-void do_settime(unsigned long secs, unsigned long nsecs, u64 system_time_base)
+void do_settime(unsigned long secs, unsigned int nsecs, u64 system_time_base)
 {
     u64 x;
     u32 y;
     struct domain *d;
 
-    x = SECONDS(secs) + (u64)nsecs - system_time_base;
+    x = SECONDS(secs) + nsecs - system_time_base;
     y = do_div(x, 1000000000);
 
     spin_lock(&wc_lock);
@@ -1294,7 +1307,7 @@ static void time_calibration_tsc_rendezvous(void *_r)
             if ( r->master_stime == 0 )
             {
                 r->master_stime = read_platform_stime();
-                rdtscll(r->master_tsc_stamp);
+                r->master_tsc_stamp = rdtsc();
             }
             atomic_inc(&r->semaphore);
 
@@ -1320,7 +1333,7 @@ static void time_calibration_tsc_rendezvous(void *_r)
         }
     }
 
-    rdtscll(c->local_tsc_stamp);
+    c->local_tsc_stamp = rdtsc();
     c->stime_local_stamp = get_s_time();
     c->stime_master_stamp = r->master_stime;
 
@@ -1350,7 +1363,7 @@ static void time_calibration_std_rendezvous(void *_r)
         mb(); /* receive signal /then/ read r->master_stime */
     }
 
-    rdtscll(c->local_tsc_stamp);
+    c->local_tsc_stamp = rdtsc();
     c->stime_local_stamp = get_s_time();
     c->stime_master_stamp = r->master_stime;
 
@@ -1384,7 +1397,7 @@ void init_percpu_time(void)
     t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
 
     local_irq_save(flags);
-    rdtscll(t->local_tsc_stamp);
+    t->local_tsc_stamp = rdtsc();
     now = read_platform_stime();
     local_irq_restore(flags);
 
@@ -1413,13 +1426,13 @@ static void __init tsc_check_writability(void)
     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
         return;
 
-    rdtscll(tsc);
+    tsc = rdtsc();
     if ( wrmsr_safe(MSR_IA32_TSC, 0) == 0 )
     {
-        uint64_t tmp, tmp2;
-        rdtscll(tmp2);
+        uint64_t tmp, tmp2 = rdtsc();
+
         write_tsc(tsc | (1ULL << 32));
-        rdtscll(tmp);
+        tmp = rdtsc();
         if ( ABS((s64)tmp - (s64)tmp2) < (1LL << 31) )
             what = "only partially";
     }
@@ -1764,10 +1777,12 @@ void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs, int rdtscp)
 
     spin_lock(&d->arch.vtsc_lock);
 
+#if !defined(NDEBUG) || defined(PERF_COUNTERS)
     if ( guest_kernel_mode(v, regs) )
         d->arch.vtsc_kerncount++;
     else
         d->arch.vtsc_usercount++;
+#endif
 
     if ( (int64_t)(now - d->arch.vtsc_last) > 0 )
         d->arch.vtsc_last = now;
@@ -1853,7 +1868,7 @@ void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
             *gtsc_khz = d->arch.tsc_khz;
             break;
         }
-        rdtscll(tsc);
+        tsc = rdtsc();
         *elapsed_nsec = scale_delta(tsc, &d->arch.vtsc_to_ns);
         *gtsc_khz = cpu_khz;
         break;
@@ -1865,7 +1880,7 @@ void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
         }
         else
         {
-            rdtscll(tsc);
+            tsc = rdtsc();
             *elapsed_nsec = scale_delta(tsc, &d->arch.vtsc_to_ns) -
                             d->arch.vtsc_offset;
             *gtsc_khz = 0; /* ignored by tsc_set_info */
@@ -1958,9 +1973,7 @@ void tsc_set_info(struct domain *d,
         else {
             /* when using native TSC, offset is nsec relative to power-on
              * of physical machine */
-            uint64_t tsc = 0;
-            rdtscll(tsc);
-            d->arch.vtsc_offset = scale_delta(tsc,&d->arch.vtsc_to_ns) -
+            d->arch.vtsc_offset = scale_delta(rdtsc(), &d->arch.vtsc_to_ns) -
                                   elapsed_nsec;
         }
         break;
@@ -1979,7 +1992,7 @@ void tsc_set_info(struct domain *d,
              * call set_tsc_offset() later from hvm_vcpu_reset_state() and they
              * will sync their TSC to BSP's sync_tsc.
              */
-            rdtscll(d->arch.hvm_domain.sync_tsc);
+            d->arch.hvm_domain.sync_tsc = rdtsc();
             hvm_funcs.set_tsc_offset(d->vcpu[0],
                                      d->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset,
                                      d->arch.hvm_domain.sync_tsc);
@@ -2020,17 +2033,13 @@ static void dump_softtsc(unsigned char key)
             printk(",khz=%"PRIu32, d->arch.tsc_khz);
         if ( d->arch.incarnation )
             printk(",inc=%"PRIu32, d->arch.incarnation);
+#if !defined(NDEBUG) || defined(PERF_COUNTERS)
         if ( !(d->arch.vtsc_kerncount | d->arch.vtsc_usercount) )
-        {
             printk("\n");
-            continue;
-        }
-        if ( is_hvm_domain(d) )
-            printk(",vtsc count: %"PRIu64" total\n",
-                   d->arch.vtsc_kerncount);
         else
             printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
                    d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
+#endif
         domcnt++;
     }
 
diff --git a/xen/arch/x86/trace.c b/xen/arch/x86/trace.c
index 64e9ad0..bd8596c 100644
--- a/xen/arch/x86/trace.c
+++ b/xen/arch/x86/trace.c
@@ -11,7 +11,7 @@ void __trace_hypercall_entry(void)
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     unsigned long args[6];
 
-    if ( is_pv_32on64_vcpu(current) )
+    if ( is_pv_32bit_vcpu(current) )
     {
         args[0] = regs->ebx;
         args[1] = regs->ecx;
@@ -36,7 +36,7 @@ void __trace_hypercall_entry(void)
 void __trace_pv_trap(int trapnr, unsigned long eip,
                      int use_error_code, unsigned error_code)
 {
-    if ( is_pv_32on64_vcpu(current) )
+    if ( is_pv_32bit_vcpu(current) )
     {
         struct __packed {
             unsigned eip:32,
@@ -77,7 +77,7 @@ void __trace_pv_page_fault(unsigned long addr, unsigned error_code)
 {
     unsigned long eip = guest_cpu_user_regs()->eip;
 
-    if ( is_pv_32on64_vcpu(current) )
+    if ( is_pv_32bit_vcpu(current) )
     {
         struct __packed {
             u32 eip, addr, error_code;
@@ -108,7 +108,7 @@ void __trace_pv_page_fault(unsigned long addr, unsigned error_code)
 
 void __trace_trap_one_addr(unsigned event, unsigned long va)
 {
-    if ( is_pv_32on64_vcpu(current) )
+    if ( is_pv_32bit_vcpu(current) )
     {
         u32 d = va;
         __trace_var(event, 1, sizeof(d), &d);
@@ -123,7 +123,7 @@ void __trace_trap_one_addr(unsigned event, unsigned long va)
 void __trace_trap_two_addr(unsigned event, unsigned long va1,
                            unsigned long va2)
 {
-    if ( is_pv_32on64_vcpu(current) )
+    if ( is_pv_32bit_vcpu(current) )
     {
         struct __packed {
             u32 va1, va2;
@@ -156,7 +156,7 @@ void __trace_ptwr_emulation(unsigned long addr, l1_pgentry_t npte)
      * cases, "unsigned long" is the size of a guest virtual address.
      */
 
-    if ( is_pv_32on64_vcpu(current) )
+    if ( is_pv_32bit_vcpu(current) )
     {
         struct __packed {
             l1_pgentry_t pte;
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 61316ba..9f5a6c6 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
@@ -72,6 +71,7 @@
 #include <asm/apic.h>
 #include <asm/mc146818rtc.h>
 #include <asm/hpet.h>
+#include <asm/vpmu.h>
 #include <public/arch-x86/cpuid.h>
 #include <xsm/xsm.h>
 
@@ -124,7 +124,7 @@ static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
     if ( is_hvm_vcpu(v) )
         return;
 
-    if ( is_pv_32on64_vcpu(v) )
+    if ( is_pv_32bit_vcpu(v) )
     {
         compat_show_guest_stack(v, regs, debug_stack_lines);
         return;
@@ -193,6 +193,70 @@ static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
     printk("\n");
 }
 
+/*
+ * Notes for get_stack_trace_bottom() and get_stack_dump_bottom()
+ *
+ * Stack pages 0, 1 and 2:
+ *   These are all 1-page IST stacks.  Each of these stacks have an exception
+ *   frame and saved register state at the top.  The interesting bound for a
+ *   trace is the word adjacent to this, while the bound for a dump is the
+ *   very top, including the exception frame.
+ *
+ * Stack pages 3, 4 and 5:
+ *   None of these are particularly interesting.  With MEMORY_GUARD, page 5 is
+ *   explicitly not present, so attempting to dump or trace it is
+ *   counterproductive.  Without MEMORY_GUARD, it is possible for a call chain
+ *   to use the entire primary stack and wander into page 5.  In this case,
+ *   consider these pages an extension of the primary stack to aid debugging
+ *   hopefully rare situations where the primary stack has effective been
+ *   overflown.
+ *
+ * Stack pages 6 and 7:
+ *   These form the primary stack, and have a cpu_info at the top.  For a
+ *   trace, the interesting bound is adjacent to the cpu_info, while for a
+ *   dump, the entire cpu_info is interesting.
+ *
+ * For the cases where the stack should not be inspected, pretend that the
+ * passed stack pointer is already out of reasonable bounds.
+ */
+unsigned long get_stack_trace_bottom(unsigned long sp)
+{
+    switch ( get_stack_page(sp) )
+    {
+    case 0 ... 2:
+        return ROUNDUP(sp, PAGE_SIZE) -
+            offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
+
+#ifndef MEMORY_GUARD
+    case 3 ... 5:
+#endif
+    case 6 ... 7:
+        return ROUNDUP(sp, STACK_SIZE) -
+            sizeof(struct cpu_info) - sizeof(unsigned long);
+
+    default:
+        return sp - sizeof(unsigned long);
+    }
+}
+
+unsigned long get_stack_dump_bottom(unsigned long sp)
+{
+    switch ( get_stack_page(sp) )
+    {
+    case 0 ... 2:
+        return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
+
+#ifndef MEMORY_GUARD
+    case 3 ... 5:
+#endif
+    case 6 ... 7:
+        return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
+
+    default:
+        return sp - sizeof(unsigned long);
+    }
+}
+
 #if !defined(CONFIG_FRAME_POINTER)
 
 /*
@@ -203,7 +267,7 @@ static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
 static void _show_trace(unsigned long sp, unsigned long __maybe_unused bp)
 {
     unsigned long *stack = (unsigned long *)sp, addr;
-    unsigned long *bottom = (unsigned long *)get_printable_stack_bottom(sp);
+    unsigned long *bottom = (unsigned long *)get_stack_trace_bottom(sp);
 
     while ( stack <= bottom )
     {
@@ -221,7 +285,7 @@ static void _show_trace(unsigned long sp, unsigned long bp)
     unsigned long *frame, next, addr;
 
     /* Bounds for range of valid frame pointer. */
-    unsigned long low = sp, high = get_printable_stack_bottom(sp);
+    unsigned long low = sp, high = get_stack_trace_bottom(sp);
 
     /* The initial frame pointer. */
     next = bp;
@@ -292,7 +356,7 @@ static void show_trace(const struct cpu_user_regs *regs)
 
 void show_stack(const struct cpu_user_regs *regs)
 {
-    unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr;
+    unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), *stack_bottom, addr;
     int i;
 
     if ( guest_mode(regs) )
@@ -300,10 +364,11 @@ void show_stack(const struct cpu_user_regs *regs)
 
     printk("Xen stack trace from "__OP"sp=%p:\n  ", stack);
 
-    for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
+    stack_bottom = _p(get_stack_dump_bottom(regs->rsp));
+
+    for ( i = 0; i < (debug_stack_lines*stack_words_per_line) &&
+              (stack <= stack_bottom); i++ )
     {
-        if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 )
-            break;
         if ( (i != 0) && ((i % stack_words_per_line) == 0) )
             printk("\n  ");
         addr = *stack++;
@@ -454,9 +519,9 @@ static void do_guest_trap(
         tb->flags |= TBF_INTERRUPT;
 
     if ( unlikely(null_trap_bounce(v, tb)) )
-        gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] "
-                 "on VCPU %d [ec=%04x]\n",
-                 trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code);
+        gprintk(XENLOG_WARNING,
+                "Unhandled %s fault/trap [#%d, ec=%04x]\n",
+                trapstr(trapnr), trapnr, regs->error_code);
 }
 
 static void instruction_done(
@@ -466,9 +531,9 @@ static void instruction_done(
     regs->eflags &= ~X86_EFLAGS_RF;
     if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
     {
-        current->arch.debugreg[6] |= bpmatch | 0xffff0ff0;
+        current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
         if ( regs->eflags & X86_EFLAGS_TF )
-            current->arch.debugreg[6] |= 0x4000;
+            current->arch.debugreg[6] |= DR_STEP;
         do_guest_trap(TRAP_debug, regs, 0);
     }
 }
@@ -684,16 +749,16 @@ int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
                uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
-    struct domain *d = current->domain;
+    struct domain *currd = current->domain;
     /* Optionally shift out of the way of Viridian architectural leaves. */
-    uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
+    uint32_t base = is_viridian_domain(currd) ? 0x40000100 : 0x40000000;
     uint32_t limit, dummy;
 
     idx -= base;
     if ( idx > XEN_CPUID_MAX_NUM_LEAVES )
         return 0; /* Avoid unnecessary pass through domain_cpuid() */
 
-    domain_cpuid(d, base, 0, &limit, &dummy, &dummy, &dummy);
+    domain_cpuid(currd, base, 0, &limit, &dummy, &dummy, &dummy);
     if ( limit == 0 )
         /* Default number of leaves */
         limit = XEN_CPUID_MAX_NUM_LEAVES;
@@ -729,11 +794,11 @@ int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
     case 2:
         *eax = 1;          /* Number of hypercall-transfer pages */
         *ebx = 0x40000000; /* MSR base address */
-        if ( is_viridian_domain(d) )
+        if ( is_viridian_domain(currd) )
             *ebx = 0x40000200;
         *ecx = 0;          /* Features 1 */
         *edx = 0;          /* Features 2 */
-        if ( is_pv_vcpu(current) )
+        if ( is_pv_domain(currd) )
             *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
         break;
 
@@ -757,18 +822,19 @@ void pv_cpuid(struct cpu_user_regs *regs)
 {
     uint32_t a, b, c, d;
     struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
 
     a = regs->eax;
     b = regs->ebx;
     c = regs->ecx;
     d = regs->edx;
 
-    if ( !is_control_domain(curr->domain) && !is_hardware_domain(curr->domain) )
+    if ( !is_control_domain(currd) && !is_hardware_domain(currd) )
     {
         unsigned int cpuid_leaf = a, sub_leaf = c;
 
         if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) )
-            domain_cpuid(curr->domain, a, c, &a, &b, &c, &d);
+            domain_cpuid(currd, a, c, &a, &b, &c, &d);
 
         switch ( cpuid_leaf )
         {
@@ -784,7 +850,7 @@ void pv_cpuid(struct cpu_user_regs *regs)
                 {
                     if ( !(curr->arch.xcr0 & (1ULL << sub_leaf)) )
                         continue;
-                    domain_cpuid(curr->domain, cpuid_leaf, sub_leaf,
+                    domain_cpuid(currd, cpuid_leaf, sub_leaf,
                                  &_eax, &_ebx, &_ecx, &_edx);
                     if ( (_eax + _ebx) > b )
                         b = _eax + _ebx;
@@ -796,10 +862,7 @@ void pv_cpuid(struct cpu_user_regs *regs)
         goto out;
     }
 
-    asm ( 
-        "cpuid"
-        : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
-        : "0" (a), "1" (b), "2" (c), "3" (d) );
+    cpuid_count(a, c, &a, &b, &c, &d);
 
     if ( (regs->eax & 0x7fffffff) == 0x00000001 )
     {
@@ -807,7 +870,7 @@ void pv_cpuid(struct cpu_user_regs *regs)
         if ( !cpu_has_apic )
             __clear_bit(X86_FEATURE_APIC, &d);
 
-        if ( !is_pvh_vcpu(curr) )
+        if ( !is_pvh_domain(currd) )
         {
             __clear_bit(X86_FEATURE_PSE, &d);
             __clear_bit(X86_FEATURE_PGE, &d);
@@ -825,7 +888,7 @@ void pv_cpuid(struct cpu_user_regs *regs)
         __clear_bit(X86_FEATURE_DS, &d);
         __clear_bit(X86_FEATURE_ACC, &d);
         __clear_bit(X86_FEATURE_PBE, &d);
-        if ( is_pvh_vcpu(curr) )
+        if ( is_pvh_domain(currd) )
             __clear_bit(X86_FEATURE_MTRR, &d);
 
         __clear_bit(X86_FEATURE_DTES64 % 32, &c);
@@ -834,7 +897,7 @@ void pv_cpuid(struct cpu_user_regs *regs)
         __clear_bit(X86_FEATURE_VMXE % 32, &c);
         __clear_bit(X86_FEATURE_SMXE % 32, &c);
         __clear_bit(X86_FEATURE_TM2 % 32, &c);
-        if ( is_pv_32bit_vcpu(curr) )
+        if ( is_pv_32bit_domain(currd) )
             __clear_bit(X86_FEATURE_CX16 % 32, &c);
         __clear_bit(X86_FEATURE_XTPR % 32, &c);
         __clear_bit(X86_FEATURE_PDCM % 32, &c);
@@ -883,12 +946,12 @@ void pv_cpuid(struct cpu_user_regs *regs)
 
     case 0x80000001:
         /* Modify Feature Information. */
-        if ( is_pv_32bit_vcpu(curr) )
+        if ( is_pv_32bit_domain(currd) )
         {
             __clear_bit(X86_FEATURE_LM % 32, &d);
             __clear_bit(X86_FEATURE_LAHF_LM % 32, &c);
         }
-        if ( is_pv_32on64_vcpu(curr) &&
+        if ( is_pv_32bit_domain(currd) &&
              boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
             __clear_bit(X86_FEATURE_SYSCALL % 32, &d);
         __clear_bit(X86_FEATURE_PAGE1GB % 32, &d);
@@ -906,8 +969,10 @@ void pv_cpuid(struct cpu_user_regs *regs)
         __clear_bit(X86_FEATURE_TOPOEXT % 32, &c);
         break;
 
+    case 0x0000000a: /* Architectural Performance Monitor Features (Intel) */
+        break;
+
     case 0x00000005: /* MONITOR/MWAIT */
-    case 0x0000000a: /* Architectural Performance Monitor Features */
     case 0x0000000b: /* Extended Topology Enumeration */
     case 0x8000000a: /* SVM revision and features */
     case 0x8000001b: /* Instruction Based Sampling */
@@ -923,6 +988,9 @@ void pv_cpuid(struct cpu_user_regs *regs)
     }
 
  out:
+    /* VPMU may decide to modify some of the leaves */
+    vpmu_do_cpuid(regs->eax, &a, &b, &c, &d);
+
     regs->eax = a;
     regs->ebx = b;
     regs->ecx = c;
@@ -1010,8 +1078,7 @@ void do_invalid_op(struct cpu_user_regs *regs)
         return;
     }
 
-    if ( (!is_kernel_text(eip) &&
-          (system_state > SYS_STATE_boot || !is_kernel_inittext(eip))) ||
+    if ( !is_active_kernel_text(regs->eip) ||
          __copy_from_user(bug_insn, eip, sizeof(bug_insn)) ||
          memcmp(bug_insn, "\xf\xb", sizeof(bug_insn)) )
         goto die;
@@ -1257,7 +1324,7 @@ static enum pf_type __page_fault_type(
 
     mfn = cr3 >> PAGE_SHIFT;
 
-    l4t = map_domain_page(mfn);
+    l4t = map_domain_page(_mfn(mfn));
     l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
     mfn = l4e_get_pfn(l4e);
     unmap_domain_page(l4t);
@@ -1266,7 +1333,7 @@ static enum pf_type __page_fault_type(
         return real_fault;
     page_user &= l4e_get_flags(l4e);
 
-    l3t  = map_domain_page(mfn);
+    l3t  = map_domain_page(_mfn(mfn));
     l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
     mfn = l3e_get_pfn(l3e);
     unmap_domain_page(l3t);
@@ -1277,7 +1344,7 @@ static enum pf_type __page_fault_type(
     if ( l3e_get_flags(l3e) & _PAGE_PSE )
         goto leaf;
 
-    l2t = map_domain_page(mfn);
+    l2t = map_domain_page(_mfn(mfn));
     l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
     mfn = l2e_get_pfn(l2e);
     unmap_domain_page(l2t);
@@ -1288,7 +1355,7 @@ static enum pf_type __page_fault_type(
     if ( l2e_get_flags(l2e) & _PAGE_PSE )
         goto leaf;
 
-    l1t = map_domain_page(mfn);
+    l1t = map_domain_page(_mfn(mfn));
     l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
     mfn = l1e_get_pfn(l1e);
     unmap_domain_page(l1t);
@@ -1379,7 +1446,7 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
          !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) &&
          (regs->error_code & PFEC_write_access) )
     {
-        if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
+        if ( VM_ASSIST(d, writable_pagetables) &&
              /* Do not check if access-protection fault since the page may
                 legitimately be not present in shadow page tables */
              (paging_mode_enabled(d) ||
@@ -1490,8 +1557,8 @@ void do_page_fault(struct cpu_user_regs *regs)
  */
 void __init do_early_page_fault(struct cpu_user_regs *regs)
 {
-    static int stuck;
-    static unsigned long prev_eip, prev_cr2;
+    static unsigned int __initdata stuck;
+    static unsigned long __initdata prev_eip, prev_cr2;
     unsigned long cr2 = read_cr2();
 
     BUG_ON(smp_processor_id() != 0);
@@ -1677,7 +1744,9 @@ static int guest_io_okay(
                                           port>>3, 2) )
         {
         default: x.bytes[0] = ~0;
+            /* fallthrough */
         case 1:  x.bytes[1] = ~0;
+            /* fallthrough */
         case 0:  break;
         }
         TOGGLE_MODE();
@@ -1690,9 +1759,8 @@ static int guest_io_okay(
 }
 
 /* Has the administrator granted sufficient permission for this I/O access? */
-static int admin_io_okay(
-    unsigned int port, unsigned int bytes,
-    struct vcpu *v, struct cpu_user_regs *regs)
+static bool_t admin_io_okay(unsigned int port, unsigned int bytes,
+                            const struct domain *d)
 {
     /*
      * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
@@ -1705,17 +1773,21 @@ static int admin_io_okay(
     if ( ((port & ~1) == RTC_PORT(0)) )
         return 0;
 
-    return ioports_access_permitted(v->domain, port, port + bytes - 1);
+    return ioports_access_permitted(d, port, port + bytes - 1);
 }
 
-static int pci_cfg_ok(struct domain *d, int write, int size)
+static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
+                         unsigned int size, uint32_t *write)
 {
     uint32_t machine_bdf;
-    uint16_t start, end;
-    if (!is_hardware_domain(d))
+
+    if ( !is_hardware_domain(currd) )
         return 0;
 
-    machine_bdf = (d->arch.pci_cf8 >> 8) & 0xFFFF;
+    if ( !CF8_ENABLED(currd->arch.pci_cf8) )
+        return 1;
+
+    machine_bdf = CF8_BDF(currd->arch.pci_cf8);
     if ( write )
     {
         const unsigned long *ro_map = pci_get_ro_map(0);
@@ -1723,9 +1795,9 @@ static int pci_cfg_ok(struct domain *d, int write, int size)
         if ( ro_map && test_bit(machine_bdf, ro_map) )
             return 0;
     }
-    start = d->arch.pci_cf8 & 0xFF;
+    start |= CF8_ADDR_LO(currd->arch.pci_cf8);
     /* AMD extended configuration space access? */
-    if ( (d->arch.pci_cf8 & 0x0F000000) &&
+    if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
          boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
          boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
     {
@@ -1734,22 +1806,24 @@ static int pci_cfg_ok(struct domain *d, int write, int size)
         if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
             return 0;
         if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
-            start |= (d->arch.pci_cf8 >> 16) & 0xF00;
+            start |= CF8_ADDR_HI(currd->arch.pci_cf8);
     }
-    end = start + size - 1;
-    if (xsm_pci_config_permission(XSM_HOOK, d, machine_bdf, start, end, write))
-        return 0;
-    return 1;
+
+    if ( xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
+                                   start, start + size - 1, !!write) != 0 )
+         return 0;
+
+    return !write ||
+           pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
 }
 
-uint32_t guest_io_read(
-    unsigned int port, unsigned int bytes,
-    struct vcpu *v, struct cpu_user_regs *regs)
+uint32_t guest_io_read(unsigned int port, unsigned int bytes,
+                       struct domain *currd)
 {
     uint32_t data = 0;
     unsigned int shift = 0;
 
-    if ( admin_io_okay(port, bytes, v, regs) )
+    if ( admin_io_okay(port, bytes, currd) )
     {
         switch ( bytes )
         {
@@ -1770,31 +1844,30 @@ uint32_t guest_io_read(
         }
         else if ( (port == RTC_PORT(0)) )
         {
-            sub_data = v->domain->arch.cmos_idx;
+            sub_data = currd->arch.cmos_idx;
         }
         else if ( (port == RTC_PORT(1)) &&
-                  ioports_access_permitted(v->domain, RTC_PORT(0),
-                                           RTC_PORT(1)) )
+                  ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
         {
             unsigned long flags;
 
             spin_lock_irqsave(&rtc_lock, flags);
-            outb(v->domain->arch.cmos_idx & 0x7f, RTC_PORT(0));
+            outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
             sub_data = inb(RTC_PORT(1));
             spin_unlock_irqrestore(&rtc_lock, flags);
         }
         else if ( (port == 0xcf8) && (bytes == 4) )
         {
             size = 4;
-            sub_data = v->domain->arch.pci_cf8;
+            sub_data = currd->arch.pci_cf8;
         }
         else if ( (port & 0xfffc) == 0xcfc )
         {
             size = min(bytes, 4 - (port & 3));
             if ( size == 3 )
                 size = 2;
-            if ( pci_cfg_ok(v->domain, 0, size) )
-                sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
+            if ( pci_cfg_ok(currd, port & 3, size, NULL) )
+                sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
         }
 
         if ( size == 4 )
@@ -1809,11 +1882,10 @@ uint32_t guest_io_read(
     return data;
 }
 
-void guest_io_write(
-    unsigned int port, unsigned int bytes, uint32_t data,
-    struct vcpu *v, struct cpu_user_regs *regs)
+void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
+                    struct domain *currd)
 {
-    if ( admin_io_okay(port, bytes, v, regs) )
+    if ( admin_io_okay(port, bytes, currd) )
     {
         switch ( bytes ) {
         case 1:
@@ -1841,33 +1913,32 @@ void guest_io_write(
         }
         else if ( (port == RTC_PORT(0)) )
         {
-            v->domain->arch.cmos_idx = data;
+            currd->arch.cmos_idx = data;
         }
         else if ( (port == RTC_PORT(1)) &&
-                  ioports_access_permitted(v->domain, RTC_PORT(0),
-                                           RTC_PORT(1)) )
+                  ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
         {
             unsigned long flags;
 
             if ( pv_rtc_handler )
-                pv_rtc_handler(v->domain->arch.cmos_idx & 0x7f, data);
+                pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
             spin_lock_irqsave(&rtc_lock, flags);
-            outb(v->domain->arch.cmos_idx & 0x7f, RTC_PORT(0));
+            outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
             outb(data, RTC_PORT(1));
             spin_unlock_irqrestore(&rtc_lock, flags);
         }
         else if ( (port == 0xcf8) && (bytes == 4) )
         {
             size = 4;
-            v->domain->arch.pci_cf8 = data;
+            currd->arch.pci_cf8 = data;
         }
         else if ( (port & 0xfffc) == 0xcfc )
         {
             size = min(bytes, 4 - (port & 3));
             if ( size == 3 )
                 size = 2;
-            if ( pci_cfg_ok(v->domain, 1, size) )
-                pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
+            if ( pci_cfg_ok(currd, port & 3, size, &data) )
+                pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
         }
 
         if ( size == 4 )
@@ -1925,6 +1996,7 @@ static int is_cpufreq_controller(struct domain *d)
 static int emulate_privileged_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
+    struct domain *currd = v->domain;
     unsigned long *reg, eip = regs->eip;
     u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
     enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
@@ -1942,9 +2014,10 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
                            ? (*(u32 *)&regs->reg = (val)) \
                            : (*(u16 *)&regs->reg = (val)))
     unsigned long code_base, code_limit;
-    char io_emul_stub[32];
+    char *io_emul_stub = NULL;
     void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
-    uint64_t val, msr_content;
+    uint64_t val;
+    bool_t vpmu_msr;
 
     if ( !read_descriptor(regs->cs, v, regs,
                           &code_base, &code_limit, &ar,
@@ -2081,7 +2154,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
                  (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
                  !guest_io_okay(port, op_bytes, v, regs) )
                 goto fail;
-            data = guest_io_read(port, op_bytes, v, regs);
+            data = guest_io_read(port, op_bytes, currd);
             if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
                                     &data, op_bytes)) != 0 )
             {
@@ -2107,7 +2180,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
                                      + op_bytes - rc, 0);
                 return EXCRET_fault_fixed;
             }
-            guest_io_write(port, op_bytes, data, v, regs);
+            guest_io_write(port, op_bytes, data, currd);
             wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
                                          ? -op_bytes : op_bytes));
             break;
@@ -2127,10 +2200,13 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
 
     /*
      * Very likely to be an I/O instruction (IN/OUT).
-     * Build an on-stack stub to execute the instruction with full guest
-     * GPR context. This is needed for some systems which (ab)use IN/OUT
+     * Build an stub to execute the instruction with full guest GPR
+     * context. This is needed for some systems which (ab)use IN/OUT
      * to communicate with BIOS code in system-management mode.
      */
+    io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
+                   STUB_BUF_SIZE / 2;
     /* movq $host_to_guest_gpr_switch,%rcx */
     io_emul_stub[0] = 0x48;
     io_emul_stub[1] = 0xb9;
@@ -2146,9 +2222,10 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
     io_emul_stub[14] = 0x90;
     /* ret (jumps to guest_to_host_gpr_switch) */
     io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
 
     /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)io_emul_stub;
+    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
 
     if ( ioemul_handle_quirk )
         ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
@@ -2164,7 +2241,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
     exec_in:
         if ( !guest_io_okay(port, op_bytes, v, regs) )
             goto fail;
-        if ( admin_io_okay(port, op_bytes, v, regs) )
+        if ( admin_io_okay(port, op_bytes, currd) )
         {
             mark_regs_dirty(regs);
             io_emul(regs);            
@@ -2174,8 +2251,8 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             if ( op_bytes == 4 )
                 regs->eax = 0;
             else
-                regs->eax &= ~((1u << (op_bytes * 8)) - 1);
-            regs->eax |= guest_io_read(port, op_bytes, v, regs);
+                regs->eax &= ~((1 << (op_bytes * 8)) - 1);
+            regs->eax |= guest_io_read(port, op_bytes, currd);
         }
         bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
         goto done;
@@ -2194,7 +2271,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
     exec_out:
         if ( !guest_io_okay(port, op_bytes, v, regs) )
             goto fail;
-        if ( admin_io_okay(port, op_bytes, v, regs) )
+        if ( admin_io_okay(port, op_bytes, currd) )
         {
             mark_regs_dirty(regs);
             io_emul(regs);            
@@ -2203,7 +2280,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         }
         else
         {
-            guest_io_write(port, op_bytes, regs->eax, v, regs);
+            guest_io_write(port, op_bytes, regs->eax, currd);
         }
         bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
         goto done;
@@ -2285,7 +2362,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
 
     case 0x09: /* WBINVD */
         /* Ignore the instruction if unprivileged. */
-        if ( !cache_flush_permitted(v->domain) )
+        if ( !cache_flush_permitted(currd) )
             /* Non-physdev domain attempted WBINVD; ignore for now since
                newer linux uses this in some start-of-day timing loops */
             ;
@@ -2315,21 +2392,19 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         {
             unsigned long mfn;
             
-            if ( !is_pv_32on64_vcpu(v) )
+            if ( !is_pv_32bit_domain(currd) )
             {
                 mfn = pagetable_get_pfn(v->arch.guest_table);
-                *reg = xen_pfn_to_cr3(mfn_to_gmfn(
-                    v->domain, mfn));
+                *reg = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
             }
             else
             {
                 l4_pgentry_t *pl4e =
-                    map_domain_page(pagetable_get_pfn(v->arch.guest_table));
+                    map_domain_page(_mfn(pagetable_get_pfn(v->arch.guest_table)));
 
                 mfn = l4e_get_pfn(*pl4e);
                 unmap_domain_page(pl4e);
-                *reg = compat_pfn_to_cr3(mfn_to_gmfn(
-                    v->domain, mfn));
+                *reg = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
             }
             /* PTs should not be shared */
             BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
@@ -2387,9 +2462,9 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             unsigned long gfn;
             struct page_info *page;
 
-            gfn = !is_pv_32on64_vcpu(v)
+            gfn = !is_pv_32bit_domain(currd)
                 ? xen_cr3_to_pfn(*reg) : compat_cr3_to_pfn(*reg);
-            page = get_page_from_gfn(v->domain, gfn, NULL, P2M_ALLOC);
+            page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
             if ( page )
             {
                 rc = new_guest_cr3(page_to_mfn(page));
@@ -2434,23 +2509,24 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
     case 0x30: /* WRMSR */ {
         uint32_t eax = regs->eax;
         uint32_t edx = regs->edx;
-        msr_content = ((uint64_t)edx << 32) | eax;
-        switch ( (u32)regs->ecx )
+        uint64_t msr_content = ((uint64_t)edx << 32) | eax;
+        vpmu_msr = 0;
+        switch ( regs->_ecx )
         {
         case MSR_FS_BASE:
-            if ( is_pv_32on64_vcpu(v) )
+            if ( is_pv_32bit_domain(currd) )
                 goto fail;
             wrfsbase(msr_content);
             v->arch.pv_vcpu.fs_base = msr_content;
             break;
         case MSR_GS_BASE:
-            if ( is_pv_32on64_vcpu(v) )
+            if ( is_pv_32bit_domain(currd) )
                 goto fail;
             wrgsbase(msr_content);
             v->arch.pv_vcpu.gs_base_kernel = msr_content;
             break;
         case MSR_SHADOW_GS_BASE:
-            if ( is_pv_32on64_vcpu(v) )
+            if ( is_pv_32bit_domain(currd) )
                 goto fail;
             if ( wrmsr_safe(MSR_SHADOW_GS_BASE, msr_content) )
                 goto fail;
@@ -2472,7 +2548,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         case MSR_K8_HWCR:
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
                 goto fail;
-            if ( !is_cpufreq_controller(v->domain) )
+            if ( !is_cpufreq_controller(currd) )
                 break;
             if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
                 goto fail;
@@ -2481,7 +2557,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
                  boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
                 goto fail;
-            if ( !is_hardware_domain(v->domain) || !is_pinned_vcpu(v) )
+            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
                 break;
             if ( (rdmsr_safe(MSR_AMD64_NB_CFG, val) != 0) ||
                  (eax != (uint32_t)val) ||
@@ -2494,7 +2570,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
                  boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
                 goto fail;
-            if ( !is_hardware_domain(v->domain) || !is_pinned_vcpu(v) )
+            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
                 break;
             if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) != 0) )
                 goto fail;
@@ -2514,7 +2590,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         case MSR_IA32_UCODE_REV:
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
                 goto fail;
-            if ( !is_hardware_domain(v->domain) || !is_pinned_vcpu(v) )
+            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
                 break;
             if ( rdmsr_safe(regs->ecx, val) )
                 goto fail;
@@ -2533,7 +2609,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             if (( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) &&
                 ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) )
                 goto fail;
-            if ( !is_cpufreq_controller(v->domain) )
+            if ( !is_cpufreq_controller(currd) )
                 break;
             if ( wrmsr_safe(regs->ecx, msr_content ) != 0 )
                 goto fail;
@@ -2541,7 +2617,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         case MSR_IA32_PERF_CTL:
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
                 goto fail;
-            if ( !is_cpufreq_controller(v->domain) )
+            if ( !is_cpufreq_controller(currd) )
                 break;
             if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
                 goto fail;
@@ -2550,7 +2626,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         case MSR_IA32_ENERGY_PERF_BIAS:
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
                 goto fail;
-            if ( !is_hardware_domain(v->domain) || !is_pinned_vcpu(v) )
+            if ( !is_hardware_domain(currd) || !is_pinned_vcpu(v) )
                 break;
             if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
                 goto fail;
@@ -2571,6 +2647,26 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
                 wrmsrl(regs->_ecx, msr_content);
             break;
+        case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+        case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+        case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+        case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+            {
+                vpmu_msr = 1;
+        case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+                if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+                {
+                    if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                         !is_hardware_domain(v->domain) )
+                        break;
+
+                    if ( vpmu_do_wrmsr(regs->ecx, msr_content, 0) )
+                        goto fail;
+                }
+                break;
+            }
+            /*FALLTHROUGH*/
 
         default:
             if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 )
@@ -2596,32 +2692,35 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
              !guest_kernel_mode(v, regs) )
             goto fail;
-        if ( v->domain->arch.vtsc )
+        if ( currd->arch.vtsc )
             pv_soft_rdtsc(v, regs, 0);
         else
-            rdtsc(regs->eax, regs->edx);
+        {
+            val = rdtsc();
+            goto rdmsr_writeback;
+        }
         break;
 
     case 0x32: /* RDMSR */
-        switch ( (u32)regs->ecx )
+        vpmu_msr = 0;
+        switch ( regs->_ecx )
         {
         case MSR_FS_BASE:
-            if ( is_pv_32on64_vcpu(v) )
+            if ( is_pv_32bit_domain(currd) )
                 goto fail;
             val = cpu_has_fsgsbase ? __rdfsbase() : v->arch.pv_vcpu.fs_base;
             goto rdmsr_writeback;
         case MSR_GS_BASE:
-            if ( is_pv_32on64_vcpu(v) )
+            if ( is_pv_32bit_domain(currd) )
                 goto fail;
             val = cpu_has_fsgsbase ? __rdgsbase()
                                    : v->arch.pv_vcpu.gs_base_kernel;
             goto rdmsr_writeback;
         case MSR_SHADOW_GS_BASE:
-            if ( is_pv_32on64_vcpu(v) )
+            if ( is_pv_32bit_domain(currd) )
                 goto fail;
-            regs->eax = v->arch.pv_vcpu.gs_base_user & 0xFFFFFFFFUL;
-            regs->edx = v->arch.pv_vcpu.gs_base_user >> 32;
-            break;
+            val = v->arch.pv_vcpu.gs_base_user;
+            goto rdmsr_writeback;
         case MSR_K7_FID_VID_CTL:
         case MSR_K7_FID_VID_STATUS:
         case MSR_K8_PSTATE_LIMIT:
@@ -2637,7 +2736,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         case MSR_K8_PSTATE7:
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
                 goto fail;
-            if ( !is_cpufreq_controller(v->domain) )
+            if ( !is_cpufreq_controller(currd) )
             {
                 regs->eax = regs->edx = 0;
                 break;
@@ -2653,12 +2752,10 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             }
             goto rdmsr_normal;
         case MSR_IA32_MISC_ENABLE:
-            if ( rdmsr_safe(regs->ecx, msr_content) )
+            if ( rdmsr_safe(regs->ecx, val) )
                 goto fail;
-            msr_content = guest_misc_enable(msr_content);
-            regs->eax = (uint32_t)msr_content;
-            regs->edx = (uint32_t)(msr_content >> 32);
-            break;
+            val = guest_misc_enable(val);
+            goto rdmsr_writeback;
 
         case MSR_AMD64_DR0_ADDRESS_MASK:
             if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
@@ -2673,15 +2770,42 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
                             [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1];
             regs->edx = 0;
             break;
-
-        default:
-            if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
+        case MSR_IA32_PERF_CAPABILITIES:
+            /* No extra capabilities are supported */
+            regs->eax = regs->edx = 0;
+            break;
+        case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+        case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+        case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+        case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
             {
- rdmsr_writeback:
-                regs->eax = (uint32_t)val;
-                regs->edx = (uint32_t)(val >> 32);
+                vpmu_msr = 1;
+        case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+                if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+                {
+
+                    if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                         !is_hardware_domain(v->domain) )
+                    {
+                        /* Don't leak PMU MSRs to unprivileged domains */
+                        regs->eax = regs->edx = 0;
+                        break;
+                    }
+
+                    if ( vpmu_do_rdmsr(regs->ecx, &val) )
+                        goto fail;
+
+                    regs->eax = (uint32_t)val;
+                    regs->edx = (uint32_t)(val >> 32);
+                }
                 break;
             }
+            /*FALLTHROUGH*/
+
+        default:
+            if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
+                goto rdmsr_writeback;
 
             rc = vmce_rdmsr(regs->ecx, &val);
             if ( rc < 0 )
@@ -2694,10 +2818,11 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             /* Everyone can read the MSR space. */
             /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n",
                         _p(regs->ecx));*/
-            if ( rdmsr_safe(regs->ecx, msr_content) )
+            if ( rdmsr_safe(regs->ecx, val) )
                 goto fail;
-            regs->eax = (uint32_t)msr_content;
-            regs->edx = (uint32_t)(msr_content >> 32);
+ rdmsr_writeback:
+            regs->eax = (uint32_t)val;
+            regs->edx = (uint32_t)(val >> 32);
             break;
         }
         break;
@@ -2716,9 +2841,13 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
  done:
     instruction_done(regs, eip, bpmatch);
  skip:
+    if ( io_emul_stub )
+        unmap_domain_page(io_emul_stub);
     return EXCRET_fault_fixed;
 
  fail:
+    if ( io_emul_stub )
+        unmap_domain_page(io_emul_stub);
     return 0;
 }
 
@@ -3135,7 +3264,7 @@ void do_general_protection(struct cpu_user_regs *regs)
             return;
         }
     }
-    else if ( is_pv_32on64_vcpu(v) && regs->error_code )
+    else if ( is_pv_32bit_vcpu(v) && regs->error_code )
     {
         emulate_gate_op(regs);
         return;
@@ -3258,6 +3387,7 @@ static void pci_serr_error(const struct cpu_user_regs *regs)
     {
     case 'd': /* 'dom0' */
         nmi_hwdom_report(_XEN_NMIREASON_pci_serr);
+        /* fallthrough */
     case 'i': /* 'ignore' */
         /* Would like to print a diagnostic here but can't call printk()
            from NMI context -- raise a softirq instead. */
@@ -3323,7 +3453,8 @@ void do_nmi(const struct cpu_user_regs *regs)
     if ( nmi_callback(regs, cpu) )
         return;
 
-    if ( !nmi_watchdog || (!nmi_watchdog_tick(regs) && watchdog_force) )
+    if ( (nmi_watchdog == NMI_NONE) ||
+         (!nmi_watchdog_tick(regs) && watchdog_force) )
         handle_unknown = 1;
 
     /* Only the BSP gets external NMIs from the system. */
@@ -3607,7 +3738,7 @@ long register_guest_nmi_callback(unsigned long address)
 
     t->vector  = TRAP_nmi;
     t->flags   = 0;
-    t->cs      = (is_pv_32on64_domain(d) ?
+    t->cs      = (is_pv_32bit_domain(d) ?
                   FLAT_COMPAT_KERNEL_CS : FLAT_KERNEL_CS);
     t->address = address;
     TI_SET_IF(t, 1);
@@ -3810,8 +3941,8 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
          * DR6: Bits 4-11,16-31 reserved (set to 1).
          *      Bit 12 reserved (set to 0).
          */
-        value &= 0xffffefff; /* reserved bits => 0 */
-        value |= 0xffff0ff0; /* reserved bits => 1 */
+        value &= ~DR_STATUS_RESERVED_ZERO; /* reserved bits => 0 */
+        value |=  DR_STATUS_RESERVED_ONE;  /* reserved bits => 1 */
         if ( v == curr ) 
             write_debugreg(6, value);
         break;
diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
new file mode 100644
index 0000000..b32a839
--- /dev/null
+++ b/xen/arch/x86/vm_event.c
@@ -0,0 +1,117 @@
+/*
+ * arch/x86/vm_event.c
+ *
+ * Architecture-specific vm_event handling routines
+ *
+ * Copyright (c) 2015 Tamas K Lengyel (tamas at tklengyel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/sched.h>
+#include <asm/hvm/hvm.h>
+#include <asm/vm_event.h>
+
+/* Implicitly serialized by the domctl lock. */
+int vm_event_init_domain(struct domain *d)
+{
+    struct vcpu *v;
+
+    if ( !d->arch.event_write_data )
+        d->arch.event_write_data =
+            vzalloc(sizeof(struct monitor_write_data) * d->max_vcpus);
+
+    if ( !d->arch.event_write_data )
+        return -ENOMEM;
+
+    for_each_vcpu ( d, v )
+    {
+        if ( v->arch.vm_event.emul_read_data )
+            continue;
+
+        v->arch.vm_event.emul_read_data =
+            xzalloc(struct vm_event_emul_read_data);
+
+        if ( !v->arch.vm_event.emul_read_data )
+            return -ENOMEM;
+    }
+
+    return 0;
+}
+
+/*
+ * Implicitly serialized by the domctl lock,
+ * or on domain cleanup paths only.
+ */
+void vm_event_cleanup_domain(struct domain *d)
+{
+    struct vcpu *v;
+
+    vfree(d->arch.event_write_data);
+    d->arch.event_write_data = NULL;
+
+    for_each_vcpu ( d, v )
+    {
+        xfree(v->arch.vm_event.emul_read_data);
+        v->arch.vm_event.emul_read_data = NULL;
+    }
+}
+
+void vm_event_toggle_singlestep(struct domain *d, struct vcpu *v)
+{
+    if ( !is_hvm_domain(d) || !atomic_read(&v->vm_event_pause_count) )
+        return;
+
+    hvm_toggle_singlestep(v);
+}
+
+void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp)
+{
+    if ( rsp->flags & VM_EVENT_FLAG_DENY )
+    {
+        struct monitor_write_data *w =
+            &v->domain->arch.event_write_data[v->vcpu_id];
+
+        ASSERT(v->domain->arch.event_write_data != NULL);
+
+        switch ( rsp->reason )
+        {
+        case VM_EVENT_REASON_MOV_TO_MSR:
+            w->do_write.msr = 0;
+            break;
+        case VM_EVENT_REASON_WRITE_CTRLREG:
+            switch ( rsp->u.write_ctrlreg.index )
+            {
+            case VM_EVENT_X86_CR0:
+                w->do_write.cr0 = 0;
+                break;
+            case VM_EVENT_X86_CR3:
+                w->do_write.cr3 = 0;
+                break;
+            case VM_EVENT_X86_CR4:
+                w->do_write.cr4 = 0;
+                break;
+            }
+            break;
+        }
+    }
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/x86_64/acpi_mmcfg.c b/xen/arch/x86/x86_64/acpi_mmcfg.c
index 3666193..f01ad70 100644
--- a/xen/arch/x86/x86_64/acpi_mmcfg.c
+++ b/xen/arch/x86/x86_64/acpi_mmcfg.c
@@ -17,8 +17,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
index 5b0af61..1521779 100644
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -219,7 +219,20 @@ ENTRY(compat_post_handle_exception)
         movb  $0,TRAPBOUNCE_flags(%rdx)
         jmp   compat_test_all_events
 
-ENTRY(compat_syscall)
+/* See lstar_enter for entry register state. */
+ENTRY(cstar_enter)
+        sti
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_USER_CS32
+        pushq %rcx
+        pushq $0
+        SAVE_VOLATILE TRAP_syscall
+        GET_CURRENT(%rbx)
+        movq  VCPU_domain(%rbx),%rcx
+        cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
+        je    switch_to_kernel
         cmpb  $0,VCPU_syscall32_disables_events(%rbx)
         movzwl VCPU_syscall32_sel(%rbx),%esi
         movq  VCPU_syscall32_addr(%rbx),%rax
@@ -417,6 +430,8 @@ ENTRY(compat_hypercall_table)
         .quad do_domctl
         .quad compat_kexec_op
         .quad do_tmem_op
+        .quad do_ni_hypercall           /* reserved for XenClient */
+        .quad do_xenpmu_op              /* 40 */
         .rept __HYPERVISOR_arch_0-((.-compat_hypercall_table)/8)
         .quad compat_ni_hypercall
         .endr
@@ -466,6 +481,8 @@ ENTRY(compat_hypercall_args_table)
         .byte 1 /* do_domctl                */
         .byte 2 /* compat_kexec_op          */
         .byte 1 /* do_tmem_op               */
+        .byte 0 /* reserved for XenClient   */
+        .byte 2 /* do_xenpmu_op             */  /* 40 */
         .rept __HYPERVISOR_arch_0-(.-compat_hypercall_args_table)
         .byte 0 /* compat_ni_hypercall      */
         .endr
diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c
index f90f611..d034bd0 100644
--- a/xen/arch/x86/x86_64/compat/mm.c
+++ b/xen/arch/x86/x86_64/compat/mm.c
@@ -1,9 +1,9 @@
 #include <xen/event.h>
-#include <xen/mem_event.h>
 #include <xen/mem_access.h>
 #include <xen/multicall.h>
 #include <compat/memory.h>
 #include <compat/xen.h>
+#include <asm/mem_paging.h>
 #include <asm/mem_sharing.h>
 
 int compat_set_gdt(XEN_GUEST_HANDLE_PARAM(uint) frame_list, unsigned int entries)
@@ -187,28 +187,10 @@ int compat_arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         return mem_sharing_get_nr_shared_mfns();
 
     case XENMEM_paging_op:
-    {
-        xen_mem_event_op_t meo;
-        if ( copy_from_guest(&meo, arg, 1) )
-            return -EFAULT;
-        rc = do_mem_event_op(cmd, meo.domain, &meo);
-        if ( !rc && __copy_to_guest(arg, &meo, 1) )
-            return -EFAULT;
-        break;
-    }
+        return mem_paging_memop(guest_handle_cast(arg, xen_mem_paging_op_t));
 
     case XENMEM_sharing_op:
-    {
-        xen_mem_sharing_op_t mso;
-        if ( copy_from_guest(&mso, arg, 1) )
-            return -EFAULT;
-        if ( mso.op == XENMEM_sharing_op_audit )
-            return mem_sharing_audit(); 
-        rc = do_mem_event_op(cmd, mso.domain, &mso);
-        if ( !rc && __copy_to_guest(arg, &mso, 1) )
-            return -EFAULT;
-        break;
-    }
+        return mem_sharing_memop(guest_handle_cast(arg, xen_mem_sharing_op_t));
 
     default:
         rc = -ENOSYS;
@@ -292,6 +274,7 @@ int compat_mmuext_op(XEN_GUEST_HANDLE_PARAM(mmuext_op_compat_t) cmp_uops,
                 break;
             case MMUEXT_NEW_USER_BASEPTR:
                 rc = -EINVAL;
+                /* fallthrough */
             case MMUEXT_TLB_FLUSH_LOCAL:
             case MMUEXT_TLB_FLUSH_MULTI:
             case MMUEXT_TLB_FLUSH_ALL:
diff --git a/xen/arch/x86/x86_64/compat/traps.c b/xen/arch/x86/x86_64/compat/traps.c
index b6c2563..2dae0c7 100644
--- a/xen/arch/x86/x86_64/compat/traps.c
+++ b/xen/arch/x86/x86_64/compat/traps.c
@@ -119,7 +119,7 @@ unsigned int compat_iret(void)
         }
         else if ( ksp > regs->_esp )
         {
-            for (i = 9; i > 0; ++i)
+            for ( i = 9; i > 0; --i )
             {
                 rc |= __get_user(x, (u32 *)regs->rsp + i);
                 rc |= __put_user(x, (u32 *)(unsigned long)ksp + i);
@@ -164,7 +164,7 @@ unsigned int compat_iret(void)
     return regs->_eax;
 
  exit_and_crash:
-    gdprintk(XENLOG_ERR, "Fatal error\n");
+    gprintk(XENLOG_ERR, "Fatal IRET error\n");
     domain_crash(v->domain);
     return 0;
 }
diff --git a/xen/arch/x86/x86_64/cpu_idle.c b/xen/arch/x86/x86_64/cpu_idle.c
index dfc7e84..0fbd10a 100644
--- a/xen/arch/x86/x86_64/cpu_idle.c
+++ b/xen/arch/x86/x86_64/cpu_idle.c
@@ -16,8 +16,7 @@
  *  General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/xen/arch/x86/x86_64/cpufreq.c b/xen/arch/x86/x86_64/cpufreq.c
index 1956777..30df44f 100644
--- a/xen/arch/x86/x86_64/cpufreq.c
+++ b/xen/arch/x86/x86_64/cpufreq.c
@@ -16,8 +16,7 @@
  *  General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 2d25d57..74677a2 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -13,9 +13,8 @@
 #include <public/xen.h>
 #include <irq_vectors.h>
 
-        ALIGN
 /* %rbx: struct vcpu */
-switch_to_kernel:
+ENTRY(switch_to_kernel)
         leaq  VCPU_trap_bounce(%rbx),%rdx
         /* TB_eip = (32-bit syscall && syscall32_addr) ?
          *          syscall32_addr : syscall_addr */
@@ -113,23 +112,22 @@ restore_all_xen:
  * When entering SYSCALL from user mode:
  *  Vector directly to the registered arch.syscall_addr.
  *
- * Initial work is done by per-CPU stack trampolines. At this point %rsp
- * has been initialised to point at the correct Xen stack, and %rsp, %rflags
- * and %cs have been saved. All other registers are still to be saved onto
- * the stack, starting with %rip, and an appropriate %ss must be saved into
- * the space left by the trampoline.
+ * Initial work is done by per-CPU trampolines. At this point %rsp has been
+ * initialised to point at the correct Xen stack, %rsp has been saved, and
+ * %rax needs to be restored from the %ss save slot. All other registers are
+ * still to be saved onto the stack, starting with RFLAGS, and an appropriate
+ * %ss must be saved into the space left by the trampoline.
  */
-ENTRY(syscall_enter)
+ENTRY(lstar_enter)
         sti
-        movl  $FLAT_KERNEL_SS,24(%rsp)
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_KERNEL_CS64
         pushq %rcx
         pushq $0
-        movq  24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */
         SAVE_VOLATILE TRAP_syscall
         GET_CURRENT(%rbx)
-        movq  VCPU_domain(%rbx),%rcx
-        testb $1,DOMAIN_is_32bit_pv(%rcx)
-        jnz   compat_syscall
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
@@ -626,6 +624,7 @@ ENTRY(double_fault)
 
         .pushsection .init.text, "ax", @progbits
 ENTRY(early_page_fault)
+        movl  $TRAP_page_fault,4(%rsp)
         SAVE_ALL
         movq  %rsp,%rdi
         call  do_early_page_fault
@@ -764,6 +763,8 @@ ENTRY(hypercall_table)
         .quad do_domctl
         .quad do_kexec_op
         .quad do_tmem_op
+        .quad do_ni_hypercall       /* reserved for XenClient */
+        .quad do_xenpmu_op          /* 40 */
         .rept __HYPERVISOR_arch_0-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -813,6 +814,8 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_domctl            */
         .byte 2 /* do_kexec             */
         .byte 1 /* do_tmem_op           */
+        .byte 0 /* reserved for XenClient */
+        .byte 2 /* do_xenpmu_op         */  /* 40 */
         .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
diff --git a/xen/arch/x86/x86_64/gdbstub.c b/xen/arch/x86/x86_64/gdbstub.c
index 3b5604a..2626519 100644
--- a/xen/arch/x86/x86_64/gdbstub.c
+++ b/xen/arch/x86/x86_64/gdbstub.c
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <asm/debugger.h>
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index d631aee..d918002 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -13,8 +13,7 @@
  * more details.
  * 
  * You should have received a copy of the GNU General Public License along 
- * with this program; if not, write to the Free Software Foundation, Inc., 59 
- * Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -26,7 +25,6 @@
 #include <xen/nodemask.h>
 #include <xen/guest_access.h>
 #include <xen/hypercall.h>
-#include <xen/mem_event.h>
 #include <xen/mem_access.h>
 #include <asm/current.h>
 #include <asm/asm_defns.h>
@@ -37,16 +35,15 @@
 #include <asm/msr.h>
 #include <asm/setup.h>
 #include <asm/numa.h>
+#include <asm/mem_paging.h>
 #include <asm/mem_sharing.h>
 #include <public/memory.h>
 
 unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
 
 /* Enough page directories to map into the bottom 1GB. */
-l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
-    l3_bootmap[L3_PAGETABLE_ENTRIES];
-l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned")))
-    l2_bootmap[L2_PAGETABLE_ENTRIES];
+l3_pgentry_t __section(".bss.page_aligned") l3_bootmap[L3_PAGETABLE_ENTRIES];
+l2_pgentry_t __section(".bss.page_aligned") l2_bootmap[L2_PAGETABLE_ENTRIES];
 
 l2_pgentry_t *compat_idle_pg_table_l2;
 
@@ -61,7 +58,7 @@ void *do_page_walk(struct vcpu *v, unsigned long addr)
     if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
         return NULL;
 
-    l4t = map_domain_page(mfn);
+    l4t = map_domain_page(_mfn(mfn));
     l4e = l4t[l4_table_offset(addr)];
     unmap_domain_page(l4t);
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
@@ -79,7 +76,7 @@ void *do_page_walk(struct vcpu *v, unsigned long addr)
         goto ret;
     }
 
-    l2t = map_domain_page(mfn);
+    l2t = map_domain_page(_mfn(mfn));
     l2e = l2t[l2_table_offset(addr)];
     unmap_domain_page(l2t);
     mfn = l2e_get_pfn(l2e);
@@ -91,7 +88,7 @@ void *do_page_walk(struct vcpu *v, unsigned long addr)
         goto ret;
     }
 
-    l1t = map_domain_page(mfn);
+    l1t = map_domain_page(_mfn(mfn));
     l1e = l1t[l1_table_offset(addr)];
     unmap_domain_page(l1t);
     mfn = l1e_get_pfn(l1e);
@@ -99,7 +96,7 @@ void *do_page_walk(struct vcpu *v, unsigned long addr)
         return NULL;
 
  ret:
-    return map_domain_page(mfn) + (addr & ~PAGE_MASK);
+    return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
 }
 
 /*
@@ -480,7 +477,7 @@ static int setup_m2p_table(struct mem_hotadd_info *info)
                 l2_ro_mpt += l2_table_offset(va);
             }
 
-            /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
+            /* NB. Cannot be GLOBAL: guest user mode should not see it. */
             l2e_write(l2_ro_mpt, l2e_from_pfn(mfn,
                    /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
         }
@@ -583,7 +580,7 @@ void __init paging_init(void)
                        0x77, 1UL << L3_PAGETABLE_SHIFT);
 
                 ASSERT(!l2_table_offset(va));
-                /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
+                /* NB. Cannot be GLOBAL: guest user mode should not see it. */
                 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
                     l3e_from_page(l1_pg,
                         /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
@@ -621,7 +618,7 @@ void __init paging_init(void)
                       l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
             ASSERT(!l2_table_offset(va));
         }
-        /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
+        /* NB. Cannot be GLOBAL: guest user mode should not see it. */
         if ( l1_pg )
             l2e_write(l2_ro_mpt, l2e_from_page(
                 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
@@ -895,6 +892,33 @@ void __init subarch_init_memory(void)
             share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
         }
     }
+
+    /* Mark low 16Mb of direct map NX if hardware supports it. */
+    if ( !cpu_has_nx )
+        return;
+
+    v = DIRECTMAP_VIRT_START + (1UL << 20);
+    l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[l3_table_offset(v)];
+    ASSERT(l3e_get_flags(l3e) & _PAGE_PRESENT);
+    do {
+        l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
+        ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
+        if ( l2e_get_flags(l2e) & _PAGE_PSE )
+        {
+            l2e_add_flags(l2e, _PAGE_NX_BIT);
+            l3e_to_l2e(l3e)[l2_table_offset(v)] = l2e;
+            v += 1 << L2_PAGETABLE_SHIFT;
+        }
+        else
+        {
+            l1_pgentry_t l1e = l2e_to_l1e(l2e)[l1_table_offset(v)];
+
+            ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
+            l1e_add_flags(l1e, _PAGE_NX_BIT);
+            l2e_to_l1e(l2e)[l1_table_offset(v)] = l1e;
+            v += 1 << L1_PAGETABLE_SHIFT;
+        }
+    } while ( v < DIRECTMAP_VIRT_START + (16UL << 20) );
 }
 
 long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
@@ -984,28 +1008,10 @@ long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         return mem_sharing_get_nr_shared_mfns();
 
     case XENMEM_paging_op:
-    {
-        xen_mem_event_op_t meo;
-        if ( copy_from_guest(&meo, arg, 1) )
-            return -EFAULT;
-        rc = do_mem_event_op(cmd, meo.domain, &meo);
-        if ( !rc && __copy_to_guest(arg, &meo, 1) )
-            return -EFAULT;
-        break;
-    }
+        return mem_paging_memop(guest_handle_cast(arg, xen_mem_paging_op_t));
 
     case XENMEM_sharing_op:
-    {
-        xen_mem_sharing_op_t mso;
-        if ( copy_from_guest(&mso, arg, 1) )
-            return -EFAULT;
-        if ( mso.op == XENMEM_sharing_op_audit )
-            return mem_sharing_audit(); 
-        rc = do_mem_event_op(cmd, mso.domain, &mso);
-        if ( !rc && __copy_to_guest(arg, &mso, 1) )
-            return -EFAULT;
-        break;
-    }
+        return mem_sharing_memop(guest_handle_cast(arg, xen_mem_sharing_op_t));
 
     default:
         rc = -ENOSYS;
@@ -1181,7 +1187,7 @@ int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
     unsigned long mfn, idle_index;
     int ret = 0;
 
-    if (!is_pv_32on64_domain(d))
+    if (!is_pv_32bit_domain(d))
         return 0;
 
     if ( (addr < HYPERVISOR_COMPAT_VIRT_START(d)) ||
@@ -1190,7 +1196,7 @@ int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
 
     mfn = (read_cr3()) >> PAGE_SHIFT;
 
-    pl4e = map_domain_page(mfn);
+    pl4e = map_domain_page(_mfn(mfn));
 
     l4e = pl4e[0];
 
@@ -1199,7 +1205,7 @@ int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
 
     mfn = l4e_get_pfn(l4e);
     /* We don't need get page type here since it is current CR3 */
-    pl3e = map_domain_page(mfn);
+    pl3e = map_domain_page(_mfn(mfn));
 
     l3e = pl3e[3];
 
@@ -1207,7 +1213,7 @@ int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
         goto unmap;
 
     mfn = l3e_get_pfn(l3e);
-    pl2e = map_domain_page(mfn);
+    pl2e = map_domain_page(_mfn(mfn));
 
     l2e = pl2e[l2_table_offset(addr)];
 
@@ -1240,7 +1246,7 @@ unmap:
 
 void domain_set_alloc_bitsize(struct domain *d)
 {
-    if ( !is_pv_32on64_domain(d) ||
+    if ( !is_pv_32bit_domain(d) ||
          (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
          d->arch.physaddr_bitsize > 0 )
         return;
@@ -1343,7 +1349,8 @@ int mem_hotadd_check(unsigned long spfn, unsigned long epfn)
 int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
 {
     struct mem_hotadd_info info;
-    int ret, node;
+    int ret;
+    nodeid_t node;
     unsigned long old_max = max_page, old_total = total_pages;
     unsigned long old_node_start, old_node_span, orig_online;
     unsigned long i;
@@ -1353,7 +1360,7 @@ int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
     if ( !mem_hotadd_check(spfn, epfn) )
         return -EINVAL;
 
-    if ( (node = setup_node(pxm)) == -1 )
+    if ( (node = setup_node(pxm)) == NUMA_NO_NODE )
         return -EINVAL;
 
     if ( !valid_numa_range(spfn << PAGE_SHIFT, epfn << PAGE_SHIFT, node) )
@@ -1369,16 +1376,16 @@ int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
         ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), spfn,
                                min(epfn, i) - spfn, PAGE_HYPERVISOR);
         if ( ret )
-            return ret;
+            goto destroy_directmap;
     }
     if ( i < epfn )
     {
         if ( i < spfn )
             i = spfn;
         ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), i,
-                               epfn - i, __PAGE_HYPERVISOR);
+                               epfn - i, __PAGE_HYPERVISOR_RW);
         if ( ret )
-            return ret;
+            goto destroy_directmap;
     }
 
     old_node_start = NODE_DATA(node)->node_start_pfn;
@@ -1401,7 +1408,6 @@ int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
             NODE_DATA(node)->node_spanned_pages = epfn - node_start_pfn(node);
     }
 
-    ret = -EINVAL;
     info.spfn = spfn;
     info.epfn = epfn;
     info.cur = spfn;
@@ -1424,7 +1430,7 @@ int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
     if ( ret )
         goto destroy_m2p;
 
-    if ( !need_iommu(hardware_domain) )
+    if ( iommu_enabled && !iommu_passthrough && !need_iommu(hardware_domain) )
     {
         for ( i = spfn; i < epfn; i++ )
             if ( iommu_map_page(hardware_domain, i, i, IOMMUF_readable|IOMMUF_writable) )
@@ -1438,9 +1444,8 @@ int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
     }
 
     /* We can't revert any more */
-    transfer_pages_to_heap(&info);
-
     share_hotadd_m2p_table(&info);
+    transfer_pages_to_heap(&info);
 
     return 0;
 
@@ -1451,13 +1456,13 @@ destroy_m2p:
     max_pdx = pfn_to_pdx(max_page - 1) + 1;
 destroy_frametable:
     cleanup_frame_table(&info);
-    destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
-                         (unsigned long)mfn_to_virt(epfn));
-
     if ( !orig_online )
         node_set_offline(node);
     NODE_DATA(node)->node_start_pfn = old_node_start;
     NODE_DATA(node)->node_spanned_pages = old_node_span;
+ destroy_directmap:
+    destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
+                         (unsigned long)mfn_to_virt(epfn));
 
     return ret;
 }
diff --git a/xen/arch/x86/x86_64/mmconfig.h b/xen/arch/x86/x86_64/mmconfig.h
index c447e5a..7537519 100644
--- a/xen/arch/x86/x86_64/mmconfig.h
+++ b/xen/arch/x86/x86_64/mmconfig.h
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Author: Allen Kay <allen.m.kay at intel.com> - adapted from linux
  */
diff --git a/xen/arch/x86/x86_64/mmconfig_64.c b/xen/arch/x86/x86_64/mmconfig_64.c
index 565e8c7..1f9a996 100644
--- a/xen/arch/x86/x86_64/mmconfig_64.c
+++ b/xen/arch/x86/x86_64/mmconfig_64.c
@@ -134,30 +134,10 @@ static void __iomem *mcfg_ioremap(const struct acpi_mcfg_allocation *cfg,
     return (void __iomem *) virt;
 }
 
-void arch_pci_ro_device(int seg, int bdf)
-{
-    unsigned int idx, bus = PCI_BUS(bdf);
-
-    for (idx = 0; idx < pci_mmcfg_config_num; ++idx) {
-        const struct acpi_mcfg_allocation *cfg = pci_mmcfg_virt[idx].cfg;
-        unsigned long mfn = (cfg->address >> PAGE_SHIFT) + bdf;
-
-        if (!pci_mmcfg_virt[idx].virt || cfg->pci_segment != seg ||
-            cfg->start_bus_number > bus || cfg->end_bus_number < bus)
-            continue;
-
-        if (rangeset_add_singleton(mmio_ro_ranges, mfn))
-            printk(XENLOG_ERR
-                   "%04x:%02x:%02x.%u: could not mark MCFG (mfn %#lx) read-only\n",
-                   cfg->pci_segment, bus, PCI_SLOT(bdf), PCI_FUNC(bdf),
-                   mfn);
-    }
-}
-
 int pci_mmcfg_arch_enable(unsigned int idx)
 {
     const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
-    const unsigned long *ro_map = pci_get_ro_map(cfg->pci_segment);
+    unsigned long start_mfn, end_mfn;
 
     if (pci_mmcfg_virt[idx].virt)
         return 0;
@@ -169,16 +149,15 @@ int pci_mmcfg_arch_enable(unsigned int idx)
     }
     printk(KERN_INFO "PCI: Using MCFG for segment %04x bus %02x-%02x\n",
            cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number);
-    if (ro_map) {
-        unsigned int bdf = PCI_BDF(cfg->start_bus_number, 0, 0);
-        unsigned int end = PCI_BDF(cfg->end_bus_number, -1, -1);
-
-        while ((bdf = find_next_bit(ro_map, end + 1, bdf)) <= end) {
-            arch_pci_ro_device(cfg->pci_segment, bdf);
-            if (bdf++ == end)
-                break;
-        }
-    }
+
+    start_mfn = PFN_DOWN(cfg->address) + PCI_BDF(cfg->start_bus_number, 0, 0);
+    end_mfn = PFN_DOWN(cfg->address) + PCI_BDF(cfg->end_bus_number, ~0, ~0);
+    if ( rangeset_add_range(mmio_ro_ranges, start_mfn, end_mfn) )
+        printk(XENLOG_ERR
+               "%04x:%02x-%02x: could not mark MCFG (mfns %lx-%lx) read-only\n",
+               cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number,
+               start_mfn, end_mfn);
+
     return 0;
 }
 
@@ -197,6 +176,28 @@ void pci_mmcfg_arch_disable(unsigned int idx)
            cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number);
 }
 
+bool_t pci_mmcfg_decode(unsigned long mfn, unsigned int *seg,
+                        unsigned int *bdf)
+{
+    unsigned int idx;
+
+    for (idx = 0; idx < pci_mmcfg_config_num; ++idx) {
+        const struct acpi_mcfg_allocation *cfg = pci_mmcfg_virt[idx].cfg;
+
+        if (pci_mmcfg_virt[idx].virt &&
+            mfn >= PFN_DOWN(cfg->address) + PCI_BDF(cfg->start_bus_number,
+                                                    0, 0) &&
+            mfn <= PFN_DOWN(cfg->address) + PCI_BDF(cfg->end_bus_number,
+                                                    ~0, ~0)) {
+            *seg = cfg->pci_segment;
+            *bdf = mfn - PFN_DOWN(cfg->address);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
 int __init pci_mmcfg_arch_init(void)
 {
     int i;
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index 0040bef..0846a19 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -53,9 +53,11 @@ static void _show_registers(
     printk("\nRFLAGS: %016lx   ", regs->rflags);
     if ( (context == CTXT_pv_guest) && v && v->vcpu_info )
         printk("EM: %d   ", !!vcpu_info(v, evtchn_upcall_mask));
-    printk("CONTEXT: %s\n", context_names[context]);
+    printk("CONTEXT: %s", context_names[context]);
+    if ( v && !is_idle_vcpu(v) )
+        printk(" (%pv)", v);
 
-    printk("rax: %016lx   rbx: %016lx   rcx: %016lx\n",
+    printk("\nrax: %016lx   rbx: %016lx   rcx: %016lx\n",
            regs->rax, regs->rbx, regs->rcx);
     printk("rdx: %016lx   rsi: %016lx   rdi: %016lx\n",
            regs->rdx, regs->rsi, regs->rdi);
@@ -84,7 +86,7 @@ void show_registers(const struct cpu_user_regs *regs)
     struct cpu_user_regs fault_regs = *regs;
     unsigned long fault_crs[8];
     enum context context;
-    struct vcpu *v = current;
+    struct vcpu *v = system_state >= SYS_STATE_smp_boot ? current : NULL;
 
     if ( guest_mode(regs) && has_hvm_container_vcpu(v) )
     {
@@ -173,7 +175,7 @@ void show_page_walk(unsigned long addr)
     if ( !is_canonical_address(addr) )
         return;
 
-    l4t = map_domain_page(mfn);
+    l4t = map_domain_page(_mfn(mfn));
     l4e = l4t[l4_table_offset(addr)];
     unmap_domain_page(l4t);
     mfn = l4e_get_pfn(l4e);
@@ -185,7 +187,7 @@ void show_page_walk(unsigned long addr)
          !mfn_valid(mfn) )
         return;
 
-    l3t = map_domain_page(mfn);
+    l3t = map_domain_page(_mfn(mfn));
     l3e = l3t[l3_table_offset(addr)];
     unmap_domain_page(l3t);
     mfn = l3e_get_pfn(l3e);
@@ -199,7 +201,7 @@ void show_page_walk(unsigned long addr)
          !mfn_valid(mfn) )
         return;
 
-    l2t = map_domain_page(mfn);
+    l2t = map_domain_page(_mfn(mfn));
     l2e = l2t[l2_table_offset(addr)];
     unmap_domain_page(l2t);
     mfn = l2e_get_pfn(l2e);
@@ -213,7 +215,7 @@ void show_page_walk(unsigned long addr)
          !mfn_valid(mfn) )
         return;
 
-    l1t = map_domain_page(mfn);
+    l1t = map_domain_page(_mfn(mfn));
     l1e = l1t[l1_table_offset(addr)];
     unmap_domain_page(l1t);
     mfn = l1e_get_pfn(l1e);
@@ -291,8 +293,8 @@ unsigned long do_iret(void)
     if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp,
                                  sizeof(iret_saved))) )
     {
-        gdprintk(XENLOG_ERR, "Fault while reading IRET context from "
-                "guest stack\n");
+        gprintk(XENLOG_ERR,
+                "Fault while reading IRET context from guest stack\n");
         goto exit_and_crash;
     }
 
@@ -301,8 +303,8 @@ unsigned long do_iret(void)
     {
         if ( unlikely(pagetable_is_null(v->arch.guest_table_user)) )
         {
-            gdprintk(XENLOG_ERR, "Guest switching to user mode with no "
-                    "user page tables\n");
+            gprintk(XENLOG_ERR,
+                    "Guest switching to user mode with no user page tables\n");
             goto exit_and_crash;
         }
         toggle_guest_mode(v);
@@ -331,75 +333,82 @@ unsigned long do_iret(void)
     return iret_saved.rax;
 
  exit_and_crash:
-    gdprintk(XENLOG_ERR, "Fatal error\n");
     domain_crash(v->domain);
     return 0;
 }
 
-static int write_stack_trampoline(
-    char *stack, char *stack_bottom, uint16_t cs_seg)
+static unsigned int write_stub_trampoline(
+    unsigned char *stub, unsigned long stub_va,
+    unsigned long stack_bottom, unsigned long target_va)
 {
-    /* movq %rsp, saversp(%rip) */
-    stack[0] = 0x48;
-    stack[1] = 0x89;
-    stack[2] = 0x25;
-    *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
-
-    /* leaq saversp(%rip), %rsp */
-    stack[7] = 0x48;
-    stack[8] = 0x8d;
-    stack[9] = 0x25;
-    *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
-
-    /* pushq %r11 */
-    stack[14] = 0x41;
-    stack[15] = 0x53;
-
-    /* pushq $<cs_seg> */
-    stack[16] = 0x68;
-    *(u32 *)&stack[17] = cs_seg;
-
-    /* movq $syscall_enter,%r11 */
-    stack[21] = 0x49;
-    stack[22] = 0xbb;
-    *(void **)&stack[23] = (void *)syscall_enter;
-
-    /* jmpq *%r11 */
-    stack[31] = 0x41;
-    stack[32] = 0xff;
-    stack[33] = 0xe3;
-
-    return 34;
+    /* movabsq %rax, stack_bottom - 8 */
+    stub[0] = 0x48;
+    stub[1] = 0xa3;
+    *(uint64_t *)&stub[2] = stack_bottom - 8;
+
+    /* movq %rsp, %rax */
+    stub[10] = 0x48;
+    stub[11] = 0x89;
+    stub[12] = 0xe0;
+
+    /* movabsq $stack_bottom - 8, %rsp */
+    stub[13] = 0x48;
+    stub[14] = 0xbc;
+    *(uint64_t *)&stub[15] = stack_bottom - 8;
+
+    /* pushq %rax */
+    stub[23] = 0x50;
+
+    /* jmp target_va */
+    stub[24] = 0xe9;
+    *(int32_t *)&stub[25] = target_va - (stub_va + 29);
+
+    /* Round up to a multiple of 16 bytes. */
+    return 32;
 }
 
+DEFINE_PER_CPU(struct stubs, stubs);
+void lstar_enter(void);
+void cstar_enter(void);
+
 void __devinit subarch_percpu_traps_init(void)
 {
-    char *stack_bottom, *stack;
-
-    stack_bottom = (char *)get_stack_bottom();
-    stack        = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
+    unsigned long stack_bottom = get_stack_bottom();
+    unsigned long stub_va = this_cpu(stubs.addr);
+    unsigned char *stub_page;
+    unsigned int offset;
 
     /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
     BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
 
-    /* Trampoline for SYSCALL entry from long mode. */
-    stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
-    wrmsrl(MSR_LSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
+    stub_page = map_domain_page(_mfn(this_cpu(stubs.mfn)));
+
+    /* Trampoline for SYSCALL entry from 64-bit mode. */
+    wrmsrl(MSR_LSTAR, stub_va);
+    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                   stub_va, stack_bottom,
+                                   (unsigned long)lstar_enter);
+    stub_va += offset;
 
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
          boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
     {
         /* SYSENTER entry. */
-        wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
+        wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
         wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
         wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
     }
 
     /* Trampoline for SYSCALL entry from compatibility mode. */
-    stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
-    wrmsrl(MSR_CSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
+    wrmsrl(MSR_CSTAR, stub_va);
+    offset += write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                    stub_va, stack_bottom,
+                                    (unsigned long)cstar_enter);
+
+    /* Don't consume more than half of the stub space here. */
+    ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+    unmap_domain_page(stub_page);
 
     /* Common SYSCALL parameters. */
     wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c
index 79b4ab3..28132b5 100644
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -9,6 +9,7 @@
  *    Keir Fraser <keir at xen.org>
  */
 
+#include <xen/domain_page.h>
 #include <asm/x86_emulate.h>
 #include <asm/asm_defns.h> /* mark_regs_dirty() */
 #include <asm/processor.h> /* current_cpu_info */
@@ -17,8 +18,23 @@
 /* Avoid namespace pollution. */
 #undef cmpxchg
 #undef cpuid
+#undef wbinvd
 
 #define cpu_has_amd_erratum(nr) \
         cpu_has_amd_erratum(&current_cpu_data, AMD_ERRATUM_##nr)
 
+#define get_stub(stb) ({                                        \
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < MAX_INST_LEN + 1);         \
+    (stb).addr = this_cpu(stubs.addr) + STUB_BUF_SIZE / 2;      \
+    ((stb).ptr = map_domain_page(_mfn(this_cpu(stubs.mfn)))) +  \
+        ((stb).addr & ~PAGE_MASK);                              \
+})
+#define put_stub(stb) ({                                   \
+    if ( (stb).ptr )                                       \
+    {                                                      \
+        unmap_domain_page((stb).ptr);                      \
+        (stb).ptr = NULL;                                  \
+    }                                                      \
+})
+
 #include "x86_emulate/x86_emulate.c"
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
index 656a06f..f1454ce 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -17,8 +17,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /* Operand sizes: 8-bit operands or specified/overridden size. */
@@ -313,17 +312,11 @@ struct operand {
     enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
     unsigned int bytes;
 
-    /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */
-    union {
-        unsigned long val;
-        uint32_t bigval[4];
-    };
+    /* Operand value. */
+    unsigned long val;
 
-    /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */
-    union {
-        unsigned long orig_val;
-        uint32_t orig_bigval[4];
-    };
+    /* Original operand value. */
+    unsigned long orig_val;
 
     /* OP_REG: Pointer to register field. */
     unsigned long *reg;
@@ -438,7 +431,7 @@ typedef union {
 /* Before executing instruction: restore necessary bits in EFLAGS. */
 #define _PRE_EFLAGS(_sav, _msk, _tmp)                           \
 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
-"movl %"_sav",%"_LO32 _tmp"; "                                  \
+"movl %"_LO32 _sav",%"_LO32 _tmp"; "                            \
 "push %"_tmp"; "                                                \
 "push %"_tmp"; "                                                \
 "movl %"_msk",%"_LO32 _tmp"; "                                  \
@@ -458,7 +451,7 @@ typedef union {
 "pushf; "                                       \
 "pop  %"_tmp"; "                                \
 "andl %"_msk",%"_LO32 _tmp"; "                  \
-"orl  %"_LO32 _tmp",%"_sav"; "
+"orl  %"_LO32 _tmp",%"_LO32 _sav"; "
 
 /* Raw emulation: instruction has two explicit operands. */
 #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy)\
@@ -470,18 +463,16 @@ do{ unsigned long _tmp;                                                    \
             _PRE_EFLAGS("0","4","2")                                       \
             _op"w %"_wx"3,%1; "                                            \
             _POST_EFLAGS("0","4","2")                                      \
-            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
-            : _wy ((_src).val), "i" (EFLAGS_MASK),                         \
-              "m" (_eflags), "m" ((_dst).val) );                           \
+            : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)              \
+            : _wy ((_src).val), "i" (EFLAGS_MASK) );                       \
         break;                                                             \
     case 4:                                                                \
         asm volatile (                                                     \
             _PRE_EFLAGS("0","4","2")                                       \
             _op"l %"_lx"3,%1; "                                            \
             _POST_EFLAGS("0","4","2")                                      \
-            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
-            : _ly ((_src).val), "i" (EFLAGS_MASK),                         \
-              "m" (_eflags), "m" ((_dst).val) );                           \
+            : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)              \
+            : _ly ((_src).val), "i" (EFLAGS_MASK) );                       \
         break;                                                             \
     case 8:                                                                \
         __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy);           \
@@ -497,9 +488,8 @@ do{ unsigned long _tmp;                                                    \
             _PRE_EFLAGS("0","4","2")                                       \
             _op"b %"_bx"3,%1; "                                            \
             _POST_EFLAGS("0","4","2")                                      \
-            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
-            : _by ((_src).val), "i" (EFLAGS_MASK),                         \
-              "m" (_eflags), "m" ((_dst).val) );                           \
+            : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)              \
+            : _by ((_src).val), "i" (EFLAGS_MASK) );                       \
         break;                                                             \
     default:                                                               \
         __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy);\
@@ -529,24 +519,24 @@ do{ unsigned long _tmp;                                                    \
             _PRE_EFLAGS("0","3","2")                                       \
             _op"b %1; "                                                    \
             _POST_EFLAGS("0","3","2")                                      \
-            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
-            : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) );        \
+            : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)              \
+            : "i" (EFLAGS_MASK) );                                         \
         break;                                                             \
     case 2:                                                                \
         asm volatile (                                                     \
             _PRE_EFLAGS("0","3","2")                                       \
             _op"w %1; "                                                    \
             _POST_EFLAGS("0","3","2")                                      \
-            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
-            : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) );        \
+            : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)              \
+            : "i" (EFLAGS_MASK) );                                         \
         break;                                                             \
     case 4:                                                                \
         asm volatile (                                                     \
             _PRE_EFLAGS("0","3","2")                                       \
             _op"l %1; "                                                    \
             _POST_EFLAGS("0","3","2")                                      \
-            : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)              \
-            : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) );        \
+            : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)              \
+            : "i" (EFLAGS_MASK) );                                         \
         break;                                                             \
     case 8:                                                                \
         __emulate_1op_8byte(_op, _dst, _eflags);                           \
@@ -561,17 +551,16 @@ do{ asm volatile (                                                      \
         _PRE_EFLAGS("0","4","2")                                        \
         _op"q %"_qx"3,%1; "                                             \
         _POST_EFLAGS("0","4","2")                                       \
-        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)               \
-        : _qy ((_src).val), "i" (EFLAGS_MASK),                          \
-          "m" (_eflags), "m" ((_dst).val) );                            \
+        : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)               \
+        : _qy ((_src).val), "i" (EFLAGS_MASK) );                        \
 } while (0)
 #define __emulate_1op_8byte(_op, _dst, _eflags)                         \
 do{ asm volatile (                                                      \
         _PRE_EFLAGS("0","3","2")                                        \
         _op"q %1; "                                                     \
         _POST_EFLAGS("0","3","2")                                       \
-        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp)               \
-        : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) );         \
+        : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)               \
+        : "i" (EFLAGS_MASK) );                                          \
 } while (0)
 #elif defined(__i386__)
 #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
@@ -583,7 +572,8 @@ do{ asm volatile (                                                      \
 ({ unsigned long _x = 0, _eip = _regs.eip;                              \
    if ( !mode_64bit() ) _eip = (uint32_t)_eip; /* ignore upper dword */ \
    _regs.eip += (_size); /* real hardware doesn't truncate */           \
-   generate_exception_if((uint8_t)(_regs.eip - ctxt->regs->eip) > 15,   \
+   generate_exception_if((uint8_t)(_regs.eip -                          \
+                                   ctxt->regs->eip) > MAX_INST_LEN,     \
                          EXC_GP, 0);                                    \
    rc = ops->insn_fetch(x86_seg_cs, _eip, &_x, (_size), ctxt);          \
    if ( rc ) goto done;                                                 \
@@ -679,10 +669,14 @@ do{ (_fic)->exn_raised = 0;                                     \
     rc = ops->get_fpu(fpu_handle_exception, _fic, _type, ctxt); \
     if ( rc ) goto done;                                        \
 } while (0)
-#define put_fpu(_fic)                                           \
-do{                                                             \
+#define _put_fpu()                                              \
+do {                                                            \
     if ( ops->put_fpu != NULL )                                 \
-        ops->put_fpu(ctxt);                                     \
+        (ops->put_fpu)(ctxt);                                   \
+} while (0)
+#define put_fpu(_fic)                                           \
+do {                                                            \
+    _put_fpu();                                                 \
     generate_exception_if((_fic)->exn_raised, EXC_MF, -1);      \
 } while (0)
 
@@ -722,11 +716,15 @@ do{ struct fpu_insn_ctxt fic;                           \
 } while (0)
 
 #define emulate_fpu_insn_stub(_bytes...)                                \
-do{ uint8_t stub[] = { _bytes, 0xc3 };                                  \
-    struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };        \
+do {                                                                    \
+    uint8_t *buf = get_stub(stub);                                      \
+    unsigned int _nr = sizeof((uint8_t[]){ _bytes });                   \
+    struct fpu_insn_ctxt fic = { .insn_bytes = _nr };                   \
+    memcpy(buf, ((uint8_t[]){ _bytes, 0xc3 }), _nr + 1);                \
     get_fpu(X86EMUL_FPU_fpu, &fic);                                     \
-    (*(void(*)(void))stub)();                                           \
+    stub.func();                                                        \
     put_fpu(&fic);                                                      \
+    put_stub(stub);                                                     \
 } while (0)
 
 static unsigned long _get_rep_prefix(
@@ -815,10 +813,9 @@ static int read_ulong(
  */
 static bool_t mul_dbl(unsigned long m[2])
 {
-    bool_t rc;
-    asm ( "mul %4; seto %b2"
-          : "=a" (m[0]), "=d" (m[1]), "=q" (rc)
-          : "0" (m[0]), "1" (m[1]), "2" (0) );
+    bool_t rc = 0;
+    asm ( "mul %1; seto %b2"
+          : "+a" (m[0]), "+d" (m[1]), "+q" (rc) );
     return rc;
 }
 
@@ -829,10 +826,9 @@ static bool_t mul_dbl(unsigned long m[2])
  */
 static bool_t imul_dbl(unsigned long m[2])
 {
-    bool_t rc;
-    asm ( "imul %4; seto %b2"
-          : "=a" (m[0]), "=d" (m[1]), "=q" (rc)
-          : "0" (m[0]), "1" (m[1]), "2" (0) );
+    bool_t rc = 0;
+    asm ( "imul %1; seto %b2"
+          : "+a" (m[0]), "+d" (m[1]), "+q" (rc) );
     return rc;
 }
 
@@ -846,9 +842,7 @@ static bool_t div_dbl(unsigned long u[2], unsigned long v)
 {
     if ( (v == 0) || (u[1] >= v) )
         return 1;
-    asm ( "div %4"
-          : "=a" (u[0]), "=d" (u[1])
-          : "0" (u[0]), "1" (u[1]), "r" (v) );
+    asm ( "div %2" : "+a" (u[0]), "+d" (u[1]) : "r" (v) );
     return 0;
 }
 
@@ -1467,6 +1461,7 @@ x86_emulate(
     struct operand src = { .reg = REG_POISON };
     struct operand dst = { .reg = REG_POISON };
     enum x86_swint_type swint_type;
+    struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -2573,15 +2568,25 @@ x86_emulate(
     }
 
     case 0xaa ... 0xab: /* stos */ {
-        /* unsigned long max_reps = */get_rep_prefix();
-        dst.type  = OP_MEM;
+        unsigned long nr_reps = get_rep_prefix();
         dst.bytes = (d & ByteOp) ? 1 : op_bytes;
         dst.mem.seg = x86_seg_es;
         dst.mem.off = truncate_ea(_regs.edi);
-        dst.val   = _regs.eax;
+        if ( (nr_reps == 1) || !ops->rep_stos ||
+             ((rc = ops->rep_stos(&_regs.eax,
+                                  dst.mem.seg, dst.mem.off, dst.bytes,
+                                  &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+        {
+            dst.val = _regs.eax;
+            dst.type = OP_MEM;
+            nr_reps = 1;
+        }
+        else if ( rc != X86EMUL_OKAY )
+            goto done;
         register_address_increment(
-            _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
-        put_rep_prefix(1);
+            _regs.edi,
+            nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes));
+        put_rep_prefix(nr_reps);
         break;
     }
 
@@ -3780,6 +3785,7 @@ x86_emulate(
         break;
     }
 
+ no_writeback:
     /* Inject #DB if single-step tracing was enabled at instruction start. */
     if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) &&
          (ops->inject_hw_exception != NULL) )
@@ -3790,6 +3796,8 @@ x86_emulate(
     *ctxt->regs = _regs;
 
  done:
+    _put_fpu();
+    put_stub(stub);
     return rc;
 
  twobyte_insn:
@@ -3808,19 +3816,17 @@ x86_emulate(
         struct segment_register reg;
         unsigned long base, limit, cr0, cr0w;
 
-        if ( modrm == 0xdf ) /* invlpga */
+        switch( modrm )
         {
+        case 0xdf: /* invlpga */
             generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1);
             generate_exception_if(!mode_ring0(), EXC_GP, 0);
             fail_if(ops->invlpg == NULL);
             if ( (rc = ops->invlpg(x86_seg_none, truncate_ea(_regs.eax),
                                    ctxt)) )
                 goto done;
-            break;
-        }
-
-        if ( modrm == 0xf9 ) /* rdtscp */
-        {
+            goto no_writeback;
+        case 0xf9: /* rdtscp */ {
             uint64_t tsc_aux;
             fail_if(ops->read_msr == NULL);
             if ( (rc = ops->read_msr(MSR_TSC_AUX, &tsc_aux, ctxt)) != 0 )
@@ -3828,6 +3834,14 @@ x86_emulate(
             _regs.ecx = (uint32_t)tsc_aux;
             goto rdtsc;
         }
+        case 0xd4: /* vmfunc */
+            generate_exception_if(lock_prefix | rep_prefix() | (vex.pfx == vex_66),
+                                  EXC_UD, -1);
+            fail_if(ops->vmfunc == NULL);
+            if ( (rc = ops->vmfunc(ctxt) != X86EMUL_OKAY) )
+                goto done;
+            goto no_writeback;
+        }
 
         switch ( modrm_reg & 7 )
         {
@@ -4005,9 +4019,15 @@ x86_emulate(
                /* {,v}movss xmm,xmm/m32 */
                /* {,v}movsd xmm,xmm/m64 */
     {
-        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
-        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
-
+        uint8_t *buf = get_stub(stub);
+        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
+
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        buf[5] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
@@ -4015,7 +4035,7 @@ x86_emulate(
             else
                 vcpu_must_have_sse();
             ea.bytes = 16;
-            SET_SSE_PREFIX(stub[0], vex.pfx);
+            SET_SSE_PREFIX(buf[0], vex.pfx);
             get_fpu(X86EMUL_FPU_xmm, &fic);
         }
         else
@@ -4042,15 +4062,16 @@ x86_emulate(
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
-            stub[4] &= 0x38;
+            buf[4] &= 0x38;
         }
         if ( !rc )
         {
-           copy_REX_VEX(stub, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
+           copy_REX_VEX(buf, rex_prefix, vex);
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }
         put_fpu(&fic);
+        put_stub(stub);
         if ( !rc && (b & 1) && (ea.type == OP_MEM) )
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
@@ -4240,9 +4261,15 @@ x86_emulate(
                /* {,v}movdq{a,u} xmm,xmm/m128 */
                /* vmovdq{a,u} ymm,ymm/m256 */
     {
-        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
-        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
-
+        uint8_t *buf = get_stub(stub);
+        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
+
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        buf[5] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             switch ( vex.pfx )
@@ -4250,7 +4277,7 @@ x86_emulate(
             case vex_66:
             case vex_f3:
                 vcpu_must_have_sse2();
-                stub[0] = 0x66; /* movdqa */
+                buf[0] = 0x66; /* movdqa */
                 get_fpu(X86EMUL_FPU_xmm, &fic);
                 ea.bytes = 16;
                 break;
@@ -4286,15 +4313,16 @@ x86_emulate(
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
-            stub[4] &= 0x38;
+            buf[4] &= 0x38;
         }
         if ( !rc )
         {
-           copy_REX_VEX(stub, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
+           copy_REX_VEX(buf, rex_prefix, vex);
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }
         put_fpu(&fic);
+        put_stub(stub);
         if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
@@ -4346,7 +4374,8 @@ x86_emulate(
         /* Save real source value, then compare EAX against destination. */
         src.orig_val = src.val;
         src.val = _regs.eax;
-        emulate_2op_SrcV("cmp", src, dst, _regs.eflags);
+        /* cmp: %%eax - dst ==> dst and src swapped for macro invocation */
+        emulate_2op_SrcV("cmp", dst, src, _regs.eflags);
         if ( _regs.eflags & EFLG_ZF )
         {
             /* Success: write back to memory. */
@@ -4634,5 +4663,7 @@ x86_emulate(
     goto writeback;
 
  cannot_emulate:
+    _put_fpu();
+    put_stub(stub);
     return X86EMUL_UNHANDLEABLE;
 }
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h
index b059341..cfac09b 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -17,13 +17,14 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __X86_EMULATE_H__
 #define __X86_EMULATE_H__
 
+#define MAX_INST_LEN 15
+
 struct x86_emulate_ctxt;
 
 /* Comprehensive enumeration of x86 segment registers. */
@@ -241,6 +242,20 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * rep_stos: Emulate STOS: <*p_data> -> <seg:offset>.
+     *  @bytes_per_rep: [IN ] Bytes transferred per repetition.
+     *  @reps:  [IN ] Maximum repetitions to be emulated.
+     *          [OUT] Number of repetitions actually emulated.
+     */
+    int (*rep_stos)(
+        void *p_data,
+        enum x86_segment seg,
+        unsigned long offset,
+        unsigned int bytes_per_rep,
+        unsigned long *reps,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * read_segment: Emulate a read of full context of a segment register.
      *  @reg:   [OUT] Contents of segment register (visible and hidden state).
      */
@@ -368,7 +383,11 @@ struct x86_emulate_ops
         enum x86_emulate_fpu_type type,
         struct x86_emulate_ctxt *ctxt);
 
-    /* put_fpu: Relinquish the FPU. Unhook from FPU/SIMD exception handlers. */
+    /*
+     * put_fpu: Relinquish the FPU. Unhook from FPU/SIMD exception handlers.
+     *  The handler, if installed, must be prepared to get called without
+     *  the get_fpu one having got called before!
+     */
     void (*put_fpu)(
         struct x86_emulate_ctxt *ctxt);
 
@@ -377,6 +396,10 @@ struct x86_emulate_ops
         enum x86_segment seg,
         unsigned long offset,
         struct x86_emulate_ctxt *ctxt);
+
+    /* vmfunc: Emulate VMFUNC via given set of EAX ECX inputs */
+    int (*vmfunc)(
+        struct x86_emulate_ctxt *ctxt);
 };
 
 struct cpu_user_regs;
@@ -409,6 +432,19 @@ struct x86_emulate_ctxt
     } retire;
 };
 
+struct x86_emulate_stub {
+    union {
+        void (*func)(void);
+        uintptr_t addr;
+    };
+#ifdef __XEN__
+    void *ptr;
+#else
+    /* Room for one insn and a (single byte) RET. */
+    uint8_t buf[MAX_INST_LEN + 1];
+#endif
+};
+
 /*
  * x86_emulate: Emulate an instruction.
  * Returns -1 on failure, 0 on success.
diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
index d4b1f1a..6553cff 100644
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -38,7 +38,7 @@ SECTIONS
   . = __XEN_VIRT_START;
   __image_base__ = .;
 #endif
-  . = __XEN_VIRT_START + 0x100000;
+  . = __XEN_VIRT_START + MB(1);
   _start = .;
   .text : {
         _stext = .;            /* Text and read-only data */
@@ -175,6 +175,7 @@ SECTIONS
        *(.bss.percpu.read_mostly)
        . = ALIGN(SMP_CACHE_BYTES);
        __per_cpu_data_end = .;
+       __bss_end = .;
   } :text
   _end = . ;
 
@@ -186,7 +187,7 @@ SECTIONS
   /* Trick the linker into setting the image size to exactly 16Mb. */
   . = ALIGN(__section_alignment__);
   .pad : {
-    . = ALIGN(0x1000000);
+    . = ALIGN(MB(16));
   } :text
 #else
   efi = .;
@@ -197,6 +198,8 @@ SECTIONS
        *(.exit.text)
        *(.exit.data)
        *(.exitcall.exit)
+       *(.discard)
+       *(.discard.*)
        *(.eh_frame)
 #ifdef EFI
        *(.comment)
@@ -214,4 +217,8 @@ SECTIONS
   .comment 0 : { *(.comment) }
 }
 
+ASSERT(__image_base__ > XEN_VIRT_START ||
+       _end <= XEN_VIRT_END - NR_CPUS * PAGE_SIZE,
+       "Xen image overlaps stubs area")
 ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large")
+ASSERT((cpu0_stack & (STACK_SIZE - 1)) == 0, "cpu0_stack misaligned")
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 8391246..3fdf931 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -9,22 +9,28 @@ obj-y += event_2l.o
 obj-y += event_channel.o
 obj-y += event_fifo.o
 obj-y += grant_table.o
+obj-y += guestcopy.o
 obj-y += irq.o
 obj-y += kernel.o
 obj-y += keyhandler.o
 obj-$(HAS_KEXEC) += kexec.o
 obj-$(HAS_KEXEC) += kimage.o
 obj-y += lib.o
+obj-y += lzo.o
+obj-$(HAS_MEM_ACCESS) += mem_access.o
 obj-y += memory.o
 obj-y += multicall.o
 obj-y += notifier.o
 obj-y += page_alloc.o
+obj-$(HAS_PDX) += pdx.o
 obj-y += preempt.o
 obj-y += random.o
 obj-y += rangeset.o
+obj-y += radix-tree.o
+obj-y += rbtree.o
+obj-y += rcupdate.o
 obj-y += sched_credit.o
 obj-y += sched_credit2.o
-obj-y += sched_sedf.o
 obj-y += sched_arinc653.o
 obj-y += sched_rt.o
 obj-y += schedule.o
@@ -40,21 +46,15 @@ obj-y += sysctl.o
 obj-y += tasklet.o
 obj-y += time.o
 obj-y += timer.o
+obj-y += tmem.o
+obj-y += tmem_xen.o
 obj-y += trace.o
 obj-y += version.o
+obj-y += vm_event.o
 obj-y += vmap.o
 obj-y += vsprintf.o
 obj-y += wait.o
 obj-y += xmalloc_tlsf.o
-obj-y += rcupdate.o
-obj-y += tmem.o
-obj-y += tmem_xen.o
-obj-y += radix-tree.o
-obj-y += rbtree.o
-obj-y += lzo.o
-obj-$(HAS_PDX) += pdx.o
-obj-$(HAS_MEM_ACCESS) += mem_access.o
-obj-$(HAS_MEM_ACCESS) += mem_event.o
 
 obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma unlzo unlz4 earlycpio,$(n).init.o)
 
@@ -62,8 +62,6 @@ obj-$(perfc)       += perfc.o
 obj-$(crash_debug) += gdbstub.o
 obj-$(xenoprof)    += xenoprof.o
 
-obj-$(CONFIG_XENCOMM) += xencomm.o
-
 subdir-$(CONFIG_COMPAT) += compat
 
 subdir-$(x86_64) += hvm
diff --git a/xen/common/compat/domain.c b/xen/common/compat/domain.c
index b4be3b3..3ca4ef7 100644
--- a/xen/common/compat/domain.c
+++ b/xen/common/compat/domain.c
@@ -23,15 +23,12 @@ CHECK_SIZE_(struct, vcpu_info);
 CHECK_vcpu_register_vcpu_info;
 #undef xen_vcpu_register_vcpu_info
 
-int compat_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
+int compat_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     struct domain *d = current->domain;
     struct vcpu *v;
     int rc = 0;
 
-    if ( (vcpuid < 0) || (vcpuid >= MAX_VIRT_CPUS) )
-        return -EINVAL;
-
     if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
         return -ENOENT;
 
@@ -59,7 +56,7 @@ int compat_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
         domain_unlock(d);
 
         if ( rc == -ERESTART )
-            rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
+            rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iuh",
                                                cmd, vcpuid, arg);
 
         xfree(cmp_ctxt);
diff --git a/xen/common/compat/kernel.c b/xen/common/compat/kernel.c
index 6a1e41f..65cc25b 100644
--- a/xen/common/compat/kernel.c
+++ b/xen/common/compat/kernel.c
@@ -41,6 +41,11 @@ CHECK_TYPE(domain_handle);
 #define xennmi_callback compat_nmi_callback
 #define xennmi_callback_t compat_nmi_callback_t
 
+#ifdef COMPAT_VM_ASSIST_VALID
+#undef VM_ASSIST_VALID
+#define VM_ASSIST_VALID COMPAT_VM_ASSIST_VALID
+#endif
+
 #define DO(fn) int compat_##fn
 #define COMPAT
 
diff --git a/xen/common/compat/memory.c b/xen/common/compat/memory.c
index b258138..002948b 100644
--- a/xen/common/compat/memory.c
+++ b/xen/common/compat/memory.c
@@ -17,6 +17,42 @@ CHECK_TYPE(domid);
 CHECK_mem_access_op;
 CHECK_vmemrange;
 
+#ifdef HAS_PASSTHROUGH
+struct get_reserved_device_memory {
+    struct compat_reserved_device_memory_map map;
+    unsigned int used_entries;
+};
+
+static int get_reserved_device_memory(xen_pfn_t start, xen_ulong_t nr,
+                                      u32 id, void *ctxt)
+{
+    struct get_reserved_device_memory *grdm = ctxt;
+    u32 sbdf = PCI_SBDF3(grdm->map.dev.pci.seg, grdm->map.dev.pci.bus,
+                         grdm->map.dev.pci.devfn);
+
+    if ( !(grdm->map.flags & XENMEM_RDM_ALL) && (sbdf != id) )
+        return 0;
+
+    if ( grdm->used_entries < grdm->map.nr_entries )
+    {
+        struct compat_reserved_device_memory rdm = {
+            .start_pfn = start, .nr_pages = nr
+        };
+
+        if ( rdm.start_pfn != start || rdm.nr_pages != nr )
+            return -ERANGE;
+
+        if ( __copy_to_compat_offset(grdm->map.buffer, grdm->used_entries,
+                                     &rdm, 1) )
+            return -EFAULT;
+    }
+
+    ++grdm->used_entries;
+
+    return 1;
+}
+#endif
+
 int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) compat)
 {
     int split, op = cmd & MEMOP_CMD_MASK;
@@ -303,6 +339,35 @@ int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) compat)
             break;
         }
 
+#ifdef HAS_PASSTHROUGH
+        case XENMEM_reserved_device_memory_map:
+        {
+            struct get_reserved_device_memory grdm;
+
+            if ( unlikely(start_extent) )
+                return -ENOSYS;
+
+            if ( copy_from_guest(&grdm.map, compat, 1) ||
+                 !compat_handle_okay(grdm.map.buffer, grdm.map.nr_entries) )
+                return -EFAULT;
+
+            if ( grdm.map.flags & ~XENMEM_RDM_ALL )
+                return -EINVAL;
+
+            grdm.used_entries = 0;
+            rc = iommu_get_reserved_device_memory(get_reserved_device_memory,
+                                                  &grdm);
+
+            if ( !rc && grdm.map.nr_entries < grdm.used_entries )
+                rc = -ENOBUFS;
+            grdm.map.nr_entries = grdm.used_entries;
+            if ( __copy_to_guest(compat, &grdm.map, 1) )
+                rc = -EFAULT;
+
+            return rc;
+        }
+#endif
+
         default:
             return compat_arch_memory_op(cmd, compat);
         }
diff --git a/xen/common/compat/tmem_xen.c b/xen/common/compat/tmem_xen.c
index 97c7ff2..db08005 100644
--- a/xen/common/compat/tmem_xen.c
+++ b/xen/common/compat/tmem_xen.c
@@ -11,9 +11,7 @@
 #include <xen/hypercall.h>
 #include <compat/tmem.h>
 
-#define xen_tmem_op tmem_op
-/*CHECK_tmem_op;*/
-#undef xen_tmem_op
+CHECK_tmem_oid;
 
 /*
  * Local variables:
diff --git a/xen/common/core_parking.c b/xen/common/core_parking.c
index 3190fb7..de269e0 100644
--- a/xen/common/core_parking.c
+++ b/xen/common/core_parking.c
@@ -75,11 +75,10 @@ static unsigned int core_parking_performance(unsigned int event)
             if ( core_weight < core_tmp )
             {
                 core_weight = core_tmp;
-                cpumask_clear(&core_candidate_map);
-                cpumask_set_cpu(cpu, &core_candidate_map);
+                cpumask_copy(&core_candidate_map, cpumask_of(cpu));
             }
             else if ( core_weight == core_tmp )
-                cpumask_set_cpu(cpu, &core_candidate_map);
+                __cpumask_set_cpu(cpu, &core_candidate_map);
         }
 
         for_each_cpu(cpu, &core_candidate_map)
@@ -88,11 +87,10 @@ static unsigned int core_parking_performance(unsigned int event)
             if ( sibling_weight < sibling_tmp )
             {
                 sibling_weight = sibling_tmp;
-                cpumask_clear(&sibling_candidate_map);
-                cpumask_set_cpu(cpu, &sibling_candidate_map);
+                cpumask_copy(&sibling_candidate_map, cpumask_of(cpu));
             }
             else if ( sibling_weight == sibling_tmp )
-                cpumask_set_cpu(cpu, &sibling_candidate_map);
+                __cpumask_set_cpu(cpu, &sibling_candidate_map);
         }
 
         cpu = cpumask_first(&sibling_candidate_map);
@@ -135,11 +133,10 @@ static unsigned int core_parking_power(unsigned int event)
             if ( core_weight > core_tmp )
             {
                 core_weight = core_tmp;
-                cpumask_clear(&core_candidate_map);
-                cpumask_set_cpu(cpu, &core_candidate_map);
+                cpumask_copy(&core_candidate_map, cpumask_of(cpu));
             }
             else if ( core_weight == core_tmp )
-                cpumask_set_cpu(cpu, &core_candidate_map);
+                __cpumask_set_cpu(cpu, &core_candidate_map);
         }
 
         for_each_cpu(cpu, &core_candidate_map)
@@ -148,11 +145,10 @@ static unsigned int core_parking_power(unsigned int event)
             if ( sibling_weight > sibling_tmp )
             {
                 sibling_weight = sibling_tmp;
-                cpumask_clear(&sibling_candidate_map);
-                cpumask_set_cpu(cpu, &sibling_candidate_map);
+                cpumask_copy(&sibling_candidate_map, cpumask_of(cpu));
             }
             else if ( sibling_weight == sibling_tmp )
-                cpumask_set_cpu(cpu, &sibling_candidate_map);
+                __cpumask_set_cpu(cpu, &sibling_candidate_map);
         }
 
         cpu = cpumask_first(&sibling_candidate_map);
diff --git a/xen/common/cpu.c b/xen/common/cpu.c
index 630881e..497e0cb 100644
--- a/xen/common/cpu.c
+++ b/xen/common/cpu.c
@@ -187,12 +187,12 @@ int disable_nonboot_cpus(void)
 
         if ( (error = cpu_down(cpu)) )
         {
-            BUG_ON(error == -EBUSY);
             printk("Error taking CPU%d down: %d\n", cpu, error);
+            BUG_ON(error == -EBUSY);
             break;
         }
 
-        cpumask_set_cpu(cpu, &frozen_cpus);
+        __cpumask_set_cpu(cpu, &frozen_cpus);
     }
 
     BUG_ON(!error && (num_online_cpus() != 1));
@@ -209,8 +209,8 @@ void enable_nonboot_cpus(void)
     {
         if ( (error = cpu_up(cpu)) )
         {
+            printk("Error bringing CPU%d up: %d\n", cpu, error);
             BUG_ON(error == -EBUSY);
-            printk("Error taking CPU%d up: %d\n", cpu, error);
         }
     }
 
diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
index cd6aab9..69b984c 100644
--- a/xen/common/cpupool.c
+++ b/xen/common/cpupool.c
@@ -17,6 +17,7 @@
 #include <xen/percpu.h>
 #include <xen/sched.h>
 #include <xen/sched-if.h>
+#include <xen/keyhandler.h>
 #include <xen/cpu.h>
 
 #define for_each_cpupool(ptr)    \
@@ -296,12 +297,25 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
 static long cpupool_unassign_cpu_helper(void *info)
 {
     int cpu = cpupool_moving_cpu;
+    struct cpupool *c = info;
+    struct domain *d;
     long ret;
 
     cpupool_dprintk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
                     cpupool_cpu_moving->cpupool_id, cpu);
 
     spin_lock(&cpupool_lock);
+    if ( c != cpupool_cpu_moving )
+    {
+        ret = -EBUSY;
+        goto out;
+    }
+
+    /*
+     * We need this for scanning the domain list, both in
+     * cpu_disable_scheduler(), and at the bottom of this function.
+     */
+    rcu_read_lock(&domlist_read_lock);
     ret = cpu_disable_scheduler(cpu);
     cpumask_set_cpu(cpu, &cpupool_free_cpus);
     if ( !ret )
@@ -318,6 +332,11 @@ static long cpupool_unassign_cpu_helper(void *info)
         cpupool_cpu_moving = NULL;
     }
 
+    for_each_domain_in_cpupool(d, c)
+    {
+        domain_update_node_affinity(d);
+    }
+    rcu_read_unlock(&domlist_read_lock);
 out:
     spin_unlock(&cpupool_lock);
     cpupool_dprintk("cpupool_unassign_cpu ret=%ld\n", ret);
@@ -452,13 +471,17 @@ void cpupool_rm_domain(struct domain *d)
 }
 
 /*
- * called to add a new cpu to pool admin
- * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0,
- * unless we are resuming from S3, in which case we put the cpu back
- * in the cpupool it was in prior to suspend.
+ * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0,
+ * as they must have been in there when unplugged.
+ *
+ * If, on the other hand, we are adding CPUs because we are resuming (e.g.,
+ * after ACPI S3) we put the cpu back in the pool where it was in prior when
+ * we suspended.
  */
-static void cpupool_cpu_add(unsigned int cpu)
+static int cpupool_cpu_add(unsigned int cpu)
 {
+    int ret = 0;
+
     spin_lock(&cpupool_lock);
     cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
     cpumask_set_cpu(cpu, &cpupool_free_cpus);
@@ -471,41 +494,87 @@ static void cpupool_cpu_add(unsigned int cpu)
         {
             if ( cpumask_test_cpu(cpu, (*c)->cpu_suspended ) )
             {
-                cpupool_assign_cpu_locked(*c, cpu);
+                ret = cpupool_assign_cpu_locked(*c, cpu);
+                if ( ret )
+                    goto out;
                 cpumask_clear_cpu(cpu, (*c)->cpu_suspended);
+                break;
             }
         }
-    }
 
-    if ( cpumask_test_cpu(cpu, &cpupool_free_cpus) )
-        cpupool_assign_cpu_locked(cpupool0, cpu);
+        /*
+         * Either cpu has been found as suspended in a pool, and added back
+         * there, or it stayed free (if it did not belong to any pool when
+         * suspending), and we don't want to do anything.
+         */
+        ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus) ||
+               cpumask_test_cpu(cpu, (*c)->cpu_valid));
+    }
+    else
+    {
+        /*
+         * If we are not resuming, we are hot-plugging cpu, and in which case
+         * we add it to pool0, as it certainly was there when hot-unplagged
+         * (or unplugging would have failed) and that is the default behavior
+         * anyway.
+         */
+        ret = cpupool_assign_cpu_locked(cpupool0, cpu);
+    }
+ out:
     spin_unlock(&cpupool_lock);
+
+    return ret;
 }
 
 /*
- * called to remove a cpu from pool admin
- * the cpu to be removed is locked to avoid removing it from dom0
- * returns failure if not in pool0
+ * Called to remove a CPU from a pool. The CPU is locked, to forbid removing
+ * it from pool0. In fact, if we want to hot-unplug a CPU, it must belong to
+ * pool0, or we fail.
+ *
+ * However, if we are suspending (e.g., to ACPI S3), we mark the CPU in such
+ * a way that it can be put back in its pool when resuming.
  */
 static int cpupool_cpu_remove(unsigned int cpu)
 {
     int ret = -EBUSY;
-    struct cpupool **c;
 
     spin_lock(&cpupool_lock);
-    if ( cpumask_test_cpu(cpu, cpupool0->cpu_valid) )
-        ret = 0;
-    else
+    if ( system_state == SYS_STATE_suspend )
     {
+        struct cpupool **c;
+
         for_each_cpupool(c)
         {
-            if ( cpumask_test_cpu(cpu, (*c)->cpu_suspended ) )
+            if ( cpumask_test_cpu(cpu, (*c)->cpu_valid ) )
             {
-                ret = 0;
+                cpumask_set_cpu(cpu, (*c)->cpu_suspended);
+                cpumask_clear_cpu(cpu, (*c)->cpu_valid);
                 break;
             }
         }
+
+        /*
+         * Either we found cpu in a pool, or it must be free (if it has been
+         * hot-unplagged, then we must have found it in pool0). It is, of
+         * course, fine to suspend or shutdown with CPUs not assigned to a
+         * pool, and (in case of suspend) they will stay free when resuming.
+         */
+        ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus) ||
+               cpumask_test_cpu(cpu, (*c)->cpu_suspended));
+        ASSERT(cpumask_test_cpu(cpu, &cpu_online_map) ||
+               cpumask_test_cpu(cpu, cpupool0->cpu_suspended));
+        ret = 0;
+    }
+    else if ( cpumask_test_cpu(cpu, cpupool0->cpu_valid) )
+    {
+        /*
+         * If we are not suspending, we are hot-unplugging cpu, and that is
+         * allowed only for CPUs in pool0.
+         */
+        cpumask_clear_cpu(cpu, cpupool0->cpu_valid);
+        ret = 0;
     }
+
     if ( !ret )
         cpumask_set_cpu(cpu, &cpupool_locked_cpus);
     spin_unlock(&cpupool_lock);
@@ -658,6 +727,12 @@ int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
     return ret;
 }
 
+static void print_cpumap(const char *str, const cpumask_t *map)
+{
+    cpulist_scnprintf(keyhandler_scratch, sizeof(keyhandler_scratch), map);
+    printk("%s: %s\n", str, keyhandler_scratch);
+}
+
 void dump_runq(unsigned char key)
 {
     unsigned long    flags;
@@ -671,12 +746,17 @@ void dump_runq(unsigned char key)
             sched_smt_power_savings? "enabled":"disabled");
     printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
 
-    printk("Idle cpupool:\n");
-    schedule_dump(NULL);
+    print_cpumap("Online Cpus", &cpu_online_map);
+    if ( !cpumask_empty(&cpupool_free_cpus) )
+    {
+        print_cpumap("Free Cpus", &cpupool_free_cpus);
+        schedule_dump(NULL);
+    }
 
     for_each_cpupool(c)
     {
         printk("Cpupool %d:\n", (*c)->cpupool_id);
+        print_cpumap("Cpus", (*c)->cpu_valid);
         schedule_dump(*c);
     }
 
@@ -690,20 +770,11 @@ static int cpu_callback(
     unsigned int cpu = (unsigned long)hcpu;
     int rc = 0;
 
-    if ( system_state == SYS_STATE_suspend )
-    {
-        struct cpupool **c;
-
-        for_each_cpupool(c)
-            if ( cpumask_test_cpu(cpu, (*c)->cpu_valid ) )
-                cpumask_set_cpu(cpu, (*c)->cpu_suspended);
-    }
-
     switch ( action )
     {
     case CPU_DOWN_FAILED:
     case CPU_ONLINE:
-        cpupool_cpu_add(cpu);
+        rc = cpupool_cpu_add(cpu);
         break;
     case CPU_DOWN_PREPARE:
         rc = cpupool_cpu_remove(cpu);
diff --git a/xen/common/device_tree.c b/xen/common/device_tree.c
index f72b2e9..18cdb6f 100644
--- a/xen/common/device_tree.c
+++ b/xen/common/device_tree.c
@@ -13,6 +13,7 @@
 #include <xen/config.h>
 #include <xen/types.h>
 #include <xen/init.h>
+#include <xen/guest_access.h>
 #include <xen/device_tree.h>
 #include <xen/kernel.h>
 #include <xen/lib.h>
@@ -23,6 +24,7 @@
 #include <xen/cpumask.h>
 #include <xen/ctype.h>
 #include <asm/setup.h>
+#include <xen/err.h>
 
 const void *device_tree_flattened;
 dt_irq_xlate_func dt_irq_xlate;
@@ -277,6 +279,22 @@ struct dt_device_node *dt_find_node_by_path(const char *path)
     return np;
 }
 
+int dt_find_node_by_gpath(XEN_GUEST_HANDLE(char) u_path, uint32_t u_plen,
+                          struct dt_device_node **node)
+{
+    char *path;
+
+    path = safe_copy_string_from_guest(u_path, u_plen, PAGE_SIZE);
+    if ( IS_ERR(path) )
+        return PTR_ERR(path);
+
+    *node = dt_find_node_by_path(path);
+
+    xfree(path);
+
+    return (*node == NULL) ? -ESRCH : 0;
+}
+
 struct dt_device_node *dt_find_node_by_alias(const char *alias)
 {
     const struct dt_alias_prop *app;
@@ -290,11 +308,12 @@ struct dt_device_node *dt_find_node_by_alias(const char *alias)
     return NULL;
 }
 
-bool_t dt_match_node(const struct dt_device_match *matches,
-                     const struct dt_device_node *node)
+const struct dt_device_match *
+dt_match_node(const struct dt_device_match *matches,
+              const struct dt_device_node *node)
 {
     if ( !matches )
-        return 0;
+        return NULL;
 
     while ( matches->path || matches->type ||
             matches->compatible || matches->not_available )
@@ -314,12 +333,11 @@ bool_t dt_match_node(const struct dt_device_match *matches,
             match &= !dt_device_is_available(node);
 
         if ( match )
-            return match;
-
+            return matches;
         matches++;
     }
 
-    return 0;
+    return NULL;
 }
 
 const struct dt_device_node *dt_get_parent(const struct dt_device_node *node)
@@ -399,6 +417,26 @@ int dt_n_size_cells(const struct dt_device_node *np)
 }
 
 /*
+ * These are defined in Linux where much of this code comes from, but
+ * are currently unused outside this file in the context of Xen.
+ */
+#define IORESOURCE_BITS         0x000000ff      /* Bus-specific bits */
+
+#define IORESOURCE_TYPE_BITS    0x00001f00      /* Resource type */
+#define IORESOURCE_IO           0x00000100      /* PCI/ISA I/O ports */
+#define IORESOURCE_MEM          0x00000200
+#define IORESOURCE_REG          0x00000300      /* Register offsets */
+#define IORESOURCE_IRQ          0x00000400
+#define IORESOURCE_DMA          0x00000800
+#define IORESOURCE_BUS          0x00001000
+
+#define IORESOURCE_PREFETCH     0x00002000      /* No side effects */
+#define IORESOURCE_READONLY     0x00004000
+#define IORESOURCE_CACHEABLE    0x00008000
+#define IORESOURCE_RANGELENGTH  0x00010000
+#define IORESOURCE_SHADOWABLE   0x00020000
+
+/*
  * Default translator (generic bus)
  */
 static bool_t dt_bus_default_match(const struct dt_device_node *node)
@@ -462,9 +500,81 @@ static int dt_bus_default_translate(__be32 *addr, u64 offset, int na)
 }
 static unsigned int dt_bus_default_get_flags(const __be32 *addr)
 {
-    /* TODO: Return the type of memory (device, ...) for caching
-     * attribute during mapping */
-    return 0;
+    return IORESOURCE_MEM;
+}
+
+/*
+ * PCI bus specific translator
+ */
+
+static bool_t dt_bus_pci_match(const struct dt_device_node *np)
+{
+    /*
+     * "pciex" is PCI Express "vci" is for the /chaos bridge on 1st-gen PCI
+     * powermacs "ht" is hypertransport
+     */
+    return !strcmp(np->type, "pci") || !strcmp(np->type, "pciex") ||
+        !strcmp(np->type, "vci") || !strcmp(np->type, "ht");
+}
+
+static void dt_bus_pci_count_cells(const struct dt_device_node *np,
+				   int *addrc, int *sizec)
+{
+    if (addrc)
+        *addrc = 3;
+    if (sizec)
+        *sizec = 2;
+}
+
+static unsigned int dt_bus_pci_get_flags(const __be32 *addr)
+{
+    unsigned int flags = 0;
+    u32 w = be32_to_cpup(addr);
+
+    switch((w >> 24) & 0x03) {
+    case 0x01:
+        flags |= IORESOURCE_IO;
+        break;
+    case 0x02: /* 32 bits */
+    case 0x03: /* 64 bits */
+        flags |= IORESOURCE_MEM;
+        break;
+    }
+    if (w & 0x40000000)
+        flags |= IORESOURCE_PREFETCH;
+    return flags;
+}
+
+static u64 dt_bus_pci_map(__be32 *addr, const __be32 *range, int na, int ns,
+		int pna)
+{
+    u64 cp, s, da;
+    unsigned int af, rf;
+
+    af = dt_bus_pci_get_flags(addr);
+    rf = dt_bus_pci_get_flags(range);
+
+    /* Check address type match */
+    if ((af ^ rf) & (IORESOURCE_MEM | IORESOURCE_IO))
+        return DT_BAD_ADDR;
+
+    /* Read address values, skipping high cell */
+    cp = dt_read_number(range + 1, na - 1);
+    s  = dt_read_number(range + na + pna, ns);
+    da = dt_read_number(addr + 1, na - 1);
+
+    dt_dprintk("DT: PCI map, cp=%llx, s=%llx, da=%llx\n",
+               (unsigned long long)cp, (unsigned long long)s,
+               (unsigned long long)da);
+
+    if (da < cp || da >= (cp + s))
+        return DT_BAD_ADDR;
+    return da - cp;
+}
+
+static int dt_bus_pci_translate(__be32 *addr, u64 offset, int na)
+{
+    return dt_bus_default_translate(addr + 1, offset, na - 1);
 }
 
 /*
@@ -472,6 +582,16 @@ static unsigned int dt_bus_default_get_flags(const __be32 *addr)
  */
 static const struct dt_bus dt_busses[] =
 {
+    /* PCI */
+    {
+        .name = "pci",
+        .addresses = "assigned-addresses",
+        .match = dt_bus_pci_match,
+        .count_cells = dt_bus_pci_count_cells,
+        .map = dt_bus_pci_map,
+        .translate = dt_bus_pci_translate,
+        .get_flags = dt_bus_pci_get_flags,
+    },
     /* Default */
     {
         .name = "default",
@@ -496,7 +616,7 @@ static const struct dt_bus *dt_match_bus(const struct dt_device_node *np)
 }
 
 static const __be32 *dt_get_address(const struct dt_device_node *dev,
-                                    int index, u64 *size,
+                                    unsigned int index, u64 *size,
                                     unsigned int *flags)
 {
     const __be32 *prop;
@@ -683,7 +803,7 @@ bail:
 }
 
 /* dt_device_address - Translate device tree address and return it */
-int dt_device_get_address(const struct dt_device_node *dev, int index,
+int dt_device_get_address(const struct dt_device_node *dev, unsigned int index,
                           u64 *addr, u64 *size)
 {
     const __be32 *addrp;
@@ -704,6 +824,91 @@ int dt_device_get_address(const struct dt_device_node *dev, int index,
     return 0;
 }
 
+
+int dt_for_each_range(const struct dt_device_node *dev,
+                      int (*cb)(const struct dt_device_node *,
+                                u64 addr, u64 length,
+                                void *),
+                      void *data)
+{
+    const struct dt_device_node *parent = NULL;
+    const struct dt_bus *bus, *pbus;
+    const __be32 *ranges;
+    __be32 addr[DT_MAX_ADDR_CELLS];
+    unsigned int rlen;
+    int na, ns, pna, pns, rone;
+
+    bus = dt_match_bus(dev);
+    if ( !bus )
+        return 0; /* device is not a bus */
+
+    parent = dt_get_parent(dev);
+    if ( parent == NULL )
+        return -EINVAL;
+
+    ranges = dt_get_property(dev, "ranges", &rlen);
+    if ( ranges == NULL )
+    {
+        printk(XENLOG_ERR "DT: no ranges; cannot enumerate\n");
+        return -EINVAL;
+    }
+    if ( rlen == 0 ) /* Nothing to do */
+        return 0;
+
+    bus->count_cells(dev, &na, &ns);
+    if ( !DT_CHECK_COUNTS(na, ns) )
+    {
+        printk(XENLOG_ERR "dt_parse: Bad cell count for device %s\n",
+                  dev->full_name);
+        return -EINVAL;
+    }
+
+    pbus = dt_match_bus(parent);
+    if ( pbus == NULL )
+    {
+        printk("DT: %s is not a valid bus\n", parent->full_name);
+        return -EINVAL;
+    }
+
+    pbus->count_cells(dev, &pna, &pns);
+    if ( !DT_CHECK_COUNTS(pna, pns) )
+    {
+        printk(XENLOG_ERR "dt_parse: Bad cell count for parent %s\n",
+               dev->full_name);
+        return -EINVAL;
+    }
+
+    /* Now walk through the ranges */
+    rlen /= 4;
+    rone = na + pna + ns;
+
+    dt_dprintk("%s: dev=%s, bus=%s, parent=%s, rlen=%d, rone=%d\n",
+               __func__,
+               dt_node_name(dev), bus->name,
+               dt_node_name(parent), rlen, rone);
+
+    for ( ; rlen >= rone; rlen -= rone, ranges += rone )
+    {
+        u64 a, s;
+        int ret;
+
+        memcpy(addr, ranges + na, 4 * pna);
+
+        a = __dt_translate_address(dev, addr, "ranges");
+        s = dt_read_number(ranges + na + pna, ns);
+
+        ret = cb(dev, a, s, data);
+        if ( ret )
+        {
+            dt_dprintk(" -> callback failed=%d\n", ret);
+            return ret;
+        }
+
+    }
+
+    return 0;
+}
+
 /**
  * dt_find_node_by_phandle - Find a node given a phandle
  * @handle: phandle of the node to find
@@ -811,6 +1016,160 @@ unsigned int dt_number_of_address(const struct dt_device_node *dev)
     return (psize / onesize);
 }
 
+int dt_for_each_irq_map(const struct dt_device_node *dev,
+                        int (*cb)(const struct dt_device_node *,
+                                  const struct dt_irq *,
+                                  void *),
+                        void *data)
+{
+    const struct dt_device_node *ipar, *tnode, *old = NULL;
+    const __be32 *tmp, *imap;
+    u32 intsize = 1, addrsize, pintsize = 0, paddrsize = 0;
+    u32 imaplen;
+    int i, ret;
+
+    struct dt_raw_irq dt_raw_irq;
+    struct dt_irq dt_irq;
+
+    dt_dprintk("%s: par=%s cb=%p data=%p\n", __func__,
+               dev->full_name, cb, data);
+
+    ipar = dev;
+
+    /* First get the #interrupt-cells property of the current cursor
+     * that tells us how to interpret the passed-in intspec. If there
+     * is none, we are nice and just walk up the tree
+     */
+    do {
+        tmp = dt_get_property(ipar, "#interrupt-cells", NULL);
+        if ( tmp != NULL )
+        {
+            intsize = be32_to_cpu(*tmp);
+            break;
+        }
+        tnode = ipar;
+        ipar = dt_irq_find_parent(ipar);
+    } while ( ipar );
+    if ( ipar == NULL )
+    {
+        dt_dprintk(" -> no parent found !\n");
+        goto fail;
+    }
+
+    dt_dprintk("%s: ipar=%s, size=%d\n", __func__, ipar->full_name, intsize);
+
+    if ( intsize > DT_MAX_IRQ_SPEC )
+    {
+        dt_dprintk(" -> too many irq specifier cells\n");
+        goto fail;
+    }
+
+    /* Look for this #address-cells. We have to implement the old linux
+     * trick of looking for the parent here as some device-trees rely on it
+     */
+    old = ipar;
+    do {
+        tmp = dt_get_property(old, "#address-cells", NULL);
+        tnode = dt_get_parent(old);
+        old = tnode;
+    } while ( old && tmp == NULL );
+
+    old = NULL;
+    addrsize = (tmp == NULL) ? 2 : be32_to_cpu(*tmp);
+
+    dt_dprintk(" -> addrsize=%d\n", addrsize);
+
+    /* Now look for an interrupt-map */
+    imap = dt_get_property(dev, "interrupt-map", &imaplen);
+    /* No interrupt-map found. Ignore */
+    if ( imap == NULL )
+    {
+        dt_dprintk(" -> no map, ignoring\n");
+        return 0;
+    }
+    imaplen /= sizeof(u32);
+
+    /* Parse interrupt-map */
+    while ( imaplen > (addrsize + intsize + 1) )
+    {
+        /* skip child unit address and child interrupt specifier */
+        imap += addrsize + intsize;
+        imaplen -= addrsize + intsize;
+
+        /* Get the interrupt parent */
+        ipar = dt_find_node_by_phandle(be32_to_cpup(imap));
+        imap++;
+        --imaplen;
+
+        /* Check if not found */
+        if ( ipar == NULL )
+        {
+            dt_dprintk(" -> imap parent not found !\n");
+            goto fail;
+        }
+
+        dt_dprintk(" -> ipar %s\n", dt_node_name(ipar));
+
+        /* Get #interrupt-cells and #address-cells of new
+         * parent
+         */
+        tmp = dt_get_property(ipar, "#interrupt-cells", NULL);
+        if ( tmp == NULL )
+        {
+            dt_dprintk(" -> parent lacks #interrupt-cells!\n");
+            goto fail;
+        }
+        pintsize = be32_to_cpu(*tmp);
+        tmp = dt_get_property(ipar, "#address-cells", NULL);
+        paddrsize = (tmp == NULL) ? 0 : be32_to_cpu(*tmp);
+
+        dt_dprintk(" -> pintsize=%d, paddrsize=%d\n",
+                   pintsize, paddrsize);
+
+        if ( pintsize > DT_MAX_IRQ_SPEC )
+        {
+            dt_dprintk(" -> too many irq specifier cells in parent\n");
+            goto fail;
+        }
+
+        /* Check for malformed properties */
+        if ( imaplen < (paddrsize + pintsize) )
+            goto fail;
+
+        imap += paddrsize;
+        imaplen -= paddrsize;
+
+        dt_raw_irq.controller = ipar;
+        dt_raw_irq.size = pintsize;
+        for ( i = 0; i < pintsize; i++ )
+            dt_raw_irq.specifier[i] = dt_read_number(imap + i, 1);
+
+        ret = dt_irq_translate(&dt_raw_irq, &dt_irq);
+        if ( ret )
+        {
+            dt_dprintk(" -> failed to translate IRQ: %d\n", ret);
+            return ret;
+        }
+
+        ret = cb(dev, &dt_irq, data);
+        if ( ret )
+        {
+            dt_dprintk(" -> callback failed=%d\n", ret);
+            return ret;
+        }
+
+        imap += pintsize;
+        imaplen -= pintsize;
+
+        dt_dprintk(" -> imaplen=%d\n", imaplen);
+    }
+
+    return 0;
+
+fail:
+    return -EINVAL;
+}
+
 /**
  * dt_irq_map_raw - Low level interrupt tree parsing
  * @parent:     the device interrupt parent
@@ -1006,7 +1365,8 @@ fail:
     return -EINVAL;
 }
 
-int dt_device_get_raw_irq(const struct dt_device_node *device, int index,
+int dt_device_get_raw_irq(const struct dt_device_node *device,
+                          unsigned int index,
                           struct dt_raw_irq *out_irq)
 {
     const struct dt_device_node *p;
@@ -1014,7 +1374,7 @@ int dt_device_get_raw_irq(const struct dt_device_node *device, int index,
     u32 intsize, intlen;
     int res = -EINVAL;
 
-    dt_dprintk("dt_device_get_raw_irq: dev=%s, index=%d\n",
+    dt_dprintk("dt_device_get_raw_irq: dev=%s, index=%u\n",
                device->full_name, index);
 
     /* Get the interrupts property */
@@ -1058,14 +1418,20 @@ int dt_irq_translate(const struct dt_raw_irq *raw,
                      struct dt_irq *out_irq)
 {
     ASSERT(dt_irq_xlate != NULL);
+    ASSERT(dt_interrupt_controller != NULL);
 
-    /* TODO: Retrieve the right irq_xlate. This is only work for the gic */
+    /*
+     * TODO: Retrieve the right irq_xlate. This is only works for the primary
+     * interrupt controller.
+     */
+    if ( raw->controller != dt_interrupt_controller )
+        return -EINVAL;
 
     return dt_irq_xlate(raw->specifier, raw->size,
                         &out_irq->irq, &out_irq->type);
 }
 
-int dt_device_get_irq(const struct dt_device_node *device, int index,
+int dt_device_get_irq(const struct dt_device_node *device, unsigned int index,
                       struct dt_irq *out_irq)
 {
     struct dt_raw_irq raw;
@@ -1097,6 +1463,12 @@ bool_t dt_device_is_available(const struct dt_device_node *device)
     return 0;
 }
 
+bool_t dt_device_for_passthrough(const struct dt_device_node *device)
+{
+    return (dt_find_property(device, "xen,passthrough", NULL) != NULL);
+
+}
+
 static int __dt_parse_phandle_with_args(const struct dt_device_node *np,
                                         const char *list_name,
                                         const char *cells_name,
@@ -1454,6 +1826,9 @@ static unsigned long __init unflatten_dt_node(const void *fdt,
             ((char *)pp->value)[sz - 1] = 0;
             dt_dprintk("fixed up name for %s -> %s\n", pathp,
                        (char *)pp->value);
+            /* Generic device initialization */
+            np->dev.type = DEV_DT;
+            np->dev.of_node = np;
         }
     }
     if ( allnextpp )
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 336e9ea..1b9fcfc 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -15,7 +15,7 @@
 #include <xen/domain.h>
 #include <xen/mm.h>
 #include <xen/event.h>
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
 #include <xen/time.h>
 #include <xen/console.h>
 #include <xen/softirq.h>
@@ -42,6 +42,7 @@
 #include <xsm/xsm.h>
 #include <xen/trace.h>
 #include <xen/tmem.h>
+#include <asm/setup.h>
 
 /* Linux config option: propageted to domain0 */
 /* xen_processor_pmbits: xen control Cx, Px, ... */
@@ -69,7 +70,7 @@ integer_param("hardware_dom", hardware_domid);
 
 struct vcpu *idle_vcpu[NR_CPUS] __read_mostly;
 
-vcpu_info_t dummy_vcpu_info;
+static vcpu_info_t dummy_vcpu_info;
 
 static void __domain_finalise_shutdown(struct domain *d)
 {
@@ -126,6 +127,8 @@ struct vcpu *alloc_vcpu(
 
     tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
 
+    grant_table_init_vcpu(v);
+
     if ( !zalloc_cpumask_var(&v->cpu_hard_affinity) ||
          !zalloc_cpumask_var(&v->cpu_hard_affinity_tmp) ||
          !zalloc_cpumask_var(&v->cpu_hard_affinity_saved) ||
@@ -219,6 +222,8 @@ static int late_hwdom_init(struct domain *d)
     rangeset_swap(d->iomem_caps, dom0->iomem_caps);
 #ifdef CONFIG_X86
     rangeset_swap(d->arch.ioport_caps, dom0->arch.ioport_caps);
+    setup_io_bitmap(d);
+    setup_io_bitmap(dom0);
 #endif
 
     rcu_unlock_domain(dom0);
@@ -242,8 +247,9 @@ static void __init parse_extra_guest_irqs(const char *s)
 }
 custom_param("extra_guest_irqs", parse_extra_guest_irqs);
 
-struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, uint32_t ssidref)
+struct domain *domain_create(domid_t domid, unsigned int domcr_flags,
+                             uint32_t ssidref,
+                             struct xen_arch_domainconfig *config)
 {
     struct domain *d, **pd, *old_hwdom = NULL;
     enum { INIT_xsm = 1u<<0, INIT_watchdog = 1u<<1, INIT_rangeset = 1u<<2,
@@ -344,8 +350,8 @@ struct domain *domain_create(
         poolid = 0;
 
         err = -ENOMEM;
-        d->mem_event = xzalloc(struct mem_event_per_domain);
-        if ( !d->mem_event )
+        d->vm_event = xzalloc(struct vm_event_per_domain);
+        if ( !d->vm_event )
             goto fail;
 
         d->pbuf = xzalloc_array(char, DOMAIN_PBUF_SIZE);
@@ -353,7 +359,7 @@ struct domain *domain_create(
             goto fail;
     }
 
-    if ( (err = arch_domain_create(d, domcr_flags)) != 0 )
+    if ( (err = arch_domain_create(d, domcr_flags, config)) != 0 )
         goto fail;
     init_status |= INIT_arch;
 
@@ -387,7 +393,7 @@ struct domain *domain_create(
     if ( hardware_domain == d )
         hardware_domain = old_hwdom;
     atomic_set(&d->refcnt, DOMAIN_DESTROYED);
-    xfree(d->mem_event);
+    xfree(d->vm_event);
     xfree(d->pbuf);
     if ( init_status & INIT_arch )
         arch_domain_destroy(d);
@@ -617,19 +623,15 @@ int domain_kill(struct domain *d)
     case DOMDYING_dying:
         rc = domain_relinquish_resources(d);
         if ( rc != 0 )
-        {
-            if ( rc == -ERESTART )
-                rc = -EAGAIN;
             break;
-        }
         if ( cpupool_move_domain(d, cpupool0) )
-            return -EAGAIN;
+            return -ERESTART;
         for_each_vcpu ( d, v )
             unmap_vcpu_info(v);
         d->is_dying = DOMDYING_dead;
         /* Mem event cleanup has to go here because the rings 
          * have to be put before we call put_domain. */
-        mem_event_cleanup(d);
+        vm_event_cleanup(d);
         put_domain(d);
         send_global_virq(VIRQ_DOM_EXC);
         /* fallthrough */
@@ -808,7 +810,7 @@ static void complete_domain_destroy(struct rcu_head *head)
     free_xenoprof_pages(d);
 #endif
 
-    xfree(d->mem_event);
+    xfree(d->vm_event);
     xfree(d->pbuf);
 
     for ( i = d->max_vcpus - 1; i >= 0; i-- )
@@ -898,7 +900,7 @@ int vcpu_pause_by_systemcontroller(struct vcpu *v)
         new = old + 1;
 
         if ( new > 255 )
-            return -EUSERS;
+            return -EOVERFLOW;
 
         prev = cmpxchg(&v->controller_pause_count, old, new);
     } while ( prev != old );
@@ -978,7 +980,7 @@ int __domain_pause_by_systemcontroller(struct domain *d,
          * toolstack overflowing d->pause_count with many repeated hypercalls.
          */
         if ( new > 255 )
-            return -EUSERS;
+            return -EOVERFLOW;
 
         prev = cmpxchg(&d->controller_pause_count, old, new);
     } while ( prev != old );
@@ -1008,6 +1010,34 @@ int domain_unpause_by_systemcontroller(struct domain *d)
     return 0;
 }
 
+void domain_pause_except_self(struct domain *d)
+{
+    struct vcpu *v, *curr = current;
+
+    if ( curr->domain == d )
+    {
+        for_each_vcpu( d, v )
+            if ( likely(v != curr) )
+                vcpu_pause(v);
+    }
+    else
+        domain_pause(d);
+}
+
+void domain_unpause_except_self(struct domain *d)
+{
+    struct vcpu *v, *curr = current;
+
+    if ( curr->domain == d )
+    {
+        for_each_vcpu( d, v )
+            if ( likely(v != curr) )
+                vcpu_unpause(v);
+    }
+    else
+        domain_unpause(d);
+}
+
 int vcpu_reset(struct vcpu *v)
 {
     struct domain *d = v->domain;
@@ -1139,16 +1169,13 @@ void unmap_vcpu_info(struct vcpu *v)
     put_page_and_type(mfn_to_page(mfn));
 }
 
-long do_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
+long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     struct domain *d = current->domain;
     struct vcpu *v;
     struct vcpu_guest_context *ctxt;
     long rc = 0;
 
-    if ( (vcpuid < 0) || (vcpuid >= MAX_VIRT_CPUS) )
-        return -EINVAL;
-
     if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
         return -ENOENT;
 
@@ -1174,7 +1201,7 @@ long do_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
         free_vcpu_guest_context(ctxt);
 
         if ( rc == -ERESTART )
-            rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
+            rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iuh",
                                                cmd, vcpuid, arg);
 
         break;
@@ -1325,9 +1352,11 @@ long do_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
     return rc;
 }
 
-long vm_assist(struct domain *p, unsigned int cmd, unsigned int type)
+#ifdef VM_ASSIST_VALID
+long vm_assist(struct domain *p, unsigned int cmd, unsigned int type,
+               unsigned long valid)
 {
-    if ( type > MAX_VMASST_TYPE )
+    if ( type >= BITS_PER_LONG || !test_bit(type, &valid) )
         return -EINVAL;
 
     switch ( cmd )
@@ -1342,6 +1371,7 @@ long vm_assist(struct domain *p, unsigned int cmd, unsigned int type)
 
     return -ENOSYS;
 }
+#endif
 
 struct pirq *pirq_get_info(struct domain *d, int pirq)
 {
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index 3641296..7f959f3 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -1,8 +1,8 @@
 /******************************************************************************
  * domctl.c
- * 
+ *
  * Domain management operations. For use by node control stack.
- * 
+ *
  * Copyright (c) 2002-2006, K A Fraser
  */
 
@@ -24,11 +24,12 @@
 #include <xen/bitmap.h>
 #include <xen/paging.h>
 #include <xen/hypercall.h>
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
 #include <asm/current.h>
 #include <asm/irq.h>
 #include <asm/page.h>
 #include <asm/p2m.h>
+#include <asm/monitor.h>
 #include <public/domctl.h>
 #include <xsm/xsm.h>
 
@@ -154,13 +155,13 @@ void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info)
     u64 cpu_time = 0;
     int flags = XEN_DOMINF_blocked;
     struct vcpu_runstate_info runstate;
-    
+
     info->domain = d->domain_id;
     info->max_vcpu_id = XEN_INVALID_MAX_VCPU_ID;
     info->nr_online_vcpus = 0;
     info->ssidref = 0;
-    
-    /* 
+
+    /*
      * - domain is marked as blocked only if all its vcpus are blocked
      * - domain is marked as running if any of its vcpus is running
      */
@@ -237,7 +238,7 @@ static unsigned int default_vcpu0_location(cpumask_t *online)
     }
 
     /*
-     * If we're on a HT system, we only auto-allocate to a non-primary HT. We 
+     * If we're on a HT system, we only auto-allocate to a non-primary HT. We
      * favour high numbered CPUs in the event of a tie.
      */
     cpumask_copy(&cpu_exclude_map, per_cpu(cpu_sibling_mask, 0));
@@ -344,7 +345,7 @@ static struct vnuma_info *vnuma_alloc(unsigned int nr_vnodes,
 
     vnuma->vdistance = xmalloc_array(unsigned int, nr_vnodes * nr_vnodes);
     vnuma->vcpu_to_vnode = xmalloc_array(unsigned int, nr_vcpus);
-    vnuma->vnode_to_pnode = xmalloc_array(unsigned int, nr_vnodes);
+    vnuma->vnode_to_pnode = xmalloc_array(nodeid_t, nr_vnodes);
     vnuma->vmemrange = xmalloc_array(xen_vmemrange_t, nr_ranges);
 
     if ( vnuma->vdistance == NULL || vnuma->vmemrange == NULL ||
@@ -382,30 +383,40 @@ static struct vnuma_info *vnuma_init(const struct xen_domctl_vnuma *uinfo,
                          nr_vnodes * nr_vnodes) )
         goto vnuma_fail;
 
+    if ( copy_from_guest(info->vmemrange, uinfo->vmemrange,
+                         uinfo->nr_vmemranges) )
+        goto vnuma_fail;
+
     if ( copy_from_guest(info->vcpu_to_vnode, uinfo->vcpu_to_vnode,
                          d->max_vcpus) )
         goto vnuma_fail;
 
-    if ( copy_from_guest(info->vnode_to_pnode, uinfo->vnode_to_pnode,
-                         nr_vnodes) )
-        goto vnuma_fail;
+    ret = -E2BIG;
+    for ( i = 0; i < d->max_vcpus; ++i )
+        if ( info->vcpu_to_vnode[i] >= nr_vnodes )
+            goto vnuma_fail;
 
-    if (copy_from_guest(info->vmemrange, uinfo->vmemrange,
-                        uinfo->nr_vmemranges))
-        goto vnuma_fail;
+    for ( i = 0; i < nr_vnodes; ++i )
+    {
+        unsigned int pnode;
+
+        ret = -EFAULT;
+        if ( copy_from_guest_offset(&pnode, uinfo->vnode_to_pnode, i, 1) )
+            goto vnuma_fail;
+        ret = -E2BIG;
+        if ( pnode >= MAX_NUMNODES )
+            goto vnuma_fail;
+        info->vnode_to_pnode[i] = pnode;
+    }
 
     info->nr_vnodes = nr_vnodes;
     info->nr_vmemranges = uinfo->nr_vmemranges;
 
     /* Check that vmemranges flags are zero. */
+    ret = -EINVAL;
     for ( i = 0; i < info->nr_vmemranges; i++ )
-    {
         if ( info->vmemrange[i].flags != 0 )
-        {
-            ret = -EINVAL;
             goto vnuma_fail;
-        }
-    }
 
     return info;
 
@@ -432,6 +443,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
     case XEN_DOMCTL_createdomain:
     case XEN_DOMCTL_getdomaininfo:
     case XEN_DOMCTL_test_assign_device:
+    case XEN_DOMCTL_gdbsx_guestmemio:
         d = NULL;
         break;
     default:
@@ -484,7 +496,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
             break;
 
 #ifdef CONFIG_COMPAT
-        if ( !is_pv_32on64_vcpu(v) )
+        if ( !is_pv_32bit_domain(d) )
             ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
         else
             ret = copy_from_guest(c.cmp,
@@ -507,8 +519,8 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
         }
 
         free_vcpu_guest_context(c.nat);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_pausedomain:
         ret = -EINVAL;
@@ -521,13 +533,11 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
         break;
 
     case XEN_DOMCTL_resumedomain:
-    {
         if ( d == current->domain ) /* no domain_pause() */
             ret = -EINVAL;
         else
             domain_resume(d);
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_createdomain:
     {
@@ -536,8 +546,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
         unsigned int domcr_flags;
 
         ret = -EINVAL;
-        if ( supervisor_mode_kernel ||
-             (op->u.createdomain.flags &
+        if ( (op->u.createdomain.flags &
              ~(XEN_DOMCTL_CDF_hvm_guest
                | XEN_DOMCTL_CDF_pvh_guest
                | XEN_DOMCTL_CDF_hap
@@ -585,7 +594,8 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_oos_off )
             domcr_flags |= DOMCRF_oos_off;
 
-        d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref);
+        d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref,
+                          &op->u.createdomain.config);
         if ( IS_ERR(d) )
         {
             ret = PTR_ERR(d);
@@ -601,8 +611,8 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
         op->domain = d->domain_id;
         copyback = 1;
         d = NULL;
+        break;
     }
-    break;
 
     case XEN_DOMCTL_max_vcpus:
     {
@@ -611,8 +621,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 
         ret = -EINVAL;
         if ( (d == current->domain) || /* no domain_pause() */
-             (max > MAX_VIRT_CPUS) ||
-             (is_hvm_domain(d) && (max > MAX_HVM_VCPUS)) )
+             (max > domain_max_vcpus(d)) )
             break;
 
         /* Until Xenoprof can dynamically grow its vcpu-s array... */
@@ -692,14 +701,15 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 
     maxvcpu_out_novcpulock:
         domain_unpause(d);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_destroydomain:
-    {
         ret = domain_kill(d);
-    }
-    break;
+        if ( ret == -ERESTART )
+            ret = hypercall_create_continuation(
+                __HYPERVISOR_domctl, "h", u_domctl);
+        break;
 
     case XEN_DOMCTL_setnodeaffinity:
     {
@@ -709,14 +719,13 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
                                         &op->u.nodeaffinity.nodemap);
         if ( !ret )
             ret = domain_set_node_affinity(d, &new_affinity);
+        break;
     }
-    break;
+
     case XEN_DOMCTL_getnodeaffinity:
-    {
         ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap,
                                         &d->node_affinity);
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_setvcpuaffinity:
     case XEN_DOMCTL_getvcpuaffinity:
@@ -825,18 +834,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
                 ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft,
                                                v->cpu_soft_affinity);
         }
+        break;
     }
-    break;
 
     case XEN_DOMCTL_scheduler_op:
-    {
         ret = sched_adjust(d, &op->u.scheduler_op);
         copyback = 1;
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_getdomaininfo:
-    { 
+    {
         domid_t dom = op->domain;
 
         rcu_read_lock(&domlist_read_lock);
@@ -845,12 +852,9 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
             if ( d->domain_id >= dom )
                 break;
 
+        ret = -ESRCH;
         if ( d == NULL )
-        {
-            rcu_read_unlock(&domlist_read_lock);
-            ret = -ESRCH;
-            break;
-        }
+            goto getdomaininfo_out;
 
         ret = xsm_getdomaininfo(XSM_HOOK, d);
         if ( ret )
@@ -864,11 +868,11 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
     getdomaininfo_out:
         rcu_read_unlock(&domlist_read_lock);
         d = NULL;
+        break;
     }
-    break;
 
     case XEN_DOMCTL_getvcpucontext:
-    { 
+    {
         vcpu_guest_context_u c = { .nat = NULL };
         struct vcpu         *v;
 
@@ -898,7 +902,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
         vcpu_unpause(v);
 
 #ifdef CONFIG_COMPAT
-        if ( !is_pv_32on64_vcpu(v) )
+        if ( !is_pv_32bit_domain(d) )
             ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
         else
             ret = copy_to_guest(guest_handle_cast(op->u.vcpucontext.ctxt,
@@ -913,11 +917,11 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 
     getvcpucontext_out:
         xfree(c.nat);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_getvcpuinfo:
-    { 
+    {
         struct vcpu   *v;
         struct vcpu_runstate_info runstate;
 
@@ -938,15 +942,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
         op->u.getvcpuinfo.cpu      = v->processor;
         ret = 0;
         copyback = 1;
+        break;
     }
-    break;
 
     case XEN_DOMCTL_max_mem:
     {
-        unsigned long new_max;
-
-        ret = -EINVAL;
-        new_max = op->u.max_mem.max_memkb >> (PAGE_SHIFT-10);
+        uint64_t new_max = op->u.max_mem.max_memkb >> (PAGE_SHIFT - 10);
 
         spin_lock(&d->page_alloc_lock);
         /*
@@ -954,32 +955,26 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
          * that the domain will now be allowed to "ratchet" down to new_max. In
          * the meantime, while tot > max, all new allocations are disallowed.
          */
-        d->max_pages = new_max;
-        ret = 0;
+        d->max_pages = min(new_max, (uint64_t)(typeof(d->max_pages))-1);
         spin_unlock(&d->page_alloc_lock);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_setdomainhandle:
-    {
         memcpy(d->handle, op->u.setdomainhandle.handle,
                sizeof(xen_domain_handle_t));
-        ret = 0;
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_setdebugging:
-    {
-        ret = -EINVAL;
-        if ( d == current->domain ) /* no domain_pause() */
-            break;
-
-        domain_pause(d);
-        d->debugger_attached = !!op->u.setdebugging.enable;
-        domain_unpause(d); /* causes guest to latch new status */
-        ret = 0;
-    }
-    break;
+        if ( unlikely(d == current->domain) ) /* no domain_pause() */
+            ret = -EINVAL;
+        else
+        {
+            domain_pause(d);
+            d->debugger_attached = !!op->u.setdebugging.enable;
+            domain_unpause(d); /* causes guest to latch new status */
+        }
+        break;
 
     case XEN_DOMCTL_irq_permission:
     {
@@ -998,8 +993,8 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
             ret = irq_permit_access(d, irq);
         else
             ret = irq_deny_access(d, irq);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_iomem_permission:
     {
@@ -1021,8 +1016,8 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
             ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
         if ( !ret )
             memory_type_changed(d);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_memory_mapping:
     {
@@ -1078,15 +1073,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
         }
         /* Do this unconditionally to cover errors on above failure paths. */
         memory_type_changed(d);
+        break;
     }
-    break;
 
     case XEN_DOMCTL_settimeoffset:
-    {
         domain_set_time_offset(d, op->u.settimeoffset.time_offset_seconds);
-        ret = 0;
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_set_target:
     {
@@ -1112,59 +1104,42 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 
         /* Hold reference on @e until we destroy @d. */
         d->target = e;
-
-        ret = 0;
+        break;
     }
-    break;
 
     case XEN_DOMCTL_subscribe:
-    {
         d->suspend_evtchn = op->u.subscribe.port;
-    }
-    break;
+        break;
 
-    case XEN_DOMCTL_mem_event_op:
-        ret = mem_event_domctl(d, &op->u.mem_event_op,
-                               guest_handle_cast(u_domctl, void));
+    case XEN_DOMCTL_vm_event_op:
+        ret = vm_event_domctl(d, &op->u.vm_event_op,
+                              guest_handle_cast(u_domctl, void));
         copyback = 1;
         break;
 
     case XEN_DOMCTL_disable_migrate:
-    {
         d->disable_migrate = op->u.disable_migrate.disable;
-    }
-    break;
+        break;
 
 #ifdef HAS_MEM_ACCESS
     case XEN_DOMCTL_set_access_required:
-    {
-        struct p2m_domain* p2m;
-
-        ret = -EPERM;
-        if ( current->domain == d )
-            break;
-
-        ret = 0;
-        p2m = p2m_get_hostp2m(d);
-        p2m->access_required = op->u.access_required.access_required;
-    }
-    break;
+        if ( unlikely(current->domain == d) )
+            ret = -EPERM;
+        else
+            p2m_get_hostp2m(d)->access_required =
+                op->u.access_required.access_required;
+        break;
 #endif
 
     case XEN_DOMCTL_set_virq_handler:
-    {
-        uint32_t virq = op->u.set_virq_handler.virq;
-        ret = set_global_virq_handler(d, virq);
-    }
-    break;
+        ret = set_global_virq_handler(d, op->u.set_virq_handler.virq);
+        break;
 
     case XEN_DOMCTL_set_max_evtchn:
-    {
         d->max_evtchn_port = min_t(unsigned int,
                                    op->u.set_max_evtchn.max_port,
                                    INT_MAX);
-    }
-    break;
+        break;
 
     case XEN_DOMCTL_setvnumainfo:
     {
@@ -1183,9 +1158,18 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
         d->vnuma = vnuma;
         write_unlock(&d->vnuma_rwlock);
 
-        ret = 0;
+        break;
     }
-    break;
+
+    case XEN_DOMCTL_monitor_op:
+        ret = -EPERM;
+        if ( current->domain == d )
+            break;
+
+        ret = monitor_domctl(d, &op->u.monitor_op);
+        if ( !ret )
+            copyback = 1;
+        break;
 
     default:
         ret = arch_do_domctl(op, d, u_domctl);
diff --git a/xen/common/earlycpio.c b/xen/common/earlycpio.c
index 5e54142..f6b1a9e 100644
--- a/xen/common/earlycpio.c
+++ b/xen/common/earlycpio.c
@@ -54,25 +54,26 @@ enum cpio_fields {
 
 /**
  * cpio_data find_cpio_data - Search for files in an uncompressed cpio
- * @path:   The directory to search for, including a slash at the end
- * @data:   Pointer to the the cpio archive or a header inside
- * @len:    Remaining length of the cpio based on data pointer
- * @offset: When a matching file is found, this is the offset to the
- *          beginning of the cpio. It can be used to iterate through
- *          the cpio to find all files inside of a directory path
+ * @path:       The directory to search for, including a slash at the end
+ * @data:       Pointer to the the cpio archive or a header inside
+ * @len:        Remaining length of the cpio based on data pointer
+ * @nextoff:    When a matching file is found, this is the offset from the
+ *              beginning of the cpio to the beginning of the next file, not the
+ *              matching file itself. It can be used to iterate through the cpio
+ *              to find all files inside of a directory path.
  *
- * @return: struct cpio_data containing the address, length and
- *          filename (with the directory path cut off) of the found file.
- *          If you search for a filename and not for files in a directory,
- *          pass the absolute path of the filename in the cpio and make sure
- *          the match returned an empty filename string.
+ * @return:     struct cpio_data containing the address, length and
+ *              filename (with the directory path cut off) of the found file.
+ *              If you search for a filename and not for files in a directory,
+ *              pass the absolute path of the filename in the cpio and make sure
+ *              the match returned an empty filename string.
  */
 
 struct cpio_data __init find_cpio_data(const char *path, void *data,
-					  size_t len,  long *offset)
+				       size_t len,  long *nextoff)
 {
 	const size_t cpio_header_len = 8*C_NFIELDS - 2;
-	struct cpio_data cd = { NULL, 0 };
+	struct cpio_data cd = { NULL, 0, "" };
 	const char *p, *dptr, *nptr;
 	unsigned int ch[C_NFIELDS], *chp, v;
 	unsigned char c, x;
@@ -129,17 +130,17 @@ struct cpio_data __init find_cpio_data(const char *path, void *data,
 		if ((ch[C_MODE] & 0170000) == 0100000 &&
 		    ch[C_NAMESIZE] >= mypathsize &&
 		    !memcmp(p, path, mypathsize)) {
-			*offset = (long)nptr - (long)data;
+			*nextoff = (long)nptr - (long)data;
 			if (ch[C_NAMESIZE] - mypathsize >= MAX_CPIO_FILE_NAME) {
 				printk(
 				"File %s exceeding MAX_CPIO_FILE_NAME [%d]\n",
 				p, MAX_CPIO_FILE_NAME);
 			}
-			if (ch[C_NAMESIZE] - 1 /* includes \0 */ == mypathsize) {
-				cd.data = (void *)dptr;
-				cd.size = ch[C_FILESIZE];
-				return cd; /* Found it! */
-			}
+			strlcpy(cd.name, p + mypathsize, MAX_CPIO_FILE_NAME);
+
+			cd.data = (void *)dptr;
+			cd.size = ch[C_FILESIZE];
+			return cd; /* Found it! */
 		}
 		len -= (nptr - p);
 		p = nptr;
diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
index ac6881e..2bc5b25 100644
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -32,6 +32,10 @@
 /* Using SetVirtualAddressMap() is incompatible with kexec: */
 #undef USE_SET_VIRTUAL_ADDRESS_MAP
 
+#define EFI_REVISION(major, minor) (((major) << 16) | (minor))
+
+#define SMBIOS3_TABLE_GUID \
+  { 0xf2fd1544, 0x9794, 0x4a2c, {0x99, 0x2e, 0xe5, 0xbb, 0xcf, 0x20, 0xe3, 0x94} }
 #define SHIM_LOCK_PROTOCOL_GUID \
   { 0x605dab50, 0xe046, 0x4300, {0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23} }
 
@@ -76,12 +80,14 @@ static int set_color(u32 mask, int bpp, u8 *pos, u8 *sz);
 static bool_t match_guid(const EFI_GUID *guid1, const EFI_GUID *guid2);
 
 static const EFI_BOOT_SERVICES *__initdata efi_bs;
+static UINT32 __initdata efi_bs_revision;
 static EFI_HANDLE __initdata efi_ih;
 
 static SIMPLE_TEXT_OUTPUT_INTERFACE *__initdata StdOut;
 static SIMPLE_TEXT_OUTPUT_INTERFACE *__initdata StdErr;
 
 static UINT32 __initdata mdesc_ver;
+static bool_t __initdata map_bs;
 
 static struct file __initdata cfg;
 static struct file __initdata kernel;
@@ -214,6 +220,9 @@ static void __init noreturn blexit(const CHAR16 *str)
         PrintStr((CHAR16 *)str);
     PrintStr(newline);
 
+    if ( !efi_bs )
+        efi_arch_halt();
+
     if ( cfg.addr )
         efi_bs->FreePages(cfg.addr, PFN_UP(cfg.size));
     if ( kernel.addr )
@@ -519,6 +528,8 @@ static bool_t __init read_file(EFI_FILE_HANDLE dir_handle, CHAR16 *name,
         PrintErrMesg(name, ret);
     }
 
+    efi_arch_flush_dcache_area(file->ptr, file->size);
+
     return 1;
 }
 
@@ -710,6 +721,7 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
 
     efi_ih = ImageHandle;
     efi_bs = SystemTable->BootServices;
+    efi_bs_revision = efi_bs->Hdr.Revision;
     efi_rs = SystemTable->RuntimeServices;
     efi_ct = SystemTable->ConfigurationTable;
     efi_num_ct = SystemTable->NumberOfTableEntries;
@@ -751,6 +763,8 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
             {
                 if ( wstrcmp(ptr + 1, L"basevideo") == 0 )
                     base_video = 1;
+                else if ( wstrcmp(ptr + 1, L"mapbs") == 0 )
+                    map_bs = 1;
                 else if ( wstrncmp(ptr + 1, L"cfg=", 4) == 0 )
                     cfg_file_name = ptr + 5;
                 else if ( i + 1 < argc && wstrcmp(ptr + 1, L"cfg") == 0 )
@@ -760,6 +774,7 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
                 {
                     PrintStr(L"Xen EFI Loader options:\r\n");
                     PrintStr(L"-basevideo   retain current video mode\r\n");
+                    PrintStr(L"-mapbs       map EfiBootServices{Code,Data}\r\n");
                     PrintStr(L"-cfg=<file>  specify configuration file\r\n");
                     PrintStr(L"-help, -?    display this help\r\n");
                     blexit(NULL);
@@ -993,6 +1008,7 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
         static EFI_GUID __initdata acpi_guid = ACPI_TABLE_GUID;
         static EFI_GUID __initdata mps_guid = MPS_TABLE_GUID;
         static EFI_GUID __initdata smbios_guid = SMBIOS_TABLE_GUID;
+        static EFI_GUID __initdata smbios3_guid = SMBIOS3_TABLE_GUID;
 
         if ( match_guid(&acpi2_guid, &efi_ct[i].VendorGuid) )
 	       efi.acpi20 = (long)efi_ct[i].VendorTable;
@@ -1002,11 +1018,15 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
 	       efi.mps = (long)efi_ct[i].VendorTable;
         if ( match_guid(&smbios_guid, &efi_ct[i].VendorGuid) )
 	       efi.smbios = (long)efi_ct[i].VendorTable;
+        if ( match_guid(&smbios3_guid, &efi_ct[i].VendorGuid) )
+	       efi.smbios3 = (long)efi_ct[i].VendorTable;
     }
 
 #ifndef CONFIG_ARM /* TODO - disabled until implemented on ARM */
-    if (efi.smbios != EFI_INVALID_TABLE_ADDR)
-        dmi_efi_get_table((void *)(long)efi.smbios);
+    dmi_efi_get_table(efi.smbios != EFI_INVALID_TABLE_ADDR
+                      ? (void *)(long)efi.smbios : NULL,
+                      efi.smbios3 != EFI_INVALID_TABLE_ADDR
+                      ? (void *)(long)efi.smbios3 : NULL);
 #endif
 
     /* Collect PCI ROM contents. */
@@ -1046,16 +1066,21 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
             efi_arch_video_init(gop, info_size, mode_info);
     }
 
-    efi_bs->GetMemoryMap(&efi_memmap_size, NULL, &map_key,
+    info_size = 0;
+    efi_bs->GetMemoryMap(&info_size, NULL, &map_key,
                          &efi_mdesc_size, &mdesc_ver);
-    efi_memmap = efi_arch_allocate_mmap_buffer(&efi_memmap_size);
+    info_size += 8 * efi_mdesc_size;
+    efi_memmap = efi_arch_allocate_mmap_buffer(info_size);
     if ( !efi_memmap )
         blexit(L"Unable to allocate memory for EFI memory map");
 
     for ( retry = 0; ; retry = 1 )
     {
-        status = efi_bs->GetMemoryMap(&efi_memmap_size, efi_memmap, &map_key,
-                                      &efi_mdesc_size, &mdesc_ver);
+        efi_memmap_size = info_size;
+        status = SystemTable->BootServices->GetMemoryMap(&efi_memmap_size,
+                                                         efi_memmap, &map_key,
+                                                         &efi_mdesc_size,
+                                                         &mdesc_ver);
         if ( EFI_ERROR(status) )
             PrintErrMesg(L"Cannot obtain memory map", status);
 
@@ -1064,7 +1089,9 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
 
         efi_arch_pre_exit_boot();
 
-        status = efi_bs->ExitBootServices(ImageHandle, map_key);
+        status = SystemTable->BootServices->ExitBootServices(ImageHandle,
+                                                             map_key);
+        efi_bs = NULL;
         if ( status != EFI_INVALID_PARAMETER || retry )
             break;
     }
@@ -1087,7 +1114,31 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable)
 #ifndef CONFIG_ARM /* TODO - runtime service support */
 
 static bool_t __initdata efi_rs_enable = 1;
-boolean_param("efi-rs", efi_rs_enable);
+static bool_t __initdata efi_map_uc;
+
+static void __init parse_efi_param(char *s)
+{
+    char *ss;
+
+    do {
+        bool_t val = !!strncmp(s, "no-", 3);
+
+        if ( !val )
+            s += 3;
+
+        ss = strchr(s, ',');
+        if ( ss )
+            *ss = '\0';
+
+        if ( !strcmp(s, "rs") )
+            efi_rs_enable = val;
+        else if ( !strcmp(s, "attr=uc") )
+            efi_map_uc = val;
+
+        s = ss + 1;
+    } while ( ss );
+}
+custom_param("efi", parse_efi_param);
 
 #ifndef USE_SET_VIRTUAL_ADDRESS_MAP
 static __init void copy_mapping(unsigned long mfn, unsigned long end,
@@ -1149,20 +1200,25 @@ void __init efi_init_memory(void)
     } *extra, *extra_head = NULL;
 #endif
 
-    printk(XENLOG_INFO "EFI memory map:\n");
+    printk(XENLOG_INFO "EFI memory map:%s\n",
+           map_bs ? " (mapping BootServices)" : "");
     for ( i = 0; i < efi_memmap_size; i += efi_mdesc_size )
     {
         EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i;
         u64 len = desc->NumberOfPages << EFI_PAGE_SHIFT;
         unsigned long smfn, emfn;
-        unsigned int prot = PAGE_HYPERVISOR;
+        unsigned int prot = PAGE_HYPERVISOR_RWX;
 
         printk(XENLOG_INFO " %013" PRIx64 "-%013" PRIx64
                            " type=%u attr=%016" PRIx64 "\n",
                desc->PhysicalStart, desc->PhysicalStart + len - 1,
                desc->Type, desc->Attribute);
 
-        if ( !efi_rs_enable || !(desc->Attribute & EFI_MEMORY_RUNTIME) )
+        if ( !efi_rs_enable ||
+             (!(desc->Attribute & EFI_MEMORY_RUNTIME) &&
+              (!map_bs ||
+               (desc->Type != EfiBootServicesCode &&
+                desc->Type != EfiBootServicesData))) )
             continue;
 
         desc->VirtualStart = INVALID_VIRTUAL_ADDRESS;
@@ -1178,17 +1234,23 @@ void __init efi_init_memory(void)
             prot |= _PAGE_PAT | MAP_SMALL_PAGES;
         else if ( desc->Attribute & (EFI_MEMORY_UC | EFI_MEMORY_UCE) )
             prot |= _PAGE_PWT | _PAGE_PCD | MAP_SMALL_PAGES;
+        else if ( efi_bs_revision >= EFI_REVISION(2, 5) &&
+                  (desc->Attribute & EFI_MEMORY_WP) )
+            prot |= _PAGE_PAT | _PAGE_PWT | MAP_SMALL_PAGES;
         else
         {
-            printk(XENLOG_ERR "Unknown cachability for MFNs %#lx-%#lx\n",
-                   smfn, emfn - 1);
-            continue;
+            printk(XENLOG_ERR "Unknown cachability for MFNs %#lx-%#lx%s\n",
+                   smfn, emfn - 1, efi_map_uc ? ", assuming UC" : "");
+            if ( !efi_map_uc )
+                continue;
+            prot |= _PAGE_PWT | _PAGE_PCD | MAP_SMALL_PAGES;
         }
 
-        if ( desc->Attribute & EFI_MEMORY_WP )
-            prot &= _PAGE_RW;
+        if ( desc->Attribute & (efi_bs_revision < EFI_REVISION(2, 5)
+                                ? EFI_MEMORY_WP : EFI_MEMORY_RO) )
+            prot &= ~_PAGE_RW;
         if ( desc->Attribute & EFI_MEMORY_XP )
-            prot |= _PAGE_NX_BIT;
+            prot |= _PAGE_NX;
 
         if ( pfn_to_pdx(emfn - 1) < (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
              !(smfn & pfn_hole_mask) &&
@@ -1248,7 +1310,10 @@ void __init efi_init_memory(void)
     {
         const EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i;
 
-        if ( (desc->Attribute & EFI_MEMORY_RUNTIME) &&
+        if ( ((desc->Attribute & EFI_MEMORY_RUNTIME) ||
+              (map_bs &&
+               (desc->Type == EfiBootServicesCode ||
+                desc->Type == EfiBootServicesData))) &&
              desc->VirtualStart != INVALID_VIRTUAL_ADDRESS &&
              desc->VirtualStart != desc->PhysicalStart )
             copy_mapping(PFN_DOWN(desc->PhysicalStart),
diff --git a/xen/common/efi/runtime.c b/xen/common/efi/runtime.c
index f5df51e..ae87557 100644
--- a/xen/common/efi/runtime.c
+++ b/xen/common/efi/runtime.c
@@ -30,6 +30,7 @@ const CHAR16 *__read_mostly efi_fw_vendor;
 const EFI_RUNTIME_SERVICES *__read_mostly efi_rs;
 #ifndef CONFIG_ARM /* TODO - disabled until implemented on ARM */
 static DEFINE_SPINLOCK(efi_rs_lock);
+static unsigned int efi_rs_on_cpu = NR_CPUS;
 #endif
 
 UINTN __read_mostly efi_memmap_size;
@@ -45,6 +46,7 @@ struct efi __read_mostly efi = {
 	.acpi20 = EFI_INVALID_TABLE_ADDR,
 	.mps    = EFI_INVALID_TABLE_ADDR,
 	.smbios = EFI_INVALID_TABLE_ADDR,
+	.smbios3 = EFI_INVALID_TABLE_ADDR,
 };
 
 const struct efi_pci_rom *__read_mostly efi_pci_roms;
@@ -65,6 +67,8 @@ unsigned long efi_rs_enter(void)
 
     spin_lock(&efi_rs_lock);
 
+    efi_rs_on_cpu = smp_processor_id();
+
     /* prevent fixup_page_fault() from doing anything */
     irq_enter();
 
@@ -99,13 +103,16 @@ void efi_rs_leave(unsigned long cr3)
         asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
     }
     irq_exit();
+    efi_rs_on_cpu = NR_CPUS;
     spin_unlock(&efi_rs_lock);
     stts();
 }
 
-paddr_t efi_rs_page_table(void)
+bool_t efi_rs_using_pgtables(void)
 {
-    return efi_l4_pgtable ? virt_to_maddr(efi_l4_pgtable) : 0;
+    return efi_l4_pgtable &&
+           (smp_processor_id() == efi_rs_on_cpu) &&
+           (read_cr3() == virt_to_maddr(efi_l4_pgtable));
 }
 
 unsigned long efi_get_time(void)
diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
index eece46b..46737cf 100644
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -11,8 +11,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -95,8 +94,6 @@ static uint8_t get_xen_consumer(xen_event_channel_notification_t fn)
 /* Get the notification function for a given Xen-bound event channel. */
 #define xen_notification_fn(e) (xen_consumers[(e)->xen_consumer-1])
 
-static void evtchn_set_pending(struct vcpu *v, int port);
-
 static int virq_is_global(uint32_t virq)
 {
     int rc;
@@ -108,6 +105,7 @@ static int virq_is_global(uint32_t virq)
     case VIRQ_TIMER:
     case VIRQ_DEBUG:
     case VIRQ_XENOPROF:
+    case VIRQ_XENPMU:
         rc = 0;
         break;
     case VIRQ_ARCH_0 ... VIRQ_ARCH_7:
@@ -141,6 +139,7 @@ static struct evtchn *alloc_evtchn_bucket(struct domain *d, unsigned int port)
             return NULL;
         }
         chn[i].port = port + i;
+        spin_lock_init(&chn[i].lock);
     }
     return chn;
 }
@@ -191,9 +190,23 @@ static int get_free_port(struct domain *d)
         return -ENOMEM;
     bucket_from_port(d, port) = chn;
 
+    write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET);
+
     return port;
 }
 
+static void free_evtchn(struct domain *d, struct evtchn *chn)
+{
+    /* Clear pending event to avoid unexpected behavior on re-bind. */
+    evtchn_port_clear_pending(d, chn);
+
+    /* Reset binding to vcpu0 when the channel is freed. */
+    chn->state          = ECS_FREE;
+    chn->notify_vcpu_id = 0;
+    chn->xen_consumer   = 0;
+
+    xsm_evtchn_close_post(chn);
+}
 
 static long evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc)
 {
@@ -217,11 +230,15 @@ static long evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc)
     if ( rc )
         goto out;
 
+    spin_lock(&chn->lock);
+
     chn->state = ECS_UNBOUND;
     if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF )
         chn->u.unbound.remote_domid = current->domain->domain_id;
     evtchn_port_init(d, chn);
 
+    spin_unlock(&chn->lock);
+
     alloc->port = port;
 
  out:
@@ -232,6 +249,28 @@ static long evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc)
 }
 
 
+static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn)
+{
+    if ( lchn < rchn )
+    {
+        spin_lock(&lchn->lock);
+        spin_lock(&rchn->lock);
+    }
+    else
+    {
+        if ( lchn != rchn )
+            spin_lock(&rchn->lock);
+        spin_lock(&lchn->lock);
+    }
+}
+
+static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn)
+{
+    spin_unlock(&lchn->lock);
+    if ( lchn != rchn )
+        spin_unlock(&rchn->lock);
+}
+
 static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
 {
     struct evtchn *lchn, *rchn;
@@ -274,6 +313,8 @@ static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
     if ( rc )
         goto out;
 
+    double_evtchn_lock(lchn, rchn);
+
     lchn->u.interdomain.remote_dom  = rd;
     lchn->u.interdomain.remote_port = rport;
     lchn->state                     = ECS_INTERDOMAIN;
@@ -287,7 +328,9 @@ static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
      * We may have lost notifications on the remote unbound port. Fix that up
      * here by conservatively always setting a notification on the local port.
      */
-    evtchn_set_pending(ld->vcpu[lchn->notify_vcpu_id], lport);
+    evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn);
+
+    double_evtchn_unlock(lchn, rchn);
 
     bind->local_port = lport;
 
@@ -329,11 +372,16 @@ static long evtchn_bind_virq(evtchn_bind_virq_t *bind)
         ERROR_EXIT(port);
 
     chn = evtchn_from_port(d, port);
+
+    spin_lock(&chn->lock);
+
     chn->state          = ECS_VIRQ;
     chn->notify_vcpu_id = vcpu;
     chn->u.virq         = virq;
     evtchn_port_init(d, chn);
 
+    spin_unlock(&chn->lock);
+
     v->virq_to_evtchn[virq] = bind->port = port;
 
  out:
@@ -360,10 +408,15 @@ static long evtchn_bind_ipi(evtchn_bind_ipi_t *bind)
         ERROR_EXIT(port);
 
     chn = evtchn_from_port(d, port);
+
+    spin_lock(&chn->lock);
+
     chn->state          = ECS_IPI;
     chn->notify_vcpu_id = vcpu;
     evtchn_port_init(d, chn);
 
+    spin_unlock(&chn->lock);
+
     bind->port = port;
 
  out:
@@ -438,17 +491,18 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
         goto out;
     }
 
+    spin_lock(&chn->lock);
+
     chn->state  = ECS_PIRQ;
     chn->u.pirq.irq = pirq;
     link_pirq_port(port, chn, v);
     evtchn_port_init(d, chn);
 
+    spin_unlock(&chn->lock);
+
     bind->port = port;
 
-#ifdef CONFIG_X86
-    if ( is_hvm_domain(d) && domain_pirq_to_irq(d, pirq) > 0 )
-        map_domain_emuirq_pirq(d, pirq, IRQ_PT);
-#endif
+    arch_evtchn_bind_pirq(d, pirq);
 
  out:
     spin_unlock(&d->event_lock);
@@ -457,7 +511,7 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
 }
 
 
-static long __evtchn_close(struct domain *d1, int port1)
+static long evtchn_close(struct domain *d1, int port1, bool_t guest)
 {
     struct domain *d2 = NULL;
     struct vcpu   *v;
@@ -477,7 +531,7 @@ static long __evtchn_close(struct domain *d1, int port1)
     chn1 = evtchn_from_port(d1, port1);
 
     /* Guest cannot close a Xen-attached event channel. */
-    if ( unlikely(consumer_is_xen(chn1)) )
+    if ( unlikely(consumer_is_xen(chn1)) && guest )
     {
         rc = -EINVAL;
         goto out;
@@ -563,22 +617,24 @@ static long __evtchn_close(struct domain *d1, int port1)
         BUG_ON(chn2->state != ECS_INTERDOMAIN);
         BUG_ON(chn2->u.interdomain.remote_dom != d1);
 
+        double_evtchn_lock(chn1, chn2);
+
+        free_evtchn(d1, chn1);
+
         chn2->state = ECS_UNBOUND;
         chn2->u.unbound.remote_domid = d1->domain_id;
-        break;
+
+        double_evtchn_unlock(chn1, chn2);
+
+        goto out;
 
     default:
         BUG();
     }
 
-    /* Clear pending event to avoid unexpected behavior on re-bind. */
-    evtchn_port_clear_pending(d1, chn1);
-
-    /* Reset binding to vcpu0 when the channel is freed. */
-    chn1->state          = ECS_FREE;
-    chn1->notify_vcpu_id = 0;
-
-    xsm_evtchn_close_post(chn1);
+    spin_lock(&chn1->lock);
+    free_evtchn(d1, chn1);
+    spin_unlock(&chn1->lock);
 
  out:
     if ( d2 != NULL )
@@ -593,34 +649,24 @@ static long __evtchn_close(struct domain *d1, int port1)
     return rc;
 }
 
-
-static long evtchn_close(evtchn_close_t *close)
-{
-    return __evtchn_close(current->domain, close->port);
-}
-
-int evtchn_send(struct domain *d, unsigned int lport)
+int evtchn_send(struct domain *ld, unsigned int lport)
 {
     struct evtchn *lchn, *rchn;
-    struct domain *ld = d, *rd;
-    struct vcpu   *rvcpu;
+    struct domain *rd;
     int            rport, ret = 0;
 
-    spin_lock(&ld->event_lock);
-
-    if ( unlikely(!port_is_valid(ld, lport)) )
-    {
-        spin_unlock(&ld->event_lock);
+    if ( !port_is_valid(ld, lport) )
         return -EINVAL;
-    }
 
     lchn = evtchn_from_port(ld, lport);
 
+    spin_lock(&lchn->lock);
+
     /* Guest cannot send via a Xen-attached event channel. */
     if ( unlikely(consumer_is_xen(lchn)) )
     {
-        spin_unlock(&ld->event_lock);
-        return -EINVAL;
+        ret = -EINVAL;
+        goto out;
     }
 
     ret = xsm_evtchn_send(XSM_HOOK, ld, lchn);
@@ -633,14 +679,13 @@ int evtchn_send(struct domain *d, unsigned int lport)
         rd    = lchn->u.interdomain.remote_dom;
         rport = lchn->u.interdomain.remote_port;
         rchn  = evtchn_from_port(rd, rport);
-        rvcpu = rd->vcpu[rchn->notify_vcpu_id];
         if ( consumer_is_xen(rchn) )
-            (*xen_notification_fn(rchn))(rvcpu, rport);
+            xen_notification_fn(rchn)(rd->vcpu[rchn->notify_vcpu_id], rport);
         else
-            evtchn_set_pending(rvcpu, rport);
+            evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn);
         break;
     case ECS_IPI:
-        evtchn_set_pending(ld->vcpu[lchn->notify_vcpu_id], lport);
+        evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn);
         break;
     case ECS_UNBOUND:
         /* silently drop the notification */
@@ -650,16 +695,11 @@ int evtchn_send(struct domain *d, unsigned int lport)
     }
 
 out:
-    spin_unlock(&ld->event_lock);
+    spin_unlock(&lchn->lock);
 
     return ret;
 }
 
-static void evtchn_set_pending(struct vcpu *v, int port)
-{
-    evtchn_port_set_pending(v, evtchn_from_port(v->domain, port));
-}
-
 int guest_enabled_event(struct vcpu *v, uint32_t virq)
 {
     return ((v != NULL) && (v->virq_to_evtchn[virq] != 0));
@@ -669,6 +709,7 @@ void send_guest_vcpu_virq(struct vcpu *v, uint32_t virq)
 {
     unsigned long flags;
     int port;
+    struct domain *d;
 
     ASSERT(!virq_is_global(virq));
 
@@ -678,7 +719,8 @@ void send_guest_vcpu_virq(struct vcpu *v, uint32_t virq)
     if ( unlikely(port == 0) )
         goto out;
 
-    evtchn_set_pending(v, port);
+    d = v->domain;
+    evtchn_port_set_pending(d, v->vcpu_id, evtchn_from_port(d, port));
 
  out:
     spin_unlock_irqrestore(&v->virq_lock, flags);
@@ -707,7 +749,7 @@ static void send_guest_global_virq(struct domain *d, uint32_t virq)
         goto out;
 
     chn = evtchn_from_port(d, port);
-    evtchn_set_pending(d->vcpu[chn->notify_vcpu_id], port);
+    evtchn_port_set_pending(d, chn->notify_vcpu_id, chn);
 
  out:
     spin_unlock_irqrestore(&v->virq_lock, flags);
@@ -731,7 +773,7 @@ void send_guest_pirq(struct domain *d, const struct pirq *pirq)
     }
 
     chn = evtchn_from_port(d, port);
-    evtchn_set_pending(d->vcpu[chn->notify_vcpu_id], port);
+    evtchn_port_set_pending(d, chn->notify_vcpu_id, chn);
 }
 
 static struct domain *global_virq_handlers[NR_VIRQS] __read_mostly;
@@ -928,8 +970,6 @@ int evtchn_unmask(unsigned int port)
     struct domain *d = current->domain;
     struct evtchn *evtchn;
 
-    ASSERT(spin_is_locked(&d->event_lock));
-
     if ( unlikely(!port_is_valid(d, port)) )
         return -EINVAL;
 
@@ -955,7 +995,7 @@ static long evtchn_reset(evtchn_reset_t *r)
         goto out;
 
     for ( i = 0; port_is_valid(d, i); i++ )
-        (void)__evtchn_close(d, i);
+        evtchn_close(d, i, 1);
 
     spin_lock(&d->event_lock);
 
@@ -1062,7 +1102,7 @@ long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         struct evtchn_close close;
         if ( copy_from_guest(&close, arg, 1) != 0 )
             return -EFAULT;
-        rc = evtchn_close(&close);
+        rc = evtchn_close(current->domain, close.port, 1);
         break;
     }
 
@@ -1096,9 +1136,7 @@ long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         struct evtchn_unmask unmask;
         if ( copy_from_guest(&unmask, arg, 1) != 0 )
             return -EFAULT;
-        spin_lock(&current->domain->event_lock);
         rc = evtchn_unmask(unmask.port);
-        spin_unlock(&current->domain->event_lock);
         break;
     }
 
@@ -1146,59 +1184,44 @@ long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 
 
 int alloc_unbound_xen_event_channel(
-    struct vcpu *local_vcpu, domid_t remote_domid,
+    struct domain *ld, unsigned int lvcpu, domid_t remote_domid,
     xen_event_channel_notification_t notification_fn)
 {
     struct evtchn *chn;
-    struct domain *d = local_vcpu->domain;
     int            port, rc;
 
-    spin_lock(&d->event_lock);
+    spin_lock(&ld->event_lock);
 
-    rc = get_free_port(d);
+    rc = get_free_port(ld);
     if ( rc < 0 )
         goto out;
     port = rc;
-    chn = evtchn_from_port(d, port);
+    chn = evtchn_from_port(ld, port);
 
-    rc = xsm_evtchn_unbound(XSM_TARGET, d, chn, remote_domid);
+    rc = xsm_evtchn_unbound(XSM_TARGET, ld, chn, remote_domid);
     if ( rc )
         goto out;
 
+    spin_lock(&chn->lock);
+
     chn->state = ECS_UNBOUND;
     chn->xen_consumer = get_xen_consumer(notification_fn);
-    chn->notify_vcpu_id = local_vcpu->vcpu_id;
+    chn->notify_vcpu_id = lvcpu;
     chn->u.unbound.remote_domid = remote_domid;
 
+    spin_unlock(&chn->lock);
+
  out:
-    spin_unlock(&d->event_lock);
+    spin_unlock(&ld->event_lock);
 
     return rc < 0 ? rc : port;
 }
 
-
-void free_xen_event_channel(
-    struct vcpu *local_vcpu, int port)
+void free_xen_event_channel(struct domain *d, int port)
 {
-    struct evtchn *chn;
-    struct domain *d = local_vcpu->domain;
-
-    spin_lock(&d->event_lock);
-
-    if ( unlikely(d->is_dying) )
-    {
-        spin_unlock(&d->event_lock);
-        return;
-    }
-
     BUG_ON(!port_is_valid(d, port));
-    chn = evtchn_from_port(d, port);
-    BUG_ON(!consumer_is_xen(chn));
-    chn->xen_consumer = 0;
 
-    spin_unlock(&d->event_lock);
-
-    (void)__evtchn_close(d, port);
+    evtchn_close(d, port, 0);
 }
 
 
@@ -1206,29 +1229,21 @@ void notify_via_xen_event_channel(struct domain *ld, int lport)
 {
     struct evtchn *lchn, *rchn;
     struct domain *rd;
-    int            rport;
-
-    spin_lock(&ld->event_lock);
-
-    if ( unlikely(ld->is_dying) )
-    {
-        spin_unlock(&ld->event_lock);
-        return;
-    }
 
     ASSERT(port_is_valid(ld, lport));
     lchn = evtchn_from_port(ld, lport);
-    ASSERT(consumer_is_xen(lchn));
+
+    spin_lock(&lchn->lock);
 
     if ( likely(lchn->state == ECS_INTERDOMAIN) )
     {
+        ASSERT(consumer_is_xen(lchn));
         rd    = lchn->u.interdomain.remote_dom;
-        rport = lchn->u.interdomain.remote_port;
-        rchn  = evtchn_from_port(rd, rport);
-        evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport);
+        rchn  = evtchn_from_port(rd, lchn->u.interdomain.remote_port);
+        evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn);
     }
 
-    spin_unlock(&ld->event_lock);
+    spin_unlock(&lchn->lock);
 }
 
 void evtchn_check_pollers(struct domain *d, unsigned int port)
@@ -1263,8 +1278,9 @@ int evtchn_init(struct domain *d)
     d->evtchn = alloc_evtchn_bucket(d, 0);
     if ( !d->evtchn )
         return -ENOMEM;
+    d->valid_evtchns = EVTCHNS_PER_BUCKET;
 
-    spin_lock_init(&d->event_lock);
+    spin_lock_init_prof(d, event_lock);
     if ( get_free_port(d) != 0 )
     {
         free_evtchn_bucket(d, d->evtchn);
@@ -1273,13 +1289,13 @@ int evtchn_init(struct domain *d)
     evtchn_from_port(d, 0)->state = ECS_RESERVED;
 
 #if MAX_VIRT_CPUS > BITS_PER_LONG
-    d->poll_mask = xmalloc_array(unsigned long, BITS_TO_LONGS(MAX_VIRT_CPUS));
+    d->poll_mask = xzalloc_array(unsigned long,
+                                 BITS_TO_LONGS(domain_max_vcpus(d)));
     if ( !d->poll_mask )
     {
         free_evtchn_bucket(d, d->evtchn);
         return -ENOMEM;
     }
-    bitmap_zero(d->poll_mask, MAX_VIRT_CPUS);
 #endif
 
     return 0;
@@ -1288,7 +1304,7 @@ int evtchn_init(struct domain *d)
 
 void evtchn_destroy(struct domain *d)
 {
-    unsigned int i, j;
+    unsigned int i;
 
     /* After this barrier no new event-channel allocations can occur. */
     BUG_ON(!d->is_dying);
@@ -1296,13 +1312,19 @@ void evtchn_destroy(struct domain *d)
 
     /* Close all existing event channels. */
     for ( i = 0; port_is_valid(d, i); i++ )
-    {
-        evtchn_from_port(d, i)->xen_consumer = 0;
-        (void)__evtchn_close(d, i);
-    }
+        evtchn_close(d, i, 0);
+
+    clear_global_virq_handlers(d);
+
+    evtchn_fifo_destroy(d);
+}
+
+
+void evtchn_destroy_final(struct domain *d)
+{
+    unsigned int i, j;
 
     /* Free all event-channel buckets. */
-    spin_lock(&d->event_lock);
     for ( i = 0; i < NR_EVTCHN_GROUPS; i++ )
     {
         if ( !d->evtchn_group[i] )
@@ -1310,20 +1332,9 @@ void evtchn_destroy(struct domain *d)
         for ( j = 0; j < BUCKETS_PER_GROUP; j++ )
             free_evtchn_bucket(d, d->evtchn_group[i][j]);
         xfree(d->evtchn_group[i]);
-        d->evtchn_group[i] = NULL;
     }
     free_evtchn_bucket(d, d->evtchn);
-    d->evtchn = NULL;
-    spin_unlock(&d->event_lock);
-
-    clear_global_virq_handlers(d);
-
-    evtchn_fifo_destroy(d);
-}
 
-
-void evtchn_destroy_final(struct domain *d)
-{
 #if MAX_VIRT_CPUS > BITS_PER_LONG
     xfree(d->poll_mask);
     d->poll_mask = NULL;
diff --git a/xen/common/event_fifo.c b/xen/common/event_fifo.c
index b81fae4..c9b7884 100644
--- a/xen/common/event_fifo.c
+++ b/xen/common/event_fifo.c
@@ -74,9 +74,9 @@ static struct evtchn_fifo_queue *lock_old_queue(const struct domain *d,
         spin_unlock_irqrestore(&old_q->lock, *flags);
     }
 
-    gdprintk(XENLOG_WARNING,
-             "domain %d, port %d lost event (too many queue changes)\n",
-             d->domain_id, evtchn->port);
+    gprintk(XENLOG_WARNING,
+            "dom%d port %d lost event (too many queue changes)\n",
+            d->domain_id, evtchn->port);
     return NULL;
 }          
 
diff --git a/xen/common/gdbstub.c b/xen/common/gdbstub.c
index 67ff726..19e3b8d 100644
--- a/xen/common/gdbstub.c
+++ b/xen/common/gdbstub.c
@@ -18,8 +18,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index fe52b63..2b449d5 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -20,8 +20,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/err.h>
@@ -37,6 +36,7 @@
 #include <xen/iommu.h>
 #include <xen/paging.h>
 #include <xen/keyhandler.h>
+#include <xen/vmap.h>
 #include <xsm/xsm.h>
 #include <asm/flushtlb.h>
 
@@ -57,7 +57,7 @@ integer_param("gnttab_max_frames", max_grant_frames);
  * New options allow to set max_maptrack_frames and
  * map_grant_table_frames independently.
  */
-#define DEFAULT_MAX_MAPTRACK_FRAMES 256
+#define DEFAULT_MAX_MAPTRACK_FRAMES 1024
 
 static unsigned int __read_mostly max_maptrack_frames;
 integer_param("gnttab_max_maptrack_frames", max_maptrack_frames);
@@ -113,6 +113,18 @@ struct gnttab_unmap_common {
         goto _lbl;                              \
     } while ( 0 )
 
+/*
+ * Tracks a mapping of another domain's grant reference. Each domain has a
+ * table of these, indexes into which are returned as a 'mapping handle'.
+ */
+struct grant_mapping {
+    u32      ref;           /* grant ref */
+    u16      flags;         /* 0-4: GNTMAP_* ; 5-15: unused */
+    domid_t  domid;         /* granting domain */
+    u32      vcpu;          /* vcpu which created the grant mapping */
+    u32      pad;           /* round size to a power of 2 */
+};
+
 #define MAPTRACK_PER_PAGE (PAGE_SIZE / sizeof(struct grant_mapping))
 #define maptrack_entry(t, e) \
     ((t)->maptrack[(e)/MAPTRACK_PER_PAGE][(e)%MAPTRACK_PER_PAGE])
@@ -157,10 +169,13 @@ struct active_grant_entry {
                                in the page.                           */
     unsigned      length:16; /* For sub-page grants, the length of the
                                 grant.                                */
+    spinlock_t    lock;      /* lock to protect access of this entry.
+                                see docs/misc/grant-tables.txt for
+                                locking protocol                      */
 };
 
 #define ACGNT_PER_PAGE (PAGE_SIZE / sizeof(struct active_grant_entry))
-#define active_entry(t, e) \
+#define _active_entry(t, e) \
     ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
 
 static inline void gnttab_flush_tlb(const struct domain *d)
@@ -188,6 +203,24 @@ nr_active_grant_frames(struct grant_table *gt)
     return num_act_frames_from_sha_frames(nr_grant_frames(gt));
 }
 
+static inline struct active_grant_entry *
+active_entry_acquire(struct grant_table *t, grant_ref_t e)
+{
+    struct active_grant_entry *act;
+
+    ASSERT(rw_is_locked(&t->lock));
+
+    act = &_active_entry(t, e);
+    spin_lock(&act->lock);
+
+    return act;
+}
+
+static inline void active_entry_release(struct active_grant_entry *act)
+{
+    spin_unlock(&act->lock);
+}
+
 /* Check if the page has been paged out, or needs unsharing. 
    If rc == GNTST_okay, *page contains the page struct with a ref taken.
    Caller must do put_page(*page).
@@ -231,87 +264,197 @@ static int __get_paged_frame(unsigned long gfn, unsigned long *frame, struct pag
 static inline void
 double_gt_lock(struct grant_table *lgt, struct grant_table *rgt)
 {
+    /*
+     * See mapkind() for why the write lock is also required for the
+     * remote domain.
+     */
     if ( lgt < rgt )
     {
-        spin_lock(&lgt->lock);
-        spin_lock(&rgt->lock);
+        write_lock(&lgt->lock);
+        write_lock(&rgt->lock);
     }
     else
     {
         if ( lgt != rgt )
-            spin_lock(&rgt->lock);
-        spin_lock(&lgt->lock);
+            write_lock(&rgt->lock);
+        write_lock(&lgt->lock);
     }
 }
 
 static inline void
 double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt)
 {
-    spin_unlock(&lgt->lock);
+    write_unlock(&lgt->lock);
     if ( lgt != rgt )
-        spin_unlock(&rgt->lock);
+        write_unlock(&rgt->lock);
 }
 
 static inline int
 __get_maptrack_handle(
-    struct grant_table *t)
+    struct grant_table *t,
+    struct vcpu *v)
 {
-    unsigned int h;
-    if ( unlikely((h = t->maptrack_head) == MAPTRACK_TAIL) )
-        return -1;
-    t->maptrack_head = maptrack_entry(t, h).ref;
-    return h;
+    unsigned int head, next, prev_head;
+
+    do {
+        /* No maptrack pages allocated for this VCPU yet? */
+        head = read_atomic(&v->maptrack_head);
+        if ( unlikely(head == MAPTRACK_TAIL) )
+            return -1;
+
+        /*
+         * Always keep one entry in the free list to make it easier to
+         * add free entries to the tail.
+         */
+        next = read_atomic(&maptrack_entry(t, head).ref);
+        if ( unlikely(next == MAPTRACK_TAIL) )
+            return -1;
+
+        prev_head = head;
+        head = cmpxchg(&v->maptrack_head, prev_head, next);
+    } while ( head != prev_head );
+
+    return head;
+}
+
+/*
+ * Try to "steal" a free maptrack entry from another VCPU.
+ *
+ * A stolen entry is transferred to the thief, so the number of
+ * entries for each VCPU should tend to the usage pattern.
+ *
+ * To avoid having to atomically count the number of free entries on
+ * each VCPU and to avoid two VCPU repeatedly stealing entries from
+ * each other, the initial victim VCPU is selected randomly.
+ */
+static int steal_maptrack_handle(struct grant_table *t,
+                                 const struct vcpu *curr)
+{
+    const struct domain *currd = curr->domain;
+    unsigned int first, i;
+
+    /* Find an initial victim. */
+    first = i = get_random() % currd->max_vcpus;
+
+    do {
+        if ( currd->vcpu[i] )
+        {
+            int handle;
+
+            handle = __get_maptrack_handle(t, currd->vcpu[i]);
+            if ( handle != -1 )
+            {
+                maptrack_entry(t, handle).vcpu = curr->vcpu_id;
+                return handle;
+            }
+        }
+
+        i++;
+        if ( i == currd->max_vcpus )
+            i = 0;
+    } while ( i != first );
+
+    /* No free handles on any VCPU. */
+    return -1;
 }
 
 static inline void
 put_maptrack_handle(
     struct grant_table *t, int handle)
 {
-    spin_lock(&t->lock);
-    maptrack_entry(t, handle).ref = t->maptrack_head;
-    t->maptrack_head = handle;
-    spin_unlock(&t->lock);
+    struct domain *currd = current->domain;
+    struct vcpu *v;
+    unsigned int prev_tail, cur_tail;
+
+    /* 1. Set entry to be a tail. */
+    maptrack_entry(t, handle).ref = MAPTRACK_TAIL;
+
+    /* 2. Add entry to the tail of the list on the original VCPU. */
+    v = currd->vcpu[maptrack_entry(t, handle).vcpu];
+
+    cur_tail = read_atomic(&v->maptrack_tail);
+    do {
+        prev_tail = cur_tail;
+        cur_tail = cmpxchg(&v->maptrack_tail, prev_tail, handle);
+    } while ( cur_tail != prev_tail );
+
+    /* 3. Update the old tail entry to point to the new entry. */
+    write_atomic(&maptrack_entry(t, prev_tail).ref, handle);
 }
 
 static inline int
 get_maptrack_handle(
     struct grant_table *lgt)
 {
+    struct vcpu          *curr = current;
     int                   i;
     grant_handle_t        handle;
     struct grant_mapping *new_mt;
-    unsigned int          new_mt_limit, nr_frames;
 
-    spin_lock(&lgt->lock);
+    handle = __get_maptrack_handle(lgt, curr);
+    if ( likely(handle != -1) )
+        return handle;
 
-    while ( unlikely((handle = __get_maptrack_handle(lgt)) == -1) )
+    spin_lock(&lgt->maptrack_lock);
+
+    /*
+     * If we've run out of frames, try stealing an entry from another
+     * VCPU (in case the guest isn't mapping across its VCPUs evenly).
+     */
+    if ( nr_maptrack_frames(lgt) >= max_maptrack_frames )
     {
-        nr_frames = nr_maptrack_frames(lgt);
-        if ( nr_frames >= max_maptrack_frames )
-            break;
+        /*
+         * Can drop the lock since no other VCPU can be adding a new
+         * frame once they've run out.
+         */
+        spin_unlock(&lgt->maptrack_lock);
 
-        new_mt = alloc_xenheap_page();
-        if ( !new_mt )
-            break;
+        /*
+         * Uninitialized free list? Steal an extra entry for the tail
+         * sentinel.
+         */
+        if ( curr->maptrack_tail == MAPTRACK_TAIL )
+        {
+            handle = steal_maptrack_handle(lgt, curr);
+            if ( handle == -1 )
+                return -1;
+            curr->maptrack_tail = handle;
+            write_atomic(&curr->maptrack_head, handle);
+        }
+        return steal_maptrack_handle(lgt, curr);
+    }
 
-        clear_page(new_mt);
+    new_mt = alloc_xenheap_page();
+    if ( !new_mt )
+    {
+        spin_unlock(&lgt->maptrack_lock);
+        return -1;
+    }
+    clear_page(new_mt);
 
-        new_mt_limit = lgt->maptrack_limit + MAPTRACK_PER_PAGE;
+    /*
+     * Use the first new entry and add the remaining entries to the
+     * head of the free list.
+     */
+    handle = lgt->maptrack_limit;
 
-        for ( i = 1; i < MAPTRACK_PER_PAGE; i++ )
-            new_mt[i - 1].ref = lgt->maptrack_limit + i;
-        new_mt[i - 1].ref = lgt->maptrack_head;
-        lgt->maptrack_head = lgt->maptrack_limit;
+    for ( i = 0; i < MAPTRACK_PER_PAGE; i++ )
+    {
+        new_mt[i].ref = handle + i + 1;
+        new_mt[i].vcpu = curr->vcpu_id;
+    }
+    new_mt[i - 1].ref = curr->maptrack_head;
 
-        lgt->maptrack[nr_frames] = new_mt;
-        smp_wmb();
-        lgt->maptrack_limit      = new_mt_limit;
+    /* Set tail directly if this is the first page for this VCPU. */
+    if ( curr->maptrack_tail == MAPTRACK_TAIL )
+        curr->maptrack_tail = handle + MAPTRACK_PER_PAGE - 1;
 
-        gdprintk(XENLOG_INFO, "Increased maptrack size to %u frames\n",
-                 nr_frames + 1);
-    }
+    write_atomic(&curr->maptrack_head, handle + 1);
+
+    lgt->maptrack[nr_maptrack_frames(lgt)] = new_mt;
+    lgt->maptrack_limit += MAPTRACK_PER_PAGE;
 
-    spin_unlock(&lgt->lock);
+    spin_unlock(&lgt->maptrack_lock);
 
     return handle;
 }
@@ -319,11 +462,21 @@ get_maptrack_handle(
 /* Number of grant table entries. Caller must hold d's grant table lock. */
 static unsigned int nr_grant_entries(struct grant_table *gt)
 {
-    ASSERT(gt->gt_version != 0);
-    if (gt->gt_version == 1)
-        return (nr_grant_frames(gt) << PAGE_SHIFT) / sizeof(grant_entry_v1_t);
-    else
-        return (nr_grant_frames(gt) << PAGE_SHIFT) / sizeof(grant_entry_v2_t);
+    switch ( gt->gt_version )
+    {
+#define f2e(nr, ver) (((nr) << PAGE_SHIFT) / sizeof(grant_entry_v##ver##_t))
+    case 1:
+        BUILD_BUG_ON(f2e(INITIAL_NR_GRANT_FRAMES, 1) <
+                     GNTTAB_NR_RESERVED_ENTRIES);
+        return f2e(nr_grant_frames(gt), 1);
+    case 2:
+        BUILD_BUG_ON(f2e(INITIAL_NR_GRANT_FRAMES, 2) <
+                     GNTTAB_NR_RESERVED_ENTRIES);
+        return f2e(nr_grant_frames(gt), 2);
+#undef f2e
+    }
+
+    return 0;
 }
 
 static int _set_status_v1(domid_t  domid,
@@ -505,27 +658,27 @@ static int grant_map_exists(const struct domain *ld,
                             unsigned long mfn,
                             unsigned int *ref_count)
 {
-    const struct active_grant_entry *act;
     unsigned int ref, max_iter;
     
-    ASSERT(spin_is_locked(&rgt->lock));
+    ASSERT(rw_is_locked(&rgt->lock));
 
     max_iter = min(*ref_count + (1 << GNTTABOP_CONTINUATION_ARG_SHIFT),
                    nr_grant_entries(rgt));
     for ( ref = *ref_count; ref < max_iter; ref++ )
     {
-        act = &active_entry(rgt, ref);
+        struct active_grant_entry *act;
+        bool_t exists;
 
-        if ( !act->pin )
-            continue;
+        act = active_entry_acquire(rgt, ref);
 
-        if ( act->domid != ld->domain_id )
-            continue;
+        exists = act->pin
+            && act->domid == ld->domain_id
+            && act->frame == mfn;
 
-        if ( act->frame != mfn )
-            continue;
+        active_entry_release(act);
 
-        return 0;
+        if ( exists )
+            return 0;
     }
 
     if ( ref < nr_grant_entries(rgt) )
@@ -537,24 +690,39 @@ static int grant_map_exists(const struct domain *ld,
     return -EINVAL;
 }
 
-static void mapcount(
-    struct grant_table *lgt, struct domain *rd, unsigned long mfn,
-    unsigned int *wrc, unsigned int *rdc)
+#define MAPKIND_READ 1
+#define MAPKIND_WRITE 2
+static unsigned int mapkind(
+    struct grant_table *lgt, const struct domain *rd, unsigned long mfn)
 {
     struct grant_mapping *map;
     grant_handle_t handle;
+    unsigned int kind = 0;
 
-    *wrc = *rdc = 0;
+    /*
+     * Must have the local domain's grant table write lock when
+     * iterating over its maptrack entries.
+     */
+    ASSERT(rw_is_write_locked(&lgt->lock));
+    /*
+     * Must have the remote domain's grant table write lock while
+     * counting its active entries.
+     */
+    ASSERT(rw_is_write_locked(&rd->grant_table->lock));
 
-    for ( handle = 0; handle < lgt->maptrack_limit; handle++ )
+    for ( handle = 0; !(kind & MAPKIND_WRITE) &&
+                      handle < lgt->maptrack_limit; handle++ )
     {
         map = &maptrack_entry(lgt, handle);
         if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) ||
              map->domid != rd->domain_id )
             continue;
-        if ( active_entry(rd->grant_table, map->ref).frame == mfn )
-            (map->flags & GNTMAP_readonly) ? (*rdc)++ : (*wrc)++;
+        if ( _active_entry(rd->grant_table, map->ref).frame == mfn )
+            kind |= map->flags & GNTMAP_readonly ?
+                    MAPKIND_READ : MAPKIND_WRITE;
     }
+
+    return kind;
 }
 
 /*
@@ -580,10 +748,9 @@ __gnttab_map_grant_ref(
     unsigned int   cache_flags;
     struct active_grant_entry *act = NULL;
     struct grant_mapping *mt;
-    grant_entry_v1_t *sha1;
-    grant_entry_v2_t *sha2;
     grant_entry_header_t *shah;
     uint16_t *status;
+    bool_t need_iommu;
 
     led = current;
     ld = led->domain;
@@ -629,34 +796,22 @@ __gnttab_map_grant_ref(
     }
 
     rgt = rd->grant_table;
-    spin_lock(&rgt->lock);
-
-    if ( rgt->gt_version == 0 )
-        PIN_FAIL(unlock_out, GNTST_general_error,
-                 "remote grant table not yet set up\n");
+    read_lock(&rgt->lock);
 
     /* Bounds check on the grant ref */
     if ( unlikely(op->ref >= nr_grant_entries(rgt)))
         PIN_FAIL(unlock_out, GNTST_bad_gntref, "Bad ref (%d).\n", op->ref);
 
-    act = &active_entry(rgt, op->ref);
+    act = active_entry_acquire(rgt, op->ref);
     shah = shared_entry_header(rgt, op->ref);
-    if (rgt->gt_version == 1) {
-        sha1 = &shared_entry_v1(rgt, op->ref);
-        sha2 = NULL;
-        status = &shah->flags;
-    } else {
-        sha2 = &shared_entry_v2(rgt, op->ref);
-        sha1 = NULL;
-        status = &status_entry(rgt, op->ref);
-    }
+    status = rgt->gt_version == 1 ? &shah->flags : &status_entry(rgt, op->ref);
 
     /* If already pinned, check the active domid and avoid refcnt overflow. */
     if ( act->pin &&
          ((act->domid != ld->domain_id) ||
           (act->pin & 0x80808080U) != 0 ||
           (act->is_sub_page)) )
-        PIN_FAIL(unlock_out, GNTST_general_error,
+        PIN_FAIL(act_release_out, GNTST_general_error,
                  "Bad domain (%d != %d), or risk of counter overflow %08x, or subpage %d\n",
                  act->domid, ld->domain_id, act->pin, act->is_sub_page);
 
@@ -667,13 +822,15 @@ __gnttab_map_grant_ref(
         if ( (rc = _set_status(rgt->gt_version, ld->domain_id,
                                op->flags & GNTMAP_readonly,
                                1, shah, act, status) ) != GNTST_okay )
-             goto unlock_out;
+            goto act_release_out;
 
         if ( !act->pin )
         {
             unsigned long frame;
+            unsigned long gfn = rgt->gt_version == 1 ?
+                                shared_entry_v1(rgt, op->ref).frame :
+                                shared_entry_v2(rgt, op->ref).full_page.frame;
 
-            unsigned long gfn = sha1 ? sha1->frame : sha2->full_page.frame;
             rc = __get_paged_frame(gfn, &frame, &pg, 
                                     !!(op->flags & GNTMAP_readonly), rd);
             if ( rc != GNTST_okay )
@@ -702,7 +859,8 @@ __gnttab_map_grant_ref(
 
     cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
 
-    spin_unlock(&rgt->lock);
+    active_entry_release(act);
+    read_unlock(&rgt->lock);
 
     /* pg may be set, with a refcount included, from __get_paged_frame */
     if ( !pg )
@@ -778,25 +936,27 @@ __gnttab_map_grant_ref(
         goto undo_out;
     }
 
-    double_gt_lock(lgt, rgt);
-
-    if ( gnttab_need_iommu_mapping(ld) )
+    need_iommu = gnttab_need_iommu_mapping(ld);
+    if ( need_iommu )
     {
-        unsigned int wrc, rdc;
+        unsigned int kind;
         int err = 0;
+
+        double_gt_lock(lgt, rgt);
+
         /* We're not translated, so we know that gmfns and mfns are
            the same things, so the IOMMU entry is always 1-to-1. */
-        mapcount(lgt, rd, frame, &wrc, &rdc);
+        kind = mapkind(lgt, rd, frame);
         if ( (act_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) &&
              !(old_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) )
         {
-            if ( wrc == 0 )
+            if ( !(kind & MAPKIND_WRITE) )
                 err = iommu_map_page(ld, frame, frame,
                                      IOMMUF_readable|IOMMUF_writable);
         }
         else if ( act_pin && !old_pin )
         {
-            if ( (wrc + rdc) == 0 )
+            if ( !kind )
                 err = iommu_map_page(ld, frame, frame, IOMMUF_readable);
         }
         if ( err )
@@ -809,12 +969,22 @@ __gnttab_map_grant_ref(
 
     TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom);
 
+    /*
+     * All maptrack entry users check mt->flags first before using the
+     * other fields so just ensure the flags field is stored last.
+     *
+     * However, if gnttab_need_iommu_mapping() then this would race
+     * with a concurrent mapcount() call (on an unmap, for example)
+     * and a lock is required.
+     */
     mt = &maptrack_entry(lgt, handle);
     mt->domid = op->dom;
     mt->ref   = op->ref;
-    mt->flags = op->flags;
+    wmb();
+    write_atomic(&mt->flags, op->flags);
 
-    double_gt_unlock(lgt, rgt);
+    if ( need_iommu )
+        double_gt_unlock(lgt, rgt);
 
     op->dev_bus_addr = (u64)frame << PAGE_SHIFT;
     op->handle       = handle;
@@ -837,9 +1007,9 @@ __gnttab_map_grant_ref(
         put_page(pg);
     }
 
-    spin_lock(&rgt->lock);
+    read_lock(&rgt->lock);
 
-    act = &active_entry(rgt, op->ref);
+    act = active_entry_acquire(rgt, op->ref);
 
     if ( op->flags & GNTMAP_device_map )
         act->pin -= (op->flags & GNTMAP_readonly) ?
@@ -856,8 +1026,11 @@ __gnttab_map_grant_ref(
     if ( !act->pin )
         gnttab_clear_flag(_GTF_reading, status);
 
+ act_release_out:
+    active_entry_release(act);
+
  unlock_out:
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
     op->status = rc;
     put_maptrack_handle(lgt, handle);
     rcu_unlock_domain(rd);
@@ -907,18 +1080,19 @@ __gnttab_unmap_common(
     }
 
     op->map = &maptrack_entry(lgt, op->handle);
-    spin_lock(&lgt->lock);
 
-    if ( unlikely(!op->map->flags) )
+    read_lock(&lgt->lock);
+
+    if ( unlikely(!read_atomic(&op->map->flags)) )
     {
-        spin_unlock(&lgt->lock);
+        read_unlock(&lgt->lock);
         gdprintk(XENLOG_INFO, "Zero flags for handle (%d).\n", op->handle);
         op->status = GNTST_bad_handle;
         return;
     }
 
     dom = op->map->domid;
-    spin_unlock(&lgt->lock);
+    read_unlock(&lgt->lock);
 
     if ( unlikely((rd = rcu_lock_domain_by_id(dom)) == NULL) )
     {
@@ -939,9 +1113,10 @@ __gnttab_unmap_common(
     TRACE_1D(TRC_MEM_PAGE_GRANT_UNMAP, dom);
 
     rgt = rd->grant_table;
-    double_gt_lock(lgt, rgt);
 
-    op->flags = op->map->flags;
+    read_lock(&rgt->lock);
+
+    op->flags = read_atomic(&op->map->flags);
     if ( unlikely(!op->flags) || unlikely(op->map->domid != dom) )
     {
         gdprintk(XENLOG_WARNING, "Unstable handle %u\n", op->handle);
@@ -950,7 +1125,7 @@ __gnttab_unmap_common(
     }
 
     op->rd = rd;
-    act = &active_entry(rgt, op->map->ref);
+    act = active_entry_acquire(rgt, op->map->ref);
 
     if ( op->frame == 0 )
     {
@@ -959,7 +1134,7 @@ __gnttab_unmap_common(
     else
     {
         if ( unlikely(op->frame != act->frame) )
-            PIN_FAIL(unmap_out, GNTST_general_error,
+            PIN_FAIL(act_release_out, GNTST_general_error,
                      "Bad frame number doesn't match gntref. (%lx != %lx)\n",
                      op->frame, act->frame);
         if ( op->flags & GNTMAP_device_map )
@@ -978,7 +1153,7 @@ __gnttab_unmap_common(
         if ( (rc = replace_grant_host_mapping(op->host_addr,
                                               op->frame, op->new_addr, 
                                               op->flags)) < 0 )
-            goto unmap_out;
+            goto act_release_out;
 
         ASSERT(act->pin & (GNTPIN_hstw_mask | GNTPIN_hstr_mask));
         op->map->flags &= ~GNTMAP_host_map;
@@ -988,28 +1163,34 @@ __gnttab_unmap_common(
             act->pin -= GNTPIN_hstw_inc;
     }
 
-    if ( gnttab_need_iommu_mapping(ld) )
+ act_release_out:
+    active_entry_release(act);
+ unmap_out:
+    read_unlock(&rgt->lock);
+
+    if ( rc == GNTST_okay && gnttab_need_iommu_mapping(ld) )
     {
-        unsigned int wrc, rdc;
+        unsigned int kind;
         int err = 0;
-        mapcount(lgt, rd, op->frame, &wrc, &rdc);
-        if ( (wrc + rdc) == 0 )
+
+        double_gt_lock(lgt, rgt);
+
+        kind = mapkind(lgt, rd, op->frame);
+        if ( !kind )
             err = iommu_unmap_page(ld, op->frame);
-        else if ( wrc == 0 )
+        else if ( !(kind & MAPKIND_WRITE) )
             err = iommu_map_page(ld, op->frame, op->frame, IOMMUF_readable);
+
+        double_gt_unlock(lgt, rgt);
+
         if ( err )
-        {
             rc = GNTST_general_error;
-            goto unmap_out;
-        }
     }
 
     /* If just unmapped a writable mapping, mark as dirtied */
-    if ( !(op->flags & GNTMAP_readonly) )
+    if ( rc == GNTST_okay && !(op->flags & GNTMAP_readonly) )
          gnttab_mark_dirty(rd, op->frame);
 
- unmap_out:
-    double_gt_unlock(lgt, rgt);
     op->status = rc;
     rcu_unlock_domain(rd);
 }
@@ -1039,12 +1220,12 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op)
 
     rcu_lock_domain(rd);
     rgt = rd->grant_table;
-    spin_lock(&rgt->lock);
 
+    read_lock(&rgt->lock);
     if ( rgt->gt_version == 0 )
-        goto unmap_out;
+        goto unlock_out;
 
-    act = &active_entry(rgt, op->map->ref);
+    act = active_entry_acquire(rgt, op->map->ref);
     sha = shared_entry_header(rgt, op->map->ref);
 
     if ( rgt->gt_version == 1 )
@@ -1058,7 +1239,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op)
          * Suggests that __gntab_unmap_common failed early and so
          * nothing further to do
          */
-        goto unmap_out;
+        goto act_release_out;
     }
 
     pg = mfn_to_page(op->frame);
@@ -1082,7 +1263,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op)
              * Suggests that __gntab_unmap_common failed in
              * replace_grant_host_mapping() so nothing further to do
              */
-            goto unmap_out;
+            goto act_release_out;
         }
 
         if ( !is_iomem_page(op->frame) ) 
@@ -1103,8 +1284,11 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op)
     if ( act->pin == 0 )
         gnttab_clear_flag(_GTF_reading, status);
 
- unmap_out:
-    spin_unlock(&rgt->lock);
+ act_release_out:
+    active_entry_release(act);
+ unlock_out:
+    read_unlock(&rgt->lock);
+
     if ( put_handle )
     {
         op->map->flags = 0;
@@ -1290,13 +1474,15 @@ gnttab_unpopulate_status_frames(struct domain *d, struct grant_table *gt)
     gt->nr_status_frames = 0;
 }
 
+/*
+ * Grow the grant table. The caller must hold the grant table's
+ * write lock before calling this function.
+ */
 int
 gnttab_grow_table(struct domain *d, unsigned int req_nr_frames)
 {
-    /* d's grant table lock must be held by the caller */
-
     struct grant_table *gt = d->grant_table;
-    unsigned int i;
+    unsigned int i, j;
 
     ASSERT(req_nr_frames <= max_grant_frames);
 
@@ -1311,6 +1497,8 @@ gnttab_grow_table(struct domain *d, unsigned int req_nr_frames)
         if ( (gt->active[i] = alloc_xenheap_page()) == NULL )
             goto active_alloc_failed;
         clear_page(gt->active[i]);
+        for ( j = 0; j < ACGNT_PER_PAGE; j++ )
+            spin_lock_init(&gt->active[i][j].lock);
     }
 
     /* Shared */
@@ -1398,7 +1586,7 @@ gnttab_setup_table(
     }
 
     gt = d->grant_table;
-    spin_lock(&gt->lock);
+    write_lock(&gt->lock);
 
     if ( gt->gt_version == 0 )
         gt->gt_version = 1;
@@ -1426,7 +1614,7 @@ gnttab_setup_table(
     }
 
  out3:
-    spin_unlock(&gt->lock);
+    write_unlock(&gt->lock);
  out2:
     rcu_unlock_domain(d);
  out1:
@@ -1468,13 +1656,13 @@ gnttab_query_size(
         goto query_out_unlock;
     }
 
-    spin_lock(&d->grant_table->lock);
+    read_lock(&d->grant_table->lock);
 
     op.nr_frames     = nr_grant_frames(d->grant_table);
     op.max_nr_frames = max_grant_frames;
     op.status        = GNTST_okay;
 
-    spin_unlock(&d->grant_table->lock);
+    read_unlock(&d->grant_table->lock);
 
  
  query_out_unlock:
@@ -1500,15 +1688,7 @@ gnttab_prepare_for_transfer(
     union grant_combo   scombo, prev_scombo, new_scombo;
     int                 retries = 0;
 
-    spin_lock(&rgt->lock);
-
-    if ( rgt->gt_version == 0 )
-    {
-        gdprintk(XENLOG_INFO,
-                 "Grant table not ready for transfer to domain(%d).\n",
-                 rd->domain_id);
-        goto fail;
-    }
+    read_lock(&rgt->lock);
 
     if ( unlikely(ref >= nr_grant_entries(rgt)) )
     {
@@ -1551,11 +1731,11 @@ gnttab_prepare_for_transfer(
         scombo = prev_scombo;
     }
 
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
     return 1;
 
  fail:
-    spin_unlock(&rgt->lock);
+    read_unlock(&rgt->lock);
     return 0;
 }
 
@@ -1570,6 +1750,7 @@ gnttab_transfer(
     struct gnttab_transfer gop;
     unsigned long mfn;
     unsigned int max_bitsize;
+    struct active_grant_entry *act;
 
     for ( i = 0; i < count; i++ )
     {
@@ -1651,24 +1832,22 @@ gnttab_transfer(
         }
 
         max_bitsize = domain_clamp_alloc_bitsize(
-            e, BITS_PER_LONG+PAGE_SHIFT-1);
-        if ( (1UL << (max_bitsize - PAGE_SHIFT)) <= mfn )
+            e, e->grant_table->gt_version > 1 || paging_mode_translate(e)
+               ? BITS_PER_LONG + PAGE_SHIFT : 32 + PAGE_SHIFT);
+        if ( max_bitsize < BITS_PER_LONG + PAGE_SHIFT &&
+             (mfn >> (max_bitsize - PAGE_SHIFT)) )
         {
             struct page_info *new_page;
-            void *sp, *dp;
 
-            new_page = alloc_domheap_page(NULL, MEMF_bits(max_bitsize));
+            new_page = alloc_domheap_page(e, MEMF_no_owner |
+                                             MEMF_bits(max_bitsize));
             if ( new_page == NULL )
             {
                 gop.status = GNTST_address_too_big;
                 goto unlock_and_copyback;
             }
 
-            sp = map_domain_page(mfn);
-            dp = __map_domain_page(new_page);
-            memcpy(dp, sp, PAGE_SIZE);
-            unmap_domain_page(dp);
-            unmap_domain_page(sp);
+            copy_domain_page(_mfn(page_to_mfn(new_page)), _mfn(mfn));
 
             page->count_info &= ~(PGC_count_mask|PGC_allocated);
             free_domheap_page(page);
@@ -1747,25 +1926,31 @@ gnttab_transfer(
         TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id);
 
         /* Tell the guest about its new page frame. */
-        spin_lock(&e->grant_table->lock);
+        read_lock(&e->grant_table->lock);
+        act = active_entry_acquire(e->grant_table, gop.ref);
 
         if ( e->grant_table->gt_version == 1 )
         {
             grant_entry_v1_t *sha = &shared_entry_v1(e->grant_table, gop.ref);
+
             guest_physmap_add_page(e, sha->frame, mfn, 0);
-            sha->frame = mfn;
+            if ( !paging_mode_translate(e) )
+                sha->frame = mfn;
         }
         else
         {
             grant_entry_v2_t *sha = &shared_entry_v2(e->grant_table, gop.ref);
+
             guest_physmap_add_page(e, sha->full_page.frame, mfn, 0);
-            sha->full_page.frame = mfn;
+            if ( !paging_mode_translate(e) )
+                sha->full_page.frame = mfn;
         }
         smp_wmb();
         shared_entry_header(e->grant_table, gop.ref)->flags |=
             GTF_transfer_completed;
 
-        spin_unlock(&e->grant_table->lock);
+        active_entry_release(act);
+        read_unlock(&e->grant_table->lock);
 
         rcu_unlock_domain(e);
 
@@ -1803,9 +1988,9 @@ __release_grant_for_copy(
     released_read = 0;
     released_write = 0;
 
-    spin_lock(&rgt->lock);
+    read_lock(&rgt->lock);
 
-    act = &active_entry(rgt, gref);
+    act = active_entry_acquire(rgt, gref);
     sha = shared_entry_header(rgt, gref);
     r_frame = act->frame;
 
@@ -1844,7 +2029,8 @@ __release_grant_for_copy(
         released_read = 1;
     }
 
-    spin_unlock(&rgt->lock);
+    active_entry_release(act);
+    read_unlock(&rgt->lock);
 
     if ( td != rd )
     {
@@ -1882,10 +2068,9 @@ static int
 __acquire_grant_for_copy(
     struct domain *rd, unsigned long gref, domid_t ldom, int readonly,
     unsigned long *frame, struct page_info **page, 
-    unsigned *page_off, unsigned *length, unsigned allow_transitive)
+    uint16_t *page_off, uint16_t *length, unsigned allow_transitive)
 {
     struct grant_table *rgt = rd->grant_table;
-    grant_entry_v1_t *sha1;
     grant_entry_v2_t *sha2;
     grant_entry_header_t *shah;
     struct active_grant_entry *act;
@@ -1895,34 +2080,28 @@ __acquire_grant_for_copy(
     grant_ref_t trans_gref;
     struct domain *td;
     unsigned long grant_frame;
-    unsigned trans_page_off;
-    unsigned trans_length;
+    uint16_t trans_page_off;
+    uint16_t trans_length;
     int is_sub_page;
     s16 rc = GNTST_okay;
 
     *page = NULL;
 
-    spin_lock(&rgt->lock);
-
-    if ( rgt->gt_version == 0 )
-        PIN_FAIL(unlock_out, GNTST_general_error,
-                 "remote grant table not ready\n");
+    read_lock(&rgt->lock);
 
     if ( unlikely(gref >= nr_grant_entries(rgt)) )
-        PIN_FAIL(unlock_out, GNTST_bad_gntref,
+        PIN_FAIL(gt_unlock_out, GNTST_bad_gntref,
                  "Bad grant reference %ld\n", gref);
 
-    act = &active_entry(rgt, gref);
+    act = active_entry_acquire(rgt, gref);
     shah = shared_entry_header(rgt, gref);
     if ( rgt->gt_version == 1 )
     {
-        sha1 = &shared_entry_v1(rgt, gref);
         sha2 = NULL;
         status = &shah->flags;
     }
     else
     {
-        sha1 = NULL;
         sha2 = &shared_entry_v2(rgt, gref);
         status = &status_entry(rgt, gref);
     }
@@ -1944,7 +2123,19 @@ __acquire_grant_for_copy(
 
         td = rd;
         trans_gref = gref;
-        if ( sha2 && (shah->flags & GTF_type_mask) == GTF_transitive )
+        if ( !sha2 )
+        {
+            unsigned long gfn = shared_entry_v1(rgt, gref).frame;
+
+            rc = __get_paged_frame(gfn, &grant_frame, page, readonly, rd);
+            if ( rc != GNTST_okay )
+                goto unlock_out_clear;
+            act->gfn = gfn;
+            is_sub_page = 0;
+            trans_page_off = 0;
+            trans_length = PAGE_SIZE;
+        }
+        else if ( (shah->flags & GTF_type_mask) == GTF_transitive )
         {
             if ( !allow_transitive )
                 PIN_FAIL(unlock_out_clear, GNTST_general_error,
@@ -1971,17 +2162,27 @@ __acquire_grant_for_copy(
                 PIN_FAIL(unlock_out_clear, GNTST_general_error,
                          "transitive grant referenced bad domain %d\n",
                          trans_domid);
-            spin_unlock(&rgt->lock);
+
+            /*
+             * __acquire_grant_for_copy() could take the lock on the
+             * remote table (if rd == td), so we have to drop the lock
+             * here and reacquire
+             */
+            active_entry_release(act);
+            read_unlock(&rgt->lock);
 
             rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id,
                                           readonly, &grant_frame, page,
                                           &trans_page_off, &trans_length, 0);
 
-            spin_lock(&rgt->lock);
+            read_lock(&rgt->lock);
+            act = active_entry_acquire(rgt, gref);
+
             if ( rc != GNTST_okay ) {
                 __fixup_status_for_copy_pin(act, status);
                 rcu_unlock_domain(td);
-                spin_unlock(&rgt->lock);
+                active_entry_release(act);
+                read_unlock(&rgt->lock);
                 return rc;
             }
 
@@ -1993,7 +2194,8 @@ __acquire_grant_for_copy(
             {
                 __fixup_status_for_copy_pin(act, status);
                 rcu_unlock_domain(td);
-                spin_unlock(&rgt->lock);
+                active_entry_release(act);
+                read_unlock(&rgt->lock);
                 put_page(*page);
                 return __acquire_grant_for_copy(rd, gref, ldom, readonly,
                                                 frame, page, page_off, length,
@@ -2006,16 +2208,6 @@ __acquire_grant_for_copy(
             is_sub_page = 1;
             act->gfn = -1ul;
         }
-        else if ( sha1 )
-        {
-            rc = __get_paged_frame(sha1->frame, &grant_frame, page, readonly, rd);
-            if ( rc != GNTST_okay )
-                goto unlock_out_clear;
-            act->gfn = sha1->frame;
-            is_sub_page = 0;
-            trans_page_off = 0;
-            trans_length = PAGE_SIZE;
-        }
         else if ( !(sha2->hdr.flags & GTF_sub_page) )
         {
             rc = __get_paged_frame(sha2->full_page.frame, &grant_frame, page, readonly, rd);
@@ -2052,7 +2244,12 @@ __acquire_grant_for_copy(
     {
         ASSERT(mfn_valid(act->frame));
         *page = mfn_to_page(act->frame);
-        (void)page_get_owner_and_reference(*page);
+        td = page_get_owner_and_reference(*page);
+        /*
+         * act->pin being non-zero should guarantee the page to have a
+         * non-zero refcount and hence a valid owner.
+         */
+        ASSERT(td);
     }
 
     act->pin += readonly ? GNTPIN_hstr_inc : GNTPIN_hstw_inc;
@@ -2061,7 +2258,8 @@ __acquire_grant_for_copy(
     *length = act->length;
     *frame = act->frame;
 
-    spin_unlock(&rgt->lock);
+    active_entry_release(act);
+    read_unlock(&rgt->lock);
     return rc;
  
  unlock_out_clear:
@@ -2073,272 +2271,444 @@ __acquire_grant_for_copy(
         gnttab_clear_flag(_GTF_reading, status);
 
  unlock_out:
-    spin_unlock(&rgt->lock);
+    active_entry_release(act);
+
+ gt_unlock_out:
+    read_unlock(&rgt->lock);
+
     return rc;
 }
 
-static void
-__gnttab_copy(
-    struct gnttab_copy *op)
-{
-    struct domain *sd = NULL, *dd = NULL;
-    unsigned long s_frame, d_frame;
-    struct page_info *s_pg = NULL, *d_pg = NULL;
-    char *sp, *dp;
-    s16 rc = GNTST_okay;
-    int have_d_grant = 0, have_s_grant = 0;
-    int src_is_gref, dest_is_gref;
+struct gnttab_copy_buf {
+    /* Guest provided. */
+    struct gnttab_copy_ptr ptr;
+    uint16_t len;
 
-    if ( ((op->source.offset + op->len) > PAGE_SIZE) ||
-         ((op->dest.offset + op->len) > PAGE_SIZE) )
-        PIN_FAIL(error_out, GNTST_bad_copy_arg, "copy beyond page area.\n");
+    /* Mapped etc. */
+    struct domain *domain;
+    unsigned long frame;
+    struct page_info *page;
+    void *virt;
+    bool_t read_only;
+    bool_t have_grant;
+    bool_t have_type;
+};
 
-    src_is_gref = op->flags & GNTCOPY_source_gref;
-    dest_is_gref = op->flags & GNTCOPY_dest_gref;
+static int gnttab_copy_lock_domain(domid_t domid, unsigned int gref_flag,
+                                   struct gnttab_copy_buf *buf)
+{
+    int rc;
 
-    if ( (op->source.domid != DOMID_SELF && !src_is_gref ) ||
-         (op->dest.domid   != DOMID_SELF && !dest_is_gref)   )
-        PIN_FAIL(error_out, GNTST_permission_denied,
+    if ( domid != DOMID_SELF && !gref_flag )
+        PIN_FAIL(out, GNTST_permission_denied,
                  "only allow copy-by-mfn for DOMID_SELF.\n");
 
-    if ( op->source.domid == DOMID_SELF )
-        sd = rcu_lock_current_domain();
-    else if ( (sd = rcu_lock_domain_by_id(op->source.domid)) == NULL )
-        PIN_FAIL(error_out, GNTST_bad_domain,
-                 "couldn't find %d\n", op->source.domid);
+    if ( domid == DOMID_SELF )
+        buf->domain = rcu_lock_current_domain();
+    else
+    {
+        buf->domain = rcu_lock_domain_by_id(domid);
+        if ( buf->domain == NULL )
+            PIN_FAIL(out, GNTST_bad_domain, "couldn't find %d\n", domid);
+    }
+
+    buf->ptr.domid = domid;
+    rc = GNTST_okay;
+ out:
+    return rc;
+}
 
-    if ( op->dest.domid == DOMID_SELF )
-        dd = rcu_lock_current_domain();
-    else if ( (dd = rcu_lock_domain_by_id(op->dest.domid)) == NULL )
-        PIN_FAIL(error_out, GNTST_bad_domain,
-                 "couldn't find %d\n", op->dest.domid);
+static void gnttab_copy_unlock_domains(struct gnttab_copy_buf *src,
+                                       struct gnttab_copy_buf *dest)
+{
+    if ( src->domain )
+    {
+        rcu_unlock_domain(src->domain);
+        src->domain = NULL;
+    }
+    if ( dest->domain )
+    {
+        rcu_unlock_domain(dest->domain);
+        dest->domain = NULL;
+    }
+}
 
-    rc = xsm_grant_copy(XSM_HOOK, sd, dd);
-    if ( rc )
+static int gnttab_copy_lock_domains(const struct gnttab_copy *op,
+                                    struct gnttab_copy_buf *src,
+                                    struct gnttab_copy_buf *dest)
+{
+    int rc;
+
+    rc = gnttab_copy_lock_domain(op->source.domid,
+                                 op->flags & GNTCOPY_source_gref, src);
+    if ( rc < 0 )
+        goto error;
+    rc = gnttab_copy_lock_domain(op->dest.domid,
+                                 op->flags & GNTCOPY_dest_gref, dest);
+    if ( rc < 0 )
+        goto error;
+
+    rc = xsm_grant_copy(XSM_HOOK, src->domain, dest->domain);
+    if ( rc < 0 )
     {
         rc = GNTST_permission_denied;
-        goto error_out;
+        goto error;
     }
+    return 0;
+
+ error:
+    gnttab_copy_unlock_domains(src, dest);
+    return rc;
+}
 
-    if ( src_is_gref )
+static void gnttab_copy_release_buf(struct gnttab_copy_buf *buf)
+{
+    if ( buf->virt )
     {
-        unsigned source_off, source_len;
-        rc = __acquire_grant_for_copy(sd, op->source.u.ref,
-                                      current->domain->domain_id, 1,
-                                      &s_frame, &s_pg,
-                                      &source_off, &source_len, 1);
-        if ( rc != GNTST_okay )
-            goto error_out;
-        have_s_grant = 1;
-        if ( op->source.offset < source_off ||
-             op->len > source_len )
-            PIN_FAIL(error_out, GNTST_general_error,
-                     "copy source out of bounds: %d < %d || %d > %d\n",
-                     op->source.offset, source_off,
-                     op->len, source_len);
+        unmap_domain_page(buf->virt);
+        buf->virt = NULL;
     }
-    else
+    if ( buf->have_type )
     {
-        rc = __get_paged_frame(op->source.u.gmfn, &s_frame, &s_pg, 1, sd);
-        if ( rc != GNTST_okay )
-            PIN_FAIL(error_out, rc,
-                     "source frame %lx invalid.\n", s_frame);
+        put_page_type(buf->page);
+        buf->have_type = 0;
+    }
+    if ( buf->page )
+    {
+        put_page(buf->page);
+        buf->page = NULL;
+    }
+    if ( buf->have_grant )
+    {
+        __release_grant_for_copy(buf->domain, buf->ptr.u.ref, buf->read_only);
+        buf->have_grant = 0;
     }
+}
+
+static int gnttab_copy_claim_buf(const struct gnttab_copy *op,
+                                 const struct gnttab_copy_ptr *ptr,
+                                 struct gnttab_copy_buf *buf,
+                                 unsigned int gref_flag)
+{
+    int rc;
+
+    buf->read_only = gref_flag == GNTCOPY_source_gref;
 
-    if ( dest_is_gref )
+    if ( op->flags & gref_flag )
     {
-        unsigned dest_off, dest_len;
-        rc = __acquire_grant_for_copy(dd, op->dest.u.ref,
-                                      current->domain->domain_id, 0,
-                                      &d_frame, &d_pg, &dest_off, &dest_len, 1);
+        rc = __acquire_grant_for_copy(buf->domain, ptr->u.ref,
+                                      current->domain->domain_id,
+                                      buf->read_only,
+                                      &buf->frame, &buf->page,
+                                      &buf->ptr.offset, &buf->len, 1);
         if ( rc != GNTST_okay )
-            goto error_out;
-        have_d_grant = 1;
-        if ( op->dest.offset < dest_off ||
-             op->len > dest_len )
-            PIN_FAIL(error_out, GNTST_general_error,
-                     "copy dest out of bounds: %d < %d || %d > %d\n",
-                     op->dest.offset, dest_off,
-                     op->len, dest_len);
+            goto out;
+        buf->ptr.u.ref = ptr->u.ref;
+        buf->have_grant = 1;
     }
     else
     {
-        rc = __get_paged_frame(op->dest.u.gmfn, &d_frame, &d_pg, 0, dd);
+        rc = __get_paged_frame(ptr->u.gmfn, &buf->frame, &buf->page,
+                               buf->read_only, buf->domain);
         if ( rc != GNTST_okay )
-            PIN_FAIL(error_out, rc,
-                     "destination frame %lx invalid.\n", d_frame);
+            PIN_FAIL(out, rc,
+                     "source frame %"PRI_xen_pfn" invalid.\n", ptr->u.gmfn);
+
+        buf->ptr.u.gmfn = ptr->u.gmfn;
+        buf->ptr.offset = 0;
+        buf->len = PAGE_SIZE;
     }
 
-    if ( !get_page_type(d_pg, PGT_writable_page) )
+    if ( !buf->read_only )
     {
-        if ( !dd->is_dying )
-            gdprintk(XENLOG_WARNING, "Could not get dst frame %lx\n", d_frame);
-        rc = GNTST_general_error;
-        goto error_out;
+        if ( !get_page_type(buf->page, PGT_writable_page) )
+        {
+            if ( !buf->domain->is_dying )
+                gdprintk(XENLOG_WARNING, "Could not get writable frame %lx\n", buf->frame);
+            rc = GNTST_general_error;
+            goto out;
+        }
+        buf->have_type = 1;
     }
 
-    sp = map_domain_page(s_frame);
-    dp = map_domain_page(d_frame);
+    buf->virt = map_domain_page(_mfn(buf->frame));
+    rc = GNTST_okay;
 
-    memcpy(dp + op->dest.offset, sp + op->source.offset, op->len);
+ out:
+    return rc;
+}
 
-    unmap_domain_page(dp);
-    unmap_domain_page(sp);
+static bool_t gnttab_copy_buf_valid(const struct gnttab_copy_ptr *p,
+                                    const struct gnttab_copy_buf *b,
+                                    bool_t has_gref)
+{
+    if ( !b->virt )
+        return 0;
+    if ( has_gref )
+        return b->have_grant && p->u.ref == b->ptr.u.ref;
+    return p->u.gmfn == b->ptr.u.gmfn;
+}
 
-    gnttab_mark_dirty(dd, d_frame);
+static int gnttab_copy_buf(const struct gnttab_copy *op,
+                           struct gnttab_copy_buf *dest,
+                           const struct gnttab_copy_buf *src)
+{
+    int rc;
 
-    put_page_type(d_pg);
- error_out:
-    if ( d_pg )
-        put_page(d_pg);
-    if ( s_pg )
-        put_page(s_pg);
-    if ( have_s_grant )
-        __release_grant_for_copy(sd, op->source.u.ref, 1);
-    if ( have_d_grant )
-        __release_grant_for_copy(dd, op->dest.u.ref, 0);
-    if ( sd )
-        rcu_unlock_domain(sd);
-    if ( dd )
-        rcu_unlock_domain(dd);
-    op->status = rc;
+    if ( ((op->source.offset + op->len) > PAGE_SIZE) ||
+         ((op->dest.offset + op->len) > PAGE_SIZE) )
+        PIN_FAIL(out, GNTST_bad_copy_arg, "copy beyond page area.\n");
+
+    if ( op->source.offset < src->ptr.offset ||
+         op->source.offset + op->len > src->ptr.offset + src->len )
+        PIN_FAIL(out, GNTST_general_error,
+                 "copy source out of bounds: %d < %d || %d > %d\n",
+                 op->source.offset, src->ptr.offset,
+                 op->len, src->len);
+
+    if ( op->dest.offset < dest->ptr.offset ||
+         op->dest.offset + op->len > dest->ptr.offset + dest->len )
+        PIN_FAIL(out, GNTST_general_error,
+                 "copy dest out of bounds: %d < %d || %d > %d\n",
+                 op->dest.offset, dest->ptr.offset,
+                 op->len, dest->len);
+
+    memcpy(dest->virt + op->dest.offset, src->virt + op->source.offset,
+           op->len);
+    gnttab_mark_dirty(dest->domain, dest->frame);
+    rc = GNTST_okay;
+ out:
+    return rc;
 }
 
-static long
-gnttab_copy(
+static int gnttab_copy_one(const struct gnttab_copy *op,
+                           struct gnttab_copy_buf *dest,
+                           struct gnttab_copy_buf *src)
+{
+    int rc;
+
+    if ( !src->domain || op->source.domid != src->ptr.domid ||
+         !dest->domain || op->dest.domid != dest->ptr.domid )
+    {
+        gnttab_copy_release_buf(src);
+        gnttab_copy_release_buf(dest);
+        gnttab_copy_unlock_domains(src, dest);
+
+        rc = gnttab_copy_lock_domains(op, src, dest);
+        if ( rc < 0 )
+            goto out;
+    }
+
+    /* Different source? */
+    if ( !gnttab_copy_buf_valid(&op->source, src,
+                                op->flags & GNTCOPY_source_gref) )
+    {
+        gnttab_copy_release_buf(src);
+        rc = gnttab_copy_claim_buf(op, &op->source, src, GNTCOPY_source_gref);
+        if ( rc < 0 )
+            goto out;
+    }
+
+    /* Different dest? */
+    if ( !gnttab_copy_buf_valid(&op->dest, dest,
+                                op->flags & GNTCOPY_dest_gref) )
+    {
+        gnttab_copy_release_buf(dest);
+        rc = gnttab_copy_claim_buf(op, &op->dest, dest, GNTCOPY_dest_gref);
+        if ( rc < 0 )
+            goto out;
+    }
+
+    rc = gnttab_copy_buf(op, dest, src);
+ out:
+    return rc;
+}
+
+static long gnttab_copy(
     XEN_GUEST_HANDLE_PARAM(gnttab_copy_t) uop, unsigned int count)
 {
-    int i;
+    unsigned int i;
     struct gnttab_copy op;
+    struct gnttab_copy_buf src = {};
+    struct gnttab_copy_buf dest = {};
+    long rc = 0;
 
     for ( i = 0; i < count; i++ )
     {
-        if (i && hypercall_preempt_check())
-            return i;
+        if ( i && hypercall_preempt_check() )
+        {
+            rc = i;
+            break;
+        }
+
         if ( unlikely(__copy_from_guest(&op, uop, 1)) )
-            return -EFAULT;
-        __gnttab_copy(&op);
+        {
+            rc = -EFAULT;
+            break;
+        }
+
+        op.status = gnttab_copy_one(&op, &dest, &src);
+        if ( op.status != GNTST_okay )
+        {
+            gnttab_copy_release_buf(&src);
+            gnttab_copy_release_buf(&dest);
+        }
+
         if ( unlikely(__copy_field_to_guest(uop, &op, status)) )
-            return -EFAULT;
+        {
+            rc = -EFAULT;
+            break;
+        }
         guest_handle_add_offset(uop, 1);
     }
-    return 0;
+
+    gnttab_copy_release_buf(&src);
+    gnttab_copy_release_buf(&dest);
+    gnttab_copy_unlock_domains(&src, &dest);
+
+    return rc;
 }
 
 static long
 gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
 {
     gnttab_set_version_t op;
-    struct domain *d = current->domain;
-    struct grant_table *gt = d->grant_table;
-    struct active_grant_entry *act;
+    struct domain *currd = current->domain;
+    struct grant_table *gt = currd->grant_table;
     grant_entry_v1_t reserved_entries[GNTTAB_NR_RESERVED_ENTRIES];
-    long res;
-    int i;
+    int res;
+    unsigned int i;
 
-    if (copy_from_guest(&op, uop, 1))
+    if ( copy_from_guest(&op, uop, 1) )
         return -EFAULT;
 
     res = -EINVAL;
-    if (op.version != 1 && op.version != 2)
+    if ( op.version != 1 && op.version != 2 )
         goto out;
 
     res = 0;
     if ( gt->gt_version == op.version )
         goto out;
 
-    spin_lock(&gt->lock);
-    /* Make sure that the grant table isn't currently in use when we
-       change the version number, except for the first 8 entries which
-       are allowed to be in use (xenstore/xenconsole keeps them mapped).
-       (You need to change the version number for e.g. kexec.) */
-    if ( gt->gt_version != 0 )
+    write_lock(&gt->lock);
+    /*
+     * Make sure that the grant table isn't currently in use when we
+     * change the version number, except for the first 8 entries which
+     * are allowed to be in use (xenstore/xenconsole keeps them mapped).
+     * (You need to change the version number for e.g. kexec.)
+     */
+    for ( i = GNTTAB_NR_RESERVED_ENTRIES; i < nr_grant_entries(gt); i++ )
+    {
+        if ( read_atomic(&_active_entry(gt, i).pin) != 0 )
+        {
+            gdprintk(XENLOG_WARNING,
+                     "tried to change grant table version from %u to %u, but some grant entries still in use\n",
+                     gt->gt_version, op.version);
+            res = -EBUSY;
+            goto out_unlock;
+        }
+    }
+
+    switch ( gt->gt_version )
     {
-        for ( i = GNTTAB_NR_RESERVED_ENTRIES; i < nr_grant_entries(gt); i++ )
+    case 0:
+        if ( op.version == 2 )
+        {
+    case 1:
+            /* XXX: We could maybe shrink the active grant table here. */
+            res = gnttab_populate_status_frames(currd, gt, nr_grant_frames(gt));
+            if ( res < 0)
+                goto out_unlock;
+        }
+        break;
+    case 2:
+        for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES; i++ )
         {
-            act = &active_entry(gt, i);
-            if ( act->pin != 0 )
+            if ( ((shared_entry_v2(gt, i).hdr.flags & GTF_type_mask) ==
+                  GTF_permit_access) &&
+                 (shared_entry_v2(gt, i).full_page.frame >> 32) )
             {
                 gdprintk(XENLOG_WARNING,
-                         "tried to change grant table version from %d to %d, but some grant entries still in use\n",
-                         gt->gt_version,
-                         op.version);
-                res = -EBUSY;
+                         "tried to change grant table version to 1 with non-representable entries\n");
+                res = -ERANGE;
                 goto out_unlock;
             }
         }
+        break;
     }
 
-    /* XXX: If we're going to version 2, we could maybe shrink the
-       active grant table here. */
-
-    if ( op.version == 2 && gt->gt_version < 2 )
-    {
-        res = gnttab_populate_status_frames(d, gt, nr_grant_frames(gt));
-        if ( res < 0)
-            goto out_unlock;
-    }
-
-    /* Preserve the first 8 entries (toolstack reserved grants) */
-    if ( gt->gt_version == 1 )
-    {
-        memcpy(reserved_entries, &shared_entry_v1(gt, 0), sizeof(reserved_entries));
-    }
-    else if ( gt->gt_version == 2 )
+    /* Preserve the first 8 entries (toolstack reserved grants). */
+    switch ( gt->gt_version )
     {
-        for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES && i < nr_grant_entries(gt); i++ )
+    case 1:
+        memcpy(reserved_entries, &shared_entry_v1(gt, 0),
+               sizeof(reserved_entries));
+        break;
+    case 2:
+        for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES; i++ )
         {
-            int flags = status_entry(gt, i);
-            flags |= shared_entry_v2(gt, i).hdr.flags;
-            if ((flags & GTF_type_mask) == GTF_permit_access)
+            unsigned int flags = shared_entry_v2(gt, i).hdr.flags;
+
+            switch ( flags & GTF_type_mask )
             {
-                reserved_entries[i].flags = flags;
+            case GTF_permit_access:
+                reserved_entries[i].flags = flags | status_entry(gt, i);
                 reserved_entries[i].domid = shared_entry_v2(gt, i).hdr.domid;
                 reserved_entries[i].frame = shared_entry_v2(gt, i).full_page.frame;
-            }
-            else
-            {
-                if ((flags & GTF_type_mask) != GTF_invalid)
-                    gdprintk(XENLOG_INFO, "d%d: bad flags %x in grant %d when switching grant version\n",
-                           d->domain_id, flags, i);
+                break;
+            default:
+                gdprintk(XENLOG_INFO,
+                         "bad flags %#x in grant %u when switching version\n",
+                         flags, i);
+                /* fall through */
+            case GTF_invalid:
                 memset(&reserved_entries[i], 0, sizeof(reserved_entries[i]));
+                break;
             }
         }
+        break;
     }
 
     if ( op.version < 2 && gt->gt_version == 2 )
-        gnttab_unpopulate_status_frames(d, gt);
+        gnttab_unpopulate_status_frames(currd, gt);
 
-    /* Make sure there's no crud left over in the table from the
-       old version. */
+    /* Make sure there's no crud left over from the old version. */
     for ( i = 0; i < nr_grant_frames(gt); i++ )
-        memset(gt->shared_raw[i], 0, PAGE_SIZE);
+        clear_page(gt->shared_raw[i]);
 
-    /* Restore the first 8 entries (toolstack reserved grants) */
-    if ( gt->gt_version != 0 && op.version == 1 )
-    {
-        memcpy(&shared_entry_v1(gt, 0), reserved_entries, sizeof(reserved_entries));
-    }
-    else if ( gt->gt_version != 0 && op.version == 2 )
+    /* Restore the first 8 entries (toolstack reserved grants). */
+    if ( gt->gt_version )
     {
-        for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES; i++ )
+        switch ( op.version )
         {
-            status_entry(gt, i) = reserved_entries[i].flags & (GTF_reading|GTF_writing);
-            shared_entry_v2(gt, i).hdr.flags = reserved_entries[i].flags & ~(GTF_reading|GTF_writing);
-            shared_entry_v2(gt, i).hdr.domid = reserved_entries[i].domid;
-            shared_entry_v2(gt, i).full_page.frame = reserved_entries[i].frame;
+        case 1:
+            memcpy(&shared_entry_v1(gt, 0), reserved_entries, sizeof(reserved_entries));
+            break;
+        case 2:
+            for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES; i++ )
+            {
+                status_entry(gt, i) =
+                    reserved_entries[i].flags & (GTF_reading | GTF_writing);
+                shared_entry_v2(gt, i).hdr.flags =
+                    reserved_entries[i].flags & ~(GTF_reading | GTF_writing);
+                shared_entry_v2(gt, i).hdr.domid =
+                    reserved_entries[i].domid;
+                shared_entry_v2(gt, i).full_page.frame =
+                    reserved_entries[i].frame;
+            }
+            break;
         }
     }
 
     gt->gt_version = op.version;
 
-out_unlock:
-    spin_unlock(&gt->lock);
+ out_unlock:
+    write_unlock(&gt->lock);
 
-out:
+ out:
     op.version = gt->gt_version;
 
-    if (__copy_to_guest(uop, &op, 1))
+    if ( __copy_to_guest(uop, &op, 1) )
         res = -EFAULT;
 
     return res;
@@ -2389,7 +2759,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
 
     op.status = GNTST_okay;
 
-    spin_lock(&gt->lock);
+    read_lock(&gt->lock);
 
     for ( i = 0; i < op.nr_frames; i++ )
     {
@@ -2398,7 +2768,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
             op.status = GNTST_bad_virt_addr;
     }
 
-    spin_unlock(&gt->lock);
+    read_unlock(&gt->lock);
 out2:
     rcu_unlock_domain(d);
 out1:
@@ -2444,10 +2814,11 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t ref_b)
 {
     struct domain *d = rcu_lock_current_domain();
     struct grant_table *gt = d->grant_table;
-    struct active_grant_entry *act;
+    struct active_grant_entry *act_a = NULL;
+    struct active_grant_entry *act_b = NULL;
     s16 rc = GNTST_okay;
 
-    spin_lock(&gt->lock);
+    write_lock(&gt->lock);
 
     /* Bounds check on the grant refs */
     if ( unlikely(ref_a >= nr_grant_entries(d->grant_table)))
@@ -2455,12 +2826,16 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t ref_b)
     if ( unlikely(ref_b >= nr_grant_entries(d->grant_table)))
         PIN_FAIL(out, GNTST_bad_gntref, "Bad ref-b (%d).\n", ref_b);
 
-    act = &active_entry(gt, ref_a);
-    if ( act->pin )
+    /* Swapping the same ref is a no-op. */
+    if ( ref_a == ref_b )
+        goto out;
+
+    act_a = active_entry_acquire(gt, ref_a);
+    if ( act_a->pin )
         PIN_FAIL(out, GNTST_eagain, "ref a %ld busy\n", (long)ref_a);
 
-    act = &active_entry(gt, ref_b);
-    if ( act->pin )
+    act_b = active_entry_acquire(gt, ref_b);
+    if ( act_b->pin )
         PIN_FAIL(out, GNTST_eagain, "ref b %ld busy\n", (long)ref_b);
 
     if ( gt->gt_version == 1 )
@@ -2487,7 +2862,11 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t ref_b)
     }
 
 out:
-    spin_unlock(&gt->lock);
+    if ( act_b != NULL )
+        active_entry_release(act_b);
+    if ( act_a != NULL )
+        active_entry_release(act_a);
+    write_unlock(&gt->lock);
 
     rcu_unlock_domain(d);
 
@@ -2558,19 +2937,19 @@ static int __gnttab_cache_flush(gnttab_cache_flush_t *cflush,
 
     if ( d != owner )
     {
-        spin_lock(&owner->grant_table->lock);
+        read_lock(&owner->grant_table->lock);
 
         ret = grant_map_exists(d, owner->grant_table, mfn, ref_count);
         if ( ret != 0 )
         {
-            spin_unlock(&owner->grant_table->lock);
+            read_unlock(&owner->grant_table->lock);
             rcu_unlock_domain(d);
             put_page(page);
             return ret;
         }
     }
 
-    v = map_domain_page(mfn);
+    v = map_domain_page(_mfn(mfn));
     v += cflush->offset;
 
     if ( (cflush->op & GNTTAB_CACHE_INVAL) && (cflush->op & GNTTAB_CACHE_CLEAN) )
@@ -2583,7 +2962,7 @@ static int __gnttab_cache_flush(gnttab_cache_flush_t *cflush,
         ret = 0;
 
     if ( d != owner )
-        spin_unlock(&owner->grant_table->lock);
+        read_unlock(&owner->grant_table->lock);
     unmap_domain_page(v);
     put_page(page);
 
@@ -2796,13 +3175,14 @@ grant_table_create(
     struct domain *d)
 {
     struct grant_table *t;
-    int                 i;
+    unsigned int i, j;
 
     if ( (t = xzalloc(struct grant_table)) == NULL )
         goto no_mem_0;
 
     /* Simple stuff. */
-    spin_lock_init(&t->lock);
+    rwlock_init(&t->lock);
+    spin_lock_init(&t->maptrack_lock);
     t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES;
 
     /* Active grant table. */
@@ -2815,19 +3195,14 @@ grant_table_create(
         if ( (t->active[i] = alloc_xenheap_page()) == NULL )
             goto no_mem_2;
         clear_page(t->active[i]);
+        for ( j = 0; j < ACGNT_PER_PAGE; j++ )
+            spin_lock_init(&t->active[i][j].lock);
     }
 
     /* Tracking of mapped foreign frames table */
-    if ( (t->maptrack = xzalloc_array(struct grant_mapping *,
-                                      max_maptrack_frames)) == NULL )
+    t->maptrack = vzalloc(max_maptrack_frames * sizeof(*t->maptrack));
+    if ( t->maptrack == NULL )
         goto no_mem_2;
-    if ( (t->maptrack[0] = alloc_xenheap_page()) == NULL )
-        goto no_mem_3;
-    clear_page(t->maptrack[0]);
-    t->maptrack_limit = MAPTRACK_PER_PAGE;
-    for ( i = 1; i < MAPTRACK_PER_PAGE; i++ )
-        t->maptrack[0][i - 1].ref = i;
-    t->maptrack[0][i - 1].ref = MAPTRACK_TAIL;
 
     /* Shared grant table. */
     if ( (t->shared_raw = xzalloc_array(void *, max_grant_frames)) == NULL )
@@ -2859,8 +3234,7 @@ grant_table_create(
         free_xenheap_page(t->shared_raw[i]);
     xfree(t->shared_raw);
  no_mem_3:
-    free_xenheap_page(t->maptrack[0]);
-    xfree(t->maptrack);
+    vfree(t->maptrack);
  no_mem_2:
     for ( i = 0;
           i < num_act_frames_from_sha_frames(INITIAL_NR_GRANT_FRAMES); i++ )
@@ -2909,9 +3283,9 @@ gnttab_release_mappings(
         }
 
         rgt = rd->grant_table;
-        spin_lock(&rgt->lock);
+        read_lock(&rgt->lock);
 
-        act = &active_entry(rgt, ref);
+        act = active_entry_acquire(rgt, ref);
         sha = shared_entry_header(rgt, ref);
         if (rgt->gt_version == 1)
             status = &sha->flags;
@@ -2969,7 +3343,8 @@ gnttab_release_mappings(
         if ( act->pin == 0 )
             gnttab_clear_flag(_GTF_reading, status);
 
-        spin_unlock(&rgt->lock);
+        active_entry_release(act);
+        read_unlock(&rgt->lock);
 
         rcu_unlock_domain(rd);
 
@@ -2994,7 +3369,7 @@ grant_table_destroy(
 
     for ( i = 0; i < nr_maptrack_frames(t); i++ )
         free_xenheap_page(t->maptrack[i]);
-    xfree(t->maptrack);
+    vfree(t->maptrack);
 
     for ( i = 0; i < nr_active_grant_frames(t); i++ )
         free_xenheap_page(t->active[i]);
@@ -3008,6 +3383,12 @@ grant_table_destroy(
     d->grant_table = NULL;
 }
 
+void grant_table_init_vcpu(struct vcpu *v)
+{
+    v->maptrack_head = MAPTRACK_TAIL;
+    v->maptrack_tail = MAPTRACK_TAIL;
+}
+
 static void gnttab_usage_print(struct domain *rd)
 {
     int first = 1;
@@ -3017,38 +3398,32 @@ static void gnttab_usage_print(struct domain *rd)
     printk("      -------- active --------       -------- shared --------\n");
     printk("[ref] localdom mfn      pin          localdom gmfn     flags\n");
 
-    spin_lock(&gt->lock);
-
-    if ( gt->gt_version == 0 )
-        goto out;
+    read_lock(&gt->lock);
 
     for ( ref = 0; ref != nr_grant_entries(gt); ref++ )
     {
         struct active_grant_entry *act;
         struct grant_entry_header *sha;
-        grant_entry_v1_t *sha1;
-        grant_entry_v2_t *sha2;
         uint16_t status;
         uint64_t frame;
 
-        act = &active_entry(gt, ref);
+        act = active_entry_acquire(gt, ref);
         if ( !act->pin )
+        {
+            active_entry_release(act);
             continue;
+        }
 
         sha = shared_entry_header(gt, ref);
 
         if ( gt->gt_version == 1 )
         {
-            sha1 = &shared_entry_v1(gt, ref);
-            sha2 = NULL;
             status = sha->flags;
-            frame = sha1->frame;
+            frame = shared_entry_v1(gt, ref).frame;
         }
         else
         {
-            sha2 = &shared_entry_v2(gt, ref);
-            sha1 = NULL;
-            frame = sha2->full_page.frame;
+            frame = shared_entry_v2(gt, ref).full_page.frame;
             status = status_entry(gt, ref);
         }
 
@@ -3063,10 +3438,10 @@ static void gnttab_usage_print(struct domain *rd)
         printk("[%3d]    %5d 0x%06lx 0x%08x      %5d 0x%06"PRIx64" 0x%02x\n",
                ref, act->domid, act->frame, act->pin,
                sha->domid, frame, status);
+        active_entry_release(act);
     }
 
- out:
-    spin_unlock(&gt->lock);
+    read_unlock(&gt->lock);
 
     if ( first )
         printk("grant-table for remote domain:%5d ... "
diff --git a/xen/common/guestcopy.c b/xen/common/guestcopy.c
new file mode 100644
index 0000000..6ae1815
--- /dev/null
+++ b/xen/common/guestcopy.c
@@ -0,0 +1,31 @@
+#include <xen/config.h>
+#include <xen/lib.h>
+#include <xen/guest_access.h>
+#include <xen/err.h>
+
+/*
+ * The function copies a string from the guest and adds a NUL to
+ * make sure the string is correctly terminated.
+ */
+char *safe_copy_string_from_guest(XEN_GUEST_HANDLE(char) u_buf,
+                                  size_t size, size_t max_size)
+{
+    char *tmp;
+
+    if ( size > max_size )
+        return ERR_PTR(-ENOBUFS);
+
+    /* Add an extra +1 to append \0 */
+    tmp = xmalloc_array(char, size + 1);
+    if ( !tmp )
+        return ERR_PTR(-ENOMEM);
+
+    if ( copy_from_guest(tmp, u_buf, size) )
+    {
+        xfree(tmp);
+        return ERR_PTR(-EFAULT);
+    }
+    tmp[size] = '\0';
+
+    return tmp;
+}
diff --git a/xen/common/hvm/save.c b/xen/common/hvm/save.c
index da6e668..dd2c547 100644
--- a/xen/common/hvm/save.c
+++ b/xen/common/hvm/save.c
@@ -17,8 +17,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -114,7 +113,7 @@ int hvm_save_one(struct domain *d, uint16_t typecode, uint16_t instance,
         uint32_t off;
         const struct hvm_save_descriptor *desc;
 
-        rv = -EBADSLT;
+        rv = -ENOENT;
         for ( off = 0; off < (ctxt.cur - sizeof(*desc)); off += desc->length )
         {
             desc = (void *)(ctxt.data + off);
diff --git a/xen/common/kernel.c b/xen/common/kernel.c
index bafd44f..6a3196a 100644
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -4,7 +4,6 @@
  * Copyright (c) 2002-2005 K A Fraser
  */
 
-#include <xen/config.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/errno.h>
@@ -233,9 +232,7 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     switch ( cmd )
     {
     case XENVER_version:
-    {
         return (xen_major_version() << 16) | xen_minor_version();
-    }
 
     case XENVER_extraversion:
     {
@@ -250,7 +247,7 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 
     case XENVER_compile_info:
     {
-        struct xen_compile_info info;
+        xen_compile_info_t info;
 
         memset(&info, 0, sizeof(info));
         safe_strcpy(info.compiler,       xen_compiler());
@@ -279,6 +276,7 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         xen_platform_parameters_t params = {
             .virt_start = HYPERVISOR_VIRT_START
         };
+
         if ( copy_to_guest(arg, &params, 1) )
             return -EFAULT;
         return 0;
@@ -307,16 +305,14 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         switch ( fi.submap_idx )
         {
         case 0:
-            fi.submap = 0;
-            if ( VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3) )
+            fi.submap = (1U << XENFEAT_memory_op_vnode_supported);
+            if ( VM_ASSIST(d, pae_extended_cr3) )
                 fi.submap |= (1U << XENFEAT_pae_pgdir_above_4gb);
-            if ( paging_mode_translate(current->domain) )
+            if ( paging_mode_translate(d) )
                 fi.submap |= 
                     (1U << XENFEAT_writable_page_tables) |
                     (1U << XENFEAT_auto_translated_physmap);
-            if ( supervisor_mode_kernel )
-                fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
-            if ( is_hardware_domain(current->domain) )
+            if ( is_hardware_domain(d) )
                 fi.submap |= 1U << XENFEAT_dom0;
 #ifdef CONFIG_X86
             switch ( d->guest_type )
@@ -343,31 +339,25 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
             return -EINVAL;
         }
 
-        if ( copy_to_guest(arg, &fi, 1) )
+        if ( __copy_to_guest(arg, &fi, 1) )
             return -EFAULT;
         return 0;
     }
 
     case XENVER_pagesize:
-    {
         return (!guest_handle_is_null(arg) ? -EINVAL : PAGE_SIZE);
-    }
 
     case XENVER_guest_handle:
-    {
         if ( copy_to_guest(arg, current->domain->handle,
                            ARRAY_SIZE(current->domain->handle)) )
             return -EFAULT;
         return 0;
-    }
 
     case XENVER_commandline:
-    {
         if ( copy_to_guest(arg, saved_cmdline, ARRAY_SIZE(saved_cmdline)) )
             return -EFAULT;
         return 0;
     }
-    }
 
     return -ENOSYS;
 }
@@ -396,10 +386,12 @@ DO(nmi_op)(unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     return rc;
 }
 
+#ifdef VM_ASSIST_VALID
 DO(vm_assist)(unsigned int cmd, unsigned int type)
 {
-    return vm_assist(current->domain, cmd, type);
+    return vm_assist(current->domain, cmd, type, VM_ASSIST_VALID);
 }
+#endif
 
 DO(ni_hypercall)(void)
 {
diff --git a/xen/common/kexec.c b/xen/common/kexec.c
index 2239ee8..7dd2700 100644
--- a/xen/common/kexec.c
+++ b/xen/common/kexec.c
@@ -78,7 +78,7 @@ static paddr_t __initdata crashinfo_maxaddr = 4ULL << 30;
 
 /* = log base 2 of crashinfo_maxaddr after checking for sanity. Default to
  * larger than the entire physical address space. */
-paddr_t crashinfo_maxaddr_bits = 64;
+unsigned int __initdata crashinfo_maxaddr_bits = 64;
 
 /* Pointers to keep track of the crash heap region. */
 static void *crash_heap_current = NULL, *crash_heap_end = NULL;
@@ -454,8 +454,7 @@ static int kexec_init_cpu_notes(const unsigned long cpu)
         spin_unlock(&crash_notes_lock);
         /* Always return ok, because whether we successfully allocated or not,
          * another CPU has successfully allocated. */
-        if ( note )
-            xfree(note);
+        xfree(note);
     }
     else
     {
@@ -532,7 +531,7 @@ void __init kexec_early_calculations(void)
         low_crashinfo_mode = LOW_CRASHINFO_NONE;
 
     if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
-        crashinfo_maxaddr_bits = fls(crashinfo_maxaddr) - 1;
+        crashinfo_maxaddr_bits = fls64(crashinfo_maxaddr) - 1;
 }
 
 static int __init kexec_init(void)
@@ -663,8 +662,8 @@ static int kexec_get_range(XEN_GUEST_HANDLE_PARAM(void) uarg)
 
     ret = kexec_get_range_internal(&range);
 
-    if ( ret == 0 && unlikely(copy_to_guest(uarg, &range, 1)) )
-        return -EFAULT;
+    if ( ret == 0 && unlikely(__copy_to_guest(uarg, &range, 1)) )
+        ret = -EFAULT;
 
     return ret;
 }
@@ -687,10 +686,11 @@ static int kexec_get_range_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
     if ( (range.start | range.size) & ~(unsigned long)(~0u) )
         return -ERANGE;
 
-    if ( ret == 0 ) {
+    if ( ret == 0 )
+    {
         XLAT_kexec_range(&compat_range, &range);
-        if ( unlikely(copy_to_guest(uarg, &compat_range, 1)) )
-             return -EFAULT;
+        if ( unlikely(__copy_to_guest(uarg, &compat_range, 1)) )
+             ret = -EFAULT;
     }
 
     return ret;
@@ -872,7 +872,7 @@ static int kexec_load_slot(struct kexec_image *kimage)
 static uint16_t kexec_load_v1_arch(void)
 {
 #ifdef CONFIG_X86
-    return is_pv_32on64_domain(hardware_domain) ? EM_386 : EM_X86_64;
+    return is_pv_32bit_domain(hardware_domain) ? EM_386 : EM_X86_64;
 #else
     return EM_NONE;
 #endif
@@ -912,7 +912,7 @@ static int kexec_segments_from_ind_page(unsigned long mfn,
     kimage_entry_t *entry;
     int ret = 0;
 
-    page = map_domain_page(mfn);
+    page = map_domain_page(_mfn(mfn));
 
     /*
      * Walk the indirection page list, adding destination pages to the
@@ -934,7 +934,7 @@ static int kexec_segments_from_ind_page(unsigned long mfn,
             break;
         case IND_INDIRECTION:
             unmap_domain_page(page);
-            entry = page = map_domain_page(mfn);
+            entry = page = map_domain_page(_mfn(mfn));
             continue;
         case IND_DONE:
             goto done;
@@ -1003,6 +1003,24 @@ static int kexec_do_load_v1(xen_kexec_load_v1_t *load, int compat)
     if ( ret < 0 )
         goto error;
 
+    if ( arch == EM_386 || arch == EM_X86_64 )
+    {
+        /*
+         * Ensure 0 - 1 MiB is mapped and accessible by the image.
+         *
+         * This allows access to VGA memory and the region purgatory copies
+         * in the crash case.
+         */
+        unsigned long addr;
+
+        for ( addr = 0; addr < MB(1); addr += PAGE_SIZE )
+        {
+            ret = machine_kexec_add_page(kimage, addr, addr);
+            if ( ret < 0 )
+                goto error;
+        }
+    }
+
     ret = kexec_load_slot(kimage);
     if ( ret < 0 )
         goto error;
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index a917726..5d21e48 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -294,15 +294,15 @@ static void dump_domains(unsigned char key)
                 process_pending_softirqs();
 
             printk("    VCPU%d: CPU%d [has=%c] poll=%d "
-                   "upcall_pend = %02x, upcall_mask = %02x ",
+                   "upcall_pend=%02x upcall_mask=%02x ",
                    v->vcpu_id, v->processor,
                    v->is_running ? 'T':'F', v->poll_evtchn,
                    vcpu_info(v, evtchn_upcall_pending),
                    !vcpu_event_delivery_is_enabled(v));
             cpuset_print(tmpstr, sizeof(tmpstr), v->vcpu_dirty_cpumask);
-            printk("dirty_cpus=%s ", tmpstr);
+            printk("dirty_cpus=%s\n", tmpstr);
             cpuset_print(tmpstr, sizeof(tmpstr), v->cpu_hard_affinity);
-            printk("cpu_affinity=%s\n", tmpstr);
+            printk("    cpu_hard_affinity=%s ", tmpstr);
             cpuset_print(tmpstr, sizeof(tmpstr), v->cpu_soft_affinity);
             printk("cpu_soft_affinity=%s\n", tmpstr);
             printk("    pause_count=%d pause_flags=%lx\n",
diff --git a/xen/common/kimage.c b/xen/common/kimage.c
index 9b79a5e..dcc010e 100644
--- a/xen/common/kimage.c
+++ b/xen/common/kimage.c
@@ -77,7 +77,7 @@ static struct page_info *kimage_alloc_zeroed_page(unsigned memflags)
     if ( !page )
         return NULL;
 
-    clear_domain_page(page_to_mfn(page));
+    clear_domain_page(_mfn(page_to_mfn(page)));
 
     return page;
 }
@@ -409,7 +409,7 @@ static struct page_info *kimage_alloc_crash_control_page(struct kexec_image *ima
     if ( page )
     {
         image->next_crash_page = hole_end;
-        clear_domain_page(page_to_mfn(page));
+        clear_domain_page(_mfn(page_to_mfn(page)));
     }
 
     return page;
@@ -495,10 +495,10 @@ static void kimage_terminate(struct kexec_image *image)
  * Call unmap_domain_page(ptr) after the loop exits.
  */
 #define for_each_kimage_entry(image, ptr, entry)                        \
-    for ( ptr = map_domain_page(image->head >> PAGE_SHIFT);             \
+    for ( ptr = map_domain_page(_mfn(paddr_to_pfn(image->head)));       \
           (entry = *ptr) && !(entry & IND_DONE);                        \
           ptr = (entry & IND_INDIRECTION) ?                             \
-              (unmap_domain_page(ptr), map_domain_page(entry >> PAGE_SHIFT)) \
+              (unmap_domain_page(ptr), map_domain_page(_mfn(paddr_to_pfn(entry)))) \
               : ptr + 1 )
 
 static void kimage_free_entry(kimage_entry_t entry)
@@ -637,15 +637,15 @@ static struct page_info *kimage_alloc_page(struct kexec_image *image,
         if ( old )
         {
             /* If so move it. */
-            unsigned long old_mfn = *old >> PAGE_SHIFT;
-            unsigned long mfn = addr >> PAGE_SHIFT;
+            mfn_t old_mfn = _mfn(*old >> PAGE_SHIFT);
+            mfn_t mfn = _mfn(addr >> PAGE_SHIFT);
 
             copy_domain_page(mfn, old_mfn);
             clear_domain_page(old_mfn);
             *old = (addr & ~PAGE_MASK) | IND_SOURCE;
             unmap_domain_page(old);
 
-            page = mfn_to_page(old_mfn);
+            page = mfn_to_page(mfn_x(old_mfn));
             break;
         }
         else
@@ -748,7 +748,7 @@ static int kimage_load_crash_segment(struct kexec_image *image,
         dchunk = PAGE_SIZE;
         schunk = min(dchunk, sbytes);
 
-        dest_va = map_domain_page(dest_mfn);
+        dest_va = map_domain_page(_mfn(dest_mfn));
         if ( !dest_va )
             return -EINVAL;
 
@@ -866,7 +866,7 @@ int kimage_build_ind(struct kexec_image *image, unsigned long ind_mfn,
     int ret = 0;
     paddr_t dest = KIMAGE_NO_DEST;
 
-    page = map_domain_page(ind_mfn);
+    page = map_domain_page(_mfn(ind_mfn));
     if ( !page )
         return -ENOMEM;
 
@@ -892,7 +892,7 @@ int kimage_build_ind(struct kexec_image *image, unsigned long ind_mfn,
             break;
         case IND_INDIRECTION:
             unmap_domain_page(page);
-            page = map_domain_page(mfn);
+            page = map_domain_page(_mfn(mfn));
             entry = page;
             continue;
         case IND_DONE:
@@ -917,12 +917,17 @@ int kimage_build_ind(struct kexec_image *image, unsigned long ind_mfn,
                 goto done;
             }
 
-            copy_domain_page(page_to_mfn(xen_page), mfn);
+            copy_domain_page(_mfn(page_to_mfn(xen_page)), _mfn(mfn));
             put_page(guest_page);
 
             ret = kimage_add_page(image, page_to_maddr(xen_page));
             if ( ret < 0 )
                 goto done;
+
+            ret = machine_kexec_add_page(image, dest, dest);
+            if ( ret < 0 )
+                goto done;
+
             dest += PAGE_SIZE;
             break;
         }
diff --git a/xen/common/lib.c b/xen/common/lib.c
index 89c74ad..ae0bbb3 100644
--- a/xen/common/lib.c
+++ b/xen/common/lib.c
@@ -461,12 +461,16 @@ unsigned long long parse_size_and_unit(const char *s, const char **ps)
     {
     case 'T': case 't':
         ret <<= 10;
+        /* fallthrough */
     case 'G': case 'g':
         ret <<= 10;
+        /* fallthrough */
     case 'M': case 'm':
         ret <<= 10;
+        /* fallthrough */
     case 'K': case 'k':
         ret <<= 10;
+        /* fallthrough */
     case 'B': case 'b':
         s1++;
         break;
diff --git a/xen/common/libelf/libelf-dominfo.c b/xen/common/libelf/libelf-dominfo.c
index 6120dd4..f929968 100644
--- a/xen/common/libelf/libelf-dominfo.c
+++ b/xen/common/libelf/libelf-dominfo.c
@@ -12,8 +12,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "libelf-private.h"
@@ -235,6 +234,10 @@ static unsigned elf_xen_parse_notes(struct elf_binary *elf,
           ELF_HANDLE_PTRVAL(note) < parms->elf_note_end;
           note = elf_note_next(elf, note) )
     {
+#ifdef __XEN__
+        process_pending_softirqs();
+#endif
+
         if ( *total_note_count >= ELF_MAX_TOTAL_NOTE_COUNT )
         {
             elf_mark_broken(elf, "too many ELF notes");
@@ -438,7 +441,7 @@ static elf_errorstatus elf_xen_addr_calc_check(struct elf_binary *elf,
 
     if ( parms->bsd_symtab )
     {
-        elf_parse_bsdsyms(elf, parms->virt_kend);
+        elf_parse_bsdsyms(elf, elf->pend);
         if ( elf->bsd_symtab_pend )
             parms->virt_kend = elf->bsd_symtab_pend + parms->virt_offset;
     }
diff --git a/xen/common/libelf/libelf-loader.c b/xen/common/libelf/libelf-loader.c
index c5e9141..6f42bea 100644
--- a/xen/common/libelf/libelf-loader.c
+++ b/xen/common/libelf/libelf-loader.c
@@ -12,8 +12,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifdef __XEN__
diff --git a/xen/common/libelf/libelf-private.h b/xen/common/libelf/libelf-private.h
index 854a0d7..d5f9d89 100644
--- a/xen/common/libelf/libelf-private.h
+++ b/xen/common/libelf/libelf-private.h
@@ -10,19 +10,16 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef __LIBELF_PRIVATE_H__
 #define __LIBELF_PRIVATE_H__
 
 #ifdef __XEN__
 
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/string.h>
 #include <xen/lib.h>
 #include <xen/libelf.h>
+#include <xen/softirq.h>
 #include <asm/byteorder.h>
 #include <public/elfnote.h>
 
diff --git a/xen/common/libelf/libelf-tools.c b/xen/common/libelf/libelf-tools.c
index dae210e..5a4757b 100644
--- a/xen/common/libelf/libelf-tools.c
+++ b/xen/common/libelf/libelf-tools.c
@@ -12,8 +12,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "libelf-private.h"
diff --git a/xen/common/libfdt/fdt.c b/xen/common/libfdt/fdt.c
index 2ce6a44..d02f4bf 100644
--- a/xen/common/libfdt/fdt.c
+++ b/xen/common/libfdt/fdt.c
@@ -16,9 +16,7 @@
  *     GNU General Public License for more details.
  *
  *     You should have received a copy of the GNU General Public
- *     License along with this library; if not, write to the Free
- *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
- *     MA 02110-1301 USA
+ *     License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Alternatively,
  *
diff --git a/xen/common/libfdt/fdt_empty_tree.c b/xen/common/libfdt/fdt_empty_tree.c
index f72d13b..d505611 100644
--- a/xen/common/libfdt/fdt_empty_tree.c
+++ b/xen/common/libfdt/fdt_empty_tree.c
@@ -16,9 +16,7 @@
  *     GNU General Public License for more details.
  *
  *     You should have received a copy of the GNU General Public
- *     License along with this library; if not, write to the Free
- *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
- *     MA 02110-1301 USA
+ *     License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Alternatively,
  *
diff --git a/xen/common/libfdt/fdt_ro.c b/xen/common/libfdt/fdt_ro.c
index 50007f6..36f9b48 100644
--- a/xen/common/libfdt/fdt_ro.c
+++ b/xen/common/libfdt/fdt_ro.c
@@ -16,9 +16,7 @@
  *     GNU General Public License for more details.
  *
  *     You should have received a copy of the GNU General Public
- *     License along with this library; if not, write to the Free
- *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
- *     MA 02110-1301 USA
+ *     License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Alternatively,
  *
diff --git a/xen/common/libfdt/fdt_rw.c b/xen/common/libfdt/fdt_rw.c
index fdba618..ee18bfc 100644
--- a/xen/common/libfdt/fdt_rw.c
+++ b/xen/common/libfdt/fdt_rw.c
@@ -16,9 +16,7 @@
  *     GNU General Public License for more details.
  *
  *     You should have received a copy of the GNU General Public
- *     License along with this library; if not, write to the Free
- *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
- *     MA 02110-1301 USA
+ *     License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Alternatively,
  *
diff --git a/xen/common/libfdt/fdt_strerror.c b/xen/common/libfdt/fdt_strerror.c
index e6c3cee..8d0289c 100644
--- a/xen/common/libfdt/fdt_strerror.c
+++ b/xen/common/libfdt/fdt_strerror.c
@@ -16,9 +16,7 @@
  *     GNU General Public License for more details.
  *
  *     You should have received a copy of the GNU General Public
- *     License along with this library; if not, write to the Free
- *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
- *     MA 02110-1301 USA
+ *     License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Alternatively,
  *
diff --git a/xen/common/libfdt/fdt_sw.c b/xen/common/libfdt/fdt_sw.c
index f422754..c7d93d3 100644
--- a/xen/common/libfdt/fdt_sw.c
+++ b/xen/common/libfdt/fdt_sw.c
@@ -16,9 +16,7 @@
  *     GNU General Public License for more details.
  *
  *     You should have received a copy of the GNU General Public
- *     License along with this library; if not, write to the Free
- *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
- *     MA 02110-1301 USA
+ *     License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Alternatively,
  *
diff --git a/xen/common/libfdt/fdt_wip.c b/xen/common/libfdt/fdt_wip.c
index c5bbb68..2d1cac0 100644
--- a/xen/common/libfdt/fdt_wip.c
+++ b/xen/common/libfdt/fdt_wip.c
@@ -16,9 +16,7 @@
  *     GNU General Public License for more details.
  *
  *     You should have received a copy of the GNU General Public
- *     License along with this library; if not, write to the Free
- *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
- *     MA 02110-1301 USA
+ *     License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Alternatively,
  *
diff --git a/xen/common/libfdt/libfdt_internal.h b/xen/common/libfdt/libfdt_internal.h
index 381133b..d50c4e1 100644
--- a/xen/common/libfdt/libfdt_internal.h
+++ b/xen/common/libfdt/libfdt_internal.h
@@ -18,9 +18,7 @@
  *     GNU General Public License for more details.
  *
  *     You should have received a copy of the GNU General Public
- *     License along with this library; if not, write to the Free
- *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
- *     MA 02110-1301 USA
+ *     License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Alternatively,
  *
diff --git a/xen/common/mem_access.c b/xen/common/mem_access.c
index d8aac5f..159c036 100644
--- a/xen/common/mem_access.c
+++ b/xen/common/mem_access.c
@@ -16,45 +16,18 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 
 #include <xen/sched.h>
 #include <xen/guest_access.h>
 #include <xen/hypercall.h>
-#include <xen/mem_event.h>
+#include <xen/vm_event.h>
 #include <public/memory.h>
 #include <asm/p2m.h>
 #include <xsm/xsm.h>
 
-void mem_access_resume(struct domain *d)
-{
-    mem_event_response_t rsp;
-
-    /* Pull all responses off the ring. */
-    while ( mem_event_get_response(d, &d->mem_event->access, &rsp) )
-    {
-        struct vcpu *v;
-
-        if ( rsp.flags & MEM_EVENT_FLAG_DUMMY )
-            continue;
-
-        /* Validate the vcpu_id in the response. */
-        if ( (rsp.vcpu_id >= d->max_vcpus) || !d->vcpu[rsp.vcpu_id] )
-            continue;
-
-        v = d->vcpu[rsp.vcpu_id];
-
-        p2m_mem_event_emulate_check(v, &rsp);
-
-        /* Unpause domain. */
-        if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
-            mem_event_vcpu_unpause(v);
-    }
-}
-
 int mem_access_memop(unsigned long cmd,
                      XEN_GUEST_HANDLE_PARAM(xen_mem_access_op_t) arg)
 {
@@ -74,25 +47,16 @@ int mem_access_memop(unsigned long cmd,
     if ( !p2m_mem_access_sanity_check(d) )
         goto out;
 
-    rc = xsm_mem_event_op(XSM_DM_PRIV, d, XENMEM_access_op);
+    rc = xsm_mem_access(XSM_DM_PRIV, d);
     if ( rc )
         goto out;
 
     rc = -ENODEV;
-    if ( unlikely(!d->mem_event->access.ring_page) )
+    if ( unlikely(!d->vm_event->monitor.ring_page) )
         goto out;
 
     switch ( mao.op )
     {
-    case XENMEM_access_op_resume:
-        if ( unlikely(start_iter) )
-            rc = -ENOSYS;
-        else
-        {
-            mem_access_resume(d);
-            rc = 0;
-        }
-        break;
 
     case XENMEM_access_op_set_access:
         rc = -EINVAL;
@@ -102,7 +66,7 @@ int mem_access_memop(unsigned long cmd,
               ((mao.pfn + mao.nr - 1) > domain_get_maximum_gpfn(d))) )
             break;
 
-        rc = p2m_set_mem_access(d, mao.pfn, mao.nr, start_iter,
+        rc = p2m_set_mem_access(d, _gfn(mao.pfn), mao.nr, start_iter,
                                 MEMOP_CMD_MASK, mao.access);
         if ( rc > 0 )
         {
@@ -124,7 +88,7 @@ int mem_access_memop(unsigned long cmd,
         if ( (mao.pfn > domain_get_maximum_gpfn(d)) && mao.pfn != ~0ull )
             break;
 
-        rc = p2m_get_mem_access(d, mao.pfn, &access);
+        rc = p2m_get_mem_access(d, _gfn(mao.pfn), &access);
         if ( rc != 0 )
             break;
 
@@ -134,6 +98,14 @@ int mem_access_memop(unsigned long cmd,
         break;
     }
 
+    case XENMEM_access_op_enable_emulate:
+        rc = p2m_mem_access_enable_emulate(d);
+        break;
+
+    case XENMEM_access_op_disable_emulate:
+        rc = p2m_mem_access_disable_emulate(d);
+        break;
+
     default:
         rc = -ENOSYS;
         break;
@@ -144,13 +116,13 @@ int mem_access_memop(unsigned long cmd,
     return rc;
 }
 
-int mem_access_send_req(struct domain *d, mem_event_request_t *req)
+int mem_access_send_req(struct domain *d, vm_event_request_t *req)
 {
-    int rc = mem_event_claim_slot(d, &d->mem_event->access);
+    int rc = vm_event_claim_slot(d, &d->vm_event->monitor);
     if ( rc < 0 )
         return rc;
 
-    mem_event_put_request(d, &d->mem_event->access, req);
+    vm_event_put_request(d, &d->vm_event->monitor, req);
 
     return 0;
 }
diff --git a/xen/common/mem_event.c b/xen/common/mem_event.c
deleted file mode 100644
index 16ebdb5..0000000
--- a/xen/common/mem_event.c
+++ /dev/null
@@ -1,742 +0,0 @@
-/******************************************************************************
- * mem_event.c
- *
- * Memory event support.
- *
- * Copyright (c) 2009 Citrix Systems, Inc. (Patrick Colp)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-
-#include <xen/sched.h>
-#include <xen/event.h>
-#include <xen/wait.h>
-#include <xen/mem_event.h>
-#include <xen/mem_access.h>
-#include <asm/p2m.h>
-
-#ifdef HAS_MEM_PAGING
-#include <asm/mem_paging.h>
-#endif
-
-#ifdef HAS_MEM_SHARING
-#include <asm/mem_sharing.h>
-#endif
-
-#include <xsm/xsm.h>
-
-/* for public/io/ring.h macros */
-#define xen_mb()   mb()
-#define xen_rmb()  rmb()
-#define xen_wmb()  wmb()
-
-#define mem_event_ring_lock_init(_med)  spin_lock_init(&(_med)->ring_lock)
-#define mem_event_ring_lock(_med)       spin_lock(&(_med)->ring_lock)
-#define mem_event_ring_unlock(_med)     spin_unlock(&(_med)->ring_lock)
-
-static int mem_event_enable(
-    struct domain *d,
-    xen_domctl_mem_event_op_t *mec,
-    struct mem_event_domain *med,
-    int pause_flag,
-    int param,
-    xen_event_channel_notification_t notification_fn)
-{
-    int rc;
-    unsigned long ring_gfn = d->arch.hvm_domain.params[param];
-
-    /* Only one helper at a time. If the helper crashed,
-     * the ring is in an undefined state and so is the guest.
-     */
-    if ( med->ring_page )
-        return -EBUSY;
-
-    /* The parameter defaults to zero, and it should be
-     * set to something */
-    if ( ring_gfn == 0 )
-        return -ENOSYS;
-
-    mem_event_ring_lock_init(med);
-    mem_event_ring_lock(med);
-
-    rc = prepare_ring_for_helper(d, ring_gfn, &med->ring_pg_struct,
-                                    &med->ring_page);
-    if ( rc < 0 )
-        goto err;
-
-    /* Set the number of currently blocked vCPUs to 0. */
-    med->blocked = 0;
-
-    /* Allocate event channel */
-    rc = alloc_unbound_xen_event_channel(d->vcpu[0],
-                                         current->domain->domain_id,
-                                         notification_fn);
-    if ( rc < 0 )
-        goto err;
-
-    med->xen_port = mec->port = rc;
-
-    /* Prepare ring buffer */
-    FRONT_RING_INIT(&med->front_ring,
-                    (mem_event_sring_t *)med->ring_page,
-                    PAGE_SIZE);
-
-    /* Save the pause flag for this particular ring. */
-    med->pause_flag = pause_flag;
-
-    /* Initialize the last-chance wait queue. */
-    init_waitqueue_head(&med->wq);
-
-    mem_event_ring_unlock(med);
-    return 0;
-
- err:
-    destroy_ring_for_helper(&med->ring_page,
-                            med->ring_pg_struct);
-    mem_event_ring_unlock(med);
-
-    return rc;
-}
-
-static unsigned int mem_event_ring_available(struct mem_event_domain *med)
-{
-    int avail_req = RING_FREE_REQUESTS(&med->front_ring);
-    avail_req -= med->target_producers;
-    avail_req -= med->foreign_producers;
-
-    BUG_ON(avail_req < 0);
-
-    return avail_req;
-}
-
-/*
- * mem_event_wake_blocked() will wakeup vcpus waiting for room in the
- * ring. These vCPUs were paused on their way out after placing an event,
- * but need to be resumed where the ring is capable of processing at least
- * one event from them.
- */
-static void mem_event_wake_blocked(struct domain *d, struct mem_event_domain *med)
-{
-    struct vcpu *v;
-    int online = d->max_vcpus;
-    unsigned int avail_req = mem_event_ring_available(med);
-
-    if ( avail_req == 0 || med->blocked == 0 )
-        return;
-
-    /*
-     * We ensure that we only have vCPUs online if there are enough free slots
-     * for their memory events to be processed.  This will ensure that no
-     * memory events are lost (due to the fact that certain types of events
-     * cannot be replayed, we need to ensure that there is space in the ring
-     * for when they are hit).
-     * See comment below in mem_event_put_request().
-     */
-    for_each_vcpu ( d, v )
-        if ( test_bit(med->pause_flag, &v->pause_flags) )
-            online--;
-
-    ASSERT(online == (d->max_vcpus - med->blocked));
-
-    /* We remember which vcpu last woke up to avoid scanning always linearly
-     * from zero and starving higher-numbered vcpus under high load */
-    if ( d->vcpu )
-    {
-        int i, j, k;
-
-        for (i = med->last_vcpu_wake_up + 1, j = 0; j < d->max_vcpus; i++, j++)
-        {
-            k = i % d->max_vcpus;
-            v = d->vcpu[k];
-            if ( !v )
-                continue;
-
-            if ( !(med->blocked) || online >= avail_req )
-               break;
-
-            if ( test_and_clear_bit(med->pause_flag, &v->pause_flags) )
-            {
-                vcpu_unpause(v);
-                online++;
-                med->blocked--;
-                med->last_vcpu_wake_up = k;
-            }
-        }
-    }
-}
-
-/*
- * In the event that a vCPU attempted to place an event in the ring and
- * was unable to do so, it is queued on a wait queue.  These are woken as
- * needed, and take precedence over the blocked vCPUs.
- */
-static void mem_event_wake_queued(struct domain *d, struct mem_event_domain *med)
-{
-    unsigned int avail_req = mem_event_ring_available(med);
-
-    if ( avail_req > 0 )
-        wake_up_nr(&med->wq, avail_req);
-}
-
-/*
- * mem_event_wake() will wakeup all vcpus waiting for the ring to
- * become available.  If we have queued vCPUs, they get top priority. We
- * are guaranteed that they will go through code paths that will eventually
- * call mem_event_wake() again, ensuring that any blocked vCPUs will get
- * unpaused once all the queued vCPUs have made it through.
- */
-void mem_event_wake(struct domain *d, struct mem_event_domain *med)
-{
-    if (!list_empty(&med->wq.list))
-        mem_event_wake_queued(d, med);
-    else
-        mem_event_wake_blocked(d, med);
-}
-
-static int mem_event_disable(struct domain *d, struct mem_event_domain *med)
-{
-    if ( med->ring_page )
-    {
-        struct vcpu *v;
-
-        mem_event_ring_lock(med);
-
-        if ( !list_empty(&med->wq.list) )
-        {
-            mem_event_ring_unlock(med);
-            return -EBUSY;
-        }
-
-        /* Free domU's event channel and leave the other one unbound */
-        free_xen_event_channel(d->vcpu[0], med->xen_port);
-
-        /* Unblock all vCPUs */
-        for_each_vcpu ( d, v )
-        {
-            if ( test_and_clear_bit(med->pause_flag, &v->pause_flags) )
-            {
-                vcpu_unpause(v);
-                med->blocked--;
-            }
-        }
-
-        destroy_ring_for_helper(&med->ring_page,
-                                med->ring_pg_struct);
-        mem_event_ring_unlock(med);
-    }
-
-    return 0;
-}
-
-static inline void mem_event_release_slot(struct domain *d,
-                                          struct mem_event_domain *med)
-{
-    /* Update the accounting */
-    if ( current->domain == d )
-        med->target_producers--;
-    else
-        med->foreign_producers--;
-
-    /* Kick any waiters */
-    mem_event_wake(d, med);
-}
-
-/*
- * mem_event_mark_and_pause() tags vcpu and put it to sleep.
- * The vcpu will resume execution in mem_event_wake_waiters().
- */
-void mem_event_mark_and_pause(struct vcpu *v, struct mem_event_domain *med)
-{
-    if ( !test_and_set_bit(med->pause_flag, &v->pause_flags) )
-    {
-        vcpu_pause_nosync(v);
-        med->blocked++;
-    }
-}
-
-/*
- * This must be preceded by a call to claim_slot(), and is guaranteed to
- * succeed.  As a side-effect however, the vCPU may be paused if the ring is
- * overly full and its continued execution would cause stalling and excessive
- * waiting.  The vCPU will be automatically unpaused when the ring clears.
- */
-void mem_event_put_request(struct domain *d,
-                           struct mem_event_domain *med,
-                           mem_event_request_t *req)
-{
-    mem_event_front_ring_t *front_ring;
-    int free_req;
-    unsigned int avail_req;
-    RING_IDX req_prod;
-
-    if ( current->domain != d )
-    {
-        req->flags |= MEM_EVENT_FLAG_FOREIGN;
-#ifndef NDEBUG
-        if ( !(req->flags & MEM_EVENT_FLAG_VCPU_PAUSED) )
-            gdprintk(XENLOG_G_WARNING, "d%dv%d was not paused.\n",
-                     d->domain_id, req->vcpu_id);
-#endif
-    }
-
-    mem_event_ring_lock(med);
-
-    /* Due to the reservations, this step must succeed. */
-    front_ring = &med->front_ring;
-    free_req = RING_FREE_REQUESTS(front_ring);
-    ASSERT(free_req > 0);
-
-    /* Copy request */
-    req_prod = front_ring->req_prod_pvt;
-    memcpy(RING_GET_REQUEST(front_ring, req_prod), req, sizeof(*req));
-    req_prod++;
-
-    /* Update ring */
-    front_ring->req_prod_pvt = req_prod;
-    RING_PUSH_REQUESTS(front_ring);
-
-    /* We've actually *used* our reservation, so release the slot. */
-    mem_event_release_slot(d, med);
-
-    /* Give this vCPU a black eye if necessary, on the way out.
-     * See the comments above wake_blocked() for more information
-     * on how this mechanism works to avoid waiting. */
-    avail_req = mem_event_ring_available(med);
-    if( current->domain == d && avail_req < d->max_vcpus )
-        mem_event_mark_and_pause(current, med);
-
-    mem_event_ring_unlock(med);
-
-    notify_via_xen_event_channel(d, med->xen_port);
-}
-
-int mem_event_get_response(struct domain *d, struct mem_event_domain *med, mem_event_response_t *rsp)
-{
-    mem_event_front_ring_t *front_ring;
-    RING_IDX rsp_cons;
-
-    mem_event_ring_lock(med);
-
-    front_ring = &med->front_ring;
-    rsp_cons = front_ring->rsp_cons;
-
-    if ( !RING_HAS_UNCONSUMED_RESPONSES(front_ring) )
-    {
-        mem_event_ring_unlock(med);
-        return 0;
-    }
-
-    /* Copy response */
-    memcpy(rsp, RING_GET_RESPONSE(front_ring, rsp_cons), sizeof(*rsp));
-    rsp_cons++;
-
-    /* Update ring */
-    front_ring->rsp_cons = rsp_cons;
-    front_ring->sring->rsp_event = rsp_cons + 1;
-
-    /* Kick any waiters -- since we've just consumed an event,
-     * there may be additional space available in the ring. */
-    mem_event_wake(d, med);
-
-    mem_event_ring_unlock(med);
-
-    return 1;
-}
-
-void mem_event_cancel_slot(struct domain *d, struct mem_event_domain *med)
-{
-    mem_event_ring_lock(med);
-    mem_event_release_slot(d, med);
-    mem_event_ring_unlock(med);
-}
-
-static int mem_event_grab_slot(struct mem_event_domain *med, int foreign)
-{
-    unsigned int avail_req;
-
-    if ( !med->ring_page )
-        return -ENOSYS;
-
-    mem_event_ring_lock(med);
-
-    avail_req = mem_event_ring_available(med);
-    if ( avail_req == 0 )
-    {
-        mem_event_ring_unlock(med);
-        return -EBUSY;
-    }
-
-    if ( !foreign )
-        med->target_producers++;
-    else
-        med->foreign_producers++;
-
-    mem_event_ring_unlock(med);
-
-    return 0;
-}
-
-/* Simple try_grab wrapper for use in the wait_event() macro. */
-static int mem_event_wait_try_grab(struct mem_event_domain *med, int *rc)
-{
-    *rc = mem_event_grab_slot(med, 0);
-    return *rc;
-}
-
-/* Call mem_event_grab_slot() until the ring doesn't exist, or is available. */
-static int mem_event_wait_slot(struct mem_event_domain *med)
-{
-    int rc = -EBUSY;
-    wait_event(med->wq, mem_event_wait_try_grab(med, &rc) != -EBUSY);
-    return rc;
-}
-
-bool_t mem_event_check_ring(struct mem_event_domain *med)
-{
-    return (med->ring_page != NULL);
-}
-
-/*
- * Determines whether or not the current vCPU belongs to the target domain,
- * and calls the appropriate wait function.  If it is a guest vCPU, then we
- * use mem_event_wait_slot() to reserve a slot.  As long as there is a ring,
- * this function will always return 0 for a guest.  For a non-guest, we check
- * for space and return -EBUSY if the ring is not available.
- *
- * Return codes: -ENOSYS: the ring is not yet configured
- *               -EBUSY: the ring is busy
- *               0: a spot has been reserved
- *
- */
-int __mem_event_claim_slot(struct domain *d, struct mem_event_domain *med,
-                            bool_t allow_sleep)
-{
-    if ( (current->domain == d) && allow_sleep )
-        return mem_event_wait_slot(med);
-    else
-        return mem_event_grab_slot(med, (current->domain != d));
-}
-
-#ifdef HAS_MEM_PAGING
-/* Registered with Xen-bound event channel for incoming notifications. */
-static void mem_paging_notification(struct vcpu *v, unsigned int port)
-{
-    if ( likely(v->domain->mem_event->paging.ring_page != NULL) )
-        p2m_mem_paging_resume(v->domain);
-}
-#endif
-
-#ifdef HAS_MEM_ACCESS
-/* Registered with Xen-bound event channel for incoming notifications. */
-static void mem_access_notification(struct vcpu *v, unsigned int port)
-{
-    if ( likely(v->domain->mem_event->access.ring_page != NULL) )
-        mem_access_resume(v->domain);
-}
-#endif
-
-#ifdef HAS_MEM_SHARING
-/* Registered with Xen-bound event channel for incoming notifications. */
-static void mem_sharing_notification(struct vcpu *v, unsigned int port)
-{
-    if ( likely(v->domain->mem_event->share.ring_page != NULL) )
-        mem_sharing_sharing_resume(v->domain);
-}
-#endif
-
-int do_mem_event_op(int op, uint32_t domain, void *arg)
-{
-    int ret;
-    struct domain *d;
-
-    ret = rcu_lock_live_remote_domain_by_id(domain, &d);
-    if ( ret )
-        return ret;
-
-    ret = xsm_mem_event_op(XSM_DM_PRIV, d, op);
-    if ( ret )
-        goto out;
-
-    switch (op)
-    {
-#ifdef HAS_MEM_PAGING
-        case XENMEM_paging_op:
-            ret = mem_paging_memop(d, (xen_mem_event_op_t *) arg);
-            break;
-#endif
-#ifdef HAS_MEM_SHARING
-        case XENMEM_sharing_op:
-            ret = mem_sharing_memop(d, (xen_mem_sharing_op_t *) arg);
-            break;
-#endif
-        default:
-            ret = -ENOSYS;
-    }
-
- out:
-    rcu_unlock_domain(d);
-    return ret;
-}
-
-/* Clean up on domain destruction */
-void mem_event_cleanup(struct domain *d)
-{
-#ifdef HAS_MEM_PAGING
-    if ( d->mem_event->paging.ring_page ) {
-        /* Destroying the wait queue head means waking up all
-         * queued vcpus. This will drain the list, allowing
-         * the disable routine to complete. It will also drop
-         * all domain refs the wait-queued vcpus are holding.
-         * Finally, because this code path involves previously
-         * pausing the domain (domain_kill), unpausing the
-         * vcpus causes no harm. */
-        destroy_waitqueue_head(&d->mem_event->paging.wq);
-        (void)mem_event_disable(d, &d->mem_event->paging);
-    }
-#endif
-#ifdef HAS_MEM_ACCESS
-    if ( d->mem_event->access.ring_page ) {
-        destroy_waitqueue_head(&d->mem_event->access.wq);
-        (void)mem_event_disable(d, &d->mem_event->access);
-    }
-#endif
-#ifdef HAS_MEM_SHARING
-    if ( d->mem_event->share.ring_page ) {
-        destroy_waitqueue_head(&d->mem_event->share.wq);
-        (void)mem_event_disable(d, &d->mem_event->share);
-    }
-#endif
-}
-
-int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec,
-                     XEN_GUEST_HANDLE_PARAM(void) u_domctl)
-{
-    int rc;
-
-    rc = xsm_mem_event_control(XSM_PRIV, d, mec->mode, mec->op);
-    if ( rc )
-        return rc;
-
-    if ( unlikely(d == current->domain) )
-    {
-        gdprintk(XENLOG_INFO, "Tried to do a memory event op on itself.\n");
-        return -EINVAL;
-    }
-
-    if ( unlikely(d->is_dying) )
-    {
-        gdprintk(XENLOG_INFO, "Ignoring memory event op on dying domain %u\n",
-                 d->domain_id);
-        return 0;
-    }
-
-    if ( unlikely(d->vcpu == NULL) || unlikely(d->vcpu[0] == NULL) )
-    {
-        gdprintk(XENLOG_INFO,
-                 "Memory event op on a domain (%u) with no vcpus\n",
-                 d->domain_id);
-        return -EINVAL;
-    }
-
-    rc = -ENOSYS;
-
-    switch ( mec->mode )
-    {
-#ifdef HAS_MEM_PAGING
-    case XEN_DOMCTL_MEM_EVENT_OP_PAGING:
-    {
-        struct mem_event_domain *med = &d->mem_event->paging;
-        rc = -EINVAL;
-
-        switch( mec->op )
-        {
-        case XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE:
-        {
-            struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-            rc = -EOPNOTSUPP;
-            /* pvh fixme: p2m_is_foreign types need addressing */
-            if ( is_pvh_vcpu(current) || is_pvh_domain(hardware_domain) )
-                break;
-
-            rc = -ENODEV;
-            /* Only HAP is supported */
-            if ( !hap_enabled(d) )
-                break;
-
-            /* No paging if iommu is used */
-            rc = -EMLINK;
-            if ( unlikely(need_iommu(d)) )
-                break;
-
-            rc = -EXDEV;
-            /* Disallow paging in a PoD guest */
-            if ( p2m->pod.entry_count )
-                break;
-
-            rc = mem_event_enable(d, mec, med, _VPF_mem_paging,
-                                    HVM_PARAM_PAGING_RING_PFN,
-                                    mem_paging_notification);
-        }
-        break;
-
-        case XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE:
-        {
-            if ( med->ring_page )
-                rc = mem_event_disable(d, med);
-        }
-        break;
-
-        default:
-            rc = -ENOSYS;
-            break;
-        }
-    }
-    break;
-#endif
-
-#ifdef HAS_MEM_ACCESS
-    case XEN_DOMCTL_MEM_EVENT_OP_ACCESS:
-    {
-        struct mem_event_domain *med = &d->mem_event->access;
-        rc = -EINVAL;
-
-        switch( mec->op )
-        {
-        case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE:
-        case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE_INTROSPECTION:
-        {
-            rc = -ENODEV;
-            if ( !p2m_mem_event_sanity_check(d) )
-                break;
-
-            rc = mem_event_enable(d, mec, med, _VPF_mem_access,
-                                    HVM_PARAM_ACCESS_RING_PFN,
-                                    mem_access_notification);
-
-            if ( mec->op == XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE_INTROSPECTION
-                 && !rc )
-                p2m_setup_introspection(d);
-
-        }
-        break;
-
-        case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE:
-        {
-            if ( med->ring_page )
-            {
-                rc = mem_event_disable(d, med);
-                d->arch.hvm_domain.introspection_enabled = 0;
-            }
-        }
-        break;
-
-        default:
-            rc = -ENOSYS;
-            break;
-        }
-    }
-    break;
-#endif
-
-#ifdef HAS_MEM_SHARING
-    case XEN_DOMCTL_MEM_EVENT_OP_SHARING:
-    {
-        struct mem_event_domain *med = &d->mem_event->share;
-        rc = -EINVAL;
-
-        switch( mec->op )
-        {
-        case XEN_DOMCTL_MEM_EVENT_OP_SHARING_ENABLE:
-        {
-            rc = -EOPNOTSUPP;
-            /* pvh fixme: p2m_is_foreign types need addressing */
-            if ( is_pvh_vcpu(current) || is_pvh_domain(hardware_domain) )
-                break;
-
-            rc = -ENODEV;
-            /* Only HAP is supported */
-            if ( !hap_enabled(d) )
-                break;
-
-            rc = mem_event_enable(d, mec, med, _VPF_mem_sharing,
-                                    HVM_PARAM_SHARING_RING_PFN,
-                                    mem_sharing_notification);
-        }
-        break;
-
-        case XEN_DOMCTL_MEM_EVENT_OP_SHARING_DISABLE:
-        {
-            if ( med->ring_page )
-                rc = mem_event_disable(d, med);
-        }
-        break;
-
-        default:
-            rc = -ENOSYS;
-            break;
-        }
-    }
-    break;
-#endif
-
-    default:
-        rc = -ENOSYS;
-    }
-
-    return rc;
-}
-
-void mem_event_vcpu_pause(struct vcpu *v)
-{
-    ASSERT(v == current);
-
-    atomic_inc(&v->mem_event_pause_count);
-    vcpu_pause_nosync(v);
-}
-
-void mem_event_vcpu_unpause(struct vcpu *v)
-{
-    int old, new, prev = v->mem_event_pause_count.counter;
-
-    /* All unpause requests as a result of toolstack responses.  Prevent
-     * underflow of the vcpu pause count. */
-    do
-    {
-        old = prev;
-        new = old - 1;
-
-        if ( new < 0 )
-        {
-            printk(XENLOG_G_WARNING
-                   "%pv mem_event: Too many unpause attempts\n", v);
-            return;
-        }
-
-        prev = cmpxchg(&v->mem_event_pause_count.counter, old, new);
-    } while ( prev != old );
-
-    vcpu_unpause(v);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/common/memory.c b/xen/common/memory.c
index e84ace9..b541f4a 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -126,22 +126,28 @@ static void populate_physmap(struct memop_args *a)
             if ( is_domain_direct_mapped(d) )
             {
                 mfn = gpfn;
-                if ( !mfn_valid(mfn) )
-                {
-                    gdprintk(XENLOG_INFO, "Invalid mfn %#"PRI_xen_pfn"\n",
-                             mfn);
-                    goto out;
-                }
 
-                page = mfn_to_page(mfn);
-                if ( !get_page(page, d) )
+                for ( j = 0; j < (1U << a->extent_order); j++, mfn++ )
                 {
-                    gdprintk(XENLOG_INFO,
-                             "mfn %#"PRI_xen_pfn" doesn't belong to the"
-                             " domain\n", mfn);
-                    goto out;
+                    if ( !mfn_valid(mfn) )
+                    {
+                        gdprintk(XENLOG_INFO, "Invalid mfn %#"PRI_xen_pfn"\n",
+                                 mfn);
+                        goto out;
+                    }
+
+                    page = mfn_to_page(mfn);
+                    if ( !get_page(page, d) )
+                    {
+                        gdprintk(XENLOG_INFO,
+                                 "mfn %#"PRI_xen_pfn" doesn't belong to the"
+                                 " domain\n", mfn);
+                        goto out;
+                    }
+                    put_page(page);
                 }
-                put_page(page);
+
+                page = mfn_to_page(gpfn);
             }
             else
                 page = alloc_domheap_pages(d, a->extent_order, a->memflags);
@@ -462,7 +468,8 @@ static long memory_exchange(XEN_GUEST_HANDLE_PARAM(xen_memory_exchange_t) arg)
         /* Allocate a chunk's worth of anonymous output pages. */
         for ( j = 0; j < (1UL << out_chunk_order); j++ )
         {
-            page = alloc_domheap_pages(NULL, exch.out.extent_order, memflags);
+            page = alloc_domheap_pages(d, exch.out.extent_order,
+                                       MEMF_no_owner | memflags);
             if ( unlikely(page == NULL) )
             {
                 rc = -ENOMEM;
@@ -692,11 +699,98 @@ out:
     return rc;
 }
 
+static int construct_memop_from_reservation(
+               const struct xen_memory_reservation *r,
+               struct memop_args *a)
+{
+    unsigned int address_bits;
+
+    a->extent_list  = r->extent_start;
+    a->nr_extents   = r->nr_extents;
+    a->extent_order = r->extent_order;
+    a->memflags     = 0;
+
+    address_bits = XENMEMF_get_address_bits(r->mem_flags);
+    if ( (address_bits != 0) &&
+         (address_bits < (get_order_from_pages(max_page) + PAGE_SHIFT)) )
+    {
+        if ( address_bits <= PAGE_SHIFT )
+            return -EINVAL;
+        a->memflags = MEMF_bits(address_bits);
+    }
+
+    if ( r->mem_flags & XENMEMF_vnode )
+    {
+        nodeid_t vnode, pnode;
+        struct domain *d = a->domain;
+
+        read_lock(&d->vnuma_rwlock);
+        if ( d->vnuma )
+        {
+            vnode = XENMEMF_get_node(r->mem_flags);
+            if ( vnode >= d->vnuma->nr_vnodes )
+            {
+                read_unlock(&d->vnuma_rwlock);
+                return -EINVAL;
+            }
+
+            pnode = d->vnuma->vnode_to_pnode[vnode];
+            if ( pnode != NUMA_NO_NODE )
+            {
+                a->memflags |= MEMF_node(pnode);
+                if ( r->mem_flags & XENMEMF_exact_node_request )
+                    a->memflags |= MEMF_exact_node;
+            }
+        }
+        read_unlock(&d->vnuma_rwlock);
+    }
+    else
+    {
+        a->memflags |= MEMF_node(XENMEMF_get_node(r->mem_flags));
+        if ( r->mem_flags & XENMEMF_exact_node_request )
+            a->memflags |= MEMF_exact_node;
+    }
+
+    return 0;
+}
+
+#ifdef HAS_PASSTHROUGH
+struct get_reserved_device_memory {
+    struct xen_reserved_device_memory_map map;
+    unsigned int used_entries;
+};
+
+static int get_reserved_device_memory(xen_pfn_t start, xen_ulong_t nr,
+                                      u32 id, void *ctxt)
+{
+    struct get_reserved_device_memory *grdm = ctxt;
+    u32 sbdf = PCI_SBDF3(grdm->map.dev.pci.seg, grdm->map.dev.pci.bus,
+                         grdm->map.dev.pci.devfn);
+
+    if ( !(grdm->map.flags & XENMEM_RDM_ALL) && (sbdf != id) )
+        return 0;
+
+    if ( grdm->used_entries < grdm->map.nr_entries )
+    {
+        struct xen_reserved_device_memory rdm = {
+            .start_pfn = start, .nr_pages = nr
+        };
+
+        if ( __copy_to_guest_offset(grdm->map.buffer, grdm->used_entries,
+                                    &rdm, 1) )
+            return -EFAULT;
+    }
+
+    ++grdm->used_entries;
+
+    return 1;
+}
+#endif
+
 long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     struct domain *d;
     long rc;
-    unsigned int address_bits;
     struct xen_memory_reservation reservation;
     struct memop_args args;
     domid_t domid;
@@ -718,35 +812,24 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         if ( unlikely(start_extent >= reservation.nr_extents) )
             return start_extent;
 
-        args.extent_list  = reservation.extent_start;
-        args.nr_extents   = reservation.nr_extents;
-        args.extent_order = reservation.extent_order;
-        args.nr_done      = start_extent;
-        args.preempted    = 0;
-        args.memflags     = 0;
+        d = rcu_lock_domain_by_any_id(reservation.domid);
+        if ( d == NULL )
+            return start_extent;
+        args.domain = d;
 
-        address_bits = XENMEMF_get_address_bits(reservation.mem_flags);
-        if ( (address_bits != 0) &&
-             (address_bits < (get_order_from_pages(max_page) + PAGE_SHIFT)) )
+        if ( construct_memop_from_reservation(&reservation, &args) )
         {
-            if ( address_bits <= PAGE_SHIFT )
-                return start_extent;
-            args.memflags = MEMF_bits(address_bits);
+            rcu_unlock_domain(d);
+            return start_extent;
         }
 
-        args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags));
-        if ( reservation.mem_flags & XENMEMF_exact_node_request )
-            args.memflags |= MEMF_exact_node;
+        args.nr_done   = start_extent;
+        args.preempted = 0;
 
         if ( op == XENMEM_populate_physmap
              && (reservation.mem_flags & XENMEMF_populate_on_demand) )
             args.memflags |= MEMF_populate_on_demand;
 
-        d = rcu_lock_domain_by_any_id(reservation.domid);
-        if ( d == NULL )
-            return start_extent;
-        args.domain = d;
-
         if ( xsm_memory_adjust_reservation(XSM_TARGET, current->domain, d) )
         {
             rcu_unlock_domain(d);
@@ -1118,6 +1201,35 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         break;
     }
 
+#ifdef HAS_PASSTHROUGH
+    case XENMEM_reserved_device_memory_map:
+    {
+        struct get_reserved_device_memory grdm;
+
+        if ( unlikely(start_extent) )
+            return -ENOSYS;
+
+        if ( copy_from_guest(&grdm.map, arg, 1) ||
+             !guest_handle_okay(grdm.map.buffer, grdm.map.nr_entries) )
+            return -EFAULT;
+
+        if ( grdm.map.flags & ~XENMEM_RDM_ALL )
+            return -EINVAL;
+
+        grdm.used_entries = 0;
+        rc = iommu_get_reserved_device_memory(get_reserved_device_memory,
+                                              &grdm);
+
+        if ( !rc && grdm.map.nr_entries < grdm.used_entries )
+            rc = -ENOBUFS;
+        grdm.map.nr_entries = grdm.used_entries;
+        if ( __copy_to_guest(arg, &grdm.map, 1) )
+            rc = -EFAULT;
+
+        break;
+    }
+#endif
+
     default:
         rc = arch_memory_op(cmd, arg);
         break;
@@ -1126,6 +1238,24 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     return rc;
 }
 
+void clear_domain_page(mfn_t mfn)
+{
+    void *ptr = map_domain_page(mfn);
+
+    clear_page(ptr);
+    unmap_domain_page(ptr);
+}
+
+void copy_domain_page(mfn_t dest, mfn_t source)
+{
+    const void *src = map_domain_page(source);
+    void *dst = map_domain_page(dest);
+
+    copy_page(dst, src);
+    unmap_domain_page(dst);
+    unmap_domain_page(src);
+}
+
 void destroy_ring_for_helper(
     void **_va, struct page_info *page)
 {
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 24a759c..74fc1de 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -17,8 +17,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -278,7 +277,7 @@ unsigned long __init alloc_boot_pages(
 
 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN :  \
-                          (fls(page_to_mfn(pg)) ? : 1))
+                          (flsl(page_to_mfn(pg)) ? : 1))
 
 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
@@ -405,13 +404,19 @@ void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages)
     spin_unlock(&heap_lock);
 }
 
+static bool_t __read_mostly first_node_initialised;
+#ifndef CONFIG_SEPARATE_XENHEAP
+static unsigned int __read_mostly xenheap_bits;
+#else
+#define xenheap_bits 0
+#endif
+
 static unsigned long init_node_heap(int node, unsigned long mfn,
                                     unsigned long nr, bool_t *use_tail)
 {
     /* First node to be discovered has its heap metadata statically alloced. */
     static heap_by_zone_and_order_t _heap_static;
     static unsigned long avail_static[NR_ZONES];
-    static int first_node_initialised;
     unsigned long needed = (sizeof(**_heap) +
                             sizeof(**avail) * NR_ZONES +
                             PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -429,14 +434,18 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
     }
 #ifdef DIRECTMAP_VIRT_END
     else if ( *use_tail && nr >= needed &&
-              (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) )
+              (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) &&
+              (!xenheap_bits ||
+               !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
     {
         _heap[node] = mfn_to_virt(mfn + nr - needed);
         avail[node] = mfn_to_virt(mfn + nr - 1) +
                       PAGE_SIZE - sizeof(**avail) * NR_ZONES;
     }
     else if ( nr >= needed &&
-              (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) )
+              (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) &&
+              (!xenheap_bits ||
+               !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
     {
         _heap[node] = mfn_to_virt(mfn);
         avail[node] = mfn_to_virt(mfn + needed - 1) +
@@ -580,17 +589,19 @@ static struct page_info *alloc_heap_pages(
     unsigned int order, unsigned int memflags,
     struct domain *d)
 {
-    unsigned int first_node, i, j, zone = 0, nodemask_retry = 0;
-    unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1);
+    unsigned int i, j, zone = 0, nodemask_retry = 0;
+    nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node;
     unsigned long request = 1UL << order;
     struct page_info *pg;
     nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
     bool_t need_tlbflush = 0;
     uint32_t tlbflush_timestamp = 0;
 
+    /* Make sure there are enough bits in memflags for nodeID. */
+    BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t)));
+
     if ( node == NUMA_NO_NODE )
     {
-        memflags &= ~MEMF_exact_node;
         if ( d != NULL )
         {
             node = next_node(d->last_alloc_node, nodemask);
@@ -602,7 +613,7 @@ static struct page_info *alloc_heap_pages(
     }
     first_node = node;
 
-    ASSERT(node >= 0);
+    ASSERT(node < MAX_NUMNODES);
     ASSERT(zone_lo <= zone_hi);
     ASSERT(zone_hi < NR_ZONES);
 
@@ -651,7 +662,7 @@ static struct page_info *alloc_heap_pages(
                     goto found;
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
-        if ( memflags & MEMF_exact_node )
+        if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE )
             goto not_found;
 
         /* Pick next node. */
@@ -668,7 +679,7 @@ static struct page_info *alloc_heap_pages(
         if ( node == first_node )
         {
             /* When we have tried all in nodemask, we fall back to others. */
-            if ( nodemask_retry++ )
+            if ( (memflags & MEMF_exact_node) || nodemask_retry++ )
                 goto not_found;
             nodes_andnot(nodemask, node_online_map, nodemask);
             first_node = node = first_node(nodemask);
@@ -1260,7 +1271,7 @@ void __init end_boot_allocator(void)
     {
 #ifdef CONFIG_X86
         dma_bitsize = min_t(unsigned int,
-                            fls(NODE_DATA(0)->node_spanned_pages) - 1
+                            flsl(NODE_DATA(0)->node_spanned_pages) - 1
                             + PAGE_SHIFT - 2,
                             32);
 #else
@@ -1279,7 +1290,8 @@ static void __init smp_scrub_heap_pages(void *data)
     unsigned long mfn, start, end;
     struct page_info *pg;
     struct scrub_region *r;
-    unsigned int temp_cpu, node, cpu_idx = 0;
+    unsigned int temp_cpu, cpu_idx = 0;
+    nodeid_t node;
     unsigned int cpu = smp_processor_id();
 
     if ( data )
@@ -1338,7 +1350,7 @@ static int __init find_non_smt(unsigned int node, cpumask_t *dest)
         if ( cpumask_intersects(dest, per_cpu(cpu_sibling_mask, i)) )
             continue;
         cpu = cpumask_first(per_cpu(cpu_sibling_mask, i));
-        cpumask_set_cpu(cpu, dest);
+        __cpumask_set_cpu(cpu, dest);
     }
     return cpumask_weight(dest);
 }
@@ -1431,13 +1443,13 @@ void __init scrub_heap_pages(void)
         /* Figure out which NODE CPUs are close. */
         for_each_online_node ( j )
         {
-            int distance;
+            u8 distance;
 
             if ( cpumask_empty(&node_to_cpumask(j)) )
                 continue;
 
             distance = __node_distance(i, j);
-            if ( distance < last_distance )
+            if ( (distance < last_distance) && (distance != NUMA_NO_DISTANCE) )
             {
                 last_distance = distance;
                 best_node = j;
@@ -1450,7 +1462,7 @@ void __init scrub_heap_pages(void)
         cpus = find_non_smt(best_node, &node_cpus);
         if ( cpus == 0 )
         {
-            cpumask_set_cpu(smp_processor_id(), &node_cpus);
+            __cpumask_set_cpu(smp_processor_id(), &node_cpus);
             cpus = 1;
         }
         /* We already have the node information from round #0. */
@@ -1541,11 +1553,13 @@ void free_xenheap_pages(void *v, unsigned int order)
 
 #else
 
-static unsigned int __read_mostly xenheap_bits;
-
 void __init xenheap_max_mfn(unsigned long mfn)
 {
-    xenheap_bits = fls(mfn) + PAGE_SHIFT;
+    ASSERT(!first_node_initialised);
+    ASSERT(!xenheap_bits);
+    BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG);
+    xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS);
+    printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits);
 }
 
 void init_xenheap_pages(paddr_t ps, paddr_t pe)
@@ -1642,9 +1656,9 @@ int assign_pages(
         if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
         {
             if ( !opt_tmem || order != 0 || d->tot_pages != d->max_pages )
-                gdprintk(XENLOG_INFO, "Over-allocation for domain %u: "
-                         "%u > %u\n", d->domain_id,
-                         d->tot_pages + (1 << order), d->max_pages);
+                gprintk(XENLOG_INFO, "Over-allocation for domain %u: "
+                        "%u > %u\n", d->domain_id,
+                        d->tot_pages + (1 << order), d->max_pages);
             goto fail;
         }
 
@@ -1682,10 +1696,14 @@ struct page_info *alloc_domheap_pages(
 
     ASSERT(!in_irq());
 
-    bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
+    bits = domain_clamp_alloc_bitsize(memflags & MEMF_no_owner ? NULL : d,
+                                      bits ? : (BITS_PER_LONG+PAGE_SHIFT));
     if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
         return NULL;
 
+    if ( memflags & MEMF_no_owner )
+        memflags |= MEMF_no_refcount;
+
     if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
         pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
 
@@ -1695,7 +1713,8 @@ struct page_info *alloc_domheap_pages(
                                   memflags, d)) == NULL)) )
          return NULL;
 
-    if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
+    if ( d && !(memflags & MEMF_no_owner) &&
+         assign_pages(d, pg, order, memflags) )
     {
         free_heap_pages(pg, order);
         return NULL;
diff --git a/xen/common/pdx.c b/xen/common/pdx.c
index cf8b9b5..90136c0 100644
--- a/xen/common/pdx.c
+++ b/xen/common/pdx.c
@@ -12,8 +12,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/common/perfc.c b/xen/common/perfc.c
index 96a4245..9f078e1 100644
--- a/xen/common/perfc.c
+++ b/xen/common/perfc.c
@@ -57,7 +57,7 @@ void perfc_printall(unsigned char key)
                 for_each_online_cpu ( cpu )
                 {
                     if ( k > 0 && (k % 4) == 0 )
-                        printk("\n%46s", "");
+                        printk("\n%53s", "");
                     printk("  CPU%02u[%10"PRIperfc"u]", cpu, per_cpu(perfcounters, cpu)[j]);
                     ++k;
                 }
@@ -103,7 +103,7 @@ void perfc_printall(unsigned char key)
                     if ( perfc_info[i].type == TYPE_S_ARRAY ) 
                         sum = (perfc_t) sum;
                     if ( k > 0 && (k % 4) == 0 )
-                        printk("\n%46s", "");
+                        printk("\n%53s", "");
                     printk("  CPU%02u[%10Lu]", cpu, sum);
                     ++k;
                 }
diff --git a/xen/common/preempt.c b/xen/common/preempt.c
index ec50dae..3b4178f 100644
--- a/xen/common/preempt.c
+++ b/xen/common/preempt.c
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/preempt.h>
diff --git a/xen/common/radix-tree.c b/xen/common/radix-tree.c
index 17361c7..5e33f09 100644
--- a/xen/common/radix-tree.c
+++ b/xen/common/radix-tree.c
@@ -15,8 +15,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/common/random.c b/xen/common/random.c
index 4a28a24..fb805b0 100644
--- a/xen/common/random.c
+++ b/xen/common/random.c
@@ -1,9 +1,12 @@
+#include <xen/cache.h>
+#include <xen/init.h>
 #include <xen/percpu.h>
 #include <xen/random.h>
 #include <xen/time.h>
 #include <asm/random.h>
 
 static DEFINE_PER_CPU(unsigned int, seed);
+unsigned int __read_mostly boot_random;
 
 unsigned int get_random(void)
 {
@@ -27,3 +30,10 @@ unsigned int get_random(void)
 
     return val;
 }
+
+static int __init init_boot_random(void)
+{
+    boot_random = get_random();
+    return 0;
+}
+__initcall(init_boot_random);
diff --git a/xen/common/rangeset.c b/xen/common/rangeset.c
index 116d5dc..6c6293c 100644
--- a/xen/common/rangeset.c
+++ b/xen/common/rangeset.c
@@ -248,11 +248,11 @@ int rangeset_remove_range(
     return rc;
 }
 
-int rangeset_contains_range(
+bool_t rangeset_contains_range(
     struct rangeset *r, unsigned long s, unsigned long e)
 {
     struct range *x;
-    int contains;
+    bool_t contains;
 
     ASSERT(s <= e);
 
@@ -264,11 +264,11 @@ int rangeset_contains_range(
     return contains;
 }
 
-int rangeset_overlaps_range(
+bool_t rangeset_overlaps_range(
     struct rangeset *r, unsigned long s, unsigned long e)
 {
     struct range *x;
-    int overlaps;
+    bool_t overlaps;
 
     ASSERT(s <= e);
 
@@ -289,7 +289,7 @@ int rangeset_report_ranges(
 
     read_lock(&r->lock);
 
-    for ( x = find_range(r, s); x && (x->s <= e) && !rc; x = next_range(r, x) )
+    for ( x = first_range(r); x && (x->s <= e) && !rc; x = next_range(r, x) )
         if ( x->e >= s )
             rc = cb(max(x->s, s), min(x->e, e), ctxt);
 
@@ -310,14 +310,14 @@ int rangeset_remove_singleton(
     return rangeset_remove_range(r, s, s);
 }
 
-int rangeset_contains_singleton(
+bool_t rangeset_contains_singleton(
     struct rangeset *r, unsigned long s)
 {
     return rangeset_contains_range(r, s, s);
 }
 
-int rangeset_is_empty(
-    struct rangeset *r)
+bool_t rangeset_is_empty(
+    const struct rangeset *r)
 {
     return ((r == NULL) || list_empty(&r->range_list));
 }
diff --git a/xen/common/rbtree.c b/xen/common/rbtree.c
index 67564c8..1812c0a 100644
--- a/xen/common/rbtree.c
+++ b/xen/common/rbtree.c
@@ -14,8 +14,7 @@
   GNU General Public License for more details.
 
   You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+  along with this program; If not, see <http://www.gnu.org/licenses/>.
 
   linux/lib/rbtree.c
 */
diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
index e9979cd..f13b87b 100644
--- a/xen/common/rcupdate.c
+++ b/xen/common/rcupdate.c
@@ -12,8 +12,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) IBM Corporation, 2001
  *
diff --git a/xen/common/sched_arinc653.c b/xen/common/sched_arinc653.c
index 5f09ded..cff5da9 100644
--- a/xen/common/sched_arinc653.c
+++ b/xen/common/sched_arinc653.c
@@ -246,9 +246,8 @@ arinc653_sched_set(
 
     for ( i = 0; i < schedule->num_sched_entries; i++ )
     {
-        /* Check for a valid VCPU ID and run time. */
-        if ( (schedule->sched_entries[i].vcpu_id >= MAX_VIRT_CPUS)
-             || (schedule->sched_entries[i].runtime <= 0) )
+        /* Check for a valid run time. */
+        if ( schedule->sched_entries[i].runtime <= 0 )
             goto fail;
 
         /* Add this entry's run time to total run time. */
@@ -706,6 +705,7 @@ a653sched_adjust_global(const struct scheduler *ops,
         rc = arinc653_sched_set(ops, &local_sched);
         break;
     case XEN_SYSCTL_SCHEDOP_getinfo:
+        memset(&local_sched, -1, sizeof(local_sched));
         rc = arinc653_sched_get(ops, &local_sched);
         if ( rc )
             break;
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index 8b02b7b..57967c1 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -26,6 +26,23 @@
 
 
 /*
+ * Locking:
+ * - Scheduler-lock (a.k.a. runqueue lock):
+ *  + is per-runqueue, and there is one runqueue per-cpu;
+ *  + serializes all runqueue manipulation operations;
+ * - Private data lock (a.k.a. private scheduler lock):
+ *  + serializes accesses to the scheduler global state (weight,
+ *    credit, balance_credit, etc);
+ *  + serializes updates to the domains' scheduling parameters.
+ *
+ * Ordering is "private lock always comes first":
+ *  + if we need both locks, we must acquire the private
+ *    scheduler lock for first;
+ *  + if we already own a runqueue lock, we must never acquire
+ *    the private scheduler lock.
+ */
+
+/*
  * Basic constants
  */
 #define CSCHED_DEFAULT_WEIGHT       256
@@ -154,10 +171,10 @@ struct csched_pcpu {
  * Convenience macro for accessing the per-PCPU cpumask we need for
  * implementing the two steps (soft and hard affinity) balancing logic.
  * It is stored in csched_pcpu so that serialization is not an issue,
- * as there is a csched_pcpu for each PCPU and we always hold the
- * runqueue spin-lock when using this.
+ * as there is a csched_pcpu for each PCPU, and we always hold the
+ * runqueue lock for the proper PCPU when using this.
  */
-#define csched_balance_mask (CSCHED_PCPU(smp_processor_id())->balance_mask)
+#define csched_balance_mask(c) (CSCHED_PCPU(c)->balance_mask)
 
 /*
  * Virtual CPU
@@ -279,24 +296,23 @@ __runq_remove(struct csched_vcpu *svc)
 
 /*
  * Hard affinity balancing is always necessary and must never be skipped.
- * OTOH, if the vcpu's soft affinity is full (it spans all the possible
- * pcpus) we can safely avoid dealing with it entirely.
+ * But soft affinity need only be considered when it has a functionally
+ * different effect than other constraints (such as hard affinity, cpus
+ * online, or cpupools).
  *
- * A vcpu's soft affinity is also deemed meaningless in case it has empty
- * intersection with mask, to cover the cases where using the soft affinity
- * mask seems legit, but would instead led to trying to schedule the vcpu
- * on _no_ pcpu! Typical use cases are for mask to be equal to the vcpu's
- * hard affinity, or to the && of hard affinity and the set of online cpus
- * in the domain's cpupool.
+ * Soft affinity only needs to be considered if:
+ * * The cpus in the cpupool are not a subset of soft affinity
+ * * The hard affinity is not a subset of soft affinity
+ * * There is an overlap between the soft affinity and the mask which is
+ *   currently being considered.
  */
 static inline int __vcpu_has_soft_affinity(const struct vcpu *vc,
                                            const cpumask_t *mask)
 {
-    if ( cpumask_full(vc->cpu_soft_affinity)
-         || !cpumask_intersects(vc->cpu_soft_affinity, mask) )
-        return 0;
-
-    return 1;
+    return !cpumask_subset(cpupool_online_cpumask(vc->domain->cpupool),
+                           vc->cpu_soft_affinity) &&
+           !cpumask_subset(vc->cpu_hard_affinity, vc->cpu_soft_affinity) &&
+           cpumask_intersects(vc->cpu_soft_affinity, mask);
 }
 
 /*
@@ -350,12 +366,17 @@ __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
 {
     struct csched_vcpu * const cur = CSCHED_VCPU(curr_on_cpu(cpu));
     struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));
-    cpumask_t mask, idle_mask;
+    cpumask_t mask, idle_mask, *online;
     int balance_step, idlers_empty;
 
     ASSERT(cur);
     cpumask_clear(&mask);
-    idlers_empty = cpumask_empty(prv->idlers);
+
+    /* cpu is vc->processor, so it must be in a cpupool. */
+    ASSERT(per_cpu(cpupool, cpu) != NULL);
+    online = cpupool_online_cpumask(per_cpu(cpupool, cpu));
+    cpumask_and(&idle_mask, prv->idlers, online);
+    idlers_empty = cpumask_empty(&idle_mask);
 
 
     /*
@@ -372,7 +393,7 @@ __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
     {
         if ( cur->pri != CSCHED_PRI_IDLE )
             SCHED_STAT_CRANK(tickle_idlers_none);
-        cpumask_set_cpu(cpu, &mask);
+        __cpumask_set_cpu(cpu, &mask);
     }
     else if ( !idlers_empty )
     {
@@ -391,9 +412,10 @@ __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
 
             /* Are there idlers suitable for new (for this balance step)? */
             csched_balance_cpumask(new->vcpu, balance_step,
-                                   csched_balance_mask);
-            cpumask_and(&idle_mask, prv->idlers, csched_balance_mask);
-            new_idlers_empty = cpumask_empty(&idle_mask);
+                                   csched_balance_mask(cpu));
+            cpumask_and(csched_balance_mask(cpu),
+                        csched_balance_mask(cpu), &idle_mask);
+            new_idlers_empty = cpumask_empty(csched_balance_mask(cpu));
 
             /*
              * Let's not be too harsh! If there aren't idlers suitable
@@ -422,7 +444,7 @@ __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
                 SCHED_VCPU_STAT_CRANK(cur, migrate_r);
                 SCHED_STAT_CRANK(migrate_kicked_away);
                 set_bit(_VPF_migrating, &cur->vcpu->pause_flags);
-                cpumask_set_cpu(cpu, &mask);
+                __cpumask_set_cpu(cpu, &mask);
             }
             else if ( !new_idlers_empty )
             {
@@ -432,7 +454,7 @@ __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
                 {
                     this_cpu(last_tickle_cpu) =
                         cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask);
-                    cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
+                    __cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
                 }
                 else
                     cpumask_or(&mask, &mask, &idle_mask);
@@ -675,7 +697,7 @@ _csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit)
          */
         cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers);
         if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) )
-            cpumask_set_cpu(cpu, &idlers);
+            __cpumask_set_cpu(cpu, &idlers);
         cpumask_and(&cpus, &cpus, &idlers);
 
         /*
@@ -692,7 +714,7 @@ _csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit)
          */
         if ( !cpumask_test_cpu(cpu, &cpus) && !cpumask_empty(&cpus) )
             cpu = cpumask_cycle(cpu, &cpus);
-        cpumask_clear_cpu(cpu, &cpus);
+        __cpumask_clear_cpu(cpu, &cpus);
 
         while ( !cpumask_empty(&cpus) )
         {
@@ -1470,8 +1492,9 @@ csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step)
                  && !__vcpu_has_soft_affinity(vc, vc->cpu_hard_affinity) )
                 continue;
 
-            csched_balance_cpumask(vc, balance_step, csched_balance_mask);
-            if ( __csched_vcpu_is_migrateable(vc, cpu, csched_balance_mask) )
+            csched_balance_cpumask(vc, balance_step, csched_balance_mask(cpu));
+            if ( __csched_vcpu_is_migrateable(vc, cpu,
+                                              csched_balance_mask(cpu)) )
             {
                 /* We got a candidate. Grab it! */
                 TRACE_3D(TRC_CSCHED_STOLEN_VCPU, peer_cpu,
@@ -1494,6 +1517,7 @@ static struct csched_vcpu *
 csched_load_balance(struct csched_private *prv, int cpu,
     struct csched_vcpu *snext, bool_t *stolen)
 {
+    struct cpupool *c = per_cpu(cpupool, cpu);
     struct csched_vcpu *speer;
     cpumask_t workers;
     cpumask_t *online;
@@ -1501,10 +1525,13 @@ csched_load_balance(struct csched_private *prv, int cpu,
     int node = cpu_to_node(cpu);
 
     BUG_ON( cpu != snext->vcpu->processor );
-    online = cpupool_scheduler_cpumask(per_cpu(cpupool, cpu));
+    online = cpupool_online_cpumask(c);
 
-    /* If this CPU is going offline we shouldn't steal work. */
-    if ( unlikely(!cpumask_test_cpu(cpu, online)) )
+    /*
+     * If this CPU is going offline, or is not (yet) part of any cpupool
+     * (as it happens, e.g., during cpu bringup), we shouldn't steal work.
+     */
+    if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) )
         goto out;
 
     if ( snext->pri == CSCHED_PRI_IDLE )
@@ -1536,7 +1563,7 @@ csched_load_balance(struct csched_private *prv, int cpu,
             /* Find out what the !idle are in this node */
             cpumask_andnot(&workers, online, prv->idlers);
             cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
-            cpumask_clear_cpu(cpu, &workers);
+            __cpumask_clear_cpu(cpu, &workers);
 
             peer_cpu = cpumask_first(&workers);
             if ( peer_cpu >= nr_cpu_ids )
@@ -1751,11 +1778,24 @@ static void
 csched_dump_pcpu(const struct scheduler *ops, int cpu)
 {
     struct list_head *runq, *iter;
+    struct csched_private *prv = CSCHED_PRIV(ops);
     struct csched_pcpu *spc;
     struct csched_vcpu *svc;
+    spinlock_t *lock = lock;
+    unsigned long flags;
     int loop;
 #define cpustr keyhandler_scratch
 
+    /*
+     * We need both locks:
+     * - csched_dump_vcpu() wants to access domains' scheduling
+     *   parameters, which are protected by the private scheduler lock;
+     * - we scan through the runqueue, so we need the proper runqueue
+     *   lock (the one of the runqueue of this cpu).
+     */
+    spin_lock_irqsave(&prv->lock, flags);
+    lock = pcpu_schedule_lock(cpu);
+
     spc = CSCHED_PCPU(cpu);
     runq = &spc->runq;
 
@@ -1782,6 +1822,9 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu)
             csched_dump_vcpu(svc);
         }
     }
+
+    pcpu_schedule_unlock(lock, cpu);
+    spin_unlock_irqrestore(&prv->lock, flags);
 #undef cpustr
 }
 
@@ -1793,7 +1836,7 @@ csched_dump(const struct scheduler *ops)
     int loop;
     unsigned long flags;
 
-    spin_lock_irqsave(&(prv->lock), flags);
+    spin_lock_irqsave(&prv->lock, flags);
 
 #define idlers_buf keyhandler_scratch
 
@@ -1836,15 +1879,20 @@ csched_dump(const struct scheduler *ops)
         list_for_each( iter_svc, &sdom->active_vcpu )
         {
             struct csched_vcpu *svc;
+            spinlock_t *lock;
+
             svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
+            lock = vcpu_schedule_lock(svc->vcpu);
 
             printk("\t%3d: ", ++loop);
             csched_dump_vcpu(svc);
+
+            vcpu_schedule_unlock(lock, svc->vcpu);
         }
     }
 #undef idlers_buf
 
-    spin_unlock_irqrestore(&(prv->lock), flags);
+    spin_unlock_irqrestore(&prv->lock, flags);
 }
 
 static int
diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index 1ca521b..75e0321 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -25,6 +25,7 @@
 #include <xen/errno.h>
 #include <xen/trace.h>
 #include <xen/cpu.h>
+#include <xen/keyhandler.h>
 
 #define d2printk(x...)
 //#define d2printk printk
@@ -51,8 +52,6 @@
  * credit2 wiki page:
  *  http://wiki.xen.org/wiki/Credit2_Scheduler_Development
  * TODO:
- * + Immediate bug-fixes
- *  - Do per-runqueue, grab proper lock for dump debugkey
  * + Multiple sockets
  *  - Detect cpu layout and make runqueue map, one per L2 (make_runq_map())
  *  - Simple load balancer / runqueue assignment
@@ -493,7 +492,7 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu *
     BUG_ON(new->rqd != rqd);
 
     /* Look at the cpu it's running on first */
-    cur = CSCHED2_VCPU(per_cpu(schedule_data, cpu).curr);
+    cur = CSCHED2_VCPU(curr_on_cpu(cpu));
     burn_credits(rqd, cur, now);
 
     if ( cur->credit < new->credit )
@@ -526,7 +525,7 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu *
         if ( i == cpu )
             continue;
 
-        cur = CSCHED2_VCPU(per_cpu(schedule_data, i).curr);
+        cur = CSCHED2_VCPU(curr_on_cpu(i));
 
         BUG_ON(is_idle_vcpu(cur->vcpu));
 
@@ -556,7 +555,10 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu *
     /* Only switch to another processor if the credit difference is greater
      * than the migrate resistance */
     if ( ipid == -1 || lowest + CSCHED2_MIGRATE_RESIST > new->credit )
+    {
+        SCHED_STAT_CRANK(tickle_idlers_none);
         goto no_tickle;
+    }
 
 tickle:
     BUG_ON(ipid == -1);
@@ -571,6 +573,7 @@ tickle:
                   (unsigned char *)&d);
     }
     cpumask_set_cpu(ipid, &rqd->tickled);
+    SCHED_STAT_CRANK(tickle_idlers_some);
     cpu_raise_softirq(ipid, SCHEDULE_SOFTIRQ);
 
 no_tickle:
@@ -650,6 +653,8 @@ static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now,
         }
     }
 
+    SCHED_STAT_CRANK(credit_reset);
+
     /* No need to resort runqueue, as everyone's order should be the same. */
 }
 
@@ -658,7 +663,7 @@ void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_vcpu *svc, s
     s_time_t delta;
 
     /* Assert svc is current */
-    ASSERT(svc==CSCHED2_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr));
+    ASSERT(svc==CSCHED2_VCPU(curr_on_cpu(svc->vcpu->processor)));
 
     if ( is_idle_vcpu(svc->vcpu) )
     {
@@ -669,6 +674,7 @@ void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_vcpu *svc, s
     delta = now - svc->start_time;
 
     if ( delta > 0 ) {
+        SCHED_STAT_CRANK(burn_credits_t2c);
         t2c_update(rqd, delta, svc);
         svc->start_time = now;
 
@@ -709,6 +715,7 @@ static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight,
     {
         rqd->max_weight = new_weight;
         d2printk("%s: Runqueue id %d max weight %d\n", __func__, rqd->id, rqd->max_weight);
+        SCHED_STAT_CRANK(upd_max_weight_quick);
     }
     else if ( old_weight == rqd->max_weight )
     {
@@ -725,6 +732,7 @@ static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight,
 
         rqd->max_weight = max_weight;
         d2printk("%s: Runqueue %d max weight %d\n", __func__, rqd->id, rqd->max_weight);
+        SCHED_STAT_CRANK(upd_max_weight_full);
     }
 }
 
@@ -746,6 +754,7 @@ __csched2_vcpu_check(struct vcpu *vc)
     {
         BUG_ON( !is_idle_vcpu(vc) );
     }
+    SCHED_STAT_CRANK(vcpu_check);
 }
 #define CSCHED2_VCPU_CHECK(_vc)  (__csched2_vcpu_check(_vc))
 #else
@@ -931,8 +940,9 @@ csched2_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
     struct csched2_vcpu * const svc = CSCHED2_VCPU(vc);
 
     BUG_ON( is_idle_vcpu(vc) );
+    SCHED_STAT_CRANK(vcpu_sleep);
 
-    if ( per_cpu(schedule_data, vc->processor).curr == vc )
+    if ( curr_on_cpu(vc->processor) == vc )
         cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
     else if ( __vcpu_on_runq(svc) )
     {
@@ -956,19 +966,23 @@ csched2_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
 
     BUG_ON( is_idle_vcpu(vc) );
 
-    /* Make sure svc priority mod happens before runq check */
-    if ( unlikely(per_cpu(schedule_data, vc->processor).curr == vc) )
+    if ( unlikely(curr_on_cpu(vc->processor) == vc) )
     {
+        SCHED_STAT_CRANK(vcpu_wake_running);
         goto out;
     }
 
     if ( unlikely(__vcpu_on_runq(svc)) )
     {
-        /* If we've boosted someone that's already on a runqueue, prioritize
-         * it and inform the cpu in question. */
+        SCHED_STAT_CRANK(vcpu_wake_onrunq);
         goto out;
     }
 
+    if ( likely(vcpu_runnable(vc)) )
+        SCHED_STAT_CRANK(vcpu_wake_runnable);
+    else
+        SCHED_STAT_CRANK(vcpu_wake_not_runnable);
+
     /* If the context hasn't been saved for this vcpu yet, we can't put it on
      * another runqueue.  Instead, we set a flag so that it will be put on the runqueue
      * after the context has been saved. */
@@ -1195,6 +1209,7 @@ static void migrate(const struct scheduler *ops,
         svc->migrate_rqd = trqd;
         set_bit(_VPF_migrating, &svc->vcpu->pause_flags);
         set_bit(__CSFLAG_runq_migrate_request, &svc->flags);
+        SCHED_STAT_CRANK(migrate_requested);
     }
     else
     {
@@ -1215,7 +1230,10 @@ static void migrate(const struct scheduler *ops,
             update_load(ops, svc->rqd, svc, 1, now);
             runq_insert(ops, svc->vcpu->processor, svc);
             runq_tickle(ops, svc->vcpu->processor, svc, now);
+            SCHED_STAT_CRANK(migrate_on_runq);
         }
+        else
+            SCHED_STAT_CRANK(migrate_no_runq);
     }
 }
 
@@ -1569,7 +1587,10 @@ csched2_runtime(const struct scheduler *ops, int cpu, struct csched2_vcpu *snext
     /* The next guy may actually have a higher credit, if we've tried to
      * avoid migrating him from a different cpu.  DTRT.  */
     if ( rt_credit <= 0 )
+    {
         time = CSCHED2_MIN_TIMER;
+        SCHED_STAT_CRANK(runtime_min_timer);
+    }
     else
     {
         /* FIXME: See if we can eliminate this conversion if we know time
@@ -1580,9 +1601,15 @@ csched2_runtime(const struct scheduler *ops, int cpu, struct csched2_vcpu *snext
 
         /* Check limits */
         if ( time < CSCHED2_MIN_TIMER )
+        {
             time = CSCHED2_MIN_TIMER;
+            SCHED_STAT_CRANK(runtime_min_timer);
+        }
         else if ( time > CSCHED2_MAX_TIMER )
+        {
             time = CSCHED2_MAX_TIMER;
+            SCHED_STAT_CRANK(runtime_max_timer);
+        }
     }
 
     return time;
@@ -1615,7 +1642,10 @@ runq_candidate(struct csched2_runqueue_data *rqd,
          * its credit is at least CSCHED2_MIGRATE_RESIST higher. */
         if ( svc->vcpu->processor != cpu
              && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit )
+        {
+            SCHED_STAT_CRANK(migrate_resisted);
             continue;
+        }
 
         /* If the next one on the list has more credit than current
          * (or idle, if current is not runnable), choose it. */
@@ -1760,6 +1790,7 @@ csched2_schedule(
         {
             snext->credit += CSCHED2_MIGRATE_COMPENSATION;
             snext->vcpu->processor = cpu;
+            SCHED_STAT_CRANK(migrated);
             ret.migrated = 1;
         }
     }
@@ -1800,12 +1831,24 @@ csched2_dump_vcpu(struct csched2_vcpu *svc)
 static void
 csched2_dump_pcpu(const struct scheduler *ops, int cpu)
 {
+    struct csched2_private *prv = CSCHED2_PRIV(ops);
     struct list_head *runq, *iter;
     struct csched2_vcpu *svc;
+    unsigned long flags;
+    spinlock_t *lock;
     int loop;
-    char cpustr[100];
+#define cpustr keyhandler_scratch
 
-    /* FIXME: Do locking properly for access to runqueue structures */
+    /*
+     * We need both locks:
+     * - csched2_dump_vcpu() wants to access domains' scheduling
+     *   parameters, which are protected by the private scheduler lock;
+     * - we scan through the runqueue, so we need the proper runqueue
+     *   lock (the one of the runqueue this cpu is associated to).
+     */
+    spin_lock_irqsave(&prv->lock, flags);
+    lock = per_cpu(schedule_data, cpu).schedule_lock;
+    spin_lock(lock);
 
     runq = &RQD(ops, cpu)->runq;
 
@@ -1815,7 +1858,7 @@ csched2_dump_pcpu(const struct scheduler *ops, int cpu)
     printk("core=%s\n", cpustr);
 
     /* current VCPU */
-    svc = CSCHED2_VCPU(per_cpu(schedule_data, cpu).curr);
+    svc = CSCHED2_VCPU(curr_on_cpu(cpu));
     if ( svc )
     {
         printk("\trun: ");
@@ -1832,6 +1875,10 @@ csched2_dump_pcpu(const struct scheduler *ops, int cpu)
             csched2_dump_vcpu(svc);
         }
     }
+
+    spin_unlock(lock);
+    spin_unlock_irqrestore(&prv->lock, flags);
+#undef cpustr
 }
 
 static void
@@ -1839,7 +1886,13 @@ csched2_dump(const struct scheduler *ops)
 {
     struct list_head *iter_sdom, *iter_svc;
     struct csched2_private *prv = CSCHED2_PRIV(ops);
+    unsigned long flags;
     int i, loop;
+#define cpustr keyhandler_scratch
+
+    /* We need the private lock as we access global scheduler data
+     * and (below) the list of active domains. */
+    spin_lock_irqsave(&prv->lock, flags);
 
     printk("Active queues: %d\n"
            "\tdefault-weight     = %d\n",
@@ -1851,19 +1904,25 @@ csched2_dump(const struct scheduler *ops)
         
         fraction = prv->rqd[i].avgload * 100 / (1ULL<<prv->load_window_shift);
 
+        cpulist_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].active);
         printk("Runqueue %d:\n"
                "\tncpus              = %u\n"
+               "\tcpus               = %s\n"
                "\tmax_weight         = %d\n"
                "\tinstload           = %d\n"
                "\taveload            = %3"PRI_stime"\n",
                i,
                cpumask_weight(&prv->rqd[i].active),
+               cpustr,
                prv->rqd[i].max_weight,
                prv->rqd[i].load,
                fraction);
 
+        cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].idle);
+        printk("\tidlers: %s\n", cpustr);
+        cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->rqd[i].tickled);
+        printk("\ttickled: %s\n", cpustr);
     }
-    /* FIXME: Locking! */
 
     printk("Domain info:\n");
     loop = 0;
@@ -1872,20 +1931,28 @@ csched2_dump(const struct scheduler *ops)
         struct csched2_dom *sdom;
         sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem);
 
-       printk("\tDomain: %d w %d v %d\n\t", 
-              sdom->dom->domain_id, 
-              sdom->weight, 
-              sdom->nr_vcpus);
+        printk("\tDomain: %d w %d v %d\n\t",
+               sdom->dom->domain_id,
+               sdom->weight,
+               sdom->nr_vcpus);
 
         list_for_each( iter_svc, &sdom->vcpu )
         {
             struct csched2_vcpu *svc;
+            spinlock_t *lock;
+
             svc = list_entry(iter_svc, struct csched2_vcpu, sdom_elem);
+            lock = vcpu_schedule_lock(svc->vcpu);
 
             printk("\t%3d: ", ++loop);
             csched2_dump_vcpu(svc);
+
+            vcpu_schedule_unlock(lock, svc->vcpu);
         }
     }
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+#undef cpustr
 }
 
 static void activate_runqueue(struct csched2_private *prv, int rqi)
@@ -1920,7 +1987,7 @@ static void deactivate_runqueue(struct csched2_private *prv, int rqi)
 
 static void init_pcpu(const struct scheduler *ops, int cpu)
 {
-    int rqi;
+    unsigned rqi;
     unsigned long flags;
     struct csched2_private *prv = CSCHED2_PRIV(ops);
     struct csched2_runqueue_data *rqd;
@@ -1945,7 +2012,7 @@ static void init_pcpu(const struct scheduler *ops, int cpu)
     else
         rqi = cpu_to_socket(cpu);
 
-    if ( rqi < 0 )
+    if ( rqi == XEN_INVALID_SOCKET_ID )
     {
         printk("%s: cpu_to_socket(%d) returned %d!\n",
                __func__, cpu, rqi);
@@ -1988,7 +2055,7 @@ csched2_alloc_pdata(const struct scheduler *ops, int cpu)
 {
     /* Check to see if the cpu is online yet */
     /* Note: cpu 0 doesn't get a STARTING callback */
-    if ( cpu == 0 || cpu_to_socket(cpu) >= 0 )
+    if ( cpu == 0 || cpu_to_socket(cpu) != XEN_INVALID_SOCKET_ID )
         init_pcpu(ops, cpu);
     else
         printk("%s: cpu %d not online yet, deferring initializatgion\n",
@@ -2138,8 +2205,7 @@ csched2_deinit(const struct scheduler *ops)
     struct csched2_private *prv;
 
     prv = CSCHED2_PRIV(ops);
-    if ( prv != NULL )
-        xfree(prv);
+    xfree(prv);
 }
 
 
diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index e70d6c7..4372486 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -124,6 +124,24 @@
 #define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
 #define TRC_RTDS_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(RTDS, 5)
 
+ /*
+  * Useful to avoid too many cpumask_var_t on the stack.
+  */
+static cpumask_var_t *_cpumask_scratch;
+#define cpumask_scratch _cpumask_scratch[smp_processor_id()]
+
+/*
+ * We want to only allocate the _cpumask_scratch array the first time an
+ * instance of this scheduler is used, and avoid reallocating and leaking
+ * the old one when more instance are activated inside new cpupools. We
+ * also want to get rid of it when the last instance is de-inited.
+ *
+ * So we (sort of) reference count the number of initialized instances. This
+ * does not need to happen via atomic_t refcounters, as it only happens either
+ * during boot, or under the protection of the cpupool_lock spinlock.
+ */
+static unsigned int nr_rt_ops;
+
 /*
  * Systme-wide private data, include global RunQueue/DepletedQ
  * Global lock is referenced by schedule_data.schedule_lock from all
@@ -218,8 +236,7 @@ __q_elem(struct list_head *elem)
 static void
 rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
 {
-    char cpustr[1024];
-    cpumask_t *cpupool_mask;
+    cpumask_t *cpupool_mask, *mask;
 
     ASSERT(svc != NULL);
     /* idle vcpu */
@@ -229,10 +246,22 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
         return;
     }
 
-    cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity);
+    /*
+     * We can't just use 'cpumask_scratch' because the dumping can
+     * happen from a pCPU outside of this scheduler's cpupool, and
+     * hence it's not right to use the pCPU's scratch mask (which
+     * may even not exist!). On the other hand, it is safe to use
+     * svc->vcpu->processor's own scratch space, since we hold the
+     * runqueue lock.
+     */
+    mask = _cpumask_scratch[svc->vcpu->processor];
+
+    cpupool_mask = cpupool_scheduler_cpumask(svc->vcpu->domain->cpupool);
+    cpumask_and(mask, cpupool_mask, svc->vcpu->cpu_hard_affinity);
+    cpulist_scnprintf(keyhandler_scratch, sizeof(keyhandler_scratch), mask);
     printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
            " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
-           " \t\t onQ=%d runnable=%d cpu_hard_affinity=%s ",
+           " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%s\n",
             svc->vcpu->domain->domain_id,
             svc->vcpu->vcpu_id,
             svc->vcpu->processor,
@@ -243,19 +272,19 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
             svc->last_start,
             __vcpu_on_q(svc),
             vcpu_runnable(svc->vcpu),
-            cpustr);
-    memset(cpustr, 0, sizeof(cpustr));
-    cpupool_mask = cpupool_scheduler_cpumask(svc->vcpu->domain->cpupool);
-    cpumask_scnprintf(cpustr, sizeof(cpustr), cpupool_mask);
-    printk("cpupool=%s\n", cpustr);
+            svc->flags,
+            keyhandler_scratch);
 }
 
 static void
 rt_dump_pcpu(const struct scheduler *ops, int cpu)
 {
-    struct rt_vcpu *svc = rt_vcpu(curr_on_cpu(cpu));
+    struct rt_private *prv = rt_priv(ops);
+    unsigned long flags;
 
-    rt_dump_vcpu(ops, svc);
+    spin_lock_irqsave(&prv->lock, flags);
+    rt_dump_vcpu(ops, rt_vcpu(curr_on_cpu(cpu)));
+    spin_unlock_irqrestore(&prv->lock, flags);
 }
 
 static void
@@ -264,18 +293,17 @@ rt_dump(const struct scheduler *ops)
     struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *iter;
     struct rt_private *prv = rt_priv(ops);
     struct rt_vcpu *svc;
-    cpumask_t *online;
     struct rt_dom *sdom;
     unsigned long flags;
 
-    ASSERT(!list_empty(&prv->sdom));
+    spin_lock_irqsave(&prv->lock, flags);
+
+    if ( list_empty(&prv->sdom) )
+        goto out;
 
-    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
     runq = rt_runq(ops);
     depletedq = rt_depletedq(ops);
 
-    spin_lock_irqsave(&prv->lock, flags);
     printk("Global RunQueue info:\n");
     list_for_each( iter, runq )
     {
@@ -303,6 +331,7 @@ rt_dump(const struct scheduler *ops)
         }
     }
 
+ out:
     spin_unlock_irqrestore(&prv->lock, flags);
 }
 
@@ -406,6 +435,16 @@ rt_init(struct scheduler *ops)
     if ( prv == NULL )
         return -ENOMEM;
 
+    ASSERT( _cpumask_scratch == NULL || nr_rt_ops > 0 );
+
+    if ( !_cpumask_scratch )
+    {
+        _cpumask_scratch = xmalloc_array(cpumask_var_t, nr_cpu_ids);
+        if ( !_cpumask_scratch )
+            goto no_mem;
+    }
+    nr_rt_ops++;
+
     spin_lock_init(&prv->lock);
     INIT_LIST_HEAD(&prv->sdom);
     INIT_LIST_HEAD(&prv->runq);
@@ -416,6 +455,10 @@ rt_init(struct scheduler *ops)
     ops->sched_data = prv;
 
     return 0;
+
+ no_mem:
+    xfree(prv);
+    return -ENOMEM;
 }
 
 static void
@@ -423,6 +466,13 @@ rt_deinit(const struct scheduler *ops)
 {
     struct rt_private *prv = rt_priv(ops);
 
+    ASSERT( _cpumask_scratch && nr_rt_ops > 0 );
+
+    if ( (--nr_rt_ops) == 0 )
+    {
+        xfree(_cpumask_scratch);
+        _cpumask_scratch = NULL;
+    }
     xfree(prv);
 }
 
@@ -440,10 +490,32 @@ rt_alloc_pdata(const struct scheduler *ops, int cpu)
     per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
     spin_unlock_irqrestore(&prv->lock, flags);
 
+    if ( !alloc_cpumask_var(&_cpumask_scratch[cpu]) )
+        return NULL;
+
     /* 1 indicates alloc. succeed in schedule.c */
     return (void *)1;
 }
 
+static void
+rt_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
+    unsigned long flags;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    /* Move spinlock back to the default lock */
+    ASSERT(sd->schedule_lock == &prv->lock);
+    ASSERT(!spin_is_locked(&sd->_lock));
+    sd->schedule_lock = &sd->_lock;
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    free_cpumask_var(_cpumask_scratch[cpu]);
+}
+
 static void *
 rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
 {
@@ -525,6 +597,8 @@ rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
     if ( !is_idle_vcpu(vc) )
         svc->budget = RTDS_DEFAULT_BUDGET;
 
+    SCHED_STAT_CRANK(vcpu_init);
+
     return svc;
 }
 
@@ -574,6 +648,8 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
     struct rt_dom * const sdom = svc->sdom;
     spinlock_t *lock;
 
+    SCHED_STAT_CRANK(vcpu_destroy);
+
     BUG_ON( sdom == NULL );
 
     lock = vcpu_schedule_lock_irq(vc);
@@ -663,7 +739,7 @@ burn_budget(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now)
  * lock is grabbed before calling this function
  */
 static struct rt_vcpu *
-__runq_pick(const struct scheduler *ops, cpumask_t *mask)
+__runq_pick(const struct scheduler *ops, const cpumask_t *mask)
 {
     struct list_head *runq = rt_runq(ops);
     struct list_head *iter;
@@ -780,10 +856,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
     }
     else
     {
-        cpumask_t cur_cpu;
-        cpumask_clear(&cur_cpu);
-        cpumask_set_cpu(cpu, &cur_cpu);
-        snext = __runq_pick(ops, &cur_cpu);
+        snext = __runq_pick(ops, cpumask_of(cpu));
         if ( snext == NULL )
             snext = rt_vcpu(idle_vcpu[cpu]);
 
@@ -850,6 +923,7 @@ rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
     struct rt_vcpu * const svc = rt_vcpu(vc);
 
     BUG_ON( is_idle_vcpu(vc) );
+    SCHED_STAT_CRANK(vcpu_sleep);
 
     if ( curr_on_cpu(vc->processor) == vc )
         cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
@@ -927,6 +1001,7 @@ runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
     }
 
     /* didn't tickle any cpu */
+    SCHED_STAT_CRANK(tickle_idlers_none);
     return;
 out:
     /* TRACE */
@@ -942,6 +1017,7 @@ out:
     }
 
     cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
+    SCHED_STAT_CRANK(tickle_idlers_some);
     cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
     return;
 }
@@ -965,11 +1041,22 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
     BUG_ON( is_idle_vcpu(vc) );
 
     if ( unlikely(curr_on_cpu(vc->processor) == vc) )
+    {
+        SCHED_STAT_CRANK(vcpu_wake_running);
         return;
+    }
 
     /* on RunQ/DepletedQ, just update info is ok */
     if ( unlikely(__vcpu_on_q(svc)) )
+    {
+        SCHED_STAT_CRANK(vcpu_wake_onrunq);
         return;
+    }
+
+    if ( likely(vcpu_runnable(vc)) )
+        SCHED_STAT_CRANK(vcpu_wake_runnable);
+    else
+        SCHED_STAT_CRANK(vcpu_wake_not_runnable);
 
     /* If context hasn't been saved for this vcpu yet, we can't put it on
      * the Runqueue/DepletedQ. Instead, we set a flag so that it will be
@@ -1093,6 +1180,7 @@ const struct scheduler sched_rtds_def = {
     .init           = rt_init,
     .deinit         = rt_deinit,
     .alloc_pdata    = rt_alloc_pdata,
+    .free_pdata     = rt_free_pdata,
     .alloc_domdata  = rt_alloc_domdata,
     .free_domdata   = rt_free_domdata,
     .init_domain    = rt_dom_init,
diff --git a/xen/common/sched_sedf.c b/xen/common/sched_sedf.c
deleted file mode 100644
index 7c80bad..0000000
--- a/xen/common/sched_sedf.c
+++ /dev/null
@@ -1,1541 +0,0 @@
-/******************************************************************************
- * Simple EDF scheduler for xen
- *
- * by Stephan Diestelhorst (C)  2004 Cambridge University
- * based on code by Mark Williamson (C) 2004 Intel Research Cambridge
- */
-
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/sched-if.h>
-#include <xen/timer.h>
-#include <xen/softirq.h>
-#include <xen/time.h>
-#include <xen/errno.h>
-
-#ifndef NDEBUG
-#define SEDF_STATS
-#define CHECK(_p)                                           \
-    do {                                                    \
-        if ( !(_p) )                                        \
-            printk("Check '%s' failed, line %d, file %s\n", \
-                   #_p , __LINE__, __FILE__);               \
-    } while ( 0 )
-#else
-#define CHECK(_p) ((void)0)
-#endif
-
-#define EXTRA_NONE (0)
-#define EXTRA_AWARE (1)
-#define EXTRA_RUN_PEN (2)
-#define EXTRA_RUN_UTIL (4)
-#define EXTRA_WANT_PEN_Q (8)
-#define EXTRA_PEN_Q (0)
-#define EXTRA_UTIL_Q (1)
-#define SEDF_ASLEEP (16)
-
-#define EXTRA_QUANTUM (MICROSECS(500)) 
-#define WEIGHT_PERIOD (MILLISECS(100))
-#define WEIGHT_SAFETY (MILLISECS(5))
-
-#define PERIOD_MAX MILLISECS(10000) /* 10s  */
-#define PERIOD_MIN (MICROSECS(10))  /* 10us */
-#define SLICE_MIN (MICROSECS(5))    /*  5us */
-
-#define IMPLY(a, b) (!(a) || (b))
-#define EQ(a, b) ((!!(a)) == (!!(b)))
-
-
-struct sedf_dom_info {
-    struct domain  *domain;
-};
-
-struct sedf_priv_info {
-    /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
-    spinlock_t lock;
-};
-
-struct sedf_vcpu_info {
-    struct vcpu *vcpu;
-    struct list_head list;
-    struct list_head extralist[2];
- 
-    /* Parameters for EDF */
-    s_time_t  period;  /* = relative deadline */
-    s_time_t  slice;   /* = worst case execution time */
- 
-    /* Advaced Parameters */
-
-    /* Latency Scaling */
-    s_time_t  period_orig;
-    s_time_t  slice_orig;
-    s_time_t  latency;
- 
-    /* Status of domain */
-    int       status;
-    /* Weights for "Scheduling for beginners/ lazy/ etc." ;) */
-    short     weight;
-    short     extraweight;
-    /* Bookkeeping */
-    s_time_t  deadl_abs;
-    s_time_t  sched_start_abs;
-    s_time_t  cputime;
-    /* Times the domain un-/blocked */
-    s_time_t  block_abs;
-    s_time_t  unblock_abs;
- 
-    /* Scores for {util, block penalty}-weighted extratime distribution */
-    int   score[2];
-    s_time_t  short_block_lost_tot;
- 
-    /* Statistics */
-    s_time_t  extra_time_tot;
-
-#ifdef SEDF_STATS
-    s_time_t  block_time_tot;
-    s_time_t  penalty_time_tot;
-    int   block_tot;
-    int   short_block_tot;
-    int   long_block_tot;
-    int   pen_extra_blocks;
-    int   pen_extra_slices;
-#endif
-};
-
-struct sedf_cpu_info {
-    struct list_head runnableq;
-    struct list_head waitq;
-    struct list_head extraq[2];
-    s_time_t         current_slice_expires;
-};
-
-#define SEDF_PRIV(_ops) \
-    ((struct sedf_priv_info *)((_ops)->sched_data))
-#define EDOM_INFO(d)   ((struct sedf_vcpu_info *)((d)->sched_priv))
-#define CPU_INFO(cpu)  \
-    ((struct sedf_cpu_info *)per_cpu(schedule_data, cpu).sched_priv)
-#define LIST(d)        (&EDOM_INFO(d)->list)
-#define EXTRALIST(d,i) (&(EDOM_INFO(d)->extralist[i]))
-#define RUNQ(cpu)      (&CPU_INFO(cpu)->runnableq)
-#define WAITQ(cpu)     (&CPU_INFO(cpu)->waitq)
-#define EXTRAQ(cpu,i)  (&(CPU_INFO(cpu)->extraq[i]))
-#define IDLETASK(cpu)  (idle_vcpu[cpu])
-
-#define PERIOD_BEGIN(inf) ((inf)->deadl_abs - (inf)->period)
-
-#define DIV_UP(x,y) (((x) + (y) - 1) / y)
-
-#define extra_runs(inf)      ((inf->status) & 6)
-#define extra_get_cur_q(inf) (((inf->status & 6) >> 1)-1)
-#define sedf_runnable(edom)  (!(EDOM_INFO(edom)->status & SEDF_ASLEEP))
-
-
-static void sedf_dump_cpu_state(const struct scheduler *ops, int i);
-
-static inline int extraq_on(struct vcpu *d, int i)
-{
-    return ((EXTRALIST(d,i)->next != NULL) &&
-            (EXTRALIST(d,i)->next != EXTRALIST(d,i)));
-}
-
-static inline void extraq_del(struct vcpu *d, int i)
-{
-    struct list_head *list = EXTRALIST(d,i);
-    ASSERT(extraq_on(d,i));
-    list_del(list);
-    list->next = NULL;
-    ASSERT(!extraq_on(d, i));
-}
-
-/*
- * Adds a domain to the queue of processes which are aware of extra time. List
- * is sorted by score, where a lower score means higher priority for an extra
- * slice. It also updates the score, by simply subtracting a fixed value from
- * each entry, in order to avoid overflow. The algorithm works by simply
- * charging each domain that recieved extratime with an inverse of its weight.
- */ 
-static inline void extraq_add_sort_update(struct vcpu *d, int i, int sub)
-{
-    struct list_head      *cur;
-    struct sedf_vcpu_info *curinf;
- 
-    ASSERT(!extraq_on(d,i));
-
-    /*
-     * Iterate through all elements to find our "hole" and on our way
-     * update all the other scores.
-     */
-    list_for_each ( cur, EXTRAQ(d->processor, i) )
-    {
-        curinf = list_entry(cur,struct sedf_vcpu_info,extralist[i]);
-        curinf->score[i] -= sub;
-        if ( EDOM_INFO(d)->score[i] < curinf->score[i] )
-            break;
-    }
-
-    /* cur now contains the element, before which we'll enqueue */
-    list_add(EXTRALIST(d,i),cur->prev);
- 
-    /* Continue updating the extraq */
-    if ( (cur != EXTRAQ(d->processor,i)) && sub )
-    {
-        for ( cur = cur->next; cur != EXTRAQ(d->processor,i); cur = cur->next )
-        {
-            curinf = list_entry(cur,struct sedf_vcpu_info, extralist[i]);
-            curinf->score[i] -= sub;
-        }
-    }
-
-    ASSERT(extraq_on(d,i));
-}
-static inline void extraq_check(struct vcpu *d)
-{
-    if ( extraq_on(d, EXTRA_UTIL_Q) )
-    {
-        if ( !(EDOM_INFO(d)->status & EXTRA_AWARE) &&
-             !extra_runs(EDOM_INFO(d)) )
-            extraq_del(d, EXTRA_UTIL_Q);
-    }
-    else
-    {
-        if ( (EDOM_INFO(d)->status & EXTRA_AWARE) && sedf_runnable(d) )
-            extraq_add_sort_update(d, EXTRA_UTIL_Q, 0);
-    }
-}
-
-static inline void extraq_check_add_unblocked(struct vcpu *d, int priority)
-{
-    struct sedf_vcpu_info *inf = EDOM_INFO(d);
-
-    if ( inf->status & EXTRA_AWARE )
-        /* Put on the weighted extraq without updating any scores */
-        extraq_add_sort_update(d, EXTRA_UTIL_Q, 0);
-}
-
-static inline int __task_on_queue(struct vcpu *d)
-{
-    return (((LIST(d))->next != NULL) && (LIST(d)->next != LIST(d)));
-}
-
-static inline void __del_from_queue(struct vcpu *d)
-{
-    struct list_head *list = LIST(d);
-    ASSERT(__task_on_queue(d));
-    list_del(list);
-    list->next = NULL;
-    ASSERT(!__task_on_queue(d));
-}
-
-typedef int(*list_comparer)(struct list_head* el1, struct list_head* el2);
-
-static inline void list_insert_sort(
-    struct list_head *list, struct list_head *element, list_comparer comp)
-{
-    struct list_head     *cur;
-
-    /* Iterate through all elements to find our "hole" */
-    list_for_each( cur, list )
-        if ( comp(element, cur) < 0 )
-            break;
-
-    /* cur now contains the element, before which we'll enqueue */
-    list_add(element, cur->prev);
-}
-
-#define DOMAIN_COMPARER(name, field, comp1, comp2)                      \
-static int name##_comp(struct list_head* el1, struct list_head* el2)    \
-{                                                                       \
-    struct sedf_vcpu_info *d1, *d2;                                     \
-    d1 = list_entry(el1,struct sedf_vcpu_info, field);                  \
-    d2 = list_entry(el2,struct sedf_vcpu_info, field);                  \
-    if ( (comp1) == (comp2) )                                           \
-        return 0;                                                       \
-    if ( (comp1) < (comp2) )                                            \
-        return -1;                                                      \
-    else                                                                \
-        return 1;                                                       \
-}
-
-/*
- * Adds a domain to the queue of processes which wait for the beginning of the
- * next period; this list is therefore sortet by this time, which is simply
- * absol. deadline - period.
- */ 
-DOMAIN_COMPARER(waitq, list, PERIOD_BEGIN(d1), PERIOD_BEGIN(d2));
-static inline void __add_to_waitqueue_sort(struct vcpu *v)
-{
-    ASSERT(!__task_on_queue(v));
-    list_insert_sort(WAITQ(v->processor), LIST(v), waitq_comp);
-    ASSERT(__task_on_queue(v));
-}
-
-/*
- * Adds a domain to the queue of processes which have started their current
- * period and are runnable (i.e. not blocked, dieing,...). The first element
- * on this list is running on the processor, if the list is empty the idle
- * task will run. As we are implementing EDF, this list is sorted by deadlines.
- */ 
-DOMAIN_COMPARER(runq, list, d1->deadl_abs, d2->deadl_abs);
-static inline void __add_to_runqueue_sort(struct vcpu *v)
-{
-    list_insert_sort(RUNQ(v->processor), LIST(v), runq_comp);
-}
-
-
-static void sedf_insert_vcpu(const struct scheduler *ops, struct vcpu *v)
-{
-    if ( !is_idle_vcpu(v) )
-    {
-        extraq_check(v);
-    }
-    else
-    {
-        EDOM_INFO(v)->deadl_abs = 0;
-        EDOM_INFO(v)->status &= ~SEDF_ASLEEP;
-    }
-}
-
-static void *sedf_alloc_vdata(const struct scheduler *ops, struct vcpu *v, void *dd)
-{
-    struct sedf_vcpu_info *inf;
-
-    inf = xzalloc(struct sedf_vcpu_info);
-    if ( inf == NULL )
-        return NULL;
-
-    inf->vcpu = v;
-
-    /* Every VCPU gets an equal share of extratime by default */
-    inf->deadl_abs   = 0;
-    inf->latency     = 0;
-    inf->status      = EXTRA_AWARE | SEDF_ASLEEP;
-    inf->extraweight = 1;
-    /* Upon creation all domain are best-effort */
-    inf->period      = WEIGHT_PERIOD;
-    inf->slice       = 0;
-
-    inf->period_orig = inf->period; inf->slice_orig = inf->slice;
-    INIT_LIST_HEAD(&(inf->list));
-    INIT_LIST_HEAD(&(inf->extralist[EXTRA_PEN_Q]));
-    INIT_LIST_HEAD(&(inf->extralist[EXTRA_UTIL_Q]));
-
-    SCHED_STAT_CRANK(vcpu_init);
-
-    return inf;
-}
-
-static void *
-sedf_alloc_pdata(const struct scheduler *ops, int cpu)
-{
-    struct sedf_cpu_info *spc;
-
-    spc = xzalloc(struct sedf_cpu_info);
-    BUG_ON(spc == NULL);
-    INIT_LIST_HEAD(&spc->waitq);
-    INIT_LIST_HEAD(&spc->runnableq);
-    INIT_LIST_HEAD(&spc->extraq[EXTRA_PEN_Q]);
-    INIT_LIST_HEAD(&spc->extraq[EXTRA_UTIL_Q]);
-
-    return (void *)spc;
-}
-
-static void
-sedf_free_pdata(const struct scheduler *ops, void *spc, int cpu)
-{
-    if ( spc == NULL )
-        return;
-
-    xfree(spc);
-}
-
-static void sedf_free_vdata(const struct scheduler *ops, void *priv)
-{
-    xfree(priv);
-}
-
-static void *
-sedf_alloc_domdata(const struct scheduler *ops, struct domain *d)
-{
-    return xzalloc(struct sedf_dom_info);
-}
-
-static int sedf_init_domain(const struct scheduler *ops, struct domain *d)
-{
-    d->sched_priv = sedf_alloc_domdata(ops, d);
-    if ( d->sched_priv == NULL )
-        return -ENOMEM;
-
-    return 0;
-}
-
-static void sedf_free_domdata(const struct scheduler *ops, void *data)
-{
-    xfree(data);
-}
-
-static void sedf_destroy_domain(const struct scheduler *ops, struct domain *d)
-{
-    sedf_free_domdata(ops, d->sched_priv);
-}
-
-static int sedf_pick_cpu(const struct scheduler *ops, struct vcpu *v)
-{
-    cpumask_t online_affinity;
-    cpumask_t *online;
-
-    online = cpupool_scheduler_cpumask(v->domain->cpupool);
-    cpumask_and(&online_affinity, v->cpu_hard_affinity, online);
-    return cpumask_cycle(v->vcpu_id % cpumask_weight(&online_affinity) - 1,
-                         &online_affinity);
-}
-
-/*
- * Handles the rescheduling & bookkeeping of domains running in their
- * guaranteed timeslice.
- */
-static void desched_edf_dom(s_time_t now, struct vcpu* d)
-{
-    struct sedf_vcpu_info* inf = EDOM_INFO(d);
-
-    /* Current domain is running in real time mode */
-    ASSERT(__task_on_queue(d));
-
-    /* Update the domain's cputime */
-    inf->cputime += now - inf->sched_start_abs;
-
-    /* Scheduling decisions which don't remove the running domain from
-     * the runq */
-    if ( (inf->cputime < inf->slice) && sedf_runnable(d) )
-        return;
-  
-    __del_from_queue(d);
-
-    /*
-     * Manage bookkeeping (i.e. calculate next deadline, memorise
-     * overrun-time of slice) of finished domains.
-     */
-    if ( inf->cputime >= inf->slice )
-    {
-        inf->cputime -= inf->slice;
-  
-        if ( inf->period < inf->period_orig )
-        {
-            /* This domain runs in latency scaling or burst mode */
-            inf->period *= 2;
-            inf->slice  *= 2;
-            if ( (inf->period > inf->period_orig) ||
-                 (inf->slice > inf->slice_orig) )
-            {
-                /* Reset slice and period */
-                inf->period = inf->period_orig;
-                inf->slice = inf->slice_orig;
-            }
-        }
-
-        /* Set next deadline */
-        inf->deadl_abs += inf->period;
-    }
- 
-    /* Add a runnable domain to the waitqueue */
-    if ( sedf_runnable(d) )
-    {
-        __add_to_waitqueue_sort(d);
-    }
-    else
-    {
-        /* We have a blocked realtime task -> remove it from exqs too */
-        if ( extraq_on(d, EXTRA_PEN_Q) )
-            extraq_del(d, EXTRA_PEN_Q);
-        if ( extraq_on(d, EXTRA_UTIL_Q) )
-            extraq_del(d, EXTRA_UTIL_Q);
-    }
-
-    ASSERT(EQ(sedf_runnable(d), __task_on_queue(d)));
-    ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), 
-                 sedf_runnable(d)));
-}
-
-
-/* Update all elements on the queues */
-static void update_queues(
-    s_time_t now, struct list_head *runq, struct list_head *waitq)
-{
-    struct list_head     *cur, *tmp;
-    struct sedf_vcpu_info *curinf;
- 
-    /*
-     * Check for the first elements of the waitqueue, whether their
-     * next period has already started.
-     */
-    list_for_each_safe ( cur, tmp, waitq )
-    {
-        curinf = list_entry(cur, struct sedf_vcpu_info, list);
-        if ( PERIOD_BEGIN(curinf) > now )
-            break;
-        __del_from_queue(curinf->vcpu);
-        __add_to_runqueue_sort(curinf->vcpu);
-    }
- 
-    /* Process the runq, find domains that are on the runq that shouldn't */
-    list_for_each_safe ( cur, tmp, runq )
-    {
-        curinf = list_entry(cur,struct sedf_vcpu_info,list);
-
-        if ( unlikely(curinf->slice == 0) )
-        {
-            /* Ignore domains with empty slice */
-            __del_from_queue(curinf->vcpu);
-
-            /* Move them to their next period */
-            curinf->deadl_abs += curinf->period;
-
-            /* Ensure that the start of the next period is in the future */
-            if ( unlikely(PERIOD_BEGIN(curinf) < now) )
-                curinf->deadl_abs += 
-                    (DIV_UP(now - PERIOD_BEGIN(curinf),
-                            curinf->period)) * curinf->period;
-
-            /* Put them back into the queue */
-            __add_to_waitqueue_sort(curinf->vcpu);
-        }
-        else if ( unlikely((curinf->deadl_abs < now) ||
-                           (curinf->cputime > curinf->slice)) )
-        {
-            /*
-             * We missed the deadline or the slice was already finished.
-             * Might hapen because of dom_adj.
-             */
-            printk("\tDomain %i.%i exceeded it's deadline/"
-                   "slice (%"PRIu64" / %"PRIu64") now: %"PRIu64
-                   " cputime: %"PRIu64"\n",
-                   curinf->vcpu->domain->domain_id,
-                   curinf->vcpu->vcpu_id,
-                   curinf->deadl_abs, curinf->slice, now,
-                   curinf->cputime);
-            __del_from_queue(curinf->vcpu);
-
-            /* Common case: we miss one period */
-            curinf->deadl_abs += curinf->period;
-
-            /*
-             * If we are still behind: modulo arithmetic, force deadline
-             * to be in future and aligned to period borders.
-             */
-            if ( unlikely(curinf->deadl_abs < now) )
-                curinf->deadl_abs += 
-                    DIV_UP(now - curinf->deadl_abs,
-                           curinf->period) * curinf->period;
-            ASSERT(curinf->deadl_abs >= now);
-
-            /* Give a fresh slice */
-            curinf->cputime = 0;
-            if ( PERIOD_BEGIN(curinf) > now )
-                __add_to_waitqueue_sort(curinf->vcpu);
-            else
-                __add_to_runqueue_sort(curinf->vcpu);
-        }
-        else
-            break;
-    }
-}
-
-
-/*
- * removes a domain from the head of the according extraQ and
- * requeues it at a specified position:
- *   round-robin extratime: end of extraQ
- *   weighted ext.: insert in sorted list by score
- * if the domain is blocked / has regained its short-block-loss
- * time it is not put on any queue.
- */
-static void desched_extra_dom(s_time_t now, struct vcpu *d)
-{
-    struct sedf_vcpu_info *inf = EDOM_INFO(d);
-    int i = extra_get_cur_q(inf);
-    unsigned long oldscore;
-
-    ASSERT(extraq_on(d, i));
-
-    /* Unset all running flags */
-    inf->status  &= ~(EXTRA_RUN_PEN | EXTRA_RUN_UTIL);
-    /* Fresh slice for the next run */
-    inf->cputime = 0;
-    /* Accumulate total extratime */
-    inf->extra_time_tot += now - inf->sched_start_abs;
-    /* Remove extradomain from head of the queue. */
-    extraq_del(d, i);
-
-    /* Update the score */
-    oldscore = inf->score[i];
-    if ( i == EXTRA_PEN_Q )
-    {
-        /* Domain was running in L0 extraq */
-        /* reduce block lost, probably more sophistication here!*/
-        /*inf->short_block_lost_tot -= EXTRA_QUANTUM;*/
-        inf->short_block_lost_tot -= now - inf->sched_start_abs;
-#if 0
-        /* KAF: If we don't exit short-blocking state at this point
-         * domain0 can steal all CPU for up to 10 seconds before
-         * scheduling settles down (when competing against another
-         * CPU-bound domain). Doing this seems to make things behave
-         * nicely. Noone gets starved by default.
-         */
-        if ( inf->short_block_lost_tot <= 0 )
-#endif
-        {
-            /* We have (over-)compensated our block penalty */
-            inf->short_block_lost_tot = 0;
-            /* We don't want a place on the penalty queue anymore! */
-            inf->status &= ~EXTRA_WANT_PEN_Q;
-            goto check_extra_queues;
-        }
-
-        /*
-         * We have to go again for another try in the block-extraq,
-         * the score is not used incremantally here, as this is
-         * already done by recalculating the block_lost
-         */
-        inf->score[EXTRA_PEN_Q] = (inf->period << 10) /
-            inf->short_block_lost_tot;
-        oldscore = 0;
-    }
-    else
-    {
-        /*
-         * Domain was running in L1 extraq => score is inverse of
-         * utilization and is used somewhat incremental!
-         */
-        if ( !inf->extraweight )
-        {
-            /* NB: use fixed point arithmetic with 10 bits */
-            inf->score[EXTRA_UTIL_Q] = (inf->period << 10) /
-                inf->slice;
-        }
-        else
-        {
-            /*
-             * Conversion between realtime utilisation and extrawieght:
-             * full (ie 100%) utilization is equivalent to 128 extraweight
-             */
-            inf->score[EXTRA_UTIL_Q] = (1<<17) / inf->extraweight;
-        }
-    }
-
- check_extra_queues:
-    /* Adding a runnable domain to the right queue and removing blocked ones */
-    if ( sedf_runnable(d) )
-    {
-        /* Add according to score: weighted round robin */
-        if (((inf->status & EXTRA_AWARE) && (i == EXTRA_UTIL_Q)) ||
-            ((inf->status & EXTRA_WANT_PEN_Q) && (i == EXTRA_PEN_Q)))
-            extraq_add_sort_update(d, i, oldscore);
-    }
-    else
-    {
-        /* Remove this blocked domain from the waitq! */
-        __del_from_queue(d);
-        /* Make sure that we remove a blocked domain from the other
-         * extraq too. */
-        if ( i == EXTRA_PEN_Q )
-        {
-            if ( extraq_on(d, EXTRA_UTIL_Q) )
-                extraq_del(d, EXTRA_UTIL_Q);
-        }
-        else
-        {
-            if ( extraq_on(d, EXTRA_PEN_Q) )
-                extraq_del(d, EXTRA_PEN_Q);
-        }
-    }
-
-    ASSERT(EQ(sedf_runnable(d), __task_on_queue(d)));
-    ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), 
-                 sedf_runnable(d)));
-}
-
-
-static struct task_slice sedf_do_extra_schedule(
-    s_time_t now, s_time_t end_xt, struct list_head *extraq[], int cpu)
-{
-    struct task_slice   ret = { 0 };
-    struct sedf_vcpu_info *runinf;
-    ASSERT(end_xt > now);
-
-    /* Enough time left to use for extratime? */
-    if ( end_xt - now < EXTRA_QUANTUM )
-        goto return_idle;
-
-    if ( !list_empty(extraq[EXTRA_PEN_Q]) )
-    {
-        /*
-         * We still have elements on the level 0 extraq
-         * => let those run first!
-         */
-        runinf   = list_entry(extraq[EXTRA_PEN_Q]->next, 
-                              struct sedf_vcpu_info, extralist[EXTRA_PEN_Q]);
-        runinf->status |= EXTRA_RUN_PEN;
-        ret.task = runinf->vcpu;
-        ret.time = EXTRA_QUANTUM;
-#ifdef SEDF_STATS
-        runinf->pen_extra_slices++;
-#endif
-    }
-    else
-    {
-        if ( !list_empty(extraq[EXTRA_UTIL_Q]) )
-        {
-            /* Use elements from the normal extraqueue */
-            runinf   = list_entry(extraq[EXTRA_UTIL_Q]->next,
-                                  struct sedf_vcpu_info,
-                                  extralist[EXTRA_UTIL_Q]);
-            runinf->status |= EXTRA_RUN_UTIL;
-            ret.task = runinf->vcpu;
-            ret.time = EXTRA_QUANTUM;
-        }
-        else
-            goto return_idle;
-    }
-
-    ASSERT(ret.time > 0);
-    ASSERT(sedf_runnable(ret.task));
-    return ret;
- 
- return_idle:
-    ret.task = IDLETASK(cpu);
-    ret.time = end_xt - now;
-    ASSERT(ret.time > 0);
-    ASSERT(sedf_runnable(ret.task));
-    return ret;
-}
-
-
-static int sedf_init(struct scheduler *ops)
-{
-    struct sedf_priv_info *prv;
-
-    prv = xzalloc(struct sedf_priv_info);
-    if ( prv == NULL )
-        return -ENOMEM;
-
-    ops->sched_data = prv;
-    spin_lock_init(&prv->lock);
-
-    return 0;
-}
-
-
-static void sedf_deinit(const struct scheduler *ops)
-{
-    struct sedf_priv_info *prv;
-
-    prv = SEDF_PRIV(ops);
-    if ( prv != NULL )
-        xfree(prv);
-}
-
-
-/*
- * Main scheduling function
- * Reasons for calling this function are:
- * -timeslice for the current period used up
- * -domain on waitqueue has started it's period
- * -and various others ;) in general: determine which domain to run next
- */
-static struct task_slice sedf_do_schedule(
-    const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled)
-{
-    int                   cpu      = smp_processor_id();
-    struct list_head     *runq     = RUNQ(cpu);
-    struct list_head     *waitq    = WAITQ(cpu);
-    struct sedf_vcpu_info *inf     = EDOM_INFO(current);
-    struct list_head      *extraq[] = {
-        EXTRAQ(cpu, EXTRA_PEN_Q), EXTRAQ(cpu, EXTRA_UTIL_Q)};
-    struct sedf_vcpu_info *runinf, *waitinf;
-    struct task_slice      ret;
-
-    SCHED_STAT_CRANK(schedule);
-
-    /* Idle tasks don't need any of the following stuf */
-    if ( is_idle_vcpu(current) )
-        goto check_waitq;
-
-    /*
-     * Create local state of the status of the domain, in order to avoid
-     * inconsistent state during scheduling decisions, because data for
-     * vcpu_runnable is not protected by the scheduling lock!
-     */
-    if ( !vcpu_runnable(current) )
-        inf->status |= SEDF_ASLEEP;
- 
-    if ( inf->status & SEDF_ASLEEP )
-        inf->block_abs = now;
-
-    if ( unlikely(extra_runs(inf)) )
-    {
-        /* Special treatment of domains running in extra time */
-        desched_extra_dom(now, current);
-    }
-    else 
-    {
-        desched_edf_dom(now, current);
-    }
- check_waitq:
-    update_queues(now, runq, waitq);
-
-    /*
-     * Now simply pick the first domain from the runqueue, which has the
-     * earliest deadline, because the list is sorted
-     *
-     * Tasklet work (which runs in idle VCPU context) overrides all else.
-     */
-    if ( tasklet_work_scheduled ||
-         (list_empty(runq) && list_empty(waitq)) ||
-         unlikely(!cpumask_test_cpu(cpu,
-                   cpupool_scheduler_cpumask(per_cpu(cpupool, cpu)))) )
-    {
-        ret.task = IDLETASK(cpu);
-        ret.time = SECONDS(1);
-    }
-    else if ( !list_empty(runq) )
-    {
-        runinf   = list_entry(runq->next,struct sedf_vcpu_info,list);
-        ret.task = runinf->vcpu;
-        if ( !list_empty(waitq) )
-        {
-            waitinf  = list_entry(waitq->next,
-                                  struct sedf_vcpu_info,list);
-            /*
-             * Rerun scheduler, when scheduled domain reaches it's
-             * end of slice or the first domain from the waitqueue
-             * gets ready.
-             */
-            ret.time = MIN(now + runinf->slice - runinf->cputime,
-                           PERIOD_BEGIN(waitinf)) - now;
-        }
-        else
-        {
-            ret.time = runinf->slice - runinf->cputime;
-        }
-    }
-    else
-    {
-        waitinf  = list_entry(waitq->next,struct sedf_vcpu_info, list);
-        /*
-         * We could not find any suitable domain 
-         * => look for domains that are aware of extratime
-         */
-        ret = sedf_do_extra_schedule(now, PERIOD_BEGIN(waitinf),
-                                     extraq, cpu);
-    }
-
-    /*
-     * TODO: Do something USEFUL when this happens and find out, why it
-     * still can happen!!!
-     */
-    if ( ret.time < 0)
-    {
-        printk("Ouch! We are seriously BEHIND schedule! %"PRIi64"\n",
-               ret.time);
-        ret.time = EXTRA_QUANTUM;
-    }
-
-    ret.migrated = 0;
-
-    EDOM_INFO(ret.task)->sched_start_abs = now;
-    CHECK(ret.time > 0);
-    ASSERT(sedf_runnable(ret.task));
-    CPU_INFO(cpu)->current_slice_expires = now + ret.time;
-    return ret;
-}
-
-
-static void sedf_sleep(const struct scheduler *ops, struct vcpu *d)
-{
-    if ( is_idle_vcpu(d) )
-        return;
-
-    EDOM_INFO(d)->status |= SEDF_ASLEEP;
- 
-    if ( per_cpu(schedule_data, d->processor).curr == d )
-    {
-        cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ);
-    }
-    else
-    {
-        if ( __task_on_queue(d) )
-            __del_from_queue(d);
-        if ( extraq_on(d, EXTRA_UTIL_Q) ) 
-            extraq_del(d, EXTRA_UTIL_Q);
-        if ( extraq_on(d, EXTRA_PEN_Q) )
-            extraq_del(d, EXTRA_PEN_Q);
-    }
-}
-
-
-/*
- * This function wakes up a domain, i.e. moves them into the waitqueue
- * things to mention are: admission control is taking place nowhere at
- * the moment, so we can't be sure, whether it is safe to wake the domain
- * up at all. Anyway, even if it is safe (total cpu usage <=100%) there are
- * some considerations on when to allow the domain to wake up and have it's
- * first deadline...
- * I detected 3 cases, which could describe the possible behaviour of the
- * scheduler,
- * and I'll try to make them more clear:
- *
- * 1. Very conservative
- *     -when a blocked domain unblocks, it is allowed to start execution at
- *      the beginning of the next complete period
- *      (D..deadline, R..running, B..blocking/sleeping, U..unblocking/waking up
- *
- *      DRRB_____D__U_____DRRRRR___D________ ... 
- *
- *     -this causes the domain to miss a period (and a deadlline)
- *     -doesn't disturb the schedule at all
- *     -deadlines keep occuring isochronous
- *
- * 2. Conservative Part 1: Short Unblocking
- *     -when a domain unblocks in the same period as it was blocked it
- *      unblocks and may consume the rest of it's original time-slice minus
- *      the time it was blocked
- *      (assume period=9, slice=5)
- *
- *      DRB_UR___DRRRRR___D...
- *
- *     -this also doesn't disturb scheduling, but might lead to the fact, that
- *      the domain can't finish it's workload in the period
- *     -in addition to that the domain can be treated prioritised when
- *      extratime is available
- *     -addition: experiments have shown that this may have a HUGE impact on
- *      performance of other domains, becaus it can lead to excessive context
- *      switches
- *
- *    Part2: Long Unblocking
- *    Part 2a
- *     -it is obvious that such accounting of block time, applied when
- *      unblocking is happening in later periods, works fine aswell
- *     -the domain is treated as if it would have been running since the start
- *      of its new period
- *
- *      DRB______D___UR___D... 
- *
- *    Part 2b
- *     -if one needs the full slice in the next period, it is necessary to
- *      treat the unblocking time as the start of the new period, i.e. move
- *      the deadline further back (later)
- *     -this doesn't disturb scheduling as well, because for EDF periods can
- *      be treated as minimal inter-release times and scheduling stays
- *      correct, when deadlines are kept relative to the time the process
- *      unblocks
- *
- *      DRB______D___URRRR___D...<prev [Thread] next>
- *                       (D) <- old deadline was here
- *     -problem: deadlines don't occur isochronous anymore
- *    Part 2c (Improved Atropos design)
- *     -when a domain unblocks it is given a very short period (=latency hint)
- *      and slice length scaled accordingly
- *     -both rise again to the original value (e.g. get doubled every period)
- *
- * 3. Unconservative (i.e. incorrect)
- *     -to boost the performance of I/O dependent domains it would be possible
- *      to put the domain into the runnable queue immediately, and let it run
- *      for the remainder of the slice of the current period
- *      (or even worse: allocate a new full slice for the domain) 
- *     -either behaviour can lead to missed deadlines in other domains as
- *      opposed to approaches 1,2a,2b
- */
-static void unblock_short_extra_support(
-    struct sedf_vcpu_info* inf, s_time_t now)
-{
-    /*
-     * This unblocking scheme tries to support the domain, by assigning it
-     * a priority in extratime distribution according to the loss of time
-     * in this slice due to blocking
-     */
-    s_time_t pen;
- 
-    /* No more realtime execution in this period! */
-    inf->deadl_abs += inf->period;
-    if ( likely(inf->block_abs) )
-    {
-        /* Treat blocked time as consumed by the domain */
-        /*inf->cputime += now - inf->block_abs;*/
-        /*
-         * Penalty is time the domain would have
-         * had if it continued to run.
-         */
-        pen = (inf->slice - inf->cputime);
-        if ( pen < 0 )
-            pen = 0;
-        /* Accumulate all penalties over the periods */
-        /*inf->short_block_lost_tot += pen;*/
-        /* Set penalty to the current value */
-        inf->short_block_lost_tot = pen;
-        /* Not sure which one is better.. but seems to work well... */
-  
-        if ( inf->short_block_lost_tot )
-        {
-            inf->score[0] = (inf->period << 10) /
-                inf->short_block_lost_tot;
-#ifdef SEDF_STATS
-            inf->pen_extra_blocks++;
-#endif
-            if ( extraq_on(inf->vcpu, EXTRA_PEN_Q) )
-                /* Remove domain for possible resorting! */
-                extraq_del(inf->vcpu, EXTRA_PEN_Q);
-            else
-                /*
-                 * Remember that we want to be on the penalty q
-                 * so that we can continue when we (un-)block
-                 * in penalty-extratime
-                 */
-                inf->status |= EXTRA_WANT_PEN_Q;
-   
-            /* (re-)add domain to the penalty extraq */
-            extraq_add_sort_update(inf->vcpu, EXTRA_PEN_Q, 0);
-        }
-    }
-
-    /* Give it a fresh slice in the next period! */
-    inf->cputime = 0;
-}
-
-
-static void unblock_long_cons_b(struct sedf_vcpu_info* inf,s_time_t now)
-{
-    /* Conservative 2b */
-
-    /* Treat the unblocking time as a start of a new period */
-    inf->deadl_abs = now + inf->period;
-    inf->cputime = 0;
-}
-
-
-#define DOMAIN_EDF   1
-#define DOMAIN_EXTRA_PEN  2
-#define DOMAIN_EXTRA_UTIL  3
-#define DOMAIN_IDLE   4
-static inline int get_run_type(struct vcpu* d)
-{
-    struct sedf_vcpu_info* inf = EDOM_INFO(d);
-    if (is_idle_vcpu(d))
-        return DOMAIN_IDLE;
-    if (inf->status & EXTRA_RUN_PEN)
-        return DOMAIN_EXTRA_PEN;
-    if (inf->status & EXTRA_RUN_UTIL)
-        return DOMAIN_EXTRA_UTIL;
-    return DOMAIN_EDF;
-}
-
-
-/*
- * Compares two domains in the relation of whether the one is allowed to
- * interrupt the others execution.
- * It returns true (!=0) if a switch to the other domain is good.
- * Current Priority scheme is as follows:
- *  EDF > L0 (penalty based) extra-time > 
- *  L1 (utilization) extra-time > idle-domain
- * In the same class priorities are assigned as following:
- *  EDF: early deadline > late deadline
- *  L0 extra-time: lower score > higher score
- */
-static inline int should_switch(struct vcpu *cur,
-                                struct vcpu *other,
-                                s_time_t now)
-{
-    struct sedf_vcpu_info *cur_inf, *other_inf;
-    cur_inf   = EDOM_INFO(cur);
-    other_inf = EDOM_INFO(other);
- 
-    /* Check whether we need to make an earlier scheduling decision */
-    if ( PERIOD_BEGIN(other_inf) < 
-         CPU_INFO(other->processor)->current_slice_expires )
-        return 1;
-
-    /* No timing-based switches need to be taken into account here */
-    switch ( get_run_type(cur) )
-    {
-    case DOMAIN_EDF:
-        /* Do not interrupt a running EDF domain */
-        return 0;
-    case DOMAIN_EXTRA_PEN:
-        /* Check whether we also want the L0 ex-q with lower score */
-        return ((other_inf->status & EXTRA_WANT_PEN_Q) &&
-                (other_inf->score[EXTRA_PEN_Q] < 
-                 cur_inf->score[EXTRA_PEN_Q]));
-    case DOMAIN_EXTRA_UTIL:
-        /* Check whether we want the L0 extraq. Don't
-         * switch if both domains want L1 extraq. */
-        return !!(other_inf->status & EXTRA_WANT_PEN_Q);
-    case DOMAIN_IDLE:
-        return 1;
-    }
-
-    return 1;
-}
-
-static void sedf_wake(const struct scheduler *ops, struct vcpu *d)
-{
-    s_time_t              now = NOW();
-    struct sedf_vcpu_info* inf = EDOM_INFO(d);
-
-    if ( unlikely(is_idle_vcpu(d)) )
-        return;
-   
-    if ( unlikely(__task_on_queue(d)) )
-        return;
-
-    ASSERT(!sedf_runnable(d));
-    inf->status &= ~SEDF_ASLEEP;
-    ASSERT(!extraq_on(d, EXTRA_UTIL_Q));
-    ASSERT(!extraq_on(d, EXTRA_PEN_Q));
- 
-    if ( unlikely(inf->deadl_abs == 0) )
-    {
-        /* Initial setup of the deadline */
-        inf->deadl_abs = now + inf->slice;
-    }
-  
-#ifdef SEDF_STATS 
-    inf->block_tot++;
-#endif
-
-    if ( unlikely(now < PERIOD_BEGIN(inf)) )
-    {
-        /* Unblocking in extra-time! */
-        if ( inf->status & EXTRA_WANT_PEN_Q )
-        {
-            /*
-             * We have a domain that wants compensation
-             * for block penalty and did just block in
-             * its compensation time. Give it another
-             * chance!
-             */
-            extraq_add_sort_update(d, EXTRA_PEN_Q, 0);
-        }
-        extraq_check_add_unblocked(d, 0);
-    }  
-    else
-    {  
-        if ( now < inf->deadl_abs )
-        {
-            /* Short blocking */
-#ifdef SEDF_STATS
-            inf->short_block_tot++;
-#endif
-            unblock_short_extra_support(inf, now);
-
-            extraq_check_add_unblocked(d, 1);
-        }
-        else
-        {
-            /* Long unblocking */
-#ifdef SEDF_STATS
-            inf->long_block_tot++;
-#endif
-            unblock_long_cons_b(inf, now);
-
-            extraq_check_add_unblocked(d, 1);
-        }
-    }
-
-    if ( PERIOD_BEGIN(inf) > now )
-        __add_to_waitqueue_sort(d);
-    else
-        __add_to_runqueue_sort(d);
- 
-#ifdef SEDF_STATS
-    /* Do some statistics here... */
-    if ( inf->block_abs != 0 )
-    {
-        inf->block_time_tot += now - inf->block_abs;
-        inf->penalty_time_tot +=
-            PERIOD_BEGIN(inf) + inf->cputime - inf->block_abs;
-    }
-#endif
-
-    /* Sanity check: make sure each extra-aware domain IS on the util-q! */
-    ASSERT(IMPLY(inf->status & EXTRA_AWARE, extraq_on(d, EXTRA_UTIL_Q)));
-    ASSERT(__task_on_queue(d));
-    /*
-     * Check whether the awakened task needs to invoke the do_schedule
-     * routine. Try to avoid unnecessary runs but:
-     * Save approximation: Always switch to scheduler!
-     */
-    ASSERT(d->processor >= 0);
-    ASSERT(d->processor < nr_cpu_ids);
-    ASSERT(per_cpu(schedule_data, d->processor).curr);
-
-    if ( should_switch(per_cpu(schedule_data, d->processor).curr, d, now) )
-        cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ);
-}
-
-
-/* Print a lot of useful information about a domains in the system */
-static void sedf_dump_domain(struct vcpu *d)
-{
-    printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id,
-           d->is_running ? 'T':'F');
-    printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu"
-           " sc=%i xtr(%s)=%"PRIu64" ew=%hu",
-           EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs,
-           EDOM_INFO(d)->weight,
-           EDOM_INFO(d)->score[EXTRA_UTIL_Q],
-           (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no",
-           EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight);
-    
-#ifdef SEDF_STATS
-    if ( EDOM_INFO(d)->block_time_tot != 0 )
-        printk(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) /
-               EDOM_INFO(d)->block_time_tot);
-    if ( EDOM_INFO(d)->block_tot != 0 )
-        printk("\n   blks=%u sh=%u (%u%%) (shex=%i "\
-               "shexsl=%i) l=%u (%u%%) avg: b=%"PRIu64" p=%"PRIu64"",
-               EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_block_tot,
-               (EDOM_INFO(d)->short_block_tot * 100) / EDOM_INFO(d)->block_tot,
-               EDOM_INFO(d)->pen_extra_blocks,
-               EDOM_INFO(d)->pen_extra_slices,
-               EDOM_INFO(d)->long_block_tot,
-               (EDOM_INFO(d)->long_block_tot * 100) / EDOM_INFO(d)->block_tot,
-               (EDOM_INFO(d)->block_time_tot) / EDOM_INFO(d)->block_tot,
-               (EDOM_INFO(d)->penalty_time_tot) / EDOM_INFO(d)->block_tot);
-#endif
-    printk("\n");
-}
-
-
-/* Dumps all domains on the specified cpu */
-static void sedf_dump_cpu_state(const struct scheduler *ops, int i)
-{
-    struct list_head      *list, *queue, *tmp;
-    struct sedf_vcpu_info *d_inf;
-    struct domain         *d;
-    struct vcpu    *ed;
-    int loop = 0;
- 
-    printk("now=%"PRIu64"\n",NOW());
-    queue = RUNQ(i);
-    printk("RUNQ rq %lx   n: %lx, p: %lx\n",  (unsigned long)queue,
-           (unsigned long) queue->next, (unsigned long) queue->prev);
-    list_for_each_safe ( list, tmp, queue )
-    {
-        printk("%3d: ",loop++);
-        d_inf = list_entry(list, struct sedf_vcpu_info, list);
-        sedf_dump_domain(d_inf->vcpu);
-    }
- 
-    queue = WAITQ(i); loop = 0;
-    printk("\nWAITQ rq %lx   n: %lx, p: %lx\n",  (unsigned long)queue,
-           (unsigned long) queue->next, (unsigned long) queue->prev);
-    list_for_each_safe ( list, tmp, queue )
-    {
-        printk("%3d: ",loop++);
-        d_inf = list_entry(list, struct sedf_vcpu_info, list);
-        sedf_dump_domain(d_inf->vcpu);
-    }
- 
-    queue = EXTRAQ(i,EXTRA_PEN_Q); loop = 0;
-    printk("\nEXTRAQ (penalty) rq %lx   n: %lx, p: %lx\n",
-           (unsigned long)queue, (unsigned long) queue->next,
-           (unsigned long) queue->prev);
-    list_for_each_safe ( list, tmp, queue )
-    {
-        d_inf = list_entry(list, struct sedf_vcpu_info,
-                           extralist[EXTRA_PEN_Q]);
-        printk("%3d: ",loop++);
-        sedf_dump_domain(d_inf->vcpu);
-    }
- 
-    queue = EXTRAQ(i,EXTRA_UTIL_Q); loop = 0;
-    printk("\nEXTRAQ (utilization) rq %lx   n: %lx, p: %lx\n",
-           (unsigned long)queue, (unsigned long) queue->next,
-           (unsigned long) queue->prev);
-    list_for_each_safe ( list, tmp, queue )
-    {
-        d_inf = list_entry(list, struct sedf_vcpu_info,
-                           extralist[EXTRA_UTIL_Q]);
-        printk("%3d: ",loop++);
-        sedf_dump_domain(d_inf->vcpu);
-    }
- 
-    loop = 0;
-    printk("\nnot on Q\n");
-
-    rcu_read_lock(&domlist_read_lock);
-    for_each_domain ( d )
-    {
-        if ( (d->cpupool ? d->cpupool->sched : &sched_sedf_def) != ops )
-            continue;
-        for_each_vcpu(d, ed)
-        {
-            if ( !__task_on_queue(ed) && (ed->processor == i) )
-            {
-                printk("%3d: ",loop++);
-                sedf_dump_domain(ed);
-            }
-        }
-    }
-    rcu_read_unlock(&domlist_read_lock);
-}
-
-
-/* Adjusts periods and slices of the domains accordingly to their weights */
-static int sedf_adjust_weights(struct cpupool *c, int nr_cpus, int *sumw, s_time_t *sumt)
-{
-    struct vcpu *p;
-    struct domain      *d;
-    unsigned int        cpu;
-
-    /*
-     * Sum across all weights. Notice that no runq locking is needed
-     * here: the caller holds sedf_priv_info.lock and we're not changing
-     * anything that is accessed during scheduling.
-     */
-    rcu_read_lock(&domlist_read_lock);
-    for_each_domain_in_cpupool( d, c )
-    {
-        for_each_vcpu( d, p )
-        {
-            if ( (cpu = p->processor) >= nr_cpus )
-                continue;
-
-            if ( EDOM_INFO(p)->weight )
-            {
-                sumw[cpu] += EDOM_INFO(p)->weight;
-            }
-            else
-            {
-                /*
-                 * Don't modify domains who don't have a weight, but sum
-                 * up the time they need, projected to a WEIGHT_PERIOD,
-                 * so that this time is not given to the weight-driven
-                 *  domains
-                 */
-
-                /* Check for overflows */
-                ASSERT((WEIGHT_PERIOD < ULONG_MAX) 
-                       && (EDOM_INFO(p)->slice_orig < ULONG_MAX));
-                sumt[cpu] += 
-                    (WEIGHT_PERIOD * EDOM_INFO(p)->slice_orig) / 
-                    EDOM_INFO(p)->period_orig;
-            }
-        }
-    }
-    rcu_read_unlock(&domlist_read_lock);
-
-    /*
-     * Adjust all slices (and periods) to the new weight. Unlike above, we
-     * need to take thr runq lock for the various VCPUs: we're modyfing
-     * slice and period which are referenced during scheduling.
-     */
-    rcu_read_lock(&domlist_read_lock);
-    for_each_domain_in_cpupool( d, c )
-    {
-        for_each_vcpu ( d, p )
-        {
-            if ( (cpu = p->processor) >= nr_cpus )
-                continue;
-            if ( EDOM_INFO(p)->weight )
-            {
-                /* Interrupts already off */
-                spinlock_t *lock = vcpu_schedule_lock(p);
-
-                EDOM_INFO(p)->period_orig = 
-                    EDOM_INFO(p)->period  = WEIGHT_PERIOD;
-                EDOM_INFO(p)->slice_orig  =
-                    EDOM_INFO(p)->slice   = 
-                    (EDOM_INFO(p)->weight *
-                     (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[cpu])) / sumw[cpu];
-
-                vcpu_schedule_unlock(lock, p);
-            }
-        }
-    }
-    rcu_read_unlock(&domlist_read_lock);
-
-    return 0;
-}
-
-
-/* Set or fetch domain scheduling parameters */
-static int sedf_adjust(const struct scheduler *ops, struct domain *p, struct xen_domctl_scheduler_op *op)
-{
-    struct sedf_priv_info *prv = SEDF_PRIV(ops);
-    unsigned long flags;
-    unsigned int nr_cpus = cpumask_last(&cpu_online_map) + 1;
-    int *sumw = xzalloc_array(int, nr_cpus);
-    s_time_t *sumt = xzalloc_array(s_time_t, nr_cpus);
-    struct vcpu *v;
-    int rc = 0;
-
-    /*
-     * Serialize against the pluggable scheduler lock to protect from
-     * concurrent updates. We need to take the runq lock for the VCPUs
-     * as well, since we are touching extraweight, weight, slice and
-     * period. As in sched_credit2.c, runq locks nest inside the
-     * pluggable scheduler lock.
-     */
-    spin_lock_irqsave(&prv->lock, flags);
-
-    if ( op->cmd == XEN_DOMCTL_SCHEDOP_putinfo )
-    {
-        /*
-         * These are used in sedf_adjust_weights() but have to be allocated in
-         * this function, as we need to avoid nesting xmem_pool_alloc's lock
-         * within our prv->lock.
-         */
-        if ( !sumw || !sumt )
-        {
-            /* Check for errors here, the _getinfo branch doesn't care */
-            rc = -ENOMEM;
-            goto out;
-        }
-
-        /* Check for sane parameters */
-        if ( !op->u.sedf.period && !op->u.sedf.weight )
-        {
-            rc = -EINVAL;
-            goto out;
-        }
-
-        if ( op->u.sedf.weight )
-        {
-            if ( (op->u.sedf.extratime & EXTRA_AWARE) &&
-                 (!op->u.sedf.period) )
-            {
-                /* Weight-driven domains with extratime only */
-                for_each_vcpu ( p, v )
-                {
-                    /* (Here and everywhere in the following) IRQs are already off,
-                     * hence vcpu_spin_lock() is the one. */
-                    spinlock_t *lock = vcpu_schedule_lock(v);
-
-                    EDOM_INFO(v)->extraweight = op->u.sedf.weight;
-                    EDOM_INFO(v)->weight = 0;
-                    EDOM_INFO(v)->slice = 0;
-                    EDOM_INFO(v)->period = WEIGHT_PERIOD;
-                    vcpu_schedule_unlock(lock, v);
-                }
-            }
-            else
-            {
-                /* Weight-driven domains with real-time execution */
-                for_each_vcpu ( p, v )
-                {
-                    spinlock_t *lock = vcpu_schedule_lock(v);
-
-                    EDOM_INFO(v)->weight = op->u.sedf.weight;
-                    vcpu_schedule_unlock(lock, v);
-                }
-            }
-        }
-        else
-        {
-            /*
-             * Sanity checking: note that disabling extra weight requires
-             * that we set a non-zero slice.
-             */
-            if ( (op->u.sedf.period > PERIOD_MAX) ||
-                 (op->u.sedf.period < PERIOD_MIN) ||
-                 (op->u.sedf.slice  > op->u.sedf.period) ||
-                 (op->u.sedf.slice  < SLICE_MIN) )
-            {
-                rc = -EINVAL;
-                goto out;
-            }
-
-            /* Time-driven domains */
-            for_each_vcpu ( p, v )
-            {
-                spinlock_t *lock = vcpu_schedule_lock(v);
-
-                EDOM_INFO(v)->weight = 0;
-                EDOM_INFO(v)->extraweight = 0;
-                EDOM_INFO(v)->period_orig = 
-                    EDOM_INFO(v)->period  = op->u.sedf.period;
-                EDOM_INFO(v)->slice_orig  = 
-                    EDOM_INFO(v)->slice   = op->u.sedf.slice;
-                vcpu_schedule_unlock(lock, v);
-            }
-        }
-
-        rc = sedf_adjust_weights(p->cpupool, nr_cpus, sumw, sumt);
-        if ( rc )
-            goto out;
-
-        for_each_vcpu ( p, v )
-        {
-            spinlock_t *lock = vcpu_schedule_lock(v);
-
-            EDOM_INFO(v)->status  = 
-                (EDOM_INFO(v)->status &
-                 ~EXTRA_AWARE) | (op->u.sedf.extratime & EXTRA_AWARE);
-            EDOM_INFO(v)->latency = op->u.sedf.latency;
-            extraq_check(v);
-            vcpu_schedule_unlock(lock, v);
-        }
-    }
-    else if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
-    {
-        if ( p->vcpu[0] == NULL )
-        {
-            rc = -EINVAL;
-            goto out;
-        }
-
-        op->u.sedf.period    = EDOM_INFO(p->vcpu[0])->period;
-        op->u.sedf.slice     = EDOM_INFO(p->vcpu[0])->slice;
-        op->u.sedf.extratime = EDOM_INFO(p->vcpu[0])->status & EXTRA_AWARE;
-        op->u.sedf.latency   = EDOM_INFO(p->vcpu[0])->latency;
-        op->u.sedf.weight    = EDOM_INFO(p->vcpu[0])->weight;
-    }
-
-out:
-    spin_unlock_irqrestore(&prv->lock, flags);
-
-    xfree(sumt);
-    xfree(sumw);
-
-    return rc;
-}
-
-static struct sedf_priv_info _sedf_priv;
-
-const struct scheduler sched_sedf_def = {
-    .name           = "Simple EDF Scheduler",
-    .opt_name       = "sedf",
-    .sched_id       = XEN_SCHEDULER_SEDF,
-    .sched_data     = &_sedf_priv,
-    
-    .init_domain    = sedf_init_domain,
-    .destroy_domain = sedf_destroy_domain,
-
-    .insert_vcpu    = sedf_insert_vcpu,
-
-    .alloc_vdata    = sedf_alloc_vdata,
-    .free_vdata     = sedf_free_vdata,
-    .alloc_pdata    = sedf_alloc_pdata,
-    .free_pdata     = sedf_free_pdata,
-    .alloc_domdata  = sedf_alloc_domdata,
-    .free_domdata   = sedf_free_domdata,
-
-    .init           = sedf_init,
-    .deinit         = sedf_deinit,
-
-    .do_schedule    = sedf_do_schedule,
-    .pick_cpu       = sedf_pick_cpu,
-    .dump_cpu_state = sedf_dump_cpu_state,
-    .sleep          = sedf_sleep,
-    .wake           = sedf_wake,
-    .adjust         = sedf_adjust,
-};
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 6285a6e..3eefed7 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -65,7 +65,6 @@ DEFINE_PER_CPU(struct schedule_data, schedule_data);
 DEFINE_PER_CPU(struct scheduler *, scheduler);
 
 static const struct scheduler *schedulers[] = {
-    &sched_sedf_def,
     &sched_credit_def,
     &sched_credit2_def,
     &sched_arinc653_def,
@@ -185,6 +184,38 @@ uint64_t get_cpu_idle_time(unsigned int cpu)
     return state.time[RUNSTATE_running];
 }
 
+/*
+ * If locks are different, take the one with the lower address first.
+ * This avoids dead- or live-locks when this code is running on both
+ * cpus at the same time.
+ */
+static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
+                                   unsigned long *flags)
+{
+    if ( lock1 == lock2 )
+    {
+        spin_lock_irqsave(lock1, *flags);
+    }
+    else if ( lock1 < lock2 )
+    {
+        spin_lock_irqsave(lock1, *flags);
+        spin_lock(lock2);
+    }
+    else
+    {
+        spin_lock_irqsave(lock2, *flags);
+        spin_lock(lock1);
+    }
+}
+
+static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
+                                     unsigned long flags)
+{
+    if ( lock1 != lock2 )
+        spin_unlock(lock2);
+    spin_unlock_irqrestore(lock1, flags);
+}
+
 int sched_init_vcpu(struct vcpu *v, unsigned int processor) 
 {
     struct domain *d = v->domain;
@@ -260,10 +291,7 @@ int sched_move_domain(struct domain *d, struct cpupool *c)
         if ( vcpu_priv[v->vcpu_id] == NULL )
         {
             for_each_vcpu ( d, v )
-            {
-                if ( vcpu_priv[v->vcpu_id] != NULL )
-                    xfree(vcpu_priv[v->vcpu_id]);
-            }
+                xfree(vcpu_priv[v->vcpu_id]);
             xfree(vcpu_priv);
             SCHED_OP(c->sched, free_domdata, domdata);
             return -ENOMEM;
@@ -422,6 +450,62 @@ void vcpu_unblock(struct vcpu *v)
     vcpu_wake(v);
 }
 
+/*
+ * Do the actual movement of a vcpu from old to new CPU. Locks for *both*
+ * CPUs needs to have been taken already when calling this!
+ */
+static void vcpu_move_locked(struct vcpu *v, unsigned int new_cpu)
+{
+    unsigned int old_cpu = v->processor;
+
+    /*
+     * Transfer urgency status to new CPU before switching CPUs, as
+     * once the switch occurs, v->is_urgent is no longer protected by
+     * the per-CPU scheduler lock we are holding.
+     */
+    if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
+    {
+        atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
+        atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
+    }
+
+    /*
+     * Actual CPU switch to new CPU.  This is safe because the lock
+     * pointer cant' change while the current lock is held.
+     */
+    if ( VCPU2OP(v)->migrate )
+        SCHED_OP(VCPU2OP(v), migrate, v, new_cpu);
+    else
+        v->processor = new_cpu;
+}
+
+/*
+ * Move a vcpu from its current processor to a target new processor,
+ * without asking the scheduler to do any placement. This is intended
+ * for being called from special contexts, where things are quiet
+ * enough that no contention is supposed to happen (i.e., during
+ * shutdown or software suspend, like ACPI S3).
+ */
+static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu)
+{
+    unsigned long flags;
+    spinlock_t *lock, *new_lock;
+
+    ASSERT(system_state == SYS_STATE_suspend);
+    ASSERT(!vcpu_runnable(v) && (atomic_read(&v->pause_count) ||
+                                 atomic_read(&v->domain->pause_count)));
+
+    lock = per_cpu(schedule_data, v->processor).schedule_lock;
+    new_lock = per_cpu(schedule_data, new_cpu).schedule_lock;
+
+    sched_spin_lock_double(lock, new_lock, &flags);
+    ASSERT(new_cpu != v->processor);
+    vcpu_move_locked(v, new_cpu);
+    sched_spin_unlock_double(lock, new_lock, flags);
+
+    sched_move_irqs(v);
+}
+
 static void vcpu_migrate(struct vcpu *v)
 {
     unsigned long flags;
@@ -433,31 +517,14 @@ static void vcpu_migrate(struct vcpu *v)
     for ( ; ; )
     {
         /*
-         * If per-cpu locks for old and new cpu are different, take the one
-         * with the lower lock address first. This avoids dead- or live-locks
-         * when this code is running on both cpus at the same time.
          * We need another iteration if the pre-calculated lock addresses
          * are not correct any longer after evaluating old and new cpu holding
          * the locks.
          */
-
         old_lock = per_cpu(schedule_data, old_cpu).schedule_lock;
         new_lock = per_cpu(schedule_data, new_cpu).schedule_lock;
 
-        if ( old_lock == new_lock )
-        {
-            spin_lock_irqsave(old_lock, flags);
-        }
-        else if ( old_lock < new_lock )
-        {
-            spin_lock_irqsave(old_lock, flags);
-            spin_lock(new_lock);
-        }
-        else
-        {
-            spin_lock_irqsave(new_lock, flags);
-            spin_lock(old_lock);
-        }
+        sched_spin_lock_double(old_lock, new_lock, &flags);
 
         old_cpu = v->processor;
         if ( old_lock == per_cpu(schedule_data, old_cpu).schedule_lock )
@@ -488,9 +555,7 @@ static void vcpu_migrate(struct vcpu *v)
             pick_called = 0;
         }
 
-        if ( old_lock != new_lock )
-            spin_unlock(new_lock);
-        spin_unlock_irqrestore(old_lock, flags);
+        sched_spin_unlock_double(old_lock, new_lock, flags);
     }
 
     /*
@@ -501,36 +566,13 @@ static void vcpu_migrate(struct vcpu *v)
     if ( v->is_running ||
          !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
     {
-        if ( old_lock != new_lock )
-            spin_unlock(new_lock);
-        spin_unlock_irqrestore(old_lock, flags);
+        sched_spin_unlock_double(old_lock, new_lock, flags);
         return;
     }
 
-    /*
-     * Transfer urgency status to new CPU before switching CPUs, as once
-     * the switch occurs, v->is_urgent is no longer protected by the per-CPU
-     * scheduler lock we are holding.
-     */
-    if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
-    {
-        atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
-        atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
-    }
-
-    /*
-     * Switch to new CPU, then unlock new and old CPU.  This is safe because
-     * the lock pointer cant' change while the current lock is held.
-     */
-    if ( VCPU2OP(v)->migrate )
-        SCHED_OP(VCPU2OP(v), migrate, v, new_cpu);
-    else
-        v->processor = new_cpu;
+    vcpu_move_locked(v, new_cpu);
 
-
-    if ( old_lock != new_lock )
-        spin_unlock(new_lock);
-    spin_unlock_irqrestore(old_lock, flags);
+    sched_spin_unlock_double(old_lock, new_lock, flags);
 
     if ( old_cpu != new_cpu )
         sched_move_irqs(v);
@@ -601,12 +643,19 @@ int cpu_disable_scheduler(unsigned int cpu)
     struct vcpu *v;
     struct cpupool *c;
     cpumask_t online_affinity;
-    int    ret = 0;
+    unsigned int new_cpu;
+    int ret = 0;
 
     c = per_cpu(cpupool, cpu);
     if ( c == NULL )
         return ret;
 
+    /*
+     * We'd need the domain RCU lock, but:
+     *  - when we are called from cpupool code, it's acquired there already;
+     *  - when we are called for CPU teardown, we're in stop-machine context,
+     *    so that's not be a problem.
+     */
     for_each_domain_in_cpupool ( d, c )
     {
         for_each_vcpu ( d, v )
@@ -630,26 +679,68 @@ int cpu_disable_scheduler(unsigned int cpu)
                 cpumask_setall(v->cpu_hard_affinity);
             }
 
-            if ( v->processor == cpu )
+            if ( v->processor != cpu )
             {
-                set_bit(_VPF_migrating, &v->pause_flags);
+                /* The vcpu is not on this cpu, so we can move on. */
                 vcpu_schedule_unlock_irqrestore(lock, flags, v);
-                vcpu_sleep_nosync(v);
-                vcpu_migrate(v);
+                continue;
+            }
+
+            /* If it is on this cpu, we must send it away. */
+            if ( unlikely(system_state == SYS_STATE_suspend) )
+            {
+                vcpu_schedule_unlock_irqrestore(lock, flags, v);
+
+                /*
+                 * If we are doing a shutdown/suspend, it is not necessary to
+                 * ask the scheduler to chime in. In fact:
+                 *  * there is no reason for it: the end result we are after
+                 *    is just 'all the vcpus on the boot pcpu, and no vcpu
+                 *    anywhere else', so let's just go for it;
+                 *  * it's wrong, for cpupools with only non-boot pcpus, as
+                 *    the scheduler would always fail to send the vcpus away
+                 *    from the last online (non boot) pcpu!
+                 *
+                 * Therefore, in the shutdown/suspend case, we just pick up
+                 * one (still) online pcpu. Note that, at this stage, all
+                 * domains (including dom0) have been paused already, so we
+                 * do not expect any vcpu activity at all.
+                 */
+                cpumask_andnot(&online_affinity, &cpu_online_map,
+                               cpumask_of(cpu));
+                BUG_ON(cpumask_empty(&online_affinity));
+                /*
+                 * As boot cpu is, usually, pcpu #0, using cpumask_first()
+                 * will make us converge quicker.
+                 */
+                new_cpu = cpumask_first(&online_affinity);
+                vcpu_move_nosched(v, new_cpu);
             }
             else
+            {
+                /*
+                 * OTOH, if the system is still live, and we are here because
+                 * we are doing some cpupool manipulations:
+                 *  * we want to call the scheduler, and let it re-evaluation
+                 *    the placement of the vcpu, taking into account the new
+                 *    cpupool configuration;
+                 *  * the scheduler will always fine a suitable solution, or
+                 *    things would have failed before getting in here.
+                 */
+                set_bit(_VPF_migrating, &v->pause_flags);
                 vcpu_schedule_unlock_irqrestore(lock, flags, v);
+                vcpu_sleep_nosync(v);
+                vcpu_migrate(v);
 
-            /*
-             * A vcpu active in the hypervisor will not be migratable.
-             * The caller should try again after releasing and reaquiring
-             * all locks.
-             */
-            if ( v->processor == cpu )
-                ret = -EAGAIN;
+                /*
+                 * The only caveat, in this case, is that if a vcpu active in
+                 * the hypervisor isn't migratable. In this case, the caller
+                 * should try again after releasing and reaquiring all locks.
+                 */
+                if ( v->processor == cpu )
+                    ret = -EAGAIN;
+            }
         }
-
-        domain_update_node_affinity(d);
     }
 
     return ret;
@@ -885,39 +976,6 @@ void watchdog_domain_destroy(struct domain *d)
         kill_timer(&d->watchdog_timer[i]);
 }
 
-long do_sched_op_compat(int cmd, unsigned long arg)
-{
-    long ret = 0;
-
-    switch ( cmd )
-    {
-    case SCHEDOP_yield:
-    {
-        ret = vcpu_yield();
-        break;
-    }
-
-    case SCHEDOP_block:
-    {
-        vcpu_block_enable_events();
-        break;
-    }
-
-    case SCHEDOP_shutdown:
-    {
-        TRACE_3D(TRC_SCHED_SHUTDOWN,
-                 current->domain->domain_id, current->vcpu_id, arg);
-        domain_shutdown(current->domain, (u8)arg);
-        break;
-    }
-
-    default:
-        ret = -ENOSYS;
-    }
-
-    return ret;
-}
-
 typedef long ret_t;
 
 #endif /* !COMPAT */
@@ -1006,16 +1064,10 @@ ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
             break;
 
         ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
-        if ( ret )
-        {
-            rcu_unlock_domain(d);
-            return ret;
-        }
-
-        domain_shutdown(d, (u8)sched_remote_shutdown.reason);
+        if ( likely(!ret) )
+            domain_shutdown(d, sched_remote_shutdown.reason);
 
         rcu_unlock_domain(d);
-        ret = 0;
 
         break;
     }
@@ -1066,9 +1118,8 @@ long do_set_timer_op(s_time_t timeout)
          * timeout in this case can burn a lot of CPU. We therefore go for a
          * reasonable middleground of triggering a timer event in 100ms.
          */
-        gdprintk(XENLOG_INFO,
-                 "Warning: huge timeout set by vcpu %d: %"PRIx64"\n",
-                 v->vcpu_id, (uint64_t)timeout);
+        gprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n",
+                (uint64_t)timeout);
         set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
     }
     else
@@ -1182,6 +1233,7 @@ static void schedule(void)
     {
     case TASKLET_enqueued:
         set_bit(_TASKLET_scheduled, tasklet_work);
+        /* fallthrough */
     case TASKLET_enqueued|TASKLET_scheduled:
         tasklet_work_scheduled = 1;
         break;
@@ -1339,9 +1391,10 @@ static int cpu_schedule_up(unsigned int cpu)
 static void cpu_schedule_down(unsigned int cpu)
 {
     struct schedule_data *sd = &per_cpu(schedule_data, cpu);
+    struct scheduler *sched = per_cpu(scheduler, cpu);
 
     if ( sd->sched_priv != NULL )
-        SCHED_OP(&ops, free_pdata, sd->sched_priv, cpu);
+        SCHED_OP(sched, free_pdata, sd->sched_priv, cpu);
 
     kill_timer(&sd->s_timer);
 }
@@ -1421,7 +1474,8 @@ void __init scheduler_init(void)
         sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
     }
 
-    idle_domain = domain_create(DOMID_IDLE, 0, 0);
+    /* There is no need of arch-specific configuration for an idle domain */
+    idle_domain = domain_create(DOMID_IDLE, 0, 0, NULL);
     BUG_ON(IS_ERR(idle_domain));
     idle_domain->vcpu = idle_vcpu;
     idle_domain->max_vcpus = nr_cpu_ids;
@@ -1513,22 +1567,29 @@ void scheduler_free(struct scheduler *sched)
 
 void schedule_dump(struct cpupool *c)
 {
-    int               i;
+    unsigned int      i;
     struct scheduler *sched;
     cpumask_t        *cpus;
 
-    sched = (c == NULL) ? &ops : c->sched;
-    cpus = cpupool_scheduler_cpumask(c);
-    printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
-    SCHED_OP(sched, dump_settings);
+    /* Locking, if necessary, must be handled withing each scheduler */
 
-    for_each_cpu (i, cpus)
+    if ( c != NULL )
+    {
+        sched = c->sched;
+        cpus = c->cpu_valid;
+        printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
+        SCHED_OP(sched, dump_settings);
+    }
+    else
     {
-        spinlock_t *lock = pcpu_schedule_lock(i);
+        sched = &ops;
+        cpus = &cpupool_free_cpus;
+    }
 
+    for_each_cpu (i, cpus)
+    {
         printk("CPU[%02d] ", i);
         SCHED_OP(sched, dump_cpu_state, i);
-        pcpu_schedule_unlock(lock, i);
     }
 }
 
diff --git a/xen/common/shutdown.c b/xen/common/shutdown.c
index 94d4c53..9cfbf7a 100644
--- a/xen/common/shutdown.c
+++ b/xen/common/shutdown.c
@@ -37,46 +37,40 @@ void hwdom_shutdown(u8 reason)
     switch ( reason )
     {
     case SHUTDOWN_poweroff:
-    {
-        printk("Domain 0 halted: halting machine.\n");
+        printk("Hardware Dom%u halted: halting machine\n",
+               hardware_domain->domain_id);
         machine_halt();
         break; /* not reached */
-    }
 
     case SHUTDOWN_crash:
-    {
         debugger_trap_immediate();
-        printk("Domain 0 crashed: ");
+        printk("Hardware Dom%u crashed: ", hardware_domain->domain_id);
 #ifdef CONFIG_KEXEC
         kexec_crash();
 #endif
         maybe_reboot();
         break; /* not reached */
-    }
 
     case SHUTDOWN_reboot:
-    {
-        printk("Domain 0 shutdown: rebooting machine.\n");
+        printk("Hardware Dom%u shutdown: rebooting machine\n",
+               hardware_domain->domain_id);
         machine_restart(0);
         break; /* not reached */
-    }
 
     case SHUTDOWN_watchdog:
-    {
-        printk("Domain 0 shutdown: watchdog rebooting machine.\n");
+        printk("Hardware Dom%u shutdown: watchdog rebooting machine\n",
+               hardware_domain->domain_id);
 #ifdef CONFIG_KEXEC
         kexec_crash();
 #endif
         machine_restart(0);
         break; /* not reached */
-    }
 
     default:
-    {
-        printk("Domain 0 shutdown (unknown reason %u): ", reason);
+        printk("Hardware Dom%u shutdown (unknown reason %u): ",
+               hardware_domain->domain_id, reason);
         maybe_reboot();
         break; /* not reached */
     }
-    }
 }  
 
diff --git a/xen/common/softirq.c b/xen/common/softirq.c
index 33d5d86..11e3345 100644
--- a/xen/common/softirq.c
+++ b/xen/common/softirq.c
@@ -88,7 +88,7 @@ void cpumask_raise_softirq(const cpumask_t *mask, unsigned int nr)
         if ( !test_and_set_bit(nr, &softirq_pending(cpu)) &&
              cpu != this_cpu &&
              !arch_skip_send_event_check(cpu) )
-            cpumask_set_cpu(cpu, raise_mask);
+            __cpumask_set_cpu(cpu, raise_mask);
 
     if ( raise_mask == &send_mask )
         smp_send_event_check_mask(raise_mask);
@@ -106,7 +106,7 @@ void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
     if ( !per_cpu(batching, this_cpu) || in_irq() )
         smp_send_event_check_cpu(cpu);
     else
-        cpumask_set_cpu(cpu, &per_cpu(batch_mask, this_cpu));
+        __cpumask_set_cpu(cpu, &per_cpu(batch_mask, this_cpu));
 }
 
 void cpu_raise_softirq_batch_begin(void)
@@ -122,7 +122,7 @@ void cpu_raise_softirq_batch_finish(void)
     ASSERT(per_cpu(batching, this_cpu));
     for_each_cpu ( cpu, mask )
         if ( !softirq_pending(cpu) )
-            cpumask_clear_cpu(cpu, mask);
+            __cpumask_clear_cpu(cpu, mask);
     smp_send_event_check_mask(mask);
     cpumask_clear(mask);
     --per_cpu(batching, this_cpu);
diff --git a/xen/common/spinlock.c b/xen/common/spinlock.c
index f9f19a8..29149d1 100644
--- a/xen/common/spinlock.c
+++ b/xen/common/spinlock.c
@@ -115,125 +115,134 @@ void spin_debug_disable(void)
 
 #endif
 
+static always_inline spinlock_tickets_t observe_lock(spinlock_tickets_t *t)
+{
+    spinlock_tickets_t v;
+
+    smp_rmb();
+    v.head_tail = read_atomic(&t->head_tail);
+    return v;
+}
+
+static always_inline u16 observe_head(spinlock_tickets_t *t)
+{
+    smp_rmb();
+    return read_atomic(&t->head);
+}
+
 void _spin_lock(spinlock_t *lock)
 {
+    spinlock_tickets_t tickets = SPINLOCK_TICKET_INC;
     LOCK_PROFILE_VAR;
 
     check_lock(&lock->debug);
-    while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
+    tickets.head_tail = arch_fetch_and_add(&lock->tickets.head_tail,
+                                           tickets.head_tail);
+    while ( tickets.tail != observe_head(&lock->tickets) )
     {
         LOCK_PROFILE_BLOCK;
-        while ( likely(_raw_spin_is_locked(&lock->raw)) )
-            cpu_relax();
+        cpu_relax();
     }
     LOCK_PROFILE_GOT;
     preempt_disable();
+    arch_lock_acquire_barrier();
 }
 
 void _spin_lock_irq(spinlock_t *lock)
 {
-    LOCK_PROFILE_VAR;
-
     ASSERT(local_irq_is_enabled());
     local_irq_disable();
-    check_lock(&lock->debug);
-    while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
-    {
-        LOCK_PROFILE_BLOCK;
-        local_irq_enable();
-        while ( likely(_raw_spin_is_locked(&lock->raw)) )
-            cpu_relax();
-        local_irq_disable();
-    }
-    LOCK_PROFILE_GOT;
-    preempt_disable();
+    _spin_lock(lock);
 }
 
 unsigned long _spin_lock_irqsave(spinlock_t *lock)
 {
     unsigned long flags;
-    LOCK_PROFILE_VAR;
 
     local_irq_save(flags);
-    check_lock(&lock->debug);
-    while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
-    {
-        LOCK_PROFILE_BLOCK;
-        local_irq_restore(flags);
-        while ( likely(_raw_spin_is_locked(&lock->raw)) )
-            cpu_relax();
-        local_irq_save(flags);
-    }
-    LOCK_PROFILE_GOT;
-    preempt_disable();
+    _spin_lock(lock);
     return flags;
 }
 
 void _spin_unlock(spinlock_t *lock)
 {
+    arch_lock_release_barrier();
     preempt_enable();
     LOCK_PROFILE_REL;
-    _raw_spin_unlock(&lock->raw);
+    add_sized(&lock->tickets.head, 1);
 }
 
 void _spin_unlock_irq(spinlock_t *lock)
 {
-    preempt_enable();
-    LOCK_PROFILE_REL;
-    _raw_spin_unlock(&lock->raw);
+    _spin_unlock(lock);
     local_irq_enable();
 }
 
 void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
-    preempt_enable();
-    LOCK_PROFILE_REL;
-    _raw_spin_unlock(&lock->raw);
+    _spin_unlock(lock);
     local_irq_restore(flags);
 }
 
 int _spin_is_locked(spinlock_t *lock)
 {
     check_lock(&lock->debug);
-    return _raw_spin_is_locked(&lock->raw);
+    return lock->tickets.head != lock->tickets.tail;
 }
 
 int _spin_trylock(spinlock_t *lock)
 {
+    spinlock_tickets_t old, new;
+
     check_lock(&lock->debug);
-    if ( !_raw_spin_trylock(&lock->raw) )
+    old = observe_lock(&lock->tickets);
+    if ( old.head != old.tail )
+        return 0;
+    new = old;
+    new.tail++;
+    if ( cmpxchg(&lock->tickets.head_tail,
+                 old.head_tail, new.head_tail) != old.head_tail )
         return 0;
 #ifdef LOCK_PROFILE
     if (lock->profile)
         lock->profile->time_locked = NOW();
 #endif
     preempt_disable();
+    /*
+     * cmpxchg() is a full barrier so no need for an
+     * arch_lock_acquire_barrier().
+     */
     return 1;
 }
 
 void _spin_barrier(spinlock_t *lock)
 {
+    spinlock_tickets_t sample;
 #ifdef LOCK_PROFILE
     s_time_t block = NOW();
-    u64      loop = 0;
+#endif
 
     check_barrier(&lock->debug);
-    do { smp_mb(); loop++;} while ( _raw_spin_is_locked(&lock->raw) );
-    if ((loop > 1) && lock->profile)
+    smp_mb();
+    sample = observe_lock(&lock->tickets);
+    if ( sample.head != sample.tail )
     {
-        lock->profile->time_block += NOW() - block;
-        lock->profile->block_cnt++;
-    }
-#else
-    check_barrier(&lock->debug);
-    do { smp_mb(); } while ( _raw_spin_is_locked(&lock->raw) );
+        while ( observe_head(&lock->tickets) == sample.head )
+            cpu_relax();
+#ifdef LOCK_PROFILE
+        if ( lock->profile )
+        {
+            lock->profile->time_block += NOW() - block;
+            lock->profile->block_cnt++;
+        }
 #endif
+    }
     smp_mb();
 }
 
 int _spin_trylock_recursive(spinlock_t *lock)
 {
-    int cpu = smp_processor_id();
+    unsigned int cpu = smp_processor_id();
 
     /* Don't allow overflow of recurse_cpu field. */
     BUILD_BUG_ON(NR_CPUS > 0xfffu);
@@ -256,8 +265,17 @@ int _spin_trylock_recursive(spinlock_t *lock)
 
 void _spin_lock_recursive(spinlock_t *lock)
 {
-    while ( !spin_trylock_recursive(lock) )
-        cpu_relax();
+    unsigned int cpu = smp_processor_id();
+
+    if ( likely(lock->recurse_cpu != cpu) )
+    {
+        _spin_lock(lock);
+        lock->recurse_cpu = cpu;
+    }
+
+    /* We support only fairly shallow recursion, else the counter overflows. */
+    ASSERT(lock->recurse_cnt < 0xfu);
+    lock->recurse_cnt++;
 }
 
 void _spin_unlock_recursive(spinlock_t *lock)
@@ -313,7 +331,7 @@ unsigned long _read_lock_irqsave(rwlock_t *lock)
             local_irq_restore(flags);
             while ( (x = lock->lock) & RW_WRITE_FLAG )
                 cpu_relax();
-            local_irq_save(flags);
+            local_irq_disable();
         }
     } while ( cmpxchg(&lock->lock, x, x+1) != x );
     preempt_disable();
@@ -333,14 +351,18 @@ int _read_trylock(rwlock_t *lock)
     return 1;
 }
 
-void _read_unlock(rwlock_t *lock)
-{
-    uint32_t x, y;
+#ifndef _raw_read_unlock
+# define _raw_read_unlock(l) do {                      \
+    uint32_t x = (l)->lock, y;                         \
+    while ( (y = cmpxchg(&(l)->lock, x, x - 1)) != x ) \
+        x = y;                                         \
+} while (0)
+#endif
 
+inline void _read_unlock(rwlock_t *lock)
+{
     preempt_enable();
-    x = lock->lock;
-    while ( (y = cmpxchg(&lock->lock, x, x-1)) != x )
-        x = y;
+    _raw_read_unlock(lock);
 }
 
 void _read_unlock_irq(rwlock_t *lock)
@@ -409,7 +431,7 @@ unsigned long _write_lock_irqsave(rwlock_t *lock)
             local_irq_restore(flags);
             while ( (x = lock->lock) & RW_WRITE_FLAG )
                 cpu_relax();
-            local_irq_save(flags);
+            local_irq_disable();
         }
     } while ( cmpxchg(&lock->lock, x, x|RW_WRITE_FLAG) != x );
     while ( x != 0 )
@@ -434,10 +456,14 @@ int _write_trylock(rwlock_t *lock)
     return 1;
 }
 
-void _write_unlock(rwlock_t *lock)
+#ifndef _raw_write_unlock
+# define _raw_write_unlock(l) xchg(&(l)->lock, 0)
+#endif
+
+inline void _write_unlock(rwlock_t *lock)
 {
     preempt_enable();
-    if ( cmpxchg(&lock->lock, RW_WRITE_FLAG, 0) != RW_WRITE_FLAG )
+    if ( _raw_write_unlock(lock) != RW_WRITE_FLAG )
         BUG();
 }
 
diff --git a/xen/common/stop_machine.c b/xen/common/stop_machine.c
index 932e5a7..cf109ba 100644
--- a/xen/common/stop_machine.c
+++ b/xen/common/stop_machine.c
@@ -16,8 +16,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/common/symbols.c b/xen/common/symbols.c
index bc2fde6..a59c59d 100644
--- a/xen/common/symbols.c
+++ b/xen/common/symbols.c
@@ -17,9 +17,11 @@
 #include <xen/lib.h>
 #include <xen/string.h>
 #include <xen/spinlock.h>
+#include <public/platform.h>
+#include <xen/guest_access.h>
 
 #ifdef SYMBOLS_ORIGIN
-extern const unsigned int symbols_offsets[1];
+extern const unsigned int symbols_offsets[];
 #define symbols_address(n) (SYMBOLS_ORIGIN + symbols_offsets[n])
 #else
 extern const unsigned long symbols_addresses[];
@@ -148,3 +150,55 @@ const char *symbols_lookup(unsigned long addr,
     *offset = addr - symbols_address(low);
     return namebuf;
 }
+
+/*
+ * Get symbol type information. This is encoded as a single char at the
+ * beginning of the symbol name.
+ */
+static char symbols_get_symbol_type(unsigned int off)
+{
+    /*
+     * Get just the first code, look it up in the token table,
+     * and return the first char from this token.
+     */
+    return symbols_token_table[symbols_token_index[symbols_names[off + 1]]];
+}
+
+int xensyms_read(uint32_t *symnum, char *type,
+                 uint64_t *address, char *name)
+{
+    /*
+     * Symbols are most likely accessed sequentially so we remember position
+     * from previous read. This can help us avoid the extra call to
+     * get_symbol_offset().
+     */
+    static uint64_t next_symbol, next_offset;
+    static DEFINE_SPINLOCK(symbols_mutex);
+
+    if ( *symnum > symbols_num_syms )
+        return -ERANGE;
+    if ( *symnum == symbols_num_syms )
+    {
+        /* No more symbols */
+        name[0] = '\0';
+        return 0;
+    }
+
+    spin_lock(&symbols_mutex);
+
+    if ( *symnum == 0 )
+        next_offset = next_symbol = 0;
+    if ( next_symbol != *symnum )
+        /* Non-sequential access */
+        next_offset = get_symbol_offset(*symnum);
+
+    *type = symbols_get_symbol_type(next_offset);
+    next_offset = symbols_expand_symbol(next_offset, name);
+    *address = symbols_address(*symnum);
+
+    next_symbol = ++*symnum;
+
+    spin_unlock(&symbols_mutex);
+
+    return 0;
+}
diff --git a/xen/common/sysctl.c b/xen/common/sysctl.c
index 70202e8..85e853f 100644
--- a/xen/common/sysctl.c
+++ b/xen/common/sysctl.c
@@ -14,6 +14,7 @@
 #include <xen/domain.h>
 #include <xen/event.h>
 #include <xen/domain_page.h>
+#include <xen/tmem.h>
 #include <xen/trace.h>
 #include <xen/console.h>
 #include <xen/iocap.h>
@@ -68,7 +69,7 @@ long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl)
     case XEN_SYSCTL_tbuf_op:
         ret = tb_control(&op->u.tbuf_op);
         break;
-    
+
     case XEN_SYSCTL_sched_id:
         op->u.sched_id.sched_id = sched_id();
         break;
@@ -144,7 +145,7 @@ long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl)
     case XEN_SYSCTL_getcpuinfo:
     {
         uint32_t i, nr_cpus;
-        struct xen_sysctl_cpuinfo cpuinfo;
+        struct xen_sysctl_cpuinfo cpuinfo = { 0 };
 
         nr_cpus = min(op->u.getcpuinfo.max_cpus, nr_cpu_ids);
 
@@ -274,85 +275,123 @@ long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl)
 
     case XEN_SYSCTL_numainfo:
     {
-        uint32_t i, j, max_node_index, last_online_node;
+        unsigned int i, j, num_nodes;
         xen_sysctl_numainfo_t *ni = &op->u.numainfo;
+        bool_t do_meminfo = !guest_handle_is_null(ni->meminfo);
+        bool_t do_distance = !guest_handle_is_null(ni->distance);
 
-        last_online_node = last_node(node_online_map);
-        max_node_index = min_t(uint32_t, ni->max_node_index, last_online_node);
-        ni->max_node_index = last_online_node;
+        num_nodes = last_node(node_online_map) + 1;
 
-        for ( i = 0; i <= max_node_index; i++ )
+        if ( do_meminfo || do_distance )
         {
-            if ( !guest_handle_is_null(ni->node_to_memsize) )
-            {
-                uint64_t memsize = node_online(i) ?
-                                   node_spanned_pages(i) << PAGE_SHIFT : 0ul;
-                if ( copy_to_guest_offset(ni->node_to_memsize, i, &memsize, 1) )
-                    break;
-            }
-            if ( !guest_handle_is_null(ni->node_to_memfree) )
-            {
-                uint64_t memfree = node_online(i) ?
-                                   avail_node_heap_pages(i) << PAGE_SHIFT : 0ul;
-                if ( copy_to_guest_offset(ni->node_to_memfree, i, &memfree, 1) )
-                    break;
-            }
+            xen_sysctl_meminfo_t meminfo = { 0 };
 
-            if ( !guest_handle_is_null(ni->node_to_node_distance) )
+            if ( num_nodes > ni->num_nodes )
+                num_nodes = ni->num_nodes;
+            for ( i = 0; i < num_nodes; ++i )
             {
-                for ( j = 0; j <= max_node_index; j++)
+                static uint32_t distance[MAX_NUMNODES];
+
+                if ( do_meminfo )
                 {
-                    uint32_t distance = ~0u;
-                    if ( node_online(i) && node_online(j) )
-                        distance = __node_distance(i, j);
-                    if ( copy_to_guest_offset(
-                        ni->node_to_node_distance,
-                        i*(max_node_index+1) + j, &distance, 1) )
+                    if ( node_online(i) )
+                    {
+                        meminfo.memsize = node_spanned_pages(i) << PAGE_SHIFT;
+                        meminfo.memfree = avail_node_heap_pages(i) << PAGE_SHIFT;
+                    }
+                    else
+                        meminfo.memsize = meminfo.memfree = XEN_INVALID_MEM_SZ;
+
+                    if ( copy_to_guest_offset(ni->meminfo, i, &meminfo, 1) )
+                    {
+                        ret = -EFAULT;
                         break;
+                    }
+                }
+
+                if ( do_distance )
+                {
+                    for ( j = 0; j < num_nodes; j++ )
+                    {
+                        distance[j] = __node_distance(i, j);
+                        if ( distance[j] == NUMA_NO_DISTANCE )
+                            distance[j] = XEN_INVALID_NODE_DIST;
+                    }
+
+                    if ( copy_to_guest_offset(ni->distance, i * num_nodes,
+                                              distance, num_nodes) )
+                    {
+                        ret = -EFAULT;
+                        break;
+                    }
                 }
-                if ( j <= max_node_index )
-                    break;
             }
         }
+        else
+            i = num_nodes;
 
-        ret = ((i <= max_node_index) || copy_to_guest(u_sysctl, op, 1))
-            ? -EFAULT : 0;
+        if ( !ret && (ni->num_nodes != i) )
+        {
+            ni->num_nodes = i;
+            if ( __copy_field_to_guest(u_sysctl, op,
+                                       u.numainfo.num_nodes) )
+            {
+                ret = -EFAULT;
+                break;
+            }
+        }
     }
     break;
 
-    case XEN_SYSCTL_topologyinfo:
+    case XEN_SYSCTL_cputopoinfo:
     {
-        uint32_t i, max_cpu_index, last_online_cpu;
-        xen_sysctl_topologyinfo_t *ti = &op->u.topologyinfo;
-
-        last_online_cpu = cpumask_last(&cpu_online_map);
-        max_cpu_index = min_t(uint32_t, ti->max_cpu_index, last_online_cpu);
-        ti->max_cpu_index = last_online_cpu;
+        unsigned int i, num_cpus;
+        xen_sysctl_cputopoinfo_t *ti = &op->u.cputopoinfo;
 
-        for ( i = 0; i <= max_cpu_index; i++ )
+        num_cpus = cpumask_last(&cpu_online_map) + 1;
+        if ( !guest_handle_is_null(ti->cputopo) )
         {
-            if ( !guest_handle_is_null(ti->cpu_to_core) )
-            {
-                uint32_t core = cpu_online(i) ? cpu_to_core(i) : ~0u;
-                if ( copy_to_guest_offset(ti->cpu_to_core, i, &core, 1) )
-                    break;
-            }
-            if ( !guest_handle_is_null(ti->cpu_to_socket) )
+            xen_sysctl_cputopo_t cputopo = { 0 };
+
+            if ( num_cpus > ti->num_cpus )
+                num_cpus = ti->num_cpus;
+            for ( i = 0; i < num_cpus; ++i )
             {
-                uint32_t socket = cpu_online(i) ? cpu_to_socket(i) : ~0u;
-                if ( copy_to_guest_offset(ti->cpu_to_socket, i, &socket, 1) )
+                if ( cpu_present(i) )
+                {
+                    cputopo.core = cpu_to_core(i);
+                    cputopo.socket = cpu_to_socket(i);
+                    cputopo.node = cpu_to_node(i);
+                    if ( cputopo.node == NUMA_NO_NODE )
+                        cputopo.node = XEN_INVALID_NODE_ID;
+                }
+                else
+                {
+                    cputopo.core = XEN_INVALID_CORE_ID;
+                    cputopo.socket = XEN_INVALID_SOCKET_ID;
+                    cputopo.node = XEN_INVALID_NODE_ID;
+                }
+
+                if ( copy_to_guest_offset(ti->cputopo, i, &cputopo, 1) )
+                {
+                    ret = -EFAULT;
                     break;
+                }
             }
-            if ( !guest_handle_is_null(ti->cpu_to_node) )
+        }
+        else
+            i = num_cpus;
+
+        if ( !ret && (ti->num_cpus != i) )
+        {
+            ti->num_cpus = i;
+            if ( __copy_field_to_guest(u_sysctl, op,
+                                       u.cputopoinfo.num_cpus) )
             {
-                uint32_t node = cpu_online(i) ? cpu_to_node(i) : ~0u;
-                if ( copy_to_guest_offset(ti->cpu_to_node, i, &node, 1) )
-                    break;
+                ret = -EFAULT;
+                break;
             }
         }
-
-        ret = ((i <= max_cpu_index) || copy_to_guest(u_sysctl, op, 1))
-            ? -EFAULT : 0;
     }
     break;
 
@@ -362,6 +401,65 @@ long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl)
         break;
 #endif
 
+#ifdef HAS_PCI
+    case XEN_SYSCTL_pcitopoinfo:
+    {
+        xen_sysctl_pcitopoinfo_t *ti = &op->u.pcitopoinfo;
+        unsigned int i = 0;
+
+        if ( guest_handle_is_null(ti->devs) ||
+             guest_handle_is_null(ti->nodes) )
+        {
+            ret = -EINVAL;
+            break;
+        }
+
+        while ( i < ti->num_devs )
+        {
+            physdev_pci_device_t dev;
+            uint32_t node;
+            const struct pci_dev *pdev;
+
+            if ( copy_from_guest_offset(&dev, ti->devs, i, 1) )
+            {
+                ret = -EFAULT;
+                break;
+            }
+
+            spin_lock(&pcidevs_lock);
+            pdev = pci_get_pdev(dev.seg, dev.bus, dev.devfn);
+            if ( !pdev )
+                node = XEN_INVALID_DEV;
+            else if ( pdev->node == NUMA_NO_NODE )
+                node = XEN_INVALID_NODE_ID;
+            else
+                node = pdev->node;
+            spin_unlock(&pcidevs_lock);
+
+            if ( copy_to_guest_offset(ti->nodes, i, &node, 1) )
+            {
+                ret = -EFAULT;
+                break;
+            }
+
+            if ( (++i > 0x3f) && hypercall_preempt_check() )
+                break;
+        }
+
+        if ( !ret && (ti->num_devs != i) )
+        {
+            ti->num_devs = i;
+            if ( __copy_field_to_guest(u_sysctl, op, u.pcitopoinfo.num_devs) )
+                ret = -EFAULT;
+        }
+        break;
+    }
+#endif
+
+    case XEN_SYSCTL_tmem_op:
+        ret = tmem_control(&op->u.tmem_op);
+        break;
+
     default:
         ret = arch_do_sysctl(op, u_sysctl);
         copyback = 0;
diff --git a/xen/common/time.c b/xen/common/time.c
index c16ff10..29fdf52 100644
--- a/xen/common/time.c
+++ b/xen/common/time.c
@@ -12,8 +12,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/common/tmem.c b/xen/common/tmem.c
index f2dc26e..0436e49 100644
--- a/xen/common/tmem.c
+++ b/xen/common/tmem.c
@@ -15,9 +15,10 @@
  */
 
 #ifdef __XEN__
-#include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
+#include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here. */
 #endif
 
+#include <public/sysctl.h>
 #include <xen/tmem.h>
 #include <xen/rbtree.h>
 #include <xen/radix-tree.h>
@@ -26,7 +27,7 @@
 
 #define TMEM_SPEC_VERSION 1
 
-/* global statistics (none need to be locked) */
+/* Global statistics (none need to be locked). */
 static unsigned long total_tmem_ops = 0;
 static unsigned long errored_tmem_ops = 0;
 static unsigned long total_flush_pool = 0;
@@ -68,18 +69,18 @@ struct client {
     bool_t compress;
     bool_t frozen;
     bool_t shared_auth_required;
-    /* for save/restore/migration */
+    /* For save/restore/migration. */
     bool_t live_migrating;
     bool_t was_frozen;
     struct list_head persistent_invalidated_list;
     struct tmem_page_descriptor *cur_pgp;
-    /* statistics collection */
+    /* Statistics collection. */
     unsigned long compress_poor, compress_nomem;
     unsigned long compressed_pages;
     uint64_t compressed_sum_size;
     uint64_t total_cycles;
     unsigned long succ_pers_puts, succ_eph_gets, succ_pers_gets;
-    /* shared pool authentication */
+    /* Shared pool authentication. */
     uint64_t shared_auth_uuid[MAX_GLOBAL_SHARED_POOLS][2];
 };
 
@@ -89,7 +90,7 @@ struct share_list {
 };
 
 #define POOL_PAGESHIFT (PAGE_SHIFT - 12)
-#define OBJ_HASH_BUCKETS 256 /* must be power of two */
+#define OBJ_HASH_BUCKETS 256 /* Must be power of two. */
 #define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
 
 struct tmem_pool {
@@ -97,20 +98,20 @@ struct tmem_pool {
     bool_t persistent;
     bool_t is_dying;
     struct client *client;
-    uint64_t uuid[2]; /* 0 for private, non-zero for shared */
+    uint64_t uuid[2]; /* 0 for private, non-zero for shared. */
     uint32_t pool_id;
     rwlock_t pool_rwlock;
-    struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
-    struct list_head share_list; /* valid if shared */
-    int shared_count; /* valid if shared */
-    /* for save/restore/migration */
+    struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* Protected by pool_rwlock. */
+    struct list_head share_list; /* Valid if shared. */
+    int shared_count; /* Valid if shared. */
+    /* For save/restore/migration. */
     struct list_head persistent_page_list;
     struct tmem_page_descriptor *cur_pgp;
-    /* statistics collection */
+    /* Statistics collection. */
     atomic_t pgp_count;
     int pgp_count_max;
-    long obj_count;  /* atomicity depends on pool_rwlock held for write */
-    long obj_count_max;  
+    long obj_count;  /* Atomicity depends on pool_rwlock held for write. */
+    long obj_count_max;
     unsigned long objnode_count, objnode_count_max;
     uint64_t sum_life_cycles;
     uint64_t sum_evicted_cycles;
@@ -124,16 +125,12 @@ struct tmem_pool {
 #define is_persistent(_p)  (_p->persistent)
 #define is_shared(_p)      (_p->shared)
 
-struct oid {
-    uint64_t oid[3];
-};
-
 struct tmem_object_root {
-    struct oid oid;
-    struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
-    unsigned long objnode_count; /* atomicity depends on obj_spinlock */
-    long pgp_count; /* atomicity depends on obj_spinlock */
-    struct radix_tree_root tree_root; /* tree of pages within object */
+    struct xen_tmem_oid oid;
+    struct rb_node rb_tree_node; /* Protected by pool->pool_rwlock. */
+    unsigned long objnode_count; /* Atomicity depends on obj_spinlock. */
+    long pgp_count; /* Atomicity depends on obj_spinlock. */
+    struct radix_tree_root tree_root; /* Tree of pages within object. */
     struct tmem_pool *pool;
     domid_t last_client;
     spinlock_t obj_spinlock;
@@ -157,23 +154,23 @@ struct tmem_page_descriptor {
             };
             struct tmem_object_root *obj;
         } us;
-        struct oid inv_oid;  /* used for invalid list only */
+        struct xen_tmem_oid inv_oid;  /* Used for invalid list only. */
     };
     pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
-                    else compressed data (cdata) */
+                    else compressed data (cdata). */
     uint32_t index;
-    /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */
-    uint16_t firstbyte; /* NON_SHAREABLE->pfp  otherwise->pcd */
-    bool_t eviction_attempted;  /* CHANGE TO lifetimes? (settable) */
+    /* Must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings. */
+    uint16_t firstbyte; /* NON_SHAREABLE->pfp  otherwise->pcd. */
+    bool_t eviction_attempted;  /* CHANGE TO lifetimes? (settable). */
     struct list_head pcd_siblings;
     union {
-        struct page_info *pfp;  /* page frame pointer */
-        char *cdata; /* compressed data */
-        struct tmem_page_content_descriptor *pcd; /* page dedup */
+        struct page_info *pfp;  /* Page frame pointer. */
+        char *cdata; /* Compressed data. */
+        struct tmem_page_content_descriptor *pcd; /* Page dedup. */
     };
     union {
         uint64_t timestamp;
-        uint32_t pool_id;  /* used for invalid list only */
+        uint32_t pool_id;  /* Used for invalid list only. */
     };
 };
 
@@ -181,21 +178,21 @@ struct tmem_page_descriptor {
 
 struct tmem_page_content_descriptor {
     union {
-        struct page_info *pfp;  /* page frame pointer */
-        char *cdata; /* if compression_enabled */
-        char *tze; /* if !compression_enabled, trailing zeroes eliminated */
+        struct page_info *pfp;  /* Page frame pointer. */
+        char *cdata; /* If compression_enabled. */
+        char *tze; /* If !compression_enabled, trailing zeroes eliminated. */
     };
     struct list_head pgp_list;
     struct rb_node pcd_rb_tree_node;
     uint32_t pgp_ref_count;
-    pagesize_t size; /* if compression_enabled -> 0<size<PAGE_SIZE (*cdata)
+    pagesize_t size; /* If compression_enabled -> 0<size<PAGE_SIZE (*cdata)
                      * else if tze, 0<=size<PAGE_SIZE, rounded up to mult of 8
-                     * else PAGE_SIZE -> *pfp */
+                     * else PAGE_SIZE -> *pfp. */
 };
-struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */
-rwlock_t pcd_tree_rwlocks[256]; /* poor man's concurrency for now */
+struct rb_root pcd_tree_roots[256]; /* Choose based on first byte of page. */
+rwlock_t pcd_tree_rwlocks[256]; /* Poor man's concurrency for now. */
 
-static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
+static LIST_HEAD(global_ephemeral_page_list); /* All pages in ephemeral pools. */
 
 static LIST_HEAD(global_client_list);
 
@@ -212,14 +209,14 @@ PAGE_LIST_HEAD(tmem_page_list);
 unsigned long tmem_page_list_pages = 0;
 
 DEFINE_RWLOCK(tmem_rwlock);
-static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
+static DEFINE_SPINLOCK(eph_lists_spinlock); /* Protects global AND clients. */
 static DEFINE_SPINLOCK(pers_lists_spinlock);
 
 #define ASSERT_SPINLOCK(_l) ASSERT(spin_is_locked(_l))
 #define ASSERT_WRITELOCK(_l) ASSERT(rw_is_write_locked(_l))
 
-/* global counters (should use long_atomic_t access) */
-static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
+/* Global counters (should use long_atomic_t access). */
+static long global_eph_count = 0; /* Atomicity depends on eph_lists_spinlock. */
 static atomic_t global_obj_count = ATOMIC_INIT(0);
 static atomic_t global_pgp_count = ATOMIC_INIT(0);
 static atomic_t global_pcd_count = ATOMIC_INIT(0);
@@ -344,7 +341,7 @@ static int __init tmem_mempool_init(void)
     return tmem_mempool != NULL;
 }
 
-/* persistent pools are per-domain */
+/* Persistent pools are per-domain. */
 static void *tmem_persistent_pool_page_get(unsigned long size)
 {
     struct page_info *pi;
@@ -368,7 +365,7 @@ static void tmem_persistent_pool_page_put(void *page_va)
 }
 
 /*
- * Page content descriptor manipulation routines
+ * Page content descriptor manipulation routines.
  */
 #define NOT_SHAREABLE ((uint16_t)-1UL)
 
@@ -393,8 +390,10 @@ static int pcd_copy_to_client(xen_pfn_t cmfn, struct tmem_page_descriptor *pgp)
     return ret;
 }
 
-/* ensure pgp no longer points to pcd, nor vice-versa */
-/* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */
+/*
+ * Ensure pgp no longer points to pcd, nor vice-versa.
+ * Take pcd rwlock unless have_pcd_rwlock is set, always unlock when done.
+ */
 static void pcd_disassociate(struct tmem_page_descriptor *pgp, struct tmem_pool *pool, bool_t have_pcd_rwlock)
 {
     struct tmem_page_content_descriptor *pcd = pgp->pcd;
@@ -424,30 +423,30 @@ static void pcd_disassociate(struct tmem_page_descriptor *pgp, struct tmem_pool
         return;
     }
 
-    /* no more references to this pcd, recycle it and the physical page */
+    /* No more references to this pcd, recycle it and the physical page. */
     ASSERT(list_empty(&pcd->pgp_list));
     pcd->pfp = NULL;
-    /* remove pcd from rbtree */
+    /* Remove pcd from rbtree. */
     rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]);
-    /* reinit the struct for safety for now */
+    /* Reinit the struct for safety for now. */
     RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);
-    /* now free up the pcd memory */
+    /* Now free up the pcd memory. */
     tmem_free(pcd, NULL);
     atomic_dec_and_assert(global_pcd_count);
     if ( pgp_size != 0 && pcd_size < PAGE_SIZE )
     {
-        /* compressed data */
+        /* Compressed data. */
         tmem_free(pcd_cdata, pool);
         pcd_tot_csize -= pcd_csize;
     }
     else if ( pcd_size != PAGE_SIZE )
     {
-        /* trailing zero data */
+        /* Trailing zero data. */
         pcd_tot_tze_size -= pcd_size;
         if ( pcd_size )
             tmem_free(pcd_tze, pool);
     } else {
-        /* real physical page */
+        /* Real physical page. */
         if ( tmem_tze_enabled() )
             pcd_tot_tze_size -= PAGE_SIZE;
         if ( tmem_compression_enabled() )
@@ -488,48 +487,50 @@ static int pcd_associate(struct tmem_page_descriptor *pgp, char *cdata, pagesize
     }
     write_lock(&pcd_tree_rwlocks[firstbyte]);
 
-    /* look for page match */
+    /* Look for page match. */
     root = &pcd_tree_roots[firstbyte];
     new = &(root->rb_node);
     while ( *new )
     {
         pcd = container_of(*new, struct tmem_page_content_descriptor, pcd_rb_tree_node);
         parent = *new;
-        /* compare new entry and rbtree entry, set cmp accordingly */
+        /* Compare new entry and rbtree entry, set cmp accordingly. */
         if ( cdata != NULL )
         {
             if ( pcd->size < PAGE_SIZE )
-                /* both new entry and rbtree entry are compressed */
+                /* Both new entry and rbtree entry are compressed. */
                 cmp = tmem_pcd_cmp(cdata,csize,pcd->cdata,pcd->size);
             else
-                /* new entry is compressed, rbtree entry is not */
+                /* New entry is compressed, rbtree entry is not. */
                 cmp = -1;
         } else if ( pcd->size < PAGE_SIZE )
-            /* rbtree entry is compressed, rbtree entry is not */
+            /* Rbtree entry is compressed, rbtree entry is not. */
             cmp = 1;
         else if ( tmem_tze_enabled() ) {
             if ( pcd->size < PAGE_SIZE )
-                /* both new entry and rbtree entry are trailing zero */
+                /* Both new entry and rbtree entry are trailing zero. */
                 cmp = tmem_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->tze,pcd->size);
             else
-                /* new entry is trailing zero, rbtree entry is not */
+                /* New entry is trailing zero, rbtree entry is not. */
                 cmp = tmem_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->pfp,PAGE_SIZE);
         } else  {
-            /* both new entry and rbtree entry are full physical pages */
+            /* Both new entry and rbtree entry are full physical pages. */
             ASSERT(pgp->pfp != NULL);
             ASSERT(pcd->pfp != NULL);
             cmp = tmem_page_cmp(pgp->pfp,pcd->pfp);
         }
 
-        /* walk tree or match depending on cmp */
+        /* Walk tree or match depending on cmp. */
         if ( cmp < 0 )
             new = &((*new)->rb_left);
         else if ( cmp > 0 )
             new = &((*new)->rb_right);
         else
         {
-            /* match! if not compressed, free the no-longer-needed page */
-            /* but if compressed, data is assumed static so don't free! */
+            /*
+             * Match! if not compressed, free the no-longer-needed page
+             * but if compressed, data is assumed static so don't free!
+             */
             if ( cdata == NULL )
                 tmem_free_page(pgp->us.obj->pool,pgp->pfp);
             deduped_puts++;
@@ -537,7 +538,7 @@ static int pcd_associate(struct tmem_page_descriptor *pgp, char *cdata, pagesize
         }
     }
 
-    /* exited while loop with no match, so alloc a pcd and put it in the tree */
+    /* Exited while loop with no match, so alloc a pcd and put it in the tree. */
     if ( (pcd = tmem_malloc(sizeof(struct tmem_page_content_descriptor), NULL)) == NULL )
     {
         ret = -ENOMEM;
@@ -551,8 +552,8 @@ static int pcd_associate(struct tmem_page_descriptor *pgp, char *cdata, pagesize
         }
     }
     atomic_inc_and_max(global_pcd_count);
-    RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);  /* is this necessary */
-    INIT_LIST_HEAD(&pcd->pgp_list);  /* is this necessary */
+    RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);  /* Is this necessary? */
+    INIT_LIST_HEAD(&pcd->pgp_list);  /* Is this necessary? */
     pcd->pgp_ref_count = 0;
     if ( cdata != NULL )
     {
@@ -594,7 +595,7 @@ unlock:
 
 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
 
-/* allocate a struct tmem_page_descriptor and associate it with an object */
+/* Allocate a struct tmem_page_descriptor and associate it with an object. */
 static struct tmem_page_descriptor *pgp_alloc(struct tmem_object_root *obj)
 {
     struct tmem_page_descriptor *pgp;
@@ -638,7 +639,7 @@ static void pgp_free_data(struct tmem_page_descriptor *pgp, struct tmem_pool *po
     if ( pgp->pfp == NULL )
         return;
     if ( tmem_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
-        pcd_disassociate(pgp,pool,0); /* pgp->size lost */
+        pcd_disassociate(pgp,pool,0); /* pgp->size lost. */
     else if ( pgp_size )
         tmem_free(pgp->cdata, pool);
     else
@@ -686,7 +687,7 @@ static void pgp_free(struct tmem_page_descriptor *pgp)
     __pgp_free(pgp, pool);
 }
 
-/* remove pgp from global/pool/client lists and free it */
+/* Remove pgp from global/pool/client lists and free it. */
 static void pgp_delist_free(struct tmem_page_descriptor *pgp)
 {
     struct client *client;
@@ -698,7 +699,7 @@ static void pgp_delist_free(struct tmem_page_descriptor *pgp)
     client = pgp->us.obj->pool->client;
     ASSERT(client != NULL);
 
-    /* Delist pgp */
+    /* Delist pgp. */
     if ( !is_persistent(pgp->us.obj->pool) )
     {
         spin_lock(&eph_lists_spinlock);
@@ -733,11 +734,11 @@ static void pgp_delist_free(struct tmem_page_descriptor *pgp)
     life = get_cycles() - pgp->timestamp;
     pgp->us.obj->pool->sum_life_cycles += life;
 
-    /* free pgp */
+    /* Free pgp. */
     pgp_free(pgp);
 }
 
-/* called only indirectly by radix_tree_destroy */
+/* Called only indirectly by radix_tree_destroy. */
 static void pgp_destroy(void *v)
 {
     struct tmem_page_descriptor *pgp = (struct tmem_page_descriptor *)v;
@@ -774,7 +775,7 @@ static struct tmem_page_descriptor *pgp_delete_from_obj(struct tmem_object_root
 
 /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
 
-/* called only indirectly from radix_tree_insert */
+/* Called only indirectly from radix_tree_insert. */
 static struct radix_tree_node *rtn_alloc(void *arg)
 {
     struct tmem_object_node *objnode;
@@ -793,7 +794,7 @@ static struct radix_tree_node *rtn_alloc(void *arg)
     return &objnode->rtn;
 }
 
-/* called only indirectly from radix_tree_delete/destroy */
+/* Called only indirectly from radix_tree_delete/destroy. */
 static void rtn_free(struct radix_tree_node *rtn, void *arg)
 {
     struct tmem_pool *pool;
@@ -814,7 +815,8 @@ static void rtn_free(struct radix_tree_node *rtn, void *arg)
 
 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
 
-static int oid_compare(struct oid *left, struct oid *right)
+static int oid_compare(struct xen_tmem_oid *left,
+                       struct xen_tmem_oid *right)
 {
     if ( left->oid[2] == right->oid[2] )
     {
@@ -838,19 +840,20 @@ static int oid_compare(struct oid *left, struct oid *right)
         return 1;
 }
 
-static void oid_set_invalid(struct oid *oidp)
+static void oid_set_invalid(struct xen_tmem_oid *oidp)
 {
     oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
 }
 
-static unsigned oid_hash(struct oid *oidp)
+static unsigned oid_hash(struct xen_tmem_oid *oidp)
 {
     return (tmem_hash(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
                      BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK);
 }
 
-/* searches for object==oid in pool, returns locked object if found */
-static struct tmem_object_root * obj_find(struct tmem_pool *pool, struct oid *oidp)
+/* Searches for object==oid in pool, returns locked object if found. */
+static struct tmem_object_root * obj_find(struct tmem_pool *pool,
+                                          struct xen_tmem_oid *oidp)
 {
     struct rb_node *node;
     struct tmem_object_root *obj;
@@ -863,7 +866,7 @@ restart_find:
         obj = container_of(node, struct tmem_object_root, rb_tree_node);
         switch ( oid_compare(&obj->oid, oidp) )
         {
-            case 0: /* equal */
+            case 0: /* Equal. */
                 if ( !spin_trylock(&obj->obj_spinlock) )
                 {
                     read_unlock(&pool->pool_rwlock);
@@ -882,11 +885,11 @@ restart_find:
     return NULL;
 }
 
-/* free an object that has no more pgps in it */
+/* Free an object that has no more pgps in it. */
 static void obj_free(struct tmem_object_root *obj)
 {
     struct tmem_pool *pool;
-    struct oid old_oid;
+    struct xen_tmem_oid old_oid;
 
     ASSERT_SPINLOCK(&obj->obj_spinlock);
     ASSERT(obj != NULL);
@@ -895,7 +898,7 @@ static void obj_free(struct tmem_object_root *obj)
     ASSERT(pool != NULL);
     ASSERT(pool->client != NULL);
     ASSERT_WRITELOCK(&pool->pool_rwlock);
-    if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
+    if ( obj->tree_root.rnode != NULL ) /* May be a "stump" with no leaves. */
         radix_tree_destroy(&obj->tree_root, pgp_destroy);
     ASSERT((long)obj->objnode_count == 0);
     ASSERT(obj->tree_root.rnode == NULL);
@@ -916,6 +919,9 @@ static int obj_rb_insert(struct rb_root *root, struct tmem_object_root *obj)
     struct rb_node **new, *parent = NULL;
     struct tmem_object_root *this;
 
+    ASSERT(obj->pool);
+    ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
+
     new = &(root->rb_node);
     while ( *new )
     {
@@ -939,10 +945,11 @@ static int obj_rb_insert(struct rb_root *root, struct tmem_object_root *obj)
 }
 
 /*
- * allocate, initialize, and insert an tmem_object_root
- * (should be called only if find failed)
+ * Allocate, initialize, and insert an tmem_object_root
+ * (should be called only if find failed).
  */
-static struct tmem_object_root * obj_alloc(struct tmem_pool *pool, struct oid *oidp)
+static struct tmem_object_root * obj_alloc(struct tmem_pool *pool,
+                                           struct xen_tmem_oid *oidp)
 {
     struct tmem_object_root *obj;
 
@@ -964,7 +971,7 @@ static struct tmem_object_root * obj_alloc(struct tmem_pool *pool, struct oid *o
     return obj;
 }
 
-/* free an object after destroying any pgps in it */
+/* Free an object after destroying any pgps in it. */
 static void obj_destroy(struct tmem_object_root *obj)
 {
     ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
@@ -972,7 +979,7 @@ static void obj_destroy(struct tmem_object_root *obj)
     obj_free(obj);
 }
 
-/* destroys all objs in a pool, or only if obj->last_client matches cli_id */
+/* Destroys all objs in a pool, or only if obj->last_client matches cli_id. */
 static void pool_destroy_objs(struct tmem_pool *pool, domid_t cli_id)
 {
     struct rb_node *node;
@@ -1037,11 +1044,14 @@ static int shared_pool_join(struct tmem_pool *pool, struct client *new_client)
         tmem_client_info("adding new %s %d to shared pool owned by %s %d\n",
                     tmem_client_str, new_client->cli_id, tmem_client_str,
                     pool->client->cli_id);
+    else if ( pool->shared_count )
+        tmem_client_info("inter-guest sharing of shared pool %s by client %d\n",
+                         tmem_client_str, pool->client->cli_id);
     ++pool->shared_count;
     return 0;
 }
 
-/* reassign "ownership" of the pool to another client that shares this pool */
+/* Reassign "ownership" of the pool to another client that shares this pool. */
 static void shared_pool_reassign(struct tmem_pool *pool)
 {
     struct share_list *sl;
@@ -1056,7 +1066,10 @@ static void shared_pool_reassign(struct tmem_pool *pool)
     }
     old_client->pools[pool->pool_id] = NULL;
     sl = list_entry(pool->share_list.next, struct share_list, share_list);
-    ASSERT(sl->client != old_client);
+    /*
+     * The sl->client can be old_client if there are multiple shared pools
+     * within an guest.
+     */
     pool->client = new_client = sl->client;
     for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
         if (new_client->pools[poolid] == pool)
@@ -1071,8 +1084,10 @@ static void shared_pool_reassign(struct tmem_pool *pool)
     pool->pool_id = poolid;
 }
 
-/* destroy all objects with last_client same as passed cli_id,
-   remove pool's cli_id from list of sharers of this pool */
+/*
+ * Destroy all objects with last_client same as passed cli_id,
+ * remove pool's cli_id from list of sharers of this pool.
+ */
 static int shared_pool_quit(struct tmem_pool *pool, domid_t cli_id)
 {
     struct share_list *sl;
@@ -1080,7 +1095,7 @@ static int shared_pool_quit(struct tmem_pool *pool, domid_t cli_id)
 
     ASSERT(is_shared(pool));
     ASSERT(pool->client != NULL);
-    
+
     ASSERT_WRITELOCK(&tmem_rwlock);
     pool_destroy_objs(pool, cli_id);
     list_for_each_entry(sl,&pool->share_list, share_list)
@@ -1107,7 +1122,7 @@ static int shared_pool_quit(struct tmem_pool *pool, domid_t cli_id)
     return -1;
 }
 
-/* flush all data (owned by cli_id) from a pool and, optionally, free it */
+/* Flush all data (owned by cli_id) from a pool and, optionally, free it. */
 static void pool_flush(struct tmem_pool *pool, domid_t cli_id)
 {
     ASSERT(pool != NULL);
@@ -1168,7 +1183,7 @@ static struct client *client_create(domid_t cli_id)
     }
     if ( !d->is_dying ) {
         d->tmem_client = client;
-	client->domain = d;
+        client->domain = d;
     }
     rcu_unlock_domain(d);
 
@@ -1196,7 +1211,7 @@ static void client_free(struct client *client)
     xfree(client);
 }
 
-/* flush all data from a client and, optionally, free it */
+/* Flush all data from a client and, optionally, free it. */
 static void client_flush(struct client *client)
 {
     int i;
@@ -1217,7 +1232,7 @@ static bool_t client_over_quota(struct client *client)
     int total = _atomic_read(client_weight_total);
 
     ASSERT(client != NULL);
-    if ( (total == 0) || (client->weight == 0) || 
+    if ( (total == 0) || (client->weight == 0) ||
           (client->eph_count == 0) )
         return 0;
     return ( ((global_eph_count*100L) / client->eph_count ) >
@@ -1298,12 +1313,12 @@ static int tmem_evict(void)
                 goto found;
             }
     }
-     /* global_ephemeral_page_list is empty, so we bail out. */
+     /* Global_ephemeral_page_list is empty, so we bail out. */
     spin_unlock(&eph_lists_spinlock);
     goto out;
 
 found:
-    /* Delist */
+    /* Delist. */
     list_del_init(&pgp->us.client_eph_pages);
     client->eph_count--;
     list_del_init(&pgp->global_eph_pages);
@@ -1327,7 +1342,7 @@ found:
         pcd_disassociate(pgp,pool,1);
     }
 
-    /* pgp already delist, so call pgp_free directly */
+    /* pgp already delist, so call pgp_free directly. */
     pgp_free(pgp);
     if ( obj->pgp_count == 0 )
     {
@@ -1402,7 +1417,7 @@ static int do_tmem_put_compress(struct tmem_page_descriptor *pgp, xen_pfn_t cmfn
     void *dst, *p;
     size_t size;
     int ret = 0;
-    
+
     ASSERT(pgp != NULL);
     ASSERT(pgp->us.obj != NULL);
     ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock);
@@ -1455,8 +1470,8 @@ static int do_tmem_dup_put(struct tmem_page_descriptor *pgp, xen_pfn_t cmfn,
     ASSERT(pool != NULL);
     client = pool->client;
     if ( client->live_migrating )
-        goto failed_dup; /* no dups allowed when migrating */
-    /* can we successfully manipulate pgp to change out the data? */
+        goto failed_dup; /* No dups allowed when migrating. */
+    /* Can we successfully manipulate pgp to change out the data? */
     if ( client->compress && pgp->size != 0 )
     {
         ret = do_tmem_put_compress(pgp, cmfn, clibuf);
@@ -1486,7 +1501,7 @@ copy_uncompressed:
     }
 
 done:
-    /* successfully replaced data, clean up and return success */
+    /* Successfully replaced data, clean up and return success. */
     if ( is_shared(pool) )
         obj->last_client = client->cli_id;
     spin_unlock(&obj->obj_spinlock);
@@ -1501,8 +1516,10 @@ bad_copy:
     goto cleanup;
 
 failed_dup:
-   /* couldn't change out the data, flush the old data and return
-    * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
+    /*
+     * Couldn't change out the data, flush the old data and return
+     * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put.
+     */
     ret = -ENOSPC;
 cleanup:
     pgpfound = pgp_delete_from_obj(obj, pgp->index);
@@ -1521,8 +1538,8 @@ cleanup:
 }
 
 static int do_tmem_put(struct tmem_pool *pool,
-              struct oid *oidp, uint32_t index,
-              xen_pfn_t cmfn, tmem_cli_va_param_t clibuf)
+                       struct xen_tmem_oid *oidp, uint32_t index,
+                       xen_pfn_t cmfn, tmem_cli_va_param_t clibuf)
 {
     struct tmem_object_root *obj = NULL;
     struct tmem_page_descriptor *pgp = NULL;
@@ -1536,7 +1553,7 @@ static int do_tmem_put(struct tmem_pool *pool,
     pool->puts++;
 
 refind:
-    /* does page already exist (dup)?  if so, handle specially */
+    /* Does page already exist (dup)?  if so, handle specially. */
     if ( (obj = obj_find(pool, oidp)) != NULL )
     {
         if ((pgp = pgp_lookup_in_obj(obj, index)) != NULL)
@@ -1545,14 +1562,14 @@ refind:
         }
         else
         {
-            /* no puts allowed into a frozen pool (except dup puts) */
+            /* No puts allowed into a frozen pool (except dup puts). */
             if ( client->frozen )
-	        goto unlock_obj;
+                goto unlock_obj;
         }
     }
     else
     {
-        /* no puts allowed into a frozen pool (except dup puts) */
+        /* No puts allowed into a frozen pool (except dup puts). */
         if ( client->frozen )
             return ret;
         if ( (obj = obj_alloc(pool, oidp)) == NULL )
@@ -1560,10 +1577,10 @@ refind:
 
         write_lock(&pool->pool_rwlock);
         /*
-	 * Parallel callers may already allocated obj and inserted to obj_rb_root
-	 * before us.
-	 */
-        if (!obj_rb_insert(&pool->obj_rb_root[oid_hash(oidp)], obj))
+         * Parallel callers may already allocated obj and inserted to obj_rb_root
+         * before us.
+         */
+        if ( !obj_rb_insert(&pool->obj_rb_root[oid_hash(oidp)], obj) )
         {
             tmem_free(obj, pool);
             write_unlock(&pool->pool_rwlock);
@@ -1575,14 +1592,14 @@ refind:
         write_unlock(&pool->pool_rwlock);
     }
 
-    /* When arrive here, we have a spinlocked obj for use */
+    /* When arrive here, we have a spinlocked obj for use. */
     ASSERT_SPINLOCK(&obj->obj_spinlock);
     if ( (pgp = pgp_alloc(obj)) == NULL )
         goto unlock_obj;
 
     ret = pgp_add_to_obj(obj, index, pgp);
     if ( ret == -ENOMEM  )
-        /* warning, may result in partially built radix tree ("stump") */
+        /* Warning: may result in partially built radix tree ("stump"). */
         goto free_pgp;
 
     pgp->index = index;
@@ -1642,7 +1659,7 @@ insert_page:
         spin_unlock(&eph_lists_spinlock);
     }
     else
-    { /* is_persistent */
+    { /* is_persistent. */
         spin_lock(&pers_lists_spinlock);
         list_add_tail(&pgp->us.pool_pers_pages,
             &pool->persistent_page_list);
@@ -1652,7 +1669,7 @@ insert_page:
     if ( is_shared(pool) )
         obj->last_client = client->cli_id;
 
-    /* free the obj spinlock */
+    /* Free the obj spinlock. */
     spin_unlock(&obj->obj_spinlock);
     pool->good_puts++;
 
@@ -1686,8 +1703,9 @@ unlock_obj:
     return ret;
 }
 
-static int do_tmem_get(struct tmem_pool *pool, struct oid *oidp, uint32_t index,
-              xen_pfn_t cmfn, tmem_cli_va_param_t clibuf)
+static int do_tmem_get(struct tmem_pool *pool,
+                       struct xen_tmem_oid *oidp, uint32_t index,
+                       xen_pfn_t cmfn, tmem_cli_va_param_t clibuf)
 {
     struct tmem_object_root *obj;
     struct tmem_page_descriptor *pgp;
@@ -1764,7 +1782,8 @@ bad_copy:
     return rc;
 }
 
-static int do_tmem_flush_page(struct tmem_pool *pool, struct oid *oidp, uint32_t index)
+static int do_tmem_flush_page(struct tmem_pool *pool,
+                              struct xen_tmem_oid *oidp, uint32_t index)
 {
     struct tmem_object_root *obj;
     struct tmem_page_descriptor *pgp;
@@ -1797,7 +1816,8 @@ out:
         return 1;
 }
 
-static int do_tmem_flush_object(struct tmem_pool *pool, struct oid *oidp)
+static int do_tmem_flush_object(struct tmem_pool *pool,
+                                struct xen_tmem_oid *oidp)
 {
     struct tmem_object_root *obj;
 
@@ -1933,7 +1953,7 @@ static int do_tmem_new_pool(domid_t this_cli_id,
                      (client->shared_auth_uuid[i][1] == uuid_hi) )
                     break;
             if ( i == MAX_GLOBAL_SHARED_POOLS )
-	    {
+            {
                 tmem_client_info("Shared auth failed, create non shared pool instead!\n");
                 pool->shared = 0;
                 goto out;
@@ -1942,7 +1962,7 @@ static int do_tmem_new_pool(domid_t this_cli_id,
 
         /*
          * Authorize okay, match a global shared pool or use the newly allocated
-         * one
+         * one.
          */
         first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
         for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
@@ -1951,7 +1971,7 @@ static int do_tmem_new_pool(domid_t this_cli_id,
             {
                 if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
                 {
-                    /* Succ to match a global shared pool */
+                    /* Succ to match a global shared pool. */
                     tmem_client_info("(matches shared pool uuid=%"PRIx64".%"PRIx64") pool_id=%d\n",
                         uuid_hi, uuid_lo, d_poolid);
                     client->pools[d_poolid] = shpool;
@@ -1971,17 +1991,19 @@ static int do_tmem_new_pool(domid_t this_cli_id,
             }
         }
 
-        /* Failed to find a global shard pool slot */
+        /* Failed to find a global shared pool slot. */
         if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
         {
             tmem_client_warn("tmem: failed... no global shared pool slots available\n");
             goto fail;
         }
-        /* Add pool to global shard pool */
+        /* Add pool to global shared pool. */
         else
         {
             INIT_LIST_HEAD(&pool->share_list);
             pool->shared_count = 0;
+            if ( shared_pool_join(pool, client) )
+                goto fail;
             global_shared_pools[first_unused_s_poolid] = pool;
         }
     }
@@ -1997,12 +2019,12 @@ fail:
 
 /************ TMEM CONTROL OPERATIONS ************************************/
 
-/* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
+/* Freeze/thaw all pools belonging to client cli_id (all domains if -1). */
 static int tmemc_freeze_pools(domid_t cli_id, int arg)
 {
     struct client *client;
-    bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
-    bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
+    bool_t freeze = (arg == XEN_SYSCTL_TMEM_OP_FREEZE) ? 1 : 0;
+    bool_t destroy = (arg == XEN_SYSCTL_TMEM_OP_DESTROY) ? 1 : 0;
     char *s;
 
     s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
@@ -2033,7 +2055,7 @@ static int tmemc_flush_mem(domid_t cli_id, uint32_t kb)
            tmem_client_str);
         return -1;
     }
-    /* convert kb to pages, rounding up if necessary */
+    /* Convert kb to pages, rounding up if necessary. */
     npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
     flushed_pages = tmem_flush_npages(npages);
     flushed_kb = flushed_pages << (PAGE_SHIFT-10);
@@ -2093,7 +2115,7 @@ static int tmemc_list_client(struct client *c, tmem_cli_va_param_t buf,
              p->obj_count, p->obj_count_max,
              p->objnode_count, p->objnode_count_max,
              p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
-             p->no_mem_puts, 
+             p->no_mem_puts,
              p->found_gets, p->gets,
              p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
         if ( sum + n >= len )
@@ -2132,7 +2154,7 @@ static int tmemc_list_shared(tmem_cli_va_param_t buf, int off, uint32_t len,
              p->obj_count, p->obj_count_max,
              p->objnode_count, p->objnode_count_max,
              p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
-             p->no_mem_puts, 
+             p->no_mem_puts,
              p->found_gets, p->gets,
              p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
         if ( sum + n >= len )
@@ -2150,7 +2172,7 @@ static int tmemc_list_global_perf(tmem_cli_va_param_t buf, int off,
     int n = 0, sum = 0;
 
     n = scnprintf(info+n,BSIZE-n,"T=");
-    n--; /* overwrite trailing comma */
+    n--; /* Overwrite trailing comma. */
     n += scnprintf(info+n,BSIZE-n,"\n");
     if ( sum + n >= len )
         return sum;
@@ -2219,7 +2241,7 @@ static int __tmemc_set_var(struct client *client, uint32_t subop, uint32_t arg1)
 
     switch (subop)
     {
-    case TMEMC_SET_WEIGHT:
+    case XEN_SYSCTL_TMEM_OP_SET_WEIGHT:
         old_weight = client->weight;
         client->weight = arg1;
         tmem_client_info("tmem: weight set to %d for %s=%d\n",
@@ -2227,12 +2249,12 @@ static int __tmemc_set_var(struct client *client, uint32_t subop, uint32_t arg1)
         atomic_sub(old_weight,&client_weight_total);
         atomic_add(client->weight,&client_weight_total);
         break;
-    case TMEMC_SET_CAP:
+    case XEN_SYSCTL_TMEM_OP_SET_CAP:
         client->cap = arg1;
         tmem_client_info("tmem: cap set to %d for %s=%d\n",
                         arg1, tmem_cli_id_str, cli_id);
         break;
-    case TMEMC_SET_COMPRESS:
+    case XEN_SYSCTL_TMEM_OP_SET_COMPRESS:
         if ( tmem_dedup_enabled() )
         {
             tmem_client_warn("tmem: compression %s for all %ss, cannot be changed when tmem_dedup is enabled\n",
@@ -2335,7 +2357,7 @@ static int tmemc_save_subop(int cli_id, uint32_t pool_id,
 
     switch(subop)
     {
-    case TMEMC_SAVE_BEGIN:
+    case XEN_SYSCTL_TMEM_OP_SAVE_BEGIN:
         if ( client == NULL )
             return 0;
         for (p = 0; p < MAX_POOLS_PER_DOMAIN; p++)
@@ -2352,33 +2374,33 @@ static int tmemc_save_subop(int cli_id, uint32_t pool_id,
             client->live_migrating = 1;
         rc = 1;
         break;
-    case TMEMC_RESTORE_BEGIN:
+    case XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN:
         if ( client == NULL && (client = client_create(cli_id)) != NULL )
             return 1;
         break;
-    case TMEMC_SAVE_GET_VERSION:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_VERSION:
         rc = TMEM_SPEC_VERSION;
         break;
-    case TMEMC_SAVE_GET_MAXPOOLS:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_MAXPOOLS:
         rc = MAX_POOLS_PER_DOMAIN;
         break;
-    case TMEMC_SAVE_GET_CLIENT_WEIGHT:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_WEIGHT:
         if ( client == NULL )
             break;
         rc = client->weight == -1 ? -2 : client->weight;
         break;
-    case TMEMC_SAVE_GET_CLIENT_CAP:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_CAP:
         if ( client == NULL )
             break;
         rc = client->cap == -1 ? -2 : client->cap;
         break;
-    case TMEMC_SAVE_GET_CLIENT_FLAGS:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_FLAGS:
         if ( client == NULL )
             break;
         rc = (client->compress ? TMEM_CLIENT_COMPRESS : 0 ) |
              (client->was_frozen ? TMEM_CLIENT_FROZEN : 0 );
         break;
-    case TMEMC_SAVE_GET_POOL_FLAGS:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_FLAGS:
          if ( pool == NULL )
              break;
          rc = (pool->persistent ? TMEM_POOL_PERSIST : 0) |
@@ -2386,19 +2408,19 @@ static int tmemc_save_subop(int cli_id, uint32_t pool_id,
               (POOL_PAGESHIFT << TMEM_POOL_PAGESIZE_SHIFT) |
               (TMEM_SPEC_VERSION << TMEM_POOL_VERSION_SHIFT);
         break;
-    case TMEMC_SAVE_GET_POOL_NPAGES:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_NPAGES:
          if ( pool == NULL )
              break;
         rc = _atomic_read(pool->pgp_count);
         break;
-    case TMEMC_SAVE_GET_POOL_UUID:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_UUID:
          if ( pool == NULL )
              break;
         rc = 0;
         if ( copy_to_guest(guest_handle_cast(buf, void), pool->uuid, 2) )
             rc = -EFAULT;
         break;
-    case TMEMC_SAVE_END:
+    case XEN_SYSCTL_TMEM_OP_SAVE_END:
         if ( client == NULL )
             break;
         client->live_migrating = 0;
@@ -2420,7 +2442,7 @@ static int tmemc_save_get_next_page(int cli_id, uint32_t pool_id,
     struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
                    ? NULL : client->pools[pool_id];
     struct tmem_page_descriptor *pgp;
-    struct oid oid;
+    struct xen_tmem_oid *oid;
     int ret = 0;
     struct tmem_handle h;
 
@@ -2436,26 +2458,26 @@ static int tmemc_save_get_next_page(int cli_id, uint32_t pool_id,
         ret = -1;
         goto out;
     }
-    /* note: pool->cur_pgp is the pgp last returned by get_next_page */
+    /* Note: pool->cur_pgp is the pgp last returned by get_next_page. */
     if ( pool->cur_pgp == NULL )
     {
-        /* process the first one */
+        /* Process the first one. */
         pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next,
                          struct tmem_page_descriptor,us.pool_pers_pages);
-    } else if ( list_is_last(&pool->cur_pgp->us.pool_pers_pages, 
+    } else if ( list_is_last(&pool->cur_pgp->us.pool_pers_pages,
                              &pool->persistent_page_list) )
     {
-        /* already processed the last one in the list */
+        /* Already processed the last one in the list. */
         ret = -1;
         goto out;
     }
     pgp = list_entry((&pool->cur_pgp->us.pool_pers_pages)->next,
                          struct tmem_page_descriptor,us.pool_pers_pages);
     pool->cur_pgp = pgp;
-    oid = pgp->us.obj->oid;
+    oid = &pgp->us.obj->oid;
     h.pool_id = pool_id;
-    BUILD_BUG_ON(sizeof(h.oid) != sizeof(oid));
-    memcpy(h.oid, oid.oid, sizeof(h.oid));
+    BUILD_BUG_ON(sizeof(h.oid) != sizeof(*oid));
+    memcpy(&(h.oid), oid, sizeof(h.oid));
     h.index = pgp->index;
     if ( copy_to_guest(guest_handle_cast(buf, void), &h, 1) )
     {
@@ -2463,7 +2485,7 @@ static int tmemc_save_get_next_page(int cli_id, uint32_t pool_id,
         goto out;
     }
     guest_handle_add_offset(buf, sizeof(h));
-    ret = do_tmem_get(pool, &oid, pgp->index, 0, buf);
+    ret = do_tmem_get(pool, oid, pgp->index, 0, buf);
 
 out:
     spin_unlock(&pers_lists_spinlock);
@@ -2490,7 +2512,7 @@ static int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_param_t buf,
         pgp = list_entry((&client->persistent_invalidated_list)->next,
                          struct tmem_page_descriptor,client_inv_pages);
         client->cur_pgp = pgp;
-    } else if ( list_is_last(&client->cur_pgp->client_inv_pages, 
+    } else if ( list_is_last(&client->cur_pgp->client_inv_pages,
                              &client->persistent_invalidated_list) )
     {
         client->cur_pgp = NULL;
@@ -2503,7 +2525,7 @@ static int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_param_t buf,
     }
     h.pool_id = pgp->pool_id;
     BUILD_BUG_ON(sizeof(h.oid) != sizeof(pgp->inv_oid));
-    memcpy(h.oid, pgp->inv_oid.oid, sizeof(h.oid));
+    memcpy(&(h.oid), &(pgp->inv_oid), sizeof(h.oid));
     h.index = pgp->index;
     ret = 1;
     if ( copy_to_guest(guest_handle_cast(buf, void), &h, 1) )
@@ -2513,8 +2535,10 @@ out:
     return ret;
 }
 
-static int tmemc_restore_put_page(int cli_id, uint32_t pool_id, struct oid *oidp,
-                      uint32_t index, tmem_cli_va_param_t buf, uint32_t bufsize)
+static int tmemc_restore_put_page(int cli_id, uint32_t pool_id,
+                                  struct xen_tmem_oid *oidp,
+                                  uint32_t index, tmem_cli_va_param_t buf,
+                                  uint32_t bufsize)
 {
     struct client *client = tmem_client_from_cli_id(cli_id);
     struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
@@ -2525,13 +2549,14 @@ static int tmemc_restore_put_page(int cli_id, uint32_t pool_id, struct oid *oidp
     if (bufsize != PAGE_SIZE) {
         tmem_client_err("tmem: %s: invalid parameter bufsize(%d) != (%ld)\n",
                 __func__, bufsize, PAGE_SIZE);
-	return -EINVAL;
+        return -EINVAL;
     }
     return do_tmem_put(pool, oidp, index, 0, buf);
 }
 
-static int tmemc_restore_flush_page(int cli_id, uint32_t pool_id, struct oid *oidp,
-                        uint32_t index)
+static int tmemc_restore_flush_page(int cli_id, uint32_t pool_id,
+                                    struct xen_tmem_oid *oidp,
+                                    uint32_t index)
 {
     struct client *client = tmem_client_from_cli_id(cli_id);
     struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
@@ -2542,77 +2567,75 @@ static int tmemc_restore_flush_page(int cli_id, uint32_t pool_id, struct oid *oi
     return do_tmem_flush_page(pool,oidp,index);
 }
 
-static int do_tmem_control(struct tmem_op *op)
+int tmem_control(struct xen_sysctl_tmem_op *op)
 {
     int ret;
     uint32_t pool_id = op->pool_id;
-    uint32_t subop = op->u.ctrl.subop;
-    struct oid *oidp = (struct oid *)(&op->u.ctrl.oid[0]);
+    uint32_t cmd = op->cmd;
+    struct xen_tmem_oid *oidp = &op->oid;
 
-    if ( xsm_tmem_control(XSM_PRIV) )
-        return -EPERM;
+    if ( op->pad != 0 )
+        return -EINVAL;
 
-    switch(subop)
+    write_lock(&tmem_rwlock);
+
+    switch (cmd)
     {
-    case TMEMC_THAW:
-    case TMEMC_FREEZE:
-    case TMEMC_DESTROY:
-        ret = tmemc_freeze_pools(op->u.ctrl.cli_id,subop);
+    case XEN_SYSCTL_TMEM_OP_THAW:
+    case XEN_SYSCTL_TMEM_OP_FREEZE:
+    case XEN_SYSCTL_TMEM_OP_DESTROY:
+        ret = tmemc_freeze_pools(op->cli_id, cmd);
         break;
-    case TMEMC_FLUSH:
-        ret = tmemc_flush_mem(op->u.ctrl.cli_id,op->u.ctrl.arg1);
+    case XEN_SYSCTL_TMEM_OP_FLUSH:
+        ret = tmemc_flush_mem(op->cli_id,op->arg1);
         break;
-    case TMEMC_LIST:
-        ret = tmemc_list(op->u.ctrl.cli_id,
-                         guest_handle_cast(op->u.ctrl.buf, char),
-                         op->u.ctrl.arg1,op->u.ctrl.arg2);
+    case XEN_SYSCTL_TMEM_OP_LIST:
+        ret = tmemc_list(op->cli_id,
+                         guest_handle_cast(op->buf, char), op->arg1, op->arg2);
         break;
-    case TMEMC_SET_WEIGHT:
-    case TMEMC_SET_CAP:
-    case TMEMC_SET_COMPRESS:
-        ret = tmemc_set_var(op->u.ctrl.cli_id,subop,op->u.ctrl.arg1);
+    case XEN_SYSCTL_TMEM_OP_SET_WEIGHT:
+    case XEN_SYSCTL_TMEM_OP_SET_CAP:
+    case XEN_SYSCTL_TMEM_OP_SET_COMPRESS:
+        ret = tmemc_set_var(op->cli_id, cmd, op->arg1);
         break;
-    case TMEMC_QUERY_FREEABLE_MB:
+    case XEN_SYSCTL_TMEM_OP_QUERY_FREEABLE_MB:
         ret = tmem_freeable_pages() >> (20 - PAGE_SHIFT);
         break;
-    case TMEMC_SAVE_BEGIN:
-    case TMEMC_RESTORE_BEGIN:
-    case TMEMC_SAVE_GET_VERSION:
-    case TMEMC_SAVE_GET_MAXPOOLS:
-    case TMEMC_SAVE_GET_CLIENT_WEIGHT:
-    case TMEMC_SAVE_GET_CLIENT_CAP:
-    case TMEMC_SAVE_GET_CLIENT_FLAGS:
-    case TMEMC_SAVE_GET_POOL_FLAGS:
-    case TMEMC_SAVE_GET_POOL_NPAGES:
-    case TMEMC_SAVE_GET_POOL_UUID:
-    case TMEMC_SAVE_END:
-        ret = tmemc_save_subop(op->u.ctrl.cli_id,pool_id,subop,
-                               guest_handle_cast(op->u.ctrl.buf, char),
-                               op->u.ctrl.arg1);
+    case XEN_SYSCTL_TMEM_OP_SAVE_BEGIN:
+    case XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_VERSION:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_MAXPOOLS:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_WEIGHT:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_CAP:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_FLAGS:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_FLAGS:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_NPAGES:
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_UUID:
+    case XEN_SYSCTL_TMEM_OP_SAVE_END:
+        ret = tmemc_save_subop(op->cli_id, pool_id, cmd,
+                               guest_handle_cast(op->buf, char), op->arg1);
         break;
-    case TMEMC_SAVE_GET_NEXT_PAGE:
-        ret = tmemc_save_get_next_page(op->u.ctrl.cli_id, pool_id,
-                                       guest_handle_cast(op->u.ctrl.buf, char),
-                                       op->u.ctrl.arg1);
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_PAGE:
+        ret = tmemc_save_get_next_page(op->cli_id, pool_id,
+                                       guest_handle_cast(op->buf, char), op->arg1);
         break;
-    case TMEMC_SAVE_GET_NEXT_INV:
-        ret = tmemc_save_get_next_inv(op->u.ctrl.cli_id,
-                                      guest_handle_cast(op->u.ctrl.buf, char),
-                                      op->u.ctrl.arg1);
+    case XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_INV:
+        ret = tmemc_save_get_next_inv(op->cli_id,
+                                      guest_handle_cast(op->buf, char), op->arg1);
         break;
-    case TMEMC_RESTORE_PUT_PAGE:
-        ret = tmemc_restore_put_page(op->u.ctrl.cli_id,pool_id,
-                                     oidp, op->u.ctrl.arg2,
-                                     guest_handle_cast(op->u.ctrl.buf, char),
-                                     op->u.ctrl.arg1);
+    case XEN_SYSCTL_TMEM_OP_RESTORE_PUT_PAGE:
+        ret = tmemc_restore_put_page(op->cli_id, pool_id, oidp, op->arg2,
+                                     guest_handle_cast(op->buf, char), op->arg1);
         break;
-    case TMEMC_RESTORE_FLUSH_PAGE:
-        ret = tmemc_restore_flush_page(op->u.ctrl.cli_id,pool_id,
-                                       oidp, op->u.ctrl.arg2);
+    case XEN_SYSCTL_TMEM_OP_RESTORE_FLUSH_PAGE:
+        ret = tmemc_restore_flush_page(op->cli_id, pool_id, oidp, op->arg2);
         break;
     default:
         ret = -1;
     }
+
+    write_unlock(&tmem_rwlock);
+
     return ret;
 }
 
@@ -2623,7 +2646,7 @@ long do_tmem_op(tmem_cli_op_t uops)
     struct tmem_op op;
     struct client *client = current->domain->tmem_client;
     struct tmem_pool *pool = NULL;
-    struct oid *oidp;
+    struct xen_tmem_oid *oidp;
     int rc = 0;
     bool_t succ_get = 0, succ_put = 0;
     bool_t non_succ_get = 0, non_succ_put = 0;
@@ -2650,12 +2673,12 @@ long do_tmem_op(tmem_cli_op_t uops)
         return -EFAULT;
     }
 
-    /* Acquire wirte lock for all command at first */
+    /* Acquire write lock for all commands at first. */
     write_lock(&tmem_rwlock);
 
     if ( op.cmd == TMEM_CONTROL )
     {
-        rc = do_tmem_control(&op);
+        rc = -EOPNOTSUPP;
     }
     else if ( op.cmd == TMEM_AUTH )
     {
@@ -2700,11 +2723,11 @@ long do_tmem_op(tmem_cli_op_t uops)
                 rc = -ENODEV;
                 goto out;
             }
-            /* Commands only need read lock */
+            /* Commands that only need read lock. */
             write_unlock(&tmem_rwlock);
             read_lock(&tmem_rwlock);
 
-            oidp = (struct oid *)&op.u.gen.oid[0];
+            oidp = &op.u.gen.oid;
             switch ( op.cmd )
             {
             case TMEM_NEW_POOL:
@@ -2752,7 +2775,7 @@ out:
     return rc;
 }
 
-/* this should be called when the host is destroying a client */
+/* This should be called when the host is destroying a client (domain). */
 void tmem_destroy(void *v)
 {
     struct client *client = (struct client *)v;
@@ -2775,7 +2798,7 @@ void tmem_destroy(void *v)
     write_unlock(&tmem_rwlock);
 }
 
-#define MAX_EVICTS 10  /* should be variable or set via TMEMC_ ?? */
+#define MAX_EVICTS 10  /* Should be variable or set via XEN_SYSCTL_TMEM_OP_ ?? */
 void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
 {
     struct page_info *pfp;
@@ -2817,7 +2840,7 @@ unsigned long tmem_freeable_pages(void)
     return tmem_page_list_pages + _atomic_read(freeable_page_count);
 }
 
-/* called at hypervisor startup */
+/* Called at hypervisor startup. */
 static int __init init_tmem(void)
 {
     int i;
diff --git a/xen/common/tmem_xen.c b/xen/common/tmem_xen.c
index 5ef131b..71cb7d5 100644
--- a/xen/common/tmem_xen.c
+++ b/xen/common/tmem_xen.c
@@ -77,7 +77,7 @@ static inline void *cli_get_page(xen_pfn_t cmfn, unsigned long *pcli_mfn,
 
     *pcli_mfn = page_to_mfn(page);
     *pcli_pfp = page;
-    return map_domain_page(*pcli_mfn);
+    return map_domain_page(_mfn(*pcli_mfn));
 }
 
 static inline void cli_put_page(void *cli_va, struct page_info *cli_pfp,
@@ -104,7 +104,7 @@ int tmem_copy_from_client(struct page_info *pfp,
 
     ASSERT(pfp != NULL);
     tmem_mfn = page_to_mfn(pfp);
-    tmem_va = map_domain_page(tmem_mfn);
+    tmem_va = map_domain_page(_mfn(tmem_mfn));
     if ( guest_handle_is_null(clibuf) )
     {
         cli_va = cli_get_page(cmfn, &cli_mfn, &cli_pfp, 0);
@@ -174,7 +174,7 @@ int tmem_copy_to_client(xen_pfn_t cmfn, struct page_info *pfp,
             return -EFAULT;
     }
     tmem_mfn = page_to_mfn(pfp);
-    tmem_va = map_domain_page(tmem_mfn);
+    tmem_va = map_domain_page(_mfn(tmem_mfn));
     if ( cli_va )
     {
         memcpy(cli_va, tmem_va, PAGE_SIZE);
diff --git a/xen/common/unlzma.c b/xen/common/unlzma.c
index a7da55b..e072e21 100644
--- a/xen/common/unlzma.c
+++ b/xen/common/unlzma.c
@@ -25,8 +25,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "decompress.h"
diff --git a/xen/common/unlzo.c b/xen/common/unlzo.c
index fc794b0..5ae6cf9 100644
--- a/xen/common/unlzo.c
+++ b/xen/common/unlzo.c
@@ -22,8 +22,7 @@
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; see the file COPYING.
- * If not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * If not, see <http://www.gnu.org/licenses/>.
  *
  * Markus F.X.J. Oberhumer
  * <markus at oberhumer.com>
diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
new file mode 100644
index 0000000..ef84b0f
--- /dev/null
+++ b/xen/common/vm_event.c
@@ -0,0 +1,772 @@
+/******************************************************************************
+ * vm_event.c
+ *
+ * VM event support.
+ *
+ * Copyright (c) 2009 Citrix Systems, Inc. (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include <xen/sched.h>
+#include <xen/event.h>
+#include <xen/wait.h>
+#include <xen/vm_event.h>
+#include <xen/mem_access.h>
+#include <asm/p2m.h>
+#include <asm/vm_event.h>
+#include <xsm/xsm.h>
+
+/* for public/io/ring.h macros */
+#define xen_mb()   mb()
+#define xen_rmb()  rmb()
+#define xen_wmb()  wmb()
+
+#define vm_event_ring_lock_init(_ved)  spin_lock_init(&(_ved)->ring_lock)
+#define vm_event_ring_lock(_ved)       spin_lock(&(_ved)->ring_lock)
+#define vm_event_ring_unlock(_ved)     spin_unlock(&(_ved)->ring_lock)
+
+static int vm_event_enable(
+    struct domain *d,
+    xen_domctl_vm_event_op_t *vec,
+    struct vm_event_domain *ved,
+    int pause_flag,
+    int param,
+    xen_event_channel_notification_t notification_fn)
+{
+    int rc;
+    unsigned long ring_gfn = d->arch.hvm_domain.params[param];
+
+    /* Only one helper at a time. If the helper crashed,
+     * the ring is in an undefined state and so is the guest.
+     */
+    if ( ved->ring_page )
+        return -EBUSY;
+
+    /* The parameter defaults to zero, and it should be
+     * set to something */
+    if ( ring_gfn == 0 )
+        return -ENOSYS;
+
+    vm_event_ring_lock_init(ved);
+    vm_event_ring_lock(ved);
+
+    rc = vm_event_init_domain(d);
+
+    if ( rc < 0 )
+        goto err;
+
+    rc = prepare_ring_for_helper(d, ring_gfn, &ved->ring_pg_struct,
+                                    &ved->ring_page);
+    if ( rc < 0 )
+        goto err;
+
+    /* Set the number of currently blocked vCPUs to 0. */
+    ved->blocked = 0;
+
+    /* Allocate event channel */
+    rc = alloc_unbound_xen_event_channel(d, 0, current->domain->domain_id,
+                                         notification_fn);
+    if ( rc < 0 )
+        goto err;
+
+    ved->xen_port = vec->port = rc;
+
+    /* Prepare ring buffer */
+    FRONT_RING_INIT(&ved->front_ring,
+                    (vm_event_sring_t *)ved->ring_page,
+                    PAGE_SIZE);
+
+    /* Save the pause flag for this particular ring. */
+    ved->pause_flag = pause_flag;
+
+    /* Initialize the last-chance wait queue. */
+    init_waitqueue_head(&ved->wq);
+
+    vm_event_ring_unlock(ved);
+    return 0;
+
+ err:
+    destroy_ring_for_helper(&ved->ring_page,
+                            ved->ring_pg_struct);
+    vm_event_ring_unlock(ved);
+
+    return rc;
+}
+
+static unsigned int vm_event_ring_available(struct vm_event_domain *ved)
+{
+    int avail_req = RING_FREE_REQUESTS(&ved->front_ring);
+    avail_req -= ved->target_producers;
+    avail_req -= ved->foreign_producers;
+
+    BUG_ON(avail_req < 0);
+
+    return avail_req;
+}
+
+/*
+ * vm_event_wake_blocked() will wakeup vcpus waiting for room in the
+ * ring. These vCPUs were paused on their way out after placing an event,
+ * but need to be resumed where the ring is capable of processing at least
+ * one event from them.
+ */
+static void vm_event_wake_blocked(struct domain *d, struct vm_event_domain *ved)
+{
+    struct vcpu *v;
+    int online = d->max_vcpus;
+    unsigned int avail_req = vm_event_ring_available(ved);
+
+    if ( avail_req == 0 || ved->blocked == 0 )
+        return;
+
+    /*
+     * We ensure that we only have vCPUs online if there are enough free slots
+     * for their memory events to be processed.  This will ensure that no
+     * memory events are lost (due to the fact that certain types of events
+     * cannot be replayed, we need to ensure that there is space in the ring
+     * for when they are hit).
+     * See comment below in vm_event_put_request().
+     */
+    for_each_vcpu ( d, v )
+        if ( test_bit(ved->pause_flag, &v->pause_flags) )
+            online--;
+
+    ASSERT(online == (d->max_vcpus - ved->blocked));
+
+    /* We remember which vcpu last woke up to avoid scanning always linearly
+     * from zero and starving higher-numbered vcpus under high load */
+    if ( d->vcpu )
+    {
+        int i, j, k;
+
+        for (i = ved->last_vcpu_wake_up + 1, j = 0; j < d->max_vcpus; i++, j++)
+        {
+            k = i % d->max_vcpus;
+            v = d->vcpu[k];
+            if ( !v )
+                continue;
+
+            if ( !(ved->blocked) || online >= avail_req )
+               break;
+
+            if ( test_and_clear_bit(ved->pause_flag, &v->pause_flags) )
+            {
+                vcpu_unpause(v);
+                online++;
+                ved->blocked--;
+                ved->last_vcpu_wake_up = k;
+            }
+        }
+    }
+}
+
+/*
+ * In the event that a vCPU attempted to place an event in the ring and
+ * was unable to do so, it is queued on a wait queue.  These are woken as
+ * needed, and take precedence over the blocked vCPUs.
+ */
+static void vm_event_wake_queued(struct domain *d, struct vm_event_domain *ved)
+{
+    unsigned int avail_req = vm_event_ring_available(ved);
+
+    if ( avail_req > 0 )
+        wake_up_nr(&ved->wq, avail_req);
+}
+
+/*
+ * vm_event_wake() will wakeup all vcpus waiting for the ring to
+ * become available.  If we have queued vCPUs, they get top priority. We
+ * are guaranteed that they will go through code paths that will eventually
+ * call vm_event_wake() again, ensuring that any blocked vCPUs will get
+ * unpaused once all the queued vCPUs have made it through.
+ */
+void vm_event_wake(struct domain *d, struct vm_event_domain *ved)
+{
+    if (!list_empty(&ved->wq.list))
+        vm_event_wake_queued(d, ved);
+    else
+        vm_event_wake_blocked(d, ved);
+}
+
+static int vm_event_disable(struct domain *d, struct vm_event_domain *ved)
+{
+    if ( ved->ring_page )
+    {
+        struct vcpu *v;
+
+        vm_event_ring_lock(ved);
+
+        if ( !list_empty(&ved->wq.list) )
+        {
+            vm_event_ring_unlock(ved);
+            return -EBUSY;
+        }
+
+        /* Free domU's event channel and leave the other one unbound */
+        free_xen_event_channel(d, ved->xen_port);
+
+        /* Unblock all vCPUs */
+        for_each_vcpu ( d, v )
+        {
+            if ( test_and_clear_bit(ved->pause_flag, &v->pause_flags) )
+            {
+                vcpu_unpause(v);
+                ved->blocked--;
+            }
+        }
+
+        destroy_ring_for_helper(&ved->ring_page,
+                                ved->ring_pg_struct);
+
+        vm_event_cleanup_domain(d);
+
+        vm_event_ring_unlock(ved);
+    }
+
+    return 0;
+}
+
+static inline void vm_event_release_slot(struct domain *d,
+                                         struct vm_event_domain *ved)
+{
+    /* Update the accounting */
+    if ( current->domain == d )
+        ved->target_producers--;
+    else
+        ved->foreign_producers--;
+
+    /* Kick any waiters */
+    vm_event_wake(d, ved);
+}
+
+/*
+ * vm_event_mark_and_pause() tags vcpu and put it to sleep.
+ * The vcpu will resume execution in vm_event_wake_waiters().
+ */
+void vm_event_mark_and_pause(struct vcpu *v, struct vm_event_domain *ved)
+{
+    if ( !test_and_set_bit(ved->pause_flag, &v->pause_flags) )
+    {
+        vcpu_pause_nosync(v);
+        ved->blocked++;
+    }
+}
+
+/*
+ * This must be preceded by a call to claim_slot(), and is guaranteed to
+ * succeed.  As a side-effect however, the vCPU may be paused if the ring is
+ * overly full and its continued execution would cause stalling and excessive
+ * waiting.  The vCPU will be automatically unpaused when the ring clears.
+ */
+void vm_event_put_request(struct domain *d,
+                          struct vm_event_domain *ved,
+                          vm_event_request_t *req)
+{
+    vm_event_front_ring_t *front_ring;
+    int free_req;
+    unsigned int avail_req;
+    RING_IDX req_prod;
+
+    if ( current->domain != d )
+    {
+        req->flags |= VM_EVENT_FLAG_FOREIGN;
+#ifndef NDEBUG
+        if ( !(req->flags & VM_EVENT_FLAG_VCPU_PAUSED) )
+            gdprintk(XENLOG_G_WARNING, "d%dv%d was not paused.\n",
+                     d->domain_id, req->vcpu_id);
+#endif
+    }
+
+    req->version = VM_EVENT_INTERFACE_VERSION;
+
+    vm_event_ring_lock(ved);
+
+    /* Due to the reservations, this step must succeed. */
+    front_ring = &ved->front_ring;
+    free_req = RING_FREE_REQUESTS(front_ring);
+    ASSERT(free_req > 0);
+
+    /* Copy request */
+    req_prod = front_ring->req_prod_pvt;
+    memcpy(RING_GET_REQUEST(front_ring, req_prod), req, sizeof(*req));
+    req_prod++;
+
+    /* Update ring */
+    front_ring->req_prod_pvt = req_prod;
+    RING_PUSH_REQUESTS(front_ring);
+
+    /* We've actually *used* our reservation, so release the slot. */
+    vm_event_release_slot(d, ved);
+
+    /* Give this vCPU a black eye if necessary, on the way out.
+     * See the comments above wake_blocked() for more information
+     * on how this mechanism works to avoid waiting. */
+    avail_req = vm_event_ring_available(ved);
+    if( current->domain == d && avail_req < d->max_vcpus )
+        vm_event_mark_and_pause(current, ved);
+
+    vm_event_ring_unlock(ved);
+
+    notify_via_xen_event_channel(d, ved->xen_port);
+}
+
+int vm_event_get_response(struct domain *d, struct vm_event_domain *ved,
+                          vm_event_response_t *rsp)
+{
+    vm_event_front_ring_t *front_ring;
+    RING_IDX rsp_cons;
+
+    vm_event_ring_lock(ved);
+
+    front_ring = &ved->front_ring;
+    rsp_cons = front_ring->rsp_cons;
+
+    if ( !RING_HAS_UNCONSUMED_RESPONSES(front_ring) )
+    {
+        vm_event_ring_unlock(ved);
+        return 0;
+    }
+
+    /* Copy response */
+    memcpy(rsp, RING_GET_RESPONSE(front_ring, rsp_cons), sizeof(*rsp));
+    rsp_cons++;
+
+    /* Update ring */
+    front_ring->rsp_cons = rsp_cons;
+    front_ring->sring->rsp_event = rsp_cons + 1;
+
+    /* Kick any waiters -- since we've just consumed an event,
+     * there may be additional space available in the ring. */
+    vm_event_wake(d, ved);
+
+    vm_event_ring_unlock(ved);
+
+    return 1;
+}
+
+/*
+ * Pull all responses from the given ring and unpause the corresponding vCPU
+ * if required. Based on the response type, here we can also call custom
+ * handlers.
+ *
+ * Note: responses are handled the same way regardless of which ring they
+ * arrive on.
+ */
+void vm_event_resume(struct domain *d, struct vm_event_domain *ved)
+{
+    vm_event_response_t rsp;
+
+    /* Pull all responses off the ring. */
+    while ( vm_event_get_response(d, ved, &rsp) )
+    {
+        struct vcpu *v;
+
+        if ( rsp.version != VM_EVENT_INTERFACE_VERSION )
+        {
+            printk(XENLOG_G_WARNING "vm_event interface version mismatch\n");
+            continue;
+        }
+
+        /* Validate the vcpu_id in the response. */
+        if ( (rsp.vcpu_id >= d->max_vcpus) || !d->vcpu[rsp.vcpu_id] )
+            continue;
+
+        v = d->vcpu[rsp.vcpu_id];
+
+        /*
+         * In some cases the response type needs extra handling, so here
+         * we call the appropriate handlers.
+         */
+        switch ( rsp.reason )
+        {
+        case VM_EVENT_REASON_MOV_TO_MSR:
+        case VM_EVENT_REASON_WRITE_CTRLREG:
+            vm_event_register_write_resume(v, &rsp);
+            break;
+
+#ifdef HAS_MEM_ACCESS
+        case VM_EVENT_REASON_MEM_ACCESS:
+            mem_access_resume(v, &rsp);
+            break;
+#endif
+
+#ifdef HAS_MEM_PAGING
+        case VM_EVENT_REASON_MEM_PAGING:
+            p2m_mem_paging_resume(d, &rsp);
+            break;
+#endif
+
+        };
+
+        /* Check for altp2m switch */
+        if ( rsp.flags & VM_EVENT_FLAG_ALTERNATE_P2M )
+            p2m_altp2m_check(v, rsp.altp2m_idx);
+
+        if ( rsp.flags & VM_EVENT_FLAG_VCPU_PAUSED )
+        {
+            if ( rsp.flags & VM_EVENT_FLAG_TOGGLE_SINGLESTEP )
+                vm_event_toggle_singlestep(d, v);
+
+            vm_event_vcpu_unpause(v);
+        }
+    }
+}
+
+void vm_event_cancel_slot(struct domain *d, struct vm_event_domain *ved)
+{
+    vm_event_ring_lock(ved);
+    vm_event_release_slot(d, ved);
+    vm_event_ring_unlock(ved);
+}
+
+static int vm_event_grab_slot(struct vm_event_domain *ved, int foreign)
+{
+    unsigned int avail_req;
+
+    if ( !ved->ring_page )
+        return -ENOSYS;
+
+    vm_event_ring_lock(ved);
+
+    avail_req = vm_event_ring_available(ved);
+    if ( avail_req == 0 )
+    {
+        vm_event_ring_unlock(ved);
+        return -EBUSY;
+    }
+
+    if ( !foreign )
+        ved->target_producers++;
+    else
+        ved->foreign_producers++;
+
+    vm_event_ring_unlock(ved);
+
+    return 0;
+}
+
+/* Simple try_grab wrapper for use in the wait_event() macro. */
+static int vm_event_wait_try_grab(struct vm_event_domain *ved, int *rc)
+{
+    *rc = vm_event_grab_slot(ved, 0);
+    return *rc;
+}
+
+/* Call vm_event_grab_slot() until the ring doesn't exist, or is available. */
+static int vm_event_wait_slot(struct vm_event_domain *ved)
+{
+    int rc = -EBUSY;
+    wait_event(ved->wq, vm_event_wait_try_grab(ved, &rc) != -EBUSY);
+    return rc;
+}
+
+bool_t vm_event_check_ring(struct vm_event_domain *ved)
+{
+    return (ved->ring_page != NULL);
+}
+
+/*
+ * Determines whether or not the current vCPU belongs to the target domain,
+ * and calls the appropriate wait function.  If it is a guest vCPU, then we
+ * use vm_event_wait_slot() to reserve a slot.  As long as there is a ring,
+ * this function will always return 0 for a guest.  For a non-guest, we check
+ * for space and return -EBUSY if the ring is not available.
+ *
+ * Return codes: -ENOSYS: the ring is not yet configured
+ *               -EBUSY: the ring is busy
+ *               0: a spot has been reserved
+ *
+ */
+int __vm_event_claim_slot(struct domain *d, struct vm_event_domain *ved,
+                          bool_t allow_sleep)
+{
+    if ( (current->domain == d) && allow_sleep )
+        return vm_event_wait_slot(ved);
+    else
+        return vm_event_grab_slot(ved, (current->domain != d));
+}
+
+#ifdef HAS_MEM_PAGING
+/* Registered with Xen-bound event channel for incoming notifications. */
+static void mem_paging_notification(struct vcpu *v, unsigned int port)
+{
+    if ( likely(v->domain->vm_event->paging.ring_page != NULL) )
+        vm_event_resume(v->domain, &v->domain->vm_event->paging);
+}
+#endif
+
+/* Registered with Xen-bound event channel for incoming notifications. */
+static void monitor_notification(struct vcpu *v, unsigned int port)
+{
+    if ( likely(v->domain->vm_event->monitor.ring_page != NULL) )
+        vm_event_resume(v->domain, &v->domain->vm_event->monitor);
+}
+
+#ifdef HAS_MEM_SHARING
+/* Registered with Xen-bound event channel for incoming notifications. */
+static void mem_sharing_notification(struct vcpu *v, unsigned int port)
+{
+    if ( likely(v->domain->vm_event->share.ring_page != NULL) )
+        vm_event_resume(v->domain, &v->domain->vm_event->share);
+}
+#endif
+
+/* Clean up on domain destruction */
+void vm_event_cleanup(struct domain *d)
+{
+#ifdef HAS_MEM_PAGING
+    if ( d->vm_event->paging.ring_page )
+    {
+        /* Destroying the wait queue head means waking up all
+         * queued vcpus. This will drain the list, allowing
+         * the disable routine to complete. It will also drop
+         * all domain refs the wait-queued vcpus are holding.
+         * Finally, because this code path involves previously
+         * pausing the domain (domain_kill), unpausing the
+         * vcpus causes no harm. */
+        destroy_waitqueue_head(&d->vm_event->paging.wq);
+        (void)vm_event_disable(d, &d->vm_event->paging);
+    }
+#endif
+    if ( d->vm_event->monitor.ring_page )
+    {
+        destroy_waitqueue_head(&d->vm_event->monitor.wq);
+        (void)vm_event_disable(d, &d->vm_event->monitor);
+    }
+#ifdef HAS_MEM_SHARING
+    if ( d->vm_event->share.ring_page )
+    {
+        destroy_waitqueue_head(&d->vm_event->share.wq);
+        (void)vm_event_disable(d, &d->vm_event->share);
+    }
+#endif
+}
+
+int vm_event_domctl(struct domain *d, xen_domctl_vm_event_op_t *vec,
+                    XEN_GUEST_HANDLE_PARAM(void) u_domctl)
+{
+    int rc;
+
+    rc = xsm_vm_event_control(XSM_PRIV, d, vec->mode, vec->op);
+    if ( rc )
+        return rc;
+
+    if ( unlikely(d == current->domain) )
+    {
+        gdprintk(XENLOG_INFO, "Tried to do a memory event op on itself.\n");
+        return -EINVAL;
+    }
+
+    if ( unlikely(d->is_dying) )
+    {
+        gdprintk(XENLOG_INFO, "Ignoring memory event op on dying domain %u\n",
+                 d->domain_id);
+        return 0;
+    }
+
+    if ( unlikely(d->vcpu == NULL) || unlikely(d->vcpu[0] == NULL) )
+    {
+        gdprintk(XENLOG_INFO,
+                 "Memory event op on a domain (%u) with no vcpus\n",
+                 d->domain_id);
+        return -EINVAL;
+    }
+
+    rc = -ENOSYS;
+
+    switch ( vec->mode )
+    {
+#ifdef HAS_MEM_PAGING
+    case XEN_DOMCTL_VM_EVENT_OP_PAGING:
+    {
+        struct vm_event_domain *ved = &d->vm_event->paging;
+        rc = -EINVAL;
+
+        switch( vec->op )
+        {
+        case XEN_VM_EVENT_ENABLE:
+        {
+            struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+            rc = -EOPNOTSUPP;
+            /* pvh fixme: p2m_is_foreign types need addressing */
+            if ( is_pvh_vcpu(current) || is_pvh_domain(hardware_domain) )
+                break;
+
+            rc = -ENODEV;
+            /* Only HAP is supported */
+            if ( !hap_enabled(d) )
+                break;
+
+            /* No paging if iommu is used */
+            rc = -EMLINK;
+            if ( unlikely(need_iommu(d)) )
+                break;
+
+            rc = -EXDEV;
+            /* Disallow paging in a PoD guest */
+            if ( p2m->pod.entry_count )
+                break;
+
+            rc = vm_event_enable(d, vec, ved, _VPF_mem_paging,
+                                 HVM_PARAM_PAGING_RING_PFN,
+                                 mem_paging_notification);
+        }
+        break;
+
+        case XEN_VM_EVENT_DISABLE:
+            if ( ved->ring_page )
+                rc = vm_event_disable(d, ved);
+            break;
+
+        case XEN_VM_EVENT_RESUME:
+            if ( ved->ring_page )
+                vm_event_resume(d, ved);
+            else
+                rc = -ENODEV;
+            break;
+
+        default:
+            rc = -ENOSYS;
+            break;
+        }
+    }
+    break;
+#endif
+
+    case XEN_DOMCTL_VM_EVENT_OP_MONITOR:
+    {
+        struct vm_event_domain *ved = &d->vm_event->monitor;
+        rc = -EINVAL;
+
+        switch( vec->op )
+        {
+        case XEN_VM_EVENT_ENABLE:
+            rc = vm_event_enable(d, vec, ved, _VPF_mem_access,
+                                 HVM_PARAM_MONITOR_RING_PFN,
+                                 monitor_notification);
+            break;
+
+        case XEN_VM_EVENT_DISABLE:
+            if ( ved->ring_page )
+                rc = vm_event_disable(d, ved);
+            break;
+
+        case XEN_VM_EVENT_RESUME:
+            if ( ved->ring_page )
+                vm_event_resume(d, ved);
+            else
+                rc = -ENODEV;
+            break;
+
+        default:
+            rc = -ENOSYS;
+            break;
+        }
+    }
+    break;
+
+#ifdef HAS_MEM_SHARING
+    case XEN_DOMCTL_VM_EVENT_OP_SHARING:
+    {
+        struct vm_event_domain *ved = &d->vm_event->share;
+        rc = -EINVAL;
+
+        switch( vec->op )
+        {
+        case XEN_VM_EVENT_ENABLE:
+            rc = -EOPNOTSUPP;
+            /* pvh fixme: p2m_is_foreign types need addressing */
+            if ( is_pvh_vcpu(current) || is_pvh_domain(hardware_domain) )
+                break;
+
+            rc = -ENODEV;
+            /* Only HAP is supported */
+            if ( !hap_enabled(d) )
+                break;
+
+            rc = vm_event_enable(d, vec, ved, _VPF_mem_sharing,
+                                 HVM_PARAM_SHARING_RING_PFN,
+                                 mem_sharing_notification);
+            break;
+
+        case XEN_VM_EVENT_DISABLE:
+            if ( ved->ring_page )
+                rc = vm_event_disable(d, ved);
+            break;
+
+        case XEN_VM_EVENT_RESUME:
+            if ( ved->ring_page )
+                vm_event_resume(d, ved);
+            else
+                rc = -ENODEV;
+            break;
+
+        default:
+            rc = -ENOSYS;
+            break;
+        }
+    }
+    break;
+#endif
+
+    default:
+        rc = -ENOSYS;
+    }
+
+    return rc;
+}
+
+void vm_event_vcpu_pause(struct vcpu *v)
+{
+    ASSERT(v == current);
+
+    atomic_inc(&v->vm_event_pause_count);
+    vcpu_pause_nosync(v);
+}
+
+void vm_event_vcpu_unpause(struct vcpu *v)
+{
+    int old, new, prev = v->vm_event_pause_count.counter;
+
+    /* All unpause requests as a result of toolstack responses.  Prevent
+     * underflow of the vcpu pause count. */
+    do
+    {
+        old = prev;
+        new = old - 1;
+
+        if ( new < 0 )
+        {
+            printk(XENLOG_G_WARNING
+                   "%pv vm_event: Too many unpause attempts\n", v);
+            return;
+        }
+
+        prev = cmpxchg(&v->vm_event_pause_count.counter, old, new);
+    } while ( prev != old );
+
+    vcpu_unpause(v);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/vmap.c b/xen/common/vmap.c
index 783cea3..c57239f 100644
--- a/xen/common/vmap.c
+++ b/xen/common/vmap.c
@@ -40,7 +40,7 @@ void __init vm_init(void)
     bitmap_fill(vm_bitmap, vm_low);
 
     /* Populate page tables for the bitmap if necessary. */
-    map_pages_to_xen(va, 0, vm_low - nr, MAP_SMALL_PAGES);
+    populate_pt_range(va, 0, vm_low - nr);
 }
 
 void *vm_alloc(unsigned int nr, unsigned int align)
@@ -181,7 +181,7 @@ void vm_free(const void *va)
     spin_unlock(&vm_lock);
 }
 
-void *__vmap(const unsigned long *mfn, unsigned int granularity,
+void *__vmap(const mfn_t *mfn, unsigned int granularity,
              unsigned int nr, unsigned int align, unsigned int flags)
 {
     void *va = vm_alloc(nr * granularity, align);
@@ -189,7 +189,7 @@ void *__vmap(const unsigned long *mfn, unsigned int granularity,
 
     for ( ; va && nr--; ++mfn, cur += PAGE_SIZE * granularity )
     {
-        if ( map_pages_to_xen(cur, *mfn, granularity, flags) )
+        if ( map_pages_to_xen(cur, mfn_x(*mfn), granularity, flags) )
         {
             vunmap(va);
             va = NULL;
@@ -199,7 +199,7 @@ void *__vmap(const unsigned long *mfn, unsigned int granularity,
     return va;
 }
 
-void *vmap(const unsigned long *mfn, unsigned int nr)
+void *vmap(const mfn_t *mfn, unsigned int nr)
 {
     return __vmap(mfn, 1, nr, 1, PAGE_HYPERVISOR);
 }
@@ -215,4 +215,75 @@ void vunmap(const void *va)
 #endif
     vm_free(va);
 }
+
+void *vmalloc(size_t size)
+{
+    mfn_t *mfn;
+    size_t pages, i;
+    struct page_info *pg;
+    void *va;
+
+    ASSERT(size);
+
+    pages = PFN_UP(size);
+    mfn = xmalloc_array(mfn_t, pages);
+    if ( mfn == NULL )
+        return NULL;
+
+    for ( i = 0; i < pages; i++ )
+    {
+        pg = alloc_domheap_page(NULL, 0);
+        if ( pg == NULL )
+            goto error;
+        mfn[i] = _mfn(page_to_mfn(pg));
+    }
+
+    va = vmap(mfn, pages);
+    if ( va == NULL )
+        goto error;
+
+    xfree(mfn);
+    return va;
+
+ error:
+    while ( i-- )
+        free_domheap_page(mfn_to_page(mfn_x(mfn[i])));
+    xfree(mfn);
+    return NULL;
+}
+
+void *vzalloc(size_t size)
+{
+    void *p = vmalloc(size);
+    int i;
+
+    if ( p == NULL )
+        return NULL;
+
+    for ( i = 0; i < size; i += PAGE_SIZE )
+        clear_page(p + i);
+
+    return p;
+}
+
+void vfree(void *va)
+{
+    unsigned int i, pages;
+    struct page_info *pg;
+    PAGE_LIST_HEAD(pg_list);
+
+    if ( !va )
+        return;
+
+    pages = vm_size(va);
+    ASSERT(pages);
+
+    for ( i = 0; i < pages; i++ )
+        page_list_add(vmap_to_page(va + i * PAGE_SIZE), &pg_list);
+
+    vunmap(va);
+
+    while ( (pg = page_list_remove_head(&pg_list)) != NULL )
+        free_domheap_page(pg);
+}
 #endif
diff --git a/xen/common/vsprintf.c b/xen/common/vsprintf.c
index 065cc42..51b5e4e 100644
--- a/xen/common/vsprintf.c
+++ b/xen/common/vsprintf.c
@@ -336,9 +336,14 @@ static char *pointer(char *str, char *end, const char **fmt_ptr,
         const struct vcpu *v = arg;
 
         ++*fmt_ptr;
-        if ( str < end )
-            *str = 'd';
-        str = number(str + 1, end, v->domain->domain_id, 10, -1, -1, 0);
+        if ( unlikely(v->domain->domain_id == DOMID_IDLE) )
+            str = string(str, end, "IDLE", -1, -1, 0);
+        else
+        {
+            if ( str < end )
+                *str = 'd';
+            str = number(str + 1, end, v->domain->domain_id, 10, -1, -1, 0);
+        }
         if ( str < end )
             *str = 'v';
         return number(str + 1, end, v->vcpu_id, 10, -1, -1, 0);
diff --git a/xen/common/wait.c b/xen/common/wait.c
index 1f6b597..4ac98c0 100644
--- a/xen/common/wait.c
+++ b/xen/common/wait.c
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/common/xencomm.c b/xen/common/xencomm.c
deleted file mode 100644
index 2604ac0..0000000
--- a/xen/common/xencomm.c
+++ /dev/null
@@ -1,621 +0,0 @@
-/******************************************************************************
- * xencomm.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- *
- * Copyright (C) IBM Corp. 2006
- *
- * Authors: Hollis Blanchard <hollisb at us.ibm.com>
- *          Tristan Gingold <tristan.gingold at bull.net>
- *          Isaku Yamahata <yamahata at valinux.co.jp> multiple page support
- */
-
-#include <xen/config.h>
-#include <xen/mm.h>
-#include <xen/sched.h>
-#include <xen/xencomm.h>
-#include <public/xen.h>
-#include <public/xencomm.h>
-
-#undef DEBUG
-#ifdef DEBUG
-#define xc_dprintk(f, a...) printk("[xencomm]" f , ## a)
-#else
-#define xc_dprintk(f, a...) ((void)0)
-#endif
-
-static void *
-xencomm_vaddr(unsigned long paddr, struct page_info *page)
-{
-    return (void*)((paddr & ~PAGE_MASK) | (unsigned long)page_to_virt(page));
-}
-
-/* get_page() to prevent another vcpu freeing the page. */
-static int
-xencomm_get_page(unsigned long paddr, struct page_info **page)
-{
-    unsigned long maddr = paddr_to_maddr(paddr);
-    if ( maddr == 0 )
-        return -EFAULT;
-        
-    *page = maddr_to_page(maddr);
-    if ( !get_page(*page, current->domain) )
-    {
-        /*
-         * This page might be a page granted by another domain, or this page 
-         * is freed with decrease reservation hypercall at the same time.
-         */
-        gdprintk(XENLOG_WARNING,
-                 "bad page is passed. paddr %#lx maddr %#lx\n",
-                 paddr, maddr);
-        return -EFAULT;
-    }
-
-    return 0;
-}
-
-/* check if struct desc doesn't cross page boundry */
-static int
-xencomm_desc_cross_page_boundary(unsigned long paddr)
-{
-    unsigned long offset = paddr & ~PAGE_MASK;
-    if ( offset > PAGE_SIZE - sizeof(struct xencomm_desc) )
-        return 1;
-    return 0;
-}
-
-struct xencomm_ctxt {
-    struct xencomm_desc __user *desc_in_paddr;
-    uint32_t nr_addrs;
-
-    struct page_info *page;
-    unsigned long *address;
-};
-
-static uint32_t
-xencomm_ctxt_nr_addrs(const struct xencomm_ctxt *ctxt)
-{
-    return ctxt->nr_addrs;
-}
-
-static unsigned long*
-xencomm_ctxt_address(struct xencomm_ctxt *ctxt)
-{
-    return ctxt->address;
-}
-
-static int
-xencomm_ctxt_init(const void *handle, struct xencomm_ctxt *ctxt)
-{
-    struct page_info *page;
-    struct xencomm_desc *desc;
-    int ret;
-
-    /* Avoid unaligned access. */
-    if ( ((unsigned long)handle % __alignof__(*desc)) != 0 )
-        return -EINVAL;
-    if ( xencomm_desc_cross_page_boundary((unsigned long)handle) )
-        return -EINVAL;
-
-    /* First we need to access the descriptor. */
-    ret = xencomm_get_page((unsigned long)handle, &page);
-    if ( ret )
-        return ret;
-
-    desc = xencomm_vaddr((unsigned long)handle, page);
-    if ( desc->magic != XENCOMM_MAGIC )
-    {
-        printk("%s: error: %p magic was %#x\n", __func__, desc, desc->magic);
-        put_page(page);
-        return -EINVAL;
-    }
-
-    /* Copy before use: It is possible for a guest to modify concurrently. */
-    ctxt->nr_addrs = desc->nr_addrs;
-    ctxt->desc_in_paddr = (struct xencomm_desc*)handle;
-    ctxt->page = page;
-    ctxt->address = &desc->address[0];
-    return 0;
-}
-
-/*
- * Calculate the vaddr of &ctxt->desc_in_paddr->address[i] and get_page().
- * And put the results in ctxt->page and ctxt->address.
- * If there is the previous page, put_page().
- *
- * A guest domain passes the array, ctxt->desc_in_paddr->address[].
- * It is gpaddr-contiguous, but not maddr-contiguous so that
- * we can't obtain the vaddr by simple offsetting.
- * We need to convert gpaddr, &ctxt->desc_in_paddr->address[i],
- * into maddr and then convert it to the xen virtual address in order
- * to access there.
- * The conversion can be optimized out by using the last result of
- * ctxt->address because we access the array sequentially.
- * The conversion, gpaddr -> maddr -> vaddr, is necessary only when
- * crossing page boundary.
- */
-static int
-xencomm_ctxt_next(struct xencomm_ctxt *ctxt, int i)
-{
-    unsigned long paddr;
-    struct page_info *page;
-    int ret;
-
-    BUG_ON(i >= ctxt->nr_addrs);
-
-    /* For i == 0 case we already calculated it in xencomm_ctxt_init(). */
-    if ( i != 0 )
-        ctxt->address++;
-
-    if ( ((unsigned long)ctxt->address & ~PAGE_MASK) != 0 )
-        return 0;
-
-    /* Crossing page boundary: machine address must be calculated. */
-    paddr = (unsigned long)&ctxt->desc_in_paddr->address[i];
-    ret = xencomm_get_page(paddr, &page);
-    if ( ret )
-        return ret;
-
-    put_page(ctxt->page);
-    ctxt->page = page;
-    ctxt->address = xencomm_vaddr(paddr, page);
-
-    return 0;
-}
-
-static void
-xencomm_ctxt_done(struct xencomm_ctxt *ctxt)
-{
-    put_page(ctxt->page);
-}
-
-static int
-xencomm_copy_chunk_from(
-    unsigned long to, unsigned long paddr, unsigned int  len)
-{
-    struct page_info *page;
-    int res;
-
-    do {
-        res = xencomm_get_page(paddr, &page);
-    } while ( res == -EAGAIN );
-
-    if ( res )
-        return res;
-
-    xc_dprintk("%lx[%d] -> %lx\n",
-               (unsigned long)xencomm_vaddr(paddr, page), len, to);
-
-    memcpy((void *)to, xencomm_vaddr(paddr, page), len);
-    put_page(page);
-
-    return 0;
-}
-
-static unsigned long
-xencomm_inline_from_guest(
-    void *to, const void *from, unsigned int n, unsigned int skip)
-{
-    unsigned long src_paddr = xencomm_inline_addr(from) + skip;
-
-    while ( n > 0 )
-    {
-        unsigned int chunksz, bytes;
-
-        chunksz = PAGE_SIZE - (src_paddr % PAGE_SIZE);
-        bytes   = min(chunksz, n);
-
-        if ( xencomm_copy_chunk_from((unsigned long)to, src_paddr, bytes) )
-            return n;
-        src_paddr += bytes;
-        to += bytes;
-        n -= bytes;
-    }
-
-    /* Always successful. */
-    return 0;
-}
-
-/**
- * xencomm_copy_from_guest: Copy a block of data from domain space.
- * @to:   Machine address.
- * @from: Physical address to a xencomm buffer descriptor.
- * @n:    Number of bytes to copy.
- * @skip: Number of bytes from the start to skip.
- *
- * Copy data from domain to hypervisor.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-unsigned long
-xencomm_copy_from_guest(
-    void *to, const void *from, unsigned int n, unsigned int skip)
-{
-    struct xencomm_ctxt ctxt;
-    unsigned int from_pos = 0;
-    unsigned int to_pos = 0;
-    unsigned int i = 0;
-
-    if ( xencomm_is_inline(from) )
-        return xencomm_inline_from_guest(to, from, n, skip);
-
-    if ( xencomm_ctxt_init(from, &ctxt) )
-        return n;
-
-    /* Iterate through the descriptor, copying up to a page at a time */
-    while ( (to_pos < n) && (i < xencomm_ctxt_nr_addrs(&ctxt)) )
-    {
-        unsigned long src_paddr;
-        unsigned int pgoffset, chunksz, chunk_skip;
-
-        if ( xencomm_ctxt_next(&ctxt, i) )
-            goto out;
-        src_paddr = *xencomm_ctxt_address(&ctxt);
-        if ( src_paddr == XENCOMM_INVALID )
-        {
-            i++;
-            continue;
-        }
-
-        pgoffset = src_paddr % PAGE_SIZE;
-        chunksz = PAGE_SIZE - pgoffset;
-
-        chunk_skip = min(chunksz, skip);
-        from_pos += chunk_skip;
-        chunksz -= chunk_skip;
-        skip -= chunk_skip;
-
-        if ( skip == 0 && chunksz > 0 )
-        {
-            unsigned int bytes = min(chunksz, n - to_pos);
-
-            if ( xencomm_copy_chunk_from((unsigned long)to + to_pos,
-                                         src_paddr + chunk_skip, bytes) )
-                goto out;
-            from_pos += bytes;
-            to_pos += bytes;
-        }
-
-        i++;
-    }
-
-out:
-    xencomm_ctxt_done(&ctxt);
-    return n - to_pos;
-}
-
-static int
-xencomm_copy_chunk_to(
-    unsigned long paddr, unsigned long from, unsigned int  len)
-{
-    struct page_info *page;
-    int res;
-
-    do {
-        res = xencomm_get_page(paddr, &page);
-    } while ( res == -EAGAIN );
-
-    if ( res )
-        return res;
-
-    xc_dprintk("%lx[%d] -> %lx\n", from, len,
-               (unsigned long)xencomm_vaddr(paddr, page));
-
-    memcpy(xencomm_vaddr(paddr, page), (void *)from, len);
-    xencomm_mark_dirty((unsigned long)xencomm_vaddr(paddr, page), len);
-    put_page(page);
-
-    return 0;
-}
-
-static unsigned long
-xencomm_inline_to_guest(
-    void *to, const void *from, unsigned int n, unsigned int skip)
-{
-    unsigned long dest_paddr = xencomm_inline_addr(to) + skip;
-
-    while ( n > 0 )
-    {
-        unsigned int chunksz, bytes;
-
-        chunksz = PAGE_SIZE - (dest_paddr % PAGE_SIZE);
-        bytes   = min(chunksz, n);
-
-        if ( xencomm_copy_chunk_to(dest_paddr, (unsigned long)from, bytes) )
-            return n;
-        dest_paddr += bytes;
-        from += bytes;
-        n -= bytes;
-    }
-
-    /* Always successful. */
-    return 0;
-}
-
-/**
- * xencomm_copy_to_guest: Copy a block of data to domain space.
- * @to:     Physical address to xencomm buffer descriptor.
- * @from:   Machine address.
- * @n:      Number of bytes to copy.
- * @skip: Number of bytes from the start to skip.
- *
- * Copy data from hypervisor to domain.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-unsigned long
-xencomm_copy_to_guest(
-    void *to, const void *from, unsigned int n, unsigned int skip)
-{
-    struct xencomm_ctxt ctxt;
-    unsigned int from_pos = 0;
-    unsigned int to_pos = 0;
-    unsigned int i = 0;
-
-    if ( xencomm_is_inline(to) )
-        return xencomm_inline_to_guest(to, from, n, skip);
-
-    if ( xencomm_ctxt_init(to, &ctxt) )
-        return n;
-
-    /* Iterate through the descriptor, copying up to a page at a time */
-    while ( (from_pos < n) && (i < xencomm_ctxt_nr_addrs(&ctxt)) )
-    {
-        unsigned long dest_paddr;
-        unsigned int pgoffset, chunksz, chunk_skip;
-
-        if ( xencomm_ctxt_next(&ctxt, i) )
-            goto out;
-        dest_paddr = *xencomm_ctxt_address(&ctxt);
-        if ( dest_paddr == XENCOMM_INVALID )
-        {
-            i++;
-            continue;
-        }
-
-        pgoffset = dest_paddr % PAGE_SIZE;
-        chunksz = PAGE_SIZE - pgoffset;
-
-        chunk_skip = min(chunksz, skip);
-        to_pos += chunk_skip;
-        chunksz -= chunk_skip;
-        skip -= chunk_skip;
-
-        if ( skip == 0 && chunksz > 0 )
-        {
-            unsigned int bytes = min(chunksz, n - from_pos);
-
-            if ( xencomm_copy_chunk_to(dest_paddr + chunk_skip,
-                                      (unsigned long)from + from_pos, bytes) )
-                goto out;
-            from_pos += bytes;
-            to_pos += bytes;
-        }
-
-        i++;
-    }
-
-out:
-    xencomm_ctxt_done(&ctxt);
-    return n - from_pos;
-}
-
-static int
-xencomm_clear_chunk(
-    unsigned long paddr, unsigned int  len)
-{
-    struct page_info *page;
-    int res;
-
-    do {
-        res = xencomm_get_page(paddr, &page);
-    } while ( res == -EAGAIN );
-
-    if ( res )
-        return res;
-
-    memset(xencomm_vaddr(paddr, page), 0x00, len);
-    xencomm_mark_dirty((unsigned long)xencomm_vaddr(paddr, page), len);
-    put_page(page);
-
-    return 0;
-}
-
-static unsigned long
-xencomm_inline_clear_guest(
-    void *to, unsigned int n, unsigned int skip)
-{
-    unsigned long dest_paddr = xencomm_inline_addr(to) + skip;
-
-    while ( n > 0 )
-    {
-        unsigned int chunksz, bytes;
-
-        chunksz = PAGE_SIZE - (dest_paddr % PAGE_SIZE);
-        bytes   = min(chunksz, n);
-
-        if ( xencomm_clear_chunk(dest_paddr, bytes) )
-            return n;
-        dest_paddr += bytes;
-        n -= bytes;
-    }
-
-    /* Always successful. */
-    return 0;
-}
-
-/**
- * xencomm_clear_guest: Clear a block of data in domain space.
- * @to:     Physical address to xencomm buffer descriptor.
- * @n:      Number of bytes to copy.
- * @skip: Number of bytes from the start to skip.
- *
- * Clear domain data
- *
- * Returns number of bytes that could not be cleared
- * On success, this will be zero.
- */
-unsigned long
-xencomm_clear_guest(
-    void *to, unsigned int n, unsigned int skip)
-{
-    struct xencomm_ctxt ctxt;
-    unsigned int from_pos = 0;
-    unsigned int to_pos = 0;
-    unsigned int i = 0;
-
-    if ( xencomm_is_inline(to) )
-        return xencomm_inline_clear_guest(to, n, skip);
-
-    if ( xencomm_ctxt_init(to, &ctxt) )
-        return n;
-
-    /* Iterate through the descriptor, copying up to a page at a time */
-    while ( (from_pos < n) && (i < xencomm_ctxt_nr_addrs(&ctxt)) )
-    {
-        unsigned long dest_paddr;
-        unsigned int pgoffset, chunksz, chunk_skip;
-
-        if ( xencomm_ctxt_next(&ctxt, i) )
-            goto out;
-        dest_paddr = *xencomm_ctxt_address(&ctxt);
-        if ( dest_paddr == XENCOMM_INVALID )
-        {
-            i++;
-            continue;
-        }
-
-        pgoffset = dest_paddr % PAGE_SIZE;
-        chunksz = PAGE_SIZE - pgoffset;
-
-        chunk_skip = min(chunksz, skip);
-        to_pos += chunk_skip;
-        chunksz -= chunk_skip;
-        skip -= chunk_skip;
-
-        if ( skip == 0 && chunksz > 0 )
-        {
-            unsigned int bytes = min(chunksz, n - from_pos);
-
-            if ( xencomm_clear_chunk(dest_paddr + chunk_skip, bytes) )
-                goto out;
-            from_pos += bytes;
-            to_pos += bytes;
-        }
-
-        i++;
-    }
-
-out:
-    xencomm_ctxt_done(&ctxt);
-    return n - from_pos;
-}
-
-static int xencomm_inline_add_offset(void **handle, unsigned int bytes)
-{
-    *handle += bytes;
-    return 0;
-}
-
-/* Offset page addresses in 'handle' to skip 'bytes' bytes. Set completely
- * exhausted pages to XENCOMM_INVALID. */
-int xencomm_add_offset(void **handle, unsigned int bytes)
-{
-    struct xencomm_ctxt ctxt;
-    int i = 0;
-    int res = 0;
-
-    if ( xencomm_is_inline(*handle) )
-        return xencomm_inline_add_offset(handle, bytes);
-
-    res = xencomm_ctxt_init(handle, &ctxt);
-    if ( res != 0 )
-        return res;
-
-    /* Iterate through the descriptor incrementing addresses */
-    while ( (bytes > 0) && (i < xencomm_ctxt_nr_addrs(&ctxt)) )
-    {
-        unsigned long *address;
-        unsigned long dest_paddr;
-        unsigned int pgoffset, chunksz, chunk_skip;
-
-        res = xencomm_ctxt_next(&ctxt, i);
-        if ( res )
-            goto out;
-        address = xencomm_ctxt_address(&ctxt);
-        dest_paddr = *address;
-        if ( dest_paddr == XENCOMM_INVALID )
-        {
-            i++;
-            continue;
-        }
-
-        pgoffset = dest_paddr % PAGE_SIZE;
-        chunksz = PAGE_SIZE - pgoffset;
-
-        chunk_skip = min(chunksz, bytes);
-        if ( chunk_skip == chunksz )
-            *address = XENCOMM_INVALID; /* exhausted this page */
-        else
-            *address += chunk_skip;
-        bytes -= chunk_skip;
-
-        i++;
-    }
-
-out:
-    xencomm_ctxt_done(&ctxt);
-    return res;
-}
-
-int xencomm_handle_is_null(void *handle)
-{
-    struct xencomm_ctxt ctxt;
-    int i;
-    int res = 1;
-
-    if ( xencomm_is_inline(handle) )
-        return xencomm_inline_addr(handle) == 0;
-
-    if ( xencomm_ctxt_init(handle, &ctxt) )
-        return 1;
-
-    for ( i = 0; i < xencomm_ctxt_nr_addrs(&ctxt); i++ )
-    {
-        if ( xencomm_ctxt_next(&ctxt, i) )
-            goto out;
-        if ( *xencomm_ctxt_address(&ctxt) != XENCOMM_INVALID )
-        {
-            res = 0;
-            goto out;
-        }
-    }
-
-out:
-    xencomm_ctxt_done(&ctxt);
-    return res;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/common/xenoprof.c b/xen/common/xenoprof.c
index 3de20b8..1061323 100644
--- a/xen/common/xenoprof.c
+++ b/xen/common/xenoprof.c
@@ -219,7 +219,7 @@ static int alloc_xenoprof_struct(
     bufsize = sizeof(struct xenoprof_buf);
     i = sizeof(struct event_log);
 #ifdef CONFIG_COMPAT
-    d->xenoprof->is_compat = is_pv_32on64_domain(is_passive ? hardware_domain : d);
+    d->xenoprof->is_compat = is_pv_32bit_domain(is_passive ? hardware_domain : d);
     if ( XENOPROF_COMPAT(d->xenoprof) )
     {
         bufsize = sizeof(struct compat_oprof_buf);
diff --git a/xen/common/xmalloc_tlsf.c b/xen/common/xmalloc_tlsf.c
index a5769c9..b13317e 100644
--- a/xen/common/xmalloc_tlsf.c
+++ b/xen/common/xmalloc_tlsf.c
@@ -138,9 +138,9 @@ static inline void MAPPING_SEARCH(unsigned long *r, int *fl, int *sl)
     }
     else
     {
-        t = (1 << (fls(*r) - 1 - MAX_LOG2_SLI)) - 1;
+        t = (1 << (flsl(*r) - 1 - MAX_LOG2_SLI)) - 1;
         *r = *r + t;
-        *fl = fls(*r) - 1;
+        *fl = flsl(*r) - 1;
         *sl = (*r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI;
         *fl -= FLI_OFFSET;
         /*if ((*fl -= FLI_OFFSET) < 0) // FL will be always >0!
@@ -164,7 +164,7 @@ static inline void MAPPING_INSERT(unsigned long r, int *fl, int *sl)
     }
     else
     {
-        *fl = fls(r) - 1;
+        *fl = flsl(r) - 1;
         *sl = (r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI;
         *fl -= FLI_OFFSET;
     }
diff --git a/xen/common/xz/dec_lzma2.c b/xen/common/xz/dec_lzma2.c
index 779221d..44fe79b 100644
--- a/xen/common/xz/dec_lzma2.c
+++ b/xen/common/xz/dec_lzma2.c
@@ -1043,6 +1043,8 @@ XZ_EXTERN enum xz_ret INIT xz_dec_lzma2_run(struct xz_dec_lzma2 *s,
 
 			s->lzma2.sequence = SEQ_LZMA_PREPARE;
 
+		/* Fall through */
+
 		case SEQ_LZMA_PREPARE:
 			if (s->lzma2.compressed < RC_INIT_BYTES)
 				return XZ_DATA_ERROR;
@@ -1053,6 +1055,8 @@ XZ_EXTERN enum xz_ret INIT xz_dec_lzma2_run(struct xz_dec_lzma2 *s,
 			s->lzma2.compressed -= RC_INIT_BYTES;
 			s->lzma2.sequence = SEQ_LZMA_RUN;
 
+		/* Fall through */
+
 		case SEQ_LZMA_RUN:
 			/*
 			 * Set dictionary limit to indicate how much we want
diff --git a/xen/drivers/acpi/apei/apei-base.c b/xen/drivers/acpi/apei/apei-base.c
index 43f8f98..6f81e7f 100644
--- a/xen/drivers/acpi/apei/apei-base.c
+++ b/xen/drivers/acpi/apei/apei-base.c
@@ -28,8 +28,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #include <xen/kernel.h>
 #include <xen/errno.h>
diff --git a/xen/drivers/acpi/apei/apei-io.c b/xen/drivers/acpi/apei/apei-io.c
index f0f4636..8955de9 100644
--- a/xen/drivers/acpi/apei/apei-io.c
+++ b/xen/drivers/acpi/apei/apei-io.c
@@ -15,8 +15,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/kernel.h>
diff --git a/xen/drivers/acpi/apei/erst.c b/xen/drivers/acpi/apei/erst.c
index 21593cf..8d721d0 100644
--- a/xen/drivers/acpi/apei/erst.c
+++ b/xen/drivers/acpi/apei/erst.c
@@ -22,8 +22,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/kernel.h>
diff --git a/xen/drivers/acpi/apei/hest.c b/xen/drivers/acpi/apei/hest.c
index b8790a6..f74e7c2 100644
--- a/xen/drivers/acpi/apei/hest.c
+++ b/xen/drivers/acpi/apei/hest.c
@@ -23,8 +23,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/errno.h>
diff --git a/xen/drivers/acpi/numa.c b/xen/drivers/acpi/numa.c
index 775537b..434194e 100644
--- a/xen/drivers/acpi/numa.c
+++ b/xen/drivers/acpi/numa.c
@@ -16,8 +16,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
diff --git a/xen/drivers/acpi/osl.c b/xen/drivers/acpi/osl.c
index 93c983c..ce15470 100644
--- a/xen/drivers/acpi/osl.c
+++ b/xen/drivers/acpi/osl.c
@@ -18,8 +18,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
@@ -88,13 +87,14 @@ void __iomem *
 acpi_os_map_memory(acpi_physical_address phys, acpi_size size)
 {
 	if (system_state >= SYS_STATE_active) {
-		unsigned long pfn = PFN_DOWN(phys);
+		mfn_t mfn = _mfn(PFN_DOWN(phys));
 		unsigned int offs = phys & (PAGE_SIZE - 1);
 
 		/* The low first Mb is always mapped. */
 		if ( !((phys + size - 1) >> 20) )
 			return __va(phys);
-		return __vmap(&pfn, PFN_UP(offs + size), 1, 1, PAGE_HYPERVISOR_NOCACHE) + offs;
+		return __vmap(&mfn, PFN_UP(offs + size), 1, 1,
+			      PAGE_HYPERVISOR_NOCACHE) + offs;
 	}
 	return __acpi_map_table(phys, size);
 }
diff --git a/xen/drivers/acpi/pmstat.c b/xen/drivers/acpi/pmstat.c
index daac2da..892260d 100644
--- a/xen/drivers/acpi/pmstat.c
+++ b/xen/drivers/acpi/pmstat.c
@@ -14,8 +14,7 @@
 # more details.
 #
 # You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc., 59 
-# Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+# this program; If not, see <http://www.gnu.org/licenses/>.
 #
 # The full GNU General Public License is included in this distribution in the
 # file called LICENSE.
diff --git a/xen/drivers/acpi/tables.c b/xen/drivers/acpi/tables.c
index 1beca79..e57cf2a 100644
--- a/xen/drivers/acpi/tables.c
+++ b/xen/drivers/acpi/tables.c
@@ -16,8 +16,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
diff --git a/xen/drivers/char/Makefile b/xen/drivers/char/Makefile
index 911b788..47fc3f9 100644
--- a/xen/drivers/char/Makefile
+++ b/xen/drivers/char/Makefile
@@ -1,8 +1,10 @@
 obj-y += console.o
 obj-$(HAS_NS16550) += ns16550.o
+obj-$(HAS_CADENCE_UART) += cadence-uart.o
 obj-$(HAS_PL011) += pl011.o
 obj-$(HAS_EXYNOS4210) += exynos4210-uart.o
 obj-$(HAS_OMAP) += omap-uart.o
+obj-$(HAS_SCIF) += scif-uart.o
 obj-$(HAS_EHCI) += ehci-dbgp.o
 obj-$(CONFIG_ARM) += dt-uart.o
 obj-y += serial.o
diff --git a/xen/drivers/char/cadence-uart.c b/xen/drivers/char/cadence-uart.c
new file mode 100644
index 0000000..933672f
--- /dev/null
+++ b/xen/drivers/char/cadence-uart.c
@@ -0,0 +1,224 @@
+/*
+ * xen/drivers/char/cadence-uart.c
+ *
+ * Driver for Cadence UART in Xilinx ZynqMP.
+ *
+ * Written by Edgar E. Iglesias <edgar.iglesias at gmail.com>
+ * Copyright (c) 2015 Xilinx Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <xen/config.h>
+#include <xen/console.h>
+#include <xen/serial.h>
+#include <xen/init.h>
+#include <xen/irq.h>
+#include <xen/device_tree.h>
+#include <xen/errno.h>
+#include <asm/device.h>
+#include <xen/mm.h>
+#include <xen/vmap.h>
+#include <asm/cadence-uart.h>
+#include <asm/io.h>
+
+static struct cuart {
+    unsigned int irq;
+    void __iomem *regs;
+    /* UART with IRQ line: interrupt-driven I/O. */
+    struct irqaction irqaction;
+    struct vuart_info vuart;
+} cuart_com = {0};
+
+#define cuart_read(uart, off)           readl((uart)->regs + (off))
+#define cuart_write(uart, off,val)      writel((val), (uart)->regs + (off))
+
+static void cuart_interrupt(int irq, void *data, struct cpu_user_regs *regs)
+{
+    struct serial_port *port = data;
+    struct cuart *uart = port->uart;
+    unsigned int status;
+
+    do {
+        status = cuart_read(uart, R_UART_SR);
+        /* ACK.  */
+        if ( status & UART_SR_INTR_RTRIG )
+        {
+            serial_rx_interrupt(port, regs);
+            cuart_write(uart, R_UART_CISR, UART_SR_INTR_RTRIG);
+        }
+    } while ( status & UART_SR_INTR_RTRIG );
+}
+
+static void __init cuart_init_preirq(struct serial_port *port)
+{
+    struct cuart *uart = port->uart;
+
+    cuart_write(uart, R_UART_MR, UART_MR_NO_PARITY);
+    /* Enable and Reset both the RX and TX paths.  */
+    cuart_write(uart, R_UART_CR, UART_CR_RX_RST | UART_CR_TX_RST |
+                      UART_CR_RX_ENABLE | UART_CR_TX_ENABLE);
+}
+
+static void __init cuart_init_postirq(struct serial_port *port)
+{
+    struct cuart *uart = port->uart;
+    int rc;
+
+    if ( uart->irq > 0 )
+    {
+        uart->irqaction.handler = cuart_interrupt;
+        uart->irqaction.name    = "cadence-uart";
+        uart->irqaction.dev_id  = port;
+        if ( (rc = setup_irq(uart->irq, 0, &uart->irqaction)) != 0 )
+            printk("ERROR: Failed to allocate cadence-uart IRQ %d\n", uart->irq);
+    }
+
+    /* Clear pending error interrupts */
+    cuart_write(uart, R_UART_RTRIG, 1);
+    cuart_write(uart, R_UART_CISR, ~0);
+
+    /* Unmask interrupts */
+    cuart_write(uart, R_UART_IDR, ~0);
+    cuart_write(uart, R_UART_IER, UART_SR_INTR_RTRIG);
+}
+
+static void cuart_suspend(struct serial_port *port)
+{
+    BUG();
+}
+
+static void cuart_resume(struct serial_port *port)
+{
+    BUG();
+}
+
+static int cuart_tx_ready(struct serial_port *port)
+{
+    struct cuart *uart = port->uart;
+    unsigned int status = cuart_read(uart, R_UART_SR);
+
+    return !( status & UART_SR_INTR_TFUL );
+}
+
+static void cuart_putc(struct serial_port *port, char c)
+{
+    struct cuart *uart = port->uart;
+
+    cuart_write(uart, R_UART_TX, (uint32_t)(unsigned char)c);
+}
+
+static int cuart_getc(struct serial_port *port, char *pc)
+{
+    struct cuart *uart = port->uart;
+
+    if ( cuart_read(uart, R_UART_SR) & UART_SR_INTR_REMPTY )
+        return 0;
+
+    *pc = cuart_read(uart, R_UART_RX) & 0xff;
+    return 1;
+}
+
+static int __init cuart_irq(struct serial_port *port)
+{
+    struct cuart *uart = port->uart;
+
+    return ( (uart->irq > 0) ? uart->irq : -1 );
+}
+
+static const struct vuart_info *cuart_vuart(struct serial_port *port)
+{
+    struct cuart *uart = port->uart;
+
+    return &uart->vuart;
+}
+
+static struct uart_driver __read_mostly cuart_driver = {
+    .init_preirq  = cuart_init_preirq,
+    .init_postirq = cuart_init_postirq,
+    .endboot      = NULL,
+    .suspend      = cuart_suspend,
+    .resume       = cuart_resume,
+    .tx_ready     = cuart_tx_ready,
+    .putc         = cuart_putc,
+    .getc         = cuart_getc,
+    .irq          = cuart_irq,
+    .vuart_info   = cuart_vuart,
+};
+
+static int __init cuart_init(struct dt_device_node *dev, const void *data)
+{
+    const char *config = data;
+    struct cuart *uart;
+    int res;
+    u64 addr, size;
+
+    if ( strcmp(config, "") )
+        printk("WARNING: UART configuration is not supported\n");
+
+    uart = &cuart_com;
+
+    res = dt_device_get_address(dev, 0, &addr, &size);
+    if ( res )
+    {
+        printk("cadence: Unable to retrieve the base"
+               " address of the UART\n");
+        return res;
+    }
+
+    res = platform_get_irq(dev, 0);
+    if ( res < 0 )
+    {
+        printk("cadence: Unable to retrieve the IRQ\n");
+        return -EINVAL;
+    }
+    uart->irq = res;
+
+    uart->regs = ioremap_nocache(addr, size);
+    if ( !uart->regs )
+    {
+        printk("cadence: Unable to map the UART memory\n");
+        return -ENOMEM;
+    }
+
+    uart->vuart.base_addr = addr;
+    uart->vuart.size = size;
+    uart->vuart.data_off = R_UART_RX;
+    uart->vuart.status_off = R_UART_SR;
+    uart->vuart.status = UART_SR_INTR_TEMPTY;
+
+    /* Register with generic serial driver. */
+    serial_register_uart(SERHND_DTUART, &cuart_driver, uart);
+
+    dt_device_set_used_by(dev, DOMID_XEN);
+
+    return 0;
+}
+
+static const struct dt_device_match cuart_dt_match[] __initconst =
+{
+    DT_MATCH_COMPATIBLE("cdns,uart-r1p8"),
+    { /* sentinel */ },
+};
+
+DT_DEVICE_START(cuart, "Cadence UART", DEVICE_SERIAL)
+    .dt_match = cuart_dt_match,
+    .init = cuart_init,
+DT_DEVICE_END
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c
index 0b8d3d4..fce4cc8 100644
--- a/xen/drivers/char/console.c
+++ b/xen/drivers/char/console.c
@@ -1150,21 +1150,6 @@ void panic(const char *fmt, ...)
         machine_restart(5000);
 }
 
-void __bug(char *file, int line)
-{
-    console_start_sync();
-    printk("Xen BUG at %s:%d\n", file, line);
-    dump_execution_state();
-    panic("Xen BUG at %s:%d", file, line);
-}
-
-void __warn(char *file, int line)
-{
-    printk("Xen WARN at %s:%d\n", file, line);
-    dump_execution_state();
-}
-
-
 /*
  * **************************************************************
  * ****************** Console suspend/resume ********************
diff --git a/xen/drivers/char/dt-uart.c b/xen/drivers/char/dt-uart.c
index 1197230..d599322 100644
--- a/xen/drivers/char/dt-uart.c
+++ b/xen/drivers/char/dt-uart.c
@@ -22,6 +22,7 @@
 #include <xen/console.h>
 #include <xen/device_tree.h>
 #include <xen/serial.h>
+#include <xen/errno.h>
 
 /*
  * Configure UART port with a string:
@@ -41,9 +42,33 @@ void __init dt_uart_init(void)
     const char *devpath = opt_dtuart;
     char *options;
 
-    if ( !console_has("dtuart") || !strcmp(opt_dtuart, "") )
+    if ( !console_has("dtuart") )
+        return; /* Not for us */
+
+    if ( !strcmp(opt_dtuart, "") )
     {
-        printk("No console\n");
+        const struct dt_device_node *chosen = dt_find_node_by_path("/chosen");
+
+        if ( chosen )
+        {
+            const char *stdout;
+
+            ret = dt_property_read_string(chosen, "stdout-path", &stdout);
+            if ( ret >= 0 )
+            {
+                printk("Taking dtuart configuration from /chosen/stdout-path\n");
+                if ( strlcpy(opt_dtuart, stdout, sizeof(opt_dtuart))
+                     >= sizeof(opt_dtuart) )
+                    printk("WARNING: /chosen/stdout-path too long, truncated\n");
+            }
+            else if ( ret != -EINVAL /* Not present */ )
+                printk("Failed to read /chosen/stdout-path (%d)\n", ret);
+        }
+    }
+
+    if ( !strcmp(opt_dtuart, "") )
+    {
+        printk("No dtuart path configured\n");
         return;
     }
 
@@ -53,7 +78,7 @@ void __init dt_uart_init(void)
     else
         options = "";
 
-    printk("Looking for UART console %s\n", devpath);
+    printk("Looking for dtuart at \"%s\", options \"%s\"\n", devpath, options);
     if ( *devpath == '/' )
         dev = dt_find_node_by_path(devpath);
     else
@@ -68,5 +93,15 @@ void __init dt_uart_init(void)
     ret = device_init(dev, DEVICE_SERIAL, options);
 
     if ( ret )
-        printk("Unable to initialize serial: %d\n", ret);
+        printk("Unable to initialize dtuart: %d\n", ret);
 }
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/drivers/char/exynos4210-uart.c b/xen/drivers/char/exynos4210-uart.c
index cba8729..bac1c2b 100644
--- a/xen/drivers/char/exynos4210-uart.c
+++ b/xen/drivers/char/exynos4210-uart.c
@@ -352,14 +352,14 @@ static int __init exynos4210_uart_init(struct dt_device_node *dev,
     return 0;
 }
 
-static const char * const exynos4210_dt_compat[] __initconst =
+static const struct dt_device_match exynos4210_dt_match[] __initconst =
 {
-    "samsung,exynos4210-uart",
-    NULL
+    DT_MATCH_COMPATIBLE("samsung,exynos4210-uart"),
+    { /* sentinel */ },
 };
 
 DT_DEVICE_START(exynos4210, "Exynos 4210 UART", DEVICE_SERIAL)
-        .compatible = exynos4210_dt_compat,
+        .dt_match = exynos4210_dt_match,
         .init = exynos4210_uart_init,
 DT_DEVICE_END
 
diff --git a/xen/drivers/char/ns16550.c b/xen/drivers/char/ns16550.c
index 161b251..d443880 100644
--- a/xen/drivers/char/ns16550.c
+++ b/xen/drivers/char/ns16550.c
@@ -1185,16 +1185,16 @@ static int __init ns16550_uart_dt_init(struct dt_device_node *dev,
     return 0;
 }
 
-static const char * const ns16550_dt_compat[] __initconst =
+static const struct dt_device_match ns16550_dt_match[] __initconst =
 {
-    "ns16550",
-    "ns16550a",
-    "snps,dw-apb-uart",
-    NULL
+    DT_MATCH_COMPATIBLE("ns16550"),
+    DT_MATCH_COMPATIBLE("ns16550a"),
+    DT_MATCH_COMPATIBLE("snps,dw-apb-uart"),
+    { /* sentinel */ },
 };
 
 DT_DEVICE_START(ns16550, "NS16550 UART", DEVICE_SERIAL)
-        .compatible = ns16550_dt_compat,
+        .dt_match = ns16550_dt_match,
         .init = ns16550_uart_dt_init,
 DT_DEVICE_END
 
diff --git a/xen/drivers/char/omap-uart.c b/xen/drivers/char/omap-uart.c
index 16d1454..d8f64ea 100644
--- a/xen/drivers/char/omap-uart.c
+++ b/xen/drivers/char/omap-uart.c
@@ -350,14 +350,14 @@ static int __init omap_uart_init(struct dt_device_node *dev,
     return 0;
 }
 
-static const char * const omap_uart_dt_compat[] __initconst =
+static const struct dt_device_match omap_uart_dt_match[] __initconst =
 {
-    "ti,omap4-uart",
-    NULL
+    DT_MATCH_COMPATIBLE("ti,omap4-uart"),
+    { /* sentinel */ },
 };
 
 DT_DEVICE_START(omap_uart, "OMAP UART", DEVICE_SERIAL)
-    .compatible = omap_uart_dt_compat,
+    .dt_match = omap_uart_dt_match,
     .init = omap_uart_init,
 DT_DEVICE_END
 
diff --git a/xen/drivers/char/pl011.c b/xen/drivers/char/pl011.c
index 57274d9..67e6df5 100644
--- a/xen/drivers/char/pl011.c
+++ b/xen/drivers/char/pl011.c
@@ -285,14 +285,14 @@ static int __init pl011_uart_init(struct dt_device_node *dev,
     return 0;
 }
 
-static const char * const pl011_dt_compat[] __initconst =
+static const struct dt_device_match pl011_dt_match[] __initconst =
 {
-    "arm,pl011",
-    NULL
+    DT_MATCH_COMPATIBLE("arm,pl011"),
+    { /* sentinel */ },
 };
 
 DT_DEVICE_START(pl011, "PL011 UART", DEVICE_SERIAL)
-        .compatible = pl011_dt_compat,
+        .dt_match = pl011_dt_match,
         .init = pl011_uart_init,
 DT_DEVICE_END
 
diff --git a/xen/drivers/char/scif-uart.c b/xen/drivers/char/scif-uart.c
new file mode 100644
index 0000000..51a2233
--- /dev/null
+++ b/xen/drivers/char/scif-uart.c
@@ -0,0 +1,367 @@
+/*
+ * xen/drivers/char/scif-uart.c
+ *
+ * Driver for SCIF (Serial communication interface with FIFO)
+ * compatible UART.
+ *
+ * Oleksandr Tyshchenko <oleksandr.tyshchenko at globallogic.com>
+ * Copyright (C) 2014, Globallogic.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <xen/config.h>
+#include <xen/console.h>
+#include <xen/errno.h>
+#include <xen/serial.h>
+#include <xen/init.h>
+#include <xen/irq.h>
+#include <xen/mm.h>
+#include <xen/delay.h>
+#include <asm/device.h>
+#include <asm/scif-uart.h>
+#include <asm/io.h>
+
+#define PARITY_NONE    0
+#define PARITY_EVEN    1
+#define PARITY_ODD     2
+
+#define scif_readb(uart, off)          readb((uart)->regs + (off))
+#define scif_writeb(uart, off, val)    writeb((val), (uart)->regs + (off))
+
+#define scif_readw(uart, off)          readw((uart)->regs + (off))
+#define scif_writew(uart, off, val)    writew((val), (uart)->regs + (off))
+
+static struct scif_uart {
+    unsigned int baud, clock_hz, data_bits, parity, stop_bits;
+    unsigned int irq;
+    char __iomem *regs;
+    struct irqaction irqaction;
+    struct vuart_info vuart;
+} scif_com = {0};
+
+static void scif_uart_interrupt(int irq, void *data, struct cpu_user_regs *regs)
+{
+    struct serial_port *port = data;
+    struct scif_uart *uart = port->uart;
+    uint16_t status, ctrl;
+
+    ctrl = scif_readw(uart, SCIF_SCSCR);
+    status = scif_readw(uart, SCIF_SCFSR) & ~SCFSR_TEND;
+    /* Ignore next flag if TX Interrupt is disabled */
+    if ( !(ctrl & SCSCR_TIE) )
+        status &= ~SCFSR_TDFE;
+
+    while ( status != 0 )
+    {
+        /* TX Interrupt */
+        if ( status & SCFSR_TDFE )
+            serial_tx_interrupt(port, regs);
+
+        /* RX Interrupt */
+        if ( status & (SCFSR_RDF | SCFSR_DR) )
+            serial_rx_interrupt(port, regs);
+
+        /* Error Interrupt */
+        if ( status & SCIF_ERRORS )
+            scif_writew(uart, SCIF_SCFSR, ~SCIF_ERRORS);
+        if ( scif_readw(uart, SCIF_SCLSR) & SCLSR_ORER )
+            scif_writew(uart, SCIF_SCLSR, 0);
+
+        ctrl = scif_readw(uart, SCIF_SCSCR);
+        status = scif_readw(uart, SCIF_SCFSR) & ~SCFSR_TEND;
+        /* Ignore next flag if TX Interrupt is disabled */
+        if ( !(ctrl & SCSCR_TIE) )
+            status &= ~SCFSR_TDFE;
+    }
+}
+
+static void __init scif_uart_init_preirq(struct serial_port *port)
+{
+    struct scif_uart *uart = port->uart;
+    unsigned int divisor;
+    uint16_t val;
+
+    /*
+     * Wait until last bit has been transmitted. This is needed for a smooth
+     * transition when we come from early printk
+     */
+    while ( !(scif_readw(uart, SCIF_SCFSR) & SCFSR_TEND) );
+
+    /* Disable TX/RX parts and all interrupts */
+    scif_writew(uart, SCIF_SCSCR, 0);
+
+    /* Reset TX/RX FIFOs */
+    scif_writew(uart, SCIF_SCFCR, SCFCR_RFRST | SCFCR_TFRST);
+
+    /* Clear all errors and flags */
+    scif_readw(uart, SCIF_SCFSR);
+    scif_writew(uart, SCIF_SCFSR, 0);
+    scif_readw(uart, SCIF_SCLSR);
+    scif_writew(uart, SCIF_SCLSR, 0);
+
+    /* Select Baud rate generator output as a clock source */
+    scif_writew(uart, SCIF_SCSCR, SCSCR_CKE10);
+
+    /* Setup protocol format and Baud rate, select Asynchronous mode */
+    val = 0;
+    ASSERT( uart->data_bits >= 7 && uart->data_bits <= 8 );
+    if ( uart->data_bits == 7 )
+        val |= SCSMR_CHR;
+    else
+        val &= ~SCSMR_CHR;
+
+    ASSERT( uart->stop_bits >= 1 && uart->stop_bits <= 2 );
+    if ( uart->stop_bits == 2 )
+        val |= SCSMR_STOP;
+    else
+        val &= ~SCSMR_STOP;
+
+    ASSERT( uart->parity >= PARITY_NONE && uart->parity <= PARITY_ODD );
+    switch ( uart->parity )
+    {
+    case PARITY_NONE:
+        val &= ~SCSMR_PE;
+        break;
+
+    case PARITY_EVEN:
+        val |= SCSMR_PE;
+        break;
+
+    case PARITY_ODD:
+        val |= SCSMR_PE | SCSMR_ODD;
+        break;
+    }
+    scif_writew(uart, SCIF_SCSMR, val);
+
+    ASSERT( uart->clock_hz > 0 );
+    if ( uart->baud != BAUD_AUTO )
+    {
+        /* Setup desired Baud rate */
+        divisor = uart->clock_hz / (uart->baud << 4);
+        ASSERT( divisor >= 1 && divisor <= (uint16_t)UINT_MAX );
+        scif_writew(uart, SCIF_DL, (uint16_t)divisor);
+        /* Selects the frequency divided clock (SC_CLK external input) */
+        scif_writew(uart, SCIF_CKS, 0);
+        udelay(1000000 / uart->baud + 1);
+    }
+    else
+    {
+        /* Read current Baud rate */
+        divisor = scif_readw(uart, SCIF_DL);
+        ASSERT( divisor >= 1 && divisor <= (uint16_t)UINT_MAX );
+        uart->baud = uart->clock_hz / (divisor << 4);
+    }
+
+    /* Setup trigger level for TX/RX FIFOs */
+    scif_writew(uart, SCIF_SCFCR, SCFCR_RTRG11 | SCFCR_TTRG11);
+
+    /* Enable TX/RX parts */
+    scif_writew(uart, SCIF_SCSCR, scif_readw(uart, SCIF_SCSCR) |
+                 SCSCR_TE | SCSCR_RE);
+}
+
+static void __init scif_uart_init_postirq(struct serial_port *port)
+{
+    struct scif_uart *uart = port->uart;
+    int rc;
+
+    uart->irqaction.handler = scif_uart_interrupt;
+    uart->irqaction.name    = "scif_uart";
+    uart->irqaction.dev_id  = port;
+
+    if ( (rc = setup_irq(uart->irq, 0, &uart->irqaction)) != 0 )
+        dprintk(XENLOG_ERR, "Failed to allocated scif_uart IRQ %d\n",
+                uart->irq);
+
+    /* Clear all errors */
+    if ( scif_readw(uart, SCIF_SCFSR) & SCIF_ERRORS )
+        scif_writew(uart, SCIF_SCFSR, ~SCIF_ERRORS);
+    if ( scif_readw(uart, SCIF_SCLSR) & SCLSR_ORER )
+        scif_writew(uart, SCIF_SCLSR, 0);
+
+    /* Enable TX/RX and Error Interrupts  */
+    scif_writew(uart, SCIF_SCSCR, scif_readw(uart, SCIF_SCSCR) |
+                 SCSCR_TIE | SCSCR_RIE | SCSCR_REIE);
+}
+
+static void scif_uart_suspend(struct serial_port *port)
+{
+    BUG();
+}
+
+static void scif_uart_resume(struct serial_port *port)
+{
+    BUG();
+}
+
+static int scif_uart_tx_ready(struct serial_port *port)
+{
+    struct scif_uart *uart = port->uart;
+    uint16_t cnt;
+
+    /* Check for empty space in TX FIFO */
+    if ( !(scif_readw(uart, SCIF_SCFSR) & SCFSR_TDFE) )
+        return 0;
+
+     /* Check number of data bytes stored in TX FIFO */
+    cnt = scif_readw(uart, SCIF_SCFDR) >> 8;
+    ASSERT( cnt >= 0 && cnt <= SCIF_FIFO_MAX_SIZE );
+
+    return (SCIF_FIFO_MAX_SIZE - cnt);
+}
+
+static void scif_uart_putc(struct serial_port *port, char c)
+{
+    struct scif_uart *uart = port->uart;
+
+    scif_writeb(uart, SCIF_SCFTDR, c);
+    /* Clear required TX flags */
+    scif_writew(uart, SCIF_SCFSR, scif_readw(uart, SCIF_SCFSR) &
+                 ~(SCFSR_TEND | SCFSR_TDFE));
+}
+
+static int scif_uart_getc(struct serial_port *port, char *pc)
+{
+    struct scif_uart *uart = port->uart;
+
+    /* Check for available data bytes in RX FIFO */
+    if ( !(scif_readw(uart, SCIF_SCFSR) & (SCFSR_RDF | SCFSR_DR)) )
+        return 0;
+
+    *pc = scif_readb(uart, SCIF_SCFRDR);
+
+    /* dummy read */
+    scif_readw(uart, SCIF_SCFSR);
+    /* Clear required RX flags */
+    scif_writew(uart, SCIF_SCFSR, ~(SCFSR_RDF | SCFSR_DR));
+
+    return 1;
+}
+
+static int __init scif_uart_irq(struct serial_port *port)
+{
+    struct scif_uart *uart = port->uart;
+
+    return ((uart->irq > 0) ? uart->irq : -1);
+}
+
+static const struct vuart_info *scif_vuart_info(struct serial_port *port)
+{
+    struct scif_uart *uart = port->uart;
+
+    return &uart->vuart;
+}
+
+static void scif_uart_start_tx(struct serial_port *port)
+{
+    struct scif_uart *uart = port->uart;
+
+    scif_writew(uart, SCIF_SCSCR, scif_readw(uart, SCIF_SCSCR) | SCSCR_TIE);
+}
+
+static void scif_uart_stop_tx(struct serial_port *port)
+{
+    struct scif_uart *uart = port->uart;
+
+    scif_writew(uart, SCIF_SCSCR, scif_readw(uart, SCIF_SCSCR) & ~SCSCR_TIE);
+}
+
+static struct uart_driver __read_mostly scif_uart_driver = {
+    .init_preirq  = scif_uart_init_preirq,
+    .init_postirq = scif_uart_init_postirq,
+    .endboot      = NULL,
+    .suspend      = scif_uart_suspend,
+    .resume       = scif_uart_resume,
+    .tx_ready     = scif_uart_tx_ready,
+    .putc         = scif_uart_putc,
+    .getc         = scif_uart_getc,
+    .irq          = scif_uart_irq,
+    .start_tx     = scif_uart_start_tx,
+    .stop_tx      = scif_uart_stop_tx,
+    .vuart_info   = scif_vuart_info,
+};
+
+static int __init scif_uart_init(struct dt_device_node *dev,
+                                 const void *data)
+{
+    const char *config = data;
+    struct scif_uart *uart;
+    int res;
+    u64 addr, size;
+
+    if ( strcmp(config, "") )
+        printk("WARNING: UART configuration is not supported\n");
+
+    uart = &scif_com;
+
+    uart->clock_hz  = SCIF_CLK_FREQ;
+    uart->baud      = BAUD_AUTO;
+    uart->data_bits = 8;
+    uart->parity    = PARITY_NONE;
+    uart->stop_bits = 1;
+
+    res = dt_device_get_address(dev, 0, &addr, &size);
+    if ( res )
+    {
+        printk("scif-uart: Unable to retrieve the base"
+                     " address of the UART\n");
+        return res;
+    }
+
+    res = platform_get_irq(dev, 0);
+    if ( res < 0 )
+    {
+        printk("scif-uart: Unable to retrieve the IRQ\n");
+        return res;
+    }
+    uart->irq = res;
+
+    uart->regs = ioremap_nocache(addr, size);
+    if ( !uart->regs )
+    {
+        printk("scif-uart: Unable to map the UART memory\n");
+        return -ENOMEM;
+    }
+
+    uart->vuart.base_addr  = addr;
+    uart->vuart.size       = size;
+    uart->vuart.data_off   = SCIF_SCFTDR;
+    uart->vuart.status_off = SCIF_SCFSR;
+    uart->vuart.status     = SCFSR_TDFE;
+
+    /* Register with generic serial driver */
+    serial_register_uart(SERHND_DTUART, &scif_uart_driver, uart);
+
+    dt_device_set_used_by(dev, DOMID_XEN);
+
+    return 0;
+}
+
+static const struct dt_device_match scif_uart_dt_match[] __initconst =
+{
+    DT_MATCH_COMPATIBLE("renesas,scif"),
+    { /* sentinel */ },
+};
+
+DT_DEVICE_START(scif_uart, "SCIF UART", DEVICE_SERIAL)
+    .dt_match = scif_uart_dt_match,
+    .init = scif_uart_init,
+DT_DEVICE_END
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/drivers/cpufreq/cpufreq.c b/xen/drivers/cpufreq/cpufreq.c
index ab66884..567e9e9 100644
--- a/xen/drivers/cpufreq/cpufreq.c
+++ b/xen/drivers/cpufreq/cpufreq.c
@@ -21,8 +21,7 @@
  *  General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/xen/drivers/passthrough/amd/iommu_acpi.c b/xen/drivers/passthrough/amd/iommu_acpi.c
index 5634eac..79c1f8c 100644
--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c b/xen/drivers/passthrough/amd/iommu_cmd.c
index 4faa01b..44407f5 100644
--- a/xen/drivers/passthrough/amd/iommu_cmd.c
+++ b/xen/drivers/passthrough/amd/iommu_cmd.c
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/sched.h>
diff --git a/xen/drivers/passthrough/amd/iommu_detect.c b/xen/drivers/passthrough/amd/iommu_detect.c
index 98e5cc2..c1fa29b 100644
--- a/xen/drivers/passthrough/amd/iommu_detect.c
+++ b/xen/drivers/passthrough/amd/iommu_detect.c
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
diff --git a/xen/drivers/passthrough/amd/iommu_guest.c b/xen/drivers/passthrough/amd/iommu_guest.c
index 98e7b38..e74f469 100644
--- a/xen/drivers/passthrough/amd/iommu_guest.c
+++ b/xen/drivers/passthrough/amd/iommu_guest.c
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/sched.h>
@@ -203,7 +202,7 @@ void guest_iommu_add_ppr_log(struct domain *d, u32 entry[])
                                     sizeof(ppr_entry_t), tail);
     ASSERT(mfn_valid(mfn));
 
-    log_base = map_domain_page(mfn);
+    log_base = map_domain_page(_mfn(mfn));
     log = log_base + tail % (PAGE_SIZE / sizeof(ppr_entry_t));
 
     /* Convert physical device id back into virtual device id */
@@ -252,7 +251,7 @@ void guest_iommu_add_event_log(struct domain *d, u32 entry[])
                                     sizeof(event_entry_t), tail);
     ASSERT(mfn_valid(mfn));
 
-    log_base = map_domain_page(mfn);
+    log_base = map_domain_page(_mfn(mfn));
     log = log_base + tail % (PAGE_SIZE / sizeof(event_entry_t));
 
     /* re-write physical device id into virtual device id */
@@ -377,7 +376,7 @@ static int do_completion_wait(struct domain *d, cmd_entry_t *cmd)
         gaddr_64 = (gaddr_hi << 32) | (gaddr_lo << 3);
 
         gfn = gaddr_64 >> PAGE_SHIFT;
-        vaddr = map_domain_page(mfn_x(get_gfn(d, gfn ,&p2mt)));
+        vaddr = map_domain_page(get_gfn(d, gfn ,&p2mt));
         put_gfn(d, gfn);
 
         write_u64_atomic((uint64_t *)(vaddr + (gaddr_64 & (PAGE_SIZE-1))),
@@ -425,7 +424,7 @@ static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd)
     ASSERT(mfn_valid(dte_mfn));
 
     /* Read guest dte information */
-    dte_base = map_domain_page(dte_mfn);
+    dte_base = map_domain_page(_mfn(dte_mfn));
 
     gdte = dte_base + gbdf % (PAGE_SIZE / sizeof(dev_entry_t));
 
@@ -506,7 +505,7 @@ static void guest_iommu_process_command(unsigned long _d)
                                             sizeof(cmd_entry_t), head);
         ASSERT(mfn_valid(cmd_mfn));
 
-        cmd_base = map_domain_page(cmd_mfn);
+        cmd_base = map_domain_page(_mfn(cmd_mfn));
         cmd = cmd_base + head % entries_per_page;
 
         opcode = get_field_from_reg_u32(cmd->data[1],
@@ -682,7 +681,7 @@ static uint64_t iommu_mmio_read64(struct guest_iommu *iommu,
 }
 
 static int guest_iommu_mmio_read(struct vcpu *v, unsigned long addr,
-                                 unsigned long len, unsigned long *pval)
+                                 unsigned int len, unsigned long *pval)
 {
     struct guest_iommu *iommu = vcpu_iommu(v);
     unsigned long offset;
@@ -695,7 +694,7 @@ static int guest_iommu_mmio_read(struct vcpu *v, unsigned long addr,
     if ( unlikely((offset & (len - 1 )) || (len > 8)) )
     {
         AMD_IOMMU_DEBUG("iommu mmio read access is not aligned:"
-                        " offset = %lx, len = %lx\n", offset, len);
+                        " offset = %lx, len = %x\n", offset, len);
         return X86EMUL_UNHANDLEABLE;
     }
 
@@ -772,7 +771,7 @@ static void guest_iommu_mmio_write64(struct guest_iommu *iommu,
 }
 
 static int guest_iommu_mmio_write(struct vcpu *v, unsigned long addr,
-                                  unsigned long len, unsigned long val)
+                                  unsigned int len, unsigned long val)
 {
     struct guest_iommu *iommu = vcpu_iommu(v);
     unsigned long offset;
@@ -785,7 +784,7 @@ static int guest_iommu_mmio_write(struct vcpu *v, unsigned long addr,
     if ( unlikely((offset & (len - 1)) || (len > 8)) )
     {
         AMD_IOMMU_DEBUG("iommu mmio write access is not aligned:"
-                        " offset = %lx, len = %lx\n", offset, len);
+                        " offset = %lx, len = %x\n", offset, len);
         return X86EMUL_UNHANDLEABLE;
     }
 
@@ -868,6 +867,20 @@ static void guest_iommu_reg_init(struct guest_iommu *iommu)
     iommu->reg_ext_feature.hi = upper;
 }
 
+static int guest_iommu_mmio_range(struct vcpu *v, unsigned long addr)
+{
+    struct guest_iommu *iommu = vcpu_iommu(v);
+
+    return iommu && addr >= iommu->mmio_base &&
+           addr < iommu->mmio_base + IOMMU_MMIO_SIZE;
+}
+
+static const struct hvm_mmio_ops iommu_mmio_ops = {
+    .check = guest_iommu_mmio_range,
+    .read = guest_iommu_mmio_read,
+    .write = guest_iommu_mmio_write
+};
+
 /* Domain specific initialization */
 int guest_iommu_init(struct domain* d)
 {
@@ -894,6 +907,8 @@ int guest_iommu_init(struct domain* d)
 
     spin_lock_init(&iommu->lock);
 
+    register_mmio_handler(d, &iommu_mmio_ops);
+
     return 0;
 }
 
@@ -910,17 +925,3 @@ void guest_iommu_destroy(struct domain *d)
 
     domain_hvm_iommu(d)->arch.g_iommu = NULL;
 }
-
-static int guest_iommu_mmio_range(struct vcpu *v, unsigned long addr)
-{
-    struct guest_iommu *iommu = vcpu_iommu(v);
-
-    return iommu && addr >= iommu->mmio_base &&
-           addr < iommu->mmio_base + IOMMU_MMIO_SIZE;
-}
-
-const struct hvm_mmio_handler iommu_mmio_handler = {
-    .check_handler = guest_iommu_mmio_range,
-    .read_handler = guest_iommu_mmio_read,
-    .write_handler = guest_iommu_mmio_write
-};
diff --git a/xen/drivers/passthrough/amd/iommu_init.c b/xen/drivers/passthrough/amd/iommu_init.c
index 56bda00..d90a2d2 100644
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -451,7 +450,7 @@ static void iommu_msi_unmask(struct irq_desc *desc)
     spin_lock_irqsave(&iommu->lock, flags);
     amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
     spin_unlock_irqrestore(&iommu->lock, flags);
-    iommu->msi.msi_attrib.masked = 0;
+    iommu->msi.msi_attrib.host_masked = 0;
 }
 
 static void iommu_msi_mask(struct irq_desc *desc)
@@ -464,7 +463,7 @@ static void iommu_msi_mask(struct irq_desc *desc)
     spin_lock_irqsave(&iommu->lock, flags);
     amd_iommu_msi_enable(iommu, IOMMU_CONTROL_DISABLED);
     spin_unlock_irqrestore(&iommu->lock, flags);
-    iommu->msi.msi_attrib.masked = 1;
+    iommu->msi.msi_attrib.host_masked = 1;
 }
 
 static unsigned int iommu_msi_startup(struct irq_desc *desc)
diff --git a/xen/drivers/passthrough/amd/iommu_intr.c b/xen/drivers/passthrough/amd/iommu_intr.c
index c1b76fb..62e29e9 100644
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/err.h>
@@ -365,15 +364,17 @@ unsigned int amd_iommu_read_ioapic_from_ire(
     unsigned int apic, unsigned int reg)
 {
     unsigned int val = __io_apic_read(apic, reg);
+    unsigned int pin = (reg - 0x10) / 2;
+    unsigned int offset = ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin];
 
-    if ( !(reg & 1) )
+    if ( !(reg & 1) && offset < INTREMAP_ENTRIES )
     {
-        unsigned int offset = val & (INTREMAP_ENTRIES - 1);
         u16 bdf = ioapic_sbdf[IO_APIC_ID(apic)].bdf;
         u16 seg = ioapic_sbdf[IO_APIC_ID(apic)].seg;
         u16 req_id = get_intremap_requestor_id(seg, bdf);
         const u32 *entry = get_intremap_entry(seg, req_id, offset);
 
+        ASSERT(offset == (val & (INTREMAP_ENTRIES - 1)));
         val &= ~(INTREMAP_ENTRIES - 1);
         val |= get_field_from_reg_u32(*entry,
                                       INT_REMAP_ENTRY_INTTYPE_MASK,
@@ -529,10 +530,12 @@ int amd_iommu_msi_msg_update_ire(
     } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
 
     if ( !rc )
+    {
         for ( i = 1; i < nr; ++i )
             msi_desc[i].remap_index = msi_desc->remap_index + i;
+        msg->data = data;
+    }
 
-    msg->data = data;
     return rc;
 }
 
diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c
index a8c60ec..78862c9 100644
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/config.h>
@@ -42,7 +41,7 @@ void clear_iommu_pte_present(unsigned long l1_mfn, unsigned long gfn)
 {
     u64 *table, *pte;
 
-    table = map_domain_page(l1_mfn);
+    table = map_domain_page(_mfn(l1_mfn));
     pte = table + pfn_to_pde_idx(gfn, IOMMU_PAGING_MODE_LEVEL_1);
     *pte = 0;
     unmap_domain_page(table);
@@ -115,7 +114,7 @@ static bool_t set_iommu_pte_present(unsigned long pt_mfn, unsigned long gfn,
     u32 *pde;
     bool_t need_flush = 0;
 
-    table = map_domain_page(pt_mfn);
+    table = map_domain_page(_mfn(pt_mfn));
 
     pde = (u32*)(table + pfn_to_pde_idx(gfn, pde_level));
 
@@ -349,12 +348,12 @@ static int iommu_update_pde_count(struct domain *d, unsigned long pt_mfn,
     next_level = merge_level - 1;
 
     /* get pde at merge level */
-    table = map_domain_page(pt_mfn);
+    table = map_domain_page(_mfn(pt_mfn));
     pde = table + pfn_to_pde_idx(gfn, merge_level);
 
     /* get page table of next level */
     ntable_maddr = amd_iommu_get_next_table_from_pte((u32*)pde);
-    ntable = map_domain_page(ntable_maddr >> PAGE_SHIFT);
+    ntable = map_domain_page(_mfn(paddr_to_pfn(ntable_maddr)));
 
     /* get the first mfn of next level */
     first_mfn = amd_iommu_get_next_table_from_pte((u32*)ntable) >> PAGE_SHIFT;
@@ -400,7 +399,7 @@ static int iommu_merge_pages(struct domain *d, unsigned long pt_mfn,
 
     ASSERT( spin_is_locked(&hd->arch.mapping_lock) && pt_mfn );
 
-    table = map_domain_page(pt_mfn);
+    table = map_domain_page(_mfn(pt_mfn));
     pde = table + pfn_to_pde_idx(gfn, merge_level);
 
     /* get first mfn */
@@ -412,7 +411,7 @@ static int iommu_merge_pages(struct domain *d, unsigned long pt_mfn,
         return 1;
     }
 
-    ntable = map_domain_page(ntable_mfn);
+    ntable = map_domain_page(_mfn(ntable_mfn));
     first_mfn = amd_iommu_get_next_table_from_pte((u32*)ntable) >> PAGE_SHIFT;
 
     if ( first_mfn == 0 )
@@ -467,7 +466,7 @@ static int iommu_pde_from_gfn(struct domain *d, unsigned long pfn,
         unsigned int next_level = level - 1;
         pt_mfn[level] = next_table_mfn;
 
-        next_table_vaddr = map_domain_page(next_table_mfn);
+        next_table_vaddr = map_domain_page(_mfn(next_table_mfn));
         pde = next_table_vaddr + pfn_to_pde_idx(pfn, level);
 
         /* Here might be a super page frame */
@@ -557,6 +556,10 @@ static int update_paging_mode(struct domain *d, unsigned long gfn)
     unsigned long old_root_mfn;
     struct hvm_iommu *hd = domain_hvm_iommu(d);
 
+    if ( gfn == INVALID_MFN )
+        return -EADDRNOTAVAIL;
+    ASSERT(!(gfn >> DEFAULT_DOMAIN_ADDRESS_WIDTH));
+
     level = hd->arch.paging_mode;
     old_root = hd->arch.root_table;
     offset = gfn >> (PTE_PER_TABLE_SHIFT * (level - 1));
@@ -729,12 +732,15 @@ int amd_iommu_unmap_page(struct domain *d, unsigned long gfn)
      * we might need a deeper page table for lager gfn now */
     if ( is_hvm_domain(d) )
     {
-        if ( update_paging_mode(d, gfn) )
+        int rc = update_paging_mode(d, gfn);
+
+        if ( rc )
         {
             spin_unlock(&hd->arch.mapping_lock);
             AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn);
-            domain_crash(d);
-            return -EFAULT;
+            if ( rc != -EADDRNOTAVAIL )
+                domain_crash(d);
+            return rc;
         }
     }
 
@@ -785,11 +791,6 @@ void amd_iommu_share_p2m(struct domain *d)
     struct page_info *p2m_table;
     mfn_t pgd_mfn;
 
-    ASSERT( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled );
-
-    if ( !iommu_use_hap_pt(d) )
-        return;
-
     pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d)));
     p2m_table = mfn_to_page(mfn_x(pgd_mfn));
 
diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
index e83bb35..c1c0b6b 100644
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/sched.h>
@@ -394,7 +393,8 @@ static int reassign_device(struct domain *source, struct domain *target,
 }
 
 static int amd_iommu_assign_device(struct domain *d, u8 devfn,
-                                   struct pci_dev *pdev)
+                                   struct pci_dev *pdev,
+                                   u32 flag)
 {
     struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
     int bdf = PCI_BDF2(pdev->bus, devfn);
diff --git a/xen/drivers/passthrough/arm/iommu.c b/xen/drivers/passthrough/arm/iommu.c
index 3007b99..95b1abb 100644
--- a/xen/drivers/passthrough/arm/iommu.c
+++ b/xen/drivers/passthrough/arm/iommu.c
@@ -66,5 +66,10 @@ int arch_iommu_domain_init(struct domain *d)
 
 void arch_iommu_domain_destroy(struct domain *d)
 {
-    iommu_dt_domain_destroy(d);
+}
+
+int arch_iommu_populate_page_table(struct domain *d)
+{
+    /* The IOMMU shares the p2m with the CPU */
+    return -ENOSYS;
 }
diff --git a/xen/drivers/passthrough/arm/smmu.c b/xen/drivers/passthrough/arm/smmu.c
index 42bde75..bb08827 100644
--- a/xen/drivers/passthrough/arm/smmu.c
+++ b/xen/drivers/passthrough/arm/smmu.c
@@ -11,31 +11,34 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
- * Based on Linux drivers/iommu/arm-smmu.c (commit 89a23cd)
  * Copyright (C) 2013 ARM Limited
  *
  * Author: Will Deacon <will.deacon at arm.com>
  *
+ * Based on Linux drivers/iommu/arm-smmu.c
+ *	=> commit e6b5be2be4e30037eb551e0ed09dd97bd00d85d3
+ *
  * Xen modification:
  * Julien Grall <julien.grall at linaro.org>
  * Copyright (C) 2014 Linaro Limited.
  *
  * This driver currently supports:
- *  - SMMUv1 and v2 implementations (didn't try v2 SMMU)
- *  - Stream-matching and stream-indexing
- *  - v7/v8 long-descriptor format
- *  - Non-secure access to the SMMU
- *  - 4k pages, p2m shared with the processor
- *  - Up to 40-bit addressing
- *  - Context fault reporting
+ *	- SMMUv1 and v2 implementations
+ *	- Stream-matching and stream-indexing
+ *	- v7/v8 long-descriptor format
+ *	- Non-secure access to the SMMU
+ *	- 4k and 64k pages, with contiguous pte hints.
+ *	- Up to 48-bit addressing (dependent on VA_BITS)
+ *	- Context fault reporting
  */
 
+
 #include <xen/config.h>
 #include <xen/delay.h>
 #include <xen/errno.h>
+#include <xen/err.h>
 #include <xen/irq.h>
 #include <xen/lib.h>
 #include <xen/list.h>
@@ -43,1485 +46,2695 @@
 #include <xen/vmap.h>
 #include <xen/rbtree.h>
 #include <xen/sched.h>
+#include <xen/sizes.h>
 #include <asm/atomic.h>
 #include <asm/device.h>
 #include <asm/io.h>
 #include <asm/platform.h>
 
-/* Driver options */
-#define SMMU_OPT_SECURE_CONFIG_ACCESS   (1 << 0)
+/* Xen: The below defines are redefined within the file. Undef it */
+#undef SCTLR_AFE
+#undef SCTLR_TRE
+#undef SCTLR_M
+#undef TTBCR_EAE
+
+/* Alias to Xen device tree helpers */
+#define device_node dt_device_node
+#define of_phandle_args dt_phandle_args
+#define of_device_id dt_device_match
+#define of_match_node dt_match_node
+#define of_property_read_u32(np, pname, out) (!dt_property_read_u32(np, pname, out))
+#define of_property_read_bool dt_property_read_bool
+#define of_parse_phandle_with_args dt_parse_phandle_with_args
+
+/* Xen: Helpers to get device MMIO and IRQs */
+struct resource
+{
+	u64 addr;
+	u64 size;
+	unsigned int type;
+};
 
-/* Maximum number of stream IDs assigned to a single device */
-#define MAX_MASTER_STREAMIDS    MAX_PHANDLE_ARGS
+#define resource_size(res) (res)->size;
+
+#define platform_device dt_device_node
+
+#define IORESOURCE_MEM 0
+#define IORESOURCE_IRQ 1
+
+static struct resource *platform_get_resource(struct platform_device *pdev,
+					      unsigned int type,
+					      unsigned int num)
+{
+	/*
+	 * The resource is only used between 2 calls of platform_get_resource.
+	 * It's quite ugly but it's avoid to add too much code in the part
+	 * imported from Linux
+	 */
+	static struct resource res;
+	int ret = 0;
+
+	res.type = type;
+
+	switch (type) {
+	case IORESOURCE_MEM:
+		ret = dt_device_get_address(pdev, num, &res.addr, &res.size);
+
+		return ((ret) ? NULL : &res);
+
+	case IORESOURCE_IRQ:
+		ret = platform_get_irq(pdev, num);
+		if (ret < 0)
+			return NULL;
+
+		res.addr = ret;
+		res.size = 1;
+
+		return &res;
+
+	default:
+		return NULL;
+	}
+}
+
+/* Xen: Helpers for IRQ functions */
+#define request_irq(irq, func, flags, name, dev) request_irq(irq, flags, func, name, dev)
+#define free_irq release_irq
+
+enum irqreturn {
+	IRQ_NONE	= (0 << 0),
+	IRQ_HANDLED	= (1 << 0),
+};
+
+typedef enum irqreturn irqreturn_t;
+
+/* Device logger functions
+ * TODO: Handle PCI
+ */
+#define dev_print(dev, lvl, fmt, ...)						\
+	 printk(lvl "smmu: %s: " fmt, dt_node_full_name(dev_to_dt(dev)), ## __VA_ARGS__)
+
+#define dev_dbg(dev, fmt, ...) dev_print(dev, XENLOG_DEBUG, fmt, ## __VA_ARGS__)
+#define dev_notice(dev, fmt, ...) dev_print(dev, XENLOG_INFO, fmt, ## __VA_ARGS__)
+#define dev_warn(dev, fmt, ...) dev_print(dev, XENLOG_WARNING, fmt, ## __VA_ARGS__)
+#define dev_err(dev, fmt, ...) dev_print(dev, XENLOG_ERR, fmt, ## __VA_ARGS__)
+
+#define dev_err_ratelimited(dev, fmt, ...)					\
+	 dev_print(dev, XENLOG_ERR, fmt, ## __VA_ARGS__)
+
+#define dev_name(dev) dt_node_full_name(dev_to_dt(dev))
+
+/* Alias to Xen allocation helpers */
+#define kfree xfree
+#define kmalloc(size, flags)		_xmalloc(size, sizeof(void *))
+#define kzalloc(size, flags)		_xzalloc(size, sizeof(void *))
+#define devm_kzalloc(dev, size, flags)	_xzalloc(size, sizeof(void *))
+#define kmalloc_array(size, n, flags)	_xmalloc_array(size, sizeof(void *), n)
+
+static void __iomem *devm_ioremap_resource(struct device *dev,
+					   struct resource *res)
+{
+	void __iomem *ptr;
+
+	if (!res || res->type != IORESOURCE_MEM) {
+		dev_err(dev, "Invalid resource\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	ptr = ioremap_nocache(res->addr, res->size);
+	if (!ptr) {
+		dev_err(dev,
+			"ioremap failed (addr 0x%"PRIx64" size 0x%"PRIx64")\n",
+			res->addr, res->size);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return ptr;
+}
+
+/* Xen doesn't handle IOMMU fault */
+#define report_iommu_fault(...)	1
+
+#define IOMMU_FAULT_READ	0
+#define IOMMU_FAULT_WRITE	1
+
+/*
+ * Xen: PCI functions
+ * TODO: It should be implemented when PCI will be supported
+ */
+#define to_pci_dev(dev)	(NULL)
+static inline int pci_for_each_dma_alias(struct pci_dev *pdev,
+					 int (*fn) (struct pci_dev *pdev,
+						    u16 alias, void *data),
+					 void *data)
+{
+	BUG();
+	return 0;
+}
+
+/* Xen: misc */
+#define PHYS_MASK_SHIFT		PADDR_BITS
+typedef paddr_t phys_addr_t;
+
+#ifdef CONFIG_ARM_64
+# define CONFIG_64BIT
+#endif
+
+#define VA_BITS		0	/* Only used for configuring stage-1 input size */
+
+/* The macro ACCESS_ONCE start to be replaced in Linux in favor of
+ * {READ, WRITE}_ONCE. Rather than introducing in the common code, keep a
+ * version here. We will have to drop it when the SMMU code in Linux will
+ * switch to {READ, WRITE}_ONCE.
+ */
+#define __ACCESS_ONCE(x) ({ \
+	 __maybe_unused typeof(x) __var = 0; \
+	(volatile typeof(x) *)&(x); })
+#define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
+
+#define MODULE_DEVICE_TABLE(type, name)
+#define module_param_named(name, value, type, perm)
+#define MODULE_PARM_DESC(_parm, desc)
+
+/* Xen: Dummy iommu_domain */
+struct iommu_domain
+{
+	/* Runtime SMMU configuration for this iommu_domain */
+	struct arm_smmu_domain		*priv;
+
+	atomic_t ref;
+	/* Used to link iommu_domain contexts for a same domain.
+	 * There is at least one per-SMMU to used by the domain.
+	 * */
+	struct list_head		list;
+};
+
+/* Xen: Describes informations required for a Xen domain */
+struct arm_smmu_xen_domain {
+	spinlock_t			lock;
+	/* List of context (i.e iommu_domain) associated to this domain */
+	struct list_head		contexts;
+};
+
+/*
+ * Xen: Information about each device stored in dev->archdata.iommu
+ *
+ * Initially dev->archdata.iommu only stores the iommu_domain (runtime
+ * configuration of the SMMU) but, on Xen, we also have to store the
+ * iommu_group (list of streamIDs associated to the device).
+ *
+ * This is because Linux has a field iommu_group in the struct device. On Xen,
+ * that would require to move some hackery (dummy iommu_group) in a more generic
+ * place.
+ * */
+struct arm_smmu_xen_device {
+	struct iommu_domain *domain;
+	struct iommu_group *group;
+};
+
+#define dev_archdata(dev) ((struct arm_smmu_xen_device *)dev->archdata.iommu)
+#define dev_iommu_domain(dev) (dev_archdata(dev)->domain)
+#define dev_iommu_group(dev) (dev_archdata(dev)->group)
+
+/* Xen: Dummy iommu_group */
+struct iommu_group
+{
+	/* Streamids of the device */
+	struct arm_smmu_master_cfg *cfg;
+
+	atomic_t ref;
+};
+
+static struct iommu_group *iommu_group_alloc(void)
+{
+	struct iommu_group *group = xzalloc(struct iommu_group);
+
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&group->ref, 1);
+
+	return group;
+}
+
+static void iommu_group_put(struct iommu_group *group)
+{
+	if (atomic_dec_and_test(&group->ref))
+		xfree(group);
+}
+
+static void iommu_group_set_iommudata(struct iommu_group *group,
+				      struct arm_smmu_master_cfg *cfg,
+				      void (*releasefn)(void *))
+{
+	/* TODO: Store the releasefn for the PCI */
+	ASSERT(releasefn == NULL);
+
+	group->cfg = cfg;
+}
+
+static int iommu_group_add_device(struct iommu_group *group,
+				  struct device *dev)
+{
+	dev_iommu_group(dev) = group;
 
-/* Maximum stream ID */
-#define SMMU_MAX_STREAMIDS      (PAGE_SIZE_64K - 1)
+	atomic_inc(&group->ref);
+
+	return 0;
+}
+
+static struct iommu_group *iommu_group_get(struct device *dev)
+{
+	struct iommu_group *group = dev_iommu_group(dev);
+
+	if (group)
+		atomic_inc(&group->ref);
+
+	return group;
+}
+
+#define iommu_group_get_iommudata(group) (group)->cfg
+
+/***** Start of Linux SMMU code *****/
+
+/* Maximum number of stream IDs assigned to a single device */
+#define MAX_MASTER_STREAMIDS		MAX_PHANDLE_ARGS
 
 /* Maximum number of context banks per SMMU */
-#define SMMU_MAX_CBS        128
+#define ARM_SMMU_MAX_CBS		128
 
 /* Maximum number of mapping groups per SMMU */
-#define SMMU_MAX_SMRS       128
+#define ARM_SMMU_MAX_SMRS		128
 
 /* SMMU global address space */
-#define SMMU_GR0(smmu)      ((smmu)->base)
-#define SMMU_GR1(smmu)      ((smmu)->base + (smmu)->pagesize)
+#define ARM_SMMU_GR0(smmu)		((smmu)->base)
+#define ARM_SMMU_GR1(smmu)		((smmu)->base + (1 << (smmu)->pgshift))
 
 /*
- * SMMU global address space with conditional offset to access secure aliases of
- * non-secure registers (e.g. nsCR0: 0x400, nsGFSR: 0x448, nsGFSYNR0: 0x450)
+ * SMMU global address space with conditional offset to access secure
+ * aliases of non-secure registers (e.g. nsCR0: 0x400, nsGFSR: 0x448,
+ * nsGFSYNR0: 0x450)
  */
-#define SMMU_GR0_NS(smmu)                                   \
-    ((smmu)->base +                                         \
-     ((smmu->options & SMMU_OPT_SECURE_CONFIG_ACCESS)    \
-        ? 0x400 : 0))
+#define ARM_SMMU_GR0_NS(smmu)						\
+	((smmu)->base +							\
+		((smmu->options & ARM_SMMU_OPT_SECURE_CFG_ACCESS)	\
+			? 0x400 : 0))
 
 /* Page table bits */
-#define SMMU_PTE_PAGE           (((pteval_t)3) << 0)
-#define SMMU_PTE_CONT           (((pteval_t)1) << 52)
-#define SMMU_PTE_AF             (((pteval_t)1) << 10)
-#define SMMU_PTE_SH_NS          (((pteval_t)0) << 8)
-#define SMMU_PTE_SH_OS          (((pteval_t)2) << 8)
-#define SMMU_PTE_SH_IS          (((pteval_t)3) << 8)
-
-#if PAGE_SIZE == PAGE_SIZE_4K
-#define SMMU_PTE_CONT_ENTRIES   16
-#elif PAGE_SIZE == PAGE_SIZE_64K
-#define SMMU_PTE_CONT_ENTRIES   32
+#define ARM_SMMU_PTE_XN			(((pteval_t)3) << 53)
+#define ARM_SMMU_PTE_CONT		(((pteval_t)1) << 52)
+#define ARM_SMMU_PTE_AF			(((pteval_t)1) << 10)
+#define ARM_SMMU_PTE_SH_NS		(((pteval_t)0) << 8)
+#define ARM_SMMU_PTE_SH_OS		(((pteval_t)2) << 8)
+#define ARM_SMMU_PTE_SH_IS		(((pteval_t)3) << 8)
+#define ARM_SMMU_PTE_PAGE		(((pteval_t)3) << 0)
+
+#if PAGE_SIZE == SZ_4K
+#define ARM_SMMU_PTE_CONT_ENTRIES	16
+#elif PAGE_SIZE == SZ_64K
+#define ARM_SMMU_PTE_CONT_ENTRIES	32
 #else
-#define SMMU_PTE_CONT_ENTRIES   1
+#define ARM_SMMU_PTE_CONT_ENTRIES	1
 #endif
 
-#define SMMU_PTE_CONT_SIZE      (PAGE_SIZE * SMMU_PTE_CONT_ENTRIES)
-#define SMMU_PTE_CONT_MASK      (~(SMMU_PTE_CONT_SIZE - 1))
-#define SMMU_PTE_HWTABLE_SIZE   (PTRS_PER_PTE * sizeof(pte_t))
+#define ARM_SMMU_PTE_CONT_SIZE		(PAGE_SIZE * ARM_SMMU_PTE_CONT_ENTRIES)
+#define ARM_SMMU_PTE_CONT_MASK		(~(ARM_SMMU_PTE_CONT_SIZE - 1))
 
 /* Stage-1 PTE */
-#define SMMU_PTE_AP_UNPRIV      (((pteval_t)1) << 6)
-#define SMMU_PTE_AP_RDONLY      (((pteval_t)2) << 6)
-#define SMMU_PTE_ATTRINDX_SHIFT 2
-#define SMMU_PTE_nG             (((pteval_t)1) << 11)
+#define ARM_SMMU_PTE_AP_UNPRIV		(((pteval_t)1) << 6)
+#define ARM_SMMU_PTE_AP_RDONLY		(((pteval_t)2) << 6)
+#define ARM_SMMU_PTE_ATTRINDX_SHIFT	2
+#define ARM_SMMU_PTE_nG			(((pteval_t)1) << 11)
 
 /* Stage-2 PTE */
-#define SMMU_PTE_HAP_FAULT      (((pteval_t)0) << 6)
-#define SMMU_PTE_HAP_READ       (((pteval_t)1) << 6)
-#define SMMU_PTE_HAP_WRITE      (((pteval_t)2) << 6)
-#define SMMU_PTE_MEMATTR_OIWB   (((pteval_t)0xf) << 2)
-#define SMMU_PTE_MEMATTR_NC     (((pteval_t)0x5) << 2)
-#define SMMU_PTE_MEMATTR_DEV    (((pteval_t)0x1) << 2)
+#define ARM_SMMU_PTE_HAP_FAULT		(((pteval_t)0) << 6)
+#define ARM_SMMU_PTE_HAP_READ		(((pteval_t)1) << 6)
+#define ARM_SMMU_PTE_HAP_WRITE		(((pteval_t)2) << 6)
+#define ARM_SMMU_PTE_MEMATTR_OIWB	(((pteval_t)0xf) << 2)
+#define ARM_SMMU_PTE_MEMATTR_NC		(((pteval_t)0x5) << 2)
+#define ARM_SMMU_PTE_MEMATTR_DEV	(((pteval_t)0x1) << 2)
 
 /* Configuration registers */
-#define SMMU_GR0_sCR0           0x0
-#define SMMU_sCR0_CLIENTPD      (1 << 0)
-#define SMMU_sCR0_GFRE          (1 << 1)
-#define SMMU_sCR0_GFIE          (1 << 2)
-#define SMMU_sCR0_GCFGFRE       (1 << 4)
-#define SMMU_sCR0_GCFGFIE       (1 << 5)
-#define SMMU_sCR0_USFCFG        (1 << 10)
-#define SMMU_sCR0_VMIDPNE       (1 << 11)
-#define SMMU_sCR0_PTM           (1 << 12)
-#define SMMU_sCR0_FB            (1 << 13)
-#define SMMU_sCR0_BSU_SHIFT     14
-#define SMMU_sCR0_BSU_MASK      0x3
+#define ARM_SMMU_GR0_sCR0		0x0
+#define sCR0_CLIENTPD			(1 << 0)
+#define sCR0_GFRE			(1 << 1)
+#define sCR0_GFIE			(1 << 2)
+#define sCR0_GCFGFRE			(1 << 4)
+#define sCR0_GCFGFIE			(1 << 5)
+#define sCR0_USFCFG			(1 << 10)
+#define sCR0_VMIDPNE			(1 << 11)
+#define sCR0_PTM			(1 << 12)
+#define sCR0_FB				(1 << 13)
+#define sCR0_BSU_SHIFT			14
+#define sCR0_BSU_MASK			0x3
 
 /* Identification registers */
-#define SMMU_GR0_ID0            0x20
-#define SMMU_GR0_ID1            0x24
-#define SMMU_GR0_ID2            0x28
-#define SMMU_GR0_ID3            0x2c
-#define SMMU_GR0_ID4            0x30
-#define SMMU_GR0_ID5            0x34
-#define SMMU_GR0_ID6            0x38
-#define SMMU_GR0_ID7            0x3c
-#define SMMU_GR0_sGFSR          0x48
-#define SMMU_GR0_sGFSYNR0       0x50
-#define SMMU_GR0_sGFSYNR1       0x54
-#define SMMU_GR0_sGFSYNR2       0x58
-#define SMMU_GR0_PIDR0          0xfe0
-#define SMMU_GR0_PIDR1          0xfe4
-#define SMMU_GR0_PIDR2          0xfe8
-
-#define SMMU_ID0_S1TS           (1 << 30)
-#define SMMU_ID0_S2TS           (1 << 29)
-#define SMMU_ID0_NTS            (1 << 28)
-#define SMMU_ID0_SMS            (1 << 27)
-#define SMMU_ID0_PTFS_SHIFT     24
-#define SMMU_ID0_PTFS_MASK      0x2
-#define SMMU_ID0_PTFS_V8_ONLY   0x2
-#define SMMU_ID0_CTTW           (1 << 14)
-#define SMMU_ID0_NUMIRPT_SHIFT  16
-#define SMMU_ID0_NUMIRPT_MASK   0xff
-#define SMMU_ID0_NUMSMRG_SHIFT  0
-#define SMMU_ID0_NUMSMRG_MASK   0xff
-
-#define SMMU_ID1_PAGESIZE            (1 << 31)
-#define SMMU_ID1_NUMPAGENDXB_SHIFT   28
-#define SMMU_ID1_NUMPAGENDXB_MASK    7
-#define SMMU_ID1_NUMS2CB_SHIFT       16
-#define SMMU_ID1_NUMS2CB_MASK        0xff
-#define SMMU_ID1_NUMCB_SHIFT         0
-#define SMMU_ID1_NUMCB_MASK          0xff
-
-#define SMMU_ID2_OAS_SHIFT           4
-#define SMMU_ID2_OAS_MASK            0xf
-#define SMMU_ID2_IAS_SHIFT           0
-#define SMMU_ID2_IAS_MASK            0xf
-#define SMMU_ID2_UBS_SHIFT           8
-#define SMMU_ID2_UBS_MASK            0xf
-#define SMMU_ID2_PTFS_4K             (1 << 12)
-#define SMMU_ID2_PTFS_16K            (1 << 13)
-#define SMMU_ID2_PTFS_64K            (1 << 14)
-
-#define SMMU_PIDR2_ARCH_SHIFT        4
-#define SMMU_PIDR2_ARCH_MASK         0xf
+#define ARM_SMMU_GR0_ID0		0x20
+#define ARM_SMMU_GR0_ID1		0x24
+#define ARM_SMMU_GR0_ID2		0x28
+#define ARM_SMMU_GR0_ID3		0x2c
+#define ARM_SMMU_GR0_ID4		0x30
+#define ARM_SMMU_GR0_ID5		0x34
+#define ARM_SMMU_GR0_ID6		0x38
+#define ARM_SMMU_GR0_ID7		0x3c
+#define ARM_SMMU_GR0_sGFSR		0x48
+#define ARM_SMMU_GR0_sGFSYNR0		0x50
+#define ARM_SMMU_GR0_sGFSYNR1		0x54
+#define ARM_SMMU_GR0_sGFSYNR2		0x58
+#define ARM_SMMU_GR0_PIDR0		0xfe0
+#define ARM_SMMU_GR0_PIDR1		0xfe4
+#define ARM_SMMU_GR0_PIDR2		0xfe8
+
+#define ID0_S1TS			(1 << 30)
+#define ID0_S2TS			(1 << 29)
+#define ID0_NTS				(1 << 28)
+#define ID0_SMS				(1 << 27)
+#define ID0_PTFS_SHIFT			24
+#define ID0_PTFS_MASK			0x2
+#define ID0_PTFS_V8_ONLY		0x2
+#define ID0_CTTW			(1 << 14)
+#define ID0_NUMIRPT_SHIFT		16
+#define ID0_NUMIRPT_MASK		0xff
+#define ID0_NUMSIDB_SHIFT		9
+#define ID0_NUMSIDB_MASK		0xf
+#define ID0_NUMSMRG_SHIFT		0
+#define ID0_NUMSMRG_MASK		0xff
+
+#define ID1_PAGESIZE			(1 << 31)
+#define ID1_NUMPAGENDXB_SHIFT		28
+#define ID1_NUMPAGENDXB_MASK		7
+#define ID1_NUMS2CB_SHIFT		16
+#define ID1_NUMS2CB_MASK		0xff
+#define ID1_NUMCB_SHIFT			0
+#define ID1_NUMCB_MASK			0xff
+
+#define ID2_OAS_SHIFT			4
+#define ID2_OAS_MASK			0xf
+#define ID2_IAS_SHIFT			0
+#define ID2_IAS_MASK			0xf
+#define ID2_UBS_SHIFT			8
+#define ID2_UBS_MASK			0xf
+#define ID2_PTFS_4K			(1 << 12)
+#define ID2_PTFS_16K			(1 << 13)
+#define ID2_PTFS_64K			(1 << 14)
+
+#define PIDR2_ARCH_SHIFT		4
+#define PIDR2_ARCH_MASK			0xf
 
 /* Global TLB invalidation */
-#define SMMU_GR0_STLBIALL           0x60
-#define SMMU_GR0_TLBIVMID           0x64
-#define SMMU_GR0_TLBIALLNSNH        0x68
-#define SMMU_GR0_TLBIALLH           0x6c
-#define SMMU_GR0_sTLBGSYNC          0x70
-#define SMMU_GR0_sTLBGSTATUS        0x74
-#define SMMU_sTLBGSTATUS_GSACTIVE   (1 << 0)
-#define SMMU_TLB_LOOP_TIMEOUT       1000000 /* 1s! */
+#define ARM_SMMU_GR0_STLBIALL		0x60
+#define ARM_SMMU_GR0_TLBIVMID		0x64
+#define ARM_SMMU_GR0_TLBIALLNSNH	0x68
+#define ARM_SMMU_GR0_TLBIALLH		0x6c
+#define ARM_SMMU_GR0_sTLBGSYNC		0x70
+#define ARM_SMMU_GR0_sTLBGSTATUS	0x74
+#define sTLBGSTATUS_GSACTIVE		(1 << 0)
+#define TLB_LOOP_TIMEOUT		1000000	/* 1s! */
 
 /* Stream mapping registers */
-#define SMMU_GR0_SMR(n)             (0x800 + ((n) << 2))
-#define SMMU_SMR_VALID              (1 << 31)
-#define SMMU_SMR_MASK_SHIFT         16
-#define SMMU_SMR_MASK_MASK          0x7fff
-#define SMMU_SMR_ID_SHIFT           0
-#define SMMU_SMR_ID_MASK            0x7fff
-
-#define SMMU_GR0_S2CR(n)        (0xc00 + ((n) << 2))
-#define SMMU_S2CR_CBNDX_SHIFT   0
-#define SMMU_S2CR_CBNDX_MASK    0xff
-#define SMMU_S2CR_TYPE_SHIFT    16
-#define SMMU_S2CR_TYPE_MASK     0x3
-#define SMMU_S2CR_TYPE_TRANS    (0 << SMMU_S2CR_TYPE_SHIFT)
-#define SMMU_S2CR_TYPE_BYPASS   (1 << SMMU_S2CR_TYPE_SHIFT)
-#define SMMU_S2CR_TYPE_FAULT    (2 << SMMU_S2CR_TYPE_SHIFT)
+#define ARM_SMMU_GR0_SMR(n)		(0x800 + ((n) << 2))
+#define SMR_VALID			(1 << 31)
+#define SMR_MASK_SHIFT			16
+#define SMR_MASK_MASK			0x7fff
+#define SMR_ID_SHIFT			0
+#define SMR_ID_MASK			0x7fff
+
+#define ARM_SMMU_GR0_S2CR(n)		(0xc00 + ((n) << 2))
+#define S2CR_CBNDX_SHIFT		0
+#define S2CR_CBNDX_MASK			0xff
+#define S2CR_TYPE_SHIFT			16
+#define S2CR_TYPE_MASK			0x3
+#define S2CR_TYPE_TRANS			(0 << S2CR_TYPE_SHIFT)
+#define S2CR_TYPE_BYPASS		(1 << S2CR_TYPE_SHIFT)
+#define S2CR_TYPE_FAULT			(2 << S2CR_TYPE_SHIFT)
 
 /* Context bank attribute registers */
-#define SMMU_GR1_CBAR(n)                    (0x0 + ((n) << 2))
-#define SMMU_CBAR_VMID_SHIFT                0
-#define SMMU_CBAR_VMID_MASK                 0xff
-#define SMMU_CBAR_S1_MEMATTR_SHIFT          12
-#define SMMU_CBAR_S1_MEMATTR_MASK           0xf
-#define SMMU_CBAR_S1_MEMATTR_WB             0xf
-#define SMMU_CBAR_TYPE_SHIFT                16
-#define SMMU_CBAR_TYPE_MASK                 0x3
-#define SMMU_CBAR_TYPE_S2_TRANS             (0 << SMMU_CBAR_TYPE_SHIFT)
-#define SMMU_CBAR_TYPE_S1_TRANS_S2_BYPASS   (1 << SMMU_CBAR_TYPE_SHIFT)
-#define SMMU_CBAR_TYPE_S1_TRANS_S2_FAULT    (2 << SMMU_CBAR_TYPE_SHIFT)
-#define SMMU_CBAR_TYPE_S1_TRANS_S2_TRANS    (3 << SMMU_CBAR_TYPE_SHIFT)
-#define SMMU_CBAR_IRPTNDX_SHIFT             24
-#define SMMU_CBAR_IRPTNDX_MASK              0xff
-
-#define SMMU_GR1_CBA2R(n)                   (0x800 + ((n) << 2))
-#define SMMU_CBA2R_RW64_32BIT               (0 << 0)
-#define SMMU_CBA2R_RW64_64BIT               (1 << 0)
+#define ARM_SMMU_GR1_CBAR(n)		(0x0 + ((n) << 2))
+#define CBAR_VMID_SHIFT			0
+#define CBAR_VMID_MASK			0xff
+#define CBAR_S1_BPSHCFG_SHIFT		8
+#define CBAR_S1_BPSHCFG_MASK		3
+#define CBAR_S1_BPSHCFG_NSH		3
+#define CBAR_S1_MEMATTR_SHIFT		12
+#define CBAR_S1_MEMATTR_MASK		0xf
+#define CBAR_S1_MEMATTR_WB		0xf
+#define CBAR_TYPE_SHIFT			16
+#define CBAR_TYPE_MASK			0x3
+#define CBAR_TYPE_S2_TRANS		(0 << CBAR_TYPE_SHIFT)
+#define CBAR_TYPE_S1_TRANS_S2_BYPASS	(1 << CBAR_TYPE_SHIFT)
+#define CBAR_TYPE_S1_TRANS_S2_FAULT	(2 << CBAR_TYPE_SHIFT)
+#define CBAR_TYPE_S1_TRANS_S2_TRANS	(3 << CBAR_TYPE_SHIFT)
+#define CBAR_IRPTNDX_SHIFT		24
+#define CBAR_IRPTNDX_MASK		0xff
+
+#define ARM_SMMU_GR1_CBA2R(n)		(0x800 + ((n) << 2))
+#define CBA2R_RW64_32BIT		(0 << 0)
+#define CBA2R_RW64_64BIT		(1 << 0)
 
 /* Translation context bank */
-#define SMMU_CB_BASE(smmu)                  ((smmu)->base + ((smmu)->size >> 1))
-#define SMMU_CB(smmu, n)                    ((n) * (smmu)->pagesize)
-
-#define SMMU_CB_SCTLR                       0x0
-#define SMMU_CB_RESUME                      0x8
-#define SMMU_CB_TCR2                        0x10
-#define SMMU_CB_TTBR0_LO                    0x20
-#define SMMU_CB_TTBR0_HI                    0x24
-#define SMMU_CB_TCR                         0x30
-#define SMMU_CB_S1_MAIR0                    0x38
-#define SMMU_CB_FSR                         0x58
-#define SMMU_CB_FAR_LO                      0x60
-#define SMMU_CB_FAR_HI                      0x64
-#define SMMU_CB_FSYNR0                      0x68
-#define SMMU_CB_S1_TLBIASID                 0x610
-
-#define SMMU_SCTLR_S1_ASIDPNE               (1 << 12)
-#define SMMU_SCTLR_CFCFG                    (1 << 7)
-#define SMMU_SCTLR_CFIE                     (1 << 6)
-#define SMMU_SCTLR_CFRE                     (1 << 5)
-#define SMMU_SCTLR_E                        (1 << 4)
-#define SMMU_SCTLR_AFE                      (1 << 2)
-#define SMMU_SCTLR_TRE                      (1 << 1)
-#define SMMU_SCTLR_M                        (1 << 0)
-#define SMMU_SCTLR_EAE_SBOP                 (SMMU_SCTLR_AFE | SMMU_SCTLR_TRE)
-
-#define SMMU_RESUME_RETRY                   (0 << 0)
-#define SMMU_RESUME_TERMINATE               (1 << 0)
-
-#define SMMU_TCR_EAE                        (1 << 31)
-
-#define SMMU_TCR_PASIZE_SHIFT               16
-#define SMMU_TCR_PASIZE_MASK                0x7
-
-#define SMMU_TCR_TG0_4K                     (0 << 14)
-#define SMMU_TCR_TG0_64K                    (1 << 14)
-
-#define SMMU_TCR_SH0_SHIFT                  12
-#define SMMU_TCR_SH0_MASK                   0x3
-#define SMMU_TCR_SH_NS                      0
-#define SMMU_TCR_SH_OS                      2
-#define SMMU_TCR_SH_IS                      3
-
-#define SMMU_TCR_ORGN0_SHIFT                10
-#define SMMU_TCR_IRGN0_SHIFT                8
-#define SMMU_TCR_RGN_MASK                   0x3
-#define SMMU_TCR_RGN_NC                     0
-#define SMMU_TCR_RGN_WBWA                   1
-#define SMMU_TCR_RGN_WT                     2
-#define SMMU_TCR_RGN_WB                     3
-
-#define SMMU_TCR_SL0_SHIFT                  6
-#define SMMU_TCR_SL0_MASK                   0x3
-#define SMMU_TCR_SL0_LVL_2                  0
-#define SMMU_TCR_SL0_LVL_1                  1
-
-#define SMMU_TCR_T1SZ_SHIFT                 16
-#define SMMU_TCR_T0SZ_SHIFT                 0
-#define SMMU_TCR_SZ_MASK                    0xf
-
-#define SMMU_TCR2_SEP_SHIFT                 15
-#define SMMU_TCR2_SEP_MASK                  0x7
-
-#define SMMU_TCR2_PASIZE_SHIFT              0
-#define SMMU_TCR2_PASIZE_MASK               0x7
+#define ARM_SMMU_CB_BASE(smmu)		((smmu)->base + ((smmu)->size >> 1))
+#define ARM_SMMU_CB(smmu, n)		((n) * (1 << (smmu)->pgshift))
+
+#define ARM_SMMU_CB_SCTLR		0x0
+#define ARM_SMMU_CB_RESUME		0x8
+#define ARM_SMMU_CB_TTBCR2		0x10
+#define ARM_SMMU_CB_TTBR0_LO		0x20
+#define ARM_SMMU_CB_TTBR0_HI		0x24
+#define ARM_SMMU_CB_TTBCR		0x30
+#define ARM_SMMU_CB_S1_MAIR0		0x38
+#define ARM_SMMU_CB_FSR			0x58
+#define ARM_SMMU_CB_FAR_LO		0x60
+#define ARM_SMMU_CB_FAR_HI		0x64
+#define ARM_SMMU_CB_FSYNR0		0x68
+#define ARM_SMMU_CB_S1_TLBIASID		0x610
+
+#define SCTLR_S1_ASIDPNE		(1 << 12)
+#define SCTLR_CFCFG			(1 << 7)
+#define SCTLR_CFIE			(1 << 6)
+#define SCTLR_CFRE			(1 << 5)
+#define SCTLR_E				(1 << 4)
+#define SCTLR_AFE			(1 << 2)
+#define SCTLR_TRE			(1 << 1)
+#define SCTLR_M				(1 << 0)
+#define SCTLR_EAE_SBOP			(SCTLR_AFE | SCTLR_TRE)
+
+#define RESUME_RETRY			(0 << 0)
+#define RESUME_TERMINATE		(1 << 0)
+
+#define TTBCR_EAE			(1 << 31)
+
+#define TTBCR_PASIZE_SHIFT		16
+#define TTBCR_PASIZE_MASK		0x7
+
+#define TTBCR_TG0_4K			(0 << 14)
+#define TTBCR_TG0_64K			(1 << 14)
+
+#define TTBCR_SH0_SHIFT			12
+#define TTBCR_SH0_MASK			0x3
+#define TTBCR_SH_NS			0
+#define TTBCR_SH_OS			2
+#define TTBCR_SH_IS			3
+
+#define TTBCR_ORGN0_SHIFT		10
+#define TTBCR_IRGN0_SHIFT		8
+#define TTBCR_RGN_MASK			0x3
+#define TTBCR_RGN_NC			0
+#define TTBCR_RGN_WBWA			1
+#define TTBCR_RGN_WT			2
+#define TTBCR_RGN_WB			3
+
+#define TTBCR_SL0_SHIFT			6
+#define TTBCR_SL0_MASK			0x3
+#define TTBCR_SL0_LVL_2			0
+#define TTBCR_SL0_LVL_1			1
+
+#define TTBCR_T1SZ_SHIFT		16
+#define TTBCR_T0SZ_SHIFT		0
+#define TTBCR_SZ_MASK			0xf
+
+#define TTBCR2_SEP_SHIFT		15
+#define TTBCR2_SEP_MASK			0x7
+
+#define TTBCR2_PASIZE_SHIFT		0
+#define TTBCR2_PASIZE_MASK		0x7
 
 /* Common definitions for PASize and SEP fields */
-#define SMMU_TCR2_ADDR_32                   0
-#define SMMU_TCR2_ADDR_36                   1
-#define SMMU_TCR2_ADDR_40                   2
-#define SMMU_TCR2_ADDR_42                   3
-#define SMMU_TCR2_ADDR_44                   4
-#define SMMU_TCR2_ADDR_48                   5
-
-#define SMMU_TTBRn_HI_ASID_SHIFT            16
-
-#define SMMU_MAIR_ATTR_SHIFT(n)             ((n) << 3)
-#define SMMU_MAIR_ATTR_MASK                 0xff
-#define SMMU_MAIR_ATTR_DEVICE               0x04
-#define SMMU_MAIR_ATTR_NC                   0x44
-#define SMMU_MAIR_ATTR_WBRWA                0xff
-#define SMMU_MAIR_ATTR_IDX_NC               0
-#define SMMU_MAIR_ATTR_IDX_CACHE            1
-#define SMMU_MAIR_ATTR_IDX_DEV              2
-
-#define SMMU_FSR_MULTI                      (1 << 31)
-#define SMMU_FSR_SS                         (1 << 30)
-#define SMMU_FSR_UUT                        (1 << 8)
-#define SMMU_FSR_ASF                        (1 << 7)
-#define SMMU_FSR_TLBLKF                     (1 << 6)
-#define SMMU_FSR_TLBMCF                     (1 << 5)
-#define SMMU_FSR_EF                         (1 << 4)
-#define SMMU_FSR_PF                         (1 << 3)
-#define SMMU_FSR_AFF                        (1 << 2)
-#define SMMU_FSR_TF                         (1 << 1)
-
-#define SMMU_FSR_IGN                        (SMMU_FSR_AFF | SMMU_FSR_ASF |    \
-                                             SMMU_FSR_TLBMCF | SMMU_FSR_TLBLKF)
-#define SMMU_FSR_FAULT                      (SMMU_FSR_MULTI | SMMU_FSR_SS |   \
-                                             SMMU_FSR_UUT | SMMU_FSR_EF |     \
-                                             SMMU_FSR_PF | SMMU_FSR_TF |      \
-                                             SMMU_FSR_IGN)
-
-#define SMMU_FSYNR0_WNR                     (1 << 4)
-
-#define smmu_print(dev, lvl, fmt, ...)                                        \
-    printk(lvl "smmu: %s: " fmt, dt_node_full_name(dev->node), ## __VA_ARGS__)
-
-#define smmu_err(dev, fmt, ...) smmu_print(dev, XENLOG_ERR, fmt, ## __VA_ARGS__)
-
-#define smmu_dbg(dev, fmt, ...)                                             \
-    smmu_print(dev, XENLOG_DEBUG, fmt, ## __VA_ARGS__)
-
-#define smmu_info(dev, fmt, ...)                                            \
-    smmu_print(dev, XENLOG_INFO, fmt, ## __VA_ARGS__)
-
-#define smmu_warn(dev, fmt, ...)                                            \
-    smmu_print(dev, XENLOG_WARNING, fmt, ## __VA_ARGS__)
-
-struct arm_smmu_device {
-    const struct dt_device_node *node;
-
-    void __iomem                *base;
-    unsigned long               size;
-    unsigned long               pagesize;
-
-#define SMMU_FEAT_COHERENT_WALK (1 << 0)
-#define SMMU_FEAT_STREAM_MATCH  (1 << 1)
-#define SMMU_FEAT_TRANS_S1      (1 << 2)
-#define SMMU_FEAT_TRANS_S2      (1 << 3)
-#define SMMU_FEAT_TRANS_NESTED  (1 << 4)
-    u32                         features;
-    u32                         options;
-    int                         version;
-
-    u32                         num_context_banks;
-    u32                         num_s2_context_banks;
-    DECLARE_BITMAP(context_map, SMMU_MAX_CBS);
-    atomic_t                    irptndx;
-
-    u32                         num_mapping_groups;
-    DECLARE_BITMAP(smr_map, SMMU_MAX_SMRS);
-
-    unsigned long               input_size;
-    unsigned long               s1_output_size;
-    unsigned long               s2_output_size;
-
-    u32                         num_global_irqs;
-    u32                         num_context_irqs;
-    unsigned int                *irqs;
+#define TTBCR2_ADDR_32			0
+#define TTBCR2_ADDR_36			1
+#define TTBCR2_ADDR_40			2
+#define TTBCR2_ADDR_42			3
+#define TTBCR2_ADDR_44			4
+#define TTBCR2_ADDR_48			5
+
+#define TTBRn_HI_ASID_SHIFT		16
+
+#define MAIR_ATTR_SHIFT(n)		((n) << 3)
+#define MAIR_ATTR_MASK			0xff
+#define MAIR_ATTR_DEVICE		0x04
+#define MAIR_ATTR_NC			0x44
+#define MAIR_ATTR_WBRWA			0xff
+#define MAIR_ATTR_IDX_NC		0
+#define MAIR_ATTR_IDX_CACHE		1
+#define MAIR_ATTR_IDX_DEV		2
+
+#define FSR_MULTI			(1 << 31)
+#define FSR_SS				(1 << 30)
+#define FSR_UUT				(1 << 8)
+#define FSR_ASF				(1 << 7)
+#define FSR_TLBLKF			(1 << 6)
+#define FSR_TLBMCF			(1 << 5)
+#define FSR_EF				(1 << 4)
+#define FSR_PF				(1 << 3)
+#define FSR_AFF				(1 << 2)
+#define FSR_TF				(1 << 1)
+
+#define FSR_IGN				(FSR_AFF | FSR_ASF | \
+					 FSR_TLBMCF | FSR_TLBLKF)
+#define FSR_FAULT			(FSR_MULTI | FSR_SS | FSR_UUT | \
+					 FSR_EF | FSR_PF | FSR_TF | FSR_IGN)
+
+#define FSYNR0_WNR			(1 << 4)
+
+static int force_stage;
+module_param_named(force_stage, force_stage, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(force_stage,
+	"Force SMMU mappings to be installed at a particular stage of translation. A value of '1' or '2' forces the corresponding stage. All other values are ignored (i.e. no stage is forced). Note that selecting a specific stage will disable support for nested translation.");
+
+enum arm_smmu_arch_version {
+	ARM_SMMU_V1 = 1,
+	ARM_SMMU_V2,
+};
 
-    u32                         smr_mask_mask;
-    u32                         smr_id_mask;
+struct arm_smmu_smr {
+	u8				idx;
+	u16				mask;
+	u16				id;
+};
 
-    unsigned long               *sids;
+struct arm_smmu_master_cfg {
+	int				num_streamids;
+	u16				streamids[MAX_MASTER_STREAMIDS];
+	struct arm_smmu_smr		*smrs;
+};
 
-    struct list_head            list;
-    struct rb_root              masters;
+struct arm_smmu_master {
+	struct device_node		*of_node;
+	struct rb_node			node;
+	struct arm_smmu_master_cfg	cfg;
 };
 
-struct arm_smmu_smr {
-    u8                          idx;
-    u16                         mask;
-    u16                         id;
+struct arm_smmu_device {
+	struct device			*dev;
+
+	void __iomem			*base;
+	unsigned long			size;
+	unsigned long			pgshift;
+
+#define ARM_SMMU_FEAT_COHERENT_WALK	(1 << 0)
+#define ARM_SMMU_FEAT_STREAM_MATCH	(1 << 1)
+#define ARM_SMMU_FEAT_TRANS_S1		(1 << 2)
+#define ARM_SMMU_FEAT_TRANS_S2		(1 << 3)
+#define ARM_SMMU_FEAT_TRANS_NESTED	(1 << 4)
+	u32				features;
+
+#define ARM_SMMU_OPT_SECURE_CFG_ACCESS (1 << 0)
+	u32				options;
+	enum arm_smmu_arch_version	version;
+
+	u32				num_context_banks;
+	u32				num_s2_context_banks;
+	DECLARE_BITMAP(context_map, ARM_SMMU_MAX_CBS);
+	atomic_t			irptndx;
+
+	u32				num_mapping_groups;
+	DECLARE_BITMAP(smr_map, ARM_SMMU_MAX_SMRS);
+
+	unsigned long			s1_input_size;
+	unsigned long			s1_output_size;
+	unsigned long			s2_input_size;
+	unsigned long			s2_output_size;
+
+	u32				num_global_irqs;
+	u32				num_context_irqs;
+	unsigned int			*irqs;
+
+	struct list_head		list;
+	struct rb_root			masters;
 };
 
-#define INVALID_IRPTNDX         0xff
+struct arm_smmu_cfg {
+	u8				cbndx;
+	u8				irptndx;
+	u32				cbar;
 
-#define SMMU_CB_ASID(cfg)       ((cfg)->cbndx)
-#define SMMU_CB_VMID(cfg)       ((cfg)->cbndx + 1)
+	/* Xen: Domain associated to this configuration */
+	struct domain			*domain;
+};
+#define INVALID_IRPTNDX			0xff
 
-struct arm_smmu_domain_cfg {
-    struct arm_smmu_device  *smmu;
-    u8                      cbndx;
-    u8                      irptndx;
-    u32                     cbar;
-    /* Domain associated to this device */
-    struct domain           *domain;
-    /* List of master which use this structure */
-    struct list_head        masters;
+#define ARM_SMMU_CB_ASID(cfg)		((cfg)->cbndx)
+#define ARM_SMMU_CB_VMID(cfg)		((cfg)->cbndx + 1)
 
-    /* Used to link domain context for a same domain */
-    struct list_head        list;
+enum arm_smmu_domain_stage {
+	ARM_SMMU_DOMAIN_S1 = 0,
+	ARM_SMMU_DOMAIN_S2,
+	ARM_SMMU_DOMAIN_NESTED,
 };
 
-struct arm_smmu_master {
-    const struct dt_device_node *dt_node;
-
-    /*
-     * The following is specific to the master's position in the
-     * SMMU chain.
-     */
-    struct rb_node              node;
-    u32                         num_streamids;
-    u16                         streamids[MAX_MASTER_STREAMIDS];
-    int                         num_s2crs;
-
-    struct arm_smmu_smr         *smrs;
-    struct arm_smmu_domain_cfg  *cfg;
-
-    /* Used to link masters in a same domain context */
-    struct list_head            list;
+struct arm_smmu_domain {
+	struct arm_smmu_device		*smmu;
+	struct arm_smmu_cfg		cfg;
+	enum arm_smmu_domain_stage	stage;
+	spinlock_t			lock;
 };
 
+static DEFINE_SPINLOCK(arm_smmu_devices_lock);
 static LIST_HEAD(arm_smmu_devices);
 
-struct arm_smmu_domain {
-    spinlock_t lock;
-    struct list_head contexts;
+struct arm_smmu_option_prop {
+	u32 opt;
+	const char *prop;
 };
 
-struct arm_smmu_option_prop {
-    u32         opt;
-    const char  *prop;
+static struct arm_smmu_option_prop arm_smmu_options[] = {
+	{ ARM_SMMU_OPT_SECURE_CFG_ACCESS, "calxeda,smmu-secure-config-access" },
+	{ 0, NULL},
 };
 
-static const struct arm_smmu_option_prop arm_smmu_options [] __initconst =
+static void parse_driver_options(struct arm_smmu_device *smmu)
 {
-    { SMMU_OPT_SECURE_CONFIG_ACCESS, "calxeda,smmu-secure-config-access" },
-    { 0, NULL},
-};
+	int i = 0;
+
+	do {
+		if (of_property_read_bool(smmu->dev->of_node,
+						arm_smmu_options[i].prop)) {
+			smmu->options |= arm_smmu_options[i].opt;
+			dev_notice(smmu->dev, "option %s\n",
+				arm_smmu_options[i].prop);
+		}
+	} while (arm_smmu_options[++i].opt);
+}
 
-static void __init check_driver_options(struct arm_smmu_device *smmu)
+static struct device_node *dev_get_dev_node(struct device *dev)
 {
-    int i = 0;
+#if 0 /* Xen: TODO: Add support for PCI */
+	if (dev_is_pci(dev)) {
+		struct pci_bus *bus = to_pci_dev(dev)->bus;
+
+		while (!pci_is_root_bus(bus))
+			bus = bus->parent;
+		return bus->bridge->parent->of_node;
+	}
+#endif
 
-    do {
-        if ( dt_property_read_bool(smmu->node, arm_smmu_options[i].prop) )
-        {
-            smmu->options |= arm_smmu_options[i].opt;
-            smmu_dbg(smmu, "option %s\n", arm_smmu_options[i].prop);
-        }
-    } while ( arm_smmu_options[++i].opt );
+	return dev->of_node;
 }
 
-static void arm_smmu_context_fault(int irq, void *data,
-                                   struct cpu_user_regs *regs)
+static struct arm_smmu_master *find_smmu_master(struct arm_smmu_device *smmu,
+						struct device_node *dev_node)
 {
-    u32 fsr, far, fsynr;
-    uint64_t iova;
-    struct arm_smmu_domain_cfg *cfg = data;
-    struct arm_smmu_device *smmu = cfg->smmu;
-    void __iomem *cb_base;
+	struct rb_node *node = smmu->masters.rb_node;
+
+	while (node) {
+		struct arm_smmu_master *master;
+
+		master = container_of(node, struct arm_smmu_master, node);
 
-    cb_base = SMMU_CB_BASE(smmu) + SMMU_CB(smmu, cfg->cbndx);
-    fsr = readl_relaxed(cb_base + SMMU_CB_FSR);
+		if (dev_node < master->of_node)
+			node = node->rb_left;
+		else if (dev_node > master->of_node)
+			node = node->rb_right;
+		else
+			return master;
+	}
 
-    if ( !(fsr & SMMU_FSR_FAULT) )
-        return;
+	return NULL;
+}
 
-    if ( fsr & SMMU_FSR_IGN )
-        smmu_err(smmu, "Unexpected context fault (fsr 0x%u)\n", fsr);
+static struct arm_smmu_master_cfg *
+find_smmu_master_cfg(struct device *dev)
+{
+	struct arm_smmu_master_cfg *cfg = NULL;
+	struct iommu_group *group = iommu_group_get(dev);
 
-    fsynr = readl_relaxed(cb_base + SMMU_CB_FSYNR0);
-    far = readl_relaxed(cb_base + SMMU_CB_FAR_LO);
-    iova = far;
-    far = readl_relaxed(cb_base + SMMU_CB_FAR_HI);
-    iova |= ((uint64_t)far << 32);
+	if (group) {
+		cfg = iommu_group_get_iommudata(group);
+		iommu_group_put(group);
+	}
 
-    smmu_err(smmu, "Unhandled context fault for domain %u\n",
-             cfg->domain->domain_id);
-    smmu_err(smmu, "\tFSR 0x%x, IOVA 0x%"PRIx64", FSYNR 0x%x,  CB %d\n",
-             fsr, iova, fsynr, cfg->cbndx);
+	return cfg;
+}
 
-    /* Clear the faulting FSR */
-    writel(fsr, cb_base + SMMU_CB_FSR);
+static int insert_smmu_master(struct arm_smmu_device *smmu,
+			      struct arm_smmu_master *master)
+{
+	struct rb_node **new, *parent;
+
+	new = &smmu->masters.rb_node;
+	parent = NULL;
+	while (*new) {
+		struct arm_smmu_master *this
+			= container_of(*new, struct arm_smmu_master, node);
+
+		parent = *new;
+		if (master->of_node < this->of_node)
+			new = &((*new)->rb_left);
+		else if (master->of_node > this->of_node)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&master->node, parent, new);
+	rb_insert_color(&master->node, &smmu->masters);
+	return 0;
+}
 
-    /* Terminate any stalled transactions */
-    if ( fsr & SMMU_FSR_SS )
-        writel_relaxed(SMMU_RESUME_TERMINATE, cb_base + SMMU_CB_RESUME);
+static int register_smmu_master(struct arm_smmu_device *smmu,
+				struct device *dev,
+				struct of_phandle_args *masterspec)
+{
+	int i;
+	struct arm_smmu_master *master;
+
+	master = find_smmu_master(smmu, masterspec->np);
+	if (master) {
+		dev_err(dev,
+			"rejecting multiple registrations for master device %s\n",
+			masterspec->np->name);
+		return -EBUSY;
+	}
+
+	if (masterspec->args_count > MAX_MASTER_STREAMIDS) {
+		dev_err(dev,
+			"reached maximum number (%d) of stream IDs for master device %s\n",
+			MAX_MASTER_STREAMIDS, masterspec->np->name);
+		return -ENOSPC;
+	}
+
+	master = devm_kzalloc(dev, sizeof(*master), GFP_KERNEL);
+	if (!master)
+		return -ENOMEM;
+
+	master->of_node			= masterspec->np;
+	master->cfg.num_streamids	= masterspec->args_count;
+
+	/* Xen: Let Xen know that the device is protected by an SMMU */
+	dt_device_set_protected(masterspec->np);
+
+	for (i = 0; i < master->cfg.num_streamids; ++i) {
+		u16 streamid = masterspec->args[i];
+
+		if (!(smmu->features & ARM_SMMU_FEAT_STREAM_MATCH) &&
+		     (streamid >= smmu->num_mapping_groups)) {
+			dev_err(dev,
+				"stream ID for master device %s greater than maximum allowed (%d)\n",
+				masterspec->np->name, smmu->num_mapping_groups);
+			return -ERANGE;
+		}
+		master->cfg.streamids[i] = streamid;
+	}
+	return insert_smmu_master(smmu, master);
 }
 
-static void arm_smmu_global_fault(int irq, void *data,
-                                  struct cpu_user_regs *regs)
+static struct arm_smmu_device *find_smmu_for_device(struct device *dev)
 {
-    u32 gfsr, gfsynr0, gfsynr1, gfsynr2;
-    struct arm_smmu_device *smmu = data;
-    void __iomem *gr0_base = SMMU_GR0_NS(smmu);
+	struct arm_smmu_device *smmu;
+	struct arm_smmu_master *master = NULL;
+	struct device_node *dev_node = dev_get_dev_node(dev);
+
+	spin_lock(&arm_smmu_devices_lock);
+	list_for_each_entry(smmu, &arm_smmu_devices, list) {
+		master = find_smmu_master(smmu, dev_node);
+		if (master)
+			break;
+	}
+	spin_unlock(&arm_smmu_devices_lock);
+
+	return master ? smmu : NULL;
+}
 
-    gfsr = readl_relaxed(gr0_base + SMMU_GR0_sGFSR);
-    gfsynr0 = readl_relaxed(gr0_base + SMMU_GR0_sGFSYNR0);
-    gfsynr1 = readl_relaxed(gr0_base + SMMU_GR0_sGFSYNR1);
-    gfsynr2 = readl_relaxed(gr0_base + SMMU_GR0_sGFSYNR2);
+static int __arm_smmu_alloc_bitmap(unsigned long *map, int start, int end)
+{
+	int idx;
 
-    if ( !gfsr )
-        return;
+	do {
+		idx = find_next_zero_bit(map, end, start);
+		if (idx == end)
+			return -ENOSPC;
+	} while (test_and_set_bit(idx, map));
 
-    smmu_err(smmu, "Unexpected global fault, this could be serious\n");
-    smmu_err(smmu,
-             "\tGFSR 0x%08x, GFSYNR0 0x%08x, GFSYNR1 0x%08x, GFSYNR2 0x%08x\n",
-             gfsr, gfsynr0, gfsynr1, gfsynr2);
-    writel(gfsr, gr0_base + SMMU_GR0_sGFSR);
+	return idx;
 }
 
-static struct arm_smmu_master *
-find_smmu_master(struct arm_smmu_device *smmu,
-                 const struct dt_device_node *dev_node)
+static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
 {
-    struct rb_node *node = smmu->masters.rb_node;
+	clear_bit(idx, map);
+}
 
-    while ( node )
-    {
-        struct arm_smmu_master *master;
+/* Wait for any pending TLB invalidations to complete */
+static void arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
+{
+	int count = 0;
+	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
+
+	writel_relaxed(0, gr0_base + ARM_SMMU_GR0_sTLBGSYNC);
+	while (readl_relaxed(gr0_base + ARM_SMMU_GR0_sTLBGSTATUS)
+	       & sTLBGSTATUS_GSACTIVE) {
+		cpu_relax();
+		if (++count == TLB_LOOP_TIMEOUT) {
+			dev_err_ratelimited(smmu->dev,
+			"TLB sync timed out -- SMMU may be deadlocked\n");
+			return;
+		}
+		udelay(1);
+	}
+}
 
-        master = container_of(node, struct arm_smmu_master, node);
+static void arm_smmu_tlb_inv_context(struct arm_smmu_domain *smmu_domain)
+{
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	void __iomem *base = ARM_SMMU_GR0(smmu);
+	bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
+
+	if (stage1) {
+		base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+		writel_relaxed(ARM_SMMU_CB_ASID(cfg),
+			       base + ARM_SMMU_CB_S1_TLBIASID);
+	} else {
+		base = ARM_SMMU_GR0(smmu);
+		writel_relaxed(ARM_SMMU_CB_VMID(cfg),
+			       base + ARM_SMMU_GR0_TLBIVMID);
+	}
+
+	arm_smmu_tlb_sync(smmu);
+}
 
-        if ( dev_node < master->dt_node )
-            node = node->rb_left;
-        else if ( dev_node > master->dt_node )
-            node = node->rb_right;
-        else
-            return master;
-    }
+static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
+{
+	int flags, ret;
+	u32 fsr, far, fsynr, resume;
+	unsigned long iova;
+	struct iommu_domain *domain = dev;
+	struct arm_smmu_domain *smmu_domain = domain->priv;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	void __iomem *cb_base;
+
+	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+	fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
+
+	if (!(fsr & FSR_FAULT))
+		return IRQ_NONE;
+
+	if (fsr & FSR_IGN)
+		dev_err_ratelimited(smmu->dev,
+				    "Unexpected context fault (fsr 0x%x)\n",
+				    fsr);
+
+	fsynr = readl_relaxed(cb_base + ARM_SMMU_CB_FSYNR0);
+	flags = fsynr & FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ;
+
+	far = readl_relaxed(cb_base + ARM_SMMU_CB_FAR_LO);
+	iova = far;
+#ifdef CONFIG_64BIT
+	far = readl_relaxed(cb_base + ARM_SMMU_CB_FAR_HI);
+	iova |= ((unsigned long)far << 32);
+#endif
 
-    return NULL;
+	if (!report_iommu_fault(domain, smmu->dev, iova, flags)) {
+		ret = IRQ_HANDLED;
+		resume = RESUME_RETRY;
+	} else {
+		dev_err_ratelimited(smmu->dev,
+		    "Unhandled context fault: iova=0x%08lx, fsynr=0x%x, cb=%d\n",
+		    iova, fsynr, cfg->cbndx);
+		ret = IRQ_NONE;
+		resume = RESUME_TERMINATE;
+	}
+
+	/* Clear the faulting FSR */
+	writel(fsr, cb_base + ARM_SMMU_CB_FSR);
+
+	/* Retry or terminate any stalled transactions */
+	if (fsr & FSR_SS)
+		writel_relaxed(resume, cb_base + ARM_SMMU_CB_RESUME);
+
+	return ret;
 }
 
-static __init int insert_smmu_master(struct arm_smmu_device *smmu,
-                                     struct arm_smmu_master *master)
+static irqreturn_t arm_smmu_global_fault(int irq, void *dev)
 {
-    struct rb_node **new, *parent;
+	u32 gfsr, gfsynr0, gfsynr1, gfsynr2;
+	struct arm_smmu_device *smmu = dev;
+	void __iomem *gr0_base = ARM_SMMU_GR0_NS(smmu);
+
+	gfsr = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSR);
+	gfsynr0 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR0);
+	gfsynr1 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR1);
+	gfsynr2 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR2);
+
+	if (!gfsr)
+		return IRQ_NONE;
+
+	dev_err_ratelimited(smmu->dev,
+		"Unexpected global fault, this could be serious\n");
+	dev_err_ratelimited(smmu->dev,
+		"\tGFSR 0x%08x, GFSYNR0 0x%08x, GFSYNR1 0x%08x, GFSYNR2 0x%08x\n",
+		gfsr, gfsynr0, gfsynr1, gfsynr2);
+
+	writel(gfsr, gr0_base + ARM_SMMU_GR0_sGFSR);
+	return IRQ_HANDLED;
+}
 
-    new = &smmu->masters.rb_node;
-    parent = NULL;
-    while ( *new )
-    {
-        struct arm_smmu_master *this;
+/* Xen: Interrupt handlers wrapper */
+static void arm_smmu_context_fault_xen(int irq, void *dev,
+				       struct cpu_user_regs *regs)
+{
+	arm_smmu_context_fault(irq, dev);
+}
+
+#define arm_smmu_context_fault arm_smmu_context_fault_xen
 
-        this = container_of(*new, struct arm_smmu_master, node);
+static void arm_smmu_global_fault_xen(int irq, void *dev,
+				      struct cpu_user_regs *regs)
+{
+	arm_smmu_global_fault(irq, dev);
+}
 
-        parent = *new;
-        if ( master->dt_node < this->dt_node )
-            new = &((*new)->rb_left);
-        else if (master->dt_node > this->dt_node)
-            new = &((*new)->rb_right);
-        else
-            return -EEXIST;
-    }
+#define arm_smmu_global_fault arm_smmu_global_fault_xen
 
-    rb_link_node(&master->node, parent, new);
-    rb_insert_color(&master->node, &smmu->masters);
-    return 0;
+#if 0 /* Xen: Page tables are shared with the processor */
+static void arm_smmu_flush_pgtable(struct arm_smmu_device *smmu, void *addr,
+				   size_t size)
+{
+	unsigned long offset = (unsigned long)addr & ~PAGE_MASK;
+
+
+	/* Ensure new page tables are visible to the hardware walker */
+	if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) {
+		dsb(ishst);
+	} else {
+		/*
+		 * If the SMMU can't walk tables in the CPU caches, treat them
+		 * like non-coherent DMA since we need to flush the new entries
+		 * all the way out to memory. There's no possibility of
+		 * recursion here as the SMMU table walker will not be wired
+		 * through another SMMU.
+		 */
+		dma_map_page(smmu->dev, virt_to_page(addr), offset, size,
+				DMA_TO_DEVICE);
+	}
 }
+#endif
 
-static __init int register_smmu_master(struct arm_smmu_device *smmu,
-                                       struct dt_phandle_args *masterspec)
+static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain)
 {
-    int i, sid;
-    struct arm_smmu_master *master;
-    int rc = 0;
+	u32 reg;
+	bool stage1;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	void __iomem *cb_base, *gr0_base, *gr1_base;
+	paddr_t p2maddr;
+
+	gr0_base = ARM_SMMU_GR0(smmu);
+	gr1_base = ARM_SMMU_GR1(smmu);
+	stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
+	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+
+	/* CBAR */
+	reg = cfg->cbar;
+	if (smmu->version == ARM_SMMU_V1)
+		reg |= cfg->irptndx << CBAR_IRPTNDX_SHIFT;
+
+	/*
+	 * Use the weakest shareability/memory types, so they are
+	 * overridden by the ttbcr/pte.
+	 */
+	if (stage1) {
+		reg |= (CBAR_S1_BPSHCFG_NSH << CBAR_S1_BPSHCFG_SHIFT) |
+			(CBAR_S1_MEMATTR_WB << CBAR_S1_MEMATTR_SHIFT);
+	} else {
+		reg |= ARM_SMMU_CB_VMID(cfg) << CBAR_VMID_SHIFT;
+	}
+	writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBAR(cfg->cbndx));
+
+	if (smmu->version > ARM_SMMU_V1) {
+		/* CBA2R */
+#ifdef CONFIG_64BIT
+		reg = CBA2R_RW64_64BIT;
+#else
+		reg = CBA2R_RW64_32BIT;
+#endif
+		writel_relaxed(reg,
+			       gr1_base + ARM_SMMU_GR1_CBA2R(cfg->cbndx));
+
+		/* TTBCR2 */
+		switch (smmu->s1_input_size) {
+		case 32:
+			reg = (TTBCR2_ADDR_32 << TTBCR2_SEP_SHIFT);
+			break;
+		case 36:
+			reg = (TTBCR2_ADDR_36 << TTBCR2_SEP_SHIFT);
+			break;
+		case 39:
+		case 40:
+			reg = (TTBCR2_ADDR_40 << TTBCR2_SEP_SHIFT);
+			break;
+		case 42:
+			reg = (TTBCR2_ADDR_42 << TTBCR2_SEP_SHIFT);
+			break;
+		case 44:
+			reg = (TTBCR2_ADDR_44 << TTBCR2_SEP_SHIFT);
+			break;
+		case 48:
+			reg = (TTBCR2_ADDR_48 << TTBCR2_SEP_SHIFT);
+			break;
+		}
+
+		switch (smmu->s1_output_size) {
+		case 32:
+			reg |= (TTBCR2_ADDR_32 << TTBCR2_PASIZE_SHIFT);
+			break;
+		case 36:
+			reg |= (TTBCR2_ADDR_36 << TTBCR2_PASIZE_SHIFT);
+			break;
+		case 39:
+		case 40:
+			reg |= (TTBCR2_ADDR_40 << TTBCR2_PASIZE_SHIFT);
+			break;
+		case 42:
+			reg |= (TTBCR2_ADDR_42 << TTBCR2_PASIZE_SHIFT);
+			break;
+		case 44:
+			reg |= (TTBCR2_ADDR_44 << TTBCR2_PASIZE_SHIFT);
+			break;
+		case 48:
+			reg |= (TTBCR2_ADDR_48 << TTBCR2_PASIZE_SHIFT);
+			break;
+		}
+
+		if (stage1)
+			writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBCR2);
+	}
+
+	/* TTBR0 */
+	/* Xen: The page table is shared with the P2M code */
+	ASSERT(smmu_domain->cfg.domain != NULL);
+	p2maddr = page_to_maddr(smmu_domain->cfg.domain->arch.p2m.root);
+
+	dev_notice(smmu->dev, "d%u: p2maddr 0x%"PRIpaddr"\n",
+		   smmu_domain->cfg.domain->domain_id, p2maddr);
+
+	reg = (p2maddr & ((1ULL << 32) - 1));
+	writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0_LO);
+	reg = (p2maddr >> 32);
+	if (stage1)
+		reg |= ARM_SMMU_CB_ASID(cfg) << TTBRn_HI_ASID_SHIFT;
+	writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0_HI);
+
+	/*
+	 * TTBCR
+	 * We use long descriptor, with inner-shareable WBWA tables in TTBR0.
+	 */
+	if (smmu->version > ARM_SMMU_V1) {
+		if (PAGE_SIZE == SZ_4K)
+			reg = TTBCR_TG0_4K;
+		else
+			reg = TTBCR_TG0_64K;
+
+		if (!stage1) {
+			reg |= (64 - smmu->s2_input_size) << TTBCR_T0SZ_SHIFT;
+
+			switch (smmu->s2_output_size) {
+			case 32:
+				reg |= (TTBCR2_ADDR_32 << TTBCR_PASIZE_SHIFT);
+				break;
+			case 36:
+				reg |= (TTBCR2_ADDR_36 << TTBCR_PASIZE_SHIFT);
+				break;
+			case 40:
+				reg |= (TTBCR2_ADDR_40 << TTBCR_PASIZE_SHIFT);
+				break;
+			case 42:
+				reg |= (TTBCR2_ADDR_42 << TTBCR_PASIZE_SHIFT);
+				break;
+			case 44:
+				reg |= (TTBCR2_ADDR_44 << TTBCR_PASIZE_SHIFT);
+				break;
+			case 48:
+				reg |= (TTBCR2_ADDR_48 << TTBCR_PASIZE_SHIFT);
+				break;
+			}
+		} else {
+			reg |= (64 - smmu->s1_input_size) << TTBCR_T0SZ_SHIFT;
+		}
+	} else {
+		reg = 0;
+	}
+
+	/* Xen: The attributes to walk the page table should be the same as
+	 * VTCR_EL2. Currently doesn't differ from Linux ones.
+	 */
+	reg |= TTBCR_EAE |
+	      (TTBCR_SH_IS << TTBCR_SH0_SHIFT) |
+	      (TTBCR_RGN_WBWA << TTBCR_ORGN0_SHIFT) |
+	      (TTBCR_RGN_WBWA << TTBCR_IRGN0_SHIFT);
+
+	if (!stage1)
+		reg |= (TTBCR_SL0_LVL_1 << TTBCR_SL0_SHIFT);
+
+	writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBCR);
+
+	/* MAIR0 (stage-1 only) */
+	if (stage1) {
+		reg = (MAIR_ATTR_NC << MAIR_ATTR_SHIFT(MAIR_ATTR_IDX_NC)) |
+		      (MAIR_ATTR_WBRWA << MAIR_ATTR_SHIFT(MAIR_ATTR_IDX_CACHE)) |
+		      (MAIR_ATTR_DEVICE << MAIR_ATTR_SHIFT(MAIR_ATTR_IDX_DEV));
+		writel_relaxed(reg, cb_base + ARM_SMMU_CB_S1_MAIR0);
+	}
+
+	/* SCTLR */
+	reg = SCTLR_CFCFG | SCTLR_CFIE | SCTLR_CFRE | SCTLR_M | SCTLR_EAE_SBOP;
+	if (stage1)
+		reg |= SCTLR_S1_ASIDPNE;
+#ifdef __BIG_ENDIAN
+	reg |= SCTLR_E;
+#endif
+	writel_relaxed(reg, cb_base + ARM_SMMU_CB_SCTLR);
+}
 
-    smmu_dbg(smmu, "Try to add master %s\n", masterspec->np->name);
+static int arm_smmu_init_domain_context(struct iommu_domain *domain,
+					struct arm_smmu_device *smmu)
+{
+	int irq, start, ret = 0;
+	unsigned long flags;
+	struct arm_smmu_domain *smmu_domain = domain->priv;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+
+	spin_lock_irqsave(&smmu_domain->lock, flags);
+	if (smmu_domain->smmu)
+		goto out_unlock;
+
+	/*
+	 * Mapping the requested stage onto what we support is surprisingly
+	 * complicated, mainly because the spec allows S1+S2 SMMUs without
+	 * support for nested translation. That means we end up with the
+	 * following table:
+	 *
+	 * Requested        Supported        Actual
+	 *     S1               N              S1
+	 *     S1             S1+S2            S1
+	 *     S1               S2             S2
+	 *     S1               S1             S1
+	 *     N                N              N
+	 *     N              S1+S2            S2
+	 *     N                S2             S2
+	 *     N                S1             S1
+	 *
+	 * Note that you can't actually request stage-2 mappings.
+	 */
+	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
+		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S2))
+		smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+
+	switch (smmu_domain->stage) {
+	case ARM_SMMU_DOMAIN_S1:
+		cfg->cbar = CBAR_TYPE_S1_TRANS_S2_BYPASS;
+		start = smmu->num_s2_context_banks;
+		break;
+	case ARM_SMMU_DOMAIN_NESTED:
+		/*
+		 * We will likely want to change this if/when KVM gets
+		 * involved.
+		 */
+	case ARM_SMMU_DOMAIN_S2:
+		cfg->cbar = CBAR_TYPE_S2_TRANS;
+		start = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = __arm_smmu_alloc_bitmap(smmu->context_map, start,
+				      smmu->num_context_banks);
+	if (IS_ERR_VALUE(ret))
+		goto out_unlock;
+
+	cfg->cbndx = ret;
+	if (smmu->version == ARM_SMMU_V1) {
+		cfg->irptndx = atomic_inc_return(&smmu->irptndx);
+		cfg->irptndx %= smmu->num_context_irqs;
+	} else {
+		cfg->irptndx = cfg->cbndx;
+	}
+
+	ACCESS_ONCE(smmu_domain->smmu) = smmu;
+	arm_smmu_init_context_bank(smmu_domain);
+	spin_unlock_irqrestore(&smmu_domain->lock, flags);
+
+	irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx];
+	ret = request_irq(irq, arm_smmu_context_fault, IRQF_SHARED,
+			  "arm-smmu-context-fault", domain);
+	if (IS_ERR_VALUE(ret)) {
+		dev_err(smmu->dev, "failed to request context IRQ %d (%u)\n",
+			cfg->irptndx, irq);
+		cfg->irptndx = INVALID_IRPTNDX;
+	}
+
+	return 0;
+
+out_unlock:
+	spin_unlock_irqrestore(&smmu_domain->lock, flags);
+	return ret;
+}
 
-    master = find_smmu_master(smmu, masterspec->np);
-    if ( master )
-    {
-        smmu_err(smmu,
-                 "rejecting multiple registrations for master device %s\n",
-                 masterspec->np->name);
-        return -EBUSY;
-    }
+static void arm_smmu_destroy_domain_context(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = domain->priv;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	void __iomem *cb_base;
+	int irq;
+
+	if (!smmu)
+		return;
+
+	/* Disable the context bank and nuke the TLB before freeing it. */
+	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+	writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
+	arm_smmu_tlb_inv_context(smmu_domain);
+
+	if (cfg->irptndx != INVALID_IRPTNDX) {
+		irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx];
+		free_irq(irq, domain);
+	}
+
+	__arm_smmu_free_bitmap(smmu->context_map, cfg->cbndx);
+}
 
-    if ( masterspec->args_count > MAX_MASTER_STREAMIDS )
-    {
-        smmu_err(smmu,
-            "reached maximum number (%d) of stream IDs for master device %s\n",
-            MAX_MASTER_STREAMIDS, masterspec->np->name);
-        return -ENOSPC;
-    }
+static int arm_smmu_domain_init(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain;
+
+	/*
+	 * Allocate the domain and initialise some of its data structures.
+	 * We can't really do anything meaningful until we've added a
+	 * master.
+	 */
+	smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL);
+	if (!smmu_domain)
+		return -ENOMEM;
+
+	spin_lock_init(&smmu_domain->lock);
+	domain->priv = smmu_domain;
+	return 0;
+}
 
-    master = xzalloc(struct arm_smmu_master);
-    if ( !master )
-        return -ENOMEM;
+#if 0 /* Xen: Page tables are shared with the processor */
+static void arm_smmu_free_ptes(pmd_t *pmd)
+{
+	pgtable_t table = pmd_pgtable(*pmd);
 
-    INIT_LIST_HEAD(&master->list);
-    master->dt_node = masterspec->np;
-    master->num_streamids = masterspec->args_count;
+	__free_page(table);
+}
 
-    dt_device_set_protected(masterspec->np);
+static void arm_smmu_free_pmds(pud_t *pud)
+{
+	int i;
+	pmd_t *pmd, *pmd_base = pmd_offset(pud, 0);
 
-    for ( i = 0; i < master->num_streamids; ++i )
-    {
-        sid = masterspec->args[i];
-        if ( test_and_set_bit(sid, smmu->sids) )
-        {
-            smmu_err(smmu, "duplicate stream ID (%d)\n", sid);
-            xfree(master);
-            return -EEXIST;
-        }
-        master->streamids[i] = masterspec->args[i];
-    }
+	pmd = pmd_base;
+	for (i = 0; i < PTRS_PER_PMD; ++i) {
+		if (pmd_none(*pmd))
+			continue;
 
-    rc = insert_smmu_master(smmu, master);
-    /* Insertion should never fail */
-    ASSERT(rc == 0);
+		arm_smmu_free_ptes(pmd);
+		pmd++;
+	}
 
-    return 0;
+	pmd_free(NULL, pmd_base);
 }
 
-static int __arm_smmu_alloc_bitmap(unsigned long *map, int start, int end)
+static void arm_smmu_free_puds(pgd_t *pgd)
 {
-    int idx;
+	int i;
+	pud_t *pud, *pud_base = pud_offset(pgd, 0);
 
-    do
-    {
-        idx = find_next_zero_bit(map, end, start);
-        if ( idx == end )
-            return -ENOSPC;
-    } while ( test_and_set_bit(idx, map) );
+	pud = pud_base;
+	for (i = 0; i < PTRS_PER_PUD; ++i) {
+		if (pud_none(*pud))
+			continue;
 
-    return idx;
+		arm_smmu_free_pmds(pud);
+		pud++;
+	}
+
+	pud_free(NULL, pud_base);
 }
 
-static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
+static void arm_smmu_free_pgtables(struct arm_smmu_domain *smmu_domain)
 {
-    clear_bit(idx, map);
+	int i;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	pgd_t *pgd, *pgd_base = cfg->pgd;
+
+	/*
+	 * Recursively free the page tables for this domain. We don't
+	 * care about speculative TLB filling because the tables should
+	 * not be active in any context bank at this point (SCTLR.M is 0).
+	 */
+	pgd = pgd_base;
+	for (i = 0; i < PTRS_PER_PGD; ++i) {
+		if (pgd_none(*pgd))
+			continue;
+		arm_smmu_free_puds(pgd);
+		pgd++;
+	}
+
+	kfree(pgd_base);
 }
+#endif
 
-static void arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
+static void arm_smmu_domain_destroy(struct iommu_domain *domain)
 {
-    int count = 0;
-    void __iomem *gr0_base = SMMU_GR0(smmu);
+	struct arm_smmu_domain *smmu_domain = domain->priv;
+
+	/*
+	 * Free the domain resources. We assume that all devices have
+	 * already been detached.
+	 */
+	arm_smmu_destroy_domain_context(domain);
+	kfree(smmu_domain);
+}
 
-    writel_relaxed(0, gr0_base + SMMU_GR0_sTLBGSYNC);
-    while ( readl_relaxed(gr0_base + SMMU_GR0_sTLBGSTATUS) &
-            SMMU_sTLBGSTATUS_GSACTIVE )
-    {
-        cpu_relax();
-        if ( ++count == SMMU_TLB_LOOP_TIMEOUT )
-        {
-            smmu_err(smmu, "TLB sync timed out -- SMMU may be deadlocked\n");
-            return;
-        }
-        udelay(1);
-    }
+static int arm_smmu_master_configure_smrs(struct arm_smmu_device *smmu,
+					  struct arm_smmu_master_cfg *cfg)
+{
+	int i;
+	struct arm_smmu_smr *smrs;
+	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
+
+	if (!(smmu->features & ARM_SMMU_FEAT_STREAM_MATCH))
+		return 0;
+
+	if (cfg->smrs)
+		return -EEXIST;
+
+	smrs = kmalloc_array(cfg->num_streamids, sizeof(*smrs), GFP_KERNEL);
+	if (!smrs) {
+		dev_err(smmu->dev, "failed to allocate %d SMRs\n",
+			cfg->num_streamids);
+		return -ENOMEM;
+	}
+
+	/* Allocate the SMRs on the SMMU */
+	for (i = 0; i < cfg->num_streamids; ++i) {
+		int idx = __arm_smmu_alloc_bitmap(smmu->smr_map, 0,
+						  smmu->num_mapping_groups);
+		if (IS_ERR_VALUE(idx)) {
+			dev_err(smmu->dev, "failed to allocate free SMR\n");
+			goto err_free_smrs;
+		}
+
+		smrs[i] = (struct arm_smmu_smr) {
+			.idx	= idx,
+			.mask	= 0, /* We don't currently share SMRs */
+			.id	= cfg->streamids[i],
+		};
+	}
+
+	/* It worked! Now, poke the actual hardware */
+	for (i = 0; i < cfg->num_streamids; ++i) {
+		u32 reg = SMR_VALID | smrs[i].id << SMR_ID_SHIFT |
+			  smrs[i].mask << SMR_MASK_SHIFT;
+		writel_relaxed(reg, gr0_base + ARM_SMMU_GR0_SMR(smrs[i].idx));
+	}
+
+	cfg->smrs = smrs;
+	return 0;
+
+err_free_smrs:
+	while (--i >= 0)
+		__arm_smmu_free_bitmap(smmu->smr_map, smrs[i].idx);
+	kfree(smrs);
+	return -ENOSPC;
 }
 
-static void arm_smmu_tlb_inv_context(struct arm_smmu_domain_cfg *cfg)
+static void arm_smmu_master_free_smrs(struct arm_smmu_device *smmu,
+				      struct arm_smmu_master_cfg *cfg)
 {
-    struct arm_smmu_device *smmu = cfg->smmu;
-    void __iomem *base = SMMU_GR0(smmu);
+	int i;
+	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
+	struct arm_smmu_smr *smrs = cfg->smrs;
+
+	if (!smrs)
+		return;
 
-    writel_relaxed(SMMU_CB_VMID(cfg),
-                   base + SMMU_GR0_TLBIVMID);
+	/* Invalidate the SMRs before freeing back to the allocator */
+	for (i = 0; i < cfg->num_streamids; ++i) {
+		u8 idx = smrs[i].idx;
 
-    arm_smmu_tlb_sync(smmu);
+		writel_relaxed(~SMR_VALID, gr0_base + ARM_SMMU_GR0_SMR(idx));
+		__arm_smmu_free_bitmap(smmu->smr_map, idx);
+	}
+
+	cfg->smrs = NULL;
+	kfree(smrs);
 }
 
-static void arm_smmu_iotlb_flush_all(struct domain *d)
+static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain,
+				      struct arm_smmu_master_cfg *cfg)
 {
-    struct arm_smmu_domain *smmu_domain = domain_hvm_iommu(d)->arch.priv;
-    struct arm_smmu_domain_cfg *cfg;
+	int i, ret;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
 
-    spin_lock(&smmu_domain->lock);
-    list_for_each_entry(cfg, &smmu_domain->contexts, list)
-        arm_smmu_tlb_inv_context(cfg);
-    spin_unlock(&smmu_domain->lock);
+	/* Devices in an IOMMU group may already be configured */
+	ret = arm_smmu_master_configure_smrs(smmu, cfg);
+	if (ret)
+		return ret == -EEXIST ? 0 : ret;
+
+	for (i = 0; i < cfg->num_streamids; ++i) {
+		u32 idx, s2cr;
+
+		idx = cfg->smrs ? cfg->smrs[i].idx : cfg->streamids[i];
+		s2cr = S2CR_TYPE_TRANS |
+		       (smmu_domain->cfg.cbndx << S2CR_CBNDX_SHIFT);
+		writel_relaxed(s2cr, gr0_base + ARM_SMMU_GR0_S2CR(idx));
+	}
+
+	return 0;
 }
 
-static void arm_smmu_iotlb_flush(struct domain *d, unsigned long gfn,
-                                 unsigned int page_count)
+static void arm_smmu_domain_remove_master(struct arm_smmu_domain *smmu_domain,
+					  struct arm_smmu_master_cfg *cfg)
 {
-    /* ARM SMMU v1 doesn't have flush by VMA and VMID */
-    arm_smmu_iotlb_flush_all(d);
+	int i;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
+
+	/* An IOMMU group is torn down by the first device to be removed */
+	if ((smmu->features & ARM_SMMU_FEAT_STREAM_MATCH) && !cfg->smrs)
+		return;
+
+	/*
+	 * We *must* clear the S2CR first, because freeing the SMR means
+	 * that it can be re-allocated immediately.
+	 * Xen: Unlike Linux, any access to non-configured stream will fault.
+	 */
+	for (i = 0; i < cfg->num_streamids; ++i) {
+		u32 idx = cfg->smrs ? cfg->smrs[i].idx : cfg->streamids[i];
+
+		writel_relaxed(S2CR_TYPE_FAULT,
+			       gr0_base + ARM_SMMU_GR0_S2CR(idx));
+	}
+
+	arm_smmu_master_free_smrs(smmu, cfg);
+}
+
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	int ret;
+	struct arm_smmu_domain *smmu_domain = domain->priv;
+	struct arm_smmu_device *smmu, *dom_smmu;
+	struct arm_smmu_master_cfg *cfg;
+
+	smmu = find_smmu_for_device(dev);
+	if (!smmu) {
+		dev_err(dev, "cannot attach to SMMU, is it on the same bus?\n");
+		return -ENXIO;
+	}
+
+	if (dev_iommu_domain(dev)) {
+		dev_err(dev, "already attached to IOMMU domain\n");
+		return -EEXIST;
+	}
+
+	/*
+	 * Sanity check the domain. We don't support domains across
+	 * different SMMUs.
+	 */
+	dom_smmu = ACCESS_ONCE(smmu_domain->smmu);
+	if (!dom_smmu) {
+		/* Now that we have a master, we can finalise the domain */
+		ret = arm_smmu_init_domain_context(domain, smmu);
+		if (IS_ERR_VALUE(ret))
+			return ret;
+
+		dom_smmu = smmu_domain->smmu;
+	}
+
+	if (dom_smmu != smmu) {
+		dev_err(dev,
+			"cannot attach to SMMU %s whilst already attached to domain on SMMU %s\n",
+			dev_name(smmu_domain->smmu->dev), dev_name(smmu->dev));
+		return -EINVAL;
+	}
+
+	/* Looks ok, so add the device to the domain */
+	cfg = find_smmu_master_cfg(dev);
+	if (!cfg)
+		return -ENODEV;
+
+	ret = arm_smmu_domain_add_master(smmu_domain, cfg);
+
+	if (!ret)
+		dev_iommu_domain(dev) = domain;
+	return ret;
 }
 
-static int determine_smr_mask(struct arm_smmu_device *smmu,
-                              struct arm_smmu_master *master,
-                              struct arm_smmu_smr *smr, int start, int order)
-{
-    u16 i, zero_bits_mask, one_bits_mask, const_mask;
-    int nr;
-
-    nr = 1 << order;
-
-    if ( nr == 1 )
-    {
-        /* no mask, use streamid to match and be done with it */
-        smr->mask = 0;
-        smr->id = master->streamids[start];
-        return 0;
-    }
-
-    zero_bits_mask = 0;
-    one_bits_mask = 0xffff;
-    for ( i = start; i < start + nr; i++)
-    {
-        zero_bits_mask |= master->streamids[i];   /* const 0 bits */
-        one_bits_mask &= master->streamids[i]; /* const 1 bits */
-    }
-    zero_bits_mask = ~zero_bits_mask;
-
-    /* bits having constant values (either 0 or 1) */
-    const_mask = zero_bits_mask | one_bits_mask;
-
-    i = hweight16(~const_mask);
-    if ( (1 << i) == nr )
-    {
-        smr->mask = ~const_mask;
-        smr->id = one_bits_mask;
-    }
-    else
-        /* no usable mask for this set of streamids */
-        return 1;
-
-    if ( ((smr->mask & smmu->smr_mask_mask) != smr->mask) ||
-         ((smr->id & smmu->smr_id_mask) != smr->id) )
-        /* insufficient number of mask/id bits */
-        return 1;
-
-    return 0;
-}
-
-static int determine_smr_mapping(struct arm_smmu_device *smmu,
-                                 struct arm_smmu_master *master,
-                                 struct arm_smmu_smr *smrs, int max_smrs)
-{
-    int nr_sid, nr, i, bit, start;
-
-    /*
-     * This function is called only once -- when a master is added
-     * to a domain. If master->num_s2crs != 0 then this master
-     * was already added to a domain.
-     */
-    BUG_ON(master->num_s2crs);
-
-    start = nr = 0;
-    nr_sid = master->num_streamids;
-    do
-    {
-        /*
-         * largest power-of-2 number of streamids for which to
-         * determine a usable mask/id pair for stream matching
-         */
-        bit = fls(nr_sid);
-        if (!bit)
-            return 0;
-
-        /*
-         * iterate over power-of-2 numbers to determine
-         * largest possible mask/id pair for stream matching
-         * of next 2**i streamids
-         */
-        for ( i = bit - 1; i >= 0; i-- )
-        {
-            if( !determine_smr_mask(smmu, master,
-                                    &smrs[master->num_s2crs],
-                                    start, i))
-                break;
-        }
-
-        if ( i < 0 )
-            goto out;
-
-        nr = 1 << i;
-        nr_sid -= nr;
-        start += nr;
-        master->num_s2crs++;
-    } while ( master->num_s2crs <= max_smrs );
+static void arm_smmu_detach_dev(struct iommu_domain *domain, struct device *dev)
+{
+	struct arm_smmu_domain *smmu_domain = domain->priv;
+	struct arm_smmu_master_cfg *cfg;
 
-out:
-    if ( nr_sid )
-    {
-        /* not enough mapping groups available */
-        master->num_s2crs = 0;
-        return -ENOSPC;
-    }
+	cfg = find_smmu_master_cfg(dev);
+	if (!cfg)
+		return;
 
-    return 0;
+	dev_iommu_domain(dev) = NULL;
+	arm_smmu_domain_remove_master(smmu_domain, cfg);
 }
 
-static int arm_smmu_master_configure_smrs(struct arm_smmu_device *smmu,
-                                          struct arm_smmu_master *master)
-{
-    int i, max_smrs, ret;
-    struct arm_smmu_smr *smrs;
-    void __iomem *gr0_base = SMMU_GR0(smmu);
-
-    if ( !(smmu->features & SMMU_FEAT_STREAM_MATCH) )
-        return 0;
-
-    if ( master->smrs )
-        return -EEXIST;
-
-    max_smrs = min(smmu->num_mapping_groups, master->num_streamids);
-    smrs = xmalloc_array(struct arm_smmu_smr, max_smrs);
-    if ( !smrs )
-    {
-        smmu_err(smmu, "failed to allocated %d SMRs for master %s\n",
-                 max_smrs, dt_node_name(master->dt_node));
-        return -ENOMEM;
-    }
-
-    ret = determine_smr_mapping(smmu, master, smrs, max_smrs);
-    if ( ret )
-        goto err_free_smrs;
-
-    /* Allocate the SMRs on the root SMMU */
-    for ( i = 0; i < master->num_s2crs; ++i )
-    {
-        int idx = __arm_smmu_alloc_bitmap(smmu->smr_map, 0,
-                                          smmu->num_mapping_groups);
-        if ( idx < 0 )
-        {
-            smmu_err(smmu, "failed to allocate free SMR\n");
-            goto err_free_bitmap;
-        }
-        smrs[i].idx = idx;
-    }
-
-    /* It worked! Now, poke the actual hardware */
-    for ( i = 0; i < master->num_s2crs; ++i )
-    {
-        u32 reg = SMMU_SMR_VALID | smrs[i].id << SMMU_SMR_ID_SHIFT |
-            smrs[i].mask << SMMU_SMR_MASK_SHIFT;
-        smmu_dbg(smmu, "SMR%d: 0x%x\n", smrs[i].idx, reg);
-        writel_relaxed(reg, gr0_base + SMMU_GR0_SMR(smrs[i].idx));
-    }
-
-    master->smrs = smrs;
-    return 0;
-
-err_free_bitmap:
-    while (--i >= 0)
-        __arm_smmu_free_bitmap(smmu->smr_map, smrs[i].idx);
-    master->num_s2crs = 0;
-err_free_smrs:
-    xfree(smrs);
-    return -ENOSPC;
+#if 0 /*
+       * Xen: The page table is shared with the processor, therefore
+       * helpers to implement separate is not necessary.
+       */
+static bool arm_smmu_pte_is_contiguous_range(unsigned long addr,
+					     unsigned long end)
+{
+	return !(addr & ~ARM_SMMU_PTE_CONT_MASK) &&
+		(addr + ARM_SMMU_PTE_CONT_SIZE <= end);
 }
 
-/* Forward declaration */
-static void arm_smmu_destroy_domain_context(struct arm_smmu_domain_cfg *cfg);
+static int arm_smmu_alloc_init_pte(struct arm_smmu_device *smmu, pmd_t *pmd,
+				   unsigned long addr, unsigned long end,
+				   unsigned long pfn, int prot, int stage)
+{
+	pte_t *pte, *start;
+	pteval_t pteval = ARM_SMMU_PTE_PAGE | ARM_SMMU_PTE_AF;
+
+	if (pmd_none(*pmd)) {
+		/* Allocate a new set of tables */
+		pgtable_t table = alloc_page(GFP_ATOMIC|__GFP_ZERO);
+
+		if (!table)
+			return -ENOMEM;
+
+		arm_smmu_flush_pgtable(smmu, page_address(table), PAGE_SIZE);
+		pmd_populate(NULL, pmd, table);
+		arm_smmu_flush_pgtable(smmu, pmd, sizeof(*pmd));
+	}
+
+	if (stage == 1) {
+		pteval |= ARM_SMMU_PTE_AP_UNPRIV | ARM_SMMU_PTE_nG;
+		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
+			pteval |= ARM_SMMU_PTE_AP_RDONLY;
+
+		if (prot & IOMMU_CACHE)
+			pteval |= (MAIR_ATTR_IDX_CACHE <<
+				   ARM_SMMU_PTE_ATTRINDX_SHIFT);
+	} else {
+		pteval |= ARM_SMMU_PTE_HAP_FAULT;
+		if (prot & IOMMU_READ)
+			pteval |= ARM_SMMU_PTE_HAP_READ;
+		if (prot & IOMMU_WRITE)
+			pteval |= ARM_SMMU_PTE_HAP_WRITE;
+		if (prot & IOMMU_CACHE)
+			pteval |= ARM_SMMU_PTE_MEMATTR_OIWB;
+		else
+			pteval |= ARM_SMMU_PTE_MEMATTR_NC;
+	}
+
+	if (prot & IOMMU_NOEXEC)
+		pteval |= ARM_SMMU_PTE_XN;
+
+	/* If no access, create a faulting entry to avoid TLB fills */
+	if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
+		pteval &= ~ARM_SMMU_PTE_PAGE;
+
+	pteval |= ARM_SMMU_PTE_SH_IS;
+	start = pmd_page_vaddr(*pmd) + pte_index(addr);
+	pte = start;
+
+	/*
+	 * Install the page table entries. This is fairly complicated
+	 * since we attempt to make use of the contiguous hint in the
+	 * ptes where possible. The contiguous hint indicates a series
+	 * of ARM_SMMU_PTE_CONT_ENTRIES ptes mapping a physically
+	 * contiguous region with the following constraints:
+	 *
+	 *   - The region start is aligned to ARM_SMMU_PTE_CONT_SIZE
+	 *   - Each pte in the region has the contiguous hint bit set
+	 *
+	 * This complicates unmapping (also handled by this code, when
+	 * neither IOMMU_READ or IOMMU_WRITE are set) because it is
+	 * possible, yet highly unlikely, that a client may unmap only
+	 * part of a contiguous range. This requires clearing of the
+	 * contiguous hint bits in the range before installing the new
+	 * faulting entries.
+	 *
+	 * Note that re-mapping an address range without first unmapping
+	 * it is not supported, so TLB invalidation is not required here
+	 * and is instead performed at unmap and domain-init time.
+	 */
+	do {
+		int i = 1;
+
+		pteval &= ~ARM_SMMU_PTE_CONT;
+
+		if (arm_smmu_pte_is_contiguous_range(addr, end)) {
+			i = ARM_SMMU_PTE_CONT_ENTRIES;
+			pteval |= ARM_SMMU_PTE_CONT;
+		} else if (pte_val(*pte) &
+			   (ARM_SMMU_PTE_CONT | ARM_SMMU_PTE_PAGE)) {
+			int j;
+			pte_t *cont_start;
+			unsigned long idx = pte_index(addr);
+
+			idx &= ~(ARM_SMMU_PTE_CONT_ENTRIES - 1);
+			cont_start = pmd_page_vaddr(*pmd) + idx;
+			for (j = 0; j < ARM_SMMU_PTE_CONT_ENTRIES; ++j)
+				pte_val(*(cont_start + j)) &=
+					~ARM_SMMU_PTE_CONT;
+
+			arm_smmu_flush_pgtable(smmu, cont_start,
+					       sizeof(*pte) *
+					       ARM_SMMU_PTE_CONT_ENTRIES);
+		}
+
+		do {
+			*pte = pfn_pte(pfn, __pgprot(pteval));
+		} while (pte++, pfn++, addr += PAGE_SIZE, --i);
+	} while (addr != end);
+
+	arm_smmu_flush_pgtable(smmu, start, sizeof(*pte) * (pte - start));
+	return 0;
+}
 
-static int arm_smmu_domain_add_master(struct domain *d,
-                                      struct arm_smmu_domain_cfg *cfg,
-                                      struct arm_smmu_master *master)
+static int arm_smmu_alloc_init_pmd(struct arm_smmu_device *smmu, pud_t *pud,
+				   unsigned long addr, unsigned long end,
+				   phys_addr_t phys, int prot, int stage)
 {
-    int i, ret;
-    struct arm_smmu_device *smmu = cfg->smmu;
-    void __iomem *gr0_base = SMMU_GR0(smmu);
-    struct arm_smmu_smr *smrs = master->smrs;
+	int ret;
+	pmd_t *pmd;
+	unsigned long next, pfn = __phys_to_pfn(phys);
+
+#ifndef __PAGETABLE_PMD_FOLDED
+	if (pud_none(*pud)) {
+		pmd = (pmd_t *)get_zeroed_page(GFP_ATOMIC);
+		if (!pmd)
+			return -ENOMEM;
+
+		arm_smmu_flush_pgtable(smmu, pmd, PAGE_SIZE);
+		pud_populate(NULL, pud, pmd);
+		arm_smmu_flush_pgtable(smmu, pud, sizeof(*pud));
+
+		pmd += pmd_index(addr);
+	} else
+#endif
+		pmd = pmd_offset(pud, addr);
 
-    if ( master->cfg )
-        return -EBUSY;
+	do {
+		next = pmd_addr_end(addr, end);
+		ret = arm_smmu_alloc_init_pte(smmu, pmd, addr, next, pfn,
+					      prot, stage);
+		phys += next - addr;
+		pfn = __phys_to_pfn(phys);
+	} while (pmd++, addr = next, addr < end);
 
-    ret = arm_smmu_master_configure_smrs(smmu, master);
-    if ( ret )
-        return ret;
+	return ret;
+}
 
-    /* Now we're at the root, time to point at our context bank */
-    if ( !master->num_s2crs )
-        master->num_s2crs = master->num_streamids;
+static int arm_smmu_alloc_init_pud(struct arm_smmu_device *smmu, pgd_t *pgd,
+				   unsigned long addr, unsigned long end,
+				   phys_addr_t phys, int prot, int stage)
+{
+	int ret = 0;
+	pud_t *pud;
+	unsigned long next;
+
+#ifndef __PAGETABLE_PUD_FOLDED
+	if (pgd_none(*pgd)) {
+		pud = (pud_t *)get_zeroed_page(GFP_ATOMIC);
+		if (!pud)
+			return -ENOMEM;
+
+		arm_smmu_flush_pgtable(smmu, pud, PAGE_SIZE);
+		pgd_populate(NULL, pgd, pud);
+		arm_smmu_flush_pgtable(smmu, pgd, sizeof(*pgd));
+
+		pud += pud_index(addr);
+	} else
+#endif
+		pud = pud_offset(pgd, addr);
 
-    for ( i = 0; i < master->num_s2crs; ++i )
-    {
-        u32 idx, s2cr;
+	do {
+		next = pud_addr_end(addr, end);
+		ret = arm_smmu_alloc_init_pmd(smmu, pud, addr, next, phys,
+					      prot, stage);
+		phys += next - addr;
+	} while (pud++, addr = next, addr < end);
 
-        idx = smrs ? smrs[i].idx : master->streamids[i];
-        s2cr = (SMMU_S2CR_TYPE_TRANS << SMMU_S2CR_TYPE_SHIFT) |
-            (cfg->cbndx << SMMU_S2CR_CBNDX_SHIFT);
-        smmu_dbg(smmu, "S2CR%d: 0x%x\n", idx, s2cr);
-        writel_relaxed(s2cr, gr0_base + SMMU_GR0_S2CR(idx));
-    }
+	return ret;
+}
 
-    master->cfg = cfg;
-    list_add(&master->list, &cfg->masters);
+static int arm_smmu_handle_mapping(struct arm_smmu_domain *smmu_domain,
+				   unsigned long iova, phys_addr_t paddr,
+				   size_t size, int prot)
+{
+	int ret, stage;
+	unsigned long end;
+	phys_addr_t input_mask, output_mask;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	pgd_t *pgd = cfg->pgd;
+	unsigned long flags;
+
+	if (cfg->cbar == CBAR_TYPE_S2_TRANS) {
+		stage = 2;
+		input_mask = (1ULL << smmu->s2_input_size) - 1;
+		output_mask = (1ULL << smmu->s2_output_size) - 1;
+	} else {
+		stage = 1;
+		input_mask = (1ULL << smmu->s1_input_size) - 1;
+		output_mask = (1ULL << smmu->s1_output_size) - 1;
+	}
+
+	if (!pgd)
+		return -EINVAL;
+
+	if (size & ~PAGE_MASK)
+		return -EINVAL;
+
+	if ((phys_addr_t)iova & ~input_mask)
+		return -ERANGE;
+
+	if (paddr & ~output_mask)
+		return -ERANGE;
+
+	spin_lock_irqsave(&smmu_domain->lock, flags);
+	pgd += pgd_index(iova);
+	end = iova + size;
+	do {
+		unsigned long next = pgd_addr_end(iova, end);
+
+		ret = arm_smmu_alloc_init_pud(smmu, pgd, iova, next, paddr,
+					      prot, stage);
+		if (ret)
+			goto out_unlock;
+
+		paddr += next - iova;
+		iova = next;
+	} while (pgd++, iova != end);
+
+out_unlock:
+	spin_unlock_irqrestore(&smmu_domain->lock, flags);
+
+	return ret;
+}
+
+static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
+			phys_addr_t paddr, size_t size, int prot)
+{
+	struct arm_smmu_domain *smmu_domain = domain->priv;
 
-    return 0;
+	if (!smmu_domain)
+		return -ENODEV;
+
+	return arm_smmu_handle_mapping(smmu_domain, iova, paddr, size, prot);
 }
 
-static void arm_smmu_domain_remove_master(struct arm_smmu_master *master)
+static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
+			     size_t size)
 {
-    int i;
-    struct arm_smmu_domain_cfg *cfg = master->cfg;
-    struct arm_smmu_device *smmu = cfg->smmu;
-    void __iomem *gr0_base = SMMU_GR0(smmu);
-    struct arm_smmu_smr *smrs = master->smrs;
+	int ret;
+	struct arm_smmu_domain *smmu_domain = domain->priv;
 
-    /*
-     * We *must* clear the S2CR first, because freeing the SMR means
-     * that it can be reallocated immediately
-     */
-    for ( i = 0; i < master->num_streamids; ++i )
-    {
-        u16 sid = master->streamids[i];
-        writel_relaxed(SMMU_S2CR_TYPE_FAULT,
-                       gr0_base + SMMU_GR0_S2CR(sid));
-    }
+	ret = arm_smmu_handle_mapping(smmu_domain, iova, 0, size, 0);
+	arm_smmu_tlb_inv_context(smmu_domain);
+	return ret ? 0 : size;
+}
 
-    /* Invalidate the SMRs before freeing back to the allocator */
-    for (i = 0; i < master->num_s2crs; ++i) {
-        u8 idx = smrs[i].idx;
-        writel_relaxed(~SMMU_SMR_VALID, gr0_base + SMMU_GR0_SMR(idx));
-        __arm_smmu_free_bitmap(smmu->smr_map, idx);
-    }
+static phys_addr_t arm_smmu_iova_to_phys(struct iommu_domain *domain,
+					 dma_addr_t iova)
+{
+	pgd_t *pgdp, pgd;
+	pud_t pud;
+	pmd_t pmd;
+	pte_t pte;
+	struct arm_smmu_domain *smmu_domain = domain->priv;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+
+	pgdp = cfg->pgd;
+	if (!pgdp)
+		return 0;
+
+	pgd = *(pgdp + pgd_index(iova));
+	if (pgd_none(pgd))
+		return 0;
+
+	pud = *pud_offset(&pgd, iova);
+	if (pud_none(pud))
+		return 0;
+
+	pmd = *pmd_offset(&pud, iova);
+	if (pmd_none(pmd))
+		return 0;
+
+	pte = *(pmd_page_vaddr(pmd) + pte_index(iova));
+	if (pte_none(pte))
+		return 0;
+
+	return __pfn_to_phys(pte_pfn(pte)) | (iova & ~PAGE_MASK);
+}
+#endif
 
-    master->smrs = NULL;
-    master->num_s2crs = 0;
-    xfree(smrs);
+#if 0 /* Xen: arm_smmu_capable is not used at the moment */
+static bool arm_smmu_capable(enum iommu_cap cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		/*
+		 * Return true here as the SMMU can always send out coherent
+		 * requests.
+		 */
+		return true;
+	case IOMMU_CAP_INTR_REMAP:
+		return true; /* MSIs are just memory writes */
+	case IOMMU_CAP_NOEXEC:
+		return true;
+	default:
+		return false;
+	}
+}
+#endif
 
-    master->cfg = NULL;
-    list_del(&master->list);
-    INIT_LIST_HEAD(&master->list);
+static int __arm_smmu_get_pci_sid(struct pci_dev *pdev, u16 alias, void *data)
+{
+	*((u16 *)data) = alias;
+	return 0; /* Continue walking */
 }
 
-static void arm_smmu_init_context_bank(struct arm_smmu_domain_cfg *cfg)
+static void __arm_smmu_release_pci_iommudata(void *data)
 {
-    u32 reg;
-    struct arm_smmu_device *smmu = cfg->smmu;
-    void __iomem *cb_base, *gr1_base;
-    paddr_t p2maddr;
+	kfree(data);
+}
 
-    ASSERT(cfg->domain != NULL);
-    p2maddr = page_to_maddr(cfg->domain->arch.p2m.root);
+static int arm_smmu_add_device(struct device *dev)
+{
+	struct arm_smmu_device *smmu;
+	struct arm_smmu_master_cfg *cfg;
+	struct iommu_group *group;
+	void (*releasefn)(void *) = NULL;
+	int ret;
+
+	smmu = find_smmu_for_device(dev);
+	if (!smmu)
+		return -ENODEV;
+
+	group = iommu_group_alloc();
+	if (IS_ERR(group)) {
+		dev_err(dev, "Failed to allocate IOMMU group\n");
+		return PTR_ERR(group);
+	}
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+		if (!cfg) {
+			ret = -ENOMEM;
+			goto out_put_group;
+		}
+
+		cfg->num_streamids = 1;
+		/*
+		 * Assume Stream ID == Requester ID for now.
+		 * We need a way to describe the ID mappings in FDT.
+		 */
+		pci_for_each_dma_alias(pdev, __arm_smmu_get_pci_sid,
+				       &cfg->streamids[0]);
+		releasefn = __arm_smmu_release_pci_iommudata;
+	} else {
+		struct arm_smmu_master *master;
+
+		master = find_smmu_master(smmu, dev->of_node);
+		if (!master) {
+			ret = -ENODEV;
+			goto out_put_group;
+		}
+
+		cfg = &master->cfg;
+	}
+
+	iommu_group_set_iommudata(group, cfg, releasefn);
+	ret = iommu_group_add_device(group, dev);
+
+out_put_group:
+	iommu_group_put(group);
+	return ret;
+}
 
-    gr1_base = SMMU_GR1(smmu);
-    cb_base = SMMU_CB_BASE(smmu) + SMMU_CB(smmu, cfg->cbndx);
+#if 0 /* Xen: We don't support remove device for now. Will be useful for PCI */
+static void arm_smmu_remove_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
 
-    /* CBAR */
-    reg = cfg->cbar;
-    if ( smmu->version == 1 )
-        reg |= cfg->irptndx << SMMU_CBAR_IRPTNDX_SHIFT;
+static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
+				    enum iommu_attr attr, void *data)
+{
+	struct arm_smmu_domain *smmu_domain = domain->priv;
+
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
+		return 0;
+	default:
+		return -ENODEV;
+	}
+}
 
-    reg |= SMMU_CB_VMID(cfg) << SMMU_CBAR_VMID_SHIFT;
-    writel_relaxed(reg, gr1_base + SMMU_GR1_CBAR(cfg->cbndx));
+static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
+				    enum iommu_attr attr, void *data)
+{
+	struct arm_smmu_domain *smmu_domain = domain->priv;
+
+	switch (attr) {
+	case DOMAIN_ATTR_NESTING:
+		if (smmu_domain->smmu)
+			return -EPERM;
+		if (*(int *)data)
+			smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+		else
+			smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+
+		return 0;
+	default:
+		return -ENODEV;
+	}
+}
 
-    if ( smmu->version > 1 )
-    {
-        /* CBA2R */
-#ifdef CONFIG_ARM_64
-        reg = SMMU_CBA2R_RW64_64BIT;
+static const struct iommu_ops arm_smmu_ops = {
+	.capable		= arm_smmu_capable,
+	.domain_init		= arm_smmu_domain_init,
+	.domain_destroy		= arm_smmu_domain_destroy,
+	.attach_dev		= arm_smmu_attach_dev,
+	.detach_dev		= arm_smmu_detach_dev,
+	.map			= arm_smmu_map,
+	.unmap			= arm_smmu_unmap,
+	.map_sg			= default_iommu_map_sg,
+	.iova_to_phys		= arm_smmu_iova_to_phys,
+	.add_device		= arm_smmu_add_device,
+	.remove_device		= arm_smmu_remove_device,
+	.domain_get_attr	= arm_smmu_domain_get_attr,
+	.domain_set_attr	= arm_smmu_domain_set_attr,
+	.pgsize_bitmap		= (SECTION_SIZE |
+				   ARM_SMMU_PTE_CONT_SIZE |
+				   PAGE_SIZE),
+};
+#endif
+
+static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
+{
+	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
+	void __iomem *cb_base;
+	int i = 0;
+	u32 reg;
+
+	/* clear global FSR */
+	reg = readl_relaxed(ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sGFSR);
+	writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sGFSR);
+
+	/* Mark all SMRn as invalid and all S2CRn as bypass */
+	for (i = 0; i < smmu->num_mapping_groups; ++i) {
+		writel_relaxed(0, gr0_base + ARM_SMMU_GR0_SMR(i));
+		/*
+		 * Xen: Unlike Linux, any access to a non-configure stream
+		 * will fault by default.
+		 */
+		writel_relaxed(S2CR_TYPE_FAULT,
+			gr0_base + ARM_SMMU_GR0_S2CR(i));
+	}
+
+	/* Make sure all context banks are disabled and clear CB_FSR  */
+	for (i = 0; i < smmu->num_context_banks; ++i) {
+		cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, i);
+		writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
+		writel_relaxed(FSR_FAULT, cb_base + ARM_SMMU_CB_FSR);
+	}
+
+	/* Invalidate the TLB, just in case */
+	writel_relaxed(0, gr0_base + ARM_SMMU_GR0_STLBIALL);
+	writel_relaxed(0, gr0_base + ARM_SMMU_GR0_TLBIALLH);
+	writel_relaxed(0, gr0_base + ARM_SMMU_GR0_TLBIALLNSNH);
+
+	reg = readl_relaxed(ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
+
+	/* Enable fault reporting */
+	reg |= (sCR0_GFRE | sCR0_GFIE | sCR0_GCFGFRE | sCR0_GCFGFIE);
+
+	/* Disable TLB broadcasting. */
+	reg |= (sCR0_VMIDPNE | sCR0_PTM);
+
+	/* Enable client access, but bypass when no mapping is found */
+	reg &= ~(sCR0_CLIENTPD | sCR0_USFCFG);
+	/* Xen: Unlike Linux, generate a fault when no mapping is found */
+	reg |= sCR0_USFCFG;
+
+	/* Disable forced broadcasting */
+	reg &= ~sCR0_FB;
+
+	/* Don't upgrade barriers */
+	reg &= ~(sCR0_BSU_MASK << sCR0_BSU_SHIFT);
+
+	/* Push the button */
+	arm_smmu_tlb_sync(smmu);
+	writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
+}
+
+static int arm_smmu_id_size_to_bits(int size)
+{
+	switch (size) {
+	case 0:
+		return 32;
+	case 1:
+		return 36;
+	case 2:
+		return 40;
+	case 3:
+		return 42;
+	case 4:
+		return 44;
+	case 5:
+	default:
+		return 48;
+	}
+}
+
+static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
+{
+	unsigned long size;
+	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
+	u32 id;
+
+	dev_notice(smmu->dev, "probing hardware configuration...\n");
+	dev_notice(smmu->dev, "SMMUv%d with:\n", smmu->version);
+
+	/* ID0 */
+	id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID0);
+#ifndef CONFIG_64BIT
+	if (((id >> ID0_PTFS_SHIFT) & ID0_PTFS_MASK) == ID0_PTFS_V8_ONLY) {
+		dev_err(smmu->dev, "\tno v7 descriptor support!\n");
+		return -ENODEV;
+	}
+#endif
+
+	/* Restrict available stages based on module parameter */
+	if (force_stage == 1)
+		id &= ~(ID0_S2TS | ID0_NTS);
+	else if (force_stage == 2)
+		id &= ~(ID0_S1TS | ID0_NTS);
+
+	if (id & ID0_S1TS) {
+		smmu->features |= ARM_SMMU_FEAT_TRANS_S1;
+		dev_notice(smmu->dev, "\tstage 1 translation\n");
+	}
+
+	if (id & ID0_S2TS) {
+		smmu->features |= ARM_SMMU_FEAT_TRANS_S2;
+		dev_notice(smmu->dev, "\tstage 2 translation\n");
+	}
+
+	if (id & ID0_NTS) {
+		smmu->features |= ARM_SMMU_FEAT_TRANS_NESTED;
+		dev_notice(smmu->dev, "\tnested translation\n");
+	}
+
+	if (!(smmu->features &
+		(ARM_SMMU_FEAT_TRANS_S1 | ARM_SMMU_FEAT_TRANS_S2))) {
+		dev_err(smmu->dev, "\tno translation support!\n");
+		return -ENODEV;
+	}
+
+	if (id & ID0_CTTW) {
+		smmu->features |= ARM_SMMU_FEAT_COHERENT_WALK;
+		dev_notice(smmu->dev, "\tcoherent table walk\n");
+	}
+
+	if (id & ID0_SMS) {
+		u32 smr, sid, mask;
+
+		smmu->features |= ARM_SMMU_FEAT_STREAM_MATCH;
+		smmu->num_mapping_groups = (id >> ID0_NUMSMRG_SHIFT) &
+					   ID0_NUMSMRG_MASK;
+		if (smmu->num_mapping_groups == 0) {
+			dev_err(smmu->dev,
+				"stream-matching supported, but no SMRs present!\n");
+			return -ENODEV;
+		}
+
+		smr = SMR_MASK_MASK << SMR_MASK_SHIFT;
+		smr |= (SMR_ID_MASK << SMR_ID_SHIFT);
+		writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0));
+		smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0));
+
+		mask = (smr >> SMR_MASK_SHIFT) & SMR_MASK_MASK;
+		sid = (smr >> SMR_ID_SHIFT) & SMR_ID_MASK;
+		if ((mask & sid) != sid) {
+			dev_err(smmu->dev,
+				"SMR mask bits (0x%x) insufficient for ID field (0x%x)\n",
+				mask, sid);
+			return -ENODEV;
+		}
+
+		dev_notice(smmu->dev,
+			   "\tstream matching with %u register groups, mask 0x%x\n",
+			   smmu->num_mapping_groups, mask);
+	} else {
+		smmu->num_mapping_groups = (id >> ID0_NUMSIDB_SHIFT) &
+					   ID0_NUMSIDB_MASK;
+	}
+
+	/* ID1 */
+	id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID1);
+	smmu->pgshift = (id & ID1_PAGESIZE) ? 16 : 12;
+
+	/* Check for size mismatch of SMMU address space from mapped region */
+	size = 1 <<
+		(((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 1);
+	size *= 2 << smmu->pgshift;
+	if (smmu->size != size)
+		dev_warn(smmu->dev,
+			"SMMU address space size (0x%lx) differs from mapped region size (0x%lx)!\n",
+			size, smmu->size);
+
+	smmu->num_s2_context_banks = (id >> ID1_NUMS2CB_SHIFT) &
+				      ID1_NUMS2CB_MASK;
+	smmu->num_context_banks = (id >> ID1_NUMCB_SHIFT) & ID1_NUMCB_MASK;
+	if (smmu->num_s2_context_banks > smmu->num_context_banks) {
+		dev_err(smmu->dev, "impossible number of S2 context banks!\n");
+		return -ENODEV;
+	}
+	dev_notice(smmu->dev, "\t%u context banks (%u stage-2 only)\n",
+		   smmu->num_context_banks, smmu->num_s2_context_banks);
+
+	/* ID2 */
+	id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID2);
+	size = arm_smmu_id_size_to_bits((id >> ID2_IAS_SHIFT) & ID2_IAS_MASK);
+	smmu->s1_output_size = min_t(unsigned long, PHYS_MASK_SHIFT, size);
+
+	/* Xen: Stage-2 input size has to match p2m_ipa_bits.  */
+	if (size < p2m_ipa_bits) {
+		dev_err(smmu->dev,
+			"P2M IPA size not supported (P2M=%u SMMU=%lu)!\n",
+			p2m_ipa_bits, size);
+		return -ENODEV;
+	}
+	smmu->s2_input_size = p2m_ipa_bits;
+#if 0
+	/* Stage-2 input size limited due to pgd allocation (PTRS_PER_PGD) */
+#ifdef CONFIG_64BIT
+	smmu->s2_input_size = min_t(unsigned long, VA_BITS, size);
 #else
-        reg = SMMU_CBA2R_RW64_32BIT;
+	smmu->s2_input_size = min(32UL, size);
+#endif
 #endif
-        writel_relaxed(reg, gr1_base + SMMU_GR1_CBA2R(cfg->cbndx));
-    }
-
-    /* TTBR0 */
-    reg = (p2maddr & ((1ULL << 32) - 1));
-    writel_relaxed(reg, cb_base + SMMU_CB_TTBR0_LO);
-    reg = (p2maddr >> 32);
-    writel_relaxed(reg, cb_base + SMMU_CB_TTBR0_HI);
-
-    /*
-     * TCR
-     * We use long descriptor, with inner-shareable WBWA tables in TTBR0.
-     */
-    if ( smmu->version > 1 )
-    {
-        /* 4K Page Table */
-        if ( PAGE_SIZE == PAGE_SIZE_4K )
-            reg = SMMU_TCR_TG0_4K;
-        else
-            reg = SMMU_TCR_TG0_64K;
-
-        switch ( smmu->s2_output_size ) {
-        case 32:
-            reg |= (SMMU_TCR2_ADDR_32 << SMMU_TCR_PASIZE_SHIFT);
-            break;
-        case 36:
-            reg |= (SMMU_TCR2_ADDR_36 << SMMU_TCR_PASIZE_SHIFT);
-            break;
-        case 40:
-            reg |= (SMMU_TCR2_ADDR_40 << SMMU_TCR_PASIZE_SHIFT);
-            break;
-        case 42:
-            reg |= (SMMU_TCR2_ADDR_42 << SMMU_TCR_PASIZE_SHIFT);
-            break;
-        case 44:
-            reg |= (SMMU_TCR2_ADDR_44 << SMMU_TCR_PASIZE_SHIFT);
-            break;
-        case 48:
-            reg |= (SMMU_TCR2_ADDR_48 << SMMU_TCR_PASIZE_SHIFT);
-            break;
-        }
-    }
-    else
-        reg = 0;
-
-    /* The attribute to walk the page table should be the same as VTCR_EL2 */
-    reg |= SMMU_TCR_EAE |
-        (SMMU_TCR_SH_IS << SMMU_TCR_SH0_SHIFT) |
-        (SMMU_TCR_RGN_WBWA << SMMU_TCR_ORGN0_SHIFT) |
-        (SMMU_TCR_RGN_WBWA << SMMU_TCR_IRGN0_SHIFT) |
-        (SMMU_TCR_SL0_LVL_1 << SMMU_TCR_SL0_SHIFT) |
-        /* T0SZ=(1)100 = -8 ( 32 -(-8) = 40 bit physical addresses ) */
-        (0x18 << SMMU_TCR_T0SZ_SHIFT);
-    writel_relaxed(reg, cb_base + SMMU_CB_TCR);
-
-    /* SCTLR */
-    reg = SMMU_SCTLR_CFCFG |
-        SMMU_SCTLR_CFIE |
-        SMMU_SCTLR_CFRE |
-        SMMU_SCTLR_M |
-        SMMU_SCTLR_EAE_SBOP;
-
-    writel_relaxed(reg, cb_base + SMMU_CB_SCTLR);
-}
-
-static struct arm_smmu_domain_cfg *
-arm_smmu_alloc_domain_context(struct domain *d,
-                              struct arm_smmu_device *smmu)
-{
-    unsigned int irq;
-    int ret, start;
-    struct arm_smmu_domain_cfg *cfg;
-    struct arm_smmu_domain *smmu_domain = domain_hvm_iommu(d)->arch.priv;
-
-    ASSERT(spin_is_locked(&smmu_domain->lock));
-
-    cfg = xzalloc(struct arm_smmu_domain_cfg);
-    if ( !cfg )
-        return NULL;
-
-    /* Master already initialized to another domain ... */
-    if ( cfg->domain != NULL )
-        goto out_free_mem;
-
-    cfg->cbar = SMMU_CBAR_TYPE_S2_TRANS;
-    start = 0;
-
-    ret = __arm_smmu_alloc_bitmap(smmu->context_map, start,
-                                  smmu->num_context_banks);
-    if ( ret < 0 )
-        goto out_free_mem;
-
-    cfg->cbndx = ret;
-    if ( smmu->version == 1 )
-    {
-        cfg->irptndx = atomic_inc_return(&smmu->irptndx);
-        cfg->irptndx %= smmu->num_context_irqs;
-    }
-    else
-        cfg->irptndx = cfg->cbndx;
-
-    irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx];
-    ret = request_irq(irq, IRQF_SHARED, arm_smmu_context_fault,
-                      "arm-smmu-context-fault", cfg);
-    if ( ret )
-    {
-        smmu_err(smmu, "failed to request context IRQ %d (%u)\n",
-                 cfg->irptndx, irq);
-        cfg->irptndx = INVALID_IRPTNDX;
-        goto out_free_context;
-    }
-
-    cfg->domain = d;
-    cfg->smmu = smmu;
-    if ( smmu->features & SMMU_FEAT_COHERENT_WALK )
-        iommu_set_feature(d, IOMMU_FEAT_COHERENT_WALK);
-
-    arm_smmu_init_context_bank(cfg);
-    list_add(&cfg->list, &smmu_domain->contexts);
-    INIT_LIST_HEAD(&cfg->masters);
-
-    return cfg;
-
-out_free_context:
-    __arm_smmu_free_bitmap(smmu->context_map, cfg->cbndx);
-out_free_mem:
-    xfree(cfg);
-
-    return NULL;
-}
-
-static void arm_smmu_destroy_domain_context(struct arm_smmu_domain_cfg *cfg)
-{
-    struct domain *d = cfg->domain;
-    struct arm_smmu_domain *smmu_domain = domain_hvm_iommu(d)->arch.priv;
-    struct arm_smmu_device *smmu = cfg->smmu;
-    void __iomem *cb_base;
-    unsigned int irq;
-
-    ASSERT(spin_is_locked(&smmu_domain->lock));
-    BUG_ON(!list_empty(&cfg->masters));
-
-    /* Disable the context bank and nuke the TLB before freeing it */
-    cb_base = SMMU_CB_BASE(smmu) + SMMU_CB(smmu, cfg->cbndx);
-    writel_relaxed(0, cb_base + SMMU_CB_SCTLR);
-    arm_smmu_tlb_inv_context(cfg);
-
-    if ( cfg->irptndx != INVALID_IRPTNDX )
-    {
-        irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx];
-        release_irq(irq, cfg);
-    }
-
-    __arm_smmu_free_bitmap(smmu->context_map, cfg->cbndx);
-    list_del(&cfg->list);
-    xfree(cfg);
-}
-
-static struct arm_smmu_device *
-arm_smmu_find_smmu_by_dev(const struct dt_device_node *dev)
-{
-    struct arm_smmu_device *smmu;
-    struct arm_smmu_master *master = NULL;
-
-    list_for_each_entry( smmu, &arm_smmu_devices, list )
-    {
-        master = find_smmu_master(smmu, dev);
-        if ( master )
-            break;
-    }
-
-    if ( !master )
-        return NULL;
-
-    return smmu;
-}
-
-static int arm_smmu_attach_dev(struct domain *d,
-                               const struct dt_device_node *dev)
-{
-    struct arm_smmu_device *smmu = arm_smmu_find_smmu_by_dev(dev);
-    struct arm_smmu_master *master;
-    struct arm_smmu_domain *smmu_domain = domain_hvm_iommu(d)->arch.priv;
-    struct arm_smmu_domain_cfg *cfg = NULL;
-    struct arm_smmu_domain_cfg *curr;
-    int ret;
-
-    printk(XENLOG_DEBUG "arm-smmu: attach %s to domain %d\n",
-           dt_node_full_name(dev), d->domain_id);
-
-    if ( !smmu )
-    {
-        printk(XENLOG_ERR "%s: cannot attach to SMMU, is it on the same bus?\n",
-               dt_node_full_name(dev));
-        return -ENODEV;
-    }
-
-    master = find_smmu_master(smmu, dev);
-    BUG_ON(master == NULL);
-
-    /* Check if the device is already assigned to someone */
-    if ( master->cfg )
-        return -EBUSY;
 
-    spin_lock(&smmu_domain->lock);
-    list_for_each_entry( curr, &smmu_domain->contexts, list )
-    {
-        if ( curr->smmu == smmu )
-        {
-            cfg = curr;
-            break;
-        }
-    }
+	/* The stage-2 output mask is also applied for bypass */
+	size = arm_smmu_id_size_to_bits((id >> ID2_OAS_SHIFT) & ID2_OAS_MASK);
+	smmu->s2_output_size = min_t(unsigned long, PHYS_MASK_SHIFT, size);
 
-    if ( !cfg )
-    {
-        cfg = arm_smmu_alloc_domain_context(d, smmu);
-        if ( !cfg )
-        {
-            smmu_err(smmu, "unable to allocate context for domain %u\n",
-                     d->domain_id);
-            spin_unlock(&smmu_domain->lock);
-            return -ENOMEM;
-        }
-    }
-    spin_unlock(&smmu_domain->lock);
+	if (smmu->version == ARM_SMMU_V1) {
+		smmu->s1_input_size = 32;
+	} else {
+#ifdef CONFIG_64BIT
+		size = (id >> ID2_UBS_SHIFT) & ID2_UBS_MASK;
+		size = min(VA_BITS, arm_smmu_id_size_to_bits(size));
+#else
+		size = 32;
+#endif
+		smmu->s1_input_size = size;
+
+		if ((PAGE_SIZE == SZ_4K && !(id & ID2_PTFS_4K)) ||
+		    (PAGE_SIZE == SZ_64K && !(id & ID2_PTFS_64K)) ||
+		    (PAGE_SIZE != SZ_4K && PAGE_SIZE != SZ_64K)) {
+			dev_err(smmu->dev, "CPU page size 0x%lx unsupported\n",
+				PAGE_SIZE);
+			return -ENODEV;
+		}
+	}
+
+	if (smmu->features & ARM_SMMU_FEAT_TRANS_S1)
+		dev_notice(smmu->dev, "\tStage-1: %lu-bit VA -> %lu-bit IPA\n",
+			   smmu->s1_input_size, smmu->s1_output_size);
+
+	if (smmu->features & ARM_SMMU_FEAT_TRANS_S2)
+		dev_notice(smmu->dev, "\tStage-2: %lu-bit IPA -> %lu-bit PA\n",
+			   smmu->s2_input_size, smmu->s2_output_size);
+
+	return 0;
+}
+
+static const struct of_device_id arm_smmu_of_match[] = {
+	{ .compatible = "arm,smmu-v1", .data = (void *)ARM_SMMU_V1 },
+	{ .compatible = "arm,smmu-v2", .data = (void *)ARM_SMMU_V2 },
+	{ .compatible = "arm,mmu-400", .data = (void *)ARM_SMMU_V1 },
+	{ .compatible = "arm,mmu-401", .data = (void *)ARM_SMMU_V1 },
+	{ .compatible = "arm,mmu-500", .data = (void *)ARM_SMMU_V2 },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, arm_smmu_of_match);
 
-    ret = arm_smmu_domain_add_master(d, cfg, master);
-    if ( ret )
-    {
-        spin_lock(&smmu_domain->lock);
-        if ( list_empty(&cfg->masters) )
-            arm_smmu_destroy_domain_context(cfg);
-        spin_unlock(&smmu_domain->lock);
-    }
-
-    return ret;
-}
-
-static int arm_smmu_detach_dev(struct domain *d,
-                               const struct dt_device_node *dev)
-{
-    struct arm_smmu_domain *smmu_domain = domain_hvm_iommu(d)->arch.priv;
-    struct arm_smmu_master *master;
-    struct arm_smmu_device *smmu = arm_smmu_find_smmu_by_dev(dev);
-    struct arm_smmu_domain_cfg *cfg;
-
-    printk(XENLOG_DEBUG "arm-smmu: detach %s to domain %d\n",
-           dt_node_full_name(dev), d->domain_id);
-
-    if ( !smmu )
-    {
-        printk(XENLOG_ERR "%s: cannot find the SMMU, is it on the same bus?\n",
-               dt_node_full_name(dev));
-        return -ENODEV;
-    }
-
-    master = find_smmu_master(smmu, dev);
-    BUG_ON(master == NULL);
-
-    cfg = master->cfg;
-
-    /* Sanity check to avoid removing a device that doesn't belong to
-     * the domain
-     */
-    if ( !cfg || cfg->domain != d )
-    {
-        printk(XENLOG_ERR "%s: was not attach to domain %d\n",
-               dt_node_full_name(dev), d->domain_id);
-        return -ESRCH;
-    }
-
-    arm_smmu_domain_remove_master(master);
-
-    spin_lock(&smmu_domain->lock);
-    if ( list_empty(&cfg->masters) )
-        arm_smmu_destroy_domain_context(cfg);
-    spin_unlock(&smmu_domain->lock);
-
-    return 0;
-}
-
-static int arm_smmu_reassign_dt_dev(struct domain *s, struct domain *t,
-                                    const struct dt_device_node *dev)
-{
-    int ret = 0;
-
-    /* Don't allow remapping on other domain than hwdom */
-    if ( t != hardware_domain )
-        return -EPERM;
-
-    if ( t == s )
-        return 0;
-
-    ret = arm_smmu_detach_dev(s, dev);
-    if ( ret )
-        return ret;
-
-    ret = arm_smmu_attach_dev(t, dev);
-
-    return ret;
-}
-
-static __init int arm_smmu_id_size_to_bits(int size)
-{
-    switch ( size )
-    {
-    case 0:
-        return 32;
-    case 1:
-        return 36;
-    case 2:
-        return 40;
-    case 3:
-        return 42;
-    case 4:
-        return 44;
-    case 5:
-    default:
-        return 48;
-    }
-}
-
-static __init int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
+/*
+ * Xen: We don't have refcount for allocated memory so manually free memory
+ * when an error occured.
+ */
+static int arm_smmu_device_dt_probe(struct platform_device *pdev)
 {
-    unsigned long size;
-    void __iomem *gr0_base = SMMU_GR0(smmu);
-    u32 id;
+	const struct of_device_id *of_id;
+	struct resource *res;
+	struct arm_smmu_device *smmu;
+	struct device *dev = &pdev->dev;
+	struct rb_node *node;
+	struct of_phandle_args masterspec;
+	int num_irqs, i, err;
+
+	smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
+	if (!smmu) {
+		dev_err(dev, "failed to allocate arm_smmu_device\n");
+		return -ENOMEM;
+	}
+	smmu->dev = dev;
+
+	of_id = of_match_node(arm_smmu_of_match, dev->of_node);
+	smmu->version = (enum arm_smmu_arch_version)of_id->data;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	smmu->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(smmu->base)) {
+		err = PTR_ERR(smmu->base);
+		goto out_free;
+	}
+	smmu->size = resource_size(res);
+
+	if (of_property_read_u32(dev->of_node, "#global-interrupts",
+				 &smmu->num_global_irqs)) {
+		dev_err(dev, "missing #global-interrupts property\n");
+		err = -ENODEV;
+		goto out_free;
+	}
+
+	num_irqs = 0;
+	while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
+		num_irqs++;
+		if (num_irqs > smmu->num_global_irqs)
+			smmu->num_context_irqs++;
+	}
+
+	if (!smmu->num_context_irqs) {
+		dev_err(dev, "found %d interrupts but expected at least %d\n",
+			num_irqs, smmu->num_global_irqs + 1);
+		err = -ENODEV;
+		goto out_free;
+	}
+
+	smmu->irqs = devm_kzalloc(dev, sizeof(*smmu->irqs) * num_irqs,
+				  GFP_KERNEL);
+	if (!smmu->irqs) {
+		dev_err(dev, "failed to allocate %d irqs\n", num_irqs);
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < num_irqs; ++i) {
+		int irq = platform_get_irq(pdev, i);
+
+		if (irq < 0) {
+			dev_err(dev, "failed to get irq index %d\n", i);
+			err = -ENODEV;
+			goto out_free;
+		}
+		smmu->irqs[i] = irq;
+	}
+
+	err = arm_smmu_device_cfg_probe(smmu);
+	if (err)
+		return err;
+
+	i = 0;
+	smmu->masters = RB_ROOT;
+	while (!of_parse_phandle_with_args(dev->of_node, "mmu-masters",
+					   "#stream-id-cells", i,
+					   &masterspec)) {
+		err = register_smmu_master(smmu, dev, &masterspec);
+		if (err) {
+			dev_err(dev, "failed to add master %s\n",
+				masterspec.np->name);
+			goto out_put_masters;
+		}
+
+		i++;
+	}
+	dev_notice(dev, "registered %d master devices\n", i);
+
+	parse_driver_options(smmu);
+
+	if (smmu->version > ARM_SMMU_V1 &&
+	    smmu->num_context_banks != smmu->num_context_irqs) {
+		dev_err(dev,
+			"found only %d context interrupt(s) but %d required\n",
+			smmu->num_context_irqs, smmu->num_context_banks);
+		err = -ENODEV;
+		goto out_put_masters;
+	}
+
+	for (i = 0; i < smmu->num_global_irqs; ++i) {
+		err = request_irq(smmu->irqs[i],
+				  arm_smmu_global_fault,
+				  IRQF_SHARED,
+				  "arm-smmu global fault",
+				  smmu);
+		if (err) {
+			dev_err(dev, "failed to request global IRQ %d (%u)\n",
+				i, smmu->irqs[i]);
+			goto out_free_irqs;
+		}
+	}
+
+	INIT_LIST_HEAD(&smmu->list);
+	spin_lock(&arm_smmu_devices_lock);
+	list_add(&smmu->list, &arm_smmu_devices);
+	spin_unlock(&arm_smmu_devices_lock);
+
+	arm_smmu_device_reset(smmu);
+	return 0;
 
-    smmu_info(smmu, "probing hardware configuration...\n");
+out_free_irqs:
+	while (i--)
+		free_irq(smmu->irqs[i], smmu);
+
+out_put_masters:
+	for (node = rb_first(&smmu->masters); node; node = rb_next(node)) {
+		struct arm_smmu_master *master
+			= container_of(node, struct arm_smmu_master, node);
+		kfree(master);
+	}
+
+out_free:
+	kfree(smmu->irqs);
+	if (!IS_ERR(smmu->base))
+		iounmap(smmu->base);
+	kfree(smmu);
+
+	return err;
+}
 
-    /*
-     * Primecell ID
-     */
-    id = readl_relaxed(gr0_base + SMMU_GR0_PIDR2);
-    smmu->version = ((id >> SMMU_PIDR2_ARCH_SHIFT) & SMMU_PIDR2_ARCH_MASK) + 1;
-    smmu_info(smmu, "SMMUv%d with:\n", smmu->version);
+#if 0 /* Xen: We never remove SMMU */
+static int arm_smmu_device_remove(struct platform_device *pdev)
+{
+	int i;
+	struct device *dev = &pdev->dev;
+	struct arm_smmu_device *curr, *smmu = NULL;
+	struct rb_node *node;
+
+	spin_lock(&arm_smmu_devices_lock);
+	list_for_each_entry(curr, &arm_smmu_devices, list) {
+		if (curr->dev == dev) {
+			smmu = curr;
+			list_del(&smmu->list);
+			break;
+		}
+	}
+	spin_unlock(&arm_smmu_devices_lock);
+
+	if (!smmu)
+		return -ENODEV;
+
+	for (node = rb_first(&smmu->masters); node; node = rb_next(node)) {
+		struct arm_smmu_master *master
+			= container_of(node, struct arm_smmu_master, node);
+		of_node_put(master->of_node);
+	}
+
+	if (!bitmap_empty(smmu->context_map, ARM_SMMU_MAX_CBS))
+		dev_err(dev, "removing device with active domains!\n");
+
+	for (i = 0; i < smmu->num_global_irqs; ++i)
+		free_irq(smmu->irqs[i], smmu);
+
+	/* Turn the thing off */
+	writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
+	return 0;
+}
 
-    /* ID0 */
-    id = readl_relaxed(gr0_base + SMMU_GR0_ID0);
-#ifndef CONFIG_ARM_64
-    if ( ((id >> SMMU_ID0_PTFS_SHIFT) & SMMU_ID0_PTFS_MASK) ==
-            SMMU_ID0_PTFS_V8_ONLY )
-    {
-        smmu_err(smmu, "\tno v7 descriptor support!\n");
-        return -ENODEV;
-    }
+static struct platform_driver arm_smmu_driver = {
+	.driver	= {
+		.name		= "arm-smmu",
+		.of_match_table	= of_match_ptr(arm_smmu_of_match),
+	},
+	.probe	= arm_smmu_device_dt_probe,
+	.remove	= arm_smmu_device_remove,
+};
+
+static int __init arm_smmu_init(void)
+{
+	struct device_node *np;
+	int ret;
+
+	/*
+	 * Play nice with systems that don't have an ARM SMMU by checking that
+	 * an ARM SMMU exists in the system before proceeding with the driver
+	 * and IOMMU bus operation registration.
+	 */
+	np = of_find_matching_node(NULL, arm_smmu_of_match);
+	if (!np)
+		return 0;
+
+	of_node_put(np);
+
+	ret = platform_driver_register(&arm_smmu_driver);
+	if (ret)
+		return ret;
+
+	/* Oh, for a proper bus abstraction */
+	if (!iommu_present(&platform_bus_type))
+		bus_set_iommu(&platform_bus_type, &arm_smmu_ops);
+
+#ifdef CONFIG_ARM_AMBA
+	if (!iommu_present(&amba_bustype))
+		bus_set_iommu(&amba_bustype, &arm_smmu_ops);
 #endif
-    if ( id & SMMU_ID0_S1TS )
-    {
-        smmu->features |= SMMU_FEAT_TRANS_S1;
-        smmu_info(smmu, "\tstage 1 translation\n");
-    }
-
-    if ( id & SMMU_ID0_S2TS )
-    {
-        smmu->features |= SMMU_FEAT_TRANS_S2;
-        smmu_info(smmu, "\tstage 2 translation\n");
-    }
-
-    if ( id & SMMU_ID0_NTS )
-    {
-        smmu->features |= SMMU_FEAT_TRANS_NESTED;
-        smmu_info(smmu, "\tnested translation\n");
-    }
-
-    if ( !(smmu->features &
-           (SMMU_FEAT_TRANS_S1 | SMMU_FEAT_TRANS_S2 |
-            SMMU_FEAT_TRANS_NESTED)) )
-    {
-        smmu_err(smmu, "\tno translation support!\n");
-        return -ENODEV;
-    }
-
-    /* We need at least support for Stage 2 */
-    if ( !(smmu->features & SMMU_FEAT_TRANS_S2) )
-    {
-        smmu_err(smmu, "\tno stage 2 translation!\n");
-        return -ENODEV;
-    }
-
-    if ( id & SMMU_ID0_CTTW )
-    {
-        smmu->features |= SMMU_FEAT_COHERENT_WALK;
-        smmu_info(smmu, "\tcoherent table walk\n");
-    }
-
-    if ( id & SMMU_ID0_SMS )
-    {
-        u32 smr, sid, mask;
-
-        smmu->features |= SMMU_FEAT_STREAM_MATCH;
-        smmu->num_mapping_groups = (id >> SMMU_ID0_NUMSMRG_SHIFT) &
-            SMMU_ID0_NUMSMRG_MASK;
-        if ( smmu->num_mapping_groups == 0 )
-        {
-            smmu_err(smmu,
-                     "stream-matching supported, but no SMRs present!\n");
-            return -ENODEV;
-        }
-
-        smr = SMMU_SMR_MASK_MASK << SMMU_SMR_MASK_SHIFT;
-        smr |= (SMMU_SMR_ID_MASK << SMMU_SMR_ID_SHIFT);
-        writel_relaxed(smr, gr0_base + SMMU_GR0_SMR(0));
-        smr = readl_relaxed(gr0_base + SMMU_GR0_SMR(0));
-
-        mask = (smr >> SMMU_SMR_MASK_SHIFT) & SMMU_SMR_MASK_MASK;
-        sid = (smr >> SMMU_SMR_ID_SHIFT) & SMMU_SMR_ID_MASK;
-        if ( (mask & sid) != sid )
-        {
-            smmu_err(smmu,
-                     "SMR mask bits (0x%x) insufficient for ID field (0x%x)\n",
-                     mask, sid);
-            return -ENODEV;
-        }
-        smmu->smr_mask_mask = mask;
-        smmu->smr_id_mask = sid;
-
-        smmu_info(smmu,
-                  "\tstream matching with %u register groups, mask 0x%x\n",
-                  smmu->num_mapping_groups, mask);
-    }
-
-    /* ID1 */
-    id = readl_relaxed(gr0_base + SMMU_GR0_ID1);
-    smmu->pagesize = (id & SMMU_ID1_PAGESIZE) ? PAGE_SIZE_64K : PAGE_SIZE_4K;
-
-    /* Check for size mismatch of SMMU address space from mapped region */
-    size = 1 << (((id >> SMMU_ID1_NUMPAGENDXB_SHIFT) &
-                  SMMU_ID1_NUMPAGENDXB_MASK) + 1);
-    size *= (smmu->pagesize << 1);
-    if ( smmu->size != size )
-        smmu_warn(smmu, "SMMU address space size (0x%lx) differs "
-                  "from mapped region size (0x%lx)!\n", size, smmu->size);
-
-    smmu->num_s2_context_banks = (id >> SMMU_ID1_NUMS2CB_SHIFT) &
-        SMMU_ID1_NUMS2CB_MASK;
-    smmu->num_context_banks = (id >> SMMU_ID1_NUMCB_SHIFT) &
-        SMMU_ID1_NUMCB_MASK;
-    if ( smmu->num_s2_context_banks > smmu->num_context_banks )
-    {
-        smmu_err(smmu, "impossible number of S2 context banks!\n");
-        return -ENODEV;
-    }
-    smmu_info(smmu, "\t%u context banks (%u stage-2 only)\n",
-              smmu->num_context_banks, smmu->num_s2_context_banks);
-
-    /* ID2 */
-    id = readl_relaxed(gr0_base + SMMU_GR0_ID2);
-    size = arm_smmu_id_size_to_bits((id >> SMMU_ID2_IAS_SHIFT) &
-                                    SMMU_ID2_IAS_MASK);
-
-    /*
-     * Stage-1 output limited by stage-2 input size due to VTCR_EL2
-     * setup (see setup_virt_paging)
-     */
-    /* Current maximum output size of 40 bits */
-    smmu->s1_output_size = min(40UL, size);
-
-    /* The stage-2 output mask is also applied for bypass */
-    size = arm_smmu_id_size_to_bits((id >> SMMU_ID2_OAS_SHIFT) &
-                                    SMMU_ID2_OAS_MASK);
-    smmu->s2_output_size = min((unsigned long)PADDR_BITS, size);
-
-    if ( smmu->version == 1 )
-        smmu->input_size = 32;
-    else
-    {
-#ifdef CONFIG_ARM_64
-        size = (id >> SMMU_ID2_UBS_SHIFT) & SMMU_ID2_UBS_MASK;
-        size = min(39, arm_smmu_id_size_to_bits(size));
-#else
-        size = 32;
+
+#ifdef CONFIG_PCI
+	if (!iommu_present(&pci_bus_type))
+		bus_set_iommu(&pci_bus_type, &arm_smmu_ops);
 #endif
-        smmu->input_size = size;
 
-        if ( (PAGE_SIZE == PAGE_SIZE_4K && !(id & SMMU_ID2_PTFS_4K) ) ||
-             (PAGE_SIZE == PAGE_SIZE_64K && !(id & SMMU_ID2_PTFS_64K)) ||
-             (PAGE_SIZE != PAGE_SIZE_4K && PAGE_SIZE != PAGE_SIZE_64K) )
-        {
-            smmu_err(smmu, "CPU page size 0x%lx unsupported\n",
-                     PAGE_SIZE);
-            return -ENODEV;
-        }
-    }
+	return 0;
+}
+
+static void __exit arm_smmu_exit(void)
+{
+	return platform_driver_unregister(&arm_smmu_driver);
+}
+
+subsys_initcall(arm_smmu_init);
+module_exit(arm_smmu_exit);
+
+MODULE_DESCRIPTION("IOMMU API for ARM architected SMMU implementations");
+MODULE_AUTHOR("Will Deacon <will.deacon at arm.com>");
+MODULE_LICENSE("GPL v2");
+#endif
+
+/***** Start of Xen specific code *****/
+
+/* Xen only supports stage-2 translation, so force the value to 2. */
+static int force_stage = 2;
+
+/*
+ * Platform features. It indicates the list of features supported by all
+ * SMMUs.
+ * Actually we only care about coherent table walk.
+ */
+static u32 platform_features = ARM_SMMU_FEAT_COHERENT_WALK;
+
+static void arm_smmu_iotlb_flush_all(struct domain *d)
+{
+	struct arm_smmu_xen_domain *smmu_domain = domain_hvm_iommu(d)->arch.priv;
+	struct iommu_domain *cfg;
+
+	spin_lock(&smmu_domain->lock);
+	list_for_each_entry(cfg, &smmu_domain->contexts, list) {
+		/*
+		 * Only invalidate the context when SMMU is present.
+		 * This is because the context initialization is delayed
+		 * until a master has been added.
+		 */
+		if (unlikely(!ACCESS_ONCE(cfg->priv->smmu)))
+			continue;
+		arm_smmu_tlb_inv_context(cfg->priv);
+	}
+	spin_unlock(&smmu_domain->lock);
+}
+
+static void arm_smmu_iotlb_flush(struct domain *d, unsigned long gfn,
+                                 unsigned int page_count)
+{
+    /* ARM SMMU v1 doesn't have flush by VMA and VMID */
+    arm_smmu_iotlb_flush_all(d);
+}
+
+static struct iommu_domain *arm_smmu_get_domain(struct domain *d,
+						struct device *dev)
+{
+	struct iommu_domain *domain;
+	struct arm_smmu_xen_domain *xen_domain;
+	struct arm_smmu_device *smmu;
+
+	xen_domain = domain_hvm_iommu(d)->arch.priv;
+
+	smmu = find_smmu_for_device(dev);
+	if (!smmu)
+		return NULL;
+
+	/*
+	 * Loop through the &xen_domain->contexts to locate a context
+	 * assigned to this SMMU
+	 */
+	list_for_each_entry(domain, &xen_domain->contexts, list) {
+		if (domain->priv->smmu == smmu)
+			return domain;
+	}
+
+	return NULL;
 
-    smmu_info(smmu, "\t%lu-bit VA, %lu-bit IPA, %lu-bit PA\n",
-              smmu->input_size, smmu->s1_output_size, smmu->s2_output_size);
-    return 0;
 }
 
-static __init void arm_smmu_device_reset(struct arm_smmu_device *smmu)
+static void arm_smmu_destroy_iommu_domain(struct iommu_domain *domain)
 {
-    void __iomem *gr0_base = SMMU_GR0(smmu);
-    void __iomem *cb_base;
-    int i = 0;
-    u32 reg;
+	list_del(&domain->list);
+	arm_smmu_domain_destroy(domain);
+	xfree(domain);
+}
 
-    smmu_dbg(smmu, "device reset\n");
+static int arm_smmu_assign_dev(struct domain *d, u8 devfn,
+			       struct device *dev, u32 flag)
+{
+	struct iommu_domain *domain;
+	struct arm_smmu_xen_domain *xen_domain;
+	int ret = 0;
+
+	xen_domain = domain_hvm_iommu(d)->arch.priv;
+
+	if (!dev->archdata.iommu) {
+		dev->archdata.iommu = xzalloc(struct arm_smmu_xen_device);
+		if (!dev->archdata.iommu)
+			return -ENOMEM;
+	}
+
+	if (!dev_iommu_group(dev)) {
+		ret = arm_smmu_add_device(dev);
+		if (ret)
+			return ret;
+	}
+
+	spin_lock(&xen_domain->lock);
+
+	/*
+	 * Check to see if a context bank (iommu_domain) already exists for
+	 * this xen domain under the same SMMU
+	 */
+	domain = arm_smmu_get_domain(d, dev);
+	if (!domain) {
+
+		domain = xzalloc(struct iommu_domain);
+		if (!domain) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = arm_smmu_domain_init(domain);
+		if (ret) {
+			xfree(domain);
+			goto out;
+		}
+
+		domain->priv->cfg.domain = d;
+
+		/* Chain the new context to the domain */
+		list_add(&domain->list, &xen_domain->contexts);
+
+	}
+
+	ret = arm_smmu_attach_dev(domain, dev);
+	if (ret) {
+		if (domain->ref.counter == 0)
+			arm_smmu_destroy_iommu_domain(domain);
+	} else {
+		atomic_inc(&domain->ref);
+	}
 
-    /* Clear Global FSR */
-    reg = readl_relaxed(SMMU_GR0_NS(smmu) + SMMU_GR0_sGFSR);
-    writel(reg, SMMU_GR0_NS(smmu) + SMMU_GR0_sGFSR);
+out:
+	spin_unlock(&xen_domain->lock);
 
-    /* Mark all SMRn as invalid and all S2CRn as fault */
-    for ( i = 0; i < smmu->num_mapping_groups; ++i )
-    {
-        writel_relaxed(~SMMU_SMR_VALID, gr0_base + SMMU_GR0_SMR(i));
-        writel_relaxed(SMMU_S2CR_TYPE_FAULT, gr0_base + SMMU_GR0_S2CR(i));
-    }
+	return ret;
+}
+
+static int arm_smmu_deassign_dev(struct domain *d, struct device *dev)
+{
+	struct iommu_domain *domain = dev_iommu_domain(dev);
+	struct arm_smmu_xen_domain *xen_domain;
+
+	xen_domain = domain_hvm_iommu(d)->arch.priv;
+
+	if (!domain || domain->priv->cfg.domain != d) {
+		dev_err(dev, " not attached to domain %d\n", d->domain_id);
+		return -ESRCH;
+	}
 
-    /* Make sure all context banks are disabled and clear CB_FSR  */
-    for ( i = 0; i < smmu->num_context_banks; ++i )
-    {
-        cb_base = SMMU_CB_BASE(smmu) + SMMU_CB(smmu, i);
-        writel_relaxed(0, cb_base + SMMU_CB_SCTLR);
-        writel_relaxed(SMMU_FSR_FAULT, cb_base + SMMU_CB_FSR);
-    }
+	spin_lock(&xen_domain->lock);
 
-    /* Invalidate the TLB, just in case */
-    writel_relaxed(0, gr0_base + SMMU_GR0_STLBIALL);
-    writel_relaxed(0, gr0_base + SMMU_GR0_TLBIALLH);
-    writel_relaxed(0, gr0_base + SMMU_GR0_TLBIALLNSNH);
+	arm_smmu_detach_dev(domain, dev);
+	atomic_dec(&domain->ref);
 
-    reg = readl_relaxed(SMMU_GR0_NS(smmu) + SMMU_GR0_sCR0);
+	if (domain->ref.counter == 0)
+		arm_smmu_destroy_iommu_domain(domain);
 
-    /* Enable fault reporting */
-    reg |= (SMMU_sCR0_GFRE | SMMU_sCR0_GFIE |
-            SMMU_sCR0_GCFGFRE | SMMU_sCR0_GCFGFIE);
+	spin_unlock(&xen_domain->lock);
+
+	return 0;
+}
+
+static int arm_smmu_reassign_dev(struct domain *s, struct domain *t,
+				 u8 devfn,  struct device *dev)
+{
+	int ret = 0;
 
-    /* Disable TLB broadcasting. */
-    reg |= (SMMU_sCR0_VMIDPNE | SMMU_sCR0_PTM);
+	/* Don't allow remapping on other domain than hwdom */
+	if (t && t != hardware_domain)
+		return -EPERM;
 
-    /* Enable client access, generate a fault if no mapping is found */
-    reg &= ~(SMMU_sCR0_CLIENTPD);
-    reg |= SMMU_sCR0_USFCFG;
+	if (t == s)
+		return 0;
 
-    /* Disable forced broadcasting */
-    reg &= ~SMMU_sCR0_FB;
+	ret = arm_smmu_deassign_dev(s, dev);
+	if (ret)
+		return ret;
 
-    /* Don't upgrade barriers when client devices are not mapped to
-     * a translation context banks (just here for clarity as Xen policy
-     * is to deny invalid transaction). */
-    reg &= ~(SMMU_sCR0_BSU_MASK << SMMU_sCR0_BSU_SHIFT);
+	if (t) {
+		/* No flags are defined for ARM. */
+		ret = arm_smmu_assign_dev(t, devfn, dev, 0);
+		if (ret)
+			return ret;
+	}
 
-    /* Push the button */
-    arm_smmu_tlb_sync(smmu);
-    writel_relaxed(reg, SMMU_GR0_NS(smmu) + SMMU_GR0_sCR0);
+	return 0;
 }
 
 static int arm_smmu_iommu_domain_init(struct domain *d)
 {
-    struct arm_smmu_domain *smmu_domain;
+	struct arm_smmu_xen_domain *xen_domain;
 
-    smmu_domain = xzalloc(struct arm_smmu_domain);
-    if ( !smmu_domain )
-        return -ENOMEM;
+	xen_domain = xzalloc(struct arm_smmu_xen_domain);
+	if ( !xen_domain )
+		return -ENOMEM;
 
-    spin_lock_init(&smmu_domain->lock);
-    INIT_LIST_HEAD(&smmu_domain->contexts);
+	spin_lock_init(&xen_domain->lock);
+	INIT_LIST_HEAD(&xen_domain->contexts);
 
-    domain_hvm_iommu(d)->arch.priv = smmu_domain;
+	domain_hvm_iommu(d)->arch.priv = xen_domain;
 
-    return 0;
+	/* Coherent walk can be enabled only when all SMMUs support it. */
+	if (platform_features & ARM_SMMU_FEAT_COHERENT_WALK)
+		iommu_set_feature(d, IOMMU_FEAT_COHERENT_WALK);
+
+	return 0;
 }
 
 static void __hwdom_init arm_smmu_iommu_hwdom_init(struct domain *d)
@@ -1530,50 +2743,53 @@ static void __hwdom_init arm_smmu_iommu_hwdom_init(struct domain *d)
 
 static void arm_smmu_iommu_domain_teardown(struct domain *d)
 {
-    struct arm_smmu_domain *smmu_domain = domain_hvm_iommu(d)->arch.priv;
+	struct arm_smmu_xen_domain *xen_domain = domain_hvm_iommu(d)->arch.priv;
 
-    ASSERT(list_empty(&smmu_domain->contexts));
-    xfree(smmu_domain);
+	ASSERT(list_empty(&xen_domain->contexts));
+	xfree(xen_domain);
 }
 
 static int arm_smmu_map_page(struct domain *d, unsigned long gfn,
-                             unsigned long mfn, unsigned int flags)
+			     unsigned long mfn, unsigned int flags)
 {
-    p2m_type_t t;
-
-    /* Grant mappings can be used for DMA requests. The dev_bus_addr returned by
-     * the hypercall is the MFN (not the IPA). For device protected by
-     * an IOMMU, Xen needs to add a 1:1 mapping in the domain p2m to
-     * allow DMA request to work.
-     * This is only valid when the domain is directed mapped. Hence this
-     * function should only be used by gnttab code with gfn == mfn.
-     */
-    BUG_ON(!is_domain_direct_mapped(d));
-    BUG_ON(mfn != gfn);
-
-    /* We only support readable and writable flags */
-    if ( !(flags & (IOMMUF_readable | IOMMUF_writable)) )
-        return -EINVAL;
-
-    t = (flags & IOMMUF_writable) ? p2m_iommu_map_rw : p2m_iommu_map_ro;
-
-    /* The function guest_physmap_add_entry replaces the current mapping
-     * if there is already one...
-     */
-    return guest_physmap_add_entry(d, gfn, mfn, 0, t);
+	p2m_type_t t;
+
+	/*
+	 * Grant mappings can be used for DMA requests. The dev_bus_addr
+	 * returned by the hypercall is the MFN (not the IPA). For device
+	 * protected by an IOMMU, Xen needs to add a 1:1 mapping in the domain
+	 * p2m to allow DMA request to work.
+	 * This is only valid when the domain is directed mapped. Hence this
+	 * function should only be used by gnttab code with gfn == mfn.
+	 */
+	BUG_ON(!is_domain_direct_mapped(d));
+	BUG_ON(mfn != gfn);
+
+	/* We only support readable and writable flags */
+	if (!(flags & (IOMMUF_readable | IOMMUF_writable)))
+		return -EINVAL;
+
+	t = (flags & IOMMUF_writable) ? p2m_iommu_map_rw : p2m_iommu_map_ro;
+
+	/*
+	 * The function guest_physmap_add_entry replaces the current mapping
+	 * if there is already one...
+	 */
+	return guest_physmap_add_entry(d, gfn, mfn, 0, t);
 }
 
 static int arm_smmu_unmap_page(struct domain *d, unsigned long gfn)
 {
-    /* This function should only be used by gnttab code when the domain
-     * is direct mapped
-     */
-    if ( !is_domain_direct_mapped(d) )
-        return -EINVAL;
+	/*
+	 * This function should only be used by gnttab code when the domain
+	 * is direct mapped
+	 */
+	if ( !is_domain_direct_mapped(d) )
+		return -EINVAL;
 
-    guest_physmap_remove_page(d, gfn, gfn, 0);
+	guest_physmap_remove_page(d, gfn, gfn, 0);
 
-    return 0;
+	return 0;
 }
 
 static const struct iommu_ops arm_smmu_iommu_ops = {
@@ -1582,203 +2798,57 @@ static const struct iommu_ops arm_smmu_iommu_ops = {
     .teardown = arm_smmu_iommu_domain_teardown,
     .iotlb_flush = arm_smmu_iotlb_flush,
     .iotlb_flush_all = arm_smmu_iotlb_flush_all,
-    .assign_dt_device = arm_smmu_attach_dev,
-    .reassign_dt_device = arm_smmu_reassign_dt_dev,
+    .assign_device = arm_smmu_assign_dev,
+    .reassign_device = arm_smmu_reassign_dev,
     .map_page = arm_smmu_map_page,
     .unmap_page = arm_smmu_unmap_page,
 };
 
-static int __init smmu_init(struct dt_device_node *dev,
-                            const void *data)
-{
-    struct arm_smmu_device *smmu;
-    int res;
-    u64 addr, size;
-    unsigned int num_irqs, i;
-    struct dt_phandle_args masterspec;
-    struct rb_node *node;
-
-    /* Even if the device can't be initialized, we don't want to give
-     * the smmu device to dom0.
-     */
-    dt_device_set_used_by(dev, DOMID_XEN);
-
-    smmu = xzalloc(struct arm_smmu_device);
-    if ( !smmu )
-    {
-        printk(XENLOG_ERR "%s: failed to allocate arm_smmu_device\n",
-               dt_node_full_name(dev));
-        return -ENOMEM;
-    }
-
-    smmu->node = dev;
-    check_driver_options(smmu);
-
-    res = dt_device_get_address(smmu->node, 0, &addr, &size);
-    if ( res )
-    {
-        smmu_err(smmu, "unable to retrieve the base address of the SMMU\n");
-        goto out_err;
-    }
-
-    smmu->base = ioremap_nocache(addr, size);
-    if ( !smmu->base )
-    {
-        smmu_err(smmu, "unable to map the SMMU memory\n");
-        goto out_err;
-    }
-
-    smmu->size = size;
-
-    if ( !dt_property_read_u32(smmu->node, "#global-interrupts",
-                               &smmu->num_global_irqs) )
-    {
-        smmu_err(smmu, "missing #global-interrupts\n");
-        goto out_unmap;
-    }
-
-    num_irqs = dt_number_of_irq(smmu->node);
-    if ( num_irqs > smmu->num_global_irqs )
-        smmu->num_context_irqs = num_irqs - smmu->num_global_irqs;
-
-    if ( !smmu->num_context_irqs )
-    {
-        smmu_err(smmu, "found %d interrupts but expected at least %d\n",
-                 num_irqs, smmu->num_global_irqs + 1);
-        goto out_unmap;
-    }
-
-    smmu->irqs = xzalloc_array(unsigned int, num_irqs);
-    if ( !smmu->irqs )
-    {
-        smmu_err(smmu, "failed to allocated %d irqs\n", num_irqs);
-        goto out_unmap;
-    }
-
-    for ( i = 0; i < num_irqs; i++ )
-    {
-        res = platform_get_irq(smmu->node, i);
-        if ( res < 0 )
-        {
-            smmu_err(smmu, "failed to get irq index %d\n", i);
-            goto out_free_irqs;
-        }
-        smmu->irqs[i] = res;
-    }
-
-    smmu->sids = xzalloc_array(unsigned long,
-                               BITS_TO_LONGS(SMMU_MAX_STREAMIDS));
-    if ( !smmu->sids )
-    {
-        smmu_err(smmu, "failed to allocated bitmap for stream ID tracking\n");
-        goto out_free_masters;
-    }
-
-
-    i = 0;
-    smmu->masters = RB_ROOT;
-    while ( !dt_parse_phandle_with_args(smmu->node, "mmu-masters",
-                                        "#stream-id-cells", i, &masterspec) )
-    {
-        res = register_smmu_master(smmu, &masterspec);
-        if ( res )
-        {
-            smmu_err(smmu, "failed to add master %s\n",
-                     masterspec.np->name);
-            goto out_free_masters;
-        }
-        i++;
-    }
-
-    smmu_info(smmu, "registered %d master devices\n", i);
-
-    res = arm_smmu_device_cfg_probe(smmu);
-    if ( res )
-    {
-        smmu_err(smmu, "failed to probe the SMMU\n");
-        goto out_free_masters;
-    }
-
-    if ( smmu->version > 1 &&
-         smmu->num_context_banks != smmu->num_context_irqs )
-    {
-        smmu_err(smmu,
-                 "found only %d context interrupt(s) but %d required\n",
-                 smmu->num_context_irqs, smmu->num_context_banks);
-        goto out_free_masters;
-    }
-
-    smmu_dbg(smmu, "register global IRQs handler\n");
-
-    for ( i = 0; i < smmu->num_global_irqs; ++i )
-    {
-        smmu_dbg(smmu, "\t- global IRQ %u\n", smmu->irqs[i]);
-        res = request_irq(smmu->irqs[i], IRQF_SHARED, arm_smmu_global_fault,
-                          "arm-smmu global fault", smmu);
-        if ( res )
-        {
-            smmu_err(smmu, "failed to request global IRQ %d (%u)\n",
-                     i, smmu->irqs[i]);
-            goto out_release_irqs;
-        }
-    }
-
-    INIT_LIST_HEAD(&smmu->list);
-    list_add(&smmu->list, &arm_smmu_devices);
-
-    arm_smmu_device_reset(smmu);
-
-    iommu_set_ops(&arm_smmu_iommu_ops);
-
-    /* sids field can be freed... */
-    xfree(smmu->sids);
-    smmu->sids = NULL;
-
-    return 0;
-
-out_release_irqs:
-    while (i--)
-        release_irq(smmu->irqs[i], smmu);
-
-out_free_masters:
-    for ( node = rb_first(&smmu->masters); node; node = rb_next(node) )
-    {
-        struct arm_smmu_master *master;
-
-        master = container_of(node, struct arm_smmu_master, node);
-        xfree(master);
-    }
-
-    xfree(smmu->sids);
+static __init const struct arm_smmu_device *find_smmu(const struct device *dev)
+{
+	struct arm_smmu_device *smmu;
+	bool found = false;
+
+	spin_lock(&arm_smmu_devices_lock);
+	list_for_each_entry(smmu, &arm_smmu_devices, list) {
+		if (smmu->dev == dev) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&arm_smmu_devices_lock);
+
+	return (found) ? smmu : NULL;
+}
 
-out_free_irqs:
-    xfree(smmu->irqs);
+static __init int arm_smmu_dt_init(struct dt_device_node *dev,
+				   const void *data)
+{
+	int rc;
+	const struct arm_smmu_device *smmu;
 
-out_unmap:
-    iounmap(smmu->base);
+	/*
+	 * Even if the device can't be initialized, we don't want to
+	 * give the SMMU device to dom0.
+	 */
+	dt_device_set_used_by(dev, DOMID_XEN);
 
-out_err:
-    xfree(smmu);
+	rc = arm_smmu_device_dt_probe(dev);
+	if (rc)
+		return rc;
 
-    return -ENODEV;
-}
+	iommu_set_ops(&arm_smmu_iommu_ops);
 
-static const char * const smmu_dt_compat[] __initconst =
-{
-    "arm,mmu-400",
-    NULL
-};
+	/* Find the last SMMU added and retrieve its features. */
+	smmu = find_smmu(dt_to_dev(dev));
+	BUG_ON(smmu == NULL);
+
+	platform_features &= smmu->features;
+
+	return 0;
+}
 
 DT_DEVICE_START(smmu, "ARM SMMU", DEVICE_IOMMU)
-    .compatible = smmu_dt_compat,
-    .init = smmu_init,
+	.dt_match = arm_smmu_of_match,
+	.init = arm_smmu_dt_init,
 DT_DEVICE_END
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/drivers/passthrough/ats.h b/xen/drivers/passthrough/ats.h
index 000e76d..5c91572 100644
--- a/xen/drivers/passthrough/ats.h
+++ b/xen/drivers/passthrough/ats.h
@@ -9,8 +9,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _ATS_H_
diff --git a/xen/drivers/passthrough/device_tree.c b/xen/drivers/passthrough/device_tree.c
index 3e47df5..7ff79f8 100644
--- a/xen/drivers/passthrough/device_tree.c
+++ b/xen/drivers/passthrough/device_tree.c
@@ -1,9 +1,6 @@
 /*
  * Code to passthrough a device tree node to a guest
  *
- * TODO: This contains only the necessary code to protected device passed to
- * dom0. It will need some updates when device passthrough will is added.
- *
  * Julien Grall <julien.grall at linaro.org>
  * Copyright (c) 2014 Linaro Limited.
  *
@@ -20,8 +17,10 @@
 
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <xen/guest_access.h>
 #include <xen/iommu.h>
 #include <xen/device_tree.h>
+#include <xsm/xsm.h>
 
 static spinlock_t dtdevs_lock = SPIN_LOCK_UNLOCKED;
 
@@ -41,7 +40,20 @@ int iommu_assign_dt_device(struct domain *d, struct dt_device_node *dev)
     if ( !list_empty(&dev->domain_list) )
         goto fail;
 
-    rc = hd->platform_ops->assign_dt_device(d, dev);
+    if ( need_iommu(d) <= 0 )
+    {
+        /*
+         * The hwdom is forced to use IOMMU for protecting assigned
+         * device. Therefore the IOMMU data is already set up.
+         */
+        ASSERT(!is_hardware_domain(d));
+        rc = iommu_construct(d);
+        if ( rc )
+            goto fail;
+    }
+
+    /* The flag field doesn't matter to DT device. */
+    rc = hd->platform_ops->assign_device(d, 0, dt_to_dev(dev), 0);
 
     if ( rc )
         goto fail;
@@ -68,14 +80,12 @@ int iommu_deassign_dt_device(struct domain *d, struct dt_device_node *dev)
 
     spin_lock(&dtdevs_lock);
 
-    rc = hd->platform_ops->reassign_dt_device(d, hardware_domain, dev);
+    rc = hd->platform_ops->reassign_device(d, NULL, 0, dt_to_dev(dev));
     if ( rc )
         goto fail;
 
-    list_del(&dev->domain_list);
-
-    dt_device_set_used_by(dev, hardware_domain->domain_id);
-    list_add(&dev->domain_list, &domain_hvm_iommu(hardware_domain)->dt_devices);
+    list_del_init(&dev->domain_list);
+    dt_device_set_used_by(dev, DOMID_IO);
 
 fail:
     spin_unlock(&dtdevs_lock);
@@ -83,6 +93,20 @@ fail:
     return rc;
 }
 
+static bool_t iommu_dt_device_is_assigned(const struct dt_device_node *dev)
+{
+    bool_t assigned = 0;
+
+    if ( !dt_device_is_protected(dev) )
+        return 0;
+
+    spin_lock(&dtdevs_lock);
+    assigned = !list_empty(&dev->domain_list);
+    spin_unlock(&dtdevs_lock);
+
+    return assigned;
+}
+
 int iommu_dt_domain_init(struct domain *d)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
@@ -92,7 +116,7 @@ int iommu_dt_domain_init(struct domain *d)
     return 0;
 }
 
-void iommu_dt_domain_destroy(struct domain *d)
+int iommu_release_dt_devices(struct domain *d)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     struct dt_device_node *dev, *_dev;
@@ -102,7 +126,101 @@ void iommu_dt_domain_destroy(struct domain *d)
     {
         rc = iommu_deassign_dt_device(d, dev);
         if ( rc )
+        {
             dprintk(XENLOG_ERR, "Failed to deassign %s in domain %u\n",
                     dt_node_full_name(dev), d->domain_id);
+            return rc;
+        }
+    }
+
+    return 0;
+}
+
+int iommu_do_dt_domctl(struct xen_domctl *domctl, struct domain *d,
+                       XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+{
+    int ret;
+    struct dt_device_node *dev;
+
+    switch ( domctl->cmd )
+    {
+    case XEN_DOMCTL_assign_device:
+        ret = -ENODEV;
+        if ( domctl->u.assign_device.dev != XEN_DOMCTL_DEV_DT )
+            break;
+
+        if ( unlikely(d->is_dying) )
+        {
+            ret = -EINVAL;
+            break;
+        }
+
+        ret = dt_find_node_by_gpath(domctl->u.assign_device.u.dt.path,
+                                    domctl->u.assign_device.u.dt.size,
+                                    &dev);
+        if ( ret )
+            break;
+
+        ret = xsm_assign_dtdevice(XSM_HOOK, d, dt_node_full_name(dev));
+        if ( ret )
+            break;
+
+        ret = iommu_assign_dt_device(d, dev);
+
+        if ( ret )
+            printk(XENLOG_G_ERR "XEN_DOMCTL_assign_dt_device: assign \"%s\""
+                   " to dom%u failed (%d)\n",
+                   dt_node_full_name(dev), d->domain_id, ret);
+        break;
+
+    case XEN_DOMCTL_deassign_device:
+        ret = -ENODEV;
+        if ( domctl->u.assign_device.dev != XEN_DOMCTL_DEV_DT )
+            break;
+
+        ret = dt_find_node_by_gpath(domctl->u.assign_device.u.dt.path,
+                                    domctl->u.assign_device.u.dt.size,
+                                    &dev);
+        if ( ret )
+            break;
+
+        ret = xsm_deassign_dtdevice(XSM_HOOK, d, dt_node_full_name(dev));
+
+        ret = iommu_deassign_dt_device(d, dev);
+
+        if ( ret )
+            printk(XENLOG_G_ERR "XEN_DOMCTL_assign_dt_device: assign \"%s\""
+                   " to dom%u failed (%d)\n",
+                   dt_node_full_name(dev), d->domain_id, ret);
+        break;
+
+    case XEN_DOMCTL_test_assign_device:
+        ret = -ENODEV;
+        if ( domctl->u.assign_device.dev != XEN_DOMCTL_DEV_DT )
+            break;
+
+        ret = dt_find_node_by_gpath(domctl->u.assign_device.u.dt.path,
+                                    domctl->u.assign_device.u.dt.size,
+                                    &dev);
+        if ( ret )
+            break;
+
+        ret = xsm_test_assign_dtdevice(XSM_HOOK, dt_node_full_name(dev));
+        if ( ret )
+            break;
+
+        if ( iommu_dt_device_is_assigned(dev) )
+        {
+            printk(XENLOG_G_ERR "%s already assigned.\n",
+                   dt_node_full_name(dev));
+            ret = -EINVAL;
+        }
+        break;
+
+    default:
+        ret = -ENOSYS;
+        break;
     }
+
+    return ret;
 }
diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index 4cd32b5..bda9374 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Allen Kay <allen.m.kay at intel.com>
  * Copyright (C) Xiaohui Xin <xiaohui.xin at intel.com>
@@ -20,14 +19,123 @@
 
 #include <xen/event.h>
 #include <xen/iommu.h>
+#include <xen/cpu.h>
 #include <xen/irq.h>
 #include <asm/hvm/irq.h>
 #include <asm/hvm/iommu.h>
 #include <asm/hvm/support.h>
 #include <xen/hvm/irq.h>
-#include <xen/tasklet.h>
 
-static void hvm_dirq_assist(unsigned long _d);
+static DEFINE_PER_CPU(struct list_head, dpci_list);
+
+/*
+ * These two bit states help to safely schedule, deschedule, and wait until
+ * the softirq has finished.
+ *
+ * The semantics behind these two bits is as follow:
+ *  - STATE_SCHED - whoever modifies it has to ref-count the domain (->dom).
+ *  - STATE_RUN - only softirq is allowed to set and clear it. If it has
+ *      been set hvm_dirq_assist will RUN with a saved value of the
+ *      'struct domain' copied from 'pirq_dpci->dom' before STATE_RUN was set.
+ *
+ * The usual states are: STATE_SCHED(set) -> STATE_RUN(set) ->
+ * STATE_SCHED(unset) -> STATE_RUN(unset).
+ *
+ * However the states can also diverge such as: STATE_SCHED(set) ->
+ * STATE_SCHED(unset) -> STATE_RUN(set) -> STATE_RUN(unset). That means
+ * the 'hvm_dirq_assist' never run and that the softirq did not do any
+ * ref-counting.
+ */
+
+enum {
+    STATE_SCHED,
+    STATE_RUN
+};
+
+/*
+ * This can be called multiple times, but the softirq is only raised once.
+ * That is until the STATE_SCHED state has been cleared. The state can be
+ * cleared by: the 'dpci_softirq' (when it has executed 'hvm_dirq_assist'),
+ * or by 'pt_pirq_softirq_reset' (which will try to clear the state before
+ * the softirq had a chance to run).
+ */
+static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci)
+{
+    unsigned long flags;
+
+    if ( test_and_set_bit(STATE_SCHED, &pirq_dpci->state) )
+        return;
+
+    get_knownalive_domain(pirq_dpci->dom);
+
+    local_irq_save(flags);
+    list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list));
+    local_irq_restore(flags);
+
+    raise_softirq(HVM_DPCI_SOFTIRQ);
+}
+
+/*
+ * If we are racing with softirq_dpci (STATE_SCHED) we return
+ * true. Otherwise we return false.
+ *
+ * If it is false, it is the callers responsibility to make sure
+ * that the softirq (with the event_lock dropped) has ran.
+ */
+bool_t pt_pirq_softirq_active(struct hvm_pirq_dpci *pirq_dpci)
+{
+    if ( pirq_dpci->state & ((1 << STATE_RUN) | (1 << STATE_SCHED)) )
+        return 1;
+
+    /*
+     * If in the future we would call 'raise_softirq_for' right away
+     * after 'pt_pirq_softirq_active' we MUST reset the list (otherwise it
+     * might have stale data).
+     */
+    return 0;
+}
+
+/*
+ * Reset the pirq_dpci->dom parameter to NULL.
+ *
+ * This function checks the different states to make sure it can do it
+ * at the right time. If it unschedules the 'hvm_dirq_assist' from running
+ * it also refcounts (which is what the softirq would have done) properly.
+ */
+static void pt_pirq_softirq_reset(struct hvm_pirq_dpci *pirq_dpci)
+{
+    struct domain *d = pirq_dpci->dom;
+
+    ASSERT(spin_is_locked(&d->event_lock));
+
+    switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 0) )
+    {
+    case (1 << STATE_SCHED):
+        /*
+         * We are going to try to de-schedule the softirq before it goes in
+         * STATE_RUN. Whoever clears STATE_SCHED MUST refcount the 'dom'.
+         */
+        put_domain(d);
+        /* fallthrough. */
+    case (1 << STATE_RUN):
+    case (1 << STATE_RUN) | (1 << STATE_SCHED):
+        /*
+         * The reason it is OK to reset 'dom' when STATE_RUN bit is set is due
+         * to a shortcut the 'dpci_softirq' implements. It stashes the 'dom'
+         * in local variable before it sets STATE_RUN - and therefore will not
+         * dereference '->dom' which would crash.
+         */
+        pirq_dpci->dom = NULL;
+        break;
+    }
+    /*
+     * Inhibit 'hvm_dirq_assist' from doing anything useful and at worst
+     * calling 'set_timer' which will blow up (as we have called kill_timer
+     * or never initialized it). Note that we hold the lock that
+     * 'hvm_dirq_assist' could be spinning on.
+     */
+    pirq_dpci->masked = 0;
+}
 
 bool_t pt_irq_need_timer(uint32_t flags)
 {
@@ -101,6 +209,7 @@ int pt_irq_create_bind(
     if ( pirq < 0 || pirq >= d->nr_pirqs )
         return -EINVAL;
 
+ restart:
     spin_lock(&d->event_lock);
 
     hvm_irq_dpci = domain_get_irq_dpci(d);
@@ -114,9 +223,6 @@ int pt_irq_create_bind(
             spin_unlock(&d->event_lock);
             return -ENOMEM;
         }
-        softirq_tasklet_init(
-            &hvm_irq_dpci->dirq_tasklet,
-            hvm_dirq_assist, (unsigned long)d);
         for ( i = 0; i < NR_HVM_IRQS; i++ )
             INIT_LIST_HEAD(&hvm_irq_dpci->girq[i]);
 
@@ -131,6 +237,21 @@ int pt_irq_create_bind(
     }
     pirq_dpci = pirq_dpci(info);
 
+    /*
+     * A crude 'while' loop with us dropping the spinlock and giving
+     * the softirq_dpci a chance to run.
+     * We MUST check for this condition as the softirq could be scheduled
+     * and hasn't run yet. Note that this code replaced tasklet_kill which
+     * would have spun forever and would do the same thing (wait to flush out
+     * outstanding hvm_dirq_assist calls.
+     */
+    if ( pt_pirq_softirq_active(pirq_dpci) )
+    {
+        spin_unlock(&d->event_lock);
+        cpu_relax();
+        goto restart;
+    }
+
     switch ( pt_irq_bind->irq_type )
     {
     case PT_IRQ_TYPE_MSI:
@@ -144,18 +265,40 @@ int pt_irq_create_bind(
                                HVM_IRQ_DPCI_GUEST_MSI;
             pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec;
             pirq_dpci->gmsi.gflags = pt_irq_bind->u.msi.gflags;
+            /*
+             * 'pt_irq_create_bind' can be called after 'pt_irq_destroy_bind'.
+             * The 'pirq_cleanup_check' which would free the structure is only
+             * called if the event channel for the PIRQ is active. However
+             * OS-es that use event channels usually bind PIRQs to eventds
+             * and unbind them before calling 'pt_irq_destroy_bind' - with the
+             * result that we re-use the 'dpci' structure. This can be
+             * reproduced with unloading and loading the driver for a device.
+             *
+             * As such on every 'pt_irq_create_bind' call we MUST set it.
+             */
+            pirq_dpci->dom = d;
             /* bind after hvm_irq_dpci is setup to avoid race with irq handler*/
             rc = pirq_guest_bind(d->vcpu[0], info, 0);
             if ( rc == 0 && pt_irq_bind->u.msi.gtable )
             {
                 rc = msixtbl_pt_register(d, info, pt_irq_bind->u.msi.gtable);
                 if ( unlikely(rc) )
+                {
                     pirq_guest_unbind(d, info);
+                    /*
+                     * Between 'pirq_guest_bind' and before 'pirq_guest_unbind'
+                     * an interrupt can be scheduled. No more of them are going
+                     * to be scheduled but we must deal with the one that may be
+                     * in the queue.
+                     */
+                    pt_pirq_softirq_reset(pirq_dpci);
+                }
             }
             if ( unlikely(rc) )
             {
                 pirq_dpci->gmsi.gflags = 0;
                 pirq_dpci->gmsi.gvec = 0;
+                pirq_dpci->dom = NULL;
                 pirq_dpci->flags = 0;
                 pirq_cleanup_check(info, d);
                 spin_unlock(&d->event_lock);
@@ -232,6 +375,7 @@ int pt_irq_create_bind(
         {
             unsigned int share;
 
+            /* MUST be set, as the pirq_dpci can be re-used. */
             pirq_dpci->dom = d;
             if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI_TRANSLATE )
             {
@@ -258,6 +402,10 @@ int pt_irq_create_bind(
             {
                 if ( pt_irq_need_timer(pirq_dpci->flags) )
                     kill_timer(&pirq_dpci->timer);
+                /*
+                 * There is no path for __do_IRQ to schedule softirq as
+                 * IRQ_GUEST is not set. As such we can reset 'dom' directly.
+                 */
                 pirq_dpci->dom = NULL;
                 list_del(&girq->list);
                 list_del(&digl->list);
@@ -391,8 +539,13 @@ int pt_irq_destroy_bind(
         msixtbl_pt_unregister(d, pirq);
         if ( pt_irq_need_timer(pirq_dpci->flags) )
             kill_timer(&pirq_dpci->timer);
-        pirq_dpci->dom   = NULL;
         pirq_dpci->flags = 0;
+        /*
+         * See comment in pt_irq_create_bind's PT_IRQ_TYPE_MSI before the
+         * call to pt_pirq_softirq_reset.
+         */
+        pt_pirq_softirq_reset(pirq_dpci);
+
         pirq_cleanup_check(pirq, d);
     }
 
@@ -419,7 +572,12 @@ void pt_pirq_init(struct domain *d, struct hvm_pirq_dpci *dpci)
 
 bool_t pt_pirq_cleanup_check(struct hvm_pirq_dpci *dpci)
 {
-    return !dpci->flags;
+    if ( !dpci->flags && !pt_pirq_softirq_active(dpci) )
+    {
+        dpci->dom = NULL;
+        return 1;
+    }
+    return 0;
 }
 
 int pt_pirq_iterate(struct domain *d,
@@ -459,7 +617,7 @@ int hvm_do_IRQ_dpci(struct domain *d, struct pirq *pirq)
         return 0;
 
     pirq_dpci->masked = 1;
-    tasklet_schedule(&dpci->dirq_tasklet);
+    raise_softirq_for(pirq_dpci);
     return 1;
 }
 
@@ -513,9 +671,11 @@ void hvm_dpci_msi_eoi(struct domain *d, int vector)
     spin_unlock(&d->event_lock);
 }
 
-static int _hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
-                            void *arg)
+static void hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci)
 {
+    ASSERT(d->arch.hvm_domain.irq.dpci);
+
+    spin_lock(&d->event_lock);
     if ( test_and_clear_bool(pirq_dpci->masked) )
     {
         struct pirq *pirq = dpci_pirq(pirq_dpci);
@@ -526,13 +686,17 @@ static int _hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
             send_guest_pirq(d, pirq);
 
             if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
-                return 0;
+            {
+                spin_unlock(&d->event_lock);
+                return;
+            }
         }
 
         if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
         {
             vmsi_deliver_pirq(d, pirq_dpci);
-            return 0;
+            spin_unlock(&d->event_lock);
+            return;
         }
 
         list_for_each_entry ( digl, &pirq_dpci->digl_list, list )
@@ -545,7 +709,8 @@ static int _hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
         {
             /* for translated MSI to INTx interrupt, eoi as early as possible */
             __msi_pirq_eoi(pirq_dpci);
-            return 0;
+            spin_unlock(&d->event_lock);
+            return;
         }
 
         /*
@@ -558,18 +723,6 @@ static int _hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
         ASSERT(pt_irq_need_timer(pirq_dpci->flags));
         set_timer(&pirq_dpci->timer, NOW() + PT_IRQ_TIME_OUT);
     }
-
-    return 0;
-}
-
-static void hvm_dirq_assist(unsigned long _d)
-{
-    struct domain *d = (struct domain *)_d;
-
-    ASSERT(d->arch.hvm_domain.irq.dpci);
-
-    spin_lock(&d->event_lock);
-    pt_pirq_iterate(d, _hvm_dirq_assist, NULL);
     spin_unlock(&d->event_lock);
 }
 
@@ -625,3 +778,93 @@ void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi,
 unlock:
     spin_unlock(&d->event_lock);
 }
+
+/*
+ * Note: 'pt_pirq_softirq_reset' can clear the STATE_SCHED before we get to
+ * doing it. If that is the case we let 'pt_pirq_softirq_reset' do ref-counting.
+ */
+static void dpci_softirq(void)
+{
+    unsigned int cpu = smp_processor_id();
+    LIST_HEAD(our_list);
+
+    local_irq_disable();
+    list_splice_init(&per_cpu(dpci_list, cpu), &our_list);
+    local_irq_enable();
+
+    while ( !list_empty(&our_list) )
+    {
+        struct hvm_pirq_dpci *pirq_dpci;
+        struct domain *d;
+
+        pirq_dpci = list_entry(our_list.next, struct hvm_pirq_dpci, softirq_list);
+        list_del(&pirq_dpci->softirq_list);
+
+        d = pirq_dpci->dom;
+        smp_mb(); /* 'd' MUST be saved before we set/clear the bits. */
+        if ( test_and_set_bit(STATE_RUN, &pirq_dpci->state) )
+        {
+            unsigned long flags;
+
+            /* Put back on the list and retry. */
+            local_irq_save(flags);
+            list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list));
+            local_irq_restore(flags);
+
+            raise_softirq(HVM_DPCI_SOFTIRQ);
+            continue;
+        }
+        /*
+         * The one who clears STATE_SCHED MUST refcount the domain.
+         */
+        if ( test_and_clear_bit(STATE_SCHED, &pirq_dpci->state) )
+        {
+            hvm_dirq_assist(d, pirq_dpci);
+            put_domain(d);
+        }
+        clear_bit(STATE_RUN, &pirq_dpci->state);
+    }
+}
+
+static int cpu_callback(
+    struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+    unsigned int cpu = (unsigned long)hcpu;
+
+    switch ( action )
+    {
+    case CPU_UP_PREPARE:
+        INIT_LIST_HEAD(&per_cpu(dpci_list, cpu));
+        break;
+    case CPU_UP_CANCELED:
+    case CPU_DEAD:
+        /*
+         * On CPU_DYING this callback is called (on the CPU that is dying)
+         * with an possible HVM_DPIC_SOFTIRQ pending - at which point we can
+         * clear out any outstanding domains (by the virtue of the idle loop
+         * calling the softirq later). In CPU_DEAD case the CPU is deaf and
+         * there are no pending softirqs for us to handle so we can chill.
+         */
+        ASSERT(list_empty(&per_cpu(dpci_list, cpu)));
+        break;
+    }
+
+    return NOTIFY_DONE;
+}
+
+static struct notifier_block cpu_nfb = {
+    .notifier_call = cpu_callback,
+};
+
+static int __init setup_dpci_softirq(void)
+{
+    unsigned int cpu;
+
+    for_each_online_cpu(cpu)
+        INIT_LIST_HEAD(&per_cpu(dpci_list, cpu));
+
+    open_softirq(HVM_DPCI_SOFTIRQ, dpci_softirq);
+    register_cpu_notifier(&cpu_nfb);
+    return 0;
+}
+__initcall(setup_dpci_softirq);
diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
index cc12735..d513773 100644
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -9,8 +9,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/sched.h>
@@ -47,6 +46,7 @@ bool_t __read_mostly force_iommu;
 bool_t __hwdom_initdata iommu_dom0_strict;
 bool_t __read_mostly iommu_verbose;
 bool_t __read_mostly iommu_workaround_bios_bug;
+bool_t __read_mostly iommu_igfx = 1;
 bool_t __read_mostly iommu_passthrough;
 bool_t __read_mostly iommu_snoop = 1;
 bool_t __read_mostly iommu_qinval = 1;
@@ -87,6 +87,8 @@ static void __init parse_iommu_param(char *s)
             force_iommu = val;
         else if ( !strcmp(s, "workaround_bios_bug") )
             iommu_workaround_bios_bug = val;
+        else if ( !strcmp(s, "igfx") )
+            iommu_igfx = val;
         else if ( !strcmp(s, "verbose") )
             iommu_verbose = val;
         else if ( !strcmp(s, "snoop") )
@@ -187,6 +189,32 @@ void iommu_teardown(struct domain *d)
     tasklet_schedule(&iommu_pt_cleanup_tasklet);
 }
 
+int iommu_construct(struct domain *d)
+{
+    if ( need_iommu(d) > 0 )
+        return 0;
+
+    if ( !iommu_use_hap_pt(d) )
+    {
+        int rc;
+
+        rc = arch_iommu_populate_page_table(d);
+        if ( rc )
+            return rc;
+    }
+
+    d->need_iommu = 1;
+    /*
+     * There may be dirty cache lines when a device is assigned
+     * and before need_iommu(d) becoming true, this will cause
+     * memory_type_changed lose effect if memory type changes.
+     * Call memory_type_changed here to amend this.
+     */
+    memory_type_changed(d);
+
+    return 0;
+}
+
 void iommu_domain_destroy(struct domain *d)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
@@ -309,7 +337,7 @@ int iommu_do_domctl(
     struct xen_domctl *domctl, struct domain *d,
     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
-    int ret = -ENOSYS;
+    int ret = -ENODEV;
 
     if ( !iommu_enabled )
         return -ENOSYS;
@@ -318,6 +346,11 @@ int iommu_do_domctl(
     ret = iommu_do_pci_domctl(domctl, d, u_domctl);
 #endif
 
+#ifdef HAS_DEVICE_TREE
+    if ( ret == -ENODEV )
+        ret = iommu_do_dt_domctl(domctl, d, u_domctl);
+#endif
+
     return ret;
 }
 
@@ -332,7 +365,7 @@ void iommu_share_p2m_table(struct domain* d)
 {
     const struct iommu_ops *ops = iommu_get_ops();
 
-    if ( iommu_enabled && is_hvm_domain(d) )
+    if ( iommu_enabled && iommu_use_hap_pt(d) )
         ops->share_p2m(d);
 }
 
@@ -344,6 +377,16 @@ void iommu_crash_shutdown(void)
     iommu_enabled = iommu_intremap = 0;
 }
 
+int iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt)
+{
+    const struct iommu_ops *ops = iommu_get_ops();
+
+    if ( !iommu_enabled || !ops->get_reserved_device_memory )
+        return 0;
+
+    return ops->get_reserved_device_memory(func, ctxt);
+}
+
 bool_t iommu_has_feature(struct domain *d, enum iommu_feature feature)
 {
     const struct hvm_iommu *hd = domain_hvm_iommu(d);
@@ -368,7 +411,7 @@ static void iommu_dump_p2m_table(unsigned char key)
     ops = iommu_get_ops();
     for_each_domain(d)
     {
-        if ( is_hardware_domain(d) )
+        if ( is_hardware_domain(d) || need_iommu(d) <= 0 )
             continue;
 
         if ( iommu_use_hap_pt(d) )
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 1eba833..27b3ca7 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/sched.h>
@@ -447,7 +446,6 @@ int __init pci_ro_device(int seg, int bus, int devfn)
     }
 
     __set_bit(PCI_BDF2(bus, devfn), pseg->ro_map);
-    arch_pci_ro_device(seg, PCI_BDF2(bus, devfn));
     _pci_hide_device(pdev);
 
     return 0;
@@ -568,7 +566,8 @@ static void pci_enable_acs(struct pci_dev *pdev)
     pci_conf_write16(seg, bus, dev, func, pos + PCI_ACS_CTRL, ctrl);
 }
 
-int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info *info)
+int pci_add_device(u16 seg, u8 bus, u8 devfn,
+                   const struct pci_dev_info *info, nodeid_t node)
 {
     struct pci_seg *pseg;
     struct pci_dev *pdev;
@@ -586,7 +585,8 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info *info)
         pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
         spin_unlock(&pcidevs_lock);
         if ( !pdev )
-            pci_add_device(seg, info->physfn.bus, info->physfn.devfn, NULL);
+            pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
+                           NULL, node);
         pdev_type = "virtual function";
     }
     else
@@ -609,6 +609,8 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info *info)
     if ( !pdev )
         goto out;
 
+    pdev->node = node;
+
     if ( info )
         pdev->info = *info;
     else if ( !pdev->vf_rlen[0] )
@@ -767,40 +769,51 @@ static int pci_clean_dpci_irq(struct domain *d,
         xfree(digl);
     }
 
-    return 0;
+    return pt_pirq_softirq_active(pirq_dpci) ? -ERESTART : 0;
 }
 
-static void pci_clean_dpci_irqs(struct domain *d)
+static int pci_clean_dpci_irqs(struct domain *d)
 {
     struct hvm_irq_dpci *hvm_irq_dpci = NULL;
 
     if ( !iommu_enabled )
-        return;
+        return 0;
 
     if ( !is_hvm_domain(d) )
-        return;
+        return 0;
 
     spin_lock(&d->event_lock);
     hvm_irq_dpci = domain_get_irq_dpci(d);
     if ( hvm_irq_dpci != NULL )
     {
-        tasklet_kill(&hvm_irq_dpci->dirq_tasklet);
+        int ret = pt_pirq_iterate(d, pci_clean_dpci_irq, NULL);
 
-        pt_pirq_iterate(d, pci_clean_dpci_irq, NULL);
+        if ( ret )
+        {
+            spin_unlock(&d->event_lock);
+            return ret;
+        }
 
         d->arch.hvm_domain.irq.dpci = NULL;
         free_hvm_irq_dpci(hvm_irq_dpci);
     }
     spin_unlock(&d->event_lock);
+    return 0;
 }
 
-void pci_release_devices(struct domain *d)
+int pci_release_devices(struct domain *d)
 {
     struct pci_dev *pdev;
     u8 bus, devfn;
+    int ret;
 
     spin_lock(&pcidevs_lock);
-    pci_clean_dpci_irqs(d);
+    ret = pci_clean_dpci_irqs(d);
+    if ( ret )
+    {
+        spin_unlock(&pcidevs_lock);
+        return ret;
+    }
     while ( (pdev = pci_get_pdev_by_domain(d, -1, -1, -1)) )
     {
         bus = pdev->bus;
@@ -811,6 +824,8 @@ void pci_release_devices(struct domain *d)
                    PCI_SLOT(devfn), PCI_FUNC(devfn));
     }
     spin_unlock(&pcidevs_lock);
+
+    return 0;
 }
 
 #define PCI_CLASS_BRIDGE_HOST    0x0600
@@ -887,10 +902,7 @@ out:
     return ret;
 }
 
-/*
- * detect pci device, return 0 if it exists, or return 0
- */
-int __init pci_device_detect(u16 seg, u8 bus, u8 dev, u8 func)
+bool_t __init pci_device_detect(u16 seg, u8 bus, u8 dev, u8 func)
 {
     u32 vendor;
 
@@ -948,7 +960,7 @@ static int __init _scan_pci_devices(struct pci_seg *pseg, void *arg)
         {
             for ( func = 0; func < 8; func++ )
             {
-                if ( pci_device_detect(pseg->nr, bus, dev, func) == 0 )
+                if ( !pci_device_detect(pseg->nr, bus, dev, func) )
                 {
                     if ( !func )
                         break;
@@ -1178,10 +1190,11 @@ static int _dump_pci_devices(struct pci_seg *pseg, void *arg)
 
     list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
     {
-        printk("%04x:%02x:%02x.%u - dom %-3d - MSIs < ",
+        printk("%04x:%02x:%02x.%u - dom %-3d - node %-3d - MSIs < ",
                pseg->nr, pdev->bus,
                PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn),
-               pdev->domain ? pdev->domain->domain_id : -1);
+               pdev->domain ? pdev->domain->domain_id : -1,
+               (pdev->node != NUMA_NO_NODE) ? pdev->node : -1);
         list_for_each_entry ( msi, &pdev->msi_list, list )
                printk("%d ", msi->irq);
         printk(">\n");
@@ -1241,7 +1254,7 @@ int iommu_add_device(struct pci_dev *pdev)
     if ( !iommu_enabled || !hd->platform_ops )
         return 0;
 
-    rc = hd->platform_ops->add_device(pdev->devfn, pdev);
+    rc = hd->platform_ops->add_device(pdev->devfn, pci_to_dev(pdev));
     if ( rc || !pdev->phantom_stride )
         return rc;
 
@@ -1250,7 +1263,7 @@ int iommu_add_device(struct pci_dev *pdev)
         devfn += pdev->phantom_stride;
         if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
             return 0;
-        rc = hd->platform_ops->add_device(devfn, pdev);
+        rc = hd->platform_ops->add_device(devfn, pci_to_dev(pdev));
         if ( rc )
             printk(XENLOG_WARNING "IOMMU: add %04x:%02x:%02x.%u failed (%d)\n",
                    pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), rc);
@@ -1271,7 +1284,7 @@ int iommu_enable_device(struct pci_dev *pdev)
          !hd->platform_ops->enable_device )
         return 0;
 
-    return hd->platform_ops->enable_device(pdev);
+    return hd->platform_ops->enable_device(pci_to_dev(pdev));
 }
 
 int iommu_remove_device(struct pci_dev *pdev)
@@ -1293,7 +1306,7 @@ int iommu_remove_device(struct pci_dev *pdev)
         devfn += pdev->phantom_stride;
         if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
             break;
-        rc = hd->platform_ops->remove_device(devfn, pdev);
+        rc = hd->platform_ops->remove_device(devfn, pci_to_dev(pdev));
         if ( !rc )
             continue;
 
@@ -1302,7 +1315,7 @@ int iommu_remove_device(struct pci_dev *pdev)
         return rc;
     }
 
-    return hd->platform_ops->remove_device(pdev->devfn, pdev);
+    return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev));
 }
 
 /*
@@ -1320,7 +1333,7 @@ static int device_assigned(u16 seg, u8 bus, u8 devfn)
     return pdev ? 0 : -EBUSY;
 }
 
-static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     struct pci_dev *pdev;
@@ -1333,25 +1346,18 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
      * enabled for this domain */
     if ( unlikely(!need_iommu(d) &&
             (d->arch.hvm_domain.mem_sharing_enabled ||
-             d->mem_event->paging.ring_page ||
+             d->vm_event->paging.ring_page ||
              p2m_get_hostp2m(d)->global_logdirty)) )
         return -EXDEV;
 
     if ( !spin_trylock(&pcidevs_lock) )
         return -ERESTART;
 
-    if ( need_iommu(d) <= 0 )
+    rc = iommu_construct(d);
+    if ( rc )
     {
-        if ( !iommu_use_hap_pt(d) )
-        {
-            rc = arch_iommu_populate_page_table(d);
-            if ( rc )
-            {
-                spin_unlock(&pcidevs_lock);
-                return rc;
-            }
-        }
-        d->need_iommu = 1;
+        spin_unlock(&pcidevs_lock);
+        return rc;
     }
 
     pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
@@ -1363,7 +1369,7 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
 
     pdev->fault.count = 0;
 
-    if ( (rc = hd->platform_ops->assign_device(d, devfn, pdev)) )
+    if ( (rc = hd->platform_ops->assign_device(d, devfn, pci_to_dev(pdev), flag)) )
         goto done;
 
     for ( ; pdev->phantom_stride; rc = 0 )
@@ -1371,7 +1377,7 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
         devfn += pdev->phantom_stride;
         if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
             break;
-        rc = hd->platform_ops->assign_device(d, devfn, pdev);
+        rc = hd->platform_ops->assign_device(d, devfn, pci_to_dev(pdev), flag);
         if ( rc )
             printk(XENLOG_G_WARNING "d%d: assign %04x:%02x:%02x.%u failed (%d)\n",
                    d->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
@@ -1406,7 +1412,8 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
         devfn += pdev->phantom_stride;
         if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
             break;
-        ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, pdev);
+        ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
+                                                pci_to_dev(pdev));
         if ( !ret )
             continue;
 
@@ -1416,7 +1423,8 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
     }
 
     devfn = pdev->devfn;
-    ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, pdev);
+    ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
+                                            pci_to_dev(pdev));
     if ( ret )
     {
         dprintk(XENLOG_G_ERR,
@@ -1486,7 +1494,9 @@ int iommu_do_pci_domctl(
 {
     u16 seg;
     u8 bus, devfn;
+    u32 flag;
     int ret = 0;
+    uint32_t machine_sbdf;
 
     switch ( domctl->cmd )
     {
@@ -1500,8 +1510,8 @@ int iommu_do_pci_domctl(
             break;
 
         seg = domctl->u.get_device_group.machine_sbdf >> 16;
-        bus = (domctl->u.get_device_group.machine_sbdf >> 8) & 0xff;
-        devfn = domctl->u.get_device_group.machine_sbdf & 0xff;
+        bus = PCI_BUS(domctl->u.get_device_group.machine_sbdf);
+        devfn = PCI_DEVFN2(domctl->u.get_device_group.machine_sbdf);
         max_sdevs = domctl->u.get_device_group.max_sdevs;
         sdevs = domctl->u.get_device_group.sdev_array;
 
@@ -1523,13 +1533,19 @@ int iommu_do_pci_domctl(
     break;
 
     case XEN_DOMCTL_test_assign_device:
-        ret = xsm_test_assign_device(XSM_HOOK, domctl->u.assign_device.machine_sbdf);
+        ret = -ENODEV;
+        if ( domctl->u.assign_device.dev != XEN_DOMCTL_DEV_PCI )
+            break;
+
+        machine_sbdf = domctl->u.assign_device.u.pci.machine_sbdf;
+
+        ret = xsm_test_assign_device(XSM_HOOK, machine_sbdf);
         if ( ret )
             break;
 
-        seg = domctl->u.assign_device.machine_sbdf >> 16;
-        bus = (domctl->u.assign_device.machine_sbdf >> 8) & 0xff;
-        devfn = domctl->u.assign_device.machine_sbdf & 0xff;
+        seg = machine_sbdf >> 16;
+        bus = PCI_BUS(machine_sbdf);
+        devfn = PCI_DEVFN2(machine_sbdf);
 
         if ( device_assigned(seg, bus, devfn) )
         {
@@ -1541,22 +1557,34 @@ int iommu_do_pci_domctl(
         break;
 
     case XEN_DOMCTL_assign_device:
+        ret = -ENODEV;
+        if ( domctl->u.assign_device.dev != XEN_DOMCTL_DEV_PCI )
+            break;
+
         if ( unlikely(d->is_dying) )
         {
             ret = -EINVAL;
             break;
         }
 
-        ret = xsm_assign_device(XSM_HOOK, d, domctl->u.assign_device.machine_sbdf);
+        machine_sbdf = domctl->u.assign_device.u.pci.machine_sbdf;
+
+        ret = xsm_assign_device(XSM_HOOK, d, machine_sbdf);
         if ( ret )
             break;
 
-        seg = domctl->u.assign_device.machine_sbdf >> 16;
-        bus = (domctl->u.assign_device.machine_sbdf >> 8) & 0xff;
-        devfn = domctl->u.assign_device.machine_sbdf & 0xff;
+        seg = machine_sbdf >> 16;
+        bus = PCI_BUS(machine_sbdf);
+        devfn = PCI_DEVFN2(machine_sbdf);
+        flag = domctl->u.assign_device.flag;
+        if ( flag & ~XEN_DOMCTL_DEV_RDM_RELAXED )
+        {
+            ret = -EINVAL;
+            break;
+        }
 
         ret = device_assigned(seg, bus, devfn) ?:
-              assign_device(d, seg, bus, devfn);
+              assign_device(d, seg, bus, devfn, flag);
         if ( ret == -ERESTART )
             ret = hypercall_create_continuation(__HYPERVISOR_domctl,
                                                 "h", u_domctl);
@@ -1569,13 +1597,19 @@ int iommu_do_pci_domctl(
         break;
 
     case XEN_DOMCTL_deassign_device:
-        ret = xsm_deassign_device(XSM_HOOK, d, domctl->u.assign_device.machine_sbdf);
+        ret = -ENODEV;
+        if ( domctl->u.assign_device.dev != XEN_DOMCTL_DEV_PCI )
+            break;
+
+        machine_sbdf = domctl->u.assign_device.u.pci.machine_sbdf;
+
+        ret = xsm_deassign_device(XSM_HOOK, d, machine_sbdf);
         if ( ret )
             break;
 
-        seg = domctl->u.assign_device.machine_sbdf >> 16;
-        bus = (domctl->u.assign_device.machine_sbdf >> 8) & 0xff;
-        devfn = domctl->u.assign_device.machine_sbdf & 0xff;
+        seg = machine_sbdf >> 16;
+        bus = PCI_BUS(machine_sbdf);
+        devfn = PCI_DEVFN2(machine_sbdf);
 
         spin_lock(&pcidevs_lock);
         ret = deassign_device(d, seg, bus, devfn);
diff --git a/xen/drivers/passthrough/vtd/dmar.c b/xen/drivers/passthrough/vtd/dmar.c
index 1152c3a..34ec4c7 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Ashok Raj <ashok.raj at intel.com>
  * Copyright (C) Shaohua Li <shaohua.li at intel.com>
@@ -28,6 +27,7 @@
 #include <xen/xmalloc.h>
 #include <xen/pci.h>
 #include <xen/pci_regs.h>
+#include <asm/atomic.h>
 #include <asm/string.h>
 #include "dmar.h"
 #include "iommu.h"
@@ -80,6 +80,16 @@ static int __init acpi_register_rmrr_unit(struct acpi_rmrr_unit *rmrr)
     return 0;
 }
 
+static void scope_devices_free(struct dmar_scope *scope)
+{
+    if ( !scope )
+        return;
+
+    scope->devices_cnt = 0;
+    xfree(scope->devices);
+    scope->devices = NULL;
+}
+
 static void __init disable_all_dmar_units(void)
 {
     struct acpi_drhd_unit *drhd, *_drhd;
@@ -89,16 +99,19 @@ static void __init disable_all_dmar_units(void)
     list_for_each_entry_safe ( drhd, _drhd, &acpi_drhd_units, list )
     {
         list_del(&drhd->list);
+        scope_devices_free(&drhd->scope);
         xfree(drhd);
     }
     list_for_each_entry_safe ( rmrr, _rmrr, &acpi_rmrr_units, list )
     {
         list_del(&rmrr->list);
+        scope_devices_free(&rmrr->scope);
         xfree(rmrr);
     }
     list_for_each_entry_safe ( atsr, _atsr, &acpi_atsr_units, list )
     {
         list_del(&atsr->list);
+        scope_devices_free(&atsr->scope);
         xfree(atsr);
     }
 }
@@ -317,13 +330,13 @@ static int __init acpi_parse_dev_scope(
     if ( (cnt = scope_device_count(start, end)) < 0 )
         return cnt;
 
-    scope->devices_cnt = cnt;
     if ( cnt > 0 )
     {
         scope->devices = xzalloc_array(u16, cnt);
         if ( !scope->devices )
             return -ENOMEM;
     }
+    scope->devices_cnt = cnt;
 
     while ( start < end )
     {
@@ -426,7 +439,7 @@ static int __init acpi_parse_dev_scope(
 
  out:
     if ( ret )
-        xfree(scope->devices);
+        scope_devices_free(scope);
 
     return ret;
 }
@@ -523,7 +536,7 @@ acpi_parse_one_drhd(struct acpi_dmar_header *header)
             d = PCI_SLOT(dmaru->scope.devices[i]);
             f = PCI_FUNC(dmaru->scope.devices[i]);
 
-            if ( pci_device_detect(drhd->segment, b, d, f) == 0 )
+            if ( !pci_device_detect(drhd->segment, b, d, f) )
             {
                 dprintk(XENLOG_WARNING VTDPREFIX,
                         " Non-existent device (%04x:%02x:%02x.%u) is reported"
@@ -541,6 +554,7 @@ acpi_parse_one_drhd(struct acpi_dmar_header *header)
                     "  Workaround BIOS bug: ignore the DRHD due to all "
                     "devices under its scope are not PCI discoverable!\n");
 
+                scope_devices_free(&dmaru->scope);
                 iommu_free(dmaru);
                 xfree(dmaru);
             }
@@ -561,9 +575,11 @@ acpi_parse_one_drhd(struct acpi_dmar_header *header)
 out:
     if ( ret )
     {
+        scope_devices_free(&dmaru->scope);
         iommu_free(dmaru);
         xfree(dmaru);
     }
+
     return ret;
 }
 
@@ -635,7 +651,7 @@ acpi_parse_one_rmrr(struct acpi_dmar_header *header)
             d = PCI_SLOT(rmrru->scope.devices[i]);
             f = PCI_FUNC(rmrru->scope.devices[i]);
 
-            if ( pci_device_detect(rmrr->segment, b, d, f) == 0 )
+            if ( !pci_device_detect(rmrr->segment, b, d, f) )
             {
                 dprintk(XENLOG_WARNING VTDPREFIX,
                         " Non-existent device (%04x:%02x:%02x.%u) is reported"
@@ -657,6 +673,7 @@ acpi_parse_one_rmrr(struct acpi_dmar_header *header)
                 "  Ignore the RMRR (%"PRIx64", %"PRIx64") due to "
                 "devices under its scope are not PCI discoverable!\n",
                 rmrru->base_address, rmrru->end_address);
+            scope_devices_free(&rmrru->scope);
             xfree(rmrru);
         }
         else if ( base_addr > end_addr )
@@ -664,6 +681,7 @@ acpi_parse_one_rmrr(struct acpi_dmar_header *header)
             dprintk(XENLOG_WARNING VTDPREFIX,
                 "  The RMRR (%"PRIx64", %"PRIx64") is incorrect!\n",
                 rmrru->base_address, rmrru->end_address);
+            scope_devices_free(&rmrru->scope);
             xfree(rmrru);
             ret = -EFAULT;
         }
@@ -726,7 +744,10 @@ acpi_parse_one_atsr(struct acpi_dmar_header *header)
     }
 
     if ( ret )
+    {
+        scope_devices_free(&atsru->scope);
         xfree(atsru);
+    }
     else
         acpi_register_atsr_unit(atsru);
     return ret;
@@ -838,8 +859,7 @@ static int __init acpi_parse_dmar(struct acpi_table_header *table)
 
 out:
     /* Zap ACPI DMAR signature to prevent dom0 using vt-d HW. */
-    dmar->header.signature[0] = 'X';
-    dmar->header.checksum -= 'X'-'D';
+    acpi_dmar_zap();
     return ret;
 }
 
@@ -867,18 +887,18 @@ int __init acpi_dmar_init(void)
 
 void acpi_dmar_reinstate(void)
 {
-    if ( dmar_table == NULL )
-        return;
-    dmar_table->signature[0] = 'D';
-    dmar_table->checksum += 'X'-'D';
+    uint32_t sig = 0x52414d44; /* "DMAR" */
+
+    if ( dmar_table )
+        write_atomic((uint32_t*)&dmar_table->signature[0], sig);
 }
 
 void acpi_dmar_zap(void)
 {
-    if ( dmar_table == NULL )
-        return;
-    dmar_table->signature[0] = 'X';
-    dmar_table->checksum -= 'X'-'D';
+    uint32_t sig = 0x44414d52; /* "RMAD" - doesn't alter table checksum */
+
+    if ( dmar_table )
+        write_atomic((uint32_t*)&dmar_table->signature[0], sig);
 }
 
 int platform_supports_intremap(void)
@@ -893,3 +913,30 @@ int platform_supports_x2apic(void)
     unsigned int mask = ACPI_DMAR_INTR_REMAP | ACPI_DMAR_X2APIC_OPT_OUT;
     return cpu_has_x2apic && ((dmar_flags & mask) == ACPI_DMAR_INTR_REMAP);
 }
+
+int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt)
+{
+    struct acpi_rmrr_unit *rmrr, *rmrr_cur = NULL;
+    unsigned int i;
+    u16 bdf;
+
+    for_each_rmrr_device ( rmrr, bdf, i )
+    {
+        int rc;
+
+        if ( rmrr == rmrr_cur )
+            continue;
+
+        rc = func(PFN_DOWN(rmrr->base_address),
+                  PFN_UP(rmrr->end_address) - PFN_DOWN(rmrr->base_address),
+                  PCI_SBDF2(rmrr->segment, bdf), ctxt);
+
+        if ( unlikely(rc < 0) )
+            return rc;
+
+        if ( rc )
+            rmrr_cur = rmrr;
+    }
+
+    return 0;
+}
diff --git a/xen/drivers/passthrough/vtd/dmar.h b/xen/drivers/passthrough/vtd/dmar.h
index af1feef..729b603 100644
--- a/xen/drivers/passthrough/vtd/dmar.h
+++ b/xen/drivers/passthrough/vtd/dmar.h
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Ashok Raj <ashok.raj at intel.com>
  * Copyright (C) Shaohua Li <shaohua.li at intel.com>
@@ -129,7 +128,6 @@ do {                                                \
 
 int vtd_hw_check(void);
 void disable_pmr(struct iommu *iommu);
-int is_usb_device(u16 seg, u8 bus, u8 devfn);
 int is_igd_drhd(struct acpi_drhd_unit *drhd);
 
 #endif /* _DMAR_H_ */
diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
index 5524dba..8acf889 100644
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Allen Kay <allen.m.kay at intel.com>
  * Copyright (C) Weidong Han <weidong.han at intel.com>
@@ -75,6 +74,7 @@ int domain_context_mapping_one(struct domain *domain, struct iommu *iommu,
                                u8 bus, u8 devfn, const struct pci_dev *);
 int domain_context_unmap_one(struct domain *domain, struct iommu *iommu,
                              u8 bus, u8 devfn);
+int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt);
 
 unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg);
 void io_apic_write_remap_rte(unsigned int apic,
diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c
index 0333686..987bbe9 100644
--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Allen Kay <allen.m.kay at intel.com>
  * Copyright (C) Xiaohui Xin <xiaohui.xin at intel.com>
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
index 5a946d4..dd13865 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Ashok Raj <ashok.raj at intel.com>
  * Copyright (C) Shaohua Li <shaohua.li at intel.com>
@@ -190,14 +189,15 @@ u64 alloc_pgtable_maddr(struct acpi_drhd_unit *drhd, unsigned long npages)
     struct acpi_rhsa_unit *rhsa;
     struct page_info *pg, *cur_pg;
     u64 *vaddr;
-    int node = -1, i;
+    nodeid_t node = NUMA_NO_NODE;
+    unsigned int i;
 
     rhsa = drhd_to_rhsa(drhd);
     if ( rhsa )
         node =  pxm_to_node(rhsa->proximity_domain);
 
     pg = alloc_domheap_pages(NULL, get_order_from_pages(npages),
-                             (node == -1 ) ? 0 : MEMF_node(node));
+                             (node == NUMA_NO_NODE) ? 0 : MEMF_node(node));
     if ( !pg )
         return 0;
 
@@ -991,24 +991,30 @@ static void dma_msi_unmask(struct irq_desc *desc)
 {
     struct iommu *iommu = desc->action->dev_id;
     unsigned long flags;
+    u32 sts;
 
     /* unmask it */
     spin_lock_irqsave(&iommu->register_lock, flags);
-    dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
+    sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
+    sts &= ~DMA_FECTL_IM;
+    dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
     spin_unlock_irqrestore(&iommu->register_lock, flags);
-    iommu->msi.msi_attrib.masked = 0;
+    iommu->msi.msi_attrib.host_masked = 0;
 }
 
 static void dma_msi_mask(struct irq_desc *desc)
 {
     unsigned long flags;
     struct iommu *iommu = desc->action->dev_id;
+    u32 sts;
 
     /* mask it */
     spin_lock_irqsave(&iommu->register_lock, flags);
-    dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM);
+    sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
+    sts |= DMA_FECTL_IM;
+    dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
     spin_unlock_irqrestore(&iommu->register_lock, flags);
-    iommu->msi.msi_attrib.masked = 1;
+    iommu->msi.msi_attrib.host_masked = 1;
 }
 
 static unsigned int dma_msi_startup(struct irq_desc *desc)
@@ -1053,8 +1059,7 @@ static void dma_msi_set_affinity(struct irq_desc *desc, const cpumask_t *mask)
 
     spin_lock_irqsave(&iommu->register_lock, flags);
     dmar_writel(iommu->reg, DMAR_FEDATA_REG, msg.data);
-    dmar_writel(iommu->reg, DMAR_FEADDR_REG, msg.address_lo);
-    dmar_writel(iommu->reg, DMAR_FEUADDR_REG, msg.address_hi);
+    dmar_writeq(iommu->reg, DMAR_FEADDR_REG, msg.address);
     spin_unlock_irqrestore(&iommu->register_lock, flags);
 }
 
@@ -1802,17 +1807,13 @@ static void iommu_set_pgd(struct domain *d)
     struct hvm_iommu *hd  = domain_hvm_iommu(d);
     mfn_t pgd_mfn;
 
-    ASSERT( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled );
-
-    if ( !iommu_use_hap_pt(d) )
-        return;
-
     pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d)));
     hd->arch.pgd_maddr = pagetable_get_paddr(pagetable_from_mfn(pgd_mfn));
 }
 
 static int rmrr_identity_mapping(struct domain *d, bool_t map,
-                                 const struct acpi_rmrr_unit *rmrr)
+                                 const struct acpi_rmrr_unit *rmrr,
+                                 u32 flag)
 {
     unsigned long base_pfn = rmrr->base_address >> PAGE_SHIFT_4K;
     unsigned long end_pfn = PAGE_ALIGN_4K(rmrr->end_address) >> PAGE_SHIFT_4K;
@@ -1844,7 +1845,7 @@ static int rmrr_identity_mapping(struct domain *d, bool_t map,
 
             while ( base_pfn < end_pfn )
             {
-                if ( intel_iommu_unmap_page(d, base_pfn) )
+                if ( clear_identity_p2m_entry(d, base_pfn) )
                     ret = -ENXIO;
                 base_pfn++;
             }
@@ -1860,8 +1861,7 @@ static int rmrr_identity_mapping(struct domain *d, bool_t map,
 
     while ( base_pfn < end_pfn )
     {
-        int err = intel_iommu_map_page(d, base_pfn, base_pfn,
-                                       IOMMUF_readable|IOMMUF_writable);
+        int err = set_identity_p2m_entry(d, base_pfn, p2m_access_rw, flag);
 
         if ( err )
             return err;
@@ -1904,7 +1904,13 @@ static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev)
              PCI_BUS(bdf) == pdev->bus &&
              PCI_DEVFN2(bdf) == devfn )
         {
-            ret = rmrr_identity_mapping(pdev->domain, 1, rmrr);
+            /*
+             * iommu_add_device() is only called for the hardware
+             * domain (see xen/drivers/passthrough/pci.c:pci_add_device()).
+             * Since RMRRs are always reserved in the e820 map for the hardware
+             * domain, there shouldn't be a conflict.
+             */
+            ret = rmrr_identity_mapping(pdev->domain, 1, rmrr, 0);
             if ( ret )
                 dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n",
                         pdev->domain->domain_id);
@@ -1945,7 +1951,11 @@ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
              PCI_DEVFN2(bdf) != devfn )
             continue;
 
-        rmrr_identity_mapping(pdev->domain, 0, rmrr);
+        /*
+         * Any flag is nothing to clear these mappings but here
+         * its always safe and strict to set 0.
+         */
+        rmrr_identity_mapping(pdev->domain, 0, rmrr, 0);
     }
 
     return domain_context_unmap(pdev->domain, devfn, pdev);
@@ -2002,6 +2012,7 @@ static int init_vtd_hw(void)
     struct iommu_flush *flush = NULL;
     int ret;
     unsigned long flags;
+    u32 sts;
 
     /*
      * Basic VT-d HW init: set VT-d interrupt, clear VT-d faults.  
@@ -2015,7 +2026,9 @@ static int init_vtd_hw(void)
         clear_fault_bits(iommu);
 
         spin_lock_irqsave(&iommu->register_lock, flags);
-        dmar_writel(iommu->reg, DMAR_FECTL_REG, 0);
+        sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
+        sts &= ~DMA_FECTL_IM;
+        dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
         spin_unlock_irqrestore(&iommu->register_lock, flags);
     }
 
@@ -2103,7 +2116,13 @@ static void __hwdom_init setup_hwdom_rmrr(struct domain *d)
     spin_lock(&pcidevs_lock);
     for_each_rmrr_device ( rmrr, bdf, i )
     {
-        ret = rmrr_identity_mapping(d, 1, rmrr);
+        /*
+         * Here means we're add a device to the hardware domain.
+         * Since RMRRs are always reserved in the e820 map for the hardware
+         * domain, there shouldn't be a conflict. So its always safe and
+         * strict to set 0.
+         */
+        ret = rmrr_identity_mapping(d, 1, rmrr, 0);
         if ( ret )
             dprintk(XENLOG_ERR VTDPREFIX,
                      "IOMMU: mapping reserved region failed\n");
@@ -2231,11 +2250,9 @@ static int reassign_device_ownership(
     /*
      * If the device belongs to the hardware domain, and it has RMRR, don't
      * remove it from the hardware domain, because BIOS may use RMRR at
-     * booting time. Also account for the special casing of USB below (in
-     * intel_iommu_assign_device()).
+     * booting time.
      */
-    if ( !is_hardware_domain(source) &&
-         !is_usb_device(pdev->seg, pdev->bus, pdev->devfn) )
+    if ( !is_hardware_domain(source) )
     {
         const struct acpi_rmrr_unit *rmrr;
         u16 bdf;
@@ -2246,7 +2263,11 @@ static int reassign_device_ownership(
                  PCI_BUS(bdf) == pdev->bus &&
                  PCI_DEVFN2(bdf) == devfn )
             {
-                ret = rmrr_identity_mapping(source, 0, rmrr);
+                /*
+                 * Any RMRR flag is always ignored when remove a device,
+                 * but its always safe and strict to set 0.
+                 */
+                ret = rmrr_identity_mapping(source, 0, rmrr, 0);
                 if ( ret != -ENOENT )
                     return ret;
             }
@@ -2270,7 +2291,7 @@ static int reassign_device_ownership(
 }
 
 static int intel_iommu_assign_device(
-    struct domain *d, u8 devfn, struct pci_dev *pdev)
+    struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
 {
     struct acpi_rmrr_unit *rmrr;
     int ret = 0, i;
@@ -2280,18 +2301,44 @@ static int intel_iommu_assign_device(
     if ( list_empty(&acpi_drhd_units) )
         return -ENODEV;
 
+    seg = pdev->seg;
+    bus = pdev->bus;
+    /*
+     * In rare cases one given rmrr is shared by multiple devices but
+     * obviously this would put the security of a system at risk. So
+     * we would prevent from this sort of device assignment. But this
+     * can be permitted if user set
+     *      "pci = [ 'sbdf, rdm_policy=relaxed' ]"
+     *
+     * TODO: in the future we can introduce group device assignment
+     * interface to make sure devices sharing RMRR are assigned to the
+     * same domain together.
+     */
+    for_each_rmrr_device( rmrr, bdf, i )
+    {
+        if ( rmrr->segment == seg &&
+             PCI_BUS(bdf) == bus &&
+             PCI_DEVFN2(bdf) == devfn &&
+             rmrr->scope.devices_cnt > 1 )
+        {
+            bool_t relaxed = !!(flag & XEN_DOMCTL_DEV_RDM_RELAXED);
+
+            printk(XENLOG_GUEST "%s" VTDPREFIX
+                   " It's %s to assign %04x:%02x:%02x.%u"
+                   " with shared RMRR at %"PRIx64" for Dom%d.\n",
+                   relaxed ? XENLOG_WARNING : XENLOG_ERR,
+                   relaxed ? "risky" : "disallowed",
+                   seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                   rmrr->base_address, d->domain_id);
+            if ( !relaxed )
+                return -EPERM;
+        }
+    }
+
     ret = reassign_device_ownership(hardware_domain, d, devfn, pdev);
     if ( ret )
         return ret;
 
-    /* FIXME: Because USB RMRR conflicts with guest bios region,
-     * ignore USB RMRR temporarily.
-     */
-    seg = pdev->seg;
-    bus = pdev->bus;
-    if ( is_usb_device(seg, bus, pdev->devfn) )
-        return 0;
-
     /* Setup rmrr identity mapping */
     for_each_rmrr_device( rmrr, bdf, i )
     {
@@ -2299,7 +2346,7 @@ static int intel_iommu_assign_device(
              PCI_BUS(bdf) == bus &&
              PCI_DEVFN2(bdf) == devfn )
         {
-            ret = rmrr_identity_mapping(d, 1, rmrr);
+            ret = rmrr_identity_mapping(d, 1, rmrr, flag);
             if ( ret )
             {
                 reassign_device_ownership(d, hardware_domain, devfn, pdev);
@@ -2495,6 +2542,7 @@ const struct iommu_ops intel_iommu_ops = {
     .crash_shutdown = vtd_crash_shutdown,
     .iotlb_flush = intel_iommu_iotlb_flush,
     .iotlb_flush_all = intel_iommu_iotlb_flush_all,
+    .get_reserved_device_memory = intel_iommu_get_reserved_device_memory,
     .dump_p2m_table = vtd_dump_p2m_table,
 };
 
diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h
index d6e6520..ac71ed1 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Ashok Raj <ashok.raj at intel.com>
  */
@@ -51,17 +50,10 @@
 #define    DMAR_IRTA_REG   0xB8    /* intr remap */
 
 #define OFFSET_STRIDE        (9)
-#define dmar_readl(dmar, reg) readl(dmar + reg)
-#define dmar_writel(dmar, reg, val) writel(val, dmar + reg)
-#define dmar_readq(dmar, reg) ({ \
-        u32 lo, hi; \
-        lo = dmar_readl(dmar, reg); \
-        hi = dmar_readl(dmar, reg + 4); \
-        (((u64) hi) << 32) + lo; })
-#define dmar_writeq(dmar, reg, val) do {\
-        dmar_writel(dmar, reg, (u32)val); \
-        dmar_writel(dmar, reg + 4, (u32)((u64) val >> 32)); \
-    } while (0)
+#define dmar_readl(dmar, reg) readl((dmar) + (reg))
+#define dmar_readq(dmar, reg) readq((dmar) + (reg))
+#define dmar_writel(dmar, reg, val) writel(val, (dmar) + (reg))
+#define dmar_writeq(dmar, reg, val) writeq(val, (dmar) + (reg))
 
 #define VER_MAJOR(v)        (((v) & 0xf0) >> 4)
 #define VER_MINOR(v)        ((v) & 0x0f)
@@ -482,7 +474,6 @@ struct qinval_entry {
 #define VTD_PAGE_TABLE_LEVEL_3  3
 #define VTD_PAGE_TABLE_LEVEL_4  4
 
-#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
 #define MAX_IOMMU_REGS 0xc0
 
 extern struct list_head acpi_drhd_units;
diff --git a/xen/drivers/passthrough/vtd/qinval.c b/xen/drivers/passthrough/vtd/qinval.c
index 4603020..b81b0bd 100644
--- a/xen/drivers/passthrough/vtd/qinval.c
+++ b/xen/drivers/passthrough/vtd/qinval.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Allen Kay <allen.m.kay at intel.com>
  * Copyright (C) Xiaohui Xin <xiaohui.xin at intel.com>
diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
index b24fb12..1888843 100644
--- a/xen/drivers/passthrough/vtd/quirks.c
+++ b/xen/drivers/passthrough/vtd/quirks.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Author: Allen Kay <allen.m.kay at intel.com>
  */
@@ -72,6 +71,9 @@ int is_igd_vt_enabled_quirk(void)
 {
     u16 ggc;
 
+    if ( !iommu_igfx )
+        return 0;
+
     if ( !IS_ILK(ioh_id) )
         return 1;
 
@@ -503,9 +505,13 @@ void pci_vtd_quirk(const struct pci_dev *pdev)
     case 0x0040: case 0x0044: case 0x0048: /* Nehalem/Westmere */
     case 0x0100: case 0x0104: case 0x0108: /* Sandybridge */
     case 0x0150: case 0x0154: case 0x0158: /* Ivybridge */
-    case 0x0a04: /* Haswell ULT */
-    case 0x0c00: case 0x0c04: case 0x0c08: /* Haswell */
-    case 0x1600: case 0x1604: case 0x1608: /* Broadwell */
+    case 0x0a00: case 0x0a04: case 0x0a08: case 0x0a0f: /* Haswell ULT */
+    case 0x0c00: case 0x0c04: case 0x0c08: case 0x0c0f: /* Haswell */
+    case 0x0d00: case 0x0d04: case 0x0d08: case 0x0d0f: /* Haswell */
+    case 0x1600: case 0x1604: case 0x1608: case 0x160f: /* Broadwell */
+    case 0x1610: case 0x1614: case 0x1618: /* Broadwell */
+    case 0x1900: case 0x1904: case 0x1908: case 0x190c: case 0x190f: /* Skylake */
+    case 0x1910: case 0x1918: case 0x191f: /* Skylake */
         bar = pci_conf_read32(seg, bus, dev, func, 0x6c);
         bar = (bar << 32) | pci_conf_read32(seg, bus, dev, func, 0x68);
         pa = bar & 0x7ffffff000UL; /* bits 12...38 */
diff --git a/xen/drivers/passthrough/vtd/utils.c b/xen/drivers/passthrough/vtd/utils.c
index bd14c02..44c4ef5 100644
--- a/xen/drivers/passthrough/vtd/utils.c
+++ b/xen/drivers/passthrough/vtd/utils.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Allen Kay <allen.m.kay at intel.com>
  */
@@ -29,13 +28,6 @@
 #include "extern.h"
 #include <asm/io_apic.h>
 
-int is_usb_device(u16 seg, u8 bus, u8 devfn)
-{
-    u16 class = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
-                                PCI_CLASS_DEVICE);
-    return (class == 0xc03);
-}
-
 /* Disable vt-d protected memory registers. */
 void disable_pmr(struct iommu *iommu)
 {
diff --git a/xen/drivers/passthrough/vtd/vtd.h b/xen/drivers/passthrough/vtd/vtd.h
index 02e9d78..bb8889f 100644
--- a/xen/drivers/passthrough/vtd/vtd.h
+++ b/xen/drivers/passthrough/vtd/vtd.h
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Allen Kay <allen.m.kay at intel.com>
  * Copyright (C) Weidong Han <weidong.han at intel.com>
diff --git a/xen/drivers/passthrough/vtd/x86/ats.c b/xen/drivers/passthrough/vtd/x86/ats.c
index 6b0632b..7c797f6 100644
--- a/xen/drivers/passthrough/vtd/x86/ats.c
+++ b/xen/drivers/passthrough/vtd/x86/ats.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Author: Allen Kay <allen.m.kay at intel.com>
  */
diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
index 109234e..c0d6aab 100644
--- a/xen/drivers/passthrough/vtd/x86/vtd.c
+++ b/xen/drivers/passthrough/vtd/x86/vtd.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Allen Kay <allen.m.kay at intel.com>
  * Copyright (C) Weidong Han <weidong.han at intel.com>
@@ -41,7 +40,7 @@ boolean_param("iommu_inclusive_mapping", iommu_inclusive_mapping);
 
 void *map_vtd_domain_page(u64 maddr)
 {
-    return map_domain_page(maddr >> PAGE_SHIFT_4K);
+    return map_domain_page(_mfn(paddr_to_pfn(maddr)));
 }
 
 void unmap_vtd_domain_page(void *va)
diff --git a/xen/drivers/passthrough/x86/ats.c b/xen/drivers/passthrough/x86/ats.c
index 436eada..40c9f40 100644
--- a/xen/drivers/passthrough/x86/ats.c
+++ b/xen/drivers/passthrough/x86/ats.c
@@ -9,8 +9,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/sched.h>
diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c
index ce0ca5a..8cbb655 100644
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -9,8 +9,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <xen/sched.h>
@@ -56,13 +55,20 @@ int arch_iommu_populate_page_table(struct domain *d)
 
     while ( !rc && (page = page_list_remove_head(&d->page_list)) )
     {
-        if ( is_hvm_domain(d) ||
+        if ( has_hvm_container_domain(d) ||
             (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
         {
-            BUG_ON(SHARED_M2P(mfn_to_gmfn(d, page_to_mfn(page))));
-            rc = hd->platform_ops->map_page(
-                d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
-                IOMMUF_readable|IOMMUF_writable);
+            unsigned long mfn = page_to_mfn(page);
+            unsigned long gfn = mfn_to_gmfn(d, mfn);
+
+            if ( gfn != INVALID_MFN )
+            {
+                ASSERT(!(gfn >> DEFAULT_DOMAIN_ADDRESS_WIDTH));
+                BUG_ON(SHARED_M2P(gfn));
+                rc = hd->platform_ops->map_page(d, gfn, mfn,
+                                                IOMMUF_readable |
+                                                IOMMUF_writable);
+            }
             if ( rc )
             {
                 page_list_add(page, &d->page_list);
@@ -85,8 +91,9 @@ int arch_iommu_populate_page_table(struct domain *d)
          * first few entries.
          */
         page_list_move(&d->page_list, &d->arch.relmem_list);
-        while ( (page = page_list_first(&d->page_list)) != NULL &&
-                (page->count_info & (PGC_state|PGC_broken)) )
+        while ( !page_list_empty(&d->page_list) &&
+                (page = page_list_first(&d->page_list),
+                 (page->count_info & (PGC_state|PGC_broken))) )
         {
             page_list_del(page, &d->page_list);
             page_list_add_tail(page, &d->arch.relmem_list);
diff --git a/xen/include/Makefile b/xen/include/Makefile
index f7ccbc9..6664107 100644
--- a/xen/include/Makefile
+++ b/xen/include/Makefile
@@ -21,12 +21,12 @@ headers-y := \
     compat/vcpu.h \
     compat/version.h \
     compat/xen.h \
-    compat/xencomm.h \
     compat/xenoprof.h
+headers-$(CONFIG_X86)     += compat/arch-x86/pmu.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen-mca.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen-$(compat-arch-y).h
-headers-y                 += compat/arch-$(compat-arch-y).h compat/xlat.h
+headers-y                 += compat/arch-$(compat-arch-y).h compat/pmu.h compat/xlat.h
 headers-$(FLASK_ENABLE)   += compat/xsm/flask_op.h
 
 cppflags-y                := -include public/xen-compat.h
@@ -88,13 +88,33 @@ compat/xlat.h: $(addprefix compat/.xlat/,$(xlat-y)) Makefile
 
 ifeq ($(XEN_TARGET_ARCH),$(XEN_COMPILE_ARCH))
 
-all: headers.chk
+all: headers.chk headers++.chk
 
-headers.chk: $(filter-out public/arch-% public/%ctl.h public/xsm/% public/%hvm/save.h, $(wildcard public/*.h public/*/*.h) $(public-y)) Makefile
-	for i in $(filter %.h,$^); do $(CC) -ansi -include stdint.h -Wall -W -Werror -S -o /dev/null -x c $$i || exit 1; echo $$i; done >$@.new
+PUBLIC_HEADERS := $(filter-out public/arch-% public/dom0_ops.h, $(wildcard public/*.h public/*/*.h) $(public-y))
+
+PUBLIC_ANSI_HEADERS := $(filter-out public/%ctl.h public/xsm/% public/%hvm/save.h, $(PUBLIC_HEADERS))
+
+headers.chk: $(PUBLIC_ANSI_HEADERS) Makefile
+	for i in $(filter %.h,$^); do \
+	    $(CC) -x c -ansi -Wall -Werror -include stdint.h \
+	          -S -o /dev/null $$i || exit 1; \
+	    echo $$i; \
+	done >$@.new
+	mv $@.new $@
+
+headers++.chk: $(PUBLIC_HEADERS) Makefile
+	if $(CXX) -v >/dev/null 2>&1; then \
+	    for i in $(filter %.h,$^); do \
+	        echo '#include "'$$i'"' \
+	        | $(CXX) -x c++ -std=gnu++98 -Wall -Werror -D__XEN_TOOLS__ \
+	          -include stdint.h -include public/xen.h -S -o /dev/null - \
+	        || exit 1; \
+	        echo $$i; \
+	    done ; \
+	fi >$@.new
 	mv $@.new $@
 
 endif
 
 clean::
-	rm -rf compat headers.chk
+	rm -rf compat headers.chk headers++.chk
diff --git a/xen/include/asm-arm/arm32/bitops.h b/xen/include/asm-arm/arm32/bitops.h
index 0d05258..8be3564 100644
--- a/xen/include/asm-arm/arm32/bitops.h
+++ b/xen/include/asm-arm/arm32/bitops.h
@@ -15,6 +15,8 @@ extern int _test_and_change_bit(int nr, volatile void * p);
 #define test_and_clear_bit(n,p)   _test_and_clear_bit(n,p)
 #define test_and_change_bit(n,p)  _test_and_change_bit(n,p)
 
+#define flsl fls
+
 /*
  * Little endian assembly bitops.  nr = 0 -> byte 0 bit 0.
  */
diff --git a/xen/include/asm-arm/arm32/page.h b/xen/include/asm-arm/arm32/page.h
index a07e217..bccdbfc 100644
--- a/xen/include/asm-arm/arm32/page.h
+++ b/xen/include/asm-arm/arm32/page.h
@@ -103,11 +103,14 @@ static inline uint64_t gva_to_ma_par(vaddr_t va, unsigned int flags)
     WRITE_CP64(tmp, PAR);
     return par;
 }
-static inline uint64_t gva_to_ipa_par(vaddr_t va)
+static inline uint64_t gva_to_ipa_par(vaddr_t va, unsigned int flags)
 {
     uint64_t par, tmp;
     tmp = READ_CP64(PAR);
-    WRITE_CP32(va, ATS1CPR);
+    if ( (flags & GV2M_WRITE) == GV2M_WRITE )
+        WRITE_CP32(va, ATS1CPW);
+    else
+        WRITE_CP32(va, ATS1CPR);
     isb(); /* Ensure result is available. */
     par = READ_CP64(PAR);
     WRITE_CP64(tmp, PAR);
diff --git a/xen/include/asm-arm/arm32/spinlock.h b/xen/include/asm-arm/arm32/spinlock.h
deleted file mode 100644
index bc0343c..0000000
--- a/xen/include/asm-arm/arm32/spinlock.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef __ASM_ARM32_SPINLOCK_H
-#define __ASM_ARM32_SPINLOCK_H
-
-static inline void dsb_sev(void)
-{
-    __asm__ __volatile__ (
-        "dsb\n"
-        "sev\n"
-        );
-}
-
-typedef struct {
-    volatile unsigned int lock;
-} raw_spinlock_t;
-
-#define _RAW_SPIN_LOCK_UNLOCKED { 0 }
-
-#define _raw_spin_is_locked(x)          ((x)->lock != 0)
-
-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
-{
-    ASSERT(_raw_spin_is_locked(lock));
-
-    smp_mb();
-
-    __asm__ __volatile__(
-"   str     %1, [%0]\n"
-    :
-    : "r" (&lock->lock), "r" (0)
-    : "cc");
-
-    dsb_sev();
-}
-
-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
-{
-    unsigned long contended, res;
-
-    do {
-        __asm__ __volatile__(
-    "   ldrex   %0, [%2]\n"
-    "   teq     %0, #0\n"
-    "   strexeq %1, %3, [%2]\n"
-    "   movne   %1, #0\n"
-        : "=&r" (contended), "=r" (res)
-        : "r" (&lock->lock), "r" (1)
-        : "cc");
-    } while (res);
-
-    if (!contended) {
-        smp_mb();
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-#endif /* __ASM_SPINLOCK_H */
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/include/asm-arm/arm64/bitops.h b/xen/include/asm-arm/arm64/bitops.h
index b43931d..6bf1922 100644
--- a/xen/include/asm-arm/arm64/bitops.h
+++ b/xen/include/asm-arm/arm64/bitops.h
@@ -32,6 +32,17 @@ static /*__*/always_inline unsigned long __ffs(unsigned long word)
  */
 #define ffz(x)  __ffs(~(x))
 
+static inline int flsl(unsigned long x)
+{
+        int ret;
+
+        if (__builtin_constant_p(x))
+               return generic_flsl(x);
+
+        asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
+        return BITS_PER_LONG - ret;
+}
+
 /* Based on linux/include/asm-generic/bitops/find.h */
 
 #ifndef find_next_bit
diff --git a/xen/include/asm-arm/arm64/page.h b/xen/include/asm-arm/arm64/page.h
index e7a761d..29a32cf 100644
--- a/xen/include/asm-arm/arm64/page.h
+++ b/xen/include/asm-arm/arm64/page.h
@@ -98,11 +98,14 @@ static inline uint64_t gva_to_ma_par(vaddr_t va, unsigned int flags)
     return par;
 }
 
-static inline uint64_t gva_to_ipa_par(vaddr_t va)
+static inline uint64_t gva_to_ipa_par(vaddr_t va, unsigned int flags)
 {
     uint64_t par, tmp = READ_SYSREG64(PAR_EL1);
 
-    asm volatile ("at s1e1r, %0;" : : "r" (va));
+    if ( (flags & GV2M_WRITE) == GV2M_WRITE )
+        asm volatile ("at s1e1w, %0;" : : "r" (va));
+    else
+        asm volatile ("at s1e1r, %0;" : : "r" (va));
     isb();
     par = READ_SYSREG64(PAR_EL1);
     WRITE_SYSREG64(tmp, PAR_EL1);
diff --git a/xen/include/asm-arm/arm64/spinlock.h b/xen/include/asm-arm/arm64/spinlock.h
deleted file mode 100644
index 5ae034d..0000000
--- a/xen/include/asm-arm/arm64/spinlock.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Derived from Linux arch64 spinlock.h which is:
- * Copyright (C) 2012 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ASM_ARM64_SPINLOCK_H
-#define __ASM_ARM64_SPINLOCK_H
-
-typedef struct {
-    volatile unsigned int lock;
-} raw_spinlock_t;
-
-#define _RAW_SPIN_LOCK_UNLOCKED { 0 }
-
-#define _raw_spin_is_locked(x)          ((x)->lock != 0)
-
-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
-{
-    ASSERT(_raw_spin_is_locked(lock));
-
-    asm volatile(
-        "       stlr    %w1, %0\n"
-        : "=Q" (lock->lock) : "r" (0) : "memory");
-}
-
-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
-{
-    unsigned int tmp;
-
-    asm volatile(
-        "2:     ldaxr   %w0, %1\n"
-        "       cbnz    %w0, 1f\n"
-        "       stxr    %w0, %w2, %1\n"
-        "       cbnz    %w0, 2b\n"
-        "1:\n"
-        : "=&r" (tmp), "+Q" (lock->lock)
-        : "r" (1)
-        : "cc", "memory");
-
-    return !tmp;
-}
-
-#endif /* __ASM_SPINLOCK_H */
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/include/asm-arm/atomic.h b/xen/include/asm-arm/atomic.h
index 7d15fb0..5a38c67 100644
--- a/xen/include/asm-arm/atomic.h
+++ b/xen/include/asm-arm/atomic.h
@@ -23,6 +23,17 @@ static inline void name(volatile type *addr, type val) \
                  : reg (val));                         \
 }
 
+#define build_add_sized(name, size, width, type, reg) \
+static inline void name(volatile type *addr, type val)                  \
+{                                                                       \
+    type t;                                                             \
+    asm volatile("ldr" size " %"width"1,%0\n"                           \
+                 "add %"width"1,%"width"1,%"width"2\n"                  \
+                 "str" size " %"width"1,%0"                             \
+                 : "=m" (*(volatile type *)addr), "=r" (t)              \
+                 : reg (val));                                          \
+}
+
 #if defined (CONFIG_ARM_32)
 #define BYTE ""
 #define WORD ""
@@ -46,6 +57,10 @@ build_atomic_read(read_u64_atomic, "x", uint64_t, "=r")
 build_atomic_write(write_u64_atomic, "x", uint64_t, "r")
 #endif
 
+build_add_sized(add_u8_sized, "b", BYTE, uint8_t, "ri")
+build_add_sized(add_u16_sized, "h", WORD, uint16_t, "ri")
+build_add_sized(add_u32_sized, "", WORD, uint32_t, "ri")
+
 void __bad_atomic_size(void);
 
 #define read_atomic(p) ({                                               \
@@ -70,6 +85,17 @@ void __bad_atomic_size(void);
     __x;                                                                \
 })
 
+#define add_sized(p, x) ({                                              \
+    typeof(*(p)) __x = (x);                                             \
+    switch ( sizeof(*(p)) )                                             \
+    {                                                                   \
+    case 1: add_u8_sized((uint8_t *)(p), __x); break;                   \
+    case 2: add_u16_sized((uint16_t *)(p), __x); break;                 \
+    case 4: add_u32_sized((uint32_t *)(p), __x); break;                 \
+    default: __bad_atomic_size(); break;                                \
+    }                                                                   \
+})
+    
 /*
  * NB. I've pushed the volatile qualifier into the operations. This allows
  * fast accessors such as _atomic_read() and _atomic_set() which don't give
diff --git a/xen/include/asm-arm/bitops.h b/xen/include/asm-arm/bitops.h
index d69a7c3..bda8898 100644
--- a/xen/include/asm-arm/bitops.h
+++ b/xen/include/asm-arm/bitops.h
@@ -101,53 +101,25 @@ static inline int test_bit(int nr, const volatile void *addr)
         return 1UL & (p[BIT_WORD(nr)] >> (nr & (BITS_PER_WORD-1)));
 }
 
-static inline int constant_fls(int x)
-{
-        int r = 32;
-
-        if (!x)
-                return 0;
-        if (!(x & 0xffff0000u)) {
-                x <<= 16;
-                r -= 16;
-        }
-        if (!(x & 0xff000000u)) {
-                x <<= 8;
-                r -= 8;
-        }
-        if (!(x & 0xf0000000u)) {
-                x <<= 4;
-                r -= 4;
-        }
-        if (!(x & 0xc0000000u)) {
-                x <<= 2;
-                r -= 2;
-        }
-        if (!(x & 0x80000000u)) {
-                x <<= 1;
-                r -= 1;
-        }
-        return r;
-}
-
 /*
  * On ARMv5 and above those functions can be implemented around
  * the clz instruction for much better code efficiency.
  */
 
-static inline int fls(int x)
+static inline int fls(unsigned int x)
 {
         int ret;
 
         if (__builtin_constant_p(x))
-               return constant_fls(x);
+               return generic_fls(x);
 
         asm("clz\t%"__OP32"0, %"__OP32"1" : "=r" (ret) : "r" (x));
         return 32 - ret;
 }
 
 
-#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
+#define ffs(x) ({ unsigned int __t = (x); fls(__t & -__t); })
+#define ffsl(x) ({ unsigned long __t = (x); flsl(__t & -__t); })
 
 /**
  * find_first_set_bit - find the first set bit in @word
@@ -158,7 +130,7 @@ static inline int fls(int x)
  */
 static inline unsigned int find_first_set_bit(unsigned long word)
 {
-        return ffs(word) - 1;
+        return ffsl(word) - 1;
 }
 
 /**
diff --git a/xen/include/asm-arm/cadence-uart.h b/xen/include/asm-arm/cadence-uart.h
new file mode 100644
index 0000000..48680ee
--- /dev/null
+++ b/xen/include/asm-arm/cadence-uart.h
@@ -0,0 +1,55 @@
+/*
+ * xen/include/asm-arm/cadence-uart.h
+ *
+ * Written by Edgar E. Iglesias <edgar.iglesias at xilinx.com>
+ * Copyright (C) 2015 Xilinx Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __ASM_ARM_CADENCE_UART_H__
+#define __ASM_ARM_CADENCE_UART_H__
+
+#define R_UART_CR    0x00
+#define UART_CR_RX_RST       0x01
+#define UART_CR_TX_RST       0x02
+#define UART_CR_RX_ENABLE    0x04
+#define UART_CR_RX_DISABLE   0x08
+#define UART_CR_TX_ENABLE    0x10
+#define UART_CR_TX_DISABLE   0x20
+
+#define R_UART_MR    0x04
+#define UART_MR_NO_PARITY    0x20
+
+#define R_UART_IER   0x08
+#define R_UART_IDR   0x0C
+#define R_UART_IMR   0x10
+#define R_UART_CISR  0x14
+#define R_UART_RTRIG 0x20
+#define R_UART_SR    0x2C
+#define UART_SR_INTR_RTRIG   0x01
+#define UART_SR_INTR_REMPTY  0x02
+#define UART_SR_INTR_TEMPTY  0x08
+#define UART_SR_INTR_TFUL    0x10
+
+#define R_UART_TX    0x30
+#define R_UART_RX    0x30
+
+#endif /* __ASM_ARM_CADENCE_UART_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-arm/config.h b/xen/include/asm-arm/config.h
index 264e2c1..817c216 100644
--- a/xen/include/asm-arm/config.h
+++ b/xen/include/asm-arm/config.h
@@ -47,8 +47,11 @@
 #define NR_CPUS 128
 #endif
 
+#ifdef CONFIG_ARM_64
+#define MAX_VIRT_CPUS 128
+#else
 #define MAX_VIRT_CPUS 8
-#define MAX_HVM_VCPUS MAX_VIRT_CPUS
+#endif
 
 #define asmlinkage /* Nothing needed */
 
@@ -187,6 +190,8 @@
 #define PAGE_MASK           (~(PAGE_SIZE-1))
 #define PAGE_FLAG_MASK      (~0)
 
+#define NR_hypercalls 64
+
 #define STACK_ORDER 3
 #define STACK_SIZE  (PAGE_SIZE << STACK_ORDER)
 
@@ -196,8 +201,6 @@ extern unsigned long xenheap_phys_end;
 extern unsigned long frametable_virt_end;
 #endif
 
-#define supervisor_mode_kernel (0)
-
 #define watchdog_disable() ((void)0)
 #define watchdog_enable()  ((void)0)
 
diff --git a/xen/include/asm-arm/cpregs.h b/xen/include/asm-arm/cpregs.h
index f1100c8..e5cb00c 100644
--- a/xen/include/asm-arm/cpregs.h
+++ b/xen/include/asm-arm/cpregs.h
@@ -83,16 +83,22 @@
 #define DBGBVR1         p14,0,c0,c1,4   /* Breakpoint Value 1 */
 #define DBGBCR1         p14,0,c0,c1,5   /* Breakpoint Control 1 */
 #define DBGOSLAR        p14,0,c1,c0,4   /* OS Lock Access */
+#define DBGOSLSR        p14,0,c1,c1,4   /* OS Lock Status Register */
 #define DBGOSDLR        p14,0,c1,c3,4   /* OS Double Lock */
+#define DBGPRCR         p14,0,c1,c4,4   /* Debug Power Control Register */
 
 /* CP14 CR0: */
 #define TEECR           p14,6,c0,c0,0   /* ThumbEE Configuration Register */
 
 /* CP14 CR1: */
+#define DBGDRAR64       p14,0,c1        /* Debug ROM Address Register (64-bit access) */
+#define DBGDRAR         p14,0,c1,c0,0   /* Debug ROM Address Register (32-bit access) */
 #define TEEHBR          p14,6,c1,c0,0   /* ThumbEE Handler Base Register */
 #define JOSCR           p14,7,c1,c0,0   /* Jazelle OS Control Register */
 
 /* CP14 CR2: */
+#define DBGDSAR64       p14,0,c2        /* Debug Self Address Offset Register (64-bit access) */
+#define DBGDSAR         p14,0,c2,c0,0   /* Debug Self Address Offset Register (32-bit access) */
 #define JMCR            p14,7,c2,c0,0   /* Jazelle Main Configuration Register */
 
 
@@ -222,8 +228,8 @@
 #define PMCEID0         p15,0,c9,c12,6  /* Perf. Mon. Common Event Identification register 0 */
 #define PMCEID1         p15,0,c9,c12,7  /* Perf. Mon. Common Event Identification register 1 */
 #define PMCCNTR         p15,0,c9,c13,0  /* Perf. Mon. Cycle Count Register */
-#define PMXEVCNTR       p15,0,c9,c13,1  /* Perf. Mon. Event Type Select Register */
-#define PMXEVCNR        p15,0,c9,c13,2  /* Perf. Mon. Event Count Register */
+#define PMXEVTYPER      p15,0,c9,c13,1  /* Perf. Mon. Event Type Select Register */
+#define PMXEVCNTR       p15,0,c9,c13,2  /* Perf. Mon. Event Count Register */
 #define PMUSERENR       p15,0,c9,c14,0  /* Perf. Mon. User Enable Register */
 #define PMINTENSET      p15,0,c9,c14,1  /* Perf. Mon. Interrupt Enable Set Register */
 #define PMINTENCLR      p15,0,c9,c14,2  /* Perf. Mon. Interrupt Enable Clear Register */
diff --git a/xen/include/asm-arm/device.h b/xen/include/asm-arm/device.h
index 74a80c6..5d0a4cd 100644
--- a/xen/include/asm-arm/device.h
+++ b/xen/include/asm-arm/device.h
@@ -2,10 +2,36 @@
 #define __ASM_ARM_DEVICE_H
 
 #include <xen/init.h>
-#include <xen/device_tree.h>
 
 enum device_type
 {
+    DEV_DT,
+};
+
+struct dev_archdata {
+    void *iommu;    /* IOMMU private data */
+};
+
+/* struct device - The basic device structure */
+struct device
+{
+    enum device_type type;
+#ifdef HAS_DEVICE_TREE
+    struct dt_device_node *of_node; /* Used by drivers imported from Linux */
+#endif
+    struct dev_archdata archdata;
+};
+
+typedef struct device device_t;
+
+#include <xen/device_tree.h>
+
+/* TODO: Correctly implement dev_is_pci when PCI is supported on ARM */
+#define dev_is_pci(dev) ((void)(dev), 0)
+#define dev_is_dt(dev)  ((dev->type == DEV_DT)
+
+enum device_class
+{
     DEVICE_SERIAL,
     DEVICE_IOMMU,
     DEVICE_GIC,
@@ -16,10 +42,10 @@ enum device_type
 struct device_desc {
     /* Device name */
     const char *name;
-    /* Device type */
-    enum device_type type;
-    /* Array of device tree 'compatible' strings */
-    const char *const *compatible;
+    /* Device class */
+    enum device_class class;
+    /* List of devices supported by this driver */
+    const struct dt_device_match *dt_match;
     /* Device initialization */
     int (*init)(struct dt_device_node *dev, const void *data);
 };
@@ -27,12 +53,12 @@ struct device_desc {
 /**
  *  device_init - Initialize a device
  *  @dev: device to initialize
- *  @type: type of the device (serial, network...)
+ *  @class: class of the device (serial, network...)
  *  @data: specific data for initializing the device
  *
  *  Return 0 on success.
  */
-int __init device_init(struct dt_device_node *dev, enum device_type type,
+int __init device_init(struct dt_device_node *dev, enum device_class class,
                        const void *data);
 
 /**
@@ -41,13 +67,13 @@ int __init device_init(struct dt_device_node *dev, enum device_type type,
  *
  * Return the device type on success or DEVICE_ANY on failure
  */
-enum device_type device_get_type(const struct dt_device_node *dev);
+enum device_class device_get_class(const struct dt_device_node *dev);
 
-#define DT_DEVICE_START(_name, _namestr, _type)                     \
+#define DT_DEVICE_START(_name, _namestr, _class)                    \
 static const struct device_desc __dev_desc_##_name __used           \
-__attribute__((__section__(".dev.info"))) = {                       \
+__section(".dev.info") = {                                          \
     .name = _namestr,                                               \
-    .type = _type,                                                  \
+    .class = _class,                                                \
 
 #define DT_DEVICE_END                                               \
 };
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 8b7dd85..56aa208 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -17,6 +17,7 @@ struct hvm_domain
 {
     uint64_t              params[HVM_NR_PARAMS];
     struct hvm_iommu      iommu;
+    bool_t                introspection_enabled;
 }  __cacheline_aligned;
 
 #ifdef CONFIG_ARM_64
@@ -76,6 +77,8 @@ struct arch_domain
     } virt_timer_base;
 
     struct {
+        /* Version of the vGIC */
+        enum gic_version version;
         /* GIC HW version specific vGIC driver handler */
         const struct vgic_ops *handler;
         /*
@@ -90,6 +93,7 @@ struct arch_domain
         spinlock_t lock;
         int ctlr;
         int nr_spis; /* Number of SPIs */
+        unsigned long *allocated_irqs; /* bitmap of IRQs allocated */
         struct vgic_irq_rank *shared_irqs;
         /*
          * SPIs are domain global, SGIs and PPIs are per-VCPU and stored in
@@ -99,13 +103,16 @@ struct arch_domain
         /* Base address for guest GIC */
         paddr_t dbase; /* Distributor base address */
         paddr_t cbase; /* CPU base address */
-#ifdef CONFIG_ARM_64
+#ifdef HAS_GICV3
         /* GIC V3 addressing */
-        paddr_t dbase_size; /* Distributor base size */
-        paddr_t rbase[MAX_RDIST_COUNT];      /* Re-Distributor base address */
-        paddr_t rbase_size[MAX_RDIST_COUNT]; /* Re-Distributor size */
-        uint32_t rdist_stride;               /* Re-Distributor stride */
-        int rdist_count;                     /* No. of Re-Distributors */
+        /* List of contiguous occupied by the redistributors */
+        struct vgic_rdist_region {
+            paddr_t base;                   /* Base address */
+            paddr_t size;                   /* Size */
+            unsigned int first_cpu;         /* First CPU handled */
+        } rdist_regions[MAX_RDIST_COUNT];
+        int nr_regions;                     /* Number of rdist regions */
+        uint32_t rdist_stride;              /* Re-Distributor stride */
 #endif
     } vgic;
 
@@ -236,6 +243,11 @@ struct arch_vcpu
          * lr_pending is a subset of vgic.inflight_irqs. */
         struct list_head lr_pending;
         spinlock_t lock;
+
+        /* GICv3: redistributor base and flags for this vCPU */
+        paddr_t rdist_base;
+#define VGIC_V3_RDIST_LAST  (1 << 0)        /* last vCPU of the rdist */
+        uint8_t flags;
     } vgic;
 
     /* Timer registers  */
@@ -249,6 +261,46 @@ struct arch_vcpu
 void vcpu_show_execution_state(struct vcpu *);
 void vcpu_show_registers(const struct vcpu *);
 
+unsigned int domain_max_vcpus(const struct domain *);
+
+/*
+ * Due to the restriction of GICv3, the number of vCPUs in AFF0 is
+ * limited to 16, thus only the first 4 bits of AFF0 are legal. We will
+ * use the first 2 affinity levels here, expanding the number of vCPU up
+ * to 4096(==16*256), which is more than the PEs that GIC-500 supports.
+ *
+ * Since we don't save information of vCPU's topology (affinity) in
+ * vMPIDR at the moment, we map the vcpuid to the vMPIDR linearly.
+ */
+static inline unsigned int vaffinity_to_vcpuid(register_t vaff)
+{
+    unsigned int vcpuid;
+
+    vaff &= MPIDR_HWID_MASK;
+
+    vcpuid = MPIDR_AFFINITY_LEVEL(vaff, 0);
+    vcpuid |= MPIDR_AFFINITY_LEVEL(vaff, 1) << 4;
+
+    return vcpuid;
+}
+
+static inline register_t vcpuid_to_vaffinity(unsigned int vcpuid)
+{
+    register_t vaff;
+
+    /*
+     * Right now only AFF0 and AFF1 are supported in virtual affinity.
+     * Since only the first 4 bits in AFF0 are used in GICv3, the
+     * available bits are 12 (4+8).
+     */
+    BUILD_BUG_ON(!(MAX_VIRT_CPUS < ((1 << 12))));
+
+    vaff = (vcpuid & 0x0f) << MPIDR_LEVEL_SHIFT(0);
+    vaff |= ((vcpuid >> 4) & MPIDR_LEVEL_MASK) << MPIDR_LEVEL_SHIFT(1);
+
+    return vaff;
+}
+
 #endif /* __ASM_DOMAIN_H__ */
 
 /*
diff --git a/xen/include/asm-arm/gic.h b/xen/include/asm-arm/gic.h
index 0396a8e..d343abf 100644
--- a/xen/include/asm-arm/gic.h
+++ b/xen/include/asm-arm/gic.h
@@ -153,18 +153,16 @@
 #include <xen/irq.h>
 #include <asm-arm/vgic.h>
 
-#define DT_COMPAT_GIC_400            "arm,gic-400"
-#define DT_COMPAT_GIC_CORTEX_A15     "arm,cortex-a15-gic"
-#define DT_COMPAT_GIC_CORTEX_A7      "arm,cortex-a7-gic"
+#define DT_COMPAT_GIC_CORTEX_A15 "arm,cortex-a15-gic"
 
-#define DT_MATCH_GIC_V2 DT_MATCH_COMPATIBLE(DT_COMPAT_GIC_CORTEX_A15), \
-                        DT_MATCH_COMPATIBLE(DT_COMPAT_GIC_CORTEX_A7), \
-                        DT_MATCH_COMPATIBLE(DT_COMPAT_GIC_400)
+#define DT_MATCH_GIC_V2                                             \
+    DT_MATCH_COMPATIBLE(DT_COMPAT_GIC_CORTEX_A15),                  \
+    DT_MATCH_COMPATIBLE("arm,cortex-a7-gic"),                       \
+    DT_MATCH_COMPATIBLE("arm,gic-400")
 
-#define DT_COMPAT_GIC_V3             "arm,gic-v3"
-
-#define DT_MATCH_GIC_V3 DT_MATCH_COMPATIBLE(DT_COMPAT_GIC_V3)
+#define DT_MATCH_GIC_V3 DT_MATCH_COMPATIBLE("arm,gic-v3")
 
+#ifdef HAS_GICV3
 /*
  * GICv3 registers that needs to be saved/restored
  */
@@ -174,6 +172,7 @@ struct gic_v3 {
     uint32_t apr1[4];
     uint64_t lr[16];
 };
+#endif
 
 /*
  * GICv2 register that needs to be saved/restored
@@ -191,7 +190,9 @@ struct gic_v2 {
  */
 union gic_state_data {
     struct gic_v2 v2;
+#ifdef HAS_GICV3
     struct gic_v3 v3;
+#endif
 };
 
 /*
@@ -219,9 +220,13 @@ extern enum gic_version gic_hw_version(void);
 /* Program the GIC to route an interrupt */
 extern void gic_route_irq_to_xen(struct irq_desc *desc, const cpumask_t *cpu_mask,
                                  unsigned int priority);
-extern void gic_route_irq_to_guest(struct domain *, struct irq_desc *desc,
-                                   const cpumask_t *cpu_mask,
-                                   unsigned int priority);
+extern int gic_route_irq_to_guest(struct domain *, unsigned int virq,
+                                  struct irq_desc *desc,
+                                  unsigned int priority);
+
+/* Remove an IRQ passthrough to a guest */
+int gic_remove_irq_from_guest(struct domain *d, unsigned int virq,
+                              struct irq_desc *desc);
 
 extern void gic_inject(void);
 extern void gic_clear_pending_irqs(struct vcpu *v);
@@ -235,6 +240,10 @@ extern void gic_remove_from_queues(struct vcpu *v, unsigned int virtual_irq);
 
 /* Accept an interrupt from the GIC and dispatch its handler */
 extern void gic_interrupt(struct cpu_user_regs *regs, int is_fiq);
+/* Find the interrupt controller and set up the callback to translate
+ * device tree IRQ.
+ */
+extern void gic_preinit(void);
 /* Bring up the interrupt controller, and report # cpus attached */
 extern void gic_init(void);
 /* Bring up a secondary CPU's per-CPU GIC interface */
@@ -287,19 +296,21 @@ struct gic_info {
     uint8_t nr_lrs;
     /* Maintenance irq number */
     unsigned int maintenance_irq;
+    /* Pointer to the device tree node representing the interrupt controller */
+    const struct dt_device_node *node;
 };
 
 struct gic_hw_operations {
     /* Hold GIC HW information */
     const struct gic_info *info;
+    /* Initialize the GIC and the boot CPU */
+    int (*init)(void);
     /* Save GIC registers */
     void (*save_state)(struct vcpu *);
     /* Restore GIC registers */
     void (*restore_state)(const struct vcpu *);
     /* Dump GIC LR register information */
     void (*dump_state)(const struct vcpu *);
-    /* Map MMIO region of GIC */
-    int (*gicv_setup)(struct domain *);
 
     /* hw_irq_controller to enable/disable/eoi host irq */
     hw_irq_controller *gic_host_irq_type;
@@ -339,13 +350,14 @@ struct gic_hw_operations {
     unsigned int (*read_apr)(int apr_reg);
     /* Secondary CPU init */
     int (*secondary_init)(void);
-    int (*make_dt_node)(const struct domain *d,
-                        const struct dt_device_node *node, void *fdt);
+    int (*make_hwdom_dt_node)(const struct domain *d,
+                              const struct dt_device_node *node, void *fdt);
 };
 
 void register_gic_ops(const struct gic_hw_operations *ops);
-int gic_make_node(const struct domain *d,const struct dt_device_node *node,
-                  void *fdt);
+int gic_make_hwdom_dt_node(const struct domain *d,
+                           const struct dt_device_node *node,
+                           void *fdt);
 
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/xen/include/asm-arm/gic_v3_defs.h b/xen/include/asm-arm/gic_v3_defs.h
index b8a1c2e..bf7b239 100644
--- a/xen/include/asm-arm/gic_v3_defs.h
+++ b/xen/include/asm-arm/gic_v3_defs.h
@@ -147,11 +147,20 @@
 
 #define ICH_SGI_IRQMODE_SHIFT        40
 #define ICH_SGI_IRQMODE_MASK         0x1
-#define ICH_SGI_TARGET_OTHERS        1
+#define ICH_SGI_TARGET_OTHERS        1UL
 #define ICH_SGI_TARGET_LIST          0
 #define ICH_SGI_IRQ_SHIFT            24
 #define ICH_SGI_IRQ_MASK             0xf
 #define ICH_SGI_TARGETLIST_MASK      0xffff
+#define ICH_SGI_AFFx_MASK            0xff
+#define ICH_SGI_AFFINITY_LEVEL(x)    (16 * (x))
+
+struct rdist_region {
+    paddr_t base;
+    paddr_t size;
+    void __iomem *map_base;
+};
+
 #endif /* __ASM_ARM_GIC_V3_DEFS_H__ */
 
 /*
diff --git a/xen/include/asm-arm/grant_table.h b/xen/include/asm-arm/grant_table.h
index 0edad67..5e076cc 100644
--- a/xen/include/asm-arm/grant_table.h
+++ b/xen/include/asm-arm/grant_table.h
@@ -3,8 +3,7 @@
 
 #include <xen/grant_table.h>
 
-#define INVALID_GFN (-1UL)
-#define INITIAL_NR_GRANT_FRAMES 1
+#define INITIAL_NR_GRANT_FRAMES 4
 
 void gnttab_clear_flag(unsigned long nr, uint16_t *addr);
 int create_grant_host_mapping(unsigned long gpaddr,
diff --git a/xen/include/asm-arm/hypercall.h b/xen/include/asm-arm/hypercall.h
index 94a92d4..a0c5a31 100644
--- a/xen/include/asm-arm/hypercall.h
+++ b/xen/include/asm-arm/hypercall.h
@@ -4,7 +4,7 @@
 #include <public/domctl.h> /* for arch_do_domctl */
 int do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg);
 
-long do_arm_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg);
+long do_arm_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg);
 
 long subarch_do_domctl(struct xen_domctl *domctl, struct domain *d,
                        XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl);
diff --git a/xen/include/asm-arm/iommu.h b/xen/include/asm-arm/iommu.h
index 9322f08..9b0e34f 100644
--- a/xen/include/asm-arm/iommu.h
+++ b/xen/include/asm-arm/iommu.h
@@ -9,8 +9,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef __ARCH_ARM_IOMMU_H__
 #define __ARCH_ARM_IOMMU_H__
diff --git a/xen/include/asm-arm/irq.h b/xen/include/asm-arm/irq.h
index 435dfcd..f33c331 100644
--- a/xen/include/asm-arm/irq.h
+++ b/xen/include/asm-arm/irq.h
@@ -15,7 +15,6 @@ struct arch_pirq
 };
 
 struct arch_irq_desc {
-    int eoi_cpu;
     unsigned int type;
 };
 
@@ -37,13 +36,19 @@ void do_IRQ(struct cpu_user_regs *regs, unsigned int irq, int is_fiq);
 
 #define domain_pirq_to_irq(d, pirq) (pirq)
 
+bool_t is_assignable_irq(unsigned int irq);
+
 void init_IRQ(void);
 void init_secondary_IRQ(void);
 
-int route_irq_to_guest(struct domain *d, unsigned int irq,
-                       const char *devname);
+int route_irq_to_guest(struct domain *d, unsigned int virq,
+                       unsigned int irq, const char *devname);
+int release_guest_irq(struct domain *d, unsigned int irq);
+
 void arch_move_irqs(struct vcpu *v);
 
+#define arch_evtchn_bind_pirq(d, pirq) ((void)((d) + (pirq)))
+
 /* Set IRQ type for an SPI */
 int irq_set_spi_type(unsigned int spi, unsigned int type);
 
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index d25e485..a95082e 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -5,7 +5,6 @@
 #include <xen/kernel.h>
 #include <asm/page.h>
 #include <public/xen.h>
-#include <xen/domain_page.h>
 #include <xen/pdx.h>
 
 /* Align Xen to a 2 MiB boundary. */
@@ -208,6 +207,8 @@ static inline void __iomem *ioremap_wc(paddr_t start, size_t len)
 #define pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT)
 #define paddr_to_pfn(pa)  ((unsigned long)((pa) >> PAGE_SHIFT))
 #define paddr_to_pdx(pa)    pfn_to_pdx(paddr_to_pfn(pa))
+#define vmap_to_mfn(va)     paddr_to_pfn(virt_to_maddr((vaddr_t)va))
+#define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
 /* Page-align address and convert to frame number format */
 #define paddr_to_pfn_aligned(paddr)    paddr_to_pfn(PAGE_ALIGN(paddr))
@@ -274,10 +275,6 @@ static inline void *page_to_virt(const struct page_info *pg)
     return mfn_to_virt(page_to_mfn(pg));
 }
 
-struct domain *page_get_owner_and_reference(struct page_info *page);
-void put_page(struct page_info *page);
-int  get_page(struct page_info *page, struct domain *domain);
-
 struct page_info *get_page_from_gva(struct domain *d, vaddr_t va,
                                     unsigned long flags);
 
@@ -308,8 +305,6 @@ static inline int relinquish_shared_pages(struct domain *d)
     return 0;
 }
 
-#define INVALID_MFN             (~0UL)
-
 /* Xen always owns P2M on ARM */
 #define set_gpfn_from_mfn(mfn, pfn) do { (void) (mfn), (void)(pfn); } while (0)
 #define mfn_to_gmfn(_d, mfn)  (mfn)
diff --git a/xen/include/asm-arm/monitor.h b/xen/include/asm-arm/monitor.h
new file mode 100644
index 0000000..a3a9703
--- /dev/null
+++ b/xen/include/asm-arm/monitor.h
@@ -0,0 +1,33 @@
+/*
+ * include/asm-arm/monitor.h
+ *
+ * Architecture-specific monitor_op domctl handler.
+ *
+ * Copyright (c) 2015 Tamas K Lengyel (tamas at tklengyel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ASM_ARM_MONITOR_H__
+#define __ASM_ARM_MONITOR_H__
+
+#include <xen/sched.h>
+#include <public/domctl.h>
+
+static inline
+int monitor_domctl(struct domain *d, struct xen_domctl_monitor_op *op)
+{
+    return -ENOSYS;
+}
+
+#endif /* __ASM_X86_MONITOR_H__ */
diff --git a/xen/include/asm-arm/numa.h b/xen/include/asm-arm/numa.h
index 06a9d5a..a00cb7c 100644
--- a/xen/include/asm-arm/numa.h
+++ b/xen/include/asm-arm/numa.h
@@ -1,11 +1,13 @@
 #ifndef __ARCH_ARM_NUMA_H
 #define __ARCH_ARM_NUMA_H
 
+typedef u8 nodeid_t;
+
 /* Fake one node for now. See also node_online_map. */
 #define cpu_to_node(cpu) 0
 #define node_to_cpumask(node)   (cpu_online_map)
 
-static inline __attribute__((pure)) int phys_to_nid(paddr_t addr)
+static inline __attribute__((pure)) nodeid_t phys_to_nid(paddr_t addr)
 {
     return 0;
 }
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index da36504..08bdce3 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -2,11 +2,17 @@
 #define _XEN_P2M_H
 
 #include <xen/mm.h>
-
+#include <xen/radix-tree.h>
+#include <public/vm_event.h> /* for vm_event_response_t */
+#include <public/memory.h>
 #include <xen/p2m-common.h>
+#include <public/memory.h>
 
 #define paddr_bits PADDR_BITS
 
+/* Holds the bit size of IPAs in p2m tables.  */
+extern unsigned int p2m_ipa_bits;
+
 struct domain;
 
 extern void memory_type_changed(struct domain *);
@@ -45,9 +51,21 @@ struct p2m_domain {
         unsigned long shattered[4];
     } stats;
 
-    /* If true, and an access fault comes in and there is no mem_event listener,
+    /* If true, and an access fault comes in and there is no vm_event listener,
      * pause domain. Otherwise, remove access restrictions. */
     bool_t access_required;
+
+    /* Defines if mem_access is in use for the domain. */
+    bool_t mem_access_enabled;
+
+    /* Default P2M access type for each page in the the domain: new pages,
+     * swapped in pages, cleared pages, and pages that are ambiguously
+     * retyped get this access type. See definition of p2m_access_t. */
+    p2m_access_t default_access;
+
+    /* Radix tree to store the p2m_access_t settings as the pte's don't have
+     * enough available bits to store this information. */
+    struct radix_tree_root mem_access_settings;
 };
 
 /* List of possible type for each page in the p2m entry.
@@ -71,16 +89,30 @@ typedef enum {
 } p2m_type_t;
 
 static inline
-void p2m_mem_event_emulate_check(struct vcpu *v,
-                                 const mem_event_response_t *rsp)
+int p2m_mem_access_enable_emulate(struct domain *d)
+{
+    /* Not supported on ARM */
+    return -ENOSYS;
+}
+
+static inline
+int p2m_mem_access_disable_emulate(struct domain *d)
+{
+    /* Not supported on ARM */
+    return -ENOSYS;
+}
+
+static inline
+void p2m_mem_access_emulate_check(struct vcpu *v,
+                                  const vm_event_response_t *rsp)
 {
     /* Not supported on ARM. */
-};
+}
 
 static inline
-void p2m_setup_introspection(struct domain *d)
+void p2m_altp2m_check(struct vcpu *v, uint16_t idx)
 {
-    /* No special setup on ARM. */
+    /* Not supported on ARM. */
 }
 
 #define p2m_is_foreign(_t)  ((_t) == p2m_map_foreign)
@@ -217,6 +249,21 @@ static inline int get_page_and_type(struct page_info *page,
 /* get host p2m table */
 #define p2m_get_hostp2m(d) (&(d)->arch.p2m)
 
+/* vm_event and mem_access are supported on any ARM guest */
+static inline bool_t p2m_mem_access_sanity_check(struct domain *d)
+{
+    return 1;
+}
+
+static inline bool_t p2m_vm_event_sanity_check(struct domain *d)
+{
+    return 1;
+}
+
+/* Send mem event based on the access. Boolean return value indicates if trap
+ * needs to be injected into guest. */
+bool_t p2m_mem_access_check(paddr_t gpa, vaddr_t gla, const struct npfec npfec);
+
 #endif /* _XEN_P2M_H */
 
 /*
diff --git a/xen/include/asm-arm/page.h b/xen/include/asm-arm/page.h
index 53d4b63..5ecfd07 100644
--- a/xen/include/asm-arm/page.h
+++ b/xen/include/asm-arm/page.h
@@ -2,7 +2,6 @@
 #define __ARM_PAGE_H__
 
 #include <xen/config.h>
-#include <xen/errno.h>
 #include <public/xen.h>
 #include <asm/processor.h>
 
@@ -65,7 +64,6 @@
 #define PAGE_HYPERVISOR         (WRITEALLOC)
 #define PAGE_HYPERVISOR_NOCACHE (DEV_SHARED)
 #define PAGE_HYPERVISOR_WC      (DEV_WC)
-#define MAP_SMALL_PAGES         PAGE_HYPERVISOR
 
 /*
  * Stage 2 Memory Type.
@@ -83,6 +81,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <xen/errno.h>
 #include <xen/types.h>
 #include <xen/lib.h>
 
@@ -265,6 +264,8 @@ static inline lpae_t mfn_to_xen_entry(unsigned long mfn, unsigned attr)
 /* Actual cacheline size on the boot CPU. */
 extern size_t cacheline_bytes;
 
+#define copy_page(dp, sp) memcpy(dp, sp, PAGE_SIZE)
+
 /* Functions for flushing medium-sized areas.
  * if 'range' is large enough we might want to use model-specific
  * full-cache flushes. */
@@ -423,9 +424,9 @@ static inline uint64_t va_to_par(vaddr_t va)
     return par;
 }
 
-static inline int gva_to_ipa(vaddr_t va, paddr_t *paddr)
+static inline int gva_to_ipa(vaddr_t va, paddr_t *paddr, unsigned int flags)
 {
-    uint64_t par = gva_to_ipa_par(va);
+    uint64_t par = gva_to_ipa_par(va, flags);
     if ( par & PAR_F )
         return -EFAULT;
     *paddr = (par & PADDR_MASK & PAGE_MASK) | ((unsigned long) va & ~PAGE_MASK);
diff --git a/xen/include/asm-arm/perfc.h b/xen/include/asm-arm/perfc.h
new file mode 100644
index 0000000..95c4b2b
--- /dev/null
+++ b/xen/include/asm-arm/perfc.h
@@ -0,0 +1,21 @@
+#ifndef __ASM_PERFC_H__
+#define __ASM_PERFC_H__
+
+static inline void arch_perfc_reset(void)
+{
+}
+
+static inline void arch_perfc_gather(void)
+{
+}
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-arm/perfc_defn.h b/xen/include/asm-arm/perfc_defn.h
new file mode 100644
index 0000000..69fabe7
--- /dev/null
+++ b/xen/include/asm-arm/perfc_defn.h
@@ -0,0 +1,83 @@
+/* This file is legitimately included multiple times. */
+/*#ifndef __XEN_PERFC_DEFN_H__*/
+/*#define __XEN_PERFC_DEFN_H__*/
+
+PERFCOUNTER(invalid_hypercalls, "invalid hypercalls")
+
+PERFCOUNTER(trap_wfi,      "trap: wfi")
+PERFCOUNTER(trap_wfe,      "trap: wfe")
+PERFCOUNTER(trap_cp15_32,  "trap: cp15 32-bit access")
+PERFCOUNTER(trap_cp15_64,  "trap: cp15 64-bit access")
+PERFCOUNTER(trap_cp14_32,  "trap: cp14 32-bit access")
+PERFCOUNTER(trap_cp14_64,  "trap: cp14 64-bit access")
+PERFCOUNTER(trap_cp14_dbg, "trap: cp14 dbg access")
+PERFCOUNTER(trap_cp,       "trap: cp access")
+PERFCOUNTER(trap_smc32,    "trap: 32-bit smc")
+PERFCOUNTER(trap_hvc32,    "trap: 32-bit hvc")
+#ifdef CONFIG_ARM_64
+PERFCOUNTER(trap_smc64,    "trap: 64-bit smc")
+PERFCOUNTER(trap_hvc64,    "trap: 64-bit hvc")
+PERFCOUNTER(trap_sysreg,   "trap: sysreg access")
+#endif
+PERFCOUNTER(trap_iabt,     "trap: guest instr abort")
+PERFCOUNTER(trap_dabt,     "trap: guest data abort")
+PERFCOUNTER(trap_uncond,   "trap: condition failed")
+
+PERFCOUNTER(vpsci_cpu_on,              "vpsci: cpu_on")
+PERFCOUNTER(vpsci_cpu_off,             "vpsci: cpu_off")
+PERFCOUNTER(vpsci_version,             "vpsci: version")
+PERFCOUNTER(vpsci_migrate_info_type,   "vpsci: migrate_info_type")
+PERFCOUNTER(vpsci_migrate_info_up_cpu, "vpsci: migrate_info_up_cpu")
+PERFCOUNTER(vpsci_system_off,          "vpsci: system_off")
+PERFCOUNTER(vpsci_system_reset,        "vpsci: system_reset")
+PERFCOUNTER(vpsci_cpu_suspend,         "vpsci: cpu_suspend")
+PERFCOUNTER(vpsci_cpu_affinity_info,   "vpsci: cpu_affinity_info")
+PERFCOUNTER(vpsci_cpu_migrate,         "vpsci: cpu_migrate")
+
+PERFCOUNTER(vgicd_reads,                "vgicd: read")
+PERFCOUNTER(vgicd_writes,               "vgicd: write")
+PERFCOUNTER(vgicr_reads,                "vgicr: read")
+PERFCOUNTER(vgicr_writes,               "vgicr: write")
+PERFCOUNTER(vgic_sysreg_reads,          "vgic: sysreg read")
+PERFCOUNTER(vgic_sysreg_writes,         "vgic: sysreg write")
+PERFCOUNTER(vgic_sgi_list  ,            "vgic: SGI send to list")
+PERFCOUNTER(vgic_sgi_others,            "vgic: SGI send to others")
+PERFCOUNTER(vgic_sgi_self,              "vgic: SGI send to self")
+PERFCOUNTER(vgic_cross_cpu_intr_inject, "vgic: cross-CPU irq inject")
+PERFCOUNTER(vgic_irq_migrates,          "vgic: irq migration")
+
+PERFCOUNTER(vuart_reads,  "vuart: read")
+PERFCOUNTER(vuart_writes, "vuart: write")
+
+PERFCOUNTER(vtimer_cp32_reads,   "vtimer: cp32 read")
+PERFCOUNTER(vtimer_cp32_writes,  "vtimer: cp32 write")
+
+PERFCOUNTER(vtimer_cp64_reads,   "vtimer: cp64 read")
+PERFCOUNTER(vtimer_cp64_writes,  "vtimer: cp64 write")
+
+PERFCOUNTER(vtimer_sysreg_reads,  "vtimer: sysreg read")
+PERFCOUNTER(vtimer_sysreg_writes, "vtimer: sysreg write")
+
+PERFCOUNTER(vtimer_phys_inject,   "vtimer: phys expired, injected")
+PERFCOUNTER(vtimer_phys_masked,   "vtimer: phys expired, masked")
+PERFCOUNTER(vtimer_virt_inject,   "vtimer: virt expired, injected")
+
+PERFCOUNTER(ppis,                 "#PPIs")
+PERFCOUNTER(spis,                 "#SPIs")
+PERFCOUNTER(guest_irqs,           "#GUEST-IRQS")
+
+PERFCOUNTER(hyp_timer_irqs,   "Hypervisor timer interrupts")
+PERFCOUNTER(phys_timer_irqs,  "Physical timer interrupts")
+PERFCOUNTER(virt_timer_irqs,  "Virtual timer interrupts")
+PERFCOUNTER(maintenance_irqs, "Maintenance interrupts")
+
+/*#endif*/ /* __XEN_PERFC_DEFN_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-arm/platform.h b/xen/include/asm-arm/platform.h
index eefaca6..b8fc5ac 100644
--- a/xen/include/asm-arm/platform.h
+++ b/xen/include/asm-arm/platform.h
@@ -37,16 +37,6 @@ struct platform_desc {
      * List of devices which must not pass-through to a guest
      */
     const struct dt_device_match *blacklist_dev;
-    /*
-     * The IRQ (PPI) to use to inject event channels to dom0.
-     */
-    unsigned int dom0_evtchn_ppi;
-    /*
-     * The location of a region of physical address space which dom0
-     * can use for grant table mappings. If size is zero defaults to
-     * 0xb0000000-0xb0020000.
-     */
-    paddr_t dom0_gnttab_start, dom0_gnttab_size;
 };
 
 /*
@@ -55,11 +45,6 @@ struct platform_desc {
  */
 #define PLATFORM_QUIRK_GIC_64K_STRIDE (1 << 0)
 
-/*
- * Quirk for platforms where GICH_LR_HW does not work as expected.
- */
-#define PLATFORM_QUIRK_GUEST_PIRQ_NEED_EOI       (1 << 1)
-
 void __init platform_init(void);
 int __init platform_init_time(void);
 int __init platform_specific_mapping(struct domain *d);
@@ -72,11 +57,10 @@ void platform_poweroff(void);
 bool_t platform_has_quirk(uint32_t quirk);
 bool_t platform_device_is_blacklisted(const struct dt_device_node *node);
 unsigned int platform_dom0_evtchn_ppi(void);
-void platform_dom0_gnttab(paddr_t *start, paddr_t *size);
 
 #define PLATFORM_START(_name, _namestr)                         \
 static const struct platform_desc  __plat_desc_##_name __used   \
-__attribute__((__section__(".arch.info"))) = {                  \
+__section(".arch.info") = {                                     \
     .name = _namestr,
 
 #define PLATFORM_END                                            \
diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
index fcd26fb..7e6eb66 100644
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -438,10 +438,21 @@ union hsr {
     } sysreg; /* HSR_EC_SYSREG */
 #endif
 
+    struct hsr_iabt {
+        unsigned long ifsc:6;  /* Instruction fault status code */
+        unsigned long res0:1;
+        unsigned long s1ptw:1; /* Stage 2 fault during stage 1 translation */
+        unsigned long res1:1;
+        unsigned long eat:1;   /* External abort type */
+        unsigned long res2:15;
+        unsigned long len:1;   /* Instruction length */
+        unsigned long ec:6;    /* Exception Class */
+    } iabt; /* HSR_EC_INSTR_ABORT_* */
+
     struct hsr_dabt {
         unsigned long dfsc:6;  /* Data Fault Status Code */
         unsigned long write:1; /* Write / not Read */
-        unsigned long s1ptw:1; /* */
+        unsigned long s1ptw:1; /* Stage 2 fault during stage 1 translation */
         unsigned long cache:1; /* Cache Maintenance */
         unsigned long eat:1;   /* External Abort Type */
 #ifdef CONFIG_ARM_32
@@ -555,8 +566,14 @@ union hsr {
 #define FSC_LL_MASK    (_AC(0x03,U)<<0)
 
 /* Time counter hypervisor control register */
-#define CNTHCTL_PA      (1u<<0)  /* Kernel/user access to physical counter */
-#define CNTHCTL_TA      (1u<<1)  /* Kernel/user access to CNTP timer */
+#define CNTHCTL_EL2_EL1PCTEN (1u<<0) /* Kernel/user access to physical counter */
+#define CNTHCTL_EL2_EL1PCEN  (1u<<1) /* Kernel/user access to CNTP timer regs */
+
+/* Time counter kernel control register */
+#define CNTKCTL_EL1_EL0PCTEN (1u<<0) /* Expose phys counters to EL0 */
+#define CNTKCTL_EL1_EL0VCTEN (1u<<1) /* Expose virt counters to EL0 */
+#define CNTKCTL_EL1_EL0VTEN  (1u<<8) /* Expose virt timer registers to EL0 */
+#define CNTKCTL_EL1_EL0PTEN  (1u<<9) /* Expose phys timer registers to EL0 */
 
 /* Timer control registers */
 #define CNTx_CTL_ENABLE   (1u<<0)  /* Enable timer */
diff --git a/xen/include/asm-arm/scif-uart.h b/xen/include/asm-arm/scif-uart.h
new file mode 100644
index 0000000..7a9f639
--- /dev/null
+++ b/xen/include/asm-arm/scif-uart.h
@@ -0,0 +1,107 @@
+/*
+ * xen/include/asm-arm/scif-uart.h
+ *
+ * Common constant definition between early printk and the UART driver
+ * for the SCIF compatible UART.
+ *
+ * Oleksandr Tyshchenko <oleksandr.tyshchenko at globallogic.com>
+ * Copyright (C) 2014, Globallogic.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __ASM_ARM_SCIF_UART_H
+#define __ASM_ARM_SCIF_UART_H
+
+#define SCIF_FIFO_MAX_SIZE    16
+#define SCIF_CLK_FREQ         14745600
+
+/* Register offsets */
+#define SCIF_SCSMR     (0x00)    /* Serial mode register           */
+#define SCIF_SCBRR     (0x04)    /* Bit rate register              */
+#define SCIF_SCSCR     (0x08)    /* Serial control register        */
+#define SCIF_SCFTDR    (0x0C)    /* Transmit FIFO data register    */
+#define SCIF_SCFSR     (0x10)    /* Serial status register         */
+#define SCIF_SCFRDR    (0x14)    /* Receive FIFO data register     */
+#define SCIF_SCFCR     (0x18)    /* FIFO control register          */
+#define SCIF_SCFDR     (0x1C)    /* FIFO data count register       */
+#define SCIF_SCSPTR    (0x20)    /* Serial port register           */
+#define SCIF_SCLSR     (0x24)    /* Line status register           */
+#define SCIF_DL        (0x30)    /* Frequency division register    */
+#define SCIF_CKS       (0x34)    /* Clock Select register          */
+
+/* Serial Control Register (SCSCR) */
+#define SCSCR_TIE     (1 << 7)    /* Transmit Interrupt Enable */
+#define SCSCR_RIE     (1 << 6)    /* Receive Interrupt Enable */
+#define SCSCR_TE      (1 << 5)    /* Transmit Enable */
+#define SCSCR_RE      (1 << 4)    /* Receive Enable */
+#define SCSCR_REIE    (1 << 3)    /* Receive Error Interrupt Enable */
+#define SCSCR_TOIE    (1 << 2)    /* Timeout Interrupt Enable */
+#define SCSCR_CKE1    (1 << 1)    /* Clock Enable 1 */
+#define SCSCR_CKE0    (1 << 0)    /* Clock Enable 0 */
+
+#define SCSCR_CKE00    (0)
+#define SCSCR_CKE01    (SCSCR_CKE0)
+#define SCSCR_CKE10    (SCSCR_CKE1)
+#define SCSCR_CKE11    (SCSCR_CKE1 | SCSCR_CKE0)
+
+/* Serial Mode Register (SCSMR) */
+#define SCSMR_CHR     (1 << 6)    /* 7-bit Character Length */
+#define SCSMR_PE      (1 << 5)    /* Parity Enable */
+#define SCSMR_ODD     (1 << 4)    /* Odd Parity */
+#define SCSMR_STOP    (1 << 3)    /* Stop Bit Length */
+
+/* Serial Status Register (SCFSR) */
+#define SCFSR_ER      (1 << 7)    /* Receive Error */
+#define SCFSR_TEND    (1 << 6)    /* Transmission End */
+#define SCFSR_TDFE    (1 << 5)    /* Transmit FIFO Data Empty */
+#define SCFSR_BRK     (1 << 4)    /* Break Detect */
+#define SCFSR_FER     (1 << 3)    /* Framing Error */
+#define SCFSR_PER     (1 << 2)    /* Parity Error */
+#define SCFSR_RDF     (1 << 1)    /* Receive FIFO Data Full */
+#define SCFSR_DR      (1 << 0)    /* Receive Data Ready */
+
+#define SCIF_ERRORS    (SCFSR_PER | SCFSR_FER | SCFSR_ER | SCFSR_BRK)
+
+/* Line Status Register (SCLSR) */
+#define SCLSR_TO      (1 << 2)    /* Timeout */
+#define SCLSR_ORER    (1 << 0)    /* Overrun Error */
+
+/* FIFO Control Register (SCFCR) */
+#define SCFCR_RTRG1    (1 << 7)    /* Receive FIFO Data Count Trigger 1 */
+#define SCFCR_RTRG0    (1 << 6)    /* Receive FIFO Data Count Trigger 0 */
+#define SCFCR_TTRG1    (1 << 5)    /* Transmit FIFO Data Count Trigger 1 */
+#define SCFCR_TTRG0    (1 << 4)    /* Transmit FIFO Data Count Trigger 0 */
+#define SCFCR_MCE      (1 << 3)    /* Modem Control Enable */
+#define SCFCR_TFRST    (1 << 2)    /* Transmit FIFO Data Register Reset */
+#define SCFCR_RFRST    (1 << 1)    /* Receive FIFO Data Register Reset */
+#define SCFCR_LOOP     (1 << 0)    /* Loopback Test */
+
+#define SCFCR_RTRG00    (0)
+#define SCFCR_RTRG01    (SCFCR_RTRG0)
+#define SCFCR_RTRG10    (SCFCR_RTRG1)
+#define SCFCR_RTRG11    (SCFCR_RTRG1 | SCFCR_RTRG0)
+
+#define SCFCR_TTRG00    (0)
+#define SCFCR_TTRG01    (SCFCR_TTRG0)
+#define SCFCR_TTRG10    (SCFCR_TTRG1)
+#define SCFCR_TTRG11    (SCFCR_TTRG1 | SCFCR_TTRG0)
+
+#endif /* __ASM_ARM_SCIF_UART_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-arm/setup.h b/xen/include/asm-arm/setup.h
index ba5a67d..81bb3da 100644
--- a/xen/include/asm-arm/setup.h
+++ b/xen/include/asm-arm/setup.h
@@ -51,8 +51,6 @@ void arch_init_memory(void);
 
 void copy_from_paddr(void *dst, paddr_t paddr, unsigned long len);
 
-void arch_get_xen_caps(xen_capabilities_info_t *info);
-
 int construct_dom0(struct domain *d);
 
 void discard_initial_modules(void);
diff --git a/xen/include/asm-arm/spinlock.h b/xen/include/asm-arm/spinlock.h
index a064f73..81955d1 100644
--- a/xen/include/asm-arm/spinlock.h
+++ b/xen/include/asm-arm/spinlock.h
@@ -1,23 +1,6 @@
 #ifndef __ASM_SPINLOCK_H
 #define __ASM_SPINLOCK_H
 
-#include <xen/config.h>
-#include <xen/lib.h>
-
-#if defined(CONFIG_ARM_32)
-# include <asm/arm32/spinlock.h>
-#elif defined(CONFIG_ARM_64)
-# include <asm/arm64/spinlock.h>
-#else
-# error "unknown ARM variant"
-#endif
+/* Nothing ARM specific. */
 
 #endif /* __ASM_SPINLOCK_H */
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/include/asm-arm/sysregs.h b/xen/include/asm-arm/sysregs.h
index 169b7ac..570f43e 100644
--- a/xen/include/asm-arm/sysregs.h
+++ b/xen/include/asm-arm/sysregs.h
@@ -40,9 +40,17 @@
     ((__HSR_SYSREG_##crm) << HSR_SYSREG_CRM_SHIFT) | \
     ((__HSR_SYSREG_##op2) << HSR_SYSREG_OP2_SHIFT)
 
+#define HSR_SYSREG_DCISW          HSR_SYSREG(1,0,c7,c6,2)
+#define HSR_SYSREG_DCCSW          HSR_SYSREG(1,0,c7,c10,2)
+#define HSR_SYSREG_DCCISW         HSR_SYSREG(1,0,c7,c14,2)
+
 #define HSR_SYSREG_MDSCR_EL1      HSR_SYSREG(2,0,c0,c2,2)
+#define HSR_SYSREG_MDRAR_EL1      HSR_SYSREG(2,0,c1,c0,0)
 #define HSR_SYSREG_OSLAR_EL1      HSR_SYSREG(2,0,c1,c0,4)
+#define HSR_SYSREG_OSLSR_EL1      HSR_SYSREG(2,0,c1,c1,4)
 #define HSR_SYSREG_OSDLR_EL1      HSR_SYSREG(2,0,c1,c3,4)
+#define HSR_SYSREG_DBGPRCR_EL1    HSR_SYSREG(2,0,c1,c4,4)
+#define HSR_SYSREG_MDCCSR_EL0     HSR_SYSREG(2,3,c0,c1,0)
 
 #define HSR_SYSREG_DBGBVRn_EL1(n) HSR_SYSREG(2,0,c0,c##n,4)
 #define HSR_SYSREG_DBGBCRn_EL1(n) HSR_SYSREG(2,0,c0,c##n,5)
@@ -67,6 +75,7 @@
                                   case HSR_SYSREG_##REG##n_EL1(15)
 
 #define HSR_SYSREG_SCTLR_EL1      HSR_SYSREG(3,0,c1, c0,0)
+#define HSR_SYSREG_ACTLR_EL1      HSR_SYSREG(3,0,c1, c0,1)
 #define HSR_SYSREG_TTBR0_EL1      HSR_SYSREG(3,0,c2, c0,0)
 #define HSR_SYSREG_TTBR1_EL1      HSR_SYSREG(3,0,c2, c0,1)
 #define HSR_SYSREG_TCR_EL1        HSR_SYSREG(3,0,c2, c0,2)
@@ -100,8 +109,9 @@
 #define HSR_SYSREG_PMOVSSET_EL0   HSR_SYSREG(3,3,c9,c14,3)
 
 #define HSR_SYSREG_CNTPCT_EL0     HSR_SYSREG(3,3,c14,c0,0)
-#define HSR_SYSREG_CNTP_CTL_EL0   HSR_SYSREG(3,3,c14,c2,1)
 #define HSR_SYSREG_CNTP_TVAL_EL0  HSR_SYSREG(3,3,c14,c2,0)
+#define HSR_SYSREG_CNTP_CTL_EL0   HSR_SYSREG(3,3,c14,c2,1)
+#define HSR_SYSREG_CNTP_CVAL_EL0  HSR_SYSREG(3,3,c14,c2,2)
 
 /*
  * GIC System register assembly aliases picked from kernel
diff --git a/xen/include/asm-arm/system.h b/xen/include/asm-arm/system.h
index ce3d38a..f0e222f 100644
--- a/xen/include/asm-arm/system.h
+++ b/xen/include/asm-arm/system.h
@@ -51,6 +51,11 @@
 # error "unknown ARM variant"
 #endif
 
+#define arch_fetch_and_add(x, v) __sync_fetch_and_add(x, v)
+
+#define arch_lock_acquire_barrier() smp_mb()
+#define arch_lock_release_barrier() smp_mb()
+
 extern struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next);
 
 #endif
diff --git a/xen/include/asm-arm/time.h b/xen/include/asm-arm/time.h
index d544b5b..d755f36 100644
--- a/xen/include/asm-arm/time.h
+++ b/xen/include/asm-arm/time.h
@@ -22,6 +22,12 @@ enum timer_ppi
     MAX_TIMER_PPI = 4,
 };
 
+/*
+ * Value of "clock-frequency" in the DT timer node if present.
+ * 0 means the property doesn't exist.
+ */
+extern uint32_t timer_dt_clock_frequency;
+
 /* Get one of the timer IRQ number */
 unsigned int timer_get_irq(enum timer_ppi ppi);
 
@@ -31,6 +37,8 @@ extern void __cpuinit init_timer_interrupt(void);
 /* Counter value at boot time */
 extern uint64_t boot_count;
 
+void preinit_xen_time(void);
+
 #endif /* __ARM_TIME_H__ */
 /*
  * Local variables:
diff --git a/xen/include/asm-arm/vgic.h b/xen/include/asm-arm/vgic.h
index 74d5a4e..96839f0 100644
--- a/xen/include/asm-arm/vgic.h
+++ b/xen/include/asm-arm/vgic.h
@@ -67,7 +67,7 @@ struct pending_irq
 #define GIC_IRQ_GUEST_MIGRATING   4
     unsigned long status;
     struct irq_desc *desc; /* only set it the irq corresponds to a physical irq */
-    int irq;
+    unsigned int irq;
 #define GIC_INVALID_LR         ~(uint8_t)0
     uint8_t lr;
     uint8_t priority;
@@ -85,7 +85,7 @@ struct pending_irq
 /* Represents state corresponding to a block of 32 interrupts */
 struct vgic_irq_rank {
     spinlock_t lock; /* Covers access to all other members of this struct */
-    uint32_t ienable, iactive, ipend, pendsgi;
+    uint32_t ienable;
     uint32_t icfg[2];
     uint32_t ipriority[8];
     union {
@@ -98,6 +98,17 @@ struct vgic_irq_rank {
     };
 };
 
+struct sgi_target {
+    uint8_t aff1;
+    uint16_t list;
+};
+
+static inline void sgi_target_init(struct sgi_target *sgi_target)
+{
+    sgi_target->aff1 = 0;
+    sgi_target->list = 0;
+}
+
 struct vgic_ops {
     /* Initialize vGIC */
     int (*vcpu_init)(struct vcpu *v);
@@ -110,6 +121,8 @@ struct vgic_ops {
     struct vcpu *(*get_target_vcpu)(struct vcpu *v, unsigned int irq);
     /* vGIC sysreg emulation */
     int (*emulate_sysreg)(struct cpu_user_regs *regs, union hsr hsr);
+    /* Maximum number of vCPU supported */
+    const unsigned int max_vcpus;
 };
 
 /* Number of ranks of interrupt registers for a domain */
@@ -161,10 +174,10 @@ static inline void vgic_byte_write(uint32_t *reg, uint32_t var, int offset)
 {
     int byte = offset & 0x3;
 
-    var &= (0xff << (8*byte));
+    var &= 0xff;
 
     *reg &= ~(0xff << (8*byte));
-    *reg |= var;
+    *reg |= (var << (8*byte));
 }
 
 enum gic_sgi_mode;
@@ -177,14 +190,15 @@ enum gic_sgi_mode;
 
 #define vgic_num_irqs(d)        ((d)->arch.vgic.nr_spis + 32)
 
-extern int domain_vgic_init(struct domain *d);
+extern int domain_vgic_init(struct domain *d, unsigned int nr_spis);
 extern void domain_vgic_free(struct domain *d);
 extern int vcpu_vgic_init(struct vcpu *v);
 extern struct vcpu *vgic_get_target_vcpu(struct vcpu *v, unsigned int irq);
-extern void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq);
-extern void vgic_vcpu_inject_spi(struct domain *d, unsigned int irq);
+extern void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int virq);
+extern void vgic_vcpu_inject_spi(struct domain *d, unsigned int virq);
 extern void vgic_clear_pending_irqs(struct vcpu *v);
 extern struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq);
+extern struct pending_irq *spi_to_pending(struct domain *d, unsigned int irq);
 extern struct vgic_irq_rank *vgic_rank_offset(struct vcpu *v, int b, int n, int s);
 extern struct vgic_irq_rank *vgic_rank_irq(struct vcpu *v, unsigned int irq);
 extern int vgic_emulate(struct cpu_user_regs *regs, union hsr hsr);
@@ -197,8 +211,41 @@ int vgic_v3_init(struct domain *d);
 extern int vcpu_vgic_free(struct vcpu *v);
 extern int vgic_to_sgi(struct vcpu *v, register_t sgir,
                        enum gic_sgi_mode irqmode, int virq,
-                       unsigned long vcpu_mask);
+                       const struct sgi_target *target);
 extern void vgic_migrate_irq(struct vcpu *old, struct vcpu *new, unsigned int irq);
+
+/* Reserve a specific guest vIRQ */
+extern bool_t vgic_reserve_virq(struct domain *d, unsigned int virq);
+
+/*
+ * Allocate a guest VIRQ
+ *  - spi == 0 => allocate a PPI. It will be the same on every vCPU
+ *  - spi == 1 => allocate an SPI
+ */
+extern int vgic_allocate_virq(struct domain *d, bool_t spi);
+
+static inline int vgic_allocate_ppi(struct domain *d)
+{
+    return vgic_allocate_virq(d, 0 /* ppi */);
+}
+
+static inline int vgic_allocate_spi(struct domain *d)
+{
+    return vgic_allocate_virq(d, 1 /* spi */);
+}
+
+extern void vgic_free_virq(struct domain *d, unsigned int virq);
+
+void vgic_v2_setup_hw(paddr_t dbase, paddr_t cbase, paddr_t vbase);
+
+#ifdef HAS_GICV3
+struct rdist_region;
+void vgic_v3_setup_hw(paddr_t dbase,
+                      unsigned int nr_rdist_regions,
+                      const struct rdist_region *regions,
+                      uint32_t rdist_stride);
+#endif
+
 #endif /* __ASM_ARM_VGIC_H__ */
 
 /*
diff --git a/xen/include/asm-arm/vm_event.h b/xen/include/asm-arm/vm_event.h
new file mode 100644
index 0000000..976fdf1
--- /dev/null
+++ b/xen/include/asm-arm/vm_event.h
@@ -0,0 +1,50 @@
+/*
+ * vm_event.h: architecture specific vm_event handling routines
+ *
+ * Copyright (c) 2015 Tamas K Lengyel (tamas at tklengyel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ASM_ARM_VM_EVENT_H__
+#define __ASM_ARM_VM_EVENT_H__
+
+#include <xen/sched.h>
+#include <xen/vm_event.h>
+
+static inline
+int vm_event_init_domain(struct domain *d)
+{
+    /* Not supported on ARM. */
+    return 0;
+}
+
+static inline
+void vm_event_cleanup_domain(struct domain *d)
+{
+    /* Not supported on ARM. */
+}
+
+static inline
+void vm_event_toggle_singlestep(struct domain *d, struct vcpu *v)
+{
+    /* Not supported on ARM. */
+}
+
+static inline
+void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp)
+{
+    /* Not supported on ARM. */
+}
+
+#endif /* __ASM_ARM_VM_EVENT_H__ */
diff --git a/xen/include/asm-x86/acpi.h b/xen/include/asm-x86/acpi.h
index 5e85b38..cf479a9 100644
--- a/xen/include/asm-x86/acpi.h
+++ b/xen/include/asm-x86/acpi.h
@@ -18,8 +18,7 @@
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/xen/include/asm-x86/alternative.h b/xen/include/asm-x86/alternative.h
index c746047..23c9b9f 100644
--- a/xen/include/asm-x86/alternative.h
+++ b/xen/include/asm-x86/alternative.h
@@ -12,6 +12,7 @@
         .byte \alt_len
 .endm
 #else
+#include <xen/stringify.h>
 #include <xen/types.h>
 
 struct alt_instr {
@@ -73,6 +74,26 @@ extern void alternative_instructions(void);
 #define alternative(oldinstr, newinstr, feature)                        \
         asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
 
+/*
+ * Alternative inline assembly with input.
+ *
+ * Pecularities:
+ * No memory clobber here.
+ * Argument numbers start with 1.
+ * Best is to use constraints that are fixed size (like (%1) ... "r")
+ * If you use variable sized constraints like "m" or "g" in the
+ * replacement make sure to pad to the worst case length.
+ * Leaving an unused argument 0 to keep API compatibility.
+ */
+#define alternative_input(oldinstr, newinstr, feature, input...)	\
+	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)		\
+		: : "i" (0), ## input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, newinstr, feature, output, input...)	\
+	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)		\
+		: output : "i" (0), ## input)
+
 #endif  /*  __ASSEMBLY__  */
 
 #endif /* __X86_ALTERNATIVE_H__ */
diff --git a/xen/include/asm-x86/altp2m.h b/xen/include/asm-x86/altp2m.h
new file mode 100644
index 0000000..fc82c1b
--- /dev/null
+++ b/xen/include/asm-x86/altp2m.h
@@ -0,0 +1,37 @@
+/*
+ * Alternate p2m HVM
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _X86_ALTP2M_H
+#define _X86_ALTP2M_H
+
+#include <xen/types.h>
+#include <xen/sched.h>         /* for struct vcpu, struct domain */
+#include <asm/hvm/vcpu.h>      /* for vcpu_altp2m */
+
+/* Alternate p2m HVM on/off per domain */
+static inline bool_t altp2m_active(const struct domain *d)
+{
+    return d->arch.altp2m_active;
+}
+
+/* Alternate p2m VCPU */
+void altp2m_vcpu_initialise(struct vcpu *v);
+void altp2m_vcpu_destroy(struct vcpu *v);
+void altp2m_vcpu_reset(struct vcpu *v);
+
+#endif /* _X86_ALTP2M_H */
+
diff --git a/xen/include/asm-x86/amd-iommu.h b/xen/include/asm-x86/amd-iommu.h
index b45708b..e9fa9c2 100644
--- a/xen/include/asm-x86/amd-iommu.h
+++ b/xen/include/asm-x86/amd-iommu.h
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef _ASM_X86_64_AMD_IOMMU_H
 #define _ASM_X86_64_AMD_IOMMU_H
diff --git a/xen/include/asm-x86/apic.h b/xen/include/asm-x86/apic.h
index 6697245..be9a535 100644
--- a/xen/include/asm-x86/apic.h
+++ b/xen/include/asm-x86/apic.h
@@ -221,7 +221,6 @@ extern unsigned int nmi_watchdog;
 #define NMI_NONE	0
 #define NMI_IO_APIC	1
 #define NMI_LOCAL_APIC	2
-#define NMI_INVALID	3
 
 #else /* !CONFIG_X86_LOCAL_APIC */
 static inline int lapic_suspend(void) {return 0;}
diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h
index 1674c7c..7c8c2c0 100644
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -6,6 +6,7 @@
 /* NB. Auto-generated from arch/.../asm-offsets.c */
 #include <asm/asm-offsets.h>
 #endif
+#include <asm/bug.h>
 #include <asm/processor.h>
 #include <asm/percpu.h>
 #include <xen/stringify.h>
@@ -26,18 +27,20 @@ void ret_from_intr(void);
 #endif
 
 #ifndef NDEBUG
-#define ASSERT_INTERRUPT_STATUS(x)              \
+#define ASSERT_INTERRUPT_STATUS(x, msg)         \
         pushf;                                  \
         testb $X86_EFLAGS_IF>>8,1(%rsp);        \
         j##x  1f;                               \
-        ud2a;                                   \
+        ASSERT_FAILED(msg);                     \
 1:      addq  $8,%rsp;
 #else
-#define ASSERT_INTERRUPT_STATUS(x)
+#define ASSERT_INTERRUPT_STATUS(x, msg)
 #endif
 
-#define ASSERT_INTERRUPTS_ENABLED  ASSERT_INTERRUPT_STATUS(nz)
-#define ASSERT_INTERRUPTS_DISABLED ASSERT_INTERRUPT_STATUS(z)
+#define ASSERT_INTERRUPTS_ENABLED \
+    ASSERT_INTERRUPT_STATUS(nz, "INTERRUPTS ENABLED")
+#define ASSERT_INTERRUPTS_DISABLED \
+    ASSERT_INTERRUPT_STATUS(z, "INTERRUPTS DISABLED")
 
 /*
  * This flag is set in an exception frame when registers R12-R15 did not get
diff --git a/xen/include/asm-x86/atomic.h b/xen/include/asm-x86/atomic.h
index 8972463..2b8c877 100644
--- a/xen/include/asm-x86/atomic.h
+++ b/xen/include/asm-x86/atomic.h
@@ -14,44 +14,69 @@ static inline void name(volatile type *addr, type val) \
 { asm volatile("mov" size " %1,%0": "=m" (*(volatile type *)addr) \
 :reg (val) barrier); }
 
+#define build_add_sized(name, size, type, reg) \
+    static inline void name(volatile type *addr, type val)              \
+    {                                                                   \
+        asm volatile("add" size " %1,%0"                                \
+                     : "=m" (*addr)                                     \
+                     : reg (val));                                      \
+    }
+
 build_read_atomic(read_u8_atomic, "b", uint8_t, "=q", )
 build_read_atomic(read_u16_atomic, "w", uint16_t, "=r", )
 build_read_atomic(read_u32_atomic, "l", uint32_t, "=r", )
+build_read_atomic(read_u64_atomic, "q", uint64_t, "=r", )
 
 build_write_atomic(write_u8_atomic, "b", uint8_t, "q", )
 build_write_atomic(write_u16_atomic, "w", uint16_t, "r", )
 build_write_atomic(write_u32_atomic, "l", uint32_t, "r", )
-
-build_read_atomic(read_u64_atomic, "q", uint64_t, "=r", )
 build_write_atomic(write_u64_atomic, "q", uint64_t, "r", )
 
+build_add_sized(add_u8_sized, "b", uint8_t, "qi")
+build_add_sized(add_u16_sized, "w", uint16_t, "ri")
+build_add_sized(add_u32_sized, "l", uint32_t, "ri")
+build_add_sized(add_u64_sized, "q", uint64_t, "ri")
+
 #undef build_read_atomic
 #undef build_write_atomic
+#undef build_add_sized
 
 void __bad_atomic_size(void);
 
-#define read_atomic(p) ({                                               \
-    typeof(*p) __x;                                                     \
-    switch ( sizeof(*p) ) {                                             \
-    case 1: __x = (typeof(*p))read_u8_atomic((uint8_t *)p); break;      \
-    case 2: __x = (typeof(*p))read_u16_atomic((uint16_t *)p); break;    \
-    case 4: __x = (typeof(*p))read_u32_atomic((uint32_t *)p); break;    \
-    case 8: __x = (typeof(*p))read_u64_atomic((uint64_t *)p); break;    \
-    default: __x = 0; __bad_atomic_size(); break;                       \
-    }                                                                   \
-    __x;                                                                \
+#define read_atomic(p) ({                                 \
+    unsigned long x_;                                     \
+    switch ( sizeof(*(p)) ) {                             \
+    case 1: x_ = read_u8_atomic((uint8_t *)(p)); break;   \
+    case 2: x_ = read_u16_atomic((uint16_t *)(p)); break; \
+    case 4: x_ = read_u32_atomic((uint32_t *)(p)); break; \
+    case 8: x_ = read_u64_atomic((uint64_t *)(p)); break; \
+    default: x_ = 0; __bad_atomic_size(); break;          \
+    }                                                     \
+    (typeof(*(p)))x_;                                     \
+})
+
+#define write_atomic(p, x) ({                             \
+    typeof(*(p)) __x = (x);                               \
+    unsigned long x_ = (unsigned long)__x;                \
+    switch ( sizeof(*(p)) ) {                             \
+    case 1: write_u8_atomic((uint8_t *)(p), x_); break;   \
+    case 2: write_u16_atomic((uint16_t *)(p), x_); break; \
+    case 4: write_u32_atomic((uint32_t *)(p), x_); break; \
+    case 8: write_u64_atomic((uint64_t *)(p), x_); break; \
+    default: __bad_atomic_size(); break;                  \
+    }                                                     \
 })
 
-#define write_atomic(p, x) ({                                           \
-    typeof(*p) __x = (x);                                               \
-    switch ( sizeof(*p) ) {                                             \
-    case 1: write_u8_atomic((uint8_t *)p, (uint8_t)__x); break;         \
-    case 2: write_u16_atomic((uint16_t *)p, (uint16_t)__x); break;      \
-    case 4: write_u32_atomic((uint32_t *)p, (uint32_t)__x); break;      \
-    case 8: write_u64_atomic((uint64_t *)p, (uint64_t)__x); break;      \
-    default: __bad_atomic_size(); break;                                \
-    }                                                                   \
-    __x;                                                                \
+#define add_sized(p, x) ({                                \
+    typeof(*(p)) x_ = (x);                                \
+    switch ( sizeof(*(p)) )                               \
+    {                                                     \
+    case 1: add_u8_sized((uint8_t *)(p), x_); break;      \
+    case 2: add_u16_sized((uint16_t *)(p), x_); break;    \
+    case 4: add_u32_sized((uint32_t *)(p), x_); break;    \
+    case 8: add_u64_sized((uint64_t *)(p), x_); break;    \
+    default: __bad_atomic_size(); break;                  \
+    }                                                     \
 })
 
 /*
diff --git a/xen/include/asm-x86/bitops.h b/xen/include/asm-x86/bitops.h
index 82a08ee..ff43a98 100644
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -5,19 +5,20 @@
  * Copyright 1992, Linus Torvalds.
  */
 
-#include <xen/config.h>
+#include <asm/alternative.h>
+#define X86_FEATURES_ONLY
+#include <asm/cpufeature.h>
 
 /*
  * We specify the memory operand as both input and output because the memory
  * operand is both read from and written to. Since the operand is in fact a
  * word array, we also specify "memory" in the clobbers list to indicate that
  * words other than the one directly addressed by the memory operand may be
- * modified. We don't use "+m" because the gcc manual says that it should be
- * used only when the constraint allows the operand to reside in a register.
+ * modified.
  */
 
-#define ADDR (*(volatile long *) addr)
-#define CONST_ADDR (*(const volatile long *) addr)
+#define ADDR (*(volatile int *) addr)
+#define CONST_ADDR (*(const volatile int *) addr)
 
 extern void __bitop_bad_size(void);
 #define bitop_bad_size(addr) (sizeof(*(addr)) < 4)
@@ -34,10 +35,8 @@ extern void __bitop_bad_size(void);
  */
 static inline void set_bit(int nr, volatile void *addr)
 {
-    asm volatile (
-        "lock; btsl %1,%0"
-        : "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+    asm volatile ( "lock; btsl %1,%0"
+                   : "+m" (ADDR) : "Ir" (nr) : "memory");
 }
 #define set_bit(nr, addr) ({                            \
     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
@@ -53,12 +52,9 @@ static inline void set_bit(int nr, volatile void *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(int nr, volatile void *addr)
+static inline void __set_bit(int nr, void *addr)
 {
-    asm volatile (
-        "btsl %1,%0"
-        : "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+    asm volatile ( "btsl %1,%0" : "+m" (*(int *)addr) : "Ir" (nr) : "memory" );
 }
 #define __set_bit(nr, addr) ({                          \
     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
@@ -74,10 +70,8 @@ static inline void __set_bit(int nr, volatile void *addr)
  */
 static inline void clear_bit(int nr, volatile void *addr)
 {
-    asm volatile (
-        "lock; btrl %1,%0"
-        : "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+    asm volatile ( "lock; btrl %1,%0"
+                   : "+m" (ADDR) : "Ir" (nr) : "memory");
 }
 #define clear_bit(nr, addr) ({                          \
     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
@@ -93,12 +87,9 @@ static inline void clear_bit(int nr, volatile void *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __clear_bit(int nr, volatile void *addr)
+static inline void __clear_bit(int nr, void *addr)
 {
-    asm volatile (
-        "btrl %1,%0"
-        : "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+    asm volatile ( "btrl %1,%0" : "+m" (*(int *)addr) : "Ir" (nr) : "memory" );
 }
 #define __clear_bit(nr, addr) ({                        \
     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
@@ -114,12 +105,9 @@ static inline void __clear_bit(int nr, volatile void *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(int nr, volatile void *addr)
+static inline void __change_bit(int nr, void *addr)
 {
-    asm volatile (
-        "btcl %1,%0"
-        : "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+    asm volatile ( "btcl %1,%0" : "+m" (*(int *)addr) : "Ir" (nr) : "memory" );
 }
 #define __change_bit(nr, addr) ({                       \
     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
@@ -137,10 +125,8 @@ static inline void __change_bit(int nr, volatile void *addr)
  */
 static inline void change_bit(int nr, volatile void *addr)
 {
-    asm volatile (
-        "lock; btcl %1,%0"
-        : "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+    asm volatile ( "lock; btcl %1,%0"
+                    : "+m" (ADDR) : "Ir" (nr) : "memory");
 }
 #define change_bit(nr, addr) ({                         \
     if ( bitop_bad_size(addr) ) __bitop_bad_size();     \
@@ -159,10 +145,8 @@ static inline int test_and_set_bit(int nr, volatile void *addr)
 {
     int oldbit;
 
-    asm volatile (
-        "lock; btsl %2,%1\n\tsbbl %0,%0"
-        : "=r" (oldbit), "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+    asm volatile ( "lock; btsl %2,%1\n\tsbbl %0,%0"
+                   : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
     return oldbit;
 }
 #define test_and_set_bit(nr, addr) ({                   \
@@ -179,14 +163,14 @@ static inline int test_and_set_bit(int nr, volatile void *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(int nr, volatile void *addr)
+static inline int __test_and_set_bit(int nr, void *addr)
 {
     int oldbit;
 
     asm volatile (
         "btsl %2,%1\n\tsbbl %0,%0"
-        : "=r" (oldbit), "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+        : "=r" (oldbit), "+m" (*(int *)addr)
+        : "Ir" (nr) : "memory" );
     return oldbit;
 }
 #define __test_and_set_bit(nr, addr) ({                 \
@@ -206,10 +190,8 @@ static inline int test_and_clear_bit(int nr, volatile void *addr)
 {
     int oldbit;
 
-    asm volatile (
-        "lock; btrl %2,%1\n\tsbbl %0,%0"
-        : "=r" (oldbit), "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+    asm volatile ( "lock; btrl %2,%1\n\tsbbl %0,%0"
+                   : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
     return oldbit;
 }
 #define test_and_clear_bit(nr, addr) ({                 \
@@ -226,14 +208,14 @@ static inline int test_and_clear_bit(int nr, volatile void *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_clear_bit(int nr, volatile void *addr)
+static inline int __test_and_clear_bit(int nr, void *addr)
 {
     int oldbit;
 
     asm volatile (
         "btrl %2,%1\n\tsbbl %0,%0"
-        : "=r" (oldbit), "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+        : "=r" (oldbit), "+m" (*(int *)addr)
+        : "Ir" (nr) : "memory" );
     return oldbit;
 }
 #define __test_and_clear_bit(nr, addr) ({               \
@@ -242,14 +224,14 @@ static inline int __test_and_clear_bit(int nr, volatile void *addr)
 })
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr, volatile void *addr)
+static inline int __test_and_change_bit(int nr, void *addr)
 {
     int oldbit;
 
     asm volatile (
         "btcl %2,%1\n\tsbbl %0,%0"
-        : "=r" (oldbit), "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+        : "=r" (oldbit), "+m" (*(int *)addr)
+        : "Ir" (nr) : "memory" );
     return oldbit;
 }
 #define __test_and_change_bit(nr, addr) ({              \
@@ -269,10 +251,8 @@ static inline int test_and_change_bit(int nr, volatile void *addr)
 {
     int oldbit;
 
-    asm volatile (
-        "lock; btcl %2,%1\n\tsbbl %0,%0"
-        : "=r" (oldbit), "=m" (ADDR)
-        : "Ir" (nr), "m" (ADDR) : "memory");
+    asm volatile ( "lock; btcl %2,%1\n\tsbbl %0,%0"
+                   : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
     return oldbit;
 }
 #define test_and_change_bit(nr, addr) ({                \
@@ -313,9 +293,17 @@ extern unsigned int __find_first_zero_bit(
 extern unsigned int __find_next_zero_bit(
     const unsigned long *addr, unsigned int size, unsigned int offset);
 
-static inline unsigned int __scanbit(unsigned long val, unsigned long max)
+static inline unsigned int __scanbit(unsigned long val, unsigned int max)
 {
-    asm ( "bsf %1,%0 ; cmovz %2,%0" : "=&r" (val) : "r" (val), "r" (max) );
+    if ( __builtin_constant_p(max) && max == BITS_PER_LONG )
+        alternative_io("bsf %[in],%[out]; cmovz %[max],%k[out]",
+                       "rep; bsf %[in],%[out]",
+                       X86_FEATURE_BMI1,
+                       [out] "=&r" (val),
+                       [in] "r" (val), [max] "r" (max));
+    else
+        asm ( "bsf %1,%0 ; cmovz %2,%k0"
+              : "=&r" (val) : "r" (val), "r" (max) );
     return (unsigned int)val;
 }
 
@@ -391,7 +379,7 @@ static inline unsigned int __scanbit(unsigned long val, unsigned long max)
  */
 static inline unsigned int find_first_set_bit(unsigned long word)
 {
-    asm ( "bsf %1,%0" : "=r" (word) : "r" (word) );
+    asm ( "rep; bsf %1,%0" : "=r" (word) : "rm" (word) );
     return (unsigned int)word;
 }
 
@@ -401,7 +389,7 @@ static inline unsigned int find_first_set_bit(unsigned long word)
  *
  * This is defined the same way as the libc and compiler builtin ffs routines.
  */
-static inline int ffs(unsigned long x)
+static inline int ffsl(unsigned long x)
 {
     long r;
 
@@ -412,13 +400,24 @@ static inline int ffs(unsigned long x)
     return (int)r+1;
 }
 
+static inline int ffs(unsigned int x)
+{
+    int r;
+
+    asm ( "bsf %1,%0\n\t"
+          "jnz 1f\n\t"
+          "mov $-1,%0\n"
+          "1:" : "=r" (r) : "rm" (x));
+    return r + 1;
+}
+
 /**
  * fls - find last bit set
  * @x: the word to search
  *
  * This is defined the same way as ffs.
  */
-static inline int fls(unsigned long x)
+static inline int flsl(unsigned long x)
 {
     long r;
 
@@ -429,6 +428,17 @@ static inline int fls(unsigned long x)
     return (int)r+1;
 }
 
+static inline int fls(unsigned int x)
+{
+    int r;
+
+    asm ( "bsr %1,%0\n\t"
+          "jnz 1f\n\t"
+          "mov $-1,%0\n"
+          "1:" : "=r" (r) : "rm" (x));
+    return r + 1;
+}
+
 /**
  * hweightN - returns the hamming weight of a N-bit word
  * @x: the word to weigh
diff --git a/xen/include/asm-x86/bug.h b/xen/include/asm-x86/bug.h
index cd862e3..cec6bce 100644
--- a/xen/include/asm-x86/bug.h
+++ b/xen/include/asm-x86/bug.h
@@ -5,6 +5,13 @@
 #define BUG_LINE_LO_WIDTH (31 - BUG_DISP_WIDTH)
 #define BUG_LINE_HI_WIDTH (31 - BUG_DISP_WIDTH)
 
+#define BUGFRAME_run_fn 0
+#define BUGFRAME_warn   1
+#define BUGFRAME_bug    2
+#define BUGFRAME_assert 3
+
+#ifndef __ASSEMBLY__
+
 struct bug_frame {
     signed int loc_disp:BUG_DISP_WIDTH;
     unsigned int line_hi:BUG_LINE_HI_WIDTH;
@@ -22,11 +29,6 @@ struct bug_frame {
                       ((1 << BUG_LINE_LO_WIDTH) - 1)))
 #define bug_msg(b) ((const char *)(b) + (b)->msg_disp[1])
 
-#define BUGFRAME_run_fn 0
-#define BUGFRAME_warn   1
-#define BUGFRAME_bug    2
-#define BUGFRAME_assert 3
-
 #define BUG_FRAME(type, line, ptr, second_frame, msg) do {                   \
     BUILD_BUG_ON((line) >> (BUG_LINE_LO_WIDTH + BUG_LINE_HI_WIDTH));         \
     asm volatile ( ".Lbug%=: ud2\n"                                          \
@@ -66,4 +68,42 @@ extern const struct bug_frame __start_bug_frames[],
                               __stop_bug_frames_2[],
                               __stop_bug_frames_3[];
 
+#else  /* !__ASSEMBLY__ */
+
+/*
+ * Construct a bugframe, suitable for using in assembly code.  Should always
+ * match the C version above.  One complication is having to stash the strings
+ * in .rodata
+ */
+    .macro BUG_FRAME type, line, file_str, second_frame, msg
+    .L\@ud: ud2a
+
+    .pushsection .rodata.str1, "aMS", @progbits, 1
+         .L\@s1: .asciz "\file_str"
+    .popsection
+
+    .pushsection .bug_frames.\type, "a", @progbits
+        .L\@bf:
+        .long (.L\@ud - .L\@bf) + \
+               ((\line >> BUG_LINE_LO_WIDTH) << BUG_DISP_WIDTH)
+        .long (.L\@s1 - .L\@bf) + \
+               ((\line & ((1 << BUG_LINE_LO_WIDTH) - 1)) << BUG_DISP_WIDTH)
+
+        .if \second_frame
+            .pushsection .rodata.str1, "aMS", @progbits, 1
+                .L\@s2: .asciz "\msg"
+            .popsection
+            .long 0, (.L\@s2 - .L\@bf)
+        .endif
+    .popsection
+    .endm
+
+#define WARN BUG_FRAME BUGFRAME_warn, __LINE__, __FILE__, 0, 0
+#define BUG  BUG_FRAME BUGFRAME_bug,  __LINE__, __FILE__, 0, 0
+
+#define ASSERT_FAILED(msg)                                      \
+     BUG_FRAME BUGFRAME_assert, __LINE__, __FILE__, 1, msg
+
+#endif /* !__ASSEMBLY__ */
+
 #endif /* __X86_BUG_H__ */
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index 3802721..3e9be83 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -67,15 +67,6 @@
 #define NR_CPUS 256
 #endif
 
-/* Maximum we can support with current vLAPIC ID mapping. */
-#define MAX_HVM_VCPUS 128
-
-#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
-# define supervisor_mode_kernel (1)
-#else
-# define supervisor_mode_kernel (0)
-#endif
-
 /* Linkage for x86 */
 #define __ALIGN .align 16,0x90
 #define __ALIGN_STR ".align 16,0x90"
@@ -103,6 +94,10 @@
 /* Primary stack is restricted to 8kB by guard pages. */
 #define PRIMARY_STACK_SIZE 8192
 
+/* Total size of syscall and emulation stubs. */
+#define STUB_BUF_SHIFT (L1_CACHE_SHIFT > 7 ? L1_CACHE_SHIFT : 7)
+#define STUB_BUF_SIZE  (1 << STUB_BUF_SHIFT)
+
 /* Return value for zero-size _xmalloc(), distinguished from NULL. */
 #define ZERO_BLOCK_PTR ((void *)0xBAD0BAD0BAD0BAD0UL)
 
@@ -167,6 +162,7 @@ extern unsigned char boot_edid_info[128];
  *    High read-only compatibility machine-to-phys translation table.
  *  0xffff82d080000000 - 0xffff82d0bfffffff [1GB,   2^30 bytes, PML4:261]
  *    Xen text, static data, bss.
+#ifndef CONFIG_BIGMEM
  *  0xffff82d0c0000000 - 0xffff82dffbffffff [61GB - 64MB,       PML4:261]
  *    Reserved for future use.
  *  0xffff82dffc000000 - 0xffff82dfffffffff [64MB,  2^26 bytes, PML4:261]
@@ -175,6 +171,16 @@ extern unsigned char boot_edid_info[128];
  *    Page-frame information array.
  *  0xffff830000000000 - 0xffff87ffffffffff [5TB, 5*2^40 bytes, PML4:262-271]
  *    1:1 direct mapping of all physical memory.
+#else
+ *  0xffff82d0c0000000 - 0xffff82ffdfffffff [188.5GB,           PML4:261]
+ *    Reserved for future use.
+ *  0xffff82ffe0000000 - 0xffff82ffffffffff [512MB, 2^29 bytes, PML4:261]
+ *    Super-page information array.
+ *  0xffff830000000000 - 0xffff847fffffffff [1.5TB, 3*2^39 bytes, PML4:262-264]
+ *    Page-frame information array.
+ *  0xffff848000000000 - 0xffff87ffffffffff [3.5TB, 7*2^39 bytes, PML4:265-271]
+ *    1:1 direct mapping of all physical memory.
+#endif
  *  0xffff880000000000 - 0xffffffffffffffff [120TB,             PML4:272-511]
  *    PV: Guest-defined use.
  *  0xffff880000000000 - 0xffffff7fffffffff [119.5TB,           PML4:272-510]
@@ -243,21 +249,35 @@ extern unsigned char boot_edid_info[128];
 /* Slot 261: xen text, static data and bss (1GB). */
 #define XEN_VIRT_START          (HIRO_COMPAT_MPT_VIRT_END)
 #define XEN_VIRT_END            (XEN_VIRT_START + GB(1))
-/* Slot 261: superpage information array (64MB). */
+
+/* Slot 261: superpage information array (64MB or 512MB). */
 #define SPAGETABLE_VIRT_END     FRAMETABLE_VIRT_START
 #define SPAGETABLE_NR           (((FRAMETABLE_NR - 1) >> (SUPERPAGE_SHIFT - \
                                                           PAGE_SHIFT)) + 1)
 #define SPAGETABLE_SIZE         (SPAGETABLE_NR * sizeof(struct spage_info))
 #define SPAGETABLE_VIRT_START   ((SPAGETABLE_VIRT_END - SPAGETABLE_SIZE) & \
                                  (_AC(-1,UL) << SUPERPAGE_SHIFT))
+
+#ifndef CONFIG_BIGMEM
 /* Slot 261: page-frame information array (128GB). */
-#define FRAMETABLE_VIRT_END     DIRECTMAP_VIRT_START
 #define FRAMETABLE_SIZE         GB(128)
+#else
+/* Slot 262-264: page-frame information array (1.5TB). */
+#define FRAMETABLE_SIZE         GB(1536)
+#endif
+#define FRAMETABLE_VIRT_END     DIRECTMAP_VIRT_START
 #define FRAMETABLE_NR           (FRAMETABLE_SIZE / sizeof(*frame_table))
 #define FRAMETABLE_VIRT_START   (FRAMETABLE_VIRT_END - FRAMETABLE_SIZE)
+
+#ifndef CONFIG_BIGMEM
 /* Slot 262-271/510: A direct 1:1 mapping of all of physical memory. */
 #define DIRECTMAP_VIRT_START    (PML4_ADDR(262))
 #define DIRECTMAP_SIZE          (PML4_ENTRY_BYTES * (511 - 262))
+#else
+/* Slot 265-271/510: A direct 1:1 mapping of all of physical memory. */
+#define DIRECTMAP_VIRT_START    (PML4_ADDR(265))
+#define DIRECTMAP_SIZE          (PML4_ENTRY_BYTES * (511 - 265))
+#endif
 #define DIRECTMAP_VIRT_END      (DIRECTMAP_VIRT_START + DIRECTMAP_SIZE)
 
 #ifndef __ASSEMBLY__
@@ -327,6 +347,15 @@ extern unsigned long xen_phys_start;
 #define ARG_XLAT_START(v)        \
     (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
 
+#define NATIVE_VM_ASSIST_VALID   ((1UL << VMASST_TYPE_4gb_segments)        | \
+                                  (1UL << VMASST_TYPE_4gb_segments_notify) | \
+                                  (1UL << VMASST_TYPE_writable_pagetables) | \
+                                  (1UL << VMASST_TYPE_pae_extended_cr3)    | \
+                                  (1UL << VMASST_TYPE_m2p_strict))
+#define VM_ASSIST_VALID          NATIVE_VM_ASSIST_VALID
+#define COMPAT_VM_ASSIST_VALID   (NATIVE_VM_ASSIST_VALID & \
+                                  ((1UL << COMPAT_BITS_PER_LONG) - 1))
+
 #define ELFSIZE 64
 
 #define ARCH_CRASH_SAVE_VMCOREINFO
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index d3bd14d..9a01563 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -5,10 +5,8 @@
  */
 
 #ifndef __ASM_I386_CPUFEATURE_H
+#ifndef X86_FEATURES_ONLY
 #define __ASM_I386_CPUFEATURE_H
-
-#ifndef __ASSEMBLY__
-#include <xen/bitops.h>
 #endif
 
 #define NCAPINTS	8	/* N 32-bit words worth of info */
@@ -151,11 +149,14 @@
 #define X86_FEATURE_CMT 	(7*32+12) /* Cache Monitoring Technology */
 #define X86_FEATURE_NO_FPU_SEL 	(7*32+13) /* FPU CS/DS stored as zero */
 #define X86_FEATURE_MPX		(7*32+14) /* Memory Protection Extensions */
+#define X86_FEATURE_CAT 	(7*32+15) /* Cache Allocation Technology */
 #define X86_FEATURE_RDSEED	(7*32+18) /* RDSEED instruction */
 #define X86_FEATURE_ADX		(7*32+19) /* ADCX, ADOX instructions */
 #define X86_FEATURE_SMAP	(7*32+20) /* Supervisor Mode Access Prevention */
 
-#ifndef __ASSEMBLY__
+#if !defined(__ASSEMBLY__) && !defined(X86_FEATURES_ONLY)
+#include <xen/bitops.h>
+
 #define cpu_has(c, bit)		test_bit(bit, (c)->x86_capability)
 #define boot_cpu_has(bit)	test_bit(bit, boot_cpu_data.x86_capability)
 #define cpufeat_mask(idx)       (1u << ((idx) & 31))
@@ -216,6 +217,8 @@
 
 #define cpu_has_cpuid_faulting	boot_cpu_has(X86_FEATURE_CPUID_FAULTING)
 
+#define cpu_has_cx16            boot_cpu_has(X86_FEATURE_CX16)
+
 enum _cache_type {
     CACHE_TYPE_NULL = 0,
     CACHE_TYPE_DATA = 1,
@@ -262,6 +265,8 @@ struct cpuid4_info {
 int cpuid4_cache_lookup(int index, struct cpuid4_info *this_leaf);
 #endif
 
+#undef X86_FEATURES_ONLY
+
 #endif /* __ASM_I386_CPUFEATURE_H */
 
 /* 
diff --git a/xen/include/asm-x86/cpuidle.h b/xen/include/asm-x86/cpuidle.h
index 4d70677..46e614b 100644
--- a/xen/include/asm-x86/cpuidle.h
+++ b/xen/include/asm-x86/cpuidle.h
@@ -23,6 +23,8 @@ void acpi_dead_idle(void);
 void trace_exit_reason(u32 *irq_traced);
 void update_idle_stats(struct acpi_processor_power *,
                        struct acpi_processor_cx *, uint64_t, uint64_t);
+void update_last_cx_stat(struct acpi_processor_power *,
+                         struct acpi_processor_cx *, uint64_t);
 
 /*
  * vcpu is urgent if vcpu is polling event channel
diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h
index b95fd79..f011d2d 100644
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -12,6 +12,28 @@
 #include <public/xen.h>
 #include <asm/page.h>
 
+/*
+ * Xen's cpu stacks are 8 pages (8-page aligned), arranged as:
+ *
+ * 7 - Primary stack (with a struct cpu_info at the top)
+ * 6 - Primary stack
+ * 5 - Optionally not preset (MEMORY_GUARD)
+ * 4 - unused
+ * 3 - Syscall trampolines
+ * 2 - MCE IST stack
+ * 1 - NMI IST stack
+ * 0 - Double Fault IST stack
+ */
+
+/*
+ * Identify which stack page the stack pointer is on.  Returns an index
+ * as per the comment above.
+ */
+static inline unsigned int get_stack_page(unsigned long sp)
+{
+    return (sp & (STACK_SIZE-1)) >> PAGE_SHIFT;
+}
+
 struct vcpu;
 
 struct cpu_info {
@@ -51,13 +73,12 @@ static inline struct cpu_info *get_cpu_info(void)
     ((unsigned long)&get_cpu_info()->guest_cpu_user_regs.es)
 
 /*
- * Get the bottom-of-stack, as useful for printing stack traces.  This is the
- * highest word on the stack which might be part of a stack trace, and is the
- * adjacent word to a struct cpu_info on the stack.
+ * Get the reasonable stack bounds for stack traces and stack dumps.  Stack
+ * dumps have a slightly larger range to include exception frames in the
+ * printed information.  The returned word is inside the interesting range.
  */
-#define get_printable_stack_bottom(sp)          \
-    ((sp & (~(STACK_SIZE-1))) +                 \
-     (STACK_SIZE - sizeof(struct cpu_info) - sizeof(unsigned long)))
+unsigned long get_stack_trace_bottom(unsigned long sp);
+unsigned long get_stack_dump_bottom (unsigned long sp);
 
 #define reset_stack_and_jump(__fn)                                      \
     ({                                                                  \
diff --git a/xen/include/asm-x86/debugger.h b/xen/include/asm-x86/debugger.h
index 0408bec..33f4700 100644
--- a/xen/include/asm-x86/debugger.h
+++ b/xen/include/asm-x86/debugger.h
@@ -82,9 +82,8 @@ static inline int debugger_trap_entry(
     return 0;
 }
 
-typedef unsigned long dbgva_t;
-typedef unsigned char dbgbyte_t;
-extern int dbg_rw_mem(dbgva_t addr, dbgbyte_t *buf, int len,
-                      domid_t domid, int toaddr, uint64_t pgd3);
+unsigned int dbg_rw_mem(void * __user addr, void * __user buf,
+                        unsigned int len, domid_t domid, bool_t toaddr,
+                        uint64_t pgd3);
 
 #endif /* __X86_DEBUGGER_H__ */
diff --git a/xen/include/asm-x86/debugreg.h b/xen/include/asm-x86/debugreg.h
index a5b2838..c57914e 100644
--- a/xen/include/asm-x86/debugreg.h
+++ b/xen/include/asm-x86/debugreg.h
@@ -21,6 +21,8 @@
 #define DR_STEP         (0x4000)        /* single-step */
 #define DR_SWITCH       (0x8000)        /* task switch */
 #define DR_NOT_RTM      (0x10000)       /* clear: #BP inside RTM region */
+#define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */
+#define DR_STATUS_RESERVED_ONE  0xffff0ff0ul /* Reserved, read as one */
 
 /* Now define a bunch of things for manipulating the control register.
    The top two bytes of the control register consist of 4 fields of 4
diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h
index 225913a..0e2d97f 100644
--- a/xen/include/asm-x86/desc.h
+++ b/xen/include/asm-x86/desc.h
@@ -47,17 +47,7 @@
     (sel) = (((sel) & 3) >= _rpl) ? (sel) : (((sel) & ~3) | _rpl); \
 })
 
-/* Stack selectors don't need fixing up if the kernel runs in ring 0. */
-#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
-#define fixup_guest_stack_selector(d, ss) ((void)0)
-#else
 #define fixup_guest_stack_selector(d, ss) __fixup_guest_selector(d, ss)
-#endif
-
-/*
- * Code selectors are always fixed up. It allows the Xen exit stub to detect
- * return to guest context, even when the guest kernel runs in ring 0.
- */
 #define fixup_guest_code_selector(d, cs)  __fixup_guest_selector(d, cs)
 
 /*
@@ -75,7 +65,7 @@
  */
 #define guest_gate_selector_okay(d, sel)                                \
     ((((sel)>>3) < FIRST_RESERVED_GDT_ENTRY) || /* Guest seg? */        \
-     ((sel) == (!is_pv_32on64_domain(d) ?                               \
+     ((sel) == (!is_pv_32bit_domain(d) ?                                \
                 FLAT_KERNEL_CS :                /* Xen default seg? */  \
                 FLAT_COMPAT_KERNEL_CS)) ||                              \
      ((sel) & 4))                               /* LDT seg? */
diff --git a/xen/include/asm-x86/device.h b/xen/include/asm-x86/device.h
new file mode 100644
index 0000000..f2acc7e
--- /dev/null
+++ b/xen/include/asm-x86/device.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_X86_DEVICE_H
+#define __ASM_X86_DEVICE_H
+
+#include <xen/pci.h>
+
+/*
+ * x86 only supports PCI. Therefore it's possible to directly use
+ * pci_dev to avoid adding new field.
+ */
+
+typedef struct pci_dev device_t;
+
+#define dev_is_pci(dev) ((void)(dev), 1)
+#define pci_to_dev(pci) (pci)
+
+#endif /* __ASM_X86_DEVICE_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 6a77a93..0fce09e 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -9,12 +9,11 @@
 #include <asm/e820.h>
 #include <asm/mce.h>
 #include <public/vcpu.h>
+#include <public/hvm/hvm_info_table.h>
 
 #define has_32bit_shinfo(d)    ((d)->arch.has_32bit_shinfo)
 #define is_pv_32bit_domain(d)  ((d)->arch.is_32bit_pv)
 #define is_pv_32bit_vcpu(v)    (is_pv_32bit_domain((v)->domain))
-#define is_pv_32on64_domain(d) (is_pv_32bit_domain(d))
-#define is_pv_32on64_vcpu(v)   (is_pv_32on64_domain((v)->domain))
 
 #define is_hvm_pv_evtchn_domain(d) (has_hvm_container_domain(d) && \
         d->arch.hvm_domain.irq.callback_via_type == HVMIRQ_callback_vector)
@@ -87,6 +86,7 @@ void hypercall_page_initialise(struct domain *d, void *);
 /*          shadow paging extension             */
 /************************************************/
 struct shadow_domain {
+#ifdef CONFIG_SHADOW_PAGING
     unsigned int      opt_flags;    /* runtime tunable optimizations on/off */
     struct page_list_head pinned_shadows;
 
@@ -116,9 +116,11 @@ struct shadow_domain {
 
     /* Has this domain ever used HVMOP_pagetable_dying? */
     bool_t pagetable_dying_op;
+#endif
 };
 
 struct shadow_vcpu {
+#ifdef CONFIG_SHADOW_PAGING
     /* PAE guests: per-vcpu shadow top-level table */
     l3_pgentry_t l3table[4] __attribute__((__aligned__(32)));
     /* PAE guests: per-vcpu cache of the top-level *guest* entries */
@@ -144,6 +146,7 @@ struct shadow_vcpu {
     } oos_fixup[SHADOW_OOS_PAGES];
 
     bool_t pagetable_dying;
+#endif
 };
 
 /************************************************/
@@ -181,6 +184,8 @@ struct paging_domain {
 
     /* flags to control paging operation */
     u32                     mode;
+    /* Has that pool ever run out of memory? */
+    bool_t                  p2m_alloc_failed;
     /* extension for shadow paging support */
     struct shadow_domain    shadow;
     /* extension for hardware-assited paging */
@@ -205,8 +210,6 @@ struct paging_domain {
      * (used by p2m and log-dirty code for their tries) */
     struct page_info * (*alloc_page)(struct domain *d);
     void (*free_page)(struct domain *d, struct page_info *pg);
-    /* Has that pool ever run out of memory? */
-    bool_t p2m_alloc_failed;
 };
 
 struct paging_vcpu {
@@ -230,6 +233,10 @@ struct paging_vcpu {
 typedef xen_domctl_cpuid_t cpuid_input_t;
 
 #define MAX_NESTEDP2M 10
+
+#define MAX_ALTP2M      10 /* arbitrary */
+#define INVALID_ALTP2M  0xffff
+#define MAX_EPTP        (PAGE_SIZE / sizeof(uint64_t))
 struct p2m_domain;
 struct time_scale {
     int shift;
@@ -244,19 +251,37 @@ struct pv_domain
     struct mapcache_domain mapcache;
 };
 
+struct monitor_write_data {
+    struct {
+        unsigned int msr : 1;
+        unsigned int cr0 : 1;
+        unsigned int cr3 : 1;
+        unsigned int cr4 : 1;
+    } do_write;
+
+    uint32_t msr;
+    uint64_t value;
+    uint64_t cr0;
+    uint64_t cr3;
+    uint64_t cr4;
+};
+
 struct arch_domain
 {
     struct page_info *perdomain_l3_pg;
 
     unsigned int hv_compat_vstart;
 
-    bool_t s3_integrity;
+    /* Maximum physical-address bitwidth supported by this guest. */
+    unsigned int physaddr_bitsize;
 
     /* I/O-port admin-specified access capabilities. */
     struct rangeset *ioport_caps;
     uint32_t pci_cf8;
     uint8_t cmos_idx;
 
+    bool_t s3_integrity;
+
     struct list_head pdev_list;
 
     union {
@@ -270,34 +295,46 @@ struct arch_domain
      * page_alloc lock */
     int page_alloc_unlock_level;
 
+    /* Continuable domain_relinquish_resources(). */
+    enum {
+        RELMEM_not_started,
+        RELMEM_shared,
+        RELMEM_xen,
+        RELMEM_l4,
+        RELMEM_l3,
+        RELMEM_l2,
+        RELMEM_done,
+    } relmem;
+    struct page_list_head relmem_list;
+
     /* nestedhvm: translate l2 guest physical to host physical */
     struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
     mm_lock_t nested_p2m_lock;
 
+    /* altp2m: allow multiple copies of host p2m */
+    bool_t altp2m_active;
+    struct p2m_domain *altp2m_p2m[MAX_ALTP2M];
+    mm_lock_t altp2m_list_lock;
+    uint64_t *altp2m_eptp;
+
     /* NB. protected by d->event_lock and by irq_desc[irq].lock */
     struct radix_tree_root irq_pirq;
 
-    /* Maximum physical-address bitwidth supported by this guest. */
-    unsigned int physaddr_bitsize;
-
     /* Is a 32-bit PV (non-HVM) guest? */
     bool_t is_32bit_pv;
     /* Is shared-info page in 32-bit format? */
     bool_t has_32bit_shinfo;
+
     /* Domain cannot handle spurious page faults? */
     bool_t suppress_spurious_page_faults;
 
-    /* Continuable domain_relinquish_resources(). */
-    enum {
-        RELMEM_not_started,
-        RELMEM_shared,
-        RELMEM_xen,
-        RELMEM_l4,
-        RELMEM_l3,
-        RELMEM_l2,
-        RELMEM_done,
-    } relmem;
-    struct page_list_head relmem_list;
+    /* Is PHYSDEVOP_eoi to automatically unmask the event channel? */
+    bool_t auto_unmask;
+
+    /* Values snooped from updates to cpuids[] (below). */
+    u8 x86;                  /* CPU family */
+    u8 x86_vendor;           /* CPU vendor */
+    u8 x86_model;            /* CPU model */
 
     cpuid_input_t *cpuids;
 
@@ -314,22 +351,42 @@ struct arch_domain
     struct time_scale ns_to_vtsc; /* scaling for certain emulated cases */
     uint32_t incarnation;    /* incremented every restore or live migrate
                                 (possibly other cases in the future */
-    uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */
-    uint64_t vtsc_usercount; /* not used for hvm */
+#if !defined(NDEBUG) || defined(PERF_COUNTERS)
+    uint64_t vtsc_kerncount;
+    uint64_t vtsc_usercount;
+#endif
 
     /* Pseudophysical e820 map (XENMEM_memory_map).  */
     spinlock_t e820_lock;
     struct e820entry *e820;
     unsigned int nr_e820;
 
-    /* set auto_unmask to 1 if you want PHYSDEVOP_eoi to automatically
-     * unmask the event channel */
-    bool_t auto_unmask;
+    /* RMID assigned to the domain for CMT */
+    unsigned int psr_rmid;
+    /* COS assigned to the domain for each socket */
+    unsigned int *psr_cos_ids;
+
     /* Shared page for notifying that explicit PIRQ EOI is required. */
     unsigned long *pirq_eoi_map;
     unsigned long pirq_eoi_map_mfn;
 
-    unsigned int psr_rmid; /* RMID assigned to the domain for CMT */
+    /* Monitor options */
+    struct {
+        unsigned int write_ctrlreg_enabled       : 4;
+        unsigned int write_ctrlreg_sync          : 4;
+        unsigned int write_ctrlreg_onchangeonly  : 4;
+        unsigned int mov_to_msr_enabled          : 1;
+        unsigned int mov_to_msr_extended         : 1;
+        unsigned int singlestep_enabled          : 1;
+        unsigned int software_breakpoint_enabled : 1;
+        unsigned int guest_request_enabled       : 1;
+        unsigned int guest_request_sync          : 1;
+    } monitor;
+
+    /* Mem_access emulation control */
+    bool_t mem_access_emulate_enabled;
+
+    struct monitor_write_data *event_write_data;
 } __cacheline_aligned;
 
 #define has_arch_pdevs(d)    (!list_empty(&(d)->arch.pdev_list))
@@ -364,8 +421,6 @@ struct pv_vcpu
         };
     };
 
-    unsigned long vm_assist;
-
     unsigned long syscall32_callback_eip;
     unsigned long sysenter_callback_eip;
     unsigned short syscall32_callback_cs;
@@ -426,6 +481,8 @@ struct arch_vcpu
     void (*ctxt_switch_from) (struct vcpu *);
     void (*ctxt_switch_to) (struct vcpu *);
 
+    struct vpmu_struct vpmu;
+
     /* Virtual Machine Extensions */
     union {
         struct pv_vcpu pv_vcpu;
@@ -478,15 +535,15 @@ struct arch_vcpu
 
     /*
      * Should we emulate the next matching instruction on VCPU resume
-     * after a mem_event?
+     * after a vm_event?
      */
     struct {
         uint32_t emulate_flags;
         unsigned long gpa;
         unsigned long eip;
-    } mem_event;
-
-} __cacheline_aligned;
+        struct vm_event_emul_read_data *emul_read_data;
+    } vm_event;
+};
 
 smap_check_policy_t smap_policy_change(struct vcpu *v,
                                        smap_check_policy_t new_policy);
@@ -527,6 +584,8 @@ void domain_cpuid(struct domain *d,
                   unsigned int  *ecx,
                   unsigned int  *edx);
 
+#define domain_max_vcpus(d) (is_hvm_domain(d) ? HVM_MAX_VCPUS : MAX_VIRT_CPUS)
+
 #endif /* __ASM_DOMAIN_H__ */
 
 /*
diff --git a/xen/include/asm-x86/fixmap.h b/xen/include/asm-x86/fixmap.h
index ebb3207..70eadff 100644
--- a/xen/include/asm-x86/fixmap.h
+++ b/xen/include/asm-x86/fixmap.h
@@ -78,6 +78,8 @@ extern void __set_fixmap(
 #define set_fixmap_nocache(idx, phys) \
     __set_fixmap(idx, (phys)>>PAGE_SHIFT, PAGE_HYPERVISOR_NOCACHE)
 
+#define clear_fixmap(idx) __set_fixmap(idx, 0, 0)
+
 #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
 #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
 
diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h
index d2a8250..f8a0d76 100644
--- a/xen/include/asm-x86/guest_pt.h
+++ b/xen/include/asm-x86/guest_pt.h
@@ -22,8 +22,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _XEN_ASM_GUEST_PT_H
@@ -33,15 +32,6 @@
 #error GUEST_PAGING_LEVELS not defined
 #endif
 
-/* Type of the guest's frame numbers */
-TYPE_SAFE(unsigned long,gfn)
-#define PRI_gfn "05lx"
-
-#ifndef gfn_t
-#define gfn_t /* Grep fodder: gfn_t, _gfn() and gfn_x() are defined above */
-#undef gfn_t
-#endif
-
 #define VALID_GFN(m) (m != INVALID_GFN)
 
 static inline int
diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
index 7876527..c613836 100644
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -19,8 +19,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _XEN_HAP_H
@@ -32,21 +31,6 @@
     printk("hap error: %s(): " _f, __func__, ##_a)
 
 /************************************************/
-/*          hap domain page mapping             */
-/************************************************/
-static inline void *
-hap_map_domain_page(mfn_t mfn)
-{
-    return map_domain_page(mfn_x(mfn));
-}
-
-static inline void
-hap_unmap_domain_page(void *p)
-{
-    unmap_domain_page(p);
-}
-
-/************************************************/
 /*        hap domain level functions            */
 /************************************************/
 void  hap_domain_init(struct domain *d);
@@ -54,7 +38,7 @@ int   hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
                  XEN_GUEST_HANDLE_PARAM(void) u_domctl);
 int   hap_enable(struct domain *d, u32 mode);
 void  hap_final_teardown(struct domain *d);
-void  hap_teardown(struct domain *d);
+void  hap_teardown(struct domain *d, int *preempted);
 void  hap_vcpu_init(struct vcpu *v);
 int   hap_track_dirty_vram(struct domain *d,
                            unsigned long begin_pfn,
diff --git a/xen/include/asm-x86/hpet.h b/xen/include/asm-x86/hpet.h
index 875f1de..10c4a56 100644
--- a/xen/include/asm-x86/hpet.h
+++ b/xen/include/asm-x86/hpet.h
@@ -52,6 +52,7 @@
 
 extern unsigned long hpet_address;
 extern u8 hpet_blockid;
+extern u8 hpet_flags;
 
 /*
  * Detect and initialise HPET hardware: return counter update frequency.
diff --git a/xen/include/asm-x86/hvm/asid.h b/xen/include/asm-x86/hvm/asid.h
index 795b0e1..00e37c4 100644
--- a/xen/include/asm-x86/hvm/asid.h
+++ b/xen/include/asm-x86/hvm/asid.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_ASID_H__
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 0702bf5..992d5d1 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_DOMAIN_H__
@@ -46,6 +45,7 @@ struct hvm_ioreq_vcpu {
     struct list_head list_entry;
     struct vcpu      *vcpu;
     evtchn_port_t    ioreq_evtchn;
+    bool_t           pending;
 };
 
 #define NR_IO_RANGE_TYPES (HVMOP_IO_RANGE_PCI + 1)
@@ -70,6 +70,7 @@ struct hvm_ioreq_server {
     evtchn_port_t          bufioreq_evtchn;
     struct rangeset        *range[NR_IO_RANGE_TYPES];
     bool_t                 enabled;
+    bool_t                 bufioreq_atomic;
 };
 
 struct hvm_domain {
@@ -83,7 +84,6 @@ struct hvm_domain {
     struct {
         spinlock_t       lock;
         ioservid_t       id;
-        bool_t           waiting;
         struct list_head list;
     } ioreq_server;
     struct hvm_ioreq_server *default_ioreq_server;
@@ -94,6 +94,7 @@ struct hvm_domain {
     struct pl_time         pl_time;
 
     struct hvm_io_handler *io_handler;
+    unsigned int          io_handler_count;
 
     /* Lock protects access to irq, vpic and vioapic. */
     spinlock_t             irq_lock;
@@ -135,7 +136,6 @@ struct hvm_domain {
     bool_t                 mem_sharing_enabled;
     bool_t                 qemu_mapcache_invalidate;
     bool_t                 is_s3_suspended;
-    bool_t                 introspection_enabled;
 
     /*
      * TSC value that VCPUs use to calculate their tsc_offset value.
@@ -143,6 +143,8 @@ struct hvm_domain {
      */
     uint64_t sync_tsc;
 
+    unsigned long *io_bitmap;
+
     union {
         struct vmx_domain vmx;
         struct svm_domain svm;
diff --git a/xen/include/asm-x86/hvm/emulate.h b/xen/include/asm-x86/hvm/emulate.h
index 5411302..49134b5 100644
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -32,13 +32,21 @@ struct hvm_emulate_ctxt {
     struct hvm_trap trap;
 
     uint32_t intr_shadow;
+
+    bool_t set_context;
+};
+
+enum emul_kind {
+    EMUL_KIND_NORMAL,
+    EMUL_KIND_NOWRITE,
+    EMUL_KIND_SET_CONTEXT
 };
 
 int hvm_emulate_one(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 int hvm_emulate_one_no_write(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
-void hvm_mem_event_emulate_one(bool_t nowrite,
+void hvm_mem_access_emulate_one(enum emul_kind kind,
     unsigned int trapnr,
     unsigned int errcode);
 void hvm_emulate_prepare(
@@ -50,11 +58,22 @@ struct segment_register *hvmemul_get_seg_reg(
     enum x86_segment seg,
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 
-int hvmemul_do_pio(
-    unsigned long port, unsigned long *reps, int size,
-    paddr_t ram_gpa, int dir, int df, void *p_data);
+int hvmemul_do_pio_buffer(uint16_t port,
+                          unsigned int size,
+                          uint8_t dir,
+                          void *buffer);
 
 void hvm_dump_emulation_state(const char *prefix,
                               struct hvm_emulate_ctxt *hvmemul_ctxt);
 
 #endif /* __ASM_X86_HVM_EMULATE_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/hvm/event.h b/xen/include/asm-x86/hvm/event.h
new file mode 100644
index 0000000..e07f329
--- /dev/null
+++ b/xen/include/asm-x86/hvm/event.h
@@ -0,0 +1,45 @@
+/*
+ * event.h: Hardware virtual machine assist events.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ASM_X86_HVM_EVENT_H__
+#define __ASM_X86_HVM_EVENT_H__
+
+/*
+ * Called for current VCPU on crX/MSR changes by guest.
+ * The event might not fire if the client has subscribed to it in onchangeonly
+ * mode, hence the bool_t return type for control register write events.
+ */
+bool_t hvm_event_cr(unsigned int index, unsigned long value,
+                    unsigned long old);
+#define hvm_event_crX(what, new, old) \
+    hvm_event_cr(VM_EVENT_X86_##what, new, old)
+void hvm_event_msr(unsigned int msr, uint64_t value);
+/* Called for current VCPU: returns -1 if no listener */
+int hvm_event_int3(unsigned long gla);
+int hvm_event_single_step(unsigned long gla);
+void hvm_event_guest_request(void);
+
+#endif /* __ASM_X86_HVM_EVENT_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index e3d2d9a..68b216c 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_HVM_H__
@@ -94,6 +93,9 @@ struct hvm_function_table {
     /* Necessary hardware support for PVH mode? */
     int pvh_supported;
 
+    /* Necessary hardware support for alternate p2m's? */
+    bool_t altp2m_supported;
+
     /* Indicate HAP capabilities. */
     int hap_capabilities;
 
@@ -164,6 +166,7 @@ struct hvm_function_table {
     int (*msr_read_intercept)(unsigned int msr, uint64_t *msr_content);
     int (*msr_write_intercept)(unsigned int msr, uint64_t msr_content);
     void (*invlpg_intercept)(unsigned long vaddr);
+    int (*vmfunc_intercept)(struct cpu_user_regs *regs);
     void (*handle_cd)(struct vcpu *v, unsigned long value);
     void (*set_info_guest)(struct vcpu *v);
     void (*set_rdtsc_exiting)(struct vcpu *v, bool_t);
@@ -172,16 +175,11 @@ struct hvm_function_table {
     int (*nhvm_vcpu_initialise)(struct vcpu *v);
     void (*nhvm_vcpu_destroy)(struct vcpu *v);
     int (*nhvm_vcpu_reset)(struct vcpu *v);
-    int (*nhvm_vcpu_hostrestore)(struct vcpu *v,
-                                struct cpu_user_regs *regs);
-    int (*nhvm_vcpu_vmexit)(struct vcpu *v, struct cpu_user_regs *regs,
-                                uint64_t exitcode);
     int (*nhvm_vcpu_vmexit_trap)(struct vcpu *v, struct hvm_trap *trap);
-    uint64_t (*nhvm_vcpu_guestcr3)(struct vcpu *v);
     uint64_t (*nhvm_vcpu_p2m_base)(struct vcpu *v);
-    uint32_t (*nhvm_vcpu_asid)(struct vcpu *v);
-    int (*nhvm_vmcx_guest_intercepts_trap)(struct vcpu *v, 
-                               unsigned int trapnr, int errcode);
+    bool_t (*nhvm_vmcx_guest_intercepts_trap)(struct vcpu *v,
+                                              unsigned int trapnr,
+                                              int errcode);
 
     bool_t (*nhvm_vmcx_hap_enabled)(struct vcpu *v);
 
@@ -207,6 +205,13 @@ struct hvm_function_table {
                                   uint32_t *ecx, uint32_t *edx);
 
     void (*enable_msr_exit_interception)(struct domain *d);
+    bool_t (*is_singlestep_supported)(void);
+
+    /* Alternate p2m */
+    void (*altp2m_vcpu_update_p2m)(struct vcpu *v);
+    void (*altp2m_vcpu_update_vmfunc_ve)(struct vcpu *v);
+    bool_t (*altp2m_vcpu_emulate_ve)(struct vcpu *v);
+    int (*altp2m_vcpu_emulate_vmfunc)(struct cpu_user_regs *regs);
 };
 
 extern struct hvm_function_table hvm_funcs;
@@ -228,8 +233,10 @@ int hvm_vcpu_cacheattr_init(struct vcpu *v);
 void hvm_vcpu_cacheattr_destroy(struct vcpu *v);
 void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip);
 
-bool_t hvm_send_assist_req(ioreq_t *p);
-void hvm_broadcast_assist_req(ioreq_t *p);
+struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
+                                                 ioreq_t *p);
+int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *p, bool_t buffered);
+unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool_t buffered);
 
 void hvm_get_guest_pat(struct vcpu *v, u64 *guest_pat);
 int hvm_set_guest_pat(struct vcpu *v, u64 guest_pat);
@@ -359,7 +366,6 @@ void hvm_hypervisor_cpuid_leaf(uint32_t sub_idx,
 void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
                                    unsigned int *ecx, unsigned int *edx);
 void hvm_migrate_timers(struct vcpu *v);
-bool_t hvm_has_dm(struct domain *d);
 bool_t hvm_io_pending(struct vcpu *v);
 void hvm_do_resume(struct vcpu *v);
 void hvm_migrate_pirqs(struct vcpu *v);
@@ -434,7 +440,8 @@ int hvm_virtual_to_linear_addr(
     unsigned int addr_size,
     unsigned long *linear_addr);
 
-void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent);
+void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent,
+                             bool_t *writable);
 void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent);
 void hvm_unmap_guest_frame(void *p, bool_t permanent);
 
@@ -446,6 +453,9 @@ static inline void hvm_set_info_guest(struct vcpu *v)
 
 int hvm_debug_op(struct vcpu *v, int32_t op);
 
+/* Caller should pause vcpu before calling this function */
+void hvm_toggle_singlestep(struct vcpu *v);
+
 static inline void hvm_invalidate_regs_fields(struct cpu_user_regs *regs)
 {
 #ifndef NDEBUG
@@ -473,50 +483,68 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
 int hvm_x2apic_msr_read(struct vcpu *v, unsigned int msr, uint64_t *msr_content);
 int hvm_x2apic_msr_write(struct vcpu *v, unsigned int msr, uint64_t msr_content);
 
-/* Called for current VCPU on crX changes by guest */
-void hvm_memory_event_cr0(unsigned long value, unsigned long old);
-void hvm_memory_event_cr3(unsigned long value, unsigned long old);
-void hvm_memory_event_cr4(unsigned long value, unsigned long old);
-void hvm_memory_event_msr(unsigned long msr, unsigned long value);
-/* Called for current VCPU on int3: returns -1 if no listener */
-int hvm_memory_event_int3(unsigned long gla);
-
-/* Called for current VCPU on single step: returns -1 if no listener */
-int hvm_memory_event_single_step(unsigned long gla);
-
 /*
  * Nested HVM
  */
 
-/* Restores l1 guest state */
-int nhvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs);
-/* Fill l1 guest's VMCB/VMCS with data provided by generic exit codes
- * (do conversion as needed), other misc SVM/VMX specific tweaks to make
- * it work */
-int nhvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs,
-                     uint64_t exitcode);
 /* inject vmexit into l1 guest. l1 guest will see a VMEXIT due to
  * 'trapnr' exception.
  */ 
-int nhvm_vcpu_vmexit_trap(struct vcpu *v, struct hvm_trap *trap);
+static inline int nhvm_vcpu_vmexit_trap(struct vcpu *v, struct hvm_trap *trap)
+{
+    return hvm_funcs.nhvm_vcpu_vmexit_trap(v, trap);
+}
 
-/* returns l2 guest cr3 in l2 guest physical address space. */
-uint64_t nhvm_vcpu_guestcr3(struct vcpu *v);
 /* returns l1 guest's cr3 that points to the page table used to
  * translate l2 guest physical address to l1 guest physical address.
  */
-uint64_t nhvm_vcpu_p2m_base(struct vcpu *v);
-/* returns the asid number l1 guest wants to use to run the l2 guest */
-uint32_t nhvm_vcpu_asid(struct vcpu *v);
+static inline uint64_t nhvm_vcpu_p2m_base(struct vcpu *v)
+{
+    return hvm_funcs.nhvm_vcpu_p2m_base(v);
+}
 
 /* returns true, when l1 guest intercepts the specified trap */
-int nhvm_vmcx_guest_intercepts_trap(struct vcpu *v, 
-                                    unsigned int trapnr, int errcode);
+static inline bool_t nhvm_vmcx_guest_intercepts_trap(struct vcpu *v,
+                                                     unsigned int trap,
+                                                     int errcode)
+{
+    return hvm_funcs.nhvm_vmcx_guest_intercepts_trap(v, trap, errcode);
+}
 
 /* returns true when l1 guest wants to use hap to run l2 guest */
-bool_t nhvm_vmcx_hap_enabled(struct vcpu *v);
+static inline bool_t nhvm_vmcx_hap_enabled(struct vcpu *v)
+{
+    return hvm_funcs.nhvm_vmcx_hap_enabled(v);
+}
+
 /* interrupt */
-enum hvm_intblk nhvm_interrupt_blocked(struct vcpu *v);
+static inline enum hvm_intblk nhvm_interrupt_blocked(struct vcpu *v)
+{
+    return hvm_funcs.nhvm_intr_blocked(v);
+}
+
+static inline bool_t hvm_enable_msr_exit_interception(struct domain *d)
+{
+    if ( hvm_funcs.enable_msr_exit_interception )
+    {
+        hvm_funcs.enable_msr_exit_interception(d);
+        return 1;
+    }
+
+    return 0;
+}
+
+static inline bool_t hvm_is_singlestep_supported(void)
+{
+    return (hvm_funcs.is_singlestep_supported &&
+            hvm_funcs.is_singlestep_supported());
+}
+
+/* returns true if hardware supports alternate p2m's */
+static inline bool_t hvm_altp2m_supported(void)
+{
+    return hvm_funcs.altp2m_supported;
+}
 
 #ifndef NDEBUG
 /* Permit use of the Forced Emulation Prefix in HVM guests */
@@ -525,6 +553,15 @@ extern bool_t opt_hvm_fep;
 #define opt_hvm_fep 0
 #endif
 
+/* updates the current hardware p2m */
+void altp2m_vcpu_update_p2m(struct vcpu *v);
+
+/* updates VMCS fields related to VMFUNC and #VE */
+void altp2m_vcpu_update_vmfunc_ve(struct vcpu *v);
+
+/* emulates #VE */
+bool_t altp2m_vcpu_emulate_ve(struct vcpu *v);
+
 #endif /* __ASM_X86_HVM_HVM_H__ */
 
 /*
diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h
index 886a9d6..8585a1f 100644
--- a/xen/include/asm-x86/hvm/io.h
+++ b/xen/include/asm-x86/hvm/io.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_IO_H__
@@ -25,96 +24,97 @@
 #include <public/hvm/ioreq.h>
 #include <public/event_channel.h>
 
-#define MAX_IO_HANDLER             16
-
-#define HVM_PORTIO                  0
-#define HVM_BUFFERED_IO             2
+#define NR_IO_HANDLERS 32
 
 typedef int (*hvm_mmio_read_t)(struct vcpu *v,
                                unsigned long addr,
-                               unsigned long length,
+                               unsigned int length,
                                unsigned long *val);
 typedef int (*hvm_mmio_write_t)(struct vcpu *v,
                                 unsigned long addr,
-                                unsigned long length,
+                                unsigned int length,
                                 unsigned long val);
 typedef int (*hvm_mmio_check_t)(struct vcpu *v, unsigned long addr);
 
-typedef int (*portio_action_t)(
-    int dir, uint32_t port, uint32_t bytes, uint32_t *val);
-typedef int (*mmio_action_t)(ioreq_t *);
-struct io_handler {
-    int                 type;
-    unsigned long       addr;
-    unsigned long       size;
-    union {
-        portio_action_t portio;
-        mmio_action_t   mmio;
-        void           *ptr;
-    } action;
+struct hvm_mmio_ops {
+    hvm_mmio_check_t check;
+    hvm_mmio_read_t  read;
+    hvm_mmio_write_t write;
 };
 
+static inline paddr_t hvm_mmio_first_byte(const ioreq_t *p)
+{
+    return p->df ?
+           p->addr - (p->count - 1ul) * p->size :
+           p->addr;
+}
+
+static inline paddr_t hvm_mmio_last_byte(const ioreq_t *p)
+{
+    unsigned long count = p->count;
+
+    return p->df ?
+           p->addr + p->size - 1:
+           p->addr + (count * p->size) - 1;
+}
+
+typedef int (*portio_action_t)(
+    int dir, unsigned int port, unsigned int bytes, uint32_t *val);
+
 struct hvm_io_handler {
-    int     num_slot;
-    struct  io_handler hdl_list[MAX_IO_HANDLER];
+    union {
+        struct {
+            const struct hvm_mmio_ops *ops;
+        } mmio;
+        struct {
+            unsigned int port, size;
+            portio_action_t action;
+        } portio;
+    };
+    const struct hvm_io_ops *ops;
+    uint8_t type;
 };
 
-struct hvm_mmio_handler {
-    hvm_mmio_check_t check_handler;
-    hvm_mmio_read_t read_handler;
-    hvm_mmio_write_t write_handler;
+typedef int (*hvm_io_read_t)(const struct hvm_io_handler *,
+                             uint64_t addr,
+                             uint32_t size,
+                             uint64_t *data);
+typedef int (*hvm_io_write_t)(const struct hvm_io_handler *,
+                              uint64_t addr,
+                              uint32_t size,
+                              uint64_t data);
+typedef bool_t (*hvm_io_accept_t)(const struct hvm_io_handler *,
+                                  const ioreq_t *p);
+typedef void (*hvm_io_complete_t)(const struct hvm_io_handler *);
+
+struct hvm_io_ops {
+    hvm_io_accept_t   accept;
+    hvm_io_read_t     read;
+    hvm_io_write_t    write;
+    hvm_io_complete_t complete;
 };
 
-extern const struct hvm_mmio_handler hpet_mmio_handler;
-extern const struct hvm_mmio_handler vlapic_mmio_handler;
-extern const struct hvm_mmio_handler vioapic_mmio_handler;
-extern const struct hvm_mmio_handler msixtbl_mmio_handler;
-extern const struct hvm_mmio_handler iommu_mmio_handler;
+int hvm_process_io_intercept(const struct hvm_io_handler *handler,
+                             ioreq_t *p);
 
-#define HVM_MMIO_HANDLER_NR 5
+const struct hvm_io_handler *hvm_find_io_handler(ioreq_t *p);
 
-int hvm_io_intercept(ioreq_t *p, int type);
-void register_io_handler(
-    struct domain *d, unsigned long addr, unsigned long size,
-    void *action, int type);
-void relocate_io_handler(
-    struct domain *d, unsigned long old_addr, unsigned long new_addr,
-    unsigned long size, int type);
+int hvm_io_intercept(ioreq_t *p);
 
-static inline int hvm_portio_intercept(ioreq_t *p)
-{
-    return hvm_io_intercept(p, HVM_PORTIO);
-}
-
-static inline int hvm_buffered_io_intercept(ioreq_t *p)
-{
-    return hvm_io_intercept(p, HVM_BUFFERED_IO);
-}
+struct hvm_io_handler *hvm_next_io_handler(struct domain *d);
 
 bool_t hvm_mmio_internal(paddr_t gpa);
-int hvm_mmio_intercept(ioreq_t *p);
-int hvm_buffered_io_send(ioreq_t *p);
 
-static inline void register_portio_handler(
-    struct domain *d, unsigned long addr,
-    unsigned long size, portio_action_t action)
-{
-    register_io_handler(d, addr, size, action, HVM_PORTIO);
-}
+void register_mmio_handler(struct domain *d,
+                           const struct hvm_mmio_ops *ops);
 
-static inline void relocate_portio_handler(
-    struct domain *d, unsigned long old_addr, unsigned long new_addr,
-    unsigned long size)
-{
-    relocate_io_handler(d, old_addr, new_addr, size, HVM_PORTIO);
-}
+void register_portio_handler(
+    struct domain *d, unsigned int port, unsigned int size,
+    portio_action_t action);
 
-static inline void register_buffered_io_handler(
-    struct domain *d, unsigned long addr,
-    unsigned long size, mmio_action_t action)
-{
-    register_io_handler(d, addr, size, action, HVM_BUFFERED_IO);
-}
+void relocate_portio_handler(
+    struct domain *d, unsigned int old_port, unsigned int new_port,
+    unsigned int size);
 
 void send_timeoffset_req(unsigned long timeoff);
 void send_invalidate_req(void);
@@ -123,10 +123,10 @@ int handle_mmio_with_translation(unsigned long gva, unsigned long gpfn,
                                  struct npfec);
 int handle_pio(uint16_t port, unsigned int size, int dir);
 void hvm_interrupt_post(struct vcpu *v, int vector, int type);
-void hvm_io_assist(ioreq_t *p);
 void hvm_dpci_eoi(struct domain *d, unsigned int guest_irq,
                   const union vioapic_redir_entry *ent);
 void msix_write_completion(struct vcpu *);
+void msixtbl_init(struct domain *d);
 
 struct hvm_hw_stdvga {
     uint8_t sr_index;
@@ -144,5 +144,18 @@ void stdvga_init(struct domain *d);
 void stdvga_deinit(struct domain *d);
 
 extern void hvm_dpci_msi_eoi(struct domain *d, int vector);
+
+void register_dpci_portio_handler(struct domain *d);
+
 #endif /* __ASM_X86_HVM_IO_H__ */
 
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/hvm/iommu.h b/xen/include/asm-x86/hvm/iommu.h
index 46cb126..3a4c68a 100644
--- a/xen/include/asm-x86/hvm/iommu.h
+++ b/xen/include/asm-x86/hvm/iommu.h
@@ -46,6 +46,8 @@ struct g2m_ioport {
     unsigned int np;
 };
 
+#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
+
 struct arch_hvm_iommu
 {
     u64 pgd_maddr;                 /* io page directory machine address */
diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h
index 9ec5afa..73b8fb0 100644
--- a/xen/include/asm-x86/hvm/irq.h
+++ b/xen/include/asm-x86/hvm/irq.h
@@ -15,8 +15,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_IRQ_H__
diff --git a/xen/include/asm-x86/hvm/nestedhvm.h b/xen/include/asm-x86/hvm/nestedhvm.h
index cca41b3..cf1a8f4 100644
--- a/xen/include/asm-x86/hvm/nestedhvm.h
+++ b/xen/include/asm-x86/hvm/nestedhvm.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _HVM_NESTEDHVM_H
diff --git a/xen/include/asm-x86/hvm/support.h b/xen/include/asm-x86/hvm/support.h
index 05ef5c5..4a1f0ad 100644
--- a/xen/include/asm-x86/hvm/support.h
+++ b/xen/include/asm-x86/hvm/support.h
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_SUPPORT_H__
@@ -124,11 +123,12 @@ void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value);
 
 /* These functions all return X86EMUL return codes. */
 int hvm_set_efer(uint64_t value);
-int hvm_set_cr0(unsigned long value);
-int hvm_set_cr3(unsigned long value);
-int hvm_set_cr4(unsigned long value);
+int hvm_set_cr0(unsigned long value, bool_t may_defer);
+int hvm_set_cr3(unsigned long value, bool_t may_defer);
+int hvm_set_cr4(unsigned long value, bool_t may_defer);
 int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content);
-int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content);
+int hvm_msr_write_intercept(
+    unsigned int msr, uint64_t msr_content, bool_t may_defer);
 int hvm_mov_to_cr(unsigned int cr, unsigned int gpr);
 int hvm_mov_from_cr(unsigned int cr, unsigned int gpr);
 
diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
index 2ac6f75..c479f0b 100644
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _ASM_X86_64_AMD_IOMMU_DEFS_H
@@ -464,8 +463,6 @@
 #define IOMMU_CONTROL_DISABLED	0
 #define IOMMU_CONTROL_ENABLED	1
 
-#define DEFAULT_DOMAIN_ADDRESS_WIDTH    48
-
 /* interrupt remapping table */
 #define INT_REMAP_ENTRY_REMAPEN_MASK    0x00000001
 #define INT_REMAP_ENTRY_REMAPEN_SHIFT   0
diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
index cf43e29..9c51172 100644
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _ASM_X86_64_AMD_IOMMU_PROTO_H
diff --git a/xen/include/asm-x86/hvm/svm/asid.h b/xen/include/asm-x86/hvm/svm/asid.h
index a8d6853..182866c 100644
--- a/xen/include/asm-x86/hvm/svm/asid.h
+++ b/xen/include/asm-x86/hvm/svm/asid.h
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_SVM_ASID_H__
diff --git a/xen/include/asm-x86/hvm/svm/emulate.h b/xen/include/asm-x86/hvm/svm/emulate.h
index ccc2d3c..7c1dcd1 100644
--- a/xen/include/asm-x86/hvm/svm/emulate.h
+++ b/xen/include/asm-x86/hvm/svm/emulate.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_SVM_EMULATE_H__
diff --git a/xen/include/asm-x86/hvm/svm/intr.h b/xen/include/asm-x86/hvm/svm/intr.h
index 4f74151..ae52d9f 100644
--- a/xen/include/asm-x86/hvm/svm/intr.h
+++ b/xen/include/asm-x86/hvm/svm/intr.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/xen/include/asm-x86/hvm/svm/nestedsvm.h b/xen/include/asm-x86/hvm/svm/nestedsvm.h
index f88b1bd..974a7d4 100644
--- a/xen/include/asm-x86/hvm/svm/nestedsvm.h
+++ b/xen/include/asm-x86/hvm/svm/nestedsvm.h
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 #ifndef __ASM_X86_HVM_SVM_NESTEDSVM_H__
@@ -110,18 +109,11 @@ nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs,
 void nsvm_vcpu_destroy(struct vcpu *v);
 int nsvm_vcpu_initialise(struct vcpu *v);
 int nsvm_vcpu_reset(struct vcpu *v);
-int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs);
 int nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs);
-int nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs,
-    uint64_t exitcode);
 int nsvm_vcpu_vmexit_trap(struct vcpu *v, struct hvm_trap *trap);
-uint64_t nsvm_vcpu_guestcr3(struct vcpu *v);
 uint64_t nsvm_vcpu_hostcr3(struct vcpu *v);
-uint32_t nsvm_vcpu_asid(struct vcpu *v);
-int nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v,
-    struct cpu_user_regs *regs, uint64_t exitcode);
-int nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr,
-                                    int errcode);
+bool_t nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr,
+                                       int errcode);
 bool_t nsvm_vmcb_hap_enabled(struct vcpu *v);
 enum hvm_intblk nsvm_intr_blocked(struct vcpu *v);
 
diff --git a/xen/include/asm-x86/hvm/svm/svm.h b/xen/include/asm-x86/hvm/svm/svm.h
index cb2db3f..d60ec23 100644
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/xen/include/asm-x86/hvm/svm/svmdebug.h b/xen/include/asm-x86/hvm/svm/svmdebug.h
index 7df17fe..c282a06 100644
--- a/xen/include/asm-x86/hvm/svm/svmdebug.h
+++ b/xen/include/asm-x86/hvm/svm/svmdebug.h
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
diff --git a/xen/include/asm-x86/hvm/svm/vmcb.h b/xen/include/asm-x86/hvm/svm/vmcb.h
index fd0b0a4..ec47e2d 100644
--- a/xen/include/asm-x86/hvm/svm/vmcb.h
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 #ifndef __ASM_X86_HVM_SVM_VMCB_H__
diff --git a/xen/include/asm-x86/hvm/vcpu.h b/xen/include/asm-x86/hvm/vcpu.h
index 01e0665..f553814 100644
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_VCPU_H__
@@ -30,13 +29,11 @@
 #include <asm/hvm/svm/nestedsvm.h>
 #include <asm/mtrr.h>
 
-enum hvm_io_state {
-    HVMIO_none = 0,
-    HVMIO_dispatched,
-    HVMIO_awaiting_completion,
-    HVMIO_handle_mmio_awaiting_completion,
-    HVMIO_handle_pio_awaiting_completion,
-    HVMIO_completed
+enum hvm_io_completion {
+    HVMIO_no_completion,
+    HVMIO_mmio_completion,
+    HVMIO_pio_completion,
+    HVMIO_realmode_completion
 };
 
 struct hvm_vcpu_asid {
@@ -44,11 +41,22 @@ struct hvm_vcpu_asid {
     uint32_t asid;
 };
 
+/*
+ * We may read or write up to m256 as a number of device-model
+ * transactions.
+ */
+struct hvm_mmio_cache {
+    unsigned long gla;
+    unsigned int size;
+    uint8_t dir;
+    uint8_t pad[3]; /* make buffer[] long-aligned */
+    uint8_t buffer[32];
+};
+
 struct hvm_vcpu_io {
     /* I/O request in flight to device model. */
-    enum hvm_io_state   io_state;
-    unsigned long       io_data;
-    int                 io_size;
+    enum hvm_io_completion io_completion;
+    ioreq_t                io_req;
 
     /*
      * HVM emulation:
@@ -60,13 +68,13 @@ struct hvm_vcpu_io {
     unsigned long       mmio_gva;
     unsigned long       mmio_gpfn;
 
-    /* We may read up to m256 as a number of device-model transactions. */
-    paddr_t mmio_large_read_pa;
-    uint8_t mmio_large_read[32];
-    unsigned int mmio_large_read_bytes;
-    /* We may write up to m256 as a number of device-model transactions. */
-    unsigned int mmio_large_write_bytes;
-    paddr_t mmio_large_write_pa;
+    /*
+     * We may need to handle up to 3 distinct memory accesses per
+     * instruction.
+     */
+    struct hvm_mmio_cache mmio_cache[3];
+    unsigned int mmio_cache_count;
+
     /* For retries we shouldn't re-fetch the instruction. */
     unsigned int mmio_insn_bytes;
     unsigned char mmio_insn[16];
@@ -74,11 +82,19 @@ struct hvm_vcpu_io {
      * For string instruction emulation we need to be able to signal a
      * necessary retry through other than function return codes.
      */
-    bool_t mmio_retry, mmio_retrying;
+    bool_t mmio_retry;
 
     unsigned long msix_unmask_address;
+
+    const struct g2m_ioport *g2m_ioport;
 };
 
+static inline bool_t hvm_vcpu_io_need_completion(const struct hvm_vcpu_io *vio)
+{
+    return (vio->io_req.state == STATE_IOREQ_READY) &&
+           !vio->io_req.data_is_ptr;
+}
+
 #define VMCX_EADDR    (~0ULL)
 
 struct nestedvcpu {
@@ -118,6 +134,13 @@ struct nestedvcpu {
 
 #define vcpu_nestedhvm(v) ((v)->arch.hvm_vcpu.nvcpu)
 
+struct altp2mvcpu {
+    uint16_t    p2midx;         /* alternate p2m index */
+    gfn_t       veinfo_gfn;     /* #VE information page gfn */
+};
+
+#define vcpu_altp2m(v) ((v)->arch.hvm_vcpu.avcpu)
+
 struct hvm_vcpu {
     /* Guest control-register and EFER values, just as the guest sees them. */
     unsigned long       guest_cr[5];
@@ -151,9 +174,6 @@ struct hvm_vcpu {
     u32                 msr_tsc_aux;
     u64                 msr_tsc_adjust;
 
-    /* VPMU */
-    struct vpmu_struct  vpmu;
-
     union {
         struct arch_vmx_struct vmx;
         struct arch_svm_struct svm;
@@ -163,12 +183,16 @@ struct hvm_vcpu {
 
     struct nestedvcpu   nvcpu;
 
+    struct altp2mvcpu   avcpu;
+
     struct mtrr_state   mtrr;
     u64                 pat_cr;
 
     /* In mode delay_for_missed_ticks, VCPUs have differing guest times. */
     int64_t             stime_offset;
 
+    u8                  evtchn_upcall_vector;
+
     /* Which cache mode is this VCPU in (CR0:CD/NW)? */
     u8                  cache_mode;
 
diff --git a/xen/include/asm-x86/hvm/vioapic.h b/xen/include/asm-x86/hvm/vioapic.h
index ab4e07e..67fdd67 100644
--- a/xen/include/asm-x86/hvm/vioapic.h
+++ b/xen/include/asm-x86/hvm/vioapic.h
@@ -18,8 +18,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_VIOAPIC_H__
@@ -47,6 +46,7 @@
 #define VIOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
 #define VIOAPIC_REG_VERSION 0x01
 #define VIOAPIC_REG_ARB_ID  0x02 /* x86 IOAPIC only */
+#define VIOAPIC_REG_RTE0    0x10
 
 struct hvm_vioapic {
     struct hvm_hw_vioapic hvm_hw_vioapic;
@@ -61,6 +61,6 @@ int vioapic_init(struct domain *d);
 void vioapic_deinit(struct domain *d);
 void vioapic_reset(struct domain *d);
 void vioapic_irq_positive_edge(struct domain *d, unsigned int irq);
-void vioapic_update_EOI(struct domain *d, int vector);
+void vioapic_update_EOI(struct domain *d, u8 vector);
 
 #endif /* __ASM_X86_HVM_VIOAPIC_H__ */
diff --git a/xen/include/asm-x86/hvm/viridian.h b/xen/include/asm-x86/hvm/viridian.h
index 4cab2e8..c4319d7 100644
--- a/xen/include/asm-x86/hvm/viridian.h
+++ b/xen/include/asm-x86/hvm/viridian.h
@@ -61,11 +61,36 @@ struct viridian_time_ref_count
     int64_t off;
 };
 
+union viridian_reference_tsc
+{
+    uint64_t raw;
+    struct
+    {
+        uint64_t enabled:1;
+        uint64_t reserved_preserved:11;
+        uint64_t pfn:48;
+    } fields;
+};
+
+/*
+ * Type defintion as in Microsoft Hypervisor Top-Level Functional
+ * Specification v4.0a, section 15.4.2.
+ */
+typedef struct _HV_REFERENCE_TSC_PAGE
+{
+    uint32_t TscSequence;
+    uint32_t Reserved1;
+    uint64_t TscScale;
+    int64_t  TscOffset;
+    uint64_t Reserved2[509];
+} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
+
 struct viridian_domain
 {
     union viridian_guest_os_id guest_os_id;
     union viridian_hypercall_gpa hypercall_gpa;
     struct viridian_time_ref_count time_ref_count;
+    union viridian_reference_tsc reference_tsc;
 };
 
 int
diff --git a/xen/include/asm-x86/hvm/vlapic.h b/xen/include/asm-x86/hvm/vlapic.h
index cfe9bdb..4656293 100644
--- a/xen/include/asm-x86/hvm/vlapic.h
+++ b/xen/include/asm-x86/hvm/vlapic.h
@@ -14,8 +14,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_VLAPIC_H__
@@ -127,7 +126,7 @@ uint32_t vlapic_set_ppr(struct vlapic *vlapic);
 void vlapic_adjust_i8259_target(struct domain *d);
 
 void vlapic_EOI_set(struct vlapic *vlapic);
-void vlapic_handle_EOI_induced_exit(struct vlapic *vlapic, int vector);
+void vlapic_handle_EOI(struct vlapic *vlapic, u8 vector);
 
 void vlapic_ipi(struct vlapic *vlapic, uint32_t icr_low, uint32_t icr_high);
 
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 6a99dca..f1126d4 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -12,15 +12,14 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 #ifndef __ASM_X86_HVM_VMX_VMCS_H__
 #define __ASM_X86_HVM_VMX_VMCS_H__
 
+#include <asm/vpmu.h>
 #include <asm/hvm/io.h>
-#include <asm/hvm/vpmu.h>
 #include <irq_vectors.h>
 
 extern void vmcs_dump_vcpu(struct vcpu *v);
@@ -62,7 +61,8 @@ struct ept_data {
     struct {
             u64 ept_mt :3,
                 ept_wl :3,
-                rsvd   :6,
+                ept_ad :1,  /* bit 6 - enable EPT A/D bits */
+                rsvd   :5,
                 asr    :52;
         };
         u64 eptp;
@@ -70,8 +70,12 @@ struct ept_data {
     cpumask_var_t synced_mask;
 };
 
+#define _VMX_DOMAIN_PML_ENABLED    0
+#define VMX_DOMAIN_PML_ENABLED     (1ul << _VMX_DOMAIN_PML_ENABLED)
 struct vmx_domain {
     unsigned long apic_access_mfn;
+    /* VMX_DOMAIN_* */
+    unsigned int status;
 };
 
 struct pi_desc {
@@ -85,6 +89,8 @@ struct pi_desc {
 #define ept_get_eptp(ept) ((ept)->eptp)
 #define ept_get_synced_mask(ept) ((ept)->synced_mask)
 
+#define NR_PML_ENTRIES   512
+
 struct arch_vmx_struct {
     /* Virtual address of VMCS. */
     struct vmcs_struct  *vmcs;
@@ -142,6 +148,8 @@ struct arch_vmx_struct {
     /* Bitmap to control vmexit policy for Non-root VMREAD/VMWRITE */
     struct page_info     *vmread_bitmap;
     struct page_info     *vmwrite_bitmap;
+
+    struct page_info     *pml_pg;
 };
 
 int vmx_create_vmcs(struct vcpu *v);
@@ -213,7 +221,10 @@ extern u32 vmx_vmentry_control;
 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING       0x00000400
 #define SECONDARY_EXEC_ENABLE_INVPCID           0x00001000
+#define SECONDARY_EXEC_ENABLE_VM_FUNCTIONS      0x00002000
 #define SECONDARY_EXEC_ENABLE_VMCS_SHADOWING    0x00004000
+#define SECONDARY_EXEC_ENABLE_PML               0x00020000
+#define SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS   0x00040000
 extern u32 vmx_secondary_exec_control;
 
 #define VMX_EPT_EXEC_ONLY_SUPPORTED             0x00000001
@@ -225,6 +236,7 @@ extern u32 vmx_secondary_exec_control;
 #define VMX_EPT_INVEPT_INSTRUCTION              0x00100000
 #define VMX_EPT_INVEPT_SINGLE_CONTEXT           0x02000000
 #define VMX_EPT_INVEPT_ALL_CONTEXT              0x04000000
+#define VMX_EPT_AD_BIT                          0x00200000
 
 #define VMX_MISC_VMWRITE_ALL                    0x20000000
 
@@ -273,6 +285,12 @@ extern u32 vmx_secondary_exec_control;
     (vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT)
 #define cpu_has_vmx_vmcs_shadowing \
     (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VMCS_SHADOWING)
+#define cpu_has_vmx_vmfunc \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VM_FUNCTIONS)
+#define cpu_has_vmx_virt_exceptions \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS)
+#define cpu_has_vmx_pml \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_PML)
 
 #define VMCS_RID_TYPE_MASK              0x80000000
 
@@ -302,10 +320,15 @@ extern u64 vmx_basic_msr;
 #define VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK  0x0FF
 #define VMX_GUEST_INTR_STATUS_SVI_OFFSET        8
 
+/* VMFUNC leaf definitions */
+#define VMX_VMFUNC_EPTP_SWITCHING   (1ULL << 0)
+
 /* VMCS field encodings. */
+#define VMCS_HIGH(x) ((x) | 1)
 enum vmcs_field {
     VIRTUAL_PROCESSOR_ID            = 0x00000000,
     POSTED_INTR_NOTIFICATION_VECTOR = 0x00000002,
+    EPTP_INDEX                      = 0x00000004,
     GUEST_ES_SELECTOR               = 0x00000800,
     GUEST_CS_SELECTOR               = 0x00000802,
     GUEST_SS_SELECTOR               = 0x00000804,
@@ -315,6 +338,7 @@ enum vmcs_field {
     GUEST_LDTR_SELECTOR             = 0x0000080c,
     GUEST_TR_SELECTOR               = 0x0000080e,
     GUEST_INTR_STATUS               = 0x00000810,
+    GUEST_PML_INDEX                 = 0x00000812,
     HOST_ES_SELECTOR                = 0x00000c00,
     HOST_CS_SELECTOR                = 0x00000c02,
     HOST_SS_SELECTOR                = 0x00000c04,
@@ -323,61 +347,36 @@ enum vmcs_field {
     HOST_GS_SELECTOR                = 0x00000c0a,
     HOST_TR_SELECTOR                = 0x00000c0c,
     IO_BITMAP_A                     = 0x00002000,
-    IO_BITMAP_A_HIGH                = 0x00002001,
     IO_BITMAP_B                     = 0x00002002,
-    IO_BITMAP_B_HIGH                = 0x00002003,
     MSR_BITMAP                      = 0x00002004,
-    MSR_BITMAP_HIGH                 = 0x00002005,
     VM_EXIT_MSR_STORE_ADDR          = 0x00002006,
-    VM_EXIT_MSR_STORE_ADDR_HIGH     = 0x00002007,
     VM_EXIT_MSR_LOAD_ADDR           = 0x00002008,
-    VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
     VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
-    VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
+    PML_ADDRESS                     = 0x0000200e,
     TSC_OFFSET                      = 0x00002010,
-    TSC_OFFSET_HIGH                 = 0x00002011,
     VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
-    VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
     APIC_ACCESS_ADDR                = 0x00002014,
-    APIC_ACCESS_ADDR_HIGH           = 0x00002015,
     PI_DESC_ADDR                    = 0x00002016,
-    PI_DESC_ADDR_HIGH               = 0x00002017,
+    VM_FUNCTION_CONTROL             = 0x00002018,
     EPT_POINTER                     = 0x0000201a,
-    EPT_POINTER_HIGH                = 0x0000201b,
     EOI_EXIT_BITMAP0                = 0x0000201c,
 #define EOI_EXIT_BITMAP(n) (EOI_EXIT_BITMAP0 + (n) * 2) /* n = 0...3 */
+    EPTP_LIST_ADDR                  = 0x00002024,
     VMREAD_BITMAP                   = 0x00002026,
-    VMREAD_BITMAP_HIGH              = 0x00002027,
     VMWRITE_BITMAP                  = 0x00002028,
-    VMWRITE_BITMAP_HIGH             = 0x00002029,
+    VIRT_EXCEPTION_INFO             = 0x0000202a,
     GUEST_PHYSICAL_ADDRESS          = 0x00002400,
-    GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
     VMCS_LINK_POINTER               = 0x00002800,
-    VMCS_LINK_POINTER_HIGH          = 0x00002801,
     GUEST_IA32_DEBUGCTL             = 0x00002802,
-    GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
     GUEST_PAT                       = 0x00002804,
-    GUEST_PAT_HIGH                  = 0x00002805,
     GUEST_EFER                      = 0x00002806,
-    GUEST_EFER_HIGH                 = 0x00002807,
     GUEST_PERF_GLOBAL_CTRL          = 0x00002808,
-    GUEST_PERF_GLOBAL_CTRL_HIGH     = 0x00002809,
-    GUEST_PDPTR0                    = 0x0000280a,
-    GUEST_PDPTR0_HIGH               = 0x0000280b,
-    GUEST_PDPTR1                    = 0x0000280c,
-    GUEST_PDPTR1_HIGH               = 0x0000280d,
-    GUEST_PDPTR2                    = 0x0000280e,
-    GUEST_PDPTR2_HIGH               = 0x0000280f,
-    GUEST_PDPTR3                    = 0x00002810,
-    GUEST_PDPTR3_HIGH               = 0x00002811,
+    GUEST_PDPTE0                    = 0x0000280a,
+#define GUEST_PDPTE(n) (GUEST_PDPTE0 + (n) * 2) /* n = 0...3 */
     GUEST_BNDCFGS                   = 0x00002812,
-    GUEST_BNDCFGS_HIGH              = 0x00002813,
     HOST_PAT                        = 0x00002c00,
-    HOST_PAT_HIGH                   = 0x00002c01,
     HOST_EFER                       = 0x00002c02,
-    HOST_EFER_HIGH                  = 0x00002c03,
     HOST_PERF_GLOBAL_CTRL           = 0x00002c04,
-    HOST_PERF_GLOBAL_CTRL_HIGH      = 0x00002c05,
     PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
     CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
     EXCEPTION_BITMAP                = 0x00004004,
@@ -424,7 +423,8 @@ enum vmcs_field {
     GUEST_TR_AR_BYTES               = 0x00004822,
     GUEST_INTERRUPTIBILITY_INFO     = 0x00004824,
     GUEST_ACTIVITY_STATE            = 0x00004826,
-    GUEST_SYSENTER_CS               = 0x0000482A,
+    GUEST_SMBASE                    = 0x00004828,
+    GUEST_SYSENTER_CS               = 0x0000482a,
     GUEST_PREEMPTION_TIMER          = 0x0000482e,
     HOST_SYSENTER_CS                = 0x00004c00,
     CR0_GUEST_HOST_MASK             = 0x00006000,
@@ -432,9 +432,7 @@ enum vmcs_field {
     CR0_READ_SHADOW                 = 0x00006004,
     CR4_READ_SHADOW                 = 0x00006006,
     CR3_TARGET_VALUE0               = 0x00006008,
-    CR3_TARGET_VALUE1               = 0x0000600a,
-    CR3_TARGET_VALUE2               = 0x0000600c,
-    CR3_TARGET_VALUE3               = 0x0000600e,
+#define CR3_TARGET_VALUE(n) (CR3_TARGET_VALUE0 + (n) * 2) /* n < CR3_TARGET_COUNT */
     EXIT_QUALIFICATION              = 0x00006400,
     GUEST_LINEAR_ADDRESS            = 0x0000640a,
     GUEST_CR0                       = 0x00006800,
@@ -482,12 +480,15 @@ extern const unsigned int vmx_introspection_force_enabled_msrs_size;
 
 #define MSR_TYPE_R 1
 #define MSR_TYPE_W 2
+
+#define VMX_GUEST_MSR 0
+#define VMX_HOST_MSR  1
+
 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type);
 void vmx_enable_intercept_for_msr(struct vcpu *v, u32 msr, int type);
 int vmx_read_guest_msr(u32 msr, u64 *val);
 int vmx_write_guest_msr(u32 msr, u64 val);
-int vmx_add_guest_msr(u32 msr);
-int vmx_add_host_load_msr(u32 msr);
+int vmx_add_msr(u32 msr, int type);
 void vmx_vmcs_switch(struct vmcs_struct *from, struct vmcs_struct *to);
 void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector);
 void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector);
@@ -497,8 +498,26 @@ void virtual_vmcs_exit(void *vvmcs);
 u64 virtual_vmcs_vmread(void *vvmcs, u32 vmcs_encoding);
 void virtual_vmcs_vmwrite(void *vvmcs, u32 vmcs_encoding, u64 val);
 
+static inline int vmx_add_guest_msr(u32 msr)
+{
+    return vmx_add_msr(msr, VMX_GUEST_MSR);
+}
+static inline int vmx_add_host_load_msr(u32 msr)
+{
+    return vmx_add_msr(msr, VMX_HOST_MSR);
+}
+
 DECLARE_PER_CPU(bool_t, vmxon);
 
+bool_t vmx_vcpu_pml_enabled(const struct vcpu *v);
+int vmx_vcpu_enable_pml(struct vcpu *v);
+void vmx_vcpu_disable_pml(struct vcpu *v);
+void vmx_vcpu_flush_pml_buffer(struct vcpu *v);
+bool_t vmx_domain_pml_enabled(const struct domain *d);
+int vmx_domain_enable_pml(struct domain *d);
+void vmx_domain_disable_pml(struct domain *d);
+void vmx_domain_flush_pml_buffers(struct domain *d);
+
 #endif /* ASM_X86_HVM_VMX_VMCS_H__ */
 
 /*
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
index c8bb548..2ed62f9 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -12,8 +12,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 #ifndef __ASM_X86_HVM_VMX_VMX_H__
@@ -37,7 +36,8 @@ typedef union {
         emt         :   3,  /* bits 5:3 - EPT Memory type */
         ipat        :   1,  /* bit 6 - Ignore PAT memory type */
         sp          :   1,  /* bit 7 - Is this a superpage? */
-        rsvd1       :   2,  /* bits 9:8 - Reserved for future use */
+        a           :   1,  /* bit 8 - Access bit */
+        d           :   1,  /* bit 9 - Dirty bit */
         recalc      :   1,  /* bit 10 - Software available 1 */
         snp         :   1,  /* bit 11 - VT-d snoop control in shared
                                EPT/VT-d usage */
@@ -46,7 +46,7 @@ typedef union {
         access      :   4,  /* bits 61:58 - p2m_access_t */
         tm          :   1,  /* bit 62 - VT-d transient-mapping hint in
                                shared EPT/VT-d usage */
-        avail3      :   1;  /* bit 63 - Software available 3 */
+        suppress_ve :   1;  /* bit 63 - suppress #VE */
     };
     u64 epte;
 } ept_entry_t;
@@ -93,6 +93,7 @@ void vmx_asm_do_vmentry(void);
 void vmx_intr_assist(void);
 void noreturn vmx_do_resume(struct vcpu *);
 void vmx_vlapic_msr_changed(struct vcpu *v);
+void vmx_realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt);
 void vmx_realmode(struct cpu_user_regs *regs);
 void vmx_update_debug_state(struct vcpu *v);
 void vmx_update_exception_bitmap(struct vcpu *v);
@@ -185,6 +186,8 @@ static inline unsigned long pi_get_pir(struct pi_desc *pi_desc, int group)
 #define EXIT_REASON_XSETBV              55
 #define EXIT_REASON_APIC_WRITE          56
 #define EXIT_REASON_INVPCID             58
+#define EXIT_REASON_VMFUNC              59
+#define EXIT_REASON_PML_FULL            62
 
 /*
  * Interruption-information format
@@ -207,8 +210,10 @@ static inline unsigned long pi_get_pir(struct pi_desc *pi_desc, int group)
 # define VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR 1
 # define VMX_CONTROL_REG_ACCESS_TYPE_CLTS        2
 # define VMX_CONTROL_REG_ACCESS_TYPE_LMSW        3
- /* 10:8 - general purpose register operand */
+ /* 11:8 - general purpose register operand */
 #define VMX_CONTROL_REG_ACCESS_GPR(eq)  (((eq) >> 8) & 0xf)
+ /* 31:16 - LMSW source data */
+#define VMX_CONTROL_REG_ACCESS_DATA(eq)  ((uint32_t)(eq) >> 16)
 
 /*
  * Access Rights
@@ -257,6 +262,7 @@ extern uint8_t posted_intr_vector;
     (vmx_ept_vpid_cap & VMX_EPT_SUPERPAGE_1GB)
 #define cpu_has_vmx_ept_2mb                     \
     (vmx_ept_vpid_cap & VMX_EPT_SUPERPAGE_2MB)
+#define cpu_has_vmx_ept_ad (vmx_ept_vpid_cap & VMX_EPT_AD_BIT)
 #define cpu_has_vmx_ept_invept_single_context   \
     (vmx_ept_vpid_cap & VMX_EPT_INVEPT_SINGLE_CONTEXT)
 
@@ -550,4 +556,14 @@ void p2m_init_hap_data(struct p2m_domain *p2m);
 #define EPT_L4_PAGETABLE_SHIFT      39
 #define EPT_PAGETABLE_ENTRIES       512
 
+/* #VE information page */
+typedef struct {
+    u32 exit_reason;
+    u32 semaphore;
+    u64 exit_qualification;
+    u64 gla;
+    u64 gpa;
+    u16 eptp_index;
+} ve_info_t;
+
 #endif /* __ASM_X86_HVM_VMX_VMX_H__ */
diff --git a/xen/include/asm-x86/hvm/vmx/vpmu_core2.h b/xen/include/asm-x86/hvm/vmx/vpmu_core2.h
deleted file mode 100644
index 60b05fd..0000000
--- a/xen/include/asm-x86/hvm/vmx/vpmu_core2.h
+++ /dev/null
@@ -1,51 +0,0 @@
-
-/*
- * vpmu_core2.h: CORE 2 specific PMU virtualization for HVM domain.
- *
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Author: Haitao Shan <haitao.shan at intel.com>
- */
-
-#ifndef __ASM_X86_HVM_VPMU_CORE_H_
-#define __ASM_X86_HVM_VPMU_CORE_H_
-
-/* Currently only 3 fixed counters are supported. */
-#define VPMU_CORE2_NUM_FIXED 3
-/* Currently only 3 Non-architectual Performance Control MSRs */
-#define VPMU_CORE2_NUM_CTRLS 3
-
-struct arch_msr_pair {
-    u64 counter;
-    u64 control;
-};
-
-struct core2_pmu_enable {
-    char ds_area_enable;
-    char fixed_ctr_enable[VPMU_CORE2_NUM_FIXED];
-    char arch_pmc_enable[1];
-};
-
-struct core2_vpmu_context {
-    struct core2_pmu_enable *pmu_enable;
-    u64 fix_counters[VPMU_CORE2_NUM_FIXED];
-    u64 ctrls[VPMU_CORE2_NUM_CTRLS];
-    u64 global_ovf_status;
-    struct arch_msr_pair arch_msr_pair[1];
-};
-
-#endif /* __ASM_X86_HVM_VPMU_CORE_H_ */
-
diff --git a/xen/include/asm-x86/hvm/vmx/vvmx.h b/xen/include/asm-x86/hvm/vmx/vvmx.h
index c17a440..60fdc21 100644
--- a/xen/include/asm-x86/hvm/vmx/vvmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vvmx.h
@@ -16,8 +16,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  */
 #ifndef __ASM_X86_HVM_VVMX_H__
@@ -111,12 +110,10 @@ union vmx_inst_info {
 int nvmx_vcpu_initialise(struct vcpu *v);
 void nvmx_vcpu_destroy(struct vcpu *v);
 int nvmx_vcpu_reset(struct vcpu *v);
-uint64_t nvmx_vcpu_guestcr3(struct vcpu *v);
 uint64_t nvmx_vcpu_eptp_base(struct vcpu *v);
-uint32_t nvmx_vcpu_asid(struct vcpu *v);
 enum hvm_intblk nvmx_intr_blocked(struct vcpu *v);
-int nvmx_intercepts_exception(struct vcpu *v, 
-                              unsigned int trap, int error_code);
+bool_t nvmx_intercepts_exception(struct vcpu *v, unsigned int trap,
+                                 int error_code);
 void nvmx_domain_relinquish_resources(struct domain *d);
 
 bool_t nvmx_ept_enabled(struct vcpu *v);
diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h
index 8b8b65a..495d669 100644
--- a/xen/include/asm-x86/hvm/vpt.h
+++ b/xen/include/asm-x86/hvm/vpt.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_HVM_VPT_H__
diff --git a/xen/include/asm-x86/iommu.h b/xen/include/asm-x86/iommu.h
index e7a65da..29203d7 100644
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -9,8 +9,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef __ARCH_X86_IOMMU_H__
 #define __ARCH_X86_IOMMU_H__
diff --git a/xen/include/asm-x86/irq.h b/xen/include/asm-x86/irq.h
index d3c55f3..a44305e 100644
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -5,6 +5,7 @@
 
 #include <xen/config.h>
 #include <asm/atomic.h>
+#include <asm/numa.h>
 #include <xen/cpumask.h>
 #include <xen/smp.h>
 #include <xen/hvm/irq.h>
@@ -155,7 +156,7 @@ int  init_irq_data(void);
 void clear_irq_vector(int irq);
 
 int irq_to_vector(int irq);
-int create_irq(int node);
+int create_irq(nodeid_t node);
 void destroy_irq(unsigned int irq);
 int assign_irq_vector(int irq, const cpumask_t *);
 
diff --git a/xen/include/asm-x86/ldt.h b/xen/include/asm-x86/ldt.h
index aa77368..289ae19 100644
--- a/xen/include/asm-x86/ldt.h
+++ b/xen/include/asm-x86/ldt.h
@@ -15,7 +15,7 @@ static inline void load_LDT(struct vcpu *v)
     }
     else
     {
-        desc = (!is_pv_32on64_vcpu(v)
+        desc = (!is_pv_32bit_vcpu(v)
                 ? this_cpu(gdt_table) : this_cpu(compat_gdt_table))
                + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY;
         _set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, SYS_DESC_ldt);
diff --git a/xen/include/asm-x86/mem_paging.h b/xen/include/asm-x86/mem_paging.h
index 6b7a1fe..176acaf 100644
--- a/xen/include/asm-x86/mem_paging.h
+++ b/xen/include/asm-x86/mem_paging.h
@@ -16,13 +16,15 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
+#ifndef __ASM_X86_MEM_PAGING_H__
+#define __ASM_X86_MEM_PAGING_H__
 
-int mem_paging_memop(struct domain *d, xen_mem_event_op_t *meo);
+int mem_paging_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_paging_op_t) arg);
 
+#endif /*__ASM_X86_MEM_PAGING_H__ */
 
 /*
  * Local variables:
diff --git a/xen/include/asm-x86/mem_sharing.h b/xen/include/asm-x86/mem_sharing.h
index 2f1f3d2..3840a14 100644
--- a/xen/include/asm-x86/mem_sharing.h
+++ b/xen/include/asm-x86/mem_sharing.h
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef __MEM_SHARING_H__
 #define __MEM_SHARING_H__
@@ -90,9 +89,7 @@ static inline int mem_sharing_unshare_page(struct domain *d,
  */
 int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn,
                                 bool_t allow_sleep);
-int mem_sharing_sharing_resume(struct domain *d);
-int mem_sharing_memop(struct domain *d, 
-                       xen_mem_sharing_op_t *mec);
+int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg);
 int mem_sharing_domctl(struct domain *d, 
                        xen_domctl_mem_sharing_op_t *mec);
 int mem_sharing_audit(void);
diff --git a/xen/include/asm-x86/microcode.h b/xen/include/asm-x86/microcode.h
index 00a672a..23ea954 100644
--- a/xen/include/asm-x86/microcode.h
+++ b/xen/include/asm-x86/microcode.h
@@ -7,10 +7,11 @@ struct cpu_signature;
 struct ucode_cpu_info;
 
 struct microcode_ops {
-    int (*microcode_resume_match)(int cpu, const void *mc);
-    int (*cpu_request_microcode)(int cpu, const void *buf, size_t size);
-    int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
-    int (*apply_microcode)(int cpu);
+    int (*microcode_resume_match)(unsigned int cpu, const void *mc);
+    int (*cpu_request_microcode)(unsigned int cpu, const void *buf,
+                                 size_t size);
+    int (*collect_cpu_info)(unsigned int cpu, struct cpu_signature *csig);
+    int (*apply_microcode)(unsigned int cpu);
     int (*start_update)(void);
 };
 
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index dca298f..67b34c6 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -17,6 +17,7 @@
  */
 #define PFN_ORDER(_pfn) ((_pfn)->v.free.order)
 
+#ifndef CONFIG_BIGMEM
 /*
  * This definition is solely for the use in struct page_info (and
  * struct page_list_head), intended to allow easy adjustment once x86-64
@@ -30,6 +31,9 @@ struct page_list_entry
 {
     __pdx_t next, prev;
 };
+#else
+#define __pdx_t unsigned long
+#endif
 
 struct page_sharing_info;
 
@@ -314,7 +318,10 @@ static inline void *__page_to_virt(const struct page_info *pg)
 int free_page_type(struct page_info *page, unsigned long type,
                    int preemptible);
 
-void init_guest_l4_table(l4_pgentry_t[], const struct domain *);
+void init_guest_l4_table(l4_pgentry_t[], const struct domain *,
+                         bool_t zap_ro_mpt);
+void fill_ro_mpt(unsigned long mfn);
+void zap_ro_mpt(unsigned long mfn);
 
 int is_iomem_page(unsigned long mfn);
 
@@ -345,9 +352,6 @@ const unsigned long *get_platform_badpages(unsigned int *array_size);
 int page_lock(struct page_info *page);
 void page_unlock(struct page_info *page);
 
-struct domain *page_get_owner_and_reference(struct page_info *page);
-void put_page(struct page_info *page);
-int  get_page(struct page_info *page, struct domain *domain);
 void put_page_type(struct page_info *page);
 int  get_page_type(struct page_info *page, unsigned long type);
 int  put_page_type_preemptible(struct page_info *page);
@@ -426,41 +430,6 @@ extern paddr_t mem_hotplug;
  * guest L2 page), etc...
  */
 
-/* With this defined, we do some ugly things to force the compiler to
- * give us type safety between mfns and gfns and other integers.
- * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions 
- * that translate beween int and foo_t.
- * 
- * It does have some performance cost because the types now have 
- * a different storage attribute, so may not want it on all the time. */
-
-#ifndef NDEBUG
-#define TYPE_SAFETY 1
-#endif
-
-#ifdef TYPE_SAFETY
-#define TYPE_SAFE(_type,_name)                                  \
-typedef struct { _type _name; } _name##_t;                      \
-static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \
-static inline _type _name##_x(_name##_t n) { return n._name; }
-#else
-#define TYPE_SAFE(_type,_name)                                          \
-typedef _type _name##_t;                                                \
-static inline _name##_t _##_name(_type n) { return n; }                 \
-static inline _type _name##_x(_name##_t n) { return n; }
-#endif
-
-TYPE_SAFE(unsigned long,mfn);
-
-#ifndef mfn_t
-#define mfn_t /* Grep fodder: mfn_t, _mfn() and mfn_x() are defined above */
-#undef mfn_t
-#endif
-
-/* Macro for printk formats: use as printk("%"PRI_mfn"\n", mfn_x(foo)); */
-#define PRI_mfn "05lx"
-
-
 /*
  * The MPT (machine->physical mapping table) is an array of word-sized
  * values, indexed on machine frame number. It is expected that guest OSes
@@ -503,8 +472,6 @@ extern struct rangeset *mmio_ro_ranges;
       ? get_gpfn_from_mfn(mfn)                          \
       : (mfn) )
 
-#define INVALID_MFN             (~0UL)
-
 #define compat_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
 #define compat_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
 
diff --git a/xen/include/asm-x86/monitor.h b/xen/include/asm-x86/monitor.h
new file mode 100644
index 0000000..7c8280b
--- /dev/null
+++ b/xen/include/asm-x86/monitor.h
@@ -0,0 +1,31 @@
+/*
+ * include/asm-x86/monitor.h
+ *
+ * Architecture-specific monitor_op domctl handler.
+ *
+ * Copyright (c) 2015 Tamas K Lengyel (tamas at tklengyel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ASM_X86_MONITOR_H__
+#define __ASM_X86_MONITOR_H__
+
+struct domain;
+struct xen_domctl_monitor_op;
+
+#define monitor_ctrlreg_bitmask(ctrlreg_index) (1U << (ctrlreg_index))
+
+int monitor_domctl(struct domain *d, struct xen_domctl_monitor_op *op);
+
+#endif /* __ASM_X86_MONITOR_H__ */
diff --git a/xen/include/asm-x86/msi.h b/xen/include/asm-x86/msi.h
index 4c62a3a..f83e748 100644
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -65,8 +65,13 @@ struct msi_info {
 };
 
 struct msi_msg {
-	u32	address_lo;	/* low 32 bits of msi message address */
-	u32	address_hi;	/* high 32 bits of msi message address */
+	union {
+		u64	address; /* message address */
+		struct {
+			u32	address_lo; /* message address low 32 bits */
+			u32	address_hi; /* message address high 32 bits */
+		};
+	};
 	u32	data;		/* 16 bits of msi message data */
 	u32	dest32;		/* used when Interrupt Remapping with EIM is enabled */
 };
@@ -90,12 +95,13 @@ extern unsigned int pci_msix_get_table_len(struct pci_dev *pdev);
 
 struct msi_desc {
 	struct msi_attrib {
-		__u8	type	: 5; 	/* {0: unused, 5h:MSI, 11h:MSI-X} */
-		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
-		__u8	masked	: 1;
+		__u8	type;		/* {0: unused, 5h:MSI, 11h:MSI-X} */
+		__u8	pos;		/* Location of the MSI capability */
+		__u8	maskbit	: 1;	/* mask/pending bit supported ?   */
 		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
-		__u8	pos;	 	/* Location of the msi capability */
-		__u16	entry_nr;    	/* specific enabled entry 	  */
+		__u8	host_masked : 1;
+		__u8	guest_masked : 1;
+		__u16	entry_nr;	/* specific enabled entry 	  */
 	} msi_attrib;
 
 	struct list_head list;
@@ -227,6 +233,7 @@ struct arch_msix {
     int table_refcnt[MAX_MSIX_TABLE_PAGES];
     int table_idx[MAX_MSIX_TABLE_PAGES];
     spinlock_t table_lock;
+    bool_t host_maskall, guest_maskall;
     domid_t warned;
 };
 
@@ -236,6 +243,7 @@ void msi_compose_msg(unsigned vector, const cpumask_t *mask,
 void __msi_set_enable(u16 seg, u8 bus, u8 slot, u8 func, int pos, int enable);
 void mask_msi_irq(struct irq_desc *);
 void unmask_msi_irq(struct irq_desc *);
+void guest_mask_msi_irq(struct irq_desc *, bool_t mask);
 void ack_nonmaskable_msi_irq(struct irq_desc *);
 void end_nonmaskable_msi_irq(struct irq_desc *, u8 vector);
 void set_msi_affinity(struct irq_desc *, const cpumask_t *);
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
index 83f2f70..e9c4723 100644
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -130,6 +130,7 @@
 #define MSR_IA32_VMX_TRUE_PROCBASED_CTLS        0x48e
 #define MSR_IA32_VMX_TRUE_EXIT_CTLS             0x48f
 #define MSR_IA32_VMX_TRUE_ENTRY_CTLS            0x490
+#define MSR_IA32_VMX_VMFUNC                     0x491
 #define IA32_FEATURE_CONTROL_MSR                0x3a
 #define IA32_FEATURE_CONTROL_MSR_LOCK                     0x0001
 #define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX  0x0002
@@ -327,6 +328,7 @@
 #define MSR_IA32_CMT_EVTSEL		0x00000c8d
 #define MSR_IA32_CMT_CTR		0x00000c8e
 #define MSR_IA32_PSR_ASSOC		0x00000c8f
+#define MSR_IA32_PSR_L3_MASK(n)	(0x00000c90 + (n))
 
 /* Intel Model 6 */
 #define MSR_P6_PERFCTR(n)		(0x000000c1 + (n))
diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
index 52cae4b..4f233d5 100644
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -71,17 +71,14 @@ static inline int wrmsr_safe(unsigned int msr, uint64_t val)
     return _rc;
 }
 
-#define rdtsc(low,high) \
-     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
+static inline uint64_t rdtsc(void)
+{
+    uint32_t low, high;
 
-#define rdtscl(low) \
-     __asm__ __volatile__("rdtsc" : "=a" (low) : : "edx")
+    __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high));
 
-#define rdtscll(val) do { \
-     unsigned int _eax, _edx; \
-     asm volatile("rdtsc" : "=a" (_eax), "=d" (_edx)); \
-     (val) = ((unsigned long)_eax) | (((unsigned long)_edx)<<32); \
-} while(0)
+    return ((uint64_t)high << 32) | low;
+}
 
 #define __write_tsc(val) wrmsrl(MSR_IA32_TSC, val)
 #define write_tsc(val) ({                                       \
diff --git a/xen/include/asm-x86/mtrr.h b/xen/include/asm-x86/mtrr.h
index 328ba04..0569db6 100644
--- a/xen/include/asm-x86/mtrr.h
+++ b/xen/include/asm-x86/mtrr.h
@@ -1,8 +1,7 @@
 #ifndef __ASM_X86_MTRR_H__
 #define __ASM_X86_MTRR_H__
 
-#include <xen/config.h>
-#include <asm/mm.h>
+#include <xen/mm.h>
 
 /* These are the region types. They match the architectural specification. */
 #define MTRR_TYPE_UNCACHABLE 0
diff --git a/xen/include/asm-x86/multicall.h b/xen/include/asm-x86/multicall.h
index a09ac5a..fcd0ea5 100644
--- a/xen/include/asm-x86/multicall.h
+++ b/xen/include/asm-x86/multicall.h
@@ -24,7 +24,7 @@
             "    callq *%%rax; "                             \
             "1:  movq  %%rax,%c4(%0)\n"                      \
             ".section .fixup,\"ax\"\n"                       \
-            "2:  movq  $-"STR(ENOSYS)",%%rax\n"              \
+            "2:  movq  %5,%%rax\n"                           \
             "    jmp   1b\n"                                 \
             ".previous\n"                                    \
             :                                                \
@@ -32,7 +32,8 @@
               "i" (offsetof(__typeof__(*_call), op)),        \
               "i" (offsetof(__typeof__(*_call), args)),      \
               "i" (sizeof(*(_call)->args)),                  \
-              "i" (offsetof(__typeof__(*_call), result))     \
+              "i" (offsetof(__typeof__(*_call), result)),    \
+              "i" (-ENOSYS)                                  \
               /* all the caller-saves registers */           \
             : "rax", "rcx", "rdx", "rsi", "rdi",             \
               "r8",  "r9",  "r10", "r11" );                  \
@@ -54,7 +55,7 @@
             "    callq *%%rax; "                             \
             "1:  movl  %%eax,%c4(%0)\n"                      \
             ".section .fixup,\"ax\"\n"                       \
-            "2:  movl  $-"STR(ENOSYS)",%%eax\n"              \
+            "2:  movl  %5,%%eax\n"                           \
             "    jmp   1b\n"                                 \
             ".previous\n"                                    \
             :                                                \
@@ -62,7 +63,8 @@
               "i" (offsetof(__typeof__(*_call), op)),        \
               "i" (offsetof(__typeof__(*_call), args)),      \
               "i" (sizeof(*(_call)->args)),                  \
-              "i" (offsetof(__typeof__(*_call), result))     \
+              "i" (offsetof(__typeof__(*_call), result)),    \
+              "i" (-ENOSYS)                                  \
               /* all the caller-saves registers */           \
             : "rax", "rcx", "rdx", "rsi", "rdi",             \
               "r8",  "r9",  "r10", "r11" )                   \
diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h
index 5959860..0c5e5b4 100644
--- a/xen/include/asm-x86/numa.h
+++ b/xen/include/asm-x86/numa.h
@@ -5,9 +5,11 @@
 
 #define NODES_SHIFT 6
 
+typedef u8 nodeid_t;
+
 extern int srat_rev;
 
-extern unsigned char cpu_to_node[];
+extern nodeid_t      cpu_to_node[NR_CPUS];
 extern cpumask_t     node_to_cpumask[];
 
 #define cpu_to_node(cpu)		(cpu_to_node[cpu])
@@ -20,8 +22,8 @@ struct node {
 };
 
 extern int compute_hash_shift(struct node *nodes, int numnodes,
-			      int *nodeids);
-extern int pxm_to_node(int nid);
+			      nodeid_t *nodeids);
+extern nodeid_t pxm_to_node(unsigned int pxm);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 #define VIRTUAL_BUG_ON(x) 
@@ -32,12 +34,12 @@ extern int numa_off;
 
 
 extern int srat_disabled(void);
-extern void numa_set_node(int cpu, int node);
-extern int setup_node(int pxm);
+extern void numa_set_node(int cpu, nodeid_t node);
+extern nodeid_t setup_node(unsigned int pxm);
 extern void srat_detect_node(int cpu);
 
-extern void setup_node_bootmem(int nodeid, u64 start, u64 end);
-extern unsigned char apicid_to_node[];
+extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end);
+extern nodeid_t apicid_to_node[];
 #ifdef CONFIG_NUMA
 extern void init_cpu_to_node(void);
 
@@ -54,14 +56,14 @@ extern u8 *memnodemap;
 struct node_data {
     unsigned long node_start_pfn;
     unsigned long node_spanned_pages;
-    unsigned int  node_id;
+    nodeid_t      node_id;
 };
 
 extern struct node_data node_data[];
 
-static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) 
+static inline __attribute__((pure)) nodeid_t phys_to_nid(paddr_t addr)
 { 
-	unsigned nid;
+	nodeid_t nid;
 	VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize);
 	nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; 
 	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
@@ -75,7 +77,7 @@ static inline __attribute__((pure)) int phys_to_nid(paddr_t addr)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
 				 NODE_DATA(nid)->node_spanned_pages)
 
-extern int valid_numa_range(u64 start, u64 end, int node);
+extern int valid_numa_range(u64 start, u64 end, nodeid_t node);
 #else
 #define init_cpu_to_node() do {} while (0)
 #define clear_node_cpumask(cpu) do {} while (0)
@@ -83,6 +85,6 @@ extern int valid_numa_range(u64 start, u64 end, int node);
 #endif
 
 void srat_parse_regions(u64 addr);
-extern int __node_distance(int a, int b);
+extern u8 __node_distance(nodeid_t a, nodeid_t b);
 
 #endif
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 5f7fe71..5e99ac6 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -20,8 +20,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _XEN_P2M_H
@@ -72,6 +71,7 @@ typedef enum {
     p2m_ram_shared = 12,          /* Shared or sharable memory */
     p2m_ram_broken = 13,          /* Broken page, access cause domain crash */
     p2m_map_foreign  = 14,        /* ram pages from foreign domain */
+    p2m_mmio_write_dm = 15,       /* Read-only; writes go to the device model */
 } p2m_type_t;
 
 /* Modifiers to the query */
@@ -111,7 +111,12 @@ typedef unsigned int p2m_query_t;
 #define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty)     \
                       | p2m_to_mask(p2m_ram_ro)         \
                       | p2m_to_mask(p2m_grant_map_ro)   \
-                      | p2m_to_mask(p2m_ram_shared) )
+                      | p2m_to_mask(p2m_ram_shared)     \
+                      | p2m_to_mask(p2m_mmio_write_dm))
+
+/* Write-discard types, which should discard the write operations */
+#define P2M_DISCARD_WRITE_TYPES (p2m_to_mask(p2m_ram_ro)     \
+                      | p2m_to_mask(p2m_grant_map_ro))
 
 /* Types that can be subject to bulk transitions. */
 #define P2M_CHANGEABLE_TYPES (p2m_to_mask(p2m_ram_rw) \
@@ -145,6 +150,7 @@ typedef unsigned int p2m_query_t;
 #define p2m_is_hole(_t) (p2m_to_mask(_t) & P2M_HOLE_TYPES)
 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
+#define p2m_is_discard_write(_t) (p2m_to_mask(_t) & P2M_DISCARD_WRITE_TYPES)
 #define p2m_is_changeable(_t) (p2m_to_mask(_t) & P2M_CHANGEABLE_TYPES)
 #define p2m_is_pod(_t) (p2m_to_mask(_t) & P2M_POD_TYPES)
 #define p2m_is_grant(_t) (p2m_to_mask(_t) & P2M_GRANT_TYPES)
@@ -165,6 +171,12 @@ typedef unsigned int p2m_query_t;
                              (P2M_RAM_TYPES | P2M_GRANT_TYPES |  \
                               p2m_to_mask(p2m_map_foreign)))
 
+typedef enum {
+    p2m_host,
+    p2m_nested,
+    p2m_alternate,
+} p2m_class_t;
+
 /* Per-p2m-table state */
 struct p2m_domain {
     /* Lock that protects updates to the p2m */
@@ -181,6 +193,8 @@ struct p2m_domain {
 
     struct domain     *domain;   /* back pointer to domain */
 
+    p2m_class_t       p2m_class; /* host/nested/alternate */
+
     /* Nested p2ms only: nested p2m base value that this p2m shadows.
      * This can be cleared to P2M_BASE_EADDR under the per-p2m lock but
      * needs both the per-p2m lock and the per-domain nestedp2m lock
@@ -205,20 +219,28 @@ struct p2m_domain {
      * host p2m's lock. */
     int                defer_nested_flush;
 
+    /* Alternate p2m: count of vcpu's currently using this p2m. */
+    atomic_t           active_vcpus;
+
     /* Pages used to construct the p2m */
     struct page_list_head pages;
 
-    int                (*set_entry   )(struct p2m_domain *p2m,
-                                       unsigned long gfn,
-                                       mfn_t mfn, unsigned int page_order,
-                                       p2m_type_t p2mt,
-                                       p2m_access_t p2ma);
-    mfn_t              (*get_entry   )(struct p2m_domain *p2m,
-                                       unsigned long gfn,
-                                       p2m_type_t *p2mt,
-                                       p2m_access_t *p2ma,
-                                       p2m_query_t q,
-                                       unsigned int *page_order);
+    int                (*set_entry)(struct p2m_domain *p2m,
+                                    unsigned long gfn,
+                                    mfn_t mfn, unsigned int page_order,
+                                    p2m_type_t p2mt,
+                                    p2m_access_t p2ma,
+                                    int sve);
+    mfn_t              (*get_entry)(struct p2m_domain *p2m,
+                                    unsigned long gfn,
+                                    p2m_type_t *p2mt,
+                                    p2m_access_t *p2ma,
+                                    p2m_query_t q,
+                                    unsigned int *page_order,
+                                    bool_t *sve);
+    void               (*enable_hardware_log_dirty)(struct p2m_domain *p2m);
+    void               (*disable_hardware_log_dirty)(struct p2m_domain *p2m);
+    void               (*flush_hardware_cached_dirty)(struct p2m_domain *p2m);
     void               (*change_entry_type_global)(struct p2m_domain *p2m,
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
@@ -238,13 +260,20 @@ struct p2m_domain {
      * retyped get this access type.  See definition of p2m_access_t. */
     p2m_access_t default_access;
 
-    /* If true, and an access fault comes in and there is no mem_event listener, 
+    /* If true, and an access fault comes in and there is no vm_event listener, 
      * pause domain.  Otherwise, remove access restrictions. */
     bool_t       access_required;
 
     /* Highest guest frame that's ever been mapped in the p2m */
     unsigned long max_mapped_pfn;
 
+    /*
+     * Alternate p2m's only: range of gfn's for which underlying
+     * mfn may have duplicate mappings
+     */
+    unsigned long min_remapped_gfn;
+    unsigned long max_remapped_gfn;
+
     /* When releasing shared gfn's in a preemptible manner, recall where
      * to resume the search */
     unsigned long next_shared_gfn_to_relinquish;
@@ -290,7 +319,20 @@ struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t np2m_base);
  */
 struct p2m_domain *p2m_get_p2m(struct vcpu *v);
 
-#define p2m_is_nestedp2m(p2m)   ((p2m) != p2m_get_hostp2m((p2m->domain)))
+static inline bool_t p2m_is_hostp2m(const struct p2m_domain *p2m)
+{
+    return p2m->p2m_class == p2m_host;
+}
+
+static inline bool_t p2m_is_nestedp2m(const struct p2m_domain *p2m)
+{
+    return p2m->p2m_class == p2m_nested;
+}
+
+static inline bool_t p2m_is_altp2m(const struct p2m_domain *p2m)
+{
+    return p2m->p2m_class == p2m_alternate;
+}
 
 #define p2m_get_pagetable(p2m)  ((p2m)->phys_table)
 
@@ -478,13 +520,21 @@ static inline int guest_physmap_add_page(struct domain *d,
 }
 
 /* Remove a page from a domain's p2m table */
-void guest_physmap_remove_page(struct domain *d,
-                               unsigned long gfn,
-                               unsigned long mfn, unsigned int page_order);
+int guest_physmap_remove_page(struct domain *d,
+                              unsigned long gfn,
+                              unsigned long mfn, unsigned int page_order);
 
 /* Set a p2m range as populate-on-demand */
 int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
                                           unsigned int order);
+/* Enable hardware-assisted log-dirty. */
+void p2m_enable_hardware_log_dirty(struct domain *d);
+
+/* Disable hardware-assisted log-dirty */
+void p2m_disable_hardware_log_dirty(struct domain *d);
+
+/* Flush hardware cached dirty GFNs */
+void p2m_flush_hardware_cached_dirty(struct domain *d);
 
 /* Change types across all p2m entries in a domain */
 void p2m_change_entry_type_global(struct domain *d, 
@@ -506,9 +556,15 @@ int p2m_is_logdirty_range(struct p2m_domain *, unsigned long start,
                           unsigned long end);
 
 /* Set mmio addresses in the p2m table (for pass-through) */
-int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
+int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
+                       p2m_access_t access);
 int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
 
+/* Set identity addresses in the p2m table (for pass-through) */
+int set_identity_p2m_entry(struct domain *d, unsigned long gfn,
+                           p2m_access_t p2ma, unsigned int flag);
+int clear_identity_p2m_entry(struct domain *d, unsigned long gfn);
+
 /* Add foreign mapping to the guest's p2m table. */
 int p2m_add_foreign(struct domain *tdom, unsigned long fgfn,
                     unsigned long gpfn, domid_t foreign_domid);
@@ -563,7 +619,7 @@ void p2m_mem_paging_populate(struct domain *d, unsigned long gfn);
 /* Prepare the p2m for paging a frame in */
 int p2m_mem_paging_prep(struct domain *d, unsigned long gfn, uint64_t buffer);
 /* Resume normal operation (in case a domain was paused) */
-void p2m_mem_paging_resume(struct domain *d);
+void p2m_mem_paging_resume(struct domain *d, vm_event_response_t *rsp);
 
 /* Send mem event based on the access (gla is -1ull if not available).  Handles
  * the rw2rx conversion. Boolean return value indicates if access rights have 
@@ -572,36 +628,41 @@ void p2m_mem_paging_resume(struct domain *d);
  * locks -- caller must also xfree the request. */
 bool_t p2m_mem_access_check(paddr_t gpa, unsigned long gla,
                             struct npfec npfec,
-                            mem_event_request_t **req_ptr);
-
-/* Set access type for a region of pfns.
- * If start_pfn == -1ul, sets the default access type */
-long p2m_set_mem_access(struct domain *d, unsigned long start_pfn, uint32_t nr,
-                        uint32_t start, uint32_t mask, xenmem_access_t access);
-
-/* Get access type for a pfn
- * If pfn == -1ul, gets the default access type */
-int p2m_get_mem_access(struct domain *d, unsigned long pfn,
-                       xenmem_access_t *access);
+                            vm_event_request_t **req_ptr);
 
-/* Check for emulation and mark vcpu for skipping one instruction
- * upon rescheduling if required. */
-void p2m_mem_event_emulate_check(struct vcpu *v,
-                                 const mem_event_response_t *rsp);
+/*
+ * Emulating a memory access requires custom handling. These non-atomic
+ * functions should be called under domctl lock.
+ */
+static inline
+int p2m_mem_access_enable_emulate(struct domain *d)
+{
+    if ( d->arch.mem_access_emulate_enabled )
+        return -EEXIST;
 
-/* Enable arch specific introspection options (such as MSR interception). */
-void p2m_setup_introspection(struct domain *d);
+    d->arch.mem_access_emulate_enabled = 1;
+    return 0;
+}
 
-/* Sanity check for mem_event hardware support */
-static inline bool_t p2m_mem_event_sanity_check(struct domain *d)
+static inline
+int p2m_mem_access_disable_emulate(struct domain *d)
 {
-    return hap_enabled(d) && cpu_has_vmx;
+    if ( !d->arch.mem_access_emulate_enabled )
+        return -EEXIST;
+
+    d->arch.mem_access_emulate_enabled = 0;
+    return 0;
 }
 
+/* Check for emulation and mark vcpu for skipping one instruction
+ * upon rescheduling if required. */
+void p2m_mem_access_emulate_check(struct vcpu *v,
+                                  const vm_event_response_t *rsp);
+
 /* Sanity check for mem_access hardware support */
 static inline bool_t p2m_mem_access_sanity_check(struct domain *d)
 {
-    return is_hvm_domain(d);
+    return is_hvm_domain(d) && cpu_has_vmx && hap_enabled(d);
 }
 
 /* 
@@ -683,6 +744,64 @@ void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
     l1_pgentry_t *p, l1_pgentry_t new, unsigned int level);
 
 /*
+ * Alternate p2m: shadow p2m tables used for alternate memory views
+ */
+
+/* get current alternate p2m table */
+static inline struct p2m_domain *p2m_get_altp2m(struct vcpu *v)
+{
+    unsigned int index = vcpu_altp2m(v).p2midx;
+
+    if ( index == INVALID_ALTP2M )
+        return NULL;
+
+    BUG_ON(index >= MAX_ALTP2M);
+
+    return v->domain->arch.altp2m_p2m[index];
+}
+
+/* Locate an alternate p2m by its EPTP */
+unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp);
+
+/* Switch alternate p2m for a single vcpu */
+bool_t p2m_switch_vcpu_altp2m_by_id(struct vcpu *v, unsigned int idx);
+
+/* Check to see if vcpu should be switched to a different p2m. */
+void p2m_altp2m_check(struct vcpu *v, uint16_t idx);
+
+/* Flush all the alternate p2m's for a domain */
+void p2m_flush_altp2m(struct domain *d);
+
+/* Alternate p2m paging */
+bool_t p2m_altp2m_lazy_copy(struct vcpu *v, paddr_t gpa,
+    unsigned long gla, struct npfec npfec, struct p2m_domain **ap2m);
+
+/* Make a specific alternate p2m valid */
+int p2m_init_altp2m_by_id(struct domain *d, unsigned int idx);
+
+/* Find an available alternate p2m and make it valid */
+int p2m_init_next_altp2m(struct domain *d, uint16_t *idx);
+
+/* Make a specific alternate p2m invalid */
+int p2m_destroy_altp2m_by_id(struct domain *d, unsigned int idx);
+
+/* Switch alternate p2m for entire domain */
+int p2m_switch_domain_altp2m_by_id(struct domain *d, unsigned int idx);
+
+/* Set access type for a gfn */
+int p2m_set_altp2m_mem_access(struct domain *d, unsigned int idx,
+                              gfn_t gfn, xenmem_access_t access);
+
+/* Change a gfn->mfn mapping */
+int p2m_change_altp2m_gfn(struct domain *d, unsigned int idx,
+                          gfn_t old_gfn, gfn_t new_gfn);
+
+/* Propagate a host p2m change to all alternate p2m's */
+void p2m_altp2m_propagate_change(struct domain *d, gfn_t gfn,
+                                 mfn_t mfn, unsigned int page_order,
+                                 p2m_type_t p2mt, p2m_access_t p2ma);
+
+/*
  * p2m type to IOMMU flags
  */
 static inline unsigned int p2m_get_iommu_flags(p2m_type_t p2mt)
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index a8bc999..87b3341 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -172,9 +172,9 @@ static inline l4_pgentry_t l4e_from_paddr(paddr_t pa, unsigned int flags)
 #define l3e_to_l2e(x)              ((l2_pgentry_t *)__va(l3e_get_paddr(x)))
 #define l4e_to_l3e(x)              ((l3_pgentry_t *)__va(l4e_get_paddr(x)))
 
-#define map_l1t_from_l2e(x)        ((l1_pgentry_t *)map_domain_page(l2e_get_pfn(x)))
-#define map_l2t_from_l3e(x)        ((l2_pgentry_t *)map_domain_page(l3e_get_pfn(x)))
-#define map_l3t_from_l4e(x)        ((l3_pgentry_t *)map_domain_page(l4e_get_pfn(x)))
+#define map_l1t_from_l2e(x)        ((l1_pgentry_t *)map_domain_page(_mfn(l2e_get_pfn(x))))
+#define map_l2t_from_l3e(x)        ((l2_pgentry_t *)map_domain_page(_mfn(l3e_get_pfn(x))))
+#define map_l3t_from_l4e(x)        ((l3_pgentry_t *)map_domain_page(_mfn(l4e_get_pfn(x))))
 
 /* Given a virtual address, get an entry offset into a page table. */
 #define l1_table_offset(a)         \
@@ -234,6 +234,7 @@ void copy_page_sse2(void *, const void *);
 #define __pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT)
 #define __paddr_to_pfn(pa)  ((unsigned long)((pa) >> PAGE_SHIFT))
 
+
 /* Convert between machine frame numbers and spage-info structures. */
 #define __mfn_to_spage(mfn)  (spage_table + pfn_to_sdx(mfn))
 #define __spage_to_mfn(pg)   sdx_to_pfn((unsigned long)((pg) - spage_table))
@@ -262,6 +263,8 @@ void copy_page_sse2(void *, const void *);
 #define pfn_to_paddr(pfn)   __pfn_to_paddr(pfn)
 #define paddr_to_pfn(pa)    __paddr_to_pfn(pa)
 #define paddr_to_pdx(pa)    pfn_to_pdx(paddr_to_pfn(pa))
+#define vmap_to_mfn(va)     l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va)))
+#define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
 #endif /* !defined(__ASSEMBLY__) */
 
@@ -286,6 +289,7 @@ extern l2_pgentry_t l2_identmap[4*L2_PAGETABLE_ENTRIES];
 extern l1_pgentry_t l1_identmap[L1_PAGETABLE_ENTRIES],
     l1_fixmap[L1_PAGETABLE_ENTRIES];
 void paging_init(void);
+void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
 #endif /* !defined(__ASSEMBLY__) */
 
 #define _PAGE_NONE     _AC(0x000,U)
@@ -303,7 +307,8 @@ void paging_init(void);
 #define _PAGE_AVAIL1   _AC(0x400,U)
 #define _PAGE_AVAIL2   _AC(0x800,U)
 #define _PAGE_AVAIL    _AC(0xE00,U)
-#define _PAGE_PSE_PAT _AC(0x1000,U)
+#define _PAGE_PSE_PAT  _AC(0x1000,U)
+#define _PAGE_NX       (cpu_has_nx ? _PAGE_NX_BIT : 0)
 /* non-architectural flags */
 #define _PAGE_PAGED   0x2000U
 #define _PAGE_SHARED  0x4000U
@@ -320,10 +325,13 @@ void paging_init(void);
 #define _PAGE_GNTTAB   0
 #endif
 
-#define __PAGE_HYPERVISOR \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define __PAGE_HYPERVISOR_NOCACHE \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR_RO      (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_NX)
+#define __PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RO | \
+                                   _PAGE_DIRTY | _PAGE_RW)
+#define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
+                                   _PAGE_DIRTY | _PAGE_RW)
+#define __PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR | _PAGE_PCD)
 
 #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
 
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index 5f810f6..483b2d7 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -18,8 +18,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _XEN_PAGING_H
@@ -39,7 +38,11 @@
 #define PG_SH_shift    20
 #define PG_HAP_shift   21
 /* We're in one of the shadow modes */
+#ifdef CONFIG_SHADOW_PAGING
 #define PG_SH_enable   (1U << PG_SH_shift)
+#else
+#define PG_SH_enable   0
+#endif
 #define PG_HAP_enable  (1U << PG_HAP_shift)
 
 /* common paging mode bits */
@@ -74,6 +77,7 @@
 
 struct sh_emulate_ctxt;
 struct shadow_paging_mode {
+#ifdef CONFIG_SHADOW_PAGING
     void          (*detach_old_tables     )(struct vcpu *v);
     int           (*x86_emulate_write     )(struct vcpu *v, unsigned long va,
                                             void *src, u32 bytes,
@@ -88,6 +92,7 @@ struct shadow_paging_mode {
     int           (*guess_wrmap           )(struct vcpu *v, 
                                             unsigned long vaddr, mfn_t gmfn);
     void          (*pagetable_dying       )(struct vcpu *v, paddr_t gpa);
+#endif
     /* For outsiders to tell what mode we're in */
     unsigned int shadow_levels;
 };
@@ -150,6 +155,8 @@ void paging_log_dirty_init(struct domain *d,
 
 /* mark a page as dirty */
 void paging_mark_dirty(struct domain *d, unsigned long guest_mfn);
+/* mark a page as dirty with taking guest pfn as parameter */
+void paging_mark_gfn_dirty(struct domain *d, unsigned long pfn);
 
 /* is this guest page dirty? 
  * This is called from inside paging code, with the paging lock held. */
@@ -247,7 +254,6 @@ static inline int paging_invlpg(struct vcpu *v, unsigned long va)
  * pfec[0] is used to determine which kind of access this is when
  * walking the tables.  The caller should set the PFEC_page_present bit
  * in pfec[0]; in the failure case, that bit will be cleared if appropriate. */
-#define INVALID_GFN (-1UL)
 unsigned long paging_gva_to_gfn(struct vcpu *v,
                                 unsigned long va,
                                 uint32_t *pfec);
@@ -369,7 +375,7 @@ guest_map_l1e(struct vcpu *v, unsigned long addr, unsigned long *gl1mfn)
          != _PAGE_PRESENT )
         return NULL;
     *gl1mfn = l2e_get_pfn(l2e);
-    return (l1_pgentry_t *)map_domain_page(*gl1mfn) + l1_table_offset(addr);
+    return (l1_pgentry_t *)map_domain_page(_mfn(*gl1mfn)) + l1_table_offset(addr);
 }
 
 /* Pull down the mapping we got from guest_map_l1e() */
diff --git a/xen/include/asm-x86/pci.h b/xen/include/asm-x86/pci.h
index e0598fd..38ace79 100644
--- a/xen/include/asm-x86/pci.h
+++ b/xen/include/asm-x86/pci.h
@@ -1,6 +1,11 @@
 #ifndef __X86_PCI_H__
 #define __X86_PCI_H__
 
+#define CF8_BDF(cf8)     (  ((cf8) & 0x00ffff00) >> 8)
+#define CF8_ADDR_LO(cf8) (   (cf8) & 0x000000fc)
+#define CF8_ADDR_HI(cf8) (  ((cf8) & 0x0f000000) >> 16)
+#define CF8_ENABLED(cf8) (!!((cf8) & 0x80000000))
+
 #define IS_SNB_GFX(id) (id == 0x01068086 || id == 0x01168086 \
                         || id == 0x01268086 || id == 0x01028086 \
                         || id == 0x01128086 || id == 0x01228086 \
@@ -10,4 +15,12 @@ struct arch_pci_dev {
     vmask_t used_vectors;
 };
 
+int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
+                             unsigned int reg, unsigned int size,
+                             uint32_t *data);
+int pci_msi_conf_write_intercept(struct pci_dev *, unsigned int reg,
+                                 unsigned int size, uint32_t *data);
+bool_t pci_mmcfg_decode(unsigned long mfn, unsigned int *seg,
+                        unsigned int *bdf);
+
 #endif /* __X86_PCI_H__ */
diff --git a/xen/include/asm-x86/perfc_defn.h b/xen/include/asm-x86/perfc_defn.h
index 170da00..9ef092e 100644
--- a/xen/include/asm-x86/perfc_defn.h
+++ b/xen/include/asm-x86/perfc_defn.h
@@ -125,6 +125,7 @@ PERFCOUNTER(mshv_rdmsr_icr,             "MS Hv rdmsr icr")
 PERFCOUNTER(mshv_rdmsr_tpr,             "MS Hv rdmsr tpr")
 PERFCOUNTER(mshv_rdmsr_apic_assist,     "MS Hv rdmsr APIC assist")
 PERFCOUNTER(mshv_rdmsr_apic_msr,        "MS Hv rdmsr APIC msr")
+PERFCOUNTER(mshv_rdmsr_tsc_msr,         "MS Hv rdmsr TSC msr")
 PERFCOUNTER(mshv_wrmsr_osid,            "MS Hv wrmsr Guest OS ID")
 PERFCOUNTER(mshv_wrmsr_hc_page,         "MS Hv wrmsr hypercall page")
 PERFCOUNTER(mshv_wrmsr_vp_index,        "MS Hv wrmsr vp index")
@@ -133,6 +134,7 @@ PERFCOUNTER(mshv_wrmsr_tpr,             "MS Hv wrmsr tpr")
 PERFCOUNTER(mshv_wrmsr_eoi,             "MS Hv wrmsr eoi")
 PERFCOUNTER(mshv_wrmsr_apic_assist,     "MS Hv wrmsr APIC assist")
 PERFCOUNTER(mshv_wrmsr_apic_msr,        "MS Hv wrmsr APIC msr")
+PERFCOUNTER(mshv_wrmsr_tsc_msr,         "MS Hv wrmsr TSC msr")
 
 PERFCOUNTER(realmode_emulations, "realmode instructions emulated")
 PERFCOUNTER(realmode_exits,      "vmexits from realmode")
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index b4e4731..f507f5e 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -71,24 +71,25 @@
 /*
  * Intel CPU features in CR4
  */
-#define X86_CR4_VME		0x0001	/* enable vm86 extensions */
-#define X86_CR4_PVI		0x0002	/* virtual interrupts flag enable */
-#define X86_CR4_TSD		0x0004	/* disable time stamp at ipl 3 */
-#define X86_CR4_DE		0x0008	/* enable debugging extensions */
-#define X86_CR4_PSE		0x0010	/* enable page size extensions */
-#define X86_CR4_PAE		0x0020	/* enable physical address extensions */
-#define X86_CR4_MCE		0x0040	/* Machine check enable */
-#define X86_CR4_PGE		0x0080	/* enable global pages */
-#define X86_CR4_PCE		0x0100	/* enable performance counters at ipl 3 */
-#define X86_CR4_OSFXSR		0x0200	/* enable fast FPU save and restore */
-#define X86_CR4_OSXMMEXCPT	0x0400	/* enable unmasked SSE exceptions */
-#define X86_CR4_VMXE		0x2000  /* enable VMX */
-#define X86_CR4_SMXE		0x4000  /* enable SMX */
-#define X86_CR4_FSGSBASE	0x10000 /* enable {rd,wr}{fs,gs}base */
-#define X86_CR4_PCIDE		0x20000 /* enable PCID */
-#define X86_CR4_OSXSAVE	0x40000 /* enable XSAVE/XRSTOR */
-#define X86_CR4_SMEP		0x100000/* enable SMEP */
-#define X86_CR4_SMAP		0x200000/* enable SMAP */
+#define X86_CR4_VME        0x00000001 /* enable vm86 extensions */
+#define X86_CR4_PVI        0x00000002 /* virtual interrupts flag enable */
+#define X86_CR4_TSD        0x00000004 /* disable time stamp at ipl 3 */
+#define X86_CR4_DE         0x00000008 /* enable debugging extensions */
+#define X86_CR4_PSE        0x00000010 /* enable page size extensions */
+#define X86_CR4_PAE        0x00000020 /* enable physical address extensions */
+#define X86_CR4_MCE        0x00000040 /* Machine check enable */
+#define X86_CR4_PGE        0x00000080 /* enable global pages */
+#define X86_CR4_PCE        0x00000100 /* enable performance counters at ipl 3 */
+#define X86_CR4_OSFXSR     0x00000200 /* enable fast FPU save and restore */
+#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
+#define X86_CR4_VMXE       0x00002000 /* enable VMX */
+#define X86_CR4_SMXE       0x00004000 /* enable SMX */
+#define X86_CR4_FSGSBASE   0x00010000 /* enable {rd,wr}{fs,gs}base */
+#define X86_CR4_PCIDE      0x00020000 /* enable PCID */
+#define X86_CR4_OSXSAVE    0x00040000 /* enable XSAVE/XRSTOR */
+#define X86_CR4_SMEP       0x00100000 /* enable SMEP */
+#define X86_CR4_SMAP       0x00200000 /* enable SMAP */
+#define X86_CR4_PKE        0x00400000 /* enable PKE */
 
 /*
  * Trap/fault mnemonics.
@@ -142,7 +143,7 @@
 #define PFEC_page_paged     (1U<<5)
 #define PFEC_page_shared    (1U<<6)
 
-#define XEN_MINIMAL_CR4 (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)
+#define XEN_MINIMAL_CR4 (X86_CR4_PGE | X86_CR4_PAE)
 
 #define XEN_SYSCALL_MASK (X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|    \
                           X86_EFLAGS_NT|X86_EFLAGS_DF|X86_EFLAGS_IF|    \
@@ -163,6 +164,14 @@ struct vcpu;
     pc;                                             \
 })
 
+struct x86_cpu_id {
+    uint16_t vendor;
+    uint16_t family;
+    uint16_t model;
+    uint16_t feature;   /* bit index */
+    const void *driver_data;
+};
+
 struct cpuinfo_x86 {
     __u8 x86;            /* CPU family */
     __u8 x86_vendor;     /* CPU vendor */
@@ -180,9 +189,9 @@ struct cpuinfo_x86 {
     __u32 booted_cores;  /* number of cores as seen by OS */
     __u32 x86_num_siblings; /* cpuid logical cpus per chip value */
     __u32 apicid;
-    int   phys_proc_id; /* package ID of each logical CPU */
-    int   cpu_core_id; /* core ID of each logical CPU*/
-    int   compute_unit_id; /* AMD compute unit ID of each logical CPU */
+    __u32 phys_proc_id;    /* package ID of each logical CPU */
+    __u32 cpu_core_id;     /* core ID of each logical CPU*/
+    __u32 compute_unit_id; /* AMD compute unit ID of each logical CPU */
     unsigned short x86_clflush_size;
 } __cacheline_aligned;
 
@@ -204,6 +213,8 @@ extern u32 cpuid_ext_features;
 /* Maximum width of physical addresses supported by the hardware */
 extern unsigned int paddr_bits;
 
+extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id table[]);
+
 extern void identify_cpu(struct cpuinfo_x86 *);
 extern void setup_clear_cpu_cap(unsigned int);
 extern void print_cpu_info(unsigned int cpu);
@@ -234,8 +245,8 @@ unsigned int apicid_to_socket(unsigned int);
 
 /* Some CPUID calls want 'count' to be placed in ecx */
 static inline void cpuid_count(
-    int op,
-    int count,
+    unsigned int op,
+    unsigned int count,
     unsigned int *eax,
     unsigned int *ebx,
     unsigned int *ecx,
@@ -444,9 +455,12 @@ struct __packed __cacheline_aligned tss_struct {
  * descriptor table entry. */
 static always_inline void set_ist(idt_entry_t *idt, unsigned long ist)
 {
+    idt_entry_t new = *idt;
+
     /* IST is a 3 bit field, 32 bits into the IDT entry. */
     ASSERT(ist <= IST_MAX);
-    idt->a = (idt->a & ~(7UL << 32)) | (ist << 32);
+    new.a = (idt->a & ~(7UL << 32)) | (ist << 32);
+    _write_gate_lower(idt, &new);
 }
 
 #define IDT_ENTRIES 256
@@ -529,12 +543,24 @@ void trap_nop(void);
 void enable_nmis(void);
 void do_reserved_trap(struct cpu_user_regs *regs);
 
-void syscall_enter(void);
 void sysenter_entry(void);
 void sysenter_eflags_saved(void);
 void compat_hypercall(void);
 void int80_direct_trap(void);
 
+#define STUBS_PER_PAGE (PAGE_SIZE / STUB_BUF_SIZE)
+
+struct stubs {
+    union {
+        void(*func)(void);
+        unsigned long addr;
+    };
+    unsigned long mfn;
+};
+
+DECLARE_PER_CPU(struct stubs, stubs);
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn);
+
 extern int hypercall(void);
 
 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
@@ -544,8 +570,15 @@ int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val);
 
 void microcode_set_module(unsigned int);
 int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void), unsigned long len);
-int microcode_resume_cpu(int cpu);
+int microcode_resume_cpu(unsigned int cpu);
+
+enum get_cpu_vendor {
+   gcv_host_early,
+   gcv_host_late,
+   gcv_guest
+};
 
+int get_cpu_vendor(const char vendor_id[], enum get_cpu_vendor);
 void pv_cpuid(struct cpu_user_regs *regs);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/xen/include/asm-x86/psr.h b/xen/include/asm-x86/psr.h
index c6076e9..081750f 100644
--- a/xen/include/asm-x86/psr.h
+++ b/xen/include/asm-x86/psr.h
@@ -18,6 +18,9 @@
 
 #include <xen/types.h>
 
+/* CAT cpuid level */
+#define PSR_CPUID_LEVEL_CAT   0x10
+
 /* Resource Type Enumeration */
 #define PSR_RESOURCE_TYPE_L3            0x2
 
@@ -46,7 +49,15 @@ static inline bool_t psr_cmt_enabled(void)
 
 int psr_alloc_rmid(struct domain *d);
 void psr_free_rmid(struct domain *d);
-void psr_assoc_rmid(unsigned int rmid);
+void psr_ctxt_switch_to(struct domain *d);
+
+int psr_get_cat_l3_info(unsigned int socket, uint32_t *cbm_len,
+                        uint32_t *cos_max);
+int psr_get_l3_cbm(struct domain *d, unsigned int socket, uint64_t *cbm);
+int psr_set_l3_cbm(struct domain *d, unsigned int socket, uint64_t cbm);
+
+int psr_domain_init(struct domain *d);
+void psr_domain_free(struct domain *d);
 
 #endif /* __ASM_PSR_H__ */
 
diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
index 762eb02..381d9f8 100644
--- a/xen/include/asm-x86/setup.h
+++ b/xen/include/asm-x86/setup.h
@@ -2,6 +2,7 @@
 #define __X86_SETUP_H_
 
 #include <xen/multiboot.h>
+#include <asm/numa.h>
 
 extern unsigned long xenheap_initial_phys_start;
 
@@ -31,8 +32,9 @@ int construct_dom0(
     module_t *initrd,
     void *(*bootstrap_map)(const module_t *),
     char *cmdline);
+void setup_io_bitmap(struct domain *d);
 
-unsigned long initial_images_nrpages(void);
+unsigned long initial_images_nrpages(nodeid_t node);
 void discard_initial_images(void);
 
 unsigned int dom0_max_vcpus(void);
diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
index f40cab4..6d0aefb 100644
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _XEN_SHADOW_H
@@ -49,12 +48,14 @@
 
 /* Set up the shadow-specific parts of a domain struct at start of day.
  * Called from paging_domain_init(). */
-void shadow_domain_init(struct domain *d, unsigned int domcr_flags);
+int shadow_domain_init(struct domain *d, unsigned int domcr_flags);
 
 /* Setup the shadow-specific parts of a vcpu struct. It is called by
  * paging_vcpu_init() in paging.c */
 void shadow_vcpu_init(struct vcpu *v);
 
+#ifdef CONFIG_SHADOW_PAGING
+
 /* Enable an arbitrary shadow mode.  Call once at domain creation. */
 int shadow_enable(struct domain *d, u32 mode);
 
@@ -72,48 +73,45 @@ int shadow_domctl(struct domain *d,
                   XEN_GUEST_HANDLE_PARAM(void) u_domctl);
 
 /* Call when destroying a domain */
-void shadow_teardown(struct domain *d);
+void shadow_teardown(struct domain *d, int *preempted);
 
 /* Call once all of the references to the domain have gone away */
 void shadow_final_teardown(struct domain *d);
 
-/* shadow code to call when log dirty is enabled */
-int shadow_enable_log_dirty(struct domain *d, bool_t log_global);
+void sh_remove_shadows(struct domain *d, mfn_t gmfn, int fast, int all);
+
+/* Discard _all_ mappings from the domain's shadows. */
+void shadow_blow_tables_per_domain(struct domain *d);
 
-/* shadow code to call when log dirty is disabled */
-int shadow_disable_log_dirty(struct domain *d);
+#else /* !CONFIG_SHADOW_PAGING */
 
-/* shadow code to call when bitmap is being cleaned */
-void shadow_clean_dirty_bitmap(struct domain *d);
+#define shadow_teardown(d, p) ASSERT(is_pv_domain(d))
+#define shadow_final_teardown(d) ASSERT(is_pv_domain(d))
+#define shadow_enable(d, mode) \
+    ({ ASSERT(is_pv_domain(d)); -EOPNOTSUPP; })
+#define shadow_track_dirty_vram(d, begin_pfn, nr, bitmap) \
+    ({ ASSERT_UNREACHABLE(); -EOPNOTSUPP; })
 
-/* Update all the things that are derived from the guest's CR0/CR3/CR4.
- * Called to initialize paging structures if the paging mode
- * has changed, and when bringing up a VCPU for the first time. */
-void shadow_update_paging_modes(struct vcpu *v);
+static inline void sh_remove_shadows(struct domain *d, mfn_t gmfn,
+                                     bool_t fast, bool_t all) {}
 
+static inline void shadow_blow_tables_per_domain(struct domain *d) {}
 
-/* Remove all mappings of the guest page from the shadows. 
- * This is called from common code.  It does not flush TLBs. */
-int sh_remove_all_mappings(struct vcpu *v, mfn_t target_mfn);
-static inline void 
-shadow_drop_references(struct domain *d, struct page_info *p)
+static inline int shadow_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
+                                XEN_GUEST_HANDLE_PARAM(void) u_domctl)
 {
-    if ( unlikely(shadow_mode_enabled(d)) )
-        /* See the comment about locking in sh_remove_all_mappings */
-        sh_remove_all_mappings(d->vcpu[0], _mfn(page_to_mfn(p)));
+    return -EINVAL;
 }
 
+#endif /* CONFIG_SHADOW_PAGING */
+
 /* Remove all shadows of the guest mfn. */
-void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all);
-static inline void shadow_remove_all_shadows(struct vcpu *v, mfn_t gmfn)
+static inline void shadow_remove_all_shadows(struct domain *d, mfn_t gmfn)
 {
     /* See the comment about locking in sh_remove_shadows */
-    sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
+    sh_remove_shadows(d, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
 }
 
-/* Discard _all_ mappings from the domain's shadows. */
-void shadow_blow_tables_per_domain(struct domain *d);
-
 #endif /* _XEN_SHADOW_H */
 
 /*
diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h
index 81f8610..ea07888 100644
--- a/xen/include/asm-x86/smp.h
+++ b/xen/include/asm-x86/smp.h
@@ -16,7 +16,8 @@
 #include <asm/mpspec.h>
 #endif
 
-#define BAD_APICID -1U
+#define BAD_APICID   (-1U)
+#define INVALID_CUID (~0U)   /* AMD Compute Unit ID */
 #ifndef __ASSEMBLY__
 
 /*
@@ -57,6 +58,17 @@ int hard_smp_processor_id(void);
 
 void __stop_this_cpu(void);
 
+/*
+ * The value may be greater than the actual socket number in the system and
+ * is required not to change from the initial startup.
+ */
+extern unsigned int nr_sockets;
+
+void set_nr_sockets(void);
+
+/* Representing HT and core siblings in each socket. */
+extern cpumask_t **socket_cpumask;
+
 #endif /* !__ASSEMBLY__ */
 
 #endif
diff --git a/xen/include/asm-x86/softirq.h b/xen/include/asm-x86/softirq.h
index 7225dea..ec787d6 100644
--- a/xen/include/asm-x86/softirq.h
+++ b/xen/include/asm-x86/softirq.h
@@ -7,7 +7,8 @@
 
 #define MACHINE_CHECK_SOFTIRQ  (NR_COMMON_SOFTIRQS + 3)
 #define PCI_SERR_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
-#define NR_ARCH_SOFTIRQS       5
+#define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 5)
+#define NR_ARCH_SOFTIRQS       6
 
 bool_t arch_skip_send_event_check(unsigned int cpu);
 
diff --git a/xen/include/asm-x86/spinlock.h b/xen/include/asm-x86/spinlock.h
index 06d9b04..7d69e75 100644
--- a/xen/include/asm-x86/spinlock.h
+++ b/xen/include/asm-x86/spinlock.h
@@ -1,34 +1,7 @@
 #ifndef __ASM_SPINLOCK_H
 #define __ASM_SPINLOCK_H
 
-#include <xen/config.h>
-#include <xen/lib.h>
-#include <asm/atomic.h>
-
-typedef struct {
-    volatile s16 lock;
-} raw_spinlock_t;
-
-#define _RAW_SPIN_LOCK_UNLOCKED /*(raw_spinlock_t)*/ { 1 }
-
-#define _raw_spin_is_locked(x) ((x)->lock <= 0)
-
-static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
-{
-    ASSERT(_raw_spin_is_locked(lock));
-    asm volatile (
-        "movw $1,%0" 
-        : "=m" (lock->lock) : : "memory" );
-}
-
-static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
-{
-    s16 oldval;
-    asm volatile (
-        "xchgw %w0,%1"
-        :"=r" (oldval), "=m" (lock->lock)
-        :"0" ((s16)0) : "memory" );
-    return (oldval > 0);
-}
+#define _raw_read_unlock(l) \
+    asm volatile ( "lock; dec%z0 %0" : "+m" ((l)->lock) :: "memory" )
 
 #endif /* __ASM_SPINLOCK_H */
diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h
index 7111329..25a6a2a 100644
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -41,25 +41,25 @@ static always_inline unsigned long __xchg(
     case 1:
         asm volatile ( "xchgb %b0,%1"
                        : "=q" (x)
-                       : "m" (*__xg((volatile void *)ptr)), "0" (x)
+                       : "m" (*__xg(ptr)), "0" (x)
                        : "memory" );
         break;
     case 2:
         asm volatile ( "xchgw %w0,%1"
                        : "=r" (x)
-                       : "m" (*__xg((volatile void *)ptr)), "0" (x)
+                       : "m" (*__xg(ptr)), "0" (x)
                        : "memory" );
         break;
     case 4:
         asm volatile ( "xchgl %k0,%1"
                        : "=r" (x)
-                       : "m" (*__xg((volatile void *)ptr)), "0" (x)
+                       : "m" (*__xg(ptr)), "0" (x)
                        : "memory" );
         break;
     case 8:
         asm volatile ( "xchgq %0,%1"
                        : "=r" (x)
-                       : "m" (*__xg((volatile void *)ptr)), "0" (x)
+                       : "m" (*__xg(ptr)), "0" (x)
                        : "memory" );
         break;
     }
@@ -81,28 +81,28 @@ static always_inline unsigned long __cmpxchg(
     case 1:
         asm volatile ( "lock; cmpxchgb %b1,%2"
                        : "=a" (prev)
-                       : "q" (new), "m" (*__xg((volatile void *)ptr)),
+                       : "q" (new), "m" (*__xg(ptr)),
                        "0" (old)
                        : "memory" );
         return prev;
     case 2:
         asm volatile ( "lock; cmpxchgw %w1,%2"
                        : "=a" (prev)
-                       : "r" (new), "m" (*__xg((volatile void *)ptr)),
+                       : "r" (new), "m" (*__xg(ptr)),
                        "0" (old)
                        : "memory" );
         return prev;
     case 4:
         asm volatile ( "lock; cmpxchgl %k1,%2"
                        : "=a" (prev)
-                       : "r" (new), "m" (*__xg((volatile void *)ptr)),
+                       : "r" (new), "m" (*__xg(ptr)),
                        "0" (old)
                        : "memory" );
         return prev;
     case 8:
         asm volatile ( "lock; cmpxchgq %1,%2"
                        : "=a" (prev)
-                       : "r" (new), "m" (*__xg((volatile void *)ptr)),
+                       : "r" (new), "m" (*__xg(ptr)),
                        "0" (old)
                        : "memory" );
         return prev;
@@ -118,6 +118,52 @@ static always_inline unsigned long __cmpxchg(
 })
 
 /*
+ * Undefined symbol to cause link failure if a wrong size is used with
+ * arch_fetch_and_add().
+ */
+extern unsigned long __bad_fetch_and_add_size(void);
+
+static always_inline unsigned long __xadd(
+    volatile void *ptr, unsigned long v, int size)
+{
+    switch ( size )
+    {
+    case 1:
+        asm volatile ( "lock; xaddb %b0,%1"
+                       : "+r" (v), "+m" (*__xg(ptr))
+                       :: "memory");
+        return v;
+    case 2:
+        asm volatile ( "lock; xaddw %w0,%1"
+                       : "+r" (v), "+m" (*__xg(ptr))
+                       :: "memory");
+        return v;
+    case 4:
+        asm volatile ( "lock; xaddl %k0,%1"
+                       : "+r" (v), "+m" (*__xg(ptr))
+                       :: "memory");
+        return v;
+    case 8:
+        asm volatile ( "lock; xaddq %q0,%1"
+                       : "+r" (v), "+m" (*__xg(ptr))
+                       :: "memory");
+
+        return v;
+    default:
+        return __bad_fetch_and_add_size();
+    }
+}
+
+/*
+ * Atomically add @v to the 1, 2, 4, or 8 byte value at @ptr.  Returns
+ * the previous value.
+ *
+ * This is a full memory barrier.
+ */
+#define arch_fetch_and_add(ptr, v) \
+    ((typeof(*(ptr)))__xadd(ptr, (typeof(*(ptr)))(v), sizeof(*(ptr))))
+
+/*
  * Both Intel and AMD agree that, from a programmer's viewpoint:
  *  Loads cannot be reordered relative to other loads.
  *  Stores cannot be reordered relative to other stores.
@@ -139,6 +185,17 @@ static always_inline unsigned long __cmpxchg(
 #define set_mb(var, value) do { xchg(&var, value); } while (0)
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 
+/*
+ * On x86 the only reordering is of reads with older writes.  In the
+ * lock case, the read in observe_head() can only be reordered with
+ * writes that precede it, and moving a write _into_ a locked section
+ * is OK.  In the release case, the write in add_sized() can only be
+ * reordered with reads that follow it, and hoisting a read _into_ a
+ * locked region is OK.
+ */
+#define arch_lock_acquire_barrier() barrier()
+#define arch_lock_release_barrier() barrier()
+
 #define local_irq_disable()     asm volatile ( "cli" : : : "memory" )
 #define local_irq_enable()      asm volatile ( "sti" : : : "memory" )
 
diff --git a/xen/include/asm-x86/time.h b/xen/include/asm-x86/time.h
index c4d82f6..39d6bf3 100644
--- a/xen/include/asm-x86/time.h
+++ b/xen/include/asm-x86/time.h
@@ -28,9 +28,7 @@ extern bool_t disable_tsc_sync;
 
 static inline cycles_t get_cycles(void)
 {
-    cycles_t c;
-    rdtscll(c);
-    return c;
+    return rdtsc();
 }
 
 unsigned long
diff --git a/xen/include/asm-x86/traps.h b/xen/include/asm-x86/traps.h
index ebb6378..e3884d8 100644
--- a/xen/include/asm-x86/traps.h
+++ b/xen/include/asm-x86/traps.h
@@ -13,8 +13,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef ASM_TRAP_H
@@ -48,8 +47,8 @@ extern int send_guest_trap(struct domain *d, uint16_t vcpuid,
 				unsigned int trap_nr);
 
 uint32_t guest_io_read(unsigned int port, unsigned int bytes,
-                       struct vcpu *, struct cpu_user_regs *);
+                       struct domain *);
 void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
-                    struct vcpu *, struct cpu_user_regs *);
+                    struct domain *);
 
 #endif /* ASM_TRAP_H */
diff --git a/xen/include/asm-x86/vm_event.h b/xen/include/asm-x86/vm_event.h
new file mode 100644
index 0000000..0ae5952
--- /dev/null
+++ b/xen/include/asm-x86/vm_event.h
@@ -0,0 +1,33 @@
+/*
+ * vm_event.h: architecture specific vm_event handling routines
+ *
+ * Copyright (c) 2015 Tamas K Lengyel (tamas at tklengyel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ASM_X86_VM_EVENT_H__
+#define __ASM_X86_VM_EVENT_H__
+
+#include <xen/sched.h>
+#include <xen/vm_event.h>
+
+int vm_event_init_domain(struct domain *d);
+
+void vm_event_cleanup_domain(struct domain *d);
+
+void vm_event_toggle_singlestep(struct domain *d, struct vcpu *v);
+
+void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp);
+
+#endif /* __ASM_X86_VM_EVENT_H__ */
diff --git a/xen/include/asm-x86/hvm/vpmu.h b/xen/include/asm-x86/vpmu.h
similarity index 51%
rename from xen/include/asm-x86/hvm/vpmu.h
rename to xen/include/asm-x86/vpmu.h
index 1f28bd8..67e73dc 100644
--- a/xen/include/asm-x86/hvm/vpmu.h
+++ b/xen/include/asm-x86/vpmu.h
@@ -13,8 +13,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Author: Haitao Shan <haitao.shan at intel.com>
  */
@@ -22,19 +21,10 @@
 #ifndef __ASM_X86_HVM_VPMU_H_
 #define __ASM_X86_HVM_VPMU_H_
 
-/*
- * Flag bits given as a string on the hypervisor boot parameter 'vpmu'.
- * See arch/x86/hvm/vpmu.c.
- */
-#define VPMU_BOOT_ENABLED 0x1    /* vpmu generally enabled. */
-#define VPMU_BOOT_BTS     0x2    /* Intel BTS feature wanted. */
-
+#include <public/pmu.h>
 
-#define msraddr_to_bitpos(x) (((x)&0xffff) + ((x)>>31)*0x2000)
-#define vcpu_vpmu(vcpu)   (&((vcpu)->arch.hvm_vcpu.vpmu))
-#define vpmu_vcpu(vpmu)   (container_of((vpmu), struct vcpu, \
-                                          arch.hvm_vcpu.vpmu))
-#define vpmu_domain(vpmu) (vpmu_vcpu(vpmu)->domain)
+#define vcpu_vpmu(vcpu)   (&(vcpu)->arch.vpmu)
+#define vpmu_vcpu(vpmu)   container_of((vpmu), struct vcpu, arch.vpmu)
 
 #define MSR_TYPE_COUNTER            0
 #define MSR_TYPE_CTRL               1
@@ -42,6 +32,9 @@
 #define MSR_TYPE_ARCH_COUNTER       3
 #define MSR_TYPE_ARCH_CTRL          4
 
+/* Start of PMU register bank */
+#define vpmu_reg_pointer(ctxt, offset) ((void *)((uintptr_t)ctxt + \
+                                                 (uintptr_t)ctxt->offset))
 
 /* Arch specific operations shared by all vpmus */
 struct arch_vpmu_ops {
@@ -53,20 +46,25 @@ struct arch_vpmu_ops {
                      unsigned int *eax, unsigned int *ebx,
                      unsigned int *ecx, unsigned int *edx);
     void (*arch_vpmu_destroy)(struct vcpu *v);
-    int (*arch_vpmu_save)(struct vcpu *v);
-    void (*arch_vpmu_load)(struct vcpu *v);
+    int (*arch_vpmu_save)(struct vcpu *v, bool_t to_guest);
+    int (*arch_vpmu_load)(struct vcpu *v, bool_t from_guest);
     void (*arch_vpmu_dump)(const struct vcpu *);
 };
 
-int vmx_vpmu_initialise(struct vcpu *, unsigned int flags);
-int svm_vpmu_initialise(struct vcpu *, unsigned int flags);
+int core2_vpmu_init(void);
+int vmx_vpmu_initialise(struct vcpu *);
+int amd_vpmu_init(void);
+int svm_vpmu_initialise(struct vcpu *);
 
 struct vpmu_struct {
     u32 flags;
     u32 last_pcpu;
     u32 hw_lapic_lvtpc;
-    void *context;
+    void *context;      /* May be shared with PV guest */
+    void *priv_context; /* hypervisor-only */
     struct arch_vpmu_ops *arch_vpmu_ops;
+    struct xen_pmu_data *xenpmu_data;
+    spinlock_t vpmu_lock;
 };
 
 /* VPMU states */
@@ -76,30 +74,71 @@ struct vpmu_struct {
 #define VPMU_CONTEXT_SAVE                   0x8   /* Force context save */
 #define VPMU_FROZEN                         0x10  /* Stop counters while VCPU is not running */
 #define VPMU_PASSIVE_DOMAIN_ALLOCATED       0x20
-
-/* VPMU features */
-#define VPMU_CPU_HAS_DS                     0x100 /* Has Debug Store */
-#define VPMU_CPU_HAS_BTS                    0x200 /* Has Branch Trace Store */
-
-
-#define vpmu_set(_vpmu, _x)    ((_vpmu)->flags |= (_x))
-#define vpmu_reset(_vpmu, _x)  ((_vpmu)->flags &= ~(_x))
-#define vpmu_is_set(_vpmu, _x) ((_vpmu)->flags & (_x))
-#define vpmu_clear(_vpmu)      ((_vpmu)->flags = 0)
-
-int vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, uint64_t supported);
-int vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content);
+/* PV(H) guests: VPMU registers are accessed by guest from shared page */
+#define VPMU_CACHED                         0x40
+
+static inline void vpmu_set(struct vpmu_struct *vpmu, const u32 mask)
+{
+    vpmu->flags |= mask;
+}
+static inline void vpmu_reset(struct vpmu_struct *vpmu, const u32 mask)
+{
+    vpmu->flags &= ~mask;
+}
+static inline void vpmu_clear(struct vpmu_struct *vpmu)
+{
+    vpmu->flags = 0;
+}
+static inline bool_t vpmu_is_set(const struct vpmu_struct *vpmu, const u32 mask)
+{
+    return !!(vpmu->flags & mask);
+}
+static inline bool_t vpmu_are_all_set(const struct vpmu_struct *vpmu,
+                                      const u32 mask)
+{
+    return !!((vpmu->flags & mask) == mask);
+}
+
+void vpmu_lvtpc_update(uint32_t val);
+int vpmu_do_msr(unsigned int msr, uint64_t *msr_content,
+                uint64_t supported, bool_t is_write);
 void vpmu_do_interrupt(struct cpu_user_regs *regs);
 void vpmu_do_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
                                        unsigned int *ecx, unsigned int *edx);
 void vpmu_initialise(struct vcpu *v);
 void vpmu_destroy(struct vcpu *v);
 void vpmu_save(struct vcpu *v);
-void vpmu_load(struct vcpu *v);
+int vpmu_load(struct vcpu *v, bool_t from_guest);
 void vpmu_dump(struct vcpu *v);
 
+static inline int vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content,
+                                uint64_t supported)
+{
+    return vpmu_do_msr(msr, &msr_content, supported, 1);
+}
+static inline int vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
+{
+    return vpmu_do_msr(msr, msr_content, 0, 0);
+}
+
 extern int acquire_pmu_ownership(int pmu_ownership);
 extern void release_pmu_ownership(int pmu_ownership);
 
+extern unsigned int vpmu_mode;
+extern unsigned int vpmu_features;
+
+/* Context switch */
+static inline void vpmu_switch_from(struct vcpu *prev)
+{
+    if ( vpmu_mode & (XENPMU_MODE_SELF | XENPMU_MODE_HV) )
+        vpmu_save(prev);
+}
+
+static inline void vpmu_switch_to(struct vcpu *next)
+{
+    if ( vpmu_mode & (XENPMU_MODE_SELF | XENPMU_MODE_HV) )
+        vpmu_load(next, 0);
+}
+
 #endif /* __ASM_X86_HVM_VPMU_H_*/
 
diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h
index 1d54587..19ab4d0 100644
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -147,8 +147,20 @@ typedef l4_pgentry_t root_pgentry_t;
  */
 #define _PAGE_GUEST_KERNEL (1U<<12)
 
-#define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
-#define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RO      (__PAGE_HYPERVISOR_RO      | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RW      | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RWX     (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+
+#ifdef __ASSEMBLY__
+/* Dependency on NX being available can't be expressed. */
+# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RWX
+# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
+#else
+# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RW
+# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | \
+                                  _PAGE_GLOBAL | _PAGE_NX)
+#endif
 
 #endif /* __X86_64_PAGE_H__ */
 
diff --git a/xen/include/asm-x86/xenoprof.h b/xen/include/asm-x86/xenoprof.h
index 93f161b..b006ddc 100644
--- a/xen/include/asm-x86/xenoprof.h
+++ b/xen/include/asm-x86/xenoprof.h
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __ASM_X86_XENOPROF_H__
diff --git a/xen/include/efi/efidef.h b/xen/include/efi/efidef.h
index 07fdf0d..86a7e11 100644
--- a/xen/include/efi/efidef.h
+++ b/xen/include/efi/efidef.h
@@ -156,11 +156,15 @@ typedef enum {
 #define EFI_MEMORY_WT           0x0000000000000004
 #define EFI_MEMORY_WB           0x0000000000000008
 #define EFI_MEMORY_UCE          0x0000000000000010  
+#define EFI_MEMORY_WP           0x0000000000001000
 
 // physical memory protection on range 
-#define EFI_MEMORY_WP           0x0000000000001000
 #define EFI_MEMORY_RP           0x0000000000002000
 #define EFI_MEMORY_XP           0x0000000000004000
+#define EFI_MEMORY_RO           0x0000000000020000
+
+#define EFI_MEMORY_NV           0x0000000000008000
+#define EFI_MEMORY_MORE_RELIABLE 0x0000000000010000
 
 // range requires a runtime mapping
 #define EFI_MEMORY_RUNTIME      0x8000000000000000
diff --git a/xen/include/public/arch-arm.h b/xen/include/public/arch-arm.h
index e711606..9a96401 100644
--- a/xen/include/public/arch-arm.h
+++ b/xen/include/public/arch-arm.h
@@ -87,15 +87,10 @@
  * unavailable/unsupported.
  *
  *  HYPERVISOR_memory_op
- *   All generic sub-operations.
- *
- *   In addition the following arch specific sub-ops:
- *    * XENMEM_add_to_physmap
- *    * XENMEM_add_to_physmap_batch
+ *   All generic sub-operations
  *
  *  HYPERVISOR_domctl
  *   All generic sub-operations, with the exception of:
- *    * XEN_DOMCTL_iomem_permission (not yet implemented)
  *    * XEN_DOMCTL_irq_permission (not yet implemented)
  *
  *  HYPERVISOR_sched_op
@@ -170,6 +165,7 @@
 
 #define XEN_HYPERCALL_TAG   0XEA1
 
+#define  int64_aligned_t  int64_t __attribute__((aligned(8)))
 #define uint64_aligned_t uint64_t __attribute__((aligned(8)))
 
 #ifndef __ASSEMBLY__
@@ -183,8 +179,8 @@
  * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
  * in a struct in memory. On ARM is always 8 bytes sizes and 8 bytes
  * aligned.
- * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
- * hypercall argument. It is 4 bytes on aarch and 8 bytes on aarch64.
+ * XEN_GUEST_HANDLE_PARAM represents a guest pointer, when passed as an
+ * hypercall argument. It is 4 bytes on aarch32 and 8 bytes on aarch64.
  */
 #define __DEFINE_XEN_GUEST_HANDLE(name, type) \
     ___DEFINE_XEN_GUEST_HANDLE(name, type);   \
@@ -192,7 +188,6 @@
 #define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
 #define __XEN_GUEST_HANDLE(name)        __guest_handle_64_ ## name
 #define XEN_GUEST_HANDLE(name)          __XEN_GUEST_HANDLE(name)
-/* this is going to be changed on 64 bit */
 #define XEN_GUEST_HANDLE_PARAM(name)    __guest_handle_ ## name
 #define set_xen_guest_handle_raw(hnd, val)                  \
     do {                                                    \
@@ -303,7 +298,35 @@ struct vcpu_guest_context {
 };
 typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
-#endif
+
+/*
+ * struct xen_arch_domainconfig's ABI is covered by
+ * XEN_DOMCTL_INTERFACE_VERSION.
+ */
+#define XEN_DOMCTL_CONFIG_GIC_NATIVE    0
+#define XEN_DOMCTL_CONFIG_GIC_V2        1
+#define XEN_DOMCTL_CONFIG_GIC_V3        2
+struct xen_arch_domainconfig {
+    /* IN/OUT */
+    uint8_t gic_version;
+    /* IN */
+    uint32_t nr_spis;
+    /*
+     * OUT
+     * Based on the property clock-frequency in the DT timer node.
+     * The property may be present when the bootloader/firmware doesn't
+     * set correctly CNTFRQ which hold the timer frequency.
+     *
+     * As it's not possible to trap this register, we have to replicate
+     * the value in the guest DT.
+     *
+     * = 0 => property not present
+     * > 0 => Value of the property
+     *
+     */
+    uint32_t clock_frequency;
+};
+#endif /* __XEN__ || __XEN_TOOLS__ */
 
 struct arch_vcpu_info {
 };
@@ -318,7 +341,7 @@ typedef uint64_t xen_callback_t;
 
 #if defined(__XEN__) || defined(__XEN_TOOLS__)
 
-/* PSR bits (CPSR, SPSR)*/
+/* PSR bits (CPSR, SPSR) */
 
 #define PSR_THUMB       (1<<5)        /* Thumb Mode enable */
 #define PSR_FIQ_MASK    (1<<6)        /* Fast Interrupt mask */
@@ -365,7 +388,8 @@ typedef uint64_t xen_callback_t;
 
 /* Physical Address Space */
 
-/* vGIC mappings: Only one set of mapping is used by the guest.
+/*
+ * vGIC mappings: Only one set of mapping is used by the guest.
  * Therefore they can overlap.
  */
 
@@ -382,10 +406,11 @@ typedef uint64_t xen_callback_t;
 #define GUEST_GICV3_RDIST_STRIDE   0x20000ULL
 #define GUEST_GICV3_RDIST_REGIONS  1
 
-#define GUEST_GICV3_GICR0_BASE     0x03020000ULL    /* vCPU0 - vCPU7 */
-#define GUEST_GICV3_GICR0_SIZE     0x00100000ULL
+#define GUEST_GICV3_GICR0_BASE     0x03020000ULL    /* vCPU0 - vCPU127 */
+#define GUEST_GICV3_GICR0_SIZE     0x01000000ULL
 
-/* 16MB == 4096 pages reserved for guest to use as a region to map its
+/*
+ * 16MB == 4096 pages reserved for guest to use as a region to map its
  * grant table in.
  */
 #define GUEST_GNTTAB_BASE 0x38000000ULL
@@ -423,6 +448,11 @@ typedef uint64_t xen_callback_t;
 
 #endif
 
+#ifndef __ASSEMBLY__
+/* Stub definition of PMU structure */
+typedef struct xen_pmu_arch { uint8_t dummy; } xen_pmu_arch_t;
+#endif
+
 #endif /*  __XEN_PUBLIC_ARCH_ARM_H__ */
 
 /*
diff --git a/xen/include/public/arch-x86/cpuid.h b/xen/include/public/arch-x86/cpuid.h
index 6005dfe..d709340 100644
--- a/xen/include/public/arch-x86/cpuid.h
+++ b/xen/include/public/arch-x86/cpuid.h
@@ -76,13 +76,14 @@
 /*
  * Leaf 5 (0x40000x04)
  * HVM-specific features
+ * EAX: Features
+ * EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
  */
-
-/* EAX Features */
 #define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */
 #define XEN_HVM_CPUID_X2APIC_VIRT      (1u << 1) /* Virtualized x2APIC accesses */
 /* Memory mapped from other domains has valid IOMMU entries */
 #define XEN_HVM_CPUID_IOMMU_MAPPINGS   (1u << 2)
+#define XEN_HVM_CPUID_VCPU_ID_PRESENT  (1u << 3) /* vcpu id is present in EBX */
 
 #define XEN_CPUID_MAX_NUM_LEAVES 4
 
diff --git a/xen/include/public/arch-x86/hvm/save.h b/xen/include/public/arch-x86/hvm/save.h
index 88aab7e..efb0b62 100644
--- a/xen/include/public/arch-x86/hvm/save.h
+++ b/xen/include/public/arch-x86/hvm/save.h
@@ -569,6 +569,7 @@ struct hvm_viridian_domain_context {
     uint64_t hypercall_gpa;
     uint64_t guest_os_id;
     uint64_t time_ref_count;
+    uint64_t reference_tsc;
 };
 
 DECLARE_HVM_SAVE_TYPE(VIRIDIAN_DOMAIN, 15, struct hvm_viridian_domain_context);
@@ -617,3 +618,13 @@ struct hvm_msr {
 #define HVM_SAVE_CODE_MAX 20
 
 #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/public/arch-x86/pmu.h b/xen/include/public/arch-x86/pmu.h
new file mode 100644
index 0000000..68ebf12
--- /dev/null
+++ b/xen/include/public/arch-x86/pmu.h
@@ -0,0 +1,167 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2015 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_PMU_H__
+#define __XEN_PUBLIC_ARCH_X86_PMU_H__
+
+/* x86-specific PMU definitions */
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+    /*
+     * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+     * For PV(H) guests these fields are RO.
+     */
+    uint32_t counters;
+    uint32_t ctrls;
+
+    /* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    uint64_t regs[];
+#elif defined(__GNUC__)
+    uint64_t regs[0];
+#endif
+};
+typedef struct xen_pmu_amd_ctxt xen_pmu_amd_ctxt_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_amd_ctxt_t);
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+    uint64_t counter;
+    uint64_t control;
+};
+typedef struct xen_pmu_cntr_pair xen_pmu_cntr_pair_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_cntr_pair_t);
+
+struct xen_pmu_intel_ctxt {
+   /*
+    * Offsets to fixed and architectural counter MSRs (relative to
+    * xen_pmu_arch.c.intel).
+    * For PV(H) guests these fields are RO.
+    */
+    uint32_t fixed_counters;
+    uint32_t arch_counters;
+
+    /* PMU registers */
+    uint64_t global_ctrl;
+    uint64_t global_ovf_ctrl;
+    uint64_t global_status;
+    uint64_t fixed_ctrl;
+    uint64_t ds_area;
+    uint64_t pebs_enable;
+    uint64_t debugctl;
+
+    /* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    uint64_t regs[];
+#elif defined(__GNUC__)
+    uint64_t regs[0];
+#endif
+};
+typedef struct xen_pmu_intel_ctxt xen_pmu_intel_ctxt_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_intel_ctxt_t);
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+    uint64_t ip;
+    uint64_t sp;
+    uint64_t flags;
+    uint16_t cs;
+    uint16_t ss;
+    uint8_t cpl;
+    uint8_t pad[3];
+};
+typedef struct xen_pmu_regs xen_pmu_regs_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_regs_t);
+
+/* PMU flags */
+#define PMU_CACHED         (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER    (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL    (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV      (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+    union {
+        /*
+         * Processor's registers at the time of interrupt.
+         * WO for hypervisor, RO for guests.
+         */
+        struct xen_pmu_regs regs;
+        /* Padding for adding new registers to xen_pmu_regs in the future */
+#define XENPMU_REGS_PAD_SZ  64
+        uint8_t pad[XENPMU_REGS_PAD_SZ];
+    } r;
+
+    /* WO for hypervisor, RO for guest */
+    uint64_t pmu_flags;
+
+    /*
+     * APIC LVTPC register.
+     * RW for both hypervisor and guest.
+     * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+     * during XENPMU_flush or XENPMU_lvtpc_set.
+     */
+    union {
+        uint32_t lapic_lvtpc;
+        uint64_t pad;
+    } l;
+
+    /*
+     * Vendor-specific PMU registers.
+     * RW for both hypervisor and guest (see exceptions above).
+     * Guest's updates to this field are verified and then loaded by the
+     * hypervisor into hardware during XENPMU_flush
+     */
+    union {
+        struct xen_pmu_amd_ctxt amd;
+        struct xen_pmu_intel_ctxt intel;
+
+        /*
+         * Padding for contexts (fixed parts only, does not include MSR banks
+         * that are specified by offsets)
+         */
+#define XENPMU_CTXT_PAD_SZ  128
+        uint8_t pad[XENPMU_CTXT_PAD_SZ];
+    } c;
+};
+typedef struct xen_pmu_arch xen_pmu_arch_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_arch_t);
+
+#endif /* __XEN_PUBLIC_ARCH_X86_PMU_H__ */
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
diff --git a/xen/include/public/arch-x86/xen-x86_32.h b/xen/include/public/arch-x86/xen-x86_32.h
index 1504191..be9a6c6 100644
--- a/xen/include/public/arch-x86/xen-x86_32.h
+++ b/xen/include/public/arch-x86/xen-x86_32.h
@@ -104,6 +104,7 @@
     do { if ( sizeof(hnd) == 8 ) *(uint64_t *)&(hnd) = 0;   \
          (hnd).p = val;                                     \
     } while ( 0 )
+#define  int64_aligned_t  int64_t __attribute__((aligned(8)))
 #define uint64_aligned_t uint64_t __attribute__((aligned(8)))
 #define __XEN_GUEST_HANDLE_64(name) __guest_handle_64_ ## name
 #define XEN_GUEST_HANDLE_64(name) __XEN_GUEST_HANDLE_64(name)
diff --git a/xen/include/public/arch-x86/xen.h b/xen/include/public/arch-x86/xen.h
index f35804b..2ecc9c9 100644
--- a/xen/include/public/arch-x86/xen.h
+++ b/xen/include/public/arch-x86/xen.h
@@ -220,14 +220,58 @@ typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
 
 struct arch_shared_info {
-    unsigned long max_pfn;                  /* max pfn that appears in table */
-    /* Frame containing list of mfns containing list of mfns containing p2m. */
+    /*
+     * Number of valid entries in the p2m table(s) anchored at
+     * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+     */
+    unsigned long max_pfn;
+    /*
+     * Frame containing list of mfns containing list of mfns containing p2m.
+     * A value of 0 indicates it has not yet been set up, ~0 indicates it has
+     * been set to invalid e.g. due to the p2m being too large for the 3-level
+     * p2m tree. In this case the linear mapper p2m list anchored at p2m_vaddr
+     * is to be used.
+     */
     xen_pfn_t     pfn_to_mfn_frame_list_list;
     unsigned long nmi_reason;
-    uint64_t pad[32];
+    /*
+     * Following three fields are valid if p2m_cr3 contains a value different
+     * from 0.
+     * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+     * p2m_cr3 is in the same format as a cr3 value in the vcpu register state
+     * and holds the folded machine frame number (via xen_pfn_to_cr3) of a
+     * L3 or L4 page table.
+     * p2m_vaddr holds the virtual address of the linear p2m list. All entries
+     * in the range [0...max_pfn[ are accessible via this pointer.
+     * p2m_generation will be incremented by the guest before and after each
+     * change of the mappings of the p2m list. p2m_generation starts at 0 and
+     * a value with the least significant bit set indicates that a mapping
+     * update is in progress. This allows guest external software (e.g. in Dom0)
+     * to verify that read mappings are consistent and whether they have changed
+     * since the last check.
+     * Modifying a p2m element in the linear p2m list is allowed via an atomic
+     * write only.
+     */
+    unsigned long p2m_cr3;         /* cr3 value of the p2m address space */
+    unsigned long p2m_vaddr;       /* virtual address of the p2m list */
+    unsigned long p2m_generation;  /* generation count of p2m mapping */
+#ifdef __i386__
+    /* There's no room for this field in the generic structure. */
+    uint32_t wc_sec_hi;
+#endif
 };
 typedef struct arch_shared_info arch_shared_info_t;
 
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+/*
+ * struct xen_arch_domainconfig's ABI is covered by
+ * XEN_DOMCTL_INTERFACE_VERSION.
+ */
+struct xen_arch_domainconfig {
+    char dummy;
+};
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 /*
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 9e1bc63..675f021 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -37,7 +37,7 @@
 #include "hvm/save.h"
 #include "memory.h"
 
-#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a
+#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000b
 
 /*
  * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
@@ -64,23 +64,11 @@ struct xen_domctl_createdomain {
 #define _XEN_DOMCTL_CDF_pvh_guest     4
 #define XEN_DOMCTL_CDF_pvh_guest      (1U<<_XEN_DOMCTL_CDF_pvh_guest)
     uint32_t flags;
+    struct xen_arch_domainconfig config;
 };
 typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
 
-#if defined(__arm__) || defined(__aarch64__)
-#define XEN_DOMCTL_CONFIG_GIC_DEFAULT   0
-#define XEN_DOMCTL_CONFIG_GIC_V2        1
-#define XEN_DOMCTL_CONFIG_GIC_V3        2
-/* XEN_DOMCTL_configure_domain */
-struct xen_domctl_arm_configuredomain {
-    /* IN/OUT parameters */
-    uint8_t gic_version;
-};
-typedef struct xen_domctl_arm_configuredomain xen_domctl_arm_configuredomain_t;
-DEFINE_XEN_GUEST_HANDLE(xen_domctl_arm_configuredomain_t);
-#endif
-
 /* XEN_DOMCTL_getdomaininfo */
 struct xen_domctl_getdomaininfo {
     /* OUT variables. */
@@ -161,27 +149,6 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_getmemlist_t);
 #define XEN_DOMCTL_PFINFO_BROKEN  (0xdU<<28) /* broken page */
 #define XEN_DOMCTL_PFINFO_LTAB_MASK (0xfU<<28)
 
-struct xen_domctl_getpageframeinfo {
-    /* IN variables. */
-    uint64_aligned_t gmfn; /* GMFN to query */
-    /* OUT variables. */
-    /* Is the page PINNED to a type? */
-    uint32_t type;         /* see above type defs */
-};
-typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t;
-DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t);
-
-
-/* XEN_DOMCTL_getpageframeinfo2 */
-struct xen_domctl_getpageframeinfo2 {
-    /* IN variables. */
-    uint64_aligned_t num;
-    /* IN/OUT variables. */
-    XEN_GUEST_HANDLE_64(uint32) array;
-};
-typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t;
-DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t);
-
 /* XEN_DOMCTL_getpageframeinfo3 */
 struct xen_domctl_getpageframeinfo3 {
     /* IN variables. */
@@ -357,7 +324,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
 
 /* XEN_DOMCTL_scheduler_op */
 /* Scheduler types. */
-#define XEN_SCHEDULER_SEDF     4
+/* #define XEN_SCHEDULER_SEDF  4 (Removed) */
 #define XEN_SCHEDULER_CREDIT   5
 #define XEN_SCHEDULER_CREDIT2  6
 #define XEN_SCHEDULER_ARINC653 7
@@ -370,13 +337,6 @@ struct xen_domctl_scheduler_op {
     uint32_t sched_id;  /* XEN_SCHEDULER_* */
     uint32_t cmd;       /* XEN_DOMCTL_SCHEDOP_* */
     union {
-        struct xen_domctl_sched_sedf {
-            uint64_aligned_t period;
-            uint64_aligned_t slice;
-            uint64_aligned_t latency;
-            uint32_t extratime;
-            uint32_t weight;
-        } sedf;
         struct xen_domctl_sched_credit {
             uint16_t weight;
             uint16_t cap;
@@ -449,7 +409,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t);
 
 /* XEN_DOMCTL_settimeoffset */
 struct xen_domctl_settimeoffset {
-    int32_t  time_offset_seconds; /* applied to domain wallclock time */
+    int64_aligned_t time_offset_seconds; /* applied to domain wallclock time */
 };
 typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
@@ -487,12 +447,33 @@ typedef struct xen_domctl_sendtrigger xen_domctl_sendtrigger_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t);
 
 
-/* Assign PCI device to HVM guest. Sets up IOMMU structures. */
+/* Assign a device to a guest. Sets up IOMMU structures. */
 /* XEN_DOMCTL_assign_device */
 /* XEN_DOMCTL_test_assign_device */
-/* XEN_DOMCTL_deassign_device */
+/*
+ * XEN_DOMCTL_deassign_device: The behavior of this DOMCTL differs
+ * between the different type of device:
+ *  - PCI device (XEN_DOMCTL_DEV_PCI) will be reassigned to DOM0
+ *  - DT device (XEN_DOMCTL_DT_PCI) will left unassigned. DOM0
+ *  will have to call XEN_DOMCTL_assign_device in order to use the
+ *  device.
+ */
+#define XEN_DOMCTL_DEV_PCI      0
+#define XEN_DOMCTL_DEV_DT       1
 struct xen_domctl_assign_device {
-    uint32_t  machine_sbdf;   /* machine PCI ID of assigned device */
+    uint32_t dev;   /* XEN_DOMCTL_DEV_* */
+    union {
+        struct {
+            uint32_t machine_sbdf;   /* machine PCI ID of assigned device */
+        } pci;
+        struct {
+            uint32_t size; /* Length of the path */
+            XEN_GUEST_HANDLE_64(char) path; /* path to the device tree node */
+        } dt;
+    } u;
+    /* IN */
+#define XEN_DOMCTL_DEV_RDM_RELAXED      1
+    uint32_t  flag;   /* flag of assigned device */
 };
 typedef struct xen_domctl_assign_device xen_domctl_assign_device_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t);
@@ -516,6 +497,7 @@ typedef enum pt_irq_type_e {
     PT_IRQ_TYPE_ISA,
     PT_IRQ_TYPE_MSI,
     PT_IRQ_TYPE_MSI_TRANSLATE,
+    PT_IRQ_TYPE_SPI,    /* ARM: valid range 32-1019 */
 } pt_irq_type_t;
 struct xen_domctl_bind_pt_irq {
     uint32_t machine_irq;
@@ -536,6 +518,9 @@ struct xen_domctl_bind_pt_irq {
             uint32_t gflags;
             uint64_aligned_t gtable;
         } msi;
+        struct {
+            uint16_t spi;
+        } spi;
     } u;
 };
 typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t;
@@ -710,18 +695,13 @@ typedef struct xen_domctl_disable_migrate {
 
 /* XEN_DOMCTL_gettscinfo */
 /* XEN_DOMCTL_settscinfo */
-struct xen_guest_tsc_info {
+typedef struct xen_domctl_tsc_info {
+    /* IN/OUT */
     uint32_t tsc_mode;
     uint32_t gtsc_khz;
     uint32_t incarnation;
     uint32_t pad;
     uint64_aligned_t elapsed_nsec;
-};
-typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
-DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
-typedef struct xen_domctl_tsc_info {
-    XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
-    xen_guest_tsc_info_t info; /* IN */
 } xen_domctl_tsc_info_t;
 
 /* XEN_DOMCTL_gdbsx_guestmemio      guest mem io */
@@ -751,10 +731,21 @@ struct xen_domctl_gdbsx_domstatus {
 };
 
 /*
- * Memory event operations
+ * VM event operations
  */
 
-/* XEN_DOMCTL_mem_event_op */
+/* XEN_DOMCTL_vm_event_op */
+
+/*
+ * There are currently three rings available for VM events:
+ * sharing, monitor and paging. This hypercall allows one to
+ * control these rings (enable/disable), as well as to signal
+ * to the hypervisor to pull responses (resume) from the given
+ * ring.
+ */
+#define XEN_VM_EVENT_ENABLE               0
+#define XEN_VM_EVENT_DISABLE              1
+#define XEN_VM_EVENT_RESUME               2
 
 /*
  * Domain memory paging
@@ -763,42 +754,38 @@ struct xen_domctl_gdbsx_domstatus {
  * pager<->hypervisor interface. Use XENMEM_paging_op*
  * to perform per-page operations.
  *
- * The XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE domctl returns several
+ * The XEN_VM_EVENT_PAGING_ENABLE domctl returns several
  * non-standard error codes to indicate why paging could not be enabled:
  * ENODEV - host lacks HAP support (EPT/NPT) or HAP is disabled in guest
  * EMLINK - guest has iommu passthrough enabled
  * EXDEV  - guest has PoD enabled
  * EBUSY  - guest has or had paging enabled, ring buffer still active
  */
-#define XEN_DOMCTL_MEM_EVENT_OP_PAGING            1
-
-#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE     0
-#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE    1
+#define XEN_DOMCTL_VM_EVENT_OP_PAGING            1
 
 /*
- * Access permissions.
+ * Monitor helper.
  *
  * As with paging, use the domctl for teardown/setup of the
  * helper<->hypervisor interface.
  *
- * There are HVM hypercalls to set the per-page access permissions of every
- * page in a domain.  When one of these permissions--independent, read, 
- * write, and execute--is violated, the VCPU is paused and a memory event 
- * is sent with what happened.  (See public/mem_event.h) .
+ * The monitor interface can be used to register for various VM events. For
+ * example, there are HVM hypercalls to set the per-page access permissions
+ * of every page in a domain.  When one of these permissions--independent,
+ * read, write, and execute--is violated, the VCPU is paused and a memory event
+ * is sent with what happened. The memory event handler can then resume the
+ * VCPU and redo the access with a XEN_VM_EVENT_RESUME option.
  *
- * The memory event handler can then resume the VCPU and redo the access 
- * with a XENMEM_access_op_resume hypercall.
+ * See public/vm_event.h for the list of available events that can be
+ * subscribed to via the monitor interface.
  *
- * The XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE domctl returns several
+ * The XEN_VM_EVENT_MONITOR_* domctls returns
  * non-standard error codes to indicate why access could not be enabled:
  * ENODEV - host lacks HAP support (EPT/NPT) or HAP is disabled in guest
  * EBUSY  - guest has or had access enabled, ring buffer still active
+ *
  */
-#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS                        2
-
-#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE                 0
-#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE                1
-#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE_INTROSPECTION   2
+#define XEN_DOMCTL_VM_EVENT_OP_MONITOR           2
 
 /*
  * Sharing ENOMEM helper.
@@ -813,21 +800,18 @@ struct xen_domctl_gdbsx_domstatus {
  * Note that shring can be turned on (as per the domctl below)
  * *without* this ring being setup.
  */
-#define XEN_DOMCTL_MEM_EVENT_OP_SHARING           3
-
-#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_ENABLE    0
-#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_DISABLE   1
+#define XEN_DOMCTL_VM_EVENT_OP_SHARING           3
 
 /* Use for teardown/setup of helper<->hypervisor interface for paging, 
  * access and sharing.*/
-struct xen_domctl_mem_event_op {
-    uint32_t       op;           /* XEN_DOMCTL_MEM_EVENT_OP_*_* */
-    uint32_t       mode;         /* XEN_DOMCTL_MEM_EVENT_OP_* */
+struct xen_domctl_vm_event_op {
+    uint32_t       op;           /* XEN_VM_EVENT_* */
+    uint32_t       mode;         /* XEN_DOMCTL_VM_EVENT_OP_* */
 
     uint32_t port;              /* OUT: event channel for ring */
 };
-typedef struct xen_domctl_mem_event_op xen_domctl_mem_event_op_t;
-DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_event_op_t);
+typedef struct xen_domctl_vm_event_op xen_domctl_vm_event_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vm_event_op_t);
 
 /*
  * Memory sharing operations
@@ -959,27 +943,37 @@ typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
 #endif
 
-/*
- * Use in XEN_DOMCTL_setvnumainfo to set
- * vNUMA domain topology.
- */
+/* XEN_DOMCTL_setvnumainfo: specifies a virtual NUMA topology for the guest */
 struct xen_domctl_vnuma {
+    /* IN: number of vNUMA nodes to setup. Shall be greater than 0 */
     uint32_t nr_vnodes;
+    /* IN: number of memory ranges to setup */
     uint32_t nr_vmemranges;
+    /*
+     * IN: number of vCPUs of the domain (used as size of the vcpu_to_vnode
+     * array declared below). Shall be equal to the domain's max_vcpus.
+     */
     uint32_t nr_vcpus;
-    uint32_t pad;
+    uint32_t pad;                                  /* must be zero */
+
+    /*
+     * IN: array for specifying the distances of the vNUMA nodes
+     * between each others. Shall have nr_vnodes*nr_vnodes elements.
+     */
     XEN_GUEST_HANDLE_64(uint) vdistance;
+    /*
+     * IN: array for specifying to what vNUMA node each vCPU belongs.
+     * Shall have nr_vcpus elements.
+     */
     XEN_GUEST_HANDLE_64(uint) vcpu_to_vnode;
-
     /*
-     * vnodes to physical NUMA nodes mask.
-     * This kept on per-domain basis for
-     * interested consumers, such as numa aware ballooning.
+     * IN: array for specifying on what physical NUMA node each vNUMA
+     * node is placed. Shall have nr_vnodes elements.
      */
     XEN_GUEST_HANDLE_64(uint) vnode_to_pnode;
-
     /*
-     * memory rages for each vNUMA node
+     * IN: array for specifying the memory ranges. Shall have
+     * nr_vmemranges elements.
      */
     XEN_GUEST_HANDLE_64(xen_vmemrange_t) vmemrange;
 };
@@ -996,6 +990,79 @@ struct xen_domctl_psr_cmt_op {
 typedef struct xen_domctl_psr_cmt_op xen_domctl_psr_cmt_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_psr_cmt_op_t);
 
+/*  XEN_DOMCTL_MONITOR_*
+ *
+ * Enable/disable monitoring various VM events.
+ * This domctl configures what events will be reported to helper apps
+ * via the ring buffer "MONITOR". The ring has to be first enabled
+ * with the domctl XEN_DOMCTL_VM_EVENT_OP_MONITOR.
+ *
+ * GET_CAPABILITIES can be used to determine which of these features is
+ * available on a given platform.
+ *
+ * NOTICE: mem_access events are also delivered via the "MONITOR" ring buffer;
+ * however, enabling/disabling those events is performed with the use of
+ * memory_op hypercalls!
+ */
+#define XEN_DOMCTL_MONITOR_OP_ENABLE            0
+#define XEN_DOMCTL_MONITOR_OP_DISABLE           1
+#define XEN_DOMCTL_MONITOR_OP_GET_CAPABILITIES  2
+
+#define XEN_DOMCTL_MONITOR_EVENT_WRITE_CTRLREG         0
+#define XEN_DOMCTL_MONITOR_EVENT_MOV_TO_MSR            1
+#define XEN_DOMCTL_MONITOR_EVENT_SINGLESTEP            2
+#define XEN_DOMCTL_MONITOR_EVENT_SOFTWARE_BREAKPOINT   3
+#define XEN_DOMCTL_MONITOR_EVENT_GUEST_REQUEST         4
+
+struct xen_domctl_monitor_op {
+    uint32_t op; /* XEN_DOMCTL_MONITOR_OP_* */
+
+    /*
+     * When used with ENABLE/DISABLE this has to be set to
+     * the requested XEN_DOMCTL_MONITOR_EVENT_* value.
+     * With GET_CAPABILITIES this field returns a bitmap of
+     * events supported by the platform, in the format
+     * (1 << XEN_DOMCTL_MONITOR_EVENT_*).
+     */
+    uint32_t event;
+
+    /*
+     * Further options when issuing XEN_DOMCTL_MONITOR_OP_ENABLE.
+     */
+    union {
+        struct {
+            /* Which control register */
+            uint8_t index;
+            /* Pause vCPU until response */
+            uint8_t sync;
+            /* Send event only on a change of value */
+            uint8_t onchangeonly;
+        } mov_to_cr;
+
+        struct {
+            /* Enable the capture of an extended set of MSRs */
+            uint8_t extended_capture;
+        } mov_to_msr;
+
+        struct {
+            /* Pause vCPU until response */
+            uint8_t sync;
+        } guest_request;
+    } u;
+};
+typedef struct xen_domctl_monitor_op xen_domctl_monitor_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_monitor_op_t);
+
+struct xen_domctl_psr_cat_op {
+#define XEN_DOMCTL_PSR_CAT_OP_SET_L3_CBM     0
+#define XEN_DOMCTL_PSR_CAT_OP_GET_L3_CBM     1
+    uint32_t cmd;       /* IN: XEN_DOMCTL_PSR_CAT_OP_* */
+    uint32_t target;    /* IN */
+    uint64_t data;      /* IN/OUT */
+};
+typedef struct xen_domctl_psr_cat_op xen_domctl_psr_cat_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_psr_cat_op_t);
+
 struct xen_domctl {
     uint32_t cmd;
 #define XEN_DOMCTL_createdomain                   1
@@ -1004,8 +1071,8 @@ struct xen_domctl {
 #define XEN_DOMCTL_unpausedomain                  4
 #define XEN_DOMCTL_getdomaininfo                  5
 #define XEN_DOMCTL_getmemlist                     6
-#define XEN_DOMCTL_getpageframeinfo               7
-#define XEN_DOMCTL_getpageframeinfo2              8
+/* #define XEN_DOMCTL_getpageframeinfo            7 Obsolete - use getpageframeinfo3 */
+/* #define XEN_DOMCTL_getpageframeinfo2           8 Obsolete - use getpageframeinfo3 */
 #define XEN_DOMCTL_setvcpuaffinity                9
 #define XEN_DOMCTL_shadow_op                     10
 #define XEN_DOMCTL_max_mem                       11
@@ -1050,7 +1117,7 @@ struct xen_domctl {
 #define XEN_DOMCTL_suppress_spurious_page_faults 53
 #define XEN_DOMCTL_debug_op                      54
 #define XEN_DOMCTL_gethvmcontext_partial         55
-#define XEN_DOMCTL_mem_event_op                  56
+#define XEN_DOMCTL_vm_event_op                   56
 #define XEN_DOMCTL_mem_sharing_op                57
 #define XEN_DOMCTL_disable_migrate               58
 #define XEN_DOMCTL_gettscinfo                    59
@@ -1070,7 +1137,8 @@ struct xen_domctl {
 #define XEN_DOMCTL_set_vcpu_msrs                 73
 #define XEN_DOMCTL_setvnumainfo                  74
 #define XEN_DOMCTL_psr_cmt_op                    75
-#define XEN_DOMCTL_arm_configure_domain          76
+#define XEN_DOMCTL_monitor_op                    77
+#define XEN_DOMCTL_psr_cat_op                    78
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -1079,13 +1147,8 @@ struct xen_domctl {
     domid_t  domain;
     union {
         struct xen_domctl_createdomain      createdomain;
-#if defined(__arm__) || defined(__aarch64__)
-        struct xen_domctl_arm_configuredomain configuredomain;
-#endif
         struct xen_domctl_getdomaininfo     getdomaininfo;
         struct xen_domctl_getmemlist        getmemlist;
-        struct xen_domctl_getpageframeinfo  getpageframeinfo;
-        struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
         struct xen_domctl_getpageframeinfo3 getpageframeinfo3;
         struct xen_domctl_nodeaffinity      nodeaffinity;
         struct xen_domctl_vcpuaffinity      vcpuaffinity;
@@ -1118,7 +1181,7 @@ struct xen_domctl {
         struct xen_domctl_set_target        set_target;
         struct xen_domctl_subscribe         subscribe;
         struct xen_domctl_debug_op          debug_op;
-        struct xen_domctl_mem_event_op      mem_event_op;
+        struct xen_domctl_vm_event_op       vm_event_op;
         struct xen_domctl_mem_sharing_op    mem_sharing_op;
 #if defined(__i386__) || defined(__x86_64__)
         struct xen_domctl_cpuid             cpuid;
@@ -1136,6 +1199,8 @@ struct xen_domctl {
         struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
         struct xen_domctl_vnuma             vnuma;
         struct xen_domctl_psr_cmt_op        psr_cmt_op;
+        struct xen_domctl_monitor_op        monitor_op;
+        struct xen_domctl_psr_cat_op        psr_cat_op;
         uint8_t                             pad[128];
     } u;
 };
diff --git a/xen/include/public/errno.h b/xen/include/public/errno.h
new file mode 100644
index 0000000..8c88bb1
--- /dev/null
+++ b/xen/include/public/errno.h
@@ -0,0 +1,95 @@
+#ifndef __XEN_PUBLIC_ERRNO_H__
+
+#ifndef __ASSEMBLY__
+
+#define XEN_ERRNO(name, value) XEN_##name = value,
+enum xen_errno {
+
+#else /* !__ASSEMBLY__ */
+
+#define XEN_ERRNO(name, value) .equ XEN_##name, value
+
+#endif /* __ASSEMBLY__ */
+
+/* ` enum neg_errnoval {  [ -Efoo for each Efoo in the list below ]  } */
+/* ` enum errnoval { */
+
+#endif /* __XEN_PUBLIC_ERRNO_H__ */
+
+#ifdef XEN_ERRNO
+
+/*
+ * Values originating from x86 Linux. Please consider using respective
+ * values when adding new definitions here.
+ *
+ * The set of identifiers to be added here shouldn't extend beyond what
+ * POSIX mandates (see e.g.
+ * http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html)
+ * with the exception that we support some optional (XSR) values
+ * specified there (but no new ones should be added).
+ */
+
+XEN_ERRNO(EPERM,	 1)	/* Operation not permitted */
+XEN_ERRNO(ENOENT,	 2)	/* No such file or directory */
+XEN_ERRNO(ESRCH,	 3)	/* No such process */
+#ifdef __XEN__ /* Internal only, should never be exposed to the guest. */
+XEN_ERRNO(EINTR,	 4)	/* Interrupted system call */
+#endif
+XEN_ERRNO(EIO,		 5)	/* I/O error */
+XEN_ERRNO(ENXIO,	 6)	/* No such device or address */
+XEN_ERRNO(E2BIG,	 7)	/* Arg list too long */
+XEN_ERRNO(ENOEXEC,	 8)	/* Exec format error */
+XEN_ERRNO(EBADF,	 9)	/* Bad file number */
+XEN_ERRNO(ECHILD,	10)	/* No child processes */
+XEN_ERRNO(EAGAIN,	11)	/* Try again */
+XEN_ERRNO(ENOMEM,	12)	/* Out of memory */
+XEN_ERRNO(EACCES,	13)	/* Permission denied */
+XEN_ERRNO(EFAULT,	14)	/* Bad address */
+XEN_ERRNO(EBUSY,	16)	/* Device or resource busy */
+XEN_ERRNO(EEXIST,	17)	/* File exists */
+XEN_ERRNO(EXDEV,	18)	/* Cross-device link */
+XEN_ERRNO(ENODEV,	19)	/* No such device */
+XEN_ERRNO(EINVAL,	22)	/* Invalid argument */
+XEN_ERRNO(ENFILE,	23)	/* File table overflow */
+XEN_ERRNO(EMFILE,	24)	/* Too many open files */
+XEN_ERRNO(ENOSPC,	28)	/* No space left on device */
+XEN_ERRNO(EMLINK,	31)	/* Too many links */
+XEN_ERRNO(EDOM,		33)	/* Math argument out of domain of func */
+XEN_ERRNO(ERANGE,	34)	/* Math result not representable */
+XEN_ERRNO(EDEADLK,	35)	/* Resource deadlock would occur */
+XEN_ERRNO(ENAMETOOLONG,	36)	/* File name too long */
+XEN_ERRNO(ENOLCK,	37)	/* No record locks available */
+XEN_ERRNO(ENOSYS,	38)	/* Function not implemented */
+XEN_ERRNO(ENODATA,	61)	/* No data available */
+XEN_ERRNO(ETIME,	62)	/* Timer expired */
+XEN_ERRNO(EBADMSG,	74)	/* Not a data message */
+XEN_ERRNO(EOVERFLOW,	75)	/* Value too large for defined data type */
+XEN_ERRNO(EILSEQ,	84)	/* Illegal byte sequence */
+#ifdef __XEN__ /* Internal only, should never be exposed to the guest. */
+XEN_ERRNO(ERESTART,	85)	/* Interrupted system call should be restarted */
+#endif
+XEN_ERRNO(ENOTSOCK,	88)	/* Socket operation on non-socket */
+XEN_ERRNO(EOPNOTSUPP,	95)	/* Operation not supported on transport endpoint */
+XEN_ERRNO(EADDRINUSE,	98)	/* Address already in use */
+XEN_ERRNO(EADDRNOTAVAIL, 99)	/* Cannot assign requested address */
+XEN_ERRNO(ENOBUFS,	105)	/* No buffer space available */
+XEN_ERRNO(EISCONN,	106)	/* Transport endpoint is already connected */
+XEN_ERRNO(ENOTCONN,	107)	/* Transport endpoint is not connected */
+XEN_ERRNO(ETIMEDOUT,	110)	/* Connection timed out */
+
+#undef XEN_ERRNO
+#endif /* XEN_ERRNO */
+
+#ifndef __XEN_PUBLIC_ERRNO_H__
+#define __XEN_PUBLIC_ERRNO_H__
+
+/* ` } */
+
+#ifndef __ASSEMBLY__
+};
+#endif
+
+#define	XEN_EWOULDBLOCK	XEN_EAGAIN	/* Operation would block */
+#define	XEN_EDEADLOCK	XEN_EDEADLK	/* Resource deadlock would occur */
+
+#endif /*  __XEN_PUBLIC_ERRNO_H__ */
diff --git a/xen/include/public/features.h b/xen/include/public/features.h
index 16d92aa..2110b04 100644
--- a/xen/include/public/features.h
+++ b/xen/include/public/features.h
@@ -99,6 +99,9 @@
 #define XENFEAT_grant_map_identity        12
  */
 
+/* Guest can use XENMEMF_vnode to specify virtual node for memory op. */
+#define XENFEAT_memory_op_vnode_supported 13
+
 #define XENFEAT_NR_SUBMAPS 1
 
 #endif /* __XEN_PUBLIC_FEATURES_H__ */
diff --git a/xen/include/public/grant_table.h b/xen/include/public/grant_table.h
index 20d4e77..e9393fd 100644
--- a/xen/include/public/grant_table.h
+++ b/xen/include/public/grant_table.h
@@ -134,8 +134,10 @@ struct grant_entry_v1 {
     /* The domain being granted foreign privileges. [GST] */
     domid_t  domid;
     /*
-     * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
-     * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
+     * GTF_permit_access: GFN that @domid is allowed to map and access. [GST]
+     * GTF_accept_transfer: GFN that @domid is allowed to transfer into. [GST]
+     * GTF_transfer_completed: MFN whose ownership transferred by @domid
+     *                         (non-translated guests only). [XEN]
      */
     uint32_t frame;
 };
@@ -321,7 +323,7 @@ typedef uint32_t grant_handle_t;
 /*
  * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
  * by devices and/or host CPUs. If successful, <handle> is a tracking number
- * that must be presented later to destroy the mapping(s). On error, <handle>
+ * that must be presented later to destroy the mapping(s). On error, <status>
  * is a negative status code.
  * NOTES:
  *  1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address
@@ -453,7 +455,7 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
 
 struct gnttab_copy {
     /* IN parameters. */
-    struct {
+    struct gnttab_copy_ptr {
         union {
             grant_ref_t ref;
             xen_pfn_t   gmfn;
diff --git a/xen/include/public/hvm/e820.h b/xen/include/public/hvm/e820.h
index 5bdc227..6c58a37 100644
--- a/xen/include/public/hvm/e820.h
+++ b/xen/include/public/hvm/e820.h
@@ -1,4 +1,3 @@
-
 /*
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -17,6 +16,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Keir Fraser
  */
 
 #ifndef __XEN_PUBLIC_HVM_E820_H__
diff --git a/xen/include/public/hvm/hvm_info_table.h b/xen/include/public/hvm/hvm_info_table.h
index 36085fa..9e3f807 100644
--- a/xen/include/public/hvm/hvm_info_table.h
+++ b/xen/include/public/hvm/hvm_info_table.h
@@ -20,6 +20,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Keir Fraser
  */
 
 #ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
diff --git a/xen/include/public/hvm/hvm_op.h b/xen/include/public/hvm/hvm_op.h
index eeb0a60..1606185 100644
--- a/xen/include/public/hvm/hvm_op.h
+++ b/xen/include/public/hvm/hvm_op.h
@@ -16,6 +16,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007, Keir Fraser
  */
 
 #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
@@ -81,6 +83,7 @@ typedef enum {
     HVMMEM_ram_rw,             /* Normal read/write guest RAM */
     HVMMEM_ram_ro,             /* Read-only; writes are discarded */
     HVMMEM_mmio_dm,            /* Reads and write go to the device model */
+    HVMMEM_mmio_write_dm       /* Read-only; writes go to the device model */
 } hvmmem_type_t;
 
 /* Following tools-only interfaces may change in future. */
@@ -265,6 +268,13 @@ typedef uint16_t ioservid_t;
 #define HVMOP_create_ioreq_server 17
 struct xen_hvm_create_ioreq_server {
     domid_t domid;           /* IN - domain to be serviced */
+#define HVM_IOREQSRV_BUFIOREQ_OFF    0
+#define HVM_IOREQSRV_BUFIOREQ_LEGACY 1
+/*
+ * Use this when read_pointer gets updated atomically and
+ * the pointer pair gets read atomically:
+ */
+#define HVM_IOREQSRV_BUFIOREQ_ATOMIC 2
     uint8_t handle_bufioreq; /* IN - should server handle buffered ioreqs */
     ioservid_t id;           /* OUT - server id */
 };
@@ -369,6 +379,116 @@ DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_ioreq_server_state_t);
 
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 
+#if defined(__i386__) || defined(__x86_64__)
+
+/*
+ * HVMOP_set_evtchn_upcall_vector: Set a <vector> that should be used for event
+ *                                 channel upcalls on the specified <vcpu>. If set,
+ *                                 this vector will be used in preference to the
+ *                                 domain global callback via (see
+ *                                 HVM_PARAM_CALLBACK_IRQ).
+ */
+#define HVMOP_set_evtchn_upcall_vector 23
+struct xen_hvm_evtchn_upcall_vector {
+    uint32_t vcpu;
+    uint8_t vector;
+};
+typedef struct xen_hvm_evtchn_upcall_vector xen_hvm_evtchn_upcall_vector_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_evtchn_upcall_vector_t);
+
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
+#define HVMOP_guest_request_vm_event 24
+
+/* HVMOP_altp2m: perform altp2m state operations */
+#define HVMOP_altp2m 25
+
+#define HVMOP_ALTP2M_INTERFACE_VERSION 0x00000001
+
+struct xen_hvm_altp2m_domain_state {
+    /* IN or OUT variable on/off */
+    uint8_t state;
+};
+typedef struct xen_hvm_altp2m_domain_state xen_hvm_altp2m_domain_state_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_domain_state_t);
+
+struct xen_hvm_altp2m_vcpu_enable_notify {
+    uint32_t vcpu_id;
+    uint32_t pad;
+    /* #VE info area gfn */
+    uint64_t gfn;
+};
+typedef struct xen_hvm_altp2m_vcpu_enable_notify xen_hvm_altp2m_vcpu_enable_notify_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_vcpu_enable_notify_t);
+
+struct xen_hvm_altp2m_view {
+    /* IN/OUT variable */
+    uint16_t view;
+    /* Create view only: default access type
+     * NOTE: currently ignored */
+    uint16_t hvmmem_default_access; /* xenmem_access_t */
+};
+typedef struct xen_hvm_altp2m_view xen_hvm_altp2m_view_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_view_t);
+
+struct xen_hvm_altp2m_set_mem_access {
+    /* view */
+    uint16_t view;
+    /* Memory type */
+    uint16_t hvmmem_access; /* xenmem_access_t */
+    uint32_t pad;
+    /* gfn */
+    uint64_t gfn;
+};
+typedef struct xen_hvm_altp2m_set_mem_access xen_hvm_altp2m_set_mem_access_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_set_mem_access_t);
+
+struct xen_hvm_altp2m_change_gfn {
+    /* view */
+    uint16_t view;
+    uint16_t pad1;
+    uint32_t pad2;
+    /* old gfn */
+    uint64_t old_gfn;
+    /* new gfn, INVALID_GFN (~0UL) means revert */
+    uint64_t new_gfn;
+};
+typedef struct xen_hvm_altp2m_change_gfn xen_hvm_altp2m_change_gfn_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_change_gfn_t);
+
+struct xen_hvm_altp2m_op {
+    uint32_t version;   /* HVMOP_ALTP2M_INTERFACE_VERSION */
+    uint32_t cmd;
+/* Get/set the altp2m state for a domain */
+#define HVMOP_altp2m_get_domain_state     1
+#define HVMOP_altp2m_set_domain_state     2
+/* Set the current VCPU to receive altp2m event notifications */
+#define HVMOP_altp2m_vcpu_enable_notify   3
+/* Create a new view */
+#define HVMOP_altp2m_create_p2m           4
+/* Destroy a view */
+#define HVMOP_altp2m_destroy_p2m          5
+/* Switch view for an entire domain */
+#define HVMOP_altp2m_switch_p2m           6
+/* Notify that a page of memory is to have specific access types */
+#define HVMOP_altp2m_set_mem_access       7
+/* Change a p2m entry to have a different gfn->mfn mapping */
+#define HVMOP_altp2m_change_gfn           8
+    domid_t domain;
+    uint16_t pad1;
+    uint32_t pad2;
+    union {
+        struct xen_hvm_altp2m_domain_state       domain_state;
+        struct xen_hvm_altp2m_vcpu_enable_notify enable_notify;
+        struct xen_hvm_altp2m_view               view;
+        struct xen_hvm_altp2m_set_mem_access     set_mem_access;
+        struct xen_hvm_altp2m_change_gfn         change_gfn;
+        uint8_t pad[64];
+    } u;
+};
+typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_op_t);
+
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
 
 /*
diff --git a/xen/include/public/hvm/hvm_xs_strings.h b/xen/include/public/hvm/hvm_xs_strings.h
index 8aec935..146b0b0 100644
--- a/xen/include/public/hvm/hvm_xs_strings.h
+++ b/xen/include/public/hvm/hvm_xs_strings.h
@@ -20,6 +20,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2013, Citrix Systems
  */
 
 #ifndef __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__
diff --git a/xen/include/public/hvm/ioreq.h b/xen/include/public/hvm/ioreq.h
index 5b5fedf..2e5809b 100644
--- a/xen/include/public/hvm/ioreq.h
+++ b/xen/include/public/hvm/ioreq.h
@@ -83,8 +83,17 @@ typedef struct buf_ioreq buf_ioreq_t;
 
 #define IOREQ_BUFFER_SLOT_NUM     511 /* 8 bytes each, plus 2 4-byte indexes */
 struct buffered_iopage {
-    unsigned int read_pointer;
-    unsigned int write_pointer;
+#ifdef __XEN__
+    union bufioreq_pointers {
+        struct {
+#endif
+            uint32_t read_pointer;
+            uint32_t write_pointer;
+#ifdef __XEN__
+        };
+        uint64_t full;
+    } ptrs;
+#endif
     buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM];
 }; /* NB. Size of this structure must be no greater than one page. */
 typedef struct buffered_iopage buffered_iopage_t;
diff --git a/xen/include/public/hvm/params.h b/xen/include/public/hvm/params.h
index 3c51072..356dfd3 100644
--- a/xen/include/public/hvm/params.h
+++ b/xen/include/public/hvm/params.h
@@ -16,6 +16,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007, Keir Fraser
  */
 
 #ifndef __XEN_PUBLIC_HVM_PARAMS_H__
@@ -92,10 +94,15 @@
 #define _HVMPV_time_ref_count 2
 #define HVMPV_time_ref_count  (1 << _HVMPV_time_ref_count)
 
+/* Enable Reference TSC Page (HV_X64_MSR_REFERENCE_TSC) */
+#define _HVMPV_reference_tsc 3
+#define HVMPV_reference_tsc  (1 << _HVMPV_reference_tsc)
+
 #define HVMPV_feature_mask \
 	(HVMPV_base_freq | \
 	 HVMPV_no_freq | \
-	 HVMPV_time_ref_count)
+	 HVMPV_time_ref_count | \
+	 HVMPV_reference_tsc)
 
 #endif
 
@@ -157,8 +164,7 @@
  */
 #define HVM_PARAM_ACPI_IOPORTS_LOCATION 19
 
-/* Enable blocking memory events, async or sync (pause vcpu until response) 
- * onchangeonly indicates messages only on a change of value */
+/* Deprecated */
 #define HVM_PARAM_MEMORY_EVENT_CR0          20
 #define HVM_PARAM_MEMORY_EVENT_CR3          21
 #define HVM_PARAM_MEMORY_EVENT_CR4          22
@@ -166,18 +172,12 @@
 #define HVM_PARAM_MEMORY_EVENT_SINGLE_STEP  25
 #define HVM_PARAM_MEMORY_EVENT_MSR          30
 
-#define HVMPME_MODE_MASK       (3 << 0)
-#define HVMPME_mode_disabled   0
-#define HVMPME_mode_async      1
-#define HVMPME_mode_sync       2
-#define HVMPME_onchangeonly    (1 << 2)
-
 /* Boolean: Enable nestedhvm (hvm only) */
 #define HVM_PARAM_NESTEDHVM    24
 
 /* Params for the mem event rings */
 #define HVM_PARAM_PAGING_RING_PFN   27
-#define HVM_PARAM_ACCESS_RING_PFN   28
+#define HVM_PARAM_MONITOR_RING_PFN  28
 #define HVM_PARAM_SHARING_RING_PFN  29
 
 /* SHUTDOWN_* action in case of a triple fault */
@@ -189,6 +189,9 @@
 /* Location of the VM Generation ID in guest physical address space. */
 #define HVM_PARAM_VM_GENERATION_ID_ADDR 34
 
-#define HVM_NR_PARAMS          35
+/* Boolean: Enable altp2m */
+#define HVM_PARAM_ALTP2M       35
+
+#define HVM_NR_PARAMS          36
 
 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff --git a/xen/include/public/io/blkif.h b/xen/include/public/io/blkif.h
index 6baf7fb..8f0f9a6 100644
--- a/xen/include/public/io/blkif.h
+++ b/xen/include/public/io/blkif.h
@@ -92,6 +92,12 @@
  *      backend driver to open the backing device.  (e.g. the path to the
  *      file or block device representing the backing store.)
  *
+ * physical-device
+ *      Values:         "MAJOR:MINOR"
+ *
+ *      MAJOR and MINOR are the major number and minor number of the
+ *      backing device respectively.
+ *
  * type
  *      Values:         "file", "phy", "tap"
  *
diff --git a/xen/include/public/io/libxenvchan.h b/xen/include/public/io/libxenvchan.h
index 5c3d3d4..a62869a 100644
--- a/xen/include/public/io/libxenvchan.h
+++ b/xen/include/public/io/libxenvchan.h
@@ -21,8 +21,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * @section DESCRIPTION
  *
diff --git a/xen/include/public/io/netif.h b/xen/include/public/io/netif.h
index 61e9aea..5c31ae3 100644
--- a/xen/include/public/io/netif.h
+++ b/xen/include/public/io/netif.h
@@ -136,14 +136,145 @@
  */
 
 /*
+ * "feature-multicast-control" advertises the capability to filter ethernet
+ * multicast packets in the backend. To enable use of this capability the
+ * frontend must set "request-multicast-control" before moving into the
+ * connected state.
+ *
+ * If "request-multicast-control" is set then the backend transmit side should
+ * no longer flood multicast packets to the frontend, it should instead drop any
+ * multicast packet that does not match in a filter list. The list is
+ * amended by the frontend by sending dummy transmit requests containing
+ * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL} extra-info fragments as specified below.
+ * Once enabled by the frontend, the feature cannot be disabled except by
+ * closing and re-connecting to the backend.
+ */
+
+/*
  * This is the 'wire' format for packets:
- *  Request 1: netif_tx_request -- NETTXF_* (any flags)
- * [Request 2: netif_tx_extra]  (only if request 1 has NETTXF_extra_info)
- * [Request 3: netif_tx_extra]  (only if request 2 has XEN_NETIF_EXTRA_MORE)
- *  Request 4: netif_tx_request -- NETTXF_more_data
- *  Request 5: netif_tx_request -- NETTXF_more_data
+ *  Request 1: netif_tx_request_t -- NETTXF_* (any flags)
+ * [Request 2: netif_extra_info_t] (only if request 1 has NETTXF_extra_info)
+ * [Request 3: netif_extra_info_t] (only if request 2 has XEN_NETIF_EXTRA_MORE)
+ *  Request 4: netif_tx_request_t -- NETTXF_more_data
+ *  Request 5: netif_tx_request_t -- NETTXF_more_data
  *  ...
- *  Request N: netif_tx_request -- 0
+ *  Request N: netif_tx_request_t -- 0
+ */
+
+/*
+ * Guest transmit
+ * ==============
+ *
+ * Ring slot size is 12 octets, however not all request/response
+ * structs use the full size.
+ *
+ * tx request data (netif_tx_request_t)
+ * ------------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | grant ref             | offset    | flags     |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | size      |
+ * +-----+-----+-----+-----+
+ *
+ * grant ref: Reference to buffer page.
+ * offset: Offset within buffer page.
+ * flags: NETTXF_*.
+ * id: request identifier, echoed in response.
+ * size: packet size in bytes.
+ *
+ * tx response (netif_tx_response_t)
+ * ---------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | status    | unused                |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | unused                |
+ * +-----+-----+-----+-----+
+ *
+ * id: reflects id in transmit request
+ * status: NETIF_RSP_*
+ *
+ * Guest receive
+ * =============
+ *
+ * Ring slot size is 8 octets.
+ *
+ * rx request (netif_rx_request_t)
+ * -------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | pad       | gref                  |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * id: request identifier, echoed in response.
+ * gref: reference to incoming granted frame.
+ *
+ * rx response (netif_rx_response_t)
+ * ---------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | offset    | flags     | status    |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * id: reflects id in receive request
+ * offset: offset in page of start of received packet
+ * flags: NETRXF_*
+ * status: -ve: NETIF_RSP_*; +ve: Rx'ed pkt size.
+ *
+ * Extra Info
+ * ==========
+ *
+ * Can be present if initial request has NET{T,R}XF_extra_info, or
+ * previous extra request has XEN_NETIF_EXTRA_MORE.
+ *
+ * The struct therefore needs to fit into either a tx or rx slot and
+ * is therefore limited to 8 octets.
+ *
+ * extra info (netif_extra_info_t)
+ * -------------------------------
+ *
+ * General format:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |type |flags| type specfic data                 |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | padding for tx        |
+ * +-----+-----+-----+-----+
+ *
+ * type: XEN_NETIF_EXTRA_TYPE_*
+ * flags: XEN_NETIF_EXTRA_FLAG_*
+ * padding for tx: present only in the tx case due to 8 octet limit
+ *     from rx case. Not shown in type specific entries below.
+ *
+ * XEN_NETIF_EXTRA_TYPE_GSO:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |type |flags| size      |type | pad | features  |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * type: Must be XEN_NETIF_EXTRA_TYPE_GSO
+ * flags: XEN_NETIF_EXTRA_FLAG_*
+ * size: Maximum payload size of each segment.
+ * type: XEN_NETIF_GSO_TYPE_*
+ * features: EN_NETIF_GSO_FEAT_*
+ *
+ * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |type |flags| addr                              |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * type: Must be XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}
+ * flags: XEN_NETIF_EXTRA_FLAG_*
+ * addr: address to add/remove
  */
 
 /* Protocol checksum field is blank in the packet (hardware offload)? */
@@ -179,7 +310,7 @@ typedef struct netif_tx_request netif_tx_request_t;
 #define XEN_NETIF_EXTRA_TYPE_MCAST_DEL (3)  /* u.mcast */
 #define XEN_NETIF_EXTRA_TYPE_MAX       (4)
 
-/* netif_extra_info flags. */
+/* netif_extra_info_t flags. */
 #define _XEN_NETIF_EXTRA_FLAG_MORE (0)
 #define XEN_NETIF_EXTRA_FLAG_MORE  (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
 
@@ -189,8 +320,8 @@ typedef struct netif_tx_request netif_tx_request_t;
 #define XEN_NETIF_GSO_TYPE_TCPV6        (2)
 
 /*
- * This structure needs to fit within both netif_tx_request and
- * netif_rx_response for compatibility.
+ * This structure needs to fit within both netif_tx_request_t and
+ * netif_rx_response_t for compatibility.
  */
 struct netif_extra_info {
     uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
@@ -225,14 +356,6 @@ struct netif_extra_info {
 
         /*
          * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}:
-         * Backend advertises availability via 'feature-multicast-control'
-         * xenbus node containing value '1'.
-         * Frontend requests this feature by advertising
-         * 'request-multicast-control' xenbus node containing value '1'.
-         * If multicast control is requested then multicast flooding is
-         * disabled and the frontend must explicitly register its interest
-         * in multicast groups using dummy transmit requests containing
-         * MCAST_{ADD,DEL} extra-info fragments.
          */
         struct {
             uint8_t addr[6]; /* Address to add/remove. */
@@ -251,6 +374,7 @@ typedef struct netif_tx_response netif_tx_response_t;
 
 struct netif_rx_request {
     uint16_t    id;        /* Echoed in response message.        */
+    uint16_t    pad;
     grant_ref_t gref;      /* Reference to incoming granted frame */
 };
 typedef struct netif_rx_request netif_rx_request_t;
@@ -289,7 +413,7 @@ DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response);
 #define NETIF_RSP_DROPPED         -2
 #define NETIF_RSP_ERROR           -1
 #define NETIF_RSP_OKAY             0
-/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
+/* No response: used for auxiliary requests (e.g., netif_extra_info_t). */
 #define NETIF_RSP_NULL             1
 
 #endif
diff --git a/xen/include/public/io/protocols.h b/xen/include/public/io/protocols.h
index 80b196b..40a9b30 100644
--- a/xen/include/public/io/protocols.h
+++ b/xen/include/public/io/protocols.h
@@ -18,6 +18,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2008, Keir Fraser
  */
 
 #ifndef __XEN_PROTOCOLS_H__
diff --git a/xen/include/public/io/ring.h b/xen/include/public/io/ring.h
index 73e13d7..ba9401b 100644
--- a/xen/include/public/io/ring.h
+++ b/xen/include/public/io/ring.h
@@ -111,7 +111,7 @@ struct __name##_sring {                                                 \
             uint8_t msg;                                                \
         } tapif_user;                                                   \
         uint8_t pvt_pad[4];                                             \
-    } private;                                                          \
+    } pvt;                                                              \
     uint8_t __pad[44];                                                  \
     union __name##_sring_entry ring[1]; /* variable-length */           \
 };                                                                      \
@@ -156,7 +156,7 @@ typedef struct __name##_back_ring __name##_back_ring_t
 #define SHARED_RING_INIT(_s) do {                                       \
     (_s)->req_prod  = (_s)->rsp_prod  = 0;                              \
     (_s)->req_event = (_s)->rsp_event = 1;                              \
-    (void)memset((_s)->private.pvt_pad, 0, sizeof((_s)->private.pvt_pad)); \
+    (void)memset((_s)->pvt.pvt_pad, 0, sizeof((_s)->pvt.pvt_pad));      \
     (void)memset((_s)->__pad, 0, sizeof((_s)->__pad));                  \
 } while(0)
 
diff --git a/xen/include/public/io/usbif.h b/xen/include/public/io/usbif.h
index 0af2a38..9ef0cdc 100644
--- a/xen/include/public/io/usbif.h
+++ b/xen/include/public/io/usbif.h
@@ -31,6 +31,76 @@
 #include "ring.h"
 #include "../grant_table.h"
 
+/*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen pvUSB driver utilize nodes within the XenStore to
+ * communicate capabilities and to negotiate operating parameters. This
+ * section enumerates these nodes which reside in the respective front and
+ * backend portions of the XenStore, following the XenBus convention.
+ *
+ * Any specified default value is in effect if the corresponding XenBus node
+ * is not present in the XenStore.
+ *
+ * XenStore nodes in sections marked "PRIVATE" are solely for use by the
+ * driver side whose XenBus tree contains them.
+ *
+ *****************************************************************************
+ *                            Backend XenBus Nodes
+ *****************************************************************************
+ *
+ *------------------ Backend Device Identification (PRIVATE) ------------------
+ *
+ * num-ports
+ *      Values:         unsigned [1...31]
+ *
+ *      Number of ports for this (virtual) USB host connector.
+ *
+ * usb-ver
+ *      Values:         unsigned [1...2]
+ *
+ *      USB version of this host connector: 1 = USB 1.1, 2 = USB 2.0.
+ *
+ * port/[1...31]
+ *      Values:         string
+ *
+ *      Physical USB device connected to the given port, e.g. "3-1.5".
+ *
+ *****************************************************************************
+ *                            Frontend XenBus Nodes
+ *****************************************************************************
+ *
+ *----------------------- Request Transport Parameters -----------------------
+ *
+ * event-channel
+ *      Values:         unsigned
+ *
+ *      The identifier of the Xen event channel used to signal activity
+ *      in the ring buffer.
+ *
+ * urb-ring-ref
+ *      Values:         unsigned
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      the sole page in a single page sized ring buffer. This is the ring
+ *      buffer for urb requests.
+ *
+ * conn-ring-ref
+ *      Values:         unsigned
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      the sole page in a single page sized ring buffer. This is the ring
+ *      buffer for connection/disconnection requests.
+ *
+ * protocol
+ *      Values:         string (XEN_IO_PROTO_ABI_*)
+ *      Default Value:  XEN_IO_PROTO_ABI_NATIVE
+ *
+ *      The machine ABI rules governing the format of all ring request and
+ *      response structures.
+ *
+ */
+
 enum usb_spec_version {
 	USB_VER_UNKNOWN = 0,
 	USB_VER_USB11,
@@ -41,37 +111,65 @@ enum usb_spec_version {
 /*
  *  USB pipe in usbif_request
  *
- *  bits 0-5 are specific bits for virtual USB driver.
- *  bits 7-31 are standard urb pipe.
- *
- *  - port number(NEW):	bits 0-4
- *  				(USB_MAXCHILDREN is 31)
+ *  - port number:	bits 0-4
+ *				(USB_MAXCHILDREN is 31)
  *
- *  - operation flag(NEW):	bit 5
- *  				(0 = submit urb,
- *  				 1 = unlink urb)
+ *  - operation flag:	bit 5
+ *				(0 = submit urb,
+ *				 1 = unlink urb)
  *
  *  - direction:		bit 7
- *  				(0 = Host-to-Device [Out]
- *                           1 = Device-to-Host [In])
+ *				(0 = Host-to-Device [Out]
+ *				 1 = Device-to-Host [In])
  *
  *  - device address:	bits 8-14
  *
  *  - endpoint:		bits 15-18
  *
- *  - pipe type:		bits 30-31
- *  				(00 = isochronous, 01 = interrupt,
- *                           10 = control, 11 = bulk)
+ *  - pipe type:	bits 30-31
+ *				(00 = isochronous, 01 = interrupt,
+ *				 10 = control, 11 = bulk)
  */
-#define usbif_pipeportnum(pipe) ((pipe) & 0x1f)
-#define usbif_setportnum_pipe(pipe, portnum) \
-	((pipe)|(portnum))
 
-#define usbif_pipeunlink(pipe) ((pipe) & 0x20)
-#define usbif_pipesubmit(pipe) (!usbif_pipeunlink(pipe))
-#define usbif_setunlink_pipe(pipe) ((pipe)|(0x20))
+#define USBIF_PIPE_PORT_MASK	0x0000001f
+#define USBIF_PIPE_UNLINK	0x00000020
+#define USBIF_PIPE_DIR		0x00000080
+#define USBIF_PIPE_DEV_MASK	0x0000007f
+#define USBIF_PIPE_DEV_SHIFT	8
+#define USBIF_PIPE_EP_MASK	0x0000000f
+#define USBIF_PIPE_EP_SHIFT	15
+#define USBIF_PIPE_TYPE_MASK	0x00000003
+#define USBIF_PIPE_TYPE_SHIFT	30
+#define USBIF_PIPE_TYPE_ISOC	0
+#define USBIF_PIPE_TYPE_INT	1
+#define USBIF_PIPE_TYPE_CTRL	2
+#define USBIF_PIPE_TYPE_BULK	3
+
+#define usbif_pipeportnum(pipe)			((pipe) & USBIF_PIPE_PORT_MASK)
+#define usbif_setportnum_pipe(pipe, portnum)	((pipe) | (portnum))
+
+#define usbif_pipeunlink(pipe)			((pipe) & USBIF_PIPE_UNLINK)
+#define usbif_pipesubmit(pipe)			(!usbif_pipeunlink(pipe))
+#define usbif_setunlink_pipe(pipe)		((pipe) | USBIF_PIPE_UNLINK)
+
+#define usbif_pipein(pipe)			((pipe) & USBIF_PIPE_DIR)
+#define usbif_pipeout(pipe)			(!usbif_pipein(pipe))
+
+#define usbif_pipedevice(pipe)			\
+		(((pipe) >> USBIF_PIPE_DEV_SHIFT) & USBIF_PIPE_DEV_MASK)
+
+#define usbif_pipeendpoint(pipe)		\
+		(((pipe) >> USBIF_PIPE_EP_SHIFT) & USBIF_PIPE_EP_MASK)
+
+#define usbif_pipetype(pipe)			\
+		(((pipe) >> USBIF_PIPE_TYPE_SHIFT) & USBIF_PIPE_TYPE_MASK)
+#define usbif_pipeisoc(pipe)	(usbif_pipetype(pipe) == USBIF_PIPE_TYPE_ISOC)
+#define usbif_pipeint(pipe)	(usbif_pipetype(pipe) == USBIF_PIPE_TYPE_INT)
+#define usbif_pipectrl(pipe)	(usbif_pipetype(pipe) == USBIF_PIPE_TYPE_CTRL)
+#define usbif_pipebulk(pipe)	(usbif_pipetype(pipe) == USBIF_PIPE_TYPE_BULK)
 
 #define USBIF_MAX_SEGMENTS_PER_REQUEST (16)
+#define USBIF_MAX_PORTNR	31
 
 /*
  * RING for transferring urbs.
@@ -141,6 +239,10 @@ struct usbif_conn_response {
 	uint16_t id; /* request id */
 	uint8_t portnum; /* port number */
 	uint8_t speed; /* usb_device_speed */
+#define USBIF_SPEED_NONE	0
+#define USBIF_SPEED_LOW		1
+#define USBIF_SPEED_FULL	2
+#define USBIF_SPEED_HIGH	3
 };
 typedef struct usbif_conn_response usbif_conn_response_t;
 
diff --git a/xen/include/public/mem_event.h b/xen/include/public/mem_event.h
deleted file mode 100644
index 599f9e8..0000000
--- a/xen/include/public/mem_event.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/******************************************************************************
- * mem_event.h
- *
- * Memory event common structures.
- *
- * Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef _XEN_PUBLIC_MEM_EVENT_H
-#define _XEN_PUBLIC_MEM_EVENT_H
-
-#include "xen.h"
-#include "io/ring.h"
-
-/* Memory event flags */
-#define MEM_EVENT_FLAG_VCPU_PAUSED     (1 << 0)
-#define MEM_EVENT_FLAG_DROP_PAGE       (1 << 1)
-#define MEM_EVENT_FLAG_EVICT_FAIL      (1 << 2)
-#define MEM_EVENT_FLAG_FOREIGN         (1 << 3)
-#define MEM_EVENT_FLAG_DUMMY           (1 << 4)
-/*
- * Emulate the fault-causing instruction (if set in the event response flags).
- * This will allow the guest to continue execution without lifting the page
- * access restrictions.
- */
-#define MEM_EVENT_FLAG_EMULATE         (1 << 5)
-/*
- * Same as MEM_EVENT_FLAG_EMULATE, but with write operations or operations
- * potentially having side effects (like memory mapped or port I/O) disabled.
- */
-#define MEM_EVENT_FLAG_EMULATE_NOWRITE (1 << 6)
-
-/* Reasons for the memory event request */
-#define MEM_EVENT_REASON_UNKNOWN     0    /* typical reason */
-#define MEM_EVENT_REASON_VIOLATION   1    /* access violation, GFN is address */
-#define MEM_EVENT_REASON_CR0         2    /* CR0 was hit: gfn is new CR0 value, gla is previous */
-#define MEM_EVENT_REASON_CR3         3    /* CR3 was hit: gfn is new CR3 value, gla is previous */
-#define MEM_EVENT_REASON_CR4         4    /* CR4 was hit: gfn is new CR4 value, gla is previous */
-#define MEM_EVENT_REASON_INT3        5    /* int3 was hit: gla/gfn are RIP */
-#define MEM_EVENT_REASON_SINGLESTEP  6    /* single step was invoked: gla/gfn are RIP */
-#define MEM_EVENT_REASON_MSR         7    /* MSR was hit: gfn is MSR value, gla is MSR address;
-                                             does NOT honour HVMPME_onchangeonly */
-
-/* Using a custom struct (not hvm_hw_cpu) so as to not fill
- * the mem_event ring buffer too quickly. */
-struct mem_event_regs_x86 {
-    uint64_t rax;
-    uint64_t rcx;
-    uint64_t rdx;
-    uint64_t rbx;
-    uint64_t rsp;
-    uint64_t rbp;
-    uint64_t rsi;
-    uint64_t rdi;
-    uint64_t r8;
-    uint64_t r9;
-    uint64_t r10;
-    uint64_t r11;
-    uint64_t r12;
-    uint64_t r13;
-    uint64_t r14;
-    uint64_t r15;
-    uint64_t rflags;
-    uint64_t dr7;
-    uint64_t rip;
-    uint64_t cr0;
-    uint64_t cr2;
-    uint64_t cr3;
-    uint64_t cr4;
-    uint64_t sysenter_cs;
-    uint64_t sysenter_esp;
-    uint64_t sysenter_eip;
-    uint64_t msr_efer;
-    uint64_t msr_star;
-    uint64_t msr_lstar;
-    uint64_t fs_base;
-    uint64_t gs_base;
-    uint32_t cs_arbytes;
-    uint32_t _pad;
-};
-
-typedef struct mem_event_st {
-    uint32_t flags;
-    uint32_t vcpu_id;
-
-    uint64_t gfn;
-    uint64_t offset;
-    uint64_t gla; /* if gla_valid */
-
-    uint32_t p2mt;
-
-    uint16_t access_r:1;
-    uint16_t access_w:1;
-    uint16_t access_x:1;
-    uint16_t gla_valid:1;
-    uint16_t fault_with_gla:1;
-    uint16_t fault_in_gpt:1;
-    uint16_t available:10;
-
-    uint16_t reason;
-    struct mem_event_regs_x86 x86_regs;
-} mem_event_request_t, mem_event_response_t;
-
-DEFINE_RING_TYPES(mem_event, mem_event_request_t, mem_event_response_t);
-
-#endif
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 595f953..320de91 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -28,6 +28,7 @@
 #define __XEN_PUBLIC_MEMORY_H__
 
 #include "xen.h"
+#include "physdev.h"
 
 /*
  * Increase or decrease the specified domain's memory reservation. Returns the
@@ -55,6 +56,8 @@
 /* Flag to request allocation only from the node specified */
 #define XENMEMF_exact_node_request  (1<<17)
 #define XENMEMF_exact_node(n) (XENMEMF_node(n) | XENMEMF_exact_node_request)
+/* Flag to indicate the node specified is virtual node */
+#define XENMEMF_vnode  (1<<18)
 #endif
 
 struct xen_memory_reservation {
@@ -372,23 +375,23 @@ typedef struct xen_pod_target xen_pod_target_t;
 #define XENMEM_paging_op_evict              1
 #define XENMEM_paging_op_prep               2
 
-struct xen_mem_event_op {
-    uint8_t     op;         /* XENMEM_*_op_* */
+struct xen_mem_paging_op {
+    uint8_t     op;         /* XENMEM_paging_op_* */
     domid_t     domain;
-    
 
     /* PAGING_PREP IN: buffer to immediately fill page in */
     uint64_aligned_t    buffer;
     /* Other OPs */
     uint64_aligned_t    gfn;           /* IN:  gfn of page being operated on */
 };
-typedef struct xen_mem_event_op xen_mem_event_op_t;
-DEFINE_XEN_GUEST_HANDLE(xen_mem_event_op_t);
+typedef struct xen_mem_paging_op xen_mem_paging_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mem_paging_op_t);
 
 #define XENMEM_access_op                    21
-#define XENMEM_access_op_resume             0
-#define XENMEM_access_op_set_access         1
-#define XENMEM_access_op_get_access         2
+#define XENMEM_access_op_set_access         0
+#define XENMEM_access_op_get_access         1
+#define XENMEM_access_op_enable_emulate     2
+#define XENMEM_access_op_disable_emulate    3
 
 typedef enum {
     XENMEM_access_n,
@@ -439,12 +442,11 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t);
 #define XENMEM_sharing_op_nominate_gfn      0
 #define XENMEM_sharing_op_nominate_gref     1
 #define XENMEM_sharing_op_share             2
-#define XENMEM_sharing_op_resume            3
-#define XENMEM_sharing_op_debug_gfn         4
-#define XENMEM_sharing_op_debug_mfn         5
-#define XENMEM_sharing_op_debug_gref        6
-#define XENMEM_sharing_op_add_physmap       7
-#define XENMEM_sharing_op_audit             8
+#define XENMEM_sharing_op_debug_gfn         3
+#define XENMEM_sharing_op_debug_mfn         4
+#define XENMEM_sharing_op_debug_gref        5
+#define XENMEM_sharing_op_add_physmap       6
+#define XENMEM_sharing_op_audit             7
 
 #define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
 #define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)
@@ -521,6 +523,40 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t);
  * The zero value is appropiate.
  */
 
+/*
+ * With some legacy devices, certain guest-physical addresses cannot safely
+ * be used for other purposes, e.g. to map guest RAM.  This hypercall
+ * enumerates those regions so the toolstack can avoid using them.
+ */
+#define XENMEM_reserved_device_memory_map   27
+struct xen_reserved_device_memory {
+    xen_pfn_t start_pfn;
+    xen_ulong_t nr_pages;
+};
+typedef struct xen_reserved_device_memory xen_reserved_device_memory_t;
+DEFINE_XEN_GUEST_HANDLE(xen_reserved_device_memory_t);
+
+struct xen_reserved_device_memory_map {
+#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */
+    /* IN */
+    uint32_t flags;
+    /*
+     * IN/OUT
+     *
+     * Gets set to the required number of entries when too low,
+     * signaled by error code -ERANGE.
+     */
+    unsigned int nr_entries;
+    /* OUT */
+    XEN_GUEST_HANDLE(xen_reserved_device_memory_t) buffer;
+    /* IN */
+    union {
+        struct physdev_pci_device pci;
+    } dev;
+};
+typedef struct xen_reserved_device_memory_map xen_reserved_device_memory_map_t;
+DEFINE_XEN_GUEST_HANDLE(xen_reserved_device_memory_map_t);
+
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 
 /*
@@ -572,7 +608,7 @@ struct xen_vnuma_topology_info {
 typedef struct xen_vnuma_topology_info xen_vnuma_topology_info_t;
 DEFINE_XEN_GUEST_HANDLE(xen_vnuma_topology_info_t);
 
-/* Next available subop number is 27 */
+/* Next available subop number is 28 */
 
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
 
diff --git a/xen/include/public/physdev.h b/xen/include/public/physdev.h
index d547928..0e54635 100644
--- a/xen/include/public/physdev.h
+++ b/xen/include/public/physdev.h
@@ -16,6 +16,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Keir Fraser
  */
 
 #ifndef __XEN_PUBLIC_PHYSDEV_H__
@@ -293,6 +295,11 @@ struct physdev_pci_device_add {
         uint8_t bus;
         uint8_t devfn;
     } physfn;
+    /*
+     * Optional parameters array.
+     * First element ([0]) is PXM domain associated with the device (if
+     * XEN_PCI_DEV_PXM is set)
+     */
 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
     uint32_t optarr[];
 #elif defined(__GNUC__)
@@ -344,9 +351,11 @@ DEFINE_XEN_GUEST_HANDLE(physdev_dbgp_op_t);
  */
 #define PHYSDEVOP_IRQ_UNMASK_NOTIFY      4
 
+#if __XEN_INTERFACE_VERSION__ < 0x00040600
 /*
  * These all-capitals physdev operation names are superceded by the new names
- * (defined above) since interface version 0x00030202.
+ * (defined above) since interface version 0x00030202. The guard above was
+ * added post-4.5 only though and hence shouldn't check for 0x00030202.
  */
 #define PHYSDEVOP_IRQ_STATUS_QUERY       PHYSDEVOP_irq_status_query
 #define PHYSDEVOP_SET_IOPL               PHYSDEVOP_set_iopl
@@ -357,6 +366,7 @@ DEFINE_XEN_GUEST_HANDLE(physdev_dbgp_op_t);
 #define PHYSDEVOP_FREE_VECTOR            PHYSDEVOP_free_irq_vector
 #define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
 #define PHYSDEVOP_IRQ_SHARED             XENIRQSTAT_shared
+#endif
 
 #if __XEN_INTERFACE_VERSION__ < 0x00040200
 #define PHYSDEVOP_pirq_eoi_gmfn PHYSDEVOP_pirq_eoi_gmfn_v1
diff --git a/xen/include/public/platform.h b/xen/include/public/platform.h
index 5c57615..1e6a6ce 100644
--- a/xen/include/public/platform.h
+++ b/xen/include/public/platform.h
@@ -35,13 +35,28 @@
  * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
  * 1 January, 1970 if the current system time was <system_time>.
  */
-#define XENPF_settime             17
-struct xenpf_settime {
+#define XENPF_settime32           17
+struct xenpf_settime32 {
     /* IN variables. */
     uint32_t secs;
     uint32_t nsecs;
     uint64_t system_time;
 };
+#define XENPF_settime64           62
+struct xenpf_settime64 {
+    /* IN variables. */
+    uint64_t secs;
+    uint32_t nsecs;
+    uint32_t mbz;
+    uint64_t system_time;
+};
+#if __XEN_INTERFACE_VERSION__ < 0x00040600
+#define XENPF_settime XENPF_settime32
+#define xenpf_settime xenpf_settime32
+#else
+#define XENPF_settime XENPF_settime64
+#define xenpf_settime xenpf_settime64
+#endif
 typedef struct xenpf_settime xenpf_settime_t;
 DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t);
 
@@ -126,6 +141,26 @@ DEFINE_XEN_GUEST_HANDLE(xenpf_platform_quirk_t);
 #define XEN_EFI_query_variable_info           9
 #define XEN_EFI_query_capsule_capabilities   10
 #define XEN_EFI_update_capsule               11
+
+struct xenpf_efi_time {
+    uint16_t year;
+    uint8_t month;
+    uint8_t day;
+    uint8_t hour;
+    uint8_t min;
+    uint8_t sec;
+    uint32_t ns;
+    int16_t tz;
+    uint8_t daylight;
+};
+
+struct xenpf_efi_guid {
+    uint32_t data1;
+    uint16_t data2;
+    uint16_t data3;
+    uint8_t data4[8];
+};
+
 struct xenpf_efi_runtime_call {
     uint32_t function;
     /*
@@ -138,17 +173,7 @@ struct xenpf_efi_runtime_call {
     union {
 #define XEN_EFI_GET_TIME_SET_CLEARS_NS 0x00000001
         struct {
-            struct xenpf_efi_time {
-                uint16_t year;
-                uint8_t month;
-                uint8_t day;
-                uint8_t hour;
-                uint8_t min;
-                uint8_t sec;
-                uint32_t ns;
-                int16_t tz;
-                uint8_t daylight;
-            } time;
+            struct xenpf_efi_time time;
             uint32_t resolution;
             uint32_t accuracy;
         } get_time;
@@ -170,12 +195,7 @@ struct xenpf_efi_runtime_call {
             XEN_GUEST_HANDLE(void) name;  /* UCS-2/UTF-16 string */
             xen_ulong_t size;
             XEN_GUEST_HANDLE(void) data;
-            struct xenpf_efi_guid {
-                uint32_t data1;
-                uint16_t data2;
-                uint16_t data3;
-                uint8_t data4[8];
-            } vendor_guid;
+            struct xenpf_efi_guid vendor_guid;
         } get_variable, set_variable;
 
         struct {
@@ -540,6 +560,16 @@ DEFINE_XEN_GUEST_HANDLE(xenpf_core_parking_t);
 #define XEN_RESOURCE_OP_MSR_READ  0
 #define XEN_RESOURCE_OP_MSR_WRITE 1
 
+/*
+ * Specially handled MSRs:
+ * - MSR_IA32_TSC
+ * READ: Returns the scaled system time(ns) instead of raw timestamp. In
+ *       multiple entry case, if other MSR read is followed by a MSR_IA32_TSC
+ *       read, then both reads are guaranteed to be performed atomically (with
+ *       IRQ disabled). The return time indicates the point of reading that MSR.
+ * WRITE: Not supported.
+ */
+
 struct xenpf_resource_entry {
     union {
         uint32_t cmd;   /* IN: XEN_RESOURCE_OP_* */
@@ -560,6 +590,24 @@ struct xenpf_resource_op {
 typedef struct xenpf_resource_op xenpf_resource_op_t;
 DEFINE_XEN_GUEST_HANDLE(xenpf_resource_op_t);
 
+#define XENPF_get_symbol   63
+struct xenpf_symdata {
+    /* IN/OUT variables */
+    uint32_t namelen; /* IN:  size of name buffer                       */
+                      /* OUT: strlen(name) of hypervisor symbol (may be */
+                      /*      larger than what's been copied to guest)  */
+    uint32_t symnum;  /* IN:  Symbol to read                            */
+                      /* OUT: Next available symbol. If same as IN then */
+                      /*      we reached the end                        */
+
+    /* OUT variables */
+    XEN_GUEST_HANDLE(char) name;
+    uint64_t address;
+    char type;
+};
+typedef struct xenpf_symdata xenpf_symdata_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_symdata_t);
+
 /*
  * ` enum neg_errnoval
  * ` HYPERVISOR_platform_op(const struct xen_platform_op*);
@@ -569,6 +617,8 @@ struct xen_platform_op {
     uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
     union {
         struct xenpf_settime           settime;
+        struct xenpf_settime32         settime32;
+        struct xenpf_settime64         settime64;
         struct xenpf_add_memtype       add_memtype;
         struct xenpf_del_memtype       del_memtype;
         struct xenpf_read_memtype      read_memtype;
@@ -587,6 +637,7 @@ struct xen_platform_op {
         struct xenpf_mem_hotadd        mem_add;
         struct xenpf_core_parking      core_parking;
         struct xenpf_resource_op       resource_op;
+        struct xenpf_symdata           symdata;
         uint8_t                        pad[128];
     } u;
 };
diff --git a/xen/include/public/pmu.h b/xen/include/public/pmu.h
new file mode 100644
index 0000000..7753df0
--- /dev/null
+++ b/xen/include/public/pmu.h
@@ -0,0 +1,133 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2015 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef __XEN_PUBLIC_PMU_H__
+#define __XEN_PUBLIC_PMU_H__
+
+#include "xen.h"
+#if defined(__i386__) || defined(__x86_64__)
+#include "arch-x86/pmu.h"
+#elif defined (__arm__) || defined (__aarch64__)
+#include "arch-arm.h"
+#else
+#error "Unsupported architecture"
+#endif
+
+#define XENPMU_VER_MAJ    0
+#define XENPMU_VER_MIN    1
+
+/*
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_xenpmu_op(enum xenpmu_op cmd, struct xenpmu_params *args);
+ *
+ * @cmd  == XENPMU_* (PMU operation)
+ * @args == struct xenpmu_params
+ */
+/* ` enum xenpmu_op { */
+#define XENPMU_mode_get        0 /* Also used for getting PMU version */
+#define XENPMU_mode_set        1
+#define XENPMU_feature_get     2
+#define XENPMU_feature_set     3
+#define XENPMU_init            4
+#define XENPMU_finish          5
+#define XENPMU_lvtpc_set       6
+#define XENPMU_flush           7 /* Write cached MSR values to HW     */
+/* ` } */
+
+/* Parameters structure for HYPERVISOR_xenpmu_op call */
+struct xen_pmu_params {
+    /* IN/OUT parameters */
+    struct {
+        uint32_t maj;
+        uint32_t min;
+    } version;
+    uint64_t val;
+
+    /* IN parameters */
+    uint32_t vcpu;
+    uint32_t pad;
+};
+typedef struct xen_pmu_params xen_pmu_params_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_params_t);
+
+/* PMU modes:
+ * - XENPMU_MODE_OFF:   No PMU virtualization
+ * - XENPMU_MODE_SELF:  Guests can profile themselves
+ * - XENPMU_MODE_HV:    Guests can profile themselves, dom0 profiles
+ *                      itself and Xen
+ * - XENPMU_MODE_ALL:   Only dom0 has access to VPMU and it profiles
+ *                      everyone: itself, the hypervisor and the guests.
+ */
+#define XENPMU_MODE_OFF           0
+#define XENPMU_MODE_SELF          (1<<0)
+#define XENPMU_MODE_HV            (1<<1)
+#define XENPMU_MODE_ALL           (1<<2)
+
+/*
+ * PMU features:
+ * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD)
+ */
+#define XENPMU_FEATURE_INTEL_BTS  1
+
+/*
+ * Shared PMU data between hypervisor and PV(H) domains.
+ *
+ * The hypervisor fills out this structure during PMU interrupt and sends an
+ * interrupt to appropriate VCPU.
+ * Architecture-independent fields of xen_pmu_data are WO for the hypervisor
+ * and RO for the guest but some fields in xen_pmu_arch can be writable
+ * by both the hypervisor and the guest (see arch-$arch/pmu.h).
+ */
+struct xen_pmu_data {
+    /* Interrupted VCPU */
+    uint32_t vcpu_id;
+
+    /*
+     * Physical processor on which the interrupt occurred. On non-privileged
+     * guests set to vcpu_id;
+     */
+    uint32_t pcpu_id;
+
+    /*
+     * Domain that was interrupted. On non-privileged guests set to DOMID_SELF.
+     * On privileged guests can be DOMID_SELF, DOMID_XEN, or, when in
+     * XENPMU_MODE_ALL mode, domain ID of another domain.
+     */
+    domid_t  domain_id;
+
+    uint8_t pad[6];
+
+    /* Architecture-specific information */
+    struct xen_pmu_arch pmu;
+};
+
+#endif /* __XEN_PUBLIC_PMU_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h
index b3713b3..0cacacc 100644
--- a/xen/include/public/sysctl.h
+++ b/xen/include/public/sysctl.h
@@ -33,8 +33,10 @@
 
 #include "xen.h"
 #include "domctl.h"
+#include "physdev.h"
+#include "tmem.h"
 
-#define XEN_SYSCTL_INTERFACE_VERSION 0x0000000B
+#define XEN_SYSCTL_INTERFACE_VERSION 0x0000000C
 
 /*
  * Read console content from Xen buffer ring.
@@ -462,61 +464,76 @@ struct xen_sysctl_lockprof_op {
 typedef struct xen_sysctl_lockprof_op xen_sysctl_lockprof_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_op_t);
 
-/* XEN_SYSCTL_topologyinfo */
-#define INVALID_TOPOLOGY_ID  (~0U)
-struct xen_sysctl_topologyinfo {
-    /*
-     * IN: maximum addressable entry in the caller-provided arrays.
-     * OUT: largest cpu identifier in the system.
-     * If OUT is greater than IN then the arrays are truncated!
-     * If OUT is leass than IN then the array tails are not written by sysctl.
-     */
-    uint32_t max_cpu_index;
+/* XEN_SYSCTL_cputopoinfo */
+#define XEN_INVALID_CORE_ID     (~0U)
+#define XEN_INVALID_SOCKET_ID   (~0U)
+#define XEN_INVALID_NODE_ID     (~0U)
 
-    /*
-     * If not NULL, these arrays are filled with core/socket/node identifier
-     * for each cpu.
-     * If a cpu has no core/socket/node information (e.g., cpu not present) 
-     * then the sentinel value ~0u is written to each array.
-     * The number of array elements written by the sysctl is:
-     *   min(@max_cpu_index_IN, at max_cpu_index_OUT)+1
-     */
-    XEN_GUEST_HANDLE_64(uint32) cpu_to_core;
-    XEN_GUEST_HANDLE_64(uint32) cpu_to_socket;
-    XEN_GUEST_HANDLE_64(uint32) cpu_to_node;
+struct xen_sysctl_cputopo {
+    uint32_t core;
+    uint32_t socket;
+    uint32_t node;
+};
+typedef struct xen_sysctl_cputopo xen_sysctl_cputopo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cputopo_t);
+
+/*
+ * IN:
+ *  - a NULL 'cputopo' handle is a request for maximun 'num_cpus'.
+ *  - otherwise it's the number of entries in 'cputopo'
+ *
+ * OUT:
+ *  - If 'num_cpus' is less than the number Xen wants to write but the handle
+ *    handle is not a NULL one, partial data gets returned and 'num_cpus' gets
+ *    updated to reflect the intended number.
+ *  - Otherwise, 'num_cpus' shall indicate the number of entries written, which
+ *    may be less than the input value.
+ */
+struct xen_sysctl_cputopoinfo {
+    uint32_t num_cpus;
+    XEN_GUEST_HANDLE_64(xen_sysctl_cputopo_t) cputopo;
 };
-typedef struct xen_sysctl_topologyinfo xen_sysctl_topologyinfo_t;
-DEFINE_XEN_GUEST_HANDLE(xen_sysctl_topologyinfo_t);
+typedef struct xen_sysctl_cputopoinfo xen_sysctl_cputopoinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cputopoinfo_t);
 
 /* XEN_SYSCTL_numainfo */
-#define INVALID_NUMAINFO_ID (~0U)
+#define XEN_INVALID_MEM_SZ     (~0U)
+#define XEN_INVALID_NODE_DIST  (~0U)
+
+struct xen_sysctl_meminfo {
+    uint64_t memsize;
+    uint64_t memfree;
+};
+typedef struct xen_sysctl_meminfo xen_sysctl_meminfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_meminfo_t);
+
+/*
+ * IN:
+ *  - Both 'meminfo' and 'distance' handles being null is a request
+ *    for maximum value of 'num_nodes'.
+ *  - Otherwise it's the number of entries in 'meminfo' and square root
+ *    of number of entries in 'distance' (when corresponding handle is
+ *    non-null)
+ *
+ * OUT:
+ *  - If 'num_nodes' is less than the number Xen wants to write but either
+ *    handle is not a NULL one, partial data gets returned and 'num_nodes'
+ *    gets updated to reflect the intended number.
+ *  - Otherwise, 'num_nodes' shall indicate the number of entries written, which
+ *    may be less than the input value.
+ */
+
 struct xen_sysctl_numainfo {
-    /*
-     * IN: maximum addressable entry in the caller-provided arrays.
-     * OUT: largest node identifier in the system.
-     * If OUT is greater than IN then the arrays are truncated!
-     */
-    uint32_t max_node_index;
+    uint32_t num_nodes;
 
-    /* NB. Entries are 0 if node is not present. */
-    XEN_GUEST_HANDLE_64(uint64) node_to_memsize;
-    XEN_GUEST_HANDLE_64(uint64) node_to_memfree;
+    XEN_GUEST_HANDLE_64(xen_sysctl_meminfo_t) meminfo;
 
     /*
-     * Array, of size (max_node_index+1)^2, listing memory access distances
-     * between nodes. If an entry has no node distance information (e.g., node 
-     * not present) then the value ~0u is written.
-     * 
-     * Note that the array rows must be indexed by multiplying by the minimum 
-     * of the caller-provided max_node_index and the returned value of
-     * max_node_index. That is, if the largest node index in the system is
-     * smaller than the caller can handle, a smaller 2-d array is constructed
-     * within the space provided by the caller. When this occurs, trailing
-     * space provided by the caller is not modified. If the largest node index
-     * in the system is larger than the caller can handle, then a 2-d array of
-     * the maximum size handleable by the caller is constructed.
+     * Distance between nodes 'i' and 'j' is stored in index 'i*N + j',
+     * where N is the number of nodes that will be returned in 'num_nodes'
+     * (i.e. not 'num_nodes' provided by the caller)
      */
-    XEN_GUEST_HANDLE_64(uint32) node_to_node_distance;
+    XEN_GUEST_HANDLE_64(uint32) distance;
 };
 typedef struct xen_sysctl_numainfo xen_sysctl_numainfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_numainfo_t);
@@ -641,6 +658,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_sysctl_coverage_op_t);
 /* The L3 cache size is returned in KB unit */
 #define XEN_SYSCTL_PSR_CMT_get_l3_cache_size         2
 #define XEN_SYSCTL_PSR_CMT_enabled                   3
+#define XEN_SYSCTL_PSR_CMT_get_l3_event_mask         4
 struct xen_sysctl_psr_cmt_op {
     uint32_t cmd;       /* IN: XEN_SYSCTL_PSR_CMT_* */
     uint32_t flags;     /* padding variable, may be extended for future use */
@@ -655,6 +673,97 @@ struct xen_sysctl_psr_cmt_op {
 typedef struct xen_sysctl_psr_cmt_op xen_sysctl_psr_cmt_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_psr_cmt_op_t);
 
+/* XEN_SYSCTL_pcitopoinfo */
+#define XEN_INVALID_DEV (XEN_INVALID_NODE_ID - 1)
+struct xen_sysctl_pcitopoinfo {
+    /*
+     * IN: Number of elements in 'pcitopo' and 'nodes' arrays.
+     * OUT: Number of processed elements of those arrays.
+     */
+    uint32_t num_devs;
+
+    /* IN: list of devices for which node IDs are requested. */
+    XEN_GUEST_HANDLE_64(physdev_pci_device_t) devs;
+
+    /*
+     * OUT: node identifier for each device.
+     * If information for a particular device is not available then
+     * corresponding entry will be set to XEN_INVALID_NODE_ID. If
+     * device is not known to the hypervisor then XEN_INVALID_DEV
+     * will be provided.
+     */
+    XEN_GUEST_HANDLE_64(uint32) nodes;
+};
+typedef struct xen_sysctl_pcitopoinfo xen_sysctl_pcitopoinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_pcitopoinfo_t);
+
+#define XEN_SYSCTL_PSR_CAT_get_l3_info               0
+struct xen_sysctl_psr_cat_op {
+    uint32_t cmd;       /* IN: XEN_SYSCTL_PSR_CAT_* */
+    uint32_t target;    /* IN */
+    union {
+        struct {
+            uint32_t cbm_len;   /* OUT: CBM length */
+            uint32_t cos_max;   /* OUT: Maximum COS */
+        } l3_info;
+    } u;
+};
+typedef struct xen_sysctl_psr_cat_op xen_sysctl_psr_cat_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_psr_cat_op_t);
+
+#define XEN_SYSCTL_TMEM_OP_ALL_CLIENTS 0xFFFFU
+
+#define XEN_SYSCTL_TMEM_OP_THAW                   0
+#define XEN_SYSCTL_TMEM_OP_FREEZE                 1
+#define XEN_SYSCTL_TMEM_OP_FLUSH                  2
+#define XEN_SYSCTL_TMEM_OP_DESTROY                3
+#define XEN_SYSCTL_TMEM_OP_LIST                   4
+#define XEN_SYSCTL_TMEM_OP_SET_WEIGHT             5
+#define XEN_SYSCTL_TMEM_OP_SET_CAP                6
+#define XEN_SYSCTL_TMEM_OP_SET_COMPRESS           7
+#define XEN_SYSCTL_TMEM_OP_QUERY_FREEABLE_MB      8
+#define XEN_SYSCTL_TMEM_OP_SAVE_BEGIN             10
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_VERSION       11
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_MAXPOOLS      12
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_WEIGHT 13
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_CAP    14
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_FLAGS  15
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_FLAGS    16
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_NPAGES   17
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_UUID     18
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_PAGE     19
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_INV      20
+#define XEN_SYSCTL_TMEM_OP_SAVE_END               21
+#define XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN          30
+#define XEN_SYSCTL_TMEM_OP_RESTORE_PUT_PAGE       32
+#define XEN_SYSCTL_TMEM_OP_RESTORE_FLUSH_PAGE     33
+
+/*
+ * XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_[PAGE|INV] override the 'buf' in
+ * xen_sysctl_tmem_op with this structure - sometimes with an extra
+ * page tackled on.
+ */
+struct tmem_handle {
+    uint32_t pool_id;
+    uint32_t index;
+    xen_tmem_oid_t oid;
+};
+
+struct xen_sysctl_tmem_op {
+    uint32_t cmd;       /* IN: XEN_SYSCTL_TMEM_OP_* . */
+    int32_t pool_id;    /* IN: 0 by default unless _SAVE_*, RESTORE_* .*/
+    uint32_t cli_id;    /* IN: client id, 0 for XEN_SYSCTL_TMEM_QUERY_FREEABLE_MB
+                           for all others can be the domain id or
+                           XEN_SYSCTL_TMEM_OP_ALL_CLIENTS for all. */
+    uint32_t arg1;      /* IN: If not applicable to command use 0. */
+    uint32_t arg2;      /* IN: If not applicable to command use 0. */
+    uint32_t pad;       /* Padding so structure is the same under 32 and 64. */
+    xen_tmem_oid_t oid; /* IN: If not applicable to command use 0s. */
+    XEN_GUEST_HANDLE_64(char) buf; /* IN/OUT: Buffer to save and restore ops. */
+};
+typedef struct xen_sysctl_tmem_op xen_sysctl_tmem_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tmem_op_t);
+
 struct xen_sysctl {
     uint32_t cmd;
 #define XEN_SYSCTL_readconsole                    1
@@ -671,18 +780,22 @@ struct xen_sysctl {
 #define XEN_SYSCTL_pm_op                         12
 #define XEN_SYSCTL_page_offline_op               14
 #define XEN_SYSCTL_lockprof_op                   15
-#define XEN_SYSCTL_topologyinfo                  16 
+#define XEN_SYSCTL_cputopoinfo                   16
 #define XEN_SYSCTL_numainfo                      17
 #define XEN_SYSCTL_cpupool_op                    18
 #define XEN_SYSCTL_scheduler_op                  19
 #define XEN_SYSCTL_coverage_op                   20
 #define XEN_SYSCTL_psr_cmt_op                    21
+#define XEN_SYSCTL_pcitopoinfo                   22
+#define XEN_SYSCTL_psr_cat_op                    23
+#define XEN_SYSCTL_tmem_op                       24
     uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
     union {
         struct xen_sysctl_readconsole       readconsole;
         struct xen_sysctl_tbuf_op           tbuf_op;
         struct xen_sysctl_physinfo          physinfo;
-        struct xen_sysctl_topologyinfo      topologyinfo;
+        struct xen_sysctl_cputopoinfo       cputopoinfo;
+        struct xen_sysctl_pcitopoinfo       pcitopoinfo;
         struct xen_sysctl_numainfo          numainfo;
         struct xen_sysctl_sched_id          sched_id;
         struct xen_sysctl_perfc_op          perfc_op;
@@ -699,6 +812,8 @@ struct xen_sysctl {
         struct xen_sysctl_scheduler_op      scheduler_op;
         struct xen_sysctl_coverage_op       coverage_op;
         struct xen_sysctl_psr_cmt_op        psr_cmt_op;
+        struct xen_sysctl_psr_cat_op        psr_cat_op;
+        struct xen_sysctl_tmem_op           tmem_op;
         uint8_t                             pad[128];
     } u;
 };
diff --git a/xen/include/public/tmem.h b/xen/include/public/tmem.h
index 4fd2fc6..2d805fb 100644
--- a/xen/include/public/tmem.h
+++ b/xen/include/public/tmem.h
@@ -33,7 +33,11 @@
 #define TMEM_SPEC_VERSION          1
 
 /* Commands to HYPERVISOR_tmem_op() */
-#define TMEM_CONTROL               0
+#ifdef __XEN__
+#define TMEM_CONTROL               0 /* Now called XEN_SYSCTL_tmem_op */
+#else
+#undef TMEM_CONTROL
+#endif
 #define TMEM_NEW_POOL              1
 #define TMEM_DESTROY_POOL          2
 #define TMEM_PUT_PAGE              4
@@ -48,35 +52,9 @@
 #endif
 
 /* Privileged commands to HYPERVISOR_tmem_op() */
-#define TMEM_AUTH                 101 
+#define TMEM_AUTH                 101
 #define TMEM_RESTORE_NEW          102
 
-/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */
-#define TMEMC_THAW                   0
-#define TMEMC_FREEZE                 1
-#define TMEMC_FLUSH                  2
-#define TMEMC_DESTROY                3
-#define TMEMC_LIST                   4
-#define TMEMC_SET_WEIGHT             5
-#define TMEMC_SET_CAP                6
-#define TMEMC_SET_COMPRESS           7
-#define TMEMC_QUERY_FREEABLE_MB      8
-#define TMEMC_SAVE_BEGIN             10
-#define TMEMC_SAVE_GET_VERSION       11
-#define TMEMC_SAVE_GET_MAXPOOLS      12
-#define TMEMC_SAVE_GET_CLIENT_WEIGHT 13
-#define TMEMC_SAVE_GET_CLIENT_CAP    14
-#define TMEMC_SAVE_GET_CLIENT_FLAGS  15
-#define TMEMC_SAVE_GET_POOL_FLAGS    16
-#define TMEMC_SAVE_GET_POOL_NPAGES   17
-#define TMEMC_SAVE_GET_POOL_UUID     18
-#define TMEMC_SAVE_GET_NEXT_PAGE     19
-#define TMEMC_SAVE_GET_NEXT_INV      20
-#define TMEMC_SAVE_END               21
-#define TMEMC_RESTORE_BEGIN          30
-#define TMEMC_RESTORE_PUT_PAGE       32
-#define TMEMC_RESTORE_FLUSH_PAGE     33
-
 /* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
 #define TMEM_POOL_PERSIST          1
 #define TMEM_POOL_SHARED           2
@@ -95,6 +73,11 @@
 #define EFROZEN                 1000
 #define EEMPTY                  1001
 
+struct xen_tmem_oid {
+    uint64_t oid[3];
+};
+typedef struct xen_tmem_oid xen_tmem_oid_t;
+DEFINE_XEN_GUEST_HANDLE(xen_tmem_oid_t);
 
 #ifndef __ASSEMBLY__
 #if __XEN_INTERFACE_VERSION__ < 0x00040400
@@ -110,17 +93,12 @@ struct tmem_op {
             uint32_t flags;
             uint32_t arg1;
         } creat; /* for cmd == TMEM_NEW_POOL, TMEM_AUTH, TMEM_RESTORE_NEW */
-        struct { 
-            uint32_t subop;
-            uint32_t cli_id;
-            uint32_t arg1;
-            uint32_t arg2;
-            uint64_t oid[3];
-            tmem_cli_va_t buf;
-        } ctrl; /* for cmd == TMEM_CONTROL */
         struct {
-            
+#if __XEN_INTERFACE_VERSION__ < 0x00040600
             uint64_t oid[3];
+#else
+            xen_tmem_oid_t oid;
+#endif
             uint32_t index;
             uint32_t tmem_offset;
             uint32_t pfn_offset;
@@ -131,12 +109,6 @@ struct tmem_op {
 };
 typedef struct tmem_op tmem_op_t;
 DEFINE_XEN_GUEST_HANDLE(tmem_op_t);
-
-struct tmem_handle {
-    uint32_t pool_id;
-    uint32_t index;
-    uint64_t oid[3];
-};
 #endif
 
 #endif /* __XEN_PUBLIC_TMEM_H__ */
diff --git a/xen/include/public/trace.h b/xen/include/public/trace.h
index 5211ae7..274f8f6 100644
--- a/xen/include/public/trace.h
+++ b/xen/include/public/trace.h
@@ -75,7 +75,7 @@
 /* Per-scheduler IDs, to identify scheduler specific events */
 #define TRC_SCHED_CSCHED   0
 #define TRC_SCHED_CSCHED2  1
-#define TRC_SCHED_SEDF     2
+/* #define XEN_SCHEDULER_SEDF 2 (Removed) */
 #define TRC_SCHED_ARINC653 3
 #define TRC_SCHED_RTDS     4
 
diff --git a/xen/include/public/vcpu.h b/xen/include/public/vcpu.h
index e888daf..898b89f 100644
--- a/xen/include/public/vcpu.h
+++ b/xen/include/public/vcpu.h
@@ -31,7 +31,7 @@
 
 /*
  * Prototype for this hypercall is:
- *  int vcpu_op(int cmd, int vcpuid, void *extra_args)
+ *  long vcpu_op(int cmd, unsigned int vcpuid, void *extra_args)
  * @cmd        == VCPUOP_??? (VCPU operation).
  * @vcpuid     == VCPU to operate on.
  * @extra_args == Operation-specific extra arguments (NULL if none).
diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
new file mode 100644
index 0000000..ff2f217
--- /dev/null
+++ b/xen/include/public/vm_event.h
@@ -0,0 +1,269 @@
+/******************************************************************************
+ * vm_event.h
+ *
+ * Memory event common structures.
+ *
+ * Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _XEN_PUBLIC_VM_EVENT_H
+#define _XEN_PUBLIC_VM_EVENT_H
+
+#include "xen.h"
+
+#define VM_EVENT_INTERFACE_VERSION 0x00000001
+
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+#include "io/ring.h"
+
+/*
+ * Memory event flags
+ */
+
+/*
+ * VCPU_PAUSED in a request signals that the vCPU triggering the event has been
+ *  paused
+ * VCPU_PAUSED in a response signals to unpause the vCPU
+ */
+#define VM_EVENT_FLAG_VCPU_PAUSED        (1 << 0)
+/* Flags to aid debugging vm_event */
+#define VM_EVENT_FLAG_FOREIGN            (1 << 1)
+/*
+ * The following flags can be set in response to a mem_access event.
+ *
+ * Emulate the fault-causing instruction (if set in the event response flags).
+ * This will allow the guest to continue execution without lifting the page
+ * access restrictions.
+ */
+#define VM_EVENT_FLAG_EMULATE            (1 << 2)
+/*
+ * Same as VM_EVENT_FLAG_EMULATE, but with write operations or operations
+ * potentially having side effects (like memory mapped or port I/O) disabled.
+ */
+#define VM_EVENT_FLAG_EMULATE_NOWRITE    (1 << 3)
+/*
+ * Toggle singlestepping on vm_event response.
+ * Requires the vCPU to be paused already (synchronous events only).
+ */
+#define VM_EVENT_FLAG_TOGGLE_SINGLESTEP  (1 << 4)
+/*
+ * Data is being sent back to the hypervisor in the event response, to be
+ * returned by the read function when emulating an instruction.
+ * This flag is only useful when combined with VM_EVENT_FLAG_EMULATE
+ * and takes precedence if combined with VM_EVENT_FLAG_EMULATE_NOWRITE
+ * (i.e. if both VM_EVENT_FLAG_EMULATE_NOWRITE and
+ * VM_EVENT_FLAG_SET_EMUL_READ_DATA are set, only the latter will be honored).
+ */
+#define VM_EVENT_FLAG_SET_EMUL_READ_DATA (1 << 5)
+ /*
+  * Deny completion of the operation that triggered the event.
+  * Currently only useful for MSR, CR0, CR3 and CR4 write events.
+  */
+#define VM_EVENT_FLAG_DENY               (1 << 6)
+/*
+ * This flag can be set in a request or a response
+ *
+ * On a request, indicates that the event occurred in the alternate p2m specified by
+ * the altp2m_idx request field.
+ *
+ * On a response, indicates that the VCPU should resume in the alternate p2m specified
+ * by the altp2m_idx response field if possible.
+ */
+#define VM_EVENT_FLAG_ALTERNATE_P2M      (1 << 7)
+
+/*
+ * Reasons for the vm event request
+ */
+
+/* Default case */
+#define VM_EVENT_REASON_UNKNOWN                 0
+/* Memory access violation */
+#define VM_EVENT_REASON_MEM_ACCESS              1
+/* Memory sharing event */
+#define VM_EVENT_REASON_MEM_SHARING             2
+/* Memory paging event */
+#define VM_EVENT_REASON_MEM_PAGING              3
+/* A control register was updated */
+#define VM_EVENT_REASON_WRITE_CTRLREG           4
+/* An MSR was updated. */
+#define VM_EVENT_REASON_MOV_TO_MSR              5
+/* Debug operation executed (e.g. int3) */
+#define VM_EVENT_REASON_SOFTWARE_BREAKPOINT     6
+/* Single-step (e.g. MTF) */
+#define VM_EVENT_REASON_SINGLESTEP              7
+/* An event has been requested via HVMOP_guest_request_vm_event. */
+#define VM_EVENT_REASON_GUEST_REQUEST           8
+
+/* Supported values for the vm_event_write_ctrlreg index. */
+#define VM_EVENT_X86_CR0    0
+#define VM_EVENT_X86_CR3    1
+#define VM_EVENT_X86_CR4    2
+#define VM_EVENT_X86_XCR0   3
+
+/*
+ * Using a custom struct (not hvm_hw_cpu) so as to not fill
+ * the vm_event ring buffer too quickly.
+ */
+struct vm_event_regs_x86 {
+    uint64_t rax;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbx;
+    uint64_t rsp;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t r8;
+    uint64_t r9;
+    uint64_t r10;
+    uint64_t r11;
+    uint64_t r12;
+    uint64_t r13;
+    uint64_t r14;
+    uint64_t r15;
+    uint64_t rflags;
+    uint64_t dr7;
+    uint64_t rip;
+    uint64_t cr0;
+    uint64_t cr2;
+    uint64_t cr3;
+    uint64_t cr4;
+    uint64_t sysenter_cs;
+    uint64_t sysenter_esp;
+    uint64_t sysenter_eip;
+    uint64_t msr_efer;
+    uint64_t msr_star;
+    uint64_t msr_lstar;
+    uint64_t fs_base;
+    uint64_t gs_base;
+    uint32_t cs_arbytes;
+    uint32_t _pad;
+};
+
+/*
+ * mem_access flag definitions
+ *
+ * These flags are set only as part of a mem_event request.
+ *
+ * R/W/X: Defines the type of violation that has triggered the event
+ *        Multiple types can be set in a single violation!
+ * GLA_VALID: If the gla field holds a guest VA associated with the event
+ * FAULT_WITH_GLA: If the violation was triggered by accessing gla
+ * FAULT_IN_GPT: If the violation was triggered during translating gla
+ */
+#define MEM_ACCESS_R                    (1 << 0)
+#define MEM_ACCESS_W                    (1 << 1)
+#define MEM_ACCESS_X                    (1 << 2)
+#define MEM_ACCESS_RWX                  (MEM_ACCESS_R | MEM_ACCESS_W | MEM_ACCESS_X)
+#define MEM_ACCESS_RW                   (MEM_ACCESS_R | MEM_ACCESS_W)
+#define MEM_ACCESS_RX                   (MEM_ACCESS_R | MEM_ACCESS_X)
+#define MEM_ACCESS_WX                   (MEM_ACCESS_W | MEM_ACCESS_X)
+#define MEM_ACCESS_GLA_VALID            (1 << 3)
+#define MEM_ACCESS_FAULT_WITH_GLA       (1 << 4)
+#define MEM_ACCESS_FAULT_IN_GPT         (1 << 5)
+
+struct vm_event_mem_access {
+    uint64_t gfn;
+    uint64_t offset;
+    uint64_t gla;   /* if flags has MEM_ACCESS_GLA_VALID set */
+    uint32_t flags; /* MEM_ACCESS_* */
+    uint32_t _pad;
+};
+
+struct vm_event_write_ctrlreg {
+    uint32_t index;
+    uint32_t _pad;
+    uint64_t new_value;
+    uint64_t old_value;
+};
+
+struct vm_event_debug {
+    uint64_t gfn;
+};
+
+struct vm_event_mov_to_msr {
+    uint64_t msr;
+    uint64_t value;
+};
+
+#define MEM_PAGING_DROP_PAGE       (1 << 0)
+#define MEM_PAGING_EVICT_FAIL      (1 << 1)
+
+struct vm_event_paging {
+    uint64_t gfn;
+    uint32_t p2mt;
+    uint32_t flags;
+};
+
+struct vm_event_sharing {
+    uint64_t gfn;
+    uint32_t p2mt;
+    uint32_t _pad;
+};
+
+struct vm_event_emul_read_data {
+    uint32_t size;
+    /* The struct is used in a union with vm_event_regs_x86. */
+    uint8_t  data[sizeof(struct vm_event_regs_x86) - sizeof(uint32_t)];
+};
+
+typedef struct vm_event_st {
+    uint32_t version;   /* VM_EVENT_INTERFACE_VERSION */
+    uint32_t flags;     /* VM_EVENT_FLAG_* */
+    uint32_t reason;    /* VM_EVENT_REASON_* */
+    uint32_t vcpu_id;
+    uint16_t altp2m_idx; /* may be used during request and response */
+    uint16_t _pad[3];
+
+    union {
+        struct vm_event_paging                mem_paging;
+        struct vm_event_sharing               mem_sharing;
+        struct vm_event_mem_access            mem_access;
+        struct vm_event_write_ctrlreg         write_ctrlreg;
+        struct vm_event_mov_to_msr            mov_to_msr;
+        struct vm_event_debug                 software_breakpoint;
+        struct vm_event_debug                 singlestep;
+    } u;
+
+    union {
+        union {
+            struct vm_event_regs_x86 x86;
+        } regs;
+
+        struct vm_event_emul_read_data emul_read_data;
+    } data;
+} vm_event_request_t, vm_event_response_t;
+
+DEFINE_RING_TYPES(vm_event, vm_event_request_t, vm_event_response_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+#endif /* _XEN_PUBLIC_VM_EVENT_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/public/xen-compat.h b/xen/include/public/xen-compat.h
index c1d660d..1e62dc1 100644
--- a/xen/include/public/xen-compat.h
+++ b/xen/include/public/xen-compat.h
@@ -27,7 +27,7 @@
 #ifndef __XEN_PUBLIC_XEN_COMPAT_H__
 #define __XEN_PUBLIC_XEN_COMPAT_H__
 
-#define __XEN_LATEST_INTERFACE_VERSION__ 0x00040500
+#define __XEN_LATEST_INTERFACE_VERSION__ 0x00040600
 
 #if defined(__XEN__) || defined(__XEN_TOOLS__)
 /* Xen is built with matching headers and implements the latest interface. */
diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
index a6a2092..ff5547e 100644
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -101,6 +101,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_ulong_t);
 #define __HYPERVISOR_kexec_op             37
 #define __HYPERVISOR_tmem_op              38
 #define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
+#define __HYPERVISOR_xenpmu_op            40
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -160,6 +161,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_ulong_t);
 #define VIRQ_MEM_EVENT  10 /* G. (DOM0) A memory event has occured           */
 #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient                     */
 #define VIRQ_ENOMEM     12 /* G. (DOM0) Low on heap memory       */
+#define VIRQ_XENPMU     13 /* V.  PMC interrupt                              */
 
 /* Architecture-specific VIRQ definitions. */
 #define VIRQ_ARCH_0    16
@@ -486,7 +488,21 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
 /* x86/PAE guests: support PDPTs above 4GB. */
 #define VMASST_TYPE_pae_extended_cr3     3
 
+/*
+ * x86/64 guests: strictly hide M2P from user mode.
+ * This allows the guest to control respective hypervisor behavior:
+ * - when not set, L4 tables get created with the respective slot blank,
+ *   and whenever the L4 table gets used as a kernel one the missing
+ *   mapping gets inserted,
+ * - when set, L4 tables get created with the respective slot initialized
+ *   as before, and whenever the L4 table gets used as a user one the
+ *   mapping gets zapped.
+ */
+#define VMASST_TYPE_m2p_strict           32
+
+#if __XEN_INTERFACE_VERSION__ < 0x00040600
 #define MAX_VMASST_TYPE                  3
+#endif
 
 #ifndef __ASSEMBLY__
 
@@ -682,6 +698,12 @@ struct shared_info {
     uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
     uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
     uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
+#if !defined(__i386__)
+    uint32_t wc_sec_hi;
+# define xen_wc_sec_hi wc_sec_hi
+#elif !defined(__XEN__) && !defined(__XEN_TOOLS__)
+# define xen_wc_sec_hi arch.wc_sec_hi
+#endif
 
     struct arch_shared_info arch;
 
@@ -698,24 +720,27 @@ typedef struct shared_info shared_info_t;
  *  3. This the order of bootstrap elements in the initial virtual region:
  *      a. relocated kernel image
  *      b. initial ram disk              [mod_start, mod_len]
+ *         (may be omitted)
  *      c. list of allocated page frames [mfn_list, nr_pages]
  *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
  *      d. start_info_t structure        [register ESI (x86)]
- *      e. bootstrap page tables         [pt_base and CR3 (x86)]
- *      f. bootstrap stack               [register ESP (x86)]
+ *         in case of dom0 this page contains the console info, too
+ *      e. unless dom0: xenstore ring page
+ *      f. unless dom0: console ring page
+ *      g. bootstrap page tables         [pt_base and CR3 (x86)]
+ *      h. bootstrap stack               [register ESP (x86)]
  *  4. Bootstrap elements are packed together, but each is 4kB-aligned.
- *  5. The initial ram disk may be omitted.
- *  6. The list of page frames forms a contiguous 'pseudo-physical' memory
+ *  5. The list of page frames forms a contiguous 'pseudo-physical' memory
  *     layout for the domain. In particular, the bootstrap virtual-memory
  *     region is a 1:1 mapping to the first section of the pseudo-physical map.
- *  7. All bootstrap elements are mapped read-writable for the guest OS. The
+ *  6. All bootstrap elements are mapped read-writable for the guest OS. The
  *     only exception is the bootstrap page table, which is mapped read-only.
- *  8. There is guaranteed to be at least 512kB padding after the final
+ *  7. There is guaranteed to be at least 512kB padding after the final
  *     bootstrap element. If necessary, the bootstrap virtual region is
  *     extended by an extra 4MB to ensure this.
  *
  * Note: Prior to 25833:bb85bbccb1c9. ("x86/32-on-64 adjust Dom0 initial page
- * table layout") a bug caused the pt_base (3.e above) and cr3 to not point
+ * table layout") a bug caused the pt_base (3.g above) and cr3 to not point
  * to the start of the guest page tables (it was offset by two pages).
  * This only manifested itself on 32-on-64 dom0 kernels and not 32-on-64 domU
  * or 64-bit kernels of any colour. The page tables for a 32-on-64 dom0 got
@@ -771,6 +796,8 @@ typedef struct start_info start_info_t;
 #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
 #define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
 #define SIF_MOD_START_PFN (1<<3)  /* Is mod_start a PFN? */
+#define SIF_VIRT_P2M_4TOOLS (1<<4) /* Do Xen tools understand a virt. mapped */
+                                   /* P->M making the 3 level tree obsolete? */
 #define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
 
 /*
@@ -870,6 +897,9 @@ __DEFINE_XEN_GUEST_HANDLE(uint64, uint64_t);
 /* Default definitions for macros used by domctl/sysctl. */
 #if defined(__XEN__) || defined(__XEN_TOOLS__)
 
+#ifndef int64_aligned_t
+#define int64_aligned_t int64_t
+#endif
 #ifndef uint64_aligned_t
 #define uint64_aligned_t uint64_t
 #endif
diff --git a/xen/include/public/xsm/flask_op.h b/xen/include/public/xsm/flask_op.h
index 233de81..c76359c 100644
--- a/xen/include/public/xsm/flask_op.h
+++ b/xen/include/public/xsm/flask_op.h
@@ -25,6 +25,8 @@
 #ifndef __FLASK_OP_H__
 #define __FLASK_OP_H__
 
+#include "../event_channel.h"
+
 #define XEN_FLASK_INTERFACE_VERSION 1
 
 struct xen_flask_load {
@@ -148,6 +150,13 @@ struct xen_flask_relabel {
     uint32_t sid;
 };
 
+struct xen_flask_devicetree_label {
+    /* IN */
+    uint32_t sid;
+    uint32_t length;
+    XEN_GUEST_HANDLE(char) path;
+};
+
 struct xen_flask_op {
     uint32_t cmd;
 #define FLASK_LOAD              1
@@ -174,6 +183,7 @@ struct xen_flask_op {
 #define FLASK_DEL_OCONTEXT      22
 #define FLASK_GET_PEER_SID      23
 #define FLASK_RELABEL_DOMAIN    24
+#define FLASK_DEVICETREE_LABEL  25
     uint32_t interface_version; /* XEN_FLASK_INTERFACE_VERSION */
     union {
         struct xen_flask_load load;
@@ -193,6 +203,7 @@ struct xen_flask_op {
         struct xen_flask_ocontext ocontext;
         struct xen_flask_peersid peersid;
         struct xen_flask_relabel relabel;
+        struct xen_flask_devicetree_label devicetree_label;
     } u;
 };
 typedef struct xen_flask_op xen_flask_op_t;
diff --git a/xen/include/xen/acpi.h b/xen/include/xen/acpi.h
index 3aeba4a..f26658f 100644
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -16,8 +16,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/xen/include/xen/bitops.h b/xen/include/xen/bitops.h
index 6054155..cb56f24 100644
--- a/xen/include/xen/bitops.h
+++ b/xen/include/xen/bitops.h
@@ -70,20 +70,52 @@ static __inline__ int generic_fls(int x)
     return r;
 }
 
+#if BITS_PER_LONG == 64
+
+static inline int generic_ffsl(unsigned long x)
+{
+    return !x || (u32)x ? generic_ffs(x) : generic_ffs(x >> 32) + 32;
+}
+
+static inline int generic_flsl(unsigned long x)
+{
+    u32 h = x >> 32;
+
+    return h ? generic_fls(h) + 32 : generic_fls(x);
+}
+
+#else
+# define generic_ffsl generic_ffs
+# define generic_flsl generic_fls
+#endif
+
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
  */
 #include <asm/bitops.h>
 
-
+#if BITS_PER_LONG == 64
+# define fls64 flsl
+# define ffs64 ffsl
+#else
+# ifndef ffs64
+static inline int generic_ffs64(__u64 x)
+{
+    return !x || (__u32)x ? ffs(x) : ffs(x >> 32) + 32;
+}
+#  define ffs64 generic_ffs64
+# endif
+# ifndef fls64
 static inline int generic_fls64(__u64 x)
 {
     __u32 h = x >> 32;
-    if (h)
-        return fls(x) + 32;
-    return fls(x);
+
+    return h ? fls(h) + 32 : fls(x);
 }
+#  define fls64 generic_fls64
+# endif
+#endif
 
 static __inline__ int get_bitmask_order(unsigned int count)
 {
diff --git a/xen/include/xen/config.h b/xen/include/xen/config.h
index 7bef8a6..f7258c7 100644
--- a/xen/include/xen/config.h
+++ b/xen/include/xen/config.h
@@ -69,18 +69,8 @@
 #define __force
 #define __bitwise
 
-#define MB(_mb)     (_AC(_mb, UL) << 20)
-#define GB(_gb)     (_AC(_gb, UL) << 30)
-
-#ifndef __ASSEMBLY__
-
-#define dprintk(_l, _f, _a...)                              \
-    printk(_l "%s:%d: " _f, __FILE__ , __LINE__ , ## _a )
-#define gdprintk(_l, _f, _a...)                             \
-    printk(XENLOG_GUEST _l "%s:%d:%pv " _f, __FILE__,       \
-           __LINE__, current, ## _a )
-
-#endif /* !__ASSEMBLY__ */
+#define MB(_mb)     (_AC(_mb, ULL) << 20)
+#define GB(_gb)     (_AC(_gb, ULL) << 30)
 
 #define __STR(...) #__VA_ARGS__
 #define STR(...) __STR(__VA_ARGS__)
@@ -100,10 +90,6 @@
 
 #ifdef FLASK_ENABLE
 #define XSM_MAGIC 0xf97cff8c
-/* Enable permissive mode (xl setenforce or flask_enforcing parameter) */
-#define FLASK_DEVELOP 1
-/* Allow runtime disabling of FLASK via the flask_enable parameter */
-#define FLASK_BOOTPARAM 1
 /* Maintain statistics on the access vector cache */
 #define FLASK_AVC_STATS 1
 #endif
diff --git a/xen/include/xen/cper.h b/xen/include/xen/cper.h
index e6e06dd..8788cef 100644
--- a/xen/include/xen/cper.h
+++ b/xen/include/xen/cper.h
@@ -15,8 +15,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef LINUX_CPER_H
diff --git a/xen/include/xen/cpuidle.h b/xen/include/xen/cpuidle.h
index b7b9e8c..dffbcbb 100644
--- a/xen/include/xen/cpuidle.h
+++ b/xen/include/xen/cpuidle.h
@@ -19,8 +19,7 @@
  *  General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
@@ -66,6 +65,7 @@ struct acpi_processor_power
     struct acpi_processor_cx *last_state;
     struct acpi_processor_cx *safe_state;
     void *gdata; /* governor specific data */
+    u64 last_state_update_tick;
     u32 last_residency;
     u32 count;
     spinlock_t stat_lock;
diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h
index 850b4a2..0e7108c 100644
--- a/xen/include/xen/cpumask.h
+++ b/xen/include/xen/cpumask.h
@@ -103,11 +103,21 @@ static inline void cpumask_set_cpu(int cpu, volatile cpumask_t *dstp)
 	set_bit(cpumask_check(cpu), dstp->bits);
 }
 
+static inline void __cpumask_set_cpu(int cpu, cpumask_t *dstp)
+{
+	__set_bit(cpumask_check(cpu), dstp->bits);
+}
+
 static inline void cpumask_clear_cpu(int cpu, volatile cpumask_t *dstp)
 {
 	clear_bit(cpumask_check(cpu), dstp->bits);
 }
 
+static inline void __cpumask_clear_cpu(int cpu, cpumask_t *dstp)
+{
+	__clear_bit(cpumask_check(cpu), dstp->bits);
+}
+
 static inline void cpumask_setall(cpumask_t *dstp)
 {
 	bitmap_fill(dstp->bits, nr_cpumask_bits);
@@ -122,16 +132,26 @@ static inline void cpumask_clear(cpumask_t *dstp)
 #define cpumask_test_cpu(cpu, cpumask) \
 	test_bit(cpumask_check(cpu), (cpumask)->bits)
 
-static inline int cpumask_test_and_set_cpu(int cpu, cpumask_t *addr)
+static inline int cpumask_test_and_set_cpu(int cpu, volatile cpumask_t *addr)
 {
 	return test_and_set_bit(cpumask_check(cpu), addr->bits);
 }
 
-static inline int cpumask_test_and_clear_cpu(int cpu, cpumask_t *addr)
+static inline int __cpumask_test_and_set_cpu(int cpu, cpumask_t *addr)
+{
+	return __test_and_set_bit(cpumask_check(cpu), addr->bits);
+}
+
+static inline int cpumask_test_and_clear_cpu(int cpu, volatile cpumask_t *addr)
 {
 	return test_and_clear_bit(cpumask_check(cpu), addr->bits);
 }
 
+static inline int __cpumask_test_and_clear_cpu(int cpu, cpumask_t *addr)
+{
+	return __test_and_clear_bit(cpumask_check(cpu), addr->bits);
+}
+
 static inline void cpumask_and(cpumask_t *dstp, const cpumask_t *src1p,
 			       const cpumask_t *src2p)
 {
diff --git a/xen/include/xen/device_tree.h b/xen/include/xen/device_tree.h
index 08db8bc..46c5ba8 100644
--- a/xen/include/xen/device_tree.h
+++ b/xen/include/xen/device_tree.h
@@ -11,7 +11,9 @@
 #define __XEN_DEVICE_TREE_H__
 
 #include <asm/byteorder.h>
+#include <asm/device.h>
 #include <public/xen.h>
+#include <xen/kernel.h>
 #include <xen/init.h>
 #include <xen/string.h>
 #include <xen/types.h>
@@ -28,6 +30,7 @@ struct dt_device_match {
     const char *type;
     const char *compatible;
     const bool_t not_available;
+    const void *data;
 };
 
 #define DT_MATCH_PATH(p)                { .path = p }
@@ -79,8 +82,19 @@ struct dt_device_node {
     /* IOMMU specific fields */
     bool is_protected;
     struct list_head domain_list;
+
+    struct device dev;
 };
 
+#define dt_to_dev(dt_node)  (&(dt_node)->dev)
+
+static inline struct dt_device_node *dev_to_dt(struct device *dev)
+{
+    ASSERT(dev->type == DEV_DT);
+
+    return container_of(dev, struct dt_device_node, dev);
+}
+
 #define MAX_PHANDLE_ARGS 16
 struct dt_phandle_args {
     struct dt_device_node *np;
@@ -442,6 +456,20 @@ struct dt_device_node *dt_find_node_by_alias(const char *alias);
  */
 struct dt_device_node *dt_find_node_by_path(const char *path);
 
+
+/**
+ * dt_find_node_by_gpath - Same as dt_find_node_by_path but retrieve the
+ * path from the guest
+ *
+ * @u_path: Xen Guest handle to the buffer containing the path
+ * @u_plen: Length of the buffer
+ * @node: TODO
+ *
+ * Return 0 if succeed otherwise -errno
+ */
+int dt_find_node_by_gpath(XEN_GUEST_HANDLE(char) u_path, uint32_t u_plen,
+                          struct dt_device_node **node);
+
 /**
  * dt_get_parent - Get a node's parent if any
  * @node: Node to get parent
@@ -460,7 +488,7 @@ const struct dt_device_node *dt_get_parent(const struct dt_device_node *node);
  * This function resolves an address, walking the tree, for a give
  * device-tree node. It returns 0 on success.
  */
-int dt_device_get_address(const struct dt_device_node *dev, int index,
+int dt_device_get_address(const struct dt_device_node *dev, unsigned int index,
                           u64 *addr, u64 *size);
 
 /**
@@ -490,7 +518,7 @@ unsigned int dt_number_of_address(const struct dt_device_node *device);
  * This function resolves an interrupt, walking the tree, for a given
  * device-tree node. It's the high level pendant to dt_device_get_raw_irq().
  */
-int dt_device_get_irq(const struct dt_device_node *device, int index,
+int dt_device_get_irq(const struct dt_device_node *device, unsigned int index,
                       struct dt_irq *irq);
 
 /**
@@ -502,7 +530,8 @@ int dt_device_get_irq(const struct dt_device_node *device, int index,
  * This function resolves an interrupt for a device, no translation is
  * made. dt_irq_translate can be called after.
  */
-int dt_device_get_raw_irq(const struct dt_device_node *device, int index,
+int dt_device_get_raw_irq(const struct dt_device_node *device,
+                          unsigned int index,
                           struct dt_raw_irq *irq);
 
 /**
@@ -513,6 +542,30 @@ int dt_device_get_raw_irq(const struct dt_device_node *device, int index,
 int dt_irq_translate(const struct dt_raw_irq *raw, struct dt_irq *out_irq);
 
 /**
+ * dt_for_each_irq_map - Iterate over a nodes interrupt-map property
+ * @dev: The node whose interrupt-map property should be iterated over
+ * @cb: Call back to call for each entry
+ * @data: Caller data passed to callback
+ */
+int dt_for_each_irq_map(const struct dt_device_node *dev,
+                        int (*cb)(const struct dt_device_node *,
+                                  const struct dt_irq *,
+                                  void *),
+                        void *data);
+
+/**
+ * dt_for_each_range - Iterate over a nodes ranges property
+ * @dev: The node whose interrupt-map property should be iterated over
+ * @cb: Call back to call for each entry
+ * @data: Caller data passed to callback
+ */
+int dt_for_each_range(const struct dt_device_node *dev,
+                      int (*cb)(const struct dt_device_node *,
+                                u64 addr, u64 length,
+                                void *),
+                      void *data);
+
+/**
  * dt_n_size_cells - Helper to retrieve the number of cell for the size
  * @np: node to get the value
  *
@@ -541,14 +594,26 @@ int dt_n_addr_cells(const struct dt_device_node *np);
 bool_t dt_device_is_available(const struct dt_device_node *device);
 
 /**
+ * dt_device_for_passthrough - Check if a device will be used for
+ * passthrough later
+ *
+ * @device: Node to check
+ *
+ * Return true if the property "xen,passthrough" is present in the node,
+ * false otherwise.
+ */
+bool_t dt_device_for_passthrough(const struct dt_device_node *device);
+
+/**
  * dt_match_node - Tell if a device_node has a matching of dt_device_match
  * @matches: array of dt_device_match structures to search in
  * @node: the dt_device_node structure to match against
  *
  * Returns true if the device node match one of dt_device_match.
  */
-bool_t dt_match_node(const struct dt_device_match *matches,
-                     const struct dt_device_node *node);
+const struct dt_device_match *
+dt_match_node(const struct dt_device_match *matches,
+              const struct dt_device_node *node);
 
 /**
  * dt_find_matching_node - Find a node based on an dt_device_match match table
diff --git a/xen/include/xen/dmi.h b/xen/include/xen/dmi.h
index 8205893..df26d4b 100644
--- a/xen/include/xen/dmi.h
+++ b/xen/include/xen/dmi.h
@@ -34,8 +34,8 @@ struct dmi_system_id {
 
 extern int dmi_check_system(struct dmi_system_id *list);
 extern void dmi_scan_machine(void);
-extern int dmi_get_table(u32 *base, u32 *len);
-extern void dmi_efi_get_table(void *);
+extern const char *dmi_get_table(paddr_t *base, u32 *len);
+extern void dmi_efi_get_table(const void *smbios, const void *smbios3);
 bool_t dmi_get_date(int field, int *yearp, int *monthp, int *dayp);
 extern void dmi_end_boot(void);
 
diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
index 72667da..848db8a 100644
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -4,6 +4,7 @@
 
 #include <public/xen.h>
 #include <asm/domain.h>
+#include <asm/numa.h>
 
 typedef union {
     struct vcpu_guest_context *nat;
@@ -55,7 +56,8 @@ void vcpu_destroy(struct vcpu *v);
 int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset);
 void unmap_vcpu_info(struct vcpu *v);
 
-int arch_domain_create(struct domain *d, unsigned int domcr_flags);
+int arch_domain_create(struct domain *d, unsigned int domcr_flags,
+                       struct xen_arch_domainconfig *config);
 
 void arch_domain_destroy(struct domain *d);
 
@@ -99,7 +101,7 @@ struct vnuma_info {
     unsigned int nr_vmemranges;
     unsigned int *vdistance;
     unsigned int *vcpu_to_vnode;
-    unsigned int *vnode_to_pnode;
+    nodeid_t *vnode_to_pnode;
     struct xen_vmemrange *vmemrange;
 };
 
diff --git a/xen/include/xen/domain_page.h b/xen/include/xen/domain_page.h
index b7a710b..c1d630c 100644
--- a/xen/include/xen/domain_page.h
+++ b/xen/include/xen/domain_page.h
@@ -11,13 +11,19 @@
 
 #include <xen/mm.h>
 
+/*
+ * Clear a given page frame, or copy between two of them.
+ */
+void clear_domain_page(mfn_t mfn);
+void copy_domain_page(mfn_t dst, const mfn_t src);
+
 #ifdef CONFIG_DOMAIN_PAGE
 
 /*
  * Map a given page frame, returning the mapped virtual address. The page is
  * then accessible within the current VCPU until a corresponding unmap call.
  */
-void *map_domain_page(unsigned long mfn);
+void *map_domain_page(mfn_t mfn);
 
 /*
  * Pass a VA within a page previously mapped in the context of the
@@ -25,12 +31,6 @@ void *map_domain_page(unsigned long mfn);
  */
 void unmap_domain_page(const void *va);
 
-/*
- * Clear a given page frame, or copy between two of them.
- */
-void clear_domain_page(unsigned long mfn);
-void copy_domain_page(unsigned long dmfn, unsigned long smfn);
-
 /* 
  * Given a VA from map_domain_page(), return its underlying MFN.
  */
@@ -41,11 +41,15 @@ unsigned long domain_page_map_to_mfn(const void *va);
  * address spaces (not just within the VCPU that created the mapping). Global
  * mappings can also be unmapped from any context.
  */
-void *map_domain_page_global(unsigned long mfn);
+void *map_domain_page_global(mfn_t mfn);
 void unmap_domain_page_global(const void *va);
 
-#define __map_domain_page(pg)        map_domain_page(__page_to_mfn(pg))
-#define __map_domain_page_global(pg) map_domain_page_global(__page_to_mfn(pg))
+#define __map_domain_page(pg)        map_domain_page(_mfn(__page_to_mfn(pg)))
+
+static inline void *__map_domain_page_global(const struct page_info *pg)
+{
+    return map_domain_page_global(_mfn(__page_to_mfn(pg)));
+}
 
 #define DMCACHE_ENTRY_VALID 1U
 #define DMCACHE_ENTRY_HELD  2U
@@ -80,7 +84,7 @@ map_domain_page_with_cache(unsigned long mfn, struct domain_mmap_cache *cache)
     }
 
     cache->mfn   = mfn;
-    cache->va    = map_domain_page(mfn);
+    cache->va    = map_domain_page(_mfn(mfn));
     cache->flags = DMCACHE_ENTRY_HELD | DMCACHE_ENTRY_VALID;
 
  done:
@@ -109,17 +113,22 @@ domain_mmap_cache_destroy(struct domain_mmap_cache *cache)
 
 #else /* !CONFIG_DOMAIN_PAGE */
 
-#define map_domain_page(mfn)                mfn_to_virt(mfn)
+#define map_domain_page(mfn)                mfn_to_virt(mfn_x(mfn))
 #define __map_domain_page(pg)               page_to_virt(pg)
 #define unmap_domain_page(va)               ((void)(va))
-#define clear_domain_page(mfn)              clear_page(mfn_to_virt(mfn))
-#define copy_domain_page(dmfn, smfn)        copy_page(mfn_to_virt(dmfn), \
-                                                      mfn_to_virt(smfn))
 #define domain_page_map_to_mfn(va)          virt_to_mfn((unsigned long)(va))
 
-#define map_domain_page_global(mfn)         mfn_to_virt(mfn)
-#define __map_domain_page_global(pg)        page_to_virt(pg)
-#define unmap_domain_page_global(va)        ((void)(va))
+static inline void *map_domain_page_global(mfn_t mfn)
+{
+    return mfn_to_virt(mfn_x(mfn));
+}
+
+static inline void *__map_domain_page_global(const struct page_info *pg)
+{
+    return page_to_virt(pg);
+}
+
+static inline void unmap_domain_page_global(const void *va) {};
 
 struct domain_mmap_cache { 
 };
diff --git a/xen/include/xen/earlycpio.h b/xen/include/xen/earlycpio.h
index 85d144a..16d9404 100644
--- a/xen/include/xen/earlycpio.h
+++ b/xen/include/xen/earlycpio.h
@@ -6,6 +6,7 @@
 struct cpio_data {
 	void *data;
 	size_t size;
+	char name[MAX_CPIO_FILE_NAME];
 };
 
 struct cpio_data find_cpio_data(const char *path, void *data, size_t len,
diff --git a/xen/include/xen/efi.h b/xen/include/xen/efi.h
index 5e02724..e74dad1 100644
--- a/xen/include/xen/efi.h
+++ b/xen/include/xen/efi.h
@@ -15,6 +15,7 @@ struct efi {
     unsigned long acpi;         /* ACPI table (IA64 ext 0.71) */
     unsigned long acpi20;       /* ACPI table (ACPI 2.0) */
     unsigned long smbios;       /* SM BIOS table */
+    unsigned long smbios3;      /* SMBIOS v3 table */
 };
 
 extern struct efi efi;
@@ -28,7 +29,7 @@ struct xenpf_efi_runtime_call;
 struct compat_pf_efi_runtime_call;
 
 void efi_init_memory(void);
-paddr_t efi_rs_page_table(void);
+bool_t efi_rs_using_pgtables(void);
 unsigned long efi_get_time(void);
 void efi_halt_system(void);
 void efi_reset_system(bool_t warm);
diff --git a/xen/include/xen/errno.h b/xen/include/xen/errno.h
index 39147be..3178466 100644
--- a/xen/include/xen/errno.h
+++ b/xen/include/xen/errno.h
@@ -1,137 +1,20 @@
-#ifndef _I386_ERRNO_H
-#define _I386_ERRNO_H
+#ifndef __XEN_ERRNO_H__
+#define __XEN_ERRNO_H__
 
-/* ` enum neg_errnoval {  [ -Efoo for each Efoo in the list below ]  } */
-/* ` enum errnoval { */
+#include <public/errno.h>
 
-#define	EPERM		 1	/* Operation not permitted */
-#define	ENOENT		 2	/* No such file or directory */
-#define	ESRCH		 3	/* No such process */
-#define	EINTR		 4	/* Interrupted system call */
-#define	EIO		 5	/* I/O error */
-#define	ENXIO		 6	/* No such device or address */
-#define	E2BIG		 7	/* Arg list too long */
-#define	ENOEXEC		 8	/* Exec format error */
-#define	EBADF		 9	/* Bad file number */
-#define	ECHILD		10	/* No child processes */
-#define	EAGAIN		11	/* Try again */
-#define	ENOMEM		12	/* Out of memory */
-#define	EACCES		13	/* Permission denied */
-#define	EFAULT		14	/* Bad address */
-#define	ENOTBLK		15	/* Block device required */
-#define	EBUSY		16	/* Device or resource busy */
-#define	EEXIST		17	/* File exists */
-#define	EXDEV		18	/* Cross-device link */
-#define	ENODEV		19	/* No such device */
-#define	ENOTDIR		20	/* Not a directory */
-#define	EISDIR		21	/* Is a directory */
-#define	EINVAL		22	/* Invalid argument */
-#define	ENFILE		23	/* File table overflow */
-#define	EMFILE		24	/* Too many open files */
-#define	ENOTTY		25	/* Not a typewriter */
-#define	ETXTBSY		26	/* Text file busy */
-#define	EFBIG		27	/* File too large */
-#define	ENOSPC		28	/* No space left on device */
-#define	ESPIPE		29	/* Illegal seek */
-#define	EROFS		30	/* Read-only file system */
-#define	EMLINK		31	/* Too many links */
-#define	EPIPE		32	/* Broken pipe */
-#define	EDOM		33	/* Math argument out of domain of func */
-#define	ERANGE		34	/* Math result not representable */
-#define	EDEADLK		35	/* Resource deadlock would occur */
-#define	ENAMETOOLONG	36	/* File name too long */
-#define	ENOLCK		37	/* No record locks available */
-#define	ENOSYS		38	/* Function not implemented */
-#define	ENOTEMPTY	39	/* Directory not empty */
-#define	ELOOP		40	/* Too many symbolic links encountered */
-#define	EWOULDBLOCK	EAGAIN	/* Operation would block */
-#define	ENOMSG		42	/* No message of desired type */
-#define	EIDRM		43	/* Identifier removed */
-#define	ECHRNG		44	/* Channel number out of range */
-#define	EL2NSYNC	45	/* Level 2 not synchronized */
-#define	EL3HLT		46	/* Level 3 halted */
-#define	EL3RST		47	/* Level 3 reset */
-#define	ELNRNG		48	/* Link number out of range */
-#define	EUNATCH		49	/* Protocol driver not attached */
-#define	ENOCSI		50	/* No CSI structure available */
-#define	EL2HLT		51	/* Level 2 halted */
-#define	EBADE		52	/* Invalid exchange */
-#define	EBADR		53	/* Invalid request descriptor */
-#define	EXFULL		54	/* Exchange full */
-#define	ENOANO		55	/* No anode */
-#define	EBADRQC		56	/* Invalid request code */
-#define	EBADSLT		57	/* Invalid slot */
+#ifndef __ASSEMBLY__
 
-#define	EDEADLOCK	EDEADLK
+#define XEN_ERRNO(name, value) name = XEN_##name,
+enum {
+#include <public/errno.h>
+};
 
-#define	EBFONT		59	/* Bad font file format */
-#define	ENOSTR		60	/* Device not a stream */
-#define	ENODATA		61	/* No data available */
-#define	ETIME		62	/* Timer expired */
-#define	ENOSR		63	/* Out of streams resources */
-#define	ENONET		64	/* Machine is not on the network */
-#define	ENOPKG		65	/* Package not installed */
-#define	EREMOTE		66	/* Object is remote */
-#define	ENOLINK		67	/* Link has been severed */
-#define	EADV		68	/* Advertise error */
-#define	ESRMNT		69	/* Srmount error */
-#define	ECOMM		70	/* Communication error on send */
-#define	EPROTO		71	/* Protocol error */
-#define	EMULTIHOP	72	/* Multihop attempted */
-#define	EDOTDOT		73	/* RFS specific error */
-#define	EBADMSG		74	/* Not a data message */
-#define	EOVERFLOW	75	/* Value too large for defined data type */
-#define	ENOTUNIQ	76	/* Name not unique on network */
-#define	EBADFD		77	/* File descriptor in bad state */
-#define	EREMCHG		78	/* Remote address changed */
-#define	ELIBACC		79	/* Can not access a needed shared library */
-#define	ELIBBAD		80	/* Accessing a corrupted shared library */
-#define	ELIBSCN		81	/* .lib section in a.out corrupted */
-#define	ELIBMAX		82	/* Attempting to link in too many shared libraries */
-#define	ELIBEXEC	83	/* Cannot exec a shared library directly */
-#define	EILSEQ		84	/* Illegal byte sequence */
-#define	ERESTART	85	/* Interrupted system call should be restarted */
-#define	ESTRPIPE	86	/* Streams pipe error */
-#define	EUSERS		87	/* Too many users */
-#define	ENOTSOCK	88	/* Socket operation on non-socket */
-#define	EDESTADDRREQ	89	/* Destination address required */
-#define	EMSGSIZE	90	/* Message too long */
-#define	EPROTOTYPE	91	/* Protocol wrong type for socket */
-#define	ENOPROTOOPT	92	/* Protocol not available */
-#define	EPROTONOSUPPORT	93	/* Protocol not supported */
-#define	ESOCKTNOSUPPORT	94	/* Socket type not supported */
-#define	EOPNOTSUPP	95	/* Operation not supported on transport endpoint */
-#define	EPFNOSUPPORT	96	/* Protocol family not supported */
-#define	EAFNOSUPPORT	97	/* Address family not supported by protocol */
-#define	EADDRINUSE	98	/* Address already in use */
-#define	EADDRNOTAVAIL	99	/* Cannot assign requested address */
-#define	ENETDOWN	100	/* Network is down */
-#define	ENETUNREACH	101	/* Network is unreachable */
-#define	ENETRESET	102	/* Network dropped connection because of reset */
-#define	ECONNABORTED	103	/* Software caused connection abort */
-#define	ECONNRESET	104	/* Connection reset by peer */
-#define	ENOBUFS		105	/* No buffer space available */
-#define	EISCONN		106	/* Transport endpoint is already connected */
-#define	ENOTCONN	107	/* Transport endpoint is not connected */
-#define	ESHUTDOWN	108	/* Cannot send after transport endpoint shutdown */
-#define	ETOOMANYREFS	109	/* Too many references: cannot splice */
-#define	ETIMEDOUT	110	/* Connection timed out */
-#define	ECONNREFUSED	111	/* Connection refused */
-#define	EHOSTDOWN	112	/* Host is down */
-#define	EHOSTUNREACH	113	/* No route to host */
-#define	EALREADY	114	/* Operation already in progress */
-#define	EINPROGRESS	115	/* Operation now in progress */
-#define	ESTALE		116	/* Stale NFS file handle */
-#define	EUCLEAN		117	/* Structure needs cleaning */
-#define	ENOTNAM		118	/* Not a XENIX named type file */
-#define	ENAVAIL		119	/* No XENIX semaphores available */
-#define	EISNAM		120	/* Is a named type file */
-#define	EREMOTEIO	121	/* Remote I/O error */
-#define	EDQUOT		122	/* Quota exceeded */
+#else /* !__ASSEMBLY__ */
 
-#define	ENOMEDIUM	123	/* No medium found */
-#define	EMEDIUMTYPE	124	/* Wrong medium type */
+#define XEN_ERRNO(name, value) .equ name, XEN_##name
+#include <public/errno.h>
 
-/* ` } */
+#endif /* __ASSEMBLY__ */
 
-#endif
+#endif /*  __XEN_ERRNO_H__ */
diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
index 88526f8..af923d1 100644
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -58,10 +58,9 @@ void evtchn_move_pirqs(struct vcpu *v);
 typedef void (*xen_event_channel_notification_t)(
     struct vcpu *v, unsigned int port);
 int alloc_unbound_xen_event_channel(
-    struct vcpu *local_vcpu, domid_t remote_domid,
+    struct domain *ld, unsigned int lvcpu, domid_t remote_domid,
     xen_event_channel_notification_t notification_fn);
-void free_xen_event_channel(
-    struct vcpu *local_vcpu, int port);
+void free_xen_event_channel(struct domain *d, int port);
 
 /* Query if event channel is in use by the guest */
 int guest_enabled_event(struct vcpu *v, uint32_t virq);
@@ -90,11 +89,7 @@ static inline bool_t port_is_valid(struct domain *d, unsigned int p)
 {
     if ( p >= d->max_evtchns )
         return 0;
-    if ( !d->evtchn )
-        return 0;
-    if ( p < EVTCHNS_PER_BUCKET )
-        return 1;
-    return group_from_port(d, p) != NULL && bucket_from_port(d, p) != NULL;
+    return p < read_atomic(&d->valid_evtchns);
 }
 
 static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p)
@@ -152,10 +147,11 @@ static inline void evtchn_port_init(struct domain *d, struct evtchn *evtchn)
         d->evtchn_port_ops->init(d, evtchn);
 }
 
-static inline void evtchn_port_set_pending(struct vcpu *v,
+static inline void evtchn_port_set_pending(struct domain *d,
+                                           unsigned int vcpu_id,
                                            struct evtchn *evtchn)
 {
-    v->domain->evtchn_port_ops->set_pending(v, evtchn);
+    d->evtchn_port_ops->set_pending(d->vcpu[vcpu_id], evtchn);
 }
 
 static inline void evtchn_port_clear_pending(struct domain *d,
diff --git a/xen/include/xen/gdbstub.h b/xen/include/xen/gdbstub.h
index 67d7410..ab710da 100644
--- a/xen/include/xen/gdbstub.h
+++ b/xen/include/xen/gdbstub.h
@@ -14,8 +14,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __XEN_GDBSTUB_H__
diff --git a/xen/include/xen/grant_table.h b/xen/include/xen/grant_table.h
index 32f5786..5263fd6 100644
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -17,8 +17,7 @@
  * GNU General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __XEN_GRANT_TABLE_H__
@@ -52,18 +51,13 @@
 /* The maximum size of a grant table. */
 extern unsigned int max_grant_frames;
 
-/*
- * Tracks a mapping of another domain's grant reference. Each domain has a
- * table of these, indexes into which are returned as a 'mapping handle'.
- */
-struct grant_mapping {
-    u32      ref;           /* grant ref */
-    u16      flags;         /* 0-4: GNTMAP_* ; 5-15: unused */
-    domid_t  domid;         /* granting domain */
-};
-
 /* Per-domain grant information. */
 struct grant_table {
+    /*
+     * Lock protecting updates to grant table state (version, active
+     * entry list, etc.)
+     */
+    rwlock_t              lock;
     /* Table size. Number of frames shared with guest */
     unsigned int          nr_grant_frames;
     /* Shared grant table (see include/public/grant_table.h). */
@@ -78,12 +72,11 @@ struct grant_table {
     grant_status_t       **status;
     /* Active grant table. */
     struct active_grant_entry **active;
-    /* Mapping tracking table. */
+    /* Mapping tracking table per vcpu. */
     struct grant_mapping **maptrack;
-    unsigned int          maptrack_head;
     unsigned int          maptrack_limit;
-    /* Lock protecting updates to active and shared grant tables. */
-    spinlock_t            lock;
+    /* Lock protecting the maptrack page list, head, and limit */
+    spinlock_t            maptrack_lock;
     /* The defined versions are 1 and 2.  Set to 0 if we don't know
        what version to use yet. */
     unsigned              gt_version;
@@ -94,6 +87,7 @@ int grant_table_create(
     struct domain *d);
 void grant_table_destroy(
     struct domain *d);
+void grant_table_init_vcpu(struct vcpu *v);
 
 /* Domain death release of granted mappings of other domains' memory. */
 void
@@ -101,7 +95,7 @@ gnttab_release_mappings(
     struct domain *d);
 
 /* Increase the size of a domain's grant table.
- * Caller must hold d's grant table lock.
+ * Caller must hold d's grant table write lock.
  */
 int
 gnttab_grow_table(struct domain *d, unsigned int req_nr_frames);
diff --git a/xen/include/xen/guest_access.h b/xen/include/xen/guest_access.h
index 373454e..09989df 100644
--- a/xen/include/xen/guest_access.h
+++ b/xen/include/xen/guest_access.h
@@ -8,6 +8,8 @@
 #define __XEN_GUEST_ACCESS_H__
 
 #include <asm/guest_access.h>
+#include <xen/types.h>
+#include <public/xen.h>
 
 #define copy_to_guest(hnd, ptr, nr)                     \
     copy_to_guest_offset(hnd, 0, ptr, nr)
@@ -27,4 +29,7 @@
 #define __clear_guest(hnd, nr)                          \
     __clear_guest_offset(hnd, 0, nr)
 
+char *safe_copy_string_from_guest(XEN_GUEST_HANDLE(char) u_buf,
+                                  size_t size, size_t max_size);
+
 #endif /* __XEN_GUEST_ACCESS_H__ */
diff --git a/xen/include/xen/hvm/iommu.h b/xen/include/xen/hvm/iommu.h
index 693346c..106e08f 100644
--- a/xen/include/xen/hvm/iommu.h
+++ b/xen/include/xen/hvm/iommu.h
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Allen Kay <allen.m.kay at intel.com>
  */
diff --git a/xen/include/xen/hvm/irq.h b/xen/include/xen/hvm/irq.h
index c89f4b1..4c9cb20 100644
--- a/xen/include/xen/hvm/irq.h
+++ b/xen/include/xen/hvm/irq.h
@@ -15,8 +15,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __XEN_HVM_IRQ_H__
@@ -88,18 +87,19 @@ struct hvm_irq_dpci {
     DECLARE_BITMAP(isairq_map, NR_ISAIRQS);
     /* Record of mapped Links */
     uint8_t link_cnt[NR_LINK];
-    struct tasklet dirq_tasklet;
 };
 
 /* Machine IRQ to guest device/intx mapping. */
 struct hvm_pirq_dpci {
     uint32_t flags;
+    unsigned int state;
     bool_t masked;
     uint16_t pending;
     struct list_head digl_list;
     struct domain *dom;
     struct hvm_gmsi_info gmsi;
     struct timer timer;
+    struct list_head softirq_list;
 };
 
 void pt_pirq_init(struct domain *, struct hvm_pirq_dpci *);
@@ -109,6 +109,7 @@ int pt_pirq_iterate(struct domain *d,
                               struct hvm_pirq_dpci *, void *arg),
                     void *arg);
 
+bool_t pt_pirq_softirq_active(struct hvm_pirq_dpci *);
 /* Modify state of a PCI INTx wire. */
 void hvm_pci_intx_assert(
     struct domain *d, unsigned int device, unsigned int intx);
diff --git a/xen/include/xen/hvm/save.h b/xen/include/xen/hvm/save.h
index ae6f0bb..aa27a50 100644
--- a/xen/include/xen/hvm/save.h
+++ b/xen/include/xen/hvm/save.h
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef __XEN_HVM_SAVE_H__
diff --git a/xen/include/xen/hypercall.h b/xen/include/xen/hypercall.h
index 8c55779..26cb615 100644
--- a/xen/include/xen/hypercall.h
+++ b/xen/include/xen/hypercall.h
@@ -14,6 +14,7 @@
 #include <public/event_channel.h>
 #include <public/tmem.h>
 #include <public/version.h>
+#include <public/pmu.h>
 #include <asm/hypercall.h>
 #include <xsm/xsm.h>
 
@@ -22,11 +23,6 @@ do_ni_hypercall(
     void);
 
 extern long
-do_sched_op_compat(
-    int cmd,
-    unsigned long arg);
-
-extern long
 do_sched_op(
     int cmd,
     XEN_GUEST_HANDLE_PARAM(void) arg);
@@ -109,7 +105,7 @@ do_vm_assist(
 extern long
 do_vcpu_op(
     int cmd,
-    int vcpuid,
+    unsigned int vcpuid,
     XEN_GUEST_HANDLE_PARAM(void) arg);
 
 struct vcpu;
@@ -144,6 +140,9 @@ do_tmem_op(
 extern long
 do_xenoprof_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg);
 
+extern long
+do_xenpmu_op(unsigned int op, XEN_GUEST_HANDLE_PARAM(xen_pmu_params_t) arg);
+
 #ifdef CONFIG_COMPAT
 
 extern int
@@ -160,7 +159,7 @@ compat_grant_table_op(
 extern int
 compat_vcpu_op(
     int cmd,
-    int vcpuid,
+    unsigned int vcpuid,
     XEN_GUEST_HANDLE_PARAM(void) arg);
 
 extern int
diff --git a/xen/include/xen/inttypes.h b/xen/include/xen/inttypes.h
index e90c55f..28c0053 100644
--- a/xen/include/xen/inttypes.h
+++ b/xen/include/xen/inttypes.h
@@ -12,9 +12,7 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; If not, see <http://www.gnu.org/licenses/>.  */
 
 /*
  *	ISO C99: 7.8 Format conversion of integer types	<inttypes.h>
diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h
index 8eb764a..8f3a20e 100644
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) Allen Kay <allen.m.kay at intel.com>
  */
@@ -25,11 +24,12 @@
 #include <xen/pci.h>
 #include <public/hvm/ioreq.h>
 #include <public/domctl.h>
+#include <asm/device.h>
 #include <asm/iommu.h>
 
 extern bool_t iommu_enable, iommu_enabled;
 extern bool_t force_iommu, iommu_verbose;
-extern bool_t iommu_workaround_bios_bug, iommu_passthrough;
+extern bool_t iommu_workaround_bios_bug, iommu_igfx, iommu_passthrough;
 extern bool_t iommu_snoop, iommu_qinval, iommu_intremap;
 extern bool_t iommu_hap_pt_share;
 extern bool_t iommu_debug;
@@ -64,6 +64,8 @@ int arch_iommu_domain_init(struct domain *d);
 int arch_iommu_populate_page_table(struct domain *d);
 void arch_iommu_check_autotranslated_hwdom(struct domain *d);
 
+int iommu_construct(struct domain *d);
+
 /* Function used internally, use iommu_domain_destroy */
 void iommu_teardown(struct domain *d);
 
@@ -90,7 +92,6 @@ void pt_pci_init(void);
 
 struct pirq;
 int hvm_do_IRQ_dpci(struct domain *, struct pirq *);
-int dpci_ioport_intercept(ioreq_t *p);
 int pt_irq_create_bind(struct domain *, xen_domctl_bind_pt_irq_t *);
 int pt_irq_destroy_bind(struct domain *, xen_domctl_bind_pt_irq_t *);
 
@@ -114,31 +115,37 @@ void iommu_read_msi_from_ire(struct msi_desc *msi_desc, struct msi_msg *msg);
 int iommu_assign_dt_device(struct domain *d, struct dt_device_node *dev);
 int iommu_deassign_dt_device(struct domain *d, struct dt_device_node *dev);
 int iommu_dt_domain_init(struct domain *d);
-void iommu_dt_domain_destroy(struct domain *d);
+int iommu_release_dt_devices(struct domain *d);
+
+int iommu_do_dt_domctl(struct xen_domctl *, struct domain *,
+                       XEN_GUEST_HANDLE_PARAM(xen_domctl_t));
 
 #endif /* HAS_DEVICE_TREE */
 
 struct page_info;
 
+/*
+ * Any non-zero value returned from callbacks of this type will cause the
+ * function the callback was handed to terminate its iteration. Assigning
+ * meaning of these non-zero values is left to the top level caller /
+ * callback pair.
+ */
+typedef int iommu_grdm_t(xen_pfn_t start, xen_ulong_t nr, u32 id, void *ctxt);
+
 struct iommu_ops {
     int (*init)(struct domain *d);
     void (*hwdom_init)(struct domain *d);
-#ifdef HAS_PCI
-    int (*add_device)(u8 devfn, struct pci_dev *);
-    int (*enable_device)(struct pci_dev *pdev);
-    int (*remove_device)(u8 devfn, struct pci_dev *);
-    int (*assign_device)(struct domain *, u8 devfn, struct pci_dev *);
+    int (*add_device)(u8 devfn, device_t *dev);
+    int (*enable_device)(device_t *dev);
+    int (*remove_device)(u8 devfn, device_t *dev);
+    int (*assign_device)(struct domain *, u8 devfn, device_t *dev, u32 flag);
     int (*reassign_device)(struct domain *s, struct domain *t,
-			   u8 devfn, struct pci_dev *);
+                           u8 devfn, device_t *dev);
+#ifdef HAS_PCI
     int (*get_device_group_id)(u16 seg, u8 bus, u8 devfn);
     int (*update_ire_from_msi)(struct msi_desc *msi_desc, struct msi_msg *msg);
     void (*read_msi_from_ire)(struct msi_desc *msi_desc, struct msi_msg *msg);
 #endif /* HAS_PCI */
-#ifdef HAS_DEVICE_TREE
-    int (*assign_dt_device)(struct domain *d, const struct dt_device_node *dev);
-    int (*reassign_dt_device)(struct domain *s, struct domain *t,
-                              const struct dt_device_node *dev);
-#endif
 
     void (*teardown)(struct domain *d);
     int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn,
@@ -156,12 +163,14 @@ struct iommu_ops {
     void (*crash_shutdown)(void);
     void (*iotlb_flush)(struct domain *d, unsigned long gfn, unsigned int page_count);
     void (*iotlb_flush_all)(struct domain *d);
+    int (*get_reserved_device_memory)(iommu_grdm_t *, void *);
     void (*dump_p2m_table)(struct domain *d);
 };
 
 void iommu_suspend(void);
 void iommu_resume(void);
 void iommu_crash_shutdown(void);
+int iommu_get_reserved_device_memory(iommu_grdm_t *, void *);
 
 void iommu_share_p2m_table(struct domain *d);
 
diff --git a/xen/include/xen/irq.h b/xen/include/xen/irq.h
index 9e0155c..0aa817e 100644
--- a/xen/include/xen/irq.h
+++ b/xen/include/xen/irq.h
@@ -172,4 +172,8 @@ unsigned int set_desc_affinity(struct irq_desc *, const cpumask_t *);
 unsigned int arch_hwdom_irqs(domid_t);
 #endif
 
+#ifndef arch_evtchn_bind_pirq
+void arch_evtchn_bind_pirq(struct domain *, int pirq);
+#endif
+
 #endif /* __XEN_IRQ_H__ */
diff --git a/xen/include/xen/kexec.h b/xen/include/xen/kexec.h
index bd17747..b7d121d 100644
--- a/xen/include/xen/kexec.h
+++ b/xen/include/xen/kexec.h
@@ -10,7 +10,7 @@
 
 typedef struct xen_kexec_reserve {
     unsigned long size;
-    unsigned long start;
+    paddr_t start;
 } xen_kexec_reserve_t;
 
 extern xen_kexec_reserve_t kexec_crash_area;
@@ -38,7 +38,7 @@ enum low_crashinfo {
 /* Low crashinfo mode.  Start as INVALID so serveral codepaths can set up
  * defaults without needing to know the state of the others. */
 extern enum low_crashinfo low_crashinfo_mode;
-extern paddr_t crashinfo_maxaddr_bits;
+extern unsigned int crashinfo_maxaddr_bits;
 void kexec_early_calculations(void);
 
 int machine_kexec_add_page(struct kexec_image *image, unsigned long vaddr,
diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
index 0bb05e5..4258912 100644
--- a/xen/include/xen/lib.h
+++ b/xen/include/xen/lib.h
@@ -8,9 +8,6 @@
 #include <xen/string.h>
 #include <asm/bug.h>
 
-void noreturn __bug(char *file, int line);
-void __warn(char *file, int line);
-
 #define BUG_ON(p)  do { if (unlikely(p)) BUG();  } while (0)
 #define WARN_ON(p) do { if (unlikely(p)) WARN(); } while (0)
 
@@ -29,15 +26,6 @@ void __warn(char *file, int line);
 #define BUILD_BUG_ON(cond) ((void)BUILD_BUG_ON_ZERO(cond))
 #endif
 
-#ifndef assert_failed
-#define assert_failed(p)                                        \
-do {                                                            \
-    printk("Assertion '%s' failed, line %d, file %s\n", p ,     \
-                   __LINE__, __FILE__);                         \
-    BUG();                                                      \
-} while (0)
-#endif
-
 #ifndef NDEBUG
 #define ASSERT(p) \
     do { if ( unlikely(!(p)) ) assert_failed(#p); } while (0)
@@ -92,10 +80,34 @@ extern void guest_printk(const struct domain *d, const char *format, ...)
     __attribute__ ((format (printf, 2, 3)));
 extern void noreturn panic(const char *format, ...)
     __attribute__ ((format (printf, 1, 2)));
-extern long vm_assist(struct domain *, unsigned int, unsigned int);
+extern long vm_assist(struct domain *, unsigned int cmd, unsigned int type,
+                      unsigned long valid);
 extern int __printk_ratelimit(int ratelimit_ms, int ratelimit_burst);
 extern int printk_ratelimit(void);
 
+#define gprintk(lvl, fmt, args...) \
+    printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args)
+
+#ifdef NDEBUG
+
+static inline void
+__attribute__ ((__format__ (__printf__, 2, 3)))
+dprintk(const char *lvl, const char *fmt, ...) {}
+
+static inline void
+__attribute__ ((__format__ (__printf__, 2, 3)))
+gdprintk(const char *lvl, const char *fmt, ...) {}
+
+#else
+
+#define dprintk(lvl, fmt, args...) \
+    printk(lvl "%s:%d: " fmt, __FILE__, __LINE__, ## args)
+#define gdprintk(lvl, fmt, args...) \
+    printk(XENLOG_GUEST lvl "%s:%d:%pv " fmt, \
+           __FILE__, __LINE__, current, ## args)
+
+#endif
+
 /* vsprintf.c */
 #define sprintf __xen_has_no_sprintf__
 #define vsprintf __xen_has_no_vsprintf__
diff --git a/xen/include/xen/libfdt/fdt.h b/xen/include/xen/libfdt/fdt.h
index 526aedb..f460b0d 100644
--- a/xen/include/xen/libfdt/fdt.h
+++ b/xen/include/xen/libfdt/fdt.h
@@ -19,9 +19,7 @@
  *     GNU General Public License for more details.
  *
  *     You should have received a copy of the GNU General Public
- *     License along with this library; if not, write to the Free
- *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
- *     MA 02110-1301 USA
+ *     License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Alternatively,
  *
diff --git a/xen/include/xen/libfdt/libfdt.h b/xen/include/xen/libfdt/libfdt.h
index 37349f1..d6b94a1 100644
--- a/xen/include/xen/libfdt/libfdt.h
+++ b/xen/include/xen/libfdt/libfdt.h
@@ -18,9 +18,7 @@
  *     GNU General Public License for more details.
  *
  *     You should have received a copy of the GNU General Public
- *     License along with this library; if not, write to the Free
- *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
- *     MA 02110-1301 USA
+ *     License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * Alternatively,
  *
diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h
index 59cf571..fa07d72 100644
--- a/xen/include/xen/list.h
+++ b/xen/include/xen/list.h
@@ -385,6 +385,66 @@ static inline void list_splice_init(struct list_head *list,
     container_of(ptr, type, member)
 
 /**
+ * list_first_entry - get the first element from a list
+ * @ptr:        the list head to take the element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_first_entry(ptr, type, member) \
+        list_entry((ptr)->next, type, member)
+
+/**
+ * list_last_entry - get the last element from a list
+ * @ptr:        the list head to take the element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_last_entry(ptr, type, member) \
+        list_entry((ptr)->prev, type, member)
+
+/**
+ * list_first_entry_or_null - get the first element from a list
+ * @ptr:        the list head to take the element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * Note that if the list is empty, it returns NULL.
+ */
+#define list_first_entry_or_null(ptr, type, member) \
+        (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+
+/**
+ * list_last_entry_or_null - get the last element from a list
+ * @ptr:        the list head to take the element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * Note that if the list is empty, it returns NULL.
+ */
+#define list_last_entry_or_null(ptr, type, member) \
+        (!list_empty(ptr) ? list_last_entry(ptr, type, member) : NULL)
+
+/**
+  * list_next_entry - get the next element in list
+  * @pos:        the type * to cursor
+  * @member:     the name of the list_struct within the struct.
+  */
+#define list_next_entry(pos, member) \
+        list_entry((pos)->member.next, typeof(*(pos)), member)
+ 
+/**
+  * list_prev_entry - get the prev element in list
+  * @pos:        the type * to cursor
+  * @member:     the name of the list_struct within the struct.
+  */
+#define list_prev_entry(pos, member) \
+        list_entry((pos)->member.prev, typeof(*(pos)), member)
+
+/**
  * list_for_each    -    iterate over a list
  * @pos:    the &struct list_head to use as a loop cursor.
  * @head:    the head for your list.
diff --git a/xen/include/xen/mem_access.h b/xen/include/xen/mem_access.h
index 6ceb2a4..111f1fe 100644
--- a/xen/include/xen/mem_access.h
+++ b/xen/include/xen/mem_access.h
@@ -16,23 +16,26 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _XEN_ASM_MEM_ACCESS_H
 #define _XEN_ASM_MEM_ACCESS_H
 
 #include <public/memory.h>
+#include <asm/p2m.h>
 
 #ifdef HAS_MEM_ACCESS
 
 int mem_access_memop(unsigned long cmd,
                      XEN_GUEST_HANDLE_PARAM(xen_mem_access_op_t) arg);
-int mem_access_send_req(struct domain *d, mem_event_request_t *req);
+int mem_access_send_req(struct domain *d, vm_event_request_t *req);
 
-/* Resumes the running of the VCPU, restarting the last instruction */
-void mem_access_resume(struct domain *d);
+static inline
+void mem_access_resume(struct vcpu *v, vm_event_response_t *rsp)
+{
+    p2m_mem_access_emulate_check(v, rsp);
+}
 
 #else
 
@@ -44,12 +47,16 @@ int mem_access_memop(unsigned long cmd,
 }
 
 static inline
-int mem_access_send_req(struct domain *d, mem_event_request_t *req)
+int mem_access_send_req(struct domain *d, vm_event_request_t *req)
 {
     return -ENOSYS;
 }
 
-static inline void mem_access_resume(struct domain *d) {}
+static inline
+void mem_access_resume(struct vcpu *vcpu, vm_event_response_t *rsp)
+{
+    /* Nothing to do. */
+}
 
 #endif /* HAS_MEM_ACCESS */
 
diff --git a/xen/include/xen/mem_event.h b/xen/include/xen/mem_event.h
deleted file mode 100644
index 4f3ad8e..0000000
--- a/xen/include/xen/mem_event.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/******************************************************************************
- * mem_event.h
- *
- * Common interface for memory event support.
- *
- * Copyright (c) 2009 Citrix Systems, Inc. (Patrick Colp)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-
-#ifndef __MEM_EVENT_H__
-#define __MEM_EVENT_H__
-
-#include <xen/sched.h>
-
-#ifdef HAS_MEM_ACCESS
-
-/* Clean up on domain destruction */
-void mem_event_cleanup(struct domain *d);
-
-/* Returns whether a ring has been set up */
-bool_t mem_event_check_ring(struct mem_event_domain *med);
-
-/* Returns 0 on success, -ENOSYS if there is no ring, -EBUSY if there is no
- * available space and the caller is a foreign domain. If the guest itself
- * is the caller, -EBUSY is avoided by sleeping on a wait queue to ensure
- * that the ring does not lose future events.
- *
- * However, the allow_sleep flag can be set to false in cases in which it is ok
- * to lose future events, and thus -EBUSY can be returned to guest vcpus
- * (handle with care!).
- *
- * In general, you must follow a claim_slot() call with either put_request() or
- * cancel_slot(), both of which are guaranteed to
- * succeed.
- */
-int __mem_event_claim_slot(struct domain *d, struct mem_event_domain *med,
-                            bool_t allow_sleep);
-static inline int mem_event_claim_slot(struct domain *d,
-                                        struct mem_event_domain *med)
-{
-    return __mem_event_claim_slot(d, med, 1);
-}
-
-static inline int mem_event_claim_slot_nosleep(struct domain *d,
-                                        struct mem_event_domain *med)
-{
-    return __mem_event_claim_slot(d, med, 0);
-}
-
-void mem_event_cancel_slot(struct domain *d, struct mem_event_domain *med);
-
-void mem_event_put_request(struct domain *d, struct mem_event_domain *med,
-                            mem_event_request_t *req);
-
-int mem_event_get_response(struct domain *d, struct mem_event_domain *med,
-                           mem_event_response_t *rsp);
-
-int do_mem_event_op(int op, uint32_t domain, void *arg);
-int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec,
-                     XEN_GUEST_HANDLE_PARAM(void) u_domctl);
-
-void mem_event_vcpu_pause(struct vcpu *v);
-void mem_event_vcpu_unpause(struct vcpu *v);
-
-#else
-
-static inline void mem_event_cleanup(struct domain *d) {}
-
-static inline bool_t mem_event_check_ring(struct mem_event_domain *med)
-{
-    return 0;
-}
-
-static inline int mem_event_claim_slot(struct domain *d,
-                                        struct mem_event_domain *med)
-{
-    return -ENOSYS;
-}
-
-static inline int mem_event_claim_slot_nosleep(struct domain *d,
-                                        struct mem_event_domain *med)
-{
-    return -ENOSYS;
-}
-
-static inline
-void mem_event_cancel_slot(struct domain *d, struct mem_event_domain *med)
-{}
-
-static inline
-void mem_event_put_request(struct domain *d, struct mem_event_domain *med,
-                            mem_event_request_t *req)
-{}
-
-static inline
-int mem_event_get_response(struct domain *d, struct mem_event_domain *med,
-                           mem_event_response_t *rsp)
-{
-    return -ENOSYS;
-}
-
-static inline int do_mem_event_op(int op, uint32_t domain, void *arg)
-{
-    return -ENOSYS;
-}
-
-static inline
-int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec,
-                     XEN_GUEST_HANDLE_PARAM(void) u_domctl)
-{
-    return -ENOSYS;
-}
-
-static inline void mem_event_vcpu_pause(struct vcpu *v) {}
-static inline void mem_event_vcpu_unpause(struct vcpu *v) {}
-
-#endif /* HAS_MEM_ACCESS */
-
-#endif /* __MEM_EVENT_H__ */
-
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 74a65a6..5d4b64b 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -1,40 +1,89 @@
 /******************************************************************************
  * include/xen/mm.h
- * 
+ *
  * Definitions for memory pages, frame numbers, addresses, allocations, etc.
- * 
- * Note that Xen must handle several different physical 'address spaces' and
- * there is a consistent terminology for these:
- * 
- * 1. gpfn/gpaddr: A guest-specific pseudo-physical frame number or address.
- * 2. gmfn/gmaddr: A machine address from the p.o.v. of a particular guest.
- * 3. mfn/maddr:   A real machine frame number or address.
- * 4. pfn/paddr:   Used in 'polymorphic' functions that work across all
- *                 address spaces, depending on context. See the pagetable
- *                 conversion macros in asm-x86/page.h for examples.
- *                 Also 'paddr_t' is big enough to store any physical address.
- * 
- * This scheme provides consistent function and variable names even when
- * different guests are running in different memory-management modes.
- * 1. A guest running in auto-translated mode (e.g., shadow_mode_translate())
- *    will have gpfn == gmfn and gmfn != mfn.
- * 2. A paravirtualised x86 guest will have gpfn != gmfn and gmfn == mfn.
- * 3. A paravirtualised guest with no pseudophysical overlay will have
- *    gpfn == gpmfn == mfn.
- * 
+ *
  * Copyright (c) 2002-2006, K A Fraser <keir at xensource.com>
+ *
+ *                         +---------------------+
+ *                          Xen Memory Management
+ *                         +---------------------+
+ *
+ * Xen has to handle many different address spaces.  It is important not to
+ * get these spaces mixed up.  The following is a consistent terminology which
+ * should be adhered to.
+ *
+ * mfn: Machine Frame Number
+ *   The values Xen puts into its own pagetables.  This is the host physical
+ *   memory address space with RAM, MMIO etc.
+ *
+ * gfn: Guest Frame Number
+ *   The values a guest puts in its own pagetables.  For an auto-translated
+ *   guest (hardware assisted with 2nd stage translation, or shadowed), gfn !=
+ *   mfn.  For a non-translated guest which is aware of Xen, gfn == mfn.
+ *
+ * pfn: Pseudophysical Frame Number
+ *   A linear idea of a guest physical address space. For an auto-translated
+ *   guest, pfn == gfn while for a non-translated guest, pfn != gfn.
+ *
+ * WARNING: Some of these terms have changed over time while others have been
+ * used inconsistently, meaning that a lot of existing code does not match the
+ * definitions above.  New code should use these terms as described here, and
+ * over time older code should be corrected to be consistent.
+ *
+ * An incomplete list of larger work area:
+ * - Phase out the use of 'pfn' from the x86 pagetable code.  Callers should
+ *   know explicitly whether they are talking about mfns or gfns.
+ * - Phase out the use of 'pfn' from the ARM mm code.  A cursory glance
+ *   suggests that 'mfn' and 'pfn' are currently used interchangeably, where
+ *   'mfn' is the appropriate term to use.
+ * - Phase out the use of gpfn/gmfn where pfn/mfn are meant.  This excludes
+ *   the x86 shadow code, which uses gmfn/smfn pairs with different,
+ *   documented, meanings.
  */
 
 #ifndef __XEN_MM_H__
 #define __XEN_MM_H__
 
+#include <xen/compiler.h>
 #include <xen/types.h>
 #include <xen/list.h>
 #include <xen/spinlock.h>
+#include <xen/typesafe.h>
+
+TYPE_SAFE(unsigned long, mfn);
+#define PRI_mfn          "05lx"
+#define INVALID_MFN      (~0UL)
+
+#ifndef mfn_t
+#define mfn_t /* Grep fodder: mfn_t, _mfn() and mfn_x() are defined above */
+#undef mfn_t
+#endif
+
+TYPE_SAFE(unsigned long, gfn);
+#define PRI_gfn          "05lx"
+#define INVALID_GFN      (~0UL)
+
+#ifndef gfn_t
+#define gfn_t /* Grep fodder: gfn_t, _gfn() and gfn_x() are defined above */
+#undef gfn_t
+#endif
+
+TYPE_SAFE(unsigned long, pfn);
+#define PRI_pfn          "05lx"
+#define INVALID_PFN      (~0UL)
+
+#ifndef pfn_t
+#define pfn_t /* Grep fodder: pfn_t, _pfn() and pfn_x() are defined above */
+#undef pfn_t
+#endif
 
-struct domain;
 struct page_info;
 
+void put_page(struct page_info *);
+int get_page(struct page_info *, struct domain *);
+struct domain *__must_check page_get_owner_and_reference(struct page_info *);
+
 /* Boot-time allocator. Turns into generic allocator after bootstrap. */
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(
@@ -55,7 +104,12 @@ int map_pages_to_xen(
     unsigned long nr_mfns,
     unsigned int flags);
 void destroy_xen_mappings(unsigned long v, unsigned long e);
-
+/*
+ * Create only non-leaf page table entries for the
+ * page range in Xen virtual address space.
+ */
+int populate_pt_range(unsigned long virt, unsigned long mfn,
+                      unsigned long nr_mfns);
 /* Claim handling */
 unsigned long domain_adjust_tot_pages(struct domain *d, long pages);
 int domain_set_outstanding_pages(struct domain *d, unsigned long pages);
@@ -120,8 +174,12 @@ struct npfec {
 #define  MEMF_no_dma      (1U<<_MEMF_no_dma)
 #define _MEMF_exact_node  4
 #define  MEMF_exact_node  (1U<<_MEMF_exact_node)
+#define _MEMF_no_owner    5
+#define  MEMF_no_owner    (1U<<_MEMF_no_owner)
 #define _MEMF_node        8
-#define  MEMF_node(n)     ((((n)+1)&0xff)<<_MEMF_node)
+#define  MEMF_node_mask   ((1U << (8 * sizeof(nodeid_t))) - 1)
+#define  MEMF_node(n)     ((((n) + 1) & MEMF_node_mask) << _MEMF_node)
+#define  MEMF_get_node(f) ((((f) >> _MEMF_node) - 1) & MEMF_node_mask)
 #define _MEMF_bits        24
 #define  MEMF_bits(n)     ((n)<<_MEMF_bits)
 
@@ -172,6 +230,11 @@ page_list_first(const struct page_list_head *head)
     return head->next;
 }
 static inline struct page_info *
+page_list_last(const struct page_list_head *head)
+{
+    return head->tail;
+}
+static inline struct page_info *
 page_list_next(const struct page_info *page,
                const struct page_list_head *head)
 {
@@ -329,10 +392,12 @@ page_list_splice(struct page_list_head *list, struct page_list_head *head)
 # define INIT_PAGE_LIST_HEAD             INIT_LIST_HEAD
 # define INIT_PAGE_LIST_ENTRY            INIT_LIST_HEAD
 # define page_list_empty                 list_empty
-# define page_list_first(hd)             list_entry((hd)->next, \
-                                                    struct page_info, list)
-# define page_list_next(pg, hd)          list_entry((pg)->list.next, \
-                                                    struct page_info, list)
+# define page_list_first(hd)             \
+    list_first_entry(hd, struct page_info, list)
+# define page_list_last(hd)              \
+    list_last_entry(hd, struct page_info, list)
+# define page_list_next(pg, hd)          list_next_entry(pg, list)
+# define page_list_prev(pg, hd)          list_prev_entry(pg, list)
 # define page_list_add(pg, hd)           list_add(&(pg)->list, hd)
 # define page_list_add_tail(pg, hd)      list_add_tail(&(pg)->list, hd)
 # define page_list_del(pg, hd)           list_del(&(pg)->list)
diff --git a/xen/include/xen/multiboot.h b/xen/include/xen/multiboot.h
index 67483ed..d1b43e1 100644
--- a/xen/include/xen/multiboot.h
+++ b/xen/include/xen/multiboot.h
@@ -12,8 +12,7 @@
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
+   along with this program; If not, see <http://www.gnu.org/licenses/>.  */
 
 #ifndef __MULTIBOOT_H__
 #define __MULTIBOOT_H__
diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
index ac4b391..7aef1a8 100644
--- a/xen/include/xen/numa.h
+++ b/xen/include/xen/numa.h
@@ -7,7 +7,8 @@
 #define NODES_SHIFT     0
 #endif
 
-#define NUMA_NO_NODE    0xFF
+#define NUMA_NO_NODE     0xFF
+#define NUMA_NO_DISTANCE 0xFF
 
 #define MAX_NUMNODES    (1 << NODES_SHIFT)
 
diff --git a/xen/include/xen/p2m-common.h b/xen/include/xen/p2m-common.h
index 29f3628..47c40c7 100644
--- a/xen/include/xen/p2m-common.h
+++ b/xen/include/xen/p2m-common.h
@@ -1,12 +1,12 @@
 #ifndef _XEN_P2M_COMMON_H
 #define _XEN_P2M_COMMON_H
 
-#include <public/mem_event.h>
+#include <public/vm_event.h>
 
 /*
  * Additional access types, which are used to further restrict
  * the permissions given my the p2m_type_t memory type.  Violations
- * caused by p2m_access_t restrictions are sent to the mem_event
+ * caused by p2m_access_t restrictions are sent to the vm_event
  * interface.
  *
  * The access permissions are soft state: when any ambiguous change of page
@@ -44,4 +44,17 @@ int unmap_mmio_regions(struct domain *d,
                        unsigned long nr,
                        unsigned long mfn);
 
+/*
+ * Set access type for a region of gfns.
+ * If gfn == INVALID_GFN, sets the default access type.
+ */
+long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr,
+                        uint32_t start, uint32_t mask, xenmem_access_t access);
+
+/*
+ * Get access type for a gfn.
+ * If gfn == INVALID_GFN, gets the default access type.
+ */
+int p2m_get_mem_access(struct domain *d, gfn_t gfn, xenmem_access_t *access);
+
 #endif /* _XEN_P2M_COMMON_H */
diff --git a/xen/include/xen/paging.h b/xen/include/xen/paging.h
index 123cc58..214bde5 100644
--- a/xen/include/xen/paging.h
+++ b/xen/include/xen/paging.h
@@ -7,7 +7,7 @@
 #include <asm/paging.h>
 #include <asm/p2m.h>
 
-#elif defined CONFIG_SHADOW
+#elif defined CONFIG_SHADOW_PAGING
 
 #include <asm/shadow.h>
 
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 91520bc..a5aef55 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -13,6 +13,8 @@
 #include <xen/irq.h>
 #include <xen/pci_regs.h>
 #include <xen/pfn.h>
+#include <asm/device.h>
+#include <asm/numa.h>
 #include <asm/pci.h>
 
 /*
@@ -24,6 +26,7 @@
  *  7:3 = slot
  *  2:0 = function
  */
+#define PCI_SEG(sbdf) (((sbdf) >> 16) & 0xffff)
 #define PCI_BUS(bdf)    (((bdf) >> 8) & 0xff)
 #define PCI_SLOT(bdf)   (((bdf) >> 3) & 0x1f)
 #define PCI_FUNC(bdf)   ((bdf) & 0x07)
@@ -31,6 +34,9 @@
 #define PCI_DEVFN2(bdf) ((bdf) & 0xff)
 #define PCI_BDF(b,d,f)  ((((b) & 0xff) << 8) | PCI_DEVFN(d,f))
 #define PCI_BDF2(b,df)  ((((b) & 0xff) << 8) | ((df) & 0xff))
+#define PCI_SBDF(s,b,d,f) ((((s) & 0xffff) << 16) | PCI_BDF(b,d,f))
+#define PCI_SBDF2(s,bdf) ((((s) & 0xffff) << 16) | ((bdf) & 0xffff))
+#define PCI_SBDF3(s,b,df) ((((s) & 0xffff) << 16) | PCI_BDF2(b, df))
 
 struct pci_dev_info {
     bool_t is_extfn;
@@ -56,6 +62,8 @@ struct pci_dev {
 
     u8 phantom_stride;
 
+    nodeid_t node; /* NUMA node */
+
     enum pdev_type {
         DEV_TYPE_PCI_UNKNOWN,
         DEV_TYPE_PCIe_ENDPOINT,
@@ -89,7 +97,7 @@ struct pci_dev {
 extern spinlock_t pcidevs_lock;
 
 bool_t pci_known_segment(u16 seg);
-int pci_device_detect(u16 seg, u8 bus, u8 dev, u8 func);
+bool_t pci_device_detect(u16 seg, u8 bus, u8 dev, u8 func);
 int scan_pci_devices(void);
 enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn);
 int find_upstream_bridge(u16 seg, u8 *bus, u8 *devfn, u8 *secbus);
@@ -99,13 +107,13 @@ struct pci_dev *pci_lock_domain_pdev(
 
 void setup_hwdom_pci_devices(struct domain *,
                             int (*)(u8 devfn, struct pci_dev *));
-void pci_release_devices(struct domain *d);
+int pci_release_devices(struct domain *d);
 int pci_add_segment(u16 seg);
 const unsigned long *pci_get_ro_map(u16 seg);
-int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info *);
+int pci_add_device(u16 seg, u8 bus, u8 devfn,
+                   const struct pci_dev_info *, nodeid_t node);
 int pci_remove_device(u16 seg, u8 bus, u8 devfn);
 int pci_ro_device(int seg, int bus, int devfn);
-void arch_pci_ro_device(int seg, int bdf);
 int pci_hide_device(int bus, int devfn);
 struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);
 struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn);
diff --git a/xen/include/xen/perfc_defn.h b/xen/include/xen/perfc_defn.h
index 3ac7b45..526002d 100644
--- a/xen/include/xen/perfc_defn.h
+++ b/xen/include/xen/perfc_defn.h
@@ -21,10 +21,17 @@ PERFCOUNTER(dom_init,               "sched: dom_init")
 PERFCOUNTER(dom_destroy,            "sched: dom_destroy")
 PERFCOUNTER(vcpu_init,              "sched: vcpu_init")
 PERFCOUNTER(vcpu_destroy,           "sched: vcpu_destroy")
+PERFCOUNTER(vcpu_sleep,             "sched: vcpu_sleep")
+PERFCOUNTER(vcpu_wake_running,      "sched: vcpu_wake_running")
+PERFCOUNTER(vcpu_wake_onrunq,       "sched: vcpu_wake_onrunq")
+PERFCOUNTER(vcpu_wake_runnable,     "sched: vcpu_wake_runnable")
+PERFCOUNTER(vcpu_wake_not_runnable, "sched: vcpu_wake_not_runnable")
+PERFCOUNTER(tickle_idlers_none,     "sched: tickle_idlers_none")
+PERFCOUNTER(tickle_idlers_some,     "sched: tickle_idlers_some")
+PERFCOUNTER(vcpu_check,             "sched: vcpu_check")
 
 /* credit specific counters */
 PERFCOUNTER(delay_ms,               "csched: delay")
-PERFCOUNTER(vcpu_check,             "csched: vcpu_check")
 PERFCOUNTER(acct_run,               "csched: acct_run")
 PERFCOUNTER(acct_no_work,           "csched: acct_no_work")
 PERFCOUNTER(acct_balance,           "csched: acct_balance")
@@ -32,15 +39,8 @@ PERFCOUNTER(acct_reorder,           "csched: acct_reorder")
 PERFCOUNTER(acct_min_credit,        "csched: acct_min_credit")
 PERFCOUNTER(acct_vcpu_active,       "csched: acct_vcpu_active")
 PERFCOUNTER(acct_vcpu_idle,         "csched: acct_vcpu_idle")
-PERFCOUNTER(vcpu_sleep,             "csched: vcpu_sleep")
-PERFCOUNTER(vcpu_wake_running,      "csched: vcpu_wake_running")
-PERFCOUNTER(vcpu_wake_onrunq,       "csched: vcpu_wake_onrunq")
-PERFCOUNTER(vcpu_wake_runnable,     "csched: vcpu_wake_runnable")
-PERFCOUNTER(vcpu_wake_not_runnable, "csched: vcpu_wake_not_runnable")
 PERFCOUNTER(vcpu_park,              "csched: vcpu_park")
 PERFCOUNTER(vcpu_unpark,            "csched: vcpu_unpark")
-PERFCOUNTER(tickle_idlers_none,     "csched: tickle_idlers_none")
-PERFCOUNTER(tickle_idlers_some,     "csched: tickle_idlers_some")
 PERFCOUNTER(load_balance_idle,      "csched: load_balance_idle")
 PERFCOUNTER(load_balance_over,      "csched: load_balance_over")
 PERFCOUNTER(load_balance_other,     "csched: load_balance_other")
@@ -51,6 +51,19 @@ PERFCOUNTER(migrate_running,        "csched: migrate_running")
 PERFCOUNTER(migrate_kicked_away,    "csched: migrate_kicked_away")
 PERFCOUNTER(vcpu_hot,               "csched: vcpu_hot")
 
+/* credit2 specific counters */
+PERFCOUNTER(burn_credits_t2c,       "csched2: burn_credits_t2c")
+PERFCOUNTER(upd_max_weight_quick,   "csched2: update_max_weight_quick")
+PERFCOUNTER(upd_max_weight_full,    "csched2: update_max_weight_full")
+PERFCOUNTER(migrate_requested,      "csched2: migrate_requested")
+PERFCOUNTER(migrate_on_runq,        "csched2: migrate_on_runq")
+PERFCOUNTER(migrate_no_runq,        "csched2: migrate_no_runq")
+PERFCOUNTER(runtime_min_timer,      "csched2: runtime_min_timer")
+PERFCOUNTER(runtime_max_timer,      "csched2: runtime_max_timer")
+PERFCOUNTER(migrated,               "csched2: migrated")
+PERFCOUNTER(migrate_resisted,       "csched2: migrate_resisted")
+PERFCOUNTER(credit_reset,           "csched2: credit_reset")
+
 PERFCOUNTER(need_flush_tlb_flush,   "PG_need_flush tlb flushes")
 
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff --git a/xen/include/xen/radix-tree.h b/xen/include/xen/radix-tree.h
index faf2545..ec40cf1 100644
--- a/xen/include/xen/radix-tree.h
+++ b/xen/include/xen/radix-tree.h
@@ -14,8 +14,7 @@
  * General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef _XEN_RADIX_TREE_H
 #define _XEN_RADIX_TREE_H
diff --git a/xen/include/xen/random.h b/xen/include/xen/random.h
index 7c43d87..b950f03 100644
--- a/xen/include/xen/random.h
+++ b/xen/include/xen/random.h
@@ -3,4 +3,7 @@
 
 unsigned int get_random(void);
 
+/* The value keeps unchange once initialized for each booting */
+extern unsigned int boot_random;
+
 #endif /* __XEN_RANDOM_H__ */
diff --git a/xen/include/xen/rangeset.h b/xen/include/xen/rangeset.h
index 5ed6817..aa64082 100644
--- a/xen/include/xen/rangeset.h
+++ b/xen/include/xen/rangeset.h
@@ -10,6 +10,8 @@
 #ifndef __XEN_RANGESET_H__
 #define __XEN_RANGESET_H__
 
+#include <xen/types.h>
+
 struct domain;
 struct rangeset;
 
@@ -50,17 +52,17 @@ void rangeset_limit(
 #define _RANGESETF_prettyprint_hex 0
 #define RANGESETF_prettyprint_hex  (1U << _RANGESETF_prettyprint_hex)
 
-int __must_check rangeset_is_empty(
-    struct rangeset *r);
+bool_t __must_check rangeset_is_empty(
+    const struct rangeset *r);
 
 /* Add/remove/query a numeric range. */
 int __must_check rangeset_add_range(
     struct rangeset *r, unsigned long s, unsigned long e);
 int __must_check rangeset_remove_range(
     struct rangeset *r, unsigned long s, unsigned long e);
-int __must_check rangeset_contains_range(
+bool_t __must_check rangeset_contains_range(
     struct rangeset *r, unsigned long s, unsigned long e);
-int __must_check rangeset_overlaps_range(
+bool_t __must_check rangeset_overlaps_range(
     struct rangeset *r, unsigned long s, unsigned long e);
 int rangeset_report_ranges(
     struct rangeset *r, unsigned long s, unsigned long e,
@@ -71,7 +73,7 @@ int __must_check rangeset_add_singleton(
     struct rangeset *r, unsigned long s);
 int __must_check rangeset_remove_singleton(
     struct rangeset *r, unsigned long s);
-int __must_check rangeset_contains_singleton(
+bool_t __must_check rangeset_contains_singleton(
     struct rangeset *r, unsigned long s);
 
 /* swap contents */
diff --git a/xen/include/xen/rbtree.h b/xen/include/xen/rbtree.h
index b16dc50..f93c4d5 100644
--- a/xen/include/xen/rbtree.h
+++ b/xen/include/xen/rbtree.h
@@ -13,8 +13,7 @@
   GNU General Public License for more details.
 
   You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+  along with this program; If not, see <http://www.gnu.org/licenses/>.
 */
 
 #ifndef __RBTREE_H__
diff --git a/xen/include/xen/rcupdate.h b/xen/include/xen/rcupdate.h
index 91ded98..557a7b1 100644
--- a/xen/include/xen/rcupdate.h
+++ b/xen/include/xen/rcupdate.h
@@ -12,8 +12,7 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
  *
  * Copyright (C) IBM Corporation, 2001
  *
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
index 7cc25c6..dbe7cab 100644
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -165,7 +165,6 @@ struct scheduler {
     void         (*tick_resume)     (const struct scheduler *, unsigned int);
 };
 
-extern const struct scheduler sched_sedf_def;
 extern const struct scheduler sched_credit_def;
 extern const struct scheduler sched_credit2_def;
 extern const struct scheduler sched_arinc653_def;
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 46fc6e3..73d3bc8 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -23,7 +23,7 @@
 #include <public/domctl.h>
 #include <public/sysctl.h>
 #include <public/vcpu.h>
-#include <public/mem_event.h>
+#include <public/vm_event.h>
 #include <public/event_channel.h>
 
 #ifdef CONFIG_COMPAT
@@ -79,6 +79,7 @@ extern domid_t hardware_domid;
 
 struct evtchn
 {
+    spinlock_t lock;
 #define ECS_FREE         0 /* Channel is available for use.                  */
 #define ECS_RESERVED     1 /* Channel is reserved.                           */
 #define ECS_UNBOUND      2 /* Channel is waiting to bind to a remote domain. */
@@ -128,7 +129,7 @@ struct evtchn
 #endif
     } ssid;
 #endif
-};
+} __attribute__((aligned(64)));
 
 int  evtchn_init(struct domain *d); /* from domain_create */
 void evtchn_destroy(struct domain *d); /* from domain_kill */
@@ -214,11 +215,15 @@ struct vcpu
     unsigned long    pause_flags;
     atomic_t         pause_count;
 
-    /* VCPU paused for mem_event replies. */
-    atomic_t         mem_event_pause_count;
+    /* VCPU paused for vm_event replies. */
+    atomic_t         vm_event_pause_count;
     /* VCPU paused by system controller. */
     int              controller_pause_count;
 
+    /* Maptrack */
+    unsigned int     maptrack_head;
+    unsigned int     maptrack_tail;
+
     /* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */
     evtchn_port_t    virq_to_evtchn[NR_VIRQS];
     spinlock_t       virq_lock;
@@ -257,8 +262,8 @@ struct vcpu
 #define domain_unlock(d) spin_unlock_recursive(&(d)->domain_lock)
 #define domain_is_locked(d) spin_is_locked(&(d)->domain_lock)
 
-/* Memory event */
-struct mem_event_domain
+/* VM event */
+struct vm_event_domain
 {
     /* ring lock */
     spinlock_t ring_lock;
@@ -269,10 +274,10 @@ struct mem_event_domain
     void *ring_page;
     struct page_info *ring_pg_struct;
     /* front-end ring */
-    mem_event_front_ring_t front_ring;
+    vm_event_front_ring_t front_ring;
     /* event channel port (vcpu0 only) */
     int xen_port;
-    /* mem_event bit for vcpu->pause_flags */
+    /* vm_event bit for vcpu->pause_flags */
     int pause_flag;
     /* list of vcpus waiting for room in the ring */
     struct waitqueue_head wq;
@@ -282,14 +287,14 @@ struct mem_event_domain
     unsigned int last_vcpu_wake_up;
 };
 
-struct mem_event_per_domain
+struct vm_event_per_domain
 {
     /* Memory sharing support */
-    struct mem_event_domain share;
+    struct vm_event_domain share;
     /* Memory paging support */
-    struct mem_event_domain paging;
-    /* Memory access support */
-    struct mem_event_domain access;
+    struct vm_event_domain paging;
+    /* VM event monitor support */
+    struct vm_event_domain monitor;
 };
 
 struct evtchn_port_ops;
@@ -306,6 +311,9 @@ struct domain
 {
     domid_t          domain_id;
 
+    unsigned int     max_vcpus;
+    struct vcpu    **vcpu;
+
     shared_info_t   *shared_info;     /* shared data area */
 
     spinlock_t       domain_lock;
@@ -314,13 +322,11 @@ struct domain
     struct page_list_head page_list;  /* linked list */
     struct page_list_head xenpage_list; /* linked list (size xenheap_pages) */
     unsigned int     tot_pages;       /* number of pages currently possesed */
+    unsigned int     xenheap_pages;   /* # pages allocated from Xen heap    */
     unsigned int     outstanding_pages; /* pages claimed but not possessed  */
     unsigned int     max_pages;       /* maximum value for tot_pages        */
     atomic_t         shr_pages;       /* number of shared pages             */
     atomic_t         paged_pages;     /* number of paged-out pages          */
-    unsigned int     xenheap_pages;   /* # pages allocated from Xen heap    */
-
-    unsigned int     max_vcpus;
 
     /* Scheduling. */
     void            *sched_priv;    /* scheduler-specific data */
@@ -335,8 +341,9 @@ struct domain
     /* Event channel information. */
     struct evtchn   *evtchn;                         /* first bucket only */
     struct evtchn  **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */
-    unsigned int     max_evtchns;
-    unsigned int     max_evtchn_port;
+    unsigned int     max_evtchns;     /* number supported by ABI */
+    unsigned int     max_evtchn_port; /* max permitted port number */
+    unsigned int     valid_evtchns;   /* number of allocated event channels */
     spinlock_t       event_lock;
     const struct evtchn_port_ops *evtchn_port_ops;
     struct evtchn_fifo_domain *evtchn_fifo;
@@ -347,15 +354,19 @@ struct domain
      * Interrupt to event-channel mappings and other per-guest-pirq data.
      * Protected by the domain's event-channel spinlock.
      */
-    unsigned int     nr_pirqs;
     struct radix_tree_root pirq_tree;
-
-    /* I/O capabilities (access to IRQs and memory-mapped I/O). */
-    struct rangeset *iomem_caps;
-    struct rangeset *irq_caps;
+    unsigned int     nr_pirqs;
 
     enum guest_type guest_type;
 
+    /* Is this guest dying (i.e., a zombie)? */
+    enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
+
+    /* Domain is paused by controller software? */
+    int              controller_pause_count;
+
+    int64_t          time_offset_seconds;
+
 #ifdef HAS_PASSTHROUGH
     /* Does this guest need iommu mappings (-1 meaning "being set up")? */
     s8               need_iommu;
@@ -364,16 +375,14 @@ struct domain
     bool_t           auto_node_affinity;
     /* Is this guest fully privileged (aka dom0)? */
     bool_t           is_privileged;
-    /* Which guest this guest has privileges on */
-    struct domain   *target;
-    /* Is this guest being debugged by dom0? */
-    bool_t           debugger_attached;
-    /* Is this guest dying (i.e., a zombie)? */
-    enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
-    /* Domain is paused by controller software? */
-    int              controller_pause_count;
     /* Domain's VCPUs are pinned 1:1 to physical CPUs? */
     bool_t           is_pinned;
+    /* Non-migratable and non-restoreable? */
+    bool_t           disable_migrate;
+    /* Is this guest being debugged by dom0? */
+    bool_t           debugger_attached;
+    /* Which guest this guest has privileges on */
+    struct domain   *target;
 
     /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
 #if MAX_VIRT_CPUS <= BITS_PER_LONG
@@ -382,6 +391,10 @@ struct domain
     unsigned long   *poll_mask;
 #endif
 
+    /* I/O capabilities (access to IRQs and memory-mapped I/O). */
+    struct rangeset *iomem_caps;
+    struct rangeset *irq_caps;
+
     /* Guest has shut down (inc. reason code)? */
     spinlock_t       shutdown_lock;
     bool_t           is_shutting_down; /* in process of shutting down? */
@@ -390,15 +403,12 @@ struct domain
 
     /* If this is not 0, send suspend notification here instead of
      * raising DOM_EXC */
-    int              suspend_evtchn;
+    evtchn_port_t    suspend_evtchn;
 
     atomic_t         pause_count;
-
-    unsigned long    vm_assist;
-
     atomic_t         refcnt;
 
-    struct vcpu    **vcpu;
+    unsigned long    vm_assist;
 
     /* Bitmask of CPUs which are holding onto this domain's state. */
     cpumask_var_t    domain_dirty_cpumask;
@@ -418,7 +428,6 @@ struct domain
 
     /* OProfile support. */
     struct xenoprof *xenoprof;
-    int32_t time_offset_seconds;
 
     /* Domain watchdog. */
 #define NR_DOMAIN_WATCHDOG_TIMERS 2
@@ -439,11 +448,8 @@ struct domain
 
     struct lock_profile_qhead profile_head;
 
-    /* Non-migratable and non-restoreable? */
-    bool_t disable_migrate;
-
-    /* Various mem_events */
-    struct mem_event_per_domain *mem_event;
+    /* Various vm_events */
+    struct vm_event_per_domain *vm_event;
 
     /*
      * Can be specified by the user. If that is not the case, it is
@@ -525,8 +531,13 @@ static inline void get_knownalive_domain(struct domain *d)
 int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity);
 void domain_update_node_affinity(struct domain *d);
 
-struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, uint32_t ssidref);
+/*
+ * Create a domain: the configuration is only necessary for real domain
+ * (i.e !DOMCRF_dummy, excluded idle domain).
+ */
+struct domain *domain_create(domid_t domid, unsigned int domcr_flags,
+                             uint32_t ssidref,
+                             struct xen_arch_domainconfig *config);
  /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
 #define _DOMCRF_hvm           0
 #define DOMCRF_hvm            (1U<<_DOMCRF_hvm)
@@ -793,6 +804,11 @@ static inline int domain_pause_by_systemcontroller_nosync(struct domain *d)
 {
     return __domain_pause_by_systemcontroller(d, domain_pause_nosync);
 }
+
+/* domain_pause() but safe against trying to pause current. */
+void domain_pause_except_self(struct domain *d);
+void domain_unpause_except_self(struct domain *d);
+
 void cpu_init(void);
 
 struct scheduler;
@@ -833,7 +849,7 @@ void watchdog_domain_destroy(struct domain *d);
 /* This check is for functionality specific to a control domain */
 #define is_control_domain(_d) ((_d)->is_privileged)
 
-#define VM_ASSIST(_d,_t) (test_bit((_t), &(_d)->vm_assist))
+#define VM_ASSIST(d, t) (test_bit(VMASST_TYPE_ ## t, &(d)->vm_assist))
 
 #define is_pv_domain(d) ((d)->guest_type == guest_type_pv)
 #define is_pv_vcpu(v)   (is_pv_domain((v)->domain))
diff --git a/xen/include/xen/shared.h b/xen/include/xen/shared.h
index 19c79fd..3f5b283 100644
--- a/xen/include/xen/shared.h
+++ b/xen/include/xen/shared.h
@@ -18,7 +18,7 @@ typedef union {
 #define __shared_info(d, s, field)                      \
     (*(!has_32bit_shinfo(d) ?                           \
        (typeof(&(s)->compat.field))&(s)->native.field : \
-       (typeof(&(s)->compat.field))&(s)->compat.field))
+       &(s)->compat.field))
 
 typedef union {
     struct vcpu_info native;
@@ -29,7 +29,7 @@ typedef union {
 #define __vcpu_info(v, i, field)                        \
     (*(!has_32bit_shinfo((v)->domain) ?                 \
        (typeof(&(i)->compat.field))&(i)->native.field : \
-       (typeof(&(i)->compat.field))&(i)->compat.field))
+       &(i)->compat.field))
 
 #else
 
@@ -41,8 +41,6 @@ typedef struct vcpu_info vcpu_info_t;
 
 #endif
 
-extern vcpu_info_t dummy_vcpu_info;
-
 #define shared_info(d, field)      __shared_info(d, (d)->shared_info, field)
 #define vcpu_info(v, field)        __vcpu_info(v, (v)->vcpu_info, field)
 
diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h
index eda9b2e..fb0438e 100644
--- a/xen/include/xen/spinlock.h
+++ b/xen/include/xen/spinlock.h
@@ -80,8 +80,7 @@ struct lock_profile_qhead {
     static struct lock_profile *__lock_profile_##name                         \
     __used_section(".lockprofile.data") =                                     \
     &__lock_profile_data_##name
-#define _SPIN_LOCK_UNLOCKED(x) { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0,          \
-                                 _LOCK_DEBUG, x }
+#define _SPIN_LOCK_UNLOCKED(x) { { 0 }, 0xfffu, 0, _LOCK_DEBUG, x }
 #define SPIN_LOCK_UNLOCKED _SPIN_LOCK_UNLOCKED(NULL)
 #define DEFINE_SPINLOCK(l)                                                    \
     spinlock_t l = _SPIN_LOCK_UNLOCKED(NULL);                                 \
@@ -117,8 +116,7 @@ extern void spinlock_profile_reset(unsigned char key);
 
 struct lock_profile_qhead { };
 
-#define SPIN_LOCK_UNLOCKED                                                    \
-    { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG }
+#define SPIN_LOCK_UNLOCKED { { 0 }, 0xfffu, 0, _LOCK_DEBUG }
 #define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED
 
 #define spin_lock_init_prof(s, l) spin_lock_init(&((s)->l))
@@ -127,8 +125,18 @@ struct lock_profile_qhead { };
 
 #endif
 
+typedef union {
+    u32 head_tail;
+    struct {
+        u16 head;
+        u16 tail;
+    };
+} spinlock_tickets_t;
+
+#define SPINLOCK_TICKET_INC { .head_tail = 0x10000, }
+
 typedef struct spinlock {
-    raw_spinlock_t raw;
+    spinlock_tickets_t tickets;
     u16 recurse_cpu:12;
     u16 recurse_cnt:4;
     struct lock_debug debug;
diff --git a/xen/include/xen/symbols.h b/xen/include/xen/symbols.h
index 87cd77d..1fa0537 100644
--- a/xen/include/xen/symbols.h
+++ b/xen/include/xen/symbols.h
@@ -11,4 +11,7 @@ const char *symbols_lookup(unsigned long addr,
                            unsigned long *offset,
                            char *namebuf);
 
+int xensyms_read(uint32_t *symnum, char *type,
+                 uint64_t *address, char *name);
+
 #endif /*_XEN_SYMBOLS_H*/
diff --git a/xen/include/xen/time.h b/xen/include/xen/time.h
index 709501f..da4e8d7 100644
--- a/xen/include/xen/time.h
+++ b/xen/include/xen/time.h
@@ -64,11 +64,11 @@ extern void update_vcpu_system_time(struct vcpu *v);
 extern void update_domain_wallclock_time(struct domain *d);
 
 extern void do_settime(
-    unsigned long secs, unsigned long nsecs, u64 system_time_base);
+    unsigned long secs, unsigned int nsecs, u64 system_time_base);
 
 extern void send_timer_event(struct vcpu *v);
 
-void domain_set_time_offset(struct domain *d, int32_t time_offset_seconds);
+void domain_set_time_offset(struct domain *d, int64_t time_offset_seconds);
 
 #include <asm/time.h>
 
diff --git a/xen/include/xen/tmem.h b/xen/include/xen/tmem.h
index 5dbf9d5..32a542a 100644
--- a/xen/include/xen/tmem.h
+++ b/xen/include/xen/tmem.h
@@ -9,6 +9,9 @@
 #ifndef __XEN_TMEM_H__
 #define __XEN_TMEM_H__
 
+struct xen_sysctl_tmem_op;
+
+extern int tmem_control(struct xen_sysctl_tmem_op *op);
 extern void tmem_destroy(void *);
 extern void *tmem_relinquish_pages(unsigned int, unsigned int);
 extern unsigned long tmem_freeable_pages(void);
diff --git a/xen/include/xen/tmem_xen.h b/xen/include/xen/tmem_xen.h
index 885ee21..0fdbf68 100644
--- a/xen/include/xen/tmem_xen.h
+++ b/xen/include/xen/tmem_xen.h
@@ -285,7 +285,7 @@ static inline int tmem_get_tmemop_from_client(tmem_op_t *op, tmem_cli_op_t uops)
 #ifdef CONFIG_COMPAT
     if ( has_hvm_container_vcpu(current) ?
          hvm_guest_x86_mode(current) != 8 :
-         is_pv_32on64_vcpu(current) )
+         is_pv_32bit_vcpu(current) )
     {
         int rc;
         enum XLAT_tmem_op_u u;
@@ -297,15 +297,11 @@ static inline int tmem_get_tmemop_from_client(tmem_op_t *op, tmem_cli_op_t uops)
         switch ( cop.cmd )
         {
         case TMEM_NEW_POOL:   u = XLAT_tmem_op_u_creat; break;
-        case TMEM_CONTROL:    u = XLAT_tmem_op_u_ctrl;  break;
         case TMEM_AUTH:       u = XLAT_tmem_op_u_creat; break;
         case TMEM_RESTORE_NEW:u = XLAT_tmem_op_u_creat; break;
         default:              u = XLAT_tmem_op_u_gen ;  break;
         }
-#define XLAT_tmem_op_HNDL_u_ctrl_buf(_d_, _s_) \
-        guest_from_compat_handle((_d_)->u.ctrl.buf, (_s_)->u.ctrl.buf)
         XLAT_tmem_op(op, &cop);
-#undef XLAT_tmem_op_HNDL_u_ctrl_buf
         return 0;
     }
 #endif
diff --git a/xen/include/xen/typesafe.h b/xen/include/xen/typesafe.h
new file mode 100644
index 0000000..7ecd3b4
--- /dev/null
+++ b/xen/include/xen/typesafe.h
@@ -0,0 +1,46 @@
+#ifndef __XEN_TYPESAFE_H__
+#define __XEN_TYPESAFE_H__
+
+/*
+ * Compiler games to gain type safety between different logical integers.
+ *
+ * TYPE_SAFE($TYPE, $FOO) declares:
+ *  * $FOO_t   which encapsulates $TYPE
+ *  * _$FOO()  which boxes a $TYPE as a $FOO_t
+ *  * $FOO_x() which unboxes a $FOO_t to $TYPE
+ *
+ * This makes a $FOO_t and a $BAR_t incompatible even when the box the same
+ * $TYPE.
+ *
+ * It does have some performance cost because the types now have a different
+ * storage attribute, so type safety is only enforced in a debug build.
+ * Non-debug builds degrade to a simple typedef and noops for the functions.
+ */
+
+#ifndef NDEBUG
+
+#define TYPE_SAFE(_type, _name)                                         \
+    typedef struct { _type _name; } _name##_t;                          \
+    static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \
+    static inline _type _name##_x(_name##_t n) { return n._name; }
+
+#else
+
+#define TYPE_SAFE(_type, _name)                                         \
+    typedef _type _name##_t;                                            \
+    static inline _name##_t _##_name(_type n) { return n; }             \
+    static inline _type _name##_x(_name##_t n) { return n; }
+
+#endif
+
+#endif /* __XEN_TYPESAFE_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/xen/vm_event.h b/xen/include/xen/vm_event.h
new file mode 100644
index 0000000..92c75ff
--- /dev/null
+++ b/xen/include/xen/vm_event.h
@@ -0,0 +1,87 @@
+/******************************************************************************
+ * vm_event.h
+ *
+ * Common interface for memory event support.
+ *
+ * Copyright (c) 2009 Citrix Systems, Inc. (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#ifndef __VM_EVENT_H__
+#define __VM_EVENT_H__
+
+#include <xen/sched.h>
+
+/* Clean up on domain destruction */
+void vm_event_cleanup(struct domain *d);
+
+/* Returns whether a ring has been set up */
+bool_t vm_event_check_ring(struct vm_event_domain *ved);
+
+/* Returns 0 on success, -ENOSYS if there is no ring, -EBUSY if there is no
+ * available space and the caller is a foreign domain. If the guest itself
+ * is the caller, -EBUSY is avoided by sleeping on a wait queue to ensure
+ * that the ring does not lose future events.
+ *
+ * However, the allow_sleep flag can be set to false in cases in which it is ok
+ * to lose future events, and thus -EBUSY can be returned to guest vcpus
+ * (handle with care!).
+ *
+ * In general, you must follow a claim_slot() call with either put_request() or
+ * cancel_slot(), both of which are guaranteed to
+ * succeed.
+ */
+int __vm_event_claim_slot(struct domain *d, struct vm_event_domain *ved,
+                          bool_t allow_sleep);
+static inline int vm_event_claim_slot(struct domain *d,
+                                      struct vm_event_domain *ved)
+{
+    return __vm_event_claim_slot(d, ved, 1);
+}
+
+static inline int vm_event_claim_slot_nosleep(struct domain *d,
+                                              struct vm_event_domain *ved)
+{
+    return __vm_event_claim_slot(d, ved, 0);
+}
+
+void vm_event_cancel_slot(struct domain *d, struct vm_event_domain *ved);
+
+void vm_event_put_request(struct domain *d, struct vm_event_domain *ved,
+                          vm_event_request_t *req);
+
+int vm_event_get_response(struct domain *d, struct vm_event_domain *ved,
+                          vm_event_response_t *rsp);
+
+void vm_event_resume(struct domain *d, struct vm_event_domain *ved);
+
+int vm_event_domctl(struct domain *d, xen_domctl_vm_event_op_t *vec,
+                    XEN_GUEST_HANDLE_PARAM(void) u_domctl);
+
+void vm_event_vcpu_pause(struct vcpu *v);
+void vm_event_vcpu_unpause(struct vcpu *v);
+
+#endif /* __VM_EVENT_H__ */
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/xen/vmap.h b/xen/include/xen/vmap.h
index b1923dd..5671ac8 100644
--- a/xen/include/xen/vmap.h
+++ b/xen/include/xen/vmap.h
@@ -1,16 +1,19 @@
 #if !defined(__XEN_VMAP_H__) && defined(VMAP_VIRT_START)
 #define __XEN_VMAP_H__
 
-#include <xen/types.h>
+#include <xen/mm.h>
 #include <asm/page.h>
 
 void *vm_alloc(unsigned int nr, unsigned int align);
 void vm_free(const void *);
 
-void *__vmap(const unsigned long *mfn, unsigned int granularity,
+void *__vmap(const mfn_t *mfn, unsigned int granularity,
              unsigned int nr, unsigned int align, unsigned int flags);
-void *vmap(const unsigned long *mfn, unsigned int nr);
+void *vmap(const mfn_t *mfn, unsigned int nr);
 void vunmap(const void *);
+void *vmalloc(size_t size);
+void *vzalloc(size_t size);
+void vfree(void *va);
 
 void __iomem *ioremap(paddr_t, size_t);
 
diff --git a/xen/include/xen/xencomm.h b/xen/include/xen/xencomm.h
deleted file mode 100644
index 3426b8a..0000000
--- a/xen/include/xen/xencomm.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- *
- * Copyright (C) IBM Corp. 2006
- *
- * Authors: Hollis Blanchard <hollisb at us.ibm.com>
- */
-
-#ifndef __XENCOMM_H__
-#define __XENCOMM_H__
-
-#include <public/xen.h>
-
-unsigned long xencomm_copy_to_guest(
-    void *to, const void *from, unsigned int len, unsigned int skip); 
-unsigned long xencomm_copy_from_guest(
-    void *to, const void *from, unsigned int len, unsigned int skip); 
-unsigned long xencomm_clear_guest(
-    void *to, unsigned int n, unsigned int skip);
-int xencomm_add_offset(void **handle, unsigned int bytes);
-int xencomm_handle_is_null(void *ptr);
-
-static inline int xencomm_is_inline(const void *handle)
-{
-    unsigned long addr = (unsigned long)handle;
-    return (addr & XENCOMM_INLINE_FLAG) == XENCOMM_INLINE_FLAG;
-}
-
-static inline unsigned long xencomm_inline_addr(const void *handle)
-{
-    return (unsigned long)handle & ~XENCOMM_INLINE_FLAG;
-}
-
-#define raw_copy_to_guest(dst, src, len)       \
-    xencomm_copy_to_guest(dst, src, len, 0)
-#define raw_copy_from_guest(dst, src, len)     \
-    xencomm_copy_from_guest(dst, src, nr, 0)
-#define raw_clear_guest(dst, len)              \
-    xencomm_clear_guest(dst, len, 0)
-#define __raw_copy_to_guest raw_copy_to_guest
-#define __raw_copy_from_guest raw_copy_from_guest
-#define __raw_clear_guest raw_clear_guest
-
-/* Is the guest handle a NULL reference? */
-#define guest_handle_is_null(hnd) \
-    ((hnd).p == NULL || xencomm_handle_is_null((hnd).p))
-
-/* Offset the given guest handle into the array it refers to. */
-#define guest_handle_add_offset(hnd, nr) ({                             \
-    const typeof((hnd).p) _ptr;                                         \
-    xencomm_add_offset((void **)&((hnd).p), nr * sizeof(*_ptr));        \
-})
-
-/* Cast a guest handle to the specified type of handle. */
-#define guest_handle_cast(hnd, type) ({         \
-    type *_x = (hnd).p;                         \
-    XEN_GUEST_HANDLE_PARAM(type) _y;            \
-    set_xen_guest_handle(_y, _x);               \
-    _y;                                         \
-})
-
-/* Cast a XEN_GUEST_HANDLE to XEN_GUEST_HANDLE_PARAM */
-#define guest_handle_to_param(hnd, type) ({                  \
-    /* type checking: make sure that the pointers inside     \
-     * XEN_GUEST_HANDLE and XEN_GUEST_HANDLE_PARAM are of    \
-     * the same type, then return hnd */                     \
-    (void)((typeof(&(hnd).p)) 0 ==                           \
-        (typeof(&((XEN_GUEST_HANDLE_PARAM(type)) {}).p)) 0); \
-    (hnd);                                                   \
-})
-
-/* Cast a XEN_GUEST_HANDLE_PARAM to XEN_GUEST_HANDLE */
-#define guest_handle_from_param(hnd, type) ({                \
-    /* type checking: make sure that the pointers inside     \
-     * XEN_GUEST_HANDLE and XEN_GUEST_HANDLE_PARAM are of    \
-     * the same type, then return hnd */                     \
-    (void)((typeof(&(hnd).p)) 0 ==                           \
-        (typeof(&((XEN_GUEST_HANDLE_PARAM(type)) {}).p)) 0); \
-    (hnd);                                                   \
-})
-
-/* Since we run in real mode, we can safely access all addresses. That also
- * means our __routines are identical to our "normal" routines. */
-#define guest_handle_okay(hnd, nr) 1
-#define guest_handle_subrange_okay(hnd, first, last) 1
-
-/*
- * Copy an array of objects to guest context via a guest handle.
- * Optionally specify an offset into the guest array.
- */
-#define copy_to_guest_offset(hnd, idx, ptr, nr) \
-    __copy_to_guest_offset(hnd, idx, ptr, nr)
-
-/* Copy sub-field of a structure to guest context via a guest handle. */
-#define copy_field_to_guest(hnd, ptr, field) \
-    __copy_field_to_guest(hnd, ptr, field)
-
-/*
- * Copy an array of objects from guest context via a guest handle.
- * Optionally specify an offset into the guest array.
- */
-#define copy_from_guest_offset(ptr, hnd, idx, nr) \
-    __copy_from_guest_offset(ptr, hnd, idx, nr)
-
-/*
- * Clear an array of objects in guest context via a guest handle.
- * Optionally specify an offset into the guest array.
- */
-#define clear_guest_offset(hnd, idx, nr) \
-    __clear_guest_offset(hnd, idx, nr)
-
-/* Copy sub-field of a structure from guest context via a guest handle. */
-#define copy_field_from_guest(ptr, hnd, field) \
-    __copy_field_from_guest(ptr, hnd, field)
-
-#define __copy_to_guest_offset(hnd, idx, ptr, nr) ({                \
-    const typeof(*(ptr)) *_s = (ptr);                               \
-    void *_d = (hnd).p;                                             \
-    ((void)((hnd).p == (ptr)));                                     \
-    xencomm_copy_to_guest(_d, _s, sizeof(*_s)*(nr), sizeof(*_s)*(idx)); \
-})
-
-#define __copy_field_to_guest(hnd, ptr, field) ({                   \
-    unsigned int _off = offsetof(typeof(*(hnd).p), field);          \
-    const typeof(&(ptr)->field) _s = &(ptr)->field;                 \
-    void *_d = (hnd).p;                                             \
-    ((void)(&(hnd).p->field == &(ptr)->field));                     \
-    xencomm_copy_to_guest(_d, _s, sizeof(*_s), _off);               \
-})
-
-#define __copy_from_guest_offset(ptr, hnd, idx, nr) ({              \
-    const typeof(*(ptr)) *_s = (hnd).p;                             \
-    typeof(*(ptr)) *_d = (ptr);                                     \
-    xencomm_copy_from_guest(_d, _s, sizeof(*_d)*(nr), sizeof(*_d)*(idx)); \
-})
-
-#define __copy_field_from_guest(ptr, hnd, field) ({                 \
-    unsigned int _off = offsetof(typeof(*(hnd).p), field);          \
-    const void *_s = (hnd).p;                                       \
-    typeof(&(ptr)->field) _d = &(ptr)->field;                       \
-    ((void)(&(hnd).p->field == &(ptr)->field));                     \
-    xencomm_copy_from_guest(_d, _s, sizeof(*_d), _off);             \
-})
-
-#define __clear_guest_offset(hnd, idx, nr) ({                \
-    void *_d = (hnd).p;                                             \
-    xencomm_clear_guest(_d, nr, idx); \
-})
-
-#ifdef CONFIG_XENCOMM_MARK_DIRTY
-extern void xencomm_mark_dirty(unsigned long addr, unsigned int len);
-#else
-static inline void xencomm_mark_dirty(unsigned long addr, unsigned int len)
-{
-}
-#endif
-
-#endif /* __XENCOMM_H__ */
diff --git a/xen/include/xlat.lst b/xen/include/xlat.lst
index 9c9fd9a..3795059 100644
--- a/xen/include/xlat.lst
+++ b/xen/include/xlat.lst
@@ -6,8 +6,12 @@
 ?	mmu_update			xen.h
 !	mmuext_op			xen.h
 !	start_info			xen.h
-?	vcpu_info			xen.h
 ?	vcpu_time_info			xen.h
+?	pmu_amd_ctxt			arch-x86/pmu.h
+?	pmu_arch			arch-x86/pmu.h
+?	pmu_cntr_pair			arch-x86/pmu.h
+?	pmu_intel_ctxt			arch-x86/pmu.h
+?	pmu_regs			arch-x86/pmu.h
 !	cpu_user_regs			arch-x86/xen- at arch@.h
 !	trap_info			arch-x86/xen.h
 ?	cpu_offline_action		arch-x86/xen-mca.h
@@ -61,9 +65,10 @@
 !	memory_exchange			memory.h
 !	memory_map			memory.h
 !	memory_reservation		memory.h
-?	mem_access_op		memory.h
+?	mem_access_op			memory.h
 !	pod_target			memory.h
 !	remove_from_physmap		memory.h
+!	reserved_device_memory_map	memory.h
 ?	vmemrange			memory.h
 !	vnuma_topology_info		memory.h
 ?	physdev_eoi			physdev.h
@@ -89,12 +94,16 @@
 ?	processor_px			platform.h
 !	psd_package			platform.h
 ?	xenpf_enter_acpi_sleep		platform.h
+!	xenpf_symdata			platform.h
 ?	xenpf_pcpuinfo			platform.h
 ?	xenpf_pcpu_version		platform.h
 ?	xenpf_resource_entry		platform.h
+?	pmu_data			pmu.h
+?	pmu_params			pmu.h
 !	sched_poll			sched.h
 ?	sched_remote_shutdown		sched.h
 ?	sched_shutdown			sched.h
+?	tmem_oid			tmem.h
 !	tmem_op				tmem.h
 ?	t_buf				trace.h
 ?	vcpu_get_physid			vcpu.h
diff --git a/xen/include/xsm/dummy.h b/xen/include/xsm/dummy.h
index f20e89c..9fe372c 100644
--- a/xen/include/xsm/dummy.h
+++ b/xen/include/xsm/dummy.h
@@ -350,6 +350,29 @@ static XSM_INLINE int xsm_deassign_device(XSM_DEFAULT_ARG struct domain *d, uint
 
 #endif /* HAS_PASSTHROUGH && HAS_PCI */
 
+#if defined(HAS_PASSTHROUGH) && defined(HAS_DEVICE_TREE)
+static XSM_INLINE int xsm_test_assign_dtdevice(XSM_DEFAULT_ARG const char *dtpath)
+{
+    XSM_ASSERT_ACTION(XSM_HOOK);
+    return xsm_default_action(action, current->domain, NULL);
+}
+
+static XSM_INLINE int xsm_assign_dtdevice(XSM_DEFAULT_ARG struct domain *d,
+                                          const char *dtpath)
+{
+    XSM_ASSERT_ACTION(XSM_HOOK);
+    return xsm_default_action(action, current->domain, d);
+}
+
+static XSM_INLINE int xsm_deassign_dtdevice(XSM_DEFAULT_ARG struct domain *d,
+                                            const char *dtpath)
+{
+    XSM_ASSERT_ACTION(XSM_HOOK);
+    return xsm_default_action(action, current->domain, d);
+}
+
+#endif /* HAS_PASSTHROUGH && HAS_DEVICE_TREE */
+
 static XSM_INLINE int xsm_resource_plug_core(XSM_DEFAULT_VOID)
 {
     XSM_ASSERT_ACTION(XSM_HOOK);
@@ -404,12 +427,6 @@ static XSM_INLINE int xsm_tmem_op(XSM_DEFAULT_VOID)
     return xsm_default_action(action, current->domain, NULL);
 }
 
-static XSM_INLINE int xsm_tmem_control(XSM_DEFAULT_VOID)
-{
-    XSM_ASSERT_ACTION(XSM_PRIV);
-    return xsm_default_action(action, current->domain, NULL);
-}
-
 static XSM_INLINE long xsm_do_xsm_op(XEN_GUEST_HANDLE_PARAM(xsm_op_t) op)
 {
     return -ENOSYS;
@@ -445,6 +462,18 @@ static XSM_INLINE int xsm_unmap_domain_pirq(XSM_DEFAULT_ARG struct domain *d)
     return xsm_default_action(action, current->domain, d);
 }
 
+static XSM_INLINE int xsm_bind_pt_irq(XSM_DEFAULT_ARG struct domain *d, struct xen_domctl_bind_pt_irq *bind)
+{
+    XSM_ASSERT_ACTION(XSM_HOOK);
+    return xsm_default_action(action, current->domain, d);
+}
+
+static XSM_INLINE int xsm_unbind_pt_irq(XSM_DEFAULT_ARG struct domain *d, struct xen_domctl_bind_pt_irq *bind)
+{
+    XSM_ASSERT_ACTION(XSM_HOOK);
+    return xsm_default_action(action, current->domain, d);
+}
+
 static XSM_INLINE int xsm_unmap_domain_irq(XSM_DEFAULT_ARG struct domain *d, int irq, void *data)
 {
     XSM_ASSERT_ACTION(XSM_HOOK);
@@ -513,14 +542,42 @@ static XSM_INLINE int xsm_hvm_param_nested(XSM_DEFAULT_ARG struct domain *d)
     return xsm_default_action(action, current->domain, d);
 }
 
-#ifdef HAS_MEM_ACCESS
-static XSM_INLINE int xsm_mem_event_control(XSM_DEFAULT_ARG struct domain *d, int mode, int op)
+static XSM_INLINE int xsm_hvm_param_altp2mhvm(XSM_DEFAULT_ARG struct domain *d)
 {
     XSM_ASSERT_ACTION(XSM_PRIV);
     return xsm_default_action(action, current->domain, d);
 }
 
-static XSM_INLINE int xsm_mem_event_op(XSM_DEFAULT_ARG struct domain *d, int op)
+static XSM_INLINE int xsm_hvm_altp2mhvm_op(XSM_DEFAULT_ARG struct domain *d)
+{
+    XSM_ASSERT_ACTION(XSM_TARGET);
+    return xsm_default_action(action, current->domain, d);
+}
+
+static XSM_INLINE int xsm_vm_event_control(XSM_DEFAULT_ARG struct domain *d, int mode, int op)
+{
+    XSM_ASSERT_ACTION(XSM_PRIV);
+    return xsm_default_action(action, current->domain, d);
+}
+
+#ifdef HAS_MEM_ACCESS
+static XSM_INLINE int xsm_mem_access(XSM_DEFAULT_ARG struct domain *d)
+{
+    XSM_ASSERT_ACTION(XSM_DM_PRIV);
+    return xsm_default_action(action, current->domain, d);
+}
+#endif
+
+#ifdef HAS_MEM_PAGING
+static XSM_INLINE int xsm_mem_paging(XSM_DEFAULT_ARG struct domain *d)
+{
+    XSM_ASSERT_ACTION(XSM_DM_PRIV);
+    return xsm_default_action(action, current->domain, d);
+}
+#endif
+
+#ifdef HAS_MEM_SHARING
+static XSM_INLINE int xsm_mem_sharing(XSM_DEFAULT_ARG struct domain *d)
 {
     XSM_ASSERT_ACTION(XSM_DM_PRIV);
     return xsm_default_action(action, current->domain, d);
@@ -631,28 +688,36 @@ static XSM_INLINE int xsm_priv_mapping(XSM_DEFAULT_ARG struct domain *d, struct
     return xsm_default_action(action, d, t);
 }
 
-static XSM_INLINE int xsm_bind_pt_irq(XSM_DEFAULT_ARG struct domain *d, struct xen_domctl_bind_pt_irq *bind)
-{
-    XSM_ASSERT_ACTION(XSM_HOOK);
-    return xsm_default_action(action, current->domain, d);
-}
-
-static XSM_INLINE int xsm_unbind_pt_irq(XSM_DEFAULT_ARG struct domain *d, struct xen_domctl_bind_pt_irq *bind)
+static XSM_INLINE int xsm_ioport_permission(XSM_DEFAULT_ARG struct domain *d, uint32_t s, uint32_t e, uint8_t allow)
 {
     XSM_ASSERT_ACTION(XSM_HOOK);
     return xsm_default_action(action, current->domain, d);
 }
 
-static XSM_INLINE int xsm_ioport_permission(XSM_DEFAULT_ARG struct domain *d, uint32_t s, uint32_t e, uint8_t allow)
+static XSM_INLINE int xsm_ioport_mapping(XSM_DEFAULT_ARG struct domain *d, uint32_t s, uint32_t e, uint8_t allow)
 {
     XSM_ASSERT_ACTION(XSM_HOOK);
     return xsm_default_action(action, current->domain, d);
 }
 
-static XSM_INLINE int xsm_ioport_mapping(XSM_DEFAULT_ARG struct domain *d, uint32_t s, uint32_t e, uint8_t allow)
+static XSM_INLINE int xsm_pmu_op (XSM_DEFAULT_ARG struct domain *d, unsigned int op)
 {
-    XSM_ASSERT_ACTION(XSM_HOOK);
-    return xsm_default_action(action, current->domain, d);
+    XSM_ASSERT_ACTION(XSM_OTHER);
+    switch ( op )
+    {
+    case XENPMU_mode_set:
+    case XENPMU_mode_get:
+    case XENPMU_feature_set:
+    case XENPMU_feature_get:
+        return xsm_default_action(XSM_PRIV, d, current->domain);
+    case XENPMU_init:
+    case XENPMU_finish:
+    case XENPMU_lvtpc_set:
+    case XENPMU_flush:
+        return xsm_default_action(XSM_HOOK, d, current->domain);
+    default:
+        return -EPERM;
+    }
 }
 
 #endif /* CONFIG_X86 */
diff --git a/xen/include/xsm/xsm.h b/xen/include/xsm/xsm.h
index 4ce089f..ba3caed 100644
--- a/xen/include/xsm/xsm.h
+++ b/xen/include/xsm/xsm.h
@@ -42,7 +42,7 @@ typedef enum xsm_default xsm_default_t;
 extern char *policy_buffer;
 extern u32 policy_size;
 
-typedef int (*xsm_initcall_t)(void);
+typedef void (*xsm_initcall_t)(void);
 
 extern xsm_initcall_t __xsm_initcall_start[], __xsm_initcall_end[];
 
@@ -107,6 +107,8 @@ struct xsm_operations {
     int (*map_domain_irq) (struct domain *d, int irq, void *data);
     int (*unmap_domain_pirq) (struct domain *d);
     int (*unmap_domain_irq) (struct domain *d, int irq, void *data);
+    int (*bind_pt_irq) (struct domain *d, struct xen_domctl_bind_pt_irq *bind);
+    int (*unbind_pt_irq) (struct domain *d, struct xen_domctl_bind_pt_irq *bind);
     int (*irq_permission) (struct domain *d, int pirq, uint8_t allow);
     int (*iomem_permission) (struct domain *d, uint64_t s, uint64_t e, uint8_t allow);
     int (*iomem_mapping) (struct domain *d, uint64_t s, uint64_t e, uint8_t allow);
@@ -119,6 +121,12 @@ struct xsm_operations {
     int (*deassign_device) (struct domain *d, uint32_t machine_bdf);
 #endif
 
+#if defined(HAS_PASSTHROUGH) && defined(HAS_DEVICE_TREE)
+    int (*test_assign_dtdevice) (const char *dtpath);
+    int (*assign_dtdevice) (struct domain *d, const char *dtpath);
+    int (*deassign_dtdevice) (struct domain *d, const char *dtpath);
+#endif
+
     int (*resource_plug_core) (void);
     int (*resource_unplug_core) (void);
     int (*resource_plug_pci) (uint32_t machine_bdf);
@@ -129,7 +137,6 @@ struct xsm_operations {
 
     int (*page_offline)(uint32_t cmd);
     int (*tmem_op)(void);
-    int (*tmem_control)(void);
 
     long (*do_xsm_op) (XEN_GUEST_HANDLE_PARAM(xsm_op_t) op);
 #ifdef CONFIG_COMPAT
@@ -139,11 +146,22 @@ struct xsm_operations {
     int (*hvm_param) (struct domain *d, unsigned long op);
     int (*hvm_control) (struct domain *d, unsigned long op);
     int (*hvm_param_nested) (struct domain *d);
+    int (*hvm_param_altp2mhvm) (struct domain *d);
+    int (*hvm_altp2mhvm_op) (struct domain *d);
     int (*get_vnumainfo) (struct domain *d);
 
+    int (*vm_event_control) (struct domain *d, int mode, int op);
+
 #ifdef HAS_MEM_ACCESS
-    int (*mem_event_control) (struct domain *d, int mode, int op);
-    int (*mem_event_op) (struct domain *d, int op);
+    int (*mem_access) (struct domain *d);
+#endif
+
+#ifdef HAS_MEM_PAGING
+    int (*mem_paging) (struct domain *d);
+#endif
+
+#ifdef HAS_MEM_SHARING
+    int (*mem_sharing) (struct domain *d);
 #endif
 
 #ifdef CONFIG_X86
@@ -169,10 +187,9 @@ struct xsm_operations {
     int (*mmuext_op) (struct domain *d, struct domain *f);
     int (*update_va_mapping) (struct domain *d, struct domain *f, l1_pgentry_t pte);
     int (*priv_mapping) (struct domain *d, struct domain *t);
-    int (*bind_pt_irq) (struct domain *d, struct xen_domctl_bind_pt_irq *bind);
-    int (*unbind_pt_irq) (struct domain *d, struct xen_domctl_bind_pt_irq *bind);
     int (*ioport_permission) (struct domain *d, uint32_t s, uint32_t e, uint8_t allow);
     int (*ioport_mapping) (struct domain *d, uint32_t s, uint32_t e, uint8_t allow);
+    int (*pmu_op) (struct domain *d, unsigned int op);
 #endif
 };
 
@@ -419,6 +436,18 @@ static inline int xsm_unmap_domain_irq (xsm_default_t def, struct domain *d, int
     return xsm_ops->unmap_domain_irq(d, irq, data);
 }
 
+static inline int xsm_bind_pt_irq(xsm_default_t def, struct domain *d,
+                                  struct xen_domctl_bind_pt_irq *bind)
+{
+    return xsm_ops->bind_pt_irq(d, bind);
+}
+
+static inline int xsm_unbind_pt_irq(xsm_default_t def, struct domain *d,
+                                    struct xen_domctl_bind_pt_irq *bind)
+{
+    return xsm_ops->unbind_pt_irq(d, bind);
+}
+
 static inline int xsm_irq_permission (xsm_default_t def, struct domain *d, int pirq, uint8_t allow)
 {
     return xsm_ops->irq_permission(d, pirq, allow);
@@ -461,6 +490,27 @@ static inline int xsm_deassign_device(xsm_default_t def, struct domain *d, uint3
 }
 #endif /* HAS_PASSTHROUGH && HAS_PCI) */
 
+#if defined(HAS_PASSTHROUGH) && defined(HAS_DEVICE_TREE)
+static inline int xsm_assign_dtdevice(xsm_default_t def, struct domain *d,
+                                      const char *dtpath)
+{
+    return xsm_ops->assign_dtdevice(d, dtpath);
+}
+
+static inline int xsm_test_assign_dtdevice(xsm_default_t def,
+                                           const char *dtpath)
+{
+    return xsm_ops->test_assign_dtdevice(dtpath);
+}
+
+static inline int xsm_deassign_dtdevice(xsm_default_t def, struct domain *d,
+                                        const char *dtpath)
+{
+    return xsm_ops->deassign_dtdevice(d, dtpath);
+}
+
+#endif /* HAS_PASSTHROUGH && HAS_DEVICE_TREE */
+
 static inline int xsm_resource_plug_pci (xsm_default_t def, uint32_t machine_bdf)
 {
     return xsm_ops->resource_plug_pci(machine_bdf);
@@ -506,11 +556,6 @@ static inline int xsm_tmem_op(xsm_default_t def)
     return xsm_ops->tmem_op();
 }
 
-static inline int xsm_tmem_control(xsm_default_t def)
-{
-    return xsm_ops->tmem_control();
-}
-
 static inline long xsm_do_xsm_op (XEN_GUEST_HANDLE_PARAM(xsm_op_t) op)
 {
     return xsm_ops->do_xsm_op(op);
@@ -538,20 +583,44 @@ static inline int xsm_hvm_param_nested (xsm_default_t def, struct domain *d)
     return xsm_ops->hvm_param_nested(d);
 }
 
+static inline int xsm_hvm_param_altp2mhvm (xsm_default_t def, struct domain *d)
+{
+    return xsm_ops->hvm_param_altp2mhvm(d);
+}
+
+static inline int xsm_hvm_altp2mhvm_op (xsm_default_t def, struct domain *d)
+{
+    return xsm_ops->hvm_altp2mhvm_op(d);
+}
+
 static inline int xsm_get_vnumainfo (xsm_default_t def, struct domain *d)
 {
     return xsm_ops->get_vnumainfo(d);
 }
 
+static inline int xsm_vm_event_control (xsm_default_t def, struct domain *d, int mode, int op)
+{
+    return xsm_ops->vm_event_control(d, mode, op);
+}
+
 #ifdef HAS_MEM_ACCESS
-static inline int xsm_mem_event_control (xsm_default_t def, struct domain *d, int mode, int op)
+static inline int xsm_mem_access (xsm_default_t def, struct domain *d)
+{
+    return xsm_ops->mem_access(d);
+}
+#endif
+
+#ifdef HAS_MEM_PAGING
+static inline int xsm_mem_paging (xsm_default_t def, struct domain *d)
 {
-    return xsm_ops->mem_event_control(d, mode, op);
+    return xsm_ops->mem_paging(d);
 }
+#endif
 
-static inline int xsm_mem_event_op (xsm_default_t def, struct domain *d, int op)
+#ifdef HAS_MEM_SHARING
+static inline int xsm_mem_sharing (xsm_default_t def, struct domain *d)
 {
-    return xsm_ops->mem_event_op(d, op);
+    return xsm_ops->mem_sharing(d);
 }
 #endif
 
@@ -643,18 +712,6 @@ static inline int xsm_priv_mapping(xsm_default_t def, struct domain *d, struct d
     return xsm_ops->priv_mapping(d, t);
 }
 
-static inline int xsm_bind_pt_irq(xsm_default_t def, struct domain *d,
-                                                struct xen_domctl_bind_pt_irq *bind)
-{
-    return xsm_ops->bind_pt_irq(d, bind);
-}
-
-static inline int xsm_unbind_pt_irq(xsm_default_t def, struct domain *d,
-                                                struct xen_domctl_bind_pt_irq *bind)
-{
-    return xsm_ops->unbind_pt_irq(d, bind);
-}
-
 static inline int xsm_ioport_permission (xsm_default_t def, struct domain *d, uint32_t s, uint32_t e, uint8_t allow)
 {
     return xsm_ops->ioport_permission(d, s, e, allow);
@@ -665,6 +722,11 @@ static inline int xsm_ioport_mapping (xsm_default_t def, struct domain *d, uint3
     return xsm_ops->ioport_mapping(d, s, e, allow);
 }
 
+static inline int xsm_pmu_op (xsm_default_t def, struct domain *d, unsigned int op)
+{
+    return xsm_ops->pmu_op(d, op);
+}
+
 #endif /* CONFIG_X86 */
 
 #endif /* XSM_NO_WRAPPERS */
diff --git a/xen/xsm/dummy.c b/xen/xsm/dummy.c
index 8eb3050..72eba40 100644
--- a/xen/xsm/dummy.c
+++ b/xen/xsm/dummy.c
@@ -81,6 +81,8 @@ void xsm_fixup_ops (struct xsm_operations *ops)
     set_to_dummy_if_null(ops, map_domain_irq);
     set_to_dummy_if_null(ops, unmap_domain_pirq);
     set_to_dummy_if_null(ops, unmap_domain_irq);
+    set_to_dummy_if_null(ops, bind_pt_irq);
+    set_to_dummy_if_null(ops, unbind_pt_irq);
     set_to_dummy_if_null(ops, irq_permission);
     set_to_dummy_if_null(ops, iomem_permission);
     set_to_dummy_if_null(ops, iomem_mapping);
@@ -94,6 +96,12 @@ void xsm_fixup_ops (struct xsm_operations *ops)
     set_to_dummy_if_null(ops, deassign_device);
 #endif
 
+#if defined(HAS_PASSTHROUGH) && defined(HAS_DEVICE_TREE)
+    set_to_dummy_if_null(ops, test_assign_dtdevice);
+    set_to_dummy_if_null(ops, assign_dtdevice);
+    set_to_dummy_if_null(ops, deassign_dtdevice);
+#endif
+
     set_to_dummy_if_null(ops, resource_plug_core);
     set_to_dummy_if_null(ops, resource_unplug_core);
     set_to_dummy_if_null(ops, resource_plug_pci);
@@ -104,10 +112,11 @@ void xsm_fixup_ops (struct xsm_operations *ops)
 
     set_to_dummy_if_null(ops, page_offline);
     set_to_dummy_if_null(ops, tmem_op);
-    set_to_dummy_if_null(ops, tmem_control);
     set_to_dummy_if_null(ops, hvm_param);
     set_to_dummy_if_null(ops, hvm_control);
     set_to_dummy_if_null(ops, hvm_param_nested);
+    set_to_dummy_if_null(ops, hvm_param_altp2mhvm);
+    set_to_dummy_if_null(ops, hvm_altp2mhvm_op);
 
     set_to_dummy_if_null(ops, do_xsm_op);
 #ifdef CONFIG_COMPAT
@@ -118,9 +127,18 @@ void xsm_fixup_ops (struct xsm_operations *ops)
     set_to_dummy_if_null(ops, remove_from_physmap);
     set_to_dummy_if_null(ops, map_gmfn_foreign);
 
+    set_to_dummy_if_null(ops, vm_event_control);
+
 #ifdef HAS_MEM_ACCESS
-    set_to_dummy_if_null(ops, mem_event_control);
-    set_to_dummy_if_null(ops, mem_event_op);
+    set_to_dummy_if_null(ops, mem_access);
+#endif
+
+#ifdef HAS_MEM_PAGING
+    set_to_dummy_if_null(ops, mem_paging);
+#endif
+
+#ifdef HAS_MEM_SHARING
+    set_to_dummy_if_null(ops, mem_sharing);
 #endif
 
 #ifdef CONFIG_X86
@@ -140,9 +158,8 @@ void xsm_fixup_ops (struct xsm_operations *ops)
     set_to_dummy_if_null(ops, mmuext_op);
     set_to_dummy_if_null(ops, update_va_mapping);
     set_to_dummy_if_null(ops, priv_mapping);
-    set_to_dummy_if_null(ops, bind_pt_irq);
-    set_to_dummy_if_null(ops, unbind_pt_irq);
     set_to_dummy_if_null(ops, ioport_permission);
     set_to_dummy_if_null(ops, ioport_mapping);
+    set_to_dummy_if_null(ops, pmu_op);
 #endif
 }
diff --git a/xen/xsm/flask/Makefile b/xen/xsm/flask/Makefile
index 1256512..12fc3a9 100644
--- a/xen/xsm/flask/Makefile
+++ b/xen/xsm/flask/Makefile
@@ -22,7 +22,7 @@ ALL_H_FILES = $(FLASK_H_FILES) $(AV_H_FILES)
 $(obj-y) ss/built_in.o: $(ALL_H_FILES)
 
 $(FLASK_H_FILES): $(FLASK_H_DEPEND)
-	$(CONFIG_SHELL) policy/mkflask.sh $(AWK) $(FLASK_H_DEPEND)
+	$(CONFIG_SHELL) policy/mkflask.sh $(AWK) include $(FLASK_H_DEPEND)
 
 $(AV_H_FILES): $(AV_H_DEPEND)
 	$(CONFIG_SHELL) policy/mkaccess_vector.sh $(AWK) $(AV_H_DEPEND)
diff --git a/xen/xsm/flask/avc.c b/xen/xsm/flask/avc.c
index fc6580e..31bc702 100644
--- a/xen/xsm/flask/avc.c
+++ b/xen/xsm/flask/avc.c
@@ -251,8 +251,6 @@ void __init avc_init(void)
     }
     atomic_set(&avc_cache.active_nodes, 0);
     atomic_set(&avc_cache.lru_hint, 0);
-
-    printk("AVC INITIALIZED\n");
 }
 
 int avc_get_hash_stats(struct xen_flask_hash_stats *arg)
@@ -602,6 +600,9 @@ void avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested,
     case AVC_AUDIT_DATA_MEMORY:
         avc_printk(&buf, "pte=%#lx mfn=%#lx ", a->memory.pte, a->memory.mfn);
         break;
+    case AVC_AUDIT_DATA_DTDEV:
+        avc_printk(&buf, "dtdevice=%s ", a->dtdev);
+        break;
     }
 
     avc_dump_query(&buf, ssid, tsid, tclass);
diff --git a/xen/xsm/flask/flask_op.c b/xen/xsm/flask/flask_op.c
index 7743aac..f4f5dd1 100644
--- a/xen/xsm/flask/flask_op.c
+++ b/xen/xsm/flask/flask_op.c
@@ -12,6 +12,7 @@
 #include <xen/event.h>
 #include <xsm/xsm.h>
 #include <xen/guest_access.h>
+#include <xen/err.h>
 
 #include <public/xsm/flask_op.h>
 
@@ -24,15 +25,12 @@
 #define _copy_to_guest copy_to_guest
 #define _copy_from_guest copy_from_guest
 
-#ifdef FLASK_DEVELOP
-int flask_enforcing = 0;
-integer_param("flask_enforcing", flask_enforcing);
-#endif
+enum flask_bootparam_t __read_mostly flask_bootparam = FLASK_BOOTPARAM_PERMISSIVE;
+static void parse_flask_param(char *s);
+custom_param("flask", parse_flask_param);
 
-#ifdef FLASK_BOOTPARAM
-int flask_enabled = 1;
-integer_param("flask_enabled", flask_enabled);
-#endif
+bool_t __read_mostly flask_enforcing = 0;
+boolean_param("flask_enforcing", flask_enforcing);
 
 #define MAX_POLICY_SIZE 0x4000000
 
@@ -57,13 +55,32 @@ static DEFINE_SPINLOCK(sel_sem);
 /* global data for booleans */
 static int bool_num = 0;
 static int *bool_pending_values = NULL;
-static size_t bool_maxstr;
 static int flask_security_make_bools(void);
 
 extern int ss_initialized;
 
 extern struct xsm_operations *original_ops;
 
+static void __init parse_flask_param(char *s)
+{
+    if ( !strcmp(s, "enforcing") )
+    {
+        flask_enforcing = 1;
+        flask_bootparam = FLASK_BOOTPARAM_ENFORCING;
+    }
+    else if ( !strcmp(s, "late") )
+    {
+        flask_enforcing = 1;
+        flask_bootparam = FLASK_BOOTPARAM_LATELOAD;
+    }
+    else if ( !strcmp(s, "disabled") )
+        flask_bootparam = FLASK_BOOTPARAM_DISABLED;
+    else if ( !strcmp(s, "permissive") )
+        flask_bootparam = FLASK_BOOTPARAM_PERMISSIVE;
+    else
+        flask_bootparam = FLASK_BOOTPARAM_INVALID;
+}
+
 static int domain_has_security(struct domain *d, u32 perms)
 {
     struct domain_security_struct *dsec;
@@ -76,29 +93,6 @@ static int domain_has_security(struct domain *d, u32 perms)
                         perms, NULL);
 }
 
-static int flask_copyin_string(XEN_GUEST_HANDLE(char) u_buf, char **buf,
-                               size_t size, size_t max_size)
-{
-    char *tmp;
-
-    if ( size > max_size )
-        return -ENOENT;
-
-    tmp = xmalloc_array(char, size + 1);
-    if ( !tmp )
-        return -ENOMEM;
-
-    if ( copy_from_guest(tmp, u_buf, size) )
-    {
-        xfree(tmp);
-        return -EFAULT;
-    }
-    tmp[size] = 0;
-
-    *buf = tmp;
-    return 0;
-}
-
 #endif /* COMPAT */
 
 static int flask_security_user(struct xen_flask_userlist *arg)
@@ -112,9 +106,9 @@ static int flask_security_user(struct xen_flask_userlist *arg)
     if ( rv )
         return rv;
 
-    rv = flask_copyin_string(arg->u.user, &user, arg->size, PAGE_SIZE);
-    if ( rv )
-        return rv;
+    user = safe_copy_string_from_guest(arg->u.user, arg->size, PAGE_SIZE);
+    if ( IS_ERR(user) )
+        return PTR_ERR(user);
 
     rv = security_get_user_sids(arg->start_sid, user, &sids, &nsids);
     if ( rv < 0 )
@@ -227,9 +221,9 @@ static int flask_security_context(struct xen_flask_sid_context *arg)
     if ( rv )
         return rv;
 
-    rv = flask_copyin_string(arg->context, &buf, arg->size, PAGE_SIZE);
-    if ( rv )
-        return rv;
+    buf = safe_copy_string_from_guest(arg->context, arg->size, PAGE_SIZE);
+    if ( IS_ERR(buf) )
+        return PTR_ERR(buf);
 
     rv = security_context_to_sid(buf, arg->size, &arg->sid);
     if ( rv < 0 )
@@ -319,14 +313,13 @@ static int flask_security_setavc_threshold(struct xen_flask_setavc_threshold *ar
 static int flask_security_resolve_bool(struct xen_flask_boolean *arg)
 {
     char *name;
-    int rv;
 
     if ( arg->bool_id != -1 )
         return 0;
 
-    rv = flask_copyin_string(arg->name, &name, arg->size, bool_maxstr);
-    if ( rv )
-        return rv;
+    name = safe_copy_string_from_guest(arg->name, arg->size, PAGE_SIZE);
+    if ( IS_ERR(name) )
+        return PTR_ERR(name);
 
     arg->bool_id = security_find_bool(name);
     arg->size = 0;
@@ -465,7 +458,7 @@ static int flask_security_make_bools(void)
     
     xfree(bool_pending_values);
     
-    ret = security_get_bools(&num, NULL, &values, &bool_maxstr);
+    ret = security_get_bools(&num, NULL, &values, NULL);
     if ( ret != 0 )
         goto out;
 
@@ -506,6 +499,7 @@ static int flask_security_load(struct xen_flask_load *load)
 {
     int ret;
     void *buf = NULL;
+    bool_t is_reload = ss_initialized;
 
     ret = domain_has_security(current->domain, SECURITY__LOAD_POLICY);
     if ( ret )
@@ -530,6 +524,10 @@ static int flask_security_load(struct xen_flask_load *load)
     if ( ret )
         goto out;
 
+    if ( !is_reload )
+        printk(XENLOG_INFO "Flask: Policy loaded, continuing in %s mode.\n",
+            flask_enforcing ? "enforcing" : "permissive");
+
     xfree(bool_pending_values);
     bool_pending_values = NULL;
     ret = 0;
@@ -541,6 +539,27 @@ static int flask_security_load(struct xen_flask_load *load)
     return ret;
 }
 
+static int flask_devicetree_label(struct xen_flask_devicetree_label *arg)
+{
+    int rv;
+    char *buf;
+    u32 sid = arg->sid;
+    u32 perm = sid ? SECURITY__ADD_OCONTEXT : SECURITY__DEL_OCONTEXT;
+
+    rv = domain_has_security(current->domain, perm);
+    if ( rv )
+        return rv;
+
+    buf = safe_copy_string_from_guest(arg->path, arg->length, PAGE_SIZE);
+    if ( IS_ERR(buf) )
+        return PTR_ERR(buf);
+
+    /* buf is consumed or freed by this function */
+    rv = security_devicetree_setlabel(buf, sid);
+
+    return rv;
+}
+
 #ifndef COMPAT
 
 static int flask_ocontext_del(struct xen_flask_ocontext *arg)
@@ -768,6 +787,10 @@ ret_t do_flask_op(XEN_GUEST_HANDLE_PARAM(xsm_op_t) u_flask_op)
         rv = flask_relabel_domain(&op.u.relabel);
         break;
 
+    case FLASK_DEVICETREE_LABEL:
+        rv = flask_devicetree_label(&op.u.devicetree_label);
+        break;
+
     default:
         rv = -ENOSYS;
     }
@@ -805,10 +828,10 @@ CHECK_flask_setenforce;
 CHECK_flask_transition;
 
 #define COMPAT
-#define flask_copyin_string(ch, pb, sz, mx) ({ \
-	XEN_GUEST_HANDLE_PARAM(char) gh; \
-	guest_from_compat_handle(gh, ch); \
-	flask_copyin_string(gh, pb, sz, mx); \
+#define safe_copy_string_from_guest(ch, sz, mx) ({ \
+    XEN_GUEST_HANDLE_PARAM(char) gh; \
+    guest_from_compat_handle(gh, ch); \
+    safe_copy_string_from_guest(gh, sz, mx); \
 })
 
 #define xen_flask_load compat_flask_load
@@ -826,6 +849,9 @@ CHECK_flask_transition;
 #define flask_security_get_bool compat_security_get_bool
 #define flask_security_set_bool compat_security_set_bool
 
+#define xen_flask_devicetree_label compat_flask_devicetree_label
+#define flask_devicetree_label compat_devicetree_label
+
 #define xen_flask_op_t compat_flask_op_t
 #undef ret_t
 #define ret_t int
diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
index d48463f..fafb1a4 100644
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -577,17 +577,22 @@ static int flask_domctl(struct domain *d, int cmd)
     case XEN_DOMCTL_iomem_permission:
     case XEN_DOMCTL_memory_mapping:
     case XEN_DOMCTL_set_target:
-#ifdef HAS_MEM_ACCESS
-    case XEN_DOMCTL_mem_event_op:
-#endif
+    case XEN_DOMCTL_vm_event_op:
+
+    /* These have individual XSM hooks (arch/../domctl.c) */
+    case XEN_DOMCTL_bind_pt_irq:
+    case XEN_DOMCTL_unbind_pt_irq:
 #ifdef CONFIG_X86
     /* These have individual XSM hooks (arch/x86/domctl.c) */
     case XEN_DOMCTL_shadow_op:
     case XEN_DOMCTL_ioport_permission:
-    case XEN_DOMCTL_bind_pt_irq:
-    case XEN_DOMCTL_unbind_pt_irq:
     case XEN_DOMCTL_ioport_mapping:
-    /* These have individual XSM hooks (drivers/passthrough/iommu.c) */
+#endif
+#ifdef HAS_PASSTHROUGH
+    /*
+     * These have individual XSM hooks
+     * (drivers/passthrough/{pci,device_tree.c)
+     */
     case XEN_DOMCTL_get_device_group:
     case XEN_DOMCTL_test_assign_device:
     case XEN_DOMCTL_assign_device:
@@ -639,8 +644,6 @@ static int flask_domctl(struct domain *d, int cmd)
     case XEN_DOMCTL_setdebugging:
         return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETDEBUGGING);
 
-    case XEN_DOMCTL_getpageframeinfo:
-    case XEN_DOMCTL_getpageframeinfo2:
     case XEN_DOMCTL_getpageframeinfo3:
         return current_has_perm(d, SECCLASS_MMU, MMU__PAGEINFO);
 
@@ -689,7 +692,10 @@ static int flask_domctl(struct domain *d, int cmd)
         return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__TRIGGER);
 
     case XEN_DOMCTL_set_access_required:
-        return current_has_perm(d, SECCLASS_HVM, HVM__MEM_EVENT);
+        return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__VM_EVENT);
+
+    case XEN_DOMCTL_monitor_op:
+        return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__VM_EVENT);
 
     case XEN_DOMCTL_debug_op:
     case XEN_DOMCTL_gdbsx_guestmemio:
@@ -729,8 +735,8 @@ static int flask_domctl(struct domain *d, int cmd)
     case XEN_DOMCTL_psr_cmt_op:
         return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__PSR_CMT_OP);
 
-    case XEN_DOMCTL_arm_configure_domain:
-        return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__CONFIGURE_DOMAIN);
+    case XEN_DOMCTL_psr_cat_op:
+        return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__PSR_CAT_OP);
 
     default:
         printk("flask_domctl: Unknown op %d\n", cmd);
@@ -783,13 +789,20 @@ static int flask_sysctl(int cmd)
         return domain_has_xen(current->domain, XEN__CPUPOOL_OP);
 
     case XEN_SYSCTL_physinfo:
-    case XEN_SYSCTL_topologyinfo:
+    case XEN_SYSCTL_cputopoinfo:
     case XEN_SYSCTL_numainfo:
+    case XEN_SYSCTL_pcitopoinfo:
         return domain_has_xen(current->domain, XEN__PHYSINFO);
 
     case XEN_SYSCTL_psr_cmt_op:
         return avc_current_has_perm(SECINITSID_XEN, SECCLASS_XEN2,
                                     XEN2__PSR_CMT_OP, NULL);
+    case XEN_SYSCTL_psr_cat_op:
+        return avc_current_has_perm(SECINITSID_XEN, SECCLASS_XEN2,
+                                    XEN2__PSR_CAT_OP, NULL);
+
+    case XEN_SYSCTL_tmem_op:
+        return domain_has_xen(current->domain, XEN__TMEM_CONTROL);
 
     default:
         printk("flask_sysctl: Unknown op %d\n", cmd);
@@ -914,6 +927,36 @@ static int flask_unmap_domain_irq (struct domain *d, int irq, void *data)
     return rc;
 }
 
+static int flask_bind_pt_irq (struct domain *d, struct xen_domctl_bind_pt_irq *bind)
+{
+    u32 dsid, rsid;
+    int rc = -EPERM;
+    int irq;
+    struct avc_audit_data ad;
+
+    rc = current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__ADD);
+    if ( rc )
+        return rc;
+
+    irq = domain_pirq_to_irq(d, bind->machine_irq);
+
+    rc = get_irq_sid(irq, &rsid, &ad);
+    if ( rc )
+        return rc;
+
+    rc = avc_current_has_perm(rsid, SECCLASS_HVM, HVM__BIND_IRQ, &ad);
+    if ( rc )
+        return rc;
+
+    dsid = domain_sid(d);
+    return avc_has_perm(dsid, rsid, SECCLASS_RESOURCE, RESOURCE__USE, &ad);
+}
+
+static int flask_unbind_pt_irq (struct domain *d, struct xen_domctl_bind_pt_irq *bind)
+{
+    return current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__REMOVE);
+}
+
 static int flask_irq_permission (struct domain *d, int pirq, uint8_t access)
 {
     /* the PIRQ number is not useful; real IRQ is checked during mapping */
@@ -1089,11 +1132,6 @@ static inline int flask_tmem_op(void)
     return domain_has_xen(current->domain, XEN__TMEM_OP);
 }
 
-static inline int flask_tmem_control(void)
-{
-    return domain_has_xen(current->domain, XEN__TMEM_CONTROL);
-}
-
 static int flask_add_to_physmap(struct domain *d1, struct domain *d2)
 {
     return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__PHYSMAP);
@@ -1136,6 +1174,42 @@ static int flask_hvm_param_nested(struct domain *d)
     return current_has_perm(d, SECCLASS_HVM, HVM__NESTED);
 }
 
+static int flask_hvm_param_altp2mhvm(struct domain *d)
+{
+    return current_has_perm(d, SECCLASS_HVM, HVM__ALTP2MHVM);
+}
+
+static int flask_hvm_altp2mhvm_op(struct domain *d)
+{
+    return current_has_perm(d, SECCLASS_HVM, HVM__ALTP2MHVM_OP);
+}
+
+static int flask_vm_event_control(struct domain *d, int mode, int op)
+{
+    return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__VM_EVENT);
+}
+
+#ifdef HAS_MEM_ACCESS
+static int flask_mem_access(struct domain *d)
+{
+    return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__MEM_ACCESS);
+}
+#endif
+
+#ifdef HAS_MEM_PAGING
+static int flask_mem_paging(struct domain *d)
+{
+    return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__MEM_PAGING);
+}
+#endif
+
+#ifdef HAS_MEM_SHARING
+static int flask_mem_sharing(struct domain *d)
+{
+    return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__MEM_SHARING);
+}
+#endif
+
 #if defined(HAS_PASSTHROUGH) && defined(HAS_PCI)
 static int flask_get_device_group(uint32_t machine_bdf)
 {
@@ -1202,17 +1276,61 @@ static int flask_deassign_device(struct domain *d, uint32_t machine_bdf)
 }
 #endif /* HAS_PASSTHROUGH && HAS_PCI */
 
-#ifdef HAS_MEM_ACCESS
-static int flask_mem_event_control(struct domain *d, int mode, int op)
+#if defined(HAS_PASSTHROUGH) && defined(HAS_DEVICE_TREE)
+static int flask_test_assign_dtdevice(const char *dtpath)
 {
-    return current_has_perm(d, SECCLASS_HVM, HVM__MEM_EVENT);
+    u32 rsid;
+    int rc = -EPERM;
+
+    rc = security_devicetree_sid(dtpath, &rsid);
+    if ( rc )
+        return rc;
+
+    return avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__STAT_DEVICE,
+                                NULL);
 }
 
-static int flask_mem_event_op(struct domain *d, int op)
+static int flask_assign_dtdevice(struct domain *d, const char *dtpath)
 {
-    return current_has_perm(d, SECCLASS_HVM, HVM__MEM_EVENT);
+    u32 dsid, rsid;
+    int rc = -EPERM;
+    struct avc_audit_data ad;
+
+    rc = current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__ADD);
+    if ( rc )
+        return rc;
+
+    rc = security_devicetree_sid(dtpath, &rsid);
+    if ( rc )
+        return rc;
+
+    AVC_AUDIT_DATA_INIT(&ad, DTDEV);
+    ad.dtdev = dtpath;
+    rc = avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__ADD_DEVICE, &ad);
+    if ( rc )
+        return rc;
+
+    dsid = domain_sid(d);
+    return avc_has_perm(dsid, rsid, SECCLASS_RESOURCE, RESOURCE__USE, &ad);
+}
+
+static int flask_deassign_dtdevice(struct domain *d, const char *dtpath)
+{
+    u32 rsid;
+    int rc = -EPERM;
+
+    rc = current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__REMOVE);
+    if ( rc )
+        return rc;
+
+    rc = security_devicetree_sid(dtpath, &rsid);
+    if ( rc )
+        return rc;
+
+    return avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__REMOVE_DEVICE,
+                                NULL);
 }
-#endif /* HAS_MEM_ACCESS */
+#endif /* HAS_PASSTHROUGH && HAS_DEVICE_TREE */
 
 #ifdef CONFIG_X86
 static int flask_do_mca(void)
@@ -1365,7 +1483,8 @@ static int flask_platform_op(uint32_t op)
         return 0;
 #endif
 
-    case XENPF_settime:
+    case XENPF_settime32:
+    case XENPF_settime64:
         return domain_has_xen(current->domain, XEN__SETTIME);
 
     case XENPF_add_memtype:
@@ -1410,6 +1529,10 @@ static int flask_platform_op(uint32_t op)
         return avc_current_has_perm(SECINITSID_XEN, SECCLASS_XEN2,
                                     XEN2__RESOURCE_OP, NULL);
 
+    case XENPF_get_symbol:
+        return avc_has_perm(domain_sid(current->domain), SECINITSID_XEN,
+                            SECCLASS_XEN2, XEN2__GET_SYMBOL, NULL);
+
     default:
         printk("flask_platform_op: Unknown op %d\n", op);
         return -EPERM;
@@ -1471,34 +1594,27 @@ static int flask_priv_mapping(struct domain *d, struct domain *t)
     return domain_has_perm(d, t, SECCLASS_MMU, MMU__TARGET_HACK);
 }
 
-static int flask_bind_pt_irq (struct domain *d, struct xen_domctl_bind_pt_irq *bind)
+static int flask_pmu_op (struct domain *d, unsigned int op)
 {
-    u32 dsid, rsid;
-    int rc = -EPERM;
-    int irq;
-    struct avc_audit_data ad;
-
-    rc = current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__ADD);
-    if ( rc )
-        return rc;
-
-    irq = domain_pirq_to_irq(d, bind->machine_irq);
-
-    rc = get_irq_sid(irq, &rsid, &ad);
-    if ( rc )
-        return rc;
-
-    rc = avc_current_has_perm(rsid, SECCLASS_HVM, HVM__BIND_IRQ, &ad);
-    if ( rc )
-        return rc;
-
-    dsid = domain_sid(d);
-    return avc_has_perm(dsid, rsid, SECCLASS_RESOURCE, RESOURCE__USE, &ad);
-}
+    u32 dsid = domain_sid(d);
 
-static int flask_unbind_pt_irq (struct domain *d, struct xen_domctl_bind_pt_irq *bind)
-{
-    return current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__REMOVE);
+    switch ( op )
+    {
+    case XENPMU_mode_set:
+    case XENPMU_mode_get:
+    case XENPMU_feature_set:
+    case XENPMU_feature_get:
+        return avc_has_perm(dsid, SECINITSID_XEN, SECCLASS_XEN2,
+                            XEN2__PMU_CTRL, NULL);
+    case XENPMU_init:
+    case XENPMU_finish:
+    case XENPMU_lvtpc_set:
+    case XENPMU_flush:
+        return avc_has_perm(dsid, SECINITSID_XEN, SECCLASS_XEN2,
+                            XEN2__PMU_USE, NULL);
+    default:
+        return -EPERM;
+    }
 }
 #endif /* CONFIG_X86 */
 
@@ -1558,6 +1674,8 @@ static struct xsm_operations flask_ops = {
     .map_domain_irq = flask_map_domain_irq,
     .unmap_domain_pirq = flask_unmap_domain_pirq,
     .unmap_domain_irq = flask_unmap_domain_irq,
+    .bind_pt_irq = flask_bind_pt_irq,
+    .unbind_pt_irq = flask_unbind_pt_irq,
     .irq_permission = flask_irq_permission,
     .iomem_permission = flask_iomem_permission,
     .iomem_mapping = flask_iomem_mapping,
@@ -1573,14 +1691,29 @@ static struct xsm_operations flask_ops = {
 
     .page_offline = flask_page_offline,
     .tmem_op = flask_tmem_op,
-    .tmem_control = flask_tmem_control,
     .hvm_param = flask_hvm_param,
     .hvm_control = flask_hvm_param,
     .hvm_param_nested = flask_hvm_param_nested,
+    .hvm_param_altp2mhvm = flask_hvm_param_altp2mhvm,
+    .hvm_altp2mhvm_op = flask_hvm_altp2mhvm_op,
 
     .do_xsm_op = do_flask_op,
     .get_vnumainfo = flask_get_vnumainfo,
 
+    .vm_event_control = flask_vm_event_control,
+
+#ifdef HAS_MEM_ACCESS
+    .mem_access = flask_mem_access,
+#endif
+
+#ifdef HAS_MEM_PAGING
+    .mem_paging = flask_mem_paging,
+#endif
+
+#ifdef HAS_MEM_SHARING
+    .mem_sharing = flask_mem_sharing,
+#endif
+
 #ifdef CONFIG_COMPAT
     .do_compat_op = compat_flask_op,
 #endif
@@ -1596,9 +1729,10 @@ static struct xsm_operations flask_ops = {
     .deassign_device = flask_deassign_device,
 #endif
 
-#ifdef HAS_MEM_ACCESS
-    .mem_event_control = flask_mem_event_control,
-    .mem_event_op = flask_mem_event_op,
+#if defined(HAS_PASSTHROUGH) && defined(HAS_DEVICE_TREE)
+    .test_assign_dtdevice = flask_test_assign_dtdevice,
+    .assign_dtdevice = flask_assign_dtdevice,
+    .deassign_dtdevice = flask_deassign_dtdevice,
 #endif
 
 #ifdef CONFIG_X86
@@ -1618,24 +1752,31 @@ static struct xsm_operations flask_ops = {
     .mmuext_op = flask_mmuext_op,
     .update_va_mapping = flask_update_va_mapping,
     .priv_mapping = flask_priv_mapping,
-    .bind_pt_irq = flask_bind_pt_irq,
-    .unbind_pt_irq = flask_unbind_pt_irq,
     .ioport_permission = flask_ioport_permission,
     .ioport_mapping = flask_ioport_mapping,
+    .pmu_op = flask_pmu_op,
 #endif
 };
 
-static __init int flask_init(void)
+static __init void flask_init(void)
 {
-    int ret = 0;
+    int ret = -ENOENT;
 
-    if ( !flask_enabled )
+    switch ( flask_bootparam )
     {
-        printk("Flask:  Disabled at boot.\n");
-        return 0;
-    }
+    case FLASK_BOOTPARAM_DISABLED:
+        printk(XENLOG_INFO "Flask: Disabled at boot.\n");
+        return;
 
-    printk("Flask:  Initializing.\n");
+    case FLASK_BOOTPARAM_PERMISSIVE:
+    case FLASK_BOOTPARAM_ENFORCING:
+    case FLASK_BOOTPARAM_LATELOAD:
+        break;
+
+    case FLASK_BOOTPARAM_INVALID:
+    default:
+        panic("Flask: Invalid value for flask= boot parameter.\n");
+    }
 
     avc_init();
 
@@ -1643,14 +1784,18 @@ static __init int flask_init(void)
     if ( register_xsm(&flask_ops) )
         panic("Flask: Unable to register with XSM");
 
-    ret = security_load_policy(policy_buffer, policy_size);
+    if ( policy_size && flask_bootparam != FLASK_BOOTPARAM_LATELOAD )
+        ret = security_load_policy(policy_buffer, policy_size);
 
-    if ( flask_enforcing )
-        printk("Flask:  Starting in enforcing mode.\n");
-    else
-        printk("Flask:  Starting in permissive mode.\n");
+    if ( ret && flask_bootparam == FLASK_BOOTPARAM_ENFORCING )
+        panic("Unable to load FLASK policy");
 
-    return ret;
+    if ( ret )
+        printk(XENLOG_INFO "Flask:  Access controls disabled until policy is loaded.\n");
+    else if ( flask_enforcing )
+        printk(XENLOG_INFO "Flask:  Starting in enforcing mode.\n");
+    else
+        printk(XENLOG_INFO "Flask:  Starting in permissive mode.\n");
 }
 
 xsm_initcall(flask_init);
diff --git a/xen/xsm/flask/include/avc.h b/xen/xsm/flask/include/avc.h
index 42a5e4b..4283562 100644
--- a/xen/xsm/flask/include/avc.h
+++ b/xen/xsm/flask/include/avc.h
@@ -17,11 +17,7 @@
 #include "av_permissions.h"
 #include "security.h"
 
-#ifdef FLASK_DEVELOP
-extern int flask_enforcing;
-#else
-#define flask_enforcing 1
-#endif
+extern bool_t flask_enforcing;
 
 /*
  * An entry in the AVC.
@@ -43,6 +39,7 @@ struct avc_audit_data {
 #define AVC_AUDIT_DATA_IRQ   2
 #define AVC_AUDIT_DATA_RANGE 3
 #define AVC_AUDIT_DATA_MEMORY 4
+#define AVC_AUDIT_DATA_DTDEV 5
     struct domain *sdom;
     struct domain *tdom;
     union {
@@ -56,6 +53,7 @@ struct avc_audit_data {
             unsigned long pte;
             unsigned long mfn;
         } memory;
+        const char *dtdev;
     };
 };
 
diff --git a/xen/xsm/flask/include/security.h b/xen/xsm/flask/include/security.h
index 348f018..34bbe62 100644
--- a/xen/xsm/flask/include/security.h
+++ b/xen/xsm/flask/include/security.h
@@ -30,17 +30,26 @@
 #define POLICYDB_VERSION_POLCAP		22
 #define POLICYDB_VERSION_PERMISSIVE	23
 #define POLICYDB_VERSION_BOUNDARY	24
+#define POLICYDB_VERSION_FILENAME_TRANS	25
+#define POLICYDB_VERSION_ROLETRANS	26
+#define POLICYDB_VERSION_NEW_OBJECT_DEFAULTS	27
+#define POLICYDB_VERSION_DEFAULT_TYPE	28
+#define POLICYDB_VERSION_CONSTRAINT_NAMES	29
+#define POLICYDB_VERSION_XEN_DEVICETREE 30
 
 /* Range of policy versions we understand*/
 #define POLICYDB_VERSION_MIN   POLICYDB_VERSION_BASE
-#define POLICYDB_VERSION_MAX   POLICYDB_VERSION_BOUNDARY
-
-#ifdef FLASK_BOOTPARAM
-extern int flask_enabled;
-#else
-#define flask_enabled 1
-#endif
+#define POLICYDB_VERSION_MAX   POLICYDB_VERSION_XEN_DEVICETREE
+
+enum flask_bootparam_t {
+    FLASK_BOOTPARAM_PERMISSIVE,
+    FLASK_BOOTPARAM_ENFORCING,
+    FLASK_BOOTPARAM_LATELOAD,
+    FLASK_BOOTPARAM_DISABLED,
+    FLASK_BOOTPARAM_INVALID,
+};
 
+extern enum flask_bootparam_t flask_bootparam;
 extern int flask_mls_enabled;
 
 int security_load_policy(void * data, size_t len);
@@ -79,6 +88,8 @@ int security_ioport_sid(u32 ioport, u32 *out_sid);
 
 int security_device_sid(u32 device, u32 *out_sid);
 
+int security_devicetree_sid(const char *path, u32 *out_sid);
+
 int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid,
                                                                     u16 tclass);
 
@@ -93,5 +104,7 @@ int security_iterate_ioport_sids(u32 start, u32 end,
 int security_ocontext_add(u32 ocontext, unsigned long low,
                            unsigned long high, u32 sid);
 
-int security_ocontext_del(u32 ocontext, unsigned int low, unsigned int high);
+int security_ocontext_del(u32 ocontext, unsigned long low, unsigned long high);
+
+int security_devicetree_setlabel(char *path, u32 sid);
 #endif /* _FLASK_SECURITY_H_ */
diff --git a/xen/xsm/flask/policy/access_vectors b/xen/xsm/flask/policy/access_vectors
index 1da9f63..0aa68f8 100644
--- a/xen/xsm/flask/policy/access_vectors
+++ b/xen/xsm/flask/policy/access_vectors
@@ -8,7 +8,8 @@
 # executing the hypercall, and the target is the xen initial sid (type xen_t).
 class xen
 {
-# XENPF_settime
+# XENPF_settime32
+# XENPF_settime64
     settime
 # XEN_SYSCTL_tbuf_op
     tbufcontrol
@@ -26,7 +27,8 @@ class xen
     mtrr_read
 # XENPF_microcode_update
     microcode
-# XEN_SYSCTL_physinfo, XEN_SYSCTL_topologyinfo, XEN_SYSCTL_numainfo
+# XEN_SYSCTL_physinfo, XEN_SYSCTL_cputopoinfo, XEN_SYSCTL_numainfo
+# XEN_SYSCTL_pcitopoinfo
     physinfo
 # XENPF_platform_quirk
     quirk
@@ -67,7 +69,7 @@ class xen
     cpupool_op
 # tmem hypercall (any access)
     tmem_op
-# TMEM_CONTROL command of tmem hypercall
+# XEN_SYSCTL_tmem_op command of tmem (part of sysctl)
     tmem_control
 # XEN_SYSCTL_scheduler_op with XEN_DOMCTL_SCHEDOP_getinfo, XEN_SYSCTL_sched_id
     getscheduler
@@ -83,6 +85,14 @@ class xen2
     resource_op
 # XEN_SYSCTL_psr_cmt_op
     psr_cmt_op
+# XEN_SYSCTL_psr_cat_op
+    psr_cat_op
+# XENPF_get_symbol
+    get_symbol
+# PMU control
+    pmu_ctrl
+# PMU use (domains, including unprivileged ones, will be using this operation)
+    pmu_use
 }
 
 # Classes domain and domain2 consist of operations that a domain performs on
@@ -218,8 +228,18 @@ class domain2
     get_vnumainfo
 # XEN_DOMCTL_psr_cmt_op
     psr_cmt_op
-# XEN_DOMCTL_configure_domain
-    configure_domain
+# XEN_DOMCTL_set_access_required
+# XEN_DOMCTL_monitor_op
+# XEN_DOMCTL_vm_event_op
+    vm_event
+# XENMEM_access_op
+    mem_access
+# XENMEM_paging_op
+    mem_paging
+# XENMEM_sharing_op
+    mem_sharing
+# XEN_DOMCTL_psr_cat_op
+    psr_cat_op
 }
 
 # Similar to class domain, but primarily contains domctls related to HVM domains
@@ -248,8 +268,6 @@ class hvm
 # HVMOP_set_mem_access, HVMOP_get_mem_access, HVMOP_pagetable_dying,
 # HVMOP_inject_trap
     hvmctl
-# XEN_DOMCTL_set_access_required
-    mem_event
 # XEN_DOMCTL_mem_sharing_op and XENMEM_sharing_op_{share,add_physmap} with:
 #  source = the domain making the hypercall
 #  target = domain whose memory is being shared
@@ -264,6 +282,13 @@ class hvm
     share_mem
 # HVMOP_set_param setting HVM_PARAM_NESTEDHVM
     nested
+# HVMOP_set_param setting HVM_PARAM_ALTP2MHVM
+    altp2mhvm
+# HVMOP_altp2m_set_domain_state HVMOP_altp2m_get_domain_state
+# HVMOP_altp2m_vcpu_enable_notify HVMOP_altp2m_create_p2m
+# HVMOP_altp2m_destroy_p2m HVMOP_altp2m_switch_p2m
+# HVMOP_altp2m_set_mem_access HVMOP_altp2m_change_gfn
+    altp2mhvm_op
 }
 
 # Class event describes event channels.  Interdomain event channels have their
@@ -331,7 +356,7 @@ class mmu
 #  source = domain making the hypercall
 #  target = domain whose pages are being mapped
     map_write
-# XEN_DOMCTL_getpageframeinfo*
+# XEN_DOMCTL_getpageframeinfo3
     pageinfo
 # XEN_DOMCTL_getmemlist
     pagelist
@@ -417,7 +442,7 @@ class resource
     remove_iomem
 # XEN_DOMCTL_get_device_group, XEN_DOMCTL_test_assign_device:
 #  source = domain making the hypercall
-#  target = PCI device being queried
+#  target = device being queried
     stat_device
 # XEN_DOMCTL_assign_device
     add_device
diff --git a/xen/xsm/flask/policy/initial_sids b/xen/xsm/flask/policy/initial_sids
index e508bde..7eca70d 100644
--- a/xen/xsm/flask/policy/initial_sids
+++ b/xen/xsm/flask/policy/initial_sids
@@ -13,4 +13,6 @@ sid ioport
 sid iomem
 sid irq
 sid device
+sid domU
+sid domDM
 # FLASK
diff --git a/xen/xsm/flask/policy/mkaccess_vector.sh b/xen/xsm/flask/policy/mkaccess_vector.sh
index 8ec87f7..7fa4aaf 100644
--- a/xen/xsm/flask/policy/mkaccess_vector.sh
+++ b/xen/xsm/flask/policy/mkaccess_vector.sh
@@ -42,7 +42,7 @@ $1 == "class"	{
 			} 
 			av_defined[tclass] = 1;
 
-			permission = 1;
+			permission = 0;
 
 			nextstate = "INHERITS_OR_CLASS-OPENBRACKET";
 			next;
@@ -108,8 +108,8 @@ $1 == "{"	{
 
 			for (i = 0; i < spaces; i++) 
 				printf(" ") > outfile; 
-			printf("0x%08xUL\n", permission) > outfile; 
-			permission = permission * 2;
+			printf("(1UL << %u)\n", permission) > outfile;
+			permission = permission + 1;
 		}
 $1 == "}"	{
 			if (nextstate != "CLASS-CLOSEBRACKET" && 
diff --git a/xen/xsm/flask/policy/mkflask.sh b/xen/xsm/flask/policy/mkflask.sh
index e8d8fb5..989a323 100644
--- a/xen/xsm/flask/policy/mkflask.sh
+++ b/xen/xsm/flask/policy/mkflask.sh
@@ -8,10 +8,13 @@ set -e
 awk=$1
 shift 1
 
+output_dir=$1
+shift 1
+
 # output file
-output_file="include/flask.h"
-debug_file="include/class_to_string.h"
-debug_file2="include/initial_sid_to_string.h"
+output_file="$output_dir/flask.h"
+debug_file="$output_dir/class_to_string.h"
+debug_file2="$output_dir/initial_sid_to_string.h"
 
 cat $* | $awk "
 BEGIN	{
@@ -25,6 +28,7 @@ BEGIN	{
 
 		printf("#ifndef _SELINUX_FLASK_H_\n") > outfile;
 		printf("#define _SELINUX_FLASK_H_\n") > outfile;
+		printf("\n#if defined(__XEN__) || defined(__XEN_TOOLS__)\n") > outfile;
 		printf("\n/*\n * Security object class definitions\n */\n") > outfile;
 		printf("/* This file is automatically generated.  Do not edit. */\n") > debugfile;
 		printf("/*\n * Security object class definitions\n */\n") > debugfile;
@@ -88,6 +92,7 @@ END	{
 		for (i = 0; i < 34; i++) 
 			printf(" ") > outfile; 
 		printf("%d\n", sid_value) > outfile; 
+		printf("\n#endif /* __XEN__ || __XEN_TOOLS__ */\n") > outfile;
 		printf("\n#endif\n") > outfile;
 		printf("};\n\n") > debugfile2;
 	}'
diff --git a/xen/xsm/flask/ss/policydb.c b/xen/xsm/flask/ss/policydb.c
index 50b2c78..a1060b1 100644
--- a/xen/xsm/flask/ss/policydb.c
+++ b/xen/xsm/flask/ss/policydb.c
@@ -74,55 +74,55 @@ static struct policydb_compat_info policydb_compat[] = {
     {
         .version        = POLICYDB_VERSION_BASE,
         .sym_num        = SYM_NUM - 3,
-        .ocon_num       = OCON_NUM - 1,
+        .ocon_num       = 4,
         .target_type    = TARGET_XEN_OLD,
     },
     {
         .version        = POLICYDB_VERSION_BOOL,
         .sym_num        = SYM_NUM - 2,
-        .ocon_num       = OCON_NUM - 1,
+        .ocon_num       = 4,
         .target_type    = TARGET_XEN_OLD,
     },
     {
         .version        = POLICYDB_VERSION_IPV6,
         .sym_num        = SYM_NUM - 2,
-        .ocon_num       = OCON_NUM,
+        .ocon_num       = 5,
         .target_type    = TARGET_XEN_OLD,
     },
     {
         .version        = POLICYDB_VERSION_NLCLASS,
         .sym_num        = SYM_NUM - 2,
-        .ocon_num       = OCON_NUM,
+        .ocon_num       = 5,
         .target_type    = TARGET_XEN_OLD,
     },
     {
         .version        = POLICYDB_VERSION_MLS,
         .sym_num        = SYM_NUM,
-        .ocon_num       = OCON_NUM,
+        .ocon_num       = 5,
         .target_type    = TARGET_XEN_OLD,
     },
     {
         .version        = POLICYDB_VERSION_AVTAB,
         .sym_num        = SYM_NUM,
-        .ocon_num       = OCON_NUM,
+        .ocon_num       = 5,
         .target_type    = TARGET_XEN_OLD,
     },
     {
 	.version	= POLICYDB_VERSION_RANGETRANS,
 	.sym_num	= SYM_NUM,
-	.ocon_num	= OCON_NUM,
+	.ocon_num	= 5,
         .target_type    = TARGET_XEN_OLD,
     },
     {
 	.version	= POLICYDB_VERSION_POLCAP,
 	.sym_num	= SYM_NUM,
-	.ocon_num	= OCON_NUM,
+	.ocon_num	= 5,
         .target_type    = TARGET_XEN_OLD,
     },
     {
 	.version	= POLICYDB_VERSION_PERMISSIVE,
 	.sym_num	= SYM_NUM,
-	.ocon_num	= OCON_NUM,
+	.ocon_num	= 5,
         .target_type    = TARGET_XEN_OLD,
     },
     {
@@ -134,7 +134,13 @@ static struct policydb_compat_info policydb_compat[] = {
     {
 	.version	= POLICYDB_VERSION_BOUNDARY,
 	.sym_num	= SYM_NUM,
-	.ocon_num	= OCON_NUM,
+	.ocon_num	= OCON_DEVICE + 1,
+        .target_type    = TARGET_XEN,
+    },
+    {
+	.version	= POLICYDB_VERSION_XEN_DEVICETREE,
+	.sym_num	= SYM_NUM,
+	.ocon_num	= OCON_DTREE + 1,
         .target_type    = TARGET_XEN,
     },
 };
@@ -634,7 +640,7 @@ static void ocontext_destroy(struct ocontext *c, int i)
 {
     context_destroy(&c->context[0]);
     context_destroy(&c->context[1]);
-    if ( i == OCON_ISID )
+    if ( i == OCON_ISID || i == OCON_DTREE )
         xfree(c->u.name);
     xfree(c);
 }
@@ -682,17 +688,17 @@ void policydb_destroy(struct policydb *p)
 
     for ( tr = p->role_tr; tr; tr = tr->next )
     {
-        if ( ltr ) xfree(ltr);
+        xfree(ltr);
         ltr = tr;
     }
-    if ( ltr ) xfree(ltr);
+    xfree(ltr);
 
     for ( ra = p->role_allow; ra; ra = ra -> next )
     {
-        if ( lra ) xfree(lra);
+        xfree(lra);
         lra = ra;
     }
-    if ( lra ) xfree(lra);
+    xfree(lra);
 
     for ( rt = p->range_tr; rt; rt = rt -> next )
     {
@@ -1040,8 +1046,8 @@ bad:
     goto out;
 }
 
-static int read_cons_helper(struct constraint_node **nodep, int ncons,
-                                                    int allowxtarget, void *fp)
+static int read_cons_helper(struct policydb *p, struct constraint_node **nodep,
+                            int ncons, int allowxtarget, void *fp)
 {
     struct constraint_node *c, *lc;
     struct constraint_expr *e, *le;
@@ -1115,6 +1121,23 @@ static int read_cons_helper(struct constraint_node **nodep, int ncons,
                     depth++;
                     if ( ebitmap_read(&e->names, fp) )
                         return -EINVAL;
+                    if ( p->policyvers >= POLICYDB_VERSION_CONSTRAINT_NAMES )
+                    {
+                        struct ebitmap dummy;
+                        ebitmap_init(&dummy);
+                        if ( ebitmap_read(&dummy, fp) )
+                            return -EINVAL;
+                        ebitmap_destroy(&dummy);
+
+                        ebitmap_init(&dummy);
+                        if ( ebitmap_read(&dummy, fp) )
+                            return -EINVAL;
+                        ebitmap_destroy(&dummy);
+
+                        rc = next_entry(buf, fp, sizeof(u32));
+                        if ( rc < 0 )
+                            return rc;
+                    }
                 break;
                 default:
                     return -EINVAL;
@@ -1184,7 +1207,7 @@ static int class_read(struct policydb *p, struct hashtab *h, void *fp)
             goto bad;
     }
 
-    rc = read_cons_helper(&cladatum->constraints, ncons, 0, fp);
+    rc = read_cons_helper(p, &cladatum->constraints, ncons, 0, fp);
     if ( rc )
         goto bad;
 
@@ -1195,11 +1218,27 @@ static int class_read(struct policydb *p, struct hashtab *h, void *fp)
         if ( rc < 0 )
             goto bad;
         ncons = le32_to_cpu(buf[0]);
-        rc = read_cons_helper(&cladatum->validatetrans, ncons, 1, fp);
+        rc = read_cons_helper(p, &cladatum->validatetrans, ncons, 1, fp);
         if ( rc )
             goto bad;
     }
 
+    if ( p->policyvers >= POLICYDB_VERSION_NEW_OBJECT_DEFAULTS )
+    {
+        rc = next_entry(buf, fp, sizeof(u32) * 3);
+        if ( rc )
+            goto bad;
+        /* these values are ignored by Xen */
+    }
+
+    if ( p->policyvers >= POLICYDB_VERSION_DEFAULT_TYPE )
+    {
+        rc = next_entry(buf, fp, sizeof(u32) * 1);
+        if ( rc )
+            goto bad;
+        /* ignored by Xen */
+    }
+
     rc = hashtab_insert(h, key, cladatum);
     if ( rc )
         goto bad;
@@ -1874,7 +1913,10 @@ int policydb_read(struct policydb *p, void *fp)
             ltr->next = tr;
         else
             p->role_tr = tr;
-        rc = next_entry(buf, fp, sizeof(u32)*3);
+        if ( p->policyvers >= POLICYDB_VERSION_ROLETRANS )
+            rc = next_entry(buf, fp, sizeof(u32)*4);
+        else
+            rc = next_entry(buf, fp, sizeof(u32)*3);
         if ( rc < 0 )
             goto bad;
         tr->role = le32_to_cpu(buf[0]);
@@ -1921,6 +1963,20 @@ int policydb_read(struct policydb *p, void *fp)
         lra = ra;
     }
 
+    if ( p->policyvers >= POLICYDB_VERSION_FILENAME_TRANS )
+    {
+        rc = next_entry(buf, fp, sizeof(u32));
+        if ( rc )
+            goto bad;
+        nel = le32_to_cpu(buf[0]);
+        if ( nel )
+        {
+            printk(KERN_ERR "Flask:  unsupported genfs config data\n");
+            rc = -EINVAL;
+            goto bad;
+        }
+    }
+
     rc = policydb_index_classes(p);
     if ( rc )
         goto bad;
@@ -1999,11 +2055,23 @@ int policydb_read(struct policydb *p, void *fp)
                         "Old xen policy does not support iomemcon");
                     goto bad;
                 }
-                rc = next_entry(buf, fp, sizeof(u32) *2);
-                if ( rc < 0 )
-                    goto bad;
-                c->u.iomem.low_iomem = le32_to_cpu(buf[0]);
-                c->u.iomem.high_iomem = le32_to_cpu(buf[1]);
+                if ( p->policyvers >= POLICYDB_VERSION_XEN_DEVICETREE )
+                {
+                    u64 b64[2];
+                    rc = next_entry(b64, fp, sizeof(u64) *2);
+                    if ( rc < 0 )
+                        goto bad;
+                    c->u.iomem.low_iomem = le64_to_cpu(b64[0]);
+                    c->u.iomem.high_iomem = le64_to_cpu(b64[1]);
+                }
+                else
+                {
+                    rc = next_entry(buf, fp, sizeof(u32) *2);
+                    if ( rc < 0 )
+                        goto bad;
+                    c->u.iomem.low_iomem = le32_to_cpu(buf[0]);
+                    c->u.iomem.high_iomem = le32_to_cpu(buf[1]);
+                }
                 rc = context_read_and_validate(&c->context[0], p, fp);
                 if ( rc )
                     goto bad;
@@ -2023,6 +2091,29 @@ int policydb_read(struct policydb *p, void *fp)
                 if ( rc )
                     goto bad;
                 break;
+            case OCON_DTREE:
+                if ( p->target_type != TARGET_XEN )
+                {
+                    printk(KERN_ERR
+                        "Old xen policy does not support devicetreecon");
+                    goto bad;
+                }
+                rc = next_entry(buf, fp, sizeof(u32));
+                if ( rc < 0 )
+                    goto bad;
+                len = le32_to_cpu(buf[0]);
+                rc = -ENOMEM;
+                c->u.name = xmalloc_array(char, len + 1);
+                if (!c->u.name)
+                    goto bad;
+                rc = next_entry(c->u.name, fp, len);
+                if ( rc < 0 )
+                    goto bad;
+                c->u.name[len] = 0;
+                rc = context_read_and_validate(&c->context[0], p, fp);
+                if ( rc )
+                    goto bad;
+                break;
             default:
                 printk(KERN_ERR
                        "Flask:  unsupported object context config data\n");
diff --git a/xen/xsm/flask/ss/policydb.h b/xen/xsm/flask/ss/policydb.h
index b176300..30be71a 100644
--- a/xen/xsm/flask/ss/policydb.h
+++ b/xen/xsm/flask/ss/policydb.h
@@ -154,8 +154,8 @@ struct ocontext {
                 u32 high_ioport;
         } ioport;
         struct {
-                u32 low_iomem;
-                u32 high_iomem;
+                u64 low_iomem;
+                u64 high_iomem;
         } iomem;
     } u;
     struct context context[2];    /* security context(s) */
@@ -180,7 +180,8 @@ struct ocontext {
 #define OCON_IOPORT  2    /* io ports */
 #define OCON_IOMEM   3    /* io memory */
 #define OCON_DEVICE  4    /* pci devices */
-#define OCON_NUM     5
+#define OCON_DTREE   5    /* device tree nodes */
+#define OCON_NUM     6
 #define OCON_NUM_OLD 7
 
 /* The policy database */
diff --git a/xen/xsm/flask/ss/services.c b/xen/xsm/flask/ss/services.c
index f0e459a..f31d7d7 100644
--- a/xen/xsm/flask/ss/services.c
+++ b/xen/xsm/flask/ss/services.c
@@ -1831,6 +1831,41 @@ out:
     return rc;
 }
 
+int security_devicetree_sid(const char *path, u32 *out_sid)
+{
+    struct ocontext *c;
+    int rc = 0;
+
+    POLICY_RDLOCK;
+
+    c = policydb.ocontexts[OCON_DTREE];
+    while ( c )
+    {
+        if ( strcmp(c->u.name, path) == 0 )
+            break;
+        c = c->next;
+    }
+
+    if ( c )
+    {
+        if ( !c->sid[0] )
+        {
+            rc = sidtab_context_to_sid(&sidtab, &c->context[0], &c->sid[0]);
+            if ( rc )
+                goto out;
+        }
+        *out_sid = c->sid[0];
+    }
+    else
+    {
+        *out_sid = SECINITSID_DEVICE;
+    }
+
+out:
+    POLICY_RDUNLOCK;
+    return rc;
+}
+
 int security_find_bool(const char *name)
 {
     int i, rv = -ENOENT;
@@ -2131,9 +2166,8 @@ int security_ocontext_add( u32 ocon, unsigned long low, unsigned long high
                 c->u.iomem.high_iomem == high && c->sid[0] == sid)
                 break;
 
-            printk("%s: IO Memory overlap with entry %#x - %#x\n",
-                   __FUNCTION__, c->u.iomem.low_iomem,
-                   c->u.iomem.high_iomem);
+            printk("%s: IO Memory overlap with entry %#"PRIx64" - %#"PRIx64"\n",
+                   __FUNCTION__, c->u.iomem.low_iomem, c->u.iomem.high_iomem);
             ret = -EEXIST;
             break;
         }
@@ -2188,7 +2222,7 @@ int security_ocontext_add( u32 ocon, unsigned long low, unsigned long high
     return ret;
 }
 
-int security_ocontext_del( u32 ocon, unsigned int low, unsigned int high )
+int security_ocontext_del( u32 ocon, unsigned long low, unsigned long high )
 {
     int ret = 0;
     struct ocontext *c, *before_c;
@@ -2217,7 +2251,7 @@ int security_ocontext_del( u32 ocon, unsigned int low, unsigned int high )
             }
         }
 
-        printk("%s: ocontext not found: pirq %d\n", __FUNCTION__, low);
+        printk("%s: ocontext not found: pirq %ld\n", __FUNCTION__, low);
         ret = -ENOENT;
         break;
 
@@ -2243,8 +2277,8 @@ int security_ocontext_del( u32 ocon, unsigned int low, unsigned int high )
             }
         }
 
-        printk("%s: ocontext not found: ioport %#x - %#x\n", __FUNCTION__,
-                low, high);
+        printk("%s: ocontext not found: ioport %#lx - %#lx\n",
+                __FUNCTION__, low, high);
         ret = -ENOENT;
         break;
 
@@ -2270,8 +2304,8 @@ int security_ocontext_del( u32 ocon, unsigned int low, unsigned int high )
             }
         }
 
-        printk("%s: ocontext not found: iomem %#x - %#x\n", __FUNCTION__,
-                low, high);
+        printk("%s: ocontext not found: iomem %#lx - %#lx\n",
+                __FUNCTION__, low, high);
         ret = -ENOENT;
         break;
 
@@ -2296,7 +2330,7 @@ int security_ocontext_del( u32 ocon, unsigned int low, unsigned int high )
             }
         }
 
-        printk("%s: ocontext not found: pcidevice %#x\n", __FUNCTION__, low);
+        printk("%s: ocontext not found: pcidevice %#lx\n", __FUNCTION__, low);
         ret = -ENOENT;
         break;
 
@@ -2308,3 +2342,67 @@ int security_ocontext_del( u32 ocon, unsigned int low, unsigned int high )
     POLICY_WRUNLOCK;
     return ret;
 }
+
+int security_devicetree_setlabel(char *path, u32 sid)
+{
+    int ret = 0;
+    struct ocontext *c;
+    struct ocontext **pcurr;
+    struct ocontext *add = NULL;
+
+    if ( sid )
+    {
+        add = xzalloc(struct ocontext);
+        if ( add == NULL )
+        {
+            xfree(path);
+            return -ENOMEM;
+        }
+        add->sid[0] = sid;
+        add->u.name = path;
+    }
+    else
+    {
+        ret = -ENOENT;
+    }
+
+    POLICY_WRLOCK;
+
+    pcurr = &policydb.ocontexts[OCON_DTREE];
+    c = *pcurr;
+    while ( c )
+    {
+        if ( strcmp(c->u.name, path) == 0 )
+        {
+            if ( sid )
+            {
+                ret = -EEXIST;
+                break;
+            }
+            else
+            {
+                *pcurr = c->next;
+                xfree(c->u.name);
+                xfree(c);
+                ret = 0;
+                break;
+            }
+        }
+        pcurr = &c->next;
+        c = *pcurr;
+    }
+
+    if ( add && ret == 0 )
+    {
+        add->next = policydb.ocontexts[OCON_DTREE];
+        policydb.ocontexts[OCON_DTREE] = add;
+        add = NULL;
+        path = NULL;
+    }
+
+    POLICY_WRUNLOCK;
+
+    xfree(add);
+    xfree(path);
+    return ret;
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-xen/xen.git