[kernel] r15297 - in dists/sid/linux-2.6/debian: . config patches/features/all/xen patches/series
Bastian Blank
waldi at alioth.debian.org
Sun Feb 28 19:10:20 UTC 2010
Author: waldi
Date: Sun Feb 28 19:10:15 2010
New Revision: 15297
Log:
Add Xen dom0 support.
* debian/changelog: Update.
* debian/config/defines: Enable xen featureset.
* debian/patches/features/all/xen/pvops.patch,
debian/patches/features/all/xen/pvops-updates.patch: Add.
* debian/patches/series/10-extra: Add new patches.
Added:
dists/sid/linux-2.6/debian/patches/features/all/xen/pvops-updates.patch
dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch
dists/sid/linux-2.6/debian/patches/series/10-extra
Modified:
dists/sid/linux-2.6/debian/changelog
dists/sid/linux-2.6/debian/config/defines
Modified: dists/sid/linux-2.6/debian/changelog
==============================================================================
--- dists/sid/linux-2.6/debian/changelog Sun Feb 28 19:00:52 2010 (r15296)
+++ dists/sid/linux-2.6/debian/changelog Sun Feb 28 19:10:15 2010 (r15297)
@@ -13,6 +13,9 @@
* agpgart: Reprobe VGA devices when a new GART device is added
(Closes: #570229)
+ [ Bastian Blank ]
+ * Add support for Xen dom0 into its featureset.
+
-- maximilian attems <maks at debian.org> Thu, 25 Feb 2010 13:07:47 +0100
linux-2.6 (2.6.32-9) unstable; urgency=high
Modified: dists/sid/linux-2.6/debian/config/defines
==============================================================================
--- dists/sid/linux-2.6/debian/config/defines Sun Feb 28 19:00:52 2010 (r15296)
+++ dists/sid/linux-2.6/debian/config/defines Sun Feb 28 19:10:15 2010 (r15297)
@@ -26,7 +26,7 @@
enabled: true
[featureset-xen_base]
-enabled: false
+enabled: true
[description]
part-long-xen: This kernel also runs on a Xen hypervisor.
Added: dists/sid/linux-2.6/debian/patches/features/all/xen/pvops-updates.patch
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ dists/sid/linux-2.6/debian/patches/features/all/xen/pvops-updates.patch Sun Feb 28 19:10:15 2010 (r15297)
@@ -0,0 +1,270 @@
+diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
+index 4953f9b..863e1c2 100644
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -397,6 +397,9 @@ static inline unsigned long pages_to_mb(unsigned long npg)
+ #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+ remap_pfn_range(vma, vaddr, pfn, size, prot)
+
++#define arch_vm_get_page_prot arch_vm_get_page_prot
++extern pgprot_t arch_vm_get_page_prot(unsigned vm_flags);
++
+ #if PAGETABLE_LEVELS > 2
+ static inline int pud_none(pud_t pud)
+ {
+diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
+index c57a301..4e46931 100644
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -160,7 +160,7 @@ extern void cleanup_highmap(void);
+ #define pgtable_cache_init() do { } while (0)
+ #define check_pgt_cache() do { } while (0)
+
+-#define PAGE_AGP PAGE_KERNEL_NOCACHE
++#define PAGE_AGP PAGE_KERNEL_IO_NOCACHE
+ #define HAVE_PAGE_AGP 1
+
+ /* fs/proc/kcore.c */
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index 25fc1df..103e324 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -17,6 +17,16 @@
+
+ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+
++pgprot_t arch_vm_get_page_prot(unsigned vm_flags)
++{
++ pgprot_t ret = __pgprot(0);
++
++ if (vm_flags & VM_IO)
++ ret = __pgprot(_PAGE_IOMAP);
++
++ return ret;
++}
++
+ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+ {
+ return (pte_t *)__get_free_page(PGALLOC_GFP);
+diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
+index 9bca04e..399a017 100644
+--- a/drivers/char/agp/intel-agp.c
++++ b/drivers/char/agp/intel-agp.c
+@@ -398,15 +398,19 @@ static void intel_i810_agp_enable(struct agp_bridge_data *bridge, u32 mode)
+ /* Exists to support ARGB cursors */
+ static struct page *i8xx_alloc_pages(void)
+ {
++ void *addr;
++ dma_addr_t _d;
+ struct page *page;
+
+- page = alloc_pages(GFP_KERNEL | GFP_DMA32, 2);
+- if (page == NULL)
++ addr = dma_alloc_coherent(NULL, 4 * PAGE_SIZE, &_d, GFP_KERNEL);
++ if (addr == NULL)
+ return NULL;
+
++ page = virt_to_page(addr);
++
+ if (set_pages_uc(page, 4) < 0) {
+ set_pages_wb(page, 4);
+- __free_pages(page, 2);
++ dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, _d);
+ return NULL;
+ }
+ get_page(page);
+@@ -416,12 +420,17 @@ static struct page *i8xx_alloc_pages(void)
+
+ static void i8xx_destroy_pages(struct page *page)
+ {
++ void *addr;
++
+ if (page == NULL)
+ return;
+
+ set_pages_wb(page, 4);
+ put_page(page);
+- __free_pages(page, 2);
++
++ addr = page_address(page);
++
++ dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, virt_to_bus(addr));
+ atomic_dec(&agp_bridge->current_memory_agp);
+ }
+
+diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
+index a75ca63..bdc26b9 100644
+--- a/drivers/gpu/drm/drm_drv.c
++++ b/drivers/gpu/drm/drm_drv.c
+@@ -201,7 +201,7 @@ int drm_lastclose(struct drm_device * dev)
+ }
+ if (drm_core_check_feature(dev, DRIVER_SG) && dev->sg &&
+ !drm_core_check_feature(dev, DRIVER_MODESET)) {
+- drm_sg_cleanup(dev->sg);
++ drm_sg_cleanup(dev, dev->sg);
+ dev->sg = NULL;
+ }
+
+diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
+index 8bf3770..dde5f66 100644
+--- a/drivers/gpu/drm/drm_gem.c
++++ b/drivers/gpu/drm/drm_gem.c
+@@ -539,7 +539,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
+ vma->vm_ops = obj->dev->driver->gem_vm_ops;
+ vma->vm_private_data = map->handle;
+- vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
++ vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+
+ /* Take a ref for this mapping of the object, so that the fault
+ * handler can dereference the mmap offset's pointer to the object.
+diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c
+index c7823c8..95ffb8a 100644
+--- a/drivers/gpu/drm/drm_scatter.c
++++ b/drivers/gpu/drm/drm_scatter.c
+@@ -32,20 +32,73 @@
+ */
+
+ #include <linux/vmalloc.h>
++#include <linux/mm.h>
+ #include "drmP.h"
+
+ #define DEBUG_SCATTER 0
+
+-static inline void *drm_vmalloc_dma(unsigned long size)
++static void *drm_vmalloc_dma(struct drm_device *drmdev, unsigned long size)
+ {
+ #if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
+ return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL | _PAGE_NO_CACHE);
+ #else
+- return vmalloc_32(size);
++ struct device *dev = &drmdev->pdev->dev;
++ struct page **pages;
++ void *addr;
++ const int npages = PFN_UP(size);
++ int i;
++
++ pages = kmalloc(npages * sizeof(*pages), GFP_KERNEL);
++ if (!pages)
++ goto fail;
++
++ for (i = 0; i < npages; i++) {
++ dma_addr_t phys;
++ void *addr;
++ addr = dma_alloc_coherent(dev, PAGE_SIZE, &phys, GFP_KERNEL);
++ if (addr == NULL)
++ goto out_free_pages;
++
++ pages[i] = virt_to_page(addr);
++ }
++
++ addr = vmap(pages, npages, VM_MAP | VM_IOREMAP, PAGE_KERNEL);
++
++ kfree(pages);
++
++ return addr;
++
++out_free_pages:
++ while (i > 0) {
++ void *addr = page_address(pages[--i]);
++ dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
++ }
++
++ kfree(pages);
++
++fail:
++ return NULL;
++#endif
++}
++
++static void drm_vfree_dma(struct drm_device *drmdev, void *addr, int npages,
++ struct page **pages)
++{
++#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
++ vfree(addr);
++#else
++ struct device *dev = &drmdev->pdev->dev;
++ int i;
++
++ for (i = 0; i < npages; i++) {
++ void *addr = page_address(pages[i]);
++ dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
++ }
++ vunmap(addr);
+ #endif
+ }
+
+-void drm_sg_cleanup(struct drm_sg_mem * entry)
++void drm_sg_cleanup(struct drm_device *drmdev, struct drm_sg_mem * entry)
+ {
+ struct page *page;
+ int i;
+@@ -56,7 +109,7 @@ void drm_sg_cleanup(struct drm_sg_mem * entry)
+ ClearPageReserved(page);
+ }
+
+- vfree(entry->virtual);
++ drm_vfree_dma(drmdev, entry->virtual, entry->pages, entry->pagelist);
+
+ kfree(entry->busaddr);
+ kfree(entry->pagelist);
+@@ -107,7 +160,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request)
+ }
+ memset((void *)entry->busaddr, 0, pages * sizeof(*entry->busaddr));
+
+- entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT);
++ entry->virtual = drm_vmalloc_dma(dev, pages << PAGE_SHIFT);
+ if (!entry->virtual) {
+ kfree(entry->busaddr);
+ kfree(entry->pagelist);
+@@ -180,7 +233,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request)
+ return 0;
+
+ failed:
+- drm_sg_cleanup(entry);
++ drm_sg_cleanup(dev, entry);
+ return -ENOMEM;
+ }
+ EXPORT_SYMBOL(drm_sg_alloc);
+@@ -212,7 +265,7 @@ int drm_sg_free(struct drm_device *dev, void *data,
+
+ DRM_DEBUG("virtual = %p\n", entry->virtual);
+
+- drm_sg_cleanup(entry);
++ drm_sg_cleanup(dev, entry);
+
+ return 0;
+ }
+diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
+index 1c040d0..3dc8d6b 100644
+--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
++++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
+@@ -272,6 +272,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
+
+ vma->vm_private_data = bo;
+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
++ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+ return 0;
+ out_unref:
+ ttm_bo_unref(&bo);
+@@ -287,6 +288,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
+ vma->vm_ops = &ttm_bo_vm_ops;
+ vma->vm_private_data = ttm_bo_reference(bo);
+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
++ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+ return 0;
+ }
+ EXPORT_SYMBOL(ttm_fbdev_mmap);
+diff --git a/include/drm/drmP.h b/include/drm/drmP.h
+index 7ad3faa..cf9ddce 100644
+--- a/include/drm/drmP.h
++++ b/include/drm/drmP.h
+@@ -1388,7 +1388,7 @@ extern int drm_vma_info(struct seq_file *m, void *data);
+ #endif
+
+ /* Scatter Gather Support (drm_scatter.h) */
+-extern void drm_sg_cleanup(struct drm_sg_mem * entry);
++extern void drm_sg_cleanup(struct drm_device *dev, struct drm_sg_mem * entry);
+ extern int drm_sg_alloc_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+ extern int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request);
Added: dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch Sun Feb 28 19:10:15 2010 (r15297)
@@ -0,0 +1,20510 @@
+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
+index 5bc4eaa..345c399 100644
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -2668,6 +2668,13 @@ and is between 256 and 4096 characters. It is defined in the file
+ medium is write-protected).
+ Example: quirks=0419:aaf5:rl,0421:0433:rc
+
++ userpte=
++ [X86] Flags controlling user PTE allocations.
++
++ nohigh = do not allocate PTE pages in
++ HIGHMEM regardless of setting
++ of CONFIG_HIGHPTE.
++
+ vdso= [X86,SH]
+ vdso=2: enable compat VDSO (default with COMPAT_VDSO)
+ vdso=1: enable VDSO (default)
+diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
+index 29a6ff8..81f9b94 100644
+--- a/Documentation/x86/x86_64/boot-options.txt
++++ b/Documentation/x86/x86_64/boot-options.txt
+@@ -267,10 +267,14 @@ IOMMU (input/output memory management unit)
+
+ iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
+ implementation:
+- swiotlb=<pages>[,force]
++ swiotlb=[npages=<pages>]
++ swiotlb=[force]
++ swiotlb=[overflow=<size>]
++
+ <pages> Prereserve that many 128K pages for the software IO
+ bounce buffering.
+ force Force all IO through the software TLB.
++ <size> Size in bytes of the overflow buffer.
+
+ Settings for the IBM Calgary hardware IOMMU currently found in IBM
+ pSeries and xSeries machines:
+diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
+index 8d3c79c..7d09a09 100644
+--- a/arch/ia64/include/asm/dma-mapping.h
++++ b/arch/ia64/include/asm/dma-mapping.h
+@@ -73,7 +73,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+ if (!dev->dma_mask)
+ return 0;
+
+- return addr + size <= *dev->dma_mask;
++ return addr + size - 1 <= *dev->dma_mask;
+ }
+
+ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h
+index dcbaea7..f0acde6 100644
+--- a/arch/ia64/include/asm/swiotlb.h
++++ b/arch/ia64/include/asm/swiotlb.h
+@@ -4,8 +4,6 @@
+ #include <linux/dma-mapping.h>
+ #include <linux/swiotlb.h>
+
+-extern int swiotlb_force;
+-
+ #ifdef CONFIG_SWIOTLB
+ extern int swiotlb;
+ extern void pci_swiotlb_init(void);
+diff --git a/arch/ia64/include/asm/xen/events.h b/arch/ia64/include/asm/xen/events.h
+index b8370c8..baa74c8 100644
+--- a/arch/ia64/include/asm/xen/events.h
++++ b/arch/ia64/include/asm/xen/events.h
+@@ -36,10 +36,6 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
+ return !(ia64_psr(regs)->i);
+ }
+
+-static inline void handle_irq(int irq, struct pt_regs *regs)
+-{
+- __do_IRQ(irq);
+-}
+ #define irq_ctx_init(cpu) do { } while (0)
+
+ #endif /* _ASM_IA64_XEN_EVENTS_H */
+diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
+index 285aae8..53292ab 100644
+--- a/arch/ia64/kernel/pci-swiotlb.c
++++ b/arch/ia64/kernel/pci-swiotlb.c
+@@ -41,7 +41,7 @@ struct dma_map_ops swiotlb_dma_ops = {
+ void __init swiotlb_dma_init(void)
+ {
+ dma_ops = &swiotlb_dma_ops;
+- swiotlb_init();
++ swiotlb_init(1);
+ }
+
+ void __init pci_swiotlb_init(void)
+@@ -51,7 +51,7 @@ void __init pci_swiotlb_init(void)
+ swiotlb = 1;
+ printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
+ machvec_init("dig");
+- swiotlb_init();
++ swiotlb_init(1);
+ dma_ops = &swiotlb_dma_ops;
+ #else
+ panic("Unable to find Intel IOMMU");
+diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
+index e281dae..80a973b 100644
+--- a/arch/powerpc/include/asm/dma-mapping.h
++++ b/arch/powerpc/include/asm/dma-mapping.h
+@@ -197,7 +197,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+ if (!dev->dma_mask)
+ return 0;
+
+- return addr + size <= *dev->dma_mask;
++ return addr + size - 1 <= *dev->dma_mask;
+ }
+
+ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
+index 53bcf3d..b152de3 100644
+--- a/arch/powerpc/kernel/setup_32.c
++++ b/arch/powerpc/kernel/setup_32.c
+@@ -345,7 +345,7 @@ void __init setup_arch(char **cmdline_p)
+
+ #ifdef CONFIG_SWIOTLB
+ if (ppc_swiotlb_enable)
+- swiotlb_init();
++ swiotlb_init(1);
+ #endif
+
+ paging_init();
+diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
+index 04f638d..df2c9e9 100644
+--- a/arch/powerpc/kernel/setup_64.c
++++ b/arch/powerpc/kernel/setup_64.c
+@@ -550,7 +550,7 @@ void __init setup_arch(char **cmdline_p)
+
+ #ifdef CONFIG_SWIOTLB
+ if (ppc_swiotlb_enable)
+- swiotlb_init();
++ swiotlb_init(1);
+ #endif
+
+ paging_init();
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index 4fdb669..fd612c0 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -1875,6 +1875,10 @@ config PCI_OLPC
+ def_bool y
+ depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
+
++config PCI_XEN
++ bool
++ select SWIOTLB
++
+ config PCI_DOMAINS
+ def_bool y
+ depends on PCI
+diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
+index 18aa3f8..4413ba4 100644
+--- a/arch/x86/include/asm/amd_iommu.h
++++ b/arch/x86/include/asm/amd_iommu.h
+@@ -23,20 +23,16 @@
+ #include <linux/irqreturn.h>
+
+ #ifdef CONFIG_AMD_IOMMU
+-extern int amd_iommu_init(void);
+ extern int amd_iommu_init_dma_ops(void);
+ extern int amd_iommu_init_passthrough(void);
+ extern void amd_iommu_detect(void);
+ extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+ extern void amd_iommu_flush_all_domains(void);
+ extern void amd_iommu_flush_all_devices(void);
+-extern void amd_iommu_shutdown(void);
+ extern void amd_iommu_apply_erratum_63(u16 devid);
+ extern void amd_iommu_init_api(void);
+ #else
+-static inline int amd_iommu_init(void) { return -ENODEV; }
+ static inline void amd_iommu_detect(void) { }
+-static inline void amd_iommu_shutdown(void) { }
+ #endif
+
+ #endif /* _ASM_X86_AMD_IOMMU_H */
+diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
+index b03bedb..0918654 100644
+--- a/arch/x86/include/asm/calgary.h
++++ b/arch/x86/include/asm/calgary.h
+@@ -62,10 +62,8 @@ struct cal_chipset_ops {
+ extern int use_calgary;
+
+ #ifdef CONFIG_CALGARY_IOMMU
+-extern int calgary_iommu_init(void);
+ extern void detect_calgary(void);
+ #else
+-static inline int calgary_iommu_init(void) { return 1; }
+ static inline void detect_calgary(void) { return; }
+ #endif
+
+diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
+index 6a25d5d..ac91eed 100644
+--- a/arch/x86/include/asm/dma-mapping.h
++++ b/arch/x86/include/asm/dma-mapping.h
+@@ -20,7 +20,8 @@
+ # define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
+ #endif
+
+-extern dma_addr_t bad_dma_address;
++#define DMA_ERROR_CODE 0
++
+ extern int iommu_merge;
+ extern struct device x86_dma_fallback_dev;
+ extern int panic_on_overflow;
+@@ -48,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+ if (ops->mapping_error)
+ return ops->mapping_error(dev, dma_addr);
+
+- return (dma_addr == bad_dma_address);
++ return (dma_addr == DMA_ERROR_CODE);
+ }
+
+ #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+@@ -66,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+ if (!dev->dma_mask)
+ return 0;
+
+- return addr + size <= *dev->dma_mask;
++ return addr + size - 1 <= *dev->dma_mask;
+ }
+
+ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
+index 6cfdafa..4ac5b0f 100644
+--- a/arch/x86/include/asm/gart.h
++++ b/arch/x86/include/asm/gart.h
+@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed;
+ extern int gart_iommu_aperture_disabled;
+
+ extern void early_gart_iommu_check(void);
+-extern void gart_iommu_init(void);
+-extern void gart_iommu_shutdown(void);
++extern int gart_iommu_init(void);
+ extern void __init gart_parse_options(char *);
+ extern void gart_iommu_hole_init(void);
+
+@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void);
+ static inline void early_gart_iommu_check(void)
+ {
+ }
+-static inline void gart_iommu_init(void)
+-{
+-}
+-static inline void gart_iommu_shutdown(void)
+-{
+-}
+ static inline void gart_parse_options(char *options)
+ {
+ }
+diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
+index 3251e23..fa152cb 100644
+--- a/arch/x86/include/asm/hpet.h
++++ b/arch/x86/include/asm/hpet.h
+@@ -68,6 +68,7 @@ extern unsigned long force_hpet_address;
+ extern int hpet_force_user;
+ extern u8 hpet_msi_disable;
+ extern int is_hpet_enabled(void);
++extern int disable_hpet(char *);
+ extern int hpet_enable(void);
+ extern void hpet_disable(void);
+ extern unsigned long hpet_readl(unsigned long a);
+@@ -108,6 +109,7 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
+ #else /* CONFIG_HPET_TIMER */
+
+ static inline int hpet_enable(void) { return 0; }
++static inline int disable_hpet(char *s) { return 0; }
+ static inline int is_hpet_enabled(void) { return 0; }
+ #define hpet_readl(a) 0
+
+diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
+index 7373932..49ee1a9 100644
+--- a/arch/x86/include/asm/io.h
++++ b/arch/x86/include/asm/io.h
+@@ -7,6 +7,10 @@
+ #include <asm-generic/int-ll64.h>
+ #include <asm/page.h>
+
++#include <xen/xen.h>
++
++extern int isapnp_disable;
++
+ #define build_mmio_read(name, size, type, reg, barrier) \
+ static inline type name(const volatile void __iomem *addr) \
+ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+@@ -199,6 +203,17 @@ extern void __iomem *early_memremap(resource_size_t phys_addr,
+ unsigned long size);
+ extern void early_iounmap(void __iomem *addr, unsigned long size);
+
++#ifdef CONFIG_XEN
++struct bio_vec;
++
++extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
++ const struct bio_vec *vec2);
++
++#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
++ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
++ (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
++#endif /* CONFIG_XEN */
++
+ #define IO_SPACE_LIMIT 0xffff
+
+ #endif /* _ASM_X86_IO_H */
+diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
+index 7c7c16c..2fc09d3 100644
+--- a/arch/x86/include/asm/io_apic.h
++++ b/arch/x86/include/asm/io_apic.h
+@@ -171,6 +171,7 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
+ extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
+
+ extern void probe_nr_irqs_gsi(void);
++extern int get_nr_irqs_gsi(void);
+
+ extern int setup_ioapic_entry(int apic, int irq,
+ struct IO_APIC_route_entry *entry,
+@@ -200,4 +201,6 @@ static inline void probe_nr_irqs_gsi(void) { }
+
+ #endif
+
++void xen_io_apic_init(void);
++
+ #endif /* _ASM_X86_IO_APIC_H */
+diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
+index fd6d21b..345c99c 100644
+--- a/arch/x86/include/asm/iommu.h
++++ b/arch/x86/include/asm/iommu.h
+@@ -1,8 +1,6 @@
+ #ifndef _ASM_X86_IOMMU_H
+ #define _ASM_X86_IOMMU_H
+
+-extern void pci_iommu_shutdown(void);
+-extern void no_iommu_init(void);
+ extern struct dma_map_ops nommu_dma_ops;
+ extern int force_iommu, no_iommu;
+ extern int iommu_detected;
+diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
+index ef51b50..e15fca1 100644
+--- a/arch/x86/include/asm/microcode.h
++++ b/arch/x86/include/asm/microcode.h
+@@ -55,4 +55,13 @@ static inline struct microcode_ops * __init init_amd_microcode(void)
+ }
+ #endif
+
++#ifdef CONFIG_MICROCODE_XEN
++extern struct microcode_ops * __init init_xen_microcode(void);
++#else
++static inline struct microcode_ops * __init init_xen_microcode(void)
++{
++ return NULL;
++}
++#endif
++
+ #endif /* _ASM_X86_MICROCODE_H */
+diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
+index 80a1dee..67eaa91 100644
+--- a/arch/x86/include/asm/mmu.h
++++ b/arch/x86/include/asm/mmu.h
+@@ -13,6 +13,9 @@ typedef struct {
+ int size;
+ struct mutex lock;
+ void *vdso;
++#ifdef CONFIG_XEN
++ int has_foreign_mappings;
++#endif
+ } mm_context_t;
+
+ #ifdef CONFIG_SMP
+diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
+index efb3899..63a55bc 100644
+--- a/arch/x86/include/asm/paravirt.h
++++ b/arch/x86/include/asm/paravirt.h
+@@ -330,11 +330,18 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
+ {
+ PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
+ }
++
+ static inline void set_iopl_mask(unsigned mask)
+ {
+ PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
+ }
+
++static inline void set_io_bitmap(struct thread_struct *thread,
++ unsigned long bytes_updated)
++{
++ PVOP_VCALL2(pv_cpu_ops.set_io_bitmap, thread, bytes_updated);
++}
++
+ /* The paravirtualized I/O functions */
+ static inline void slow_down_io(void)
+ {
+diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
+index 9357473..3202dcc 100644
+--- a/arch/x86/include/asm/paravirt_types.h
++++ b/arch/x86/include/asm/paravirt_types.h
+@@ -135,6 +135,8 @@ struct pv_cpu_ops {
+ void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
+
+ void (*set_iopl_mask)(unsigned mask);
++ void (*set_io_bitmap)(struct thread_struct *thread,
++ unsigned long bytes_updated);
+
+ void (*wbinvd)(void);
+ void (*io_delay)(void);
+diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
+index ada8c20..2a34c12 100644
+--- a/arch/x86/include/asm/pci.h
++++ b/arch/x86/include/asm/pci.h
+@@ -21,6 +21,7 @@ struct pci_sysdata {
+ extern int pci_routeirq;
+ extern int noioapicquirk;
+ extern int noioapicreroute;
++extern int pci_scan_all_fns;
+
+ /* scan a bus after allocating a pci_sysdata for it */
+ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
+diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
+index b399988..30cbf49 100644
+--- a/arch/x86/include/asm/pci_x86.h
++++ b/arch/x86/include/asm/pci_x86.h
+@@ -45,6 +45,7 @@ enum pci_bf_sort_state {
+ extern unsigned int pcibios_max_latency;
+
+ void pcibios_resource_survey(void);
++void pcibios_set_cache_line_size(void);
+
+ /* pci-pc.c */
+
+@@ -106,6 +107,7 @@ extern int pci_direct_probe(void);
+ extern void pci_direct_init(int type);
+ extern void pci_pcbios_init(void);
+ extern int pci_olpc_init(void);
++extern int pci_xen_init(void);
+ extern void __init dmi_check_pciprobe(void);
+ extern void __init dmi_check_skip_isa_align(void);
+
+diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
+index 0e8c2a0..271de94 100644
+--- a/arch/x86/include/asm/pgalloc.h
++++ b/arch/x86/include/asm/pgalloc.h
+@@ -23,6 +23,11 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
+ #endif
+
+ /*
++ * Flags to use when allocating a user page table page.
++ */
++extern gfp_t __userpte_alloc_gfp;
++
++/*
+ * Allocate and free page tables.
+ */
+ extern pgd_t *pgd_alloc(struct mm_struct *);
+diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
+index af6fd36..4953f9b 100644
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -616,6 +616,9 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ memcpy(dst, src, count * sizeof(pgd_t));
+ }
+
++int create_lookup_pte_addr(struct mm_struct *mm,
++ unsigned long address,
++ uint64_t *ptep);
+
+ #include <asm-generic/pgtable.h>
+ #endif /* __ASSEMBLY__ */
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
+index 13b1885..0aac25a 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -551,6 +551,9 @@ static inline void native_set_iopl_mask(unsigned mask)
+ #endif
+ }
+
++extern void native_set_io_bitmap(struct thread_struct *thread,
++ unsigned long updated_bytes);
++
+ static inline void
+ native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+ {
+@@ -592,6 +595,7 @@ static inline void load_sp0(struct tss_struct *tss,
+ }
+
+ #define set_iopl_mask native_set_iopl_mask
++#define set_io_bitmap native_set_io_bitmap
+ #endif /* CONFIG_PARAVIRT */
+
+ /*
+diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
+index b9e4e20..8085277 100644
+--- a/arch/x86/include/asm/swiotlb.h
++++ b/arch/x86/include/asm/swiotlb.h
+@@ -3,15 +3,16 @@
+
+ #include <linux/swiotlb.h>
+
+-/* SWIOTLB interface */
+-
+-extern int swiotlb_force;
+-
+ #ifdef CONFIG_SWIOTLB
+ extern int swiotlb;
+-extern void pci_swiotlb_init(void);
++extern int __init pci_swiotlb_detect(void);
++extern void __init pci_swiotlb_init(void);
+ #else
+ #define swiotlb 0
++static inline int pci_swiotlb_detect(void)
++{
++ return 0;
++}
+ static inline void pci_swiotlb_init(void)
+ {
+ }
+diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
+index 1bb6e39..ef0fa4d 100644
+--- a/arch/x86/include/asm/syscalls.h
++++ b/arch/x86/include/asm/syscalls.h
+@@ -33,11 +33,11 @@ long sys_rt_sigreturn(struct pt_regs *);
+ asmlinkage int sys_set_thread_area(struct user_desc __user *);
+ asmlinkage int sys_get_thread_area(struct user_desc __user *);
+
+-/* X86_32 only */
+-#ifdef CONFIG_X86_32
+ /* kernel/ioport.c */
+-long sys_iopl(struct pt_regs *);
++asmlinkage long sys_iopl(unsigned int);
+
++/* X86_32 only */
++#ifdef CONFIG_X86_32
+ /* kernel/process_32.c */
+ int sys_clone(struct pt_regs *);
+ int sys_execve(struct pt_regs *);
+@@ -68,8 +68,6 @@ int sys_vm86(struct pt_regs *);
+ #else /* CONFIG_X86_32 */
+
+ /* X86_64 only */
+-/* kernel/ioport.c */
+-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
+
+ /* kernel/process_64.c */
+ asmlinkage long sys_clone(unsigned long, unsigned long,
+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
+index 7f3eba0..e4fc8ea 100644
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -89,6 +89,10 @@ static inline void __flush_tlb_one(unsigned long addr)
+
+ #ifndef CONFIG_SMP
+
++static inline void __init init_smp_flush(void)
++{
++}
++
+ #define flush_tlb() __flush_tlb()
+ #define flush_tlb_all() __flush_tlb_all()
+ #define local_flush_tlb() __flush_tlb()
+@@ -129,6 +133,8 @@ static inline void reset_lazy_tlbstate(void)
+
+ #define local_flush_tlb() __flush_tlb()
+
++extern void init_smp_flush(void);
++
+ extern void flush_tlb_all(void);
+ extern void flush_tlb_current_task(void);
+ extern void flush_tlb_mm(struct mm_struct *);
+diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
+index 2c756fd..d8e7145 100644
+--- a/arch/x86/include/asm/x86_init.h
++++ b/arch/x86/include/asm/x86_init.h
+@@ -91,6 +91,14 @@ struct x86_init_timers {
+ };
+
+ /**
++ * struct x86_init_iommu - platform specific iommu setup
++ * @iommu_init: platform specific iommu setup
++ */
++struct x86_init_iommu {
++ int (*iommu_init)(void);
++};
++
++/**
+ * struct x86_init_ops - functions for platform specific setup
+ *
+ */
+@@ -101,6 +109,7 @@ struct x86_init_ops {
+ struct x86_init_oem oem;
+ struct x86_init_paging paging;
+ struct x86_init_timers timers;
++ struct x86_init_iommu iommu;
+ };
+
+ /**
+@@ -121,6 +130,7 @@ struct x86_platform_ops {
+ unsigned long (*calibrate_tsc)(void);
+ unsigned long (*get_wallclock)(void);
+ int (*set_wallclock)(unsigned long nowtime);
++ void (*iommu_shutdown)(void);
+ };
+
+ extern struct x86_init_ops x86_init;
+diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
+index 9c371e4..3da450b 100644
+--- a/arch/x86/include/asm/xen/hypercall.h
++++ b/arch/x86/include/asm/xen/hypercall.h
+@@ -45,6 +45,7 @@
+ #include <xen/interface/xen.h>
+ #include <xen/interface/sched.h>
+ #include <xen/interface/physdev.h>
++#include <xen/interface/platform.h>
+
+ /*
+ * The hypercall asms have to meet several constraints:
+@@ -200,6 +201,23 @@ extern struct { char _entry[32]; } hypercall_page[];
+ (type)__res; \
+ })
+
++static inline long
++privcmd_call(unsigned call,
++ unsigned long a1, unsigned long a2,
++ unsigned long a3, unsigned long a4,
++ unsigned long a5)
++{
++ __HYPERCALL_DECLS;
++ __HYPERCALL_5ARG(a1, a2, a3, a4, a5);
++
++ asm volatile("call *%[call]"
++ : __HYPERCALL_5PARAM
++ : [call] "a" (&hypercall_page[call])
++ : __HYPERCALL_CLOBBER5);
++
++ return (long)__res;
++}
++
+ static inline int
+ HYPERVISOR_set_trap_table(struct trap_info *table)
+ {
+@@ -282,6 +300,13 @@ HYPERVISOR_set_timer_op(u64 timeout)
+ }
+
+ static inline int
++HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
++{
++ platform_op->interface_version = XENPF_INTERFACE_VERSION;
++ return _hypercall1(int, dom0_op, platform_op);
++}
++
++static inline int
+ HYPERVISOR_set_debugreg(int reg, unsigned long value)
+ {
+ return _hypercall2(int, set_debugreg, reg, value);
+@@ -424,6 +449,14 @@ MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
+ mcl->args[0] = set;
+ }
+
++#if defined(CONFIG_X86_64)
++#define MULTI_UVMFLAGS_INDEX 2
++#define MULTI_UVMDOMID_INDEX 3
++#else
++#define MULTI_UVMFLAGS_INDEX 3
++#define MULTI_UVMDOMID_INDEX 4
++#endif
++
+ static inline void
+ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags)
+@@ -432,12 +465,11 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+ mcl->args[0] = va;
+ if (sizeof(new_val) == sizeof(long)) {
+ mcl->args[1] = new_val.pte;
+- mcl->args[2] = flags;
+ } else {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = new_val.pte >> 32;
+- mcl->args[3] = flags;
+ }
++ mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
+ }
+
+ static inline void
+diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
+index d5b7e90..396ff4c 100644
+--- a/arch/x86/include/asm/xen/hypervisor.h
++++ b/arch/x86/include/asm/xen/hypervisor.h
+@@ -37,31 +37,4 @@
+ extern struct shared_info *HYPERVISOR_shared_info;
+ extern struct start_info *xen_start_info;
+
+-enum xen_domain_type {
+- XEN_NATIVE, /* running on bare hardware */
+- XEN_PV_DOMAIN, /* running in a PV domain */
+- XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
+-};
+-
+-#ifdef CONFIG_XEN
+-extern enum xen_domain_type xen_domain_type;
+-#else
+-#define xen_domain_type XEN_NATIVE
+-#endif
+-
+-#define xen_domain() (xen_domain_type != XEN_NATIVE)
+-#define xen_pv_domain() (xen_domain() && \
+- xen_domain_type == XEN_PV_DOMAIN)
+-#define xen_hvm_domain() (xen_domain() && \
+- xen_domain_type == XEN_HVM_DOMAIN)
+-
+-#ifdef CONFIG_XEN_DOM0
+-#include <xen/interface/xen.h>
+-
+-#define xen_initial_domain() (xen_pv_domain() && \
+- xen_start_info->flags & SIF_INITDOMAIN)
+-#else /* !CONFIG_XEN_DOM0 */
+-#define xen_initial_domain() (0)
+-#endif /* CONFIG_XEN_DOM0 */
+-
+ #endif /* _ASM_X86_XEN_HYPERVISOR_H */
+diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
+index e8506c1..9539998 100644
+--- a/arch/x86/include/asm/xen/interface.h
++++ b/arch/x86/include/asm/xen/interface.h
+@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void);
+ #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+ #endif
+
+-#ifndef machine_to_phys_mapping
+-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+-#endif
++#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
++#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
++#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
+
+ /* Maximum number of virtual CPUs in multi-processor guests. */
+ #define MAX_VIRT_CPUS 32
+@@ -97,6 +97,8 @@ DEFINE_GUEST_HANDLE(void);
+ #define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
+
+ #ifndef __ASSEMBLY__
++#include <linux/types.h>
++
+ struct trap_info {
+ uint8_t vector; /* exception vector */
+ uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
+diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
+index 42a7e00..8413688 100644
+--- a/arch/x86/include/asm/xen/interface_32.h
++++ b/arch/x86/include/asm/xen/interface_32.h
+@@ -32,6 +32,11 @@
+ /* And the trap vector is... */
+ #define TRAP_INSTR "int $0x82"
+
++#define __MACH2PHYS_VIRT_START 0xF5800000
++#define __MACH2PHYS_VIRT_END 0xF6800000
++
++#define __MACH2PHYS_SHIFT 2
++
+ /*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The
+ * machine->physical mapping table starts at this address, read-only.
+diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
+index 100d266..839a481 100644
+--- a/arch/x86/include/asm/xen/interface_64.h
++++ b/arch/x86/include/asm/xen/interface_64.h
+@@ -39,18 +39,7 @@
+ #define __HYPERVISOR_VIRT_END 0xFFFF880000000000
+ #define __MACH2PHYS_VIRT_START 0xFFFF800000000000
+ #define __MACH2PHYS_VIRT_END 0xFFFF804000000000
+-
+-#ifndef HYPERVISOR_VIRT_START
+-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+-#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
+-#endif
+-
+-#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+-#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+-#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+-#ifndef machine_to_phys_mapping
+-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+-#endif
++#define __MACH2PHYS_SHIFT 3
+
+ /*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+diff --git a/arch/x86/include/asm/xen/iommu.h b/arch/x86/include/asm/xen/iommu.h
+new file mode 100644
+index 0000000..75df312
+--- /dev/null
++++ b/arch/x86/include/asm/xen/iommu.h
+@@ -0,0 +1,12 @@
++#ifndef ASM_X86__XEN_IOMMU_H
++
++#ifdef CONFIG_PCI_XEN
++extern void xen_iommu_init(void);
++#else
++static inline void xen_iommu_init(void)
++{
++}
++#endif
++
++#endif
++
+diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
+index 018a0a4..f334014 100644
+--- a/arch/x86/include/asm/xen/page.h
++++ b/arch/x86/include/asm/xen/page.h
+@@ -5,6 +5,7 @@
+ #include <linux/types.h>
+ #include <linux/spinlock.h>
+ #include <linux/pfn.h>
++#include <linux/mm.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/page.h>
+@@ -35,6 +36,8 @@ typedef struct xpaddr {
+ #define MAX_DOMAIN_PAGES \
+ ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+
++extern unsigned long *machine_to_phys_mapping;
++extern unsigned int machine_to_phys_order;
+
+ extern unsigned long get_phys_to_machine(unsigned long pfn);
+ extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+@@ -62,10 +65,8 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return mfn;
+
+-#if 0
+ if (unlikely((mfn >> machine_to_phys_order) != 0))
+- return max_mapnr;
+-#endif
++ return ~0;
+
+ pfn = 0;
+ /*
+@@ -112,13 +113,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
+ */
+ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+ {
+- extern unsigned long max_mapnr;
+ unsigned long pfn = mfn_to_pfn(mfn);
+- if ((pfn < max_mapnr)
+- && !xen_feature(XENFEAT_auto_translated_physmap)
+- && (get_phys_to_machine(pfn) != mfn))
+- return max_mapnr; /* force !pfn_valid() */
+- /* XXX fixme; not true with sparsemem */
++ if (get_phys_to_machine(pfn) != mfn)
++ return -1; /* force !pfn_valid() */
+ return pfn;
+ }
+
+@@ -163,6 +160,7 @@ static inline pte_t __pte_ma(pteval_t x)
+
+ #define pgd_val_ma(x) ((x).pgd)
+
++void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
+
+ xmaddr_t arbitrary_virt_to_machine(void *address);
+ unsigned long arbitrary_virt_to_mfn(void *vaddr);
+diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
+new file mode 100644
+index 0000000..cb84abe
+--- /dev/null
++++ b/arch/x86/include/asm/xen/pci.h
+@@ -0,0 +1,37 @@
++#ifndef _ASM_X86_XEN_PCI_H
++#define _ASM_X86_XEN_PCI_H
++
++#ifdef CONFIG_XEN_DOM0_PCI
++int xen_register_gsi(u32 gsi, int triggering, int polarity);
++int xen_create_msi_irq(struct pci_dev *dev,
++ struct msi_desc *msidesc,
++ int type);
++int xen_destroy_irq(int irq);
++#else
++static inline int xen_register_gsi(u32 gsi, int triggering, int polarity)
++{
++ return -1;
++}
++
++static inline int xen_create_msi_irq(struct pci_dev *dev,
++ struct msi_desc *msidesc,
++ int type)
++{
++ return -1;
++}
++static inline int xen_destroy_irq(int irq)
++{
++ return -1;
++}
++#endif
++
++#if defined(CONFIG_PCI_MSI) && defined(CONFIG_XEN_DOM0_PCI)
++int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
++#else
++static inline int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++ return -1;
++}
++#endif
++
++#endif /* _ASM_X86_XEN_PCI_H */
+diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
+new file mode 100644
+index 0000000..e4fe299
+--- /dev/null
++++ b/arch/x86/include/asm/xen/swiotlb-xen.h
+@@ -0,0 +1,14 @@
++#ifndef _ASM_X86_SWIOTLB_XEN_H
++#define _ASM_X86_SWIOTLB_XEN_H
++
++#ifdef CONFIG_PCI_XEN
++extern int xen_swiotlb;
++extern int __init pci_xen_swiotlb_detect(void);
++extern void __init pci_xen_swiotlb_init(void);
++#else
++#define xen_swiotlb 0
++static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
++static inline void __init pci_xen_swiotlb_init(void) { }
++#endif
++
++#endif /* _ASM_X86_SWIOTLB_XEN_H */
+diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
+index d8e5d0c..6e80af9 100644
+--- a/arch/x86/kernel/Makefile
++++ b/arch/x86/kernel/Makefile
+@@ -111,6 +111,7 @@ obj-$(CONFIG_X86_MRST) += mrst.o
+ microcode-y := microcode_core.o
+ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
+ microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
++microcode-$(CONFIG_MICROCODE_XEN) += microcode_xen.o
+ obj-$(CONFIG_MICROCODE) += microcode.o
+
+ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
+index 67e929b..21fc029 100644
+--- a/arch/x86/kernel/acpi/boot.c
++++ b/arch/x86/kernel/acpi/boot.c
+@@ -42,6 +42,10 @@
+ #include <asm/mpspec.h>
+ #include <asm/smp.h>
+
++#include <asm/xen/pci.h>
++
++#include <asm/xen/hypervisor.h>
++
+ static int __initdata acpi_force = 0;
+ u32 acpi_rsdt_forced;
+ int acpi_disabled;
+@@ -149,6 +153,10 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
+ {
+ unsigned int ver = 0;
+
++ /* We don't want to register lapics when in Xen dom0 */
++ if (xen_initial_domain())
++ return;
++
+ if (!enabled) {
+ ++disabled_cpus;
+ return;
+@@ -455,9 +463,13 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+ */
+ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+ {
+- unsigned int irq;
++ int irq;
+ unsigned int plat_gsi = gsi;
+
++ irq = xen_register_gsi(gsi, trigger, polarity);
++ if (irq >= 0)
++ return irq;
++
+ #ifdef CONFIG_PCI
+ /*
+ * Make sure all (legacy) PCI IRQs are set as level-triggered.
+@@ -733,6 +745,10 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
+
+ static void __init acpi_register_lapic_address(unsigned long address)
+ {
++ /* Xen dom0 doesn't have usable lapics */
++ if (xen_initial_domain())
++ return;
++
+ mp_lapic_addr = address;
+
+ set_fixmap_nocache(FIX_APIC_BASE, address);
+@@ -853,6 +869,9 @@ int __init acpi_probe_gsi(void)
+ max_gsi = gsi;
+ }
+
++ if (xen_initial_domain())
++ max_gsi += 255; /* Plus maximum entries of an ioapic. */
++
+ return max_gsi + 1;
+ }
+
+diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
+index 23fc9fe..40497d3 100644
+--- a/arch/x86/kernel/amd_iommu.c
++++ b/arch/x86/kernel/amd_iommu.c
+@@ -928,7 +928,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
+ }
+
+ if (unlikely(address == -1))
+- address = bad_dma_address;
++ address = DMA_ERROR_CODE;
+
+ WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
+
+@@ -1545,7 +1545,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
+
+ pte = dma_ops_get_pte(dom, address);
+ if (!pte)
+- return bad_dma_address;
++ return DMA_ERROR_CODE;
+
+ __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
+
+@@ -1626,7 +1626,7 @@ static dma_addr_t __map_single(struct device *dev,
+ retry:
+ address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
+ dma_mask);
+- if (unlikely(address == bad_dma_address)) {
++ if (unlikely(address == DMA_ERROR_CODE)) {
+ /*
+ * setting next_address here will let the address
+ * allocator only scan the new allocated range in the
+@@ -1647,7 +1647,7 @@ retry:
+ start = address;
+ for (i = 0; i < pages; ++i) {
+ ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+- if (ret == bad_dma_address)
++ if (ret == DMA_ERROR_CODE)
+ goto out_unmap;
+
+ paddr += PAGE_SIZE;
+@@ -1675,7 +1675,7 @@ out_unmap:
+
+ dma_ops_free_addresses(dma_dom, address, pages);
+
+- return bad_dma_address;
++ return DMA_ERROR_CODE;
+ }
+
+ /*
+@@ -1691,7 +1691,7 @@ static void __unmap_single(struct amd_iommu *iommu,
+ dma_addr_t i, start;
+ unsigned int pages;
+
+- if ((dma_addr == bad_dma_address) ||
++ if ((dma_addr == DMA_ERROR_CODE) ||
+ (dma_addr + size > dma_dom->aperture_size))
+ return;
+
+@@ -1733,7 +1733,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
+ INC_STATS_COUNTER(cnt_map_single);
+
+ if (!check_device(dev))
+- return bad_dma_address;
++ return DMA_ERROR_CODE;
+
+ dma_mask = *dev->dma_mask;
+
+@@ -1744,12 +1744,12 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
+ return (dma_addr_t)paddr;
+
+ if (!dma_ops_domain(domain))
+- return bad_dma_address;
++ return DMA_ERROR_CODE;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
+ dma_mask);
+- if (addr == bad_dma_address)
++ if (addr == DMA_ERROR_CODE)
+ goto out;
+
+ iommu_completion_wait(iommu);
+@@ -1958,7 +1958,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
+ *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
+ size, DMA_BIDIRECTIONAL, true, dma_mask);
+
+- if (*dma_addr == bad_dma_address) {
++ if (*dma_addr == DMA_ERROR_CODE) {
+ spin_unlock_irqrestore(&domain->lock, flags);
+ goto out_free;
+ }
+@@ -2120,8 +2120,7 @@ int __init amd_iommu_init_dma_ops(void)
+ prealloc_protection_domains();
+
+ iommu_detected = 1;
+- force_iommu = 1;
+- bad_dma_address = 0;
++ swiotlb = 0;
+ #ifdef CONFIG_GART_IOMMU
+ gart_iommu_aperture_disabled = 1;
+ gart_iommu_aperture = 0;
+diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
+index 362ab88..2ff5b5d 100644
+--- a/arch/x86/kernel/amd_iommu_init.c
++++ b/arch/x86/kernel/amd_iommu_init.c
+@@ -29,6 +29,7 @@
+ #include <asm/amd_iommu.h>
+ #include <asm/iommu.h>
+ #include <asm/gart.h>
++#include <asm/x86_init.h>
+
+ /*
+ * definitions for the ACPI scanning code
+@@ -1183,19 +1184,10 @@ static struct sys_device device_amd_iommu = {
+ * functions. Finally it prints some information about AMD IOMMUs and
+ * the driver state and enables the hardware.
+ */
+-int __init amd_iommu_init(void)
++static int __init amd_iommu_init(void)
+ {
+ int i, ret = 0;
+
+-
+- if (no_iommu) {
+- printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
+- return 0;
+- }
+-
+- if (!amd_iommu_detected)
+- return -ENODEV;
+-
+ /*
+ * First parse ACPI tables to find the largest Bus/Dev/Func
+ * we need to handle. Upon this information the shared data
+@@ -1310,6 +1302,7 @@ int __init amd_iommu_init(void)
+ else
+ printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
+
++ x86_platform.iommu_shutdown = disable_iommus;
+ out:
+ return ret;
+
+@@ -1336,11 +1329,6 @@ free:
+ goto out;
+ }
+
+-void amd_iommu_shutdown(void)
+-{
+- disable_iommus();
+-}
+-
+ /****************************************************************************
+ *
+ * Early detect code. This code runs at IOMMU detection time in the DMA
+@@ -1355,16 +1343,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
+
+ void __init amd_iommu_detect(void)
+ {
+- if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
++ if (no_iommu || (iommu_detected && !gart_iommu_aperture))
+ return;
+
+ if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
+ iommu_detected = 1;
+ amd_iommu_detected = 1;
+-#ifdef CONFIG_GART_IOMMU
+- gart_iommu_aperture_disabled = 1;
+- gart_iommu_aperture = 0;
+-#endif
++ x86_init.iommu.iommu_init = amd_iommu_init;
+ }
+ }
+
+diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
+index 128111d..e0dfb68 100644
+--- a/arch/x86/kernel/aperture_64.c
++++ b/arch/x86/kernel/aperture_64.c
+@@ -28,6 +28,7 @@
+ #include <asm/pci-direct.h>
+ #include <asm/dma.h>
+ #include <asm/k8.h>
++#include <asm/x86_init.h>
+
+ int gart_iommu_aperture;
+ int gart_iommu_aperture_disabled __initdata;
+@@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void)
+
+ iommu_detected = 1;
+ gart_iommu_aperture = 1;
++ x86_init.iommu.iommu_init = gart_iommu_init;
+
+ aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+@@ -456,7 +458,7 @@ out:
+
+ if (aper_alloc) {
+ /* Got the aperture from the AGP bridge */
+- } else if (swiotlb && !valid_agp) {
++ } else if (!valid_agp) {
+ /* Do nothing */
+ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
+ force_iommu ||
+diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
+index c107e83..db1af79 100644
+--- a/arch/x86/kernel/apic/io_apic.c
++++ b/arch/x86/kernel/apic/io_apic.c
+@@ -63,8 +63,11 @@
+ #include <asm/uv/uv_hub.h>
+ #include <asm/uv/uv_irq.h>
+
++#include <asm/xen/hypervisor.h>
+ #include <asm/apic.h>
+
++#include <asm/xen/pci.h>
++
+ #define __apicdebuginit(type) static type __init
+ #define for_each_irq_pin(entry, head) \
+ for (entry = head; entry; entry = entry->next)
+@@ -390,14 +393,18 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+
+ static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+ {
+- struct io_apic __iomem *io_apic = io_apic_base(apic);
++ struct io_apic __iomem *io_apic;
++
++ io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ return readl(&io_apic->data);
+ }
+
+ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+ {
+- struct io_apic __iomem *io_apic = io_apic_base(apic);
++ struct io_apic __iomem *io_apic;
++
++ io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ writel(value, &io_apic->data);
+ }
+@@ -410,7 +417,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
+ */
+ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+ {
+- struct io_apic __iomem *io_apic = io_apic_base(apic);
++ struct io_apic __iomem *io_apic;
++
++ io_apic = io_apic_base(apic);
+
+ if (sis_apic_bug)
+ writel(reg, &io_apic->index);
+@@ -3447,6 +3456,9 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+ if (type == PCI_CAP_ID_MSI && nvec > 1)
+ return 1;
+
++ if (xen_domain())
++ return xen_setup_msi_irqs(dev, nvec, type);
++
+ node = dev_to_node(&dev->dev);
+ irq_want = nr_irqs_gsi;
+ sub_handle = 0;
+@@ -3496,7 +3508,10 @@ error:
+
+ void arch_teardown_msi_irq(unsigned int irq)
+ {
+- destroy_irq(irq);
++ if (xen_domain())
++ xen_destroy_irq(irq);
++ else
++ destroy_irq(irq);
+ }
+
+ #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
+@@ -3812,6 +3827,11 @@ void __init probe_nr_irqs_gsi(void)
+ printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
+ }
+
++int get_nr_irqs_gsi(void)
++{
++ return nr_irqs_gsi;
++}
++
+ #ifdef CONFIG_SPARSE_IRQ
+ int __init arch_probe_nr_irqs(void)
+ {
+diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
+index f4361b5..404e458 100644
+--- a/arch/x86/kernel/cpu/mtrr/Makefile
++++ b/arch/x86/kernel/cpu/mtrr/Makefile
+@@ -1,3 +1,4 @@
+ obj-y := main.o if.o generic.o state.o cleanup.o
+ obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
++obj-$(CONFIG_XEN_DOM0) += xen.o
+
+diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
+index 33af141..378f8dc 100644
+--- a/arch/x86/kernel/cpu/mtrr/amd.c
++++ b/arch/x86/kernel/cpu/mtrr/amd.c
+@@ -108,6 +108,11 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
+ return 0;
+ }
+
++static int amd_num_var_ranges(void)
++{
++ return 2;
++}
++
+ static struct mtrr_ops amd_mtrr_ops = {
+ .vendor = X86_VENDOR_AMD,
+ .set = amd_set_mtrr,
+@@ -115,6 +120,7 @@ static struct mtrr_ops amd_mtrr_ops = {
+ .get_free_region = generic_get_free_region,
+ .validate_add_page = amd_validate_add_page,
+ .have_wrcomb = positive_have_wrcomb,
++ .num_var_ranges = amd_num_var_ranges,
+ };
+
+ int __init amd_init_mtrr(void)
+diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
+index de89f14..7c686a0 100644
+--- a/arch/x86/kernel/cpu/mtrr/centaur.c
++++ b/arch/x86/kernel/cpu/mtrr/centaur.c
+@@ -110,6 +110,11 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
+ return 0;
+ }
+
++static int centaur_num_var_ranges(void)
++{
++ return 8;
++}
++
+ static struct mtrr_ops centaur_mtrr_ops = {
+ .vendor = X86_VENDOR_CENTAUR,
+ .set = centaur_set_mcr,
+@@ -117,6 +122,7 @@ static struct mtrr_ops centaur_mtrr_ops = {
+ .get_free_region = centaur_get_free_region,
+ .validate_add_page = centaur_validate_add_page,
+ .have_wrcomb = positive_have_wrcomb,
++ .num_var_ranges = centaur_num_var_ranges,
+ };
+
+ int __init centaur_init_mtrr(void)
+diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
+index 228d982..fd6edcc 100644
+--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
++++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
+@@ -265,6 +265,11 @@ static void cyrix_set_all(void)
+ post_set();
+ }
+
++static int cyrix_num_var_ranges(void)
++{
++ return 8;
++}
++
+ static struct mtrr_ops cyrix_mtrr_ops = {
+ .vendor = X86_VENDOR_CYRIX,
+ .set_all = cyrix_set_all,
+@@ -273,6 +278,7 @@ static struct mtrr_ops cyrix_mtrr_ops = {
+ .get_free_region = cyrix_get_free_region,
+ .validate_add_page = generic_validate_add_page,
+ .have_wrcomb = positive_have_wrcomb,
++ .num_var_ranges = cyrix_num_var_ranges,
+ };
+
+ int __init cyrix_init_mtrr(void)
+diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
+index 55da0c5..42f30cd 100644
+--- a/arch/x86/kernel/cpu/mtrr/generic.c
++++ b/arch/x86/kernel/cpu/mtrr/generic.c
+@@ -749,8 +749,16 @@ int positive_have_wrcomb(void)
+ return 1;
+ }
+
+-/*
+- * Generic structure...
++static int generic_num_var_ranges(void)
++{
++ unsigned long config = 0, dummy;
++
++ rdmsr(MSR_MTRRcap, config, dummy);
++
++ return config & 0xff;
++}
++
++/* generic structure...
+ */
+ struct mtrr_ops generic_mtrr_ops = {
+ .use_intel_if = 1,
+@@ -760,4 +768,5 @@ struct mtrr_ops generic_mtrr_ops = {
+ .set = generic_set_mtrr,
+ .validate_add_page = generic_validate_add_page,
+ .have_wrcomb = generic_have_wrcomb,
++ .num_var_ranges = generic_num_var_ranges,
+ };
+diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
+index 84e83de..c8cb9ed 100644
+--- a/arch/x86/kernel/cpu/mtrr/main.c
++++ b/arch/x86/kernel/cpu/mtrr/main.c
+@@ -110,21 +110,6 @@ static int have_wrcomb(void)
+ return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
+ }
+
+-/* This function returns the number of variable MTRRs */
+-static void __init set_num_var_ranges(void)
+-{
+- unsigned long config = 0, dummy;
+-
+- if (use_intel())
+- rdmsr(MSR_MTRRcap, config, dummy);
+- else if (is_cpu(AMD))
+- config = 2;
+- else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
+- config = 8;
+-
+- num_var_ranges = config & 0xff;
+-}
+-
+ static void __init init_table(void)
+ {
+ int i, max;
+@@ -711,8 +696,11 @@ void __init mtrr_bp_init(void)
+ }
+ }
+
++ /* Let Xen code override the above if it wants */
++ xen_init_mtrr();
++
+ if (mtrr_if) {
+- set_num_var_ranges();
++ num_var_ranges = mtrr_if->num_var_ranges();
+ init_table();
+ if (use_intel()) {
+ get_mtrr_state();
+diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
+index a501dee..98569c3 100644
+--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
++++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
+@@ -5,6 +5,8 @@
+ #include <linux/types.h>
+ #include <linux/stddef.h>
+
++#include <asm/mtrr.h>
++
+ #define MTRR_CHANGE_MASK_FIXED 0x01
+ #define MTRR_CHANGE_MASK_VARIABLE 0x02
+ #define MTRR_CHANGE_MASK_DEFTYPE 0x04
+@@ -25,6 +27,8 @@ struct mtrr_ops {
+ int (*validate_add_page)(unsigned long base, unsigned long size,
+ unsigned int type);
+ int (*have_wrcomb)(void);
++
++ int (*num_var_ranges)(void);
+ };
+
+ extern int generic_get_free_region(unsigned long base, unsigned long size,
+@@ -73,6 +77,13 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
+ int amd_init_mtrr(void);
+ int cyrix_init_mtrr(void);
+ int centaur_init_mtrr(void);
++#ifdef CONFIG_XEN_DOM0
++void xen_init_mtrr(void);
++#else
++static inline void xen_init_mtrr(void)
++{
++}
++#endif
+
+ extern int changed_by_mtrr_cleanup;
+ extern int mtrr_cleanup(unsigned address_bits);
+diff --git a/arch/x86/kernel/cpu/mtrr/xen.c b/arch/x86/kernel/cpu/mtrr/xen.c
+new file mode 100644
+index 0000000..54ced4b
+--- /dev/null
++++ b/arch/x86/kernel/cpu/mtrr/xen.c
+@@ -0,0 +1,105 @@
++#include <linux/init.h>
++#include <linux/mm.h>
++#
++#include "mtrr.h"
++
++#include <xen/xen.h>
++#include <xen/interface/platform.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++
++static void xen_set_mtrr(unsigned int reg, unsigned long base,
++ unsigned long size, mtrr_type type)
++{
++ struct xen_platform_op op;
++ int error;
++
++ /* mtrr_ops->set() is called once per CPU,
++ * but Xen's ops apply to all CPUs.
++ */
++ if (smp_processor_id())
++ return;
++
++ if (size == 0) {
++ op.cmd = XENPF_del_memtype;
++ op.u.del_memtype.handle = 0;
++ op.u.del_memtype.reg = reg;
++ } else {
++ op.cmd = XENPF_add_memtype;
++ op.u.add_memtype.mfn = base;
++ op.u.add_memtype.nr_mfns = size;
++ op.u.add_memtype.type = type;
++ }
++
++ error = HYPERVISOR_dom0_op(&op);
++ BUG_ON(error != 0);
++}
++
++static void xen_get_mtrr(unsigned int reg, unsigned long *base,
++ unsigned long *size, mtrr_type *type)
++{
++ struct xen_platform_op op;
++
++ op.cmd = XENPF_read_memtype;
++ op.u.read_memtype.reg = reg;
++ if (HYPERVISOR_dom0_op(&op) != 0) {
++ *base = 0;
++ *size = 0;
++ *type = 0;
++ return;
++ }
++
++ *size = op.u.read_memtype.nr_mfns;
++ *base = op.u.read_memtype.mfn;
++ *type = op.u.read_memtype.type;
++}
++
++static int __init xen_num_var_ranges(void)
++{
++ int ranges;
++ struct xen_platform_op op;
++
++ op.cmd = XENPF_read_memtype;
++
++ for (ranges = 0; ; ranges++) {
++ op.u.read_memtype.reg = ranges;
++ if (HYPERVISOR_dom0_op(&op) != 0)
++ break;
++ }
++ return ranges;
++}
++
++/*
++ * DOM0 TODO: Need to fill in the remaining mtrr methods to have full
++ * working userland mtrr support.
++ */
++static struct mtrr_ops xen_mtrr_ops = {
++ .vendor = X86_VENDOR_UNKNOWN,
++ .get_free_region = generic_get_free_region,
++ .set = xen_set_mtrr,
++ .get = xen_get_mtrr,
++ .have_wrcomb = positive_have_wrcomb,
++ .validate_add_page = generic_validate_add_page,
++ .use_intel_if = 0,
++ .num_var_ranges = xen_num_var_ranges,
++};
++
++void __init xen_init_mtrr(void)
++{
++ /*
++ * Check that we're running under Xen, and privileged enough
++ * to play with MTRRs.
++ */
++ if (!xen_initial_domain())
++ return;
++
++ /*
++ * Check that the CPU has an MTRR implementation we can
++ * support.
++ */
++ if (cpu_has_mtrr ||
++ cpu_has_k6_mtrr ||
++ cpu_has_cyrix_arr ||
++ cpu_has_centaur_mcr)
++ mtrr_if = &xen_mtrr_ops;
++}
+diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
+index 5e409dc..a4849c1 100644
+--- a/arch/x86/kernel/crash.c
++++ b/arch/x86/kernel/crash.c
+@@ -27,8 +27,7 @@
+ #include <asm/cpu.h>
+ #include <asm/reboot.h>
+ #include <asm/virtext.h>
+-#include <asm/iommu.h>
+-
++#include <asm/x86_init.h>
+
+ #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
+
+@@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
+ #endif
+
+ #ifdef CONFIG_X86_64
+- pci_iommu_shutdown();
++ x86_platform.iommu_shutdown();
+ #endif
+
+ crash_save_cpu(regs, safe_smp_processor_id());
+diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
+index 5877873..1aab4be 100644
+--- a/arch/x86/kernel/hpet.c
++++ b/arch/x86/kernel/hpet.c
+@@ -98,7 +98,7 @@ static int __init hpet_setup(char *str)
+ }
+ __setup("hpet=", hpet_setup);
+
+-static int __init disable_hpet(char *str)
++int __init disable_hpet(char *str)
+ {
+ boot_hpet_disable = 1;
+ return 1;
+diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
+index 99c4d30..919c1a8 100644
+--- a/arch/x86/kernel/ioport.c
++++ b/arch/x86/kernel/ioport.c
+@@ -30,13 +30,29 @@ static void set_bitmap(unsigned long *bitmap, unsigned int base,
+ }
+ }
+
++void native_set_io_bitmap(struct thread_struct *t,
++ unsigned long bytes_updated)
++{
++ struct tss_struct *tss;
++
++ if (!bytes_updated)
++ return;
++
++ tss = &__get_cpu_var(init_tss);
++
++ /* Update the TSS: */
++ if (t->io_bitmap_ptr)
++ memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
++ else
++ memset(tss->io_bitmap, 0xff, bytes_updated);
++}
++
+ /*
+ * this changes the io permissions bitmap in the current task.
+ */
+ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+ {
+ struct thread_struct *t = ¤t->thread;
+- struct tss_struct *tss;
+ unsigned int i, max_long, bytes, bytes_updated;
+
+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+@@ -61,13 +77,13 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+ }
+
+ /*
+- * do it in the per-thread copy and in the TSS ...
++ * do it in the per-thread copy
+ *
+- * Disable preemption via get_cpu() - we must not switch away
++ * Disable preemption - we must not switch away
+ * because the ->io_bitmap_max value must match the bitmap
+ * contents:
+ */
+- tss = &per_cpu(init_tss, get_cpu());
++ preempt_disable();
+
+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+
+@@ -85,10 +101,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+
+ t->io_bitmap_max = bytes;
+
+- /* Update the TSS: */
+- memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
++ set_io_bitmap(t, bytes_updated);
+
+- put_cpu();
++ preempt_enable();
+
+ return 0;
+ }
+@@ -119,11 +134,10 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
+ return 0;
+ }
+
+-#ifdef CONFIG_X86_32
+-long sys_iopl(struct pt_regs *regs)
++asmlinkage long sys_iopl(unsigned int level)
+ {
+- unsigned int level = regs->bx;
+ struct thread_struct *t = ¤t->thread;
++ struct pt_regs *regs = task_pt_regs(current);
+ int rc;
+
+ rc = do_iopl(level, regs);
+@@ -135,9 +149,3 @@ long sys_iopl(struct pt_regs *regs)
+ out:
+ return rc;
+ }
+-#else
+-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+-{
+- return do_iopl(level, regs);
+-}
+-#endif
+diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
+index ec6ef60..fa5b061 100644
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -109,6 +109,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+
+ mutex_init(&mm->context.lock);
+ mm->context.size = 0;
++#ifdef CONFIG_XEN
++ mm->context.has_foreign_mappings = 0;
++#endif
+ old_mm = current->mm;
+ if (old_mm && old_mm->context.size > 0) {
+ mutex_lock(&old_mm->context.lock);
+diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
+index 378e9a8..86ca771 100644
+--- a/arch/x86/kernel/microcode_core.c
++++ b/arch/x86/kernel/microcode_core.c
+@@ -81,6 +81,8 @@
+ #include <linux/fs.h>
+ #include <linux/mm.h>
+
++#include <xen/xen.h>
++#include <asm/xen/hypervisor.h>
+ #include <asm/microcode.h>
+ #include <asm/processor.h>
+
+@@ -503,7 +505,9 @@ static int __init microcode_init(void)
+ struct cpuinfo_x86 *c = &cpu_data(0);
+ int error;
+
+- if (c->x86_vendor == X86_VENDOR_INTEL)
++ if (xen_pv_domain())
++ microcode_ops = init_xen_microcode();
++ else if (c->x86_vendor == X86_VENDOR_INTEL)
+ microcode_ops = init_intel_microcode();
+ else if (c->x86_vendor == X86_VENDOR_AMD)
+ microcode_ops = init_amd_microcode();
+diff --git a/arch/x86/kernel/microcode_xen.c b/arch/x86/kernel/microcode_xen.c
+new file mode 100644
+index 0000000..16c742e
+--- /dev/null
++++ b/arch/x86/kernel/microcode_xen.c
+@@ -0,0 +1,201 @@
++/*
++ * Xen microcode update driver
++ *
++ * Xen does most of the work here. We just pass the whole blob into
++ * Xen, and it will apply it to all CPUs as appropriate. Xen will
++ * worry about how different CPU models are actually updated.
++ */
++#include <linux/sched.h>
++#include <linux/module.h>
++#include <linux/firmware.h>
++#include <linux/vmalloc.h>
++#include <linux/uaccess.h>
++
++#include <asm/microcode.h>
++
++#include <xen/xen.h>
++#include <xen/interface/platform.h>
++#include <xen/interface/xen.h>
++
++#include <asm/xen/hypercall.h>
++#include <asm/xen/hypervisor.h>
++
++MODULE_DESCRIPTION("Xen microcode update driver");
++MODULE_LICENSE("GPL");
++
++struct xen_microcode {
++ size_t len;
++ char data[0];
++};
++
++static int xen_microcode_update(int cpu)
++{
++ int err;
++ struct xen_platform_op op;
++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
++ struct xen_microcode *uc = uci->mc;
++
++ if (uc == NULL || uc->len == 0) {
++ /*
++ * We do all cpus at once, so we don't need to do
++ * other cpus explicitly (besides, these vcpu numbers
++ * have no relationship to underlying physical cpus).
++ */
++ return 0;
++ }
++
++ op.cmd = XENPF_microcode_update;
++ set_xen_guest_handle(op.u.microcode.data, uc->data);
++ op.u.microcode.length = uc->len;
++
++ err = HYPERVISOR_dom0_op(&op);
++
++ if (err != 0)
++ printk(KERN_WARNING "microcode_xen: microcode update failed: %d\n", err);
++
++ return err;
++}
++
++static enum ucode_state xen_request_microcode_fw(int cpu, struct device *device)
++{
++ char name[30];
++ struct cpuinfo_x86 *c = &cpu_data(cpu);
++ const struct firmware *firmware;
++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
++ enum ucode_state ret;
++ struct xen_microcode *uc;
++ size_t size;
++ int err;
++
++ switch (c->x86_vendor) {
++ case X86_VENDOR_INTEL:
++ snprintf(name, sizeof(name), "intel-ucode/%02x-%02x-%02x",
++ c->x86, c->x86_model, c->x86_mask);
++ break;
++
++ case X86_VENDOR_AMD:
++ snprintf(name, sizeof(name), "amd-ucode/microcode_amd.bin");
++ break;
++
++ default:
++ return UCODE_NFOUND;
++ }
++
++ err = request_firmware(&firmware, name, device);
++ if (err) {
++ pr_debug("microcode: data file %s load failed\n", name);
++ return UCODE_NFOUND;
++ }
++
++ /*
++ * Only bother getting real firmware for cpu 0; the others get
++ * dummy placeholders.
++ */
++ if (cpu == 0)
++ size = firmware->size;
++ else
++ size = 0;
++
++ if (uci->mc != NULL) {
++ vfree(uci->mc);
++ uci->mc = NULL;
++ }
++
++ ret = UCODE_ERROR;
++ uc = vmalloc(sizeof(*uc) + size);
++ if (uc == NULL)
++ goto out;
++
++ ret = UCODE_OK;
++ uc->len = size;
++ memcpy(uc->data, firmware->data, uc->len);
++
++ uci->mc = uc;
++
++out:
++ release_firmware(firmware);
++
++ return ret;
++}
++
++static enum ucode_state xen_request_microcode_user(int cpu,
++ const void __user *buf, size_t size)
++{
++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
++ struct xen_microcode *uc;
++ enum ucode_state ret;
++ size_t unread;
++
++ if (cpu != 0) {
++ /* No real firmware for non-zero cpus; just store a
++ placeholder */
++ size = 0;
++ }
++
++ if (uci->mc != NULL) {
++ vfree(uci->mc);
++ uci->mc = NULL;
++ }
++
++ ret = UCODE_ERROR;
++ uc = vmalloc(sizeof(*uc) + size);
++ if (uc == NULL)
++ goto out;
++
++ uc->len = size;
++
++ ret = UCODE_NFOUND;
++
++ /* XXX This sporadically returns uncopied bytes, so we return
++ EFAULT. As far as I can see, the usermode code
++ (microcode_ctl) isn't doing anything wrong... */
++ unread = copy_from_user(uc->data, buf, size);
++
++ if (unread != 0) {
++ printk(KERN_WARNING "failed to read %zd of %zd bytes at %p -> %p\n",
++ unread, size, buf, uc->data);
++ goto out;
++ }
++
++ ret = UCODE_OK;
++
++out:
++ if (ret == 0)
++ uci->mc = uc;
++ else
++ vfree(uc);
++
++ return ret;
++}
++
++static void xen_microcode_fini_cpu(int cpu)
++{
++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
++
++ vfree(uci->mc);
++ uci->mc = NULL;
++}
++
++static int xen_collect_cpu_info(int cpu, struct cpu_signature *sig)
++{
++ sig->sig = 0;
++ sig->pf = 0;
++ sig->rev = 0;
++
++ return 0;
++}
++
++static struct microcode_ops microcode_xen_ops = {
++ .request_microcode_user = xen_request_microcode_user,
++ .request_microcode_fw = xen_request_microcode_fw,
++ .collect_cpu_info = xen_collect_cpu_info,
++ .apply_microcode = xen_microcode_update,
++ .microcode_fini_cpu = xen_microcode_fini_cpu,
++};
++
++struct microcode_ops * __init init_xen_microcode(void)
++{
++ if (!xen_initial_domain())
++ return NULL;
++ return µcode_xen_ops;
++}
+diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
+index 1b1739d..f7e115c 100644
+--- a/arch/x86/kernel/paravirt.c
++++ b/arch/x86/kernel/paravirt.c
+@@ -376,6 +376,7 @@ struct pv_cpu_ops pv_cpu_ops = {
+ .swapgs = native_swapgs,
+
+ .set_iopl_mask = native_set_iopl_mask,
++ .set_io_bitmap = native_set_io_bitmap,
+ .io_delay = native_io_delay,
+
+ .start_context_switch = paravirt_nop,
+diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
+index e6ec8a2..c7ae5ca 100644
+--- a/arch/x86/kernel/pci-calgary_64.c
++++ b/arch/x86/kernel/pci-calgary_64.c
+@@ -46,6 +46,7 @@
+ #include <asm/dma.h>
+ #include <asm/rio.h>
+ #include <asm/bios_ebda.h>
++#include <asm/x86_init.h>
+
+ #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
+ int use_calgary __read_mostly = 1;
+@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
+ if (panic_on_overflow)
+ panic("Calgary: fix the allocator.\n");
+ else
+- return bad_dma_address;
++ return DMA_ERROR_CODE;
+ }
+ }
+
+@@ -260,11 +261,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
+ void *vaddr, unsigned int npages, int direction)
+ {
+ unsigned long entry;
+- dma_addr_t ret = bad_dma_address;
++ dma_addr_t ret = DMA_ERROR_CODE;
+
+ entry = iommu_range_alloc(dev, tbl, npages);
+
+- if (unlikely(entry == bad_dma_address))
++ if (unlikely(entry == DMA_ERROR_CODE))
+ goto error;
+
+ /* set the return dma address */
+@@ -279,7 +280,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
+ error:
+ printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
+ "iommu %p\n", npages, tbl);
+- return bad_dma_address;
++ return DMA_ERROR_CODE;
+ }
+
+ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+@@ -290,8 +291,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+ unsigned long flags;
+
+ /* were we called with bad_dma_address? */
+- badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
+- if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
++ badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
++ if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
+ WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
+ "address 0x%Lx\n", dma_addr);
+ return;
+@@ -375,7 +376,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+ npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
+
+ entry = iommu_range_alloc(dev, tbl, npages);
+- if (entry == bad_dma_address) {
++ if (entry == DMA_ERROR_CODE) {
+ /* makes sure unmap knows to stop */
+ s->dma_length = 0;
+ goto error;
+@@ -393,7 +394,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+ error:
+ calgary_unmap_sg(dev, sg, nelems, dir, NULL);
+ for_each_sg(sg, s, nelems, i) {
+- sg->dma_address = bad_dma_address;
++ sg->dma_address = DMA_ERROR_CODE;
+ sg->dma_length = 0;
+ }
+ return 0;
+@@ -448,7 +449,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
+
+ /* set up tces to cover the allocated range */
+ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
+- if (mapping == bad_dma_address)
++ if (mapping == DMA_ERROR_CODE)
+ goto free;
+ *dma_handle = mapping;
+ return ret;
+@@ -729,7 +730,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
+ struct iommu_table *tbl = pci_iommu(dev->bus);
+
+ /* reserve EMERGENCY_PAGES from bad_dma_address and up */
+- iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
++ iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
+
+ /* avoid the BIOS/VGA first 640KB-1MB region */
+ /* for CalIOC2 - avoid the entire first MB */
+@@ -1346,6 +1347,23 @@ static void __init get_tce_space_from_tar(void)
+ return;
+ }
+
++static int __init calgary_iommu_init(void)
++{
++ int ret;
++
++ /* ok, we're trying to use Calgary - let's roll */
++ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
++
++ ret = calgary_init();
++ if (ret) {
++ printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
++ "falling back to no_iommu\n", ret);
++ return ret;
++ }
++
++ return 0;
++}
++
+ void __init detect_calgary(void)
+ {
+ int bus;
+@@ -1359,7 +1377,7 @@ void __init detect_calgary(void)
+ * if the user specified iommu=off or iommu=soft or we found
+ * another HW IOMMU already, bail out.
+ */
+- if (swiotlb || no_iommu || iommu_detected)
++ if (no_iommu || iommu_detected)
+ return;
+
+ if (!use_calgary)
+@@ -1444,9 +1462,7 @@ void __init detect_calgary(void)
+ printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+ specified_table_size);
+
+- /* swiotlb for devices that aren't behind the Calgary. */
+- if (max_pfn > MAX_DMA32_PFN)
+- swiotlb = 1;
++ x86_init.iommu.iommu_init = calgary_iommu_init;
+ }
+ return;
+
+@@ -1459,35 +1475,6 @@ cleanup:
+ }
+ }
+
+-int __init calgary_iommu_init(void)
+-{
+- int ret;
+-
+- if (no_iommu || (swiotlb && !calgary_detected))
+- return -ENODEV;
+-
+- if (!calgary_detected)
+- return -ENODEV;
+-
+- /* ok, we're trying to use Calgary - let's roll */
+- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+-
+- ret = calgary_init();
+- if (ret) {
+- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+- "falling back to no_iommu\n", ret);
+- return ret;
+- }
+-
+- force_iommu = 1;
+- bad_dma_address = 0x0;
+- /* dma_ops is set to swiotlb or nommu */
+- if (!dma_ops)
+- dma_ops = &nommu_dma_ops;
+-
+- return 0;
+-}
+-
+ static int __init calgary_parse_options(char *p)
+ {
+ unsigned int bridge;
+diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
+index 6ac3931..3e57c58 100644
+--- a/arch/x86/kernel/pci-dma.c
++++ b/arch/x86/kernel/pci-dma.c
+@@ -11,10 +11,12 @@
+ #include <asm/gart.h>
+ #include <asm/calgary.h>
+ #include <asm/amd_iommu.h>
++#include <asm/x86_init.h>
++#include <asm/xen/swiotlb-xen.h>
+
+ static int forbid_dac __read_mostly;
+
+-struct dma_map_ops *dma_ops;
++struct dma_map_ops *dma_ops = &nommu_dma_ops;
+ EXPORT_SYMBOL(dma_ops);
+
+ static int iommu_sac_force __read_mostly;
+@@ -42,9 +44,6 @@ int iommu_detected __read_mostly = 0;
+ */
+ int iommu_pass_through __read_mostly;
+
+-dma_addr_t bad_dma_address __read_mostly = 0;
+-EXPORT_SYMBOL(bad_dma_address);
+-
+ /* Dummy device used for NULL arguments (normally ISA). */
+ struct device x86_dma_fallback_dev = {
+ .init_name = "fallback device",
+@@ -126,18 +125,19 @@ void __init pci_iommu_alloc(void)
+ /* free the range so iommu could get some range less than 4G */
+ dma32_free_bootmem();
+ #endif
++ if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
++ goto out;
+
+- /*
+- * The order of these functions is important for
+- * fall-back/fail-over reasons
+- */
+ gart_iommu_hole_init();
+
+ detect_calgary();
+
+ detect_intel_iommu();
+
++ /* needs to be called after gart_iommu_hole_init */
+ amd_iommu_detect();
++out:
++ pci_xen_swiotlb_init();
+
+ pci_swiotlb_init();
+ }
+@@ -289,25 +289,17 @@ static int __init pci_iommu_init(void)
+ #ifdef CONFIG_PCI
+ dma_debug_add_bus(&pci_bus_type);
+ #endif
++ x86_init.iommu.iommu_init();
+
+- calgary_iommu_init();
+-
+- intel_iommu_init();
+-
+- amd_iommu_init();
++ if (swiotlb || xen_swiotlb) {
++ printk(KERN_INFO "PCI-DMA: "
++ "Using software bounce buffering for IO (SWIOTLB)\n");
++ swiotlb_print_info();
++ } else
++ swiotlb_free();
+
+- gart_iommu_init();
+-
+- no_iommu_init();
+ return 0;
+ }
+-
+-void pci_iommu_shutdown(void)
+-{
+- gart_iommu_shutdown();
+-
+- amd_iommu_shutdown();
+-}
+ /* Must execute after PCI subsystem */
+ rootfs_initcall(pci_iommu_init);
+
+diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
+index fcc0b5c..61c4d1e 100644
+--- a/arch/x86/kernel/pci-gart_64.c
++++ b/arch/x86/kernel/pci-gart_64.c
+@@ -39,6 +39,7 @@
+ #include <asm/swiotlb.h>
+ #include <asm/dma.h>
+ #include <asm/k8.h>
++#include <asm/x86_init.h>
+
+ static unsigned long iommu_bus_base; /* GART remapping area (physical) */
+ static unsigned long iommu_size; /* size of remapping area bytes */
+@@ -46,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */
+
+ static u32 *iommu_gatt_base; /* Remapping table */
+
++static dma_addr_t bad_dma_addr;
++
+ /*
+ * If this is disabled the IOMMU will use an optimized flushing strategy
+ * of only flushing when an mapping is reused. With it true the GART is
+@@ -216,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
+ if (panic_on_overflow)
+ panic("dma_map_area overflow %lu bytes\n", size);
+ iommu_full(dev, size, dir);
+- return bad_dma_address;
++ return bad_dma_addr;
+ }
+
+ for (i = 0; i < npages; i++) {
+@@ -302,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+
+ if (nonforced_iommu(dev, addr, s->length)) {
+ addr = dma_map_area(dev, addr, s->length, dir, 0);
+- if (addr == bad_dma_address) {
++ if (addr == bad_dma_addr) {
+ if (i > 0)
+ gart_unmap_sg(dev, sg, i, dir, NULL);
+ nents = 0;
+@@ -455,7 +458,7 @@ error:
+
+ iommu_full(dev, pages << PAGE_SHIFT, dir);
+ for_each_sg(sg, s, nents, i)
+- s->dma_address = bad_dma_address;
++ s->dma_address = bad_dma_addr;
+ return 0;
+ }
+
+@@ -479,7 +482,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
+ DMA_BIDIRECTIONAL, align_mask);
+
+ flush_gart();
+- if (paddr != bad_dma_address) {
++ if (paddr != bad_dma_addr) {
+ *dma_addr = paddr;
+ return page_address(page);
+ }
+@@ -499,6 +502,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
+ free_pages((unsigned long)vaddr, get_order(size));
+ }
+
++static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
++{
++ return (dma_addr == bad_dma_addr);
++}
++
+ static int no_agp;
+
+ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+@@ -686,14 +694,15 @@ static struct dma_map_ops gart_dma_ops = {
+ .unmap_page = gart_unmap_page,
+ .alloc_coherent = gart_alloc_coherent,
+ .free_coherent = gart_free_coherent,
++ .mapping_error = gart_mapping_error,
+ };
+
+-void gart_iommu_shutdown(void)
++static void gart_iommu_shutdown(void)
+ {
+ struct pci_dev *dev;
+ int i;
+
+- if (no_agp && (dma_ops != &gart_dma_ops))
++ if (no_agp)
+ return;
+
+ for (i = 0; i < num_k8_northbridges; i++) {
+@@ -708,7 +717,7 @@ void gart_iommu_shutdown(void)
+ }
+ }
+
+-void __init gart_iommu_init(void)
++int __init gart_iommu_init(void)
+ {
+ struct agp_kern_info info;
+ unsigned long iommu_start;
+@@ -718,7 +727,7 @@ void __init gart_iommu_init(void)
+ long i;
+
+ if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
+- return;
++ return 0;
+
+ #ifndef CONFIG_AGP_AMD64
+ no_agp = 1;
+@@ -730,13 +739,6 @@ void __init gart_iommu_init(void)
+ (agp_copy_info(agp_bridge, &info) < 0);
+ #endif
+
+- if (swiotlb)
+- return;
+-
+- /* Did we detect a different HW IOMMU? */
+- if (iommu_detected && !gart_iommu_aperture)
+- return;
+-
+ if (no_iommu ||
+ (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
+ !gart_iommu_aperture ||
+@@ -746,7 +748,7 @@ void __init gart_iommu_init(void)
+ "but GART IOMMU not available.\n");
+ printk(KERN_WARNING "falling back to iommu=soft.\n");
+ }
+- return;
++ return 0;
+ }
+
+ /* need to map that range */
+@@ -791,7 +793,7 @@ void __init gart_iommu_init(void)
+
+ iommu_start = aper_size - iommu_size;
+ iommu_bus_base = info.aper_base + iommu_start;
+- bad_dma_address = iommu_bus_base;
++ bad_dma_addr = iommu_bus_base;
+ iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+
+ /*
+@@ -838,6 +840,10 @@ void __init gart_iommu_init(void)
+
+ flush_gart();
+ dma_ops = &gart_dma_ops;
++ x86_platform.iommu_shutdown = gart_iommu_shutdown;
++ swiotlb = 0;
++
++ return 0;
+ }
+
+ void __init gart_parse_options(char *p)
+diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
+index a3933d4..22be12b 100644
+--- a/arch/x86/kernel/pci-nommu.c
++++ b/arch/x86/kernel/pci-nommu.c
+@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
+ dma_addr_t bus = page_to_phys(page) + offset;
+ WARN_ON(size == 0);
+ if (!check_addr("map_single", dev, bus, size))
+- return bad_dma_address;
++ return DMA_ERROR_CODE;
+ flush_write_buffers();
+ return bus;
+ }
+@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
+ .sync_sg_for_device = nommu_sync_sg_for_device,
+ .is_phys = 1,
+ };
+-
+-void __init no_iommu_init(void)
+-{
+- if (dma_ops)
+- return;
+-
+- force_iommu = 0; /* no HW IOMMU */
+- dma_ops = &nommu_dma_ops;
+-}
+diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
+index aaa6b78..7d2829d 100644
+--- a/arch/x86/kernel/pci-swiotlb.c
++++ b/arch/x86/kernel/pci-swiotlb.c
+@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = {
+ .dma_supported = NULL,
+ };
+
+-void __init pci_swiotlb_init(void)
++/*
++ * pci_swiotlb_detect - set swiotlb to 1 if necessary
++ *
++ * This returns non-zero if we are forced to use swiotlb (by the boot
++ * option).
++ */
++int __init pci_swiotlb_detect(void)
+ {
++ int use_swiotlb = swiotlb | swiotlb_force;
++
+ /* don't initialize swiotlb if iommu=off (no_iommu=1) */
+ #ifdef CONFIG_X86_64
+- if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
++ if (!no_iommu && max_pfn > MAX_DMA32_PFN)
+ swiotlb = 1;
+ #endif
+ if (swiotlb_force)
+ swiotlb = 1;
++
++ return use_swiotlb;
++}
++
++void __init pci_swiotlb_init(void)
++{
+ if (swiotlb) {
+- printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+- swiotlb_init();
++ swiotlb_init(0);
+ dma_ops = &swiotlb_dma_ops;
+ }
+ }
+diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
+index f010ab4..6b39f09 100644
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -73,16 +73,12 @@ void exit_thread(void)
+ unsigned long *bp = t->io_bitmap_ptr;
+
+ if (bp) {
+- struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+-
++ preempt_disable();
+ t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
+- /*
+- * Careful, clear this in the TSS too:
+- */
+- memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
++ set_io_bitmap(t, t->io_bitmap_max);
+ t->io_bitmap_max = 0;
+- put_cpu();
++ preempt_enable();
+ kfree(bp);
+ }
+ }
+@@ -199,19 +195,10 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+ hard_enable_TSC();
+ }
+
+- if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+- /*
+- * Copy the relevant range of the IO bitmap.
+- * Normally this is 128 bytes or less:
+- */
+- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+- max(prev->io_bitmap_max, next->io_bitmap_max));
+- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+- /*
+- * Clear any possible leftover bits:
+- */
+- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+- }
++ if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP) ||
++ test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
++ set_io_bitmap(next,
++ max(prev->io_bitmap_max, next->io_bitmap_max));
+ }
+
+ int sys_fork(struct pt_regs *regs)
+diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
+index bff34d6..704bddc 100644
+--- a/arch/x86/kernel/reboot.c
++++ b/arch/x86/kernel/reboot.c
+@@ -23,7 +23,7 @@
+ # include <linux/ctype.h>
+ # include <linux/mc146818rtc.h>
+ #else
+-# include <asm/iommu.h>
++# include <asm/x86_init.h>
+ #endif
+
+ /*
+@@ -639,7 +639,7 @@ void native_machine_shutdown(void)
+ #endif
+
+ #ifdef CONFIG_X86_64
+- pci_iommu_shutdown();
++ x86_platform.iommu_shutdown();
+ #endif
+ }
+
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index 8425f7e..9f1d581 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -89,6 +89,7 @@
+ #include <asm/cacheflush.h>
+ #include <asm/processor.h>
+ #include <asm/bugs.h>
++#include <asm/tlbflush.h>
+
+ #include <asm/system.h>
+ #include <asm/vsyscall.h>
+@@ -955,6 +956,9 @@ void __init setup_arch(char **cmdline_p)
+
+ initmem_init(0, max_pfn);
+
++ /* Initialize cross-cpu tlb flushes */
++ init_smp_flush();
++
+ #ifdef CONFIG_ACPI_SLEEP
+ /*
+ * Reserve low memory region for sleep support.
+diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
+index 4449a4a..d11c5ff 100644
+--- a/arch/x86/kernel/x86_init.c
++++ b/arch/x86/kernel/x86_init.c
+@@ -14,10 +14,13 @@
+ #include <asm/time.h>
+ #include <asm/irq.h>
+ #include <asm/tsc.h>
++#include <asm/iommu.h>
+
+ void __cpuinit x86_init_noop(void) { }
+ void __init x86_init_uint_noop(unsigned int unused) { }
+ void __init x86_init_pgd_noop(pgd_t *unused) { }
++int __init iommu_init_noop(void) { return 0; }
++void iommu_shutdown_noop(void) { }
+
+ /*
+ * The platform setup functions are preset with the default functions
+@@ -62,6 +65,10 @@ struct x86_init_ops x86_init __initdata = {
+ .tsc_pre_init = x86_init_noop,
+ .timer_init = hpet_time_init,
+ },
++
++ .iommu = {
++ .iommu_init = iommu_init_noop,
++ },
+ };
+
+ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
+@@ -72,4 +79,5 @@ struct x86_platform_ops x86_platform = {
+ .calibrate_tsc = native_calibrate_tsc,
+ .get_wallclock = mach_get_cmos_time,
+ .set_wallclock = mach_set_rtc_mmss,
++ .iommu_shutdown = iommu_shutdown_noop,
+ };
+diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
+index 06630d2..ad895ae 100644
+--- a/arch/x86/mm/Makefile
++++ b/arch/x86/mm/Makefile
+@@ -6,6 +6,11 @@ nostackp := $(call cc-option, -fno-stack-protector)
+ CFLAGS_physaddr.o := $(nostackp)
+ CFLAGS_setup_nx.o := $(nostackp)
+
++# Make sure __phys_addr has no stackprotector
++nostackp := $(call cc-option, -fno-stack-protector)
++CFLAGS_ioremap.o := $(nostackp)
++CFLAGS_init.o := $(nostackp)
++
+ obj-$(CONFIG_SMP) += tlb.o
+
+ obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
+diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
+index 71da1bc..892b8eb 100644
+--- a/arch/x86/mm/gup.c
++++ b/arch/x86/mm/gup.c
+@@ -313,6 +313,11 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ goto slow_irqon;
+ #endif
+
++#ifdef CONFIG_XEN
++ if (unlikely(mm->context.has_foreign_mappings))
++ goto slow_irqon;
++#endif
++
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
+index e78cd0e..fb91994 100644
+--- a/arch/x86/mm/pat.c
++++ b/arch/x86/mm/pat.c
+@@ -666,7 +666,7 @@ void io_free_memtype(resource_size_t start, resource_size_t end)
+ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot)
+ {
+- return vma_prot;
++ return __pgprot(pgprot_val(vma_prot) | _PAGE_IOMAP);
+ }
+
+ #ifdef CONFIG_STRICT_DEVMEM
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index ed34f5e..25fc1df 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -4,8 +4,19 @@
+ #include <asm/tlb.h>
+ #include <asm/fixmap.h>
+
++#include <xen/xen.h>
++#include <asm/xen/hypervisor.h>
++
+ #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
++#ifdef CONFIG_HIGHPTE
++#define PGALLOC_USER_GFP __GFP_HIGHMEM
++#else
++#define PGALLOC_USER_GFP 0
++#endif
++
++gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
++
+ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+ {
+ return (pte_t *)__get_free_page(PGALLOC_GFP);
+@@ -15,16 +26,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+ {
+ struct page *pte;
+
+-#ifdef CONFIG_HIGHPTE
+- pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
+-#else
+- pte = alloc_pages(PGALLOC_GFP, 0);
+-#endif
++ pte = alloc_pages(__userpte_alloc_gfp, 0);
+ if (pte)
+ pgtable_page_ctor(pte);
+ return pte;
+ }
+
++static int __init setup_userpte(char *arg)
++{
++ if (!arg)
++ return -EINVAL;
++
++ /*
++ * "userpte=nohigh" disables allocation of user pagetables in
++ * high memory.
++ */
++ if (strcmp(arg, "nohigh") == 0)
++ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
++ else
++ return -EINVAL;
++ return 0;
++}
++early_param("userpte", setup_userpte);
++
+ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+ {
+ pgtable_page_dtor(pte);
+@@ -267,6 +291,12 @@ out:
+
+ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+ {
++#ifdef CONFIG_XEN
++ /* EEW */
++ extern void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd);
++ if (xen_pv_domain())
++ xen_late_unpin_pgd(mm, pgd);
++#endif
+ pgd_mop_up_pmds(mm, pgd);
+ pgd_dtor(pgd);
+ paravirt_pgd_free(mm, pgd);
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 36fe08e..7317947 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -148,13 +148,25 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
+ * BUG();
+ */
+
+- if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
++ if (f->flush_mm == NULL ||
++ f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
++ int tlbstate = percpu_read(cpu_tlbstate.state);
++
++ /*
++ * flush_mm == NULL means flush everything, including
++ * global tlbs, which will only happen when flushing
++ * kernel mappings.
++ */
++ if (f->flush_mm == NULL)
++ __flush_tlb_all();
++ else if (tlbstate == TLBSTATE_OK) {
+ if (f->flush_va == TLB_FLUSH_ALL)
+ local_flush_tlb();
+ else
+ __flush_tlb_one(f->flush_va);
+- } else
++ }
++
++ if (tlbstate == TLBSTATE_LAZY)
+ leave_mm(cpu);
+ }
+ out:
+@@ -217,16 +229,13 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
+ flush_tlb_others_ipi(cpumask, mm, va);
+ }
+
+-static int __cpuinit init_smp_flush(void)
++void __init init_smp_flush(void)
+ {
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(flush_state); i++)
+ spin_lock_init(&flush_state[i].tlbstate_lock);
+-
+- return 0;
+ }
+-core_initcall(init_smp_flush);
+
+ void flush_tlb_current_task(void)
+ {
+@@ -274,17 +283,19 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+
+ preempt_enable();
+ }
++EXPORT_SYMBOL_GPL(flush_tlb_page);
+
+-static void do_flush_tlb_all(void *info)
++void flush_tlb_all(void)
+ {
+- unsigned long cpu = smp_processor_id();
++ /* flush_tlb_others expects preempt to be disabled */
++ int cpu = get_cpu();
++
++ flush_tlb_others(cpu_online_mask, NULL, TLB_FLUSH_ALL);
+
+ __flush_tlb_all();
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+ leave_mm(cpu);
+-}
+
+-void flush_tlb_all(void)
+-{
+- on_each_cpu(do_flush_tlb_all, NULL, 1);
++ put_cpu();
+ }
++EXPORT_SYMBOL_GPL(flush_tlb_all);
+diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
+index d49202e..64182c5 100644
+--- a/arch/x86/pci/Makefile
++++ b/arch/x86/pci/Makefile
+@@ -4,6 +4,7 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o
+ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
+ obj-$(CONFIG_PCI_DIRECT) += direct.o
+ obj-$(CONFIG_PCI_OLPC) += olpc.o
++obj-$(CONFIG_PCI_XEN) += xen.o
+
+ obj-y += fixup.o
+ obj-$(CONFIG_ACPI) += acpi.o
+diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
+index 1331fcf..30a9808 100644
+--- a/arch/x86/pci/common.c
++++ b/arch/x86/pci/common.c
+@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
+ unsigned int pci_early_dump_regs;
+ static int pci_bf_sort;
+ int pci_routeirq;
++int pci_scan_all_fns;
+ int noioapicquirk;
+ #ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+ int noioapicreroute = 0;
+@@ -412,26 +413,31 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
+
+ extern u8 pci_cache_line_size;
+
+-int __init pcibios_init(void)
++void __init pcibios_set_cache_line_size(void)
+ {
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+- if (!raw_pci_ops) {
+- printk(KERN_WARNING "PCI: System does not support PCI\n");
+- return 0;
+- }
+-
+ /*
+ * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8
+ * and P4. It's also good for 386/486s (which actually have 16)
+ * as quite a few PCI devices do not support smaller values.
+ */
++
+ pci_cache_line_size = 32 >> 2;
+ if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
+ pci_cache_line_size = 64 >> 2; /* K7 & K8 */
+ else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
+ pci_cache_line_size = 128 >> 2; /* P4 */
++}
++
++int __init pcibios_init(void)
++{
++ if (!raw_pci_ops) {
++ printk(KERN_WARNING "PCI: System does not support PCI\n");
++ return 0;
++ }
+
++ pcibios_set_cache_line_size();
+ pcibios_resource_survey();
+
+ if (pci_bf_sort >= pci_force_bf)
+diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
+index a672f12..91d040e 100644
+--- a/arch/x86/pci/i386.c
++++ b/arch/x86/pci/i386.c
+@@ -283,6 +283,8 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+
+ prot = pgprot_val(vma->vm_page_prot);
+
++ prot |= _PAGE_IOMAP; /* creating a mapping for IO */
++
+ /*
+ * Return error if pat is not enabled and write_combine is requested.
+ * Caller can followup with UC MINUS request and add a WC mtrr if there
+diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
+index 25a1f8e..4e2f90a 100644
+--- a/arch/x86/pci/init.c
++++ b/arch/x86/pci/init.c
+@@ -15,10 +15,16 @@ static __init int pci_arch_init(void)
+ if (!(pci_probe & PCI_PROBE_NOEARLY))
+ pci_mmcfg_early_init();
+
++#ifdef CONFIG_PCI_XEN
++ if (!pci_xen_init())
++ return 0;
++#endif
++
+ #ifdef CONFIG_PCI_OLPC
+ if (!pci_olpc_init())
+ return 0; /* skip additional checks if it's an XO */
+ #endif
++
+ #ifdef CONFIG_PCI_BIOS
+ pci_pcbios_init();
+ #endif
+diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
+new file mode 100644
+index 0000000..1b922aa
+--- /dev/null
++++ b/arch/x86/pci/xen.c
+@@ -0,0 +1,51 @@
++/*
++ * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
++ * x86 PCI core to support the Xen PCI Frontend
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/pci.h>
++#include <linux/acpi.h>
++
++#include <asm/io.h>
++#include <asm/pci_x86.h>
++
++#include <asm/xen/hypervisor.h>
++
++static int xen_pcifront_enable_irq(struct pci_dev *dev)
++{
++ return 0;
++}
++
++int __init pci_xen_init(void)
++{
++ if (!xen_pv_domain() || xen_initial_domain())
++ return -ENODEV;
++
++ printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
++
++ pcibios_set_cache_line_size();
++
++ pcibios_enable_irq = xen_pcifront_enable_irq;
++ pcibios_disable_irq = NULL;
++
++#ifdef CONFIG_ACPI
++ /* Keep ACPI out of the picture */
++ acpi_noirq = 1;
++#endif
++
++#ifdef CONFIG_ISAPNP
++ /* Stop isapnp from probing */
++ isapnp_disable = 1;
++#endif
++
++ /* Ensure a device still gets scanned even if it's fn number
++ * is non-zero.
++ */
++ pci_scan_all_fns = 1;
++
++ return 0;
++}
++
+diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
+index b83e119..3da23c7 100644
+--- a/arch/x86/xen/Kconfig
++++ b/arch/x86/xen/Kconfig
+@@ -36,3 +36,39 @@ config XEN_DEBUG_FS
+ help
+ Enable statistics output and various tuning options in debugfs.
+ Enabling this option may incur a significant performance overhead.
++
++config SWIOTLB_XEN
++ def_bool y
++ depends on XEN && SWIOTLB
++
++config MICROCODE_XEN
++ def_bool y
++ depends on XEN_DOM0 && MICROCODE
++
++config XEN_DOM0
++ bool "Enable Xen privileged domain support"
++ depends on XEN && X86_IO_APIC && ACPI
++ help
++ The Xen hypervisor requires a privileged domain ("dom0") to
++ actually manage the machine, provide devices drivers, etc.
++ This option enables dom0 support. A dom0 kernel can also
++ run as an unprivileged domU kernel, or a kernel running
++ native on bare hardware.
++
++# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
++# name in tools.
++config XEN_PRIVILEGED_GUEST
++ def_bool XEN_DOM0
++
++config XEN_DOM0_PCI
++ def_bool y
++ depends on XEN_DOM0 && PCI
++ select PCI_XEN
++
++config XEN_PCI_PASSTHROUGH
++ bool #"Enable support for Xen PCI passthrough devices"
++ depends on XEN && PCI
++ select PCI_XEN
++ help
++ Enable support for passing PCI devices through to
++ unprivileged domains. (COMPLETELY UNTESTED)
+\ No newline at end of file
+diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
+index 3bb4fc2..08ac224 100644
+--- a/arch/x86/xen/Makefile
++++ b/arch/x86/xen/Makefile
+@@ -17,4 +17,7 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
+ obj-$(CONFIG_SMP) += smp.o
+ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
+ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
+-
++obj-$(CONFIG_XEN_DOM0) += vga.o
++obj-$(CONFIG_XEN_DOM0) += apic.o
++obj-$(CONFIG_SWIOTLB) += pci-swiotlb-xen.o
++obj-$(CONFIG_XEN_DOM0_PCI) += pci.o
+\ No newline at end of file
+diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
+new file mode 100644
+index 0000000..21a3089
+--- /dev/null
++++ b/arch/x86/xen/apic.c
+@@ -0,0 +1,33 @@
++#include <linux/kernel.h>
++#include <linux/threads.h>
++#include <linux/bitmap.h>
++
++#include <asm/io_apic.h>
++#include <asm/acpi.h>
++#include <asm/hw_irq.h>
++
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++
++#include <xen/xen.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/physdev.h>
++
++void __init xen_io_apic_init(void)
++{
++ enable_IO_APIC();
++}
++
++void xen_init_apic(void)
++{
++ if (!xen_initial_domain())
++ return;
++
++#ifdef CONFIG_ACPI
++ /*
++ * Pretend ACPI found our lapic even though we've disabled it,
++ * to prevent MP tables from setting up lapics.
++ */
++ acpi_lapic = 1;
++#endif
++}
+diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
+index 79f9738..765f714 100644
+--- a/arch/x86/xen/enlighten.c
++++ b/arch/x86/xen/enlighten.c
+@@ -28,6 +28,7 @@
+ #include <linux/highmem.h>
+ #include <linux/console.h>
+
++#include <xen/xen.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/version.h>
+ #include <xen/interface/physdev.h>
+@@ -48,6 +49,7 @@
+ #include <asm/traps.h>
+ #include <asm/setup.h>
+ #include <asm/desc.h>
++#include <asm/pgalloc.h>
+ #include <asm/pgtable.h>
+ #include <asm/tlbflush.h>
+ #include <asm/reboot.h>
+@@ -65,6 +67,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+ enum xen_domain_type xen_domain_type = XEN_NATIVE;
+ EXPORT_SYMBOL_GPL(xen_domain_type);
+
++unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
++EXPORT_SYMBOL(machine_to_phys_mapping);
++unsigned int machine_to_phys_order;
++EXPORT_SYMBOL(machine_to_phys_order);
++
+ struct start_info *xen_start_info;
+ EXPORT_SYMBOL_GPL(xen_start_info);
+
+@@ -166,13 +173,16 @@ static void __init xen_banner(void)
+
+ printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+ pv_info.name);
+- printk(KERN_INFO "Xen version: %d.%d%s%s\n",
++ printk(KERN_INFO "Xen version: %d.%d%s%s%s\n",
+ version >> 16, version & 0xffff, extra.extraversion,
+- xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
++ xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ?
++ " (preserve-AD)" : "",
++ xen_initial_domain() ? " (dom0)" : "");
+ }
+
+ static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
+ static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
++static __read_mostly unsigned int cpuid_leaf81_edx_mask = ~0;
+
+ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+ unsigned int *cx, unsigned int *dx)
+@@ -186,7 +196,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+ * unsupported kernel subsystems as possible.
+ */
+ switch (*ax) {
+- case 1:
++ case 0x1:
+ maskecx = cpuid_leaf1_ecx_mask;
+ maskedx = cpuid_leaf1_edx_mask;
+ break;
+@@ -195,6 +205,10 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+ /* Suppress extended topology stuff */
+ maskebx = 0;
+ break;
++
++ case 0x80000001:
++ maskedx = cpuid_leaf81_edx_mask;
++ break;
+ }
+
+ asm(XEN_EMULATE_PREFIX "cpuid"
+@@ -216,8 +230,11 @@ static __init void xen_init_cpuid_mask(void)
+ cpuid_leaf1_edx_mask =
+ ~((1 << X86_FEATURE_MCE) | /* disable MCE */
+ (1 << X86_FEATURE_MCA) | /* disable MCA */
++ (1 << X86_FEATURE_PAT) | /* disable PAT */
+ (1 << X86_FEATURE_ACC)); /* thermal monitoring */
+
++ cpuid_leaf81_edx_mask = ~(1 << (X86_FEATURE_GBPAGES % 32));
++
+ if (!xen_initial_domain())
+ cpuid_leaf1_edx_mask &=
+ ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
+@@ -405,7 +422,7 @@ static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_RO);
+
+- if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
++ if (HYPERVISOR_update_va_mapping(va, pte, 0))
+ BUG();
+
+ frames[f] = mfn;
+@@ -518,11 +535,10 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
+ } else if (addr == (unsigned long)machine_check) {
+ return 0;
+ #endif
+- } else {
+- /* Some other trap using IST? */
+- if (WARN_ON(val->ist != 0))
+- return 0;
+- }
++ } else if (WARN(val->ist != 0,
++ "Unknown IST-using trap: vector %d, %pF, val->ist=%d\n",
++ vector, (void *)addr, val->ist))
++ return 0;
+ #endif /* CONFIG_X86_64 */
+ info->address = addr;
+
+@@ -678,6 +694,18 @@ static void xen_set_iopl_mask(unsigned mask)
+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ }
+
++static void xen_set_io_bitmap(struct thread_struct *thread,
++ unsigned long bytes_updated)
++{
++ struct physdev_set_iobitmap set_iobitmap;
++
++ set_xen_guest_handle(set_iobitmap.bitmap,
++ (char *)thread->io_bitmap_ptr);
++ set_iobitmap.nr_ports = thread->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
++ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
++ &set_iobitmap));
++}
++
+ static void xen_io_delay(void)
+ {
+ }
+@@ -715,7 +743,7 @@ static u32 xen_safe_apic_wait_icr_idle(void)
+ return 0;
+ }
+
+-static void set_xen_basic_apic_ops(void)
++static __init void set_xen_basic_apic_ops(void)
+ {
+ apic->read = xen_apic_read;
+ apic->write = xen_apic_write;
+@@ -977,6 +1005,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+ .load_sp0 = xen_load_sp0,
+
+ .set_iopl_mask = xen_set_iopl_mask,
++ .set_io_bitmap = xen_set_io_bitmap,
+ .io_delay = xen_io_delay,
+
+ /* Xen takes care of %gs when switching to usermode for us */
+@@ -1019,6 +1048,14 @@ static void xen_machine_halt(void)
+ xen_reboot(SHUTDOWN_poweroff);
+ }
+
++static void xen_machine_power_off(void)
++{
++ if (pm_power_off)
++ pm_power_off();
++ else
++ xen_reboot(SHUTDOWN_poweroff);
++}
++
+ static void xen_crash_shutdown(struct pt_regs *regs)
+ {
+ xen_reboot(SHUTDOWN_crash);
+@@ -1027,7 +1064,7 @@ static void xen_crash_shutdown(struct pt_regs *regs)
+ static const struct machine_ops __initdata xen_machine_ops = {
+ .restart = xen_restart,
+ .halt = xen_machine_halt,
+- .power_off = xen_machine_halt,
++ .power_off = xen_machine_power_off,
+ .shutdown = xen_machine_halt,
+ .crash_shutdown = xen_crash_shutdown,
+ .emergency_restart = xen_emergency_restart,
+@@ -1060,6 +1097,8 @@ asmlinkage void __init xen_start_kernel(void)
+
+ xen_domain_type = XEN_PV_DOMAIN;
+
++ xen_setup_machphys_mapping();
++
+ /* Install Xen paravirt ops */
+ pv_info = xen_info;
+ pv_init_ops = xen_init_ops;
+@@ -1085,6 +1124,12 @@ asmlinkage void __init xen_start_kernel(void)
+
+ xen_init_mmu_ops();
+
++ /*
++ * Prevent page tables from being allocated in highmem, even
++ * if CONFIG_HIGHPTE is enabled.
++ */
++ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
++
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ if (!xen_initial_domain())
+@@ -1137,6 +1182,8 @@ asmlinkage void __init xen_start_kernel(void)
+
+ pgd = (pgd_t *)xen_start_info->pt_base;
+
++ __supported_pte_mask |= _PAGE_IOMAP;
++
+ /* Don't do the full vcpu_info placement stuff until we have a
+ possible map and a non-dummy shared_info. */
+ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+@@ -1146,6 +1193,7 @@ asmlinkage void __init xen_start_kernel(void)
+
+ xen_raw_console_write("mapping kernel into physical memory\n");
+ pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
++ xen_ident_map_ISA();
+
+ init_mm.pgd = pgd;
+
+@@ -1155,6 +1203,14 @@ asmlinkage void __init xen_start_kernel(void)
+ if (xen_feature(XENFEAT_supervisor_mode_kernel))
+ pv_info.kernel_rpl = 0;
+
++ if (xen_initial_domain()) {
++ struct physdev_set_iopl set_iopl;
++ set_iopl.iopl = 1;
++ if (HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl) == -1)
++ BUG();
++ xen_init_apic();
++ }
++
+ /* set the limit of our address space */
+ xen_reserve_top();
+
+@@ -1177,6 +1233,16 @@ asmlinkage void __init xen_start_kernel(void)
+ add_preferred_console("xenboot", 0, NULL);
+ add_preferred_console("tty", 0, NULL);
+ add_preferred_console("hvc", 0, NULL);
++
++ boot_params.screen_info.orig_video_isVGA = 0;
++ } else {
++ const struct dom0_vga_console_info *info =
++ (void *)((char *)xen_start_info +
++ xen_start_info->console.dom0.info_off);
++
++ xen_init_vga(info, xen_start_info->console.dom0.info_size);
++ xen_start_info->console.domU.mfn = 0;
++ xen_start_info->console.domU.evtchn = 0;
+ }
+
+ xen_raw_console_write("about to get started...\n");
+diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
+index bf4cd6b..3e6b558 100644
+--- a/arch/x86/xen/mmu.c
++++ b/arch/x86/xen/mmu.c
+@@ -50,7 +50,9 @@
+ #include <asm/mmu_context.h>
+ #include <asm/setup.h>
+ #include <asm/paravirt.h>
++#include <asm/e820.h>
+ #include <asm/linkage.h>
++#include <asm/page.h>
+
+ #include <asm/xen/hypercall.h>
+ #include <asm/xen/hypervisor.h>
+@@ -58,6 +60,7 @@
+ #include <xen/page.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/version.h>
++#include <xen/interface/memory.h>
+ #include <xen/hvc-console.h>
+
+ #include "multicalls.h"
+@@ -66,6 +69,13 @@
+
+ #define MMU_UPDATE_HISTO 30
+
++/*
++ * Protects atomic reservation decrease/increase against concurrent increases.
++ * Also protects non-atomic updates of current_pages and driver_pages, and
++ * balloon lists.
++ */
++DEFINE_SPINLOCK(xen_reservation_lock);
++
+ #ifdef CONFIG_XEN_DEBUG_FS
+
+ static struct {
+@@ -184,6 +194,26 @@ static inline unsigned p2m_index(unsigned long pfn)
+ return pfn % P2M_ENTRIES_PER_PAGE;
+ }
+
++static int lookup_pte_fn(
++ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
++{
++ uint64_t *ptep = (uint64_t *)data;
++ if (ptep)
++ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
++ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
++ return 0;
++}
++
++int create_lookup_pte_addr(struct mm_struct *mm,
++ unsigned long address,
++ uint64_t *ptep)
++{
++ return apply_to_page_range(mm, address, PAGE_SIZE,
++ lookup_pte_fn, ptep);
++}
++
++EXPORT_SYMBOL(create_lookup_pte_addr);
++
+ /* Build the parallel p2m_top_mfn structures */
+ void xen_build_mfn_list_list(void)
+ {
+@@ -315,6 +345,7 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr)
+
+ return PFN_DOWN(maddr.maddr);
+ }
++EXPORT_SYMBOL_GPL(set_phys_to_machine);
+
+ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
+ {
+@@ -376,6 +407,34 @@ static bool xen_page_pinned(void *ptr)
+ return PagePinned(page);
+ }
+
++static bool xen_iomap_pte(pte_t pte)
++{
++ return pte_flags(pte) & _PAGE_IOMAP;
++}
++
++void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
++{
++ struct multicall_space mcs;
++ struct mmu_update *u;
++
++ mcs = xen_mc_entry(sizeof(*u));
++ u = mcs.args;
++
++ /* ptep might be kmapped when using 32-bit HIGHPTE */
++ u->ptr = arbitrary_virt_to_machine(ptep).maddr;
++ u->val = pte_val_ma(pteval);
++
++ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
++
++ xen_mc_issue(PARAVIRT_LAZY_MMU);
++}
++EXPORT_SYMBOL_GPL(xen_set_domain_pte);
++
++static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
++{
++ xen_set_domain_pte(ptep, pteval, DOMID_IO);
++}
++
+ static void xen_extend_mmu_update(const struct mmu_update *update)
+ {
+ struct multicall_space mcs;
+@@ -452,6 +511,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval)
+ {
++ if (xen_iomap_pte(pteval)) {
++ xen_set_iomap_pte(ptep, pteval);
++ goto out;
++ }
++
+ ADD_STATS(set_pte_at, 1);
+ // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
+ ADD_STATS(set_pte_at_current, mm == current->mm);
+@@ -522,8 +586,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
+ return val;
+ }
+
++static pteval_t iomap_pte(pteval_t val)
++{
++ if (val & _PAGE_PRESENT) {
++ unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
++ pteval_t flags = val & PTE_FLAGS_MASK;
++
++ /* We assume the pte frame number is a MFN, so
++ just use it as-is. */
++ val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
++ }
++
++ return val;
++}
++
+ pteval_t xen_pte_val(pte_t pte)
+ {
++ if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
++ return pte.pte;
++
+ return pte_mfn_to_pfn(pte.pte);
+ }
+ PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
+@@ -536,7 +617,22 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
+
+ pte_t xen_make_pte(pteval_t pte)
+ {
+- pte = pte_pfn_to_mfn(pte);
++ phys_addr_t addr = (pte & PTE_PFN_MASK);
++
++ /*
++ * Unprivileged domains are allowed to do IOMAPpings for
++ * PCI passthrough, but not map ISA space. The ISA
++ * mappings are just dummy local mappings to keep other
++ * parts of the kernel happy.
++ */
++ if (unlikely(pte & _PAGE_IOMAP) &&
++ (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
++ pte = iomap_pte(pte);
++ } else {
++ pte &= ~_PAGE_IOMAP;
++ pte = pte_pfn_to_mfn(pte);
++ }
++
+ return native_make_pte(pte);
+ }
+ PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
+@@ -592,6 +688,11 @@ void xen_set_pud(pud_t *ptr, pud_t val)
+
+ void xen_set_pte(pte_t *ptep, pte_t pte)
+ {
++ if (xen_iomap_pte(pte)) {
++ xen_set_iomap_pte(ptep, pte);
++ return;
++ }
++
+ ADD_STATS(pte_update, 1);
+ // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
+ ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+@@ -608,6 +709,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte)
+ #ifdef CONFIG_X86_PAE
+ void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+ {
++ if (xen_iomap_pte(pte)) {
++ xen_set_iomap_pte(ptep, pte);
++ return;
++ }
++
+ set_64bit((u64 *)ptep, native_pte_val(pte));
+ }
+
+@@ -1219,7 +1325,7 @@ void xen_exit_mmap(struct mm_struct *mm)
+ spin_lock(&mm->page_table_lock);
+
+ /* pgd may not be pinned in the error exit path of execve */
+- if (xen_page_pinned(mm->pgd))
++ if (xen_page_pinned(mm->pgd) && !mm->context.has_foreign_mappings)
+ xen_pgd_unpin(mm);
+
+ spin_unlock(&mm->page_table_lock);
+@@ -1288,12 +1394,19 @@ static void xen_flush_tlb_single(unsigned long addr)
+ preempt_enable();
+ }
+
++/*
++ * Flush tlb on other cpus. Xen can do this via a single hypercall
++ * rather than explicit IPIs, which has the nice property of avoiding
++ * any cpus which don't actually have dirty tlbs. Unfortunately it
++ * doesn't give us an opportunity to kick out cpus which are in lazy
++ * tlb state, so we may end up reflushing some cpus unnecessarily.
++ */
+ static void xen_flush_tlb_others(const struct cpumask *cpus,
+ struct mm_struct *mm, unsigned long va)
+ {
+ struct {
+ struct mmuext_op op;
+- DECLARE_BITMAP(mask, NR_CPUS);
++ DECLARE_BITMAP(mask, num_processors);
+ } *args;
+ struct multicall_space mcs;
+
+@@ -1417,6 +1530,13 @@ static int xen_pgd_alloc(struct mm_struct *mm)
+ return ret;
+ }
+
++void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd)
++{
++ if (xen_page_pinned(pgd))
++ __xen_pgd_unpin(mm, pgd);
++
++}
++
+ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+ {
+ #ifdef CONFIG_X86_64
+@@ -1432,14 +1552,15 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+ {
+ pgprot_t prot = PAGE_KERNEL;
+
++ /*
++ * We disable highmem allocations for page tables so we should never
++ * see any calls to kmap_atomic_pte on a highmem page.
++ */
++ BUG_ON(PageHighMem(page));
++
+ if (PagePinned(page))
+ prot = PAGE_KERNEL_RO;
+
+- if (0 && PageHighMem(page))
+- printk("mapping highpte %lx type %d prot %s\n",
+- page_to_pfn(page), type,
+- (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+-
+ return kmap_atomic_prot(page, type, prot);
+ }
+ #endif
+@@ -1447,10 +1568,17 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+ #ifdef CONFIG_X86_32
+ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+ {
+- /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+- if (pte_val_ma(*ptep) & _PAGE_PRESENT)
++ pte_t oldpte = *ptep;
++
++ if (pte_flags(oldpte) & _PAGE_PRESENT) {
++ /* Don't allow existing IO mappings to be overridden */
++ if (pte_flags(oldpte) & _PAGE_IOMAP)
++ pte = oldpte;
++
++ /* Don't allow _PAGE_RW to be set on existing pte */
+ pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+ pte_val_ma(pte));
++ }
+
+ return pte;
+ }
+@@ -1619,6 +1747,7 @@ static void *m2v(phys_addr_t maddr)
+ return __ka(m2p(maddr));
+ }
+
++/* Set the page permissions on an identity-mapped pages */
+ static void set_page_prot(void *addr, pgprot_t prot)
+ {
+ unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+@@ -1674,6 +1803,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+ set_page_prot(pmd, PAGE_KERNEL_RO);
+ }
+
++void __init xen_setup_machphys_mapping(void)
++{
++ struct xen_machphys_mapping mapping;
++ unsigned long machine_to_phys_nr_ents;
++
++ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
++ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
++ machine_to_phys_nr_ents = mapping.max_mfn + 1;
++ } else {
++ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
++ }
++ machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
++}
++
+ #ifdef CONFIG_X86_64
+ static void convert_pfn_mfn(void *v)
+ {
+@@ -1765,6 +1908,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ unsigned long max_pfn)
+ {
+ pmd_t *kernel_pmd;
++ int i;
+
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+ xen_start_info->nr_pt_frames * PAGE_SIZE +
+@@ -1776,6 +1920,20 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ xen_map_identity_early(level2_kernel_pgt, max_pfn);
+
+ memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
++
++ /*
++ * When running a 32 bit domain 0 on a 64 bit hypervisor a
++ * pinned L3 (such as the initial pgd here) contains bits
++ * which are reserved in the PAE layout but not in the 64 bit
++ * layout. Unfortunately some versions of the hypervisor
++ * (incorrectly) validate compat mode guests against the PAE
++ * layout and hence will not allow such a pagetable to be
++ * pinned by the guest. Therefore we mask off only the PFN and
++ * Present bits of the supplied L3.
++ */
++ for (i = 0; i < PTRS_PER_PGD; i++)
++ swapper_pg_dir[i].pgd &= (PTE_PFN_MASK | _PAGE_PRESENT);
++
+ set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+ __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+
+@@ -1798,6 +1956,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ }
+ #endif /* CONFIG_X86_64 */
+
++static unsigned char dummy_ioapic_mapping[PAGE_SIZE] __page_aligned_bss;
++
+ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+ {
+ pte_t pte;
+@@ -1827,9 +1987,26 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+ pte = pfn_pte(phys, prot);
+ break;
+
+- default:
++#ifdef CONFIG_X86_IO_APIC
++ case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
++ /*
++ * We just don't map the IO APIC - all access is via
++ * hypercalls. Keep the address in the pte for reference.
++ */
++ pte = __pte(__pa(dummy_ioapic_mapping) | __PAGE_KERNEL);
++ break;
++#endif
++
++ case FIX_PARAVIRT_BOOTMAP:
++ /* This is an MFN, but it isn't an IO mapping from the
++ IO domain */
+ pte = mfn_pte(phys, prot);
+ break;
++
++ default:
++ /* By default, set_fixmap is used for hardware mappings */
++ pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
++ break;
+ }
+
+ __native_set_fixmap(idx, pte);
+@@ -1844,6 +2021,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+ #endif
+ }
+
++__init void xen_ident_map_ISA(void)
++{
++ unsigned long pa;
++
++ /*
++ * If we're dom0, then linear map the ISA machine addresses into
++ * the kernel's address space.
++ */
++ if (!xen_initial_domain())
++ return;
++
++ xen_raw_printk("Xen: setup ISA identity maps\n");
++
++ for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
++ pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
++
++ if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
++ BUG();
++ }
++
++ xen_flush_tlb();
++}
++
+ static __init void xen_post_allocator_init(void)
+ {
+ pv_mmu_ops.set_pte = xen_set_pte;
+@@ -1961,6 +2161,271 @@ void __init xen_init_mmu_ops(void)
+ pv_mmu_ops = xen_mmu_ops;
+ }
+
++/* Protected by xen_reservation_lock. */
++#define MAX_CONTIG_ORDER 9 /* 2MB */
++static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
++
++#define VOID_PTE (mfn_pte(0, __pgprot(0)))
++static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
++ unsigned long *in_frames,
++ unsigned long *out_frames)
++{
++ int i;
++ struct multicall_space mcs;
++
++ xen_mc_batch();
++ for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
++ mcs = __xen_mc_entry(0);
++
++ if (in_frames)
++ in_frames[i] = virt_to_mfn(vaddr);
++
++ MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
++ set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
++
++ if (out_frames)
++ out_frames[i] = virt_to_pfn(vaddr);
++ }
++ xen_mc_issue(0);
++}
++
++/*
++ * Update the pfn-to-mfn mappings for a virtual address range, either to
++ * point to an array of mfns, or contiguously from a single starting
++ * mfn.
++ */
++static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
++ unsigned long *mfns,
++ unsigned long first_mfn)
++{
++ unsigned i, limit;
++ unsigned long mfn;
++
++ xen_mc_batch();
++
++ limit = 1u << order;
++ for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
++ struct multicall_space mcs;
++ unsigned flags;
++
++ mcs = __xen_mc_entry(0);
++ if (mfns)
++ mfn = mfns[i];
++ else
++ mfn = first_mfn + i;
++
++ if (i < (limit - 1))
++ flags = 0;
++ else {
++ if (order == 0)
++ flags = UVMF_INVLPG | UVMF_ALL;
++ else
++ flags = UVMF_TLB_FLUSH | UVMF_ALL;
++ }
++
++ MULTI_update_va_mapping(mcs.mc, vaddr,
++ mfn_pte(mfn, PAGE_KERNEL), flags);
++
++ set_phys_to_machine(virt_to_pfn(vaddr), mfn);
++ }
++
++ xen_mc_issue(0);
++}
++
++/*
++ * Perform the hypercall to exchange a region of our pfns to point to
++ * memory with the required contiguous alignment. Takes the pfns as
++ * input, and populates mfns as output.
++ *
++ * Returns a success code indicating whether the hypervisor was able to
++ * satisfy the request or not.
++ */
++static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
++ unsigned long *pfns_in,
++ unsigned long extents_out, unsigned int order_out,
++ unsigned long *mfns_out,
++ unsigned int address_bits)
++{
++ long rc;
++ int success;
++
++ struct xen_memory_exchange exchange = {
++ .in = {
++ .nr_extents = extents_in,
++ .extent_order = order_in,
++ .extent_start = pfns_in,
++ .domid = DOMID_SELF
++ },
++ .out = {
++ .nr_extents = extents_out,
++ .extent_order = order_out,
++ .extent_start = mfns_out,
++ .address_bits = address_bits,
++ .domid = DOMID_SELF
++ }
++ };
++
++ BUG_ON(extents_in << order_in != extents_out << order_out);
++
++ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
++ success = (exchange.nr_exchanged == extents_in);
++
++ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
++ BUG_ON(success && (rc != 0));
++
++ return success;
++}
++
++int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
++ unsigned int address_bits)
++{
++ unsigned long *in_frames = discontig_frames, out_frame;
++ unsigned long flags;
++ int success;
++
++ /*
++ * Currently an auto-translated guest will not perform I/O, nor will
++ * it require PAE page directories below 4GB. Therefore any calls to
++ * this function are redundant and can be ignored.
++ */
++
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ return 0;
++
++ if (unlikely(order > MAX_CONTIG_ORDER))
++ return -ENOMEM;
++
++ memset((void *) vstart, 0, PAGE_SIZE << order);
++
++ vm_unmap_aliases();
++
++ spin_lock_irqsave(&xen_reservation_lock, flags);
++
++ /* 1. Zap current PTEs, remembering MFNs. */
++ xen_zap_pfn_range(vstart, order, in_frames, NULL);
++
++ /* 2. Get a new contiguous memory extent. */
++ out_frame = virt_to_pfn(vstart);
++ success = xen_exchange_memory(1UL << order, 0, in_frames,
++ 1, order, &out_frame,
++ address_bits);
++
++ /* 3. Map the new extent in place of old pages. */
++ if (success)
++ xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
++ else
++ xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
++
++ spin_unlock_irqrestore(&xen_reservation_lock, flags);
++
++ return success ? 0 : -ENOMEM;
++}
++EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
++
++void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
++{
++ unsigned long *out_frames = discontig_frames, in_frame;
++ unsigned long flags;
++ int success;
++
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ return;
++
++ if (unlikely(order > MAX_CONTIG_ORDER))
++ return;
++
++ memset((void *) vstart, 0, PAGE_SIZE << order);
++
++ vm_unmap_aliases();
++
++ spin_lock_irqsave(&xen_reservation_lock, flags);
++
++ /* 1. Find start MFN of contiguous extent. */
++ in_frame = virt_to_mfn(vstart);
++
++ /* 2. Zap current PTEs. */
++ xen_zap_pfn_range(vstart, order, NULL, out_frames);
++
++ /* 3. Do the exchange for non-contiguous MFNs. */
++ success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
++ 0, out_frames, 0);
++
++ /* 4. Map new pages in place of old pages. */
++ if (success)
++ xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
++ else
++ xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
++
++ spin_unlock_irqrestore(&xen_reservation_lock, flags);
++}
++EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
++
++#define REMAP_BATCH_SIZE 16
++
++struct remap_data {
++ unsigned long mfn;
++ pgprot_t prot;
++ struct mmu_update *mmu_update;
++};
++
++static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
++ unsigned long addr, void *data)
++{
++ struct remap_data *rmd = data;
++ pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
++
++ rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
++ rmd->mmu_update->val = pte_val_ma(pte);
++ rmd->mmu_update++;
++
++ return 0;
++}
++
++int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
++ unsigned long addr,
++ unsigned long mfn, int nr,
++ pgprot_t prot, unsigned domid)
++{
++ struct remap_data rmd;
++ struct mmu_update mmu_update[REMAP_BATCH_SIZE];
++ int batch;
++ unsigned long range;
++ int err = 0;
++
++ prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
++
++ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
++
++ rmd.mfn = mfn;
++ rmd.prot = prot;
++
++ while (nr) {
++ batch = min(REMAP_BATCH_SIZE, nr);
++ range = (unsigned long)batch << PAGE_SHIFT;
++
++ rmd.mmu_update = mmu_update;
++ err = apply_to_page_range(vma->vm_mm, addr, range,
++ remap_area_mfn_pte_fn, &rmd);
++ if (err)
++ goto out;
++
++ err = -EFAULT;
++ if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
++ goto out;
++
++ nr -= batch;
++ addr += range;
++ }
++
++ err = 0;
++out:
++
++ flush_tlb_all();
++
++ return err;
++}
++EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
++
+ #ifdef CONFIG_XEN_DEBUG_FS
+
+ static struct dentry *d_mmu_debug;
+diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
+new file mode 100644
+index 0000000..4d55524
+--- /dev/null
++++ b/arch/x86/xen/pci-swiotlb-xen.c
+@@ -0,0 +1,52 @@
++/* Glue code to lib/swiotlb-xen.c */
++
++#include <linux/dma-mapping.h>
++#include <linux/swiotlb.h>
++
++#include <asm/xen/hypervisor.h>
++
++int xen_swiotlb __read_mostly;
++
++static struct dma_map_ops xen_swiotlb_dma_ops = {
++ .mapping_error = xen_swiotlb_dma_mapping_error,
++ .alloc_coherent = xen_swiotlb_alloc_coherent,
++ .free_coherent = xen_swiotlb_free_coherent,
++ .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
++ .sync_single_for_device = xen_swiotlb_sync_single_for_device,
++ .sync_single_range_for_cpu = xen_swiotlb_sync_single_range_for_cpu,
++ .sync_single_range_for_device = xen_swiotlb_sync_single_range_for_device,
++ .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
++ .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
++ .map_sg = xen_swiotlb_map_sg_attrs,
++ .unmap_sg = xen_swiotlb_unmap_sg_attrs,
++ .map_page = xen_swiotlb_map_page,
++ .unmap_page = xen_swiotlb_unmap_page,
++ .dma_supported = xen_swiotlb_dma_supported,
++};
++
++/*
++ * pci_swiotlb_detect - set swiotlb to 1 if necessary
++ *
++ * This returns non-zero if we are forced to use swiotlb (by the boot
++ * option).
++ */
++int __init pci_xen_swiotlb_detect(void)
++{
++
++ if (xen_pv_domain() && (xen_initial_domain() || swiotlb))
++ xen_swiotlb = 1;
++
++ /* If we are running under Xen, we MUST disable the native SWIOTLB */
++ if (xen_pv_domain())
++ swiotlb = 0;
++
++ return xen_swiotlb;
++}
++
++void __init pci_xen_swiotlb_init(void)
++{
++ if (xen_swiotlb) {
++ xen_swiotlb_init(1);
++ dma_ops = &xen_swiotlb_dma_ops;
++ }
++}
+diff --git a/arch/x86/xen/pci.c b/arch/x86/xen/pci.c
+new file mode 100644
+index 0000000..f999ad8
+--- /dev/null
++++ b/arch/x86/xen/pci.c
+@@ -0,0 +1,117 @@
++#include <linux/kernel.h>
++#include <linux/acpi.h>
++#include <linux/pci.h>
++#include <linux/msi.h>
++
++#include <asm/mpspec.h>
++#include <asm/io_apic.h>
++#include <asm/pci_x86.h>
++
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/pci.h>
++
++#include <xen/interface/xen.h>
++#include <xen/events.h>
++
++#include "xen-ops.h"
++
++int xen_register_gsi(u32 gsi, int triggering, int polarity)
++{
++ int rc, irq;
++ struct physdev_setup_gsi setup_gsi;
++ struct physdev_map_pirq map_irq;
++ int shareable = 0;
++ char *name;
++
++ if (!xen_domain())
++ return -1;
++
++ printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
++ gsi, triggering, polarity);
++
++ if (triggering == ACPI_EDGE_SENSITIVE) {
++ shareable = 0;
++ name = "ioapic-edge";
++ } else {
++ shareable = 1;
++ name = "ioapic-level";
++ }
++
++ irq = xen_allocate_pirq(gsi, shareable, name);
++
++ printk(KERN_DEBUG "xen: --> irq=%d\n", irq);
++
++ if (irq >= 0) {
++ setup_gsi.gsi = gsi;
++ setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ?
++ 0 : 1);
++ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
++
++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
++ if (rc == -EEXIST)
++ printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
++ else if (rc) {
++ printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
++ gsi, rc);
++ BUG();
++ }
++
++ map_irq.domid = DOMID_SELF;
++ map_irq.type = MAP_PIRQ_TYPE_GSI;
++ map_irq.index = gsi;
++ map_irq.pirq = irq;
++
++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
++ if (rc) {
++ printk(KERN_WARNING "xen map irq failed %d\n", rc);
++ irq = -1;
++ }
++ }
++ return irq;
++}
++
++void __init xen_setup_pirqs(void)
++{
++ int irq;
++
++ if (0 == nr_ioapics) {
++ for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
++ xen_allocate_pirq(irq, 0, "xt-pic");
++ return;
++ }
++
++ /* Pre-allocate legacy irqs */
++ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
++ int trigger, polarity;
++
++ if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
++ continue;
++
++ xen_register_gsi(irq,
++ trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE,
++ polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH);
++ }
++}
++
++#ifdef CONFIG_PCI_MSI
++int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++ int irq, ret;
++ struct msi_desc *msidesc;
++
++ list_for_each_entry(msidesc, &dev->msi_list, list) {
++ irq = xen_create_msi_irq(dev, msidesc, type);
++ if (irq < 0)
++ return -1;
++
++ ret = set_irq_msi(irq, msidesc);
++ if (ret)
++ goto error;
++ }
++ return 0;
++
++error:
++ xen_destroy_irq(irq);
++ return ret;
++}
++#endif
+diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
+index ad0047f..266c86a 100644
+--- a/arch/x86/xen/setup.c
++++ b/arch/x86/xen/setup.c
+@@ -10,6 +10,7 @@
+ #include <linux/pm.h>
+
+ #include <asm/elf.h>
++#include <asm/hpet.h>
+ #include <asm/vdso.h>
+ #include <asm/e820.h>
+ #include <asm/setup.h>
+@@ -19,6 +20,7 @@
+
+ #include <xen/page.h>
+ #include <xen/interface/callback.h>
++#include <xen/interface/memory.h>
+ #include <xen/interface/physdev.h>
+ #include <xen/features.h>
+
+@@ -36,21 +38,60 @@ extern void xen_syscall32_target(void);
+ /**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+-
+ char * __init xen_memory_setup(void)
+ {
++ static __initdata struct e820entry map[E820MAX];
++
+ unsigned long max_pfn = xen_start_info->nr_pages;
++ struct xen_memory_map memmap;
++ unsigned long long mem_end;
++ int op;
++ int rc;
++ int i;
+
+ max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
++ mem_end = PFN_PHYS((u64)max_pfn);
++
++ memmap.nr_entries = E820MAX;
++ set_xen_guest_handle(memmap.buffer, map);
++
++ op = xen_initial_domain() ?
++ XENMEM_machine_memory_map :
++ XENMEM_memory_map;
++ rc = HYPERVISOR_memory_op(op, &memmap);
++ if (rc == -ENOSYS) {
++ memmap.nr_entries = 1;
++ map[0].addr = 0ULL;
++ map[0].size = mem_end;
++ /* 8MB slack (to balance backend allocations). */
++ map[0].size += 8ULL << 20;
++ map[0].type = E820_RAM;
++ rc = 0;
++ }
++ BUG_ON(rc);
+
+ e820.nr_map = 0;
+-
+- e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
++ for (i = 0; i < memmap.nr_entries; i++) {
++ unsigned long long end = map[i].addr + map[i].size;
++ if (map[i].type == E820_RAM) {
++ if (map[i].addr > mem_end)
++ continue;
++ if (end > mem_end) {
++ /* Truncate region to max_mem. */
++ map[i].size -= end - mem_end;
++ }
++ }
++ if (map[i].size > 0)
++ e820_add_region(map[i].addr, map[i].size, map[i].type);
++ }
+
+ /*
+ * Even though this is normal, usable memory under Xen, reserve
+ * ISA memory anyway because too many things think they can poke
+ * about in there.
++ *
++ * In a dom0 kernel, this region is identity mapped with the
++ * hardware ISA area, so it really is out of bounds.
+ */
+ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
+ E820_RESERVED);
+@@ -182,13 +223,17 @@ void __init xen_arch_setup(void)
+ }
+ #endif
+
++ /*
++ * Xen hypervisor uses HPET to wakeup cpu from deep c-states,
++ * so the HPET usage in dom0 must be forbidden.
++ */
++ disable_hpet(NULL);
++
+ memcpy(boot_command_line, xen_start_info->cmd_line,
+ MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+ COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+ pm_idle = xen_idle;
+
+- paravirt_disable_iospace();
+-
+ fiddle_vdso();
+ }
+diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
+index 360f8d8..632ea35 100644
+--- a/arch/x86/xen/smp.c
++++ b/arch/x86/xen/smp.c
+@@ -178,11 +178,18 @@ static void __init xen_smp_prepare_boot_cpu(void)
+ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+ {
+ unsigned cpu;
++ unsigned int i;
+
+ xen_init_lock_cpu(0);
+
+ smp_store_cpu_info(0);
+ cpu_data(0).x86_max_cores = 1;
++
++ for_each_possible_cpu(i) {
++ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
++ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
++ zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
++ }
+ set_cpu_sibling_map(0);
+
+ if (xen_smp_intr_init(0))
+@@ -299,6 +306,8 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
+ xen_setup_timer(cpu);
+ xen_init_lock_cpu(cpu);
+
++ cpumask_set_cpu(cpu, cpu_callout_mask);
++
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+
+ /* make sure interrupts start blocked */
+diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
+new file mode 100644
+index 0000000..1cd7f4d
+--- /dev/null
++++ b/arch/x86/xen/vga.c
+@@ -0,0 +1,67 @@
++#include <linux/screen_info.h>
++#include <linux/init.h>
++
++#include <asm/bootparam.h>
++#include <asm/setup.h>
++
++#include <xen/interface/xen.h>
++
++#include "xen-ops.h"
++
++void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
++{
++ struct screen_info *screen_info = &boot_params.screen_info;
++
++ /* This is drawn from a dump from vgacon:startup in
++ * standard Linux. */
++ screen_info->orig_video_mode = 3;
++ screen_info->orig_video_isVGA = 1;
++ screen_info->orig_video_lines = 25;
++ screen_info->orig_video_cols = 80;
++ screen_info->orig_video_ega_bx = 3;
++ screen_info->orig_video_points = 16;
++ screen_info->orig_y = screen_info->orig_video_lines - 1;
++
++ switch (info->video_type) {
++ case XEN_VGATYPE_TEXT_MODE_3:
++ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
++ + sizeof(info->u.text_mode_3))
++ break;
++ screen_info->orig_video_lines = info->u.text_mode_3.rows;
++ screen_info->orig_video_cols = info->u.text_mode_3.columns;
++ screen_info->orig_x = info->u.text_mode_3.cursor_x;
++ screen_info->orig_y = info->u.text_mode_3.cursor_y;
++ screen_info->orig_video_points =
++ info->u.text_mode_3.font_height;
++ break;
++
++ case XEN_VGATYPE_VESA_LFB:
++ if (size < offsetof(struct dom0_vga_console_info,
++ u.vesa_lfb.gbl_caps))
++ break;
++ screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
++ screen_info->lfb_width = info->u.vesa_lfb.width;
++ screen_info->lfb_height = info->u.vesa_lfb.height;
++ screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
++ screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
++ screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
++ screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
++ screen_info->red_size = info->u.vesa_lfb.red_size;
++ screen_info->red_pos = info->u.vesa_lfb.red_pos;
++ screen_info->green_size = info->u.vesa_lfb.green_size;
++ screen_info->green_pos = info->u.vesa_lfb.green_pos;
++ screen_info->blue_size = info->u.vesa_lfb.blue_size;
++ screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
++ screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
++ screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
++ if (size >= offsetof(struct dom0_vga_console_info,
++ u.vesa_lfb.gbl_caps)
++ + sizeof(info->u.vesa_lfb.gbl_caps))
++ screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
++ if (size >= offsetof(struct dom0_vga_console_info,
++ u.vesa_lfb.mode_attrs)
++ + sizeof(info->u.vesa_lfb.mode_attrs))
++ screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
++ break;
++ }
++}
+diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
+index f9153a3..5afc1fe 100644
+--- a/arch/x86/xen/xen-ops.h
++++ b/arch/x86/xen/xen-ops.h
+@@ -30,6 +30,7 @@ void xen_setup_machphys_mapping(void);
+ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+ void xen_ident_map_ISA(void);
+ void xen_reserve_top(void);
++void xen_ident_map_ISA(void);
+
+ char * __init xen_memory_setup(void);
+ void __init xen_arch_setup(void);
+@@ -82,6 +83,23 @@ static inline void xen_uninit_lock_cpu(int cpu)
+ }
+ #endif
+
++struct dom0_vga_console_info;
++
++#ifdef CONFIG_XEN_DOM0
++void xen_init_vga(const struct dom0_vga_console_info *, size_t size);
++#else
++static inline void xen_init_vga(const struct dom0_vga_console_info *info,
++ size_t size)
++{
++}
++#endif
++
++#ifdef CONFIG_XEN_DOM0
++void xen_init_apic(void);
++#else
++static inline void xen_init_apic(void) {}
++#endif
++
+ /* Declare an asm function, along with symbols needed to make it
+ inlineable */
+ #define DECL_ASM(ret, name, ...) \
+diff --git a/block/blk-core.c b/block/blk-core.c
+index 71da511..32d305c 100644
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -439,6 +439,7 @@ void blk_put_queue(struct request_queue *q)
+ {
+ kobject_put(&q->kobj);
+ }
++EXPORT_SYMBOL_GPL(blk_put_queue);
+
+ void blk_cleanup_queue(struct request_queue *q)
+ {
+@@ -612,6 +613,7 @@ int blk_get_queue(struct request_queue *q)
+
+ return 1;
+ }
++EXPORT_SYMBOL_GPL(blk_get_queue);
+
+ static inline void blk_free_request(struct request_queue *q, struct request *rq)
+ {
+diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
+index 1d886e0..f4a2b10 100644
+--- a/drivers/block/Kconfig
++++ b/drivers/block/Kconfig
+@@ -462,6 +462,7 @@ config XEN_BLKDEV_FRONTEND
+ tristate "Xen virtual block device support"
+ depends on XEN
+ default y
++ select XEN_XENBUS_FRONTEND
+ help
+ This driver implements the front-end of the Xen virtual
+ block device driver. It communicates with a back-end driver
+diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
+index b8578bb..feec425 100644
+--- a/drivers/block/xen-blkfront.c
++++ b/drivers/block/xen-blkfront.c
+@@ -42,6 +42,7 @@
+ #include <linux/module.h>
+ #include <linux/scatterlist.h>
+
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/grant_table.h>
+ #include <xen/events.h>
+@@ -102,6 +103,10 @@ struct blkfront_info
+
+ static DEFINE_SPINLOCK(blkif_io_lock);
+
++static unsigned int nr_minors;
++static unsigned long *minors;
++static DEFINE_SPINLOCK(minor_lock);
++
+ #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+ (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+ #define GRANT_INVALID_REF 0
+@@ -136,6 +141,55 @@ static void add_id_to_freelist(struct blkfront_info *info,
+ info->shadow_free = id;
+ }
+
++static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
++{
++ unsigned int end = minor + nr;
++ int rc;
++
++ if (end > nr_minors) {
++ unsigned long *bitmap, *old;
++
++ bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
++ GFP_KERNEL);
++ if (bitmap == NULL)
++ return -ENOMEM;
++
++ spin_lock(&minor_lock);
++ if (end > nr_minors) {
++ old = minors;
++ memcpy(bitmap, minors,
++ BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
++ minors = bitmap;
++ nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
++ } else
++ old = bitmap;
++ spin_unlock(&minor_lock);
++ kfree(old);
++ }
++
++ spin_lock(&minor_lock);
++ if (find_next_bit(minors, end, minor) >= end) {
++ for (; minor < end; ++minor)
++ __set_bit(minor, minors);
++ rc = 0;
++ } else
++ rc = -EBUSY;
++ spin_unlock(&minor_lock);
++
++ return rc;
++}
++
++static void xlbd_release_minors(unsigned int minor, unsigned int nr)
++{
++ unsigned int end = minor + nr;
++
++ BUG_ON(end > nr_minors);
++ spin_lock(&minor_lock);
++ for (; minor < end; ++minor)
++ __clear_bit(minor, minors);
++ spin_unlock(&minor_lock);
++}
++
+ static void blkif_restart_queue_callback(void *arg)
+ {
+ struct blkfront_info *info = (struct blkfront_info *)arg;
+@@ -416,9 +470,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ if ((minor % nr_parts) == 0)
+ nr_minors = nr_parts;
+
++ err = xlbd_reserve_minors(minor, nr_minors);
++ if (err)
++ goto out;
++ err = -ENODEV;
++
+ gd = alloc_disk(nr_minors);
+ if (gd == NULL)
+- goto out;
++ goto release;
+
+ offset = minor / nr_parts;
+
+@@ -449,7 +508,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+
+ if (xlvbd_init_blk_queue(gd, sector_size)) {
+ del_gendisk(gd);
+- goto out;
++ goto release;
+ }
+
+ info->rq = gd->queue;
+@@ -469,6 +528,8 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+
+ return 0;
+
++ release:
++ xlbd_release_minors(minor, nr_minors);
+ out:
+ return err;
+ }
+@@ -650,7 +711,7 @@ fail:
+
+
+ /* Common code used when first setting up, and when resuming. */
+-static int talk_to_backend(struct xenbus_device *dev,
++static int talk_to_blkback(struct xenbus_device *dev,
+ struct blkfront_info *info)
+ {
+ const char *message = NULL;
+@@ -755,7 +816,7 @@ static int blkfront_probe(struct xenbus_device *dev,
+ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
+ dev_set_drvdata(&dev->dev, info);
+
+- err = talk_to_backend(dev, info);
++ err = talk_to_blkback(dev, info);
+ if (err) {
+ kfree(info);
+ dev_set_drvdata(&dev->dev, NULL);
+@@ -850,7 +911,7 @@ static int blkfront_resume(struct xenbus_device *dev)
+
+ blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+
+- err = talk_to_backend(dev, info);
++ err = talk_to_blkback(dev, info);
+ if (info->connected == BLKIF_STATE_SUSPENDED && !err)
+ err = blkif_recover(info);
+
+@@ -923,6 +984,7 @@ static void blkfront_connect(struct blkfront_info *info)
+ static void blkfront_closing(struct xenbus_device *dev)
+ {
+ struct blkfront_info *info = dev_get_drvdata(&dev->dev);
++ unsigned int minor, nr_minors;
+ unsigned long flags;
+
+ dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
+@@ -945,7 +1007,10 @@ static void blkfront_closing(struct xenbus_device *dev)
+ blk_cleanup_queue(info->rq);
+ info->rq = NULL;
+
++ minor = info->gd->first_minor;
++ nr_minors = info->gd->minors;
+ del_gendisk(info->gd);
++ xlbd_release_minors(minor, nr_minors);
+
+ out:
+ xenbus_frontend_closed(dev);
+@@ -954,13 +1019,13 @@ static void blkfront_closing(struct xenbus_device *dev)
+ /**
+ * Callback received when the backend's state changes.
+ */
+-static void backend_changed(struct xenbus_device *dev,
++static void blkback_changed(struct xenbus_device *dev,
+ enum xenbus_state backend_state)
+ {
+ struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+ struct block_device *bd;
+
+- dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
++ dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
+
+ switch (backend_state) {
+ case XenbusStateInitialising:
+@@ -1003,7 +1068,10 @@ static int blkfront_remove(struct xenbus_device *dev)
+
+ blkif_free(info, 0);
+
+- kfree(info);
++ if(info->users == 0)
++ kfree(info);
++ else
++ info->is_ready = -1;
+
+ return 0;
+ }
+@@ -1012,12 +1080,15 @@ static int blkfront_is_ready(struct xenbus_device *dev)
+ {
+ struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+
+- return info->is_ready;
++ return info->is_ready > 0;
+ }
+
+ static int blkif_open(struct block_device *bdev, fmode_t mode)
+ {
+ struct blkfront_info *info = bdev->bd_disk->private_data;
++
++ if(info->is_ready < 0)
++ return -ENODEV;
+ info->users++;
+ return 0;
+ }
+@@ -1033,7 +1104,10 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
+ struct xenbus_device *dev = info->xbdev;
+ enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
+
+- if (state == XenbusStateClosing && info->is_ready)
++ if(info->is_ready < 0) {
++ blkfront_closing(dev);
++ kfree(info);
++ } else if (state == XenbusStateClosing && info->is_ready)
+ blkfront_closing(dev);
+ }
+ return 0;
+@@ -1061,7 +1135,7 @@ static struct xenbus_driver blkfront = {
+ .probe = blkfront_probe,
+ .remove = blkfront_remove,
+ .resume = blkfront_resume,
+- .otherend_changed = backend_changed,
++ .otherend_changed = blkback_changed,
+ .is_ready = blkfront_is_ready,
+ };
+
+diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
+index 4dcfef0..9bca04e 100644
+--- a/drivers/char/agp/intel-agp.c
++++ b/drivers/char/agp/intel-agp.c
+@@ -15,8 +15,12 @@
+ * an Intel IOMMU. So make the correct use of the PCI DMA API contingent
+ * on the Intel IOMMU support (CONFIG_DMAR).
+ * Only newer chipsets need to bother with this, of course.
++ *
++ * Xen guests accessing graphics hardware also need proper translation
++ * between pseudo-physical addresses and real machine addresses, which
++ * is also achieved by using the DMA API.
+ */
+-#ifdef CONFIG_DMAR
++#if defined(CONFIG_DMAR) || defined(CONFIG_XEN)
+ #define USE_PCI_DMA_API 1
+ #endif
+
+diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
+index a6ee32b..5be0dd3 100644
+--- a/drivers/char/hvc_xen.c
++++ b/drivers/char/hvc_xen.c
+@@ -25,6 +25,8 @@
+ #include <linux/types.h>
+
+ #include <asm/xen/hypervisor.h>
++
++#include <xen/xen.h>
+ #include <xen/page.h>
+ #include <xen/events.h>
+ #include <xen/interface/io/console.h>
+@@ -76,7 +78,7 @@ static int __write_console(const char *data, int len)
+ return sent;
+ }
+
+-static int write_console(uint32_t vtermno, const char *data, int len)
++static int domU_write_console(uint32_t vtermno, const char *data, int len)
+ {
+ int ret = len;
+
+@@ -99,7 +101,7 @@ static int write_console(uint32_t vtermno, const char *data, int len)
+ return ret;
+ }
+
+-static int read_console(uint32_t vtermno, char *buf, int len)
++static int domU_read_console(uint32_t vtermno, char *buf, int len)
+ {
+ struct xencons_interface *intf = xencons_interface();
+ XENCONS_RING_IDX cons, prod;
+@@ -120,28 +122,63 @@ static int read_console(uint32_t vtermno, char *buf, int len)
+ return recv;
+ }
+
+-static struct hv_ops hvc_ops = {
+- .get_chars = read_console,
+- .put_chars = write_console,
++static struct hv_ops domU_hvc_ops = {
++ .get_chars = domU_read_console,
++ .put_chars = domU_write_console,
++ .notifier_add = notifier_add_irq,
++ .notifier_del = notifier_del_irq,
++ .notifier_hangup = notifier_hangup_irq,
++};
++
++static int dom0_read_console(uint32_t vtermno, char *buf, int len)
++{
++ return HYPERVISOR_console_io(CONSOLEIO_read, len, buf);
++}
++
++/*
++ * Either for a dom0 to write to the system console, or a domU with a
++ * debug version of Xen
++ */
++static int dom0_write_console(uint32_t vtermno, const char *str, int len)
++{
++ int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
++ if (rc < 0)
++ return 0;
++
++ return len;
++}
++
++static struct hv_ops dom0_hvc_ops = {
++ .get_chars = dom0_read_console,
++ .put_chars = dom0_write_console,
+ .notifier_add = notifier_add_irq,
+ .notifier_del = notifier_del_irq,
+ .notifier_hangup = notifier_hangup_irq,
+ };
+
+-static int __init xen_init(void)
++static int __init xen_hvc_init(void)
+ {
+ struct hvc_struct *hp;
++ struct hv_ops *ops;
+
+- if (!xen_pv_domain() ||
+- xen_initial_domain() ||
+- !xen_start_info->console.domU.evtchn)
++ if (!xen_pv_domain())
+ return -ENODEV;
+
+- xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
++ if (xen_initial_domain()) {
++ ops = &dom0_hvc_ops;
++ xencons_irq = bind_virq_to_irq(VIRQ_CONSOLE, 0);
++ } else {
++ if (!xen_start_info->console.domU.evtchn)
++ return -ENODEV;
++
++ ops = &domU_hvc_ops;
++ xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
++ }
++
+ if (xencons_irq < 0)
+ xencons_irq = 0; /* NO_IRQ */
+
+- hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
++ hp = hvc_alloc(HVC_COOKIE, xencons_irq, ops, 256);
+ if (IS_ERR(hp))
+ return PTR_ERR(hp);
+
+@@ -158,7 +195,7 @@ void xen_console_resume(void)
+ rebind_evtchn_irq(xen_start_info->console.domU.evtchn, xencons_irq);
+ }
+
+-static void __exit xen_fini(void)
++static void __exit xen_hvc_fini(void)
+ {
+ if (hvc)
+ hvc_remove(hvc);
+@@ -166,29 +203,24 @@ static void __exit xen_fini(void)
+
+ static int xen_cons_init(void)
+ {
++ struct hv_ops *ops;
++
+ if (!xen_pv_domain())
+ return 0;
+
+- hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
++ ops = &domU_hvc_ops;
++ if (xen_initial_domain())
++ ops = &dom0_hvc_ops;
++
++ hvc_instantiate(HVC_COOKIE, 0, ops);
++
+ return 0;
+ }
+
+-module_init(xen_init);
+-module_exit(xen_fini);
++module_init(xen_hvc_init);
++module_exit(xen_hvc_fini);
+ console_initcall(xen_cons_init);
+
+-static void raw_console_write(const char *str, int len)
+-{
+- while(len > 0) {
+- int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
+- if (rc <= 0)
+- break;
+-
+- str += rc;
+- len -= rc;
+- }
+-}
+-
+ #ifdef CONFIG_EARLY_PRINTK
+ static void xenboot_write_console(struct console *console, const char *string,
+ unsigned len)
+@@ -196,19 +228,22 @@ static void xenboot_write_console(struct console *console, const char *string,
+ unsigned int linelen, off = 0;
+ const char *pos;
+
+- raw_console_write(string, len);
++ dom0_write_console(0, string, len);
++
++ if (xen_initial_domain())
++ return;
+
+- write_console(0, "(early) ", 8);
++ domU_write_console(0, "(early) ", 8);
+ while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
+ linelen = pos-string+off;
+ if (off + linelen > len)
+ break;
+- write_console(0, string+off, linelen);
+- write_console(0, "\r\n", 2);
++ domU_write_console(0, string+off, linelen);
++ domU_write_console(0, "\r\n", 2);
+ off += linelen + 1;
+ }
+ if (off < len)
+- write_console(0, string+off, len-off);
++ domU_write_console(0, string+off, len-off);
+ }
+
+ struct console xenboot_console = {
+@@ -220,7 +255,7 @@ struct console xenboot_console = {
+
+ void xen_raw_console_write(const char *str)
+ {
+- raw_console_write(str, strlen(str));
++ dom0_write_console(0, str, strlen(str));
+ }
+
+ void xen_raw_printk(const char *fmt, ...)
+diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c
+index b115726..c721c0a 100644
+--- a/drivers/input/xen-kbdfront.c
++++ b/drivers/input/xen-kbdfront.c
+@@ -21,7 +21,10 @@
+ #include <linux/errno.h>
+ #include <linux/module.h>
+ #include <linux/input.h>
++
+ #include <asm/xen/hypervisor.h>
++
++#include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+ #include <xen/interface/io/fbif.h>
+diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
+index b2f71f7..b7feb84 100644
+--- a/drivers/net/Kconfig
++++ b/drivers/net/Kconfig
+@@ -2787,6 +2787,7 @@ source "drivers/s390/net/Kconfig"
+ config XEN_NETDEV_FRONTEND
+ tristate "Xen network device frontend driver"
+ depends on XEN
++ select XEN_XENBUS_FRONTEND
+ default y
+ help
+ The network device frontend driver allows the kernel to
+diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
+index baa051d..87d7121 100644
+--- a/drivers/net/xen-netfront.c
++++ b/drivers/net/xen-netfront.c
+@@ -42,6 +42,7 @@
+ #include <linux/mm.h>
+ #include <net/ip.h>
+
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+@@ -1393,7 +1394,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
+ }
+
+ /* Common code used when first setting up, and when resuming. */
+-static int talk_to_backend(struct xenbus_device *dev,
++static int talk_to_netback(struct xenbus_device *dev,
+ struct netfront_info *info)
+ {
+ const char *message;
+@@ -1543,7 +1544,7 @@ static int xennet_connect(struct net_device *dev)
+ return -ENODEV;
+ }
+
+- err = talk_to_backend(np->xbdev, np);
++ err = talk_to_netback(np->xbdev, np);
+ if (err)
+ return err;
+
+@@ -1597,7 +1598,7 @@ static int xennet_connect(struct net_device *dev)
+ /**
+ * Callback received when the backend's state changes.
+ */
+-static void backend_changed(struct xenbus_device *dev,
++static void netback_changed(struct xenbus_device *dev,
+ enum xenbus_state backend_state)
+ {
+ struct netfront_info *np = dev_get_drvdata(&dev->dev);
+@@ -1798,7 +1799,7 @@ static struct xenbus_driver netfront_driver = {
+ .probe = netfront_probe,
+ .remove = __devexit_p(xennet_remove),
+ .resume = netfront_resume,
+- .otherend_changed = backend_changed,
++ .otherend_changed = netback_changed,
+ };
+
+ static int __init netif_init(void)
+diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
+index 4a7f11d..ae3e98f 100644
+--- a/drivers/pci/Makefile
++++ b/drivers/pci/Makefile
+@@ -31,6 +31,8 @@ obj-$(CONFIG_HT_IRQ) += htirq.o
+ # Build Intel IOMMU support
+ obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
+
++# Build Xen IOMMU support
++obj-$(CONFIG_PCI_XEN) += xen-iommu.o
+ obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
+
+ obj-$(CONFIG_PCI_IOV) += iov.o
+diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
+index 5753036..8e6e6d1 100644
+--- a/drivers/pci/dmar.c
++++ b/drivers/pci/dmar.c
+@@ -673,10 +673,13 @@ void __init detect_intel_iommu(void)
+ "x2apic and Intr-remapping.\n");
+ #endif
+ #ifdef CONFIG_DMAR
+- if (ret && !no_iommu && !iommu_detected && !swiotlb &&
+- !dmar_disabled)
++ if (ret && !no_iommu && !iommu_detected && !dmar_disabled)
+ iommu_detected = 1;
+ #endif
++#ifdef CONFIG_X86
++ if (ret)
++ x86_init.iommu.iommu_init = intel_iommu_init;
++#endif
+ }
+ early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
+ dmar_tbl = NULL;
+diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
+index 2498602..fd89530 100644
+--- a/drivers/pci/intel-iommu.c
++++ b/drivers/pci/intel-iommu.c
+@@ -3282,7 +3282,7 @@ int __init intel_iommu_init(void)
+ * Check the need for DMA-remapping initialization now.
+ * Above initialization will also be used by Interrupt-remapping.
+ */
+- if (no_iommu || swiotlb || dmar_disabled)
++ if (no_iommu || dmar_disabled)
+ return -ENODEV;
+
+ iommu_init_mempool();
+@@ -3303,7 +3303,9 @@ int __init intel_iommu_init(void)
+ "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
+
+ init_timer(&unmap_timer);
+- force_iommu = 1;
++#ifdef CONFIG_SWIOTLB
++ swiotlb = 0;
++#endif
+ dma_ops = &intel_dma_ops;
+
+ init_iommu_sysfs();
+diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
+index f9cf317..80b9756 100644
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -19,6 +19,8 @@
+ #include <linux/errno.h>
+ #include <linux/io.h>
+
++#include <asm/xen/hypervisor.h>
++
+ #include "pci.h"
+ #include "msi.h"
+
+@@ -268,7 +270,8 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
+ {
+ struct irq_desc *desc = irq_to_desc(irq);
+
+- write_msi_msg_desc(desc, msg);
++ if (!xen_initial_domain())
++ write_msi_msg_desc(desc, msg);
+ }
+
+ static void free_msi_irqs(struct pci_dev *dev)
+diff --git a/drivers/pci/xen-iommu.c b/drivers/pci/xen-iommu.c
+new file mode 100644
+index 0000000..ac6bcdb
+--- /dev/null
++++ b/drivers/pci/xen-iommu.c
+@@ -0,0 +1,271 @@
++#include <linux/types.h>
++#include <linux/mm.h>
++#include <linux/string.h>
++#include <linux/pci.h>
++#include <linux/module.h>
++#include <linux/version.h>
++#include <linux/scatterlist.h>
++#include <linux/io.h>
++#include <linux/bug.h>
++
++#include <xen/interface/xen.h>
++#include <xen/grant_table.h>
++#include <xen/page.h>
++#include <xen/xen-ops.h>
++
++#include <asm/iommu.h>
++#include <asm/swiotlb.h>
++#include <asm/tlbflush.h>
++
++#define IOMMU_BUG_ON(test) \
++do { \
++ if (unlikely(test)) { \
++ printk(KERN_ALERT "Fatal DMA error! " \
++ "Please use 'swiotlb=force'\n"); \
++ BUG(); \
++ } \
++} while (0)
++
++/* Print address range with message */
++#define PAR(msg, addr, size) \
++do { \
++ printk(msg "[%#llx - %#llx]\n", \
++ (unsigned long long)addr, \
++ (unsigned long long)addr + size); \
++} while (0)
++
++static inline int address_needs_mapping(struct device *hwdev,
++ dma_addr_t addr)
++{
++ dma_addr_t mask = DMA_BIT_MASK(32);
++ int ret;
++
++ /* If the device has a mask, use it, otherwise default to 32 bits */
++ if (hwdev)
++ mask = *hwdev->dma_mask;
++
++ ret = (addr & ~mask) != 0;
++
++ if (ret) {
++ printk(KERN_ERR "dma address needs mapping\n");
++ printk(KERN_ERR "mask: %#llx\n address: [%#llx]\n", mask, addr);
++ }
++ return ret;
++}
++
++static int check_pages_physically_contiguous(unsigned long pfn,
++ unsigned int offset,
++ size_t length)
++{
++ unsigned long next_mfn;
++ int i;
++ int nr_pages;
++
++ next_mfn = pfn_to_mfn(pfn);
++ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
++
++ for (i = 1; i < nr_pages; i++) {
++ if (pfn_to_mfn(++pfn) != ++next_mfn)
++ return 0;
++ }
++ return 1;
++}
++
++static int range_straddles_page_boundary(phys_addr_t p, size_t size)
++{
++ unsigned long pfn = PFN_DOWN(p);
++ unsigned int offset = p & ~PAGE_MASK;
++
++ if (offset + size <= PAGE_SIZE)
++ return 0;
++ if (check_pages_physically_contiguous(pfn, offset, size))
++ return 0;
++ return 1;
++}
++
++static inline void xen_dma_unmap_page(struct page *page)
++{
++ /* Xen TODO: 2.6.18 xen calls __gnttab_dma_unmap_page here
++ * to deal with foreign pages. We'll need similar logic here at
++ * some point.
++ */
++}
++
++/* Gets dma address of a page */
++static inline dma_addr_t xen_dma_map_page(struct page *page)
++{
++ /* Xen TODO: 2.6.18 xen calls __gnttab_dma_map_page here to deal
++ * with foreign pages. We'll need similar logic here at some
++ * point.
++ */
++ return ((dma_addr_t)pfn_to_mfn(page_to_pfn(page))) << PAGE_SHIFT;
++}
++
++static int xen_map_sg(struct device *hwdev, struct scatterlist *sg,
++ int nents,
++ enum dma_data_direction direction,
++ struct dma_attrs *attrs)
++{
++ struct scatterlist *s;
++ struct page *page;
++ int i, rc;
++
++ BUG_ON(direction == DMA_NONE);
++ WARN_ON(nents == 0 || sg[0].length == 0);
++
++ for_each_sg(sg, s, nents, i) {
++ BUG_ON(!sg_page(s));
++ page = sg_page(s);
++ s->dma_address = xen_dma_map_page(page) + s->offset;
++ s->dma_length = s->length;
++ IOMMU_BUG_ON(range_straddles_page_boundary(
++ page_to_phys(page), s->length));
++ }
++
++ rc = nents;
++
++ flush_write_buffers();
++ return rc;
++}
++
++static void xen_unmap_sg(struct device *hwdev, struct scatterlist *sg,
++ int nents,
++ enum dma_data_direction direction,
++ struct dma_attrs *attrs)
++{
++ struct scatterlist *s;
++ struct page *page;
++ int i;
++
++ for_each_sg(sg, s, nents, i) {
++ page = pfn_to_page(mfn_to_pfn(PFN_DOWN(s->dma_address)));
++ xen_dma_unmap_page(page);
++ }
++}
++
++static void *xen_alloc_coherent(struct device *dev, size_t size,
++ dma_addr_t *dma_handle, gfp_t gfp)
++{
++ void *ret;
++ unsigned int order = get_order(size);
++ unsigned long vstart;
++ u64 mask;
++
++ /* ignore region specifiers */
++ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
++
++ if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
++ return ret;
++
++ if (dev == NULL || (dev->coherent_dma_mask < DMA_BIT_MASK(32)))
++ gfp |= GFP_DMA;
++
++ vstart = __get_free_pages(gfp, order);
++ ret = (void *)vstart;
++
++ if (dev != NULL && dev->coherent_dma_mask)
++ mask = dev->coherent_dma_mask;
++ else
++ mask = DMA_BIT_MASK(32);
++
++ if (ret != NULL) {
++ if (xen_create_contiguous_region(vstart, order,
++ fls64(mask)) != 0) {
++ free_pages(vstart, order);
++ return NULL;
++ }
++ memset(ret, 0, size);
++ *dma_handle = virt_to_machine(ret).maddr;
++ }
++ return ret;
++}
++
++static void xen_free_coherent(struct device *dev, size_t size,
++ void *vaddr, dma_addr_t dma_addr)
++{
++ int order = get_order(size);
++
++ if (dma_release_from_coherent(dev, order, vaddr))
++ return;
++
++ xen_destroy_contiguous_region((unsigned long)vaddr, order);
++ free_pages((unsigned long)vaddr, order);
++}
++
++static dma_addr_t xen_map_page(struct device *dev, struct page *page,
++ unsigned long offset, size_t size,
++ enum dma_data_direction direction,
++ struct dma_attrs *attrs)
++{
++ dma_addr_t dma;
++
++ BUG_ON(direction == DMA_NONE);
++
++ WARN_ON(size == 0);
++
++ dma = xen_dma_map_page(page) + offset;
++
++ IOMMU_BUG_ON(address_needs_mapping(dev, dma));
++ flush_write_buffers();
++ return dma;
++}
++
++static void xen_unmap_page(struct device *dev, dma_addr_t dma_addr,
++ size_t size,
++ enum dma_data_direction direction,
++ struct dma_attrs *attrs)
++{
++ BUG_ON(direction == DMA_NONE);
++ xen_dma_unmap_page(pfn_to_page(mfn_to_pfn(PFN_DOWN(dma_addr))));
++}
++
++static struct dma_map_ops xen_dma_ops = {
++ .dma_supported = NULL,
++
++ .alloc_coherent = xen_alloc_coherent,
++ .free_coherent = xen_free_coherent,
++
++ .map_page = xen_map_page,
++ .unmap_page = xen_unmap_page,
++
++ .map_sg = xen_map_sg,
++ .unmap_sg = xen_unmap_sg,
++
++ .mapping_error = NULL,
++
++ .is_phys = 0,
++};
++
++static struct dma_map_ops xen_swiotlb_dma_ops = {
++ .dma_supported = swiotlb_dma_supported,
++
++ .alloc_coherent = xen_alloc_coherent,
++ .free_coherent = xen_free_coherent,
++
++ .map_page = swiotlb_map_page,
++ .unmap_page = swiotlb_unmap_page,
++
++ .map_sg = swiotlb_map_sg_attrs,
++ .unmap_sg = swiotlb_unmap_sg_attrs,
++
++ .mapping_error = swiotlb_dma_mapping_error,
++
++ .is_phys = 0,
++};
++
++void __init xen_iommu_init(void)
++{
++ if (!xen_pv_domain())
++ return;
++
++ printk(KERN_INFO "Xen: Initializing Xen DMA ops\n");
++
++ force_iommu = 0;
++ dma_ops = &xen_dma_ops;
++
++ if (swiotlb) {
++ printk(KERN_INFO "Xen: Enabling DMA fallback to swiotlb\n");
++ dma_ops = &xen_swiotlb_dma_ops;
++ }
++}
++
+diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
+index 188e1ba..efac9e3 100644
+--- a/drivers/video/Kconfig
++++ b/drivers/video/Kconfig
+@@ -2063,6 +2063,7 @@ config XEN_FBDEV_FRONTEND
+ select FB_SYS_IMAGEBLIT
+ select FB_SYS_FOPS
+ select FB_DEFERRED_IO
++ select XEN_XENBUS_FRONTEND
+ default y
+ help
+ This driver implements the front-end of the Xen virtual
+diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
+index 54cd916..966b226 100644
+--- a/drivers/video/xen-fbfront.c
++++ b/drivers/video/xen-fbfront.c
+@@ -25,7 +25,10 @@
+ #include <linux/module.h>
+ #include <linux/vmalloc.h>
+ #include <linux/mm.h>
++
+ #include <asm/xen/hypervisor.h>
++
++#include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+ #include <xen/interface/io/fbif.h>
+diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
+index cab100a..edeb9b2 100644
+--- a/drivers/xen/Kconfig
++++ b/drivers/xen/Kconfig
+@@ -28,6 +28,46 @@ config XEN_DEV_EVTCHN
+ firing.
+ If in doubt, say yes.
+
++config XEN_BACKEND
++ bool "Backend driver support"
++ depends on XEN_DOM0
++ default y
++ help
++ Support for backend device drivers that provide I/O services
++ to other virtual machines.
++
++config XEN_NETDEV_BACKEND
++ tristate "Xen backend network device"
++ depends on XEN_BACKEND && NET
++ help
++ Implement the network backend driver, which passes packets
++ from the guest domain's frontend drivers to the network.
++
++config XEN_BLKDEV_BACKEND
++ tristate "Block-device backend driver"
++ depends on XEN_BACKEND && BLOCK
++ help
++ The block-device backend driver allows the kernel to export its
++ block devices to other guests via a high-performance shared-memory
++ interface.
++
++
++config XEN_BLKDEV_TAP
++ tristate "Block-device tap backend driver"
++ depends on XEN_BACKEND && BLOCK
++ help
++ The block tap driver is an alternative to the block back driver
++ and allows VM block requests to be redirected to userspace through
++ a device interface. The tap allows user-space development of
++ high-performance block backends, where disk images may be implemented
++ as files, in memory, or on other hosts across the network. This
++ driver can safely coexist with the existing blockback driver.
++
++config XEN_BLKBACK_PAGEMAP
++ tristate
++ depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP != n
++ default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP
++
+ config XENFS
+ tristate "Xen filesystem"
+ depends on XEN
+@@ -60,4 +100,14 @@ config XEN_SYS_HYPERVISOR
+ Create entries under /sys/hypervisor describing the Xen
+ hypervisor environment. When running native or in another
+ virtual environment, /sys/hypervisor will still be present,
+- but will have no xen contents.
+\ No newline at end of file
++ but will have no xen contents.
++
++config XEN_XENBUS_FRONTEND
++ tristate
++
++config XEN_GNTDEV
++ tristate "userspace grant access device driver"
++ depends on XEN
++ select MMU_NOTIFIER
++ help
++ Allows userspace processes use grants.
+diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
+index 7c28434..ab2e672 100644
+--- a/drivers/xen/Makefile
++++ b/drivers/xen/Makefile
+@@ -1,12 +1,20 @@
+-obj-y += grant-table.o features.o events.o manage.o
++obj-y += grant-table.o features.o events.o manage.o biomerge.o
+ obj-y += xenbus/
+
+ nostackp := $(call cc-option, -fno-stack-protector)
+ CFLAGS_features.o := $(nostackp)
+
+-obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
+-obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
+-obj-$(CONFIG_XEN_BALLOON) += balloon.o
+-obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
+-obj-$(CONFIG_XENFS) += xenfs/
+-obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
+\ No newline at end of file
++obj-$(CONFIG_PCI) += pci.o
++obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
++obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
++obj-$(CONFIG_XEN_BALLOON) += balloon.o
++obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
++obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
++obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
++obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
++obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
++obj-$(CONFIG_XENFS) += xenfs/
++obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
++
++xen-evtchn-y := evtchn.o
++xen-gntdev-y := gntdev.o
+diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
+index 4204336..d7c0eae 100644
+--- a/drivers/xen/balloon.c
++++ b/drivers/xen/balloon.c
+@@ -43,6 +43,7 @@
+ #include <linux/mutex.h>
+ #include <linux/list.h>
+ #include <linux/sysdev.h>
++#include <linux/swap.h>
+
+ #include <asm/page.h>
+ #include <asm/pgalloc.h>
+@@ -52,13 +53,15 @@
+
+ #include <asm/xen/hypervisor.h>
+ #include <asm/xen/hypercall.h>
++
++#include <xen/xen.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/memory.h>
+ #include <xen/xenbus.h>
+ #include <xen/features.h>
+ #include <xen/page.h>
+
+-#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
++#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT+balloon_order-10))
+
+ #define BALLOON_CLASS_NAME "xen_memory"
+
+@@ -82,14 +85,15 @@ static struct sys_device balloon_sysdev;
+
+ static int register_balloon(struct sys_device *sysdev);
+
++static struct balloon_stats balloon_stats;
++
+ /*
+- * Protects atomic reservation decrease/increase against concurrent increases.
+- * Also protects non-atomic updates of current_pages and driver_pages, and
+- * balloon lists.
++ * Work in pages of this order. Can be either 0 for normal pages
++ * or 9 for hugepages.
+ */
+-static DEFINE_SPINLOCK(balloon_lock);
+-
+-static struct balloon_stats balloon_stats;
++static int balloon_order;
++static unsigned long balloon_npages;
++static unsigned long discontig_frame_list[PAGE_SIZE / sizeof(unsigned long)];
+
+ /* We increase/decrease in batches which fit in a page */
+ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+@@ -118,10 +122,41 @@ static struct timer_list balloon_timer;
+ static void scrub_page(struct page *page)
+ {
+ #ifdef CONFIG_XEN_SCRUB_PAGES
+- clear_highpage(page);
++ int i;
++
++ for (i = 0; i < balloon_npages; i++)
++ clear_highpage(page++);
+ #endif
+ }
+
++static void free_discontig_frame(void)
++{
++ int rc;
++ struct xen_memory_reservation reservation = {
++ .address_bits = 0,
++ .domid = DOMID_SELF,
++ .nr_extents = balloon_npages,
++ .extent_order = 0
++ };
++
++ set_xen_guest_handle(reservation.extent_start, discontig_frame_list);
++ rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
++ BUG_ON(rc != balloon_npages);
++}
++
++static unsigned long shrink_frame(unsigned long nr_pages)
++{
++ unsigned long i, j;
++
++ for (i = 0, j = 0; i < nr_pages; i++, j++) {
++ if (frame_list[i] == 0)
++ j++;
++ if (i != j)
++ frame_list[i] = frame_list[j];
++ }
++ return i;
++}
++
+ /* balloon_append: add the given page to the balloon. */
+ static void balloon_append(struct page *page)
+ {
+@@ -195,19 +230,18 @@ static unsigned long current_target(void)
+
+ static int increase_reservation(unsigned long nr_pages)
+ {
+- unsigned long pfn, i, flags;
++ unsigned long pfn, mfn, i, j, flags;
+ struct page *page;
+ long rc;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+- .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
+
+- spin_lock_irqsave(&balloon_lock, flags);
++ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ page = balloon_first_page();
+ for (i = 0; i < nr_pages; i++) {
+@@ -218,6 +252,8 @@ static int increase_reservation(unsigned long nr_pages)
+
+ set_xen_guest_handle(reservation.extent_start, frame_list);
+ reservation.nr_extents = nr_pages;
++ reservation.extent_order = balloon_order;
++
+ rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+ if (rc < 0)
+ goto out;
+@@ -227,19 +263,22 @@ static int increase_reservation(unsigned long nr_pages)
+ BUG_ON(page == NULL);
+
+ pfn = page_to_pfn(page);
++ mfn = frame_list[i];
+ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
+ phys_to_machine_mapping_valid(pfn));
+
+- set_phys_to_machine(pfn, frame_list[i]);
+-
+- /* Link back into the page tables if not highmem. */
+- if (pfn < max_low_pfn) {
+- int ret;
+- ret = HYPERVISOR_update_va_mapping(
+- (unsigned long)__va(pfn << PAGE_SHIFT),
+- mfn_pte(frame_list[i], PAGE_KERNEL),
+- 0);
+- BUG_ON(ret);
++ for (j = 0; j < balloon_npages; j++, pfn++, mfn++) {
++ set_phys_to_machine(pfn, mfn);
++
++ /* Link back into the page tables if not highmem. */
++ if (pfn < max_low_pfn) {
++ int ret;
++ ret = HYPERVISOR_update_va_mapping(
++ (unsigned long)__va(pfn << PAGE_SHIFT),
++ mfn_pte(mfn, PAGE_KERNEL),
++ 0);
++ BUG_ON(ret);
++ }
+ }
+
+ /* Relinquish the page back to the allocator. */
+@@ -251,20 +290,20 @@ static int increase_reservation(unsigned long nr_pages)
+ balloon_stats.current_pages += rc;
+
+ out:
+- spin_unlock_irqrestore(&balloon_lock, flags);
++ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ return rc < 0 ? rc : rc != nr_pages;
+ }
+
+ static int decrease_reservation(unsigned long nr_pages)
+ {
+- unsigned long pfn, i, flags;
++ unsigned long pfn, lpfn, mfn, i, j, flags;
+ struct page *page;
+ int need_sleep = 0;
+- int ret;
++ int discontig, discontig_free;
++ int ret;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+- .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+@@ -272,7 +311,7 @@ static int decrease_reservation(unsigned long nr_pages)
+ nr_pages = ARRAY_SIZE(frame_list);
+
+ for (i = 0; i < nr_pages; i++) {
+- if ((page = alloc_page(GFP_BALLOON)) == NULL) {
++ if ((page = alloc_pages(GFP_BALLOON, balloon_order)) == NULL) {
+ nr_pages = i;
+ need_sleep = 1;
+ break;
+@@ -282,37 +321,50 @@ static int decrease_reservation(unsigned long nr_pages)
+ frame_list[i] = pfn_to_mfn(pfn);
+
+ scrub_page(page);
+-
+- if (!PageHighMem(page)) {
+- ret = HYPERVISOR_update_va_mapping(
+- (unsigned long)__va(pfn << PAGE_SHIFT),
+- __pte_ma(0), 0);
+- BUG_ON(ret);
+- }
+-
+ }
+
+ /* Ensure that ballooned highmem pages don't have kmaps. */
+ kmap_flush_unused();
+ flush_tlb_all();
+
+- spin_lock_irqsave(&balloon_lock, flags);
++ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* No more mappings: invalidate P2M and add to balloon. */
+ for (i = 0; i < nr_pages; i++) {
+- pfn = mfn_to_pfn(frame_list[i]);
+- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
++ mfn = frame_list[i];
++ lpfn = pfn = mfn_to_pfn(mfn);
+ balloon_append(pfn_to_page(pfn));
++ discontig_free = 0;
++ for (j = 0; j < balloon_npages; j++, lpfn++, mfn++) {
++ if ((discontig_frame_list[j] = pfn_to_mfn(lpfn)) != mfn)
++ discontig_free = 1;
++
++ set_phys_to_machine(lpfn, INVALID_P2M_ENTRY);
++ if (!PageHighMem(page)) {
++ ret = HYPERVISOR_update_va_mapping(
++ (unsigned long)__va(lpfn << PAGE_SHIFT),
++ __pte_ma(0), 0);
++ BUG_ON(ret);
++ }
++ }
++ if (discontig_free) {
++ free_discontig_frame();
++ frame_list[i] = 0;
++ discontig = 1;
++ }
+ }
++ balloon_stats.current_pages -= nr_pages;
++
++ if (discontig)
++ nr_pages = shrink_frame(nr_pages);
+
+ set_xen_guest_handle(reservation.extent_start, frame_list);
+ reservation.nr_extents = nr_pages;
++ reservation.extent_order = balloon_order;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+ BUG_ON(ret != nr_pages);
+
+- balloon_stats.current_pages -= nr_pages;
+-
+- spin_unlock_irqrestore(&balloon_lock, flags);
++ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ return need_sleep;
+ }
+@@ -379,7 +431,7 @@ static void watch_target(struct xenbus_watch *watch,
+ /* The given memory/target value is in KiB, so it needs converting to
+ * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
+ */
+- balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
++ balloon_set_new_target(new_target >> ((PAGE_SHIFT - 10) + balloon_order));
+ }
+
+ static int balloon_init_watcher(struct notifier_block *notifier,
+@@ -405,9 +457,12 @@ static int __init balloon_init(void)
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+- pr_info("xen_balloon: Initialising balloon driver.\n");
++ pr_info("xen_balloon: Initialising balloon driver with page order %d.\n",
++ balloon_order);
++
++ balloon_npages = 1 << balloon_order;
+
+- balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
++ balloon_stats.current_pages = (min(xen_start_info->nr_pages, max_pfn)) >> balloon_order;
+ balloon_stats.target_pages = balloon_stats.current_pages;
+ balloon_stats.balloon_low = 0;
+ balloon_stats.balloon_high = 0;
+@@ -420,7 +475,7 @@ static int __init balloon_init(void)
+ register_balloon(&balloon_sysdev);
+
+ /* Initialise the balloon with excess memory space. */
+- for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
++ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn += balloon_npages) {
+ page = pfn_to_page(pfn);
+ if (!PageReserved(page))
+ balloon_append(page);
+@@ -444,6 +499,121 @@ static void balloon_exit(void)
+
+ module_exit(balloon_exit);
+
++static int __init balloon_parse_huge(char *s)
++{
++ balloon_order = 9;
++ return 1;
++}
++
++__setup("balloon_hugepages", balloon_parse_huge);
++
++static int dealloc_pte_fn(pte_t *pte, struct page *pmd_page,
++ unsigned long addr, void *data)
++{
++ unsigned long mfn = pte_mfn(*pte);
++ int ret;
++ struct xen_memory_reservation reservation = {
++ .nr_extents = 1,
++ .extent_order = 0,
++ .domid = DOMID_SELF
++ };
++
++ set_xen_guest_handle(reservation.extent_start, &mfn);
++ set_pte_at(&init_mm, addr, pte, __pte_ma(0));
++ set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
++
++ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
++ BUG_ON(ret != 1);
++
++ return 0;
++}
++
++struct page **alloc_empty_pages_and_pagevec(int nr_pages)
++{
++ struct page *page, **pagevec;
++ int npages;
++ int i, j, ret;
++
++ /* Round up to next number of balloon_order pages */
++ npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
++
++ pagevec = kmalloc(sizeof(page) * nr_pages << balloon_order, GFP_KERNEL);
++ if (pagevec == NULL)
++ return NULL;
++
++ for (i = 0; i < nr_pages; i++) {
++ void *v;
++
++ page = alloc_pages(GFP_KERNEL|__GFP_COLD, balloon_order);
++ if (page == NULL)
++ goto err;
++
++ scrub_page(page);
++
++ mutex_lock(&balloon_mutex);
++
++ v = page_address(page);
++
++ ret = apply_to_page_range(&init_mm, (unsigned long)v,
++ PAGE_SIZE << balloon_order,
++ dealloc_pte_fn, NULL);
++
++ if (ret != 0) {
++ mutex_unlock(&balloon_mutex);
++ //balloon_free_page(page); /* tries to use free_cold_page */
++ __free_page(page);
++ goto err;
++ }
++ for (j = 0; j < balloon_npages; j++)
++ pagevec[(i<<balloon_order)+j] = page++;
++
++ totalram_pages = balloon_stats.current_pages -= balloon_npages;
++
++ mutex_unlock(&balloon_mutex);
++ }
++
++ out:
++ schedule_work(&balloon_worker);
++ flush_tlb_all();
++ return pagevec;
++
++ err:
++ mutex_lock(&balloon_mutex);
++ while (--i >= 0)
++ balloon_append(pagevec[i << balloon_order]);
++ mutex_unlock(&balloon_mutex);
++ kfree(pagevec);
++ pagevec = NULL;
++ goto out;
++}
++EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
++
++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
++{
++ struct page *page;
++ int i;
++ int npages;
++
++ if (pagevec == NULL)
++ return;
++
++ /* Round up to next number of balloon_order pages */
++ npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
++
++ mutex_lock(&balloon_mutex);
++ for (i = 0; i < nr_pages; i++) {
++ page = pagevec[i << balloon_order];
++ BUG_ON(page_count(page) != 1);
++ balloon_append(page);
++ }
++ mutex_unlock(&balloon_mutex);
++
++ kfree(pagevec);
++
++ schedule_work(&balloon_worker);
++}
++EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
++
+ #define BALLOON_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct sys_device *dev, \
+ struct sysdev_attribute *attr, \
+@@ -477,7 +647,7 @@ static ssize_t store_target_kb(struct sys_device *dev,
+
+ target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
+
+- balloon_set_new_target(target_bytes >> PAGE_SHIFT);
++ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
+
+ return count;
+ }
+@@ -491,7 +661,7 @@ static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr
+ {
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)balloon_stats.target_pages
+- << PAGE_SHIFT);
++ << (PAGE_SHIFT + balloon_order));
+ }
+
+ static ssize_t store_target(struct sys_device *dev,
+@@ -507,7 +677,7 @@ static ssize_t store_target(struct sys_device *dev,
+
+ target_bytes = memparse(buf, &endchar);
+
+- balloon_set_new_target(target_bytes >> PAGE_SHIFT);
++ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
+
+ return count;
+ }
+diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
+new file mode 100644
+index 0000000..d40f534
+--- /dev/null
++++ b/drivers/xen/biomerge.c
+@@ -0,0 +1,14 @@
++#include <linux/bio.h>
++#include <asm/io.h>
++#include <xen/page.h>
++
++bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
++ const struct bio_vec *vec2)
++{
++ unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
++ unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
++
++ return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
++ ((mfn1 == mfn2) || ((mfn1+1) == mfn2));
++}
++
+diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
+new file mode 100644
+index 0000000..dee55ba
+--- /dev/null
++++ b/drivers/xen/blkback/Makefile
+@@ -0,0 +1,4 @@
++obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
++obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o
++
++xen-blkback-y := blkback.o xenbus.o interface.o vbd.o
+diff --git a/drivers/xen/blkback/blkback-pagemap.c b/drivers/xen/blkback/blkback-pagemap.c
+new file mode 100644
+index 0000000..45f6eb2
+--- /dev/null
++++ b/drivers/xen/blkback/blkback-pagemap.c
+@@ -0,0 +1,109 @@
++#include <linux/module.h>
++#include "blkback-pagemap.h"
++
++static int blkback_pagemap_size;
++static struct blkback_pagemap *blkback_pagemap;
++
++static inline int
++blkback_pagemap_entry_clear(struct blkback_pagemap *map)
++{
++ static struct blkback_pagemap zero;
++ return !memcmp(map, &zero, sizeof(zero));
++}
++
++int
++blkback_pagemap_init(int pages)
++{
++ blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap),
++ GFP_KERNEL);
++ if (!blkback_pagemap)
++ return -ENOMEM;
++
++ blkback_pagemap_size = pages;
++ return 0;
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_init);
++
++void
++blkback_pagemap_set(int idx, struct page *page,
++ domid_t domid, busid_t busid, grant_ref_t gref)
++{
++ struct blkback_pagemap *entry;
++
++ BUG_ON(!blkback_pagemap);
++ BUG_ON(idx >= blkback_pagemap_size);
++
++ set_page_private(page, idx);
++
++ entry = blkback_pagemap + idx;
++ if (!blkback_pagemap_entry_clear(entry)) {
++ printk("overwriting pagemap %d: d %u b %u g %u\n",
++ idx, entry->domid, entry->busid, entry->gref);
++ BUG();
++ }
++
++ entry->page = page;
++ entry->domid = domid;
++ entry->busid = busid;
++ entry->gref = gref;
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_set);
++
++void
++blkback_pagemap_clear(struct page *page)
++{
++ int idx;
++ struct blkback_pagemap *entry;
++
++ idx = (int)page_private(page);
++
++ BUG_ON(!blkback_pagemap);
++ BUG_ON(idx >= blkback_pagemap_size);
++
++ entry = blkback_pagemap + idx;
++ if (blkback_pagemap_entry_clear(entry)) {
++ printk("clearing empty pagemap %d\n", idx);
++ BUG();
++ }
++
++ memset(entry, 0, sizeof(*entry));
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_clear);
++
++struct blkback_pagemap
++blkback_pagemap_read(struct page *page)
++{
++ int idx;
++ struct blkback_pagemap *entry;
++
++ idx = (int)page_private(page);
++
++ BUG_ON(!blkback_pagemap);
++ BUG_ON(idx >= blkback_pagemap_size);
++
++ entry = blkback_pagemap + idx;
++ if (blkback_pagemap_entry_clear(entry)) {
++ printk("reading empty pagemap %d\n", idx);
++ BUG();
++ }
++
++ return *entry;
++}
++EXPORT_SYMBOL(blkback_pagemap_read);
++
++MODULE_LICENSE("Dual BSD/GPL");
++
++int
++blkback_pagemap_contains_page(struct page *page)
++{
++ struct blkback_pagemap *entry;
++ int idx = (int)page_private(page);
++
++ if (idx < 0 || idx >= blkback_pagemap_size)
++ return 0;
++
++ entry = blkback_pagemap + idx;
++
++ return (entry->page == page);
++}
++EXPORT_SYMBOL(blkback_pagemap_contains_page);
+diff --git a/drivers/xen/blkback/blkback-pagemap.h b/drivers/xen/blkback/blkback-pagemap.h
+new file mode 100644
+index 0000000..7f97d15
+--- /dev/null
++++ b/drivers/xen/blkback/blkback-pagemap.h
+@@ -0,0 +1,36 @@
++#ifndef _BLKBACK_PAGEMAP_H_
++#define _BLKBACK_PAGEMAP_H_
++
++#include <linux/mm.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/grant_table.h>
++
++typedef unsigned int busid_t;
++
++struct blkback_pagemap {
++ struct page *page;
++ domid_t domid;
++ busid_t busid;
++ grant_ref_t gref;
++};
++
++#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE)
++
++int blkback_pagemap_init(int);
++void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t);
++void blkback_pagemap_clear(struct page *);
++struct blkback_pagemap blkback_pagemap_read(struct page *);
++int blkback_pagemap_contains_page(struct page *page);
++
++#else /* CONFIG_XEN_BLKBACK_PAGEMAP */
++
++static inline int blkback_pagemap_init(int pages) { return 0; }
++static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom,
++ busid_t bus, grant_ref_t gnt) {}
++static inline void blkback_pagemap_clear(struct page *page) {}
++#define blkback_pagemap_read(_page) ({ BUG(); (struct blkback_pagemap){0}; })
++static inline int blkback_pagemap_contains_page(struct page *page) { return 0; }
++
++#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */
++
++#endif
+diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
+new file mode 100644
+index 0000000..e644dd5
+--- /dev/null
++++ b/drivers/xen/blkback/blkback.c
+@@ -0,0 +1,672 @@
++/******************************************************************************
++ * arch/xen/drivers/blkif/backend/main.c
++ *
++ * Back-end of the driver for virtual block devices. This portion of the
++ * driver exports a 'unified' block-device interface that can be accessed
++ * by any operating system that implements a compatible front end. A
++ * reference front-end implementation can be found in:
++ * arch/xen/drivers/blkif/frontend
++ *
++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
++ * Copyright (c) 2005, Christopher Clark
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/spinlock.h>
++#include <linux/kthread.h>
++#include <linux/list.h>
++#include <linux/delay.h>
++#include <linux/freezer.h>
++
++#include <xen/balloon.h>
++#include <xen/events.h>
++#include <xen/page.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include "common.h"
++
++/*
++ * These are rather arbitrary. They are fairly large because adjacent requests
++ * pulled from a communication ring are quite likely to end up being part of
++ * the same scatter/gather request at the disc.
++ *
++ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
++ *
++ * This will increase the chances of being able to write whole tracks.
++ * 64 should be enough to keep us competitive with Linux.
++ */
++static int blkif_reqs = 64;
++module_param_named(reqs, blkif_reqs, int, 0);
++MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
++
++/* Run-time switchable: /sys/module/blkback/parameters/ */
++static unsigned int log_stats = 0;
++static unsigned int debug_lvl = 0;
++module_param(log_stats, int, 0644);
++module_param(debug_lvl, int, 0644);
++
++/*
++ * Each outstanding request that we've passed to the lower device layers has a
++ * 'pending_req' allocated to it. Each buffer_head that completes decrements
++ * the pendcnt towards zero. When it hits zero, the specified domain has a
++ * response queued for it, with the saved 'id' passed back.
++ */
++typedef struct {
++ blkif_t *blkif;
++ u64 id;
++ int nr_pages;
++ atomic_t pendcnt;
++ unsigned short operation;
++ int status;
++ struct list_head free_list;
++} pending_req_t;
++
++static pending_req_t *pending_reqs;
++static struct list_head pending_free;
++static DEFINE_SPINLOCK(pending_free_lock);
++static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
++
++#define BLKBACK_INVALID_HANDLE (~0)
++
++static struct page **pending_pages;
++static grant_handle_t *pending_grant_handles;
++
++static inline int vaddr_pagenr(pending_req_t *req, int seg)
++{
++ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
++}
++
++#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
++
++static inline unsigned long vaddr(pending_req_t *req, int seg)
++{
++ unsigned long pfn = page_to_pfn(pending_page(req, seg));
++ return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++#define pending_handle(_req, _seg) \
++ (pending_grant_handles[vaddr_pagenr(_req, _seg)])
++
++
++static int do_block_io_op(blkif_t *blkif);
++static void dispatch_rw_block_io(blkif_t *blkif,
++ struct blkif_request *req,
++ pending_req_t *pending_req);
++static void make_response(blkif_t *blkif, u64 id,
++ unsigned short op, int st);
++
++/******************************************************************
++ * misc small helpers
++ */
++static pending_req_t* alloc_req(void)
++{
++ pending_req_t *req = NULL;
++ unsigned long flags;
++
++ spin_lock_irqsave(&pending_free_lock, flags);
++ if (!list_empty(&pending_free)) {
++ req = list_entry(pending_free.next, pending_req_t, free_list);
++ list_del(&req->free_list);
++ }
++ spin_unlock_irqrestore(&pending_free_lock, flags);
++ return req;
++}
++
++static void free_req(pending_req_t *req)
++{
++ unsigned long flags;
++ int was_empty;
++
++ spin_lock_irqsave(&pending_free_lock, flags);
++ was_empty = list_empty(&pending_free);
++ list_add(&req->free_list, &pending_free);
++ spin_unlock_irqrestore(&pending_free_lock, flags);
++ if (was_empty)
++ wake_up(&pending_free_wq);
++}
++
++static void unplug_queue(blkif_t *blkif)
++{
++ if (blkif->plug == NULL)
++ return;
++ if (blkif->plug->unplug_fn)
++ blkif->plug->unplug_fn(blkif->plug);
++ blk_put_queue(blkif->plug);
++ blkif->plug = NULL;
++}
++
++static void plug_queue(blkif_t *blkif, struct block_device *bdev)
++{
++ struct request_queue *q = bdev_get_queue(bdev);
++
++ if (q == blkif->plug)
++ return;
++ unplug_queue(blkif);
++ blk_get_queue(q);
++ blkif->plug = q;
++}
++
++static void fast_flush_area(pending_req_t *req)
++{
++ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++ unsigned int i, invcount = 0;
++ grant_handle_t handle;
++ int ret;
++
++ for (i = 0; i < req->nr_pages; i++) {
++ handle = pending_handle(req, i);
++ if (handle == BLKBACK_INVALID_HANDLE)
++ continue;
++ blkback_pagemap_clear(pending_page(req, i));
++ gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
++ GNTMAP_host_map, handle);
++ pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
++ invcount++;
++ }
++
++ ret = HYPERVISOR_grant_table_op(
++ GNTTABOP_unmap_grant_ref, unmap, invcount);
++ BUG_ON(ret);
++}
++
++/******************************************************************
++ * SCHEDULER FUNCTIONS
++ */
++
++static void print_stats(blkif_t *blkif)
++{
++ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n",
++ current->comm, blkif->st_oo_req,
++ blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
++ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
++ blkif->st_rd_req = 0;
++ blkif->st_wr_req = 0;
++ blkif->st_oo_req = 0;
++}
++
++int blkif_schedule(void *arg)
++{
++ blkif_t *blkif = arg;
++
++ blkif_get(blkif);
++
++ if (debug_lvl)
++ printk(KERN_DEBUG "%s: started\n", current->comm);
++
++ while (!kthread_should_stop()) {
++ if (try_to_freeze())
++ continue;
++
++ wait_event_interruptible(
++ blkif->wq,
++ blkif->waiting_reqs || kthread_should_stop());
++ wait_event_interruptible(
++ pending_free_wq,
++ !list_empty(&pending_free) || kthread_should_stop());
++
++ blkif->waiting_reqs = 0;
++ smp_mb(); /* clear flag *before* checking for work */
++
++ if (do_block_io_op(blkif))
++ blkif->waiting_reqs = 1;
++ unplug_queue(blkif);
++
++ if (log_stats && time_after(jiffies, blkif->st_print))
++ print_stats(blkif);
++ }
++
++ if (log_stats)
++ print_stats(blkif);
++ if (debug_lvl)
++ printk(KERN_DEBUG "%s: exiting\n", current->comm);
++
++ blkif->xenblkd = NULL;
++ blkif_put(blkif);
++
++ return 0;
++}
++
++/******************************************************************
++ * COMPLETION CALLBACK -- Called as bh->b_end_io()
++ */
++
++static void __end_block_io_op(pending_req_t *pending_req, int error)
++{
++ /* An error fails the entire request. */
++ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
++ (error == -EOPNOTSUPP)) {
++ DPRINTK("blkback: write barrier op failed, not supported\n");
++ blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
++ pending_req->status = BLKIF_RSP_EOPNOTSUPP;
++ } else if (error) {
++ DPRINTK("Buffer not up-to-date at end of operation, "
++ "error=%d\n", error);
++ pending_req->status = BLKIF_RSP_ERROR;
++ }
++
++ if (atomic_dec_and_test(&pending_req->pendcnt)) {
++ fast_flush_area(pending_req);
++ make_response(pending_req->blkif, pending_req->id,
++ pending_req->operation, pending_req->status);
++ blkif_put(pending_req->blkif);
++ free_req(pending_req);
++ }
++}
++
++static void end_block_io_op(struct bio *bio, int error)
++{
++ __end_block_io_op(bio->bi_private, error);
++ bio_put(bio);
++}
++
++
++/******************************************************************************
++ * NOTIFICATION FROM GUEST OS.
++ */
++
++static void blkif_notify_work(blkif_t *blkif)
++{
++ blkif->waiting_reqs = 1;
++ wake_up(&blkif->wq);
++}
++
++irqreturn_t blkif_be_int(int irq, void *dev_id)
++{
++ blkif_notify_work(dev_id);
++ return IRQ_HANDLED;
++}
++
++
++
++/******************************************************************
++ * DOWNWARD CALLS -- These interface with the block-device layer proper.
++ */
++
++static int do_block_io_op(blkif_t *blkif)
++{
++ union blkif_back_rings *blk_rings = &blkif->blk_rings;
++ struct blkif_request req;
++ pending_req_t *pending_req;
++ RING_IDX rc, rp;
++ int more_to_do = 0;
++
++ rc = blk_rings->common.req_cons;
++ rp = blk_rings->common.sring->req_prod;
++ rmb(); /* Ensure we see queued requests up to 'rp'. */
++
++ while (rc != rp) {
++
++ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
++ break;
++
++ if (kthread_should_stop()) {
++ more_to_do = 1;
++ break;
++ }
++
++ pending_req = alloc_req();
++ if (NULL == pending_req) {
++ blkif->st_oo_req++;
++ more_to_do = 1;
++ break;
++ }
++
++ switch (blkif->blk_protocol) {
++ case BLKIF_PROTOCOL_NATIVE:
++ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
++ break;
++ case BLKIF_PROTOCOL_X86_32:
++ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
++ break;
++ case BLKIF_PROTOCOL_X86_64:
++ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
++ break;
++ default:
++ BUG();
++ }
++ blk_rings->common.req_cons = ++rc; /* before make_response() */
++
++ /* Apply all sanity checks to /private copy/ of request. */
++ barrier();
++
++ switch (req.operation) {
++ case BLKIF_OP_READ:
++ blkif->st_rd_req++;
++ dispatch_rw_block_io(blkif, &req, pending_req);
++ break;
++ case BLKIF_OP_WRITE_BARRIER:
++ blkif->st_br_req++;
++ /* fall through */
++ case BLKIF_OP_WRITE:
++ blkif->st_wr_req++;
++ dispatch_rw_block_io(blkif, &req, pending_req);
++ break;
++ default:
++ /* A good sign something is wrong: sleep for a while to
++ * avoid excessive CPU consumption by a bad guest. */
++ msleep(1);
++ DPRINTK("error: unknown block io operation [%d]\n",
++ req.operation);
++ make_response(blkif, req.id, req.operation,
++ BLKIF_RSP_ERROR);
++ free_req(pending_req);
++ break;
++ }
++
++ /* Yield point for this unbounded loop. */
++ cond_resched();
++ }
++
++ return more_to_do;
++}
++
++static void dispatch_rw_block_io(blkif_t *blkif,
++ struct blkif_request *req,
++ pending_req_t *pending_req)
++{
++ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++ struct phys_req preq;
++ struct {
++ unsigned long buf; unsigned int nsec;
++ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++ unsigned int nseg;
++ struct bio *bio = NULL;
++ int ret, i;
++ int operation;
++
++ switch (req->operation) {
++ case BLKIF_OP_READ:
++ operation = READ;
++ break;
++ case BLKIF_OP_WRITE:
++ operation = WRITE;
++ break;
++ case BLKIF_OP_WRITE_BARRIER:
++ operation = WRITE_BARRIER;
++ break;
++ default:
++ operation = 0; /* make gcc happy */
++ BUG();
++ }
++
++ /* Check that number of segments is sane. */
++ nseg = req->nr_segments;
++ if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
++ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
++ DPRINTK("Bad number of segments in request (%d)\n", nseg);
++ goto fail_response;
++ }
++
++ preq.dev = req->handle;
++ preq.sector_number = req->sector_number;
++ preq.nr_sects = 0;
++
++ pending_req->blkif = blkif;
++ pending_req->id = req->id;
++ pending_req->operation = req->operation;
++ pending_req->status = BLKIF_RSP_OKAY;
++ pending_req->nr_pages = nseg;
++
++ for (i = 0; i < nseg; i++) {
++ uint32_t flags;
++
++ seg[i].nsec = req->seg[i].last_sect -
++ req->seg[i].first_sect + 1;
++
++ if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
++ (req->seg[i].last_sect < req->seg[i].first_sect))
++ goto fail_response;
++ preq.nr_sects += seg[i].nsec;
++
++ flags = GNTMAP_host_map;
++ if (operation != READ)
++ flags |= GNTMAP_readonly;
++ gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
++ req->seg[i].gref, blkif->domid);
++ }
++
++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
++ BUG_ON(ret);
++
++ for (i = 0; i < nseg; i++) {
++ if (unlikely(map[i].status != 0)) {
++ DPRINTK("invalid buffer -- could not remap it\n");
++ map[i].handle = BLKBACK_INVALID_HANDLE;
++ ret |= 1;
++ continue;
++ }
++
++ set_phys_to_machine(
++ page_to_pfn(pending_page(pending_req, i)),
++ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
++ seg[i].buf = map[i].dev_bus_addr |
++ (req->seg[i].first_sect << 9);
++ blkback_pagemap_set(vaddr_pagenr(pending_req, i),
++ pending_page(pending_req, i),
++ blkif->domid, req->handle,
++ req->seg[i].gref);
++ pending_handle(pending_req, i) = map[i].handle;
++ }
++
++ if (ret)
++ goto fail_flush;
++
++ if (vbd_translate(&preq, blkif, operation) != 0) {
++ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
++ operation == READ ? "read" : "write",
++ preq.sector_number,
++ preq.sector_number + preq.nr_sects, preq.dev);
++ goto fail_flush;
++ }
++
++ plug_queue(blkif, preq.bdev);
++ atomic_set(&pending_req->pendcnt, 1);
++ blkif_get(blkif);
++
++ for (i = 0; i < nseg; i++) {
++ if (((int)preq.sector_number|(int)seg[i].nsec) &
++ ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
++ DPRINTK("Misaligned I/O request from domain %d",
++ blkif->domid);
++ goto fail_put_bio;
++ }
++
++ while ((bio == NULL) ||
++ (bio_add_page(bio,
++ pending_page(pending_req, i),
++ seg[i].nsec << 9,
++ seg[i].buf & ~PAGE_MASK) == 0)) {
++ if (bio) {
++ atomic_inc(&pending_req->pendcnt);
++ submit_bio(operation, bio);
++ }
++
++ bio = bio_alloc(GFP_KERNEL, nseg-i);
++ if (unlikely(bio == NULL))
++ goto fail_put_bio;
++
++ bio->bi_bdev = preq.bdev;
++ bio->bi_private = pending_req;
++ bio->bi_end_io = end_block_io_op;
++ bio->bi_sector = preq.sector_number;
++ }
++
++ preq.sector_number += seg[i].nsec;
++ }
++
++ if (!bio) {
++ BUG_ON(operation != WRITE_BARRIER);
++ bio = bio_alloc(GFP_KERNEL, 0);
++ if (unlikely(bio == NULL))
++ goto fail_put_bio;
++
++ bio->bi_bdev = preq.bdev;
++ bio->bi_private = pending_req;
++ bio->bi_end_io = end_block_io_op;
++ bio->bi_sector = -1;
++ }
++
++ submit_bio(operation, bio);
++
++ if (operation == READ)
++ blkif->st_rd_sect += preq.nr_sects;
++ else if (operation == WRITE || operation == WRITE_BARRIER)
++ blkif->st_wr_sect += preq.nr_sects;
++
++ return;
++
++ fail_flush:
++ fast_flush_area(pending_req);
++ fail_response:
++ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
++ free_req(pending_req);
++ msleep(1); /* back off a bit */
++ return;
++
++ fail_put_bio:
++ __end_block_io_op(pending_req, -EINVAL);
++ if (bio)
++ bio_put(bio);
++ unplug_queue(blkif);
++ msleep(1); /* back off a bit */
++ return;
++}
++
++
++
++/******************************************************************
++ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
++ */
++
++
++static void make_response(blkif_t *blkif, u64 id,
++ unsigned short op, int st)
++{
++ struct blkif_response resp;
++ unsigned long flags;
++ union blkif_back_rings *blk_rings = &blkif->blk_rings;
++ int more_to_do = 0;
++ int notify;
++
++ resp.id = id;
++ resp.operation = op;
++ resp.status = st;
++
++ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
++ /* Place on the response ring for the relevant domain. */
++ switch (blkif->blk_protocol) {
++ case BLKIF_PROTOCOL_NATIVE:
++ memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
++ &resp, sizeof(resp));
++ break;
++ case BLKIF_PROTOCOL_X86_32:
++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
++ &resp, sizeof(resp));
++ break;
++ case BLKIF_PROTOCOL_X86_64:
++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
++ &resp, sizeof(resp));
++ break;
++ default:
++ BUG();
++ }
++ blk_rings->common.rsp_prod_pvt++;
++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
++ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
++ /*
++ * Tail check for pending requests. Allows frontend to avoid
++ * notifications if requests are already in flight (lower
++ * overheads and promotes batching).
++ */
++ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
++
++ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
++ more_to_do = 1;
++ }
++
++ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
++
++ if (more_to_do)
++ blkif_notify_work(blkif);
++ if (notify)
++ notify_remote_via_irq(blkif->irq);
++}
++
++static int __init blkif_init(void)
++{
++ int i, mmap_pages;
++ int rc = 0;
++
++ if (!xen_pv_domain())
++ return -ENODEV;
++
++ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
++
++ pending_reqs = kmalloc(sizeof(pending_reqs[0]) *
++ blkif_reqs, GFP_KERNEL);
++ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
++ mmap_pages, GFP_KERNEL);
++ pending_pages = alloc_empty_pages_and_pagevec(mmap_pages);
++
++ if (blkback_pagemap_init(mmap_pages))
++ goto out_of_memory;
++
++ if (!pending_reqs || !pending_grant_handles || !pending_pages) {
++ rc = -ENOMEM;
++ goto out_of_memory;
++ }
++
++ for (i = 0; i < mmap_pages; i++)
++ pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
++
++ rc = blkif_interface_init();
++ if (rc)
++ goto failed_init;
++
++ memset(pending_reqs, 0, sizeof(pending_reqs));
++ INIT_LIST_HEAD(&pending_free);
++
++ for (i = 0; i < blkif_reqs; i++)
++ list_add_tail(&pending_reqs[i].free_list, &pending_free);
++
++ rc = blkif_xenbus_init();
++ if (rc)
++ goto failed_init;
++
++ return 0;
++
++ out_of_memory:
++ printk(KERN_ERR "%s: out of memory\n", __func__);
++ failed_init:
++ kfree(pending_reqs);
++ kfree(pending_grant_handles);
++ free_empty_pages_and_pagevec(pending_pages, mmap_pages);
++ return rc;
++}
++
++module_init(blkif_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
+new file mode 100644
+index 0000000..af43d63
+--- /dev/null
++++ b/drivers/xen/blkback/common.h
+@@ -0,0 +1,139 @@
++/*
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __BLKIF__BACKEND__COMMON_H__
++#define __BLKIF__BACKEND__COMMON_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/blkdev.h>
++#include <linux/vmalloc.h>
++#include <linux/wait.h>
++#include <asm/io.h>
++#include <asm/setup.h>
++#include <asm/pgalloc.h>
++#include <asm/hypervisor.h>
++#include <xen/blkif.h>
++#include <xen/grant_table.h>
++#include <xen/xenbus.h>
++#include "blkback-pagemap.h"
++
++
++#define DPRINTK(_f, _a...) \
++ pr_debug("(file=%s, line=%d) " _f, \
++ __FILE__ , __LINE__ , ## _a )
++
++struct vbd {
++ blkif_vdev_t handle; /* what the domain refers to this vbd as */
++ unsigned char readonly; /* Non-zero -> read-only */
++ unsigned char type; /* VDISK_xxx */
++ u32 pdevice; /* phys device that this vbd maps to */
++ struct block_device *bdev;
++};
++
++struct backend_info;
++
++typedef struct blkif_st {
++ /* Unique identifier for this interface. */
++ domid_t domid;
++ unsigned int handle;
++ /* Physical parameters of the comms window. */
++ unsigned int irq;
++ /* Comms information. */
++ enum blkif_protocol blk_protocol;
++ union blkif_back_rings blk_rings;
++ struct vm_struct *blk_ring_area;
++ /* The VBD attached to this interface. */
++ struct vbd vbd;
++ /* Back pointer to the backend_info. */
++ struct backend_info *be;
++ /* Private fields. */
++ spinlock_t blk_ring_lock;
++ atomic_t refcnt;
++
++ wait_queue_head_t wq;
++ struct task_struct *xenblkd;
++ unsigned int waiting_reqs;
++ struct request_queue *plug;
++
++ /* statistics */
++ unsigned long st_print;
++ int st_rd_req;
++ int st_wr_req;
++ int st_oo_req;
++ int st_br_req;
++ int st_rd_sect;
++ int st_wr_sect;
++
++ wait_queue_head_t waiting_to_free;
++
++ grant_handle_t shmem_handle;
++ grant_ref_t shmem_ref;
++} blkif_t;
++
++blkif_t *blkif_alloc(domid_t domid);
++void blkif_disconnect(blkif_t *blkif);
++void blkif_free(blkif_t *blkif);
++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
++
++#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
++#define blkif_put(_b) \
++ do { \
++ if (atomic_dec_and_test(&(_b)->refcnt)) \
++ wake_up(&(_b)->waiting_to_free);\
++ } while (0)
++
++/* Create a vbd. */
++int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
++ unsigned minor, int readonly, int cdrom);
++void vbd_free(struct vbd *vbd);
++
++unsigned long long vbd_size(struct vbd *vbd);
++unsigned int vbd_info(struct vbd *vbd);
++unsigned long vbd_secsize(struct vbd *vbd);
++
++struct phys_req {
++ unsigned short dev;
++ unsigned short nr_sects;
++ struct block_device *bdev;
++ blkif_sector_t sector_number;
++};
++
++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
++
++int blkif_interface_init(void);
++
++int blkif_xenbus_init(void);
++
++irqreturn_t blkif_be_int(int irq, void *dev_id);
++int blkif_schedule(void *arg);
++
++int blkback_barrier(struct xenbus_transaction xbt,
++ struct backend_info *be, int state);
++
++#endif /* __BLKIF__BACKEND__COMMON_H__ */
+diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
+new file mode 100644
+index 0000000..e397a41
+--- /dev/null
++++ b/drivers/xen/blkback/interface.c
+@@ -0,0 +1,186 @@
++/******************************************************************************
++ * arch/xen/drivers/blkif/backend/interface.c
++ *
++ * Block-device interface management.
++ *
++ * Copyright (c) 2004, Keir Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++#include <xen/events.h>
++#include <xen/grant_table.h>
++#include <linux/kthread.h>
++
++static struct kmem_cache *blkif_cachep;
++
++blkif_t *blkif_alloc(domid_t domid)
++{
++ blkif_t *blkif;
++
++ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
++ if (!blkif)
++ return ERR_PTR(-ENOMEM);
++
++ memset(blkif, 0, sizeof(*blkif));
++ blkif->domid = domid;
++ spin_lock_init(&blkif->blk_ring_lock);
++ atomic_set(&blkif->refcnt, 1);
++ init_waitqueue_head(&blkif->wq);
++ blkif->st_print = jiffies;
++ init_waitqueue_head(&blkif->waiting_to_free);
++
++ return blkif;
++}
++
++static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
++{
++ struct gnttab_map_grant_ref op;
++
++ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
++ GNTMAP_host_map, shared_page, blkif->domid);
++
++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++ BUG();
++
++ if (op.status) {
++ DPRINTK(" Grant table operation failure !\n");
++ return op.status;
++ }
++
++ blkif->shmem_ref = shared_page;
++ blkif->shmem_handle = op.handle;
++
++ return 0;
++}
++
++static void unmap_frontend_page(blkif_t *blkif)
++{
++ struct gnttab_unmap_grant_ref op;
++
++ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
++ GNTMAP_host_map, blkif->shmem_handle);
++
++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++ BUG();
++}
++
++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
++{
++ int err;
++
++ /* Already connected through? */
++ if (blkif->irq)
++ return 0;
++
++ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
++ return -ENOMEM;
++
++ err = map_frontend_page(blkif, shared_page);
++ if (err) {
++ free_vm_area(blkif->blk_ring_area);
++ return err;
++ }
++
++ switch (blkif->blk_protocol) {
++ case BLKIF_PROTOCOL_NATIVE:
++ {
++ struct blkif_sring *sring;
++ sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
++ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
++ break;
++ }
++ case BLKIF_PROTOCOL_X86_32:
++ {
++ struct blkif_x86_32_sring *sring_x86_32;
++ sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
++ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
++ break;
++ }
++ case BLKIF_PROTOCOL_X86_64:
++ {
++ struct blkif_x86_64_sring *sring_x86_64;
++ sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
++ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
++ break;
++ }
++ default:
++ BUG();
++ }
++
++ err = bind_interdomain_evtchn_to_irqhandler(
++ blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
++ if (err < 0)
++ {
++ unmap_frontend_page(blkif);
++ free_vm_area(blkif->blk_ring_area);
++ blkif->blk_rings.common.sring = NULL;
++ return err;
++ }
++ blkif->irq = err;
++
++ return 0;
++}
++
++void blkif_disconnect(blkif_t *blkif)
++{
++ if (blkif->xenblkd) {
++ kthread_stop(blkif->xenblkd);
++ blkif->xenblkd = NULL;
++ }
++
++ atomic_dec(&blkif->refcnt);
++ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
++ atomic_inc(&blkif->refcnt);
++
++ if (blkif->irq) {
++ unbind_from_irqhandler(blkif->irq, blkif);
++ blkif->irq = 0;
++ }
++
++ if (blkif->blk_rings.common.sring) {
++ unmap_frontend_page(blkif);
++ free_vm_area(blkif->blk_ring_area);
++ blkif->blk_rings.common.sring = NULL;
++ }
++}
++
++void blkif_free(blkif_t *blkif)
++{
++ if (!atomic_dec_and_test(&blkif->refcnt))
++ BUG();
++ kmem_cache_free(blkif_cachep, blkif);
++}
++
++int __init blkif_interface_init(void)
++{
++ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
++ 0, 0, NULL);
++ if (!blkif_cachep)
++ return -ENOMEM;
++
++ return 0;
++}
+diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
+new file mode 100644
+index 0000000..410c2ea
+--- /dev/null
++++ b/drivers/xen/blkback/vbd.c
+@@ -0,0 +1,118 @@
++/******************************************************************************
++ * blkback/vbd.c
++ *
++ * Routines for managing virtual block devices (VBDs).
++ *
++ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++
++#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
++ (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
++
++unsigned long long vbd_size(struct vbd *vbd)
++{
++ return vbd_sz(vbd);
++}
++
++unsigned int vbd_info(struct vbd *vbd)
++{
++ return vbd->type | (vbd->readonly?VDISK_READONLY:0);
++}
++
++unsigned long vbd_secsize(struct vbd *vbd)
++{
++ return bdev_logical_block_size(vbd->bdev);
++}
++
++int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
++ unsigned minor, int readonly, int cdrom)
++{
++ struct vbd *vbd;
++ struct block_device *bdev;
++
++ vbd = &blkif->vbd;
++ vbd->handle = handle;
++ vbd->readonly = readonly;
++ vbd->type = 0;
++
++ vbd->pdevice = MKDEV(major, minor);
++
++ bdev = open_by_devnum(vbd->pdevice,
++ vbd->readonly ? FMODE_READ : FMODE_WRITE);
++
++ if (IS_ERR(bdev)) {
++ DPRINTK("vbd_creat: device %08x could not be opened.\n",
++ vbd->pdevice);
++ return -ENOENT;
++ }
++
++ vbd->bdev = bdev;
++
++ if (vbd->bdev->bd_disk == NULL) {
++ DPRINTK("vbd_creat: device %08x doesn't exist.\n",
++ vbd->pdevice);
++ vbd_free(vbd);
++ return -ENOENT;
++ }
++
++ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
++ vbd->type |= VDISK_CDROM;
++ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
++ vbd->type |= VDISK_REMOVABLE;
++
++ DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
++ handle, blkif->domid);
++ return 0;
++}
++
++void vbd_free(struct vbd *vbd)
++{
++ if (vbd->bdev)
++ blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
++ vbd->bdev = NULL;
++}
++
++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
++{
++ struct vbd *vbd = &blkif->vbd;
++ int rc = -EACCES;
++
++ if ((operation != READ) && vbd->readonly)
++ goto out;
++
++ if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
++ goto out;
++
++ req->dev = vbd->pdevice;
++ req->bdev = vbd->bdev;
++ rc = 0;
++
++ out:
++ return rc;
++}
+diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
+new file mode 100644
+index 0000000..34f8e40
+--- /dev/null
++++ b/drivers/xen/blkback/xenbus.c
+@@ -0,0 +1,541 @@
++/* Xenbus code for blkif backend
++ Copyright (C) 2005 Rusty Russell <rusty at rustcorp.com.au>
++ Copyright (C) 2005 XenSource Ltd
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2 of the License, or
++ (at your option) any later version.
++
++ This program is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ GNU General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with this program; if not, write to the Free Software
++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++*/
++
++#include <stdarg.h>
++#include <linux/module.h>
++#include <linux/kthread.h>
++#include "common.h"
++
++#undef DPRINTK
++#define DPRINTK(fmt, args...) \
++ pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \
++ __FUNCTION__, __LINE__, ##args)
++
++struct backend_info
++{
++ struct xenbus_device *dev;
++ blkif_t *blkif;
++ struct xenbus_watch backend_watch;
++ unsigned major;
++ unsigned minor;
++ char *mode;
++};
++
++static void connect(struct backend_info *);
++static int connect_ring(struct backend_info *);
++static void backend_changed(struct xenbus_watch *, const char **,
++ unsigned int);
++
++static int blkback_name(blkif_t *blkif, char *buf)
++{
++ char *devpath, *devname;
++ struct xenbus_device *dev = blkif->be->dev;
++
++ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
++ if (IS_ERR(devpath))
++ return PTR_ERR(devpath);
++
++ if ((devname = strstr(devpath, "/dev/")) != NULL)
++ devname += strlen("/dev/");
++ else
++ devname = devpath;
++
++ snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
++ kfree(devpath);
++
++ return 0;
++}
++
++static void update_blkif_status(blkif_t *blkif)
++{
++ int err;
++ char name[TASK_COMM_LEN];
++
++ /* Not ready to connect? */
++ if (!blkif->irq || !blkif->vbd.bdev)
++ return;
++
++ /* Already connected? */
++ if (blkif->be->dev->state == XenbusStateConnected)
++ return;
++
++ /* Attempt to connect: exit if we fail to. */
++ connect(blkif->be);
++ if (blkif->be->dev->state != XenbusStateConnected)
++ return;
++
++ err = blkback_name(blkif, name);
++ if (err) {
++ xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
++ return;
++ }
++
++ blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
++ if (IS_ERR(blkif->xenblkd)) {
++ err = PTR_ERR(blkif->xenblkd);
++ blkif->xenblkd = NULL;
++ xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
++ }
++}
++
++
++/****************************************************************
++ * sysfs interface for VBD I/O requests
++ */
++
++#define VBD_SHOW(name, format, args...) \
++ static ssize_t show_##name(struct device *_dev, \
++ struct device_attribute *attr, \
++ char *buf) \
++ { \
++ struct xenbus_device *dev = to_xenbus_device(_dev); \
++ struct backend_info *be = dev_get_drvdata(&dev->dev); \
++ \
++ return sprintf(buf, format, ##args); \
++ } \
++ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
++
++VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
++VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
++VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
++VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req);
++VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
++VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
++
++static struct attribute *vbdstat_attrs[] = {
++ &dev_attr_oo_req.attr,
++ &dev_attr_rd_req.attr,
++ &dev_attr_wr_req.attr,
++ &dev_attr_br_req.attr,
++ &dev_attr_rd_sect.attr,
++ &dev_attr_wr_sect.attr,
++ NULL
++};
++
++static struct attribute_group vbdstat_group = {
++ .name = "statistics",
++ .attrs = vbdstat_attrs,
++};
++
++VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
++VBD_SHOW(mode, "%s\n", be->mode);
++
++int xenvbd_sysfs_addif(struct xenbus_device *dev)
++{
++ int error;
++
++ error = device_create_file(&dev->dev, &dev_attr_physical_device);
++ if (error)
++ goto fail1;
++
++ error = device_create_file(&dev->dev, &dev_attr_mode);
++ if (error)
++ goto fail2;
++
++ error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
++ if (error)
++ goto fail3;
++
++ return 0;
++
++fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
++fail2: device_remove_file(&dev->dev, &dev_attr_mode);
++fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
++ return error;
++}
++
++void xenvbd_sysfs_delif(struct xenbus_device *dev)
++{
++ sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
++ device_remove_file(&dev->dev, &dev_attr_mode);
++ device_remove_file(&dev->dev, &dev_attr_physical_device);
++}
++
++static int blkback_remove(struct xenbus_device *dev)
++{
++ struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++ DPRINTK("");
++
++ if (be->major || be->minor)
++ xenvbd_sysfs_delif(dev);
++
++ if (be->backend_watch.node) {
++ unregister_xenbus_watch(&be->backend_watch);
++ kfree(be->backend_watch.node);
++ be->backend_watch.node = NULL;
++ }
++
++ if (be->blkif) {
++ blkif_disconnect(be->blkif);
++ vbd_free(&be->blkif->vbd);
++ blkif_free(be->blkif);
++ be->blkif = NULL;
++ }
++
++ kfree(be);
++ dev_set_drvdata(&dev->dev, NULL);
++ return 0;
++}
++
++int blkback_barrier(struct xenbus_transaction xbt,
++ struct backend_info *be, int state)
++{
++ struct xenbus_device *dev = be->dev;
++ int err;
++
++ err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
++ "%d", state);
++ if (err)
++ xenbus_dev_fatal(dev, err, "writing feature-barrier");
++
++ return err;
++}
++
++/**
++ * Entry point to this code when a new device is created. Allocate the basic
++ * structures, and watch the store waiting for the hotplug scripts to tell us
++ * the device's physical major and minor numbers. Switch to InitWait.
++ */
++static int blkback_probe(struct xenbus_device *dev,
++ const struct xenbus_device_id *id)
++{
++ int err;
++ struct backend_info *be = kzalloc(sizeof(struct backend_info),
++ GFP_KERNEL);
++ if (!be) {
++ xenbus_dev_fatal(dev, -ENOMEM,
++ "allocating backend structure");
++ return -ENOMEM;
++ }
++ be->dev = dev;
++ dev_set_drvdata(&dev->dev, be);
++
++ be->blkif = blkif_alloc(dev->otherend_id);
++ if (IS_ERR(be->blkif)) {
++ err = PTR_ERR(be->blkif);
++ be->blkif = NULL;
++ xenbus_dev_fatal(dev, err, "creating block interface");
++ goto fail;
++ }
++
++ /* setup back pointer */
++ be->blkif->be = be;
++
++ err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
++ "%s/%s", dev->nodename, "physical-device");
++ if (err)
++ goto fail;
++
++ err = xenbus_switch_state(dev, XenbusStateInitWait);
++ if (err)
++ goto fail;
++
++ return 0;
++
++fail:
++ DPRINTK("failed");
++ blkback_remove(dev);
++ return err;
++}
++
++
++/**
++ * Callback received when the hotplug scripts have placed the physical-device
++ * node. Read it and the mode node, and create a vbd. If the frontend is
++ * ready, connect.
++ */
++static void backend_changed(struct xenbus_watch *watch,
++ const char **vec, unsigned int len)
++{
++ int err;
++ unsigned major;
++ unsigned minor;
++ struct backend_info *be
++ = container_of(watch, struct backend_info, backend_watch);
++ struct xenbus_device *dev = be->dev;
++ int cdrom = 0;
++ char *device_type;
++
++ DPRINTK("");
++
++ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
++ &major, &minor);
++ if (XENBUS_EXIST_ERR(err)) {
++ /* Since this watch will fire once immediately after it is
++ registered, we expect this. Ignore it, and wait for the
++ hotplug scripts. */
++ return;
++ }
++ if (err != 2) {
++ xenbus_dev_fatal(dev, err, "reading physical-device");
++ return;
++ }
++
++ if ((be->major || be->minor) &&
++ ((be->major != major) || (be->minor != minor))) {
++ printk(KERN_WARNING
++ "blkback: changing physical device (from %x:%x to "
++ "%x:%x) not supported.\n", be->major, be->minor,
++ major, minor);
++ return;
++ }
++
++ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
++ if (IS_ERR(be->mode)) {
++ err = PTR_ERR(be->mode);
++ be->mode = NULL;
++ xenbus_dev_fatal(dev, err, "reading mode");
++ return;
++ }
++
++ device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
++ if (!IS_ERR(device_type)) {
++ cdrom = strcmp(device_type, "cdrom") == 0;
++ kfree(device_type);
++ }
++
++ if (be->major == 0 && be->minor == 0) {
++ /* Front end dir is a number, which is used as the handle. */
++
++ char *p = strrchr(dev->otherend, '/') + 1;
++ long handle = simple_strtoul(p, NULL, 0);
++
++ be->major = major;
++ be->minor = minor;
++
++ err = vbd_create(be->blkif, handle, major, minor,
++ (NULL == strchr(be->mode, 'w')), cdrom);
++ if (err) {
++ be->major = be->minor = 0;
++ xenbus_dev_fatal(dev, err, "creating vbd structure");
++ return;
++ }
++
++ err = xenvbd_sysfs_addif(dev);
++ if (err) {
++ vbd_free(&be->blkif->vbd);
++ be->major = be->minor = 0;
++ xenbus_dev_fatal(dev, err, "creating sysfs entries");
++ return;
++ }
++
++ /* We're potentially connected now */
++ update_blkif_status(be->blkif);
++ }
++}
++
++
++/**
++ * Callback received when the frontend's state changes.
++ */
++static void frontend_changed(struct xenbus_device *dev,
++ enum xenbus_state frontend_state)
++{
++ struct backend_info *be = dev_get_drvdata(&dev->dev);
++ int err;
++
++ DPRINTK("%s", xenbus_strstate(frontend_state));
++
++ switch (frontend_state) {
++ case XenbusStateInitialising:
++ if (dev->state == XenbusStateClosed) {
++ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
++ __FUNCTION__, dev->nodename);
++ xenbus_switch_state(dev, XenbusStateInitWait);
++ }
++ break;
++
++ case XenbusStateInitialised:
++ case XenbusStateConnected:
++ /* Ensure we connect even when two watches fire in
++ close successsion and we miss the intermediate value
++ of frontend_state. */
++ if (dev->state == XenbusStateConnected)
++ break;
++
++ err = connect_ring(be);
++ if (err)
++ break;
++ update_blkif_status(be->blkif);
++ break;
++
++ case XenbusStateClosing:
++ blkif_disconnect(be->blkif);
++ xenbus_switch_state(dev, XenbusStateClosing);
++ break;
++
++ case XenbusStateClosed:
++ xenbus_switch_state(dev, XenbusStateClosed);
++ if (xenbus_dev_is_online(dev))
++ break;
++ /* fall through if not online */
++ case XenbusStateUnknown:
++ device_unregister(&dev->dev);
++ break;
++
++ default:
++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++ frontend_state);
++ break;
++ }
++}
++
++
++/* ** Connection ** */
++
++
++/**
++ * Write the physical details regarding the block device to the store, and
++ * switch to Connected state.
++ */
++static void connect(struct backend_info *be)
++{
++ struct xenbus_transaction xbt;
++ int err;
++ struct xenbus_device *dev = be->dev;
++
++ DPRINTK("%s", dev->otherend);
++
++ /* Supply the information about the device the frontend needs */
++again:
++ err = xenbus_transaction_start(&xbt);
++ if (err) {
++ xenbus_dev_fatal(dev, err, "starting transaction");
++ return;
++ }
++
++ err = blkback_barrier(xbt, be, 1);
++ if (err)
++ goto abort;
++
++ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
++ vbd_size(&be->blkif->vbd));
++ if (err) {
++ xenbus_dev_fatal(dev, err, "writing %s/sectors",
++ dev->nodename);
++ goto abort;
++ }
++
++ /* FIXME: use a typename instead */
++ err = xenbus_printf(xbt, dev->nodename, "info", "%u",
++ vbd_info(&be->blkif->vbd));
++ if (err) {
++ xenbus_dev_fatal(dev, err, "writing %s/info",
++ dev->nodename);
++ goto abort;
++ }
++ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
++ vbd_secsize(&be->blkif->vbd));
++ if (err) {
++ xenbus_dev_fatal(dev, err, "writing %s/sector-size",
++ dev->nodename);
++ goto abort;
++ }
++
++ err = xenbus_transaction_end(xbt, 0);
++ if (err == -EAGAIN)
++ goto again;
++ if (err)
++ xenbus_dev_fatal(dev, err, "ending transaction");
++
++ err = xenbus_switch_state(dev, XenbusStateConnected);
++ if (err)
++ xenbus_dev_fatal(dev, err, "switching to Connected state",
++ dev->nodename);
++
++ return;
++ abort:
++ xenbus_transaction_end(xbt, 1);
++}
++
++
++static int connect_ring(struct backend_info *be)
++{
++ struct xenbus_device *dev = be->dev;
++ unsigned long ring_ref;
++ unsigned int evtchn;
++ char protocol[64] = "";
++ int err;
++
++ DPRINTK("%s", dev->otherend);
++
++ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
++ "event-channel", "%u", &evtchn, NULL);
++ if (err) {
++ xenbus_dev_fatal(dev, err,
++ "reading %s/ring-ref and event-channel",
++ dev->otherend);
++ return err;
++ }
++
++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
++ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
++ "%63s", protocol, NULL);
++ if (err)
++ strcpy(protocol, "unspecified, assuming native");
++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
++ else {
++ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
++ return -1;
++ }
++ printk(KERN_INFO
++ "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
++ ring_ref, evtchn, be->blkif->blk_protocol, protocol);
++
++ /* Map the shared frame, irq etc. */
++ err = blkif_map(be->blkif, ring_ref, evtchn);
++ if (err) {
++ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
++ ring_ref, evtchn);
++ return err;
++ }
++
++ return 0;
++}
++
++
++/* ** Driver Registration ** */
++
++
++static const struct xenbus_device_id blkback_ids[] = {
++ { "vbd" },
++ { "" }
++};
++
++
++static struct xenbus_driver blkback = {
++ .name = "vbd",
++ .owner = THIS_MODULE,
++ .ids = blkback_ids,
++ .probe = blkback_probe,
++ .remove = blkback_remove,
++ .otherend_changed = frontend_changed
++};
++
++
++int blkif_xenbus_init(void)
++{
++ return xenbus_register_backend(&blkback);
++}
+diff --git a/drivers/xen/blktap/Makefile b/drivers/xen/blktap/Makefile
+new file mode 100644
+index 0000000..99ff53c
+--- /dev/null
++++ b/drivers/xen/blktap/Makefile
+@@ -0,0 +1,3 @@
++obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
++
++blktap-objs := control.o ring.o wait_queue.o device.o request.o sysfs.o
+diff --git a/drivers/xen/blktap/blktap.h b/drivers/xen/blktap/blktap.h
+new file mode 100644
+index 0000000..db4cf02
+--- /dev/null
++++ b/drivers/xen/blktap/blktap.h
+@@ -0,0 +1,253 @@
++#ifndef _BLKTAP_H_
++#define _BLKTAP_H_
++
++#include <linux/mm.h>
++#include <linux/fs.h>
++#include <linux/cdev.h>
++#include <linux/init.h>
++#include <linux/scatterlist.h>
++#include <xen/blkif.h>
++#include <xen/grant_table.h>
++
++//#define ENABLE_PASSTHROUGH
++
++extern int blktap_debug_level;
++
++#define BTPRINTK(level, tag, force, _f, _a...) \
++ do { \
++ if (blktap_debug_level > level && \
++ (force || printk_ratelimit())) \
++ printk(tag "%s: " _f, __func__, ##_a); \
++ } while (0)
++
++#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
++#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
++#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
++#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
++
++#define MAX_BLKTAP_DEVICE 256
++
++#define BLKTAP_CONTROL 1
++#define BLKTAP_RING_FD 2
++#define BLKTAP_RING_VMA 3
++#define BLKTAP_DEVICE 4
++#define BLKTAP_PAUSE_REQUESTED 6
++#define BLKTAP_PAUSED 7
++#define BLKTAP_SHUTDOWN_REQUESTED 8
++#define BLKTAP_PASSTHROUGH 9
++#define BLKTAP_DEFERRED 10
++
++/* blktap IOCTLs: */
++#define BLKTAP2_IOCTL_KICK_FE 1
++#define BLKTAP2_IOCTL_ALLOC_TAP 200
++#define BLKTAP2_IOCTL_FREE_TAP 201
++#define BLKTAP2_IOCTL_CREATE_DEVICE 202
++#define BLKTAP2_IOCTL_SET_PARAMS 203
++#define BLKTAP2_IOCTL_PAUSE 204
++#define BLKTAP2_IOCTL_REOPEN 205
++#define BLKTAP2_IOCTL_RESUME 206
++
++#define BLKTAP2_MAX_MESSAGE_LEN 256
++
++#define BLKTAP2_RING_MESSAGE_PAUSE 1
++#define BLKTAP2_RING_MESSAGE_RESUME 2
++#define BLKTAP2_RING_MESSAGE_CLOSE 3
++
++#define BLKTAP_REQUEST_FREE 0
++#define BLKTAP_REQUEST_PENDING 1
++
++/*
++ * The maximum number of requests that can be outstanding at any time
++ * is determined by
++ *
++ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
++ *
++ * where mmap_alloc < MAX_DYNAMIC_MEM.
++ *
++ * TODO:
++ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
++ * sysfs.
++ */
++#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
++#define MAX_DYNAMIC_MEM BLK_RING_SIZE
++#define MAX_PENDING_REQS BLK_RING_SIZE
++#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
++#define MMAP_VADDR(_start, _req, _seg) \
++ (_start + \
++ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
++ ((_seg) * PAGE_SIZE))
++
++#define blktap_get(_b) (atomic_inc(&(_b)->refcnt))
++#define blktap_put(_b) \
++ do { \
++ if (atomic_dec_and_test(&(_b)->refcnt)) \
++ wake_up(&(_b)->wq); \
++ } while (0)
++
++struct blktap;
++
++struct grant_handle_pair {
++ grant_handle_t kernel;
++ grant_handle_t user;
++};
++#define INVALID_GRANT_HANDLE 0xFFFF
++
++struct blktap_handle {
++ unsigned int ring;
++ unsigned int device;
++ unsigned int minor;
++};
++
++struct blktap_params {
++ char name[BLKTAP2_MAX_MESSAGE_LEN];
++ unsigned long long capacity;
++ unsigned long sector_size;
++};
++
++struct blktap_device {
++ int users;
++ spinlock_t lock;
++ struct gendisk *gd;
++
++#ifdef ENABLE_PASSTHROUGH
++ struct block_device *bdev;
++#endif
++};
++
++struct blktap_ring {
++ struct vm_area_struct *vma;
++ struct blkif_front_ring ring;
++ struct vm_foreign_map foreign_map;
++ unsigned long ring_vstart;
++ unsigned long user_vstart;
++
++ int response;
++
++ wait_queue_head_t poll_wait;
++
++ dev_t devno;
++ struct device *dev;
++ atomic_t sysfs_refcnt;
++ struct mutex sysfs_mutex;
++};
++
++struct blktap_statistics {
++ unsigned long st_print;
++ int st_rd_req;
++ int st_wr_req;
++ int st_oo_req;
++ int st_rd_sect;
++ int st_wr_sect;
++ s64 st_rd_cnt;
++ s64 st_rd_sum_usecs;
++ s64 st_rd_max_usecs;
++ s64 st_wr_cnt;
++ s64 st_wr_sum_usecs;
++ s64 st_wr_max_usecs;
++};
++
++struct blktap_request {
++ uint64_t id;
++ uint16_t usr_idx;
++
++ uint8_t status;
++ atomic_t pendcnt;
++ uint8_t nr_pages;
++ unsigned short operation;
++
++ struct timeval time;
++ struct grant_handle_pair handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++ struct list_head free_list;
++};
++
++struct blktap {
++ int minor;
++ pid_t pid;
++ atomic_t refcnt;
++ unsigned long dev_inuse;
++
++ struct blktap_params params;
++
++ struct rw_semaphore tap_sem;
++
++ struct blktap_ring ring;
++ struct blktap_device device;
++
++ int pending_cnt;
++ struct blktap_request *pending_requests[MAX_PENDING_REQS];
++ struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++
++ wait_queue_head_t wq;
++ struct list_head deferred_queue;
++
++ struct blktap_statistics stats;
++};
++
++extern struct blktap *blktaps[MAX_BLKTAP_DEVICE];
++
++static inline int
++blktap_active(struct blktap *tap)
++{
++ return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
++}
++
++static inline int
++blktap_validate_params(struct blktap *tap, struct blktap_params *params)
++{
++ /* TODO: sanity check */
++ params->name[sizeof(params->name) - 1] = '\0';
++ BTINFO("%s: capacity: %llu, sector-size: %lu\n",
++ params->name, params->capacity, params->sector_size);
++ return 0;
++}
++
++int blktap_control_destroy_device(struct blktap *);
++
++int blktap_ring_init(int *);
++int blktap_ring_free(void);
++int blktap_ring_create(struct blktap *);
++int blktap_ring_destroy(struct blktap *);
++int blktap_ring_pause(struct blktap *);
++int blktap_ring_resume(struct blktap *);
++void blktap_ring_kick_user(struct blktap *);
++
++int blktap_sysfs_init(void);
++void blktap_sysfs_free(void);
++int blktap_sysfs_create(struct blktap *);
++int blktap_sysfs_destroy(struct blktap *);
++
++int blktap_device_init(int *);
++void blktap_device_free(void);
++int blktap_device_create(struct blktap *);
++int blktap_device_destroy(struct blktap *);
++int blktap_device_pause(struct blktap *);
++int blktap_device_resume(struct blktap *);
++void blktap_device_restart(struct blktap *);
++void blktap_device_finish_request(struct blktap *,
++ struct blkif_response *,
++ struct blktap_request *);
++void blktap_device_fail_pending_requests(struct blktap *);
++#ifdef ENABLE_PASSTHROUGH
++int blktap_device_enable_passthrough(struct blktap *,
++ unsigned, unsigned);
++#endif
++
++void blktap_defer(struct blktap *);
++void blktap_run_deferred(void);
++
++int blktap_request_pool_init(void);
++void blktap_request_pool_free(void);
++int blktap_request_pool_grow(void);
++int blktap_request_pool_shrink(void);
++struct blktap_request *blktap_request_allocate(struct blktap *);
++void blktap_request_free(struct blktap *, struct blktap_request *);
++struct page *request_to_page(struct blktap_request *, int);
++
++static inline unsigned long
++request_to_kaddr(struct blktap_request *req, int seg)
++{
++ unsigned long pfn = page_to_pfn(request_to_page(req, seg));
++ return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++#endif
+diff --git a/drivers/xen/blktap/control.c b/drivers/xen/blktap/control.c
+new file mode 100644
+index 0000000..a4852f7
+--- /dev/null
++++ b/drivers/xen/blktap/control.c
+@@ -0,0 +1,284 @@
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/miscdevice.h>
++
++#include <asm/uaccess.h>
++
++#include "blktap.h"
++
++static DEFINE_SPINLOCK(blktap_control_lock);
++struct blktap *blktaps[MAX_BLKTAP_DEVICE];
++
++static int ring_major;
++static int device_major;
++static int blktap_control_registered;
++
++static void
++blktap_control_initialize_tap(struct blktap *tap)
++{
++ int minor = tap->minor;
++
++ memset(tap, 0, sizeof(*tap));
++ set_bit(BLKTAP_CONTROL, &tap->dev_inuse);
++ init_rwsem(&tap->tap_sem);
++ init_waitqueue_head(&tap->wq);
++ atomic_set(&tap->refcnt, 0);
++ sg_init_table(tap->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++
++ tap->minor = minor;
++}
++
++static struct blktap *
++blktap_control_create_tap(void)
++{
++ int minor;
++ struct blktap *tap;
++
++ tap = kmalloc(sizeof(*tap), GFP_KERNEL);
++ if (unlikely(!tap))
++ return NULL;
++
++ blktap_control_initialize_tap(tap);
++
++ spin_lock_irq(&blktap_control_lock);
++ for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++)
++ if (!blktaps[minor])
++ break;
++
++ if (minor == MAX_BLKTAP_DEVICE) {
++ kfree(tap);
++ tap = NULL;
++ goto out;
++ }
++
++ tap->minor = minor;
++ blktaps[minor] = tap;
++
++out:
++ spin_unlock_irq(&blktap_control_lock);
++ return tap;
++}
++
++static struct blktap *
++blktap_control_allocate_tap(void)
++{
++ int err, minor;
++ struct blktap *tap;
++
++ /*
++ * This is called only from the ioctl, which
++ * means we should always have interrupts enabled.
++ */
++ BUG_ON(irqs_disabled());
++
++ spin_lock_irq(&blktap_control_lock);
++
++ for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) {
++ tap = blktaps[minor];
++ if (!tap)
++ goto found;
++
++ if (!tap->dev_inuse) {
++ blktap_control_initialize_tap(tap);
++ goto found;
++ }
++ }
++
++ tap = NULL;
++
++found:
++ spin_unlock_irq(&blktap_control_lock);
++
++ if (!tap) {
++ tap = blktap_control_create_tap();
++ if (!tap)
++ return NULL;
++ }
++
++ err = blktap_ring_create(tap);
++ if (err) {
++ BTERR("ring creation failed: %d\n", err);
++ clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
++ return NULL;
++ }
++
++ BTINFO("allocated tap %p\n", tap);
++ return tap;
++}
++
++static int
++blktap_control_ioctl(struct inode *inode, struct file *filp,
++ unsigned int cmd, unsigned long arg)
++{
++ unsigned long dev;
++ struct blktap *tap;
++
++ switch (cmd) {
++ case BLKTAP2_IOCTL_ALLOC_TAP: {
++ struct blktap_handle h;
++
++ tap = blktap_control_allocate_tap();
++ if (!tap) {
++ BTERR("error allocating device\n");
++ return -ENOMEM;
++ }
++
++ h.ring = ring_major;
++ h.device = device_major;
++ h.minor = tap->minor;
++
++ if (copy_to_user((struct blktap_handle __user *)arg,
++ &h, sizeof(h))) {
++ blktap_control_destroy_device(tap);
++ return -EFAULT;
++ }
++
++ return 0;
++ }
++
++ case BLKTAP2_IOCTL_FREE_TAP:
++ dev = arg;
++
++ if (dev > MAX_BLKTAP_DEVICE || !blktaps[dev])
++ return -EINVAL;
++
++ blktap_control_destroy_device(blktaps[dev]);
++ return 0;
++ }
++
++ return -ENOIOCTLCMD;
++}
++
++static struct file_operations blktap_control_file_operations = {
++ .owner = THIS_MODULE,
++ .ioctl = blktap_control_ioctl,
++};
++
++static struct miscdevice blktap_misc = {
++ .minor = MISC_DYNAMIC_MINOR,
++ .name = "blktap-control",
++ .fops = &blktap_control_file_operations,
++};
++
++int
++blktap_control_destroy_device(struct blktap *tap)
++{
++ int err;
++ unsigned long inuse;
++
++ if (!tap)
++ return 0;
++
++ set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
++
++ for (;;) {
++ inuse = tap->dev_inuse;
++ err = blktap_device_destroy(tap);
++ if (err)
++ goto wait;
++
++ inuse = tap->dev_inuse;
++ err = blktap_ring_destroy(tap);
++ if (err)
++ goto wait;
++
++ inuse = tap->dev_inuse;
++ err = blktap_sysfs_destroy(tap);
++ if (err)
++ goto wait;
++
++ break;
++
++ wait:
++ BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n",
++ inuse, tap->dev_inuse);
++ if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse))
++ break;
++ }
++
++ clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
++
++ if (tap->dev_inuse == (1UL << BLKTAP_CONTROL)) {
++ err = 0;
++ clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
++ }
++
++ return err;
++}
++
++static int __init
++blktap_control_init(void)
++{
++ int err;
++
++ err = misc_register(&blktap_misc);
++ if (err) {
++ BTERR("misc_register failed for control device");
++ return err;
++ }
++
++ blktap_control_registered = 1;
++ return 0;
++}
++
++static void
++blktap_control_free(void)
++{
++ int i;
++
++ for (i = 0; i < MAX_BLKTAP_DEVICE; i++)
++ blktap_control_destroy_device(blktaps[i]);
++
++ if (blktap_control_registered)
++ if (misc_deregister(&blktap_misc) < 0)
++ BTERR("misc_deregister failed for control device");
++}
++
++static void
++blktap_exit(void)
++{
++ blktap_control_free();
++ blktap_ring_free();
++ blktap_sysfs_free();
++ blktap_device_free();
++ blktap_request_pool_free();
++}
++
++static int __init
++blktap_init(void)
++{
++ int err;
++
++ if (!xen_domain())
++ return -ENODEV;
++
++ err = blktap_request_pool_init();
++ if (err)
++ return err;
++
++ err = blktap_device_init(&device_major);
++ if (err)
++ goto fail;
++
++ err = blktap_ring_init(&ring_major);
++ if (err)
++ goto fail;
++
++ err = blktap_sysfs_init();
++ if (err)
++ goto fail;
++
++ err = blktap_control_init();
++ if (err)
++ goto fail;
++
++ return 0;
++
++fail:
++ blktap_exit();
++ return err;
++}
++
++module_init(blktap_init);
++module_exit(blktap_exit);
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/blktap/device.c b/drivers/xen/blktap/device.c
+new file mode 100644
+index 0000000..a50b622
+--- /dev/null
++++ b/drivers/xen/blktap/device.c
+@@ -0,0 +1,1138 @@
++#include <linux/version.h> /* XXX Remove uses of VERSION instead. */
++#include <linux/fs.h>
++#include <linux/blkdev.h>
++#include <linux/cdrom.h>
++#include <linux/hdreg.h>
++#include <linux/module.h>
++#include <asm/tlbflush.h>
++
++#include <scsi/scsi.h>
++#include <scsi/scsi_ioctl.h>
++
++#include <xen/xenbus.h>
++#include <xen/interface/io/blkif.h>
++
++#include <asm/xen/page.h>
++#include <asm/xen/hypercall.h>
++
++#include "blktap.h"
++
++#include "../blkback/blkback-pagemap.h"
++
++#if 0
++#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
++#else
++#define DPRINTK_IOCTL(_f, _a...) ((void)0)
++#endif
++
++struct blktap_grant_table {
++ int cnt;
++ struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
++};
++
++static int blktap_device_major;
++
++static inline struct blktap *
++dev_to_blktap(struct blktap_device *dev)
++{
++ return container_of(dev, struct blktap, device);
++}
++
++static int
++blktap_device_open(struct block_device * bd, fmode_t mode)
++{
++ struct blktap *tap;
++ struct blktap_device *dev = bd->bd_disk->private_data;
++
++ if (!dev)
++ return -ENOENT;
++
++ tap = dev_to_blktap(dev);
++ if (!blktap_active(tap) ||
++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++ return -ENOENT;
++
++ dev->users++;
++
++ return 0;
++}
++
++static int
++blktap_device_release(struct gendisk *gd, fmode_t mode)
++{
++ struct blktap_device *dev = gd->private_data;
++ struct blktap *tap = dev_to_blktap(dev);
++
++ dev->users--;
++ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++ blktap_device_destroy(tap);
++
++ return 0;
++}
++
++static int
++blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
++{
++ /* We don't have real geometry info, but let's at least return
++ values consistent with the size of the device */
++ sector_t nsect = get_capacity(bd->bd_disk);
++ sector_t cylinders = nsect;
++
++ hg->heads = 0xff;
++ hg->sectors = 0x3f;
++ sector_div(cylinders, hg->heads * hg->sectors);
++ hg->cylinders = cylinders;
++ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
++ hg->cylinders = 0xffff;
++ return 0;
++}
++
++static int
++blktap_device_ioctl(struct block_device *bd, fmode_t mode,
++ unsigned command, unsigned long argument)
++{
++ int i;
++
++ DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
++ command, (long)argument, inode->i_rdev);
++
++ switch (command) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
++ case HDIO_GETGEO: {
++ struct hd_geometry geo;
++ int ret;
++
++ if (!argument)
++ return -EINVAL;
++
++ geo.start = get_start_sect(bd);
++ ret = blktap_device_getgeo(bd, &geo);
++ if (ret)
++ return ret;
++
++ if (copy_to_user((struct hd_geometry __user *)argument, &geo,
++ sizeof(geo)))
++ return -EFAULT;
++
++ return 0;
++ }
++#endif
++ case CDROMMULTISESSION:
++ BTDBG("FIXME: support multisession CDs later\n");
++ for (i = 0; i < sizeof(struct cdrom_multisession); i++)
++ if (put_user(0, (char __user *)(argument + i)))
++ return -EFAULT;
++ return 0;
++
++ case SCSI_IOCTL_GET_IDLUN:
++ if (!access_ok(VERIFY_WRITE, argument,
++ sizeof(struct scsi_idlun)))
++ return -EFAULT;
++
++ /* return 0 for now. */
++ __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
++ __put_user(0,
++ &((struct scsi_idlun __user *)argument)->host_unique_id);
++ return 0;
++
++ default:
++ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
++ command);*/
++ return -EINVAL; /* same return as native Linux */
++ }
++
++ return 0;
++}
++
++static struct block_device_operations blktap_device_file_operations = {
++ .owner = THIS_MODULE,
++ .open = blktap_device_open,
++ .release = blktap_device_release,
++ .ioctl = blktap_device_ioctl,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
++ .getgeo = blktap_device_getgeo
++#endif
++};
++
++static int
++blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
++ unsigned long addr, void *data)
++{
++ pte_t *pte = (pte_t *)data;
++
++ BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte));
++ set_pte(ptep, *pte);
++ return 0;
++}
++
++static int
++blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte)
++{
++ return apply_to_page_range(mm, address,
++ PAGE_SIZE, blktap_map_uaddr_fn, &pte);
++}
++
++static int
++blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
++ unsigned long addr, void *data)
++{
++ struct mm_struct *mm = (struct mm_struct *)data;
++
++ BTDBG("ptep %p\n", ptep);
++ pte_clear(mm, addr, ptep);
++ return 0;
++}
++
++static int
++blktap_umap_uaddr(struct mm_struct *mm, unsigned long address)
++{
++ return apply_to_page_range(mm, address,
++ PAGE_SIZE, blktap_umap_uaddr_fn, mm);
++}
++
++static inline void
++flush_tlb_kernel_page(unsigned long kvaddr)
++{
++ flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE);
++}
++
++static void
++blktap_device_end_dequeued_request(struct blktap_device *dev,
++ struct request *req, int error)
++{
++ unsigned long flags;
++ int ret;
++
++ //spin_lock_irq(&dev->lock);
++ spin_lock_irqsave(dev->gd->queue->queue_lock, flags);
++ ret = __blk_end_request(req, error, blk_rq_bytes(req));
++ spin_unlock_irqrestore(dev->gd->queue->queue_lock, flags);
++ //spin_unlock_irq(&dev->lock);
++
++ BUG_ON(ret);
++}
++
++/*
++ * tap->tap_sem held on entry
++ */
++static void
++blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
++{
++ uint64_t ptep;
++ int ret, usr_idx;
++ unsigned int i, cnt;
++ struct page **map, *page;
++ struct blktap_ring *ring;
++ struct grant_handle_pair *khandle;
++ unsigned long kvaddr, uvaddr, offset;
++ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
++
++ cnt = 0;
++ ring = &tap->ring;
++ usr_idx = request->usr_idx;
++ map = ring->foreign_map.map;
++
++ if (!ring->vma)
++ return;
++
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ zap_page_range(ring->vma,
++ MMAP_VADDR(ring->user_vstart, usr_idx, 0),
++ request->nr_pages << PAGE_SHIFT, NULL);
++
++ for (i = 0; i < request->nr_pages; i++) {
++ kvaddr = request_to_kaddr(request, i);
++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++
++ khandle = request->handles + i;
++
++ if (khandle->kernel != INVALID_GRANT_HANDLE) {
++ gnttab_set_unmap_op(&unmap[cnt], kvaddr,
++ GNTMAP_host_map, khandle->kernel);
++ cnt++;
++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
++ INVALID_P2M_ENTRY);
++ }
++
++ if (khandle->user != INVALID_GRANT_HANDLE) {
++ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++ if (create_lookup_pte_addr(ring->vma->vm_mm,
++ uvaddr, &ptep) != 0) {
++ BTERR("Couldn't get a pte addr!\n");
++ return;
++ }
++
++ gnttab_set_unmap_op(&unmap[cnt], ptep,
++ GNTMAP_host_map
++ | GNTMAP_application_map
++ | GNTMAP_contains_pte,
++ khandle->user);
++ cnt++;
++ }
++
++ offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
++
++ BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
++ "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
++ "0x%08lx, handle: %u\n", offset, map[offset], request,
++ usr_idx, i, kvaddr, khandle->kernel, uvaddr,
++ khandle->user);
++
++ page = map[offset];
++ if (page) {
++ ClearPageReserved(map[offset]);
++ if (blkback_pagemap_contains_page(page))
++ set_page_private(page, 0);
++ }
++ map[offset] = NULL;
++
++ khandle->kernel = INVALID_GRANT_HANDLE;
++ khandle->user = INVALID_GRANT_HANDLE;
++ }
++
++ if (cnt) {
++ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++ unmap, cnt);
++ BUG_ON(ret);
++ }
++
++ if (!xen_feature(XENFEAT_auto_translated_physmap))
++ zap_page_range(ring->vma,
++ MMAP_VADDR(ring->user_vstart, usr_idx, 0),
++ request->nr_pages << PAGE_SHIFT, NULL);
++}
++
++/*
++ * tap->tap_sem held on entry
++ */
++static void
++blktap_unmap(struct blktap *tap, struct blktap_request *request)
++{
++ int i, usr_idx;
++ unsigned long kvaddr;
++
++ usr_idx = request->usr_idx;
++ down_write(&tap->ring.vma->vm_mm->mmap_sem);
++
++ for (i = 0; i < request->nr_pages; i++) {
++ kvaddr = request_to_kaddr(request, i);
++ BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
++ "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
++ kvaddr, request->handles[i].kernel,
++ MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
++ request->handles[i].user);
++
++ if (request->handles[i].kernel == INVALID_GRANT_HANDLE) {
++ blktap_umap_uaddr(tap->ring.vma->vm_mm, kvaddr);
++ flush_tlb_kernel_page(kvaddr);
++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
++ INVALID_P2M_ENTRY);
++ }
++ }
++
++ blktap_device_fast_flush(tap, request);
++ up_write(&tap->ring.vma->vm_mm->mmap_sem);
++}
++
++/*
++ * called if the tapdisk process dies unexpectedly.
++ * fail and release any pending requests and disable queue.
++ */
++void
++blktap_device_fail_pending_requests(struct blktap *tap)
++{
++ int usr_idx;
++ struct request *req;
++ struct blktap_device *dev;
++ struct blktap_request *request;
++
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++ return;
++
++ down_write(&tap->tap_sem);
++
++ dev = &tap->device;
++ for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
++ request = tap->pending_requests[usr_idx];
++ if (!request || request->status != BLKTAP_REQUEST_PENDING)
++ continue;
++
++ BTERR("%u:%u: failing pending %s of %d pages\n",
++ blktap_device_major, tap->minor,
++ (request->operation == BLKIF_OP_READ ?
++ "read" : "write"), request->nr_pages);
++
++ blktap_unmap(tap, request);
++ req = (struct request *)(unsigned long)request->id;
++ blktap_device_end_dequeued_request(dev, req, -EIO);
++ blktap_request_free(tap, request);
++ }
++
++ up_write(&tap->tap_sem);
++
++ spin_lock_irq(&dev->lock);
++
++ /* fail any future requests */
++ dev->gd->queue->queuedata = NULL;
++ blk_start_queue(dev->gd->queue);
++
++ spin_unlock_irq(&dev->lock);
++}
++
++/*
++ * tap->tap_sem held on entry
++ */
++void
++blktap_device_finish_request(struct blktap *tap,
++ struct blkif_response *res,
++ struct blktap_request *request)
++{
++ int ret;
++ struct request *req;
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++
++ blktap_unmap(tap, request);
++
++ req = (struct request *)(unsigned long)request->id;
++ ret = res->status == BLKIF_RSP_OKAY ? 0 : -EIO;
++
++ BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
++ res->status, res->operation, request->operation,
++ (unsigned long long)res->id);
++
++ switch (request->operation) {
++ case BLKIF_OP_READ:
++ case BLKIF_OP_WRITE:
++ if (unlikely(res->status != BLKIF_RSP_OKAY))
++ BTERR("Bad return from device data "
++ "request: %x\n", res->status);
++ blktap_device_end_dequeued_request(dev, req, ret);
++ break;
++ default:
++ BUG();
++ }
++
++ blktap_request_free(tap, request);
++}
++
++static int
++blktap_prep_foreign(struct blktap *tap,
++ struct blktap_request *request,
++ struct blkif_request *blkif_req,
++ unsigned int seg, struct page *page,
++ struct blktap_grant_table *table)
++{
++ uint64_t ptep;
++ uint32_t flags;
++#ifdef BLKTAP_CHAINED_BLKTAP
++ struct page *tap_page;
++#endif
++ struct blktap_ring *ring;
++ struct blkback_pagemap map;
++ unsigned long uvaddr, kvaddr;
++
++ ring = &tap->ring;
++ map = blkback_pagemap_read(page);
++ blkif_req->seg[seg].gref = map.gref;
++
++ uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
++ kvaddr = request_to_kaddr(request, seg);
++ flags = GNTMAP_host_map |
++ (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
++
++ gnttab_set_map_op(&table->grants[table->cnt],
++ kvaddr, flags, map.gref, map.domid);
++ table->cnt++;
++
++
++#ifdef BLKTAP_CHAINED_BLKTAP
++ /* enable chained tap devices */
++ tap_page = request_to_page(request, seg);
++ set_page_private(tap_page, page_private(page));
++ SetPageBlkback(tap_page);
++#endif
++
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ return 0;
++
++ if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
++ BTERR("couldn't get a pte addr!\n");
++ return -1;
++ }
++
++ flags |= GNTMAP_application_map | GNTMAP_contains_pte;
++ gnttab_set_map_op(&table->grants[table->cnt],
++ ptep, flags, map.gref, map.domid);
++ table->cnt++;
++
++ return 0;
++}
++
++static int
++blktap_map_foreign(struct blktap *tap,
++ struct blktap_request *request,
++ struct blkif_request *blkif_req,
++ struct blktap_grant_table *table)
++{
++ struct page *page;
++ int i, grant, err, usr_idx;
++ struct blktap_ring *ring;
++ unsigned long uvaddr, foreign_mfn;
++
++ if (!table->cnt)
++ return 0;
++
++ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++ table->grants, table->cnt);
++ BUG_ON(err);
++
++ grant = 0;
++ usr_idx = request->usr_idx;
++ ring = &tap->ring;
++
++ for (i = 0; i < request->nr_pages; i++) {
++ if (!blkif_req->seg[i].gref)
++ continue;
++
++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++
++ if (unlikely(table->grants[grant].status)) {
++ BTERR("invalid kernel buffer: could not remap it\n");
++ err |= 1;
++ table->grants[grant].handle = INVALID_GRANT_HANDLE;
++ }
++
++ request->handles[i].kernel = table->grants[grant].handle;
++ foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
++ grant++;
++
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ goto done;
++
++ if (unlikely(table->grants[grant].status)) {
++ BTERR("invalid user buffer: could not remap it\n");
++ err |= 1;
++ table->grants[grant].handle = INVALID_GRANT_HANDLE;
++ }
++
++ request->handles[i].user = table->grants[grant].handle;
++ grant++;
++
++ done:
++ if (err)
++ continue;
++
++ page = request_to_page(request, i);
++
++ if (!xen_feature(XENFEAT_auto_translated_physmap))
++ set_phys_to_machine(page_to_pfn(page),
++ FOREIGN_FRAME(foreign_mfn));
++ else if (vm_insert_page(ring->vma, uvaddr, page))
++ err |= 1;
++
++ BTDBG("pending_req: %p, seg: %d, page: %p, "
++ "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, "
++ "uhandle: %u\n", request, i, page,
++ pfn_to_kaddr(page_to_pfn(page)),
++ request->handles[i].kernel,
++ uvaddr, request->handles[i].user);
++ }
++
++ return err;
++}
++
++static void
++blktap_map(struct blktap *tap,
++ struct blktap_request *request,
++ unsigned int seg, struct page *page)
++{
++ pte_t pte;
++ int usr_idx;
++ struct blktap_ring *ring;
++ unsigned long uvaddr, kvaddr;
++
++ ring = &tap->ring;
++ usr_idx = request->usr_idx;
++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
++ kvaddr = request_to_kaddr(request, seg);
++
++ pte = mk_pte(page, ring->vma->vm_page_prot);
++ blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte));
++ flush_tlb_page(ring->vma, uvaddr);
++ blktap_map_uaddr(ring->vma->vm_mm, kvaddr, mk_pte(page, PAGE_KERNEL));
++ flush_tlb_kernel_page(kvaddr);
++
++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
++ request->handles[seg].kernel = INVALID_GRANT_HANDLE;
++ request->handles[seg].user = INVALID_GRANT_HANDLE;
++
++ BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
++ "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
++ uvaddr);
++}
++
++static int
++blktap_device_process_request(struct blktap *tap,
++ struct blktap_request *request,
++ struct request *req)
++{
++ struct page *page;
++ int i, usr_idx, err;
++ struct blktap_ring *ring;
++ struct scatterlist *sg;
++ struct blktap_grant_table table;
++ unsigned int fsect, lsect, nr_sects;
++ unsigned long offset, uvaddr;
++ struct blkif_request blkif_req, *target;
++
++ err = -1;
++ memset(&table, 0, sizeof(table));
++
++ if (!blktap_active(tap))
++ goto out;
++
++ ring = &tap->ring;
++ usr_idx = request->usr_idx;
++ blkif_req.id = usr_idx;
++ blkif_req.sector_number = (blkif_sector_t)blk_rq_pos(req);
++ blkif_req.handle = 0;
++ blkif_req.operation = rq_data_dir(req) ?
++ BLKIF_OP_WRITE : BLKIF_OP_READ;
++
++ request->id = (unsigned long)req;
++ request->operation = blkif_req.operation;
++ request->status = BLKTAP_REQUEST_PENDING;
++ do_gettimeofday(&request->time);
++
++ nr_sects = 0;
++ request->nr_pages = 0;
++ blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg);
++ BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
++ for (i = 0; i < blkif_req.nr_segments; ++i) {
++ sg = tap->sg + i;
++ fsect = sg->offset >> 9;
++ lsect = fsect + (sg->length >> 9) - 1;
++ nr_sects += sg->length >> 9;
++
++ blkif_req.seg[i] =
++ (struct blkif_request_segment) {
++ .gref = 0,
++ .first_sect = fsect,
++ .last_sect = lsect };
++
++ if (blkback_pagemap_contains_page(sg_page(sg))) {
++ /* foreign page -- use xen */
++ if (blktap_prep_foreign(tap,
++ request,
++ &blkif_req,
++ i,
++ sg_page(sg),
++ &table))
++ goto out;
++ } else {
++ /* do it the old fashioned way */
++ blktap_map(tap,
++ request,
++ i,
++ sg_page(sg));
++ }
++
++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++ offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
++ page = request_to_page(request, i);
++ ring->foreign_map.map[offset] = page;
++ SetPageReserved(page);
++
++ BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
++ uvaddr, page, page_to_pfn(page));
++ BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
++ "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n",
++ offset, request, i,
++ page, pfn_to_kaddr(page_to_pfn(page)), uvaddr);
++
++ request->nr_pages++;
++ }
++
++ if (blktap_map_foreign(tap, request, &blkif_req, &table))
++ goto out;
++
++ /* Finally, write the request message to the user ring. */
++ target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
++ memcpy(target, &blkif_req, sizeof(blkif_req));
++ target->id = request->usr_idx;
++ wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
++ ring->ring.req_prod_pvt++;
++
++ if (rq_data_dir(req)) {
++ tap->stats.st_wr_sect += nr_sects;
++ tap->stats.st_wr_req++;
++ } else {
++ tap->stats.st_rd_sect += nr_sects;
++ tap->stats.st_rd_req++;
++ }
++
++ err = 0;
++
++out:
++ if (err)
++ blktap_device_fast_flush(tap, request);
++ return err;
++}
++
++#ifdef ENABLE_PASSTHROUGH
++#define rq_for_each_bio_safe(_bio, _tmp, _req) \
++ if ((_req)->bio) \
++ for (_bio = (_req)->bio; \
++ _bio && ((_tmp = _bio->bi_next) || 1); \
++ _bio = _tmp)
++
++static void
++blktap_device_forward_request(struct blktap *tap, struct request *req)
++{
++ struct bio *bio, *tmp;
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++
++ rq_for_each_bio_safe(bio, tmp, req) {
++ bio->bi_bdev = dev->bdev;
++ submit_bio(bio->bi_rw, bio);
++ }
++}
++
++static void
++blktap_device_close_bdev(struct blktap *tap)
++{
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++
++ if (dev->bdev)
++ blkdev_put(dev->bdev);
++
++ dev->bdev = NULL;
++ clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++}
++
++static int
++blktap_device_open_bdev(struct blktap *tap, u32 pdev)
++{
++ struct block_device *bdev;
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++
++ bdev = open_by_devnum(pdev, FMODE_WRITE);
++ if (IS_ERR(bdev)) {
++ BTERR("opening device %x:%x failed: %ld\n",
++ MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
++ return PTR_ERR(bdev);
++ }
++
++ if (!bdev->bd_disk) {
++ BTERR("device %x:%x doesn't exist\n",
++ MAJOR(pdev), MINOR(pdev));
++ blkdev_put(dev->bdev);
++ return -ENOENT;
++ }
++
++ dev->bdev = bdev;
++ set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++
++ /* TODO: readjust queue parameters */
++
++ BTINFO("set device %d to passthrough on %x:%x\n",
++ tap->minor, MAJOR(pdev), MINOR(pdev));
++
++ return 0;
++}
++
++int
++blktap_device_enable_passthrough(struct blktap *tap,
++ unsigned major, unsigned minor)
++{
++ u32 pdev;
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++ pdev = MKDEV(major, minor);
++
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EINVAL;
++
++ if (dev->bdev) {
++ if (pdev)
++ return -EINVAL;
++ blktap_device_close_bdev(tap);
++ return 0;
++ }
++
++ return blktap_device_open_bdev(tap, pdev);
++}
++#endif
++
++/*
++ * dev->lock held on entry
++ */
++static void
++blktap_device_run_queue(struct blktap *tap)
++{
++ int queued, err;
++ struct request_queue *rq;
++ struct request *req;
++ struct blktap_ring *ring;
++ struct blktap_device *dev;
++ struct blktap_request *request;
++
++ queued = 0;
++ ring = &tap->ring;
++ dev = &tap->device;
++ rq = dev->gd->queue;
++
++ BTDBG("running queue for %d\n", tap->minor);
++
++ while ((req = blk_peek_request(rq)) != NULL) {
++ if (!blk_fs_request(req)) {
++ __blk_end_request_cur(req, 0);
++ continue;
++ }
++
++ if (blk_barrier_rq(req)) {
++ __blk_end_request_cur(req, 0);
++ continue;
++ }
++
++#ifdef ENABLE_PASSTHROUGH
++ if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
++ blkdev_dequeue_request(req);
++ blktap_device_forward_request(tap, req);
++ continue;
++ }
++#endif
++
++ if (RING_FULL(&ring->ring)) {
++ wait:
++ /* Avoid pointless unplugs. */
++ blk_stop_queue(rq);
++ blktap_defer(tap);
++ break;
++ }
++
++ request = blktap_request_allocate(tap);
++ if (!request) {
++ tap->stats.st_oo_req++;
++ goto wait;
++ }
++
++ BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) "
++ "buffer:%p [%s], pending: %p\n", req, tap->minor,
++ req->cmd, (unsigned long long)blk_rq_pos(req),
++ blk_rq_cur_sectors(req),
++ blk_rq_sectors(req), req->buffer,
++ rq_data_dir(req) ? "write" : "read", request);
++
++ blk_start_request(req);
++
++ spin_unlock_irq(&dev->lock);
++ down_read(&tap->tap_sem);
++
++ err = blktap_device_process_request(tap, request, req);
++ if (!err)
++ queued++;
++ else {
++ blktap_device_end_dequeued_request(dev, req, -EIO);
++ blktap_request_free(tap, request);
++ }
++
++ up_read(&tap->tap_sem);
++ spin_lock_irq(&dev->lock);
++ }
++
++ if (queued)
++ blktap_ring_kick_user(tap);
++}
++
++/*
++ * dev->lock held on entry
++ */
++static void
++blktap_device_do_request(struct request_queue *rq)
++{
++ struct request *req;
++ struct blktap *tap;
++ struct blktap_device *dev;
++
++ dev = rq->queuedata;
++ if (!dev)
++ goto fail;
++
++ tap = dev_to_blktap(dev);
++ if (!blktap_active(tap))
++ goto fail;
++
++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
++ test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++ blktap_defer(tap);
++ return;
++ }
++
++ blktap_device_run_queue(tap);
++ return;
++
++fail:
++ while ((req = blk_peek_request(rq))) {
++ BTERR("device closed: failing secs %llu - %llu\n",
++ (unsigned long long)blk_rq_pos(req),
++ (unsigned long long)blk_rq_pos(req) + blk_rq_sectors(req));
++ __blk_end_request_cur(req, 0);
++ }
++}
++
++void
++blktap_device_restart(struct blktap *tap)
++{
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++
++ if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
++ blktap_defer(tap);
++ return;
++ }
++
++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
++ test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++ blktap_defer(tap);
++ return;
++ }
++
++ spin_lock_irq(&dev->lock);
++
++ /* Re-enable calldowns. */
++ if (dev->gd) {
++ struct request_queue *rq = dev->gd->queue;
++
++ if (blk_queue_stopped(rq))
++ blk_start_queue(rq);
++
++ /* Kick things off immediately. */
++ blktap_device_do_request(rq);
++ }
++
++ spin_unlock_irq(&dev->lock);
++}
++
++static void
++blktap_device_configure(struct blktap *tap)
++{
++ struct request_queue *rq;
++ struct blktap_device *dev = &tap->device;
++
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
++ return;
++
++ dev = &tap->device;
++ rq = dev->gd->queue;
++
++ spin_lock_irq(&dev->lock);
++
++ set_capacity(dev->gd, tap->params.capacity);
++
++ /* Hard sector size and max sectors impersonate the equiv. hardware. */
++ blk_queue_logical_block_size(rq, tap->params.sector_size);
++ blk_queue_max_sectors(rq, 512);
++
++ /* Each segment in a request is up to an aligned page in size. */
++ blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
++ blk_queue_max_segment_size(rq, PAGE_SIZE);
++
++ /* Ensure a merged request will fit in a single I/O ring slot. */
++ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++
++ /* Make sure buffer addresses are sector-aligned. */
++ blk_queue_dma_alignment(rq, 511);
++
++ spin_unlock_irq(&dev->lock);
++}
++
++int
++blktap_device_resume(struct blktap *tap)
++{
++ int err;
++
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
++ return -ENODEV;
++
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return 0;
++
++ err = blktap_ring_resume(tap);
++ if (err)
++ return err;
++
++ /* device size may have changed */
++ blktap_device_configure(tap);
++
++ BTDBG("restarting device\n");
++ blktap_device_restart(tap);
++
++ return 0;
++}
++
++int
++blktap_device_pause(struct blktap *tap)
++{
++ unsigned long flags;
++ struct blktap_device *dev = &tap->device;
++
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
++ return -ENODEV;
++
++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return 0;
++
++ spin_lock_irqsave(&dev->lock, flags);
++
++ blk_stop_queue(dev->gd->queue);
++ set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++
++ spin_unlock_irqrestore(&dev->lock, flags);
++
++ return blktap_ring_pause(tap);
++}
++
++int
++blktap_device_destroy(struct blktap *tap)
++{
++ struct blktap_device *dev = &tap->device;
++ struct gendisk *gd = dev->gd;
++
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++ return 0;
++
++ BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
++
++ if (dev->users)
++ return -EBUSY;
++
++ spin_lock_irq(&dev->lock);
++ /* No more blktap_device_do_request(). */
++ blk_stop_queue(gd->queue);
++ clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++ dev->gd = NULL;
++ spin_unlock_irq(&dev->lock);
++
++#ifdef ENABLE_PASSTHROUGH
++ if (dev->bdev)
++ blktap_device_close_bdev(tap);
++#endif
++
++ del_gendisk(gd);
++ blk_cleanup_queue(gd->queue);
++ put_disk(gd);
++
++ wake_up(&tap->wq);
++
++ return 0;
++}
++
++int
++blktap_device_create(struct blktap *tap)
++{
++ int minor, err;
++ struct gendisk *gd;
++ struct request_queue *rq;
++ struct blktap_device *dev;
++
++ gd = NULL;
++ rq = NULL;
++ dev = &tap->device;
++ minor = tap->minor;
++
++ if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++ return -EEXIST;
++
++ if (blktap_validate_params(tap, &tap->params))
++ return -EINVAL;
++
++ BTINFO("minor %d sectors %Lu sector-size %lu\n",
++ minor, tap->params.capacity, tap->params.sector_size);
++
++ err = -ENODEV;
++
++ gd = alloc_disk(1);
++ if (!gd)
++ goto error;
++
++ if (minor < 26)
++ sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
++ else
++ sprintf(gd->disk_name, "tapdev%c%c",
++ 'a' + ((minor / 26) - 1), 'a' + (minor % 26));
++
++ gd->major = blktap_device_major;
++ gd->first_minor = minor;
++ gd->fops = &blktap_device_file_operations;
++ gd->private_data = dev;
++
++ spin_lock_init(&dev->lock);
++ rq = blk_init_queue(blktap_device_do_request, &dev->lock);
++ if (!rq)
++ goto error;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
++ elevator_init(rq, "noop");
++#else
++ elevator_init(rq, &elevator_noop);
++#endif
++
++ gd->queue = rq;
++ rq->queuedata = dev;
++ dev->gd = gd;
++
++ set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++ blktap_device_configure(tap);
++
++ add_disk(gd);
++
++ err = 0;
++ goto out;
++
++ error:
++ if (gd)
++ del_gendisk(gd);
++ if (rq)
++ blk_cleanup_queue(rq);
++
++ out:
++ BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
++ return err;
++}
++
++int __init
++blktap_device_init(int *maj)
++{
++ int major;
++
++ /* Dynamically allocate a major for this device */
++ major = register_blkdev(0, "tapdev");
++ if (major < 0) {
++ BTERR("Couldn't register blktap device\n");
++ return -ENOMEM;
++ }
++
++ blktap_device_major = *maj = major;
++ BTINFO("blktap device major %d\n", major);
++
++ return 0;
++}
++
++void
++blktap_device_free(void)
++{
++ if (blktap_device_major)
++ unregister_blkdev(blktap_device_major, "tapdev");
++}
+diff --git a/drivers/xen/blktap/request.c b/drivers/xen/blktap/request.c
+new file mode 100644
+index 0000000..770736a
+--- /dev/null
++++ b/drivers/xen/blktap/request.c
+@@ -0,0 +1,297 @@
++#include <linux/spinlock.h>
++#include <xen/balloon.h>
++#include <linux/sched.h>
++
++#include "blktap.h"
++
++#define MAX_BUCKETS 8
++#define BUCKET_SIZE MAX_PENDING_REQS
++
++#define BLKTAP_POOL_CLOSING 1
++
++struct blktap_request_bucket;
++
++struct blktap_request_handle {
++ int slot;
++ uint8_t inuse;
++ struct blktap_request request;
++ struct blktap_request_bucket *bucket;
++};
++
++struct blktap_request_bucket {
++ atomic_t reqs_in_use;
++ struct blktap_request_handle handles[BUCKET_SIZE];
++ struct page **foreign_pages;
++};
++
++struct blktap_request_pool {
++ spinlock_t lock;
++ uint8_t status;
++ struct list_head free_list;
++ atomic_t reqs_in_use;
++ wait_queue_head_t wait_queue;
++ struct blktap_request_bucket *buckets[MAX_BUCKETS];
++};
++
++static struct blktap_request_pool pool;
++
++static inline struct blktap_request_handle *
++blktap_request_to_handle(struct blktap_request *req)
++{
++ return container_of(req, struct blktap_request_handle, request);
++}
++
++static void
++blktap_request_pool_init_request(struct blktap_request *request)
++{
++ int i;
++
++ request->usr_idx = -1;
++ request->nr_pages = 0;
++ request->status = BLKTAP_REQUEST_FREE;
++ INIT_LIST_HEAD(&request->free_list);
++ for (i = 0; i < ARRAY_SIZE(request->handles); i++) {
++ request->handles[i].user = INVALID_GRANT_HANDLE;
++ request->handles[i].kernel = INVALID_GRANT_HANDLE;
++ }
++}
++
++static int
++blktap_request_pool_allocate_bucket(void)
++{
++ int i, idx;
++ unsigned long flags;
++ struct blktap_request *request;
++ struct blktap_request_handle *handle;
++ struct blktap_request_bucket *bucket;
++
++ bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL);
++ if (!bucket)
++ goto fail;
++
++ bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES);
++ if (!bucket->foreign_pages)
++ goto fail;
++
++ spin_lock_irqsave(&pool.lock, flags);
++
++ idx = -1;
++ for (i = 0; i < MAX_BUCKETS; i++) {
++ if (!pool.buckets[i]) {
++ idx = i;
++ pool.buckets[idx] = bucket;
++ break;
++ }
++ }
++
++ if (idx == -1) {
++ spin_unlock_irqrestore(&pool.lock, flags);
++ goto fail;
++ }
++
++ for (i = 0; i < BUCKET_SIZE; i++) {
++ handle = bucket->handles + i;
++ request = &handle->request;
++
++ handle->slot = i;
++ handle->inuse = 0;
++ handle->bucket = bucket;
++
++ blktap_request_pool_init_request(request);
++ list_add_tail(&request->free_list, &pool.free_list);
++ }
++
++ spin_unlock_irqrestore(&pool.lock, flags);
++
++ return 0;
++
++fail:
++ if (bucket && bucket->foreign_pages)
++ free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
++ kfree(bucket);
++ return -ENOMEM;
++}
++
++static void
++blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket)
++{
++ if (!bucket)
++ return;
++
++ BTDBG("freeing bucket %p\n", bucket);
++
++ free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
++ kfree(bucket);
++}
++
++struct page *
++request_to_page(struct blktap_request *req, int seg)
++{
++ struct blktap_request_handle *handle = blktap_request_to_handle(req);
++ int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
++ return handle->bucket->foreign_pages[idx];
++}
++
++int
++blktap_request_pool_shrink(void)
++{
++ int i, err;
++ unsigned long flags;
++ struct blktap_request_bucket *bucket;
++
++ err = -EAGAIN;
++
++ spin_lock_irqsave(&pool.lock, flags);
++
++ /* always keep at least one bucket */
++ for (i = 1; i < MAX_BUCKETS; i++) {
++ bucket = pool.buckets[i];
++ if (!bucket)
++ continue;
++
++ if (atomic_read(&bucket->reqs_in_use))
++ continue;
++
++ blktap_request_pool_free_bucket(bucket);
++ pool.buckets[i] = NULL;
++ err = 0;
++ break;
++ }
++
++ spin_unlock_irqrestore(&pool.lock, flags);
++
++ return err;
++}
++
++int
++blktap_request_pool_grow(void)
++{
++ return blktap_request_pool_allocate_bucket();
++}
++
++struct blktap_request *
++blktap_request_allocate(struct blktap *tap)
++{
++ int i;
++ uint16_t usr_idx;
++ unsigned long flags;
++ struct blktap_request *request;
++
++ usr_idx = -1;
++ request = NULL;
++
++ spin_lock_irqsave(&pool.lock, flags);
++
++ if (pool.status == BLKTAP_POOL_CLOSING)
++ goto out;
++
++ for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++)
++ if (!tap->pending_requests[i]) {
++ usr_idx = i;
++ break;
++ }
++
++ if (usr_idx == (uint16_t)-1)
++ goto out;
++
++ if (!list_empty(&pool.free_list)) {
++ request = list_entry(pool.free_list.next,
++ struct blktap_request, free_list);
++ list_del(&request->free_list);
++ }
++
++ if (request) {
++ struct blktap_request_handle *handle;
++
++ atomic_inc(&pool.reqs_in_use);
++
++ handle = blktap_request_to_handle(request);
++ atomic_inc(&handle->bucket->reqs_in_use);
++ handle->inuse = 1;
++
++ request->usr_idx = usr_idx;
++
++ tap->pending_requests[usr_idx] = request;
++ tap->pending_cnt++;
++ }
++
++out:
++ spin_unlock_irqrestore(&pool.lock, flags);
++ return request;
++}
++
++void
++blktap_request_free(struct blktap *tap, struct blktap_request *request)
++{
++ int free;
++ unsigned long flags;
++ struct blktap_request_handle *handle;
++
++ BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests));
++ handle = blktap_request_to_handle(request);
++
++ spin_lock_irqsave(&pool.lock, flags);
++
++ handle->inuse = 0;
++ tap->pending_requests[request->usr_idx] = NULL;
++ blktap_request_pool_init_request(request);
++ list_add(&request->free_list, &pool.free_list);
++ atomic_dec(&handle->bucket->reqs_in_use);
++ free = atomic_dec_and_test(&pool.reqs_in_use);
++
++ spin_unlock_irqrestore(&pool.lock, flags);
++
++ if (--tap->pending_cnt == 0)
++ wake_up_interruptible(&tap->wq);
++
++ if (free)
++ wake_up(&pool.wait_queue);
++}
++
++void
++blktap_request_pool_free(void)
++{
++ int i;
++ unsigned long flags;
++
++ spin_lock_irqsave(&pool.lock, flags);
++
++ pool.status = BLKTAP_POOL_CLOSING;
++ while (atomic_read(&pool.reqs_in_use)) {
++ spin_unlock_irqrestore(&pool.lock, flags);
++ wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use));
++ spin_lock_irqsave(&pool.lock, flags);
++ }
++
++ for (i = 0; i < MAX_BUCKETS; i++) {
++ blktap_request_pool_free_bucket(pool.buckets[i]);
++ pool.buckets[i] = NULL;
++ }
++
++ spin_unlock_irqrestore(&pool.lock, flags);
++}
++
++int __init
++blktap_request_pool_init(void)
++{
++ int i, err;
++
++ memset(&pool, 0, sizeof(pool));
++
++ spin_lock_init(&pool.lock);
++ INIT_LIST_HEAD(&pool.free_list);
++ atomic_set(&pool.reqs_in_use, 0);
++ init_waitqueue_head(&pool.wait_queue);
++
++ for (i = 0; i < 2; i++) {
++ err = blktap_request_pool_allocate_bucket();
++ if (err)
++ goto fail;
++ }
++
++ return 0;
++
++fail:
++ blktap_request_pool_free();
++ return err;
++}
+diff --git a/drivers/xen/blktap/ring.c b/drivers/xen/blktap/ring.c
+new file mode 100644
+index 0000000..74a7aa7
+--- /dev/null
++++ b/drivers/xen/blktap/ring.c
+@@ -0,0 +1,615 @@
++#include <linux/module.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/poll.h>
++
++#include <asm/xen/page.h>
++#include <asm/xen/hypercall.h>
++
++#include "blktap.h"
++
++#ifdef CONFIG_XEN_BLKDEV_BACKEND
++#include "../blkback/blkback-pagemap.h"
++#else
++#define blkback_pagemap_contains_page(page) 0
++#endif
++
++static int blktap_ring_major;
++
++static inline struct blktap *
++vma_to_blktap(struct vm_area_struct *vma)
++{
++ struct vm_foreign_map *m = vma->vm_private_data;
++ struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
++ return container_of(r, struct blktap, ring);
++}
++
++ /*
++ * BLKTAP - immediately before the mmap area,
++ * we have a bunch of pages reserved for shared memory rings.
++ */
++#define RING_PAGES 1
++
++static int
++blktap_read_ring(struct blktap *tap)
++{
++ /* This is called to read responses from the ring. */
++ int usr_idx;
++ RING_IDX rc, rp;
++ struct blkif_response res;
++ struct blktap_ring *ring;
++ struct blktap_request *request;
++
++ down_read(&tap->tap_sem);
++
++ ring = &tap->ring;
++ if (!ring->vma) {
++ up_read(&tap->tap_sem);
++ return 0;
++ }
++
++ /* for each outstanding message on the ring */
++ rp = ring->ring.sring->rsp_prod;
++ rmb();
++
++ for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
++ memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
++ mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
++ ++ring->ring.rsp_cons;
++
++ usr_idx = (int)res.id;
++ if (usr_idx >= MAX_PENDING_REQS ||
++ !tap->pending_requests[usr_idx]) {
++ BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
++ rc, rp, usr_idx, tap->pid, ring->vma);
++ continue;
++ }
++
++ request = tap->pending_requests[usr_idx];
++ BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
++ blktap_device_finish_request(tap, &res, request);
++ }
++
++ up_read(&tap->tap_sem);
++
++ blktap_run_deferred();
++
++ return 0;
++}
++
++static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++ return VM_FAULT_SIGBUS;
++}
++
++static pte_t
++blktap_ring_clear_pte(struct vm_area_struct *vma,
++ unsigned long uvaddr,
++ pte_t *ptep, int is_fullmm)
++{
++ pte_t copy;
++ struct blktap *tap;
++ unsigned long kvaddr;
++ struct page **map, *page;
++ struct blktap_ring *ring;
++ struct blktap_request *request;
++ struct grant_handle_pair *khandle;
++ struct gnttab_unmap_grant_ref unmap[2];
++ int offset, seg, usr_idx, count = 0;
++
++ tap = vma_to_blktap(vma);
++ ring = &tap->ring;
++ map = ring->foreign_map.map;
++ BUG_ON(!map); /* TODO Should this be changed to if statement? */
++
++ /*
++ * Zap entry if the address is before the start of the grant
++ * mapped region.
++ */
++ if (uvaddr < ring->user_vstart)
++ return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
++ ptep, is_fullmm);
++
++ offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
++ usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
++ seg = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
++
++ offset = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
++ page = map[offset];
++ if (page) {
++ ClearPageReserved(page);
++ if (blkback_pagemap_contains_page(page))
++ set_page_private(page, 0);
++ }
++ map[offset] = NULL;
++
++ request = tap->pending_requests[usr_idx];
++ kvaddr = request_to_kaddr(request, seg);
++ khandle = request->handles + seg;
++
++ if (khandle->kernel != INVALID_GRANT_HANDLE) {
++ gnttab_set_unmap_op(&unmap[count], kvaddr,
++ GNTMAP_host_map, khandle->kernel);
++ count++;
++
++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
++ INVALID_P2M_ENTRY);
++ }
++
++
++ if (khandle->user != INVALID_GRANT_HANDLE) {
++ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++
++ copy = *ptep;
++ gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep).maddr,
++ GNTMAP_host_map
++ | GNTMAP_application_map
++ | GNTMAP_contains_pte,
++ khandle->user);
++ count++;
++ } else
++ copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
++ is_fullmm);
++
++ if (count)
++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++ unmap, count))
++ BUG();
++
++ khandle->kernel = INVALID_GRANT_HANDLE;
++ khandle->user = INVALID_GRANT_HANDLE;
++
++ return copy;
++}
++
++static void
++blktap_ring_vm_unmap(struct vm_area_struct *vma)
++{
++ struct blktap *tap = vma_to_blktap(vma);
++
++ down_write(&tap->tap_sem);
++ clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
++ clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++ clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++ up_write(&tap->tap_sem);
++}
++
++static void
++blktap_ring_vm_close(struct vm_area_struct *vma)
++{
++ struct blktap *tap = vma_to_blktap(vma);
++ struct blktap_ring *ring = &tap->ring;
++
++ blktap_ring_vm_unmap(vma); /* fail future requests */
++ blktap_device_fail_pending_requests(tap); /* fail pending requests */
++ blktap_device_restart(tap); /* fail deferred requests */
++
++ down_write(&tap->tap_sem);
++
++ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
++
++ kfree(ring->foreign_map.map);
++ ring->foreign_map.map = NULL;
++
++ /* Free the ring page. */
++ ClearPageReserved(virt_to_page(ring->ring.sring));
++ free_page((unsigned long)ring->ring.sring);
++
++ BTINFO("unmapping ring %d\n", tap->minor);
++ ring->ring.sring = NULL;
++ ring->vma = NULL;
++
++ up_write(&tap->tap_sem);
++
++ wake_up(&tap->wq);
++}
++
++static struct vm_operations_struct blktap_ring_vm_operations = {
++ .close = blktap_ring_vm_close,
++ .unmap = blktap_ring_vm_unmap,
++ .fault = blktap_ring_fault,
++ .zap_pte = blktap_ring_clear_pte,
++};
++
++static int
++blktap_ring_open(struct inode *inode, struct file *filp)
++{
++ int idx;
++ struct blktap *tap;
++
++ idx = iminor(inode);
++ if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) {
++ BTERR("unable to open device blktap%d\n", idx);
++ return -ENODEV;
++ }
++
++ tap = blktaps[idx];
++
++ BTINFO("opening device blktap%d\n", idx);
++
++ if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
++ return -ENODEV;
++
++ /* Only one process can access ring at a time */
++ if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
++ return -EBUSY;
++
++ filp->private_data = tap;
++ BTINFO("opened device %d\n", tap->minor);
++
++ return 0;
++}
++
++static int
++blktap_ring_release(struct inode *inode, struct file *filp)
++{
++ struct blktap *tap = filp->private_data;
++
++ BTINFO("freeing device %d\n", tap->minor);
++ clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
++ filp->private_data = NULL;
++ wake_up(&tap->wq);
++ return 0;
++}
++
++/* Note on mmap:
++ * We need to map pages to user space in a way that will allow the block
++ * subsystem set up direct IO to them. This couldn't be done before, because
++ * there isn't really a sane way to translate a user virtual address down to a
++ * physical address when the page belongs to another domain.
++ *
++ * My first approach was to map the page in to kernel memory, add an entry
++ * for it in the physical frame list (using alloc_lomem_region as in blkback)
++ * and then attempt to map that page up to user space. This is disallowed
++ * by xen though, which realizes that we don't really own the machine frame
++ * underlying the physical page.
++ *
++ * The new approach is to provide explicit support for this in xen linux.
++ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
++ * mapped from other vms. vma->vm_private_data is set up as a mapping
++ * from pages to actual page structs. There is a new clause in get_user_pages
++ * that does the right thing for this sort of mapping.
++ */
++static int
++blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
++{
++ int size, err;
++ struct page **map;
++ struct blktap *tap;
++ struct blkif_sring *sring;
++ struct blktap_ring *ring;
++
++ tap = filp->private_data;
++ ring = &tap->ring;
++ map = NULL;
++ sring = NULL;
++
++ if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
++ return -ENOMEM;
++
++ size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++ if (size != (MMAP_PAGES + RING_PAGES)) {
++ BTERR("you _must_ map exactly %lu pages!\n",
++ MMAP_PAGES + RING_PAGES);
++ return -EAGAIN;
++ }
++
++ /* Allocate the fe ring. */
++ sring = (struct blkif_sring *)get_zeroed_page(GFP_KERNEL);
++ if (!sring) {
++ BTERR("Couldn't alloc sring.\n");
++ goto fail_mem;
++ }
++
++ map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
++ if (!map) {
++ BTERR("Couldn't alloc VM_FOREIGN map.\n");
++ goto fail_mem;
++ }
++
++ SetPageReserved(virt_to_page(sring));
++
++ SHARED_RING_INIT(sring);
++ FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
++
++ ring->ring_vstart = vma->vm_start;
++ ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
++
++ /* Map the ring pages to the start of the region and reserve it. */
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ err = vm_insert_page(vma, vma->vm_start,
++ virt_to_page(ring->ring.sring));
++ else
++ err = remap_pfn_range(vma, vma->vm_start,
++ __pa(ring->ring.sring) >> PAGE_SHIFT,
++ PAGE_SIZE, vma->vm_page_prot);
++ if (err) {
++ BTERR("Mapping user ring failed: %d\n", err);
++ goto fail;
++ }
++
++ /* Mark this VM as containing foreign pages, and set up mappings. */
++ ring->foreign_map.map = map;
++ vma->vm_private_data = &ring->foreign_map;
++ vma->vm_flags |= VM_FOREIGN;
++ vma->vm_flags |= VM_DONTCOPY;
++ vma->vm_flags |= VM_RESERVED;
++ vma->vm_ops = &blktap_ring_vm_operations;
++
++#ifdef CONFIG_X86
++ vma->vm_mm->context.has_foreign_mappings = 1;
++#endif
++
++ tap->pid = current->pid;
++ BTINFO("blktap: mapping pid is %d\n", tap->pid);
++
++ ring->vma = vma;
++ return 0;
++
++ fail:
++ /* Clear any active mappings. */
++ zap_page_range(vma, vma->vm_start,
++ vma->vm_end - vma->vm_start, NULL);
++ ClearPageReserved(virt_to_page(sring));
++ fail_mem:
++ free_page((unsigned long)sring);
++ kfree(map);
++
++ return -ENOMEM;
++}
++
++static inline void
++blktap_ring_set_message(struct blktap *tap, int msg)
++{
++ struct blktap_ring *ring = &tap->ring;
++
++ down_read(&tap->tap_sem);
++ if (ring->ring.sring)
++ ring->ring.sring->pad[0] = msg;
++ up_read(&tap->tap_sem);
++}
++
++static int
++blktap_ring_ioctl(struct inode *inode, struct file *filp,
++ unsigned int cmd, unsigned long arg)
++{
++ struct blktap_params params;
++ struct blktap *tap = filp->private_data;
++
++ BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
++
++ switch(cmd) {
++ case BLKTAP2_IOCTL_KICK_FE:
++ /* There are fe messages to process. */
++ return blktap_read_ring(tap);
++
++ case BLKTAP2_IOCTL_CREATE_DEVICE:
++ if (!arg)
++ return -EINVAL;
++
++ if (copy_from_user(¶ms, (struct blktap_params __user *)arg,
++ sizeof(params))) {
++ BTERR("failed to get params\n");
++ return -EFAULT;
++ }
++
++ if (blktap_validate_params(tap, ¶ms)) {
++ BTERR("invalid params\n");
++ return -EINVAL;
++ }
++
++ tap->params = params;
++ return blktap_device_create(tap);
++
++ case BLKTAP2_IOCTL_SET_PARAMS:
++ if (!arg)
++ return -EINVAL;
++
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EINVAL;
++
++ if (copy_from_user(¶ms, (struct blktap_params __user *)arg,
++ sizeof(params))) {
++ BTERR("failed to get params\n");
++ return -EFAULT;
++ }
++
++ if (blktap_validate_params(tap, ¶ms)) {
++ BTERR("invalid params\n");
++ return -EINVAL;
++ }
++
++ tap->params = params;
++ return 0;
++
++ case BLKTAP2_IOCTL_PAUSE:
++ if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
++ return -EINVAL;
++
++ set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++ clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++
++ blktap_ring_set_message(tap, 0);
++ wake_up_interruptible(&tap->wq);
++
++ return 0;
++
++
++ case BLKTAP2_IOCTL_REOPEN:
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EINVAL;
++
++ if (!arg)
++ return -EINVAL;
++
++ if (copy_to_user((char __user *)arg,
++ tap->params.name,
++ strlen(tap->params.name) + 1))
++ return -EFAULT;
++
++ blktap_ring_set_message(tap, 0);
++ wake_up_interruptible(&tap->wq);
++
++ return 0;
++
++ case BLKTAP2_IOCTL_RESUME:
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EINVAL;
++
++ tap->ring.response = (int)arg;
++ if (!tap->ring.response)
++ clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++
++ blktap_ring_set_message(tap, 0);
++ wake_up_interruptible(&tap->wq);
++
++ return 0;
++ }
++
++ return -ENOIOCTLCMD;
++}
++
++static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
++{
++ struct blktap *tap = filp->private_data;
++ struct blktap_ring *ring = &tap->ring;
++
++ poll_wait(filp, &ring->poll_wait, wait);
++ if (ring->ring.sring->pad[0] != 0 ||
++ ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
++ RING_PUSH_REQUESTS(&ring->ring);
++ return POLLIN | POLLRDNORM;
++ }
++
++ return 0;
++}
++
++static struct file_operations blktap_ring_file_operations = {
++ .owner = THIS_MODULE,
++ .open = blktap_ring_open,
++ .release = blktap_ring_release,
++ .ioctl = blktap_ring_ioctl,
++ .mmap = blktap_ring_mmap,
++ .poll = blktap_ring_poll,
++};
++
++void
++blktap_ring_kick_user(struct blktap *tap)
++{
++ wake_up_interruptible(&tap->ring.poll_wait);
++}
++
++int
++blktap_ring_resume(struct blktap *tap)
++{
++ int err;
++ struct blktap_ring *ring = &tap->ring;
++
++ if (!blktap_active(tap))
++ return -ENODEV;
++
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EINVAL;
++
++ /* set shared flag for resume */
++ ring->response = 0;
++
++ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
++ blktap_ring_kick_user(tap);
++
++ wait_event_interruptible(tap->wq, ring->response ||
++ !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
++
++ err = ring->response;
++ ring->response = 0;
++
++ BTDBG("err: %d\n", err);
++
++ if (err)
++ return err;
++
++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EAGAIN;
++
++ return 0;
++}
++
++int
++blktap_ring_pause(struct blktap *tap)
++{
++ if (!blktap_active(tap))
++ return -ENODEV;
++
++ if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
++ return -EINVAL;
++
++ BTDBG("draining queue\n");
++ wait_event_interruptible(tap->wq, !tap->pending_cnt);
++ if (tap->pending_cnt)
++ return -EAGAIN;
++
++ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
++ blktap_ring_kick_user(tap);
++
++ BTDBG("waiting for tapdisk response\n");
++ wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EAGAIN;
++
++ return 0;
++}
++
++int
++blktap_ring_destroy(struct blktap *tap)
++{
++ if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
++ !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
++ return 0;
++
++ BTDBG("sending tapdisk close message\n");
++ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
++ blktap_ring_kick_user(tap);
++
++ return -EAGAIN;
++}
++
++static void
++blktap_ring_initialize(struct blktap_ring *ring, int minor)
++{
++ memset(ring, 0, sizeof(*ring));
++ init_waitqueue_head(&ring->poll_wait);
++ ring->devno = MKDEV(blktap_ring_major, minor);
++}
++
++int
++blktap_ring_create(struct blktap *tap)
++{
++ struct blktap_ring *ring = &tap->ring;
++ blktap_ring_initialize(ring, tap->minor);
++ return blktap_sysfs_create(tap);
++}
++
++int __init
++blktap_ring_init(int *major)
++{
++ int err;
++
++ err = register_chrdev(0, "blktap2", &blktap_ring_file_operations);
++ if (err < 0) {
++ BTERR("error registering blktap ring device: %d\n", err);
++ return err;
++ }
++
++ blktap_ring_major = *major = err;
++ BTINFO("blktap ring major: %d\n", blktap_ring_major);
++ return 0;
++}
++
++int
++blktap_ring_free(void)
++{
++ if (blktap_ring_major)
++ unregister_chrdev(blktap_ring_major, "blktap2");
++
++ return 0;
++}
+diff --git a/drivers/xen/blktap/sysfs.c b/drivers/xen/blktap/sysfs.c
+new file mode 100644
+index 0000000..23a3a51
+--- /dev/null
++++ b/drivers/xen/blktap/sysfs.c
+@@ -0,0 +1,451 @@
++#include <linux/types.h>
++#include <linux/device.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++
++#include "blktap.h"
++
++int blktap_debug_level = 1;
++
++static struct class *class;
++static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq);
++
++static inline void
++blktap_sysfs_get(struct blktap *tap)
++{
++ atomic_inc(&tap->ring.sysfs_refcnt);
++}
++
++static inline void
++blktap_sysfs_put(struct blktap *tap)
++{
++ if (atomic_dec_and_test(&tap->ring.sysfs_refcnt))
++ wake_up(&sysfs_wq);
++}
++
++static inline void
++blktap_sysfs_enter(struct blktap *tap)
++{
++ blktap_sysfs_get(tap); /* pin sysfs device */
++ mutex_lock(&tap->ring.sysfs_mutex); /* serialize sysfs operations */
++}
++
++static inline void
++blktap_sysfs_exit(struct blktap *tap)
++{
++ mutex_unlock(&tap->ring.sysfs_mutex);
++ blktap_sysfs_put(tap);
++}
++
++#define CLASS_DEVICE_ATTR(a,b,c,d) DEVICE_ATTR(a,b,c,d)
++
++static ssize_t blktap_sysfs_pause_device(struct device *, struct device_attribute *, const char *, size_t);
++CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
++static ssize_t blktap_sysfs_resume_device(struct device *, struct device_attribute *, const char *, size_t);
++CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);
++
++static ssize_t
++blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
++{
++ int err;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++
++ blktap_sysfs_enter(tap);
++
++ if (!tap->ring.dev ||
++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++ err = -ENODEV;
++ goto out;
++ }
++
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++ err = -EPERM;
++ goto out;
++ }
++
++ if (size > BLKTAP2_MAX_MESSAGE_LEN) {
++ err = -ENAMETOOLONG;
++ goto out;
++ }
++
++ if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) {
++ err = -EINVAL;
++ goto out;
++ }
++
++ snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf);
++ err = size;
++
++out:
++ blktap_sysfs_exit(tap);
++ return err;
++}
++
++static ssize_t
++blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
++{
++ ssize_t size;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++
++ blktap_sysfs_enter(tap);
++
++ if (!tap->ring.dev)
++ size = -ENODEV;
++ else if (tap->params.name[0])
++ size = sprintf(buf, "%s\n", tap->params.name);
++ else
++ size = sprintf(buf, "%d\n", tap->minor);
++
++ blktap_sysfs_exit(tap);
++
++ return size;
++}
++CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
++ blktap_sysfs_get_name, blktap_sysfs_set_name);
++
++static ssize_t
++blktap_sysfs_remove_device(struct device *dev,
++ struct device_attribute *attr,
++ const char *buf, size_t size)
++{
++ int err;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++
++ if (!tap->ring.dev)
++ return size;
++
++ if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++ return -EBUSY;
++
++ err = blktap_control_destroy_device(tap);
++
++ return (err ? : size);
++}
++CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
++
++static ssize_t
++blktap_sysfs_pause_device(struct device *dev,
++ struct device_attribute *attr,
++ const char *buf, size_t size)
++{
++ int err;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++
++ blktap_sysfs_enter(tap);
++
++ BTDBG("pausing %u:%u: dev_inuse: %lu\n",
++ MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse);
++
++ if (!tap->ring.dev ||
++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++ err = -ENODEV;
++ goto out;
++ }
++
++ if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++ err = -EBUSY;
++ goto out;
++ }
++
++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++ err = 0;
++ goto out;
++ }
++
++ err = blktap_device_pause(tap);
++ if (!err) {
++ device_remove_file(dev, &dev_attr_pause);
++ err = device_create_file(dev, &dev_attr_resume);
++ }
++
++out:
++ blktap_sysfs_exit(tap);
++
++ return (err ? err : size);
++}
++
++static ssize_t
++blktap_sysfs_resume_device(struct device *dev,
++ struct device_attribute *attr,
++ const char *buf, size_t size)
++{
++ int err;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++
++ blktap_sysfs_enter(tap);
++
++ if (!tap->ring.dev ||
++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++ err = -ENODEV;
++ goto out;
++ }
++
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++ err = -EINVAL;
++ goto out;
++ }
++
++ err = blktap_device_resume(tap);
++ if (!err) {
++ device_remove_file(dev, &dev_attr_resume);
++ err = device_create_file(dev, &dev_attr_pause);
++ }
++
++out:
++ blktap_sysfs_exit(tap);
++
++ BTDBG("returning %zd\n", (err ? err : size));
++ return (err ? err : size);
++}
++
++#ifdef ENABLE_PASSTHROUGH
++static ssize_t
++blktap_sysfs_enable_passthrough(struct device *dev,
++ const char *buf, size_t size)
++{
++ int err;
++ unsigned major, minor;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++
++ BTINFO("passthrough request enabled\n");
++
++ blktap_sysfs_enter(tap);
++
++ if (!tap->ring.dev ||
++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++ err = -ENODEV;
++ goto out;
++ }
++
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++ err = -EINVAL;
++ goto out;
++ }
++
++ if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
++ err = -EINVAL;
++ goto out;
++ }
++
++ err = sscanf(buf, "%x:%x", &major, &minor);
++ if (err != 2) {
++ err = -EINVAL;
++ goto out;
++ }
++
++ err = blktap_device_enable_passthrough(tap, major, minor);
++
++out:
++ blktap_sysfs_exit(tap);
++ BTDBG("returning %d\n", (err ? err : size));
++ return (err ? err : size);
++}
++#endif
++
++static ssize_t
++blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
++{
++ char *tmp;
++ int i, ret;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++
++ tmp = buf;
++ blktap_sysfs_get(tap);
++
++ if (!tap->ring.dev) {
++ ret = sprintf(tmp, "no device\n");
++ goto out;
++ }
++
++ tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n",
++ tap->params.name, MAJOR(tap->ring.devno),
++ MINOR(tap->ring.devno), atomic_read(&tap->refcnt),
++ tap->dev_inuse);
++ tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, "
++ "device users: %d\n", tap->params.capacity,
++ tap->params.sector_size, tap->device.users);
++
++ down_read(&tap->tap_sem);
++
++ tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt);
++ for (i = 0; i < MAX_PENDING_REQS; i++) {
++ struct blktap_request *req = tap->pending_requests[i];
++ if (!req)
++ continue;
++
++ tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, "
++ "status: 0x%02x, pendcnt: %d, "
++ "nr_pages: %u, op: %d, time: %lu:%lu\n",
++ i, (unsigned long long)req->id, req->usr_idx,
++ req->status, atomic_read(&req->pendcnt),
++ req->nr_pages, req->operation, req->time.tv_sec,
++ req->time.tv_usec);
++ }
++
++ up_read(&tap->tap_sem);
++ ret = (tmp - buf) + 1;
++
++out:
++ blktap_sysfs_put(tap);
++ BTDBG("%s\n", buf);
++
++ return ret;
++}
++CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);
++
++int
++blktap_sysfs_create(struct blktap *tap)
++{
++ struct blktap_ring *ring;
++ struct device *dev;
++ int err;
++
++ if (!class)
++ return -ENODEV;
++
++ ring = &tap->ring;
++
++ dev = device_create(class, NULL, ring->devno,
++ tap, "blktap%d", tap->minor);
++ if (IS_ERR(dev))
++ return PTR_ERR(dev);
++
++ ring->dev = dev;
++
++ mutex_init(&ring->sysfs_mutex);
++ atomic_set(&ring->sysfs_refcnt, 0);
++
++
++ printk(KERN_CRIT "%s: adding attributes for dev %p\n", __func__, dev);
++ err = device_create_file(dev, &dev_attr_name);
++ if (err)
++ goto out;
++ err = device_create_file(dev, &dev_attr_remove);
++ if (err)
++ goto out_unregister_name;
++ err = device_create_file(dev, &dev_attr_pause);
++ if (err)
++ goto out_unregister_remove;
++ err = device_create_file(dev, &dev_attr_debug);
++ if (err)
++ goto out_unregister_pause;
++
++ return 0;
++
++out_unregister_pause:
++ device_remove_file(dev, &dev_attr_pause);
++out_unregister_remove:
++ device_remove_file(dev, &dev_attr_remove);
++out_unregister_name:
++ device_remove_file(dev, &dev_attr_name);
++out:
++ return err;
++}
++
++int
++blktap_sysfs_destroy(struct blktap *tap)
++{
++ struct blktap_ring *ring;
++ struct device *dev;
++
++ printk(KERN_CRIT "%s\n", __func__);
++
++ ring = &tap->ring;
++ dev = ring->dev;
++ if (!class || !dev)
++ return 0;
++
++ ring->dev = NULL;
++ if (wait_event_interruptible(sysfs_wq,
++ !atomic_read(&tap->ring.sysfs_refcnt)))
++ return -EAGAIN;
++
++ device_schedule_callback(dev, device_unregister);
++
++ return 0;
++}
++
++static ssize_t
++blktap_sysfs_show_verbosity(struct class *class, char *buf)
++{
++ return sprintf(buf, "%d\n", blktap_debug_level);
++}
++
++static ssize_t
++blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size)
++{
++ int level;
++
++ if (sscanf(buf, "%d", &level) == 1) {
++ blktap_debug_level = level;
++ return size;
++ }
++
++ return -EINVAL;
++}
++CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR,
++ blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
++
++static ssize_t
++blktap_sysfs_show_devices(struct class *class, char *buf)
++{
++ int i, ret;
++ struct blktap *tap;
++
++ ret = 0;
++ for (i = 0; i < MAX_BLKTAP_DEVICE; i++) {
++ tap = blktaps[i];
++ if (!tap)
++ continue;
++
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++ continue;
++
++ ret += sprintf(buf + ret, "%d ", tap->minor);
++ ret += snprintf(buf + ret, sizeof(tap->params.name) - 1,
++ tap->params.name);
++ ret += sprintf(buf + ret, "\n");
++ }
++
++ return ret;
++}
++CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL);
++
++void
++blktap_sysfs_free(void)
++{
++ if (!class)
++ return;
++
++ class_remove_file(class, &class_attr_verbosity);
++ class_remove_file(class, &class_attr_devices);
++
++ class_destroy(class);
++}
++
++int __init
++blktap_sysfs_init(void)
++{
++ struct class *cls;
++ int err;
++
++ if (class)
++ return -EEXIST;
++
++ cls = class_create(THIS_MODULE, "blktap2");
++ if (IS_ERR(cls))
++ return PTR_ERR(cls);
++
++ err = class_create_file(cls, &class_attr_verbosity);
++ if (err)
++ goto out_unregister;
++ err = class_create_file(cls, &class_attr_devices);
++ if (err)
++ goto out_unregister;
++
++ class = cls;
++ return 0;
++out_unregister:
++ class_destroy(cls);
++ return err;
++}
+diff --git a/drivers/xen/blktap/wait_queue.c b/drivers/xen/blktap/wait_queue.c
+new file mode 100644
+index 0000000..f8995aa
+--- /dev/null
++++ b/drivers/xen/blktap/wait_queue.c
+@@ -0,0 +1,40 @@
++#include <linux/list.h>
++#include <linux/spinlock.h>
++
++#include "blktap.h"
++
++static LIST_HEAD(deferred_work_queue);
++static DEFINE_SPINLOCK(deferred_work_lock);
++
++void
++blktap_run_deferred(void)
++{
++ LIST_HEAD(queue);
++ struct blktap *tap;
++ unsigned long flags;
++
++ spin_lock_irqsave(&deferred_work_lock, flags);
++ list_splice_init(&deferred_work_queue, &queue);
++ list_for_each_entry(tap, &queue, deferred_queue)
++ clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
++ spin_unlock_irqrestore(&deferred_work_lock, flags);
++
++ while (!list_empty(&queue)) {
++ tap = list_entry(queue.next, struct blktap, deferred_queue);
++ list_del_init(&tap->deferred_queue);
++ blktap_device_restart(tap);
++ }
++}
++
++void
++blktap_defer(struct blktap *tap)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&deferred_work_lock, flags);
++ if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) {
++ set_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
++ list_add_tail(&tap->deferred_queue, &deferred_work_queue);
++ }
++ spin_unlock_irqrestore(&deferred_work_lock, flags);
++}
+diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
+index bdfd584..6625ffe 100644
+--- a/drivers/xen/cpu_hotplug.c
++++ b/drivers/xen/cpu_hotplug.c
+@@ -1,5 +1,6 @@
+ #include <linux/notifier.h>
+
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+
+ #include <asm/xen/hypervisor.h>
+diff --git a/drivers/xen/events.c b/drivers/xen/events.c
+index ce602dd..925e7a1 100644
+--- a/drivers/xen/events.c
++++ b/drivers/xen/events.c
+@@ -16,7 +16,7 @@
+ * (typically dom0).
+ * 2. VIRQs, typically used for timers. These are per-cpu events.
+ * 3. IPIs.
+- * 4. Hardware interrupts. Not supported at present.
++ * 4. PIRQs - Hardware interrupts.
+ *
+ * Jeremy Fitzhardinge <jeremy at xensource.com>, XenSource Inc, 2007
+ */
+@@ -27,10 +27,15 @@
+ #include <linux/module.h>
+ #include <linux/string.h>
+ #include <linux/bootmem.h>
++#include <linux/irqnr.h>
++#include <linux/pci_regs.h>
++#include <linux/pci.h>
++#include <linux/msi.h>
+
+ #include <asm/ptrace.h>
+ #include <asm/irq.h>
+ #include <asm/idle.h>
++#include <asm/io_apic.h>
+ #include <asm/sync_bitops.h>
+ #include <asm/xen/hypercall.h>
+ #include <asm/xen/hypervisor.h>
+@@ -40,6 +45,8 @@
+ #include <xen/interface/xen.h>
+ #include <xen/interface/event_channel.h>
+
++#include "../pci/msi.h"
++
+ /*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+@@ -67,7 +74,7 @@ enum xen_irq_type {
+ * event channel - irq->event channel mapping
+ * cpu - cpu this event channel is bound to
+ * index - type-specific information:
+- * PIRQ - vector, with MSB being "needs EIO"
++ * PIRQ - with MSB being "needs EIO"
+ * VIRQ - virq number
+ * IPI - IPI vector
+ * EVTCHN -
+@@ -82,21 +89,26 @@ struct irq_info
+ unsigned short virq;
+ enum ipi_vector ipi;
+ struct {
+- unsigned short gsi;
+- unsigned short vector;
++ unsigned short nr;
++ unsigned char flags;
+ } pirq;
+ } u;
+ };
++#define PIRQ_NEEDS_EOI (1 << 0)
++#define PIRQ_SHAREABLE (1 << 1)
+
+-static struct irq_info irq_info[NR_IRQS];
++static struct irq_info *irq_info;
+
+-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+- [0 ... NR_EVENT_CHANNELS-1] = -1
+-};
++static int *evtchn_to_irq;
+ struct cpu_evtchn_s {
+ unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
+ };
+-static struct cpu_evtchn_s *cpu_evtchn_mask_p;
++
++static __initdata struct cpu_evtchn_s init_evtchn_mask = {
++ .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
++};
++static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
++
+ static inline unsigned long *cpu_evtchn_mask(int cpu)
+ {
+ return cpu_evtchn_mask_p[cpu].bits;
+@@ -106,6 +118,7 @@ static inline unsigned long *cpu_evtchn_mask(int cpu)
+ #define VALID_EVTCHN(chn) ((chn) != 0)
+
+ static struct irq_chip xen_dynamic_chip;
++static struct irq_chip xen_pirq_chip;
+
+ /* Constructor for packed IRQ information. */
+ static struct irq_info mk_unbound_info(void)
+@@ -132,10 +145,10 @@ static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq)
+ }
+
+ static struct irq_info mk_pirq_info(unsigned short evtchn,
+- unsigned short gsi, unsigned short vector)
++ unsigned short pirq)
+ {
+ return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
+- .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
++ .cpu = 0, .u.pirq = { .nr = pirq } };
+ }
+
+ /*
+@@ -184,17 +197,7 @@ static unsigned gsi_from_irq(unsigned irq)
+ BUG_ON(info == NULL);
+ BUG_ON(info->type != IRQT_PIRQ);
+
+- return info->u.pirq.gsi;
+-}
+-
+-static unsigned vector_from_irq(unsigned irq)
+-{
+- struct irq_info *info = info_for_irq(irq);
+-
+- BUG_ON(info == NULL);
+- BUG_ON(info->type != IRQT_PIRQ);
+-
+- return info->u.pirq.vector;
++ return info->u.pirq.nr;
+ }
+
+ static enum xen_irq_type type_from_irq(unsigned irq)
+@@ -218,6 +221,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn)
+ return ret;
+ }
+
++static bool pirq_needs_eoi(unsigned irq)
++{
++ struct irq_info *info = info_for_irq(irq);
++
++ BUG_ON(info->type != IRQT_PIRQ);
++
++ return info->u.pirq.flags & PIRQ_NEEDS_EOI;
++}
++
+ static inline unsigned long active_evtchns(unsigned int cpu,
+ struct shared_info *sh,
+ unsigned int idx)
+@@ -329,12 +341,24 @@ static void unmask_evtchn(int port)
+ put_cpu();
+ }
+
++static int get_nr_hw_irqs(void)
++{
++ int ret = 1;
++
++#ifdef CONFIG_X86_IO_APIC
++ ret = get_nr_irqs_gsi();
++#endif
++
++ return ret;
++}
++
+ static int find_unbound_irq(void)
+ {
+ int irq;
+ struct irq_desc *desc;
++ int start = get_nr_hw_irqs();
+
+- for (irq = 0; irq < nr_irqs; irq++)
++ for (irq = start; irq < nr_irqs; irq++)
+ if (irq_info[irq].type == IRQT_UNBOUND)
+ break;
+
+@@ -350,6 +374,290 @@ static int find_unbound_irq(void)
+ return irq;
+ }
+
++static bool identity_mapped_irq(unsigned irq)
++{
++ /* identity map all the hardware irqs */
++ return irq < get_nr_hw_irqs();
++}
++
++static void pirq_unmask_notify(int irq)
++{
++ struct irq_info *info = info_for_irq(irq);
++ struct physdev_eoi eoi = { .irq = info->u.pirq.nr };
++
++ if (unlikely(pirq_needs_eoi(irq))) {
++ int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
++ WARN_ON(rc);
++ }
++}
++
++static void pirq_query_unmask(int irq)
++{
++ struct physdev_irq_status_query irq_status;
++ struct irq_info *info = info_for_irq(irq);
++
++ BUG_ON(info->type != IRQT_PIRQ);
++
++ irq_status.irq = info->u.pirq.nr;
++ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
++ irq_status.flags = 0;
++
++ info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
++ if (irq_status.flags & XENIRQSTAT_needs_eoi)
++ info->u.pirq.flags |= PIRQ_NEEDS_EOI;
++}
++
++static bool probing_irq(int irq)
++{
++ struct irq_desc *desc = irq_to_desc(irq);
++
++ return desc && desc->action == NULL;
++}
++
++static unsigned int startup_pirq(unsigned int irq)
++{
++ struct evtchn_bind_pirq bind_pirq;
++ struct irq_info *info = info_for_irq(irq);
++ int evtchn = evtchn_from_irq(irq);
++ int rc;
++
++ BUG_ON(info->type != IRQT_PIRQ);
++
++ if (VALID_EVTCHN(evtchn))
++ goto out;
++
++ bind_pirq.pirq = info->u.pirq.nr;
++ /* NB. We are happy to share unless we are probing. */
++ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
++ BIND_PIRQ__WILL_SHARE : 0;
++ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
++ if (rc != 0) {
++ if (!probing_irq(irq))
++ printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
++ irq);
++ return 0;
++ }
++ evtchn = bind_pirq.port;
++
++ pirq_query_unmask(irq);
++
++ evtchn_to_irq[evtchn] = irq;
++ bind_evtchn_to_cpu(evtchn, 0);
++ info->evtchn = evtchn;
++
++ out:
++ unmask_evtchn(evtchn);
++ pirq_unmask_notify(irq);
++
++ return 0;
++}
++
++static void shutdown_pirq(unsigned int irq)
++{
++ struct evtchn_close close;
++ struct irq_info *info = info_for_irq(irq);
++ int evtchn = evtchn_from_irq(irq);
++
++ BUG_ON(info->type != IRQT_PIRQ);
++
++ if (!VALID_EVTCHN(evtchn))
++ return;
++
++ mask_evtchn(evtchn);
++
++ close.port = evtchn;
++ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
++ BUG();
++
++ bind_evtchn_to_cpu(evtchn, 0);
++ evtchn_to_irq[evtchn] = -1;
++ info->evtchn = 0;
++}
++
++static void enable_pirq(unsigned int irq)
++{
++ startup_pirq(irq);
++}
++
++static void disable_pirq(unsigned int irq)
++{
++}
++
++static void ack_pirq(unsigned int irq)
++{
++ int evtchn = evtchn_from_irq(irq);
++
++ move_native_irq(irq);
++
++ if (VALID_EVTCHN(evtchn)) {
++ mask_evtchn(evtchn);
++ clear_evtchn(evtchn);
++ }
++}
++
++static void end_pirq(unsigned int irq)
++{
++ int evtchn = evtchn_from_irq(irq);
++ struct irq_desc *desc = irq_to_desc(irq);
++
++ if (WARN_ON(!desc))
++ return;
++
++ if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
++ (IRQ_DISABLED|IRQ_PENDING)) {
++ shutdown_pirq(irq);
++ } else if (VALID_EVTCHN(evtchn)) {
++ unmask_evtchn(evtchn);
++ pirq_unmask_notify(irq);
++ }
++}
++
++static int find_irq_by_gsi(unsigned gsi)
++{
++ int irq;
++
++ for (irq = 0; irq < nr_irqs; irq++) {
++ struct irq_info *info = info_for_irq(irq);
++
++ if (info == NULL || info->type != IRQT_PIRQ)
++ continue;
++
++ if (gsi_from_irq(irq) == gsi)
++ return irq;
++ }
++
++ return -1;
++}
++
++/*
++ * Allocate a physical irq. We don't assign an event channel
++ * until the irq actually started up. Return an
++ * existing irq if we've already got one for the gsi.
++ */
++int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
++{
++ int irq;
++
++ spin_lock(&irq_mapping_update_lock);
++
++ irq = find_irq_by_gsi(gsi);
++ if (irq != -1) {
++ printk(KERN_INFO "xen_allocate_pirq: returning irq %d for gsi %u\n",
++ irq, gsi);
++ goto out; /* XXX need refcount? */
++ }
++
++ if (identity_mapped_irq(gsi)) {
++ irq = gsi;
++ irq_to_desc_alloc_node(irq, 0);
++ dynamic_irq_init(irq);
++ } else
++ irq = find_unbound_irq();
++
++ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
++ handle_level_irq, name);
++
++ irq_info[irq] = mk_pirq_info(0, gsi);
++ irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
++out:
++ spin_unlock(&irq_mapping_update_lock);
++ return irq;
++}
++
++#ifdef CONFIG_PCI_MSI
++int xen_destroy_irq(int irq)
++{
++ struct irq_desc *desc;
++ struct physdev_unmap_pirq unmap_irq;
++ struct irq_info *info = info_for_irq(irq);
++ int rc = -ENOENT;
++
++ spin_lock(&irq_mapping_update_lock);
++
++ desc = irq_to_desc(irq);
++ if (!desc)
++ goto out;
++
++ unmap_irq.pirq = info->u.pirq.nr;
++ unmap_irq.domid = DOMID_SELF;
++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
++ if (rc) {
++ printk(KERN_WARNING "unmap irq failed %d\n", rc);
++ goto out;
++ }
++
++ irq_info[irq] = mk_unbound_info();
++
++ dynamic_irq_cleanup(irq);
++
++out:
++ spin_unlock(&irq_mapping_update_lock);
++ return rc;
++}
++
++int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
++{
++ int irq = 0;
++ struct physdev_map_pirq map_irq;
++ int rc;
++ domid_t domid = DOMID_SELF;
++ int pos;
++ u32 table_offset, bir;
++
++ memset(&map_irq, 0, sizeof(map_irq));
++ map_irq.domid = domid;
++ map_irq.type = MAP_PIRQ_TYPE_MSI;
++ map_irq.index = -1;
++ map_irq.pirq = -1;
++ map_irq.bus = dev->bus->number;
++ map_irq.devfn = dev->devfn;
++
++ if (type == PCI_CAP_ID_MSIX) {
++ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
++
++ pci_read_config_dword(dev, msix_table_offset_reg(pos),
++ &table_offset);
++ bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
++
++ map_irq.table_base = pci_resource_start(dev, bir);
++ map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
++ }
++
++ spin_lock(&irq_mapping_update_lock);
++
++ irq = find_unbound_irq();
++
++ if (irq == -1)
++ goto out;
++
++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
++ if (rc) {
++
++ printk(KERN_WARNING "xen map irq failed %d\n", rc);
++
++ dynamic_irq_cleanup(irq);
++
++ irq = -1;
++ goto out;
++ }
++ irq_info[irq] = mk_pirq_info(0, map_irq.pirq);
++
++ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
++ handle_level_irq,
++ (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
++
++out:
++ spin_unlock(&irq_mapping_update_lock);
++ return irq;
++}
++#endif
++
++int xen_gsi_from_irq(unsigned irq)
++{
++ return gsi_from_irq(irq);
++}
++EXPORT_SYMBOL_GPL(xen_gsi_from_irq);
++
+ int bind_evtchn_to_irq(unsigned int evtchn)
+ {
+ int irq;
+@@ -409,8 +717,23 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+ return irq;
+ }
+
++static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
++ unsigned int remote_port)
++{
++ struct evtchn_bind_interdomain bind_interdomain;
++ int err;
++
++ bind_interdomain.remote_dom = remote_domain;
++ bind_interdomain.remote_port = remote_port;
++
++ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
++ &bind_interdomain);
++
++ return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
++}
++
+
+-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
++int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+ {
+ struct evtchn_bind_virq bind_virq;
+ int evtchn, irq;
+@@ -504,6 +827,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ }
+ EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
++int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
++ unsigned int remote_port,
++ irq_handler_t handler,
++ unsigned long irqflags,
++ const char *devname,
++ void *dev_id)
++{
++ int irq, retval;
++
++ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
++ if (irq < 0)
++ return irq;
++
++ retval = request_irq(irq, handler, irqflags, devname, dev_id);
++ if (retval != 0) {
++ unbind_from_irq(irq);
++ return retval;
++ }
++
++ return irq;
++}
++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
++
+ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags, const char *devname, void *dev_id)
+@@ -649,9 +995,13 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
+ int bit_idx = __ffs(pending_bits);
+ int port = (word_idx * BITS_PER_LONG) + bit_idx;
+ int irq = evtchn_to_irq[port];
++ struct irq_desc *desc;
+
+- if (irq != -1)
+- handle_irq(irq, regs);
++ if (irq != -1) {
++ desc = irq_to_desc(irq);
++ if (desc)
++ generic_handle_irq_desc(irq, desc);
++ }
+ }
+ }
+
+@@ -928,13 +1278,37 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
+ .retrigger = retrigger_dynirq,
+ };
+
++static struct irq_chip xen_pirq_chip __read_mostly = {
++ .name = "xen-pirq",
++
++ .startup = startup_pirq,
++ .shutdown = shutdown_pirq,
++
++ .enable = enable_pirq,
++ .unmask = enable_pirq,
++
++ .disable = disable_pirq,
++ .mask = disable_pirq,
++
++ .ack = ack_pirq,
++ .end = end_pirq,
++
++ .set_affinity = set_affinity_irq,
++
++ .retrigger = retrigger_dynirq,
++};
++
+ void __init xen_init_IRQ(void)
+ {
+ int i;
+
+ cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
+ GFP_KERNEL);
+- BUG_ON(cpu_evtchn_mask_p == NULL);
++ irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
++
++ evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), GFP_KERNEL);
++ for(i = 0; i < NR_EVENT_CHANNELS; i++)
++ evtchn_to_irq[i] = -1;
+
+ init_evtchn_cpu_bindings();
+
+@@ -943,4 +1317,6 @@ void __init xen_init_IRQ(void)
+ mask_evtchn(i);
+
+ irq_ctx_init(smp_processor_id());
++
++ xen_setup_pirqs();
+ }
+diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
+index 79bedba..f70a4f4 100644
+--- a/drivers/xen/evtchn.c
++++ b/drivers/xen/evtchn.c
+@@ -48,6 +48,8 @@
+ #include <linux/gfp.h>
+ #include <linux/mutex.h>
+ #include <linux/cpu.h>
++
++#include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/evtchn.h>
+ #include <asm/xen/hypervisor.h>
+diff --git a/drivers/xen/features.c b/drivers/xen/features.c
+index 99eda16..9e2b64f 100644
+--- a/drivers/xen/features.c
++++ b/drivers/xen/features.c
+@@ -18,7 +18,7 @@
+ u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+ EXPORT_SYMBOL_GPL(xen_features);
+
+-void xen_setup_features(void)
++void __init xen_setup_features(void)
+ {
+ struct xen_feature_info fi;
+ int i, j;
+diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
+new file mode 100644
+index 0000000..ddc59cc
+--- /dev/null
++++ b/drivers/xen/gntdev.c
+@@ -0,0 +1,626 @@
++/******************************************************************************
++ * gntdev.c
++ *
++ * Device for accessing (in user-space) pages that have been granted by other
++ * domains.
++ *
++ * Copyright (c) 2006-2007, D G Murray.
++ * (c) 2009 Gerd Hoffmann <kraxel at redhat.com>
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ */
++
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/miscdevice.h>
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/mmu_notifier.h>
++#include <linux/types.h>
++#include <linux/uaccess.h>
++#include <linux/sched.h>
++#include <linux/rwsem.h>
++
++#include <xen/xen.h>
++#include <xen/grant_table.h>
++#include <xen/gntdev.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include <asm/xen/page.h>
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Derek G. Murray <Derek.Murray at cl.cam.ac.uk>, "
++ "Gerd Hoffmann <kraxel at redhat.com>");
++MODULE_DESCRIPTION("User-space granted page access driver");
++
++static int debug = 0;
++module_param(debug, int, 0644);
++static int limit = 1024;
++module_param(limit, int, 0644);
++
++struct gntdev_priv {
++ struct list_head maps;
++ uint32_t used;
++ uint32_t limit;
++ struct rw_semaphore sem;
++ struct mm_struct *mm;
++ struct mmu_notifier mn;
++};
++
++struct grant_map {
++ struct list_head next;
++ struct gntdev_priv *priv;
++ struct vm_area_struct *vma;
++ int index;
++ int count;
++ int flags;
++ int is_mapped;
++ struct ioctl_gntdev_grant_ref *grants;
++ struct gnttab_map_grant_ref *map_ops;
++ struct gnttab_unmap_grant_ref *unmap_ops;
++};
++
++/* ------------------------------------------------------------------ */
++
++static void gntdev_print_maps(struct gntdev_priv *priv,
++ char *text, int text_index)
++{
++ struct grant_map *map;
++
++ printk("%s: maps list (priv %p, usage %d/%d)\n",
++ __FUNCTION__, priv, priv->used, priv->limit);
++ list_for_each_entry(map, &priv->maps, next)
++ printk(" index %2d, count %2d %s\n",
++ map->index, map->count,
++ map->index == text_index && text ? text : "");
++}
++
++static struct grant_map *gntdev_add_map(struct gntdev_priv *priv, int count)
++{
++ struct grant_map *map, *add;
++
++ add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
++ if (NULL == add)
++ return NULL;
++
++ add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL);
++ add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL);
++ add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
++ if (NULL == add->grants ||
++ NULL == add->map_ops ||
++ NULL == add->unmap_ops)
++ goto err;
++
++ add->index = 0;
++ add->count = count;
++ add->priv = priv;
++
++ if (add->count + priv->used > priv->limit)
++ goto err;
++
++ list_for_each_entry(map, &priv->maps, next) {
++ if (add->index + add->count < map->index) {
++ list_add_tail(&add->next, &map->next);
++ goto done;
++ }
++ add->index = map->index + map->count;
++ }
++ list_add_tail(&add->next, &priv->maps);
++
++done:
++ priv->used += add->count;
++ if (debug)
++ gntdev_print_maps(priv, "[new]", add->index);
++ return add;
++
++err:
++ kfree(add->grants);
++ kfree(add->map_ops);
++ kfree(add->unmap_ops);
++ kfree(add);
++ return NULL;
++}
++
++static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, int index,
++ int count)
++{
++ struct grant_map *map;
++
++ list_for_each_entry(map, &priv->maps, next) {
++ if (map->index != index)
++ continue;
++ if (map->count != count)
++ continue;
++ return map;
++ }
++ return NULL;
++}
++
++static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
++ unsigned long vaddr)
++{
++ struct grant_map *map;
++
++ list_for_each_entry(map, &priv->maps, next) {
++ if (!map->vma)
++ continue;
++ if (vaddr < map->vma->vm_start)
++ continue;
++ if (vaddr >= map->vma->vm_end)
++ continue;
++ return map;
++ }
++ return NULL;
++}
++
++static int gntdev_del_map(struct grant_map *map)
++{
++ int i;
++
++ if (map->vma)
++ return -EBUSY;
++ for (i = 0; i < map->count; i++)
++ if (map->unmap_ops[i].handle)
++ return -EBUSY;
++
++ map->priv->used -= map->count;
++ list_del(&map->next);
++ kfree(map->grants);
++ kfree(map->map_ops);
++ kfree(map->unmap_ops);
++ kfree(map);
++ return 0;
++}
++
++/* ------------------------------------------------------------------ */
++
++static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void *data)
++{
++ struct grant_map *map = data;
++ unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
++ u64 pte_maddr;
++
++ BUG_ON(pgnr >= map->count);
++ pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT;
++ pte_maddr += (unsigned long)pte & ~PAGE_MASK;
++ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags,
++ map->grants[pgnr].ref,
++ map->grants[pgnr].domid);
++ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags,
++ 0 /* handle */);
++ return 0;
++}
++
++static int map_grant_pages(struct grant_map *map)
++{
++ int i, err = 0;
++
++ if (debug)
++ printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count);
++ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++ map->map_ops, map->count);
++ if (WARN_ON(err))
++ return err;
++
++ for (i = 0; i < map->count; i++) {
++ if (map->map_ops[i].status)
++ err = -EINVAL;
++ map->unmap_ops[i].handle = map->map_ops[i].handle;
++ }
++ return err;
++}
++
++static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
++{
++ int i, err = 0;
++
++ if (debug)
++ printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__,
++ map->index, map->count, offset, pages);
++ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++ map->unmap_ops + offset, pages);
++ if (WARN_ON(err))
++ return err;
++
++ for (i = 0; i < pages; i++) {
++ if (map->unmap_ops[offset+i].status)
++ err = -EINVAL;
++ map->unmap_ops[offset+i].handle = 0;
++ }
++ return err;
++}
++
++/* ------------------------------------------------------------------ */
++
++static void gntdev_vma_close(struct vm_area_struct *vma)
++{
++ struct grant_map *map = vma->vm_private_data;
++
++ if (debug)
++ printk("%s\n", __FUNCTION__);
++ map->is_mapped = 0;
++ map->vma = NULL;
++ vma->vm_private_data = NULL;
++}
++
++static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++ if (debug)
++ printk("%s: vaddr %p, pgoff %ld (shouldn't happen)\n",
++ __FUNCTION__, vmf->virtual_address, vmf->pgoff);
++ vmf->flags = VM_FAULT_ERROR;
++ return 0;
++}
++
++static struct vm_operations_struct gntdev_vmops = {
++ .close = gntdev_vma_close,
++ .fault = gntdev_vma_fault,
++};
++
++/* ------------------------------------------------------------------ */
++
++static void mn_invl_range_start(struct mmu_notifier *mn,
++ struct mm_struct *mm,
++ unsigned long start, unsigned long end)
++{
++ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
++ struct grant_map *map;
++ unsigned long mstart, mend;
++ int err;
++
++ down_read(&priv->sem);
++ list_for_each_entry(map, &priv->maps, next) {
++ if (!map->vma)
++ continue;
++ if (!map->is_mapped)
++ continue;
++ if (map->vma->vm_start >= end)
++ continue;
++ if (map->vma->vm_end <= start)
++ continue;
++ mstart = max(start, map->vma->vm_start);
++ mend = min(end, map->vma->vm_end);
++ if (debug)
++ printk("%s: map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
++ __FUNCTION__, map->index, map->count,
++ map->vma->vm_start, map->vma->vm_end,
++ start, end, mstart, mend);
++ err = unmap_grant_pages(map,
++ (mstart - map->vma->vm_start) >> PAGE_SHIFT,
++ (mend - mstart) >> PAGE_SHIFT);
++ WARN_ON(err);
++ }
++ up_read(&priv->sem);
++}
++
++static void mn_invl_page(struct mmu_notifier *mn,
++ struct mm_struct *mm,
++ unsigned long address)
++{
++ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
++}
++
++static void mn_release(struct mmu_notifier *mn,
++ struct mm_struct *mm)
++{
++ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
++ struct grant_map *map;
++ int err;
++
++ down_read(&priv->sem);
++ list_for_each_entry(map, &priv->maps, next) {
++ if (!map->vma)
++ continue;
++ if (debug)
++ printk("%s: map %d+%d (%lx %lx)\n",
++ __FUNCTION__, map->index, map->count,
++ map->vma->vm_start, map->vma->vm_end);
++ err = unmap_grant_pages(map, 0, map->count);
++ WARN_ON(err);
++ }
++ up_read(&priv->sem);
++}
++
++struct mmu_notifier_ops gntdev_mmu_ops = {
++ .release = mn_release,
++ .invalidate_page = mn_invl_page,
++ .invalidate_range_start = mn_invl_range_start,
++};
++
++/* ------------------------------------------------------------------ */
++
++static int gntdev_open(struct inode *inode, struct file *flip)
++{
++ struct gntdev_priv *priv;
++
++ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
++ if (!priv)
++ return -ENOMEM;
++
++ INIT_LIST_HEAD(&priv->maps);
++ init_rwsem(&priv->sem);
++ priv->limit = limit;
++
++ priv->mm = get_task_mm(current);
++ if (!priv->mm) {
++ kfree(priv);
++ return -ENOMEM;
++ }
++ priv->mn.ops = &gntdev_mmu_ops;
++ mmu_notifier_register(&priv->mn, priv->mm);
++ mmput(priv->mm);
++
++ flip->private_data = priv;
++ if (debug)
++ printk("%s: priv %p\n", __FUNCTION__, priv);
++
++ return 0;
++}
++
++static int gntdev_release(struct inode *inode, struct file *flip)
++{
++ struct gntdev_priv *priv = flip->private_data;
++ struct grant_map *map;
++ int err;
++
++ if (debug)
++ printk("%s: priv %p\n", __FUNCTION__, priv);
++
++ down_write(&priv->sem);
++ while (!list_empty(&priv->maps)) {
++ map = list_entry(priv->maps.next, struct grant_map, next);
++ err = gntdev_del_map(map);
++ WARN_ON(err);
++ }
++ up_write(&priv->sem);
++ mmu_notifier_unregister(&priv->mn, priv->mm);
++ kfree(priv);
++ return 0;
++}
++
++static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
++ struct ioctl_gntdev_map_grant_ref __user *u)
++{
++ struct ioctl_gntdev_map_grant_ref op;
++ struct grant_map *map;
++ int err;
++
++ if (copy_from_user(&op, u, sizeof(op)) != 0)
++ return -EFAULT;
++ if (debug)
++ printk("%s: priv %p, add %d\n", __FUNCTION__, priv,
++ op.count);
++ if (unlikely(op.count <= 0))
++ return -EINVAL;
++ if (unlikely(op.count > priv->limit))
++ return -EINVAL;
++
++ down_write(&priv->sem);
++ err = -ENOMEM;
++ map = gntdev_add_map(priv, op.count);
++ if (!map)
++ goto err_unlock;
++
++ err = -ENOMEM;
++ if (copy_from_user(map->grants, &u->refs,
++ sizeof(map->grants[0]) * op.count) != 0)
++ goto err_free;
++ op.index = map->index << PAGE_SHIFT;
++ if (copy_to_user(u, &op, sizeof(op)) != 0)
++ goto err_free;
++ up_write(&priv->sem);
++ return 0;
++
++err_free:
++ gntdev_del_map(map);
++err_unlock:
++ up_write(&priv->sem);
++ return err;
++}
++
++static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
++ struct ioctl_gntdev_unmap_grant_ref __user *u)
++{
++ struct ioctl_gntdev_unmap_grant_ref op;
++ struct grant_map *map;
++ int err = -EINVAL;
++
++ if (copy_from_user(&op, u, sizeof(op)) != 0)
++ return -EFAULT;
++ if (debug)
++ printk("%s: priv %p, del %d+%d\n", __FUNCTION__, priv,
++ (int)op.index, (int)op.count);
++
++ down_write(&priv->sem);
++ map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
++ if (map)
++ err = gntdev_del_map(map);
++ up_write(&priv->sem);
++ return err;
++}
++
++static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
++ struct ioctl_gntdev_get_offset_for_vaddr __user *u)
++{
++ struct ioctl_gntdev_get_offset_for_vaddr op;
++ struct grant_map *map;
++
++ if (copy_from_user(&op, u, sizeof(op)) != 0)
++ return -EFAULT;
++ if (debug)
++ printk("%s: priv %p, offset for vaddr %lx\n", __FUNCTION__, priv,
++ (unsigned long)op.vaddr);
++
++ down_read(&priv->sem);
++ map = gntdev_find_map_vaddr(priv, op.vaddr);
++ if (map == NULL ||
++ map->vma->vm_start != op.vaddr) {
++ up_read(&priv->sem);
++ return -EINVAL;
++ }
++ op.offset = map->index << PAGE_SHIFT;
++ op.count = map->count;
++ up_read(&priv->sem);
++
++ if (copy_to_user(u, &op, sizeof(op)) != 0)
++ return -EFAULT;
++ return 0;
++}
++
++static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
++ struct ioctl_gntdev_set_max_grants __user *u)
++{
++ struct ioctl_gntdev_set_max_grants op;
++
++ if (copy_from_user(&op, u, sizeof(op)) != 0)
++ return -EFAULT;
++ if (debug)
++ printk("%s: priv %p, limit %d\n", __FUNCTION__, priv, op.count);
++ if (op.count > limit)
++ return -EINVAL;
++
++ down_write(&priv->sem);
++ priv->limit = op.count;
++ up_write(&priv->sem);
++ return 0;
++}
++
++static long gntdev_ioctl(struct file *flip,
++ unsigned int cmd, unsigned long arg)
++{
++ struct gntdev_priv *priv = flip->private_data;
++ void __user *ptr = (void __user *)arg;
++
++ switch (cmd) {
++ case IOCTL_GNTDEV_MAP_GRANT_REF:
++ return gntdev_ioctl_map_grant_ref(priv, ptr);
++
++ case IOCTL_GNTDEV_UNMAP_GRANT_REF:
++ return gntdev_ioctl_unmap_grant_ref(priv, ptr);
++
++ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
++ return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
++
++ case IOCTL_GNTDEV_SET_MAX_GRANTS:
++ return gntdev_ioctl_set_max_grants(priv, ptr);
++
++ default:
++ if (debug)
++ printk("%s: priv %p, unknown cmd %x\n",
++ __FUNCTION__, priv, cmd);
++ return -ENOIOCTLCMD;
++ }
++
++ return 0;
++}
++
++static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
++{
++ struct gntdev_priv *priv = flip->private_data;
++ int index = vma->vm_pgoff;
++ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++ struct grant_map *map;
++ int err = -EINVAL;
++
++ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
++ return -EINVAL;
++
++ if (debug)
++ printk("%s: map %d+%d at %lx (pgoff %lx)\n", __FUNCTION__,
++ index, count, vma->vm_start, vma->vm_pgoff);
++
++ down_read(&priv->sem);
++ map = gntdev_find_map_index(priv, index, count);
++ if (!map)
++ goto unlock_out;
++ if (map->vma)
++ goto unlock_out;
++ if (priv->mm != vma->vm_mm) {
++ printk("%s: Huh? Other mm?\n", __FUNCTION__);
++ goto unlock_out;
++ }
++
++ vma->vm_ops = &gntdev_vmops;
++
++ vma->vm_flags |= VM_RESERVED;
++ vma->vm_flags |= VM_DONTCOPY;
++ vma->vm_flags |= VM_DONTEXPAND;
++
++ vma->vm_private_data = map;
++ map->vma = vma;
++
++ map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte;
++ if (!(vma->vm_flags & VM_WRITE))
++ map->flags |= GNTMAP_readonly;
++
++ err = apply_to_page_range(vma->vm_mm, vma->vm_start,
++ vma->vm_end - vma->vm_start,
++ find_grant_ptes, map);
++ if (err) {
++ goto unlock_out;
++ if (debug)
++ printk("%s: find_grant_ptes() failure.\n", __FUNCTION__);
++ }
++
++ err = map_grant_pages(map);
++ if (err) {
++ goto unlock_out;
++ if (debug)
++ printk("%s: map_grant_pages() failure.\n", __FUNCTION__);
++ }
++ map->is_mapped = 1;
++
++unlock_out:
++ up_read(&priv->sem);
++ return err;
++}
++
++static const struct file_operations gntdev_fops = {
++ .owner = THIS_MODULE,
++ .open = gntdev_open,
++ .release = gntdev_release,
++ .mmap = gntdev_mmap,
++ .unlocked_ioctl = gntdev_ioctl
++};
++
++static struct miscdevice gntdev_miscdev = {
++ .minor = MISC_DYNAMIC_MINOR,
++ .name = "gntdev",
++ .fops = &gntdev_fops,
++};
++
++/* ------------------------------------------------------------------ */
++
++static int __init gntdev_init(void)
++{
++ int err;
++
++ if (!xen_domain())
++ return -ENODEV;
++
++ err = misc_register(&gntdev_miscdev);
++ if (err != 0) {
++ printk(KERN_ERR "Could not register gntdev device\n");
++ return err;
++ }
++ return 0;
++}
++
++static void __exit gntdev_exit(void)
++{
++ misc_deregister(&gntdev_miscdev);
++}
++
++module_init(gntdev_init);
++module_exit(gntdev_exit);
++
++/* ------------------------------------------------------------------ */
+diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
+index 7d8f531..76fe621 100644
+--- a/drivers/xen/grant-table.c
++++ b/drivers/xen/grant-table.c
+@@ -37,6 +37,7 @@
+ #include <linux/vmalloc.h>
+ #include <linux/uaccess.h>
+
++#include <xen/xen.h>
+ #include <xen/interface/xen.h>
+ #include <xen/page.h>
+ #include <xen/grant_table.h>
+@@ -472,6 +473,111 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+ return 0;
+ }
+
++static void gnttab_page_free(struct page *page, unsigned int order)
++{
++ BUG_ON(order);
++ ClearPageForeign(page);
++ gnttab_reset_grant_page(page);
++ put_page(page);
++}
++
++/*
++ * Must not be called with IRQs off. This should only be used on the
++ * slow path.
++ *
++ * Copy a foreign granted page to local memory.
++ */
++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
++{
++ struct gnttab_unmap_and_replace unmap;
++ struct mmu_update mmu;
++ struct page *page;
++ struct page *new_page;
++ void *new_addr;
++ void *addr;
++ unsigned long pfn;
++ unsigned long mfn;
++ unsigned long new_mfn;
++ int err;
++
++ page = *pagep;
++ if (!get_page_unless_zero(page))
++ return -ENOENT;
++
++ err = -ENOMEM;
++ new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
++ if (!new_page)
++ goto out;
++
++ new_addr = page_address(new_page);
++ addr = page_address(page);
++ memcpy(new_addr, addr, PAGE_SIZE);
++
++ pfn = page_to_pfn(page);
++ mfn = pfn_to_mfn(pfn);
++ new_mfn = virt_to_mfn(new_addr);
++
++// write_seqlock(&gnttab_dma_lock); /* protects __gnttab_dma_map_page on 2.6.18 */
++
++ /* Make seq visible before checking page_mapped. */
++ smp_mb();
++
++ /* Has the page been DMA-mapped? */
++ if (unlikely(page_mapped(page))) {
++ //write_sequnlock(&gnttab_dma_lock);
++ put_page(new_page);
++ err = -EBUSY;
++ goto out;
++ }
++
++ if (!xen_feature(XENFEAT_auto_translated_physmap))
++ set_phys_to_machine(pfn, new_mfn);
++
++ //gnttab_set_replace_op(&unmap, (unsigned long)addr,
++ // (unsigned long)new_addr, ref);
++ unmap.host_addr = (unsigned long)addr;
++ unmap.new_addr = (unsigned long)new_addr;
++ unmap.handle = ref;
++
++ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
++ &unmap, 1);
++ BUG_ON(err);
++ BUG_ON(unmap.status);
++
++// write_sequnlock(&gnttab_dma_lock);
++
++ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++ set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
++
++ mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
++ mmu.val = pfn;
++ err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
++ BUG_ON(err);
++ }
++
++ new_page->mapping = page->mapping;
++ SetPageForeign(new_page, _PageForeignDestructor(page));
++ if (PageReserved(page))
++ SetPageReserved(new_page);
++ *pagep = new_page;
++
++ SetPageForeign(page, gnttab_page_free);
++ ClearPageReserved(page);
++ page->mapping = NULL;
++
++out:
++ put_page(page);
++ return err;
++}
++EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
++
++void gnttab_reset_grant_page(struct page *page)
++{
++ init_page_count(page);
++ reset_page_mapcount(page);
++}
++EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
++
+ int gnttab_resume(void)
+ {
+ if (max_nr_grant_frames() < nr_grant_frames)
+diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile
+new file mode 100644
+index 0000000..e346e81
+--- /dev/null
++++ b/drivers/xen/netback/Makefile
+@@ -0,0 +1,3 @@
++obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
++
++xen-netback-y := netback.o xenbus.o interface.o
+diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
+new file mode 100644
+index 0000000..51f97c0
+--- /dev/null
++++ b/drivers/xen/netback/common.h
+@@ -0,0 +1,227 @@
++/******************************************************************************
++ * arch/xen/drivers/netif/backend/common.h
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __NETIF__BACKEND__COMMON_H__
++#define __NETIF__BACKEND__COMMON_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/ip.h>
++#include <linux/in.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/wait.h>
++#include <linux/sched.h>
++
++#include <xen/interface/io/netif.h>
++#include <asm/io.h>
++#include <asm/pgalloc.h>
++#include <xen/interface/grant_table.h>
++#include <xen/grant_table.h>
++#include <xen/xenbus.h>
++
++#define DPRINTK(_f, _a...) \
++ pr_debug("(file=%s, line=%d) " _f, \
++ __FILE__ , __LINE__ , ## _a )
++#define IPRINTK(fmt, args...) \
++ printk(KERN_INFO "xen_net: " fmt, ##args)
++#define WPRINTK(fmt, args...) \
++ printk(KERN_WARNING "xen_net: " fmt, ##args)
++
++struct xen_netif {
++ /* Unique identifier for this interface. */
++ domid_t domid;
++ unsigned int handle;
++
++ u8 fe_dev_addr[6];
++
++ /* Physical parameters of the comms window. */
++ grant_handle_t tx_shmem_handle;
++ grant_ref_t tx_shmem_ref;
++ grant_handle_t rx_shmem_handle;
++ grant_ref_t rx_shmem_ref;
++ unsigned int irq;
++
++ /* The shared rings and indexes. */
++ struct xen_netif_tx_back_ring tx;
++ struct xen_netif_rx_back_ring rx;
++ struct vm_struct *tx_comms_area;
++ struct vm_struct *rx_comms_area;
++
++ /* Set of features that can be turned on in dev->features. */
++ int features;
++
++ int smart_poll;
++
++ /* Internal feature information. */
++ u8 can_queue:1; /* can queue packets for receiver? */
++
++ /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
++ RING_IDX rx_req_cons_peek;
++
++ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
++ unsigned long credit_bytes;
++ unsigned long credit_usec;
++ unsigned long remaining_credit;
++ struct timer_list credit_timeout;
++
++ /* Enforce draining of the transmit queue. */
++ struct timer_list tx_queue_timeout;
++
++ /* Statistics */
++ int nr_copied_skbs;
++
++ /* Miscellaneous private stuff. */
++ struct list_head list; /* scheduling list */
++ atomic_t refcnt;
++ struct net_device *dev;
++ struct net_device_stats stats;
++
++ unsigned int carrier;
++
++ wait_queue_head_t waiting_to_free;
++};
++
++/*
++ * Implement our own carrier flag: the network stack's version causes delays
++ * when the carrier is re-enabled (in particular, dev_activate() may not
++ * immediately be called, which can cause packet loss; also the etherbridge
++ * can be rather lazy in activating its port).
++ */
++#define netback_carrier_on(netif) ((netif)->carrier = 1)
++#define netback_carrier_off(netif) ((netif)->carrier = 0)
++#define netback_carrier_ok(netif) ((netif)->carrier)
++
++enum {
++ NETBK_DONT_COPY_SKB,
++ NETBK_DELAYED_COPY_SKB,
++ NETBK_ALWAYS_COPY_SKB,
++};
++
++extern int netbk_copy_skb_mode;
++
++/* Function pointers into netback accelerator plugin modules */
++struct netback_accel_hooks {
++ struct module *owner;
++ int (*probe)(struct xenbus_device *dev);
++ int (*remove)(struct xenbus_device *dev);
++};
++
++/* Structure to track the state of a netback accelerator plugin */
++struct netback_accelerator {
++ struct list_head link;
++ int id;
++ char *eth_name;
++ atomic_t use_count;
++ struct netback_accel_hooks *hooks;
++};
++
++struct backend_info {
++ struct xenbus_device *dev;
++ struct xen_netif *netif;
++ enum xenbus_state frontend_state;
++ struct xenbus_watch hotplug_status_watch;
++ int have_hotplug_status_watch:1;
++
++ /* State relating to the netback accelerator */
++ void *netback_accel_priv;
++ /* The accelerator that this backend is currently using */
++ struct netback_accelerator *accelerator;
++};
++
++#define NETBACK_ACCEL_VERSION 0x00010001
++
++/*
++ * Connect an accelerator plugin module to netback. Returns zero on
++ * success, < 0 on error, > 0 (with highest version number supported)
++ * if version mismatch.
++ */
++extern int netback_connect_accelerator(unsigned version,
++ int id, const char *eth_name,
++ struct netback_accel_hooks *hooks);
++/* Disconnect a previously connected accelerator plugin module */
++extern void netback_disconnect_accelerator(int id, const char *eth_name);
++
++
++extern
++void netback_probe_accelerators(struct backend_info *be,
++ struct xenbus_device *dev);
++extern
++void netback_remove_accelerators(struct backend_info *be,
++ struct xenbus_device *dev);
++extern
++void netif_accel_init(void);
++
++
++#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
++#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
++
++void netif_disconnect(struct xen_netif *netif);
++
++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle);
++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
++ unsigned long rx_ring_ref, unsigned int evtchn);
++
++static inline void netif_get(struct xen_netif *netif)
++{
++ atomic_inc(&netif->refcnt);
++}
++
++static inline void netif_put(struct xen_netif *netif)
++{
++ if (atomic_dec_and_test(&netif->refcnt))
++ wake_up(&netif->waiting_to_free);
++}
++
++int netif_xenbus_init(void);
++
++#define netif_schedulable(netif) \
++ (netif_running((netif)->dev) && netback_carrier_ok(netif))
++
++void netif_schedule_work(struct xen_netif *netif);
++void netif_deschedule_work(struct xen_netif *netif);
++
++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
++struct net_device_stats *netif_be_get_stats(struct net_device *dev);
++irqreturn_t netif_be_int(int irq, void *dev_id);
++
++static inline int netbk_can_queue(struct net_device *dev)
++{
++ struct xen_netif *netif = netdev_priv(dev);
++ return netif->can_queue;
++}
++
++static inline int netbk_can_sg(struct net_device *dev)
++{
++ struct xen_netif *netif = netdev_priv(dev);
++ return netif->features & NETIF_F_SG;
++}
++
++#endif /* __NETIF__BACKEND__COMMON_H__ */
+diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
+new file mode 100644
+index 0000000..b23b14d
+--- /dev/null
++++ b/drivers/xen/netback/interface.c
+@@ -0,0 +1,405 @@
++/******************************************************************************
++ * arch/xen/drivers/netif/backend/interface.c
++ *
++ * Network-device interface management.
++ *
++ * Copyright (c) 2004-2005, Keir Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++#include <linux/ethtool.h>
++#include <linux/rtnetlink.h>
++
++#include <xen/events.h>
++#include <asm/xen/hypercall.h>
++
++/*
++ * Module parameter 'queue_length':
++ *
++ * Enables queuing in the network stack when a client has run out of receive
++ * descriptors. Although this feature can improve receive bandwidth by avoiding
++ * packet loss, it can also result in packets sitting in the 'tx_queue' for
++ * unbounded time. This is bad if those packets hold onto foreign resources.
++ * For example, consider a packet that holds onto resources belonging to the
++ * guest for which it is queued (e.g., packet received on vif1.0, destined for
++ * vif1.1 which is not activated in the guest): in this situation the guest
++ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
++ * run a timer (tx_queue_timeout) to drain the queue when the interface is
++ * blocked.
++ */
++static unsigned long netbk_queue_length = 32;
++module_param_named(queue_length, netbk_queue_length, ulong, 0644);
++
++static void __netif_up(struct xen_netif *netif)
++{
++ enable_irq(netif->irq);
++ netif_schedule_work(netif);
++}
++
++static void __netif_down(struct xen_netif *netif)
++{
++ disable_irq(netif->irq);
++ netif_deschedule_work(netif);
++}
++
++static int net_open(struct net_device *dev)
++{
++ struct xen_netif *netif = netdev_priv(dev);
++ if (netback_carrier_ok(netif)) {
++ __netif_up(netif);
++ netif_start_queue(dev);
++ }
++ return 0;
++}
++
++static int net_close(struct net_device *dev)
++{
++ struct xen_netif *netif = netdev_priv(dev);
++ if (netback_carrier_ok(netif))
++ __netif_down(netif);
++ netif_stop_queue(dev);
++ return 0;
++}
++
++static int netbk_change_mtu(struct net_device *dev, int mtu)
++{
++ int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
++
++ if (mtu > max)
++ return -EINVAL;
++ dev->mtu = mtu;
++ return 0;
++}
++
++static int netbk_set_sg(struct net_device *dev, u32 data)
++{
++ if (data) {
++ struct xen_netif *netif = netdev_priv(dev);
++
++ if (!(netif->features & NETIF_F_SG))
++ return -ENOSYS;
++ }
++
++ if (dev->mtu > ETH_DATA_LEN)
++ dev->mtu = ETH_DATA_LEN;
++
++ return ethtool_op_set_sg(dev, data);
++}
++
++static int netbk_set_tso(struct net_device *dev, u32 data)
++{
++ if (data) {
++ struct xen_netif *netif = netdev_priv(dev);
++
++ if (!(netif->features & NETIF_F_TSO))
++ return -ENOSYS;
++ }
++
++ return ethtool_op_set_tso(dev, data);
++}
++
++static void netbk_get_drvinfo(struct net_device *dev,
++ struct ethtool_drvinfo *info)
++{
++ strcpy(info->driver, "netbk");
++ strcpy(info->bus_info, dev_name(dev->dev.parent));
++}
++
++static const struct netif_stat {
++ char name[ETH_GSTRING_LEN];
++ u16 offset;
++} netbk_stats[] = {
++ { "copied_skbs", offsetof(struct xen_netif, nr_copied_skbs) },
++};
++
++static int netbk_get_stats_count(struct net_device *dev)
++{
++ return ARRAY_SIZE(netbk_stats);
++}
++
++static void netbk_get_ethtool_stats(struct net_device *dev,
++ struct ethtool_stats *stats, u64 * data)
++{
++ void *netif = netdev_priv(dev);
++ int i;
++
++ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
++ data[i] = *(int *)(netif + netbk_stats[i].offset);
++}
++
++static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data)
++{
++ int i;
++
++ switch (stringset) {
++ case ETH_SS_STATS:
++ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
++ memcpy(data + i * ETH_GSTRING_LEN,
++ netbk_stats[i].name, ETH_GSTRING_LEN);
++ break;
++ }
++}
++
++static struct ethtool_ops network_ethtool_ops =
++{
++ .get_drvinfo = netbk_get_drvinfo,
++
++ .get_tx_csum = ethtool_op_get_tx_csum,
++ .set_tx_csum = ethtool_op_set_tx_csum,
++ .get_sg = ethtool_op_get_sg,
++ .set_sg = netbk_set_sg,
++ .get_tso = ethtool_op_get_tso,
++ .set_tso = netbk_set_tso,
++ .get_link = ethtool_op_get_link,
++
++ .get_stats_count = netbk_get_stats_count,
++ .get_ethtool_stats = netbk_get_ethtool_stats,
++ .get_strings = netbk_get_strings,
++};
++
++static struct net_device_ops netback_ops =
++{
++ .ndo_start_xmit = netif_be_start_xmit,
++ .ndo_get_stats = netif_be_get_stats,
++ .ndo_open = net_open,
++ .ndo_stop = net_close,
++ .ndo_change_mtu = netbk_change_mtu,
++};
++
++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
++{
++ int err = 0;
++ struct net_device *dev;
++ struct xen_netif *netif;
++ char name[IFNAMSIZ] = {};
++
++ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
++ dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
++ if (dev == NULL) {
++ DPRINTK("Could not create netif: out of memory\n");
++ return ERR_PTR(-ENOMEM);
++ }
++
++ SET_NETDEV_DEV(dev, parent);
++
++ netif = netdev_priv(dev);
++ memset(netif, 0, sizeof(*netif));
++ netif->domid = domid;
++ netif->handle = handle;
++ netif->features = NETIF_F_SG;
++ atomic_set(&netif->refcnt, 1);
++ init_waitqueue_head(&netif->waiting_to_free);
++ netif->dev = dev;
++ INIT_LIST_HEAD(&netif->list);
++
++ netback_carrier_off(netif);
++
++ netif->credit_bytes = netif->remaining_credit = ~0UL;
++ netif->credit_usec = 0UL;
++ init_timer(&netif->credit_timeout);
++ /* Initialize 'expires' now: it's used to track the credit window. */
++ netif->credit_timeout.expires = jiffies;
++
++ init_timer(&netif->tx_queue_timeout);
++
++ dev->netdev_ops = &netback_ops;
++ dev->features = NETIF_F_IP_CSUM|NETIF_F_SG;
++
++ SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
++
++ dev->tx_queue_len = netbk_queue_length;
++
++ /*
++ * Initialise a dummy MAC address. We choose the numerically
++ * largest non-broadcast address to prevent the address getting
++ * stolen by an Ethernet bridge for STP purposes.
++ * (FE:FF:FF:FF:FF:FF)
++ */
++ memset(dev->dev_addr, 0xFF, ETH_ALEN);
++ dev->dev_addr[0] &= ~0x01;
++
++ rtnl_lock();
++ err = register_netdevice(dev);
++ rtnl_unlock();
++ if (err) {
++ DPRINTK("Could not register new net device %s: err=%d\n",
++ dev->name, err);
++ free_netdev(dev);
++ return ERR_PTR(err);
++ }
++
++ DPRINTK("Successfully created netif\n");
++ return netif;
++}
++
++static int map_frontend_pages(
++ struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
++{
++ struct gnttab_map_grant_ref op;
++
++ gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
++ GNTMAP_host_map, tx_ring_ref, netif->domid);
++
++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++ BUG();
++
++ if (op.status) {
++ DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
++ return op.status;
++ }
++
++ netif->tx_shmem_ref = tx_ring_ref;
++ netif->tx_shmem_handle = op.handle;
++
++ gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
++ GNTMAP_host_map, rx_ring_ref, netif->domid);
++
++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++ BUG();
++
++ if (op.status) {
++ struct gnttab_unmap_grant_ref unop;
++
++ gnttab_set_unmap_op(&unop,
++ (unsigned long)netif->tx_comms_area->addr,
++ GNTMAP_host_map, netif->tx_shmem_handle);
++ HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
++ DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
++ return op.status;
++ }
++
++ netif->rx_shmem_ref = rx_ring_ref;
++ netif->rx_shmem_handle = op.handle;
++
++ return 0;
++}
++
++static void unmap_frontend_pages(struct xen_netif *netif)
++{
++ struct gnttab_unmap_grant_ref op;
++
++ gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
++ GNTMAP_host_map, netif->tx_shmem_handle);
++
++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++ BUG();
++
++ gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
++ GNTMAP_host_map, netif->rx_shmem_handle);
++
++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++ BUG();
++}
++
++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
++ unsigned long rx_ring_ref, unsigned int evtchn)
++{
++ int err = -ENOMEM;
++ struct xen_netif_tx_sring *txs;
++ struct xen_netif_rx_sring *rxs;
++
++ /* Already connected through? */
++ if (netif->irq)
++ return 0;
++
++ netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
++ if (netif->tx_comms_area == NULL)
++ return -ENOMEM;
++ netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
++ if (netif->rx_comms_area == NULL)
++ goto err_rx;
++
++ err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
++ if (err)
++ goto err_map;
++
++ err = bind_interdomain_evtchn_to_irqhandler(
++ netif->domid, evtchn, netif_be_int, 0,
++ netif->dev->name, netif);
++ if (err < 0)
++ goto err_hypervisor;
++ netif->irq = err;
++ disable_irq(netif->irq);
++
++ txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
++ BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
++
++ rxs = (struct xen_netif_rx_sring *)
++ ((char *)netif->rx_comms_area->addr);
++ BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
++
++ netif->rx_req_cons_peek = 0;
++
++ netif_get(netif);
++
++ rtnl_lock();
++ netback_carrier_on(netif);
++ if (netif_running(netif->dev))
++ __netif_up(netif);
++ rtnl_unlock();
++
++ return 0;
++err_hypervisor:
++ unmap_frontend_pages(netif);
++err_map:
++ free_vm_area(netif->rx_comms_area);
++err_rx:
++ free_vm_area(netif->tx_comms_area);
++ return err;
++}
++
++void netif_disconnect(struct xen_netif *netif)
++{
++ if (netback_carrier_ok(netif)) {
++ rtnl_lock();
++ netback_carrier_off(netif);
++ netif_carrier_off(netif->dev); /* discard queued packets */
++ if (netif_running(netif->dev))
++ __netif_down(netif);
++ rtnl_unlock();
++ netif_put(netif);
++ }
++
++ atomic_dec(&netif->refcnt);
++ wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
++
++ del_timer_sync(&netif->credit_timeout);
++ del_timer_sync(&netif->tx_queue_timeout);
++
++ if (netif->irq)
++ unbind_from_irqhandler(netif->irq, netif);
++
++ unregister_netdev(netif->dev);
++
++ if (netif->tx.sring) {
++ unmap_frontend_pages(netif);
++ free_vm_area(netif->tx_comms_area);
++ free_vm_area(netif->rx_comms_area);
++ }
++
++ free_netdev(netif->dev);
++}
+diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
+new file mode 100644
+index 0000000..0bc6398
+--- /dev/null
++++ b/drivers/xen/netback/netback.c
+@@ -0,0 +1,1613 @@
++/******************************************************************************
++ * drivers/xen/netback/netback.c
++ *
++ * Back-end of the driver for virtual network devices. This portion of the
++ * driver exports a 'unified' network-device interface that can be accessed
++ * by any operating system that implements a compatible front end. A
++ * reference front-end implementation can be found in:
++ * drivers/xen/netfront/netfront.c
++ *
++ * Copyright (c) 2002-2005, K A Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++
++#include <linux/tcp.h>
++#include <linux/udp.h>
++
++#include <xen/balloon.h>
++#include <xen/events.h>
++#include <xen/interface/memory.h>
++
++#include <asm/xen/hypercall.h>
++#include <asm/xen/page.h>
++
++/*define NETBE_DEBUG_INTERRUPT*/
++
++struct netbk_rx_meta {
++ skb_frag_t frag;
++ int id;
++};
++
++struct netbk_tx_pending_inuse {
++ struct list_head list;
++ unsigned long alloc_time;
++};
++
++
++static void netif_idx_release(u16 pending_idx);
++static void make_tx_response(struct xen_netif *netif,
++ struct xen_netif_tx_request *txp,
++ s8 st);
++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
++ u16 id,
++ s8 st,
++ u16 offset,
++ u16 size,
++ u16 flags);
++
++static void net_tx_action(unsigned long unused);
++static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
++
++static void net_rx_action(unsigned long unused);
++static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
++
++static struct timer_list net_timer;
++static struct timer_list netbk_tx_pending_timer;
++
++#define MAX_PENDING_REQS 256
++
++static struct sk_buff_head rx_queue;
++
++static struct page **mmap_pages;
++static inline unsigned long idx_to_pfn(unsigned int idx)
++{
++ return page_to_pfn(mmap_pages[idx]);
++}
++
++static inline unsigned long idx_to_kaddr(unsigned int idx)
++{
++ return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx));
++}
++
++/* extra field used in struct page */
++static inline void netif_set_page_index(struct page *pg, unsigned int index)
++{
++ *(unsigned long *)&pg->mapping = index + 1;
++}
++
++static inline int netif_page_index(struct page *pg)
++{
++ unsigned long idx = (unsigned long)pg->mapping - 1;
++
++ if (!PageForeign(pg))
++ return -1;
++
++ if ((idx >= MAX_PENDING_REQS) || (mmap_pages[idx] != pg))
++ return -1;
++
++ return idx;
++}
++
++/*
++ * This is the amount of packet we copy rather than map, so that the
++ * guest can't fiddle with the contents of the headers while we do
++ * packet processing on them (netfilter, routing, etc). 72 is enough
++ * to cover TCP+IP headers including options.
++ */
++#define PKT_PROT_LEN 72
++
++static struct pending_tx_info {
++ struct xen_netif_tx_request req;
++ struct xen_netif *netif;
++} pending_tx_info[MAX_PENDING_REQS];
++static u16 pending_ring[MAX_PENDING_REQS];
++typedef unsigned int pending_ring_idx_t;
++
++static inline pending_ring_idx_t pending_index(unsigned i)
++{
++ return i & (MAX_PENDING_REQS-1);
++}
++
++static pending_ring_idx_t pending_prod, pending_cons;
++
++static inline pending_ring_idx_t nr_pending_reqs(void)
++{
++ return MAX_PENDING_REQS - pending_prod + pending_cons;
++}
++
++/* Freed TX SKBs get batched on this ring before return to pending_ring. */
++static u16 dealloc_ring[MAX_PENDING_REQS];
++static pending_ring_idx_t dealloc_prod, dealloc_cons;
++
++/* Doubly-linked list of in-use pending entries. */
++static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
++static LIST_HEAD(pending_inuse_head);
++
++static struct sk_buff_head tx_queue;
++
++static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
++static struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
++static struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
++
++static LIST_HEAD(net_schedule_list);
++static DEFINE_SPINLOCK(net_schedule_list_lock);
++
++#define MAX_MFN_ALLOC 64
++static unsigned long mfn_list[MAX_MFN_ALLOC];
++static unsigned int alloc_index = 0;
++
++/* Setting this allows the safe use of this driver without netloop. */
++static int MODPARM_copy_skb = 1;
++module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
++MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
++
++int netbk_copy_skb_mode;
++
++static inline unsigned long alloc_mfn(void)
++{
++ BUG_ON(alloc_index == 0);
++ return mfn_list[--alloc_index];
++}
++
++static inline void maybe_schedule_tx_action(void)
++{
++ smp_mb();
++ if ((nr_pending_reqs() < (MAX_PENDING_REQS/2)) &&
++ !list_empty(&net_schedule_list))
++ tasklet_schedule(&net_tx_tasklet);
++}
++
++static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
++{
++ struct skb_shared_info *ninfo;
++ struct sk_buff *nskb;
++ unsigned long offset;
++ int ret;
++ int len;
++ int headlen;
++
++ BUG_ON(skb_shinfo(skb)->frag_list != NULL);
++
++ nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
++ if (unlikely(!nskb))
++ goto err;
++
++ skb_reserve(nskb, NET_SKB_PAD + NET_IP_ALIGN);
++ headlen = skb_end_pointer(nskb) - nskb->data;
++ if (headlen > skb_headlen(skb))
++ headlen = skb_headlen(skb);
++ ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
++ BUG_ON(ret);
++
++ ninfo = skb_shinfo(nskb);
++ ninfo->gso_size = skb_shinfo(skb)->gso_size;
++ ninfo->gso_type = skb_shinfo(skb)->gso_type;
++
++ offset = headlen;
++ len = skb->len - headlen;
++
++ nskb->len = skb->len;
++ nskb->data_len = len;
++ nskb->truesize += len;
++
++ while (len) {
++ struct page *page;
++ int copy;
++ int zero;
++
++ if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
++ dump_stack();
++ goto err_free;
++ }
++
++ copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
++ zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
++
++ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
++ if (unlikely(!page))
++ goto err_free;
++
++ ret = skb_copy_bits(skb, offset, page_address(page), copy);
++ BUG_ON(ret);
++
++ ninfo->frags[ninfo->nr_frags].page = page;
++ ninfo->frags[ninfo->nr_frags].page_offset = 0;
++ ninfo->frags[ninfo->nr_frags].size = copy;
++ ninfo->nr_frags++;
++
++ offset += copy;
++ len -= copy;
++ }
++
++ offset = nskb->data - skb->data;
++
++ nskb->transport_header = skb->transport_header + offset;
++ nskb->network_header = skb->network_header + offset;
++ nskb->mac_header = skb->mac_header + offset;
++
++ return nskb;
++
++ err_free:
++ kfree_skb(nskb);
++ err:
++ return NULL;
++}
++
++static inline int netbk_max_required_rx_slots(struct xen_netif *netif)
++{
++ if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
++ return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
++ return 1; /* all in one */
++}
++
++static inline int netbk_queue_full(struct xen_netif *netif)
++{
++ RING_IDX peek = netif->rx_req_cons_peek;
++ RING_IDX needed = netbk_max_required_rx_slots(netif);
++
++ return ((netif->rx.sring->req_prod - peek) < needed) ||
++ ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
++}
++
++static void tx_queue_callback(unsigned long data)
++{
++ struct xen_netif *netif = (struct xen_netif *)data;
++ if (netif_schedulable(netif))
++ netif_wake_queue(netif->dev);
++}
++
++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++ struct xen_netif *netif = netdev_priv(dev);
++
++ BUG_ON(skb->dev != dev);
++
++ /* Drop the packet if the target domain has no receive buffers. */
++ if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
++ goto drop;
++
++ /*
++ * XXX For now we also copy skbuffs whose head crosses a page
++ * boundary, because netbk_gop_skb can't handle them.
++ */
++ if ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE) {
++ struct sk_buff *nskb = netbk_copy_skb(skb);
++ if ( unlikely(nskb == NULL) )
++ goto drop;
++ /* Copy only the header fields we use in this driver. */
++ nskb->dev = skb->dev;
++ nskb->ip_summed = skb->ip_summed;
++ dev_kfree_skb(skb);
++ skb = nskb;
++ }
++
++ netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
++ !!skb_shinfo(skb)->gso_size;
++ netif_get(netif);
++
++ if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
++ netif->rx.sring->req_event = netif->rx_req_cons_peek +
++ netbk_max_required_rx_slots(netif);
++ mb(); /* request notification /then/ check & stop the queue */
++ if (netbk_queue_full(netif)) {
++ netif_stop_queue(dev);
++ /*
++ * Schedule 500ms timeout to restart the queue, thus
++ * ensuring that an inactive queue will be drained.
++ * Packets will be immediately be dropped until more
++ * receive buffers become available (see
++ * netbk_queue_full() check above).
++ */
++ netif->tx_queue_timeout.data = (unsigned long)netif;
++ netif->tx_queue_timeout.function = tx_queue_callback;
++ mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
++ }
++ }
++
++ skb_queue_tail(&rx_queue, skb);
++ tasklet_schedule(&net_rx_tasklet);
++
++ return 0;
++
++ drop:
++ netif->stats.tx_dropped++;
++ dev_kfree_skb(skb);
++ return 0;
++}
++
++struct netrx_pending_operations {
++ unsigned trans_prod, trans_cons;
++ unsigned mmu_prod, mmu_mcl;
++ unsigned mcl_prod, mcl_cons;
++ unsigned copy_prod, copy_cons;
++ unsigned meta_prod, meta_cons;
++ struct mmu_update *mmu;
++ struct gnttab_transfer *trans;
++ struct gnttab_copy *copy;
++ struct multicall_entry *mcl;
++ struct netbk_rx_meta *meta;
++};
++
++/* Set up the grant operations for this fragment. If it's a flipping
++ interface, we also set up the unmap request from here. */
++static u16 netbk_gop_frag(struct xen_netif *netif, struct netbk_rx_meta *meta,
++ int i, struct netrx_pending_operations *npo,
++ struct page *page, unsigned long size,
++ unsigned long offset)
++{
++ struct gnttab_copy *copy_gop;
++ struct xen_netif_rx_request *req;
++ unsigned long old_mfn;
++ int idx = netif_page_index(page);
++
++ old_mfn = virt_to_mfn(page_address(page));
++
++ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
++
++ copy_gop = npo->copy + npo->copy_prod++;
++ copy_gop->flags = GNTCOPY_dest_gref;
++ if (idx > -1) {
++ struct pending_tx_info *src_pend = &pending_tx_info[idx];
++ copy_gop->source.domid = src_pend->netif->domid;
++ copy_gop->source.u.ref = src_pend->req.gref;
++ copy_gop->flags |= GNTCOPY_source_gref;
++ } else {
++ copy_gop->source.domid = DOMID_SELF;
++ copy_gop->source.u.gmfn = old_mfn;
++ }
++ copy_gop->source.offset = offset;
++ copy_gop->dest.domid = netif->domid;
++ copy_gop->dest.offset = 0;
++ copy_gop->dest.u.ref = req->gref;
++ copy_gop->len = size;
++
++ return req->id;
++}
++
++static void netbk_gop_skb(struct sk_buff *skb,
++ struct netrx_pending_operations *npo)
++{
++ struct xen_netif *netif = netdev_priv(skb->dev);
++ int nr_frags = skb_shinfo(skb)->nr_frags;
++ int i;
++ int extra;
++ struct netbk_rx_meta *head_meta, *meta;
++
++ head_meta = npo->meta + npo->meta_prod++;
++ head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
++ head_meta->frag.size = skb_shinfo(skb)->gso_size;
++ extra = !!head_meta->frag.size + 1;
++
++ for (i = 0; i < nr_frags; i++) {
++ meta = npo->meta + npo->meta_prod++;
++ meta->frag = skb_shinfo(skb)->frags[i];
++ meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
++ meta->frag.page,
++ meta->frag.size,
++ meta->frag.page_offset);
++ }
++
++ /*
++ * This must occur at the end to ensure that we don't trash skb_shinfo
++ * until we're done. We know that the head doesn't cross a page
++ * boundary because such packets get copied in netif_be_start_xmit.
++ */
++ head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
++ virt_to_page(skb->data),
++ skb_headlen(skb),
++ offset_in_page(skb->data));
++
++ netif->rx.req_cons += nr_frags + extra;
++}
++
++static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
++{
++ int i;
++
++ for (i = 0; i < nr_frags; i++)
++ put_page(meta[i].frag.page);
++}
++
++/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
++ used to set up the operations on the top of
++ netrx_pending_operations, which have since been done. Check that
++ they didn't give any errors and advance over them. */
++static int netbk_check_gop(int nr_frags, domid_t domid,
++ struct netrx_pending_operations *npo)
++{
++ struct gnttab_copy *copy_op;
++ int status = NETIF_RSP_OKAY;
++ int i;
++
++ for (i = 0; i <= nr_frags; i++) {
++ copy_op = npo->copy + npo->copy_cons++;
++ if (copy_op->status != GNTST_okay) {
++ DPRINTK("Bad status %d from copy to DOM%d.\n",
++ copy_op->status, domid);
++ status = NETIF_RSP_ERROR;
++ }
++ }
++
++ return status;
++}
++
++static void netbk_add_frag_responses(struct xen_netif *netif, int status,
++ struct netbk_rx_meta *meta, int nr_frags)
++{
++ int i;
++ unsigned long offset;
++
++ for (i = 0; i < nr_frags; i++) {
++ int id = meta[i].id;
++ int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
++
++ offset = 0;
++ make_rx_response(netif, id, status, offset,
++ meta[i].frag.size, flags);
++ }
++}
++
++static void net_rx_action(unsigned long unused)
++{
++ struct xen_netif *netif = NULL;
++ s8 status;
++ u16 id, irq, flags;
++ struct xen_netif_rx_response *resp;
++ struct multicall_entry *mcl;
++ struct sk_buff_head rxq;
++ struct sk_buff *skb;
++ int notify_nr = 0;
++ int ret;
++ int nr_frags;
++ int count;
++ unsigned long offset;
++
++ /*
++ * Putting hundreds of bytes on the stack is considered rude.
++ * Static works because a tasklet can only be on one CPU at any time.
++ */
++ static struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
++ static struct mmu_update rx_mmu[NET_RX_RING_SIZE];
++ static struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
++ static struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
++ static unsigned char rx_notify[NR_IRQS];
++ static u16 notify_list[NET_RX_RING_SIZE];
++ static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
++
++ struct netrx_pending_operations npo = {
++ mmu: rx_mmu,
++ trans: grant_trans_op,
++ copy: grant_copy_op,
++ mcl: rx_mcl,
++ meta: meta};
++
++ skb_queue_head_init(&rxq);
++
++ count = 0;
++
++ while ((skb = skb_dequeue(&rx_queue)) != NULL) {
++ nr_frags = skb_shinfo(skb)->nr_frags;
++ *(int *)skb->cb = nr_frags;
++
++ netbk_gop_skb(skb, &npo);
++
++ count += nr_frags + 1;
++
++ __skb_queue_tail(&rxq, skb);
++
++ /* Filled the batch queue? */
++ if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
++ break;
++ }
++
++ BUG_ON(npo.meta_prod > ARRAY_SIZE(meta));
++
++ npo.mmu_mcl = npo.mcl_prod;
++ if (npo.mcl_prod) {
++ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++ BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu));
++ mcl = npo.mcl + npo.mcl_prod++;
++
++ BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
++ mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
++
++ mcl->op = __HYPERVISOR_mmu_update;
++ mcl->args[0] = (unsigned long)rx_mmu;
++ mcl->args[1] = npo.mmu_prod;
++ mcl->args[2] = 0;
++ mcl->args[3] = DOMID_SELF;
++ }
++
++ if (npo.trans_prod) {
++ BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op));
++ mcl = npo.mcl + npo.mcl_prod++;
++ mcl->op = __HYPERVISOR_grant_table_op;
++ mcl->args[0] = GNTTABOP_transfer;
++ mcl->args[1] = (unsigned long)grant_trans_op;
++ mcl->args[2] = npo.trans_prod;
++ }
++
++ if (npo.copy_prod) {
++ BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op));
++ mcl = npo.mcl + npo.mcl_prod++;
++ mcl->op = __HYPERVISOR_grant_table_op;
++ mcl->args[0] = GNTTABOP_copy;
++ mcl->args[1] = (unsigned long)grant_copy_op;
++ mcl->args[2] = npo.copy_prod;
++ }
++
++ /* Nothing to do? */
++ if (!npo.mcl_prod)
++ return;
++
++ BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl));
++
++ ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
++ BUG_ON(ret != 0);
++ /* The mmu_machphys_update() must not fail. */
++ BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0);
++
++ while ((skb = __skb_dequeue(&rxq)) != NULL) {
++ nr_frags = *(int *)skb->cb;
++
++ netif = netdev_priv(skb->dev);
++
++ netif->stats.tx_bytes += skb->len;
++ netif->stats.tx_packets++;
++
++ status = netbk_check_gop(nr_frags, netif->domid, &npo);
++
++ id = meta[npo.meta_cons].id;
++ flags = nr_frags ? NETRXF_more_data : 0;
++
++ if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
++ flags |= NETRXF_csum_blank | NETRXF_data_validated;
++ else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
++ /* remote but checksummed. */
++ flags |= NETRXF_data_validated;
++
++ offset = 0;
++ resp = make_rx_response(netif, id, status, offset,
++ skb_headlen(skb), flags);
++
++ if (meta[npo.meta_cons].frag.size) {
++ struct xen_netif_extra_info *gso =
++ (struct xen_netif_extra_info *)
++ RING_GET_RESPONSE(&netif->rx,
++ netif->rx.rsp_prod_pvt++);
++
++ resp->flags |= NETRXF_extra_info;
++
++ gso->u.gso.size = meta[npo.meta_cons].frag.size;
++ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
++ gso->u.gso.pad = 0;
++ gso->u.gso.features = 0;
++
++ gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
++ gso->flags = 0;
++ }
++
++ netbk_add_frag_responses(netif, status,
++ meta + npo.meta_cons + 1,
++ nr_frags);
++
++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
++ irq = netif->irq;
++ if (ret && !rx_notify[irq] &&
++ (netif->smart_poll != 1)) {
++ rx_notify[irq] = 1;
++ notify_list[notify_nr++] = irq;
++ }
++
++ if (netif_queue_stopped(netif->dev) &&
++ netif_schedulable(netif) &&
++ !netbk_queue_full(netif))
++ netif_wake_queue(netif->dev);
++
++ /*
++ * netfront_smartpoll_active indicates whether
++ * netfront timer is active.
++ */
++ if ((netif->smart_poll == 1)) {
++ if (!(netif->rx.sring->netfront_smartpoll_active)) {
++ notify_remote_via_irq(irq);
++ netif->rx.sring->netfront_smartpoll_active = 1;
++ }
++ }
++
++ netif_put(netif);
++ dev_kfree_skb(skb);
++ npo.meta_cons += nr_frags + 1;
++ }
++
++ while (notify_nr != 0) {
++ irq = notify_list[--notify_nr];
++ rx_notify[irq] = 0;
++ notify_remote_via_irq(irq);
++ }
++
++ /* More work to do? */
++ if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
++ tasklet_schedule(&net_rx_tasklet);
++}
++
++static void net_alarm(unsigned long unused)
++{
++ tasklet_schedule(&net_rx_tasklet);
++}
++
++static void netbk_tx_pending_timeout(unsigned long unused)
++{
++ tasklet_schedule(&net_tx_tasklet);
++}
++
++struct net_device_stats *netif_be_get_stats(struct net_device *dev)
++{
++ struct xen_netif *netif = netdev_priv(dev);
++ return &netif->stats;
++}
++
++static int __on_net_schedule_list(struct xen_netif *netif)
++{
++ return !list_empty(&netif->list);
++}
++
++static void remove_from_net_schedule_list(struct xen_netif *netif)
++{
++ spin_lock_irq(&net_schedule_list_lock);
++ if (likely(__on_net_schedule_list(netif))) {
++ list_del_init(&netif->list);
++ netif_put(netif);
++ }
++ spin_unlock_irq(&net_schedule_list_lock);
++}
++
++static void add_to_net_schedule_list_tail(struct xen_netif *netif)
++{
++ if (__on_net_schedule_list(netif))
++ return;
++
++ spin_lock_irq(&net_schedule_list_lock);
++ if (!__on_net_schedule_list(netif) &&
++ likely(netif_schedulable(netif))) {
++ list_add_tail(&netif->list, &net_schedule_list);
++ netif_get(netif);
++ }
++ spin_unlock_irq(&net_schedule_list_lock);
++}
++
++void netif_schedule_work(struct xen_netif *netif)
++{
++ int more_to_do;
++
++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
++
++ if (more_to_do) {
++ add_to_net_schedule_list_tail(netif);
++ maybe_schedule_tx_action();
++ }
++}
++
++void netif_deschedule_work(struct xen_netif *netif)
++{
++ remove_from_net_schedule_list(netif);
++}
++
++
++static void tx_add_credit(struct xen_netif *netif)
++{
++ unsigned long max_burst, max_credit;
++
++ /*
++ * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
++ * Otherwise the interface can seize up due to insufficient credit.
++ */
++ max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
++ max_burst = min(max_burst, 131072UL);
++ max_burst = max(max_burst, netif->credit_bytes);
++
++ /* Take care that adding a new chunk of credit doesn't wrap to zero. */
++ max_credit = netif->remaining_credit + netif->credit_bytes;
++ if (max_credit < netif->remaining_credit)
++ max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
++
++ netif->remaining_credit = min(max_credit, max_burst);
++}
++
++static void tx_credit_callback(unsigned long data)
++{
++ struct xen_netif *netif = (struct xen_netif *)data;
++ tx_add_credit(netif);
++ netif_schedule_work(netif);
++}
++
++static inline int copy_pending_req(pending_ring_idx_t pending_idx)
++{
++ return gnttab_copy_grant_page(grant_tx_handle[pending_idx],
++ &mmap_pages[pending_idx]);
++}
++
++inline static void net_tx_action_dealloc(void)
++{
++ struct netbk_tx_pending_inuse *inuse, *n;
++ struct gnttab_unmap_grant_ref *gop;
++ u16 pending_idx;
++ pending_ring_idx_t dc, dp;
++ struct xen_netif *netif;
++ int ret;
++ LIST_HEAD(list);
++
++ dc = dealloc_cons;
++ gop = tx_unmap_ops;
++
++ /*
++ * Free up any grants we have finished using
++ */
++ do {
++ dp = dealloc_prod;
++
++ /* Ensure we see all indices enqueued by netif_idx_release(). */
++ smp_rmb();
++
++ while (dc != dp) {
++ unsigned long pfn;
++
++ pending_idx = dealloc_ring[pending_index(dc++)];
++ list_move_tail(&pending_inuse[pending_idx].list, &list);
++
++ pfn = idx_to_pfn(pending_idx);
++ /* Already unmapped? */
++ if (!phys_to_machine_mapping_valid(pfn))
++ continue;
++
++ gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
++ GNTMAP_host_map,
++ grant_tx_handle[pending_idx]);
++ gop++;
++ }
++
++ if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB ||
++ list_empty(&pending_inuse_head))
++ break;
++
++ /* Copy any entries that have been pending for too long. */
++ list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) {
++ if (time_after(inuse->alloc_time + HZ / 2, jiffies))
++ break;
++
++ pending_idx = inuse - pending_inuse;
++
++ pending_tx_info[pending_idx].netif->nr_copied_skbs++;
++
++ switch (copy_pending_req(pending_idx)) {
++ case 0:
++ list_move_tail(&inuse->list, &list);
++ continue;
++ case -EBUSY:
++ list_del_init(&inuse->list);
++ continue;
++ case -ENOENT:
++ continue;
++ }
++
++ break;
++ }
++ } while (dp != dealloc_prod);
++
++ dealloc_cons = dc;
++
++ ret = HYPERVISOR_grant_table_op(
++ GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
++ BUG_ON(ret);
++
++ list_for_each_entry_safe(inuse, n, &list, list) {
++ pending_idx = inuse - pending_inuse;
++
++ netif = pending_tx_info[pending_idx].netif;
++
++ make_tx_response(netif, &pending_tx_info[pending_idx].req,
++ NETIF_RSP_OKAY);
++
++ /* Ready for next use. */
++ gnttab_reset_grant_page(mmap_pages[pending_idx]);
++
++ pending_ring[pending_index(pending_prod++)] = pending_idx;
++
++ netif_put(netif);
++
++ list_del_init(&inuse->list);
++ }
++}
++
++static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *txp, RING_IDX end)
++{
++ RING_IDX cons = netif->tx.req_cons;
++
++ do {
++ make_tx_response(netif, txp, NETIF_RSP_ERROR);
++ if (cons >= end)
++ break;
++ txp = RING_GET_REQUEST(&netif->tx, cons++);
++ } while (1);
++ netif->tx.req_cons = cons;
++ netif_schedule_work(netif);
++ netif_put(netif);
++}
++
++static int netbk_count_requests(struct xen_netif *netif,
++ struct xen_netif_tx_request *first,
++ struct xen_netif_tx_request *txp, int work_to_do)
++{
++ RING_IDX cons = netif->tx.req_cons;
++ int frags = 0;
++
++ if (!(first->flags & NETTXF_more_data))
++ return 0;
++
++ do {
++ if (frags >= work_to_do) {
++ DPRINTK("Need more frags\n");
++ return -frags;
++ }
++
++ if (unlikely(frags >= MAX_SKB_FRAGS)) {
++ DPRINTK("Too many frags\n");
++ return -frags;
++ }
++
++ memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
++ sizeof(*txp));
++ if (txp->size > first->size) {
++ DPRINTK("Frags galore\n");
++ return -frags;
++ }
++
++ first->size -= txp->size;
++ frags++;
++
++ if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
++ DPRINTK("txp->offset: %x, size: %u\n",
++ txp->offset, txp->size);
++ return -frags;
++ }
++ } while ((txp++)->flags & NETTXF_more_data);
++
++ return frags;
++}
++
++static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
++ struct sk_buff *skb,
++ struct xen_netif_tx_request *txp,
++ struct gnttab_map_grant_ref *mop)
++{
++ struct skb_shared_info *shinfo = skb_shinfo(skb);
++ skb_frag_t *frags = shinfo->frags;
++ unsigned long pending_idx = *((u16 *)skb->data);
++ int i, start;
++
++ /* Skip first skb fragment if it is on same page as header fragment. */
++ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
++
++ for (i = start; i < shinfo->nr_frags; i++, txp++) {
++ pending_idx = pending_ring[pending_index(pending_cons++)];
++
++ gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
++ GNTMAP_host_map | GNTMAP_readonly,
++ txp->gref, netif->domid);
++
++ memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
++ netif_get(netif);
++ pending_tx_info[pending_idx].netif = netif;
++ frags[i].page = (void *)pending_idx;
++ }
++
++ return mop;
++}
++
++static int netbk_tx_check_mop(struct sk_buff *skb,
++ struct gnttab_map_grant_ref **mopp)
++{
++ struct gnttab_map_grant_ref *mop = *mopp;
++ int pending_idx = *((u16 *)skb->data);
++ struct xen_netif *netif = pending_tx_info[pending_idx].netif;
++ struct xen_netif_tx_request *txp;
++ struct skb_shared_info *shinfo = skb_shinfo(skb);
++ int nr_frags = shinfo->nr_frags;
++ int i, err, start;
++
++ /* Check status of header. */
++ err = mop->status;
++ if (unlikely(err)) {
++ txp = &pending_tx_info[pending_idx].req;
++ make_tx_response(netif, txp, NETIF_RSP_ERROR);
++ pending_ring[pending_index(pending_prod++)] = pending_idx;
++ netif_put(netif);
++ } else {
++ set_phys_to_machine(
++ __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
++ FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
++ grant_tx_handle[pending_idx] = mop->handle;
++ }
++
++ /* Skip first skb fragment if it is on same page as header fragment. */
++ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
++
++ for (i = start; i < nr_frags; i++) {
++ int j, newerr;
++
++ pending_idx = (unsigned long)shinfo->frags[i].page;
++
++ /* Check error status: if okay then remember grant handle. */
++ newerr = (++mop)->status;
++ if (likely(!newerr)) {
++ set_phys_to_machine(
++ __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
++ FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
++ grant_tx_handle[pending_idx] = mop->handle;
++ /* Had a previous error? Invalidate this fragment. */
++ if (unlikely(err))
++ netif_idx_release(pending_idx);
++ continue;
++ }
++
++ /* Error on this fragment: respond to client with an error. */
++ txp = &pending_tx_info[pending_idx].req;
++ make_tx_response(netif, txp, NETIF_RSP_ERROR);
++ pending_ring[pending_index(pending_prod++)] = pending_idx;
++ netif_put(netif);
++
++ /* Not the first error? Preceding frags already invalidated. */
++ if (err)
++ continue;
++
++ /* First error: invalidate header and preceding fragments. */
++ pending_idx = *((u16 *)skb->data);
++ netif_idx_release(pending_idx);
++ for (j = start; j < i; j++) {
++ pending_idx = (unsigned long)shinfo->frags[i].page;
++ netif_idx_release(pending_idx);
++ }
++
++ /* Remember the error: invalidate all subsequent fragments. */
++ err = newerr;
++ }
++
++ *mopp = mop + 1;
++ return err;
++}
++
++static void netbk_fill_frags(struct sk_buff *skb)
++{
++ struct skb_shared_info *shinfo = skb_shinfo(skb);
++ int nr_frags = shinfo->nr_frags;
++ int i;
++
++ for (i = 0; i < nr_frags; i++) {
++ skb_frag_t *frag = shinfo->frags + i;
++ struct xen_netif_tx_request *txp;
++ unsigned long pending_idx;
++
++ pending_idx = (unsigned long)frag->page;
++
++ pending_inuse[pending_idx].alloc_time = jiffies;
++ list_add_tail(&pending_inuse[pending_idx].list,
++ &pending_inuse_head);
++
++ txp = &pending_tx_info[pending_idx].req;
++ frag->page = virt_to_page(idx_to_kaddr(pending_idx));
++ frag->size = txp->size;
++ frag->page_offset = txp->offset;
++
++ skb->len += txp->size;
++ skb->data_len += txp->size;
++ skb->truesize += txp->size;
++ }
++}
++
++int netbk_get_extras(struct xen_netif *netif, struct xen_netif_extra_info *extras,
++ int work_to_do)
++{
++ struct xen_netif_extra_info extra;
++ RING_IDX cons = netif->tx.req_cons;
++
++ do {
++ if (unlikely(work_to_do-- <= 0)) {
++ DPRINTK("Missing extra info\n");
++ return -EBADR;
++ }
++
++ memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
++ sizeof(extra));
++ if (unlikely(!extra.type ||
++ extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
++ netif->tx.req_cons = ++cons;
++ DPRINTK("Invalid extra type: %d\n", extra.type);
++ return -EINVAL;
++ }
++
++ memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
++ netif->tx.req_cons = ++cons;
++ } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
++
++ return work_to_do;
++}
++
++static int netbk_set_skb_gso(struct sk_buff *skb, struct xen_netif_extra_info *gso)
++{
++ if (!gso->u.gso.size) {
++ DPRINTK("GSO size must not be zero.\n");
++ return -EINVAL;
++ }
++
++ /* Currently only TCPv4 S.O. is supported. */
++ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
++ DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
++ return -EINVAL;
++ }
++
++ skb_shinfo(skb)->gso_size = gso->u.gso.size;
++ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
++
++ /* Header must be checked, and gso_segs computed. */
++ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
++ skb_shinfo(skb)->gso_segs = 0;
++
++ return 0;
++}
++
++static int skb_checksum_setup(struct sk_buff *skb)
++{
++ struct iphdr *iph;
++ unsigned char *th;
++ int err = -EPROTO;
++
++ if (skb->protocol != htons(ETH_P_IP))
++ goto out;
++
++ iph = (void *)skb->data;
++ th = skb->data + 4 * iph->ihl;
++ if (th >= skb_tail_pointer(skb))
++ goto out;
++
++ skb->csum_start = th - skb->head;
++ switch (iph->protocol) {
++ case IPPROTO_TCP:
++ skb->csum_offset = offsetof(struct tcphdr, check);
++ break;
++ case IPPROTO_UDP:
++ skb->csum_offset = offsetof(struct udphdr, check);
++ break;
++ default:
++ if (net_ratelimit())
++ printk(KERN_ERR "Attempting to checksum a non-"
++ "TCP/UDP packet, dropping a protocol"
++ " %d packet", iph->protocol);
++ goto out;
++ }
++
++ if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
++ goto out;
++
++ err = 0;
++
++out:
++ return err;
++}
++
++static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size)
++{
++ unsigned long now = jiffies;
++ unsigned long next_credit =
++ netif->credit_timeout.expires +
++ msecs_to_jiffies(netif->credit_usec / 1000);
++
++ /* Timer could already be pending in rare cases. */
++ if (timer_pending(&netif->credit_timeout))
++ return true;
++
++ /* Passed the point where we can replenish credit? */
++ if (time_after_eq(now, next_credit)) {
++ netif->credit_timeout.expires = now;
++ tx_add_credit(netif);
++ }
++
++ /* Still too big to send right now? Set a callback. */
++ if (size > netif->remaining_credit) {
++ netif->credit_timeout.data =
++ (unsigned long)netif;
++ netif->credit_timeout.function =
++ tx_credit_callback;
++ mod_timer(&netif->credit_timeout,
++ next_credit);
++
++ return true;
++ }
++
++ return false;
++}
++
++static unsigned net_tx_build_mops(void)
++{
++ struct gnttab_map_grant_ref *mop;
++ struct sk_buff *skb;
++ int ret;
++
++ mop = tx_map_ops;
++ while (((nr_pending_reqs() + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
++ !list_empty(&net_schedule_list)) {
++ struct xen_netif *netif;
++ struct xen_netif_tx_request txreq;
++ struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
++ struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
++ u16 pending_idx;
++ RING_IDX idx;
++ int work_to_do;
++ unsigned int data_len;
++
++ /* Get a netif from the list with work to do. */
++ netif = list_first_entry(&net_schedule_list, struct xen_netif, list);
++ netif_get(netif);
++ remove_from_net_schedule_list(netif);
++
++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
++ if (!work_to_do) {
++ netif_put(netif);
++ continue;
++ }
++
++ idx = netif->tx.req_cons;
++ rmb(); /* Ensure that we see the request before we copy it. */
++ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, idx), sizeof(txreq));
++
++ /* Credit-based scheduling. */
++ if (txreq.size > netif->remaining_credit &&
++ tx_credit_exceeded(netif, txreq.size)) {
++ netif_put(netif);
++ continue;
++ }
++
++ netif->remaining_credit -= txreq.size;
++
++ work_to_do--;
++ netif->tx.req_cons = ++idx;
++
++ memset(extras, 0, sizeof(extras));
++ if (txreq.flags & NETTXF_extra_info) {
++ work_to_do = netbk_get_extras(netif, extras,
++ work_to_do);
++ idx = netif->tx.req_cons;
++ if (unlikely(work_to_do < 0)) {
++ netbk_tx_err(netif, &txreq, idx);
++ continue;
++ }
++ }
++
++ ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
++ if (unlikely(ret < 0)) {
++ netbk_tx_err(netif, &txreq, idx - ret);
++ continue;
++ }
++ idx += ret;
++
++ if (unlikely(txreq.size < ETH_HLEN)) {
++ DPRINTK("Bad packet size: %d\n", txreq.size);
++ netbk_tx_err(netif, &txreq, idx);
++ continue;
++ }
++
++ /* No crossing a page as the payload mustn't fragment. */
++ if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
++ DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
++ txreq.offset, txreq.size,
++ (txreq.offset &~PAGE_MASK) + txreq.size);
++ netbk_tx_err(netif, &txreq, idx);
++ continue;
++ }
++
++ pending_idx = pending_ring[pending_index(pending_cons)];
++
++ data_len = (txreq.size > PKT_PROT_LEN &&
++ ret < MAX_SKB_FRAGS) ?
++ PKT_PROT_LEN : txreq.size;
++
++ skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
++ GFP_ATOMIC | __GFP_NOWARN);
++ if (unlikely(skb == NULL)) {
++ DPRINTK("Can't allocate a skb in start_xmit.\n");
++ netbk_tx_err(netif, &txreq, idx);
++ break;
++ }
++
++ /* Packets passed to netif_rx() must have some headroom. */
++ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
++
++ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
++ struct xen_netif_extra_info *gso;
++ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
++
++ if (netbk_set_skb_gso(skb, gso)) {
++ kfree_skb(skb);
++ netbk_tx_err(netif, &txreq, idx);
++ continue;
++ }
++ }
++
++ gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
++ GNTMAP_host_map | GNTMAP_readonly,
++ txreq.gref, netif->domid);
++ mop++;
++
++ memcpy(&pending_tx_info[pending_idx].req,
++ &txreq, sizeof(txreq));
++ pending_tx_info[pending_idx].netif = netif;
++ *((u16 *)skb->data) = pending_idx;
++
++ __skb_put(skb, data_len);
++
++ skb_shinfo(skb)->nr_frags = ret;
++ if (data_len < txreq.size) {
++ skb_shinfo(skb)->nr_frags++;
++ skb_shinfo(skb)->frags[0].page =
++ (void *)(unsigned long)pending_idx;
++ } else {
++ /* Discriminate from any valid pending_idx value. */
++ skb_shinfo(skb)->frags[0].page = (void *)~0UL;
++ }
++
++ __skb_queue_tail(&tx_queue, skb);
++
++ pending_cons++;
++
++ mop = netbk_get_requests(netif, skb, txfrags, mop);
++
++ netif->tx.req_cons = idx;
++ netif_schedule_work(netif);
++
++ if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
++ break;
++ }
++
++ return mop - tx_map_ops;
++}
++
++static void net_tx_submit(void)
++{
++ struct gnttab_map_grant_ref *mop;
++ struct sk_buff *skb;
++
++ mop = tx_map_ops;
++ while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
++ struct xen_netif_tx_request *txp;
++ struct xen_netif *netif;
++ u16 pending_idx;
++ unsigned data_len;
++
++ pending_idx = *((u16 *)skb->data);
++ netif = pending_tx_info[pending_idx].netif;
++ txp = &pending_tx_info[pending_idx].req;
++
++ /* Check the remap error code. */
++ if (unlikely(netbk_tx_check_mop(skb, &mop))) {
++ DPRINTK("netback grant failed.\n");
++ skb_shinfo(skb)->nr_frags = 0;
++ kfree_skb(skb);
++ continue;
++ }
++
++ data_len = skb->len;
++ memcpy(skb->data,
++ (void *)(idx_to_kaddr(pending_idx)|txp->offset),
++ data_len);
++ if (data_len < txp->size) {
++ /* Append the packet payload as a fragment. */
++ txp->offset += data_len;
++ txp->size -= data_len;
++ } else {
++ /* Schedule a response immediately. */
++ netif_idx_release(pending_idx);
++ }
++
++ /*
++ * Old frontends do not assert data_validated but we
++ * can infer it from csum_blank so test both flags.
++ */
++ if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank))
++ skb->ip_summed = CHECKSUM_PARTIAL;
++ else
++ skb->ip_summed = CHECKSUM_NONE;
++
++ netbk_fill_frags(skb);
++
++ /*
++ * If the initial fragment was < PKT_PROT_LEN then
++ * pull through some bytes from the other fragments to
++ * increase the linear region to PKT_PROT_LEN bytes.
++ */
++ if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
++ int target = min_t(int, skb->len, PKT_PROT_LEN);
++ __pskb_pull_tail(skb, target - skb_headlen(skb));
++ }
++
++ skb->dev = netif->dev;
++ skb->protocol = eth_type_trans(skb, skb->dev);
++
++ netif->stats.rx_bytes += skb->len;
++ netif->stats.rx_packets++;
++
++ if (skb->ip_summed == CHECKSUM_PARTIAL) {
++ if (skb_checksum_setup(skb)) {
++ DPRINTK("Can't setup checksum in net_tx_action\n");
++ kfree_skb(skb);
++ continue;
++ }
++ }
++
++ if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
++ unlikely(skb_linearize(skb))) {
++ DPRINTK("Can't linearize skb in net_tx_action.\n");
++ kfree_skb(skb);
++ continue;
++ }
++
++ netif_rx(skb);
++ netif->dev->last_rx = jiffies;
++ }
++
++ if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
++ !list_empty(&pending_inuse_head)) {
++ struct netbk_tx_pending_inuse *oldest;
++
++ oldest = list_entry(pending_inuse_head.next,
++ struct netbk_tx_pending_inuse, list);
++ mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ);
++ }
++}
++
++/* Called after netfront has transmitted */
++static void net_tx_action(unsigned long unused)
++{
++ unsigned nr_mops;
++ int ret;
++
++ if (dealloc_cons != dealloc_prod)
++ net_tx_action_dealloc();
++
++ nr_mops = net_tx_build_mops();
++
++ if (nr_mops == 0)
++ return;
++
++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++ tx_map_ops, nr_mops);
++ BUG_ON(ret);
++
++ net_tx_submit();
++}
++
++static void netif_idx_release(u16 pending_idx)
++{
++ static DEFINE_SPINLOCK(_lock);
++ unsigned long flags;
++
++ spin_lock_irqsave(&_lock, flags);
++ dealloc_ring[pending_index(dealloc_prod)] = pending_idx;
++ /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
++ smp_wmb();
++ dealloc_prod++;
++ spin_unlock_irqrestore(&_lock, flags);
++
++ tasklet_schedule(&net_tx_tasklet);
++}
++
++static void netif_page_release(struct page *page, unsigned int order)
++{
++ int idx = netif_page_index(page);
++ BUG_ON(order);
++ BUG_ON(idx < 0);
++ netif_idx_release(idx);
++}
++
++irqreturn_t netif_be_int(int irq, void *dev_id)
++{
++ struct xen_netif *netif = dev_id;
++
++ add_to_net_schedule_list_tail(netif);
++ maybe_schedule_tx_action();
++
++ if (netif_schedulable(netif) && !netbk_queue_full(netif))
++ netif_wake_queue(netif->dev);
++
++ return IRQ_HANDLED;
++}
++
++static void make_tx_response(struct xen_netif *netif,
++ struct xen_netif_tx_request *txp,
++ s8 st)
++{
++ RING_IDX i = netif->tx.rsp_prod_pvt;
++ struct xen_netif_tx_response *resp;
++ int notify;
++
++ resp = RING_GET_RESPONSE(&netif->tx, i);
++ resp->id = txp->id;
++ resp->status = st;
++
++ if (txp->flags & NETTXF_extra_info)
++ RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
++
++ netif->tx.rsp_prod_pvt = ++i;
++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
++
++ /*
++ * netfront_smartpoll_active indicates whether netfront timer
++ * is active.
++ */
++ if ((netif->smart_poll == 1)) {
++ if (!(netif->rx.sring->netfront_smartpoll_active)) {
++ notify_remote_via_irq(netif->irq);
++ netif->rx.sring->netfront_smartpoll_active = 1;
++ }
++ } else if (notify)
++ notify_remote_via_irq(netif->irq);
++}
++
++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
++ u16 id,
++ s8 st,
++ u16 offset,
++ u16 size,
++ u16 flags)
++{
++ RING_IDX i = netif->rx.rsp_prod_pvt;
++ struct xen_netif_rx_response *resp;
++
++ resp = RING_GET_RESPONSE(&netif->rx, i);
++ resp->offset = offset;
++ resp->flags = flags;
++ resp->id = id;
++ resp->status = (s16)size;
++ if (st < 0)
++ resp->status = (s16)st;
++
++ netif->rx.rsp_prod_pvt = ++i;
++
++ return resp;
++}
++
++#ifdef NETBE_DEBUG_INTERRUPT
++static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
++{
++ struct list_head *ent;
++ struct xen_netif *netif;
++ int i = 0;
++
++ printk(KERN_ALERT "netif_schedule_list:\n");
++ spin_lock_irq(&net_schedule_list_lock);
++
++ list_for_each (ent, &net_schedule_list) {
++ netif = list_entry(ent, struct xen_netif, list);
++ printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
++ "rx_resp_prod=%08x\n",
++ i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
++ printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n",
++ netif->tx.req_cons, netif->tx.rsp_prod_pvt);
++ printk(KERN_ALERT " shared(rx_req_prod=%08x "
++ "rx_resp_prod=%08x\n",
++ netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
++ printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n",
++ netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
++ printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n",
++ netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
++ i++;
++ }
++
++ spin_unlock_irq(&net_schedule_list_lock);
++ printk(KERN_ALERT " ** End of netif_schedule_list **\n");
++
++ return IRQ_HANDLED;
++}
++#endif
++
++static int __init netback_init(void)
++{
++ int i;
++ struct page *page;
++ int rc = 0;
++
++ if (!xen_domain())
++ return -ENODEV;
++
++ /* We can increase reservation by this much in net_rx_action(). */
++// balloon_update_driver_allowance(NET_RX_RING_SIZE);
++
++ skb_queue_head_init(&rx_queue);
++ skb_queue_head_init(&tx_queue);
++
++ init_timer(&net_timer);
++ net_timer.data = 0;
++ net_timer.function = net_alarm;
++
++ init_timer(&netbk_tx_pending_timer);
++ netbk_tx_pending_timer.data = 0;
++ netbk_tx_pending_timer.function = netbk_tx_pending_timeout;
++
++ mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
++ if (mmap_pages == NULL) {
++ printk("%s: out of memory\n", __FUNCTION__);
++ return -ENOMEM;
++ }
++
++ for (i = 0; i < MAX_PENDING_REQS; i++) {
++ page = mmap_pages[i];
++ SetPageForeign(page, netif_page_release);
++ netif_set_page_index(page, i);
++ INIT_LIST_HEAD(&pending_inuse[i].list);
++ }
++
++ pending_cons = 0;
++ pending_prod = MAX_PENDING_REQS;
++ for (i = 0; i < MAX_PENDING_REQS; i++)
++ pending_ring[i] = i;
++
++ netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
++ if (MODPARM_copy_skb) {
++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
++ NULL, 0))
++ netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
++ else
++ netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
++ }
++
++ //netif_accel_init();
++
++ rc = netif_xenbus_init();
++ if (rc)
++ goto failed_init;
++
++#ifdef NETBE_DEBUG_INTERRUPT
++ (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
++ 0,
++ netif_be_dbg,
++ SA_SHIRQ,
++ "net-be-dbg",
++ &netif_be_dbg);
++#endif
++
++ return 0;
++
++failed_init:
++ free_empty_pages_and_pagevec(mmap_pages, MAX_PENDING_REQS);
++ del_timer(&netbk_tx_pending_timer);
++ del_timer(&net_timer);
++ return rc;
++
++}
++
++module_init(netback_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c
+new file mode 100644
+index 0000000..70636d0
+--- /dev/null
++++ b/drivers/xen/netback/xenbus.c
+@@ -0,0 +1,523 @@
++/* Xenbus code for netif backend
++ Copyright (C) 2005 Rusty Russell <rusty at rustcorp.com.au>
++ Copyright (C) 2005 XenSource Ltd
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2 of the License, or
++ (at your option) any later version.
++
++ This program is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ GNU General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with this program; if not, write to the Free Software
++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++*/
++
++#include <stdarg.h>
++#include <linux/module.h>
++#include <xen/xenbus.h>
++#include "common.h"
++
++#if 0
++#undef DPRINTK
++#define DPRINTK(fmt, args...) \
++ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
++#endif
++
++
++static int connect_rings(struct backend_info *);
++static void connect(struct backend_info *);
++static void backend_create_netif(struct backend_info *be);
++static void unregister_hotplug_status_watch(struct backend_info *be);
++
++static int netback_remove(struct xenbus_device *dev)
++{
++ struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++ //netback_remove_accelerators(be, dev);
++
++ unregister_hotplug_status_watch(be);
++ if (be->netif) {
++ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
++ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
++ netif_disconnect(be->netif);
++ be->netif = NULL;
++ }
++ kfree(be);
++ dev_set_drvdata(&dev->dev, NULL);
++ return 0;
++}
++
++
++/**
++ * Entry point to this code when a new device is created. Allocate the basic
++ * structures and switch to InitWait.
++ */
++static int netback_probe(struct xenbus_device *dev,
++ const struct xenbus_device_id *id)
++{
++ const char *message;
++ struct xenbus_transaction xbt;
++ int err;
++ int sg;
++ struct backend_info *be = kzalloc(sizeof(struct backend_info),
++ GFP_KERNEL);
++ if (!be) {
++ xenbus_dev_fatal(dev, -ENOMEM,
++ "allocating backend structure");
++ return -ENOMEM;
++ }
++
++ be->dev = dev;
++ dev_set_drvdata(&dev->dev, be);
++
++ sg = 1;
++ if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
++ sg = 0;
++
++ do {
++ err = xenbus_transaction_start(&xbt);
++ if (err) {
++ xenbus_dev_fatal(dev, err, "starting transaction");
++ goto fail;
++ }
++
++ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
++ if (err) {
++ message = "writing feature-sg";
++ goto abort_transaction;
++ }
++
++ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
++ "%d", sg);
++ if (err) {
++ message = "writing feature-gso-tcpv4";
++ goto abort_transaction;
++ }
++
++ /* We support rx-copy path. */
++ err = xenbus_printf(xbt, dev->nodename,
++ "feature-rx-copy", "%d", 1);
++ if (err) {
++ message = "writing feature-rx-copy";
++ goto abort_transaction;
++ }
++
++ /*
++ * We don't support rx-flip path (except old guests who don't
++ * grok this feature flag).
++ */
++ err = xenbus_printf(xbt, dev->nodename,
++ "feature-rx-flip", "%d", 0);
++ if (err) {
++ message = "writing feature-rx-flip";
++ goto abort_transaction;
++ }
++
++ /* We support data smart poll mechanism */
++ err = xenbus_printf(xbt, dev->nodename,
++ "feature-smart-poll", "%d", 1);
++ if (err) {
++ message = "writing feature-smart-poll";
++ goto abort_transaction;
++ }
++
++ err = xenbus_transaction_end(xbt, 0);
++ } while (err == -EAGAIN);
++
++ if (err) {
++ xenbus_dev_fatal(dev, err, "completing transaction");
++ goto fail;
++ }
++
++ //netback_probe_accelerators(be, dev);
++
++ err = xenbus_switch_state(dev, XenbusStateInitWait);
++ if (err)
++ goto fail;
++
++ /* This kicks hotplug scripts, so do it immediately. */
++ backend_create_netif(be);
++
++ return 0;
++
++abort_transaction:
++ xenbus_transaction_end(xbt, 1);
++ xenbus_dev_fatal(dev, err, "%s", message);
++fail:
++ DPRINTK("failed");
++ netback_remove(dev);
++ return err;
++}
++
++
++/**
++ * Handle the creation of the hotplug script environment. We add the script
++ * and vif variables to the environment, for the benefit of the vif-* hotplug
++ * scripts.
++ */
++static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
++{
++ struct backend_info *be = dev_get_drvdata(&xdev->dev);
++ struct xen_netif *netif = be->netif;
++ char *val;
++
++ DPRINTK("netback_uevent");
++
++ val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
++ if (IS_ERR(val)) {
++ int err = PTR_ERR(val);
++ xenbus_dev_fatal(xdev, err, "reading script");
++ return err;
++ }
++ else {
++ if (add_uevent_var(env, "script=%s", val)) {
++ kfree(val);
++ return -ENOMEM;
++ }
++ kfree(val);
++ }
++
++ if (add_uevent_var(env, "vif=%s", netif->dev->name))
++ return -ENOMEM;
++
++ return 0;
++}
++
++
++static void backend_create_netif(struct backend_info *be)
++{
++ int err;
++ long handle;
++ struct xenbus_device *dev = be->dev;
++
++ if (be->netif != NULL)
++ return;
++
++ err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
++ if (err != 1) {
++ xenbus_dev_fatal(dev, err, "reading handle");
++ return;
++ }
++
++ be->netif = netif_alloc(&dev->dev, dev->otherend_id, handle);
++ if (IS_ERR(be->netif)) {
++ err = PTR_ERR(be->netif);
++ be->netif = NULL;
++ xenbus_dev_fatal(dev, err, "creating interface");
++ return;
++ }
++
++ kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
++}
++
++
++static void disconnect_backend(struct xenbus_device *dev)
++{
++ struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++ if (be->netif) {
++ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
++ netif_disconnect(be->netif);
++ be->netif = NULL;
++ }
++}
++
++/**
++ * Callback received when the frontend's state changes.
++ */
++static void frontend_changed(struct xenbus_device *dev,
++ enum xenbus_state frontend_state)
++{
++ struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++ DPRINTK("%s", xenbus_strstate(frontend_state));
++
++ be->frontend_state = frontend_state;
++
++ switch (frontend_state) {
++ case XenbusStateInitialising:
++ if (dev->state == XenbusStateClosed) {
++ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
++ __FUNCTION__, dev->nodename);
++ xenbus_switch_state(dev, XenbusStateInitWait);
++ }
++ break;
++
++ case XenbusStateInitialised:
++ break;
++
++ case XenbusStateConnected:
++ if (dev->state == XenbusStateConnected)
++ break;
++ backend_create_netif(be);
++ if (be->netif)
++ connect(be);
++ break;
++
++ case XenbusStateClosing:
++ if (be->netif)
++ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
++ disconnect_backend(dev);
++ xenbus_switch_state(dev, XenbusStateClosing);
++ break;
++
++ case XenbusStateClosed:
++ xenbus_switch_state(dev, XenbusStateClosed);
++ if (xenbus_dev_is_online(dev))
++ break;
++ /* fall through if not online */
++ case XenbusStateUnknown:
++ device_unregister(&dev->dev);
++ break;
++
++ default:
++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++ frontend_state);
++ break;
++ }
++}
++
++
++static void xen_net_read_rate(struct xenbus_device *dev,
++ unsigned long *bytes, unsigned long *usec)
++{
++ char *s, *e;
++ unsigned long b, u;
++ char *ratestr;
++
++ /* Default to unlimited bandwidth. */
++ *bytes = ~0UL;
++ *usec = 0;
++
++ ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
++ if (IS_ERR(ratestr))
++ return;
++
++ s = ratestr;
++ b = simple_strtoul(s, &e, 10);
++ if ((s == e) || (*e != ','))
++ goto fail;
++
++ s = e + 1;
++ u = simple_strtoul(s, &e, 10);
++ if ((s == e) || (*e != '\0'))
++ goto fail;
++
++ *bytes = b;
++ *usec = u;
++
++ kfree(ratestr);
++ return;
++
++ fail:
++ WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
++ kfree(ratestr);
++}
++
++static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
++{
++ char *s, *e, *macstr;
++ int i;
++
++ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
++ if (IS_ERR(macstr))
++ return PTR_ERR(macstr);
++
++ for (i = 0; i < ETH_ALEN; i++) {
++ mac[i] = simple_strtoul(s, &e, 16);
++ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
++ kfree(macstr);
++ return -ENOENT;
++ }
++ s = e+1;
++ }
++
++ kfree(macstr);
++ return 0;
++}
++
++static void unregister_hotplug_status_watch(struct backend_info *be)
++{
++ if (be->have_hotplug_status_watch) {
++ unregister_xenbus_watch(&be->hotplug_status_watch);
++ kfree(be->hotplug_status_watch.node);
++ }
++ be->have_hotplug_status_watch = 0;
++}
++
++static void hotplug_status_changed(struct xenbus_watch *watch,
++ const char **vec,
++ unsigned int vec_size)
++{
++ struct backend_info *be = container_of(watch,
++ struct backend_info,
++ hotplug_status_watch);
++ char *str;
++ unsigned int len;
++
++ str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
++ if (IS_ERR(str))
++ return;
++ if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
++ xenbus_switch_state(be->dev, XenbusStateConnected);
++ /* Not interested in this watch anymore. */
++ unregister_hotplug_status_watch(be);
++ }
++ kfree(str);
++}
++
++static void connect(struct backend_info *be)
++{
++ int err;
++ struct xenbus_device *dev = be->dev;
++
++ err = connect_rings(be);
++ if (err)
++ return;
++
++ err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
++ if (err) {
++ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
++ return;
++ }
++
++ xen_net_read_rate(dev, &be->netif->credit_bytes,
++ &be->netif->credit_usec);
++ be->netif->remaining_credit = be->netif->credit_bytes;
++
++ unregister_hotplug_status_watch(be);
++ err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
++ hotplug_status_changed,
++ "%s/%s", dev->nodename, "hotplug-status");
++ if (err) {
++ /* Switch now, since we can't do a watch. */
++ xenbus_switch_state(dev, XenbusStateConnected);
++ } else {
++ be->have_hotplug_status_watch = 1;
++ }
++
++ netif_wake_queue(be->netif->dev);
++}
++
++
++static int connect_rings(struct backend_info *be)
++{
++ struct xenbus_device *dev = be->dev;
++ unsigned long tx_ring_ref, rx_ring_ref;
++ unsigned int evtchn, rx_copy;
++ int err;
++ int val;
++
++ DPRINTK("");
++
++ err = xenbus_gather(XBT_NIL, dev->otherend,
++ "tx-ring-ref", "%lu", &tx_ring_ref,
++ "rx-ring-ref", "%lu", &rx_ring_ref,
++ "event-channel", "%u", &evtchn, NULL);
++ if (err) {
++ xenbus_dev_fatal(dev, err,
++ "reading %s/ring-ref and event-channel",
++ dev->otherend);
++ return err;
++ }
++
++ err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
++ &rx_copy);
++ if (err == -ENOENT) {
++ err = 0;
++ rx_copy = 0;
++ }
++ if (err < 0) {
++ xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
++ dev->otherend);
++ return err;
++ }
++ if (!rx_copy)
++ return -EOPNOTSUPP;
++
++ if (be->netif->dev->tx_queue_len != 0) {
++ if (xenbus_scanf(XBT_NIL, dev->otherend,
++ "feature-rx-notify", "%d", &val) < 0)
++ val = 0;
++ if (val)
++ be->netif->can_queue = 1;
++ else
++ /* Must be non-zero for pfifo_fast to work. */
++ be->netif->dev->tx_queue_len = 1;
++ }
++
++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
++ val = 0;
++ if (!val) {
++ be->netif->features &= ~NETIF_F_SG;
++ be->netif->dev->features &= ~NETIF_F_SG;
++ if (be->netif->dev->mtu > ETH_DATA_LEN)
++ be->netif->dev->mtu = ETH_DATA_LEN;
++ }
++
++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
++ &val) < 0)
++ val = 0;
++ if (val) {
++ be->netif->features |= NETIF_F_TSO;
++ be->netif->dev->features |= NETIF_F_TSO;
++ }
++
++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
++ "%d", &val) < 0)
++ val = 0;
++ if (val) {
++ be->netif->features &= ~NETIF_F_IP_CSUM;
++ be->netif->dev->features &= ~NETIF_F_IP_CSUM;
++ }
++
++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll",
++ "%d", &val) < 0)
++ val = 0;
++ if (val)
++ be->netif->smart_poll = 1;
++ else
++ be->netif->smart_poll = 0;
++
++ /* Map the shared frame, irq etc. */
++ err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
++ if (err) {
++ xenbus_dev_fatal(dev, err,
++ "mapping shared-frames %lu/%lu port %u",
++ tx_ring_ref, rx_ring_ref, evtchn);
++ return err;
++ }
++ return 0;
++}
++
++
++/* ** Driver Registration ** */
++
++
++static const struct xenbus_device_id netback_ids[] = {
++ { "vif" },
++ { "" }
++};
++
++
++static struct xenbus_driver netback = {
++ .name = "vif",
++ .owner = THIS_MODULE,
++ .ids = netback_ids,
++ .probe = netback_probe,
++ .remove = netback_remove,
++ .uevent = netback_uevent,
++ .otherend_changed = frontend_changed,
++};
++
++
++int netif_xenbus_init(void)
++{
++ printk(KERN_CRIT "registering netback\n");
++ return xenbus_register_backend(&netback);
++}
+diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
+new file mode 100644
+index 0000000..ae693e7
+--- /dev/null
++++ b/drivers/xen/pci.c
+@@ -0,0 +1,124 @@
++/*
++ * Copyright (c) 2009, Intel Corporation.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms and conditions of the GNU General Public License,
++ * version 2, as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
++ * more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++ * Place - Suite 330, Boston, MA 02111-1307 USA.
++ *
++ * Author: Weidong Han <weidong.han at intel.com>
++ */
++
++#include <linux/pci.h>
++
++#include <xen/interface/xen.h>
++#include <xen/interface/physdev.h>
++
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++
++#include "../pci/pci.h"
++
++
++#ifdef CONFIG_PCI_IOV
++#define HANDLE_PCI_IOV 1
++#else
++#define HANDLE_PCI_IOV 0
++#endif
++
++static int xen_add_device(struct device *dev)
++{
++ int r;
++ struct pci_dev *pci_dev = to_pci_dev(dev);
++
++ if (HANDLE_PCI_IOV && pci_dev->is_virtfn) {
++ struct physdev_manage_pci_ext manage_pci_ext = {
++ .bus = pci_dev->bus->number,
++ .devfn = pci_dev->devfn,
++ .is_virtfn = 1,
++#ifdef CONFIG_PCI_IOV
++ .physfn.bus = pci_dev->physfn->bus->number,
++ .physfn.devfn = pci_dev->physfn->devfn,
++#endif
++ };
++
++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
++ &manage_pci_ext);
++ } else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
++ struct physdev_manage_pci_ext manage_pci_ext = {
++ .bus = pci_dev->bus->number,
++ .devfn = pci_dev->devfn,
++ .is_extfn = 1,
++ };
++
++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
++ &manage_pci_ext);
++ } else {
++ struct physdev_manage_pci manage_pci = {
++ .bus = pci_dev->bus->number,
++ .devfn = pci_dev->devfn,
++ };
++
++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
++ &manage_pci);
++ }
++
++ return r;
++}
++
++static int xen_remove_device(struct device *dev)
++{
++ int r;
++ struct pci_dev *pci_dev = to_pci_dev(dev);
++ struct physdev_manage_pci manage_pci;
++
++ manage_pci.bus = pci_dev->bus->number;
++ manage_pci.devfn = pci_dev->devfn;
++
++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
++ &manage_pci);
++
++ return r;
++}
++
++static int xen_pci_notifier(struct notifier_block *nb,
++ unsigned long action, void *data)
++{
++ struct device *dev = data;
++ int r = 0;
++
++ switch (action) {
++ case BUS_NOTIFY_ADD_DEVICE:
++ r = xen_add_device(dev);
++ break;
++ case BUS_NOTIFY_DEL_DEVICE:
++ r = xen_remove_device(dev);
++ break;
++ default:
++ break;
++ }
++
++ return r;
++}
++
++struct notifier_block device_nb = {
++ .notifier_call = xen_pci_notifier,
++};
++
++static int __init register_xen_pci_notifier(void)
++{
++ if (!xen_pv_domain())
++ return 0;
++
++ return bus_register_notifier(&pci_bus_type, &device_nb);
++}
++
++arch_initcall(register_xen_pci_notifier);
+diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
+index 88a60e0..ae5cb05 100644
+--- a/drivers/xen/sys-hypervisor.c
++++ b/drivers/xen/sys-hypervisor.c
+@@ -14,6 +14,7 @@
+ #include <asm/xen/hypervisor.h>
+ #include <asm/xen/hypercall.h>
+
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/version.h>
+diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
+index 5571f5b..8dca685 100644
+--- a/drivers/xen/xenbus/Makefile
++++ b/drivers/xen/xenbus/Makefile
+@@ -5,3 +5,8 @@ xenbus-objs += xenbus_client.o
+ xenbus-objs += xenbus_comms.o
+ xenbus-objs += xenbus_xs.o
+ xenbus-objs += xenbus_probe.o
++
++xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
++xenbus-objs += $(xenbus-be-objs-y)
++
++obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
+diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
+index 090c61e..700dc77 100644
+--- a/drivers/xen/xenbus/xenbus_comms.c
++++ b/drivers/xen/xenbus/xenbus_comms.c
+@@ -49,6 +49,7 @@ static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
+ static irqreturn_t wake_waiting(int irq, void *unused)
+ {
+ if (unlikely(xenstored_ready == 0)) {
++ printk(KERN_CRIT "xenbus_probe wake_waiting\n");
+ xenstored_ready = 1;
+ schedule_work(&probe_work);
+ }
+diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
+index 649fcdf..a90e0bf 100644
+--- a/drivers/xen/xenbus/xenbus_probe.c
++++ b/drivers/xen/xenbus/xenbus_probe.c
+@@ -49,6 +49,8 @@
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+ #include <asm/xen/hypervisor.h>
++
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+@@ -58,22 +60,15 @@
+
+
+ int xen_store_evtchn;
+-EXPORT_SYMBOL(xen_store_evtchn);
++EXPORT_SYMBOL_GPL(xen_store_evtchn);
+
+ struct xenstore_domain_interface *xen_store_interface;
++EXPORT_SYMBOL_GPL(xen_store_interface);
++
+ static unsigned long xen_store_mfn;
+
+ static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
+
+-static void wait_for_devices(struct xenbus_driver *xendrv);
+-
+-static int xenbus_probe_frontend(const char *type, const char *name);
+-
+-static void xenbus_dev_shutdown(struct device *_dev);
+-
+-static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
+-static int xenbus_dev_resume(struct device *dev);
+-
+ /* If something in array of ids matches this device, return it. */
+ static const struct xenbus_device_id *
+ match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
+@@ -94,34 +89,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv)
+
+ return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
+ }
+-
+-static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env)
+-{
+- struct xenbus_device *dev = to_xenbus_device(_dev);
+-
+- if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
+- return -ENOMEM;
+-
+- return 0;
+-}
+-
+-/* device/<type>/<id> => <type>-<id> */
+-static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+-{
+- nodename = strchr(nodename, '/');
+- if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
+- printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
+- return -EINVAL;
+- }
+-
+- strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
+- if (!strchr(bus_id, '/')) {
+- printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
+- return -EINVAL;
+- }
+- *strchr(bus_id, '/') = '-';
+- return 0;
+-}
++EXPORT_SYMBOL_GPL(xenbus_match);
+
+
+ static void free_otherend_details(struct xenbus_device *dev)
+@@ -141,7 +109,28 @@ static void free_otherend_watch(struct xenbus_device *dev)
+ }
+
+
+-int read_otherend_details(struct xenbus_device *xendev,
++static int talk_to_otherend(struct xenbus_device *dev)
++{
++ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
++
++ free_otherend_watch(dev);
++ free_otherend_details(dev);
++
++ return drv->read_otherend_details(dev);
++}
++
++
++
++static int watch_otherend(struct xenbus_device *dev)
++{
++ struct xen_bus_type *bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
++
++ return xenbus_watch_pathfmt(dev, &dev->otherend_watch, bus->otherend_changed,
++ "%s/%s", dev->otherend, "state");
++}
++
++
++int xenbus_read_otherend_details(struct xenbus_device *xendev,
+ char *id_node, char *path_node)
+ {
+ int err = xenbus_gather(XBT_NIL, xendev->nodename,
+@@ -166,39 +155,11 @@ int read_otherend_details(struct xenbus_device *xendev,
+
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
+
+-
+-static int read_backend_details(struct xenbus_device *xendev)
+-{
+- return read_otherend_details(xendev, "backend-id", "backend");
+-}
+-
+-static struct device_attribute xenbus_dev_attrs[] = {
+- __ATTR_NULL
+-};
+-
+-/* Bus type for frontend drivers. */
+-static struct xen_bus_type xenbus_frontend = {
+- .root = "device",
+- .levels = 2, /* device/type/<id> */
+- .get_bus_id = frontend_bus_id,
+- .probe = xenbus_probe_frontend,
+- .bus = {
+- .name = "xen",
+- .match = xenbus_match,
+- .uevent = xenbus_uevent,
+- .probe = xenbus_dev_probe,
+- .remove = xenbus_dev_remove,
+- .shutdown = xenbus_dev_shutdown,
+- .dev_attrs = xenbus_dev_attrs,
+-
+- .suspend = xenbus_dev_suspend,
+- .resume = xenbus_dev_resume,
+- },
+-};
+-
+-static void otherend_changed(struct xenbus_watch *watch,
+- const char **vec, unsigned int len)
++void xenbus_otherend_changed(struct xenbus_watch *watch,
++ const char **vec, unsigned int len,
++ int ignore_on_shutdown)
+ {
+ struct xenbus_device *dev =
+ container_of(watch, struct xenbus_device, otherend_watch);
+@@ -226,11 +187,7 @@ static void otherend_changed(struct xenbus_watch *watch,
+ * work that can fail e.g., when the rootfs is gone.
+ */
+ if (system_state > SYSTEM_RUNNING) {
+- struct xen_bus_type *bus = bus;
+- bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
+- /* If we're frontend, drive the state machine to Closed. */
+- /* This should cause the backend to release our resources. */
+- if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
++ if (ignore_on_shutdown && (state == XenbusStateClosing))
+ xenbus_frontend_closed(dev);
+ return;
+ }
+@@ -238,25 +195,7 @@ static void otherend_changed(struct xenbus_watch *watch,
+ if (drv->otherend_changed)
+ drv->otherend_changed(dev, state);
+ }
+-
+-
+-static int talk_to_otherend(struct xenbus_device *dev)
+-{
+- struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+-
+- free_otherend_watch(dev);
+- free_otherend_details(dev);
+-
+- return drv->read_otherend_details(dev);
+-}
+-
+-
+-static int watch_otherend(struct xenbus_device *dev)
+-{
+- return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
+- "%s/%s", dev->otherend, "state");
+-}
+-
++EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
+
+ int xenbus_dev_probe(struct device *_dev)
+ {
+@@ -300,8 +239,9 @@ int xenbus_dev_probe(struct device *_dev)
+ fail:
+ xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
+ xenbus_switch_state(dev, XenbusStateClosed);
+- return -ENODEV;
++ return err;
+ }
++EXPORT_SYMBOL_GPL(xenbus_dev_probe);
+
+ int xenbus_dev_remove(struct device *_dev)
+ {
+@@ -319,8 +259,9 @@ int xenbus_dev_remove(struct device *_dev)
+ xenbus_switch_state(dev, XenbusStateClosed);
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(xenbus_dev_remove);
+
+-static void xenbus_dev_shutdown(struct device *_dev)
++void xenbus_dev_shutdown(struct device *_dev)
+ {
+ struct xenbus_device *dev = to_xenbus_device(_dev);
+ unsigned long timeout = 5*HZ;
+@@ -341,6 +282,7 @@ static void xenbus_dev_shutdown(struct device *_dev)
+ out:
+ put_device(&dev->dev);
+ }
++EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
+
+ int xenbus_register_driver_common(struct xenbus_driver *drv,
+ struct xen_bus_type *bus,
+@@ -354,25 +296,7 @@ int xenbus_register_driver_common(struct xenbus_driver *drv,
+
+ return driver_register(&drv->driver);
+ }
+-
+-int __xenbus_register_frontend(struct xenbus_driver *drv,
+- struct module *owner, const char *mod_name)
+-{
+- int ret;
+-
+- drv->read_otherend_details = read_backend_details;
+-
+- ret = xenbus_register_driver_common(drv, &xenbus_frontend,
+- owner, mod_name);
+- if (ret)
+- return ret;
+-
+- /* If this driver is loaded as a module wait for devices to attach. */
+- wait_for_devices(drv);
+-
+- return 0;
+-}
+-EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
++EXPORT_SYMBOL_GPL(xenbus_register_driver_common);
+
+ void xenbus_unregister_driver(struct xenbus_driver *drv)
+ {
+@@ -543,24 +467,7 @@ fail:
+ kfree(xendev);
+ return err;
+ }
+-
+-/* device/<typename>/<name> */
+-static int xenbus_probe_frontend(const char *type, const char *name)
+-{
+- char *nodename;
+- int err;
+-
+- nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
+- xenbus_frontend.root, type, name);
+- if (!nodename)
+- return -ENOMEM;
+-
+- DPRINTK("%s", nodename);
+-
+- err = xenbus_probe_node(&xenbus_frontend, type, nodename);
+- kfree(nodename);
+- return err;
+-}
++EXPORT_SYMBOL_GPL(xenbus_probe_node);
+
+ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
+ {
+@@ -569,15 +476,23 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
+ unsigned int dir_n = 0;
+ int i;
+
++ printk(KERN_CRIT "%s type %s\n", __func__, type);
++
+ dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
+- if (IS_ERR(dir))
++ if (IS_ERR(dir)) {
++ printk(KERN_CRIT "%s failed xenbus_directory\n", __func__);
+ return PTR_ERR(dir);
++ }
+
+ for (i = 0; i < dir_n; i++) {
+- err = bus->probe(type, dir[i]);
+- if (err)
++ printk(KERN_CRIT "%s %d/%d %s\n", __func__, i+1,dir_n, dir[i]);
++ err = bus->probe(bus, type, dir[i]);
++ if (err) {
++ printk(KERN_CRIT "%s failed\n", __func__);
+ break;
++ }
+ }
++ printk("%s done\n", __func__);
+ kfree(dir);
+ return err;
+ }
+@@ -588,18 +503,27 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
+ char **dir;
+ unsigned int i, dir_n;
+
++ printk(KERN_CRIT "%s %s\n", __func__, bus->root);
++
+ dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
+- if (IS_ERR(dir))
++ if (IS_ERR(dir)) {
++ printk(KERN_CRIT "%s failed xenbus_directory\n", __func__);
+ return PTR_ERR(dir);
++ }
+
+ for (i = 0; i < dir_n; i++) {
++ printk(KERN_CRIT "%s %d/%d %s\n", __func__, i+1,dir_n, dir[i]);
+ err = xenbus_probe_device_type(bus, dir[i]);
+- if (err)
++ if (err) {
++ printk(KERN_CRIT "%s failed\n", __func__);
+ break;
++ }
+ }
++ printk("%s done\n", __func__);
+ kfree(dir);
+ return err;
+ }
++EXPORT_SYMBOL_GPL(xenbus_probe_devices);
+
+ static unsigned int char_count(const char *str, char c)
+ {
+@@ -662,32 +586,17 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
+ }
+ EXPORT_SYMBOL_GPL(xenbus_dev_changed);
+
+-static void frontend_changed(struct xenbus_watch *watch,
+- const char **vec, unsigned int len)
+-{
+- DPRINTK("");
+-
+- xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+-}
+-
+-/* We watch for devices appearing and vanishing. */
+-static struct xenbus_watch fe_watch = {
+- .node = "device",
+- .callback = frontend_changed,
+-};
+-
+-static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
++int xenbus_dev_suspend(struct device *dev, pm_message_t state)
+ {
+ int err = 0;
+ struct xenbus_driver *drv;
+- struct xenbus_device *xdev;
++ struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev);
+
+- DPRINTK("");
++ DPRINTK("%s", xdev->nodename);
+
+ if (dev->driver == NULL)
+ return 0;
+ drv = to_xenbus_driver(dev->driver);
+- xdev = container_of(dev, struct xenbus_device, dev);
+ if (drv->suspend)
+ err = drv->suspend(xdev, state);
+ if (err)
+@@ -695,21 +604,19 @@ static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
+ "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
+
+-static int xenbus_dev_resume(struct device *dev)
++int xenbus_dev_resume(struct device *dev)
+ {
+ int err;
+ struct xenbus_driver *drv;
+- struct xenbus_device *xdev;
++ struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev);
+
+- DPRINTK("");
++ DPRINTK("%s", xdev->nodename);
+
+ if (dev->driver == NULL)
+ return 0;
+-
+ drv = to_xenbus_driver(dev->driver);
+- xdev = container_of(dev, struct xenbus_device, dev);
+-
+ err = talk_to_otherend(xdev);
+ if (err) {
+ printk(KERN_WARNING
+@@ -740,6 +647,7 @@ static int xenbus_dev_resume(struct device *dev)
+
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(xenbus_dev_resume);
+
+ /* A flag to determine if xenstored is 'ready' (i.e. has started) */
+ int xenstored_ready = 0;
+@@ -768,10 +676,7 @@ void xenbus_probe(struct work_struct *unused)
+ {
+ BUG_ON((xenstored_ready <= 0));
+
+- /* Enumerate devices in xenstore and watch for changes. */
+- xenbus_probe_devices(&xenbus_frontend);
+- register_xenbus_watch(&fe_watch);
+- xenbus_backend_probe_and_watch();
++ printk(KERN_CRIT "xenbus_probe wake_waiting\n");
+
+ /* Notify others that xenstore is up */
+ blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
+@@ -780,27 +685,43 @@ void xenbus_probe(struct work_struct *unused)
+ static int __init xenbus_probe_init(void)
+ {
+ int err = 0;
++ unsigned long page = 0;
+
+ DPRINTK("");
+
+ err = -ENODEV;
+ if (!xen_domain())
+- goto out_error;
+-
+- /* Register ourselves with the kernel bus subsystem */
+- err = bus_register(&xenbus_frontend.bus);
+- if (err)
+- goto out_error;
+-
+- err = xenbus_backend_bus_register();
+- if (err)
+- goto out_unreg_front;
++ return err;
+
+ /*
+ * Domain0 doesn't have a store_evtchn or store_mfn yet.
+ */
+ if (xen_initial_domain()) {
+- /* dom0 not yet supported */
++ struct evtchn_alloc_unbound alloc_unbound;
++
++ /* Allocate Xenstore page */
++ page = get_zeroed_page(GFP_KERNEL);
++ if (!page)
++ goto out_error;
++
++ xen_store_mfn = xen_start_info->store_mfn =
++ pfn_to_mfn(virt_to_phys((void *)page) >>
++ PAGE_SHIFT);
++
++ /* Next allocate a local port which xenstored can bind to */
++ alloc_unbound.dom = DOMID_SELF;
++ alloc_unbound.remote_dom = 0;
++
++ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
++ &alloc_unbound);
++ if (err == -ENOSYS)
++ goto out_error;
++
++ BUG_ON(err);
++ xen_store_evtchn = xen_start_info->store_evtchn =
++ alloc_unbound.port;
++
++ xen_store_interface = mfn_to_virt(xen_store_mfn);
+ } else {
+ xenstored_ready = 1;
+ xen_store_evtchn = xen_start_info->store_evtchn;
+@@ -813,7 +734,7 @@ static int __init xenbus_probe_init(void)
+ if (err) {
+ printk(KERN_WARNING
+ "XENBUS: Error initializing xenstore comms: %i\n", err);
+- goto out_unreg_back;
++ goto out_error;
+ }
+
+ if (!xen_initial_domain())
+@@ -827,130 +748,17 @@ static int __init xenbus_probe_init(void)
+ proc_mkdir("xen", NULL);
+ #endif
+
++ printk(KERN_CRIT "%s ok\n", __func__);
+ return 0;
+
+- out_unreg_back:
+- xenbus_backend_bus_unregister();
+-
+- out_unreg_front:
+- bus_unregister(&xenbus_frontend.bus);
+-
+ out_error:
++ if (page != 0)
++ free_page(page);
++
++ printk(KERN_CRIT "err %d in %s\n", err, __func__);
+ return err;
+ }
+
+ postcore_initcall(xenbus_probe_init);
+
+ MODULE_LICENSE("GPL");
+-
+-static int is_device_connecting(struct device *dev, void *data)
+-{
+- struct xenbus_device *xendev = to_xenbus_device(dev);
+- struct device_driver *drv = data;
+- struct xenbus_driver *xendrv;
+-
+- /*
+- * A device with no driver will never connect. We care only about
+- * devices which should currently be in the process of connecting.
+- */
+- if (!dev->driver)
+- return 0;
+-
+- /* Is this search limited to a particular driver? */
+- if (drv && (dev->driver != drv))
+- return 0;
+-
+- xendrv = to_xenbus_driver(dev->driver);
+- return (xendev->state < XenbusStateConnected ||
+- (xendev->state == XenbusStateConnected &&
+- xendrv->is_ready && !xendrv->is_ready(xendev)));
+-}
+-
+-static int exists_connecting_device(struct device_driver *drv)
+-{
+- return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+- is_device_connecting);
+-}
+-
+-static int print_device_status(struct device *dev, void *data)
+-{
+- struct xenbus_device *xendev = to_xenbus_device(dev);
+- struct device_driver *drv = data;
+-
+- /* Is this operation limited to a particular driver? */
+- if (drv && (dev->driver != drv))
+- return 0;
+-
+- if (!dev->driver) {
+- /* Information only: is this too noisy? */
+- printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
+- xendev->nodename);
+- } else if (xendev->state < XenbusStateConnected) {
+- enum xenbus_state rstate = XenbusStateUnknown;
+- if (xendev->otherend)
+- rstate = xenbus_read_driver_state(xendev->otherend);
+- printk(KERN_WARNING "XENBUS: Timeout connecting "
+- "to device: %s (local state %d, remote state %d)\n",
+- xendev->nodename, xendev->state, rstate);
+- }
+-
+- return 0;
+-}
+-
+-/* We only wait for device setup after most initcalls have run. */
+-static int ready_to_wait_for_devices;
+-
+-/*
+- * On a 5-minute timeout, wait for all devices currently configured. We need
+- * to do this to guarantee that the filesystems and / or network devices
+- * needed for boot are available, before we can allow the boot to proceed.
+- *
+- * This needs to be on a late_initcall, to happen after the frontend device
+- * drivers have been initialised, but before the root fs is mounted.
+- *
+- * A possible improvement here would be to have the tools add a per-device
+- * flag to the store entry, indicating whether it is needed at boot time.
+- * This would allow people who knew what they were doing to accelerate their
+- * boot slightly, but of course needs tools or manual intervention to set up
+- * those flags correctly.
+- */
+-static void wait_for_devices(struct xenbus_driver *xendrv)
+-{
+- unsigned long start = jiffies;
+- struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+- unsigned int seconds_waited = 0;
+-
+- if (!ready_to_wait_for_devices || !xen_domain())
+- return;
+-
+- while (exists_connecting_device(drv)) {
+- if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
+- if (!seconds_waited)
+- printk(KERN_WARNING "XENBUS: Waiting for "
+- "devices to initialise: ");
+- seconds_waited += 5;
+- printk("%us...", 300 - seconds_waited);
+- if (seconds_waited == 300)
+- break;
+- }
+-
+- schedule_timeout_interruptible(HZ/10);
+- }
+-
+- if (seconds_waited)
+- printk("\n");
+-
+- bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+- print_device_status);
+-}
+-
+-#ifndef MODULE
+-static int __init boot_wait_for_devices(void)
+-{
+- ready_to_wait_for_devices = 1;
+- wait_for_devices(NULL);
+- return 0;
+-}
+-
+-late_initcall(boot_wait_for_devices);
+-#endif
+diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
+index 6c5e318..0e5fc4c 100644
+--- a/drivers/xen/xenbus/xenbus_probe.h
++++ b/drivers/xen/xenbus/xenbus_probe.h
+@@ -36,26 +36,13 @@
+
+ #define XEN_BUS_ID_SIZE 20
+
+-#ifdef CONFIG_XEN_BACKEND
+-extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
+-extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
+-extern void xenbus_backend_probe_and_watch(void);
+-extern int xenbus_backend_bus_register(void);
+-extern void xenbus_backend_bus_unregister(void);
+-#else
+-static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
+-static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
+-static inline void xenbus_backend_probe_and_watch(void) {}
+-static inline int xenbus_backend_bus_register(void) { return 0; }
+-static inline void xenbus_backend_bus_unregister(void) {}
+-#endif
+-
+ struct xen_bus_type
+ {
+ char *root;
+ unsigned int levels;
+ int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
+- int (*probe)(const char *type, const char *dir);
++ int (*probe)(struct xen_bus_type *bus, const char *type, const char *dir);
++ void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, unsigned int len);
+ struct bus_type bus;
+ };
+
+@@ -73,4 +60,16 @@ extern int xenbus_probe_devices(struct xen_bus_type *bus);
+
+ extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
+
++extern void xenbus_dev_shutdown(struct device *_dev);
++
++extern int xenbus_dev_suspend(struct device *dev, pm_message_t state);
++extern int xenbus_dev_resume(struct device *dev);
++
++extern void xenbus_otherend_changed(struct xenbus_watch *watch,
++ const char **vec, unsigned int len,
++ int ignore_on_shutdown);
++
++extern int xenbus_read_otherend_details(struct xenbus_device *xendev,
++ char *id_node, char *path_node);
++
+ #endif
+diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
+new file mode 100644
+index 0000000..a3cc535
+--- /dev/null
++++ b/drivers/xen/xenbus/xenbus_probe_backend.c
+@@ -0,0 +1,298 @@
++/******************************************************************************
++ * Talks to Xen Store to figure out what devices we have (backend half).
++ *
++ * Copyright (C) 2005 Rusty Russell, IBM Corporation
++ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
++ * Copyright (C) 2005, 2006 XenSource Ltd
++ * Copyright (C) 2007 Solarflare Communications, Inc.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#define DPRINTK(fmt, args...) \
++ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
++ __func__, __LINE__, ##args)
++
++#include <linux/kernel.h>
++#include <linux/err.h>
++#include <linux/string.h>
++#include <linux/ctype.h>
++#include <linux/fcntl.h>
++#include <linux/mm.h>
++#include <linux/notifier.h>
++
++#include <asm/page.h>
++#include <asm/pgtable.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/hypervisor.h>
++#include <xen/xenbus.h>
++#include <xen/features.h>
++
++#include "xenbus_comms.h"
++#include "xenbus_probe.h"
++
++/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
++static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
++{
++ int domid, err;
++ const char *devid, *type, *frontend;
++ unsigned int typelen;
++
++ type = strchr(nodename, '/');
++ if (!type)
++ return -EINVAL;
++ type++;
++ typelen = strcspn(type, "/");
++ if (!typelen || type[typelen] != '/')
++ return -EINVAL;
++
++ devid = strrchr(nodename, '/') + 1;
++
++ err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
++ "frontend", NULL, &frontend,
++ NULL);
++ if (err)
++ return err;
++ if (strlen(frontend) == 0)
++ err = -ERANGE;
++ if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
++ err = -ENOENT;
++ kfree(frontend);
++
++ if (err)
++ return err;
++
++ if (snprintf(bus_id, XEN_BUS_ID_SIZE,
++ "%.*s-%i-%s", typelen, type, domid, devid) >= XEN_BUS_ID_SIZE)
++ return -ENOSPC;
++ return 0;
++}
++
++static int xenbus_uevent_backend(struct device *dev,
++ struct kobj_uevent_env *env)
++{
++ struct xenbus_device *xdev;
++ struct xenbus_driver *drv;
++ struct xen_bus_type *bus;
++
++ DPRINTK("");
++
++ if (dev == NULL)
++ return -ENODEV;
++
++ xdev = to_xenbus_device(dev);
++ bus = container_of(xdev->dev.bus, struct xen_bus_type, bus);
++ if (xdev == NULL)
++ return -ENODEV;
++
++ /* stuff we want to pass to /sbin/hotplug */
++ if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype))
++ return -ENOMEM;
++
++ if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename))
++ return -ENOMEM;
++
++ if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root))
++ return -ENOMEM;
++
++ if (dev->driver) {
++ drv = to_xenbus_driver(dev->driver);
++ if (drv && drv->uevent)
++ return drv->uevent(xdev, env);
++ }
++
++ return 0;
++}
++
++/* backend/<typename>/<frontend-uuid>/<name> */
++static int xenbus_probe_backend_unit(struct xen_bus_type *bus,
++ const char *dir,
++ const char *type,
++ const char *name)
++{
++ char *nodename;
++ int err;
++
++ nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
++ if (!nodename)
++ return -ENOMEM;
++
++ DPRINTK("%s\n", nodename);
++
++ err = xenbus_probe_node(bus, type, nodename);
++ kfree(nodename);
++ return err;
++}
++
++/* backend/<typename>/<frontend-domid> */
++static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, const char *domid)
++{
++ char *nodename;
++ int err = 0;
++ char **dir;
++ unsigned int i, dir_n = 0;
++
++ DPRINTK("");
++
++ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid);
++ if (!nodename)
++ return -ENOMEM;
++
++ dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
++ if (IS_ERR(dir)) {
++ kfree(nodename);
++ return PTR_ERR(dir);
++ }
++
++ for (i = 0; i < dir_n; i++) {
++ err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]);
++ if (err)
++ break;
++ }
++ kfree(dir);
++ kfree(nodename);
++ return err;
++}
++
++static void frontend_changed(struct xenbus_watch *watch,
++ const char **vec, unsigned int len)
++{
++ xenbus_otherend_changed(watch, vec, len, 0);
++}
++
++static struct device_attribute xenbus_backend_dev_attrs[] = {
++ __ATTR_NULL
++};
++
++static struct xen_bus_type xenbus_backend = {
++ .root = "backend",
++ .levels = 3, /* backend/type/<frontend>/<id> */
++ .get_bus_id = backend_bus_id,
++ .probe = xenbus_probe_backend,
++ .otherend_changed = frontend_changed,
++ .bus = {
++ .name = "xen-backend",
++ .match = xenbus_match,
++ .uevent = xenbus_uevent_backend,
++ .probe = xenbus_dev_probe,
++ .remove = xenbus_dev_remove,
++ .shutdown = xenbus_dev_shutdown,
++ .dev_attrs = xenbus_backend_dev_attrs,
++ },
++};
++
++static void backend_changed(struct xenbus_watch *watch,
++ const char **vec, unsigned int len)
++{
++ DPRINTK("");
++
++ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
++}
++
++static struct xenbus_watch be_watch = {
++ .node = "backend",
++ .callback = backend_changed,
++};
++
++static int read_frontend_details(struct xenbus_device *xendev)
++{
++ return xenbus_read_otherend_details(xendev, "frontend-id", "frontend");
++}
++
++//void xenbus_backend_suspend(int (*fn)(struct device *, void *))
++//{
++// DPRINTK("");
++// bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
++//}
++
++//void xenbus_backend_resume(int (*fn)(struct device *, void *))
++//{
++// DPRINTK("");
++// bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
++//}
++
++//int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *))
++//{
++// return bus_for_each_dev(&xenbus_backend.bus, NULL, arg, fn);
++//}
++//EXPORT_SYMBOL_GPL(xenbus_for_each_backend);
++
++int xenbus_dev_is_online(struct xenbus_device *dev)
++{
++ int rc, val;
++
++ rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
++ if (rc != 1)
++ val = 0; /* no online node present */
++
++ return val;
++}
++EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
++
++int __xenbus_register_backend(struct xenbus_driver *drv,
++ struct module *owner, const char *mod_name)
++{
++ drv->read_otherend_details = read_frontend_details;
++
++ return xenbus_register_driver_common(drv, &xenbus_backend,
++ owner, mod_name);
++}
++EXPORT_SYMBOL_GPL(__xenbus_register_backend);
++
++static int backend_probe_and_watch(struct notifier_block *notifier,
++ unsigned long event,
++ void *data)
++{
++ /* Enumerate devices in xenstore and watch for changes. */
++ xenbus_probe_devices(&xenbus_backend);
++ printk(KERN_CRIT "%s devices probed ok\n", __func__);
++ register_xenbus_watch(&be_watch);
++ printk(KERN_CRIT "%s watch add ok ok\n", __func__);
++ printk(KERN_CRIT "%s all done\n", __func__);
++ return NOTIFY_DONE;
++}
++
++static int __init xenbus_probe_backend_init(void)
++{
++ static struct notifier_block xenstore_notifier = {
++ .notifier_call = backend_probe_and_watch
++ };
++ int err;
++
++ DPRINTK("");
++
++ /* Register ourselves with the kernel bus subsystem */
++ err = bus_register(&xenbus_backend.bus);
++ if (err) {
++ printk(KERN_CRIT "%s didn't register bus!\n", __func__);
++ return err;
++ }
++ printk(KERN_CRIT "%s bus registered ok\n", __func__);
++
++ register_xenstore_notifier(&xenstore_notifier);
++
++ return 0;
++}
++subsys_initcall(xenbus_probe_backend_init);
+diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
+new file mode 100644
+index 0000000..47be902
+--- /dev/null
++++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
+@@ -0,0 +1,292 @@
++#define DPRINTK(fmt, args...) \
++ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
++ __func__, __LINE__, ##args)
++
++#include <linux/kernel.h>
++#include <linux/err.h>
++#include <linux/string.h>
++#include <linux/ctype.h>
++#include <linux/fcntl.h>
++#include <linux/mm.h>
++#include <linux/proc_fs.h>
++#include <linux/notifier.h>
++#include <linux/kthread.h>
++#include <linux/mutex.h>
++#include <linux/io.h>
++
++#include <asm/page.h>
++#include <asm/pgtable.h>
++#include <asm/xen/hypervisor.h>
++#include <xen/xenbus.h>
++#include <xen/events.h>
++#include <xen/page.h>
++
++#include "xenbus_comms.h"
++#include "xenbus_probe.h"
++
++/* device/<type>/<id> => <type>-<id> */
++static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
++{
++ nodename = strchr(nodename, '/');
++ if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
++ printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
++ return -EINVAL;
++ }
++
++ strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
++ if (!strchr(bus_id, '/')) {
++ printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
++ return -EINVAL;
++ }
++ *strchr(bus_id, '/') = '-';
++ return 0;
++}
++
++/* device/<typename>/<name> */
++static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, const char *name)
++{
++ char *nodename;
++ int err;
++
++ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name);
++ if (!nodename)
++ return -ENOMEM;
++
++ DPRINTK("%s", nodename);
++
++ err = xenbus_probe_node(bus, type, nodename);
++ kfree(nodename);
++ return err;
++}
++
++static int xenbus_uevent_frontend(struct device *_dev, struct kobj_uevent_env *env)
++{
++ struct xenbus_device *dev = to_xenbus_device(_dev);
++
++ if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
++ return -ENOMEM;
++
++ return 0;
++}
++
++
++static void backend_changed(struct xenbus_watch *watch,
++ const char **vec, unsigned int len)
++{
++ xenbus_otherend_changed(watch, vec, len, 1);
++}
++
++static struct device_attribute xenbus_frontend_dev_attrs[] = {
++ __ATTR_NULL
++};
++
++
++static struct xen_bus_type xenbus_frontend = {
++ .root = "device",
++ .levels = 2, /* device/type/<id> */
++ .get_bus_id = frontend_bus_id,
++ .probe = xenbus_probe_frontend,
++ .otherend_changed = backend_changed,
++ .bus = {
++ .name = "xen",
++ .match = xenbus_match,
++ .uevent = xenbus_uevent_frontend,
++ .probe = xenbus_dev_probe,
++ .remove = xenbus_dev_remove,
++ .shutdown = xenbus_dev_shutdown,
++ .dev_attrs= xenbus_frontend_dev_attrs,
++
++ .suspend = xenbus_dev_suspend,
++ .resume = xenbus_dev_resume,
++ },
++};
++
++static void frontend_changed(struct xenbus_watch *watch,
++ const char **vec, unsigned int len)
++{
++ DPRINTK("");
++
++ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
++}
++
++
++/* We watch for devices appearing and vanishing. */
++static struct xenbus_watch fe_watch = {
++ .node = "device",
++ .callback = frontend_changed,
++};
++
++static int read_backend_details(struct xenbus_device *xendev)
++{
++ return xenbus_read_otherend_details(xendev, "backend-id", "backend");
++}
++
++static int is_device_connecting(struct device *dev, void *data)
++{
++ struct xenbus_device *xendev = to_xenbus_device(dev);
++ struct device_driver *drv = data;
++ struct xenbus_driver *xendrv;
++
++ /*
++ * A device with no driver will never connect. We care only about
++ * devices which should currently be in the process of connecting.
++ */
++ if (!dev->driver)
++ return 0;
++
++ /* Is this search limited to a particular driver? */
++ if (drv && (dev->driver != drv))
++ return 0;
++
++ xendrv = to_xenbus_driver(dev->driver);
++ return (xendev->state < XenbusStateConnected ||
++ (xendev->state == XenbusStateConnected &&
++ xendrv->is_ready && !xendrv->is_ready(xendev)));
++}
++
++static int exists_connecting_device(struct device_driver *drv)
++{
++ return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
++ is_device_connecting);
++}
++
++static int print_device_status(struct device *dev, void *data)
++{
++ struct xenbus_device *xendev = to_xenbus_device(dev);
++ struct device_driver *drv = data;
++
++ /* Is this operation limited to a particular driver? */
++ if (drv && (dev->driver != drv))
++ return 0;
++
++ if (!dev->driver) {
++ /* Information only: is this too noisy? */
++ printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
++ xendev->nodename);
++ } else if (xendev->state < XenbusStateConnected) {
++ enum xenbus_state rstate = XenbusStateUnknown;
++ if (xendev->otherend)
++ rstate = xenbus_read_driver_state(xendev->otherend);
++ printk(KERN_WARNING "XENBUS: Timeout connecting "
++ "to device: %s (local state %d, remote state %d)\n",
++ xendev->nodename, xendev->state, rstate);
++ }
++
++ return 0;
++}
++
++/* We only wait for device setup after most initcalls have run. */
++static int ready_to_wait_for_devices;
++
++/*
++ * On a 5-minute timeout, wait for all devices currently configured. We need
++ * to do this to guarantee that the filesystems and / or network devices
++ * needed for boot are available, before we can allow the boot to proceed.
++ *
++ * This needs to be on a late_initcall, to happen after the frontend device
++ * drivers have been initialised, but before the root fs is mounted.
++ *
++ * A possible improvement here would be to have the tools add a per-device
++ * flag to the store entry, indicating whether it is needed at boot time.
++ * This would allow people who knew what they were doing to accelerate their
++ * boot slightly, but of course needs tools or manual intervention to set up
++ * those flags correctly.
++ */
++static void wait_for_devices(struct xenbus_driver *xendrv)
++{
++ unsigned long start = jiffies;
++ struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
++ unsigned int seconds_waited = 0;
++
++ if (!ready_to_wait_for_devices || !xen_domain())
++ return;
++
++ while (exists_connecting_device(drv)) {
++ if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
++ if (!seconds_waited)
++ printk(KERN_WARNING "XENBUS: Waiting for "
++ "devices to initialise: ");
++ seconds_waited += 5;
++ printk("%us...", 300 - seconds_waited);
++ if (seconds_waited == 300)
++ break;
++ }
++
++ schedule_timeout_interruptible(HZ/10);
++ }
++
++ if (seconds_waited)
++ printk("\n");
++
++ bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
++ print_device_status);
++}
++
++int __xenbus_register_frontend(struct xenbus_driver *drv,
++ struct module *owner, const char *mod_name)
++{
++ int ret;
++
++ drv->read_otherend_details = read_backend_details;
++
++ ret = xenbus_register_driver_common(drv, &xenbus_frontend,
++ owner, mod_name);
++ if (ret)
++ return ret;
++
++ /* If this driver is loaded as a module wait for devices to attach. */
++ wait_for_devices(drv);
++
++ return 0;
++}
++EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
++
++static int frontend_probe_and_watch(struct notifier_block *notifier,
++ unsigned long event,
++ void *data)
++{
++ /* Enumerate devices in xenstore and watch for changes. */
++ xenbus_probe_devices(&xenbus_frontend);
++ printk(KERN_CRIT "%s devices probed ok\n", __func__);
++ register_xenbus_watch(&fe_watch);
++ printk(KERN_CRIT "%s watch add ok ok\n", __func__);
++ printk(KERN_CRIT "%s all done\n", __func__);
++ return NOTIFY_DONE;
++}
++
++
++static int __init xenbus_probe_frontend_init(void)
++{
++ static struct notifier_block xenstore_notifier = {
++ .notifier_call = frontend_probe_and_watch
++ };
++ int err;
++
++ DPRINTK("");
++
++ /* Register ourselves with the kernel bus subsystem */
++ err = bus_register(&xenbus_frontend.bus);
++ if (err) {
++ printk(KERN_CRIT "%s didn't register bus!\n", __func__);
++ return err;
++ }
++ printk(KERN_CRIT "%s bus registered ok\n", __func__);
++
++ register_xenstore_notifier(&xenstore_notifier);
++
++ return 0;
++}
++subsys_initcall(xenbus_probe_frontend_init);
++
++#ifndef MODULE
++static int __init boot_wait_for_devices(void)
++{
++ ready_to_wait_for_devices = 1;
++ wait_for_devices(NULL);
++ return 0;
++}
++
++late_initcall(boot_wait_for_devices);
++#endif
++
++MODULE_LICENSE("GPL");
+diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
+index eab33f1..6f91e8c 100644
+--- a/drivers/xen/xenbus/xenbus_xs.c
++++ b/drivers/xen/xenbus/xenbus_xs.c
+@@ -76,6 +76,14 @@ struct xs_handle {
+ /*
+ * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex.
+ * response_mutex is never taken simultaneously with the other three.
++ *
++ * transaction_mutex must be held before incrementing
++ * transaction_count. The mutex is held when a suspend is in
++ * progress to prevent new transactions starting.
++ *
++ * When decrementing transaction_count to zero the wait queue
++ * should be woken up, the suspend code waits for count to
++ * reach zero.
+ */
+
+ /* One request at a time. */
+@@ -85,7 +93,9 @@ struct xs_handle {
+ struct mutex response_mutex;
+
+ /* Protect transactions against save/restore. */
+- struct rw_semaphore transaction_mutex;
++ struct mutex transaction_mutex;
++ atomic_t transaction_count;
++ wait_queue_head_t transaction_wq;
+
+ /* Protect watch (de)register against save/restore. */
+ struct rw_semaphore watch_mutex;
+@@ -157,6 +167,31 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
+ return body;
+ }
+
++static void transaction_start(void)
++{
++ mutex_lock(&xs_state.transaction_mutex);
++ atomic_inc(&xs_state.transaction_count);
++ mutex_unlock(&xs_state.transaction_mutex);
++}
++
++static void transaction_end(void)
++{
++ if (atomic_dec_and_test(&xs_state.transaction_count))
++ wake_up(&xs_state.transaction_wq);
++}
++
++static void transaction_suspend(void)
++{
++ mutex_lock(&xs_state.transaction_mutex);
++ wait_event(xs_state.transaction_wq,
++ atomic_read(&xs_state.transaction_count) == 0);
++}
++
++static void transaction_resume(void)
++{
++ mutex_unlock(&xs_state.transaction_mutex);
++}
++
+ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
+ {
+ void *ret;
+@@ -164,7 +199,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
+ int err;
+
+ if (req_msg.type == XS_TRANSACTION_START)
+- down_read(&xs_state.transaction_mutex);
++ transaction_start();
+
+ mutex_lock(&xs_state.request_mutex);
+
+@@ -180,7 +215,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
+ if ((msg->type == XS_TRANSACTION_END) ||
+ ((req_msg.type == XS_TRANSACTION_START) &&
+ (msg->type == XS_ERROR)))
+- up_read(&xs_state.transaction_mutex);
++ transaction_end();
+
+ return ret;
+ }
+@@ -432,11 +467,11 @@ int xenbus_transaction_start(struct xenbus_transaction *t)
+ {
+ char *id_str;
+
+- down_read(&xs_state.transaction_mutex);
++ transaction_start();
+
+ id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
+ if (IS_ERR(id_str)) {
+- up_read(&xs_state.transaction_mutex);
++ transaction_end();
+ return PTR_ERR(id_str);
+ }
+
+@@ -461,7 +496,7 @@ int xenbus_transaction_end(struct xenbus_transaction t, int abort)
+
+ err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
+
+- up_read(&xs_state.transaction_mutex);
++ transaction_end();
+
+ return err;
+ }
+@@ -662,7 +697,7 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
+
+ void xs_suspend(void)
+ {
+- down_write(&xs_state.transaction_mutex);
++ transaction_suspend();
+ down_write(&xs_state.watch_mutex);
+ mutex_lock(&xs_state.request_mutex);
+ mutex_lock(&xs_state.response_mutex);
+@@ -677,7 +712,7 @@ void xs_resume(void)
+
+ mutex_unlock(&xs_state.response_mutex);
+ mutex_unlock(&xs_state.request_mutex);
+- up_write(&xs_state.transaction_mutex);
++ transaction_resume();
+
+ /* No need for watches_lock: the watch_mutex is sufficient. */
+ list_for_each_entry(watch, &watches, list) {
+@@ -693,7 +728,7 @@ void xs_suspend_cancel(void)
+ mutex_unlock(&xs_state.response_mutex);
+ mutex_unlock(&xs_state.request_mutex);
+ up_write(&xs_state.watch_mutex);
+- up_write(&xs_state.transaction_mutex);
++ mutex_unlock(&xs_state.transaction_mutex);
+ }
+
+ static int xenwatch_thread(void *unused)
+@@ -843,8 +878,10 @@ int xs_init(void)
+
+ mutex_init(&xs_state.request_mutex);
+ mutex_init(&xs_state.response_mutex);
+- init_rwsem(&xs_state.transaction_mutex);
++ mutex_init(&xs_state.transaction_mutex);
+ init_rwsem(&xs_state.watch_mutex);
++ atomic_set(&xs_state.transaction_count, 0);
++ init_waitqueue_head(&xs_state.transaction_wq);
+
+ /* Initialize the shared memory rings to talk to xenstored */
+ err = xb_init_comms();
+diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
+index 25275c3..4a0be9a 100644
+--- a/drivers/xen/xenfs/Makefile
++++ b/drivers/xen/xenfs/Makefile
+@@ -1,3 +1,4 @@
+ obj-$(CONFIG_XENFS) += xenfs.o
+
+-xenfs-objs = super.o xenbus.o
+\ No newline at end of file
++xenfs-y = super.o xenbus.o
++xenfs-$(CONFIG_XEN_DOM0) += xenstored.o privcmd.o
+diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
+new file mode 100644
+index 0000000..f80be7f
+--- /dev/null
++++ b/drivers/xen/xenfs/privcmd.c
+@@ -0,0 +1,404 @@
++/******************************************************************************
++ * privcmd.c
++ *
++ * Interface to privileged domain-0 commands.
++ *
++ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
++ */
++
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/string.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/uaccess.h>
++#include <linux/swap.h>
++#include <linux/smp_lock.h>
++#include <linux/highmem.h>
++#include <linux/pagemap.h>
++#include <linux/seq_file.h>
++
++#include <asm/pgalloc.h>
++#include <asm/pgtable.h>
++#include <asm/tlb.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++
++#include <xen/xen.h>
++#include <xen/privcmd.h>
++#include <xen/interface/xen.h>
++#include <xen/features.h>
++#include <xen/page.h>
++#include <xen/xen-ops.h>
++
++#ifndef HAVE_ARCH_PRIVCMD_MMAP
++static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
++#endif
++
++static long privcmd_ioctl_hypercall(void __user *udata)
++{
++ struct privcmd_hypercall hypercall;
++ long ret;
++
++ if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
++ return -EFAULT;
++
++ ret = privcmd_call(hypercall.op,
++ hypercall.arg[0], hypercall.arg[1],
++ hypercall.arg[2], hypercall.arg[3],
++ hypercall.arg[4]);
++
++ return ret;
++}
++
++static void free_page_list(struct list_head *pages)
++{
++ struct page *p, *n;
++
++ list_for_each_entry_safe(p, n, pages, lru)
++ __free_page(p);
++
++ INIT_LIST_HEAD(pages);
++}
++
++/*
++ * Given an array of items in userspace, return a list of pages
++ * containing the data. If copying fails, either because of memory
++ * allocation failure or a problem reading user memory, return an
++ * error code; its up to the caller to dispose of any partial list.
++ */
++static int gather_array(struct list_head *pagelist,
++ unsigned nelem, size_t size,
++ void __user *data)
++{
++ unsigned pageidx;
++ void *pagedata;
++ int ret;
++
++ if (size > PAGE_SIZE)
++ return 0;
++
++ pageidx = PAGE_SIZE;
++ pagedata = NULL; /* quiet, gcc */
++ while (nelem--) {
++ if (pageidx > PAGE_SIZE-size) {
++ struct page *page = alloc_page(GFP_KERNEL);
++
++ ret = -ENOMEM;
++ if (page == NULL)
++ goto fail;
++
++ pagedata = page_address(page);
++
++ list_add_tail(&page->lru, pagelist);
++ pageidx = 0;
++ }
++
++ ret = -EFAULT;
++ if (copy_from_user(pagedata + pageidx, data, size))
++ goto fail;
++
++ data += size;
++ pageidx += size;
++ }
++
++ ret = 0;
++
++fail:
++ return ret;
++}
++
++/*
++ * Call function "fn" on each element of the array fragmented
++ * over a list of pages.
++ */
++static int traverse_pages(unsigned nelem, size_t size,
++ struct list_head *pos,
++ int (*fn)(void *data, void *state),
++ void *state)
++{
++ void *pagedata;
++ unsigned pageidx;
++ int ret = 0;
++
++ BUG_ON(size > PAGE_SIZE);
++
++ pageidx = PAGE_SIZE;
++ pagedata = NULL; /* hush, gcc */
++
++ while (nelem--) {
++ if (pageidx > PAGE_SIZE-size) {
++ struct page *page;
++ pos = pos->next;
++ page = list_entry(pos, struct page, lru);
++ pagedata = page_address(page);
++ pageidx = 0;
++ }
++
++ ret = (*fn)(pagedata + pageidx, state);
++ if (ret)
++ break;
++ pageidx += size;
++ }
++
++ return ret;
++}
++
++struct mmap_mfn_state {
++ unsigned long va;
++ struct vm_area_struct *vma;
++ domid_t domain;
++};
++
++static int mmap_mfn_range(void *data, void *state)
++{
++ struct privcmd_mmap_entry *msg = data;
++ struct mmap_mfn_state *st = state;
++ struct vm_area_struct *vma = st->vma;
++ int rc;
++
++ /* Do not allow range to wrap the address space. */
++ if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
++ ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
++ return -EINVAL;
++
++ /* Range chunks must be contiguous in va space. */
++ if ((msg->va != st->va) ||
++ ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
++ return -EINVAL;
++
++ rc = xen_remap_domain_mfn_range(vma,
++ msg->va & PAGE_MASK,
++ msg->mfn, msg->npages,
++ vma->vm_page_prot,
++ st->domain);
++ if (rc < 0)
++ return rc;
++
++ st->va += msg->npages << PAGE_SHIFT;
++
++ return 0;
++}
++
++static long privcmd_ioctl_mmap(void __user *udata)
++{
++ struct privcmd_mmap mmapcmd;
++ struct mm_struct *mm = current->mm;
++ struct vm_area_struct *vma;
++ int rc;
++ LIST_HEAD(pagelist);
++ struct mmap_mfn_state state;
++
++ if (!xen_initial_domain())
++ return -EPERM;
++
++ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
++ return -EFAULT;
++
++ rc = gather_array(&pagelist,
++ mmapcmd.num, sizeof(struct privcmd_mmap_entry),
++ mmapcmd.entry);
++
++ if (rc || list_empty(&pagelist))
++ goto out;
++
++ down_write(&mm->mmap_sem);
++
++ {
++ struct page *page = list_first_entry(&pagelist,
++ struct page, lru);
++ struct privcmd_mmap_entry *msg = page_address(page);
++
++ vma = find_vma(mm, msg->va);
++ rc = -EINVAL;
++
++ if (!vma || (msg->va != vma->vm_start) ||
++ !privcmd_enforce_singleshot_mapping(vma))
++ goto out_up;
++ }
++
++ state.va = vma->vm_start;
++ state.vma = vma;
++ state.domain = mmapcmd.dom;
++
++ rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
++ &pagelist,
++ mmap_mfn_range, &state);
++
++
++out_up:
++ up_write(&mm->mmap_sem);
++
++out:
++ free_page_list(&pagelist);
++
++ return rc;
++}
++
++struct mmap_batch_state {
++ domid_t domain;
++ unsigned long va;
++ struct vm_area_struct *vma;
++ int err;
++
++ xen_pfn_t __user *user;
++};
++
++static int mmap_batch_fn(void *data, void *state)
++{
++ xen_pfn_t *mfnp = data;
++ struct mmap_batch_state *st = state;
++
++ if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
++ st->vma->vm_page_prot, st->domain) < 0) {
++ *mfnp |= 0xf0000000U;
++ st->err++;
++ }
++ st->va += PAGE_SIZE;
++
++ return 0;
++}
++
++static int mmap_return_errors(void *data, void *state)
++{
++ xen_pfn_t *mfnp = data;
++ struct mmap_batch_state *st = state;
++
++ put_user(*mfnp, st->user++);
++
++ return 0;
++}
++
++static struct vm_operations_struct privcmd_vm_ops;
++
++static long privcmd_ioctl_mmap_batch(void __user *udata)
++{
++ int ret;
++ struct privcmd_mmapbatch m;
++ struct mm_struct *mm = current->mm;
++ struct vm_area_struct *vma;
++ unsigned long nr_pages;
++ LIST_HEAD(pagelist);
++ struct mmap_batch_state state;
++
++ if (!xen_initial_domain())
++ return -EPERM;
++
++ if (copy_from_user(&m, udata, sizeof(m)))
++ return -EFAULT;
++
++ nr_pages = m.num;
++ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
++ return -EINVAL;
++
++ ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
++ m.arr);
++
++ if (ret || list_empty(&pagelist))
++ goto out;
++
++ down_write(&mm->mmap_sem);
++
++ vma = find_vma(mm, m.addr);
++ ret = -EINVAL;
++ if (!vma ||
++ vma->vm_ops != &privcmd_vm_ops ||
++ (m.addr != vma->vm_start) ||
++ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
++ !privcmd_enforce_singleshot_mapping(vma)) {
++ up_write(&mm->mmap_sem);
++ goto out;
++ }
++
++ state.domain = m.dom;
++ state.vma = vma;
++ state.va = m.addr;
++ state.err = 0;
++
++ ret = traverse_pages(m.num, sizeof(xen_pfn_t),
++ &pagelist, mmap_batch_fn, &state);
++
++ up_write(&mm->mmap_sem);
++
++ if (state.err > 0) {
++ ret = 0;
++
++ state.user = m.arr;
++ traverse_pages(m.num, sizeof(xen_pfn_t),
++ &pagelist,
++ mmap_return_errors, &state);
++ }
++
++out:
++ free_page_list(&pagelist);
++
++ return ret;
++}
++
++static long privcmd_ioctl(struct file *file,
++ unsigned int cmd, unsigned long data)
++{
++ int ret = -ENOSYS;
++ void __user *udata = (void __user *) data;
++
++ switch (cmd) {
++ case IOCTL_PRIVCMD_HYPERCALL:
++ ret = privcmd_ioctl_hypercall(udata);
++ break;
++
++ case IOCTL_PRIVCMD_MMAP:
++ ret = privcmd_ioctl_mmap(udata);
++ break;
++
++ case IOCTL_PRIVCMD_MMAPBATCH:
++ ret = privcmd_ioctl_mmap_batch(udata);
++ break;
++
++ default:
++ ret = -EINVAL;
++ break;
++ }
++
++ return ret;
++}
++
++#ifndef HAVE_ARCH_PRIVCMD_MMAP
++static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++ printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
++ vma, vma->vm_start, vma->vm_end,
++ vmf->pgoff, vmf->virtual_address);
++
++ return VM_FAULT_SIGBUS;
++}
++
++static struct vm_operations_struct privcmd_vm_ops = {
++ .fault = privcmd_fault
++};
++
++static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
++{
++ /* Unsupported for auto-translate guests. */
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ return -ENOSYS;
++
++ /* DONTCOPY is essential for Xen as copy_page_range is broken. */
++ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
++ vma->vm_ops = &privcmd_vm_ops;
++ vma->vm_private_data = NULL;
++
++ return 0;
++}
++
++static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
++{
++ return (xchg(&vma->vm_private_data, (void *)1) == NULL);
++}
++#endif
++
++const struct file_operations privcmd_file_ops = {
++ .unlocked_ioctl = privcmd_ioctl,
++ .mmap = privcmd_mmap,
++};
+diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
+index 6559e0c..229c831 100644
+--- a/drivers/xen/xenfs/super.c
++++ b/drivers/xen/xenfs/super.c
+@@ -12,6 +12,10 @@
+ #include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/magic.h>
++#include <linux/mm.h>
++#include <linux/backing-dev.h>
++
++#include <xen/xen.h>
+
+ #include "xenfs.h"
+
+@@ -20,6 +24,62 @@
+ MODULE_DESCRIPTION("Xen filesystem");
+ MODULE_LICENSE("GPL");
+
++static int xenfs_set_page_dirty(struct page *page)
++{
++ return !TestSetPageDirty(page);
++}
++
++static const struct address_space_operations xenfs_aops = {
++ .set_page_dirty = xenfs_set_page_dirty,
++};
++
++static struct backing_dev_info xenfs_backing_dev_info = {
++ .ra_pages = 0, /* No readahead */
++ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
++};
++
++static struct inode *xenfs_make_inode(struct super_block *sb, int mode)
++{
++ struct inode *ret = new_inode(sb);
++
++ if (ret) {
++ ret->i_mode = mode;
++ ret->i_mapping->a_ops = &xenfs_aops;
++ ret->i_mapping->backing_dev_info = &xenfs_backing_dev_info;
++ ret->i_uid = ret->i_gid = 0;
++ ret->i_blocks = 0;
++ ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
++ }
++ return ret;
++}
++
++static struct dentry *xenfs_create_file(struct super_block *sb,
++ struct dentry *parent,
++ const char *name,
++ const struct file_operations *fops,
++ void *data,
++ int mode)
++{
++ struct dentry *dentry;
++ struct inode *inode;
++
++ dentry = d_alloc_name(parent, name);
++ if (!dentry)
++ return NULL;
++
++ inode = xenfs_make_inode(sb, S_IFREG | mode);
++ if (!inode) {
++ dput(dentry);
++ return NULL;
++ }
++
++ inode->i_fop = fops;
++ inode->i_private = data;
++
++ d_add(dentry, inode);
++ return dentry;
++}
++
+ static ssize_t capabilities_read(struct file *file, char __user *buf,
+ size_t size, loff_t *off)
+ {
+@@ -43,8 +103,22 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
+ { "capabilities", &capabilities_file_ops, S_IRUGO },
+ {""},
+ };
+-
+- return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
++ int rc;
++
++ rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
++ if (rc < 0)
++ return rc;
++
++ if (xen_initial_domain()) {
++ xenfs_create_file(sb, sb->s_root, "xsd_kva",
++ &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
++ xenfs_create_file(sb, sb->s_root, "xsd_port",
++ &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
++ xenfs_create_file(sb, sb->s_root, "privcmd",
++ &privcmd_file_ops, NULL, S_IRUSR|S_IWUSR);
++ }
++
++ return rc;
+ }
+
+ static int xenfs_get_sb(struct file_system_type *fs_type,
+@@ -63,11 +137,25 @@ static struct file_system_type xenfs_type = {
+
+ static int __init xenfs_init(void)
+ {
+- if (xen_pv_domain())
+- return register_filesystem(&xenfs_type);
++ int err;
++ if (!xen_pv_domain()) {
++ printk(KERN_INFO "xenfs: not registering filesystem on non-xen platform\n");
++ return 0;
++ }
++
++ err = register_filesystem(&xenfs_type);
++ if (err) {
++ printk(KERN_ERR "xenfs: Unable to register filesystem!\n");
++ goto out;
++ }
++
++ err = bdi_init(&xenfs_backing_dev_info);
++ if (err)
++ unregister_filesystem(&xenfs_type);
++
++ out:
+
+- printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
+- return 0;
++ return err;
+ }
+
+ static void __exit xenfs_exit(void)
+diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
+index 51f08b2..b68aa62 100644
+--- a/drivers/xen/xenfs/xenfs.h
++++ b/drivers/xen/xenfs/xenfs.h
+@@ -2,5 +2,8 @@
+ #define _XENFS_XENBUS_H
+
+ extern const struct file_operations xenbus_file_ops;
++extern const struct file_operations privcmd_file_ops;
++extern const struct file_operations xsd_kva_file_ops;
++extern const struct file_operations xsd_port_file_ops;
+
+ #endif /* _XENFS_XENBUS_H */
+diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c
+new file mode 100644
+index 0000000..af10804
+--- /dev/null
++++ b/drivers/xen/xenfs/xenstored.c
+@@ -0,0 +1,67 @@
++#include <linux/types.h>
++#include <linux/mm.h>
++#include <linux/fs.h>
++
++#include <xen/page.h>
++
++#include "xenfs.h"
++#include "../xenbus/xenbus_comms.h"
++
++static ssize_t xsd_read(struct file *file, char __user *buf,
++ size_t size, loff_t *off)
++{
++ const char *str = (const char *)file->private_data;
++ return simple_read_from_buffer(buf, size, off, str, strlen(str));
++}
++
++static int xsd_release(struct inode *inode, struct file *file)
++{
++ kfree(file->private_data);
++ return 0;
++}
++
++static int xsd_kva_open(struct inode *inode, struct file *file)
++{
++ file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p",
++ xen_store_interface);
++ if (!file->private_data)
++ return -ENOMEM;
++ return 0;
++}
++
++static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
++{
++ size_t size = vma->vm_end - vma->vm_start;
++
++ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
++ return -EINVAL;
++
++ if (remap_pfn_range(vma, vma->vm_start,
++ virt_to_pfn(xen_store_interface),
++ size, vma->vm_page_prot))
++ return -EAGAIN;
++
++ return 0;
++}
++
++const struct file_operations xsd_kva_file_ops = {
++ .open = xsd_kva_open,
++ .mmap = xsd_kva_mmap,
++ .read = xsd_read,
++ .release = xsd_release,
++};
++
++static int xsd_port_open(struct inode *inode, struct file *file)
++{
++ file->private_data = (void *)kasprintf(GFP_KERNEL, "%d",
++ xen_store_evtchn);
++ if (!file->private_data)
++ return -ENOMEM;
++ return 0;
++}
++
++const struct file_operations xsd_port_file_ops = {
++ .open = xsd_port_open,
++ .read = xsd_read,
++ .release = xsd_release,
++};
+diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h
+index 26373cf..9fb4270 100644
+--- a/include/asm-generic/pci.h
++++ b/include/asm-generic/pci.h
+@@ -43,6 +43,8 @@ pcibios_select_root(struct pci_dev *pdev, struct resource *res)
+ return root;
+ }
+
++#ifndef HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS
++#endif
+ #ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
+ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
+ {
+diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
+index dd97fb8..b10ec49 100644
+--- a/include/linux/bootmem.h
++++ b/include/linux/bootmem.h
+@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
+ unsigned long addr,
+ unsigned long size);
+ extern void free_bootmem(unsigned long addr, unsigned long size);
++extern void free_bootmem_late(unsigned long addr, unsigned long size);
+
+ /*
+ * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
+diff --git a/include/linux/dmar.h b/include/linux/dmar.h
+index 4a2b162..5de4c9e 100644
+--- a/include/linux/dmar.h
++++ b/include/linux/dmar.h
+@@ -208,16 +208,9 @@ struct dmar_atsr_unit {
+ u8 include_all:1; /* include all ports */
+ };
+
+-/* Intel DMAR initialization functions */
+ extern int intel_iommu_init(void);
+-#else
+-static inline int intel_iommu_init(void)
+-{
+-#ifdef CONFIG_INTR_REMAP
+- return dmar_dev_scope_init();
+-#else
+- return -ENODEV;
+-#endif
+-}
+-#endif /* !CONFIG_DMAR */
++#else /* !CONFIG_DMAR: */
++static inline int intel_iommu_init(void) { return -ENODEV; }
++#endif /* CONFIG_DMAR */
++
+ #endif /* __DMAR_H__ */
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 24c3956..3d74515 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -105,6 +105,12 @@ extern unsigned int kobjsize(const void *objp);
+ #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */
+ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
+ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
++#ifdef CONFIG_XEN
++#define VM_FOREIGN 0x20000000 /* Has pages belonging to another VM */
++struct vm_foreign_map {
++ struct page **map;
++};
++#endif
+
+ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
+ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
+@@ -195,6 +201,15 @@ struct vm_operations_struct {
+ */
+ int (*access)(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write);
++
++ /* Area-specific function for clearing the PTE at @ptep. Returns the
++ * original value of @ptep. */
++ pte_t (*zap_pte)(struct vm_area_struct *vma,
++ unsigned long addr, pte_t *ptep, int is_fullmm);
++
++ /* called before close() to indicate no more pages should be mapped */
++ void (*unmap)(struct vm_area_struct *area);
++
+ #ifdef CONFIG_NUMA
+ /*
+ * set_policy() op must add a reference to any non-NULL @new mempolicy
+diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
+index 6b202b1..b03950e 100644
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -105,6 +105,9 @@ enum pageflags {
+ #ifdef CONFIG_ARCH_USES_PG_UNCACHED
+ PG_uncached, /* Page has been mapped as uncached */
+ #endif
++#ifdef CONFIG_XEN
++ PG_foreign,
++#endif
+ #ifdef CONFIG_MEMORY_FAILURE
+ PG_hwpoison, /* hardware poisoned page. Don't touch */
+ #endif
+@@ -275,6 +278,23 @@ PAGEFLAG(Uncached, uncached)
+ PAGEFLAG_FALSE(Uncached)
+ #endif
+
++#ifdef CONFIG_XEN
++TESTPAGEFLAG(Foreign, foreign)
++__SETPAGEFLAG(Foreign, foreign)
++CLEARPAGEFLAG(Foreign, foreign)
++#define SetPageForeign(_page, dtor) do { \
++ __SetPageForeign(_page); \
++ BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0); \
++ (_page)->index = (long)(dtor); \
++} while (0)
++#define _PageForeignDestructor(_page) \
++ ((void (*)(struct page *, unsigned int))(_page)->index)
++#define PageForeignDestructor(_page, order) \
++ _PageForeignDestructor(_page)(_page, order)
++#else
++PAGEFLAG_FALSE(Foreign)
++#endif
++
+ #ifdef CONFIG_MEMORY_FAILURE
+ PAGEFLAG(HWPoison, hwpoison)
+ TESTSETFLAG(HWPoison, hwpoison)
+diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
+index 73b1f1c..113585a 100644
+--- a/include/linux/swiotlb.h
++++ b/include/linux/swiotlb.h
+@@ -7,6 +7,8 @@ struct device;
+ struct dma_attrs;
+ struct scatterlist;
+
++extern int swiotlb_force;
++
+ /*
+ * Maximum allowable number of contiguous slabs to map,
+ * must be a power of 2. What is the appropriate value ?
+@@ -20,9 +22,46 @@ struct scatterlist;
+ */
+ #define IO_TLB_SHIFT 11
+
+-extern void
+-swiotlb_init(void);
+-
++/* swiotlb-core.c */
++extern void swiotlb_init(int verbose);
++#ifdef CONFIG_SWIOTLB
++extern void __init swiotlb_free(void);
++#else
++static inline void swiotlb_free(void) { }
++#endif
++extern void swiotlb_print_info(void);
++
++/* swiotlb-core.c: Internal book-keeping functions.
++ * Must be linked against the library to take advantage of them.*/
++#ifdef CONFIG_SWIOTLB
++/*
++ * Enumeration for sync targets
++ */
++enum dma_sync_target {
++ SYNC_FOR_CPU = 0,
++ SYNC_FOR_DEVICE = 1,
++};
++extern char *io_tlb_start;
++extern char *io_tlb_end;
++extern unsigned long io_tlb_nslabs;
++extern void *io_tlb_overflow_buffer;
++extern unsigned long io_tlb_overflow;
++extern int is_swiotlb_buffer(phys_addr_t paddr);
++extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
++ enum dma_data_direction dir);
++extern void *do_map_single(struct device *hwdev, phys_addr_t phys,
++ unsigned long start_dma_addr, size_t size, int dir);
++
++extern void do_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
++ int dir);
++
++extern void do_sync_single(struct device *hwdev, char *dma_addr, size_t size,
++ int dir, int target);
++extern void swiotlb_full(struct device *dev, size_t size, int dir, int do_panic);
++extern void __init swiotlb_init_early(size_t default_size, int verbose);
++#endif
++
++/* swiotlb.c: dma_ops functions. */
+ extern void
+ *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags);
+@@ -88,4 +127,74 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
+ extern int
+ swiotlb_dma_supported(struct device *hwdev, u64 mask);
+
++/* swiotlb-xen.c: dma_ops functions. */
++extern void xen_swiotlb_init(int verbose);
++extern void
++*xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
++ dma_addr_t *dma_handle, gfp_t flags);
++
++extern void
++xen_swiotlb_free_coherent(struct device *hwdev, size_t size,
++ void *vaddr, dma_addr_t dma_handle);
++
++extern dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
++ unsigned long offset, size_t size,
++ enum dma_data_direction dir,
++ struct dma_attrs *attrs);
++extern void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
++ size_t size, enum dma_data_direction dir,
++ struct dma_attrs *attrs);
++
++extern int
++xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
++ int direction);
++
++extern void
++xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
++ int direction);
++
++extern int
++xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
++ int nelems, enum dma_data_direction dir,
++ struct dma_attrs *attrs);
++
++extern void
++xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
++ int nelems, enum dma_data_direction dir,
++ struct dma_attrs *attrs);
++
++extern void
++xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
++ size_t size, enum dma_data_direction dir);
++
++extern void
++xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
++ int nelems, enum dma_data_direction dir);
++
++extern void
++xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
++ size_t size, enum dma_data_direction dir);
++
++extern void
++xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
++ int nelems, enum dma_data_direction dir);
++
++extern void
++xen_swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
++ unsigned long offset, size_t size,
++ enum dma_data_direction dir);
++
++extern void
++xen_swiotlb_sync_single_range_for_device(struct device *hwdev,
++ dma_addr_t dev_addr,
++ unsigned long offset, size_t size,
++ enum dma_data_direction dir);
++
++extern int
++xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
++
++extern int
++xen_swiotlb_dma_supported(struct device *hwdev, u64 mask);
++
++
+ #endif /* __LINUX_SWIOTLB_H */
+diff --git a/include/xen/Kbuild b/include/xen/Kbuild
+index 4e65c16..84ad8f0 100644
+--- a/include/xen/Kbuild
++++ b/include/xen/Kbuild
+@@ -1 +1,2 @@
+ header-y += evtchn.h
++header-y += privcmd.h
+diff --git a/include/xen/balloon.h b/include/xen/balloon.h
+new file mode 100644
+index 0000000..e751514
+--- /dev/null
++++ b/include/xen/balloon.h
+@@ -0,0 +1,8 @@
++#ifndef _XEN_BALLOON_H
++#define _XEN_BALLOON_H
++
++/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */
++struct page **alloc_empty_pages_and_pagevec(int nr_pages);
++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
++
++#endif
+diff --git a/include/xen/blkif.h b/include/xen/blkif.h
+new file mode 100644
+index 0000000..7172081
+--- /dev/null
++++ b/include/xen/blkif.h
+@@ -0,0 +1,123 @@
++/*
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_BLKIF_H__
++#define __XEN_BLKIF_H__
++
++#include <xen/interface/xen.h>
++#include <xen/interface/io/ring.h>
++#include <xen/interface/io/blkif.h>
++#include <xen/interface/io/protocols.h>
++
++/* Not a real protocol. Used to generate ring structs which contain
++ * the elements common to all protocols only. This way we get a
++ * compiler-checkable way to use common struct elements, so we can
++ * avoid using switch(protocol) in a number of places. */
++struct blkif_common_request {
++ char dummy;
++};
++struct blkif_common_response {
++ char dummy;
++};
++
++/* i386 protocol version */
++#pragma pack(push, 4)
++struct blkif_x86_32_request {
++ uint8_t operation; /* BLKIF_OP_??? */
++ uint8_t nr_segments; /* number of segments */
++ blkif_vdev_t handle; /* only for read/write requests */
++ uint64_t id; /* private guest value, echoed in resp */
++ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
++ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++};
++struct blkif_x86_32_response {
++ uint64_t id; /* copied from request */
++ uint8_t operation; /* copied from request */
++ int16_t status; /* BLKIF_RSP_??? */
++};
++typedef struct blkif_x86_32_request blkif_x86_32_request_t;
++typedef struct blkif_x86_32_response blkif_x86_32_response_t;
++#pragma pack(pop)
++
++/* x86_64 protocol version */
++struct blkif_x86_64_request {
++ uint8_t operation; /* BLKIF_OP_??? */
++ uint8_t nr_segments; /* number of segments */
++ blkif_vdev_t handle; /* only for read/write requests */
++ uint64_t __attribute__((__aligned__(8))) id;
++ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
++ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++};
++struct blkif_x86_64_response {
++ uint64_t __attribute__((__aligned__(8))) id;
++ uint8_t operation; /* copied from request */
++ int16_t status; /* BLKIF_RSP_??? */
++};
++typedef struct blkif_x86_64_request blkif_x86_64_request_t;
++typedef struct blkif_x86_64_response blkif_x86_64_response_t;
++
++DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
++DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
++DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
++
++union blkif_back_rings {
++ struct blkif_back_ring native;
++ struct blkif_common_back_ring common;
++ struct blkif_x86_32_back_ring x86_32;
++ struct blkif_x86_64_back_ring x86_64;
++};
++
++enum blkif_protocol {
++ BLKIF_PROTOCOL_NATIVE = 1,
++ BLKIF_PROTOCOL_X86_32 = 2,
++ BLKIF_PROTOCOL_X86_64 = 3,
++};
++
++static void inline blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src)
++{
++ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
++ dst->operation = src->operation;
++ dst->nr_segments = src->nr_segments;
++ dst->handle = src->handle;
++ dst->id = src->id;
++ dst->sector_number = src->sector_number;
++ barrier();
++ if (n > dst->nr_segments)
++ n = dst->nr_segments;
++ for (i = 0; i < n; i++)
++ dst->seg[i] = src->seg[i];
++}
++
++static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src)
++{
++ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
++ dst->operation = src->operation;
++ dst->nr_segments = src->nr_segments;
++ dst->handle = src->handle;
++ dst->id = src->id;
++ dst->sector_number = src->sector_number;
++ barrier();
++ if (n > dst->nr_segments)
++ n = dst->nr_segments;
++ for (i = 0; i < n; i++)
++ dst->seg[i] = src->seg[i];
++}
++
++#endif /* __XEN_BLKIF_H__ */
+diff --git a/include/xen/events.h b/include/xen/events.h
+index e68d59a..4a934a7 100644
+--- a/include/xen/events.h
++++ b/include/xen/events.h
+@@ -12,6 +12,8 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ irq_handler_t handler,
+ unsigned long irqflags, const char *devname,
+ void *dev_id);
++int bind_virq_to_irq(unsigned int virq, unsigned int cpu);
++
+ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags, const char *devname,
+@@ -22,6 +24,12 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id);
++int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
++ unsigned int remote_port,
++ irq_handler_t handler,
++ unsigned long irqflags,
++ const char *devname,
++ void *dev_id);
+
+ /*
+ * Common unbind function for all event sources. Takes IRQ to unbind from.
+@@ -56,4 +64,23 @@ void xen_poll_irq(int irq);
+ /* Determine the IRQ which is bound to an event channel */
+ unsigned irq_from_evtchn(unsigned int evtchn);
+
++/* Allocate an irq for a physical interrupt, given a gsi. "Legacy"
++ GSIs are identity mapped; others are dynamically allocated as
++ usual. */
++int xen_allocate_pirq(unsigned gsi, int shareable, char *name);
++
++/* Return vector allocated to pirq */
++int xen_vector_from_irq(unsigned pirq);
++
++/* Return gsi allocated to pirq */
++int xen_gsi_from_irq(unsigned pirq);
++
++#ifdef CONFIG_XEN_DOM0_PCI
++void xen_setup_pirqs(void);
++#else
++static inline void xen_setup_pirqs(void)
++{
++}
++#endif
++
+ #endif /* _XEN_EVENTS_H */
+diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
+new file mode 100644
+index 0000000..8bd1467
+--- /dev/null
++++ b/include/xen/gntdev.h
+@@ -0,0 +1,119 @@
++/******************************************************************************
++ * gntdev.h
++ *
++ * Interface to /dev/xen/gntdev.
++ *
++ * Copyright (c) 2007, D G Murray
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __LINUX_PUBLIC_GNTDEV_H__
++#define __LINUX_PUBLIC_GNTDEV_H__
++
++struct ioctl_gntdev_grant_ref {
++ /* The domain ID of the grant to be mapped. */
++ uint32_t domid;
++ /* The grant reference of the grant to be mapped. */
++ uint32_t ref;
++};
++
++/*
++ * Inserts the grant references into the mapping table of an instance
++ * of gntdev. N.B. This does not perform the mapping, which is deferred
++ * until mmap() is called with @index as the offset.
++ */
++#define IOCTL_GNTDEV_MAP_GRANT_REF \
++_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
++struct ioctl_gntdev_map_grant_ref {
++ /* IN parameters */
++ /* The number of grants to be mapped. */
++ uint32_t count;
++ uint32_t pad;
++ /* OUT parameters */
++ /* The offset to be used on a subsequent call to mmap(). */
++ uint64_t index;
++ /* Variable IN parameter. */
++ /* Array of grant references, of size @count. */
++ struct ioctl_gntdev_grant_ref refs[1];
++};
++
++/*
++ * Removes the grant references from the mapping table of an instance of
++ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
++ * before this ioctl is called, or an error will result.
++ */
++#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
++_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
++struct ioctl_gntdev_unmap_grant_ref {
++ /* IN parameters */
++ /* The offset was returned by the corresponding map operation. */
++ uint64_t index;
++ /* The number of pages to be unmapped. */
++ uint32_t count;
++ uint32_t pad;
++};
++
++/*
++ * Returns the offset in the driver's address space that corresponds
++ * to @vaddr. This can be used to perform a munmap(), followed by an
++ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
++ * the caller. The number of pages that were allocated at the same time as
++ * @vaddr is returned in @count.
++ *
++ * N.B. Where more than one page has been mapped into a contiguous range, the
++ * supplied @vaddr must correspond to the start of the range; otherwise
++ * an error will result. It is only possible to munmap() the entire
++ * contiguously-allocated range at once, and not any subrange thereof.
++ */
++#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
++_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
++struct ioctl_gntdev_get_offset_for_vaddr {
++ /* IN parameters */
++ /* The virtual address of the first mapped page in a range. */
++ uint64_t vaddr;
++ /* OUT parameters */
++ /* The offset that was used in the initial mmap() operation. */
++ uint64_t offset;
++ /* The number of pages mapped in the VM area that begins at @vaddr. */
++ uint32_t count;
++ uint32_t pad;
++};
++
++/*
++ * Sets the maximum number of grants that may mapped at once by this gntdev
++ * instance.
++ *
++ * N.B. This must be called before any other ioctl is performed on the device.
++ */
++#define IOCTL_GNTDEV_SET_MAX_GRANTS \
++_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
++struct ioctl_gntdev_set_max_grants {
++ /* IN parameter */
++ /* The maximum number of grants that may be mapped at once. */
++ uint32_t count;
++};
++
++#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
+diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
+index a40f1cd..9e54167 100644
+--- a/include/xen/grant_table.h
++++ b/include/xen/grant_table.h
+@@ -37,10 +37,16 @@
+ #ifndef __ASM_GNTTAB_H__
+ #define __ASM_GNTTAB_H__
+
+-#include <asm/xen/hypervisor.h>
++#include <asm/page.h>
++
++#include <xen/interface/xen.h>
+ #include <xen/interface/grant_table.h>
++
++#include <asm/xen/hypervisor.h>
+ #include <asm/xen/grant_table.h>
+
++#include <xen/features.h>
++
+ /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
+ #define NR_GRANT_FRAMES 4
+
+@@ -51,6 +57,8 @@ struct gnttab_free_callback {
+ u16 count;
+ };
+
++void gnttab_reset_grant_page(struct page *page);
++
+ int gnttab_suspend(void);
+ int gnttab_resume(void);
+
+@@ -80,6 +88,8 @@ unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
+
+ int gnttab_query_foreign_access(grant_ref_t ref);
+
++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
++
+ /*
+ * operations on reserved batches of grant references
+ */
+@@ -106,6 +116,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+ void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
+ unsigned long pfn);
+
++static inline void
++gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr,
++ uint32_t flags, grant_ref_t ref, domid_t domid)
++{
++ if (flags & GNTMAP_contains_pte)
++ map->host_addr = addr;
++ else if (xen_feature(XENFEAT_auto_translated_physmap))
++ map->host_addr = __pa(addr);
++ else
++ map->host_addr = addr;
++
++ map->flags = flags;
++ map->ref = ref;
++ map->dom = domid;
++}
++
++static inline void
++gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
++ uint32_t flags, grant_handle_t handle)
++{
++ if (flags & GNTMAP_contains_pte)
++ unmap->host_addr = addr;
++ else if (xen_feature(XENFEAT_auto_translated_physmap))
++ unmap->host_addr = __pa(addr);
++ else
++ unmap->host_addr = addr;
++
++ unmap->handle = handle;
++ unmap->dev_bus_addr = 0;
++}
++
+ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ struct grant_entry **__shared);
+diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
+index 39da93c..8211af8 100644
+--- a/include/xen/interface/grant_table.h
++++ b/include/xen/interface/grant_table.h
+@@ -321,6 +321,28 @@ struct gnttab_query_size {
+ DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
+
+ /*
++ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings
++ * tracked by <handle> but atomically replace the page table entry with one
++ * pointing to the machine address under <new_addr>. <new_addr> will be
++ * redirected to the null entry.
++ * NOTES:
++ * 1. The call may fail in an undefined manner if either mapping is not
++ * tracked by <handle>.
++ * 2. After executing a batch of unmaps, it is guaranteed that no stale
++ * mappings will remain in the device or host TLBs.
++ */
++#define GNTTABOP_unmap_and_replace 7
++struct gnttab_unmap_and_replace {
++ /* IN parameters. */
++ uint64_t host_addr;
++ uint64_t new_addr;
++ grant_handle_t handle;
++ /* OUT parameters. */
++ int16_t status; /* GNTST_* */
++};
++DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
++
++/*
+ * Bitfield values for update_pin_status.flags.
+ */
+ /* Map the grant entry for access by I/O devices. */
+diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
+index e8cbf43..865dcf0 100644
+--- a/include/xen/interface/io/ring.h
++++ b/include/xen/interface/io/ring.h
+@@ -73,7 +73,8 @@ union __name##_sring_entry { \
+ struct __name##_sring { \
+ RING_IDX req_prod, req_event; \
+ RING_IDX rsp_prod, rsp_event; \
+- uint8_t pad[48]; \
++ uint8_t netfront_smartpoll_active; \
++ uint8_t pad[47]; \
+ union __name##_sring_entry ring[1]; /* variable-length */ \
+ }; \
+ \
+diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
+index af36ead..eac3ce1 100644
+--- a/include/xen/interface/memory.h
++++ b/include/xen/interface/memory.h
+@@ -9,6 +9,8 @@
+ #ifndef __XEN_PUBLIC_MEMORY_H__
+ #define __XEN_PUBLIC_MEMORY_H__
+
++#include <linux/spinlock.h>
++
+ /*
+ * Increase or decrease the specified domain's memory reservation. Returns a
+ * -ve errcode on failure, or the # extents successfully allocated or freed.
+@@ -53,6 +55,48 @@ struct xen_memory_reservation {
+ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
+
+ /*
++ * An atomic exchange of memory pages. If return code is zero then
++ * @out.extent_list provides GMFNs of the newly-allocated memory.
++ * Returns zero on complete success, otherwise a negative error code.
++ * On complete success then always @nr_exchanged == @in.nr_extents.
++ * On partial success @nr_exchanged indicates how much work was done.
++ */
++#define XENMEM_exchange 11
++struct xen_memory_exchange {
++ /*
++ * [IN] Details of memory extents to be exchanged (GMFN bases).
++ * Note that @in.address_bits is ignored and unused.
++ */
++ struct xen_memory_reservation in;
++
++ /*
++ * [IN/OUT] Details of new memory extents.
++ * We require that:
++ * 1. @in.domid == @out.domid
++ * 2. @in.nr_extents << @in.extent_order ==
++ * @out.nr_extents << @out.extent_order
++ * 3. @in.extent_start and @out.extent_start lists must not overlap
++ * 4. @out.extent_start lists GPFN bases to be populated
++ * 5. @out.extent_start is overwritten with allocated GMFN bases
++ */
++ struct xen_memory_reservation out;
++
++ /*
++ * [OUT] Number of input extents that were successfully exchanged:
++ * 1. The first @nr_exchanged input extents were successfully
++ * deallocated.
++ * 2. The corresponding first entries in the output extent list correctly
++ * indicate the GMFNs that were successfully exchanged.
++ * 3. All other input and output extents are untouched.
++ * 4. If not all input exents are exchanged then the return code of this
++ * command will be non-zero.
++ * 5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
++ */
++ unsigned long nr_exchanged;
++};
++
++DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange);
++/*
+ * Returns the maximum machine frame number of mapped RAM in this system.
+ * This command always succeeds (it never returns an error code).
+ * arg == NULL.
+@@ -97,6 +141,19 @@ struct xen_machphys_mfn_list {
+ DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
+
+ /*
++ * Returns the location in virtual address space of the machine_to_phys
++ * mapping table. Architectures which do not have a m2p table, or which do not
++ * map it by default into guest address space, do not implement this command.
++ * arg == addr of xen_machphys_mapping_t.
++ */
++#define XENMEM_machphys_mapping 12
++struct xen_machphys_mapping {
++ unsigned long v_start, v_end; /* Start and end virtual addresses. */
++ unsigned long max_mfn; /* Maximum MFN that can be looked up. */
++};
++DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping_t);
++
++/*
+ * Sets the GPFN at which a particular page appears in the specified guest's
+ * pseudophysical address space.
+ * arg == addr of xen_add_to_physmap_t.
+@@ -142,4 +199,39 @@ struct xen_translate_gpfn_list {
+ };
+ DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
+
++/*
++ * Returns the pseudo-physical memory map as it was when the domain
++ * was started (specified by XENMEM_set_memory_map).
++ * arg == addr of struct xen_memory_map.
++ */
++#define XENMEM_memory_map 9
++struct xen_memory_map {
++ /*
++ * On call the number of entries which can be stored in buffer. On
++ * return the number of entries which have been stored in
++ * buffer.
++ */
++ unsigned int nr_entries;
++
++ /*
++ * Entries in the buffer are in the same format as returned by the
++ * BIOS INT 0x15 EAX=0xE820 call.
++ */
++ GUEST_HANDLE(void) buffer;
++};
++DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
++
++/*
++ * Returns the real physical memory map. Passes the same structure as
++ * XENMEM_memory_map.
++ * arg == addr of struct xen_memory_map.
++ */
++#define XENMEM_machine_memory_map 10
++
++
++/*
++ * Prevent the balloon driver from changing the memory reservation
++ * during a driver critical region.
++ */
++extern spinlock_t xen_reservation_lock;
+ #endif /* __XEN_PUBLIC_MEMORY_H__ */
+diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
+index cd69391..39c2b51 100644
+--- a/include/xen/interface/physdev.h
++++ b/include/xen/interface/physdev.h
+@@ -106,6 +106,57 @@ struct physdev_irq {
+ uint32_t vector;
+ };
+
++#define MAP_PIRQ_TYPE_MSI 0x0
++#define MAP_PIRQ_TYPE_GSI 0x1
++#define MAP_PIRQ_TYPE_UNKNOWN 0x2
++
++#define PHYSDEVOP_map_pirq 13
++struct physdev_map_pirq {
++ domid_t domid;
++ /* IN */
++ int type;
++ /* IN */
++ int index;
++ /* IN or OUT */
++ int pirq;
++ /* IN */
++ int bus;
++ /* IN */
++ int devfn;
++ /* IN */
++ int entry_nr;
++ /* IN */
++ uint64_t table_base;
++};
++
++#define PHYSDEVOP_unmap_pirq 14
++struct physdev_unmap_pirq {
++ domid_t domid;
++ /* IN */
++ int pirq;
++};
++
++#define PHYSDEVOP_manage_pci_add 15
++#define PHYSDEVOP_manage_pci_remove 16
++struct physdev_manage_pci {
++ /* IN */
++ uint8_t bus;
++ uint8_t devfn;
++};
++
++#define PHYSDEVOP_manage_pci_add_ext 20
++struct physdev_manage_pci_ext {
++ /* IN */
++ uint8_t bus;
++ uint8_t devfn;
++ unsigned is_extfn;
++ unsigned is_virtfn;
++ struct {
++ uint8_t bus;
++ uint8_t devfn;
++ } physfn;
++};
++
+ /*
+ * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
+ * hypercall since 0x00030202.
+@@ -121,6 +172,16 @@ struct physdev_op {
+ } u;
+ };
+
++#define PHYSDEVOP_setup_gsi 21
++struct physdev_setup_gsi {
++ int gsi;
++ /* IN */
++ uint8_t triggering;
++ /* IN */
++ uint8_t polarity;
++ /* IN */
++};
++
+ /*
+ * Notify that some PIRQ-bound event channels have been unmasked.
+ * ** This command is obsolete since interface version 0x00030202 and is **
+diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h
+new file mode 100644
+index 0000000..83e4714
+--- /dev/null
++++ b/include/xen/interface/platform.h
+@@ -0,0 +1,222 @@
++/******************************************************************************
++ * platform.h
++ *
++ * Hardware platform operations. Intended for use by domain-0 kernel.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2002-2006, K Fraser
++ */
++
++#ifndef __XEN_PUBLIC_PLATFORM_H__
++#define __XEN_PUBLIC_PLATFORM_H__
++
++#include "xen.h"
++
++#define XENPF_INTERFACE_VERSION 0x03000001
++
++/*
++ * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
++ * 1 January, 1970 if the current system time was <system_time>.
++ */
++#define XENPF_settime 17
++struct xenpf_settime {
++ /* IN variables. */
++ uint32_t secs;
++ uint32_t nsecs;
++ uint64_t system_time;
++};
++typedef struct xenpf_settime xenpf_settime_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t);
++
++/*
++ * Request memory range (@mfn, @mfn+ at nr_mfns-1) to have type @type.
++ * On x86, @type is an architecture-defined MTRR memory type.
++ * On success, returns the MTRR that was used (@reg) and a handle that can
++ * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting.
++ * (x86-specific).
++ */
++#define XENPF_add_memtype 31
++struct xenpf_add_memtype {
++ /* IN variables. */
++ unsigned long mfn;
++ uint64_t nr_mfns;
++ uint32_t type;
++ /* OUT variables. */
++ uint32_t handle;
++ uint32_t reg;
++};
++typedef struct xenpf_add_memtype xenpf_add_memtype_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_add_memtype_t);
++
++/*
++ * Tear down an existing memory-range type. If @handle is remembered then it
++ * should be passed in to accurately tear down the correct setting (in case
++ * of overlapping memory regions with differing types). If it is not known
++ * then @handle should be set to zero. In all cases @reg must be set.
++ * (x86-specific).
++ */
++#define XENPF_del_memtype 32
++struct xenpf_del_memtype {
++ /* IN variables. */
++ uint32_t handle;
++ uint32_t reg;
++};
++typedef struct xenpf_del_memtype xenpf_del_memtype_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_del_memtype_t);
++
++/* Read current type of an MTRR (x86-specific). */
++#define XENPF_read_memtype 33
++struct xenpf_read_memtype {
++ /* IN variables. */
++ uint32_t reg;
++ /* OUT variables. */
++ unsigned long mfn;
++ uint64_t nr_mfns;
++ uint32_t type;
++};
++typedef struct xenpf_read_memtype xenpf_read_memtype_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_read_memtype_t);
++
++#define XENPF_microcode_update 35
++struct xenpf_microcode_update {
++ /* IN variables. */
++ GUEST_HANDLE(void) data; /* Pointer to microcode data */
++ uint32_t length; /* Length of microcode data. */
++};
++typedef struct xenpf_microcode_update xenpf_microcode_update_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_microcode_update_t);
++
++#define XENPF_platform_quirk 39
++#define QUIRK_NOIRQBALANCING 1 /* Do not restrict IO-APIC RTE targets */
++#define QUIRK_IOAPIC_BAD_REGSEL 2 /* IO-APIC REGSEL forgets its value */
++#define QUIRK_IOAPIC_GOOD_REGSEL 3 /* IO-APIC REGSEL behaves properly */
++struct xenpf_platform_quirk {
++ /* IN variables. */
++ uint32_t quirk_id;
++};
++typedef struct xenpf_platform_quirk xenpf_platform_quirk_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_platform_quirk_t);
++
++#define XENPF_firmware_info 50
++#define XEN_FW_DISK_INFO 1 /* from int 13 AH=08/41/48 */
++#define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */
++#define XEN_FW_VBEDDC_INFO 3 /* from int 10 AX=4f15 */
++struct xenpf_firmware_info {
++ /* IN variables. */
++ uint32_t type;
++ uint32_t index;
++ /* OUT variables. */
++ union {
++ struct {
++ /* Int13, Fn48: Check Extensions Present. */
++ uint8_t device; /* %dl: bios device number */
++ uint8_t version; /* %ah: major version */
++ uint16_t interface_support; /* %cx: support bitmap */
++ /* Int13, Fn08: Legacy Get Device Parameters. */
++ uint16_t legacy_max_cylinder; /* %cl[7:6]:%ch: max cyl # */
++ uint8_t legacy_max_head; /* %dh: max head # */
++ uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector # */
++ /* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */
++ /* NB. First uint16_t of buffer must be set to buffer size. */
++ GUEST_HANDLE(void) edd_params;
++ } disk_info; /* XEN_FW_DISK_INFO */
++ struct {
++ uint8_t device; /* bios device number */
++ uint32_t mbr_signature; /* offset 0x1b8 in mbr */
++ } disk_mbr_signature; /* XEN_FW_DISK_MBR_SIGNATURE */
++ struct {
++ /* Int10, AX=4F15: Get EDID info. */
++ uint8_t capabilities;
++ uint8_t edid_transfer_time;
++ /* must refer to 128-byte buffer */
++ GUEST_HANDLE(uchar) edid;
++ } vbeddc_info; /* XEN_FW_VBEDDC_INFO */
++ } u;
++};
++typedef struct xenpf_firmware_info xenpf_firmware_info_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_firmware_info_t);
++
++#define XENPF_enter_acpi_sleep 51
++struct xenpf_enter_acpi_sleep {
++ /* IN variables */
++ uint16_t pm1a_cnt_val; /* PM1a control value. */
++ uint16_t pm1b_cnt_val; /* PM1b control value. */
++ uint32_t sleep_state; /* Which state to enter (Sn). */
++ uint32_t flags; /* Must be zero. */
++};
++typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_enter_acpi_sleep_t);
++
++#define XENPF_change_freq 52
++struct xenpf_change_freq {
++ /* IN variables */
++ uint32_t flags; /* Must be zero. */
++ uint32_t cpu; /* Physical cpu. */
++ uint64_t freq; /* New frequency (Hz). */
++};
++typedef struct xenpf_change_freq xenpf_change_freq_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_change_freq_t);
++
++/*
++ * Get idle times (nanoseconds since boot) for physical CPUs specified in the
++ * @cpumap_bitmap with range [0.. at cpumap_nr_cpus-1]. The @idletime array is
++ * indexed by CPU number; only entries with the corresponding @cpumap_bitmap
++ * bit set are written to. On return, @cpumap_bitmap is modified so that any
++ * non-existent CPUs are cleared. Such CPUs have their @idletime array entry
++ * cleared.
++ */
++#define XENPF_getidletime 53
++struct xenpf_getidletime {
++ /* IN/OUT variables */
++ /* IN: CPUs to interrogate; OUT: subset of IN which are present */
++ GUEST_HANDLE(uchar) cpumap_bitmap;
++ /* IN variables */
++ /* Size of cpumap bitmap. */
++ uint32_t cpumap_nr_cpus;
++ /* Must be indexable for every cpu in cpumap_bitmap. */
++ GUEST_HANDLE(uint64_t) idletime;
++ /* OUT variables */
++ /* System time when the idletime snapshots were taken. */
++ uint64_t now;
++};
++typedef struct xenpf_getidletime xenpf_getidletime_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t);
++
++struct xen_platform_op {
++ uint32_t cmd;
++ uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
++ union {
++ struct xenpf_settime settime;
++ struct xenpf_add_memtype add_memtype;
++ struct xenpf_del_memtype del_memtype;
++ struct xenpf_read_memtype read_memtype;
++ struct xenpf_microcode_update microcode;
++ struct xenpf_platform_quirk platform_quirk;
++ struct xenpf_firmware_info firmware_info;
++ struct xenpf_enter_acpi_sleep enter_acpi_sleep;
++ struct xenpf_change_freq change_freq;
++ struct xenpf_getidletime getidletime;
++ uint8_t pad[128];
++ } u;
++};
++typedef struct xen_platform_op xen_platform_op_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_platform_op_t);
++
++#endif /* __XEN_PUBLIC_PLATFORM_H__ */
+diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
+index 2befa3e..327db61 100644
+--- a/include/xen/interface/xen.h
++++ b/include/xen/interface/xen.h
+@@ -184,6 +184,8 @@
+ #define MMUEXT_NEW_USER_BASEPTR 15
+
+ #ifndef __ASSEMBLY__
++#include <linux/types.h>
++
+ struct mmuext_op {
+ unsigned int cmd;
+ union {
+@@ -449,6 +451,45 @@ struct start_info {
+ int8_t cmd_line[MAX_GUEST_CMDLINE];
+ };
+
++struct dom0_vga_console_info {
++ uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
++#define XEN_VGATYPE_TEXT_MODE_3 0x03
++#define XEN_VGATYPE_VESA_LFB 0x23
++
++ union {
++ struct {
++ /* Font height, in pixels. */
++ uint16_t font_height;
++ /* Cursor location (column, row). */
++ uint16_t cursor_x, cursor_y;
++ /* Number of rows and columns (dimensions in characters). */
++ uint16_t rows, columns;
++ } text_mode_3;
++
++ struct {
++ /* Width and height, in pixels. */
++ uint16_t width, height;
++ /* Bytes per scan line. */
++ uint16_t bytes_per_line;
++ /* Bits per pixel. */
++ uint16_t bits_per_pixel;
++ /* LFB physical address, and size (in units of 64kB). */
++ uint32_t lfb_base;
++ uint32_t lfb_size;
++ /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
++ uint8_t red_pos, red_size;
++ uint8_t green_pos, green_size;
++ uint8_t blue_pos, blue_size;
++ uint8_t rsvd_pos, rsvd_size;
++
++ /* VESA capabilities (offset 0xa, VESA command 0x4f00). */
++ uint32_t gbl_caps;
++ /* Mode attributes (offset 0x0, VESA command 0x4f01). */
++ uint16_t mode_attrs;
++ } vesa_lfb;
++ } u;
++};
++
+ /* These flags are passed in the 'flags' field of start_info_t. */
+ #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
+ #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
+@@ -461,6 +502,8 @@ typedef uint8_t xen_domain_handle_t[16];
+ #define __mk_unsigned_long(x) x ## UL
+ #define mk_unsigned_long(x) __mk_unsigned_long(x)
+
++DEFINE_GUEST_HANDLE(uint64_t);
++
+ #else /* __ASSEMBLY__ */
+
+ /* In assembly code we cannot use C numeric constant suffixes. */
+diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h
+new file mode 100644
+index 0000000..b42cdfd
+--- /dev/null
++++ b/include/xen/privcmd.h
+@@ -0,0 +1,80 @@
++/******************************************************************************
++ * privcmd.h
++ *
++ * Interface to /proc/xen/privcmd.
++ *
++ * Copyright (c) 2003-2005, K A Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __LINUX_PUBLIC_PRIVCMD_H__
++#define __LINUX_PUBLIC_PRIVCMD_H__
++
++#include <linux/types.h>
++
++typedef unsigned long xen_pfn_t;
++
++#ifndef __user
++#define __user
++#endif
++
++struct privcmd_hypercall {
++ __u64 op;
++ __u64 arg[5];
++};
++
++struct privcmd_mmap_entry {
++ __u64 va;
++ __u64 mfn;
++ __u64 npages;
++};
++
++struct privcmd_mmap {
++ int num;
++ domid_t dom; /* target domain */
++ struct privcmd_mmap_entry __user *entry;
++};
++
++struct privcmd_mmapbatch {
++ int num; /* number of pages to populate */
++ domid_t dom; /* target domain */
++ __u64 addr; /* virtual address */
++ xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
++};
++
++/*
++ * @cmd: IOCTL_PRIVCMD_HYPERCALL
++ * @arg: &privcmd_hypercall_t
++ * Return: Value returned from execution of the specified hypercall.
++ */
++#define IOCTL_PRIVCMD_HYPERCALL \
++ _IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall))
++#define IOCTL_PRIVCMD_MMAP \
++ _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap))
++#define IOCTL_PRIVCMD_MMAPBATCH \
++ _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch))
++
++#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
+diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
+index 883a21b..c7b3ce9 100644
+--- a/include/xen/xen-ops.h
++++ b/include/xen/xen-ops.h
+@@ -14,4 +14,15 @@ void xen_mm_unpin_all(void);
+ void xen_timer_resume(void);
+ void xen_arch_resume(void);
+
++int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
++ unsigned long addr,
++ unsigned long mfn, int nr,
++ pgprot_t prot, unsigned domid);
++
++extern unsigned long *xen_contiguous_bitmap;
++int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
++ unsigned int address_bits);
++
++void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order);
++
+ #endif /* INCLUDE_XEN_OPS_H */
+diff --git a/include/xen/xen.h b/include/xen/xen.h
+new file mode 100644
+index 0000000..a164024
+--- /dev/null
++++ b/include/xen/xen.h
+@@ -0,0 +1,32 @@
++#ifndef _XEN_XEN_H
++#define _XEN_XEN_H
++
++enum xen_domain_type {
++ XEN_NATIVE, /* running on bare hardware */
++ XEN_PV_DOMAIN, /* running in a PV domain */
++ XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
++};
++
++#ifdef CONFIG_XEN
++extern enum xen_domain_type xen_domain_type;
++#else
++#define xen_domain_type XEN_NATIVE
++#endif
++
++#define xen_domain() (xen_domain_type != XEN_NATIVE)
++#define xen_pv_domain() (xen_domain() && \
++ xen_domain_type == XEN_PV_DOMAIN)
++#define xen_hvm_domain() (xen_domain() && \
++ xen_domain_type == XEN_HVM_DOMAIN)
++
++#ifdef CONFIG_XEN_DOM0
++#include <xen/interface/xen.h>
++#include <asm/xen/hypervisor.h>
++
++#define xen_initial_domain() (xen_pv_domain() && \
++ xen_start_info->flags & SIF_INITDOMAIN)
++#else /* !CONFIG_XEN_DOM0 */
++#define xen_initial_domain() (0)
++#endif /* CONFIG_XEN_DOM0 */
++
++#endif /* _XEN_XEN_H */
+diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
+index b9763ba..542ca7c 100644
+--- a/include/xen/xenbus.h
++++ b/include/xen/xenbus.h
+@@ -93,7 +93,7 @@ struct xenbus_driver {
+ int (*remove)(struct xenbus_device *dev);
+ int (*suspend)(struct xenbus_device *dev, pm_message_t state);
+ int (*resume)(struct xenbus_device *dev);
+- int (*uevent)(struct xenbus_device *, char **, int, char *, int);
++ int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *);
+ struct device_driver driver;
+ int (*read_otherend_details)(struct xenbus_device *dev);
+ int (*is_ready)(struct xenbus_device *dev);
+diff --git a/lib/Makefile b/lib/Makefile
+index 2e78277..7c31e3d 100644
+--- a/lib/Makefile
++++ b/lib/Makefile
+@@ -77,7 +77,8 @@ obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
+ obj-$(CONFIG_SMP) += percpu_counter.o
+ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
+
+-obj-$(CONFIG_SWIOTLB) += swiotlb.o
++obj-$(CONFIG_SWIOTLB) += swiotlb-core.o swiotlb.o
++obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
+ obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
+ obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
+
+diff --git a/lib/swiotlb-core.c b/lib/swiotlb-core.c
+new file mode 100644
+index 0000000..a17c89e
+--- /dev/null
++++ b/lib/swiotlb-core.c
+@@ -0,0 +1,572 @@
++/*
++ * Dynamic DMA mapping support.
++ *
++ * This implementation is a fallback for platforms that do not support
++ * I/O TLBs (aka DMA address translation hardware).
++ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick at intel.com>
++ * Copyright (C) 2000 Goutham Rao <goutham.rao at intel.com>
++ * Copyright (C) 2000, 2003 Hewlett-Packard Co
++ * David Mosberger-Tang <davidm at hpl.hp.com>
++ *
++ * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API.
++ * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid
++ * unnecessary i-cache flushing.
++ * 04/07/.. ak Better overflow handling. Assorted fixes.
++ * 05/09/10 linville Add support for syncing ranges, support syncing for
++ * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
++ * 08/12/11 beckyb Add highmem support
++ */
++
++#include <linux/cache.h>
++#include <linux/dma-mapping.h>
++#include <linux/mm.h>
++#include <linux/module.h>
++#include <linux/spinlock.h>
++#include <linux/string.h>
++#include <linux/swiotlb.h>
++#include <linux/pfn.h>
++#include <linux/types.h>
++#include <linux/ctype.h>
++#include <linux/highmem.h>
++
++#include <linux/io.h>
++#include <asm/dma.h>
++#include <linux/scatterlist.h>
++
++#include <linux/init.h>
++#include <linux/bootmem.h>
++#include <linux/iommu-helper.h>
++
++#define OFFSET(val, align) ((unsigned long) ((val) & ((align) - 1)))
++
++#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
++
++/*
++ * Minimum IO TLB size to bother booting with. Systems with mainly
++ * 64bit capable cards will only lightly use the swiotlb. If we can't
++ * allocate a contiguous 1MB, we're probably in trouble anyway.
++ */
++#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
++
++int swiotlb_force;
++
++/*
++ * Used to do a quick range check in do_unmap_single and
++ * do_sync_single_*, to see if the memory was in fact allocated by this
++ * API.
++ */
++char *io_tlb_start, *io_tlb_end;
++
++/*
++ * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
++ * io_tlb_end. This is command line adjustable via setup_io_tlb_npages.
++ */
++unsigned long io_tlb_nslabs;
++
++/*
++ * When the IOMMU overflows we return a fallback buffer. This sets the size.
++ */
++unsigned long io_tlb_overflow = 32*1024;
++
++void *io_tlb_overflow_buffer;
++
++/*
++ * This is a free list describing the number of free entries available from
++ * each index
++ */
++static unsigned int *io_tlb_list;
++static unsigned int io_tlb_index;
++
++/*
++ * We need to save away the original address corresponding to a mapped entry
++ * for the sync operations.
++ */
++static phys_addr_t *io_tlb_orig_addr;
++
++/*
++ * Protect the above data structures in the map and unmap calls
++ */
++static DEFINE_SPINLOCK(io_tlb_lock);
++
++static int late_alloc;
++
++static int __init
++setup_io_tlb_npages(char *str)
++{
++ int get_value(const char *token, char *str, char **endp)
++ {
++ ssize_t len;
++ int val = 0;
++
++ len = strlen(token);
++ if (!strncmp(str, token, len)) {
++ str += len;
++ if (*str == '=')
++ ++str;
++ if (*str != '\0')
++ val = simple_strtoul(str, endp, 0);
++ }
++ *endp = str;
++ return val;
++ }
++
++ int val;
++
++ while (*str) {
++ /* The old syntax */
++ if (isdigit(*str)) {
++ io_tlb_nslabs = simple_strtoul(str, &str, 0);
++ /* avoid tail segment of size < IO_TLB_SEGSIZE */
++ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
++ }
++ if (!strncmp(str, "force", 5))
++ swiotlb_force = 1;
++ /* The new syntax: swiotlb=nslabs=16384,overflow=32768,force */
++ val = get_value("nslabs", str, &str);
++ if (val)
++ io_tlb_nslabs = ALIGN(val, IO_TLB_SEGSIZE);
++
++ val = get_value("overflow", str, &str);
++ if (val)
++ io_tlb_overflow = val;
++ str = strpbrk(str, ",");
++ if (!str)
++ break;
++ str++; /* skip ',' */
++ }
++ return 1;
++}
++__setup("swiotlb=", setup_io_tlb_npages);
++
++void swiotlb_print_info(void)
++{
++ unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
++ phys_addr_t pstart, pend;
++
++ pstart = virt_to_phys(io_tlb_start);
++ pend = virt_to_phys(io_tlb_end);
++
++ printk(KERN_INFO "DMA: Placing %luMB software IO TLB between %p - %p\n",
++ bytes >> 20, io_tlb_start, io_tlb_end);
++ printk(KERN_INFO "DMA: software IO TLB at phys %#llx - %#llx\n",
++ (unsigned long long)pstart,
++ (unsigned long long)pend);
++}
++
++/*
++ * Statically reserve bounce buffer space and initialize bounce buffer data
++ * structures for the software IO TLB used to implement the DMA API.
++ */
++void __init
++swiotlb_init_early(size_t default_size, int verbose)
++{
++ unsigned long i, bytes;
++
++ if (!io_tlb_nslabs) {
++ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
++ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
++ }
++
++ bytes = io_tlb_nslabs << IO_TLB_SHIFT;
++
++ /*
++ * Get IO TLB memory from the low pages
++ */
++ io_tlb_start = alloc_bootmem_low_pages(bytes);
++ if (!io_tlb_start)
++ panic("DMA: Cannot allocate SWIOTLB buffer");
++ io_tlb_end = io_tlb_start + bytes;
++
++ /*
++ * Allocate and initialize the free list array. This array is used
++ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
++ * between io_tlb_start and io_tlb_end.
++ */
++ io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
++ for (i = 0; i < io_tlb_nslabs; i++)
++ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
++ io_tlb_index = 0;
++ io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
++
++ /*
++ * Get the overflow emergency buffer
++ */
++ io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
++ if (!io_tlb_overflow_buffer)
++ panic("DMA: Cannot allocate SWIOTLB overflow buffer!\n");
++ if (verbose)
++ swiotlb_print_info();
++}
++
++void __init
++swiotlb_init(int verbose)
++{
++ swiotlb_init_early(64 * (1<<20), verbose); /* default to 64MB */
++}
++
++/*
++ * Systems with larger DMA zones (those that don't support ISA) can
++ * initialize the swiotlb later using the slab allocator if needed.
++ * This should be just like above, but with some error catching.
++ */
++int
++swiotlb_init_late(size_t default_size)
++{
++ unsigned long i, bytes, req_nslabs = io_tlb_nslabs;
++ unsigned int order;
++
++ if (!io_tlb_nslabs) {
++ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
++ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
++ }
++
++ /*
++ * Get IO TLB memory from the low pages
++ */
++ order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
++ io_tlb_nslabs = SLABS_PER_PAGE << order;
++ bytes = io_tlb_nslabs << IO_TLB_SHIFT;
++
++ while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
++ io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
++ order);
++ if (io_tlb_start)
++ break;
++ order--;
++ }
++
++ if (!io_tlb_start)
++ goto cleanup1;
++
++ if (order != get_order(bytes)) {
++ printk(KERN_WARNING "DMA: Warning: only able to allocate %ld MB"
++ " for software IO TLB\n", (PAGE_SIZE << order) >> 20);
++ io_tlb_nslabs = SLABS_PER_PAGE << order;
++ bytes = io_tlb_nslabs << IO_TLB_SHIFT;
++ }
++ io_tlb_end = io_tlb_start + bytes;
++ memset(io_tlb_start, 0, bytes);
++
++ /*
++ * Allocate and initialize the free list array. This array is used
++ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
++ * between io_tlb_start and io_tlb_end.
++ */
++ io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
++ get_order(io_tlb_nslabs * sizeof(int)));
++ if (!io_tlb_list)
++ goto cleanup2;
++
++ for (i = 0; i < io_tlb_nslabs; i++)
++ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
++ io_tlb_index = 0;
++
++ io_tlb_orig_addr = (phys_addr_t *) __get_free_pages(GFP_KERNEL,
++ get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
++ if (!io_tlb_orig_addr)
++ goto cleanup3;
++
++ memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t));
++
++ /*
++ * Get the overflow emergency buffer
++ */
++ io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
++ get_order(io_tlb_overflow));
++ if (!io_tlb_overflow_buffer)
++ goto cleanup4;
++
++ swiotlb_print_info();
++
++ late_alloc = 1;
++
++ return 0;
++
++cleanup4:
++ free_pages((unsigned long)io_tlb_orig_addr,
++ get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
++ io_tlb_orig_addr = NULL;
++cleanup3:
++ free_pages((unsigned long)io_tlb_list,
++ get_order(io_tlb_nslabs * sizeof(int)));
++ io_tlb_list = NULL;
++cleanup2:
++ io_tlb_end = NULL;
++ free_pages((unsigned long)io_tlb_start, order);
++ io_tlb_start = NULL;
++cleanup1:
++ io_tlb_nslabs = req_nslabs;
++ return -ENOMEM;
++}
++
++void __init swiotlb_free(void)
++{
++ if (!io_tlb_overflow_buffer)
++ return;
++
++ if (late_alloc) {
++ free_pages((unsigned long)io_tlb_overflow_buffer,
++ get_order(io_tlb_overflow));
++ free_pages((unsigned long)io_tlb_orig_addr,
++ get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
++ free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
++ sizeof(int)));
++ free_pages((unsigned long)io_tlb_start,
++ get_order(io_tlb_nslabs << IO_TLB_SHIFT));
++ } else {
++ free_bootmem_late(__pa(io_tlb_overflow_buffer),
++ io_tlb_overflow);
++ free_bootmem_late(__pa(io_tlb_orig_addr),
++ io_tlb_nslabs * sizeof(phys_addr_t));
++ free_bootmem_late(__pa(io_tlb_list),
++ io_tlb_nslabs * sizeof(int));
++ free_bootmem_late(__pa(io_tlb_start),
++ io_tlb_nslabs << IO_TLB_SHIFT);
++ }
++}
++
++int is_swiotlb_buffer(phys_addr_t paddr)
++{
++ return paddr >= virt_to_phys(io_tlb_start) &&
++ paddr < virt_to_phys(io_tlb_end);
++}
++
++/*
++ * Bounce: copy the swiotlb buffer back to the original dma location
++ */
++void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
++ enum dma_data_direction dir)
++{
++ unsigned long pfn = PFN_DOWN(phys);
++
++ if (PageHighMem(pfn_to_page(pfn))) {
++ /* The buffer does not have a mapping. Map it in and copy */
++ unsigned int offset = phys & ~PAGE_MASK;
++ char *buffer;
++ unsigned int sz = 0;
++ unsigned long flags;
++
++ while (size) {
++ sz = min_t(size_t, PAGE_SIZE - offset, size);
++
++ local_irq_save(flags);
++ buffer = kmap_atomic(pfn_to_page(pfn),
++ KM_BOUNCE_READ);
++ if (dir == DMA_TO_DEVICE)
++ memcpy(dma_addr, buffer + offset, sz);
++ else
++ memcpy(buffer + offset, dma_addr, sz);
++ kunmap_atomic(buffer, KM_BOUNCE_READ);
++ local_irq_restore(flags);
++
++ size -= sz;
++ pfn++;
++ dma_addr += sz;
++ offset = 0;
++ }
++ } else {
++ if (dir == DMA_TO_DEVICE)
++ memcpy(dma_addr, phys_to_virt(phys), size);
++ else
++ memcpy(phys_to_virt(phys), dma_addr, size);
++ }
++}
++
++/*
++ * Allocates bounce buffer and returns its kernel virtual address.
++ */
++void *
++do_map_single(struct device *hwdev, phys_addr_t phys,
++ unsigned long start_dma_addr, size_t size, int dir)
++{
++ unsigned long flags;
++ char *dma_addr;
++ unsigned int nslots, stride, index, wrap;
++ int i;
++ unsigned long mask;
++ unsigned long offset_slots;
++ unsigned long max_slots;
++
++ mask = dma_get_seg_boundary(hwdev);
++ start_dma_addr = start_dma_addr & mask;
++ offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
++
++ /*
++ * Carefully handle integer overflow which can occur when mask == ~0UL.
++ */
++ max_slots = mask + 1
++ ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
++ : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
++
++ /*
++ * For mappings greater than a page, we limit the stride (and
++ * hence alignment) to a page size.
++ */
++ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
++ if (size > PAGE_SIZE)
++ stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
++ else
++ stride = 1;
++
++ BUG_ON(!nslots);
++
++ /*
++ * Find suitable number of IO TLB entries size that will fit this
++ * request and allocate a buffer from that IO TLB pool.
++ */
++ spin_lock_irqsave(&io_tlb_lock, flags);
++ index = ALIGN(io_tlb_index, stride);
++ if (index >= io_tlb_nslabs)
++ index = 0;
++ wrap = index;
++
++ do {
++ while (iommu_is_span_boundary(index, nslots, offset_slots,
++ max_slots)) {
++ index += stride;
++ if (index >= io_tlb_nslabs)
++ index = 0;
++ if (index == wrap)
++ goto not_found;
++ }
++
++ /*
++ * If we find a slot that indicates we have 'nslots' number of
++ * contiguous buffers, we allocate the buffers from that slot
++ * and mark the entries as '0' indicating unavailable.
++ */
++ if (io_tlb_list[index] >= nslots) {
++ int count = 0;
++
++ for (i = index; i < (int) (index + nslots); i++)
++ io_tlb_list[i] = 0;
++ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE)
++ != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
++ io_tlb_list[i] = ++count;
++ dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
++
++ /*
++ * Update the indices to avoid searching in the next
++ * round.
++ */
++ io_tlb_index = ((index + nslots) < io_tlb_nslabs
++ ? (index + nslots) : 0);
++
++ goto found;
++ }
++ index += stride;
++ if (index >= io_tlb_nslabs)
++ index = 0;
++ } while (index != wrap);
++
++not_found:
++ spin_unlock_irqrestore(&io_tlb_lock, flags);
++ return NULL;
++found:
++ spin_unlock_irqrestore(&io_tlb_lock, flags);
++
++ /*
++ * Save away the mapping from the original address to the DMA address.
++ * This is needed when we sync the memory. Then we sync the buffer if
++ * needed.
++ */
++ for (i = 0; i < nslots; i++)
++ io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
++ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
++ swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
++
++ return dma_addr;
++}
++
++/*
++ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
++ */
++void
++do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
++{
++ unsigned long flags;
++ int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
++ int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
++ phys_addr_t phys = io_tlb_orig_addr[index];
++
++ /*
++ * First, sync the memory before unmapping the entry
++ */
++ if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
++ swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
++
++ /*
++ * Return the buffer to the free list by setting the corresponding
++ * entries to indicate the number of contigous entries available.
++ * While returning the entries to the free list, we merge the entries
++ * with slots below and above the pool being returned.
++ */
++ spin_lock_irqsave(&io_tlb_lock, flags);
++ {
++ count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
++ io_tlb_list[index + nslots] : 0);
++ /*
++ * Step 1: return the slots to the free list, merging the
++ * slots with superceeding slots
++ */
++ for (i = index + nslots - 1; i >= index; i--)
++ io_tlb_list[i] = ++count;
++ /*
++ * Step 2: merge the returned slots with the preceding slots,
++ * if available (non zero)
++ */
++ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) !=
++ IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
++ io_tlb_list[i] = ++count;
++ }
++ spin_unlock_irqrestore(&io_tlb_lock, flags);
++}
++
++void
++do_sync_single(struct device *hwdev, char *dma_addr, size_t size,
++ int dir, int target)
++{
++ int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
++ phys_addr_t phys = io_tlb_orig_addr[index];
++
++ phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
++
++ switch (target) {
++ case SYNC_FOR_CPU:
++ if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
++ swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
++ else
++ BUG_ON(dir != DMA_TO_DEVICE);
++ break;
++ case SYNC_FOR_DEVICE:
++ if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
++ swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
++ else
++ BUG_ON(dir != DMA_FROM_DEVICE);
++ break;
++ default:
++ BUG();
++ }
++}
++void
++swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
++{
++ /*
++ * Ran out of IOMMU space for this operation. This is very bad.
++ * Unfortunately the drivers cannot handle this operation properly.
++ * unless they check for dma_mapping_error (most don't)
++ * When the mapping is small enough return a static buffer to limit
++ * the damage, or panic when the transfer is too big.
++ */
++ dev_err(dev, "DMA: Out of SW-IOMMU space for %zu bytes.", size);
++
++ if (size <= io_tlb_overflow || !do_panic)
++ return;
++
++ if (dir == DMA_BIDIRECTIONAL)
++ panic("DMA: Random memory could be DMA accessed\n");
++ if (dir == DMA_FROM_DEVICE)
++ panic("DMA: Random memory could be DMA written\n");
++ if (dir == DMA_TO_DEVICE)
++ panic("DMA: Random memory could be DMA read\n");
++}
+diff --git a/lib/swiotlb-xen.c b/lib/swiotlb-xen.c
+new file mode 100644
+index 0000000..bee577f
+--- /dev/null
++++ b/lib/swiotlb-xen.c
+@@ -0,0 +1,504 @@
++/* An software based IOMMU that utilizes the swiotlb-core fuctionality.
++ * It can function on Xen when there are PCI devices present.*/
++
++
++#include <linux/dma-mapping.h>
++#include <linux/io.h>
++#include <asm/dma.h>
++#include <linux/scatterlist.h>
++#include <xen/interface/xen.h>
++#include <xen/grant_table.h>
++
++#include <asm/xen/page.h>
++#include <xen/page.h>
++#include <xen/xen-ops.h>
++
++static dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
++{
++ return phys_to_machine(XPADDR(paddr)).maddr;;
++}
++
++static phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
++{
++ return machine_to_phys(XMADDR(baddr)).paddr;
++}
++
++static dma_addr_t xen_virt_to_bus(void *address)
++{
++ return xen_phys_to_bus(virt_to_phys(address));
++}
++
++static int check_pages_physically_contiguous(unsigned long pfn,
++ unsigned int offset,
++ size_t length)
++{
++ unsigned long next_mfn;
++ int i;
++ int nr_pages;
++
++ next_mfn = pfn_to_mfn(pfn);
++ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
++
++ for (i = 1; i < nr_pages; i++) {
++ if (pfn_to_mfn(++pfn) != ++next_mfn)
++ return 0;
++ }
++ return 1;
++}
++
++static int range_straddles_page_boundary(phys_addr_t p, size_t size)
++{
++ unsigned long pfn = PFN_DOWN(p);
++ unsigned int offset = p & ~PAGE_MASK;
++
++ if (offset + size <= PAGE_SIZE)
++ return 0;
++ if (check_pages_physically_contiguous(pfn, offset, size))
++ return 0;
++ return 1;
++}
++
++
++bool xen_dma_capable(struct device *dev, dma_addr_t dev_addr,
++ phys_addr_t phys, size_t size)
++{
++ int rc = 0;
++
++ rc = dma_capable(dev, dev_addr, size) &&
++ !range_straddles_page_boundary(phys, size);
++ return rc;
++}
++
++static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
++{
++ unsigned long mfn = PFN_DOWN(dma_addr);
++ unsigned long pfn = mfn_to_local_pfn(mfn);
++
++ /* If the address is outside our domain, it CAN have the same virtual
++ * address as another address in our domain. Hence only check address
++ * within our domain. */
++ if (pfn_valid(pfn))
++ return is_swiotlb_buffer(PFN_PHYS(pfn));
++
++ return 0;
++}
++void *
++xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
++ dma_addr_t *dma_handle, gfp_t flags)
++{
++ void *ret;
++ int order = get_order(size);
++ u64 dma_mask = DMA_BIT_MASK(32);
++ unsigned long vstart;
++
++ /*
++ * Ignore region specifiers - the kernel's ideas of
++ * pseudo-phys memory layout has nothing to do with the
++ * machine physical layout. We can't allocate highmem
++ * because we can't return a pointer to it.
++ */
++ flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
++
++ if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
++ return ret;
++
++ vstart = __get_free_pages(flags, order);
++ ret = (void *)vstart;
++
++ if (hwdev && hwdev->coherent_dma_mask)
++ dma_mask = dma_alloc_coherent_mask(hwdev, flags);
++
++ if (ret) {
++ if (xen_create_contiguous_region(vstart, order,
++ fls64(dma_mask)) != 0) {
++ free_pages(vstart, order);
++ return NULL;
++ }
++ memset(ret, 0, size);
++ *dma_handle = virt_to_machine(ret).maddr;
++ }
++ return ret;
++}
++EXPORT_SYMBOL(xen_swiotlb_alloc_coherent);
++
++void
++xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
++ dma_addr_t dev_addr)
++{
++ int order = get_order(size);
++
++ if (dma_release_from_coherent(hwdev, order, vaddr))
++ return;
++
++ xen_destroy_contiguous_region((unsigned long)vaddr, order);
++ free_pages((unsigned long)vaddr, order);
++}
++EXPORT_SYMBOL(xen_swiotlb_free_coherent);
++
++
++static int max_dma_bits = 32;
++
++static int
++xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
++{
++ int i, rc;
++ int dma_bits;
++
++ printk(KERN_INFO "xen_swiotlb_fixup: buf=%p size=%zu\n",
++ buf, size);
++
++ dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
++
++ i = 0;
++ do {
++ int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE);
++
++ do {
++ rc = xen_create_contiguous_region(
++ (unsigned long)buf + (i << IO_TLB_SHIFT),
++ get_order(slabs << IO_TLB_SHIFT),
++ dma_bits);
++ } while (rc && dma_bits++ < max_dma_bits);
++ if (rc)
++ return rc;
++
++ i += slabs;
++ } while(i < nslabs);
++ return 0;
++}
++
++void __init xen_swiotlb_init(int verbose)
++{
++ int rc = 0;
++
++ swiotlb_init_early(64 * (1<<20), verbose);
++
++ if ((rc = xen_swiotlb_fixup(io_tlb_start,
++ io_tlb_nslabs << IO_TLB_SHIFT,
++ io_tlb_nslabs)))
++ goto error;
++
++ if ((rc = xen_swiotlb_fixup(io_tlb_overflow_buffer,
++ io_tlb_overflow,
++ io_tlb_overflow >> IO_TLB_SHIFT)))
++ goto error;
++
++ return;
++error:
++ panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\
++ "We either don't have the permission or you do not have enough"\
++ "free memory under 4GB!\n", rc);
++}
++
++/*
++ * Map a single buffer of the indicated size for DMA in streaming mode. The
++ * physical address to use is returned.
++ *
++ * Once the device is given the dma address, the device owns this memory until
++ * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed.
++ */
++dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
++ unsigned long offset, size_t size,
++ enum dma_data_direction dir,
++ struct dma_attrs *attrs)
++{
++ unsigned long start_dma_addr;
++ phys_addr_t phys = page_to_phys(page) + offset;
++ dma_addr_t dev_addr = xen_phys_to_bus(phys);
++ void *map;
++
++ BUG_ON(dir == DMA_NONE);
++ /*
++ * If the address happens to be in the device's DMA window,
++ * we can safely return the device addr and not worry about bounce
++ * buffering it.
++ */
++ if (dma_capable(dev, dev_addr, size) &&
++ !range_straddles_page_boundary(phys, size) && !swiotlb_force)
++ return dev_addr;
++
++ /*
++ * Oh well, have to allocate and map a bounce buffer.
++ */
++ start_dma_addr = xen_virt_to_bus(io_tlb_start);
++ map = do_map_single(dev, phys, start_dma_addr, size, dir);
++ if (!map) {
++ swiotlb_full(dev, size, dir, 1);
++ map = io_tlb_overflow_buffer;
++ }
++
++ dev_addr = xen_virt_to_bus(map);
++
++ /*
++ * Ensure that the address returned is DMA'ble
++ */
++ if (!dma_capable(dev, dev_addr, size))
++ panic("DMA: xen_swiotlb_map_single: bounce buffer is not " \
++ "DMA'ble\n");
++ return dev_addr;
++}
++EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
++
++/*
++ * Unmap a single streaming mode DMA translation. The dma_addr and size must
++ * match what was provided for in a previous xen_swiotlb_map_page call. All
++ * other usages are undefined.
++ *
++ * After this call, reads by the cpu to the buffer are guaranteed to see
++ * whatever the device wrote there.
++ */
++static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
++ size_t size, int dir)
++{
++ phys_addr_t paddr = xen_bus_to_phys(dev_addr);
++
++ BUG_ON(dir == DMA_NONE);
++
++ /* NOTE: We use dev_addr here, not paddr! */
++ if (is_xen_swiotlb_buffer(dev_addr)) {
++ do_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
++ return;
++ }
++
++ if (dir != DMA_FROM_DEVICE)
++ return;
++
++ /*
++ * phys_to_virt doesn't work with hihgmem page but we could
++ * call dma_mark_clean() with hihgmem page here. However, we
++ * are fine since dma_mark_clean() is null on POWERPC. We can
++ * make dma_mark_clean() take a physical address if necessary.
++ */
++ dma_mark_clean(phys_to_virt(paddr), size);
++}
++
++void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
++ size_t size, enum dma_data_direction dir,
++ struct dma_attrs *attrs)
++{
++ unmap_single(hwdev, dev_addr, size, dir);
++}
++EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
++
++/*
++ * Make physical memory consistent for a single streaming mode DMA translation
++ * after a transfer.
++ *
++ * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer
++ * using the cpu, yet do not wish to teardown the dma mapping, you must
++ * call this function before doing so. At the next point you give the dma
++ * address back to the card, you must first perform a
++ * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer
++ */
++static void
++xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
++ size_t size, int dir, int target)
++{
++ phys_addr_t paddr = xen_bus_to_phys(dev_addr);
++
++ BUG_ON(dir == DMA_NONE);
++
++ if (is_xen_swiotlb_buffer(dev_addr)) {
++ do_sync_single(hwdev, phys_to_virt(paddr), size, dir, target);
++ return;
++ }
++
++ if (dir != DMA_FROM_DEVICE)
++ return;
++
++ dma_mark_clean(phys_to_virt(paddr), size);
++}
++
++void
++xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
++ size_t size, enum dma_data_direction dir)
++{
++ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
++}
++EXPORT_SYMBOL(xen_swiotlb_sync_single_for_cpu);
++
++void
++xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
++ size_t size, enum dma_data_direction dir)
++{
++ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
++}
++EXPORT_SYMBOL(xen_swiotlb_sync_single_for_device);
++
++/*
++ * Same as above, but for a sub-range of the mapping.
++ */
++static void
++xen_swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
++ unsigned long offset, size_t size,
++ int dir, int target)
++{
++ xen_swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target);
++}
++
++void
++xen_swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
++ unsigned long offset, size_t size,
++ enum dma_data_direction dir)
++{
++ xen_swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
++ SYNC_FOR_CPU);
++}
++EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_range_for_cpu);
++
++void
++xen_swiotlb_sync_single_range_for_device(struct device *hwdev,
++ dma_addr_t dev_addr,
++ unsigned long offset, size_t size,
++ enum dma_data_direction dir)
++{
++ xen_swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
++ SYNC_FOR_DEVICE);
++}
++EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_range_for_device);
++
++/*
++ * Map a set of buffers described by scatterlist in streaming mode for DMA.
++ * This is the scatter-gather version of the above xen_swiotlb_map_page
++ * interface. Here the scatter gather list elements are each tagged with the
++ * appropriate dma address and length. They are obtained via
++ * sg_dma_{address,length}(SG).
++ *
++ * NOTE: An implementation may be able to use a smaller number of
++ * DMA address/length pairs than there are SG table elements.
++ * (for example via virtual mapping capabilities)
++ * The routine returns the number of addr/length pairs actually
++ * used, at most nents.
++ *
++ * Device ownership issues as mentioned above for xen_swiotlb_map_page are the
++ * same here.
++ */
++int
++xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
++ int nelems, enum dma_data_direction dir,
++ struct dma_attrs *attrs)
++{
++ unsigned long start_dma_addr;
++ struct scatterlist *sg;
++ int i;
++ BUG_ON(dir == DMA_NONE);
++
++ start_dma_addr = xen_virt_to_bus(io_tlb_start);
++ for_each_sg(sgl, sg, nelems, i) {
++ phys_addr_t paddr = sg_phys(sg);
++ dma_addr_t dev_addr = xen_phys_to_bus(paddr);
++
++ if (swiotlb_force ||
++ !dma_capable(hwdev, dev_addr, sg->length) ||
++ range_straddles_page_boundary(paddr, sg->length)) {
++ void *map = do_map_single(hwdev, sg_phys(sg),
++ start_dma_addr,
++ sg->length, dir);
++ if (!map) {
++ /* Don't panic here, we expect map_sg users
++ to do proper error handling. */
++ swiotlb_full(hwdev, sg->length, dir, 0);
++ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
++ attrs);
++ sgl[0].dma_length = 0;
++ return 0;
++ }
++ sg->dma_address = xen_virt_to_bus(map);
++ } else
++ sg->dma_address = dev_addr;
++ sg->dma_length = sg->length;
++ }
++ return nelems;
++}
++EXPORT_SYMBOL(xen_swiotlb_map_sg_attrs);
++
++int
++xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
++ int dir)
++{
++ return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
++}
++EXPORT_SYMBOL(xen_swiotlb_map_sg);
++
++/*
++ * Unmap a set of streaming mode DMA translations. Again, cpu read rules
++ * concerning calls here are the same as for xen_swiotlb_unmap_page() above.
++ */
++void
++xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
++ int nelems, enum dma_data_direction dir,
++ struct dma_attrs *attrs)
++{
++ struct scatterlist *sg;
++ int i;
++
++ BUG_ON(dir == DMA_NONE);
++
++ for_each_sg(sgl, sg, nelems, i)
++ unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
++
++}
++EXPORT_SYMBOL(xen_swiotlb_unmap_sg_attrs);
++
++void
++xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
++ int dir)
++{
++ return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
++}
++EXPORT_SYMBOL(xen_swiotlb_unmap_sg);
++
++/*
++ * Make physical memory consistent for a set of streaming mode DMA translations
++ * after a transfer.
++ *
++ * The same as xen_swiotlb_sync_single_* but for a scatter-gather list,
++ * same rules and usage.
++ */
++static void
++xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
++ int nelems, int dir, int target)
++{
++ struct scatterlist *sg;
++ int i;
++
++ for_each_sg(sgl, sg, nelems, i)
++ xen_swiotlb_sync_single(hwdev, sg->dma_address,
++ sg->dma_length, dir, target);
++}
++
++void
++xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
++ int nelems, enum dma_data_direction dir)
++{
++ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
++}
++EXPORT_SYMBOL(xen_swiotlb_sync_sg_for_cpu);
++
++void
++xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
++ int nelems, enum dma_data_direction dir)
++{
++ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
++}
++EXPORT_SYMBOL(xen_swiotlb_sync_sg_for_device);
++
++int
++xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
++{
++ return (dma_addr == xen_virt_to_bus(io_tlb_overflow_buffer));
++}
++EXPORT_SYMBOL(xen_swiotlb_dma_mapping_error);
++
++/*
++ * Return whether the given device DMA address mask can be supported
++ * properly. For example, if your device can only drive the low 24-bits
++ * during bus mastering, then you would pass 0x00ffffff as the mask to
++ * this function.
++ */
++int
++xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
++{
++ return xen_virt_to_bus(io_tlb_end - 1) <= mask;
++}
++EXPORT_SYMBOL(xen_swiotlb_dma_supported);
+diff --git a/lib/swiotlb.c b/lib/swiotlb.c
+index ac25cd2..f6bbcd1 100644
+--- a/lib/swiotlb.c
++++ b/lib/swiotlb.c
+@@ -1,118 +1,11 @@
+-/*
+- * Dynamic DMA mapping support.
+- *
+- * This implementation is a fallback for platforms that do not support
+- * I/O TLBs (aka DMA address translation hardware).
+- * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick at intel.com>
+- * Copyright (C) 2000 Goutham Rao <goutham.rao at intel.com>
+- * Copyright (C) 2000, 2003 Hewlett-Packard Co
+- * David Mosberger-Tang <davidm at hpl.hp.com>
+- *
+- * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API.
+- * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid
+- * unnecessary i-cache flushing.
+- * 04/07/.. ak Better overflow handling. Assorted fixes.
+- * 05/09/10 linville Add support for syncing ranges, support syncing for
+- * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
+- * 08/12/11 beckyb Add highmem support
+- */
+
+-#include <linux/cache.h>
+ #include <linux/dma-mapping.h>
+-#include <linux/mm.h>
+ #include <linux/module.h>
+-#include <linux/spinlock.h>
+-#include <linux/string.h>
+ #include <linux/swiotlb.h>
+-#include <linux/pfn.h>
+-#include <linux/types.h>
+-#include <linux/ctype.h>
+-#include <linux/highmem.h>
+
+-#include <asm/io.h>
+-#include <asm/dma.h>
+ #include <asm/scatterlist.h>
+-
+-#include <linux/init.h>
+-#include <linux/bootmem.h>
+ #include <linux/iommu-helper.h>
+
+-#define OFFSET(val,align) ((unsigned long) \
+- ( (val) & ( (align) - 1)))
+-
+-#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
+-
+-/*
+- * Minimum IO TLB size to bother booting with. Systems with mainly
+- * 64bit capable cards will only lightly use the swiotlb. If we can't
+- * allocate a contiguous 1MB, we're probably in trouble anyway.
+- */
+-#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
+-
+-/*
+- * Enumeration for sync targets
+- */
+-enum dma_sync_target {
+- SYNC_FOR_CPU = 0,
+- SYNC_FOR_DEVICE = 1,
+-};
+-
+-int swiotlb_force;
+-
+-/*
+- * Used to do a quick range check in unmap_single and
+- * sync_single_*, to see if the memory was in fact allocated by this
+- * API.
+- */
+-static char *io_tlb_start, *io_tlb_end;
+-
+-/*
+- * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
+- * io_tlb_end. This is command line adjustable via setup_io_tlb_npages.
+- */
+-static unsigned long io_tlb_nslabs;
+-
+-/*
+- * When the IOMMU overflows we return a fallback buffer. This sets the size.
+- */
+-static unsigned long io_tlb_overflow = 32*1024;
+-
+-void *io_tlb_overflow_buffer;
+-
+-/*
+- * This is a free list describing the number of free entries available from
+- * each index
+- */
+-static unsigned int *io_tlb_list;
+-static unsigned int io_tlb_index;
+-
+-/*
+- * We need to save away the original address corresponding to a mapped entry
+- * for the sync operations.
+- */
+-static phys_addr_t *io_tlb_orig_addr;
+-
+-/*
+- * Protect the above data structures in the map and unmap calls
+- */
+-static DEFINE_SPINLOCK(io_tlb_lock);
+-
+-static int __init
+-setup_io_tlb_npages(char *str)
+-{
+- if (isdigit(*str)) {
+- io_tlb_nslabs = simple_strtoul(str, &str, 0);
+- /* avoid tail segment of size < IO_TLB_SEGSIZE */
+- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+- }
+- if (*str == ',')
+- ++str;
+- if (!strcmp(str, "force"))
+- swiotlb_force = 1;
+- return 1;
+-}
+-__setup("swiotlb=", setup_io_tlb_npages);
+-/* make io_tlb_overflow tunable too? */
+
+ /* Note that this doesn't work with highmem page */
+ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
+@@ -120,390 +13,6 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
+ {
+ return phys_to_dma(hwdev, virt_to_phys(address));
+ }
+-
+-static void swiotlb_print_info(unsigned long bytes)
+-{
+- phys_addr_t pstart, pend;
+-
+- pstart = virt_to_phys(io_tlb_start);
+- pend = virt_to_phys(io_tlb_end);
+-
+- printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n",
+- bytes >> 20, io_tlb_start, io_tlb_end);
+- printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n",
+- (unsigned long long)pstart,
+- (unsigned long long)pend);
+-}
+-
+-/*
+- * Statically reserve bounce buffer space and initialize bounce buffer data
+- * structures for the software IO TLB used to implement the DMA API.
+- */
+-void __init
+-swiotlb_init_with_default_size(size_t default_size)
+-{
+- unsigned long i, bytes;
+-
+- if (!io_tlb_nslabs) {
+- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+- }
+-
+- bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+-
+- /*
+- * Get IO TLB memory from the low pages
+- */
+- io_tlb_start = alloc_bootmem_low_pages(bytes);
+- if (!io_tlb_start)
+- panic("Cannot allocate SWIOTLB buffer");
+- io_tlb_end = io_tlb_start + bytes;
+-
+- /*
+- * Allocate and initialize the free list array. This array is used
+- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+- * between io_tlb_start and io_tlb_end.
+- */
+- io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+- for (i = 0; i < io_tlb_nslabs; i++)
+- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+- io_tlb_index = 0;
+- io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
+-
+- /*
+- * Get the overflow emergency buffer
+- */
+- io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+- if (!io_tlb_overflow_buffer)
+- panic("Cannot allocate SWIOTLB overflow buffer!\n");
+-
+- swiotlb_print_info(bytes);
+-}
+-
+-void __init
+-swiotlb_init(void)
+-{
+- swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */
+-}
+-
+-/*
+- * Systems with larger DMA zones (those that don't support ISA) can
+- * initialize the swiotlb later using the slab allocator if needed.
+- * This should be just like above, but with some error catching.
+- */
+-int
+-swiotlb_late_init_with_default_size(size_t default_size)
+-{
+- unsigned long i, bytes, req_nslabs = io_tlb_nslabs;
+- unsigned int order;
+-
+- if (!io_tlb_nslabs) {
+- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+- }
+-
+- /*
+- * Get IO TLB memory from the low pages
+- */
+- order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
+- io_tlb_nslabs = SLABS_PER_PAGE << order;
+- bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+-
+- while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+- io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+- order);
+- if (io_tlb_start)
+- break;
+- order--;
+- }
+-
+- if (!io_tlb_start)
+- goto cleanup1;
+-
+- if (order != get_order(bytes)) {
+- printk(KERN_WARNING "Warning: only able to allocate %ld MB "
+- "for software IO TLB\n", (PAGE_SIZE << order) >> 20);
+- io_tlb_nslabs = SLABS_PER_PAGE << order;
+- bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+- }
+- io_tlb_end = io_tlb_start + bytes;
+- memset(io_tlb_start, 0, bytes);
+-
+- /*
+- * Allocate and initialize the free list array. This array is used
+- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+- * between io_tlb_start and io_tlb_end.
+- */
+- io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
+- get_order(io_tlb_nslabs * sizeof(int)));
+- if (!io_tlb_list)
+- goto cleanup2;
+-
+- for (i = 0; i < io_tlb_nslabs; i++)
+- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+- io_tlb_index = 0;
+-
+- io_tlb_orig_addr = (phys_addr_t *)
+- __get_free_pages(GFP_KERNEL,
+- get_order(io_tlb_nslabs *
+- sizeof(phys_addr_t)));
+- if (!io_tlb_orig_addr)
+- goto cleanup3;
+-
+- memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t));
+-
+- /*
+- * Get the overflow emergency buffer
+- */
+- io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
+- get_order(io_tlb_overflow));
+- if (!io_tlb_overflow_buffer)
+- goto cleanup4;
+-
+- swiotlb_print_info(bytes);
+-
+- return 0;
+-
+-cleanup4:
+- free_pages((unsigned long)io_tlb_orig_addr,
+- get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
+- io_tlb_orig_addr = NULL;
+-cleanup3:
+- free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
+- sizeof(int)));
+- io_tlb_list = NULL;
+-cleanup2:
+- io_tlb_end = NULL;
+- free_pages((unsigned long)io_tlb_start, order);
+- io_tlb_start = NULL;
+-cleanup1:
+- io_tlb_nslabs = req_nslabs;
+- return -ENOMEM;
+-}
+-
+-static int is_swiotlb_buffer(phys_addr_t paddr)
+-{
+- return paddr >= virt_to_phys(io_tlb_start) &&
+- paddr < virt_to_phys(io_tlb_end);
+-}
+-
+-/*
+- * Bounce: copy the swiotlb buffer back to the original dma location
+- */
+-static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+- enum dma_data_direction dir)
+-{
+- unsigned long pfn = PFN_DOWN(phys);
+-
+- if (PageHighMem(pfn_to_page(pfn))) {
+- /* The buffer does not have a mapping. Map it in and copy */
+- unsigned int offset = phys & ~PAGE_MASK;
+- char *buffer;
+- unsigned int sz = 0;
+- unsigned long flags;
+-
+- while (size) {
+- sz = min_t(size_t, PAGE_SIZE - offset, size);
+-
+- local_irq_save(flags);
+- buffer = kmap_atomic(pfn_to_page(pfn),
+- KM_BOUNCE_READ);
+- if (dir == DMA_TO_DEVICE)
+- memcpy(dma_addr, buffer + offset, sz);
+- else
+- memcpy(buffer + offset, dma_addr, sz);
+- kunmap_atomic(buffer, KM_BOUNCE_READ);
+- local_irq_restore(flags);
+-
+- size -= sz;
+- pfn++;
+- dma_addr += sz;
+- offset = 0;
+- }
+- } else {
+- if (dir == DMA_TO_DEVICE)
+- memcpy(dma_addr, phys_to_virt(phys), size);
+- else
+- memcpy(phys_to_virt(phys), dma_addr, size);
+- }
+-}
+-
+-/*
+- * Allocates bounce buffer and returns its kernel virtual address.
+- */
+-static void *
+-map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
+-{
+- unsigned long flags;
+- char *dma_addr;
+- unsigned int nslots, stride, index, wrap;
+- int i;
+- unsigned long start_dma_addr;
+- unsigned long mask;
+- unsigned long offset_slots;
+- unsigned long max_slots;
+-
+- mask = dma_get_seg_boundary(hwdev);
+- start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start) & mask;
+-
+- offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+-
+- /*
+- * Carefully handle integer overflow which can occur when mask == ~0UL.
+- */
+- max_slots = mask + 1
+- ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+- : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+-
+- /*
+- * For mappings greater than a page, we limit the stride (and
+- * hence alignment) to a page size.
+- */
+- nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+- if (size > PAGE_SIZE)
+- stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
+- else
+- stride = 1;
+-
+- BUG_ON(!nslots);
+-
+- /*
+- * Find suitable number of IO TLB entries size that will fit this
+- * request and allocate a buffer from that IO TLB pool.
+- */
+- spin_lock_irqsave(&io_tlb_lock, flags);
+- index = ALIGN(io_tlb_index, stride);
+- if (index >= io_tlb_nslabs)
+- index = 0;
+- wrap = index;
+-
+- do {
+- while (iommu_is_span_boundary(index, nslots, offset_slots,
+- max_slots)) {
+- index += stride;
+- if (index >= io_tlb_nslabs)
+- index = 0;
+- if (index == wrap)
+- goto not_found;
+- }
+-
+- /*
+- * If we find a slot that indicates we have 'nslots' number of
+- * contiguous buffers, we allocate the buffers from that slot
+- * and mark the entries as '0' indicating unavailable.
+- */
+- if (io_tlb_list[index] >= nslots) {
+- int count = 0;
+-
+- for (i = index; i < (int) (index + nslots); i++)
+- io_tlb_list[i] = 0;
+- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
+- io_tlb_list[i] = ++count;
+- dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+-
+- /*
+- * Update the indices to avoid searching in the next
+- * round.
+- */
+- io_tlb_index = ((index + nslots) < io_tlb_nslabs
+- ? (index + nslots) : 0);
+-
+- goto found;
+- }
+- index += stride;
+- if (index >= io_tlb_nslabs)
+- index = 0;
+- } while (index != wrap);
+-
+-not_found:
+- spin_unlock_irqrestore(&io_tlb_lock, flags);
+- return NULL;
+-found:
+- spin_unlock_irqrestore(&io_tlb_lock, flags);
+-
+- /*
+- * Save away the mapping from the original address to the DMA address.
+- * This is needed when we sync the memory. Then we sync the buffer if
+- * needed.
+- */
+- for (i = 0; i < nslots; i++)
+- io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
+- if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+- swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+-
+- return dma_addr;
+-}
+-
+-/*
+- * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+- */
+-static void
+-do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+-{
+- unsigned long flags;
+- int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+- phys_addr_t phys = io_tlb_orig_addr[index];
+-
+- /*
+- * First, sync the memory before unmapping the entry
+- */
+- if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+- swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+-
+- /*
+- * Return the buffer to the free list by setting the corresponding
+- * entries to indicate the number of contigous entries available.
+- * While returning the entries to the free list, we merge the entries
+- * with slots below and above the pool being returned.
+- */
+- spin_lock_irqsave(&io_tlb_lock, flags);
+- {
+- count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
+- io_tlb_list[index + nslots] : 0);
+- /*
+- * Step 1: return the slots to the free list, merging the
+- * slots with superceeding slots
+- */
+- for (i = index + nslots - 1; i >= index; i--)
+- io_tlb_list[i] = ++count;
+- /*
+- * Step 2: merge the returned slots with the preceding slots,
+- * if available (non zero)
+- */
+- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
+- io_tlb_list[i] = ++count;
+- }
+- spin_unlock_irqrestore(&io_tlb_lock, flags);
+-}
+-
+-static void
+-sync_single(struct device *hwdev, char *dma_addr, size_t size,
+- int dir, int target)
+-{
+- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+- phys_addr_t phys = io_tlb_orig_addr[index];
+-
+- phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
+-
+- switch (target) {
+- case SYNC_FOR_CPU:
+- if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+- swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+- else
+- BUG_ON(dir != DMA_TO_DEVICE);
+- break;
+- case SYNC_FOR_DEVICE:
+- if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+- swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+- else
+- BUG_ON(dir != DMA_FROM_DEVICE);
+- break;
+- default:
+- BUG();
+- }
+-}
+-
+ void *
+ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags)
+@@ -512,12 +21,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ void *ret;
+ int order = get_order(size);
+ u64 dma_mask = DMA_BIT_MASK(32);
++ unsigned long start_dma_addr;
+
+ if (hwdev && hwdev->coherent_dma_mask)
+ dma_mask = hwdev->coherent_dma_mask;
+
+ ret = (void *)__get_free_pages(flags, order);
+- if (ret && swiotlb_virt_to_bus(hwdev, ret) + size > dma_mask) {
++ if (ret && swiotlb_virt_to_bus(hwdev, ret) + size - 1 > dma_mask) {
+ /*
+ * The allocated memory isn't reachable by the device.
+ */
+@@ -527,10 +37,12 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ if (!ret) {
+ /*
+ * We are either out of memory or the device can't DMA
+- * to GFP_DMA memory; fall back on map_single(), which
++ * to GFP_DMA memory; fall back on do_map_single(), which
+ * will grab memory from the lowest available address range.
+ */
+- ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
++ start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
++ ret = do_map_single(hwdev, 0, start_dma_addr, size,
++ DMA_FROM_DEVICE);
+ if (!ret)
+ return NULL;
+ }
+@@ -539,12 +51,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dev_addr = swiotlb_virt_to_bus(hwdev, ret);
+
+ /* Confirm address can be DMA'd by device */
+- if (dev_addr + size > dma_mask) {
+- printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
++ if (dev_addr + size - 1 > dma_mask) {
++ dev_err(hwdev, "DMA: hwdev DMA mask = 0x%016Lx, " \
++ "dev_addr = 0x%016Lx\n",
+ (unsigned long long)dma_mask,
+ (unsigned long long)dev_addr);
+
+- /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
++ /* DMA_TO_DEVICE to avoid memcpy in do_unmap_single */
+ do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
+ return NULL;
+ }
+@@ -563,35 +76,11 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+ if (!is_swiotlb_buffer(paddr))
+ free_pages((unsigned long)vaddr, get_order(size));
+ else
+- /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
++ /* DMA_TO_DEVICE to avoid memcpy in do_unmap_single */
+ do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
+ }
+ EXPORT_SYMBOL(swiotlb_free_coherent);
+
+-static void
+-swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
+-{
+- /*
+- * Ran out of IOMMU space for this operation. This is very bad.
+- * Unfortunately the drivers cannot handle this operation properly.
+- * unless they check for dma_mapping_error (most don't)
+- * When the mapping is small enough return a static buffer to limit
+- * the damage, or panic when the transfer is too big.
+- */
+- printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at "
+- "device %s\n", size, dev ? dev_name(dev) : "?");
+-
+- if (size <= io_tlb_overflow || !do_panic)
+- return;
+-
+- if (dir == DMA_BIDIRECTIONAL)
+- panic("DMA: Random memory could be DMA accessed\n");
+- if (dir == DMA_FROM_DEVICE)
+- panic("DMA: Random memory could be DMA written\n");
+- if (dir == DMA_TO_DEVICE)
+- panic("DMA: Random memory could be DMA read\n");
+-}
+-
+ /*
+ * Map a single buffer of the indicated size for DMA in streaming mode. The
+ * physical address to use is returned.
+@@ -604,6 +93,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+ {
++ unsigned long start_dma_addr;
+ phys_addr_t phys = page_to_phys(page) + offset;
+ dma_addr_t dev_addr = phys_to_dma(dev, phys);
+ void *map;
+@@ -620,7 +110,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+ /*
+ * Oh well, have to allocate and map a bounce buffer.
+ */
+- map = map_single(dev, phys, size, dir);
++ start_dma_addr = swiotlb_virt_to_bus(dev, io_tlb_start);
++ map = do_map_single(dev, phys, start_dma_addr, size, dir);
+ if (!map) {
+ swiotlb_full(dev, size, dir, 1);
+ map = io_tlb_overflow_buffer;
+@@ -632,7 +123,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+ * Ensure that the address returned is DMA'ble
+ */
+ if (!dma_capable(dev, dev_addr, size))
+- panic("map_single: bounce buffer is not DMA'ble");
++ panic("DMA: swiotlb_map_single: bounce buffer is not DMA'ble");
+
+ return dev_addr;
+ }
+@@ -697,7 +188,7 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+ BUG_ON(dir == DMA_NONE);
+
+ if (is_swiotlb_buffer(paddr)) {
+- sync_single(hwdev, phys_to_virt(paddr), size, dir, target);
++ do_sync_single(hwdev, phys_to_virt(paddr), size, dir, target);
+ return;
+ }
+
+@@ -774,19 +265,22 @@ int
+ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+ {
++ unsigned long start_dma_addr;
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
++ start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
+ for_each_sg(sgl, sg, nelems, i) {
+ phys_addr_t paddr = sg_phys(sg);
+ dma_addr_t dev_addr = phys_to_dma(hwdev, paddr);
+
+ if (swiotlb_force ||
+ !dma_capable(hwdev, dev_addr, sg->length)) {
+- void *map = map_single(hwdev, sg_phys(sg),
+- sg->length, dir);
++ void *map = do_map_single(hwdev, sg_phys(sg),
++ start_dma_addr,
++ sg->length, dir);
+ if (!map) {
+ /* Don't panic here, we expect map_sg users
+ to do proper error handling. */
+@@ -819,7 +313,8 @@ EXPORT_SYMBOL(swiotlb_map_sg);
+ */
+ void
+ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+- int nelems, enum dma_data_direction dir, struct dma_attrs *attrs)
++ int nelems, enum dma_data_direction dir,
++ struct dma_attrs *attrs)
+ {
+ struct scatterlist *sg;
+ int i;
+diff --git a/mm/bootmem.c b/mm/bootmem.c
+index 555d5d2..d1dc23c 100644
+--- a/mm/bootmem.c
++++ b/mm/bootmem.c
+@@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+ return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
+ }
+
++/*
++ * free_bootmem_late - free bootmem pages directly to page allocator
++ * @addr: starting address of the range
++ * @size: size of the range in bytes
++ *
++ * This is only useful when the bootmem allocator has already been torn
++ * down, but we are still initializing the system. Pages are given directly
++ * to the page allocator, no bootmem metadata is updated because it is gone.
++ */
++void __init free_bootmem_late(unsigned long addr, unsigned long size)
++{
++ unsigned long cursor, end;
++
++ kmemleak_free_part(__va(addr), size);
++
++ cursor = PFN_UP(addr);
++ end = PFN_DOWN(addr + size);
++
++ for (; cursor < end; cursor++) {
++ __free_pages_bootmem(pfn_to_page(cursor), 0);
++ totalram_pages++;
++ }
++}
++
+ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+ {
+ int aligned;
+diff --git a/mm/memory.c b/mm/memory.c
+index 4e59455..b2de7c9 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -553,6 +553,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ if (is_zero_pfn(pfn))
+ return NULL;
+ check_pfn:
++
++#if defined(CONFIG_XEN) && defined(CONFIG_X86)
++ /* XEN: Covers user-space grant mappings (even of local pages). */
++ if (unlikely(vma->vm_flags & VM_FOREIGN))
++ return NULL;
++#endif
++
+ if (unlikely(pfn > highest_memmap_pfn)) {
+ print_bad_pte(vma, addr, pte, NULL);
+ return NULL;
+@@ -839,8 +846,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ page->index > details->last_index))
+ continue;
+ }
+- ptent = ptep_get_and_clear_full(mm, addr, pte,
+- tlb->fullmm);
++ if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte))
++ ptent = vma->vm_ops->zap_pte(vma, addr, pte,
++ tlb->fullmm);
++ else
++ ptent = ptep_get_and_clear_full(mm, addr, pte,
++ tlb->fullmm);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ if (unlikely(!page))
+ continue;
+@@ -1100,6 +1111,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
+ tlb_finish_mmu(tlb, address, end);
+ return end;
+ }
++EXPORT_SYMBOL_GPL(zap_page_range);
+
+ /**
+ * zap_vma_ptes - remove ptes mapping the vma
+@@ -1296,6 +1308,29 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ continue;
+ }
+
++#ifdef CONFIG_XEN
++ if (vma && (vma->vm_flags & VM_FOREIGN)) {
++ struct vm_foreign_map *foreign_map =
++ vma->vm_private_data;
++ struct page **map = foreign_map->map;
++ int offset = (start - vma->vm_start) >> PAGE_SHIFT;
++ if (map[offset] != NULL) {
++ if (pages) {
++ struct page *page = map[offset];
++
++ pages[i] = page;
++ get_page(page);
++ }
++ if (vmas)
++ vmas[i] = vma;
++ i++;
++ start += PAGE_SIZE;
++ nr_pages--;
++ continue;
++ }
++ }
++#endif
++
+ if (!vma ||
+ (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+ !(vm_flags & vma->vm_flags))
+@@ -1771,6 +1806,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
++#if CONFIG_XEN
++ vma->vm_mm->context.has_foreign_mappings = 1;
++#endif
++
+ err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
+ if (err) {
+ /*
+diff --git a/mm/mmap.c b/mm/mmap.c
+index ae19746..9c39fc2 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1785,6 +1785,12 @@ static void unmap_region(struct mm_struct *mm,
+ tlb_finish_mmu(tlb, start, end);
+ }
+
++static inline void unmap_vma(struct vm_area_struct *vma)
++{
++ if (unlikely(vma->vm_ops && vma->vm_ops->unmap))
++ vma->vm_ops->unmap(vma);
++}
++
+ /*
+ * Create a list of vma's touched by the unmap, removing them from the mm's
+ * vma list as we go..
+@@ -1800,6 +1806,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
+ insertion_point = (prev ? &prev->vm_next : &mm->mmap);
+ do {
+ rb_erase(&vma->vm_rb, &mm->mm_rb);
++ unmap_vma(vma);
+ mm->map_count--;
+ tail_vma = vma;
+ vma = vma->vm_next;
+@@ -2076,7 +2083,7 @@ EXPORT_SYMBOL(do_brk);
+ void exit_mmap(struct mm_struct *mm)
+ {
+ struct mmu_gather *tlb;
+- struct vm_area_struct *vma;
++ struct vm_area_struct *vma, *vma_tmp;
+ unsigned long nr_accounted = 0;
+ unsigned long end;
+
+@@ -2098,6 +2105,9 @@ void exit_mmap(struct mm_struct *mm)
+ if (!vma) /* Can happen if dup_mmap() received an OOM */
+ return;
+
++ for (vma_tmp = mm->mmap; vma_tmp; vma_tmp = vma_tmp->vm_next)
++ unmap_vma(vma_tmp);
++
+ lru_add_drain();
+ flush_cache_mm(mm);
+ tlb = tlb_gather_mmu(mm, 1);
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 36992b6..bc1b6e9 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -593,6 +593,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
+ if (bad)
+ return;
+
++#ifdef CONFIG_XEN
++ if (PageForeign(page)) {
++ PageForeignDestructor(page, order);
++ return;
++ }
++#endif
++
+ if (!PageHighMem(page)) {
+ debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+ debug_check_no_obj_freed(page_address(page),
+@@ -1087,6 +1094,13 @@ static void free_hot_cold_page(struct page *page, int cold)
+
+ kmemcheck_free_shadow(page, 0);
+
++#ifdef CONFIG_XEN
++ if (PageForeign(page)) {
++ PageForeignDestructor(page, 0);
++ return;
++ }
++#endif
++
+ if (PageAnon(page))
+ page->mapping = NULL;
+ if (free_pages_check(page))
Added: dists/sid/linux-2.6/debian/patches/series/10-extra
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ dists/sid/linux-2.6/debian/patches/series/10-extra Sun Feb 28 19:10:15 2010 (r15297)
@@ -0,0 +1,2 @@
++ features/all/xen/pvops.patch featureset=xen
++ features/all/xen/pvops-updates.patch featureset=xen
More information about the Kernel-svn-changes
mailing list