[kernel] r15515 - in dists/sid/linux-2.6/debian: . patches/features/all/xen

Fri Apr 16 16:01:00 UTC 2010

Author: waldi
Date: Fri Apr 16 16:00:25 2010
New Revision: 15515

Log:
* debian/changelog: Update.
* debian/patches/features/all/xen/pvops.patch: Update to 27f948a3bf36.

Modified:
   dists/sid/linux-2.6/debian/changelog
   dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch

Modified: dists/sid/linux-2.6/debian/changelog
==============================================================================

--- dists/sid/linux-2.6/debian/changelog	Fri Apr 16 14:48:56 2010	(r15514)
+++ dists/sid/linux-2.6/debian/changelog	Fri Apr 16 16:00:25 2010	(r15515)
@@ -31,6 +31,9 @@
   [ Martin Michlmayr ]
   * dns323-setup.c: fix WARN() when booting (Arnaud Patard).
 
+  [ Bastian Blank ]
+  * Update Xen patch.
+
  -- Ben Hutchings <ben at decadent.org.uk>  Tue, 06 Apr 2010 02:26:51 +0100
 
 linux-2.6 (2.6.32-11) unstable; urgency=low

Modified: dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch
==============================================================================
--- dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch	Fri Apr 16 14:48:56 2010	(r15514)
+++ dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch	Fri Apr 16 16:00:25 2010	(r15515)
@@ -1,4 +1,4 @@
-Patch based on commit f64df18aae5ab07b44bdcc2334cf0044ef46320c of
+Patch based on commit 27f948a3bf365a5bc3d56119637a177d41147815 of
 git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen.git.
 
 diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
@@ -1057,7 +1057,7 @@
  
  obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
-index 195e4b7..6458fe8 100644
+index 23c2da8..a2a5125 100644
 --- a/arch/x86/kernel/acpi/boot.c
 +++ b/arch/x86/kernel/acpi/boot.c
 @@ -42,6 +42,10 @@
@@ -1118,6 +1118,51 @@
  	return max_gsi + 1;
  }
  
+diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
+index d85d1b2..8c9526d 100644
+--- a/arch/x86/kernel/acpi/processor.c
++++ b/arch/x86/kernel/acpi/processor.c
+@@ -11,6 +11,7 @@
+ 
+ #include <acpi/processor.h>
+ #include <asm/acpi.h>
++#include <asm/xen/hypervisor.h>
+ 
+ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
+ {
+@@ -88,6 +89,19 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
+ 
+ EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
+ 
++/* Initialize _PDC data based on the CPU vendor */
++void xen_arch_acpi_processor_init_pdc(struct acpi_processor *pr)
++{
++	struct cpuinfo_x86 *c = &cpu_data(0);
++
++	pr->pdc = NULL;
++	if (c->x86_vendor == X86_VENDOR_INTEL)
++		init_intel_pdc(pr, c);
++
++	return;
++}
++EXPORT_SYMBOL(xen_arch_acpi_processor_init_pdc);
++
+ void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
+ {
+ 	if (pr->pdc) {
+diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
+index ca93638..9eff23c 100644
+--- a/arch/x86/kernel/acpi/sleep.c
++++ b/arch/x86/kernel/acpi/sleep.c
+@@ -12,6 +12,8 @@
+ #include <asm/segment.h>
+ #include <asm/desc.h>
+ 
++#include <xen/acpi.h>
++
+ #include "realmode/wakeup.h"
+ #include "sleep.h"
+ 
 diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
 index 23fc9fe..40497d3 100644
 --- a/arch/x86/kernel/amd_iommu.c
@@ -1621,13 +1666,15 @@
  extern int mtrr_cleanup(unsigned address_bits);
 diff --git a/arch/x86/kernel/cpu/mtrr/xen.c b/arch/x86/kernel/cpu/mtrr/xen.c
 new file mode 100644
-index 0000000..54ced4b
+index 0000000..852018b
 --- /dev/null
 +++ b/arch/x86/kernel/cpu/mtrr/xen.c
-@@ -0,0 +1,105 @@
+@@ -0,0 +1,109 @@
 +#include <linux/init.h>
 +#include <linux/mm.h>
-+#
++
++#include <asm/pat.h>
++
 +#include "mtrr.h"
 +
 +#include <xen/xen.h>
@@ -1727,8 +1774,10 @@
 +	if (cpu_has_mtrr ||
 +	    cpu_has_k6_mtrr ||
 +	    cpu_has_cyrix_arr ||
-+	    cpu_has_centaur_mcr)
++	    cpu_has_centaur_mcr) {
 +		mtrr_if = &xen_mtrr_ops;
++		pat_init();
++	}
 +}
 diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
 index 5e409dc..a4849c1 100644
@@ -2583,7 +2632,7 @@
  	}
  }
 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
-index f010ab4..6b39f09 100644
+index d0ba107..0b4f9d1 100644
 --- a/arch/x86/kernel/process.c
 +++ b/arch/x86/kernel/process.c
 @@ -73,16 +73,12 @@ void exit_thread(void)
@@ -3230,7 +3279,7 @@
 +#endif
 +}
 diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
-index 3578688..565ab25 100644
+index 3578688..cc0c7ed 100644
 --- a/arch/x86/xen/enlighten.c
 +++ b/arch/x86/xen/enlighten.c
 @@ -28,6 +28,7 @@
@@ -3292,11 +3341,8 @@
  	}
  
  	asm(XEN_EMULATE_PREFIX "cpuid"
-@@ -217,8 +230,11 @@ static __init void xen_init_cpuid_mask(void)
- 	cpuid_leaf1_edx_mask =
- 		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
+@@ -219,6 +232,8 @@ static __init void xen_init_cpuid_mask(void)
  		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-+		  (1 << X86_FEATURE_PAT)  |  /* disable PAT */
  		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
  
 +	cpuid_leaf81_edx_mask = ~(1 << (X86_FEATURE_GBPAGES % 32));
@@ -3304,7 +3350,7 @@
  	if (!xen_initial_domain())
  		cpuid_leaf1_edx_mask &=
  			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
-@@ -406,7 +422,7 @@ static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+@@ -406,7 +421,7 @@ static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
  
  		pte = pfn_pte(pfn, PAGE_KERNEL_RO);
  
@@ -3313,7 +3359,7 @@
  			BUG();
  
  		frames[f] = mfn;
-@@ -519,11 +535,10 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
+@@ -519,11 +534,10 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
  	} else if (addr == (unsigned long)machine_check) {
  		return 0;
  #endif
@@ -3329,7 +3375,7 @@
  #endif	/* CONFIG_X86_64 */
  	info->address = addr;
  
-@@ -679,6 +694,18 @@ static void xen_set_iopl_mask(unsigned mask)
+@@ -679,6 +693,18 @@ static void xen_set_iopl_mask(unsigned mask)
  	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
  }
  
@@ -3348,7 +3394,7 @@
  static void xen_io_delay(void)
  {
  }
-@@ -716,7 +743,7 @@ static u32 xen_safe_apic_wait_icr_idle(void)
+@@ -716,7 +742,7 @@ static u32 xen_safe_apic_wait_icr_idle(void)
          return 0;
  }
  
@@ -3357,7 +3403,7 @@
  {
  	apic->read = xen_apic_read;
  	apic->write = xen_apic_write;
-@@ -728,7 +755,6 @@ static void set_xen_basic_apic_ops(void)
+@@ -728,7 +754,6 @@ static void set_xen_basic_apic_ops(void)
  
  #endif
  
@@ -3365,7 +3411,19 @@
  static void xen_clts(void)
  {
  	struct multicall_space mcs;
-@@ -978,6 +1004,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+@@ -811,6 +836,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+ 		   Xen console noise. */
+ 		break;
+ 
++	case MSR_IA32_CR_PAT:
++		if (smp_processor_id() == 0)
++			xen_set_pat(((u64)high << 32) | low);
++		break;
++
+ 	default:
+ 		ret = native_write_msr_safe(msr, low, high);
+ 	}
+@@ -978,6 +1008,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
  	.load_sp0 = xen_load_sp0,
  
  	.set_iopl_mask = xen_set_iopl_mask,
@@ -3373,7 +3431,7 @@
  	.io_delay = xen_io_delay,
  
  	/* Xen takes care of %gs when switching to usermode for us */
-@@ -1020,6 +1047,14 @@ static void xen_machine_halt(void)
+@@ -1020,6 +1051,14 @@ static void xen_machine_halt(void)
  	xen_reboot(SHUTDOWN_poweroff);
  }
  
@@ -3388,7 +3446,7 @@
  static void xen_crash_shutdown(struct pt_regs *regs)
  {
  	xen_reboot(SHUTDOWN_crash);
-@@ -1028,7 +1063,7 @@ static void xen_crash_shutdown(struct pt_regs *regs)
+@@ -1028,7 +1067,7 @@ static void xen_crash_shutdown(struct pt_regs *regs)
  static const struct machine_ops __initdata xen_machine_ops = {
  	.restart = xen_restart,
  	.halt = xen_machine_halt,
@@ -3397,7 +3455,7 @@
  	.shutdown = xen_machine_halt,
  	.crash_shutdown = xen_crash_shutdown,
  	.emergency_restart = xen_emergency_restart,
-@@ -1061,6 +1096,8 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1061,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void)
  
  	xen_domain_type = XEN_PV_DOMAIN;
  
@@ -3406,7 +3464,7 @@
  	/* Install Xen paravirt ops */
  	pv_info = xen_info;
  	pv_init_ops = xen_init_ops;
-@@ -1086,6 +1123,12 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1086,6 +1127,12 @@ asmlinkage void __init xen_start_kernel(void)
  
  	xen_init_mmu_ops();
  
@@ -3419,7 +3477,7 @@
  	/* Prevent unwanted bits from being set in PTEs. */
  	__supported_pte_mask &= ~_PAGE_GLOBAL;
  	if (!xen_initial_domain())
-@@ -1116,6 +1159,10 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1116,6 +1163,10 @@ asmlinkage void __init xen_start_kernel(void)
  	 */
  	xen_setup_stackprotector();
  
@@ -3430,7 +3488,7 @@
  	xen_init_irq_ops();
  	xen_init_cpuid_mask();
  
-@@ -1144,6 +1191,8 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1144,6 +1195,8 @@ asmlinkage void __init xen_start_kernel(void)
  
  	pgd = (pgd_t *)xen_start_info->pt_base;
  
@@ -3439,7 +3497,7 @@
  	/* Don't do the full vcpu_info placement stuff until we have a
  	   possible map and a non-dummy shared_info. */
  	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-@@ -1153,6 +1202,7 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1153,6 +1206,7 @@ asmlinkage void __init xen_start_kernel(void)
  
  	xen_raw_console_write("mapping kernel into physical memory\n");
  	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
@@ -3447,7 +3505,7 @@
  
  	init_mm.pgd = pgd;
  
-@@ -1162,6 +1212,14 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1162,6 +1216,14 @@ asmlinkage void __init xen_start_kernel(void)
  	if (xen_feature(XENFEAT_supervisor_mode_kernel))
  		pv_info.kernel_rpl = 0;
  
@@ -3462,7 +3520,7 @@
  	/* set the limit of our address space */
  	xen_reserve_top();
  
-@@ -1184,6 +1242,16 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1184,6 +1246,16 @@ asmlinkage void __init xen_start_kernel(void)
  		add_preferred_console("xenboot", 0, NULL);
  		add_preferred_console("tty", 0, NULL);
  		add_preferred_console("hvc", 0, NULL);
@@ -3480,20 +3538,29 @@
  
  	xen_raw_console_write("about to get started...\n");
 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
-index 350a3de..44a4cd1 100644
+index 350a3de..8c6a858 100644
 --- a/arch/x86/xen/mmu.c
 +++ b/arch/x86/xen/mmu.c
-@@ -50,7 +50,9 @@
+@@ -42,6 +42,7 @@
+ #include <linux/highmem.h>
+ #include <linux/debugfs.h>
+ #include <linux/bug.h>
++#include <linux/vmalloc.h>
+ #include <linux/module.h>
+ 
+ #include <asm/pgtable.h>
+@@ -50,7 +51,10 @@
  #include <asm/mmu_context.h>
  #include <asm/setup.h>
  #include <asm/paravirt.h>
 +#include <asm/e820.h>
  #include <asm/linkage.h>
++#include <asm/pat.h>
 +#include <asm/page.h>
  
  #include <asm/xen/hypercall.h>
  #include <asm/xen/hypervisor.h>
-@@ -58,6 +60,7 @@
+@@ -58,6 +62,7 @@
  #include <xen/page.h>
  #include <xen/interface/xen.h>
  #include <xen/interface/version.h>
@@ -3501,7 +3568,7 @@
  #include <xen/hvc-console.h>
  
  #include "multicalls.h"
-@@ -66,6 +69,13 @@
+@@ -66,6 +71,13 @@
  
  #define MMU_UPDATE_HISTO	30
  
@@ -3515,7 +3582,7 @@
  #ifdef CONFIG_XEN_DEBUG_FS
  
  static struct {
-@@ -184,6 +194,26 @@ static inline unsigned p2m_index(unsigned long pfn)
+@@ -184,6 +196,26 @@ static inline unsigned p2m_index(unsigned long pfn)
  	return pfn % P2M_ENTRIES_PER_PAGE;
  }
  
@@ -3542,7 +3609,7 @@
  /* Build the parallel p2m_top_mfn structures */
  void xen_build_mfn_list_list(void)
  {
-@@ -315,6 +345,7 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr)
+@@ -315,6 +347,7 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr)
  
  	return PFN_DOWN(maddr.maddr);
  }
@@ -3550,7 +3617,7 @@
  
  xmaddr_t arbitrary_virt_to_machine(void *vaddr)
  {
-@@ -376,6 +407,34 @@ static bool xen_page_pinned(void *ptr)
+@@ -376,6 +409,34 @@ static bool xen_page_pinned(void *ptr)
  	return PagePinned(page);
  }
  
@@ -3585,7 +3652,7 @@
  static void xen_extend_mmu_update(const struct mmu_update *update)
  {
  	struct multicall_space mcs;
-@@ -452,6 +511,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+@@ -452,6 +513,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
  void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
  		    pte_t *ptep, pte_t pteval)
  {
@@ -3597,7 +3664,7 @@
  	ADD_STATS(set_pte_at, 1);
  //	ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
  	ADD_STATS(set_pte_at_current, mm == current->mm);
-@@ -522,8 +586,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
+@@ -522,9 +588,34 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
  	return val;
  }
  
@@ -3617,19 +3684,69 @@
 +
  pteval_t xen_pte_val(pte_t pte)
  {
-+	if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
-+		return pte.pte;
+-	return pte_mfn_to_pfn(pte.pte);
++	pteval_t pteval = pte.pte;
++
++	/* If this is a WC pte, convert back from Xen WC to Linux WC */
++	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
++		WARN_ON(!pat_enabled);
++		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
++	}
++
++	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
++		return pteval;
 +
- 	return pte_mfn_to_pfn(pte.pte);
++	return pte_mfn_to_pfn(pteval);
  }
  PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
-@@ -536,7 +617,22 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
  
+@@ -534,9 +625,62 @@ pgdval_t xen_pgd_val(pgd_t pgd)
+ }
+ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
+ 
++/*
++ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
++ * are reserved for now, to correspond to the Intel-reserved PAT
++ * types.
++ *
++ * We expect Linux's PAT set as follows:
++ *
++ * Idx  PTE flags        Linux    Xen    Default
++ * 0                     WB       WB     WB
++ * 1            PWT      WC       WT     WT
++ * 2        PCD          UC-      UC-    UC-
++ * 3        PCD PWT      UC       UC     UC
++ * 4    PAT              WB       WC     WB
++ * 5    PAT     PWT      WC       WP     WT
++ * 6    PAT PCD          UC-      UC     UC-
++ * 7    PAT PCD PWT      UC       UC     UC
++ */
++
++void xen_set_pat(u64 pat)
++{
++	/* We expect Linux to use a PAT setting of
++	 * UC UC- WC WB (ignoring the PAT flag) */
++	WARN_ON(pat != 0x0007010600070106ull);
++}
++
  pte_t xen_make_pte(pteval_t pte)
  {
 -	pte = pte_pfn_to_mfn(pte);
 +	phys_addr_t addr = (pte & PTE_PFN_MASK);
 +
++	/* If Linux is trying to set a WC pte, then map to the Xen WC.
++	 * If _PAGE_PAT is set, then it probably means it is really
++	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
++	 * things work out OK...
++	 *
++	 * (We should never see kernel mappings with _PAGE_PSE set,
++	 * but we could see hugetlbfs mappings, I think.).
++	 */
++	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
++		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
++			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
++	}
++
 +	/*
 +	 * Unprivileged domains are allowed to do IOMAPpings for
 +	 * PCI passthrough, but not map ISA space.  The ISA
@@ -3647,7 +3764,7 @@
  	return native_make_pte(pte);
  }
  PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
-@@ -592,6 +688,11 @@ void xen_set_pud(pud_t *ptr, pud_t val)
+@@ -592,6 +736,11 @@ void xen_set_pud(pud_t *ptr, pud_t val)
  
  void xen_set_pte(pte_t *ptep, pte_t pte)
  {
@@ -3659,7 +3776,7 @@
  	ADD_STATS(pte_update, 1);
  //	ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
  	ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-@@ -608,6 +709,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte)
+@@ -608,6 +757,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte)
  #ifdef CONFIG_X86_PAE
  void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
  {
@@ -3671,7 +3788,16 @@
  	set_64bit((u64 *)ptep, native_pte_val(pte));
  }
  
-@@ -1219,7 +1325,7 @@ void xen_exit_mmap(struct mm_struct *mm)
+@@ -934,8 +1088,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
+    read-only, and can be pinned. */
+ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
+ {
+-	vm_unmap_aliases();
+-
+ 	xen_mc_batch();
+ 
+ 	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
+@@ -1219,7 +1371,7 @@ void xen_exit_mmap(struct mm_struct *mm)
  	spin_lock(&mm->page_table_lock);
  
  	/* pgd may not be pinned in the error exit path of execve */
@@ -3680,7 +3806,7 @@
  		xen_pgd_unpin(mm);
  
  	spin_unlock(&mm->page_table_lock);
-@@ -1288,12 +1394,19 @@ static void xen_flush_tlb_single(unsigned long addr)
+@@ -1288,12 +1440,19 @@ static void xen_flush_tlb_single(unsigned long addr)
  	preempt_enable();
  }
  
@@ -3701,7 +3827,7 @@
  	} *args;
  	struct multicall_space mcs;
  
-@@ -1417,6 +1530,13 @@ static int xen_pgd_alloc(struct mm_struct *mm)
+@@ -1417,6 +1576,13 @@ static int xen_pgd_alloc(struct mm_struct *mm)
  	return ret;
  }
  
@@ -3715,7 +3841,7 @@
  static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
  {
  #ifdef CONFIG_X86_64
-@@ -1448,10 +1568,17 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+@@ -1448,10 +1614,17 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
  #ifdef CONFIG_X86_32
  static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
  {
@@ -3735,7 +3861,15 @@
  
  	return pte;
  }
-@@ -1620,6 +1747,7 @@ static void *m2v(phys_addr_t maddr)
+@@ -1517,7 +1690,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
+ 	if (PagePinned(virt_to_page(mm->pgd))) {
+ 		SetPagePinned(page);
+ 
+-		vm_unmap_aliases();
+ 		if (!PageHighMem(page)) {
+ 			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+ 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+@@ -1620,6 +1792,7 @@ static void *m2v(phys_addr_t maddr)
  	return __ka(m2p(maddr));
  }
  
@@ -3743,7 +3877,7 @@
  static void set_page_prot(void *addr, pgprot_t prot)
  {
  	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
-@@ -1675,6 +1803,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+@@ -1675,6 +1848,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
  	set_page_prot(pmd, PAGE_KERNEL_RO);
  }
  
@@ -3764,7 +3898,7 @@
  #ifdef CONFIG_X86_64
  static void convert_pfn_mfn(void *v)
  {
-@@ -1766,6 +1908,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+@@ -1766,6 +1953,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
  					 unsigned long max_pfn)
  {
  	pmd_t *kernel_pmd;
@@ -3772,7 +3906,7 @@
  
  	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
  				  xen_start_info->nr_pt_frames * PAGE_SIZE +
-@@ -1777,6 +1920,20 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+@@ -1777,6 +1965,20 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
  	xen_map_identity_early(level2_kernel_pgt, max_pfn);
  
  	memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
@@ -3793,7 +3927,7 @@
  	set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
  			__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
  
-@@ -1799,6 +1956,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+@@ -1799,6 +2001,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
  }
  #endif	/* CONFIG_X86_64 */
  
@@ -3802,7 +3936,7 @@
  static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
  {
  	pte_t pte;
-@@ -1828,9 +1987,26 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+@@ -1828,9 +2032,26 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
  		pte = pfn_pte(phys, prot);
  		break;
  
@@ -3830,7 +3964,7 @@
  	}
  
  	__native_set_fixmap(idx, pte);
-@@ -1845,6 +2021,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+@@ -1845,6 +2066,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
  #endif
  }
  
@@ -3860,10 +3994,14 @@
  static __init void xen_post_allocator_init(void)
  {
  	pv_mmu_ops.set_pte = xen_set_pte;
-@@ -1962,6 +2161,271 @@ void __init xen_init_mmu_ops(void)
+@@ -1960,7 +2204,270 @@ void __init xen_init_mmu_ops(void)
+ 	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
+ 	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
  	pv_mmu_ops = xen_mmu_ops;
- }
- 
++
++	vmap_lazy_unmap = false;
++}
++
 +/* Protected by xen_reservation_lock. */
 +#define MAX_CONTIG_ORDER 9 /* 2MB */
 +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
@@ -4000,8 +4138,6 @@
 +
 +	memset((void *) vstart, 0, PAGE_SIZE << order);
 +
-+	vm_unmap_aliases();
-+
 +	spin_lock_irqsave(&xen_reservation_lock, flags);
 +
 +	/* 1. Zap current PTEs, remembering MFNs. */
@@ -4039,8 +4175,6 @@
 +
 +	memset((void *) vstart, 0, PAGE_SIZE << order);
 +
-+	vm_unmap_aliases();
-+
 +	spin_lock_irqsave(&xen_reservation_lock, flags);
 +
 +	/* 1. Find start MFN of contiguous extent. */
@@ -4126,12 +4260,11 @@
 +	flush_tlb_all();
 +
 +	return err;
-+}
+ }
 +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
-+
+ 
  #ifdef CONFIG_XEN_DEBUG_FS
  
- static struct dentry *d_mmu_debug;
 diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
 new file mode 100644
 index 0000000..4d55524
@@ -4192,10 +4325,10 @@
 +}
 diff --git a/arch/x86/xen/pci.c b/arch/x86/xen/pci.c
 new file mode 100644
-index 0000000..956e0d0
+index 0000000..3def132
 --- /dev/null
 +++ b/arch/x86/xen/pci.c
-@@ -0,0 +1,299 @@
+@@ -0,0 +1,296 @@
 +#include <linux/kernel.h>
 +#include <linux/acpi.h>
 +#include <linux/pci.h>
@@ -4258,9 +4391,6 @@
 +{
 +	int rc, irq;
 +	struct physdev_setup_gsi setup_gsi;
-+	struct physdev_map_pirq map_irq;
-+	int shareable = 0;
-+	char *name;
 +
 +	if (!xen_domain())
 +		return -1;
@@ -4707,6 +4837,34 @@
  	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
  
  	/* make sure interrupts start blocked */
+diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
+index 9d1f853..af5463a 100644
+--- a/arch/x86/xen/time.c
++++ b/arch/x86/xen/time.c
+@@ -239,8 +239,22 @@ unsigned long xen_get_wallclock(void)
+ 
+ int xen_set_wallclock(unsigned long now)
+ {
++	struct xen_platform_op op;
++	int rc;
++
+ 	/* do nothing for domU */
+-	return -1;
++	if (!xen_initial_domain())
++		return -1;
++
++	op.cmd = XENPF_settime;
++	op.u.settime.secs = now;
++	op.u.settime.nsecs = 0;
++	op.u.settime.system_time = xen_clocksource_read();
++
++	rc = HYPERVISOR_dom0_op(&op);
++	WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now);
++
++	return rc;
+ }
+ 
+ static struct clocksource xen_clocksource __read_mostly = {
 diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
 new file mode 100644
 index 0000000..1cd7f4d
@@ -4781,18 +4939,20 @@
 +	}
 +}
 diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
-index f9153a3..5afc1fe 100644
+index f9153a3..1c1eff4 100644
 --- a/arch/x86/xen/xen-ops.h
 +++ b/arch/x86/xen/xen-ops.h
-@@ -30,6 +30,7 @@ void xen_setup_machphys_mapping(void);
+@@ -30,6 +30,9 @@ void xen_setup_machphys_mapping(void);
  pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
  void xen_ident_map_ISA(void);
  void xen_reserve_top(void);
 +void xen_ident_map_ISA(void);
++
++void xen_set_pat(u64);
  
  char * __init xen_memory_setup(void);
  void __init xen_arch_setup(void);
-@@ -82,6 +83,23 @@ static inline void xen_uninit_lock_cpu(int cpu)
+@@ -82,6 +85,23 @@ static inline void xen_uninit_lock_cpu(int cpu)
  }
  #endif
  
@@ -4836,1041 +4996,1971 @@
  
  static inline void blk_free_request(struct request_queue *q, struct request *rq)
  {
-diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
-index 1d886e0..f4a2b10 100644
---- a/drivers/block/Kconfig
-+++ b/drivers/block/Kconfig
-@@ -462,6 +462,7 @@ config XEN_BLKDEV_FRONTEND
- 	tristate "Xen virtual block device support"
- 	depends on XEN
- 	default y
-+	select XEN_XENBUS_FRONTEND
- 	help
- 	  This driver implements the front-end of the Xen virtual
- 	  block device driver.  It communicates with a back-end driver
-diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
-index b8578bb..a8d30d7 100644
---- a/drivers/block/xen-blkfront.c
-+++ b/drivers/block/xen-blkfront.c
-@@ -42,6 +42,7 @@
- #include <linux/module.h>
- #include <linux/scatterlist.h>
+diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
+index 7702118..1be123c 100644
+--- a/drivers/acpi/Makefile
++++ b/drivers/acpi/Makefile
+@@ -61,6 +61,7 @@ obj-$(CONFIG_ACPI_POWER_METER)	+= power_meter.o
+ # processor has its own "processor." module_param namespace
+ processor-y			:= processor_core.o processor_throttling.o
+ processor-y			+= processor_idle.o processor_thermal.o
++processor-y			+= processor_xen.o
+ processor-$(CONFIG_CPU_FREQ)	+= processor_perflib.o
+ 
+ obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
+diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
+index 28ccdbc..b0f9ed6 100644
+--- a/drivers/acpi/acpi_memhotplug.c
++++ b/drivers/acpi/acpi_memhotplug.c
+@@ -31,6 +31,7 @@
+ #include <linux/types.h>
+ #include <linux/memory_hotplug.h>
+ #include <acpi/acpi_drivers.h>
++#include <xen/acpi.h>
+ 
+ #define ACPI_MEMORY_DEVICE_CLASS		"memory"
+ #define ACPI_MEMORY_DEVICE_HID			"PNP0C80"
+@@ -70,21 +71,6 @@ static struct acpi_driver acpi_memory_device_driver = {
+ 		},
+ };
  
-+#include <xen/xen.h>
- #include <xen/xenbus.h>
- #include <xen/grant_table.h>
- #include <xen/events.h>
-@@ -102,6 +103,10 @@ struct blkfront_info
+-struct acpi_memory_info {
+-	struct list_head list;
+-	u64 start_addr;		/* Memory Range start physical addr */
+-	u64 length;		/* Memory Range length */
+-	unsigned short caching;	/* memory cache attribute */
+-	unsigned short write_protect;	/* memory read/write attribute */
+-	unsigned int enabled:1;
+-};
+-
+-struct acpi_memory_device {
+-	struct acpi_device * device;
+-	unsigned int state;	/* State of the memory device */
+-	struct list_head res_list;
+-};
+-
+ static int acpi_hotmem_initialized;
  
- static DEFINE_SPINLOCK(blkif_io_lock);
+ static acpi_status
+@@ -228,6 +214,9 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
+ 		return result;
+ 	}
  
-+static unsigned int nr_minors;
-+static unsigned long *minors;
-+static DEFINE_SPINLOCK(minor_lock);
++	if (xen_initial_domain())
++		return xen_hotadd_memory(mem_device);
 +
- #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
- 	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
- #define GRANT_INVALID_REF	0
-@@ -136,6 +141,55 @@ static void add_id_to_freelist(struct blkfront_info *info,
- 	info->shadow_free = id;
- }
+ 	node = acpi_get_node(mem_device->device->handle);
+ 	/*
+ 	 * Tell the VM there is more memory here...
+diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
+index cc22f9a..747d96f 100644
+--- a/drivers/acpi/acpica/hwsleep.c
++++ b/drivers/acpi/acpica/hwsleep.c
+@@ -47,6 +47,9 @@
+ #include "actables.h"
+ #include <linux/tboot.h>
  
-+static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
-+{
-+	unsigned int end = minor + nr;
-+	int rc;
++#include <xen/acpi.h>
++#include <asm/xen/hypervisor.h>
 +
-+	if (end > nr_minors) {
-+		unsigned long *bitmap, *old;
+ #define _COMPONENT          ACPI_HARDWARE
+ ACPI_MODULE_NAME("hwsleep")
+ 
+@@ -346,6 +349,19 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state)
+ 	tboot_sleep(sleep_state, pm1a_control, pm1b_control);
+ 
+ 	/* Write #2: Write both SLP_TYP + SLP_EN */
++	if (xen_pv_acpi()) {
++		int err;
 +
-+		bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
-+				 GFP_KERNEL);
-+		if (bitmap == NULL)
-+			return -ENOMEM;
++		err = acpi_notify_hypervisor_state(sleep_state,
++						   pm1a_control, pm1b_control);
++		if (err) {
++			ACPI_DEBUG_PRINT((ACPI_DB_INIT,
++					  "Hypervisor failure [%d]\n", err));
++			return_ACPI_STATUS(AE_ERROR);
++		}
 +
-+		spin_lock(&minor_lock);
-+		if (end > nr_minors) {
-+			old = minors;
-+			memcpy(bitmap, minors,
-+			       BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
-+			minors = bitmap;
-+			nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
-+		} else
-+			old = bitmap;
-+		spin_unlock(&minor_lock);
-+		kfree(old);
++		return_ACPI_STATUS(AE_OK);
 +	}
-+
-+	spin_lock(&minor_lock);
-+	if (find_next_bit(minors, end, minor) >= end) {
-+		for (; minor < end; ++minor)
-+			__set_bit(minor, minors);
-+		rc = 0;
-+	} else
-+		rc = -EBUSY;
-+	spin_unlock(&minor_lock);
-+
-+	return rc;
-+}
-+
-+static void xlbd_release_minors(unsigned int minor, unsigned int nr)
-+{
-+	unsigned int end = minor + nr;
-+
-+	BUG_ON(end > nr_minors);
-+	spin_lock(&minor_lock);
-+	for (; minor < end; ++minor)
-+		__clear_bit(minor, minors);
-+	spin_unlock(&minor_lock);
-+}
-+
- static void blkif_restart_queue_callback(void *arg)
- {
- 	struct blkfront_info *info = (struct blkfront_info *)arg;
-@@ -416,9 +470,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
- 	if ((minor % nr_parts) == 0)
- 		nr_minors = nr_parts;
- 
-+	err = xlbd_reserve_minors(minor, nr_minors);
-+	if (err)
-+		goto out;
-+	err = -ENODEV;
-+
- 	gd = alloc_disk(nr_minors);
- 	if (gd == NULL)
--		goto out;
-+		goto release;
  
- 	offset = minor / nr_parts;
+ 	status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control);
+ 	if (ACPI_FAILURE(status)) {
+diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
+index ec742a4..4ccecf6 100644
+--- a/drivers/acpi/processor_core.c
++++ b/drivers/acpi/processor_core.c
+@@ -58,6 +58,7 @@
+ #include <acpi/acpi_bus.h>
+ #include <acpi/acpi_drivers.h>
+ #include <acpi/processor.h>
++#include <xen/acpi.h>
  
-@@ -449,7 +508,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ #define PREFIX "ACPI: "
  
- 	if (xlvbd_init_blk_queue(gd, sector_size)) {
- 		del_gendisk(gd);
--		goto out;
-+		goto release;
- 	}
+@@ -81,11 +82,9 @@ MODULE_DESCRIPTION("ACPI Processor Driver");
+ MODULE_LICENSE("GPL");
  
- 	info->rq = gd->queue;
-@@ -469,6 +528,8 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ static int acpi_processor_add(struct acpi_device *device);
+-static int acpi_processor_remove(struct acpi_device *device, int type);
+ #ifdef CONFIG_ACPI_PROCFS
+ static int acpi_processor_info_open_fs(struct inode *inode, struct file *file);
+ #endif
+-static void acpi_processor_notify(struct acpi_device *device, u32 event);
+ static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu);
+ static int acpi_processor_handle_eject(struct acpi_processor *pr);
  
+@@ -253,7 +252,7 @@ static int acpi_processor_errata_piix4(struct pci_dev *dev)
  	return 0;
+ }
  
-+ release:
-+	xlbd_release_minors(minor, nr_minors);
-  out:
- 	return err;
+-static int acpi_processor_errata(struct acpi_processor *pr)
++int acpi_processor_errata(struct acpi_processor *pr)
+ {
+ 	int result = 0;
+ 	struct pci_dev *dev = NULL;
+@@ -284,7 +283,7 @@ static int acpi_processor_errata(struct acpi_processor *pr)
+  * _PDC is required for a BIOS-OS handshake for most of the newer
+  * ACPI processor features.
+  */
+-static int acpi_processor_set_pdc(struct acpi_processor *pr)
++int acpi_processor_set_pdc(struct acpi_processor *pr)
+ {
+ 	struct acpi_object_list *pdc_in = pr->pdc;
+ 	acpi_status status = AE_OK;
+@@ -353,7 +352,7 @@ static int acpi_processor_info_open_fs(struct inode *inode, struct file *file)
+ 			   PDE(inode)->data);
  }
-@@ -650,7 +711,7 @@ fail:
  
+-static int acpi_processor_add_fs(struct acpi_device *device)
++int acpi_processor_add_fs(struct acpi_device *device)
+ {
+ 	struct proc_dir_entry *entry = NULL;
  
- /* Common code used when first setting up, and when resuming. */
--static int talk_to_backend(struct xenbus_device *dev,
-+static int talk_to_blkback(struct xenbus_device *dev,
- 			   struct blkfront_info *info)
+@@ -392,7 +391,7 @@ static int acpi_processor_add_fs(struct acpi_device *device)
+ 		return -EIO;
+ 	return 0;
+ }
+-static int acpi_processor_remove_fs(struct acpi_device *device)
++int acpi_processor_remove_fs(struct acpi_device *device)
  {
- 	const char *message = NULL;
-@@ -755,7 +816,7 @@ static int blkfront_probe(struct xenbus_device *dev,
- 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
- 	dev_set_drvdata(&dev->dev, info);
  
--	err = talk_to_backend(dev, info);
-+	err = talk_to_blkback(dev, info);
- 	if (err) {
- 		kfree(info);
- 		dev_set_drvdata(&dev->dev, NULL);
-@@ -850,7 +911,7 @@ static int blkfront_resume(struct xenbus_device *dev)
+ 	if (acpi_device_dir(device)) {
+@@ -711,7 +710,7 @@ static int acpi_processor_get_info(struct acpi_device *device)
  
- 	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+ static DEFINE_PER_CPU(void *, processor_device_array);
  
--	err = talk_to_backend(dev, info);
-+	err = talk_to_blkback(dev, info);
- 	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
- 		err = blkif_recover(info);
+-static void acpi_processor_notify(struct acpi_device *device, u32 event)
++void acpi_processor_notify(struct acpi_device *device, u32 event)
+ {
+ 	struct acpi_processor *pr = acpi_driver_data(device);
+ 	int saved;
+@@ -879,7 +878,7 @@ err_free_cpumask:
+ 	return result;
+ }
  
-@@ -869,10 +930,29 @@ static void blkfront_connect(struct blkfront_info *info)
- 	unsigned int binfo;
- 	int err;
+-static int acpi_processor_remove(struct acpi_device *device, int type)
++int acpi_processor_remove(struct acpi_device *device, int type)
+ {
+ 	struct acpi_processor *pr = NULL;
  
--	if ((info->connected == BLKIF_STATE_CONNECTED) ||
--	    (info->connected == BLKIF_STATE_SUSPENDED) )
-+	switch (info->connected) {
-+	case BLKIF_STATE_CONNECTED:
-+		/*
-+		 * Potentially, the back-end may be signalling
-+		 * a capacity change; update the capacity.
-+		 */
-+		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
-+				   "sectors", "%Lu", &sectors);
-+		if (XENBUS_EXIST_ERR(err))
-+			return;
-+		printk(KERN_INFO "Setting capacity to %Lu\n",
-+		       sectors);
-+		set_capacity(info->gd, sectors);
-+		revalidate_disk(info->gd);
-+
-+		/* fall through */
-+	case BLKIF_STATE_SUSPENDED:
- 		return;
+@@ -1154,7 +1153,11 @@ static int __init acpi_processor_init(void)
+ 	if (result < 0)
+ 		goto out_proc;
  
-+	default:
-+		break;
-+	}
+-	result = acpi_bus_register_driver(&acpi_processor_driver);
++	if (xen_initial_domain())
++		result = xen_acpi_processor_init();
++	else
++		result = acpi_bus_register_driver(&acpi_processor_driver);
 +
- 	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
- 		__func__, info->xbdev->otherend);
- 
-@@ -920,12 +1000,11 @@ static void blkfront_connect(struct blkfront_info *info)
-  * the backend.  Once is this done, we can switch to Closed in
-  * acknowledgement.
-  */
--static void blkfront_closing(struct xenbus_device *dev)
-+static void blkfront_closing(struct blkfront_info *info)
- {
--	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
-+	unsigned int minor, nr_minors;
- 	unsigned long flags;
- 
--	dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
- 
- 	if (info->rq == NULL)
- 		goto out;
-@@ -945,27 +1024,33 @@ static void blkfront_closing(struct xenbus_device *dev)
- 	blk_cleanup_queue(info->rq);
- 	info->rq = NULL;
- 
-+	minor = info->gd->first_minor;
-+	nr_minors = info->gd->minors;
- 	del_gendisk(info->gd);
-+	xlbd_release_minors(minor, nr_minors);
- 
-  out:
--	xenbus_frontend_closed(dev);
-+	if (info->xbdev)
-+		xenbus_frontend_closed(info->xbdev);
- }
+ 	if (result < 0)
+ 		goto out_cpuidle;
  
- /**
-  * Callback received when the backend's state changes.
-  */
--static void backend_changed(struct xenbus_device *dev,
-+static void blkback_changed(struct xenbus_device *dev,
- 			    enum xenbus_state backend_state)
- {
- 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
- 	struct block_device *bd;
+@@ -1190,7 +1193,10 @@ static void __exit acpi_processor_exit(void)
  
--	dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
-+	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
+ 	acpi_processor_uninstall_hotplug_notify();
  
- 	switch (backend_state) {
- 	case XenbusStateInitialising:
- 	case XenbusStateInitWait:
- 	case XenbusStateInitialised:
-+	case XenbusStateReconfiguring:
-+	case XenbusStateReconfigured:
- 	case XenbusStateUnknown:
- 	case XenbusStateClosed:
- 		break;
-@@ -988,7 +1073,7 @@ static void backend_changed(struct xenbus_device *dev,
- 			xenbus_dev_error(dev, -EBUSY,
- 					 "Device in use; refusing to close");
- 		else
--			blkfront_closing(dev);
-+			blkfront_closing(info);
- 		mutex_unlock(&bd->bd_mutex);
- 		bdput(bd);
- 		break;
-@@ -1003,7 +1088,10 @@ static int blkfront_remove(struct xenbus_device *dev)
+-	acpi_bus_unregister_driver(&acpi_processor_driver);
++	if (xen_initial_domain())
++		xen_acpi_processor_exit();
++	else
++		acpi_bus_unregister_driver(&acpi_processor_driver);
  
- 	blkif_free(info, 0);
+ 	cpuidle_unregister_driver(&acpi_idle_driver);
  
--	kfree(info);
-+	if(info->users == 0)
-+		kfree(info);
-+	else
-+		info->xbdev = NULL;
+diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
+index c8f0797..bdee59e 100644
+--- a/drivers/acpi/processor_idle.c
++++ b/drivers/acpi/processor_idle.c
+@@ -58,6 +58,7 @@
+ 
+ #include <acpi/acpi_bus.h>
+ #include <acpi/processor.h>
++#include <xen/acpi.h>
+ #include <asm/processor.h>
  
- 	return 0;
- }
-@@ -1012,12 +1100,15 @@ static int blkfront_is_ready(struct xenbus_device *dev)
- {
- 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+ #define PREFIX "ACPI: "
+@@ -477,6 +478,9 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
  
--	return info->is_ready;
-+	return info->is_ready && info->xbdev;
- }
+ 		cx.power = obj->integer.value;
  
- static int blkif_open(struct block_device *bdev, fmode_t mode)
- {
- 	struct blkfront_info *info = bdev->bd_disk->private_data;
++		/* cache control methods to notify xen*/
++		processor_cntl_xen_power_cache(pr->acpi_id, i, reg);
 +
-+	if (!info->xbdev)
-+		return -ENODEV;
- 	info->users++;
- 	return 0;
- }
-@@ -1031,10 +1122,13 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
- 		   have ignored this request initially, as the device was
- 		   still mounted. */
- 		struct xenbus_device *dev = info->xbdev;
--		enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
+ 		current_count++;
+ 		memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
  
--		if (state == XenbusStateClosing && info->is_ready)
--			blkfront_closing(dev);
-+		if (!dev) {
-+			blkfront_closing(info);
-+			kfree(info);
-+		} else if (xenbus_read_driver_state(dev->otherend)
-+			   == XenbusStateClosing && info->is_ready)
-+			blkfront_closing(info);
- 	}
- 	return 0;
+@@ -653,7 +657,7 @@ static int acpi_processor_power_verify(struct acpi_processor *pr)
+ 	return (working);
  }
-@@ -1061,7 +1155,7 @@ static struct xenbus_driver blkfront = {
- 	.probe = blkfront_probe,
- 	.remove = blkfront_remove,
- 	.resume = blkfront_resume,
--	.otherend_changed = backend_changed,
-+	.otherend_changed = blkback_changed,
- 	.is_ready = blkfront_is_ready,
- };
- 
-diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
-index 4dcfef0..399a017 100644
---- a/drivers/char/agp/intel-agp.c
-+++ b/drivers/char/agp/intel-agp.c
-@@ -15,8 +15,12 @@
-  * an Intel IOMMU. So make the correct use of the PCI DMA API contingent
-  * on the Intel IOMMU support (CONFIG_DMAR).
-  * Only newer chipsets need to bother with this, of course.
-+ *
-+ * Xen guests accessing graphics hardware also need proper translation
-+ * between pseudo-physical addresses and real machine addresses, which
-+ * is also achieved by using the DMA API.
-  */
--#ifdef CONFIG_DMAR
-+#if defined(CONFIG_DMAR) || defined(CONFIG_XEN)
- #define USE_PCI_DMA_API 1
- #endif
  
-@@ -394,15 +398,19 @@ static void intel_i810_agp_enable(struct agp_bridge_data *bridge, u32 mode)
- /* Exists to support ARGB cursors */
- static struct page *i8xx_alloc_pages(void)
+-static int acpi_processor_get_power_info(struct acpi_processor *pr)
++int acpi_processor_get_power_info(struct acpi_processor *pr)
  {
-+	void *addr;
-+	dma_addr_t _d;
- 	struct page *page;
- 
--	page = alloc_pages(GFP_KERNEL | GFP_DMA32, 2);
--	if (page == NULL)
-+	addr = dma_alloc_coherent(NULL, 4 * PAGE_SIZE, &_d, GFP_KERNEL);
-+	if (addr == NULL)
- 		return NULL;
- 
-+	page = virt_to_page(addr);
-+
- 	if (set_pages_uc(page, 4) < 0) {
- 		set_pages_wb(page, 4);
--		__free_pages(page, 2);
-+		dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, _d);
- 		return NULL;
+ 	unsigned int i;
+ 	int result;
+@@ -1223,9 +1227,14 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
+ 	 * platforms that only support C1.
+ 	 */
+ 	if (pr->flags.power) {
+-		acpi_processor_setup_cpuidle(pr);
+-		if (cpuidle_register_device(&pr->power.dev))
+-			return -EIO;
++		if (xen_initial_domain()) {
++			processor_cntl_xen_notify(pr,
++					PROCESSOR_PM_INIT, PM_TYPE_IDLE);
++		} else {
++			acpi_processor_setup_cpuidle(pr);
++			if (cpuidle_register_device(&pr->power.dev))
++				return -EIO;
++		}
  	}
- 	get_page(page);
-@@ -412,12 +420,17 @@ static struct page *i8xx_alloc_pages(void)
- 
- static void i8xx_destroy_pages(struct page *page)
- {
-+	void *addr;
-+
- 	if (page == NULL)
- 		return;
- 
- 	set_pages_wb(page, 4);
- 	put_page(page);
--	__free_pages(page, 2);
+ #ifdef CONFIG_ACPI_PROCFS
+ 	/* 'power' [R] */
+diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
+index 8ba0ed0..86b8102 100644
+--- a/drivers/acpi/processor_perflib.c
++++ b/drivers/acpi/processor_perflib.c
+@@ -332,7 +332,7 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr)
+ 	return result;
+ }
+ 
+-static int acpi_processor_get_performance_info(struct acpi_processor *pr)
++int acpi_processor_get_performance_info(struct acpi_processor *pr)
+ {
+ 	int result = 0;
+ 	acpi_status status = AE_OK;
+@@ -434,7 +434,7 @@ int acpi_processor_notify_smm(struct module *calling_module)
+ 
+ EXPORT_SYMBOL(acpi_processor_notify_smm);
+ 
+-static int acpi_processor_get_psd(struct acpi_processor	*pr)
++int acpi_processor_get_psd(struct acpi_processor	*pr)
+ {
+ 	int result = 0;
+ 	acpi_status status = AE_OK;
+diff --git a/drivers/acpi/processor_xen.c b/drivers/acpi/processor_xen.c
+new file mode 100644
+index 0000000..2f37c9c
+--- /dev/null
++++ b/drivers/acpi/processor_xen.c
+@@ -0,0 +1,616 @@
++/*
++ * processor_xen.c - ACPI Processor Driver for xen
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or (at
++ *  your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful, but
++ *  WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ *  General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License along
++ *  with this program; if not, write to the Free Software Foundation, Inc.,
++ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ */
 +
-+	addr = page_address(page);
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/pci.h>
++#include <linux/pm.h>
++#include <linux/cpufreq.h>
++#include <linux/cpu.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <linux/dmi.h>
++#include <linux/moduleparam.h>
++#include <linux/cpuidle.h>
++#include <linux/acpi.h>
 +
-+	dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, virt_to_bus(addr));
- 	atomic_dec(&agp_bridge->current_memory_agp);
- }
- 
-diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
-index a6ee32b..5be0dd3 100644
---- a/drivers/char/hvc_xen.c
-+++ b/drivers/char/hvc_xen.c
-@@ -25,6 +25,8 @@
- #include <linux/types.h>
- 
- #include <asm/xen/hypervisor.h>
++#include <acpi/acpi_bus.h>
++#include <acpi/acpi_drivers.h>
++#include <acpi/processor.h>
++#include <xen/acpi.h>
++
++#define PREFIX "ACPI: "
++
++#define ACPI_PROCESSOR_CLASS            "processor"
++#define ACPI_PROCESSOR_DEVICE_NAME	"Processor"
++#define ACPI_PROCESSOR_FILE_INFO	"info"
++#define ACPI_PROCESSOR_FILE_THROTTLING	"throttling"
++#define ACPI_PROCESSOR_FILE_LIMIT	"limit"
++#define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80
++#define ACPI_PROCESSOR_NOTIFY_POWER	0x81
++#define ACPI_PROCESSOR_NOTIFY_THROTTLING	0x82
++
++#define _COMPONENT              ACPI_PROCESSOR_COMPONENT
++ACPI_MODULE_NAME("processor_xen");
++
++static const struct acpi_device_id processor_device_ids[] = {
++	{ACPI_PROCESSOR_OBJECT_HID, 0},
++	{"ACPI0007", 0},
++	{"", 0},
++};
 +
-+#include <xen/xen.h>
- #include <xen/page.h>
- #include <xen/events.h>
- #include <xen/interface/io/console.h>
-@@ -76,7 +78,7 @@ static int __write_console(const char *data, int len)
- 	return sent;
- }
- 
--static int write_console(uint32_t vtermno, const char *data, int len)
-+static int domU_write_console(uint32_t vtermno, const char *data, int len)
- {
- 	int ret = len;
- 
-@@ -99,7 +101,7 @@ static int write_console(uint32_t vtermno, const char *data, int len)
- 	return ret;
- }
- 
--static int read_console(uint32_t vtermno, char *buf, int len)
-+static int domU_read_console(uint32_t vtermno, char *buf, int len)
- {
- 	struct xencons_interface *intf = xencons_interface();
- 	XENCONS_RING_IDX cons, prod;
-@@ -120,28 +122,63 @@ static int read_console(uint32_t vtermno, char *buf, int len)
- 	return recv;
- }
- 
--static struct hv_ops hvc_ops = {
--	.get_chars = read_console,
--	.put_chars = write_console,
-+static struct hv_ops domU_hvc_ops = {
-+	.get_chars = domU_read_console,
-+	.put_chars = domU_write_console,
-+	.notifier_add = notifier_add_irq,
-+	.notifier_del = notifier_del_irq,
-+	.notifier_hangup = notifier_hangup_irq,
++/*
++ * Xen ACPI processor driver
++ */
++
++/* from processor_core.c */
++
++static int xen_acpi_processor_add(struct acpi_device *device);
++static void xen_acpi_processor_notify(struct acpi_device *device, u32 event);
++
++struct acpi_driver xen_acpi_processor_driver = {
++	.name = "processor",
++	.class = ACPI_PROCESSOR_CLASS,
++	.ids = processor_device_ids,
++	.ops = {
++		.add = xen_acpi_processor_add,
++		.remove = acpi_processor_remove,
++		.suspend = acpi_processor_suspend,
++		.resume = acpi_processor_resume,
++		.notify = xen_acpi_processor_notify,
++		},
 +};
 +
-+static int dom0_read_console(uint32_t vtermno, char *buf, int len)
++static int xen_acpi_processor_get_info(struct acpi_device *device)
 +{
-+	return HYPERVISOR_console_io(CONSOLEIO_read, len, buf);
-+}
++	acpi_status status = 0;
++	union acpi_object object = { 0 };
++	struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
++	struct acpi_processor *pr;
++	int cpu_index, device_declaration = 0;
++	static int cpu0_initialized;
 +
-+/*
-+ * Either for a dom0 to write to the system console, or a domU with a
-+ * debug version of Xen
-+ */
-+static int dom0_write_console(uint32_t vtermno, const char *str, int len)
-+{
-+	int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
-+	if (rc < 0)
-+		return 0;
++	pr = acpi_driver_data(device);
++	if (!pr)
++		return -EINVAL;
 +
-+	return len;
-+}
++	if (num_online_cpus() > 1)
++		errata.smp = TRUE;
 +
-+static struct hv_ops dom0_hvc_ops = {
-+	.get_chars = dom0_read_console,
-+	.put_chars = dom0_write_console,
- 	.notifier_add = notifier_add_irq,
- 	.notifier_del = notifier_del_irq,
- 	.notifier_hangup = notifier_hangup_irq,
- };
- 
--static int __init xen_init(void)
-+static int __init xen_hvc_init(void)
- {
- 	struct hvc_struct *hp;
-+	struct hv_ops *ops;
- 
--	if (!xen_pv_domain() ||
--	    xen_initial_domain() ||
--	    !xen_start_info->console.domU.evtchn)
-+	if (!xen_pv_domain())
- 		return -ENODEV;
- 
--	xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
-+	if (xen_initial_domain()) {
-+		ops = &dom0_hvc_ops;
-+		xencons_irq = bind_virq_to_irq(VIRQ_CONSOLE, 0);
-+	} else {
-+		if (!xen_start_info->console.domU.evtchn)
++	acpi_processor_errata(pr);
++
++	/*
++	 * Check to see if we have bus mastering arbitration control.  This
++	 * is required for proper C3 usage (to maintain cache coherency).
++	 */
++	if (acpi_gbl_FADT.pm2_control_block &&
++			acpi_gbl_FADT.pm2_control_length) {
++		pr->flags.bm_control = 1;
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
++				  "Bus mastering arbitration control present\n"
++				  ));
++	} else
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
++				  "No bus mastering arbitration control\n"));
++
++	if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) {
++		/* Declared with "Processor" statement; match ProcessorID */
++		status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer);
++		if (ACPI_FAILURE(status)) {
++			printk(KERN_ERR PREFIX "Evaluating processor object\n");
 +			return -ENODEV;
++		}
 +
-+		ops = &domU_hvc_ops;
-+		xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
++		/*
++		 * TBD: Synch processor ID (via LAPIC/LSAPIC structures) on SMP.
++		 *      >>> 'acpi_get_processor_id(acpi_id, &id)' in
++		 *      arch/xxx/acpi.c
++		 */
++		pr->acpi_id = object.processor.proc_id;
++	} else {
++		/*
++		 * Declared with "Device" statement; match _UID.
++		 * Note that we don't handle string _UIDs yet.
++		 */
++		unsigned long long value;
++		status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID,
++						NULL, &value);
++		if (ACPI_FAILURE(status)) {
++			printk(KERN_ERR PREFIX
++			    "Evaluating processor _UID [%#x]\n", status);
++			return -ENODEV;
++		}
++		device_declaration = 1;
++		pr->acpi_id = value;
 +	}
 +
- 	if (xencons_irq < 0)
- 		xencons_irq = 0; /* NO_IRQ */
- 
--	hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
-+	hp = hvc_alloc(HVC_COOKIE, xencons_irq, ops, 256);
- 	if (IS_ERR(hp))
- 		return PTR_ERR(hp);
- 
-@@ -158,7 +195,7 @@ void xen_console_resume(void)
- 		rebind_evtchn_irq(xen_start_info->console.domU.evtchn, xencons_irq);
- }
- 
--static void __exit xen_fini(void)
-+static void __exit xen_hvc_fini(void)
- {
- 	if (hvc)
- 		hvc_remove(hvc);
-@@ -166,29 +203,24 @@ static void __exit xen_fini(void)
- 
- static int xen_cons_init(void)
- {
-+	struct hv_ops *ops;
++	/* TBD: add Xen specific code to query cpu_index */
++	cpu_index = -1;
 +
- 	if (!xen_pv_domain())
- 		return 0;
- 
--	hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
-+	ops = &domU_hvc_ops;
-+	if (xen_initial_domain())
-+		ops = &dom0_hvc_ops;
++	/* Handle UP system running SMP kernel, with no LAPIC in MADT */
++	if (!cpu0_initialized && (cpu_index == -1) &&
++	    (num_online_cpus() == 1)) {
++		cpu_index = 0;
++	}
 +
-+	hvc_instantiate(HVC_COOKIE, 0, ops);
++	cpu0_initialized = 1;
 +
- 	return 0;
- }
- 
--module_init(xen_init);
--module_exit(xen_fini);
-+module_init(xen_hvc_init);
-+module_exit(xen_hvc_fini);
- console_initcall(xen_cons_init);
- 
--static void raw_console_write(const char *str, int len)
--{
--	while(len > 0) {
--		int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
--		if (rc <= 0)
--			break;
--
--		str += rc;
--		len -= rc;
--	}
--}
--
- #ifdef CONFIG_EARLY_PRINTK
- static void xenboot_write_console(struct console *console, const char *string,
- 				  unsigned len)
-@@ -196,19 +228,22 @@ static void xenboot_write_console(struct console *console, const char *string,
- 	unsigned int linelen, off = 0;
- 	const char *pos;
- 
--	raw_console_write(string, len);
-+	dom0_write_console(0, string, len);
++	pr->id = cpu_index;
 +
-+	if (xen_initial_domain())
-+		return;
- 
--	write_console(0, "(early) ", 8);
-+	domU_write_console(0, "(early) ", 8);
- 	while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
- 		linelen = pos-string+off;
- 		if (off + linelen > len)
- 			break;
--		write_console(0, string+off, linelen);
--		write_console(0, "\r\n", 2);
-+		domU_write_console(0, string+off, linelen);
-+		domU_write_console(0, "\r\n", 2);
- 		off += linelen + 1;
- 	}
- 	if (off < len)
--		write_console(0, string+off, len-off);
-+		domU_write_console(0, string+off, len-off);
- }
- 
- struct console xenboot_console = {
-@@ -220,7 +255,7 @@ struct console xenboot_console = {
- 
- void xen_raw_console_write(const char *str)
- {
--	raw_console_write(str, strlen(str));
-+	dom0_write_console(0, str, strlen(str));
- }
- 
- void xen_raw_printk(const char *fmt, ...)
-diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
-index a75ca63..bdc26b9 100644
---- a/drivers/gpu/drm/drm_drv.c
-+++ b/drivers/gpu/drm/drm_drv.c
-@@ -201,7 +201,7 @@ int drm_lastclose(struct drm_device * dev)
- 	}
- 	if (drm_core_check_feature(dev, DRIVER_SG) && dev->sg &&
- 	    !drm_core_check_feature(dev, DRIVER_MODESET)) {
--		drm_sg_cleanup(dev->sg);
-+		drm_sg_cleanup(dev, dev->sg);
- 		dev->sg = NULL;
- 	}
- 
-diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
-index 8bf3770..dde5f66 100644
---- a/drivers/gpu/drm/drm_gem.c
-+++ b/drivers/gpu/drm/drm_gem.c
-@@ -539,7 +539,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
- 	vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
- 	vma->vm_ops = obj->dev->driver->gem_vm_ops;
- 	vma->vm_private_data = map->handle;
--	vma->vm_page_prot =  pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
-+	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
- 
- 	/* Take a ref for this mapping of the object, so that the fault
- 	 * handler can dereference the mmap offset's pointer to the object.
-diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c
-index c7823c8..95ffb8a 100644
---- a/drivers/gpu/drm/drm_scatter.c
-+++ b/drivers/gpu/drm/drm_scatter.c
-@@ -32,20 +32,73 @@
-  */
- 
- #include <linux/vmalloc.h>
-+#include <linux/mm.h>
- #include "drmP.h"
- 
- #define DEBUG_SCATTER 0
- 
--static inline void *drm_vmalloc_dma(unsigned long size)
-+static void *drm_vmalloc_dma(struct drm_device *drmdev, unsigned long size)
- {
- #if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
- 	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL | _PAGE_NO_CACHE);
- #else
--	return vmalloc_32(size);
-+	struct device *dev = &drmdev->pdev->dev;
-+	struct page **pages;
-+	void *addr;
-+	const int npages = PFN_UP(size);
-+	int i;
++	/*
++	 *  Extra Processor objects may be enumerated on MP systems with
++	 *  less than the max # of CPUs, or Xen vCPU < pCPU.
++	 *  They should be ignored _iff they are physically not present.
++	 *
++	 */
++#if 0
++	if (pr->id == -1) {
++		if (ACPI_FAILURE
++		    (acpi_processor_hotadd_init(pr->handle, &pr->id))) {
++			return -ENODEV;
++		}
++	}
++#endif
 +
-+	pages = kmalloc(npages * sizeof(*pages), GFP_KERNEL);
-+	if (!pages)
-+		goto fail;
++	/*
++	 * On some boxes several processors use the same processor bus id.
++	 * But they are located in different scope. For example:
++	 * \_SB.SCK0.CPU0
++	 * \_SB.SCK1.CPU0
++	 * Rename the processor device bus id. And the new bus id will be
++	 * generated as the following format:
++	 * CPU+CPU ID.
++	 */
++	sprintf(acpi_device_bid(device), "CPU%X", pr->id);
++	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id,
++				pr->acpi_id));
++
++	if (!object.processor.pblk_address)
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No PBLK (NULL address)\n"));
++	else if (object.processor.pblk_length != 6)
++		printk(KERN_ERR PREFIX "Invalid PBLK length [%d]\n",
++				object.processor.pblk_length);
++	else {
++		pr->throttling.address = object.processor.pblk_address;
++		pr->throttling.duty_offset = acpi_gbl_FADT.duty_offset;
++		pr->throttling.duty_width = acpi_gbl_FADT.duty_width;
 +
-+	for (i = 0; i < npages; i++) {
-+		dma_addr_t phys;
-+		void *addr;
-+		addr = dma_alloc_coherent(dev, PAGE_SIZE, &phys, GFP_KERNEL);
-+		if (addr == NULL)
-+			goto out_free_pages;
++		pr->pblk = object.processor.pblk_address;
 +
-+		pages[i] = virt_to_page(addr);
++		/*
++		 * We don't care about error returns - we just try to mark
++		 * these reserved so that nobody else is confused into thinking
++		 * that this region might be unused..
++		 *
++		 * (In particular, allocating the IO range for Cardbus)
++		 */
++		request_region(pr->throttling.address, 6, "ACPI CPU throttle");
 +	}
 +
-+	addr = vmap(pages, npages, VM_MAP | VM_IOREMAP, PAGE_KERNEL);
++	/*
++	 * If ACPI describes a slot number for this CPU, we can use it
++	 * ensure we get the right value in the "physical id" field
++	 * of /proc/cpuinfo
++	 */
++	status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
++	if (ACPI_SUCCESS(status))
++		arch_fix_phys_package_id(pr->id, object.integer.value);
 +
-+	kfree(pages);
++	return 0;
++}
 +
-+	return addr;
++static struct acpi_device *processor_device_array[XEN_MAX_ACPI_ID + 1];
 +
-+out_free_pages:
-+	while (i > 0) {
-+		void *addr = page_address(pages[--i]);
-+		dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
++static int __cpuinit xen_acpi_processor_add(struct acpi_device *device)
++{
++	struct acpi_processor *pr = NULL;
++	int result = 0;
++	struct sys_device *sysdev;
++
++	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
++	if (!pr)
++		return -ENOMEM;
++
++	if (!zalloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) {
++		kfree(pr);
++		return -ENOMEM;
 +	}
 +
-+	kfree(pages);
++	pr->handle = device->handle;
++	strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME);
++	strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS);
++	device->driver_data = pr;
++
++	result = xen_acpi_processor_get_info(device);
++	if (result) {
++		/* Processor is physically not present */
++		return 0;
++	}
 +
-+fail:
-+	return NULL;
++	/*
++	 * Buggy BIOS check
++	 * ACPI id of processors can be reported wrongly by the BIOS.
++	 * Don't trust it blindly
++	 */
++	if (pr->acpi_id > XEN_MAX_ACPI_ID ||
++			(processor_device_array[pr->acpi_id] != NULL &&
++			 processor_device_array[pr->acpi_id] != device)) {
++		printk(KERN_WARNING "BIOS reported wrong ACPI id "
++			"for the processor\n");
++		result = -ENODEV;
++		goto err_free_cpumask;
++	}
++
++	processor_device_array[pr->acpi_id] = device;
++
++	if (pr->id != -1) {
++		per_cpu(processors, pr->id) = pr;
++
++		result = acpi_processor_add_fs(device);
++		if (result)
++			goto err_free_cpumask;
++
++		sysdev = get_cpu_sysdev(pr->id);
++		if (sysdev != NULL && sysfs_create_link(&device->dev.kobj,
++					&sysdev->kobj, "sysdev")) {
++			result = -EFAULT;
++			goto err_remove_fs;
++		}
++	}
++
++	/* _PDC call should be done before doing anything else (if reqd.). */
++	xen_arch_acpi_processor_init_pdc(pr);
++	acpi_processor_set_pdc(pr);
++	arch_acpi_processor_cleanup_pdc(pr);
++
++#ifdef CONFIG_CPU_FREQ
++	xen_acpi_processor_ppc_has_changed(pr);
++	result = xen_acpi_processor_get_performance(pr);
++	if (result)
++		goto err_remove_fs;
 +#endif
++
++	if (pr->id != -1) {
++		acpi_processor_get_throttling_info(pr);
++		acpi_processor_get_limit_info(pr);
++	}
++
++	xen_acpi_processor_power_init(pr, device);
++
++	if (pr->id != -1) {
++		pr->cdev = thermal_cooling_device_register("Processor", device,
++				&processor_cooling_ops);
++		if (IS_ERR(pr->cdev)) {
++			result = PTR_ERR(pr->cdev);
++			goto err_power_exit;
++		}
++
++		dev_info(&device->dev, "registered as cooling_device%d\n",
++				pr->cdev->id);
++
++		result = sysfs_create_link(&device->dev.kobj,
++				&pr->cdev->device.kobj,
++				"thermal_cooling");
++		if (result) {
++			printk(KERN_ERR PREFIX "Create sysfs link\n");
++			goto err_thermal_unregister;
++		}
++		result = sysfs_create_link(&pr->cdev->device.kobj,
++				&device->dev.kobj,
++				"device");
++		if (result) {
++			printk(KERN_ERR PREFIX "Create sysfs link\n");
++			goto err_remove_sysfs;
++		}
++	}
++
++	return 0;
++
++err_remove_sysfs:
++	sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
++err_thermal_unregister:
++	thermal_cooling_device_unregister(pr->cdev);
++err_power_exit:
++	acpi_processor_power_exit(pr, device);
++err_remove_fs:
++	acpi_processor_remove_fs(device);
++err_free_cpumask:
++	free_cpumask_var(pr->throttling.shared_cpu_map);
++
++	return result;
 +}
 +
-+static void drm_vfree_dma(struct drm_device *drmdev, void *addr, int npages,
-+			  struct page **pages)
++static void xen_acpi_processor_notify(struct acpi_device *device, u32 event)
 +{
-+#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
-+	vfree(addr);
-+#else
-+	struct device *dev = &drmdev->pdev->dev;
-+	int i;
++	struct acpi_processor *pr = acpi_driver_data(device);
++	int saved;
 +
-+	for (i = 0; i < npages; i++) {
-+		void *addr = page_address(pages[i]);
-+		dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
-+	}
-+	vunmap(addr);
- #endif
- }
- 
--void drm_sg_cleanup(struct drm_sg_mem * entry)
-+void drm_sg_cleanup(struct drm_device *drmdev, struct drm_sg_mem * entry)
- {
- 	struct page *page;
- 	int i;
-@@ -56,7 +109,7 @@ void drm_sg_cleanup(struct drm_sg_mem * entry)
- 			ClearPageReserved(page);
- 	}
- 
--	vfree(entry->virtual);
-+	drm_vfree_dma(drmdev, entry->virtual, entry->pages, entry->pagelist);
- 
- 	kfree(entry->busaddr);
- 	kfree(entry->pagelist);
-@@ -107,7 +160,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request)
- 	}
- 	memset((void *)entry->busaddr, 0, pages * sizeof(*entry->busaddr));
- 
--	entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT);
-+	entry->virtual = drm_vmalloc_dma(dev, pages << PAGE_SHIFT);
- 	if (!entry->virtual) {
- 		kfree(entry->busaddr);
- 		kfree(entry->pagelist);
-@@ -180,7 +233,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request)
- 	return 0;
- 
-       failed:
--	drm_sg_cleanup(entry);
-+	drm_sg_cleanup(dev, entry);
- 	return -ENOMEM;
- }
- EXPORT_SYMBOL(drm_sg_alloc);
-@@ -212,7 +265,7 @@ int drm_sg_free(struct drm_device *dev, void *data,
++	if (!pr)
++		return;
++
++	switch (event) {
++	case ACPI_PROCESSOR_NOTIFY_PERFORMANCE:
++		saved = pr->performance_platform_limit;
++		xen_acpi_processor_ppc_has_changed(pr);
++		if (saved == pr->performance_platform_limit)
++			break;
++		acpi_bus_generate_proc_event(device, event,
++					pr->performance_platform_limit);
++		acpi_bus_generate_netlink_event(device->pnp.device_class,
++					dev_name(&device->dev), event,
++					pr->performance_platform_limit);
++		break;
++	case ACPI_PROCESSOR_NOTIFY_POWER:
++		xen_acpi_processor_cst_has_changed(pr);
++		acpi_bus_generate_proc_event(device, event, 0);
++		acpi_bus_generate_netlink_event(device->pnp.device_class,
++					dev_name(&device->dev), event, 0);
++		break;
++	case ACPI_PROCESSOR_NOTIFY_THROTTLING:
++		acpi_processor_tstate_has_changed(pr);
++		acpi_bus_generate_proc_event(device, event, 0);
++		acpi_bus_generate_netlink_event(device->pnp.device_class,
++					dev_name(&device->dev), event, 0);
++	default:
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
++				  "Unsupported event [0x%x]\n", event));
++		break;
++	}
++
++	return;
++}
++
++/* from processor_idle.c */
++
++static int xen_acpi_processor_get_power_info(struct acpi_processor *pr)
++{
++	int ret;
++	int invalid_pr_id = 0;
++
++	/*
++	 * acpi_processor_get_power_info need valid pr->id
++	 * so set pr->id=0 temporarily
++	 */
++	if (pr->id == -1) {
++		invalid_pr_id = 1;
++		pr->id = 0;
++	}
++
++	ret = acpi_processor_get_power_info(pr);
++
++	if (invalid_pr_id)
++		pr->id = -1;
++
++	return ret;
++}
++
++int xen_acpi_processor_cst_has_changed(struct acpi_processor *pr)
++{
++	if (!pr)
++		return -EINVAL;
++
++	if (!pr->flags.power_setup_done)
++		return -ENODEV;
++
++	xen_acpi_processor_get_power_info(pr);
++
++	processor_cntl_xen_notify(pr,
++			PROCESSOR_PM_CHANGE, PM_TYPE_IDLE);
++
++	return 0;
++}
++
++
++int __cpuinit xen_acpi_processor_power_init(struct acpi_processor *pr,
++			      struct acpi_device *device)
++{
++	acpi_status status = 0;
++	unsigned int i;
++
++	if (!pr)
++		return -EINVAL;
++
++	if (acpi_gbl_FADT.cst_control) {
++		status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
++				acpi_gbl_FADT.cst_control, 8);
++		if (ACPI_FAILURE(status)) {
++			ACPI_EXCEPTION((AE_INFO, status,
++				"Notifying BIOS of _CST ability failed"));
++		}
++	}
++
++	xen_acpi_processor_get_power_info(pr);
++
++	pr->flags.power_setup_done = 1;
++
++	if (pr->flags.power) {
++			processor_cntl_xen_notify(pr,
++					PROCESSOR_PM_INIT, PM_TYPE_IDLE);
++
++		printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
++		for (i = 1; i <= pr->power.count; i++)
++			if (pr->power.states[i].valid)
++				printk(" C%d[C%d]", i,
++				       pr->power.states[i].type);
++		printk(")\n");
++	}
++
++	return 0;
++}
++
++/* from processor_perflib.c */
++
++#ifdef CONFIG_CPU_FREQ
++static int xen_processor_notify_smm(void)
++{
++	acpi_status status;
++	static int is_done;
++
++	/* only need successfully notify BIOS once */
++	/* avoid double notification which may lead to unexpected result */
++	if (is_done)
++		return 0;
++
++	/* Can't write pstate_cnt to smi_cmd if either value is zero */
++	if ((!acpi_gbl_FADT.smi_command) || (!acpi_gbl_FADT.pstate_control)) {
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_cnt\n"));
++		return 0;
++	}
++
++	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
++		"Writing pstate_cnt [0x%x] to smi_cmd [0x%x]\n",
++		acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
++
++	status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
++				    (u32) acpi_gbl_FADT.pstate_control, 8);
++	if (ACPI_FAILURE(status))
++		return status;
++
++	is_done = 1;
++
++	return 0;
++}
++
++static int xen_acpi_processor_get_platform_limit(struct acpi_processor *pr)
++{
++	acpi_status status = 0;
++	unsigned long long ppc = 0;
++
++	if (!pr)
++		return -EINVAL;
++
++	/*
++	 * _PPC indicates the maximum state currently supported by the platform
++	 * (e.g. 0 = states 0..n; 1 = states 1..n; etc.
++	 */
++	status = acpi_evaluate_integer(pr->handle, "_PPC", NULL, &ppc);
++
++	if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) {
++		ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PPC"));
++		return -ENODEV;
++	}
++
++	pr->performance_platform_limit = (int)ppc;
++
++	return 0;
++}
++
++int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr)
++{
++	int ret;
++
++	ret = xen_acpi_processor_get_platform_limit(pr);
++
++	if (ret < 0)
++		return ret;
++	else
++		return processor_cntl_xen_notify(pr,
++				PROCESSOR_PM_CHANGE, PM_TYPE_PERF);
++}
++
++/*
++ * Existing ACPI module does parse performance states at some point,
++ * when acpi-cpufreq driver is loaded which however is something
++ * we'd like to disable to avoid confliction with xen PM
++ * logic. So we have to collect raw performance information here
++ * when ACPI processor object is found and started.
++ */
++int xen_acpi_processor_get_performance(struct acpi_processor *pr)
++{
++	int ret;
++	struct acpi_processor_performance *perf;
++	struct acpi_psd_package *pdomain;
++
++	if (pr->performance)
++		return -EBUSY;
++
++	perf = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL);
++	if (!perf)
++		return -ENOMEM;
++
++	pr->performance = perf;
++	/* Get basic performance state information */
++	ret = acpi_processor_get_performance_info(pr);
++	if (ret < 0)
++		goto err_out;
++
++	/*
++	 * Well, here we need retrieve performance dependency information
++	 * from _PSD object. The reason why existing interface is not used
++	 * is due to the reason that existing interface sticks to Linux cpu
++	 * id to construct some bitmap, however we want to split ACPI
++	 * processor objects from Linux cpu id logic. For example, even
++	 * when Linux is configured as UP, we still want to parse all ACPI
++	 * processor objects to xen. In this case, it's preferred
++	 * to use ACPI ID instead.
++	 */
++	pdomain = &pr->performance->domain_info;
++	pdomain->num_processors = 0;
++	ret = acpi_processor_get_psd(pr);
++	if (ret < 0) {
++		/*
++		 * _PSD is optional - assume no coordination if absent (or
++		 * broken), matching native kernels' behavior.
++		 */
++		pdomain->num_entries = ACPI_PSD_REV0_ENTRIES;
++		pdomain->revision = ACPI_PSD_REV0_REVISION;
++		pdomain->domain = pr->acpi_id;
++		pdomain->coord_type = DOMAIN_COORD_TYPE_SW_ALL;
++		pdomain->num_processors = 1;
++	}
++
++	/* Some sanity check */
++	if ((pdomain->revision != ACPI_PSD_REV0_REVISION) ||
++	    (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) ||
++	    ((pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL) &&
++	     (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY) &&
++	     (pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL))) {
++		ret = -EINVAL;
++		goto err_out;
++	}
++
++	/* Last step is to notify BIOS that xen exists */
++	xen_processor_notify_smm();
++
++	processor_cntl_xen_notify(pr, PROCESSOR_PM_INIT, PM_TYPE_PERF);
++
++	return 0;
++err_out:
++	pr->performance = NULL;
++	kfree(perf);
++	return ret;
++}
++#endif /* CONFIG_CPU_FREQ */
++
++/* init and exit */
++
++int xen_acpi_processor_init(void)
++{
++	return acpi_bus_register_driver(&xen_acpi_processor_driver);
++}
++
++void xen_acpi_processor_exit(void)
++{
++	acpi_bus_unregister_driver(&xen_acpi_processor_driver);
++}
+diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
+index 5f2c379..930b870 100644
+--- a/drivers/acpi/sleep.c
++++ b/drivers/acpi/sleep.c
+@@ -19,6 +19,8 @@
  
- 	DRM_DEBUG("virtual  = %p\n", entry->virtual);
+ #include <asm/io.h>
  
--	drm_sg_cleanup(entry);
-+	drm_sg_cleanup(dev, entry);
++#include <xen/acpi.h>
++
+ #include <acpi/acpi_bus.h>
+ #include <acpi/acpi_drivers.h>
  
- 	return 0;
+@@ -211,6 +213,21 @@ static int acpi_suspend_begin(suspend_state_t pm_state)
+ 	return error;
  }
-diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
-index 1c040d0..3dc8d6b 100644
---- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
-+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
-@@ -272,6 +272,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
  
- 	vma->vm_private_data = bo;
- 	vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
-+	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
- 	return 0;
- out_unref:
- 	ttm_bo_unref(&bo);
-@@ -287,6 +288,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
- 	vma->vm_ops = &ttm_bo_vm_ops;
- 	vma->vm_private_data = ttm_bo_reference(bo);
- 	vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
-+	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
- 	return 0;
- }
- EXPORT_SYMBOL(ttm_fbdev_mmap);
-diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c
-index b115726..0859bb0 100644
---- a/drivers/input/xen-kbdfront.c
-+++ b/drivers/input/xen-kbdfront.c
-@@ -21,7 +21,10 @@
- #include <linux/errno.h>
- #include <linux/module.h>
- #include <linux/input.h>
++static void do_suspend(void)
++{
++	if (!xen_pv_acpi()) {
++		do_suspend_lowlevel();
++		return;
++	}
 +
- #include <asm/xen/hypervisor.h>
++	/*
++	 * Xen will save and restore CPU context, so
++	 * we can skip that and just go straight to
++	 * the suspend.
++	 */
++	acpi_enter_sleep_state(ACPI_STATE_S3);
++}
 +
-+#include <xen/xen.h>
- #include <xen/events.h>
- #include <xen/page.h>
- #include <xen/interface/io/fbif.h>
-@@ -272,6 +275,8 @@ static void xenkbd_backend_changed(struct xenbus_device *dev,
- 	switch (backend_state) {
- 	case XenbusStateInitialising:
- 	case XenbusStateInitialised:
-+	case XenbusStateReconfiguring:
-+	case XenbusStateReconfigured:
- 	case XenbusStateUnknown:
- 	case XenbusStateClosed:
+ /**
+  *	acpi_suspend_enter - Actually enter a sleep state.
+  *	@pm_state: ignored
+@@ -244,7 +261,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state)
  		break;
-diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
-index b2f71f7..b7feb84 100644
---- a/drivers/net/Kconfig
-+++ b/drivers/net/Kconfig
-@@ -2787,6 +2787,7 @@ source "drivers/s390/net/Kconfig"
- config XEN_NETDEV_FRONTEND
- 	tristate "Xen network device frontend driver"
+ 
+ 	case ACPI_STATE_S3:
+-		do_suspend_lowlevel();
++		do_suspend();
+ 		break;
+ 	}
+ 
+diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
+index 1d886e0..f4a2b10 100644
+--- a/drivers/block/Kconfig
++++ b/drivers/block/Kconfig
+@@ -462,6 +462,7 @@ config XEN_BLKDEV_FRONTEND
+ 	tristate "Xen virtual block device support"
  	depends on XEN
-+	select XEN_XENBUS_FRONTEND
  	default y
++	select XEN_XENBUS_FRONTEND
  	help
- 	  The network device frontend driver allows the kernel to
-diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
-index baa051d..ee7465a 100644
---- a/drivers/net/xen-netfront.c
-+++ b/drivers/net/xen-netfront.c
+ 	  This driver implements the front-end of the Xen virtual
+ 	  block device driver.  It communicates with a back-end driver
+diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
+index b8578bb..a8d30d7 100644
+--- a/drivers/block/xen-blkfront.c
++++ b/drivers/block/xen-blkfront.c
 @@ -42,6 +42,7 @@
- #include <linux/mm.h>
- #include <net/ip.h>
+ #include <linux/module.h>
+ #include <linux/scatterlist.h>
  
 +#include <xen/xen.h>
  #include <xen/xenbus.h>
+ #include <xen/grant_table.h>
  #include <xen/events.h>
- #include <xen/page.h>
-@@ -1393,7 +1394,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
- }
+@@ -102,6 +103,10 @@ struct blkfront_info
  
- /* Common code used when first setting up, and when resuming. */
--static int talk_to_backend(struct xenbus_device *dev,
-+static int talk_to_netback(struct xenbus_device *dev,
- 			   struct netfront_info *info)
- {
- 	const char *message;
-@@ -1543,7 +1544,7 @@ static int xennet_connect(struct net_device *dev)
- 		return -ENODEV;
+ static DEFINE_SPINLOCK(blkif_io_lock);
+ 
++static unsigned int nr_minors;
++static unsigned long *minors;
++static DEFINE_SPINLOCK(minor_lock);
++
+ #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+ 	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+ #define GRANT_INVALID_REF	0
+@@ -136,6 +141,55 @@ static void add_id_to_freelist(struct blkfront_info *info,
+ 	info->shadow_free = id;
+ }
+ 
++static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
++{
++	unsigned int end = minor + nr;
++	int rc;
++
++	if (end > nr_minors) {
++		unsigned long *bitmap, *old;
++
++		bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
++				 GFP_KERNEL);
++		if (bitmap == NULL)
++			return -ENOMEM;
++
++		spin_lock(&minor_lock);
++		if (end > nr_minors) {
++			old = minors;
++			memcpy(bitmap, minors,
++			       BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
++			minors = bitmap;
++			nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
++		} else
++			old = bitmap;
++		spin_unlock(&minor_lock);
++		kfree(old);
++	}
++
++	spin_lock(&minor_lock);
++	if (find_next_bit(minors, end, minor) >= end) {
++		for (; minor < end; ++minor)
++			__set_bit(minor, minors);
++		rc = 0;
++	} else
++		rc = -EBUSY;
++	spin_unlock(&minor_lock);
++
++	return rc;
++}
++
++static void xlbd_release_minors(unsigned int minor, unsigned int nr)
++{
++	unsigned int end = minor + nr;
++
++	BUG_ON(end > nr_minors);
++	spin_lock(&minor_lock);
++	for (; minor < end; ++minor)
++		__clear_bit(minor, minors);
++	spin_unlock(&minor_lock);
++}
++
+ static void blkif_restart_queue_callback(void *arg)
+ {
+ 	struct blkfront_info *info = (struct blkfront_info *)arg;
+@@ -416,9 +470,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ 	if ((minor % nr_parts) == 0)
+ 		nr_minors = nr_parts;
+ 
++	err = xlbd_reserve_minors(minor, nr_minors);
++	if (err)
++		goto out;
++	err = -ENODEV;
++
+ 	gd = alloc_disk(nr_minors);
+ 	if (gd == NULL)
+-		goto out;
++		goto release;
+ 
+ 	offset = minor / nr_parts;
+ 
+@@ -449,7 +508,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ 
+ 	if (xlvbd_init_blk_queue(gd, sector_size)) {
+ 		del_gendisk(gd);
+-		goto out;
++		goto release;
  	}
  
--	err = talk_to_backend(np->xbdev, np);
-+	err = talk_to_netback(np->xbdev, np);
- 	if (err)
- 		return err;
+ 	info->rq = gd->queue;
+@@ -469,6 +528,8 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ 
+ 	return 0;
+ 
++ release:
++	xlbd_release_minors(minor, nr_minors);
+  out:
+ 	return err;
+ }
+@@ -650,7 +711,7 @@ fail:
+ 
+ 
+ /* Common code used when first setting up, and when resuming. */
+-static int talk_to_backend(struct xenbus_device *dev,
++static int talk_to_blkback(struct xenbus_device *dev,
+ 			   struct blkfront_info *info)
+ {
+ 	const char *message = NULL;
+@@ -755,7 +816,7 @@ static int blkfront_probe(struct xenbus_device *dev,
+ 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
+ 	dev_set_drvdata(&dev->dev, info);
+ 
+-	err = talk_to_backend(dev, info);
++	err = talk_to_blkback(dev, info);
+ 	if (err) {
+ 		kfree(info);
+ 		dev_set_drvdata(&dev->dev, NULL);
+@@ -850,7 +911,7 @@ static int blkfront_resume(struct xenbus_device *dev)
+ 
+ 	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+ 
+-	err = talk_to_backend(dev, info);
++	err = talk_to_blkback(dev, info);
+ 	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
+ 		err = blkif_recover(info);
+ 
+@@ -869,10 +930,29 @@ static void blkfront_connect(struct blkfront_info *info)
+ 	unsigned int binfo;
+ 	int err;
+ 
+-	if ((info->connected == BLKIF_STATE_CONNECTED) ||
+-	    (info->connected == BLKIF_STATE_SUSPENDED) )
++	switch (info->connected) {
++	case BLKIF_STATE_CONNECTED:
++		/*
++		 * Potentially, the back-end may be signalling
++		 * a capacity change; update the capacity.
++		 */
++		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
++				   "sectors", "%Lu", &sectors);
++		if (XENBUS_EXIST_ERR(err))
++			return;
++		printk(KERN_INFO "Setting capacity to %Lu\n",
++		       sectors);
++		set_capacity(info->gd, sectors);
++		revalidate_disk(info->gd);
++
++		/* fall through */
++	case BLKIF_STATE_SUSPENDED:
+ 		return;
+ 
++	default:
++		break;
++	}
++
+ 	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
+ 		__func__, info->xbdev->otherend);
+ 
+@@ -920,12 +1000,11 @@ static void blkfront_connect(struct blkfront_info *info)
+  * the backend.  Once is this done, we can switch to Closed in
+  * acknowledgement.
+  */
+-static void blkfront_closing(struct xenbus_device *dev)
++static void blkfront_closing(struct blkfront_info *info)
+ {
+-	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
++	unsigned int minor, nr_minors;
+ 	unsigned long flags;
+ 
+-	dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
+ 
+ 	if (info->rq == NULL)
+ 		goto out;
+@@ -945,27 +1024,33 @@ static void blkfront_closing(struct xenbus_device *dev)
+ 	blk_cleanup_queue(info->rq);
+ 	info->rq = NULL;
+ 
++	minor = info->gd->first_minor;
++	nr_minors = info->gd->minors;
+ 	del_gendisk(info->gd);
++	xlbd_release_minors(minor, nr_minors);
+ 
+  out:
+-	xenbus_frontend_closed(dev);
++	if (info->xbdev)
++		xenbus_frontend_closed(info->xbdev);
+ }
  
-@@ -1597,7 +1598,7 @@ static int xennet_connect(struct net_device *dev)
  /**
   * Callback received when the backend's state changes.
   */
 -static void backend_changed(struct xenbus_device *dev,
-+static void netback_changed(struct xenbus_device *dev,
++static void blkback_changed(struct xenbus_device *dev,
  			    enum xenbus_state backend_state)
  {
- 	struct netfront_info *np = dev_get_drvdata(&dev->dev);
-@@ -1608,6 +1609,8 @@ static void backend_changed(struct xenbus_device *dev,
+ 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+ 	struct block_device *bd;
+ 
+-	dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
++	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
+ 
  	switch (backend_state) {
  	case XenbusStateInitialising:
+ 	case XenbusStateInitWait:
  	case XenbusStateInitialised:
 +	case XenbusStateReconfiguring:
 +	case XenbusStateReconfigured:
- 	case XenbusStateConnected:
  	case XenbusStateUnknown:
  	case XenbusStateClosed:
-@@ -1798,7 +1801,7 @@ static struct xenbus_driver netfront_driver = {
- 	.probe = netfront_probe,
- 	.remove = __devexit_p(xennet_remove),
- 	.resume = netfront_resume,
--	.otherend_changed = backend_changed,
-+	.otherend_changed = netback_changed,
- };
+ 		break;
+@@ -988,7 +1073,7 @@ static void backend_changed(struct xenbus_device *dev,
+ 			xenbus_dev_error(dev, -EBUSY,
+ 					 "Device in use; refusing to close");
+ 		else
+-			blkfront_closing(dev);
++			blkfront_closing(info);
+ 		mutex_unlock(&bd->bd_mutex);
+ 		bdput(bd);
+ 		break;
+@@ -1003,7 +1088,10 @@ static int blkfront_remove(struct xenbus_device *dev)
  
- static int __init netif_init(void)
-diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
-index fdc864f..7802fcd 100644
---- a/drivers/pci/Kconfig
-+++ b/drivers/pci/Kconfig
-@@ -51,6 +51,16 @@ config PCI_STUB
+ 	blkif_free(info, 0);
  
- 	  When in doubt, say N.
+-	kfree(info);
++	if(info->users == 0)
++		kfree(info);
++	else
++		info->xbdev = NULL;
  
-+config XEN_PCIDEV_FRONTEND
-+        tristate "Xen PCI Frontend"
-+        depends on XEN && PCI && X86
-+        select HOTPLUG
-+	select XEN_XENBUS_FRONTEND
-+	default y
-+        help
-+          The PCI device frontend driver allows the kernel to import arbitrary
-+          PCI devices from a PCI backend to support PCI driver domains.
+ 	return 0;
+ }
+@@ -1012,12 +1100,15 @@ static int blkfront_is_ready(struct xenbus_device *dev)
+ {
+ 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+ 
+-	return info->is_ready;
++	return info->is_ready && info->xbdev;
+ }
+ 
+ static int blkif_open(struct block_device *bdev, fmode_t mode)
+ {
+ 	struct blkfront_info *info = bdev->bd_disk->private_data;
 +
- config HT_IRQ
- 	bool "Interrupts on hypertransport devices"
- 	default y
-diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
-index 4a7f11d..b70aa4d 100644
---- a/drivers/pci/Makefile
-+++ b/drivers/pci/Makefile
-@@ -31,6 +31,8 @@ obj-$(CONFIG_HT_IRQ) += htirq.o
- # Build Intel IOMMU support
- obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
++	if (!info->xbdev)
++		return -ENODEV;
+ 	info->users++;
+ 	return 0;
+ }
+@@ -1031,10 +1122,13 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
+ 		   have ignored this request initially, as the device was
+ 		   still mounted. */
+ 		struct xenbus_device *dev = info->xbdev;
+-		enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
  
-+# Build Xen IOMMU support
-+obj-$(CONFIG_PCI_XEN) += xen-iommu.o
- obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
+-		if (state == XenbusStateClosing && info->is_ready)
+-			blkfront_closing(dev);
++		if (!dev) {
++			blkfront_closing(info);
++			kfree(info);
++		} else if (xenbus_read_driver_state(dev->otherend)
++			   == XenbusStateClosing && info->is_ready)
++			blkfront_closing(info);
+ 	}
+ 	return 0;
+ }
+@@ -1061,7 +1155,7 @@ static struct xenbus_driver blkfront = {
+ 	.probe = blkfront_probe,
+ 	.remove = blkfront_remove,
+ 	.resume = blkfront_resume,
+-	.otherend_changed = backend_changed,
++	.otherend_changed = blkback_changed,
+ 	.is_ready = blkfront_is_ready,
+ };
  
- obj-$(CONFIG_PCI_IOV) += iov.o
-@@ -60,6 +62,8 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o
+diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
+index 4dcfef0..399a017 100644
+--- a/drivers/char/agp/intel-agp.c
++++ b/drivers/char/agp/intel-agp.c
+@@ -15,8 +15,12 @@
+  * an Intel IOMMU. So make the correct use of the PCI DMA API contingent
+  * on the Intel IOMMU support (CONFIG_DMAR).
+  * Only newer chipsets need to bother with this, of course.
++ *
++ * Xen guests accessing graphics hardware also need proper translation
++ * between pseudo-physical addresses and real machine addresses, which
++ * is also achieved by using the DMA API.
+  */
+-#ifdef CONFIG_DMAR
++#if defined(CONFIG_DMAR) || defined(CONFIG_XEN)
+ #define USE_PCI_DMA_API 1
+ #endif
  
- obj-$(CONFIG_PCI_STUB) += pci-stub.o
+@@ -394,15 +398,19 @@ static void intel_i810_agp_enable(struct agp_bridge_data *bridge, u32 mode)
+ /* Exists to support ARGB cursors */
+ static struct page *i8xx_alloc_pages(void)
+ {
++	void *addr;
++	dma_addr_t _d;
+ 	struct page *page;
  
-+obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
-+
- ifeq ($(CONFIG_PCI_DEBUG),y)
- EXTRA_CFLAGS += -DDEBUG
- endif
-diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
-index cef28a7..1940183 100644
---- a/drivers/pci/bus.c
-+++ b/drivers/pci/bus.c
-@@ -249,6 +249,7 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
- 	up_read(&pci_bus_sem);
- }
+-	page = alloc_pages(GFP_KERNEL | GFP_DMA32, 2);
+-	if (page == NULL)
++	addr = dma_alloc_coherent(NULL, 4 * PAGE_SIZE, &_d, GFP_KERNEL);
++	if (addr == NULL)
+ 		return NULL;
  
-+EXPORT_SYMBOL_GPL(pci_walk_bus);
- EXPORT_SYMBOL(pci_bus_alloc_resource);
- EXPORT_SYMBOL_GPL(pci_bus_add_device);
- EXPORT_SYMBOL(pci_bus_add_devices);
-diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
-index 5753036..8e6e6d1 100644
---- a/drivers/pci/dmar.c
-+++ b/drivers/pci/dmar.c
-@@ -673,10 +673,13 @@ void __init detect_intel_iommu(void)
- 			       "x2apic and Intr-remapping.\n");
- #endif
- #ifdef CONFIG_DMAR
--		if (ret && !no_iommu && !iommu_detected && !swiotlb &&
--		    !dmar_disabled)
-+		if (ret && !no_iommu && !iommu_detected && !dmar_disabled)
- 			iommu_detected = 1;
- #endif
-+#ifdef CONFIG_X86
-+		if (ret)
-+			x86_init.iommu.iommu_init = intel_iommu_init;
-+#endif
++	page = virt_to_page(addr);
++
+ 	if (set_pages_uc(page, 4) < 0) {
+ 		set_pages_wb(page, 4);
+-		__free_pages(page, 2);
++		dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, _d);
+ 		return NULL;
  	}
- 	early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
- 	dmar_tbl = NULL;
-diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
-index 2498602..fd89530 100644
---- a/drivers/pci/intel-iommu.c
-+++ b/drivers/pci/intel-iommu.c
-@@ -3282,7 +3282,7 @@ int __init intel_iommu_init(void)
- 	 * Check the need for DMA-remapping initialization now.
- 	 * Above initialization will also be used by Interrupt-remapping.
- 	 */
--	if (no_iommu || swiotlb || dmar_disabled)
-+	if (no_iommu || dmar_disabled)
- 		return -ENODEV;
+ 	get_page(page);
+@@ -412,12 +420,17 @@ static struct page *i8xx_alloc_pages(void)
  
- 	iommu_init_mempool();
-@@ -3303,7 +3303,9 @@ int __init intel_iommu_init(void)
- 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
+ static void i8xx_destroy_pages(struct page *page)
+ {
++	void *addr;
++
+ 	if (page == NULL)
+ 		return;
  
- 	init_timer(&unmap_timer);
--	force_iommu = 1;
-+#ifdef CONFIG_SWIOTLB
-+	swiotlb = 0;
-+#endif
- 	dma_ops = &intel_dma_ops;
+ 	set_pages_wb(page, 4);
+ 	put_page(page);
+-	__free_pages(page, 2);
++
++	addr = page_address(page);
++
++	dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, virt_to_bus(addr));
+ 	atomic_dec(&agp_bridge->current_memory_agp);
+ }
  
- 	init_iommu_sysfs();
-diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
-index f9cf317..a77a46f 100644
---- a/drivers/pci/msi.c
-+++ b/drivers/pci/msi.c
-@@ -19,6 +19,9 @@
- #include <linux/errno.h>
- #include <linux/io.h>
+diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
+index a6ee32b..5be0dd3 100644
+--- a/drivers/char/hvc_xen.c
++++ b/drivers/char/hvc_xen.c
+@@ -25,6 +25,8 @@
+ #include <linux/types.h>
  
-+#include <asm/xen/hypercall.h>
-+#include <asm/xen/hypervisor.h>
+ #include <asm/xen/hypervisor.h>
 +
- #include "pci.h"
- #include "msi.h"
++#include <xen/xen.h>
+ #include <xen/page.h>
+ #include <xen/events.h>
+ #include <xen/interface/io/console.h>
+@@ -76,7 +78,7 @@ static int __write_console(const char *data, int len)
+ 	return sent;
+ }
  
-@@ -366,6 +369,20 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
+-static int write_console(uint32_t vtermno, const char *data, int len)
++static int domU_write_console(uint32_t vtermno, const char *data, int len)
+ {
+ 	int ret = len;
  
- void pci_restore_msi_state(struct pci_dev *dev)
+@@ -99,7 +101,7 @@ static int write_console(uint32_t vtermno, const char *data, int len)
+ 	return ret;
+ }
+ 
+-static int read_console(uint32_t vtermno, char *buf, int len)
++static int domU_read_console(uint32_t vtermno, char *buf, int len)
  {
-+	if (xen_initial_domain()) {
-+		struct physdev_restore_msi physdev;
-+
-+		if (!dev->msi_enabled && !dev->msix_enabled)
-+			return;
-+
-+		pci_intx_for_msi(dev, 0);
+ 	struct xencons_interface *intf = xencons_interface();
+ 	XENCONS_RING_IDX cons, prod;
+@@ -120,28 +122,63 @@ static int read_console(uint32_t vtermno, char *buf, int len)
+ 	return recv;
+ }
+ 
+-static struct hv_ops hvc_ops = {
+-	.get_chars = read_console,
+-	.put_chars = write_console,
++static struct hv_ops domU_hvc_ops = {
++	.get_chars = domU_read_console,
++	.put_chars = domU_write_console,
++	.notifier_add = notifier_add_irq,
++	.notifier_del = notifier_del_irq,
++	.notifier_hangup = notifier_hangup_irq,
++};
 +
-+		physdev.bus = dev->bus->number;
-+		physdev.devfn = dev->devfn;
-+		HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &physdev);
++static int dom0_read_console(uint32_t vtermno, char *buf, int len)
++{
++	return HYPERVISOR_console_io(CONSOLEIO_read, len, buf);
++}
 +
-+		return;
-+	}
- 	__pci_restore_msi_state(dev);
- 	__pci_restore_msix_state(dev);
- }
-diff --git a/drivers/pci/xen-iommu.c b/drivers/pci/xen-iommu.c
-new file mode 100644
-index 0000000..ac6bcdb
---- /dev/null
-+++ b/drivers/pci/xen-iommu.c
-@@ -0,0 +1,271 @@
-+#include <linux/types.h>
-+#include <linux/mm.h>
-+#include <linux/string.h>
-+#include <linux/pci.h>
-+#include <linux/module.h>
-+#include <linux/version.h>
-+#include <linux/scatterlist.h>
-+#include <linux/io.h>
-+#include <linux/bug.h>
++/*
++ * Either for a dom0 to write to the system console, or a domU with a
++ * debug version of Xen
++ */
++static int dom0_write_console(uint32_t vtermno, const char *str, int len)
++{
++	int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
++	if (rc < 0)
++		return 0;
 +
-+#include <xen/interface/xen.h>
-+#include <xen/grant_table.h>
-+#include <xen/page.h>
-+#include <xen/xen-ops.h>
++	return len;
++}
 +
-+#include <asm/iommu.h>
-+#include <asm/swiotlb.h>
-+#include <asm/tlbflush.h>
++static struct hv_ops dom0_hvc_ops = {
++	.get_chars = dom0_read_console,
++	.put_chars = dom0_write_console,
+ 	.notifier_add = notifier_add_irq,
+ 	.notifier_del = notifier_del_irq,
+ 	.notifier_hangup = notifier_hangup_irq,
+ };
+ 
+-static int __init xen_init(void)
++static int __init xen_hvc_init(void)
+ {
+ 	struct hvc_struct *hp;
++	struct hv_ops *ops;
+ 
+-	if (!xen_pv_domain() ||
+-	    xen_initial_domain() ||
+-	    !xen_start_info->console.domU.evtchn)
++	if (!xen_pv_domain())
+ 		return -ENODEV;
+ 
+-	xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
++	if (xen_initial_domain()) {
++		ops = &dom0_hvc_ops;
++		xencons_irq = bind_virq_to_irq(VIRQ_CONSOLE, 0);
++	} else {
++		if (!xen_start_info->console.domU.evtchn)
++			return -ENODEV;
 +
-+#define IOMMU_BUG_ON(test)				\
-+do {							\
-+	if (unlikely(test)) {				\
-+		printk(KERN_ALERT "Fatal DMA error! "	\
-+		       "Please use 'swiotlb=force'\n");	\
-+		BUG();					\
-+	}						\
-+} while (0)
++		ops = &domU_hvc_ops;
++		xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
++	}
 +
-+/* Print address range with message */
-+#define PAR(msg, addr, size)					\
-+do {							\
-+	printk(msg "[%#llx - %#llx]\n",			\
-+	(unsigned long long)addr,			\
-+	(unsigned long long)addr + size);		\
-+} while (0)
+ 	if (xencons_irq < 0)
+ 		xencons_irq = 0; /* NO_IRQ */
+ 
+-	hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
++	hp = hvc_alloc(HVC_COOKIE, xencons_irq, ops, 256);
+ 	if (IS_ERR(hp))
+ 		return PTR_ERR(hp);
+ 
+@@ -158,7 +195,7 @@ void xen_console_resume(void)
+ 		rebind_evtchn_irq(xen_start_info->console.domU.evtchn, xencons_irq);
+ }
+ 
+-static void __exit xen_fini(void)
++static void __exit xen_hvc_fini(void)
+ {
+ 	if (hvc)
+ 		hvc_remove(hvc);
+@@ -166,29 +203,24 @@ static void __exit xen_fini(void)
+ 
+ static int xen_cons_init(void)
+ {
++	struct hv_ops *ops;
 +
-+static inline int address_needs_mapping(struct device *hwdev,
-+						dma_addr_t addr)
-+{
-+	dma_addr_t mask = DMA_BIT_MASK(32);
-+	int ret;
+ 	if (!xen_pv_domain())
+ 		return 0;
+ 
+-	hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
++	ops = &domU_hvc_ops;
++	if (xen_initial_domain())
++		ops = &dom0_hvc_ops;
 +
-+	/* If the device has a mask, use it, otherwise default to 32 bits */
-+	if (hwdev)
-+		mask = *hwdev->dma_mask;
++	hvc_instantiate(HVC_COOKIE, 0, ops);
 +
-+	ret = (addr & ~mask) != 0;
+ 	return 0;
+ }
+ 
+-module_init(xen_init);
+-module_exit(xen_fini);
++module_init(xen_hvc_init);
++module_exit(xen_hvc_fini);
+ console_initcall(xen_cons_init);
+ 
+-static void raw_console_write(const char *str, int len)
+-{
+-	while(len > 0) {
+-		int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
+-		if (rc <= 0)
+-			break;
+-
+-		str += rc;
+-		len -= rc;
+-	}
+-}
+-
+ #ifdef CONFIG_EARLY_PRINTK
+ static void xenboot_write_console(struct console *console, const char *string,
+ 				  unsigned len)
+@@ -196,19 +228,22 @@ static void xenboot_write_console(struct console *console, const char *string,
+ 	unsigned int linelen, off = 0;
+ 	const char *pos;
+ 
+-	raw_console_write(string, len);
++	dom0_write_console(0, string, len);
 +
-+	if (ret) {
-+		printk(KERN_ERR "dma address needs mapping\n");
-+		printk(KERN_ERR "mask: %#llx\n address: [%#llx]\n", mask, addr);
++	if (xen_initial_domain())
++		return;
+ 
+-	write_console(0, "(early) ", 8);
++	domU_write_console(0, "(early) ", 8);
+ 	while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
+ 		linelen = pos-string+off;
+ 		if (off + linelen > len)
+ 			break;
+-		write_console(0, string+off, linelen);
+-		write_console(0, "\r\n", 2);
++		domU_write_console(0, string+off, linelen);
++		domU_write_console(0, "\r\n", 2);
+ 		off += linelen + 1;
+ 	}
+ 	if (off < len)
+-		write_console(0, string+off, len-off);
++		domU_write_console(0, string+off, len-off);
+ }
+ 
+ struct console xenboot_console = {
+@@ -220,7 +255,7 @@ struct console xenboot_console = {
+ 
+ void xen_raw_console_write(const char *str)
+ {
+-	raw_console_write(str, strlen(str));
++	dom0_write_console(0, str, strlen(str));
+ }
+ 
+ void xen_raw_printk(const char *fmt, ...)
+diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
+index a75ca63..bdc26b9 100644
+--- a/drivers/gpu/drm/drm_drv.c
++++ b/drivers/gpu/drm/drm_drv.c
+@@ -201,7 +201,7 @@ int drm_lastclose(struct drm_device * dev)
+ 	}
+ 	if (drm_core_check_feature(dev, DRIVER_SG) && dev->sg &&
+ 	    !drm_core_check_feature(dev, DRIVER_MODESET)) {
+-		drm_sg_cleanup(dev->sg);
++		drm_sg_cleanup(dev, dev->sg);
+ 		dev->sg = NULL;
+ 	}
+ 
+diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
+index 8bf3770..dde5f66 100644
+--- a/drivers/gpu/drm/drm_gem.c
++++ b/drivers/gpu/drm/drm_gem.c
+@@ -539,7 +539,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+ 	vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
+ 	vma->vm_ops = obj->dev->driver->gem_vm_ops;
+ 	vma->vm_private_data = map->handle;
+-	vma->vm_page_prot =  pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
++	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+ 
+ 	/* Take a ref for this mapping of the object, so that the fault
+ 	 * handler can dereference the mmap offset's pointer to the object.
+diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c
+index c7823c8..95ffb8a 100644
+--- a/drivers/gpu/drm/drm_scatter.c
++++ b/drivers/gpu/drm/drm_scatter.c
+@@ -32,20 +32,73 @@
+  */
+ 
+ #include <linux/vmalloc.h>
++#include <linux/mm.h>
+ #include "drmP.h"
+ 
+ #define DEBUG_SCATTER 0
+ 
+-static inline void *drm_vmalloc_dma(unsigned long size)
++static void *drm_vmalloc_dma(struct drm_device *drmdev, unsigned long size)
+ {
+ #if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
+ 	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL | _PAGE_NO_CACHE);
+ #else
+-	return vmalloc_32(size);
++	struct device *dev = &drmdev->pdev->dev;
++	struct page **pages;
++	void *addr;
++	const int npages = PFN_UP(size);
++	int i;
++
++	pages = kmalloc(npages * sizeof(*pages), GFP_KERNEL);
++	if (!pages)
++		goto fail;
++
++	for (i = 0; i < npages; i++) {
++		dma_addr_t phys;
++		void *addr;
++		addr = dma_alloc_coherent(dev, PAGE_SIZE, &phys, GFP_KERNEL);
++		if (addr == NULL)
++			goto out_free_pages;
++
++		pages[i] = virt_to_page(addr);
 +	}
-+	return ret;
-+}
 +
-+static int check_pages_physically_contiguous(unsigned long pfn,
-+					     unsigned int offset,
-+					     size_t length)
-+{
-+	unsigned long next_mfn;
-+	int i;
-+	int nr_pages;
++	addr = vmap(pages, npages, VM_MAP | VM_IOREMAP, PAGE_KERNEL);
 +
-+	next_mfn = pfn_to_mfn(pfn);
-+	nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
++	kfree(pages);
 +
-+	for (i = 1; i < nr_pages; i++) {
-+		if (pfn_to_mfn(++pfn) != ++next_mfn)
-+			return 0;
++	return addr;
++
++out_free_pages:
++	while (i > 0) {
++		void *addr = page_address(pages[--i]);
++		dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
 +	}
-+	return 1;
-+}
 +
-+static int range_straddles_page_boundary(phys_addr_t p, size_t size)
-+{
-+	unsigned long pfn = PFN_DOWN(p);
-+	unsigned int offset = p & ~PAGE_MASK;
++	kfree(pages);
 +
-+	if (offset + size <= PAGE_SIZE)
-+		return 0;
-+	if (check_pages_physically_contiguous(pfn, offset, size))
-+		return 0;
-+	return 1;
++fail:
++	return NULL;
++#endif
 +}
 +
-+static inline void xen_dma_unmap_page(struct page *page)
++static void drm_vfree_dma(struct drm_device *drmdev, void *addr, int npages,
++			  struct page **pages)
 +{
-+	/* Xen TODO: 2.6.18 xen calls __gnttab_dma_unmap_page here
-+	 * to deal with foreign pages.  We'll need similar logic here at
++#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
++	vfree(addr);
++#else
++	struct device *dev = &drmdev->pdev->dev;
++	int i;
++
++	for (i = 0; i < npages; i++) {
++		void *addr = page_address(pages[i]);
++		dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
++	}
++	vunmap(addr);
+ #endif
+ }
+ 
+-void drm_sg_cleanup(struct drm_sg_mem * entry)
++void drm_sg_cleanup(struct drm_device *drmdev, struct drm_sg_mem * entry)
+ {
+ 	struct page *page;
+ 	int i;
+@@ -56,7 +109,7 @@ void drm_sg_cleanup(struct drm_sg_mem * entry)
+ 			ClearPageReserved(page);
+ 	}
+ 
+-	vfree(entry->virtual);
++	drm_vfree_dma(drmdev, entry->virtual, entry->pages, entry->pagelist);
+ 
+ 	kfree(entry->busaddr);
+ 	kfree(entry->pagelist);
+@@ -107,7 +160,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request)
+ 	}
+ 	memset((void *)entry->busaddr, 0, pages * sizeof(*entry->busaddr));
+ 
+-	entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT);
++	entry->virtual = drm_vmalloc_dma(dev, pages << PAGE_SHIFT);
+ 	if (!entry->virtual) {
+ 		kfree(entry->busaddr);
+ 		kfree(entry->pagelist);
+@@ -180,7 +233,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request)
+ 	return 0;
+ 
+       failed:
+-	drm_sg_cleanup(entry);
++	drm_sg_cleanup(dev, entry);
+ 	return -ENOMEM;
+ }
+ EXPORT_SYMBOL(drm_sg_alloc);
+@@ -212,7 +265,7 @@ int drm_sg_free(struct drm_device *dev, void *data,
+ 
+ 	DRM_DEBUG("virtual  = %p\n", entry->virtual);
+ 
+-	drm_sg_cleanup(entry);
++	drm_sg_cleanup(dev, entry);
+ 
+ 	return 0;
+ }
+diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
+index 1c040d0..3dc8d6b 100644
+--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
++++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
+@@ -272,6 +272,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
+ 
+ 	vma->vm_private_data = bo;
+ 	vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
++	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+ 	return 0;
+ out_unref:
+ 	ttm_bo_unref(&bo);
+@@ -287,6 +288,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
+ 	vma->vm_ops = &ttm_bo_vm_ops;
+ 	vma->vm_private_data = ttm_bo_reference(bo);
+ 	vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
++	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+ 	return 0;
+ }
+ EXPORT_SYMBOL(ttm_fbdev_mmap);
+diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c
+index b115726..0859bb0 100644
+--- a/drivers/input/xen-kbdfront.c
++++ b/drivers/input/xen-kbdfront.c
+@@ -21,7 +21,10 @@
+ #include <linux/errno.h>
+ #include <linux/module.h>
+ #include <linux/input.h>
++
+ #include <asm/xen/hypervisor.h>
++
++#include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+ #include <xen/interface/io/fbif.h>
+@@ -272,6 +275,8 @@ static void xenkbd_backend_changed(struct xenbus_device *dev,
+ 	switch (backend_state) {
+ 	case XenbusStateInitialising:
+ 	case XenbusStateInitialised:
++	case XenbusStateReconfiguring:
++	case XenbusStateReconfigured:
+ 	case XenbusStateUnknown:
+ 	case XenbusStateClosed:
+ 		break;
+diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
+index b2f71f7..b7feb84 100644
+--- a/drivers/net/Kconfig
++++ b/drivers/net/Kconfig
+@@ -2787,6 +2787,7 @@ source "drivers/s390/net/Kconfig"
+ config XEN_NETDEV_FRONTEND
+ 	tristate "Xen network device frontend driver"
+ 	depends on XEN
++	select XEN_XENBUS_FRONTEND
+ 	default y
+ 	help
+ 	  The network device frontend driver allows the kernel to
+diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
+index baa051d..ee7465a 100644
+--- a/drivers/net/xen-netfront.c
++++ b/drivers/net/xen-netfront.c
+@@ -42,6 +42,7 @@
+ #include <linux/mm.h>
+ #include <net/ip.h>
+ 
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+@@ -1393,7 +1394,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
+ }
+ 
+ /* Common code used when first setting up, and when resuming. */
+-static int talk_to_backend(struct xenbus_device *dev,
++static int talk_to_netback(struct xenbus_device *dev,
+ 			   struct netfront_info *info)
+ {
+ 	const char *message;
+@@ -1543,7 +1544,7 @@ static int xennet_connect(struct net_device *dev)
+ 		return -ENODEV;
+ 	}
+ 
+-	err = talk_to_backend(np->xbdev, np);
++	err = talk_to_netback(np->xbdev, np);
+ 	if (err)
+ 		return err;
+ 
+@@ -1597,7 +1598,7 @@ static int xennet_connect(struct net_device *dev)
+ /**
+  * Callback received when the backend's state changes.
+  */
+-static void backend_changed(struct xenbus_device *dev,
++static void netback_changed(struct xenbus_device *dev,
+ 			    enum xenbus_state backend_state)
+ {
+ 	struct netfront_info *np = dev_get_drvdata(&dev->dev);
+@@ -1608,6 +1609,8 @@ static void backend_changed(struct xenbus_device *dev,
+ 	switch (backend_state) {
+ 	case XenbusStateInitialising:
+ 	case XenbusStateInitialised:
++	case XenbusStateReconfiguring:
++	case XenbusStateReconfigured:
+ 	case XenbusStateConnected:
+ 	case XenbusStateUnknown:
+ 	case XenbusStateClosed:
+@@ -1798,7 +1801,7 @@ static struct xenbus_driver netfront_driver = {
+ 	.probe = netfront_probe,
+ 	.remove = __devexit_p(xennet_remove),
+ 	.resume = netfront_resume,
+-	.otherend_changed = backend_changed,
++	.otherend_changed = netback_changed,
+ };
+ 
+ static int __init netif_init(void)
+diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
+index fdc864f..7802fcd 100644
+--- a/drivers/pci/Kconfig
++++ b/drivers/pci/Kconfig
+@@ -51,6 +51,16 @@ config PCI_STUB
+ 
+ 	  When in doubt, say N.
+ 
++config XEN_PCIDEV_FRONTEND
++        tristate "Xen PCI Frontend"
++        depends on XEN && PCI && X86
++        select HOTPLUG
++	select XEN_XENBUS_FRONTEND
++	default y
++        help
++          The PCI device frontend driver allows the kernel to import arbitrary
++          PCI devices from a PCI backend to support PCI driver domains.
++
+ config HT_IRQ
+ 	bool "Interrupts on hypertransport devices"
+ 	default y
+diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
+index 4a7f11d..b70aa4d 100644
+--- a/drivers/pci/Makefile
++++ b/drivers/pci/Makefile
+@@ -31,6 +31,8 @@ obj-$(CONFIG_HT_IRQ) += htirq.o
+ # Build Intel IOMMU support
+ obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
+ 
++# Build Xen IOMMU support
++obj-$(CONFIG_PCI_XEN) += xen-iommu.o
+ obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
+ 
+ obj-$(CONFIG_PCI_IOV) += iov.o
+@@ -60,6 +62,8 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o
+ 
+ obj-$(CONFIG_PCI_STUB) += pci-stub.o
+ 
++obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
++
+ ifeq ($(CONFIG_PCI_DEBUG),y)
+ EXTRA_CFLAGS += -DDEBUG
+ endif
+diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
+index cef28a7..1940183 100644
+--- a/drivers/pci/bus.c
++++ b/drivers/pci/bus.c
+@@ -249,6 +249,7 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
+ 	up_read(&pci_bus_sem);
+ }
+ 
++EXPORT_SYMBOL_GPL(pci_walk_bus);
+ EXPORT_SYMBOL(pci_bus_alloc_resource);
+ EXPORT_SYMBOL_GPL(pci_bus_add_device);
+ EXPORT_SYMBOL(pci_bus_add_devices);
+diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
+index 5753036..8e6e6d1 100644
+--- a/drivers/pci/dmar.c
++++ b/drivers/pci/dmar.c
+@@ -673,10 +673,13 @@ void __init detect_intel_iommu(void)
+ 			       "x2apic and Intr-remapping.\n");
+ #endif
+ #ifdef CONFIG_DMAR
+-		if (ret && !no_iommu && !iommu_detected && !swiotlb &&
+-		    !dmar_disabled)
++		if (ret && !no_iommu && !iommu_detected && !dmar_disabled)
+ 			iommu_detected = 1;
+ #endif
++#ifdef CONFIG_X86
++		if (ret)
++			x86_init.iommu.iommu_init = intel_iommu_init;
++#endif
+ 	}
+ 	early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
+ 	dmar_tbl = NULL;
+diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
+index 2498602..fd89530 100644
+--- a/drivers/pci/intel-iommu.c
++++ b/drivers/pci/intel-iommu.c
+@@ -3282,7 +3282,7 @@ int __init intel_iommu_init(void)
+ 	 * Check the need for DMA-remapping initialization now.
+ 	 * Above initialization will also be used by Interrupt-remapping.
+ 	 */
+-	if (no_iommu || swiotlb || dmar_disabled)
++	if (no_iommu || dmar_disabled)
+ 		return -ENODEV;
+ 
+ 	iommu_init_mempool();
+@@ -3303,7 +3303,9 @@ int __init intel_iommu_init(void)
+ 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
+ 
+ 	init_timer(&unmap_timer);
+-	force_iommu = 1;
++#ifdef CONFIG_SWIOTLB
++	swiotlb = 0;
++#endif
+ 	dma_ops = &intel_dma_ops;
+ 
+ 	init_iommu_sysfs();
+diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
+index f9cf317..a77a46f 100644
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -19,6 +19,9 @@
+ #include <linux/errno.h>
+ #include <linux/io.h>
+ 
++#include <asm/xen/hypercall.h>
++#include <asm/xen/hypervisor.h>
++
+ #include "pci.h"
+ #include "msi.h"
+ 
+@@ -366,6 +369,20 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
+ 
+ void pci_restore_msi_state(struct pci_dev *dev)
+ {
++	if (xen_initial_domain()) {
++		struct physdev_restore_msi physdev;
++
++		if (!dev->msi_enabled && !dev->msix_enabled)
++			return;
++
++		pci_intx_for_msi(dev, 0);
++
++		physdev.bus = dev->bus->number;
++		physdev.devfn = dev->devfn;
++		HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &physdev);
++
++		return;
++	}
+ 	__pci_restore_msi_state(dev);
+ 	__pci_restore_msix_state(dev);
+ }
+diff --git a/drivers/pci/xen-iommu.c b/drivers/pci/xen-iommu.c
+new file mode 100644
+index 0000000..ac6bcdb
+--- /dev/null
++++ b/drivers/pci/xen-iommu.c
+@@ -0,0 +1,271 @@
++#include <linux/types.h>
++#include <linux/mm.h>
++#include <linux/string.h>
++#include <linux/pci.h>
++#include <linux/module.h>
++#include <linux/version.h>
++#include <linux/scatterlist.h>
++#include <linux/io.h>
++#include <linux/bug.h>
++
++#include <xen/interface/xen.h>
++#include <xen/grant_table.h>
++#include <xen/page.h>
++#include <xen/xen-ops.h>
++
++#include <asm/iommu.h>
++#include <asm/swiotlb.h>
++#include <asm/tlbflush.h>
++
++#define IOMMU_BUG_ON(test)				\
++do {							\
++	if (unlikely(test)) {				\
++		printk(KERN_ALERT "Fatal DMA error! "	\
++		       "Please use 'swiotlb=force'\n");	\
++		BUG();					\
++	}						\
++} while (0)
++
++/* Print address range with message */
++#define PAR(msg, addr, size)					\
++do {							\
++	printk(msg "[%#llx - %#llx]\n",			\
++	(unsigned long long)addr,			\
++	(unsigned long long)addr + size);		\
++} while (0)
++
++static inline int address_needs_mapping(struct device *hwdev,
++						dma_addr_t addr)
++{
++	dma_addr_t mask = DMA_BIT_MASK(32);
++	int ret;
++
++	/* If the device has a mask, use it, otherwise default to 32 bits */
++	if (hwdev)
++		mask = *hwdev->dma_mask;
++
++	ret = (addr & ~mask) != 0;
++
++	if (ret) {
++		printk(KERN_ERR "dma address needs mapping\n");
++		printk(KERN_ERR "mask: %#llx\n address: [%#llx]\n", mask, addr);
++	}
++	return ret;
++}
++
++static int check_pages_physically_contiguous(unsigned long pfn,
++					     unsigned int offset,
++					     size_t length)
++{
++	unsigned long next_mfn;
++	int i;
++	int nr_pages;
++
++	next_mfn = pfn_to_mfn(pfn);
++	nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
++
++	for (i = 1; i < nr_pages; i++) {
++		if (pfn_to_mfn(++pfn) != ++next_mfn)
++			return 0;
++	}
++	return 1;
++}
++
++static int range_straddles_page_boundary(phys_addr_t p, size_t size)
++{
++	unsigned long pfn = PFN_DOWN(p);
++	unsigned int offset = p & ~PAGE_MASK;
++
++	if (offset + size <= PAGE_SIZE)
++		return 0;
++	if (check_pages_physically_contiguous(pfn, offset, size))
++		return 0;
++	return 1;
++}
++
++static inline void xen_dma_unmap_page(struct page *page)
++{
++	/* Xen TODO: 2.6.18 xen calls __gnttab_dma_unmap_page here
++	 * to deal with foreign pages.  We'll need similar logic here at
 +	 * some point.
 +	 */
 +}
@@ -7315,7 +8405,7 @@
  	case XenbusStateClosed:
  		break;
 diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
-index cab100a..c4f36b7 100644
+index cab100a..d6d6f3e 100644
 --- a/drivers/xen/Kconfig
 +++ b/drivers/xen/Kconfig
 @@ -28,6 +28,110 @@ config XEN_DEV_EVTCHN
@@ -7422,64 +8512,528 @@
 +
 +endchoice
 +
-+config XEN_PCIDEV_BE_DEBUG
-+	bool "PCI Backend Debugging"
-+	depends on XEN_PCIDEV_BACKEND
++config XEN_PCIDEV_BE_DEBUG
++	bool "PCI Backend Debugging"
++	depends on XEN_PCIDEV_BACKEND
++
+ config XENFS
+ 	tristate "Xen filesystem"
+ 	depends on XEN
+@@ -60,4 +164,23 @@ config XEN_SYS_HYPERVISOR
+          Create entries under /sys/hypervisor describing the Xen
+ 	 hypervisor environment.  When running native or in another
+ 	 virtual environment, /sys/hypervisor will still be present,
+-	 but will have no xen contents.
+\ No newline at end of file
++	 but will have no xen contents.
++
++config XEN_XENBUS_FRONTEND
++       tristate
++
++config XEN_GNTDEV
++	tristate "userspace grant access device driver"
++	depends on XEN
++	select MMU_NOTIFIER
++	help
++	  Allows userspace processes use grants.
++
++config XEN_S3
++       def_bool y
++       depends on XEN_DOM0 && ACPI
++
++config ACPI_PROCESSOR_XEN
++	   tristate
++	   depends on XEN_DOM0 && ACPI_PROCESSOR && CPU_FREQ
++	   default y
+diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
+index 7c28434..23bc06e 100644
+--- a/drivers/xen/Makefile
++++ b/drivers/xen/Makefile
+@@ -1,12 +1,24 @@
+-obj-y	+= grant-table.o features.o events.o manage.o
++obj-y	+= grant-table.o features.o events.o manage.o biomerge.o pcpu.o
+ obj-y	+= xenbus/
+ 
+ nostackp := $(call cc-option, -fno-stack-protector)
+ CFLAGS_features.o			:= $(nostackp)
+ 
+-obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
+-obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
+-obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
+-obj-$(CONFIG_XEN_DEV_EVTCHN)	+= evtchn.o
+-obj-$(CONFIG_XENFS)		+= xenfs/
+-obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
+\ No newline at end of file
++obj-$(CONFIG_PCI)			+= pci.o
++obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
++obj-$(CONFIG_XEN_XENCOMM)		+= xencomm.o
++obj-$(CONFIG_XEN_BALLOON)		+= balloon.o
++obj-$(CONFIG_XEN_DEV_EVTCHN)		+= xen-evtchn.o
++obj-$(CONFIG_XEN_GNTDEV)		+= xen-gntdev.o
++obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= pciback/
++obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
++obj-$(CONFIG_XEN_BLKDEV_TAP)            += blktap/
++obj-$(CONFIG_XEN_NETDEV_BACKEND)	+= netback/
++obj-$(CONFIG_XENFS)			+= xenfs/
++obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
++obj-$(CONFIG_XEN_S3)           += acpi.o
++obj-$(CONFIG_ACPI_PROCESSOR_XEN) += acpi_processor.o
++obj-$(CONFIG_ACPI_HOTPLUG_MEMORY)  += xen_acpi_memhotplug.o
++
++xen-evtchn-y				:= evtchn.o
++xen-gntdev-y				:= gntdev.o
+diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
+new file mode 100644
+index 0000000..e6d3d0e
+--- /dev/null
++++ b/drivers/xen/acpi.c
+@@ -0,0 +1,23 @@
++#include <xen/acpi.h>
++
++#include <xen/interface/platform.h>
++#include <asm/xen/hypercall.h>
++#include <asm/xen/hypervisor.h>
++
++int acpi_notify_hypervisor_state(u8 sleep_state,
++				 u32 pm1a_cnt, u32 pm1b_cnt)
++{
++	struct xen_platform_op op = {
++		.cmd = XENPF_enter_acpi_sleep,
++		.interface_version = XENPF_INTERFACE_VERSION,
++		.u = {
++			.enter_acpi_sleep = {
++				.pm1a_cnt_val = (u16)pm1a_cnt,
++				.pm1b_cnt_val = (u16)pm1b_cnt,
++				.sleep_state = sleep_state,
++			},
++		},
++	};
++
++	return HYPERVISOR_dom0_op(&op);
++}
+diff --git a/drivers/xen/acpi_processor.c b/drivers/xen/acpi_processor.c
+new file mode 100644
+index 0000000..77be04b
+--- /dev/null
++++ b/drivers/xen/acpi_processor.c
+@@ -0,0 +1,417 @@
++/*
++ *  acpi_processor.c - interface to notify Xen on acpi processor object
++ *                     info parsing
++ *
++ *  Copyright (C) 2008, Intel corporation
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or (at
++ *  your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful, but
++ *  WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ *  General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License along
++ *  with this program; if not, write to the Free Software Foundation, Inc.,
++ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
++ *
++ */
++
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/acpi.h>
++#include <linux/pm.h>
++#include <linux/cpu.h>
++
++#include <linux/cpufreq.h>
++#include <acpi/processor.h>
++#include <xen/acpi.h>
++#include <xen/pcpu.h>
++
++#include <asm/xen/hypercall.h>
++#include <asm/xen/hypervisor.h>
++
++static int xen_hotplug_notifier(struct acpi_processor *pr, int event);
++
++static struct processor_cntl_xen_ops xen_ops = {
++	.hotplug		= xen_hotplug_notifier,
++};
++
++static struct acpi_power_register *power_registers[XEN_MAX_ACPI_ID + 1];
++
++int processor_cntl_xen_power_cache(int cpu, int cx,
++		struct acpi_power_register *reg)
++{
++	struct acpi_power_register *buf;
++
++	if (cpu < 0 || cpu > XEN_MAX_ACPI_ID ||
++			cx < 1 || cx > ACPI_PROCESSOR_MAX_POWER) {
++		return -EINVAL;
++	}
++
++	if (power_registers[cpu] == NULL) {
++		buf = kzalloc(ACPI_PROCESSOR_MAX_POWER *
++				sizeof(struct xen_processor_cx), GFP_KERNEL);
++		if (buf == NULL)
++			return -ENOMEM;
++
++		power_registers[cpu] = buf;
++	}
++
++	memcpy(power_registers[cpu]+cx-1, reg, sizeof(*reg));
++
++	return 0;
++}
++EXPORT_SYMBOL(processor_cntl_xen_power_cache);
++
++#ifdef CONFIG_ACPI_HOTPLUG_CPU
++static int xen_get_apic_id(acpi_handle handle)
++{
++	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
++	union acpi_object *obj;
++	struct acpi_madt_local_apic *lapic;
++	u8 physid;
++
++	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
++		return -EINVAL;
++
++	if (!buffer.length || !buffer.pointer)
++		return -EINVAL;
++
++	obj = buffer.pointer;
++	if (obj->type != ACPI_TYPE_BUFFER ||
++	    obj->buffer.length < sizeof(*lapic)) {
++		kfree(buffer.pointer);
++		return -EINVAL;
++	}
++
++	lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
++
++	if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
++	    !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
++		kfree(buffer.pointer);
++		return -EINVAL;
++	}
++
++	physid = lapic->id;
++	kfree(buffer.pointer);
++	buffer.length = ACPI_ALLOCATE_BUFFER;
++	buffer.pointer = NULL;
++
++	return physid;
++}
++#else
++static int xen_get_apic_id(acpi_handle handle)
++{
++	return -1;
++}
++#endif
++
++int processor_cntl_xen_notify(struct acpi_processor *pr, int event, int type)
++{
++	int ret = -EINVAL;
++
++	switch (event) {
++	case PROCESSOR_PM_INIT:
++	case PROCESSOR_PM_CHANGE:
++		if ((type >= PM_TYPE_MAX) ||
++			!xen_ops.pm_ops[type])
++			break;
++
++		ret = xen_ops.pm_ops[type](pr, event);
++		break;
++	case PROCESSOR_HOTPLUG:
++	{
++		int apic_id;
++
++		apic_id = xen_get_apic_id(pr->handle);
++		if (apic_id < 0)
++			break;
++		if (xen_ops.hotplug)
++			ret = xen_ops.hotplug(pr, type);
++		xen_pcpu_hotplug(type, apic_id);
++		break;
++	}
++	default:
++		printk(KERN_ERR "Unsupport processor events %d.\n", event);
++		break;
++	}
++
++	return ret;
++}
++EXPORT_SYMBOL(processor_cntl_xen_notify);
++
++static inline void xen_convert_pct_reg(struct xen_pct_register *xpct,
++	struct acpi_pct_register *apct)
++{
++	xpct->descriptor = apct->descriptor;
++	xpct->length     = apct->length;
++	xpct->space_id   = apct->space_id;
++	xpct->bit_width  = apct->bit_width;
++	xpct->bit_offset = apct->bit_offset;
++	xpct->reserved   = apct->reserved;
++	xpct->address    = apct->address;
++}
++
++static inline void xen_convert_pss_states(struct xen_processor_px *xpss,
++	struct acpi_processor_px *apss, int state_count)
++{
++	int i;
++	for (i = 0; i < state_count; i++) {
++		xpss->core_frequency     = apss->core_frequency;
++		xpss->power              = apss->power;
++		xpss->transition_latency = apss->transition_latency;
++		xpss->bus_master_latency = apss->bus_master_latency;
++		xpss->control            = apss->control;
++		xpss->status             = apss->status;
++		xpss++;
++		apss++;
++	}
++}
++
++static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd,
++	struct acpi_psd_package *apsd)
++{
++	xpsd->num_entries    = apsd->num_entries;
++	xpsd->revision       = apsd->revision;
++	xpsd->domain         = apsd->domain;
++	xpsd->coord_type     = apsd->coord_type;
++	xpsd->num_processors = apsd->num_processors;
++}
++
++static int xen_cx_notifier(struct acpi_processor *pr, int action)
++{
++	int ret, count = 0, i;
++	xen_platform_op_t op = {
++		.cmd			= XENPF_set_processor_pminfo,
++		.interface_version	= XENPF_INTERFACE_VERSION,
++		.u.set_pminfo.id	= pr->acpi_id,
++		.u.set_pminfo.type	= XEN_PM_CX,
++	};
++	struct xen_processor_cx *data, *buf;
++	struct acpi_processor_cx *cx;
++	struct acpi_power_register *reg;
++
++	if (action == PROCESSOR_PM_CHANGE)
++		return -EINVAL;
++
++	if (power_registers[pr->acpi_id] == NULL) {
++		printk(KERN_WARNING "No C state info for acpi processor %d\n",
++				pr->acpi_id);
++		return -EINVAL;
++	}
++
++	/* Convert to Xen defined structure and hypercall */
++	buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
++			GFP_KERNEL);
++	if (!buf)
++		return -ENOMEM;
++
++	data = buf;
++	for (i = 1; i <= pr->power.count; i++) {
++		cx = &pr->power.states[i];
++		reg = power_registers[pr->acpi_id]+i-1;
++		/* Skip invalid cstate entry */
++		if (!cx->valid)
++			continue;
++
++		data->type = cx->type;
++		data->latency = cx->latency;
++		data->power = cx->power;
++		data->reg.space_id = reg->space_id;
++		data->reg.bit_width = reg->bit_width;
++		data->reg.bit_offset = reg->bit_offset;
++		data->reg.access_size = reg->reserved;
++		data->reg.address = reg->address;
++
++		/* Get dependency relationships, _CSD is not supported yet */
++		data->dpcnt = 0;
++		set_xen_guest_handle(data->dp, NULL);
++
++		data++;
++		count++;
++	}
++
++	if (!count) {
++		printk(KERN_ERR "No available Cx info for cpu %d\n",
++				pr->acpi_id);
++		kfree(buf);
++		return -EINVAL;
++	}
++
++	op.u.set_pminfo.power.count = count;
++	op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control;
++	op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check;
++	op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst;
++	op.u.set_pminfo.power.flags.power_setup_done =
++		pr->flags.power_setup_done;
++
++	set_xen_guest_handle(op.u.set_pminfo.power.states, buf);
++	ret = HYPERVISOR_dom0_op(&op);
++	kfree(buf);
++	return ret;
++}
++
++static int xen_px_notifier(struct acpi_processor *pr, int action)
++{
++	int ret = -EINVAL;
++	xen_platform_op_t op = {
++		.cmd			= XENPF_set_processor_pminfo,
++		.interface_version	= XENPF_INTERFACE_VERSION,
++		.u.set_pminfo.id	= pr->acpi_id,
++		.u.set_pminfo.type	= XEN_PM_PX,
++	};
++	struct xen_processor_performance *perf;
++	struct xen_processor_px *states = NULL;
++	struct acpi_processor_performance *px;
++	struct acpi_psd_package *pdomain;
++
++	if (!pr)
++		return -EINVAL;
++
++	perf = &op.u.set_pminfo.perf;
++	px = pr->performance;
++
++	switch (action) {
++	case PROCESSOR_PM_CHANGE:
++		/* ppc dynamic handle */
++		perf->flags = XEN_PX_PPC;
++		perf->platform_limit = pr->performance_platform_limit;
++
++		ret = HYPERVISOR_dom0_op(&op);
++		break;
++
++	case PROCESSOR_PM_INIT:
++		/* px normal init */
++		perf->flags = XEN_PX_PPC |
++			      XEN_PX_PCT |
++			      XEN_PX_PSS |
++			      XEN_PX_PSD;
++
++		/* ppc */
++		perf->platform_limit = pr->performance_platform_limit;
++
++		/* pct */
++		xen_convert_pct_reg(&perf->control_register,
++				&px->control_register);
++		xen_convert_pct_reg(&perf->status_register,
++				&px->status_register);
++
++		/* pss */
++		perf->state_count = px->state_count;
++		states = kzalloc(px->state_count*sizeof(xen_processor_px_t),
++				GFP_KERNEL);
++		if (!states)
++			return -ENOMEM;
++		xen_convert_pss_states(states, px->states, px->state_count);
++		set_xen_guest_handle(perf->states, states);
++
++		/* psd */
++		pdomain = &px->domain_info;
++		xen_convert_psd_pack(&perf->domain_info, pdomain);
++		if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
++			perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
++		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
++			perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
++		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
++			perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
++		else {
++			ret = -ENODEV;
++			kfree(states);
++			break;
++		}
++
++		ret = HYPERVISOR_dom0_op(&op);
++		kfree(states);
++		break;
++
++	default:
++		break;
++	}
++
++	return ret;
++}
++
++static int xen_tx_notifier(struct acpi_processor *pr, int action)
++{
++	return -EINVAL;
++}
++
++#ifdef CONFIG_ACPI_HOTPLUG_CPU
++static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
++{
++	int ret = -EINVAL;
++	uint32_t apic_id;
++	unsigned long long pxm;
++	acpi_status status = 0;
++
++	xen_platform_op_t op = {
++		.interface_version  = XENPF_INTERFACE_VERSION,
++	};
++
++	apic_id = xen_get_apic_id(pr->handle);
++	if (apic_id < 0) {
++		printk(KERN_WARNING "Can't get apic_id for acpi_id %x\n",
++		  pr->acpi_id);
++		return -1;
++	}
++
++	status = acpi_evaluate_integer(pr->handle, "_PXM",
++	  NULL, &pxm);
++	if (ACPI_FAILURE(status)) {
++		printk(KERN_WARNING "can't get pxm for acpi_id %x\n",
++		  pr->acpi_id);
++		return -1;
++	}
++
++	switch (event) {
++	case HOTPLUG_TYPE_ADD:
++		op.cmd = XENPF_cpu_hotadd;
++		op.u.cpu_add.apic_id = apic_id;
++		op.u.cpu_add.acpi_id = pr->acpi_id;
++		op.u.cpu_add.pxm = pxm;
++		ret = HYPERVISOR_dom0_op(&op);
++		break;
++	case HOTPLUG_TYPE_REMOVE:
++		printk(KERN_WARNING "Xen not support CPU hotremove\n");
++		ret = -ENOSYS;
++		break;
++	}
++
++	return ret;
++}
++#else
++static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
++{
++	return -ENOSYS;
++}
++#endif
++
++static int __init xen_acpi_processor_extcntl_init(void)
++{
++	unsigned int pmbits;
++
++	/* Only xen dom0 is allowed to handle ACPI processor info */
++	if (!xen_initial_domain())
++		return 0;
 +
- config XENFS
- 	tristate "Xen filesystem"
- 	depends on XEN
-@@ -60,4 +164,14 @@ config XEN_SYS_HYPERVISOR
-          Create entries under /sys/hypervisor describing the Xen
- 	 hypervisor environment.  When running native or in another
- 	 virtual environment, /sys/hypervisor will still be present,
--	 but will have no xen contents.
-\ No newline at end of file
-+	 but will have no xen contents.
++	pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
 +
-+config XEN_XENBUS_FRONTEND
-+       tristate
++	if (pmbits & XEN_PROCESSOR_PM_CX)
++		xen_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
++	if (pmbits & XEN_PROCESSOR_PM_PX)
++		xen_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
++	if (pmbits & XEN_PROCESSOR_PM_TX)
++		xen_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
 +
-+config XEN_GNTDEV
-+	tristate "userspace grant access device driver"
-+	depends on XEN
-+	select MMU_NOTIFIER
-+	help
-+	  Allows userspace processes use grants.
-diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
-index 7c28434..c5f71db 100644
---- a/drivers/xen/Makefile
-+++ b/drivers/xen/Makefile
-@@ -1,12 +1,21 @@
--obj-y	+= grant-table.o features.o events.o manage.o
-+obj-y	+= grant-table.o features.o events.o manage.o biomerge.o
- obj-y	+= xenbus/
- 
- nostackp := $(call cc-option, -fno-stack-protector)
- CFLAGS_features.o			:= $(nostackp)
- 
--obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
--obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
--obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
--obj-$(CONFIG_XEN_DEV_EVTCHN)	+= evtchn.o
--obj-$(CONFIG_XENFS)		+= xenfs/
--obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
-\ No newline at end of file
-+obj-$(CONFIG_PCI)			+= pci.o
-+obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
-+obj-$(CONFIG_XEN_XENCOMM)		+= xencomm.o
-+obj-$(CONFIG_XEN_BALLOON)		+= balloon.o
-+obj-$(CONFIG_XEN_DEV_EVTCHN)		+= xen-evtchn.o
-+obj-$(CONFIG_XEN_GNTDEV)		+= xen-gntdev.o
-+obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= pciback/
-+obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
-+obj-$(CONFIG_XEN_BLKDEV_TAP)            += blktap/
-+obj-$(CONFIG_XEN_NETDEV_BACKEND)	+= netback/
-+obj-$(CONFIG_XENFS)			+= xenfs/
-+obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
++	return 0;
++}
 +
-+xen-evtchn-y				:= evtchn.o
-+xen-gntdev-y				:= gntdev.o
++subsys_initcall(xen_acpi_processor_extcntl_init);
++MODULE_LICENSE("GPL");
 diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
 index 4204336..d7c0eae 100644
 --- a/drivers/xen/balloon.c
@@ -11065,3771 +12619,4199 @@
 +			      offset, request, i,
 +			      page, pfn_to_kaddr(page_to_pfn(page)), uvaddr);
 +
-+			request->nr_pages++;
-+	}
++			request->nr_pages++;
++	}
++
++	if (blktap_map_foreign(tap, request, &blkif_req, &table))
++		goto out;
++
++	/* Finally, write the request message to the user ring. */
++	target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
++	memcpy(target, &blkif_req, sizeof(blkif_req));
++	target->id = request->usr_idx;
++	wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
++	ring->ring.req_prod_pvt++;
++
++	if (rq_data_dir(req)) {
++		tap->stats.st_wr_sect += nr_sects;
++		tap->stats.st_wr_req++;
++	} else {
++		tap->stats.st_rd_sect += nr_sects;
++		tap->stats.st_rd_req++;
++	}
++
++	err = 0;
++
++out:
++	if (err)
++		blktap_device_fast_flush(tap, request);
++	return err;
++}
++
++#ifdef ENABLE_PASSTHROUGH
++#define rq_for_each_bio_safe(_bio, _tmp, _req)				\
++	if ((_req)->bio)						\
++		for (_bio = (_req)->bio;				\
++		     _bio && ((_tmp = _bio->bi_next) || 1);		\
++		     _bio = _tmp)
++
++static void
++blktap_device_forward_request(struct blktap *tap, struct request *req)
++{
++	struct bio *bio, *tmp;
++	struct blktap_device *dev;
++
++	dev = &tap->device;
++
++	rq_for_each_bio_safe(bio, tmp, req) {
++		bio->bi_bdev = dev->bdev;
++		submit_bio(bio->bi_rw, bio);
++	}
++}
++
++static void
++blktap_device_close_bdev(struct blktap *tap)
++{
++	struct blktap_device *dev;
++
++	dev = &tap->device;
++
++	if (dev->bdev)
++		blkdev_put(dev->bdev);
++
++	dev->bdev = NULL;
++	clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++}
++
++static int
++blktap_device_open_bdev(struct blktap *tap, u32 pdev)
++{
++	struct block_device *bdev;
++	struct blktap_device *dev;
++
++	dev = &tap->device;
++
++	bdev = open_by_devnum(pdev, FMODE_WRITE);
++	if (IS_ERR(bdev)) {
++		BTERR("opening device %x:%x failed: %ld\n",
++		      MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
++		return PTR_ERR(bdev);
++	}
++
++	if (!bdev->bd_disk) {
++		BTERR("device %x:%x doesn't exist\n",
++		      MAJOR(pdev), MINOR(pdev));
++		blkdev_put(dev->bdev);
++		return -ENOENT;
++	}
++
++	dev->bdev = bdev;
++	set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++
++	/* TODO: readjust queue parameters */
++
++	BTINFO("set device %d to passthrough on %x:%x\n",
++	       tap->minor, MAJOR(pdev), MINOR(pdev));
++
++	return 0;
++}
++
++int
++blktap_device_enable_passthrough(struct blktap *tap,
++				 unsigned major, unsigned minor)
++{
++	u32 pdev;
++	struct blktap_device *dev;
++
++	dev  = &tap->device;
++	pdev = MKDEV(major, minor);
++
++	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++		return -EINVAL;
++
++	if (dev->bdev) {
++		if (pdev)
++			return -EINVAL;
++		blktap_device_close_bdev(tap);
++		return 0;
++	}
++
++	return blktap_device_open_bdev(tap, pdev);
++}
++#endif
++
++/*
++ * dev->lock held on entry
++ */
++static void
++blktap_device_run_queue(struct blktap *tap)
++{
++	int queued, err;
++	struct request_queue *rq;
++	struct request *req;
++	struct blktap_ring *ring;
++	struct blktap_device *dev;
++	struct blktap_request *request;
++
++	queued = 0;
++	ring   = &tap->ring;
++	dev    = &tap->device;
++	rq     = dev->gd->queue;
++
++	BTDBG("running queue for %d\n", tap->minor);
++
++	while ((req = blk_peek_request(rq)) != NULL) {
++		if (!blk_fs_request(req)) {
++			__blk_end_request_cur(req, 0);
++			continue;
++		}
++
++		if (blk_barrier_rq(req)) {
++			__blk_end_request_cur(req, 0);
++			continue;
++		}
++
++#ifdef ENABLE_PASSTHROUGH
++		if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
++			blkdev_dequeue_request(req);
++			blktap_device_forward_request(tap, req);
++			continue;
++		}
++#endif
++
++		if (RING_FULL(&ring->ring)) {
++		wait:
++			/* Avoid pointless unplugs. */
++			blk_stop_queue(rq);
++			blktap_defer(tap);
++			break;
++		}
++
++		request = blktap_request_allocate(tap);
++		if (!request) {
++			tap->stats.st_oo_req++;
++			goto wait;
++		}
++
++		BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) "
++		      "buffer:%p [%s], pending: %p\n", req, tap->minor,
++		      req->cmd, (unsigned long long)blk_rq_pos(req),
++		      blk_rq_cur_sectors(req),
++		      blk_rq_sectors(req), req->buffer,
++		      rq_data_dir(req) ? "write" : "read", request);
++
++		blk_start_request(req);
++
++		spin_unlock_irq(&dev->lock);
++		down_read(&tap->tap_sem);
++
++		err = blktap_device_process_request(tap, request, req);
++		if (!err)
++			queued++;
++		else {
++			blktap_device_end_dequeued_request(dev, req, -EIO);
++			blktap_request_free(tap, request);
++		}
++
++		up_read(&tap->tap_sem);
++		spin_lock_irq(&dev->lock);
++	}
++
++	if (queued)
++		blktap_ring_kick_user(tap);
++}
++
++/*
++ * dev->lock held on entry
++ */
++static void
++blktap_device_do_request(struct request_queue *rq)
++{
++	struct request *req;
++	struct blktap *tap;
++	struct blktap_device *dev;
 +
-+	if (blktap_map_foreign(tap, request, &blkif_req, &table))
-+		goto out;
++	dev = rq->queuedata;
++	if (!dev)
++		goto fail;
 +
-+	/* Finally, write the request message to the user ring. */
-+	target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
-+	memcpy(target, &blkif_req, sizeof(blkif_req));
-+	target->id = request->usr_idx;
-+	wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
-+	ring->ring.req_prod_pvt++;
++	tap = dev_to_blktap(dev);
++	if (!blktap_active(tap))
++		goto fail;
 +
-+	if (rq_data_dir(req)) {
-+		tap->stats.st_wr_sect += nr_sects;
-+		tap->stats.st_wr_req++;
-+	} else {
-+		tap->stats.st_rd_sect += nr_sects;
-+		tap->stats.st_rd_req++;
++	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
++	    test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++		blktap_defer(tap);
++		return;
 +	}
 +
-+	err = 0;
++	blktap_device_run_queue(tap);
++	return;
 +
-+out:
-+	if (err)
-+		blktap_device_fast_flush(tap, request);
-+	return err;
++fail:
++	while ((req = blk_peek_request(rq))) {
++		BTERR("device closed: failing secs %llu - %llu\n",
++		      (unsigned long long)blk_rq_pos(req),
++		      (unsigned long long)blk_rq_pos(req) + blk_rq_sectors(req));
++		__blk_end_request_cur(req, 0);
++	}
 +}
 +
-+#ifdef ENABLE_PASSTHROUGH
-+#define rq_for_each_bio_safe(_bio, _tmp, _req)				\
-+	if ((_req)->bio)						\
-+		for (_bio = (_req)->bio;				\
-+		     _bio && ((_tmp = _bio->bi_next) || 1);		\
-+		     _bio = _tmp)
-+
-+static void
-+blktap_device_forward_request(struct blktap *tap, struct request *req)
++void
++blktap_device_restart(struct blktap *tap)
 +{
-+	struct bio *bio, *tmp;
 +	struct blktap_device *dev;
 +
 +	dev = &tap->device;
 +
-+	rq_for_each_bio_safe(bio, tmp, req) {
-+		bio->bi_bdev = dev->bdev;
-+		submit_bio(bio->bi_rw, bio);
++	if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
++		blktap_defer(tap);
++		return;
++	}
++
++	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
++	    test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++		blktap_defer(tap);
++		return;
++	}
++
++	spin_lock_irq(&dev->lock);
++
++	/* Re-enable calldowns. */
++	if (dev->gd) {
++		struct request_queue *rq = dev->gd->queue;
++
++		if (blk_queue_stopped(rq))
++			blk_start_queue(rq);
++
++		/* Kick things off immediately. */
++		blktap_device_do_request(rq);
 +	}
++
++	spin_unlock_irq(&dev->lock);
 +}
 +
 +static void
-+blktap_device_close_bdev(struct blktap *tap)
++blktap_device_configure(struct blktap *tap)
 +{
-+	struct blktap_device *dev;
++	struct request_queue *rq;
++	struct blktap_device *dev = &tap->device;
++
++	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
++		return;
 +
 +	dev = &tap->device;
++	rq  = dev->gd->queue;
 +
-+	if (dev->bdev)
-+		blkdev_put(dev->bdev);
++	spin_lock_irq(&dev->lock);
 +
-+	dev->bdev = NULL;
-+	clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++	set_capacity(dev->gd, tap->params.capacity);
++
++	/* Hard sector size and max sectors impersonate the equiv. hardware. */
++	blk_queue_logical_block_size(rq, tap->params.sector_size);
++	blk_queue_max_sectors(rq, 512);
++
++	/* Each segment in a request is up to an aligned page in size. */
++	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
++	blk_queue_max_segment_size(rq, PAGE_SIZE);
++
++	/* Ensure a merged request will fit in a single I/O ring slot. */
++	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++
++	/* Make sure buffer addresses are sector-aligned. */
++	blk_queue_dma_alignment(rq, 511);
++
++	spin_unlock_irq(&dev->lock);
 +}
 +
-+static int
-+blktap_device_open_bdev(struct blktap *tap, u32 pdev)
++int
++blktap_device_resume(struct blktap *tap)
 +{
-+	struct block_device *bdev;
-+	struct blktap_device *dev;
-+
-+	dev = &tap->device;
++	int err;
 +
-+	bdev = open_by_devnum(pdev, FMODE_WRITE);
-+	if (IS_ERR(bdev)) {
-+		BTERR("opening device %x:%x failed: %ld\n",
-+		      MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
-+		return PTR_ERR(bdev);
-+	}
++	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
++		return -ENODEV;
 +
-+	if (!bdev->bd_disk) {
-+		BTERR("device %x:%x doesn't exist\n",
-+		      MAJOR(pdev), MINOR(pdev));
-+		blkdev_put(dev->bdev);
-+		return -ENOENT;
-+	}
++	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++		return 0;
 +
-+	dev->bdev = bdev;
-+	set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++	err = blktap_ring_resume(tap);
++	if (err)
++		return err;
 +
-+	/* TODO: readjust queue parameters */
++	/* device size may have changed */
++	blktap_device_configure(tap);
 +
-+	BTINFO("set device %d to passthrough on %x:%x\n",
-+	       tap->minor, MAJOR(pdev), MINOR(pdev));
++	BTDBG("restarting device\n");
++	blktap_device_restart(tap);
 +
 +	return 0;
 +}
 +
 +int
-+blktap_device_enable_passthrough(struct blktap *tap,
-+				 unsigned major, unsigned minor)
++blktap_device_pause(struct blktap *tap)
 +{
-+	u32 pdev;
-+	struct blktap_device *dev;
-+
-+	dev  = &tap->device;
-+	pdev = MKDEV(major, minor);
++	unsigned long flags;
++	struct blktap_device *dev = &tap->device;
 +
-+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+		return -EINVAL;
++	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
++		return -ENODEV;
 +
-+	if (dev->bdev) {
-+		if (pdev)
-+			return -EINVAL;
-+		blktap_device_close_bdev(tap);
++	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
 +		return 0;
-+	}
 +
-+	return blktap_device_open_bdev(tap, pdev);
++	spin_lock_irqsave(&dev->lock, flags);
++
++	blk_stop_queue(dev->gd->queue);
++	set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++
++	spin_unlock_irqrestore(&dev->lock, flags);
++
++	return blktap_ring_pause(tap);
 +}
++
++int
++blktap_device_destroy(struct blktap *tap)
++{
++	struct blktap_device *dev = &tap->device;
++	struct gendisk *gd = dev->gd;
++
++	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++		return 0;
++
++	BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
++
++	if (dev->users)
++		return -EBUSY;
++
++	spin_lock_irq(&dev->lock);
++	/* No more blktap_device_do_request(). */
++	blk_stop_queue(gd->queue);
++	clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++	dev->gd = NULL;
++	spin_unlock_irq(&dev->lock);
++
++#ifdef ENABLE_PASSTHROUGH
++	if (dev->bdev)
++		blktap_device_close_bdev(tap);
 +#endif
 +
-+/*
-+ * dev->lock held on entry
-+ */
-+static void
-+blktap_device_run_queue(struct blktap *tap)
++	del_gendisk(gd);
++	blk_cleanup_queue(gd->queue);
++	put_disk(gd);
++
++	wake_up(&tap->wq);
++
++	return 0;
++}
++
++int
++blktap_device_create(struct blktap *tap)
 +{
-+	int queued, err;
++	int minor, err;
++	struct gendisk *gd;
 +	struct request_queue *rq;
-+	struct request *req;
-+	struct blktap_ring *ring;
 +	struct blktap_device *dev;
-+	struct blktap_request *request;
 +
-+	queued = 0;
-+	ring   = &tap->ring;
-+	dev    = &tap->device;
-+	rq     = dev->gd->queue;
++	gd    = NULL;
++	rq    = NULL;
++	dev   = &tap->device;
++	minor = tap->minor;
 +
-+	BTDBG("running queue for %d\n", tap->minor);
++	if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++		return -EEXIST;
 +
-+	while ((req = blk_peek_request(rq)) != NULL) {
-+		if (!blk_fs_request(req)) {
-+			__blk_end_request_cur(req, 0);
-+			continue;
-+		}
++	if (blktap_validate_params(tap, &tap->params))
++		return -EINVAL;
++
++	BTINFO("minor %d sectors %Lu sector-size %lu\n",
++	       minor, tap->params.capacity, tap->params.sector_size);
++
++	err = -ENODEV;
 +
-+		if (blk_barrier_rq(req)) {
-+			__blk_end_request_cur(req, 0);
-+			continue;
-+		}
++	gd = alloc_disk(1);
++	if (!gd)
++		goto error;
 +
-+#ifdef ENABLE_PASSTHROUGH
-+		if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
-+			blkdev_dequeue_request(req);
-+			blktap_device_forward_request(tap, req);
-+			continue;
-+		}
-+#endif
++	if (minor < 26)
++		sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
++	else
++		sprintf(gd->disk_name, "tapdev%c%c",
++			'a' + ((minor / 26) - 1), 'a' + (minor % 26));
 +
-+		if (RING_FULL(&ring->ring)) {
-+		wait:
-+			/* Avoid pointless unplugs. */
-+			blk_stop_queue(rq);
-+			blktap_defer(tap);
-+			break;
-+		}
++	gd->major = blktap_device_major;
++	gd->first_minor = minor;
++	gd->fops = &blktap_device_file_operations;
++	gd->private_data = dev;
 +
-+		request = blktap_request_allocate(tap);
-+		if (!request) {
-+			tap->stats.st_oo_req++;
-+			goto wait;
-+		}
++	spin_lock_init(&dev->lock);
++	rq = blk_init_queue(blktap_device_do_request, &dev->lock);
++	if (!rq)
++		goto error;
 +
-+		BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) "
-+		      "buffer:%p [%s], pending: %p\n", req, tap->minor,
-+		      req->cmd, (unsigned long long)blk_rq_pos(req),
-+		      blk_rq_cur_sectors(req),
-+		      blk_rq_sectors(req), req->buffer,
-+		      rq_data_dir(req) ? "write" : "read", request);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
++	elevator_init(rq, "noop");
++#else
++	elevator_init(rq, &elevator_noop);
++#endif
 +
-+		blk_start_request(req);
++	gd->queue     = rq;
++	rq->queuedata = dev;
++	dev->gd       = gd;
 +
-+		spin_unlock_irq(&dev->lock);
-+		down_read(&tap->tap_sem);
++	set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++	blktap_device_configure(tap);
 +
-+		err = blktap_device_process_request(tap, request, req);
-+		if (!err)
-+			queued++;
-+		else {
-+			blktap_device_end_dequeued_request(dev, req, -EIO);
-+			blktap_request_free(tap, request);
-+		}
++	add_disk(gd);
 +
-+		up_read(&tap->tap_sem);
-+		spin_lock_irq(&dev->lock);
-+	}
++	err = 0;
++	goto out;
 +
-+	if (queued)
-+		blktap_ring_kick_user(tap);
++ error:
++	if (gd)
++		del_gendisk(gd);
++	if (rq)
++		blk_cleanup_queue(rq);
++
++ out:
++	BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
++	return err;
 +}
 +
-+/*
-+ * dev->lock held on entry
-+ */
-+static void
-+blktap_device_do_request(struct request_queue *rq)
++int __init
++blktap_device_init(int *maj)
 +{
-+	struct request *req;
-+	struct blktap *tap;
-+	struct blktap_device *dev;
-+
-+	dev = rq->queuedata;
-+	if (!dev)
-+		goto fail;
-+
-+	tap = dev_to_blktap(dev);
-+	if (!blktap_active(tap))
-+		goto fail;
++	int major;
 +
-+	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
-+	    test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
-+		blktap_defer(tap);
-+		return;
-+	}
++	/* Dynamically allocate a major for this device */
++	major = register_blkdev(0, "tapdev");
++	if (major < 0) {
++		BTERR("Couldn't register blktap device\n");
++		return -ENOMEM;
++	}	
 +
-+	blktap_device_run_queue(tap);
-+	return;
++	blktap_device_major = *maj = major;
++	BTINFO("blktap device major %d\n", major);
 +
-+fail:
-+	while ((req = blk_peek_request(rq))) {
-+		BTERR("device closed: failing secs %llu - %llu\n",
-+		      (unsigned long long)blk_rq_pos(req),
-+		      (unsigned long long)blk_rq_pos(req) + blk_rq_sectors(req));
-+		__blk_end_request_cur(req, 0);
-+	}
++	return 0;
 +}
 +
 +void
-+blktap_device_restart(struct blktap *tap)
++blktap_device_free(void)
 +{
-+	struct blktap_device *dev;
++	if (blktap_device_major)
++		unregister_blkdev(blktap_device_major, "tapdev");
++}
+diff --git a/drivers/xen/blktap/request.c b/drivers/xen/blktap/request.c
+new file mode 100644
+index 0000000..770736a
+--- /dev/null
++++ b/drivers/xen/blktap/request.c
+@@ -0,0 +1,297 @@
++#include <linux/spinlock.h>
++#include <xen/balloon.h>
++#include <linux/sched.h>
 +
-+	dev = &tap->device;
++#include "blktap.h"
 +
-+	if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
-+		blktap_defer(tap);
-+		return;
-+	}
++#define MAX_BUCKETS                      8
++#define BUCKET_SIZE                      MAX_PENDING_REQS
 +
-+	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
-+	    test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
-+		blktap_defer(tap);
-+		return;
-+	}
++#define BLKTAP_POOL_CLOSING              1
 +
-+	spin_lock_irq(&dev->lock);
++struct blktap_request_bucket;
 +
-+	/* Re-enable calldowns. */
-+	if (dev->gd) {
-+		struct request_queue *rq = dev->gd->queue;
++struct blktap_request_handle {
++	int                              slot;
++	uint8_t                          inuse;
++	struct blktap_request            request;
++	struct blktap_request_bucket    *bucket;
++};
 +
-+		if (blk_queue_stopped(rq))
-+			blk_start_queue(rq);
++struct blktap_request_bucket {
++	atomic_t                         reqs_in_use;
++	struct blktap_request_handle     handles[BUCKET_SIZE];
++	struct page                    **foreign_pages;
++};
 +
-+		/* Kick things off immediately. */
-+		blktap_device_do_request(rq);
-+	}
++struct blktap_request_pool {
++	spinlock_t                       lock;
++	uint8_t                          status;
++	struct list_head                 free_list;
++	atomic_t                         reqs_in_use;
++	wait_queue_head_t                wait_queue;
++	struct blktap_request_bucket    *buckets[MAX_BUCKETS];
++};
 +
-+	spin_unlock_irq(&dev->lock);
++static struct blktap_request_pool pool;
++
++static inline struct blktap_request_handle *
++blktap_request_to_handle(struct blktap_request *req)
++{
++	return container_of(req, struct blktap_request_handle, request);
 +}
 +
 +static void
-+blktap_device_configure(struct blktap *tap)
++blktap_request_pool_init_request(struct blktap_request *request)
 +{
-+	struct request_queue *rq;
-+	struct blktap_device *dev = &tap->device;
++	int i;
 +
-+	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
-+		return;
++	request->usr_idx  = -1;
++	request->nr_pages = 0;
++	request->status   = BLKTAP_REQUEST_FREE;
++	INIT_LIST_HEAD(&request->free_list);
++	for (i = 0; i < ARRAY_SIZE(request->handles); i++) {
++		request->handles[i].user   = INVALID_GRANT_HANDLE;
++		request->handles[i].kernel = INVALID_GRANT_HANDLE;
++	}
++}
 +
-+	dev = &tap->device;
-+	rq  = dev->gd->queue;
++static int
++blktap_request_pool_allocate_bucket(void)
++{
++	int i, idx;
++	unsigned long flags;
++	struct blktap_request *request;
++	struct blktap_request_handle *handle;
++	struct blktap_request_bucket *bucket;
 +
-+	spin_lock_irq(&dev->lock);
++	bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL);
++	if (!bucket)
++		goto fail;
 +
-+	set_capacity(dev->gd, tap->params.capacity);
++	bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES);
++	if (!bucket->foreign_pages)
++		goto fail;
 +
-+	/* Hard sector size and max sectors impersonate the equiv. hardware. */
-+	blk_queue_logical_block_size(rq, tap->params.sector_size);
-+	blk_queue_max_sectors(rq, 512);
++	spin_lock_irqsave(&pool.lock, flags);
 +
-+	/* Each segment in a request is up to an aligned page in size. */
-+	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
-+	blk_queue_max_segment_size(rq, PAGE_SIZE);
++	idx = -1;
++	for (i = 0; i < MAX_BUCKETS; i++) {
++		if (!pool.buckets[i]) {
++			idx = i;
++			pool.buckets[idx] = bucket;
++			break;
++		}
++	}
++
++	if (idx == -1) {
++		spin_unlock_irqrestore(&pool.lock, flags);
++		goto fail;
++	}
++
++	for (i = 0; i < BUCKET_SIZE; i++) {
++		handle  = bucket->handles + i;
++		request = &handle->request;
++
++		handle->slot   = i;
++		handle->inuse  = 0;
++		handle->bucket = bucket;
++
++		blktap_request_pool_init_request(request);
++		list_add_tail(&request->free_list, &pool.free_list);
++	}
 +
-+	/* Ensure a merged request will fit in a single I/O ring slot. */
-+	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-+	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++	spin_unlock_irqrestore(&pool.lock, flags);
 +
-+	/* Make sure buffer addresses are sector-aligned. */
-+	blk_queue_dma_alignment(rq, 511);
++	return 0;
 +
-+	spin_unlock_irq(&dev->lock);
++fail:
++	if (bucket && bucket->foreign_pages)
++		free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
++	kfree(bucket);
++	return -ENOMEM;
 +}
 +
-+int
-+blktap_device_resume(struct blktap *tap)
++static void
++blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket)
 +{
-+	int err;
-+
-+	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
-+		return -ENODEV;
-+
-+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+		return 0;
-+
-+	err = blktap_ring_resume(tap);
-+	if (err)
-+		return err;
++	if (!bucket)
++		return;
 +
-+	/* device size may have changed */
-+	blktap_device_configure(tap);
++	BTDBG("freeing bucket %p\n", bucket);
 +
-+	BTDBG("restarting device\n");
-+	blktap_device_restart(tap);
++	free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
++	kfree(bucket);
++}
 +
-+	return 0;
++struct page *
++request_to_page(struct blktap_request *req, int seg)
++{
++	struct blktap_request_handle *handle = blktap_request_to_handle(req);
++	int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
++	return handle->bucket->foreign_pages[idx];
 +}
 +
 +int
-+blktap_device_pause(struct blktap *tap)
++blktap_request_pool_shrink(void)
 +{
++	int i, err;
 +	unsigned long flags;
-+	struct blktap_device *dev = &tap->device;
++	struct blktap_request_bucket *bucket;
 +
-+	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
-+		return -ENODEV;
++	err = -EAGAIN;
 +
-+	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+		return 0;
++	spin_lock_irqsave(&pool.lock, flags);
 +
-+	spin_lock_irqsave(&dev->lock, flags);
++	/* always keep at least one bucket */
++	for (i = 1; i < MAX_BUCKETS; i++) {
++		bucket = pool.buckets[i];
++		if (!bucket)
++			continue;
 +
-+	blk_stop_queue(dev->gd->queue);
-+	set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++		if (atomic_read(&bucket->reqs_in_use))
++			continue;
 +
-+	spin_unlock_irqrestore(&dev->lock, flags);
++		blktap_request_pool_free_bucket(bucket);
++		pool.buckets[i] = NULL;
++		err = 0;
++		break;
++	}
 +
-+	return blktap_ring_pause(tap);
++	spin_unlock_irqrestore(&pool.lock, flags);
++
++	return err;
 +}
 +
 +int
-+blktap_device_destroy(struct blktap *tap)
++blktap_request_pool_grow(void)
 +{
-+	struct blktap_device *dev = &tap->device;
-+	struct gendisk *gd = dev->gd;
++	return blktap_request_pool_allocate_bucket();
++}
 +
-+	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
-+		return 0;
++struct blktap_request *
++blktap_request_allocate(struct blktap *tap)
++{
++	int i;
++	uint16_t usr_idx;
++	unsigned long flags;
++	struct blktap_request *request;
 +
-+	BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
++	usr_idx = -1;
++	request = NULL;
 +
-+	if (dev->users)
-+		return -EBUSY;
++	spin_lock_irqsave(&pool.lock, flags);
 +
-+	spin_lock_irq(&dev->lock);
-+	/* No more blktap_device_do_request(). */
-+	blk_stop_queue(gd->queue);
-+	clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
-+	dev->gd = NULL;
-+	spin_unlock_irq(&dev->lock);
++	if (pool.status == BLKTAP_POOL_CLOSING)
++		goto out;
 +
-+#ifdef ENABLE_PASSTHROUGH
-+	if (dev->bdev)
-+		blktap_device_close_bdev(tap);
-+#endif
++	for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++)
++		if (!tap->pending_requests[i]) {
++			usr_idx = i;
++			break;
++		}
 +
-+	del_gendisk(gd);
-+	blk_cleanup_queue(gd->queue);
-+	put_disk(gd);
++	if (usr_idx == (uint16_t)-1)
++		goto out;
 +
-+	wake_up(&tap->wq);
++	if (!list_empty(&pool.free_list)) {
++		request = list_entry(pool.free_list.next,
++				     struct blktap_request, free_list);
++		list_del(&request->free_list);
++	}
 +
-+	return 0;
-+}
++	if (request) {
++		struct blktap_request_handle *handle;
 +
-+int
-+blktap_device_create(struct blktap *tap)
-+{
-+	int minor, err;
-+	struct gendisk *gd;
-+	struct request_queue *rq;
-+	struct blktap_device *dev;
++		atomic_inc(&pool.reqs_in_use);
 +
-+	gd    = NULL;
-+	rq    = NULL;
-+	dev   = &tap->device;
-+	minor = tap->minor;
++		handle = blktap_request_to_handle(request);
++		atomic_inc(&handle->bucket->reqs_in_use);
++		handle->inuse = 1;
 +
-+	if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
-+		return -EEXIST;
++		request->usr_idx = usr_idx;
 +
-+	if (blktap_validate_params(tap, &tap->params))
-+		return -EINVAL;
++		tap->pending_requests[usr_idx] = request;
++		tap->pending_cnt++;
++	}
 +
-+	BTINFO("minor %d sectors %Lu sector-size %lu\n",
-+	       minor, tap->params.capacity, tap->params.sector_size);
++out:
++	spin_unlock_irqrestore(&pool.lock, flags);
++	return request;
++}
 +
-+	err = -ENODEV;
++void
++blktap_request_free(struct blktap *tap, struct blktap_request *request)
++{
++	int free;
++	unsigned long flags;
++	struct blktap_request_handle *handle;
 +
-+	gd = alloc_disk(1);
-+	if (!gd)
-+		goto error;
++	BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests));
++	handle = blktap_request_to_handle(request);
 +
-+	if (minor < 26)
-+		sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
-+	else
-+		sprintf(gd->disk_name, "tapdev%c%c",
-+			'a' + ((minor / 26) - 1), 'a' + (minor % 26));
++	spin_lock_irqsave(&pool.lock, flags);
 +
-+	gd->major = blktap_device_major;
-+	gd->first_minor = minor;
-+	gd->fops = &blktap_device_file_operations;
-+	gd->private_data = dev;
++	handle->inuse = 0;
++	tap->pending_requests[request->usr_idx] = NULL;
++	blktap_request_pool_init_request(request);
++	list_add(&request->free_list, &pool.free_list);
++	atomic_dec(&handle->bucket->reqs_in_use);
++	free = atomic_dec_and_test(&pool.reqs_in_use);
 +
-+	spin_lock_init(&dev->lock);
-+	rq = blk_init_queue(blktap_device_do_request, &dev->lock);
-+	if (!rq)
-+		goto error;
++	spin_unlock_irqrestore(&pool.lock, flags);
 +
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
-+	elevator_init(rq, "noop");
-+#else
-+	elevator_init(rq, &elevator_noop);
-+#endif
++	if (--tap->pending_cnt == 0)
++		wake_up_interruptible(&tap->wq);
 +
-+	gd->queue     = rq;
-+	rq->queuedata = dev;
-+	dev->gd       = gd;
++	if (free)
++		wake_up(&pool.wait_queue);
++}
 +
-+	set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
-+	blktap_device_configure(tap);
++void
++blktap_request_pool_free(void)
++{
++	int i;
++	unsigned long flags;
 +
-+	add_disk(gd);
++	spin_lock_irqsave(&pool.lock, flags);
 +
-+	err = 0;
-+	goto out;
++	pool.status = BLKTAP_POOL_CLOSING;
++	while (atomic_read(&pool.reqs_in_use)) {
++		spin_unlock_irqrestore(&pool.lock, flags);
++		wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use));
++		spin_lock_irqsave(&pool.lock, flags);
++	}
 +
-+ error:
-+	if (gd)
-+		del_gendisk(gd);
-+	if (rq)
-+		blk_cleanup_queue(rq);
++	for (i = 0; i < MAX_BUCKETS; i++) {
++		blktap_request_pool_free_bucket(pool.buckets[i]);
++		pool.buckets[i] = NULL;
++	}
 +
-+ out:
-+	BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
-+	return err;
++	spin_unlock_irqrestore(&pool.lock, flags);
 +}
 +
 +int __init
-+blktap_device_init(int *maj)
++blktap_request_pool_init(void)
 +{
-+	int major;
++	int i, err;
 +
-+	/* Dynamically allocate a major for this device */
-+	major = register_blkdev(0, "tapdev");
-+	if (major < 0) {
-+		BTERR("Couldn't register blktap device\n");
-+		return -ENOMEM;
-+	}	
++	memset(&pool, 0, sizeof(pool));
 +
-+	blktap_device_major = *maj = major;
-+	BTINFO("blktap device major %d\n", major);
++	spin_lock_init(&pool.lock);
++	INIT_LIST_HEAD(&pool.free_list);
++	atomic_set(&pool.reqs_in_use, 0);
++	init_waitqueue_head(&pool.wait_queue);
++
++	for (i = 0; i < 2; i++) {
++		err = blktap_request_pool_allocate_bucket();
++		if (err)
++			goto fail;
++	}
 +
 +	return 0;
-+}
 +
-+void
-+blktap_device_free(void)
-+{
-+	if (blktap_device_major)
-+		unregister_blkdev(blktap_device_major, "tapdev");
++fail:
++	blktap_request_pool_free();
++	return err;
 +}
-diff --git a/drivers/xen/blktap/request.c b/drivers/xen/blktap/request.c
+diff --git a/drivers/xen/blktap/ring.c b/drivers/xen/blktap/ring.c
 new file mode 100644
-index 0000000..770736a
+index 0000000..74a7aa7
 --- /dev/null
-+++ b/drivers/xen/blktap/request.c
-@@ -0,0 +1,297 @@
-+#include <linux/spinlock.h>
-+#include <xen/balloon.h>
++++ b/drivers/xen/blktap/ring.c
+@@ -0,0 +1,615 @@
++#include <linux/module.h>
++#include <linux/signal.h>
 +#include <linux/sched.h>
++#include <linux/poll.h>
++
++#include <asm/xen/page.h>
++#include <asm/xen/hypercall.h>
 +
 +#include "blktap.h"
 +
-+#define MAX_BUCKETS                      8
-+#define BUCKET_SIZE                      MAX_PENDING_REQS
++#ifdef CONFIG_XEN_BLKDEV_BACKEND
++#include "../blkback/blkback-pagemap.h"
++#else
++#define blkback_pagemap_contains_page(page) 0
++#endif
 +
-+#define BLKTAP_POOL_CLOSING              1
++static int blktap_ring_major;
 +
-+struct blktap_request_bucket;
++static inline struct blktap *
++vma_to_blktap(struct vm_area_struct *vma)
++{
++	struct vm_foreign_map *m = vma->vm_private_data;
++	struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
++	return container_of(r, struct blktap, ring);
++}
 +
-+struct blktap_request_handle {
-+	int                              slot;
-+	uint8_t                          inuse;
-+	struct blktap_request            request;
-+	struct blktap_request_bucket    *bucket;
-+};
++ /* 
++  * BLKTAP - immediately before the mmap area,
++  * we have a bunch of pages reserved for shared memory rings.
++  */
++#define RING_PAGES 1
 +
-+struct blktap_request_bucket {
-+	atomic_t                         reqs_in_use;
-+	struct blktap_request_handle     handles[BUCKET_SIZE];
-+	struct page                    **foreign_pages;
-+};
++static int
++blktap_read_ring(struct blktap *tap)
++{
++	/* This is called to read responses from the ring. */
++	int usr_idx;
++	RING_IDX rc, rp;
++	struct blkif_response res;
++	struct blktap_ring *ring;
++	struct blktap_request *request;
 +
-+struct blktap_request_pool {
-+	spinlock_t                       lock;
-+	uint8_t                          status;
-+	struct list_head                 free_list;
-+	atomic_t                         reqs_in_use;
-+	wait_queue_head_t                wait_queue;
-+	struct blktap_request_bucket    *buckets[MAX_BUCKETS];
-+};
++	down_read(&tap->tap_sem);
 +
-+static struct blktap_request_pool pool;
++	ring = &tap->ring;
++	if (!ring->vma) {
++		up_read(&tap->tap_sem);
++		return 0;
++	}
 +
-+static inline struct blktap_request_handle *
-+blktap_request_to_handle(struct blktap_request *req)
-+{
-+	return container_of(req, struct blktap_request_handle, request);
-+}
++	/* for each outstanding message on the ring  */
++	rp = ring->ring.sring->rsp_prod;
++	rmb();
 +
-+static void
-+blktap_request_pool_init_request(struct blktap_request *request)
-+{
-+	int i;
++	for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
++		memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
++		mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
++		++ring->ring.rsp_cons;
 +
-+	request->usr_idx  = -1;
-+	request->nr_pages = 0;
-+	request->status   = BLKTAP_REQUEST_FREE;
-+	INIT_LIST_HEAD(&request->free_list);
-+	for (i = 0; i < ARRAY_SIZE(request->handles); i++) {
-+		request->handles[i].user   = INVALID_GRANT_HANDLE;
-+		request->handles[i].kernel = INVALID_GRANT_HANDLE;
++		usr_idx = (int)res.id;
++		if (usr_idx >= MAX_PENDING_REQS ||
++		    !tap->pending_requests[usr_idx]) {
++			BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
++			       rc, rp, usr_idx, tap->pid, ring->vma);
++			continue;
++		}
++
++		request = tap->pending_requests[usr_idx];
++		BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
++		blktap_device_finish_request(tap, &res, request);
 +	}
++
++	up_read(&tap->tap_sem);
++
++	blktap_run_deferred();
++
++	return 0;
 +}
 +
-+static int
-+blktap_request_pool_allocate_bucket(void)
++static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 +{
-+	int i, idx;
-+	unsigned long flags;
++	return VM_FAULT_SIGBUS;
++}
++
++static pte_t
++blktap_ring_clear_pte(struct vm_area_struct *vma,
++		      unsigned long uvaddr,
++		      pte_t *ptep, int is_fullmm)
++{
++	pte_t copy;
++	struct blktap *tap;
++	unsigned long kvaddr;
++	struct page **map, *page;
++	struct blktap_ring *ring;
 +	struct blktap_request *request;
-+	struct blktap_request_handle *handle;
-+	struct blktap_request_bucket *bucket;
++	struct grant_handle_pair *khandle;
++	struct gnttab_unmap_grant_ref unmap[2];
++	int offset, seg, usr_idx, count = 0;
 +
-+	bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL);
-+	if (!bucket)
-+		goto fail;
++	tap  = vma_to_blktap(vma);
++	ring = &tap->ring;
++	map  = ring->foreign_map.map;
++	BUG_ON(!map);	/* TODO Should this be changed to if statement? */
 +
-+	bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES);
-+	if (!bucket->foreign_pages)
-+		goto fail;
++	/*
++	 * Zap entry if the address is before the start of the grant
++	 * mapped region.
++	 */
++	if (uvaddr < ring->user_vstart)
++		return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
++					       ptep, is_fullmm);
 +
-+	spin_lock_irqsave(&pool.lock, flags);
++	offset  = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
++	usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
++	seg     = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
 +
-+	idx = -1;
-+	for (i = 0; i < MAX_BUCKETS; i++) {
-+		if (!pool.buckets[i]) {
-+			idx = i;
-+			pool.buckets[idx] = bucket;
-+			break;
-+		}
++	offset  = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
++	page    = map[offset];
++	if (page) {
++		ClearPageReserved(page);
++		if (blkback_pagemap_contains_page(page))
++			set_page_private(page, 0);
 +	}
++	map[offset] = NULL;
 +
-+	if (idx == -1) {
-+		spin_unlock_irqrestore(&pool.lock, flags);
-+		goto fail;
++	request = tap->pending_requests[usr_idx];
++	kvaddr  = request_to_kaddr(request, seg);
++	khandle = request->handles + seg;
++
++	if (khandle->kernel != INVALID_GRANT_HANDLE) {
++		gnttab_set_unmap_op(&unmap[count], kvaddr, 
++				    GNTMAP_host_map, khandle->kernel);
++		count++;
++
++		set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, 
++				    INVALID_P2M_ENTRY);
 +	}
 +
-+	for (i = 0; i < BUCKET_SIZE; i++) {
-+		handle  = bucket->handles + i;
-+		request = &handle->request;
 +
-+		handle->slot   = i;
-+		handle->inuse  = 0;
-+		handle->bucket = bucket;
++	if (khandle->user != INVALID_GRANT_HANDLE) {
++		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
 +
-+		blktap_request_pool_init_request(request);
-+		list_add_tail(&request->free_list, &pool.free_list);
-+	}
++		copy = *ptep;
++		gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep).maddr,
++				    GNTMAP_host_map
++				    | GNTMAP_application_map
++				    | GNTMAP_contains_pte,
++				    khandle->user);
++		count++;
++	} else
++		copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
++					       is_fullmm);
 +
-+	spin_unlock_irqrestore(&pool.lock, flags);
++	if (count)
++		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++					      unmap, count))
++			BUG();
 +
-+	return 0;
++	khandle->kernel = INVALID_GRANT_HANDLE;
++	khandle->user   = INVALID_GRANT_HANDLE;
 +
-+fail:
-+	if (bucket && bucket->foreign_pages)
-+		free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
-+	kfree(bucket);
-+	return -ENOMEM;
++	return copy;
++}
++
++static void
++blktap_ring_vm_unmap(struct vm_area_struct *vma)
++{
++	struct blktap *tap = vma_to_blktap(vma);
++
++	down_write(&tap->tap_sem);
++	clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
++	clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++	clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++	up_write(&tap->tap_sem);
 +}
 +
-+static void
-+blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket)
-+{
-+	if (!bucket)
-+		return;
++static void
++blktap_ring_vm_close(struct vm_area_struct *vma)
++{
++	struct blktap *tap = vma_to_blktap(vma);
++	struct blktap_ring *ring = &tap->ring;
++
++	blktap_ring_vm_unmap(vma);                 /* fail future requests */
++	blktap_device_fail_pending_requests(tap);  /* fail pending requests */
++	blktap_device_restart(tap);                /* fail deferred requests */
++
++	down_write(&tap->tap_sem);
++
++	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
++
++	kfree(ring->foreign_map.map);
++	ring->foreign_map.map = NULL;
 +
-+	BTDBG("freeing bucket %p\n", bucket);
++	/* Free the ring page. */
++	ClearPageReserved(virt_to_page(ring->ring.sring));
++	free_page((unsigned long)ring->ring.sring);
 +
-+	free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
-+	kfree(bucket);
-+}
++	BTINFO("unmapping ring %d\n", tap->minor);
++	ring->ring.sring = NULL;
++	ring->vma = NULL;
 +
-+struct page *
-+request_to_page(struct blktap_request *req, int seg)
-+{
-+	struct blktap_request_handle *handle = blktap_request_to_handle(req);
-+	int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
-+	return handle->bucket->foreign_pages[idx];
++	up_write(&tap->tap_sem);
++
++	wake_up(&tap->wq);
 +}
 +
-+int
-+blktap_request_pool_shrink(void)
++static struct vm_operations_struct blktap_ring_vm_operations = {
++	.close    = blktap_ring_vm_close,
++	.unmap    = blktap_ring_vm_unmap,
++	.fault   = blktap_ring_fault,
++	.zap_pte  = blktap_ring_clear_pte,
++};
++
++static int
++blktap_ring_open(struct inode *inode, struct file *filp)
 +{
-+	int i, err;
-+	unsigned long flags;
-+	struct blktap_request_bucket *bucket;
++	int idx;
++	struct blktap *tap;
 +
-+	err = -EAGAIN;
++	idx = iminor(inode);
++	if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) {
++		BTERR("unable to open device blktap%d\n", idx);
++		return -ENODEV;
++	}
 +
-+	spin_lock_irqsave(&pool.lock, flags);
++	tap = blktaps[idx];
 +
-+	/* always keep at least one bucket */
-+	for (i = 1; i < MAX_BUCKETS; i++) {
-+		bucket = pool.buckets[i];
-+		if (!bucket)
-+			continue;
++	BTINFO("opening device blktap%d\n", idx);
 +
-+		if (atomic_read(&bucket->reqs_in_use))
-+			continue;
++	if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
++		return -ENODEV;
 +
-+		blktap_request_pool_free_bucket(bucket);
-+		pool.buckets[i] = NULL;
-+		err = 0;
-+		break;
-+	}
++	/* Only one process can access ring at a time */
++	if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
++		return -EBUSY;
 +
-+	spin_unlock_irqrestore(&pool.lock, flags);
++	filp->private_data = tap;
++	BTINFO("opened device %d\n", tap->minor);
 +
-+	return err;
++	return 0;
 +}
 +
-+int
-+blktap_request_pool_grow(void)
++static int
++blktap_ring_release(struct inode *inode, struct file *filp)
 +{
-+	return blktap_request_pool_allocate_bucket();
++	struct blktap *tap = filp->private_data;
++
++	BTINFO("freeing device %d\n", tap->minor);
++	clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
++	filp->private_data = NULL;
++	wake_up(&tap->wq);	
++	return 0;
 +}
 +
-+struct blktap_request *
-+blktap_request_allocate(struct blktap *tap)
++/* Note on mmap:
++ * We need to map pages to user space in a way that will allow the block
++ * subsystem set up direct IO to them.  This couldn't be done before, because
++ * there isn't really a sane way to translate a user virtual address down to a 
++ * physical address when the page belongs to another domain.
++ *
++ * My first approach was to map the page in to kernel memory, add an entry
++ * for it in the physical frame list (using alloc_lomem_region as in blkback)
++ * and then attempt to map that page up to user space.  This is disallowed
++ * by xen though, which realizes that we don't really own the machine frame
++ * underlying the physical page.
++ *
++ * The new approach is to provide explicit support for this in xen linux.
++ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
++ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
++ * from pages to actual page structs.  There is a new clause in get_user_pages
++ * that does the right thing for this sort of mapping.
++ */
++static int
++blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
 +{
-+	int i;
-+	uint16_t usr_idx;
-+	unsigned long flags;
-+	struct blktap_request *request;
-+
-+	usr_idx = -1;
-+	request = NULL;
-+
-+	spin_lock_irqsave(&pool.lock, flags);
-+
-+	if (pool.status == BLKTAP_POOL_CLOSING)
-+		goto out;
++	int size, err;
++	struct page **map;
++	struct blktap *tap;
++	struct blkif_sring *sring;
++	struct blktap_ring *ring;
 +
-+	for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++)
-+		if (!tap->pending_requests[i]) {
-+			usr_idx = i;
-+			break;
-+		}
++	tap   = filp->private_data;
++	ring  = &tap->ring;
++	map   = NULL;
++	sring = NULL;
 +
-+	if (usr_idx == (uint16_t)-1)
-+		goto out;
++	if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
++		return -ENOMEM;
 +
-+	if (!list_empty(&pool.free_list)) {
-+		request = list_entry(pool.free_list.next,
-+				     struct blktap_request, free_list);
-+		list_del(&request->free_list);
++	size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++	if (size != (MMAP_PAGES + RING_PAGES)) {
++		BTERR("you _must_ map exactly %lu pages!\n",
++		      MMAP_PAGES + RING_PAGES);
++		return -EAGAIN;
 +	}
 +
-+	if (request) {
-+		struct blktap_request_handle *handle;
++	/* Allocate the fe ring. */
++	sring = (struct blkif_sring *)get_zeroed_page(GFP_KERNEL);
++	if (!sring) {
++		BTERR("Couldn't alloc sring.\n");
++		goto fail_mem;
++	}
 +
-+		atomic_inc(&pool.reqs_in_use);
++	map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
++	if (!map) {
++		BTERR("Couldn't alloc VM_FOREIGN map.\n");
++		goto fail_mem;
++	}
 +
-+		handle = blktap_request_to_handle(request);
-+		atomic_inc(&handle->bucket->reqs_in_use);
-+		handle->inuse = 1;
++	SetPageReserved(virt_to_page(sring));
++    
++	SHARED_RING_INIT(sring);
++	FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
 +
-+		request->usr_idx = usr_idx;
++	ring->ring_vstart = vma->vm_start;
++	ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
 +
-+		tap->pending_requests[usr_idx] = request;
-+		tap->pending_cnt++;
++	/* Map the ring pages to the start of the region and reserve it. */
++	if (xen_feature(XENFEAT_auto_translated_physmap))
++		err = vm_insert_page(vma, vma->vm_start,
++				     virt_to_page(ring->ring.sring));
++	else
++		err = remap_pfn_range(vma, vma->vm_start,
++				      __pa(ring->ring.sring) >> PAGE_SHIFT,
++				      PAGE_SIZE, vma->vm_page_prot);
++	if (err) {
++		BTERR("Mapping user ring failed: %d\n", err);
++		goto fail;
 +	}
 +
-+out:
-+	spin_unlock_irqrestore(&pool.lock, flags);
-+	return request;
-+}
++	/* Mark this VM as containing foreign pages, and set up mappings. */
++	ring->foreign_map.map = map;
++	vma->vm_private_data = &ring->foreign_map;
++	vma->vm_flags |= VM_FOREIGN;
++	vma->vm_flags |= VM_DONTCOPY;
++	vma->vm_flags |= VM_RESERVED;
++	vma->vm_ops = &blktap_ring_vm_operations;
 +
-+void
-+blktap_request_free(struct blktap *tap, struct blktap_request *request)
-+{
-+	int free;
-+	unsigned long flags;
-+	struct blktap_request_handle *handle;
++#ifdef CONFIG_X86
++	vma->vm_mm->context.has_foreign_mappings = 1;
++#endif
 +
-+	BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests));
-+	handle = blktap_request_to_handle(request);
++	tap->pid = current->pid;
++	BTINFO("blktap: mapping pid is %d\n", tap->pid);
 +
-+	spin_lock_irqsave(&pool.lock, flags);
++	ring->vma = vma;
++	return 0;
 +
-+	handle->inuse = 0;
-+	tap->pending_requests[request->usr_idx] = NULL;
-+	blktap_request_pool_init_request(request);
-+	list_add(&request->free_list, &pool.free_list);
-+	atomic_dec(&handle->bucket->reqs_in_use);
-+	free = atomic_dec_and_test(&pool.reqs_in_use);
++ fail:
++	/* Clear any active mappings. */
++	zap_page_range(vma, vma->vm_start, 
++		       vma->vm_end - vma->vm_start, NULL);
++	ClearPageReserved(virt_to_page(sring));
++ fail_mem:
++	free_page((unsigned long)sring);
++	kfree(map);
 +
-+	spin_unlock_irqrestore(&pool.lock, flags);
++	return -ENOMEM;
++}
 +
-+	if (--tap->pending_cnt == 0)
-+		wake_up_interruptible(&tap->wq);
++static inline void
++blktap_ring_set_message(struct blktap *tap, int msg)
++{
++	struct blktap_ring *ring = &tap->ring;
 +
-+	if (free)
-+		wake_up(&pool.wait_queue);
++	down_read(&tap->tap_sem);
++	if (ring->ring.sring)
++		ring->ring.sring->pad[0] = msg;
++	up_read(&tap->tap_sem);
 +}
 +
-+void
-+blktap_request_pool_free(void)
++static int
++blktap_ring_ioctl(struct inode *inode, struct file *filp,
++		  unsigned int cmd, unsigned long arg)
 +{
-+	int i;
-+	unsigned long flags;
++	struct blktap_params params;
++	struct blktap *tap = filp->private_data;
 +
-+	spin_lock_irqsave(&pool.lock, flags);
++	BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
++
++	switch(cmd) {
++	case BLKTAP2_IOCTL_KICK_FE:
++		/* There are fe messages to process. */
++		return blktap_read_ring(tap);
++
++	case BLKTAP2_IOCTL_CREATE_DEVICE:
++		if (!arg)
++			return -EINVAL;
 +
-+	pool.status = BLKTAP_POOL_CLOSING;
-+	while (atomic_read(&pool.reqs_in_use)) {
-+		spin_unlock_irqrestore(&pool.lock, flags);
-+		wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use));
-+		spin_lock_irqsave(&pool.lock, flags);
-+	}
++		if (copy_from_user(&params, (struct blktap_params __user *)arg,
++				   sizeof(params))) {
++			BTERR("failed to get params\n");
++			return -EFAULT;
++		}
 +
-+	for (i = 0; i < MAX_BUCKETS; i++) {
-+		blktap_request_pool_free_bucket(pool.buckets[i]);
-+		pool.buckets[i] = NULL;
-+	}
++		if (blktap_validate_params(tap, &params)) {
++			BTERR("invalid params\n");
++			return -EINVAL;
++		}
 +
-+	spin_unlock_irqrestore(&pool.lock, flags);
-+}
++		tap->params = params;
++		return blktap_device_create(tap);
 +
-+int __init
-+blktap_request_pool_init(void)
-+{
-+	int i, err;
++	case BLKTAP2_IOCTL_SET_PARAMS:
++		if (!arg)
++			return -EINVAL;
 +
-+	memset(&pool, 0, sizeof(pool));
++		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++			return -EINVAL;
 +
-+	spin_lock_init(&pool.lock);
-+	INIT_LIST_HEAD(&pool.free_list);
-+	atomic_set(&pool.reqs_in_use, 0);
-+	init_waitqueue_head(&pool.wait_queue);
++		if (copy_from_user(&params, (struct blktap_params __user *)arg,
++				   sizeof(params))) {
++			BTERR("failed to get params\n");
++			return -EFAULT;
++		}
 +
-+	for (i = 0; i < 2; i++) {
-+		err = blktap_request_pool_allocate_bucket();
-+		if (err)
-+			goto fail;
-+	}
++		if (blktap_validate_params(tap, &params)) {
++			BTERR("invalid params\n");
++			return -EINVAL;
++		}
 +
-+	return 0;
++		tap->params = params;
++		return 0;
 +
-+fail:
-+	blktap_request_pool_free();
-+	return err;
-+}
-diff --git a/drivers/xen/blktap/ring.c b/drivers/xen/blktap/ring.c
-new file mode 100644
-index 0000000..74a7aa7
---- /dev/null
-+++ b/drivers/xen/blktap/ring.c
-@@ -0,0 +1,615 @@
-+#include <linux/module.h>
-+#include <linux/signal.h>
-+#include <linux/sched.h>
-+#include <linux/poll.h>
++	case BLKTAP2_IOCTL_PAUSE:
++		if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
++			return -EINVAL;
 +
-+#include <asm/xen/page.h>
-+#include <asm/xen/hypercall.h>
++		set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++		clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
 +
-+#include "blktap.h"
++		blktap_ring_set_message(tap, 0);
++		wake_up_interruptible(&tap->wq);
 +
-+#ifdef CONFIG_XEN_BLKDEV_BACKEND
-+#include "../blkback/blkback-pagemap.h"
-+#else
-+#define blkback_pagemap_contains_page(page) 0
-+#endif
++		return 0;
 +
-+static int blktap_ring_major;
 +
-+static inline struct blktap *
-+vma_to_blktap(struct vm_area_struct *vma)
-+{
-+	struct vm_foreign_map *m = vma->vm_private_data;
-+	struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
-+	return container_of(r, struct blktap, ring);
-+}
++	case BLKTAP2_IOCTL_REOPEN:
++		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++			return -EINVAL;
 +
-+ /* 
-+  * BLKTAP - immediately before the mmap area,
-+  * we have a bunch of pages reserved for shared memory rings.
-+  */
-+#define RING_PAGES 1
++		if (!arg)
++			return -EINVAL;
 +
-+static int
-+blktap_read_ring(struct blktap *tap)
-+{
-+	/* This is called to read responses from the ring. */
-+	int usr_idx;
-+	RING_IDX rc, rp;
-+	struct blkif_response res;
-+	struct blktap_ring *ring;
-+	struct blktap_request *request;
++		if (copy_to_user((char __user *)arg,
++				 tap->params.name,
++				 strlen(tap->params.name) + 1))
++			return -EFAULT;
 +
-+	down_read(&tap->tap_sem);
++		blktap_ring_set_message(tap, 0);
++		wake_up_interruptible(&tap->wq);
 +
-+	ring = &tap->ring;
-+	if (!ring->vma) {
-+		up_read(&tap->tap_sem);
 +		return 0;
-+	}
 +
-+	/* for each outstanding message on the ring  */
-+	rp = ring->ring.sring->rsp_prod;
-+	rmb();
++	case BLKTAP2_IOCTL_RESUME:
++		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++			return -EINVAL;
 +
-+	for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
-+		memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
-+		mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
-+		++ring->ring.rsp_cons;
++		tap->ring.response = (int)arg;
++		if (!tap->ring.response)
++			clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
 +
-+		usr_idx = (int)res.id;
-+		if (usr_idx >= MAX_PENDING_REQS ||
-+		    !tap->pending_requests[usr_idx]) {
-+			BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
-+			       rc, rp, usr_idx, tap->pid, ring->vma);
-+			continue;
-+		}
++		blktap_ring_set_message(tap, 0);
++		wake_up_interruptible(&tap->wq);
 +
-+		request = tap->pending_requests[usr_idx];
-+		BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
-+		blktap_device_finish_request(tap, &res, request);
++		return 0;
 +	}
 +
-+	up_read(&tap->tap_sem);
++	return -ENOIOCTLCMD;
++}
 +
-+	blktap_run_deferred();
++static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
++{
++	struct blktap *tap = filp->private_data;
++	struct blktap_ring *ring = &tap->ring;
++
++	poll_wait(filp, &ring->poll_wait, wait);
++	if (ring->ring.sring->pad[0] != 0 ||
++	    ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
++		RING_PUSH_REQUESTS(&ring->ring);
++		return POLLIN | POLLRDNORM;
++	}
 +
 +	return 0;
 +}
 +
-+static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++static struct file_operations blktap_ring_file_operations = {
++	.owner    = THIS_MODULE,
++	.open     = blktap_ring_open,
++	.release  = blktap_ring_release,
++	.ioctl    = blktap_ring_ioctl,
++	.mmap     = blktap_ring_mmap,
++	.poll     = blktap_ring_poll,
++};
++
++void
++blktap_ring_kick_user(struct blktap *tap)
 +{
-+	return VM_FAULT_SIGBUS;
++	wake_up_interruptible(&tap->ring.poll_wait);
 +}
 +
-+static pte_t
-+blktap_ring_clear_pte(struct vm_area_struct *vma,
-+		      unsigned long uvaddr,
-+		      pte_t *ptep, int is_fullmm)
++int
++blktap_ring_resume(struct blktap *tap)
 +{
-+	pte_t copy;
-+	struct blktap *tap;
-+	unsigned long kvaddr;
-+	struct page **map, *page;
-+	struct blktap_ring *ring;
-+	struct blktap_request *request;
-+	struct grant_handle_pair *khandle;
-+	struct gnttab_unmap_grant_ref unmap[2];
-+	int offset, seg, usr_idx, count = 0;
-+
-+	tap  = vma_to_blktap(vma);
-+	ring = &tap->ring;
-+	map  = ring->foreign_map.map;
-+	BUG_ON(!map);	/* TODO Should this be changed to if statement? */
-+
-+	/*
-+	 * Zap entry if the address is before the start of the grant
-+	 * mapped region.
-+	 */
-+	if (uvaddr < ring->user_vstart)
-+		return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
-+					       ptep, is_fullmm);
-+
-+	offset  = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
-+	usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
-+	seg     = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
-+
-+	offset  = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
-+	page    = map[offset];
-+	if (page) {
-+		ClearPageReserved(page);
-+		if (blkback_pagemap_contains_page(page))
-+			set_page_private(page, 0);
-+	}
-+	map[offset] = NULL;
-+
-+	request = tap->pending_requests[usr_idx];
-+	kvaddr  = request_to_kaddr(request, seg);
-+	khandle = request->handles + seg;
++	int err;
++	struct blktap_ring *ring = &tap->ring;
 +
-+	if (khandle->kernel != INVALID_GRANT_HANDLE) {
-+		gnttab_set_unmap_op(&unmap[count], kvaddr, 
-+				    GNTMAP_host_map, khandle->kernel);
-+		count++;
++	if (!blktap_active(tap))
++		return -ENODEV;
 +
-+		set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, 
-+				    INVALID_P2M_ENTRY);
-+	}
++	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++		return -EINVAL;
 +
++	/* set shared flag for resume */
++	ring->response = 0;
 +
-+	if (khandle->user != INVALID_GRANT_HANDLE) {
-+		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
++	blktap_ring_kick_user(tap);
 +
-+		copy = *ptep;
-+		gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep).maddr,
-+				    GNTMAP_host_map
-+				    | GNTMAP_application_map
-+				    | GNTMAP_contains_pte,
-+				    khandle->user);
-+		count++;
-+	} else
-+		copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
-+					       is_fullmm);
++	wait_event_interruptible(tap->wq, ring->response ||
++				 !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
 +
-+	if (count)
-+		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
-+					      unmap, count))
-+			BUG();
++	err = ring->response;
++	ring->response = 0;
 +
-+	khandle->kernel = INVALID_GRANT_HANDLE;
-+	khandle->user   = INVALID_GRANT_HANDLE;
++	BTDBG("err: %d\n", err);
 +
-+	return copy;
-+}
++	if (err)
++		return err;
 +
-+static void
-+blktap_ring_vm_unmap(struct vm_area_struct *vma)
-+{
-+	struct blktap *tap = vma_to_blktap(vma);
++	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++		return -EAGAIN;
 +
-+	down_write(&tap->tap_sem);
-+	clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
-+	clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
-+	clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
-+	up_write(&tap->tap_sem);
++	return 0;
 +}
 +
-+static void
-+blktap_ring_vm_close(struct vm_area_struct *vma)
++int
++blktap_ring_pause(struct blktap *tap)
 +{
-+	struct blktap *tap = vma_to_blktap(vma);
-+	struct blktap_ring *ring = &tap->ring;
++	if (!blktap_active(tap))
++		return -ENODEV;
 +
-+	blktap_ring_vm_unmap(vma);                 /* fail future requests */
-+	blktap_device_fail_pending_requests(tap);  /* fail pending requests */
-+	blktap_device_restart(tap);                /* fail deferred requests */
++	if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
++		return -EINVAL;
 +
-+	down_write(&tap->tap_sem);
++	BTDBG("draining queue\n");
++	wait_event_interruptible(tap->wq, !tap->pending_cnt);
++	if (tap->pending_cnt)
++		return -EAGAIN;
 +
-+	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
++	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
++	blktap_ring_kick_user(tap);
 +
-+	kfree(ring->foreign_map.map);
-+	ring->foreign_map.map = NULL;
++	BTDBG("waiting for tapdisk response\n");
++	wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
++	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++		return -EAGAIN;
 +
-+	/* Free the ring page. */
-+	ClearPageReserved(virt_to_page(ring->ring.sring));
-+	free_page((unsigned long)ring->ring.sring);
++	return 0;
++}
 +
-+	BTINFO("unmapping ring %d\n", tap->minor);
-+	ring->ring.sring = NULL;
-+	ring->vma = NULL;
++int
++blktap_ring_destroy(struct blktap *tap)
++{
++	if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
++	    !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
++		return 0;
 +
-+	up_write(&tap->tap_sem);
++	BTDBG("sending tapdisk close message\n");
++	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
++	blktap_ring_kick_user(tap);
 +
-+	wake_up(&tap->wq);
++	return -EAGAIN;
 +}
 +
-+static struct vm_operations_struct blktap_ring_vm_operations = {
-+	.close    = blktap_ring_vm_close,
-+	.unmap    = blktap_ring_vm_unmap,
-+	.fault   = blktap_ring_fault,
-+	.zap_pte  = blktap_ring_clear_pte,
-+};
++static void
++blktap_ring_initialize(struct blktap_ring *ring, int minor)
++{
++	memset(ring, 0, sizeof(*ring));
++	init_waitqueue_head(&ring->poll_wait);
++	ring->devno = MKDEV(blktap_ring_major, minor);
++}
 +
-+static int
-+blktap_ring_open(struct inode *inode, struct file *filp)
++int
++blktap_ring_create(struct blktap *tap)
 +{
-+	int idx;
-+	struct blktap *tap;
++	struct blktap_ring *ring = &tap->ring;
++	blktap_ring_initialize(ring, tap->minor);
++	return blktap_sysfs_create(tap);
++}
 +
-+	idx = iminor(inode);
-+	if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) {
-+		BTERR("unable to open device blktap%d\n", idx);
-+		return -ENODEV;
++int __init
++blktap_ring_init(int *major)
++{
++	int err;
++
++	err = register_chrdev(0, "blktap2", &blktap_ring_file_operations);
++	if (err < 0) {
++		BTERR("error registering blktap ring device: %d\n", err);
++		return err;
 +	}
 +
-+	tap = blktaps[idx];
++	blktap_ring_major = *major = err;
++	BTINFO("blktap ring major: %d\n", blktap_ring_major);
++	return 0;
++}
 +
-+	BTINFO("opening device blktap%d\n", idx);
++int
++blktap_ring_free(void)
++{
++	if (blktap_ring_major)
++		unregister_chrdev(blktap_ring_major, "blktap2");
 +
-+	if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
-+		return -ENODEV;
++	return 0;
++}
+diff --git a/drivers/xen/blktap/sysfs.c b/drivers/xen/blktap/sysfs.c
+new file mode 100644
+index 0000000..23a3a51
+--- /dev/null
++++ b/drivers/xen/blktap/sysfs.c
+@@ -0,0 +1,451 @@
++#include <linux/types.h>
++#include <linux/device.h>
++#include <linux/module.h>
++#include <linux/sched.h>
 +
-+	/* Only one process can access ring at a time */
-+	if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
-+		return -EBUSY;
++#include "blktap.h"
 +
-+	filp->private_data = tap;
-+	BTINFO("opened device %d\n", tap->minor);
++int blktap_debug_level = 1;
 +
-+	return 0;
++static struct class *class;
++static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq);
++
++static inline void
++blktap_sysfs_get(struct blktap *tap)
++{
++	atomic_inc(&tap->ring.sysfs_refcnt);
 +}
 +
-+static int
-+blktap_ring_release(struct inode *inode, struct file *filp)
++static inline void
++blktap_sysfs_put(struct blktap *tap)
 +{
-+	struct blktap *tap = filp->private_data;
++	if (atomic_dec_and_test(&tap->ring.sysfs_refcnt))
++		wake_up(&sysfs_wq);
++}
 +
-+	BTINFO("freeing device %d\n", tap->minor);
-+	clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
-+	filp->private_data = NULL;
-+	wake_up(&tap->wq);	
-+	return 0;
++static inline void
++blktap_sysfs_enter(struct blktap *tap)
++{
++	blktap_sysfs_get(tap);               /* pin sysfs device */
++	mutex_lock(&tap->ring.sysfs_mutex);  /* serialize sysfs operations */
 +}
 +
-+/* Note on mmap:
-+ * We need to map pages to user space in a way that will allow the block
-+ * subsystem set up direct IO to them.  This couldn't be done before, because
-+ * there isn't really a sane way to translate a user virtual address down to a 
-+ * physical address when the page belongs to another domain.
-+ *
-+ * My first approach was to map the page in to kernel memory, add an entry
-+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
-+ * and then attempt to map that page up to user space.  This is disallowed
-+ * by xen though, which realizes that we don't really own the machine frame
-+ * underlying the physical page.
-+ *
-+ * The new approach is to provide explicit support for this in xen linux.
-+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
-+ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
-+ * from pages to actual page structs.  There is a new clause in get_user_pages
-+ * that does the right thing for this sort of mapping.
-+ */
-+static int
-+blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
++static inline void
++blktap_sysfs_exit(struct blktap *tap)
 +{
-+	int size, err;
-+	struct page **map;
-+	struct blktap *tap;
-+	struct blkif_sring *sring;
-+	struct blktap_ring *ring;
++	mutex_unlock(&tap->ring.sysfs_mutex);
++	blktap_sysfs_put(tap);
++}
 +
-+	tap   = filp->private_data;
-+	ring  = &tap->ring;
-+	map   = NULL;
-+	sring = NULL;
++#define CLASS_DEVICE_ATTR(a,b,c,d) DEVICE_ATTR(a,b,c,d)
 +
-+	if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
-+		return -ENOMEM;
++static ssize_t blktap_sysfs_pause_device(struct device *, struct device_attribute *, const char *, size_t);
++CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
++static ssize_t blktap_sysfs_resume_device(struct device *, struct device_attribute *, const char *, size_t);
++CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);
 +
-+	size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-+	if (size != (MMAP_PAGES + RING_PAGES)) {
-+		BTERR("you _must_ map exactly %lu pages!\n",
-+		      MMAP_PAGES + RING_PAGES);
-+		return -EAGAIN;
-+	}
++static ssize_t
++blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
++{
++	int err;
++	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
 +
-+	/* Allocate the fe ring. */
-+	sring = (struct blkif_sring *)get_zeroed_page(GFP_KERNEL);
-+	if (!sring) {
-+		BTERR("Couldn't alloc sring.\n");
-+		goto fail_mem;
-+	}
++	blktap_sysfs_enter(tap);
 +
-+	map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
-+	if (!map) {
-+		BTERR("Couldn't alloc VM_FOREIGN map.\n");
-+		goto fail_mem;
++	if (!tap->ring.dev ||
++	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++		err = -ENODEV;
++		goto out;
 +	}
 +
-+	SetPageReserved(virt_to_page(sring));
-+    
-+	SHARED_RING_INIT(sring);
-+	FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
-+
-+	ring->ring_vstart = vma->vm_start;
-+	ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
++	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++		err = -EPERM;
++		goto out;
++	}
 +
-+	/* Map the ring pages to the start of the region and reserve it. */
-+	if (xen_feature(XENFEAT_auto_translated_physmap))
-+		err = vm_insert_page(vma, vma->vm_start,
-+				     virt_to_page(ring->ring.sring));
-+	else
-+		err = remap_pfn_range(vma, vma->vm_start,
-+				      __pa(ring->ring.sring) >> PAGE_SHIFT,
-+				      PAGE_SIZE, vma->vm_page_prot);
-+	if (err) {
-+		BTERR("Mapping user ring failed: %d\n", err);
-+		goto fail;
++	if (size > BLKTAP2_MAX_MESSAGE_LEN) {
++		err = -ENAMETOOLONG;
++		goto out;
 +	}
 +
-+	/* Mark this VM as containing foreign pages, and set up mappings. */
-+	ring->foreign_map.map = map;
-+	vma->vm_private_data = &ring->foreign_map;
-+	vma->vm_flags |= VM_FOREIGN;
-+	vma->vm_flags |= VM_DONTCOPY;
-+	vma->vm_flags |= VM_RESERVED;
-+	vma->vm_ops = &blktap_ring_vm_operations;
++	if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) {
++		err = -EINVAL;
++		goto out;
++	}
 +
-+#ifdef CONFIG_X86
-+	vma->vm_mm->context.has_foreign_mappings = 1;
-+#endif
++	snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf);
++	err = size;
 +
-+	tap->pid = current->pid;
-+	BTINFO("blktap: mapping pid is %d\n", tap->pid);
++out:
++	blktap_sysfs_exit(tap);	
++	return err;
++}
 +
-+	ring->vma = vma;
-+	return 0;
++static ssize_t
++blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
++{
++	ssize_t size;
++	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
 +
-+ fail:
-+	/* Clear any active mappings. */
-+	zap_page_range(vma, vma->vm_start, 
-+		       vma->vm_end - vma->vm_start, NULL);
-+	ClearPageReserved(virt_to_page(sring));
-+ fail_mem:
-+	free_page((unsigned long)sring);
-+	kfree(map);
++	blktap_sysfs_enter(tap);
 +
-+	return -ENOMEM;
++	if (!tap->ring.dev)
++		size = -ENODEV;
++	else if (tap->params.name[0])
++		size = sprintf(buf, "%s\n", tap->params.name);
++	else
++		size = sprintf(buf, "%d\n", tap->minor);
++
++	blktap_sysfs_exit(tap);
++
++	return size;
 +}
++CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
++		  blktap_sysfs_get_name, blktap_sysfs_set_name);
 +
-+static inline void
-+blktap_ring_set_message(struct blktap *tap, int msg)
++static ssize_t
++blktap_sysfs_remove_device(struct device *dev,
++			   struct device_attribute *attr,
++			   const char *buf, size_t size)
 +{
-+	struct blktap_ring *ring = &tap->ring;
++	int err;
++	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
 +
-+	down_read(&tap->tap_sem);
-+	if (ring->ring.sring)
-+		ring->ring.sring->pad[0] = msg;
-+	up_read(&tap->tap_sem);
++	if (!tap->ring.dev)
++		return size;
++
++	if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++		return -EBUSY;
++
++	err = blktap_control_destroy_device(tap);
++
++	return (err ? : size);
 +}
++CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
 +
-+static int
-+blktap_ring_ioctl(struct inode *inode, struct file *filp,
-+		  unsigned int cmd, unsigned long arg)
++static ssize_t
++blktap_sysfs_pause_device(struct device *dev,
++			  struct device_attribute *attr,
++			  const char *buf, size_t size)
 +{
-+	struct blktap_params params;
-+	struct blktap *tap = filp->private_data;
++	int err;
++	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
 +
-+	BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
++	blktap_sysfs_enter(tap);
 +
-+	switch(cmd) {
-+	case BLKTAP2_IOCTL_KICK_FE:
-+		/* There are fe messages to process. */
-+		return blktap_read_ring(tap);
++	BTDBG("pausing %u:%u: dev_inuse: %lu\n",
++	      MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse);
 +
-+	case BLKTAP2_IOCTL_CREATE_DEVICE:
-+		if (!arg)
-+			return -EINVAL;
++	if (!tap->ring.dev ||
++	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++		err = -ENODEV;
++		goto out;
++	}
 +
-+		if (copy_from_user(&params, (struct blktap_params __user *)arg,
-+				   sizeof(params))) {
-+			BTERR("failed to get params\n");
-+			return -EFAULT;
-+		}
++	if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++		err = -EBUSY;
++		goto out;
++	}
 +
-+		if (blktap_validate_params(tap, &params)) {
-+			BTERR("invalid params\n");
-+			return -EINVAL;
-+		}
++	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++		err = 0;
++		goto out;
++	}
 +
-+		tap->params = params;
-+		return blktap_device_create(tap);
++	err = blktap_device_pause(tap);
++	if (!err) {
++		device_remove_file(dev, &dev_attr_pause);
++		err = device_create_file(dev, &dev_attr_resume);
++	}
 +
-+	case BLKTAP2_IOCTL_SET_PARAMS:
-+		if (!arg)
-+			return -EINVAL;
++out:
++	blktap_sysfs_exit(tap);
 +
-+		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+			return -EINVAL;
++	return (err ? err : size);
++}
 +
-+		if (copy_from_user(&params, (struct blktap_params __user *)arg,
-+				   sizeof(params))) {
-+			BTERR("failed to get params\n");
-+			return -EFAULT;
-+		}
++static ssize_t
++blktap_sysfs_resume_device(struct device *dev,
++			   struct device_attribute *attr,
++			   const char *buf, size_t size)
++{
++	int err;
++	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
 +
-+		if (blktap_validate_params(tap, &params)) {
-+			BTERR("invalid params\n");
-+			return -EINVAL;
-+		}
++	blktap_sysfs_enter(tap);
 +
-+		tap->params = params;
-+		return 0;
++	if (!tap->ring.dev ||
++	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++		err = -ENODEV;
++		goto out;
++	}
 +
-+	case BLKTAP2_IOCTL_PAUSE:
-+		if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
-+			return -EINVAL;
++	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++		err = -EINVAL;
++		goto out;
++	}
 +
-+		set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
-+		clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++	err = blktap_device_resume(tap);
++	if (!err) {
++		device_remove_file(dev, &dev_attr_resume);
++		err = device_create_file(dev, &dev_attr_pause);
++	}
 +
-+		blktap_ring_set_message(tap, 0);
-+		wake_up_interruptible(&tap->wq);
++out:
++	blktap_sysfs_exit(tap);
 +
-+		return 0;
++	BTDBG("returning %zd\n", (err ? err : size));
++	return (err ? err : size);
++}
 +
++#ifdef ENABLE_PASSTHROUGH
++static ssize_t
++blktap_sysfs_enable_passthrough(struct device *dev,
++				const char *buf, size_t size)
++{
++	int err;
++	unsigned major, minor;
++	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
 +
-+	case BLKTAP2_IOCTL_REOPEN:
-+		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+			return -EINVAL;
++	BTINFO("passthrough request enabled\n");
 +
-+		if (!arg)
-+			return -EINVAL;
++	blktap_sysfs_enter(tap);
 +
-+		if (copy_to_user((char __user *)arg,
-+				 tap->params.name,
-+				 strlen(tap->params.name) + 1))
-+			return -EFAULT;
++	if (!tap->ring.dev ||
++	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++		err = -ENODEV;
++		goto out;
++	}
 +
-+		blktap_ring_set_message(tap, 0);
-+		wake_up_interruptible(&tap->wq);
++	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++		err = -EINVAL;
++		goto out;
++	}
 +
-+		return 0;
++	if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
++		err = -EINVAL;
++		goto out;
++	}
 +
-+	case BLKTAP2_IOCTL_RESUME:
-+		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+			return -EINVAL;
++	err = sscanf(buf, "%x:%x", &major, &minor);
++	if (err != 2) {
++		err = -EINVAL;
++		goto out;
++	}
 +
-+		tap->ring.response = (int)arg;
-+		if (!tap->ring.response)
-+			clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++	err = blktap_device_enable_passthrough(tap, major, minor);
 +
-+		blktap_ring_set_message(tap, 0);
-+		wake_up_interruptible(&tap->wq);
++out:
++	blktap_sysfs_exit(tap);
++	BTDBG("returning %d\n", (err ? err : size));
++	return (err ? err : size);
++}
++#endif
 +
-+		return 0;
++static ssize_t
++blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
++{
++	char *tmp;
++	int i, ret;
++	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++
++	tmp = buf;
++	blktap_sysfs_get(tap);
++
++	if (!tap->ring.dev) {
++		ret = sprintf(tmp, "no device\n");
++		goto out;
 +	}
 +
-+	return -ENOIOCTLCMD;
-+}
++	tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n",
++		       tap->params.name, MAJOR(tap->ring.devno),
++		       MINOR(tap->ring.devno), atomic_read(&tap->refcnt),
++		       tap->dev_inuse);
++	tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, "
++		       "device users: %d\n", tap->params.capacity,
++		       tap->params.sector_size, tap->device.users);
++
++	down_read(&tap->tap_sem);
 +
-+static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
-+{
-+	struct blktap *tap = filp->private_data;
-+	struct blktap_ring *ring = &tap->ring;
++	tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt);
++	for (i = 0; i < MAX_PENDING_REQS; i++) {
++		struct blktap_request *req = tap->pending_requests[i];
++		if (!req)
++			continue;
 +
-+	poll_wait(filp, &ring->poll_wait, wait);
-+	if (ring->ring.sring->pad[0] != 0 ||
-+	    ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
-+		RING_PUSH_REQUESTS(&ring->ring);
-+		return POLLIN | POLLRDNORM;
++		tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, "
++			       "status: 0x%02x, pendcnt: %d, "
++			       "nr_pages: %u, op: %d, time: %lu:%lu\n",
++			       i, (unsigned long long)req->id, req->usr_idx,
++			       req->status, atomic_read(&req->pendcnt),
++			       req->nr_pages, req->operation, req->time.tv_sec,
++			       req->time.tv_usec);
 +	}
 +
-+	return 0;
-+}
++	up_read(&tap->tap_sem);
++	ret = (tmp - buf) + 1;
 +
-+static struct file_operations blktap_ring_file_operations = {
-+	.owner    = THIS_MODULE,
-+	.open     = blktap_ring_open,
-+	.release  = blktap_ring_release,
-+	.ioctl    = blktap_ring_ioctl,
-+	.mmap     = blktap_ring_mmap,
-+	.poll     = blktap_ring_poll,
-+};
++out:
++	blktap_sysfs_put(tap);
++	BTDBG("%s\n", buf);
 +
-+void
-+blktap_ring_kick_user(struct blktap *tap)
-+{
-+	wake_up_interruptible(&tap->ring.poll_wait);
++	return ret;
 +}
++CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);
 +
 +int
-+blktap_ring_resume(struct blktap *tap)
++blktap_sysfs_create(struct blktap *tap)
 +{
++	struct blktap_ring *ring;
++	struct device *dev;
 +	int err;
-+	struct blktap_ring *ring = &tap->ring;
 +
-+	if (!blktap_active(tap))
++	if (!class)
 +		return -ENODEV;
 +
-+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+		return -EINVAL;
-+
-+	/* set shared flag for resume */
-+	ring->response = 0;
++	ring = &tap->ring;
 +
-+	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
-+	blktap_ring_kick_user(tap);
++	dev = device_create(class, NULL, ring->devno,
++			    tap, "blktap%d", tap->minor);
++	if (IS_ERR(dev))
++		return PTR_ERR(dev);
 +
-+	wait_event_interruptible(tap->wq, ring->response ||
-+				 !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
++	ring->dev = dev;
 +
-+	err = ring->response;
-+	ring->response = 0;
++	mutex_init(&ring->sysfs_mutex);
++	atomic_set(&ring->sysfs_refcnt, 0);
 +
-+	BTDBG("err: %d\n", err);
 +
++	printk(KERN_CRIT "%s: adding attributes for dev %p\n", __func__, dev);
++	err = device_create_file(dev, &dev_attr_name);
 +	if (err)
-+		return err;
-+
-+	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+		return -EAGAIN;
++		goto out;
++	err = device_create_file(dev, &dev_attr_remove);
++	if (err)
++		goto out_unregister_name;
++	err = device_create_file(dev, &dev_attr_pause);
++	if (err)
++		goto out_unregister_remove;
++	err = device_create_file(dev, &dev_attr_debug);
++	if (err)
++		goto out_unregister_pause;
 +
 +	return 0;
++
++out_unregister_pause:
++	device_remove_file(dev, &dev_attr_pause);
++out_unregister_remove:
++	device_remove_file(dev, &dev_attr_remove);
++out_unregister_name:
++	device_remove_file(dev, &dev_attr_name);
++out:
++	return err;
 +}
 +
 +int
-+blktap_ring_pause(struct blktap *tap)
++blktap_sysfs_destroy(struct blktap *tap)
 +{
-+	if (!blktap_active(tap))
-+		return -ENODEV;
-+
-+	if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
-+		return -EINVAL;
++	struct blktap_ring *ring;
++	struct device *dev;
 +
-+	BTDBG("draining queue\n");
-+	wait_event_interruptible(tap->wq, !tap->pending_cnt);
-+	if (tap->pending_cnt)
-+		return -EAGAIN;
++	printk(KERN_CRIT "%s\n", __func__);
 +
-+	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
-+	blktap_ring_kick_user(tap);
++	ring = &tap->ring;
++	dev  = ring->dev;
++	if (!class || !dev)
++		return 0;
 +
-+	BTDBG("waiting for tapdisk response\n");
-+	wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
-+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++	ring->dev = NULL;
++	if (wait_event_interruptible(sysfs_wq,
++				     !atomic_read(&tap->ring.sysfs_refcnt)))
 +		return -EAGAIN;
 +
++	device_schedule_callback(dev, device_unregister);
++
 +	return 0;
 +}
 +
-+int
-+blktap_ring_destroy(struct blktap *tap)
++static ssize_t
++blktap_sysfs_show_verbosity(struct class *class, char *buf)
 +{
-+	if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
-+	    !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
-+		return 0;
++	return sprintf(buf, "%d\n", blktap_debug_level);
++}
 +
-+	BTDBG("sending tapdisk close message\n");
-+	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
-+	blktap_ring_kick_user(tap);
++static ssize_t
++blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size)
++{
++	int level;
 +
-+	return -EAGAIN;
++	if (sscanf(buf, "%d", &level) == 1) {
++		blktap_debug_level = level;
++		return size;
++	}
++
++	return -EINVAL;
 +}
++CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR,
++	   blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
 +
-+static void
-+blktap_ring_initialize(struct blktap_ring *ring, int minor)
++static ssize_t
++blktap_sysfs_show_devices(struct class *class, char *buf)
 +{
-+	memset(ring, 0, sizeof(*ring));
-+	init_waitqueue_head(&ring->poll_wait);
-+	ring->devno = MKDEV(blktap_ring_major, minor);
++	int i, ret;
++	struct blktap *tap;
++
++	ret = 0;
++	for (i = 0; i < MAX_BLKTAP_DEVICE; i++) {
++		tap = blktaps[i];
++		if (!tap)
++			continue;
++
++		if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++			continue;
++
++		ret += sprintf(buf + ret, "%d ", tap->minor);
++		ret += snprintf(buf + ret, sizeof(tap->params.name) - 1,
++				tap->params.name);
++		ret += sprintf(buf + ret, "\n");
++	}
++
++	return ret;
 +}
++CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL);
 +
-+int
-+blktap_ring_create(struct blktap *tap)
++void
++blktap_sysfs_free(void)
 +{
-+	struct blktap_ring *ring = &tap->ring;
-+	blktap_ring_initialize(ring, tap->minor);
-+	return blktap_sysfs_create(tap);
++	if (!class)
++		return;
++
++	class_remove_file(class, &class_attr_verbosity);
++	class_remove_file(class, &class_attr_devices);
++
++	class_destroy(class);
 +}
 +
 +int __init
-+blktap_ring_init(int *major)
++blktap_sysfs_init(void)
 +{
++	struct class *cls;
 +	int err;
 +
-+	err = register_chrdev(0, "blktap2", &blktap_ring_file_operations);
-+	if (err < 0) {
-+		BTERR("error registering blktap ring device: %d\n", err);
-+		return err;
-+	}
++	if (class)
++		return -EEXIST;
 +
-+	blktap_ring_major = *major = err;
-+	BTINFO("blktap ring major: %d\n", blktap_ring_major);
-+	return 0;
-+}
++	cls = class_create(THIS_MODULE, "blktap2");
++	if (IS_ERR(cls))
++		return PTR_ERR(cls);
 +
-+int
-+blktap_ring_free(void)
-+{
-+	if (blktap_ring_major)
-+		unregister_chrdev(blktap_ring_major, "blktap2");
++	err = class_create_file(cls, &class_attr_verbosity);
++	if (err)
++		goto out_unregister;
++	err = class_create_file(cls, &class_attr_devices);
++	if (err)
++		goto out_unregister;
 +
++	class = cls;
 +	return 0;
++out_unregister:
++	class_destroy(cls);
++	return err;
 +}
-diff --git a/drivers/xen/blktap/sysfs.c b/drivers/xen/blktap/sysfs.c
+diff --git a/drivers/xen/blktap/wait_queue.c b/drivers/xen/blktap/wait_queue.c
 new file mode 100644
-index 0000000..23a3a51
+index 0000000..f8995aa
 --- /dev/null
-+++ b/drivers/xen/blktap/sysfs.c
-@@ -0,0 +1,451 @@
-+#include <linux/types.h>
-+#include <linux/device.h>
-+#include <linux/module.h>
-+#include <linux/sched.h>
++++ b/drivers/xen/blktap/wait_queue.c
+@@ -0,0 +1,40 @@
++#include <linux/list.h>
++#include <linux/spinlock.h>
 +
 +#include "blktap.h"
 +
-+int blktap_debug_level = 1;
-+
-+static struct class *class;
-+static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq);
++static LIST_HEAD(deferred_work_queue);
++static DEFINE_SPINLOCK(deferred_work_lock);
 +
-+static inline void
-+blktap_sysfs_get(struct blktap *tap)
++void
++blktap_run_deferred(void)
 +{
-+	atomic_inc(&tap->ring.sysfs_refcnt);
-+}
++	LIST_HEAD(queue);
++	struct blktap *tap;
++	unsigned long flags;
++
++	spin_lock_irqsave(&deferred_work_lock, flags);
++	list_splice_init(&deferred_work_queue, &queue);
++	list_for_each_entry(tap, &queue, deferred_queue)
++		clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
++	spin_unlock_irqrestore(&deferred_work_lock, flags);
 +
-+static inline void
-+blktap_sysfs_put(struct blktap *tap)
-+{
-+	if (atomic_dec_and_test(&tap->ring.sysfs_refcnt))
-+		wake_up(&sysfs_wq);
++	while (!list_empty(&queue)) {
++		tap = list_entry(queue.next, struct blktap, deferred_queue);
++		list_del_init(&tap->deferred_queue);
++		blktap_device_restart(tap);
++	}
 +}
 +
-+static inline void
-+blktap_sysfs_enter(struct blktap *tap)
++void
++blktap_defer(struct blktap *tap)
 +{
-+	blktap_sysfs_get(tap);               /* pin sysfs device */
-+	mutex_lock(&tap->ring.sysfs_mutex);  /* serialize sysfs operations */
-+}
++	unsigned long flags;
 +
-+static inline void
-+blktap_sysfs_exit(struct blktap *tap)
-+{
-+	mutex_unlock(&tap->ring.sysfs_mutex);
-+	blktap_sysfs_put(tap);
++	spin_lock_irqsave(&deferred_work_lock, flags);
++	if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) {
++		set_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
++		list_add_tail(&tap->deferred_queue, &deferred_work_queue);
++	}
++	spin_unlock_irqrestore(&deferred_work_lock, flags);
 +}
+diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
+index bdfd584..6625ffe 100644
+--- a/drivers/xen/cpu_hotplug.c
++++ b/drivers/xen/cpu_hotplug.c
+@@ -1,5 +1,6 @@
+ #include <linux/notifier.h>
+ 
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ 
+ #include <asm/xen/hypervisor.h>
+diff --git a/drivers/xen/events.c b/drivers/xen/events.c
+index ce602dd..660774a 100644
+--- a/drivers/xen/events.c
++++ b/drivers/xen/events.c
+@@ -16,7 +16,7 @@
+  *    (typically dom0).
+  * 2. VIRQs, typically used for timers.  These are per-cpu events.
+  * 3. IPIs.
+- * 4. Hardware interrupts. Not supported at present.
++ * 4. PIRQs - Hardware interrupts.
+  *
+  * Jeremy Fitzhardinge <jeremy at xensource.com>, XenSource Inc, 2007
+  */
+@@ -27,19 +27,27 @@
+ #include <linux/module.h>
+ #include <linux/string.h>
+ #include <linux/bootmem.h>
++#include <linux/irqnr.h>
++#include <linux/pci_regs.h>
++#include <linux/pci.h>
++#include <linux/msi.h>
+ 
+ #include <asm/ptrace.h>
+ #include <asm/irq.h>
+ #include <asm/idle.h>
++#include <asm/io_apic.h>
+ #include <asm/sync_bitops.h>
+ #include <asm/xen/hypercall.h>
+ #include <asm/xen/hypervisor.h>
++#include <asm/xen/pci.h>
+ 
+ #include <xen/xen-ops.h>
+ #include <xen/events.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/event_channel.h>
+ 
++#include "../pci/msi.h"
 +
-+#define CLASS_DEVICE_ATTR(a,b,c,d) DEVICE_ATTR(a,b,c,d)
+ /*
+  * This lock protects updates to the following mapping and reference-count
+  * arrays. The lock does not need to be acquired to read the mapping tables.
+@@ -67,7 +75,7 @@ enum xen_irq_type {
+  * event channel - irq->event channel mapping
+  * cpu - cpu this event channel is bound to
+  * index - type-specific information:
+- *    PIRQ - vector, with MSB being "needs EIO"
++ *    PIRQ - with MSB being "needs EIO"
+  *    VIRQ - virq number
+  *    IPI - IPI vector
+  *    EVTCHN -
+@@ -83,20 +91,27 @@ struct irq_info
+ 		enum ipi_vector ipi;
+ 		struct {
+ 			unsigned short gsi;
+-			unsigned short vector;
++			unsigned char vector;
++			unsigned char flags;
++			uint16_t domid;
+ 		} pirq;
+ 	} u;
+ };
++#define PIRQ_NEEDS_EOI	(1 << 0)
++#define PIRQ_SHAREABLE	(1 << 1)
+ 
+-static struct irq_info irq_info[NR_IRQS];
++static struct irq_info *irq_info;
+ 
+-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+-	[0 ... NR_EVENT_CHANNELS-1] = -1
+-};
++static int *evtchn_to_irq;
+ struct cpu_evtchn_s {
+ 	unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
+ };
+-static struct cpu_evtchn_s *cpu_evtchn_mask_p;
 +
-+static ssize_t blktap_sysfs_pause_device(struct device *, struct device_attribute *, const char *, size_t);
-+CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
-+static ssize_t blktap_sysfs_resume_device(struct device *, struct device_attribute *, const char *, size_t);
-+CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);
++static __initdata struct cpu_evtchn_s init_evtchn_mask = {
++	.bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
++};
++static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
 +
-+static ssize_t
-+blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
+ static inline unsigned long *cpu_evtchn_mask(int cpu)
+ {
+ 	return cpu_evtchn_mask_p[cpu].bits;
+@@ -106,6 +121,7 @@ static inline unsigned long *cpu_evtchn_mask(int cpu)
+ #define VALID_EVTCHN(chn)	((chn) != 0)
+ 
+ static struct irq_chip xen_dynamic_chip;
++static struct irq_chip xen_pirq_chip;
+ 
+ /* Constructor for packed IRQ information. */
+ static struct irq_info mk_unbound_info(void)
+@@ -135,7 +151,8 @@ static struct irq_info mk_pirq_info(unsigned short evtchn,
+ 				    unsigned short gsi, unsigned short vector)
+ {
+ 	return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
+-			.cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
++			.cpu = 0, .u.pirq =
++			{ .gsi = gsi, .vector = vector, .domid = DOMID_SELF } };
+ }
+ 
+ /*
+@@ -218,6 +235,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn)
+ 	return ret;
+ }
+ 
++static bool pirq_needs_eoi(unsigned irq)
 +{
-+	int err;
-+	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++	struct irq_info *info = info_for_irq(irq);
 +
-+	blktap_sysfs_enter(tap);
++	BUG_ON(info->type != IRQT_PIRQ);
 +
-+	if (!tap->ring.dev ||
-+	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
-+		err = -ENODEV;
-+		goto out;
-+	}
++	return info->u.pirq.flags & PIRQ_NEEDS_EOI;
++}
 +
-+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
-+		err = -EPERM;
-+		goto out;
-+	}
+ static inline unsigned long active_evtchns(unsigned int cpu,
+ 					   struct shared_info *sh,
+ 					   unsigned int idx)
+@@ -329,17 +355,33 @@ static void unmask_evtchn(int port)
+ 	put_cpu();
+ }
+ 
++static int get_nr_hw_irqs(void)
++{
++	int ret = 1;
 +
-+	if (size > BLKTAP2_MAX_MESSAGE_LEN) {
-+		err = -ENAMETOOLONG;
-+		goto out;
-+	}
++#ifdef CONFIG_X86_IO_APIC
++	ret = get_nr_irqs_gsi();
++#endif
 +
-+	if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) {
-+		err = -EINVAL;
-+		goto out;
-+	}
++	return ret;
++}
 +
-+	snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf);
-+	err = size;
+ static int find_unbound_irq(void)
+ {
+ 	int irq;
+ 	struct irq_desc *desc;
++	int start = get_nr_hw_irqs();
+ 
+-	for (irq = 0; irq < nr_irqs; irq++)
++	if (start == nr_irqs)
++		goto no_irqs;
 +
-+out:
-+	blktap_sysfs_exit(tap);	
-+	return err;
++	/* nr_irqs is a magic value. Must not use it.*/
++	for (irq = nr_irqs-1; irq > start; irq--)
+ 		if (irq_info[irq].type == IRQT_UNBOUND)
+ 			break;
+ 
+-	if (irq == nr_irqs)
+-		panic("No available IRQ to bind to: increase nr_irqs!\n");
++	if (irq == start)
++		goto no_irqs;
+ 
+ 	desc = irq_to_desc_alloc_node(irq, 0);
+ 	if (WARN_ON(desc == NULL))
+@@ -348,8 +390,324 @@ static int find_unbound_irq(void)
+ 	dynamic_irq_init(irq);
+ 
+ 	return irq;
++
++no_irqs:
++	panic("No available IRQ to bind to: increase nr_irqs!\n");
 +}
 +
-+static ssize_t
-+blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
++static bool identity_mapped_irq(unsigned irq)
 +{
-+	ssize_t size;
-+	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
-+
-+	blktap_sysfs_enter(tap);
-+
-+	if (!tap->ring.dev)
-+		size = -ENODEV;
-+	else if (tap->params.name[0])
-+		size = sprintf(buf, "%s\n", tap->params.name);
-+	else
-+		size = sprintf(buf, "%d\n", tap->minor);
++	/* identity map all the hardware irqs */
++	return irq < get_nr_hw_irqs();
++}
 +
-+	blktap_sysfs_exit(tap);
++static void pirq_unmask_notify(int irq)
++{
++	struct irq_info *info = info_for_irq(irq);
++	struct physdev_eoi eoi = { .irq = info->u.pirq.gsi };
 +
-+	return size;
++	if (unlikely(pirq_needs_eoi(irq))) {
++		int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
++		WARN_ON(rc);
++	}
 +}
-+CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
-+		  blktap_sysfs_get_name, blktap_sysfs_set_name);
 +
-+static ssize_t
-+blktap_sysfs_remove_device(struct device *dev,
-+			   struct device_attribute *attr,
-+			   const char *buf, size_t size)
++static void pirq_query_unmask(int irq)
 +{
-+	int err;
-+	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++	struct physdev_irq_status_query irq_status;
++	struct irq_info *info = info_for_irq(irq);
 +
-+	if (!tap->ring.dev)
-+		return size;
++	BUG_ON(info->type != IRQT_PIRQ);
 +
-+	if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
-+		return -EBUSY;
++	irq_status.irq = info->u.pirq.gsi;
++	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
++		irq_status.flags = 0;
 +
-+	err = blktap_control_destroy_device(tap);
++	info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
++	if (irq_status.flags & XENIRQSTAT_needs_eoi)
++		info->u.pirq.flags |= PIRQ_NEEDS_EOI;
+ }
+ 
++static bool probing_irq(int irq)
++{
++	struct irq_desc *desc = irq_to_desc(irq);
 +
-+	return (err ? : size);
++	return desc && desc->action == NULL;
 +}
-+CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
 +
-+static ssize_t
-+blktap_sysfs_pause_device(struct device *dev,
-+			  struct device_attribute *attr,
-+			  const char *buf, size_t size)
++static unsigned int startup_pirq(unsigned int irq)
 +{
-+	int err;
-+	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
-+
-+	blktap_sysfs_enter(tap);
++	struct evtchn_bind_pirq bind_pirq;
++	struct irq_info *info = info_for_irq(irq);
++	int evtchn = evtchn_from_irq(irq);
++	int rc;
 +
-+	BTDBG("pausing %u:%u: dev_inuse: %lu\n",
-+	      MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse);
++	BUG_ON(info->type != IRQT_PIRQ);
 +
-+	if (!tap->ring.dev ||
-+	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
-+		err = -ENODEV;
++	if (VALID_EVTCHN(evtchn))
 +		goto out;
-+	}
 +
-+	if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
-+		err = -EBUSY;
-+		goto out;
++	bind_pirq.pirq = info->u.pirq.gsi;
++	/* NB. We are happy to share unless we are probing. */
++	bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
++					BIND_PIRQ__WILL_SHARE : 0;
++	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
++	if (rc != 0) {
++		if (!probing_irq(irq))
++			printk(KERN_INFO "Failed to obtain physical IRQ %d" \
++				" (GSI:%d)\n", irq, info->u.pirq.gsi);
++		return 0;
 +	}
++	evtchn = bind_pirq.port;
 +
-+	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
-+		err = 0;
-+		goto out;
-+	}
++	pirq_query_unmask(irq);
 +
-+	err = blktap_device_pause(tap);
-+	if (!err) {
-+		device_remove_file(dev, &dev_attr_pause);
-+		err = device_create_file(dev, &dev_attr_resume);
-+	}
++	evtchn_to_irq[evtchn] = irq;
++	bind_evtchn_to_cpu(evtchn, 0);
++	info->evtchn = evtchn;
 +
-+out:
-+	blktap_sysfs_exit(tap);
++ out:
++	unmask_evtchn(evtchn);
++	pirq_unmask_notify(irq);
 +
-+	return (err ? err : size);
++	return 0;
 +}
 +
-+static ssize_t
-+blktap_sysfs_resume_device(struct device *dev,
-+			   struct device_attribute *attr,
-+			   const char *buf, size_t size)
++static void shutdown_pirq(unsigned int irq)
 +{
-+	int err;
-+	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
-+
-+	blktap_sysfs_enter(tap);
++	struct evtchn_close close;
++	struct irq_info *info = info_for_irq(irq);
++	int evtchn = evtchn_from_irq(irq);
 +
-+	if (!tap->ring.dev ||
-+	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
-+		err = -ENODEV;
-+		goto out;
-+	}
++	BUG_ON(info->type != IRQT_PIRQ);
 +
-+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
-+		err = -EINVAL;
-+		goto out;
-+	}
++	if (!VALID_EVTCHN(evtchn))
++		return;
 +
-+	err = blktap_device_resume(tap);
-+	if (!err) {
-+		device_remove_file(dev, &dev_attr_resume);
-+		err = device_create_file(dev, &dev_attr_pause);
-+	}
++	mask_evtchn(evtchn);
 +
-+out:
-+	blktap_sysfs_exit(tap);
++	close.port = evtchn;
++	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
++		BUG();
 +
-+	BTDBG("returning %zd\n", (err ? err : size));
-+	return (err ? err : size);
++	bind_evtchn_to_cpu(evtchn, 0);
++	evtchn_to_irq[evtchn] = -1;
++	info->evtchn = 0;
 +}
 +
-+#ifdef ENABLE_PASSTHROUGH
-+static ssize_t
-+blktap_sysfs_enable_passthrough(struct device *dev,
-+				const char *buf, size_t size)
++static void enable_pirq(unsigned int irq)
 +{
-+	int err;
-+	unsigned major, minor;
-+	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
-+
-+	BTINFO("passthrough request enabled\n");
-+
-+	blktap_sysfs_enter(tap);
++	startup_pirq(irq);
++}
 +
-+	if (!tap->ring.dev ||
-+	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
-+		err = -ENODEV;
-+		goto out;
-+	}
++static void disable_pirq(unsigned int irq)
++{
++}
 +
-+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
-+		err = -EINVAL;
-+		goto out;
-+	}
++static void ack_pirq(unsigned int irq)
++{
++	int evtchn = evtchn_from_irq(irq);
 +
-+	if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
-+		err = -EINVAL;
-+		goto out;
-+	}
++	move_native_irq(irq);
 +
-+	err = sscanf(buf, "%x:%x", &major, &minor);
-+	if (err != 2) {
-+		err = -EINVAL;
-+		goto out;
++	if (VALID_EVTCHN(evtchn)) {
++		mask_evtchn(evtchn);
++		clear_evtchn(evtchn);
 +	}
-+
-+	err = blktap_device_enable_passthrough(tap, major, minor);
-+
-+out:
-+	blktap_sysfs_exit(tap);
-+	BTDBG("returning %d\n", (err ? err : size));
-+	return (err ? err : size);
 +}
-+#endif
 +
-+static ssize_t
-+blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
++static void end_pirq(unsigned int irq)
 +{
-+	char *tmp;
-+	int i, ret;
-+	struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++	int evtchn = evtchn_from_irq(irq);
++	struct irq_desc *desc = irq_to_desc(irq);
 +
-+	tmp = buf;
-+	blktap_sysfs_get(tap);
++	if (WARN_ON(!desc))
++		return;
 +
-+	if (!tap->ring.dev) {
-+		ret = sprintf(tmp, "no device\n");
-+		goto out;
++	if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
++	    (IRQ_DISABLED|IRQ_PENDING)) {
++		shutdown_pirq(irq);
++	} else if (VALID_EVTCHN(evtchn)) {
++		unmask_evtchn(evtchn);
++		pirq_unmask_notify(irq);
 +	}
++}
 +
-+	tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n",
-+		       tap->params.name, MAJOR(tap->ring.devno),
-+		       MINOR(tap->ring.devno), atomic_read(&tap->refcnt),
-+		       tap->dev_inuse);
-+	tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, "
-+		       "device users: %d\n", tap->params.capacity,
-+		       tap->params.sector_size, tap->device.users);
++static int find_irq_by_gsi(unsigned gsi)
++{
++	int irq;
 +
-+	down_read(&tap->tap_sem);
++	for (irq = 0; irq < nr_irqs; irq++) {
++		struct irq_info *info = info_for_irq(irq);
 +
-+	tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt);
-+	for (i = 0; i < MAX_PENDING_REQS; i++) {
-+		struct blktap_request *req = tap->pending_requests[i];
-+		if (!req)
++		if (info == NULL || info->type != IRQT_PIRQ)
 +			continue;
 +
-+		tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, "
-+			       "status: 0x%02x, pendcnt: %d, "
-+			       "nr_pages: %u, op: %d, time: %lu:%lu\n",
-+			       i, (unsigned long long)req->id, req->usr_idx,
-+			       req->status, atomic_read(&req->pendcnt),
-+			       req->nr_pages, req->operation, req->time.tv_sec,
-+			       req->time.tv_usec);
++		if (gsi_from_irq(irq) == gsi)
++			return irq;
 +	}
 +
-+	up_read(&tap->tap_sem);
-+	ret = (tmp - buf) + 1;
-+
-+out:
-+	blktap_sysfs_put(tap);
-+	BTDBG("%s\n", buf);
-+
-+	return ret;
++	return -1;
 +}
-+CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);
 +
-+int
-+blktap_sysfs_create(struct blktap *tap)
++/*
++ * Allocate a physical irq, along with a vector.  We don't assign an
++ * event channel until the irq actually started up.  Return an
++ * existing irq if we've already got one for the gsi.
++ */
++int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
 +{
-+	struct blktap_ring *ring;
-+	struct device *dev;
-+	int err;
-+
-+	if (!class)
-+		return -ENODEV;
++	int irq;
++	struct physdev_irq irq_op;
 +
-+	ring = &tap->ring;
++	spin_lock(&irq_mapping_update_lock);
 +
-+	dev = device_create(class, NULL, ring->devno,
-+			    tap, "blktap%d", tap->minor);
-+	if (IS_ERR(dev))
-+		return PTR_ERR(dev);
++	irq = find_irq_by_gsi(gsi);
++	if (irq != -1) {
++		printk(KERN_INFO "xen_allocate_pirq: returning irq %d for gsi %u\n",
++		       irq, gsi);
++		goto out;	/* XXX need refcount? */
++	}
 +
-+	ring->dev = dev;
++	/* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
++	 * we are using the !xen_initial_domain() to drop in the function.*/
++	if (identity_mapped_irq(gsi) || !xen_initial_domain()) {
++		irq = gsi;
++		irq_to_desc_alloc_node(irq, 0);
++		dynamic_irq_init(irq);
++	} else
++		irq = find_unbound_irq();
 +
-+	mutex_init(&ring->sysfs_mutex);
-+	atomic_set(&ring->sysfs_refcnt, 0);
++	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
++				      handle_level_irq, name);
 +
++	irq_op.irq = gsi;
++	irq_op.vector = 0;
 +
-+	printk(KERN_CRIT "%s: adding attributes for dev %p\n", __func__, dev);
-+	err = device_create_file(dev, &dev_attr_name);
-+	if (err)
++	/* Only the privileged domain can do this. For non-priv, the pcifront
++	 * driver provides a PCI bus that does the call to do exactly
++	 * this in the priv domain. */
++	if (xen_initial_domain() &&
++	    HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
++		dynamic_irq_cleanup(irq);
++		irq = -ENOSPC;
 +		goto out;
-+	err = device_create_file(dev, &dev_attr_remove);
-+	if (err)
-+		goto out_unregister_name;
-+	err = device_create_file(dev, &dev_attr_pause);
-+	if (err)
-+		goto out_unregister_remove;
-+	err = device_create_file(dev, &dev_attr_debug);
-+	if (err)
-+		goto out_unregister_pause;
++	}
 +
-+	return 0;
++	irq_info[irq] = mk_pirq_info(0, gsi, irq_op.vector);
++ 	irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
 +
-+out_unregister_pause:
-+	device_remove_file(dev, &dev_attr_pause);
-+out_unregister_remove:
-+	device_remove_file(dev, &dev_attr_remove);
-+out_unregister_name:
-+	device_remove_file(dev, &dev_attr_name);
 +out:
-+	return err;
++	spin_unlock(&irq_mapping_update_lock);
++
++	return irq;
 +}
 +
-+int
-+blktap_sysfs_destroy(struct blktap *tap)
++#ifdef CONFIG_PCI_MSI
++int xen_destroy_irq(int irq)
 +{
-+	struct blktap_ring *ring;
-+	struct device *dev;
++	struct irq_desc *desc;
++	struct physdev_unmap_pirq unmap_irq;
++	struct irq_info *info = info_for_irq(irq);
++	int rc = -ENOENT;
 +
-+	printk(KERN_CRIT "%s\n", __func__);
++	spin_lock(&irq_mapping_update_lock);
 +
-+	ring = &tap->ring;
-+	dev  = ring->dev;
-+	if (!class || !dev)
-+		return 0;
++	desc = irq_to_desc(irq);
++	if (!desc)
++		goto out;
 +
-+	ring->dev = NULL;
-+	if (wait_event_interruptible(sysfs_wq,
-+				     !atomic_read(&tap->ring.sysfs_refcnt)))
-+		return -EAGAIN;
++	if (xen_initial_domain()) {
++		unmap_irq.pirq = info->u.pirq.gsi;
++		unmap_irq.domid = info->u.pirq.domid;
++		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
++		if (rc) {
++			printk(KERN_WARNING "unmap irq failed %d\n", rc);
++			goto out;
++		}
++	}
++	irq_info[irq] = mk_unbound_info();
 +
-+	device_schedule_callback(dev, device_unregister);
++	dynamic_irq_cleanup(irq);
 +
-+	return 0;
++out:
++	spin_unlock(&irq_mapping_update_lock);
++	return rc;
 +}
 +
-+static ssize_t
-+blktap_sysfs_show_verbosity(struct class *class, char *buf)
++int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
 +{
-+	return sprintf(buf, "%d\n", blktap_debug_level);
-+}
++	int irq = 0;
++	struct physdev_map_pirq map_irq;
++	int rc;
++	domid_t domid;
++	int pos;
++	u32 table_offset, bir;
 +
-+static ssize_t
-+blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size)
-+{
-+	int level;
++	domid = rc = xen_find_device_domain_owner(dev);
++	if (rc < 0)
++		domid = DOMID_SELF;
++	
++	memset(&map_irq, 0, sizeof(map_irq));
++	map_irq.domid = domid;
++	map_irq.type = MAP_PIRQ_TYPE_MSI;
++	map_irq.index = -1;
++	map_irq.pirq = -1;
++	map_irq.bus = dev->bus->number;
++	map_irq.devfn = dev->devfn;
 +
-+	if (sscanf(buf, "%d", &level) == 1) {
-+		blktap_debug_level = level;
-+		return size;
++	if (type == PCI_CAP_ID_MSIX) {
++		pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
++
++		pci_read_config_dword(dev, msix_table_offset_reg(pos),
++					&table_offset);
++		bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
++
++		map_irq.table_base = pci_resource_start(dev, bir);
++		map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
 +	}
 +
-+	return -EINVAL;
-+}
-+CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR,
-+	   blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
++	spin_lock(&irq_mapping_update_lock);
 +
-+static ssize_t
-+blktap_sysfs_show_devices(struct class *class, char *buf)
-+{
-+	int i, ret;
-+	struct blktap *tap;
++	irq = find_unbound_irq();
 +
-+	ret = 0;
-+	for (i = 0; i < MAX_BLKTAP_DEVICE; i++) {
-+		tap = blktaps[i];
-+		if (!tap)
-+			continue;
++	if (irq == -1)
++		goto out;
 +
-+		if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
-+			continue;
++	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
++	if (rc) {
++		printk(KERN_WARNING "xen map irq failed %d\n", rc);
 +
-+		ret += sprintf(buf + ret, "%d ", tap->minor);
-+		ret += snprintf(buf + ret, sizeof(tap->params.name) - 1,
-+				tap->params.name);
-+		ret += sprintf(buf + ret, "\n");
++		dynamic_irq_cleanup(irq);
++
++		irq = -1;
++		goto out;
 +	}
++	irq_info[irq] = mk_pirq_info(0, map_irq.pirq, map_irq.index);
++	if (domid)
++		irq_info[irq].u.pirq.domid = domid;
 +
-+	return ret;
++	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
++			handle_level_irq,
++			(type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
++
++out:
++	spin_unlock(&irq_mapping_update_lock);
++	return irq;
 +}
-+CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL);
++#endif
 +
-+void
-+blktap_sysfs_free(void)
++int xen_vector_from_irq(unsigned irq)
 +{
-+	if (!class)
-+		return;
-+
-+	class_remove_file(class, &class_attr_verbosity);
-+	class_remove_file(class, &class_attr_devices);
-+
-+	class_destroy(class);
++	return vector_from_irq(irq);
 +}
 +
-+int __init
-+blktap_sysfs_init(void)
++int xen_gsi_from_irq(unsigned irq)
 +{
-+	struct class *cls;
-+	int err;
-+
-+	if (class)
-+		return -EEXIST;
-+
-+	cls = class_create(THIS_MODULE, "blktap2");
-+	if (IS_ERR(cls))
-+		return PTR_ERR(cls);
-+
-+	err = class_create_file(cls, &class_attr_verbosity);
-+	if (err)
-+		goto out_unregister;
-+	err = class_create_file(cls, &class_attr_devices);
-+	if (err)
-+		goto out_unregister;
-+
-+	class = cls;
-+	return 0;
-+out_unregister:
-+	class_destroy(cls);
-+	return err;
++	return gsi_from_irq(irq);
 +}
-diff --git a/drivers/xen/blktap/wait_queue.c b/drivers/xen/blktap/wait_queue.c
-new file mode 100644
-index 0000000..f8995aa
---- /dev/null
-+++ b/drivers/xen/blktap/wait_queue.c
-@@ -0,0 +1,40 @@
-+#include <linux/list.h>
-+#include <linux/spinlock.h>
-+
-+#include "blktap.h"
-+
-+static LIST_HEAD(deferred_work_queue);
-+static DEFINE_SPINLOCK(deferred_work_lock);
++EXPORT_SYMBOL_GPL(xen_gsi_from_irq);
 +
-+void
-+blktap_run_deferred(void)
+ int bind_evtchn_to_irq(unsigned int evtchn)
+ {
+ 	int irq;
+@@ -409,8 +767,23 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+ 	return irq;
+ }
+ 
++static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
++                                          unsigned int remote_port)
 +{
-+	LIST_HEAD(queue);
-+	struct blktap *tap;
-+	unsigned long flags;
++        struct evtchn_bind_interdomain bind_interdomain;
++        int err;
 +
-+	spin_lock_irqsave(&deferred_work_lock, flags);
-+	list_splice_init(&deferred_work_queue, &queue);
-+	list_for_each_entry(tap, &queue, deferred_queue)
-+		clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
-+	spin_unlock_irqrestore(&deferred_work_lock, flags);
++        bind_interdomain.remote_dom  = remote_domain;
++        bind_interdomain.remote_port = remote_port;
+ 
+-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
++        err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
++                                          &bind_interdomain);
 +
-+	while (!list_empty(&queue)) {
-+		tap = list_entry(queue.next, struct blktap, deferred_queue);
-+		list_del_init(&tap->deferred_queue);
-+		blktap_device_restart(tap);
-+	}
++        return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
 +}
 +
-+void
-+blktap_defer(struct blktap *tap)
++
++int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+ {
+ 	struct evtchn_bind_virq bind_virq;
+ 	int evtchn, irq;
+@@ -504,6 +877,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ }
+ EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+ 
++int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
++					  unsigned int remote_port,
++					  irq_handler_t handler,
++					  unsigned long irqflags,
++					  const char *devname,
++					  void *dev_id)
 +{
-+	unsigned long flags;
++        int irq, retval;
 +
-+	spin_lock_irqsave(&deferred_work_lock, flags);
-+	if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) {
-+		set_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
-+		list_add_tail(&tap->deferred_queue, &deferred_work_queue);
-+	}
-+	spin_unlock_irqrestore(&deferred_work_lock, flags);
++        irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
++        if (irq < 0)
++                return irq;
++
++        retval = request_irq(irq, handler, irqflags, devname, dev_id);
++        if (retval != 0) {
++                unbind_from_irq(irq);
++                return retval;
++        }
++
++        return irq;
 +}
-diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
-index bdfd584..6625ffe 100644
---- a/drivers/xen/cpu_hotplug.c
-+++ b/drivers/xen/cpu_hotplug.c
-@@ -1,5 +1,6 @@
- #include <linux/notifier.h>
++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
++
+ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ 			    irq_handler_t handler,
+ 			    unsigned long irqflags, const char *devname, void *dev_id)
+@@ -535,6 +931,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+ 	if (irq < 0)
+ 		return irq;
+ 
++	irqflags |= IRQF_NO_SUSPEND;
+ 	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ 	if (retval != 0) {
+ 		unbind_from_irq(irq);
+@@ -649,9 +1046,13 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
+ 				int bit_idx = __ffs(pending_bits);
+ 				int port = (word_idx * BITS_PER_LONG) + bit_idx;
+ 				int irq = evtchn_to_irq[port];
++				struct irq_desc *desc;
  
-+#include <xen/xen.h>
- #include <xen/xenbus.h>
+-				if (irq != -1)
+-					handle_irq(irq, regs);
++				if (irq != -1) {
++					desc = irq_to_desc(irq);
++					if (desc)
++						generic_handle_irq_desc(irq, desc);
++				}
+ 			}
+ 		}
  
- #include <asm/xen/hypervisor.h>
-diff --git a/drivers/xen/events.c b/drivers/xen/events.c
-index ce602dd..60b31e6 100644
---- a/drivers/xen/events.c
-+++ b/drivers/xen/events.c
-@@ -16,7 +16,7 @@
-  *    (typically dom0).
-  * 2. VIRQs, typically used for timers.  These are per-cpu events.
-  * 3. IPIs.
-- * 4. Hardware interrupts. Not supported at present.
-+ * 4. PIRQs - Hardware interrupts.
-  *
-  * Jeremy Fitzhardinge <jeremy at xensource.com>, XenSource Inc, 2007
-  */
-@@ -27,19 +27,27 @@
- #include <linux/module.h>
- #include <linux/string.h>
- #include <linux/bootmem.h>
-+#include <linux/irqnr.h>
-+#include <linux/pci_regs.h>
-+#include <linux/pci.h>
-+#include <linux/msi.h>
+@@ -855,7 +1256,7 @@ void xen_clear_irq_pending(int irq)
+ 	if (VALID_EVTCHN(evtchn))
+ 		clear_evtchn(evtchn);
+ }
+-
++EXPORT_SYMBOL(xen_clear_irq_pending);
+ void xen_set_irq_pending(int irq)
+ {
+ 	int evtchn = evtchn_from_irq(irq);
+@@ -875,9 +1276,9 @@ bool xen_test_irq_pending(int irq)
+ 	return ret;
+ }
  
- #include <asm/ptrace.h>
- #include <asm/irq.h>
- #include <asm/idle.h>
-+#include <asm/io_apic.h>
- #include <asm/sync_bitops.h>
- #include <asm/xen/hypercall.h>
- #include <asm/xen/hypervisor.h>
-+#include <asm/xen/pci.h>
+-/* Poll waiting for an irq to become pending.  In the usual case, the
++/* Poll waiting for an irq to become pending with timeout.  In the usual case, the
+    irq will be disabled so it won't deliver an interrupt. */
+-void xen_poll_irq(int irq)
++void xen_poll_irq_timeout(int irq, u64 timeout)
+ {
+ 	evtchn_port_t evtchn = evtchn_from_irq(irq);
  
- #include <xen/xen-ops.h>
- #include <xen/events.h>
- #include <xen/interface/xen.h>
- #include <xen/interface/event_channel.h>
+@@ -885,13 +1286,20 @@ void xen_poll_irq(int irq)
+ 		struct sched_poll poll;
  
-+#include "../pci/msi.h"
-+
- /*
-  * This lock protects updates to the following mapping and reference-count
-  * arrays. The lock does not need to be acquired to read the mapping tables.
-@@ -67,7 +75,7 @@ enum xen_irq_type {
-  * event channel - irq->event channel mapping
-  * cpu - cpu this event channel is bound to
-  * index - type-specific information:
-- *    PIRQ - vector, with MSB being "needs EIO"
-+ *    PIRQ - with MSB being "needs EIO"
-  *    VIRQ - virq number
-  *    IPI - IPI vector
-  *    EVTCHN -
-@@ -83,20 +91,27 @@ struct irq_info
- 		enum ipi_vector ipi;
- 		struct {
- 			unsigned short gsi;
--			unsigned short vector;
-+			unsigned char vector;
-+			unsigned char flags;
-+			uint16_t domid;
- 		} pirq;
- 	} u;
- };
-+#define PIRQ_NEEDS_EOI	(1 << 0)
-+#define PIRQ_SHAREABLE	(1 << 1)
+ 		poll.nr_ports = 1;
+-		poll.timeout = 0;
++		poll.timeout = timeout;
+ 		set_xen_guest_handle(poll.ports, &evtchn);
  
--static struct irq_info irq_info[NR_IRQS];
-+static struct irq_info *irq_info;
+ 		if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
+ 			BUG();
+ 	}
+ }
++EXPORT_SYMBOL(xen_poll_irq_timeout);
++/* Poll waiting for an irq to become pending.  In the usual case, the
++   irq will be disabled so it won't deliver an interrupt. */
++void xen_poll_irq(int irq)
++{
++	xen_poll_irq_timeout(irq, 0 /* no timeout */);
++}
  
--static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
--	[0 ... NR_EVENT_CHANNELS-1] = -1
--};
-+static int *evtchn_to_irq;
- struct cpu_evtchn_s {
- 	unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
+ void xen_irq_resume(void)
+ {
+@@ -928,13 +1336,38 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
+ 	.retrigger	= retrigger_dynirq,
  };
--static struct cpu_evtchn_s *cpu_evtchn_mask_p;
+ 
++static struct irq_chip xen_pirq_chip __read_mostly = {
++	.name		= "xen-pirq",
 +
-+static __initdata struct cpu_evtchn_s init_evtchn_mask = {
-+	.bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
++	.startup	= startup_pirq,
++	.shutdown	= shutdown_pirq,
++
++	.enable		= enable_pirq,
++	.unmask		= enable_pirq,
++
++	.disable	= disable_pirq,
++	.mask		= disable_pirq,
++
++	.ack		= ack_pirq,
++	.end		= end_pirq,
++
++	.set_affinity	= set_affinity_irq,
++
++	.retrigger	= retrigger_dynirq,
 +};
-+static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
 +
- static inline unsigned long *cpu_evtchn_mask(int cpu)
+ void __init xen_init_IRQ(void)
  {
- 	return cpu_evtchn_mask_p[cpu].bits;
-@@ -106,6 +121,7 @@ static inline unsigned long *cpu_evtchn_mask(int cpu)
- #define VALID_EVTCHN(chn)	((chn) != 0)
+ 	int i;
  
- static struct irq_chip xen_dynamic_chip;
-+static struct irq_chip xen_pirq_chip;
+ 	cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
+ 				    GFP_KERNEL);
+-	BUG_ON(cpu_evtchn_mask_p == NULL);
++	irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
++
++	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
++				    GFP_KERNEL);
++	for(i = 0; i < NR_EVENT_CHANNELS; i++)
++		evtchn_to_irq[i] = -1;
  
- /* Constructor for packed IRQ information. */
- static struct irq_info mk_unbound_info(void)
-@@ -135,7 +151,8 @@ static struct irq_info mk_pirq_info(unsigned short evtchn,
- 				    unsigned short gsi, unsigned short vector)
- {
- 	return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
--			.cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
-+			.cpu = 0, .u.pirq =
-+			{ .gsi = gsi, .vector = vector, .domid = DOMID_SELF } };
- }
+ 	init_evtchn_cpu_bindings();
  
- /*
-@@ -218,6 +235,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn)
- 	return ret;
- }
+@@ -943,4 +1376,6 @@ void __init xen_init_IRQ(void)
+ 		mask_evtchn(i);
  
-+static bool pirq_needs_eoi(unsigned irq)
-+{
-+	struct irq_info *info = info_for_irq(irq);
+ 	irq_ctx_init(smp_processor_id());
 +
-+	BUG_ON(info->type != IRQT_PIRQ);
++	xen_setup_pirqs();
+ }
+diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
+index 79bedba..6a1c4a5 100644
+--- a/drivers/xen/evtchn.c
++++ b/drivers/xen/evtchn.c
+@@ -48,6 +48,8 @@
+ #include <linux/gfp.h>
+ #include <linux/mutex.h>
+ #include <linux/cpu.h>
 +
-+	return info->u.pirq.flags & PIRQ_NEEDS_EOI;
++#include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/evtchn.h>
+ #include <asm/xen/hypervisor.h>
+@@ -68,10 +70,36 @@ struct per_user_data {
+ 	const char *name;
+ };
+ 
+-/* Who's bound to each port? */
+-static struct per_user_data *port_user[NR_EVENT_CHANNELS];
++/*
++ * Who's bound to each port?  This is logically an array of struct
++ * per_user_data *, but we encode the current enabled-state in bit 0.
++ */
++static unsigned long *port_user;
+ static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
+ 
++static inline struct per_user_data *get_port_user(unsigned port)
++{
++	return (struct per_user_data *)(port_user[port] & ~1);
 +}
 +
- static inline unsigned long active_evtchns(unsigned int cpu,
- 					   struct shared_info *sh,
- 					   unsigned int idx)
-@@ -329,17 +355,33 @@ static void unmask_evtchn(int port)
- 	put_cpu();
- }
- 
-+static int get_nr_hw_irqs(void)
++static inline void set_port_user(unsigned port, struct per_user_data *u)
 +{
-+	int ret = 1;
++	port_user[port] = (unsigned long)u;
++}
 +
-+#ifdef CONFIG_X86_IO_APIC
-+	ret = get_nr_irqs_gsi();
-+#endif
++static inline bool get_port_enabled(unsigned port)
++{
++	return port_user[port] & 1;
++}
 +
-+	return ret;
++static inline void set_port_enabled(unsigned port, bool enabled)
++{
++	if (enabled)
++		port_user[port] |= 1;
++	else
++		port_user[port] &= ~1;
 +}
 +
- static int find_unbound_irq(void)
+ irqreturn_t evtchn_interrupt(int irq, void *data)
  {
- 	int irq;
- 	struct irq_desc *desc;
-+	int start = get_nr_hw_irqs();
+ 	unsigned int port = (unsigned long)data;
+@@ -79,9 +107,14 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
  
--	for (irq = 0; irq < nr_irqs; irq++)
-+	if (start == nr_irqs)
-+		goto no_irqs;
+ 	spin_lock(&port_user_lock);
+ 
+-	u = port_user[port];
++	u = get_port_user(port);
 +
-+	/* nr_irqs is a magic value. Must not use it.*/
-+	for (irq = nr_irqs-1; irq > start; irq--)
- 		if (irq_info[irq].type == IRQT_UNBOUND)
- 			break;
++	WARN(!get_port_enabled(port),
++	     "Interrupt for port %d, but apparently not enabled; per-user %p\n",
++	     port, u);
  
--	if (irq == nr_irqs)
--		panic("No available IRQ to bind to: increase nr_irqs!\n");
-+	if (irq == start)
-+		goto no_irqs;
+ 	disable_irq_nosync(irq);
++	set_port_enabled(port, false);
  
- 	desc = irq_to_desc_alloc_node(irq, 0);
- 	if (WARN_ON(desc == NULL))
-@@ -348,8 +390,324 @@ static int find_unbound_irq(void)
- 	dynamic_irq_init(irq);
+ 	if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
+ 		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+@@ -91,9 +124,8 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
+ 			kill_fasync(&u->evtchn_async_queue,
+ 				    SIGIO, POLL_IN);
+ 		}
+-	} else {
++	} else
+ 		u->ring_overflow = 1;
+-	}
  
- 	return irq;
-+
-+no_irqs:
-+	panic("No available IRQ to bind to: increase nr_irqs!\n");
-+}
-+
-+static bool identity_mapped_irq(unsigned irq)
-+{
-+	/* identity map all the hardware irqs */
-+	return irq < get_nr_hw_irqs();
-+}
+ 	spin_unlock(&port_user_lock);
+ 
+@@ -197,9 +229,18 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
+ 		goto out;
+ 
+ 	spin_lock_irq(&port_user_lock);
+-	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
+-		if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
+-			enable_irq(irq_from_evtchn(kbuf[i]));
 +
-+static void pirq_unmask_notify(int irq)
-+{
-+	struct irq_info *info = info_for_irq(irq);
-+	struct physdev_eoi eoi = { .irq = info->u.pirq.gsi };
++	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
++		unsigned port = kbuf[i];
 +
-+	if (unlikely(pirq_needs_eoi(irq))) {
-+		int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
-+		WARN_ON(rc);
++		if (port < NR_EVENT_CHANNELS &&
++		    get_port_user(port) == u &&
++		    !get_port_enabled(port)) {
++			set_port_enabled(port, true);
++			enable_irq(irq_from_evtchn(port));
++		}
 +	}
-+}
-+
-+static void pirq_query_unmask(int irq)
-+{
-+	struct physdev_irq_status_query irq_status;
-+	struct irq_info *info = info_for_irq(irq);
 +
-+	BUG_ON(info->type != IRQT_PIRQ);
+ 	spin_unlock_irq(&port_user_lock);
+ 
+ 	rc = count;
+@@ -221,8 +262,9 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
+ 	 * interrupt handler yet, and our caller has already
+ 	 * serialized bind operations.)
+ 	 */
+-	BUG_ON(port_user[port] != NULL);
+-	port_user[port] = u;
++	BUG_ON(get_port_user(port) != NULL);
++	set_port_user(port, u);
++	set_port_enabled(port, true); /* start enabled */
+ 
+ 	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+ 				       u->name, (void *)(unsigned long)port);
+@@ -238,10 +280,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
+ 
+ 	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+ 
+-	/* make sure we unbind the irq handler before clearing the port */
+-	barrier();
+-
+-	port_user[port] = NULL;
++	set_port_user(port, NULL);
+ }
+ 
+ static long evtchn_ioctl(struct file *file,
+@@ -332,7 +371,7 @@ static long evtchn_ioctl(struct file *file,
+ 		spin_lock_irq(&port_user_lock);
+ 
+ 		rc = -ENOTCONN;
+-		if (port_user[unbind.port] != u) {
++		if (get_port_user(unbind.port) != u) {
+ 			spin_unlock_irq(&port_user_lock);
+ 			break;
+ 		}
+@@ -354,7 +393,7 @@ static long evtchn_ioctl(struct file *file,
+ 
+ 		if (notify.port >= NR_EVENT_CHANNELS) {
+ 			rc = -EINVAL;
+-		} else if (port_user[notify.port] != u) {
++		} else if (get_port_user(notify.port) != u) {
+ 			rc = -ENOTCONN;
+ 		} else {
+ 			notify_remote_via_evtchn(notify.port);
+@@ -443,10 +482,10 @@ static int evtchn_release(struct inode *inode, struct file *filp)
+ 	free_page((unsigned long)u->ring);
+ 
+ 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+-		if (port_user[i] != u)
++		if (get_port_user(i) != u)
+ 			continue;
+ 
+-		evtchn_unbind_from_user(port_user[i], i);
++		evtchn_unbind_from_user(get_port_user(i), i);
+ 	}
+ 
+ 	spin_unlock_irq(&port_user_lock);
+@@ -480,8 +519,11 @@ static int __init evtchn_init(void)
+ 	if (!xen_domain())
+ 		return -ENODEV;
+ 
++	port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
++	if (port_user == NULL)
++		return -ENOMEM;
 +
-+	irq_status.irq = info->u.pirq.gsi;
-+	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
-+		irq_status.flags = 0;
+ 	spin_lock_init(&port_user_lock);
+-	memset(port_user, 0, sizeof(port_user));
+ 
+ 	/* Create '/dev/misc/evtchn'. */
+ 	err = misc_register(&evtchn_miscdev);
+@@ -497,6 +539,9 @@ static int __init evtchn_init(void)
+ 
+ static void __exit evtchn_cleanup(void)
+ {
++	kfree(port_user);
++	port_user = NULL;
 +
-+	info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
-+	if (irq_status.flags & XENIRQSTAT_needs_eoi)
-+		info->u.pirq.flags |= PIRQ_NEEDS_EOI;
+ 	misc_deregister(&evtchn_miscdev);
  }
  
-+static bool probing_irq(int irq)
-+{
-+	struct irq_desc *desc = irq_to_desc(irq);
+diff --git a/drivers/xen/features.c b/drivers/xen/features.c
+index 99eda16..9e2b64f 100644
+--- a/drivers/xen/features.c
++++ b/drivers/xen/features.c
+@@ -18,7 +18,7 @@
+ u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+ EXPORT_SYMBOL_GPL(xen_features);
+ 
+-void xen_setup_features(void)
++void __init xen_setup_features(void)
+ {
+ 	struct xen_feature_info fi;
+ 	int i, j;
+diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
+new file mode 100644
+index 0000000..ddc59cc
+--- /dev/null
++++ b/drivers/xen/gntdev.c
+@@ -0,0 +1,626 @@
++/******************************************************************************
++ * gntdev.c
++ *
++ * Device for accessing (in user-space) pages that have been granted by other
++ * domains.
++ *
++ * Copyright (c) 2006-2007, D G Murray.
++ *           (c) 2009 Gerd Hoffmann <kraxel at redhat.com>
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ */
 +
-+	return desc && desc->action == NULL;
-+}
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/miscdevice.h>
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/mmu_notifier.h>
++#include <linux/types.h>
++#include <linux/uaccess.h>
++#include <linux/sched.h>
++#include <linux/rwsem.h>
 +
-+static unsigned int startup_pirq(unsigned int irq)
-+{
-+	struct evtchn_bind_pirq bind_pirq;
-+	struct irq_info *info = info_for_irq(irq);
-+	int evtchn = evtchn_from_irq(irq);
-+	int rc;
++#include <xen/xen.h>
++#include <xen/grant_table.h>
++#include <xen/gntdev.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include <asm/xen/page.h>
 +
-+	BUG_ON(info->type != IRQT_PIRQ);
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Derek G. Murray <Derek.Murray at cl.cam.ac.uk>, "
++	      "Gerd Hoffmann <kraxel at redhat.com>");
++MODULE_DESCRIPTION("User-space granted page access driver");
 +
-+	if (VALID_EVTCHN(evtchn))
-+		goto out;
++static int debug = 0;
++module_param(debug, int, 0644);
++static int limit = 1024;
++module_param(limit, int, 0644);
 +
-+	bind_pirq.pirq = info->u.pirq.gsi;
-+	/* NB. We are happy to share unless we are probing. */
-+	bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
-+					BIND_PIRQ__WILL_SHARE : 0;
-+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
-+	if (rc != 0) {
-+		if (!probing_irq(irq))
-+			printk(KERN_INFO "Failed to obtain physical IRQ %d" \
-+				" (GSI:%d)\n", irq, info->u.pirq.gsi);
-+		return 0;
-+	}
-+	evtchn = bind_pirq.port;
++struct gntdev_priv {
++	struct list_head maps;
++	uint32_t used;
++	uint32_t limit;
++	struct rw_semaphore sem;
++	struct mm_struct *mm;
++	struct mmu_notifier mn;
++};
 +
-+	pirq_query_unmask(irq);
++struct grant_map {
++	struct list_head next;
++	struct gntdev_priv *priv;
++	struct vm_area_struct *vma;
++	int index;
++	int count;
++	int flags;
++	int is_mapped;
++	struct ioctl_gntdev_grant_ref *grants;
++	struct gnttab_map_grant_ref   *map_ops;
++	struct gnttab_unmap_grant_ref *unmap_ops;
++};
 +
-+	evtchn_to_irq[evtchn] = irq;
-+	bind_evtchn_to_cpu(evtchn, 0);
-+	info->evtchn = evtchn;
++/* ------------------------------------------------------------------ */
 +
-+ out:
-+	unmask_evtchn(evtchn);
-+	pirq_unmask_notify(irq);
++static void gntdev_print_maps(struct gntdev_priv *priv,
++			      char *text, int text_index)
++{
++	struct grant_map *map;
 +
-+	return 0;
++	printk("%s: maps list (priv %p, usage %d/%d)\n",
++	       __FUNCTION__, priv, priv->used, priv->limit);
++	list_for_each_entry(map, &priv->maps, next)
++		printk("  index %2d, count %2d %s\n",
++		       map->index, map->count,
++		       map->index == text_index && text ? text : "");
 +}
 +
-+static void shutdown_pirq(unsigned int irq)
++static struct grant_map *gntdev_add_map(struct gntdev_priv *priv, int count)
 +{
-+	struct evtchn_close close;
-+	struct irq_info *info = info_for_irq(irq);
-+	int evtchn = evtchn_from_irq(irq);
++	struct grant_map *map, *add;
 +
-+	BUG_ON(info->type != IRQT_PIRQ);
++	add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
++	if (NULL == add)
++		return NULL;
 +
-+	if (!VALID_EVTCHN(evtchn))
-+		return;
++	add->grants    = kzalloc(sizeof(add->grants[0])    * count, GFP_KERNEL);
++	add->map_ops   = kzalloc(sizeof(add->map_ops[0])   * count, GFP_KERNEL);
++	add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
++	if (NULL == add->grants  ||
++	    NULL == add->map_ops ||
++	    NULL == add->unmap_ops)
++		goto err;
 +
-+	mask_evtchn(evtchn);
++	add->index = 0;
++	add->count = count;
++	add->priv  = priv;
 +
-+	close.port = evtchn;
-+	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
-+		BUG();
++	if (add->count + priv->used > priv->limit)
++		goto err;
 +
-+	bind_evtchn_to_cpu(evtchn, 0);
-+	evtchn_to_irq[evtchn] = -1;
-+	info->evtchn = 0;
-+}
++	list_for_each_entry(map, &priv->maps, next) {
++		if (add->index + add->count < map->index) {
++			list_add_tail(&add->next, &map->next);
++			goto done;
++		}
++		add->index = map->index + map->count;
++	}
++	list_add_tail(&add->next, &priv->maps);
 +
-+static void enable_pirq(unsigned int irq)
-+{
-+	startup_pirq(irq);
-+}
++done:
++	priv->used += add->count;
++	if (debug)
++		gntdev_print_maps(priv, "[new]", add->index);
++	return add;
 +
-+static void disable_pirq(unsigned int irq)
-+{
++err:
++	kfree(add->grants);
++	kfree(add->map_ops);
++	kfree(add->unmap_ops);
++	kfree(add);
++	return NULL;
 +}
 +
-+static void ack_pirq(unsigned int irq)
++static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, int index,
++					       int count)
 +{
-+	int evtchn = evtchn_from_irq(irq);
-+
-+	move_native_irq(irq);
++	struct grant_map *map;
 +
-+	if (VALID_EVTCHN(evtchn)) {
-+		mask_evtchn(evtchn);
-+		clear_evtchn(evtchn);
++	list_for_each_entry(map, &priv->maps, next) {
++		if (map->index != index)
++			continue;
++		if (map->count != count)
++			continue;
++		return map;
 +	}
++	return NULL;
 +}
 +
-+static void end_pirq(unsigned int irq)
++static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
++					       unsigned long vaddr)
 +{
-+	int evtchn = evtchn_from_irq(irq);
-+	struct irq_desc *desc = irq_to_desc(irq);
-+
-+	if (WARN_ON(!desc))
-+		return;
++	struct grant_map *map;
 +
-+	if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
-+	    (IRQ_DISABLED|IRQ_PENDING)) {
-+		shutdown_pirq(irq);
-+	} else if (VALID_EVTCHN(evtchn)) {
-+		unmask_evtchn(evtchn);
-+		pirq_unmask_notify(irq);
++	list_for_each_entry(map, &priv->maps, next) {
++		if (!map->vma)
++			continue;
++		if (vaddr < map->vma->vm_start)
++			continue;
++		if (vaddr >= map->vma->vm_end)
++			continue;
++		return map;
 +	}
++	return NULL;
 +}
 +
-+static int find_irq_by_gsi(unsigned gsi)
++static int gntdev_del_map(struct grant_map *map)
 +{
-+	int irq;
-+
-+	for (irq = 0; irq < nr_irqs; irq++) {
-+		struct irq_info *info = info_for_irq(irq);
-+
-+		if (info == NULL || info->type != IRQT_PIRQ)
-+			continue;
++	int i;
 +
-+		if (gsi_from_irq(irq) == gsi)
-+			return irq;
-+	}
++	if (map->vma)
++		return -EBUSY;
++	for (i = 0; i < map->count; i++)
++		if (map->unmap_ops[i].handle)
++			return -EBUSY;
 +
-+	return -1;
++	map->priv->used -= map->count;
++	list_del(&map->next);
++	kfree(map->grants);
++	kfree(map->map_ops);
++	kfree(map->unmap_ops);
++	kfree(map);
++	return 0;
 +}
 +
-+/*
-+ * Allocate a physical irq, along with a vector.  We don't assign an
-+ * event channel until the irq actually started up.  Return an
-+ * existing irq if we've already got one for the gsi.
-+ */
-+int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
-+{
-+	int irq;
-+	struct physdev_irq irq_op;
-+
-+	spin_lock(&irq_mapping_update_lock);
++/* ------------------------------------------------------------------ */
 +
-+	irq = find_irq_by_gsi(gsi);
-+	if (irq != -1) {
-+		printk(KERN_INFO "xen_allocate_pirq: returning irq %d for gsi %u\n",
-+		       irq, gsi);
-+		goto out;	/* XXX need refcount? */
-+	}
++static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void *data)
++{
++	struct grant_map *map = data;
++	unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
++	u64 pte_maddr;
 +
-+	/* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
-+	 * we are using the !xen_initial_domain() to drop in the function.*/
-+	if (identity_mapped_irq(gsi) || !xen_initial_domain()) {
-+		irq = gsi;
-+		irq_to_desc_alloc_node(irq, 0);
-+		dynamic_irq_init(irq);
-+	} else
-+		irq = find_unbound_irq();
++	BUG_ON(pgnr >= map->count);
++	pte_maddr  = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT;
++	pte_maddr += (unsigned long)pte & ~PAGE_MASK;
++	gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags,
++			  map->grants[pgnr].ref,
++			  map->grants[pgnr].domid);
++	gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags,
++			    0 /* handle */);
++	return 0;
++}
 +
-+	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
-+				      handle_level_irq, name);
++static int map_grant_pages(struct grant_map *map)
++{
++	int i, err = 0;
 +
-+	irq_op.irq = gsi;
-+	irq_op.vector = 0;
++	if (debug)
++		printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count);
++	err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++					map->map_ops, map->count);
++	if (WARN_ON(err))
++		return err;
 +
-+	/* Only the privileged domain can do this. For non-priv, the pcifront
-+	 * driver provides a PCI bus that does the call to do exactly
-+	 * this in the priv domain. */
-+	if (xen_initial_domain() &&
-+	    HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
-+		dynamic_irq_cleanup(irq);
-+		irq = -ENOSPC;
-+		goto out;
++	for (i = 0; i < map->count; i++) {
++		if (map->map_ops[i].status)
++			err = -EINVAL;
++		map->unmap_ops[i].handle = map->map_ops[i].handle;
 +	}
++	return err;
++}
 +
-+	irq_info[irq] = mk_pirq_info(0, gsi, irq_op.vector);
-+ 	irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
++static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
++{
++	int i, err = 0;
 +
-+out:
-+	spin_unlock(&irq_mapping_update_lock);
++	if (debug)
++		printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__,
++		       map->index, map->count, offset, pages);
++	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++					map->unmap_ops + offset, pages);
++	if (WARN_ON(err))
++		return err;
 +
-+	return irq;
++	for (i = 0; i < pages; i++) {
++		if (map->unmap_ops[offset+i].status)
++			err = -EINVAL;
++		map->unmap_ops[offset+i].handle = 0;
++	}
++	return err;
 +}
 +
-+#ifdef CONFIG_PCI_MSI
-+int xen_destroy_irq(int irq)
-+{
-+	struct irq_desc *desc;
-+	struct physdev_unmap_pirq unmap_irq;
-+	struct irq_info *info = info_for_irq(irq);
-+	int rc = -ENOENT;
++/* ------------------------------------------------------------------ */
 +
-+	spin_lock(&irq_mapping_update_lock);
++static void gntdev_vma_close(struct vm_area_struct *vma)
++{
++	struct grant_map *map = vma->vm_private_data;
 +
-+	desc = irq_to_desc(irq);
-+	if (!desc)
-+		goto out;
++	if (debug)
++		printk("%s\n", __FUNCTION__);
++	map->is_mapped = 0;
++	map->vma = NULL;
++	vma->vm_private_data = NULL;
++}
 +
-+	if (xen_initial_domain()) {
-+		unmap_irq.pirq = info->u.pirq.gsi;
-+		unmap_irq.domid = info->u.pirq.domid;
-+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
-+		if (rc) {
-+			printk(KERN_WARNING "unmap irq failed %d\n", rc);
-+			goto out;
-+		}
-+	}
-+	irq_info[irq] = mk_unbound_info();
++static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++	if (debug)
++		printk("%s: vaddr %p, pgoff %ld (shouldn't happen)\n",
++		       __FUNCTION__, vmf->virtual_address, vmf->pgoff);
++	vmf->flags = VM_FAULT_ERROR;
++	return 0;
++}
 +
-+	dynamic_irq_cleanup(irq);
++static struct vm_operations_struct gntdev_vmops = {
++	.close = gntdev_vma_close,
++	.fault = gntdev_vma_fault,
++};
 +
-+out:
-+	spin_unlock(&irq_mapping_update_lock);
-+	return rc;
-+}
++/* ------------------------------------------------------------------ */
 +
-+int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
++static void mn_invl_range_start(struct mmu_notifier *mn,
++				struct mm_struct *mm,
++				unsigned long start, unsigned long end)
 +{
-+	int irq = 0;
-+	struct physdev_map_pirq map_irq;
-+	int rc;
-+	domid_t domid;
-+	int pos;
-+	u32 table_offset, bir;
++	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
++	struct grant_map *map;
++	unsigned long mstart, mend;
++	int err;
 +
-+	domid = rc = xen_find_device_domain_owner(dev);
-+	if (rc < 0)
-+		domid = DOMID_SELF;
-+	
-+	memset(&map_irq, 0, sizeof(map_irq));
-+	map_irq.domid = domid;
-+	map_irq.type = MAP_PIRQ_TYPE_MSI;
-+	map_irq.index = -1;
-+	map_irq.pirq = -1;
-+	map_irq.bus = dev->bus->number;
-+	map_irq.devfn = dev->devfn;
++	down_read(&priv->sem);
++	list_for_each_entry(map, &priv->maps, next) {
++		if (!map->vma)
++			continue;
++		if (!map->is_mapped)
++			continue;
++		if (map->vma->vm_start >= end)
++			continue;
++		if (map->vma->vm_end <= start)
++			continue;
++		mstart = max(start, map->vma->vm_start);
++		mend   = min(end,   map->vma->vm_end);
++		if (debug)
++			printk("%s: map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
++			       __FUNCTION__, map->index, map->count,
++			       map->vma->vm_start, map->vma->vm_end,
++			       start, end, mstart, mend);
++		err = unmap_grant_pages(map,
++					(mstart - map->vma->vm_start) >> PAGE_SHIFT,
++					(mend - mstart) >> PAGE_SHIFT);
++		WARN_ON(err);
++	}
++	up_read(&priv->sem);
++}
 +
-+	if (type == PCI_CAP_ID_MSIX) {
-+		pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
++static void mn_invl_page(struct mmu_notifier *mn,
++			 struct mm_struct *mm,
++			 unsigned long address)
++{
++	mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
++}
 +
-+		pci_read_config_dword(dev, msix_table_offset_reg(pos),
-+					&table_offset);
-+		bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
++static void mn_release(struct mmu_notifier *mn,
++		       struct mm_struct *mm)
++{
++	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
++	struct grant_map *map;
++	int err;
 +
-+		map_irq.table_base = pci_resource_start(dev, bir);
-+		map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
++	down_read(&priv->sem);
++	list_for_each_entry(map, &priv->maps, next) {
++		if (!map->vma)
++			continue;
++		if (debug)
++			printk("%s: map %d+%d (%lx %lx)\n",
++			       __FUNCTION__, map->index, map->count,
++			       map->vma->vm_start, map->vma->vm_end);
++		err = unmap_grant_pages(map, 0, map->count);
++		WARN_ON(err);
 +	}
++	up_read(&priv->sem);
++}
 +
-+	spin_lock(&irq_mapping_update_lock);
++struct mmu_notifier_ops gntdev_mmu_ops = {
++	.release                = mn_release,
++	.invalidate_page        = mn_invl_page,
++	.invalidate_range_start = mn_invl_range_start,
++};
 +
-+	irq = find_unbound_irq();
++/* ------------------------------------------------------------------ */
 +
-+	if (irq == -1)
-+		goto out;
++static int gntdev_open(struct inode *inode, struct file *flip)
++{
++	struct gntdev_priv *priv;
 +
-+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
-+	if (rc) {
-+		printk(KERN_WARNING "xen map irq failed %d\n", rc);
++	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
++	if (!priv)
++		return -ENOMEM;
 +
-+		dynamic_irq_cleanup(irq);
++	INIT_LIST_HEAD(&priv->maps);
++	init_rwsem(&priv->sem);
++	priv->limit = limit;
 +
-+		irq = -1;
-+		goto out;
++	priv->mm = get_task_mm(current);
++	if (!priv->mm) {
++		kfree(priv);
++		return -ENOMEM;
 +	}
-+	irq_info[irq] = mk_pirq_info(0, map_irq.pirq, map_irq.index);
-+	if (domid)
-+		irq_info[irq].u.pirq.domid = domid;
++	priv->mn.ops = &gntdev_mmu_ops;
++	mmu_notifier_register(&priv->mn, priv->mm);
++	mmput(priv->mm);
 +
-+	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
-+			handle_level_irq,
-+			(type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
++	flip->private_data = priv;
++	if (debug)
++		printk("%s: priv %p\n", __FUNCTION__, priv);
 +
-+out:
-+	spin_unlock(&irq_mapping_update_lock);
-+	return irq;
++	return 0;
 +}
-+#endif
 +
-+int xen_vector_from_irq(unsigned irq)
++static int gntdev_release(struct inode *inode, struct file *flip)
 +{
-+	return vector_from_irq(irq);
-+}
++	struct gntdev_priv *priv = flip->private_data;
++	struct grant_map *map;
++	int err;
 +
-+int xen_gsi_from_irq(unsigned irq)
-+{
-+	return gsi_from_irq(irq);
++	if (debug)
++		printk("%s: priv %p\n", __FUNCTION__, priv);
++
++	down_write(&priv->sem);
++	while (!list_empty(&priv->maps)) {
++		map = list_entry(priv->maps.next, struct grant_map, next);
++		err = gntdev_del_map(map);
++		WARN_ON(err);
++	}
++	up_write(&priv->sem);
++	mmu_notifier_unregister(&priv->mn, priv->mm);
++	kfree(priv);
++	return 0;
 +}
-+EXPORT_SYMBOL_GPL(xen_gsi_from_irq);
 +
- int bind_evtchn_to_irq(unsigned int evtchn)
- {
- 	int irq;
-@@ -409,8 +767,23 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
- 	return irq;
- }
- 
-+static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
-+                                          unsigned int remote_port)
++static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
++				       struct ioctl_gntdev_map_grant_ref __user *u)
 +{
-+        struct evtchn_bind_interdomain bind_interdomain;
-+        int err;
++	struct ioctl_gntdev_map_grant_ref op;
++	struct grant_map *map;
++	int err;
 +
-+        bind_interdomain.remote_dom  = remote_domain;
-+        bind_interdomain.remote_port = remote_port;
- 
--static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
-+        err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
-+                                          &bind_interdomain);
++	if (copy_from_user(&op, u, sizeof(op)) != 0)
++		return -EFAULT;
++	if (debug)
++		printk("%s: priv %p, add %d\n", __FUNCTION__, priv,
++		       op.count);
++	if (unlikely(op.count <= 0))
++		return -EINVAL;
++	if (unlikely(op.count > priv->limit))
++		return -EINVAL;
++
++	down_write(&priv->sem);
++	err = -ENOMEM;
++	map = gntdev_add_map(priv, op.count);
++	if (!map)
++		goto err_unlock;
 +
-+        return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
-+}
++	err = -ENOMEM;
++	if (copy_from_user(map->grants, &u->refs,
++			   sizeof(map->grants[0]) * op.count) != 0)
++		goto err_free;
++	op.index = map->index << PAGE_SHIFT;
++	if (copy_to_user(u, &op, sizeof(op)) != 0)
++		goto err_free;
++	up_write(&priv->sem);
++	return 0;
 +
++err_free:
++	gntdev_del_map(map);
++err_unlock:
++	up_write(&priv->sem);
++	return err;
++}
 +
-+int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
- {
- 	struct evtchn_bind_virq bind_virq;
- 	int evtchn, irq;
-@@ -504,6 +877,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
- }
- EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
- 
-+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
-+					  unsigned int remote_port,
-+					  irq_handler_t handler,
-+					  unsigned long irqflags,
-+					  const char *devname,
-+					  void *dev_id)
++static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
++					 struct ioctl_gntdev_unmap_grant_ref __user *u)
 +{
-+        int irq, retval;
-+
-+        irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
-+        if (irq < 0)
-+                return irq;
++	struct ioctl_gntdev_unmap_grant_ref op;
++	struct grant_map *map;
++	int err = -EINVAL;
 +
-+        retval = request_irq(irq, handler, irqflags, devname, dev_id);
-+        if (retval != 0) {
-+                unbind_from_irq(irq);
-+                return retval;
-+        }
++	if (copy_from_user(&op, u, sizeof(op)) != 0)
++		return -EFAULT;
++	if (debug)
++		printk("%s: priv %p, del %d+%d\n", __FUNCTION__, priv,
++		       (int)op.index, (int)op.count);
 +
-+        return irq;
++	down_write(&priv->sem);
++	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
++	if (map)
++		err = gntdev_del_map(map);
++	up_write(&priv->sem);
++	return err;
 +}
-+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
 +
- int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
- 			    irq_handler_t handler,
- 			    unsigned long irqflags, const char *devname, void *dev_id)
-@@ -649,9 +1045,13 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
- 				int bit_idx = __ffs(pending_bits);
- 				int port = (word_idx * BITS_PER_LONG) + bit_idx;
- 				int irq = evtchn_to_irq[port];
-+				struct irq_desc *desc;
- 
--				if (irq != -1)
--					handle_irq(irq, regs);
-+				if (irq != -1) {
-+					desc = irq_to_desc(irq);
-+					if (desc)
-+						generic_handle_irq_desc(irq, desc);
-+				}
- 			}
- 		}
- 
-@@ -855,7 +1255,7 @@ void xen_clear_irq_pending(int irq)
- 	if (VALID_EVTCHN(evtchn))
- 		clear_evtchn(evtchn);
- }
--
-+EXPORT_SYMBOL(xen_clear_irq_pending);
- void xen_set_irq_pending(int irq)
- {
- 	int evtchn = evtchn_from_irq(irq);
-@@ -875,9 +1275,9 @@ bool xen_test_irq_pending(int irq)
- 	return ret;
- }
- 
--/* Poll waiting for an irq to become pending.  In the usual case, the
-+/* Poll waiting for an irq to become pending with timeout.  In the usual case, the
-    irq will be disabled so it won't deliver an interrupt. */
--void xen_poll_irq(int irq)
-+void xen_poll_irq_timeout(int irq, u64 timeout)
- {
- 	evtchn_port_t evtchn = evtchn_from_irq(irq);
- 
-@@ -885,13 +1285,20 @@ void xen_poll_irq(int irq)
- 		struct sched_poll poll;
- 
- 		poll.nr_ports = 1;
--		poll.timeout = 0;
-+		poll.timeout = timeout;
- 		set_xen_guest_handle(poll.ports, &evtchn);
- 
- 		if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
- 			BUG();
- 	}
- }
-+EXPORT_SYMBOL(xen_poll_irq_timeout);
-+/* Poll waiting for an irq to become pending.  In the usual case, the
-+   irq will be disabled so it won't deliver an interrupt. */
-+void xen_poll_irq(int irq)
++static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
++					      struct ioctl_gntdev_get_offset_for_vaddr __user *u)
 +{
-+	xen_poll_irq_timeout(irq, 0 /* no timeout */);
-+}
- 
- void xen_irq_resume(void)
- {
-@@ -928,13 +1335,38 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
- 	.retrigger	= retrigger_dynirq,
- };
- 
-+static struct irq_chip xen_pirq_chip __read_mostly = {
-+	.name		= "xen-pirq",
++	struct ioctl_gntdev_get_offset_for_vaddr op;
++	struct grant_map *map;
 +
-+	.startup	= startup_pirq,
-+	.shutdown	= shutdown_pirq,
++	if (copy_from_user(&op, u, sizeof(op)) != 0)
++		return -EFAULT;
++	if (debug)
++		printk("%s: priv %p, offset for vaddr %lx\n", __FUNCTION__, priv,
++		       (unsigned long)op.vaddr);
 +
-+	.enable		= enable_pirq,
-+	.unmask		= enable_pirq,
++	down_read(&priv->sem);
++	map = gntdev_find_map_vaddr(priv, op.vaddr);
++	if (map == NULL ||
++	    map->vma->vm_start != op.vaddr) {
++		up_read(&priv->sem);
++		return -EINVAL;
++	}
++	op.offset = map->index << PAGE_SHIFT;
++	op.count = map->count;
++	up_read(&priv->sem);
 +
-+	.disable	= disable_pirq,
-+	.mask		= disable_pirq,
++	if (copy_to_user(u, &op, sizeof(op)) != 0)
++		return -EFAULT;
++	return 0;
++}
 +
-+	.ack		= ack_pirq,
-+	.end		= end_pirq,
++static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
++					struct ioctl_gntdev_set_max_grants __user *u)
++{
++	struct ioctl_gntdev_set_max_grants op;
 +
-+	.set_affinity	= set_affinity_irq,
++	if (copy_from_user(&op, u, sizeof(op)) != 0)
++		return -EFAULT;
++	if (debug)
++		printk("%s: priv %p, limit %d\n", __FUNCTION__, priv, op.count);
++	if (op.count > limit)
++		return -EINVAL;
 +
-+	.retrigger	= retrigger_dynirq,
-+};
++	down_write(&priv->sem);
++	priv->limit = op.count;
++	up_write(&priv->sem);
++	return 0;
++}
 +
- void __init xen_init_IRQ(void)
- {
- 	int i;
- 
- 	cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
- 				    GFP_KERNEL);
--	BUG_ON(cpu_evtchn_mask_p == NULL);
-+	irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
++static long gntdev_ioctl(struct file *flip,
++			 unsigned int cmd, unsigned long arg)
++{
++	struct gntdev_priv *priv = flip->private_data;
++	void __user *ptr = (void __user *)arg;
 +
-+	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
-+				    GFP_KERNEL);
-+	for(i = 0; i < NR_EVENT_CHANNELS; i++)
-+		evtchn_to_irq[i] = -1;
- 
- 	init_evtchn_cpu_bindings();
- 
-@@ -943,4 +1375,6 @@ void __init xen_init_IRQ(void)
- 		mask_evtchn(i);
- 
- 	irq_ctx_init(smp_processor_id());
++	switch (cmd) {
++	case IOCTL_GNTDEV_MAP_GRANT_REF:
++		return gntdev_ioctl_map_grant_ref(priv, ptr);
 +
-+	xen_setup_pirqs();
- }
-diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
-index 79bedba..6a1c4a5 100644
---- a/drivers/xen/evtchn.c
-+++ b/drivers/xen/evtchn.c
-@@ -48,6 +48,8 @@
- #include <linux/gfp.h>
- #include <linux/mutex.h>
- #include <linux/cpu.h>
++	case IOCTL_GNTDEV_UNMAP_GRANT_REF:
++		return gntdev_ioctl_unmap_grant_ref(priv, ptr);
 +
-+#include <xen/xen.h>
- #include <xen/events.h>
- #include <xen/evtchn.h>
- #include <asm/xen/hypervisor.h>
-@@ -68,10 +70,36 @@ struct per_user_data {
- 	const char *name;
- };
- 
--/* Who's bound to each port? */
--static struct per_user_data *port_user[NR_EVENT_CHANNELS];
-+/*
-+ * Who's bound to each port?  This is logically an array of struct
-+ * per_user_data *, but we encode the current enabled-state in bit 0.
-+ */
-+static unsigned long *port_user;
- static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
- 
-+static inline struct per_user_data *get_port_user(unsigned port)
-+{
-+	return (struct per_user_data *)(port_user[port] & ~1);
-+}
++	case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
++		return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
 +
-+static inline void set_port_user(unsigned port, struct per_user_data *u)
-+{
-+	port_user[port] = (unsigned long)u;
-+}
++	case IOCTL_GNTDEV_SET_MAX_GRANTS:
++		return gntdev_ioctl_set_max_grants(priv, ptr);
 +
-+static inline bool get_port_enabled(unsigned port)
-+{
-+	return port_user[port] & 1;
-+}
++	default:
++		if (debug)
++			printk("%s: priv %p, unknown cmd %x\n",
++			       __FUNCTION__, priv, cmd);
++		return -ENOIOCTLCMD;
++	}
 +
-+static inline void set_port_enabled(unsigned port, bool enabled)
-+{
-+	if (enabled)
-+		port_user[port] |= 1;
-+	else
-+		port_user[port] &= ~1;
++	return 0;
 +}
 +
- irqreturn_t evtchn_interrupt(int irq, void *data)
- {
- 	unsigned int port = (unsigned long)data;
-@@ -79,9 +107,14 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
- 
- 	spin_lock(&port_user_lock);
- 
--	u = port_user[port];
-+	u = get_port_user(port);
++static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
++{
++	struct gntdev_priv *priv = flip->private_data;
++	int index = vma->vm_pgoff;
++	int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++	struct grant_map *map;
++	int err = -EINVAL;
 +
-+	WARN(!get_port_enabled(port),
-+	     "Interrupt for port %d, but apparently not enabled; per-user %p\n",
-+	     port, u);
- 
- 	disable_irq_nosync(irq);
-+	set_port_enabled(port, false);
- 
- 	if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
- 		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
-@@ -91,9 +124,8 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
- 			kill_fasync(&u->evtchn_async_queue,
- 				    SIGIO, POLL_IN);
- 		}
--	} else {
-+	} else
- 		u->ring_overflow = 1;
--	}
- 
- 	spin_unlock(&port_user_lock);
- 
-@@ -197,9 +229,18 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
- 		goto out;
- 
- 	spin_lock_irq(&port_user_lock);
--	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
--		if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
--			enable_irq(irq_from_evtchn(kbuf[i]));
++	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
++		return -EINVAL;
 +
-+	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
-+		unsigned port = kbuf[i];
++	if (debug)
++		printk("%s: map %d+%d at %lx (pgoff %lx)\n", __FUNCTION__,
++		       index, count, vma->vm_start, vma->vm_pgoff);
 +
-+		if (port < NR_EVENT_CHANNELS &&
-+		    get_port_user(port) == u &&
-+		    !get_port_enabled(port)) {
-+			set_port_enabled(port, true);
-+			enable_irq(irq_from_evtchn(port));
-+		}
++	down_read(&priv->sem);
++	map = gntdev_find_map_index(priv, index, count);
++	if (!map)
++		goto unlock_out;
++	if (map->vma)
++		goto unlock_out;
++	if (priv->mm != vma->vm_mm) {
++		printk("%s: Huh? Other mm?\n", __FUNCTION__);
++		goto unlock_out;
 +	}
 +
- 	spin_unlock_irq(&port_user_lock);
- 
- 	rc = count;
-@@ -221,8 +262,9 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
- 	 * interrupt handler yet, and our caller has already
- 	 * serialized bind operations.)
- 	 */
--	BUG_ON(port_user[port] != NULL);
--	port_user[port] = u;
-+	BUG_ON(get_port_user(port) != NULL);
-+	set_port_user(port, u);
-+	set_port_enabled(port, true); /* start enabled */
- 
- 	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
- 				       u->name, (void *)(unsigned long)port);
-@@ -238,10 +280,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
- 
- 	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
- 
--	/* make sure we unbind the irq handler before clearing the port */
--	barrier();
--
--	port_user[port] = NULL;
-+	set_port_user(port, NULL);
- }
- 
- static long evtchn_ioctl(struct file *file,
-@@ -332,7 +371,7 @@ static long evtchn_ioctl(struct file *file,
- 		spin_lock_irq(&port_user_lock);
- 
- 		rc = -ENOTCONN;
--		if (port_user[unbind.port] != u) {
-+		if (get_port_user(unbind.port) != u) {
- 			spin_unlock_irq(&port_user_lock);
- 			break;
- 		}
-@@ -354,7 +393,7 @@ static long evtchn_ioctl(struct file *file,
- 
- 		if (notify.port >= NR_EVENT_CHANNELS) {
- 			rc = -EINVAL;
--		} else if (port_user[notify.port] != u) {
-+		} else if (get_port_user(notify.port) != u) {
- 			rc = -ENOTCONN;
- 		} else {
- 			notify_remote_via_evtchn(notify.port);
-@@ -443,10 +482,10 @@ static int evtchn_release(struct inode *inode, struct file *filp)
- 	free_page((unsigned long)u->ring);
- 
- 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
--		if (port_user[i] != u)
-+		if (get_port_user(i) != u)
- 			continue;
- 
--		evtchn_unbind_from_user(port_user[i], i);
-+		evtchn_unbind_from_user(get_port_user(i), i);
- 	}
- 
- 	spin_unlock_irq(&port_user_lock);
-@@ -480,8 +519,11 @@ static int __init evtchn_init(void)
- 	if (!xen_domain())
- 		return -ENODEV;
- 
-+	port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
-+	if (port_user == NULL)
-+		return -ENOMEM;
-+
- 	spin_lock_init(&port_user_lock);
--	memset(port_user, 0, sizeof(port_user));
- 
- 	/* Create '/dev/misc/evtchn'. */
- 	err = misc_register(&evtchn_miscdev);
-@@ -497,6 +539,9 @@ static int __init evtchn_init(void)
- 
- static void __exit evtchn_cleanup(void)
- {
-+	kfree(port_user);
-+	port_user = NULL;
-+
- 	misc_deregister(&evtchn_miscdev);
- }
- 
-diff --git a/drivers/xen/features.c b/drivers/xen/features.c
-index 99eda16..9e2b64f 100644
---- a/drivers/xen/features.c
-+++ b/drivers/xen/features.c
-@@ -18,7 +18,7 @@
- u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
- EXPORT_SYMBOL_GPL(xen_features);
- 
--void xen_setup_features(void)
-+void __init xen_setup_features(void)
- {
- 	struct xen_feature_info fi;
- 	int i, j;
-diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
-new file mode 100644
-index 0000000..ddc59cc
---- /dev/null
-+++ b/drivers/xen/gntdev.c
-@@ -0,0 +1,626 @@
-+/******************************************************************************
-+ * gntdev.c
-+ *
-+ * Device for accessing (in user-space) pages that have been granted by other
-+ * domains.
-+ *
-+ * Copyright (c) 2006-2007, D G Murray.
-+ *           (c) 2009 Gerd Hoffmann <kraxel at redhat.com>
-+ *
-+ * This program is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License
-+ * along with this program; if not, write to the Free Software
-+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-+ */
++	vma->vm_ops = &gntdev_vmops;
 +
-+#include <linux/module.h>
-+#include <linux/kernel.h>
-+#include <linux/init.h>
-+#include <linux/miscdevice.h>
-+#include <linux/fs.h>
-+#include <linux/mm.h>
-+#include <linux/mman.h>
-+#include <linux/mmu_notifier.h>
-+#include <linux/types.h>
-+#include <linux/uaccess.h>
-+#include <linux/sched.h>
-+#include <linux/rwsem.h>
++	vma->vm_flags |= VM_RESERVED;
++	vma->vm_flags |= VM_DONTCOPY;
++	vma->vm_flags |= VM_DONTEXPAND;
 +
-+#include <xen/xen.h>
-+#include <xen/grant_table.h>
-+#include <xen/gntdev.h>
-+#include <asm/xen/hypervisor.h>
-+#include <asm/xen/hypercall.h>
-+#include <asm/xen/page.h>
++	vma->vm_private_data = map;
++	map->vma = vma;
 +
-+MODULE_LICENSE("GPL");
-+MODULE_AUTHOR("Derek G. Murray <Derek.Murray at cl.cam.ac.uk>, "
-+	      "Gerd Hoffmann <kraxel at redhat.com>");
-+MODULE_DESCRIPTION("User-space granted page access driver");
++	map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte;
++	if (!(vma->vm_flags & VM_WRITE))
++		map->flags |= GNTMAP_readonly;
 +
-+static int debug = 0;
-+module_param(debug, int, 0644);
-+static int limit = 1024;
-+module_param(limit, int, 0644);
++	err = apply_to_page_range(vma->vm_mm, vma->vm_start,
++				  vma->vm_end - vma->vm_start,
++				  find_grant_ptes, map);
++	if (err) {
++		goto unlock_out;
++		if (debug)
++			printk("%s: find_grant_ptes() failure.\n", __FUNCTION__);
++	}
 +
-+struct gntdev_priv {
-+	struct list_head maps;
-+	uint32_t used;
-+	uint32_t limit;
-+	struct rw_semaphore sem;
-+	struct mm_struct *mm;
-+	struct mmu_notifier mn;
-+};
++	err = map_grant_pages(map);
++	if (err) {
++		goto unlock_out;
++		if (debug)
++			printk("%s: map_grant_pages() failure.\n", __FUNCTION__);
++	}
++	map->is_mapped = 1;
 +
-+struct grant_map {
-+	struct list_head next;
-+	struct gntdev_priv *priv;
-+	struct vm_area_struct *vma;
-+	int index;
-+	int count;
-+	int flags;
-+	int is_mapped;
-+	struct ioctl_gntdev_grant_ref *grants;
-+	struct gnttab_map_grant_ref   *map_ops;
-+	struct gnttab_unmap_grant_ref *unmap_ops;
++unlock_out:
++	up_read(&priv->sem);
++	return err;
++}
++
++static const struct file_operations gntdev_fops = {
++	.owner = THIS_MODULE,
++	.open = gntdev_open,
++	.release = gntdev_release,
++	.mmap = gntdev_mmap,
++	.unlocked_ioctl = gntdev_ioctl
++};
++
++static struct miscdevice gntdev_miscdev = {
++	.minor        = MISC_DYNAMIC_MINOR,
++	.name         = "gntdev",
++	.fops         = &gntdev_fops,
 +};
 +
 +/* ------------------------------------------------------------------ */
 +
-+static void gntdev_print_maps(struct gntdev_priv *priv,
-+			      char *text, int text_index)
++static int __init gntdev_init(void)
 +{
-+	struct grant_map *map;
++	int err;
 +
-+	printk("%s: maps list (priv %p, usage %d/%d)\n",
-+	       __FUNCTION__, priv, priv->used, priv->limit);
-+	list_for_each_entry(map, &priv->maps, next)
-+		printk("  index %2d, count %2d %s\n",
-+		       map->index, map->count,
-+		       map->index == text_index && text ? text : "");
++	if (!xen_domain())
++		return -ENODEV;
++
++	err = misc_register(&gntdev_miscdev);
++	if (err != 0) {
++		printk(KERN_ERR "Could not register gntdev device\n");
++		return err;
++	}
++	return 0;
 +}
 +
-+static struct grant_map *gntdev_add_map(struct gntdev_priv *priv, int count)
++static void __exit gntdev_exit(void)
 +{
-+	struct grant_map *map, *add;
++	misc_deregister(&gntdev_miscdev);
++}
 +
-+	add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
-+	if (NULL == add)
-+		return NULL;
++module_init(gntdev_init);
++module_exit(gntdev_exit);
 +
-+	add->grants    = kzalloc(sizeof(add->grants[0])    * count, GFP_KERNEL);
-+	add->map_ops   = kzalloc(sizeof(add->map_ops[0])   * count, GFP_KERNEL);
-+	add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
-+	if (NULL == add->grants  ||
-+	    NULL == add->map_ops ||
-+	    NULL == add->unmap_ops)
-+		goto err;
++/* ------------------------------------------------------------------ */
+diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
+index 7d8f531..76fe621 100644
+--- a/drivers/xen/grant-table.c
++++ b/drivers/xen/grant-table.c
+@@ -37,6 +37,7 @@
+ #include <linux/vmalloc.h>
+ #include <linux/uaccess.h>
+ 
++#include <xen/xen.h>
+ #include <xen/interface/xen.h>
+ #include <xen/page.h>
+ #include <xen/grant_table.h>
+@@ -472,6 +473,111 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+ 	return 0;
+ }
+ 
++static void gnttab_page_free(struct page *page, unsigned int order)
++{
++	BUG_ON(order);
++	ClearPageForeign(page);
++	gnttab_reset_grant_page(page);
++	put_page(page);
++}
 +
-+	add->index = 0;
-+	add->count = count;
-+	add->priv  = priv;
++/*
++ * Must not be called with IRQs off.  This should only be used on the
++ * slow path.
++ *
++ * Copy a foreign granted page to local memory.
++ */
++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
++{
++	struct gnttab_unmap_and_replace unmap;
++	struct mmu_update mmu;
++	struct page *page;
++	struct page *new_page;
++	void *new_addr;
++	void *addr;
++	unsigned long pfn;
++	unsigned long mfn;
++	unsigned long new_mfn;
++	int err;
 +
-+	if (add->count + priv->used > priv->limit)
-+		goto err;
++	page = *pagep;
++	if (!get_page_unless_zero(page))
++		return -ENOENT;
 +
-+	list_for_each_entry(map, &priv->maps, next) {
-+		if (add->index + add->count < map->index) {
-+			list_add_tail(&add->next, &map->next);
-+			goto done;
-+		}
-+		add->index = map->index + map->count;
++	err = -ENOMEM;
++	new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
++	if (!new_page)
++		goto out;
++
++	new_addr = page_address(new_page);
++	addr = page_address(page);
++	memcpy(new_addr, addr, PAGE_SIZE);
++
++	pfn = page_to_pfn(page);
++	mfn = pfn_to_mfn(pfn);
++	new_mfn = virt_to_mfn(new_addr);
++
++//	write_seqlock(&gnttab_dma_lock); /* protects __gnttab_dma_map_page on 2.6.18 */
++
++	/* Make seq visible before checking page_mapped. */
++	smp_mb();
++
++	/* Has the page been DMA-mapped? */
++	if (unlikely(page_mapped(page))) {
++		//write_sequnlock(&gnttab_dma_lock);
++		put_page(new_page);
++		err = -EBUSY;
++		goto out;
 +	}
-+	list_add_tail(&add->next, &priv->maps);
 +
-+done:
-+	priv->used += add->count;
-+	if (debug)
-+		gntdev_print_maps(priv, "[new]", add->index);
-+	return add;
++	if (!xen_feature(XENFEAT_auto_translated_physmap))
++		set_phys_to_machine(pfn, new_mfn);
 +
-+err:
-+	kfree(add->grants);
-+	kfree(add->map_ops);
-+	kfree(add->unmap_ops);
-+	kfree(add);
-+	return NULL;
-+}
++	//gnttab_set_replace_op(&unmap, (unsigned long)addr,
++	//		      (unsigned long)new_addr, ref);
++	unmap.host_addr = (unsigned long)addr;
++	unmap.new_addr = (unsigned long)new_addr;
++	unmap.handle = ref;
 +
-+static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, int index,
-+					       int count)
-+{
-+	struct grant_map *map;
++	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
++					&unmap, 1);
++	BUG_ON(err);
++	BUG_ON(unmap.status);
 +
-+	list_for_each_entry(map, &priv->maps, next) {
-+		if (map->index != index)
-+			continue;
-+		if (map->count != count)
-+			continue;
-+		return map;
++//	write_sequnlock(&gnttab_dma_lock);
++
++	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++		set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
++
++		mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
++		mmu.val = pfn;
++		err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
++		BUG_ON(err);
 +	}
-+	return NULL;
++
++	new_page->mapping = page->mapping;
++	SetPageForeign(new_page, _PageForeignDestructor(page));
++	if (PageReserved(page))
++		SetPageReserved(new_page);
++	*pagep = new_page;
++
++	SetPageForeign(page, gnttab_page_free);
++	ClearPageReserved(page);
++	page->mapping = NULL;
++
++out:
++	put_page(page);
++	return err;
 +}
++EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
 +
-+static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
-+					       unsigned long vaddr)
++void gnttab_reset_grant_page(struct page *page)
 +{
-+	struct grant_map *map;
-+
-+	list_for_each_entry(map, &priv->maps, next) {
-+		if (!map->vma)
-+			continue;
-+		if (vaddr < map->vma->vm_start)
-+			continue;
-+		if (vaddr >= map->vma->vm_end)
-+			continue;
-+		return map;
-+	}
-+	return NULL;
++	init_page_count(page);
++	reset_page_mapcount(page);
 +}
++EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
++
+ int gnttab_resume(void)
+ {
+ 	if (max_nr_grant_frames() < nr_grant_frames)
+diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile
+new file mode 100644
+index 0000000..e346e81
+--- /dev/null
++++ b/drivers/xen/netback/Makefile
+@@ -0,0 +1,3 @@
++obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
++
++xen-netback-y := netback.o xenbus.o interface.o
+diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
+new file mode 100644
+index 0000000..51f97c0
+--- /dev/null
++++ b/drivers/xen/netback/common.h
+@@ -0,0 +1,227 @@
++/******************************************************************************
++ * arch/xen/drivers/netif/backend/common.h
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __NETIF__BACKEND__COMMON_H__
++#define __NETIF__BACKEND__COMMON_H__
 +
-+static int gntdev_del_map(struct grant_map *map)
-+{
-+	int i;
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/ip.h>
++#include <linux/in.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/wait.h>
++#include <linux/sched.h>
 +
-+	if (map->vma)
-+		return -EBUSY;
-+	for (i = 0; i < map->count; i++)
-+		if (map->unmap_ops[i].handle)
-+			return -EBUSY;
++#include <xen/interface/io/netif.h>
++#include <asm/io.h>
++#include <asm/pgalloc.h>
++#include <xen/interface/grant_table.h>
++#include <xen/grant_table.h>
++#include <xen/xenbus.h>
 +
-+	map->priv->used -= map->count;
-+	list_del(&map->next);
-+	kfree(map->grants);
-+	kfree(map->map_ops);
-+	kfree(map->unmap_ops);
-+	kfree(map);
-+	return 0;
-+}
++#define DPRINTK(_f, _a...)			\
++	pr_debug("(file=%s, line=%d) " _f,	\
++		 __FILE__ , __LINE__ , ## _a )
++#define IPRINTK(fmt, args...)				\
++	printk(KERN_INFO "xen_net: " fmt, ##args)
++#define WPRINTK(fmt, args...)				\
++	printk(KERN_WARNING "xen_net: " fmt, ##args)
 +
-+/* ------------------------------------------------------------------ */
++struct xen_netif {
++	/* Unique identifier for this interface. */
++	domid_t          domid;
++	unsigned int     handle;
 +
-+static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void *data)
-+{
-+	struct grant_map *map = data;
-+	unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
-+	u64 pte_maddr;
++	u8               fe_dev_addr[6];
 +
-+	BUG_ON(pgnr >= map->count);
-+	pte_maddr  = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT;
-+	pte_maddr += (unsigned long)pte & ~PAGE_MASK;
-+	gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags,
-+			  map->grants[pgnr].ref,
-+			  map->grants[pgnr].domid);
-+	gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags,
-+			    0 /* handle */);
-+	return 0;
-+}
++	/* Physical parameters of the comms window. */
++	grant_handle_t   tx_shmem_handle;
++	grant_ref_t      tx_shmem_ref;
++	grant_handle_t   rx_shmem_handle;
++	grant_ref_t      rx_shmem_ref;
++	unsigned int     irq;
 +
-+static int map_grant_pages(struct grant_map *map)
-+{
-+	int i, err = 0;
++	/* The shared rings and indexes. */
++	struct xen_netif_tx_back_ring tx;
++	struct xen_netif_rx_back_ring rx;
++	struct vm_struct *tx_comms_area;
++	struct vm_struct *rx_comms_area;
 +
-+	if (debug)
-+		printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count);
-+	err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
-+					map->map_ops, map->count);
-+	if (WARN_ON(err))
-+		return err;
++	/* Set of features that can be turned on in dev->features. */
++	int features;
 +
-+	for (i = 0; i < map->count; i++) {
-+		if (map->map_ops[i].status)
-+			err = -EINVAL;
-+		map->unmap_ops[i].handle = map->map_ops[i].handle;
-+	}
-+	return err;
-+}
++	int smart_poll;
 +
-+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
-+{
-+	int i, err = 0;
++	/* Internal feature information. */
++	u8 can_queue:1;	/* can queue packets for receiver? */
 +
-+	if (debug)
-+		printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__,
-+		       map->index, map->count, offset, pages);
-+	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
-+					map->unmap_ops + offset, pages);
-+	if (WARN_ON(err))
-+		return err;
++	/* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
++	RING_IDX rx_req_cons_peek;
 +
-+	for (i = 0; i < pages; i++) {
-+		if (map->unmap_ops[offset+i].status)
-+			err = -EINVAL;
-+		map->unmap_ops[offset+i].handle = 0;
-+	}
-+	return err;
-+}
++	/* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
++	unsigned long   credit_bytes;
++	unsigned long   credit_usec;
++	unsigned long   remaining_credit;
++	struct timer_list credit_timeout;
 +
-+/* ------------------------------------------------------------------ */
++	/* Enforce draining of the transmit queue. */
++	struct timer_list tx_queue_timeout;
 +
-+static void gntdev_vma_close(struct vm_area_struct *vma)
-+{
-+	struct grant_map *map = vma->vm_private_data;
++	/* Statistics */
++	int nr_copied_skbs;
 +
-+	if (debug)
-+		printk("%s\n", __FUNCTION__);
-+	map->is_mapped = 0;
-+	map->vma = NULL;
-+	vma->vm_private_data = NULL;
-+}
++	/* Miscellaneous private stuff. */
++	struct list_head list;  /* scheduling list */
++	atomic_t         refcnt;
++	struct net_device *dev;
++	struct net_device_stats stats;
 +
-+static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-+{
-+	if (debug)
-+		printk("%s: vaddr %p, pgoff %ld (shouldn't happen)\n",
-+		       __FUNCTION__, vmf->virtual_address, vmf->pgoff);
-+	vmf->flags = VM_FAULT_ERROR;
-+	return 0;
-+}
++	unsigned int carrier;
 +
-+static struct vm_operations_struct gntdev_vmops = {
-+	.close = gntdev_vma_close,
-+	.fault = gntdev_vma_fault,
++	wait_queue_head_t waiting_to_free;
 +};
 +
-+/* ------------------------------------------------------------------ */
++/*
++ * Implement our own carrier flag: the network stack's version causes delays
++ * when the carrier is re-enabled (in particular, dev_activate() may not
++ * immediately be called, which can cause packet loss; also the etherbridge
++ * can be rather lazy in activating its port).
++ */
++#define netback_carrier_on(netif)	((netif)->carrier = 1)
++#define netback_carrier_off(netif)	((netif)->carrier = 0)
++#define netback_carrier_ok(netif)	((netif)->carrier)
 +
-+static void mn_invl_range_start(struct mmu_notifier *mn,
-+				struct mm_struct *mm,
-+				unsigned long start, unsigned long end)
-+{
-+	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
-+	struct grant_map *map;
-+	unsigned long mstart, mend;
-+	int err;
++enum {
++	NETBK_DONT_COPY_SKB,
++	NETBK_DELAYED_COPY_SKB,
++	NETBK_ALWAYS_COPY_SKB,
++};
 +
-+	down_read(&priv->sem);
-+	list_for_each_entry(map, &priv->maps, next) {
-+		if (!map->vma)
-+			continue;
-+		if (!map->is_mapped)
-+			continue;
-+		if (map->vma->vm_start >= end)
-+			continue;
-+		if (map->vma->vm_end <= start)
-+			continue;
-+		mstart = max(start, map->vma->vm_start);
-+		mend   = min(end,   map->vma->vm_end);
-+		if (debug)
-+			printk("%s: map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
-+			       __FUNCTION__, map->index, map->count,
-+			       map->vma->vm_start, map->vma->vm_end,
-+			       start, end, mstart, mend);
-+		err = unmap_grant_pages(map,
-+					(mstart - map->vma->vm_start) >> PAGE_SHIFT,
-+					(mend - mstart) >> PAGE_SHIFT);
-+		WARN_ON(err);
-+	}
-+	up_read(&priv->sem);
-+}
++extern int netbk_copy_skb_mode;
 +
-+static void mn_invl_page(struct mmu_notifier *mn,
-+			 struct mm_struct *mm,
-+			 unsigned long address)
-+{
-+	mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
-+}
++/* Function pointers into netback accelerator plugin modules */
++struct netback_accel_hooks {
++	struct module *owner;
++	int  (*probe)(struct xenbus_device *dev);
++	int (*remove)(struct xenbus_device *dev);
++};
 +
-+static void mn_release(struct mmu_notifier *mn,
-+		       struct mm_struct *mm)
-+{
-+	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
-+	struct grant_map *map;
-+	int err;
++/* Structure to track the state of a netback accelerator plugin */
++struct netback_accelerator {
++	struct list_head link;
++	int id;
++	char *eth_name;
++	atomic_t use_count;
++	struct netback_accel_hooks *hooks;
++};
 +
-+	down_read(&priv->sem);
-+	list_for_each_entry(map, &priv->maps, next) {
-+		if (!map->vma)
-+			continue;
-+		if (debug)
-+			printk("%s: map %d+%d (%lx %lx)\n",
-+			       __FUNCTION__, map->index, map->count,
-+			       map->vma->vm_start, map->vma->vm_end);
-+		err = unmap_grant_pages(map, 0, map->count);
-+		WARN_ON(err);
-+	}
-+	up_read(&priv->sem);
-+}
++struct backend_info {
++	struct xenbus_device *dev;
++	struct xen_netif *netif;
++	enum xenbus_state frontend_state;
++	struct xenbus_watch hotplug_status_watch;
++	int have_hotplug_status_watch:1;
 +
-+struct mmu_notifier_ops gntdev_mmu_ops = {
-+	.release                = mn_release,
-+	.invalidate_page        = mn_invl_page,
-+	.invalidate_range_start = mn_invl_range_start,
++	/* State relating to the netback accelerator */
++	void *netback_accel_priv;
++	/* The accelerator that this backend is currently using */
++	struct netback_accelerator *accelerator;
 +};
 +
-+/* ------------------------------------------------------------------ */
++#define NETBACK_ACCEL_VERSION 0x00010001
++
++/*
++ * Connect an accelerator plugin module to netback.  Returns zero on
++ * success, < 0 on error, > 0 (with highest version number supported)
++ * if version mismatch.
++ */
++extern int netback_connect_accelerator(unsigned version,
++				       int id, const char *eth_name,
++				       struct netback_accel_hooks *hooks);
++/* Disconnect a previously connected accelerator plugin module */
++extern void netback_disconnect_accelerator(int id, const char *eth_name);
++
 +
-+static int gntdev_open(struct inode *inode, struct file *flip)
-+{
-+	struct gntdev_priv *priv;
++extern
++void netback_probe_accelerators(struct backend_info *be,
++				struct xenbus_device *dev);
++extern
++void netback_remove_accelerators(struct backend_info *be,
++				 struct xenbus_device *dev);
++extern
++void netif_accel_init(void);
 +
-+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-+	if (!priv)
-+		return -ENOMEM;
 +
-+	INIT_LIST_HEAD(&priv->maps);
-+	init_rwsem(&priv->sem);
-+	priv->limit = limit;
++#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
++#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
 +
-+	priv->mm = get_task_mm(current);
-+	if (!priv->mm) {
-+		kfree(priv);
-+		return -ENOMEM;
-+	}
-+	priv->mn.ops = &gntdev_mmu_ops;
-+	mmu_notifier_register(&priv->mn, priv->mm);
-+	mmput(priv->mm);
++void netif_disconnect(struct xen_netif *netif);
 +
-+	flip->private_data = priv;
-+	if (debug)
-+		printk("%s: priv %p\n", __FUNCTION__, priv);
++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle);
++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
++	      unsigned long rx_ring_ref, unsigned int evtchn);
 +
-+	return 0;
++static inline void netif_get(struct xen_netif *netif)
++{
++	atomic_inc(&netif->refcnt);
 +}
 +
-+static int gntdev_release(struct inode *inode, struct file *flip)
++static inline void  netif_put(struct xen_netif *netif)
 +{
-+	struct gntdev_priv *priv = flip->private_data;
-+	struct grant_map *map;
-+	int err;
-+
-+	if (debug)
-+		printk("%s: priv %p\n", __FUNCTION__, priv);
-+
-+	down_write(&priv->sem);
-+	while (!list_empty(&priv->maps)) {
-+		map = list_entry(priv->maps.next, struct grant_map, next);
-+		err = gntdev_del_map(map);
-+		WARN_ON(err);
-+	}
-+	up_write(&priv->sem);
-+	mmu_notifier_unregister(&priv->mn, priv->mm);
-+	kfree(priv);
-+	return 0;
++	if (atomic_dec_and_test(&netif->refcnt))
++		wake_up(&netif->waiting_to_free);
 +}
 +
-+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
-+				       struct ioctl_gntdev_map_grant_ref __user *u)
-+{
-+	struct ioctl_gntdev_map_grant_ref op;
-+	struct grant_map *map;
-+	int err;
++int netif_xenbus_init(void);
 +
-+	if (copy_from_user(&op, u, sizeof(op)) != 0)
-+		return -EFAULT;
-+	if (debug)
-+		printk("%s: priv %p, add %d\n", __FUNCTION__, priv,
-+		       op.count);
-+	if (unlikely(op.count <= 0))
-+		return -EINVAL;
-+	if (unlikely(op.count > priv->limit))
-+		return -EINVAL;
++#define netif_schedulable(netif)				\
++	(netif_running((netif)->dev) && netback_carrier_ok(netif))
 +
-+	down_write(&priv->sem);
-+	err = -ENOMEM;
-+	map = gntdev_add_map(priv, op.count);
-+	if (!map)
-+		goto err_unlock;
++void netif_schedule_work(struct xen_netif *netif);
++void netif_deschedule_work(struct xen_netif *netif);
 +
-+	err = -ENOMEM;
-+	if (copy_from_user(map->grants, &u->refs,
-+			   sizeof(map->grants[0]) * op.count) != 0)
-+		goto err_free;
-+	op.index = map->index << PAGE_SHIFT;
-+	if (copy_to_user(u, &op, sizeof(op)) != 0)
-+		goto err_free;
-+	up_write(&priv->sem);
-+	return 0;
++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
++struct net_device_stats *netif_be_get_stats(struct net_device *dev);
++irqreturn_t netif_be_int(int irq, void *dev_id);
 +
-+err_free:
-+	gntdev_del_map(map);
-+err_unlock:
-+	up_write(&priv->sem);
-+	return err;
++static inline int netbk_can_queue(struct net_device *dev)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	return netif->can_queue;
 +}
 +
-+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
-+					 struct ioctl_gntdev_unmap_grant_ref __user *u)
++static inline int netbk_can_sg(struct net_device *dev)
 +{
-+	struct ioctl_gntdev_unmap_grant_ref op;
-+	struct grant_map *map;
-+	int err = -EINVAL;
-+
-+	if (copy_from_user(&op, u, sizeof(op)) != 0)
-+		return -EFAULT;
-+	if (debug)
-+		printk("%s: priv %p, del %d+%d\n", __FUNCTION__, priv,
-+		       (int)op.index, (int)op.count);
-+
-+	down_write(&priv->sem);
-+	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
-+	if (map)
-+		err = gntdev_del_map(map);
-+	up_write(&priv->sem);
-+	return err;
++	struct xen_netif *netif = netdev_priv(dev);
++	return netif->features & NETIF_F_SG;
 +}
 +
-+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
-+					      struct ioctl_gntdev_get_offset_for_vaddr __user *u)
-+{
-+	struct ioctl_gntdev_get_offset_for_vaddr op;
-+	struct grant_map *map;
++#endif /* __NETIF__BACKEND__COMMON_H__ */
+diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
+new file mode 100644
+index 0000000..086d939
+--- /dev/null
++++ b/drivers/xen/netback/interface.c
+@@ -0,0 +1,410 @@
++/******************************************************************************
++ * arch/xen/drivers/netif/backend/interface.c
++ *
++ * Network-device interface management.
++ *
++ * Copyright (c) 2004-2005, Keir Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
 +
-+	if (copy_from_user(&op, u, sizeof(op)) != 0)
-+		return -EFAULT;
-+	if (debug)
-+		printk("%s: priv %p, offset for vaddr %lx\n", __FUNCTION__, priv,
-+		       (unsigned long)op.vaddr);
++#include "common.h"
++#include <linux/ethtool.h>
++#include <linux/rtnetlink.h>
 +
-+	down_read(&priv->sem);
-+	map = gntdev_find_map_vaddr(priv, op.vaddr);
-+	if (map == NULL ||
-+	    map->vma->vm_start != op.vaddr) {
-+		up_read(&priv->sem);
-+		return -EINVAL;
-+	}
-+	op.offset = map->index << PAGE_SHIFT;
-+	op.count = map->count;
-+	up_read(&priv->sem);
++#include <xen/events.h>
++#include <asm/xen/hypercall.h>
 +
-+	if (copy_to_user(u, &op, sizeof(op)) != 0)
-+		return -EFAULT;
-+	return 0;
-+}
++/*
++ * Module parameter 'queue_length':
++ *
++ * Enables queuing in the network stack when a client has run out of receive
++ * descriptors. Although this feature can improve receive bandwidth by avoiding
++ * packet loss, it can also result in packets sitting in the 'tx_queue' for
++ * unbounded time. This is bad if those packets hold onto foreign resources.
++ * For example, consider a packet that holds onto resources belonging to the
++ * guest for which it is queued (e.g., packet received on vif1.0, destined for
++ * vif1.1 which is not activated in the guest): in this situation the guest
++ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
++ * run a timer (tx_queue_timeout) to drain the queue when the interface is
++ * blocked.
++ */
++static unsigned long netbk_queue_length = 32;
++module_param_named(queue_length, netbk_queue_length, ulong, 0644);
 +
-+static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
-+					struct ioctl_gntdev_set_max_grants __user *u)
++static void __netif_up(struct xen_netif *netif)
 +{
-+	struct ioctl_gntdev_set_max_grants op;
-+
-+	if (copy_from_user(&op, u, sizeof(op)) != 0)
-+		return -EFAULT;
-+	if (debug)
-+		printk("%s: priv %p, limit %d\n", __FUNCTION__, priv, op.count);
-+	if (op.count > limit)
-+		return -EINVAL;
-+
-+	down_write(&priv->sem);
-+	priv->limit = op.count;
-+	up_write(&priv->sem);
-+	return 0;
++	enable_irq(netif->irq);
++	netif_schedule_work(netif);
 +}
 +
-+static long gntdev_ioctl(struct file *flip,
-+			 unsigned int cmd, unsigned long arg)
++static void __netif_down(struct xen_netif *netif)
 +{
-+	struct gntdev_priv *priv = flip->private_data;
-+	void __user *ptr = (void __user *)arg;
-+
-+	switch (cmd) {
-+	case IOCTL_GNTDEV_MAP_GRANT_REF:
-+		return gntdev_ioctl_map_grant_ref(priv, ptr);
-+
-+	case IOCTL_GNTDEV_UNMAP_GRANT_REF:
-+		return gntdev_ioctl_unmap_grant_ref(priv, ptr);
-+
-+	case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
-+		return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
-+
-+	case IOCTL_GNTDEV_SET_MAX_GRANTS:
-+		return gntdev_ioctl_set_max_grants(priv, ptr);
++	disable_irq(netif->irq);
++	netif_deschedule_work(netif);
++}
 +
-+	default:
-+		if (debug)
-+			printk("%s: priv %p, unknown cmd %x\n",
-+			       __FUNCTION__, priv, cmd);
-+		return -ENOIOCTLCMD;
++static int net_open(struct net_device *dev)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	if (netback_carrier_ok(netif)) {
++		__netif_up(netif);
++		netif_start_queue(dev);
 +	}
++	return 0;
++}
 +
++static int net_close(struct net_device *dev)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	if (netback_carrier_ok(netif))
++		__netif_down(netif);
++	netif_stop_queue(dev);
 +	return 0;
 +}
 +
-+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
++static int netbk_change_mtu(struct net_device *dev, int mtu)
 +{
-+	struct gntdev_priv *priv = flip->private_data;
-+	int index = vma->vm_pgoff;
-+	int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-+	struct grant_map *map;
-+	int err = -EINVAL;
++	int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
 +
-+	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
++	if (mtu > max)
 +		return -EINVAL;
++	dev->mtu = mtu;
++	return 0;
++}
 +
-+	if (debug)
-+		printk("%s: map %d+%d at %lx (pgoff %lx)\n", __FUNCTION__,
-+		       index, count, vma->vm_start, vma->vm_pgoff);
++static int netbk_set_sg(struct net_device *dev, u32 data)
++{
++	if (data) {
++		struct xen_netif *netif = netdev_priv(dev);
 +
-+	down_read(&priv->sem);
-+	map = gntdev_find_map_index(priv, index, count);
-+	if (!map)
-+		goto unlock_out;
-+	if (map->vma)
-+		goto unlock_out;
-+	if (priv->mm != vma->vm_mm) {
-+		printk("%s: Huh? Other mm?\n", __FUNCTION__);
-+		goto unlock_out;
++		if (!(netif->features & NETIF_F_SG))
++			return -ENOSYS;
 +	}
 +
-+	vma->vm_ops = &gntdev_vmops;
-+
-+	vma->vm_flags |= VM_RESERVED;
-+	vma->vm_flags |= VM_DONTCOPY;
-+	vma->vm_flags |= VM_DONTEXPAND;
-+
-+	vma->vm_private_data = map;
-+	map->vma = vma;
++	if (dev->mtu > ETH_DATA_LEN)
++		dev->mtu = ETH_DATA_LEN;
 +
-+	map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte;
-+	if (!(vma->vm_flags & VM_WRITE))
-+		map->flags |= GNTMAP_readonly;
++	return ethtool_op_set_sg(dev, data);
++}
 +
-+	err = apply_to_page_range(vma->vm_mm, vma->vm_start,
-+				  vma->vm_end - vma->vm_start,
-+				  find_grant_ptes, map);
-+	if (err) {
-+		goto unlock_out;
-+		if (debug)
-+			printk("%s: find_grant_ptes() failure.\n", __FUNCTION__);
-+	}
++static int netbk_set_tso(struct net_device *dev, u32 data)
++{
++	if (data) {
++		struct xen_netif *netif = netdev_priv(dev);
 +
-+	err = map_grant_pages(map);
-+	if (err) {
-+		goto unlock_out;
-+		if (debug)
-+			printk("%s: map_grant_pages() failure.\n", __FUNCTION__);
++		if (!(netif->features & NETIF_F_TSO))
++			return -ENOSYS;
 +	}
-+	map->is_mapped = 1;
 +
-+unlock_out:
-+	up_read(&priv->sem);
-+	return err;
++	return ethtool_op_set_tso(dev, data);
 +}
 +
-+static const struct file_operations gntdev_fops = {
-+	.owner = THIS_MODULE,
-+	.open = gntdev_open,
-+	.release = gntdev_release,
-+	.mmap = gntdev_mmap,
-+	.unlocked_ioctl = gntdev_ioctl
-+};
++static void netbk_get_drvinfo(struct net_device *dev,
++			      struct ethtool_drvinfo *info)
++{
++	strcpy(info->driver, "netbk");
++	strcpy(info->bus_info, dev_name(dev->dev.parent));
++}
 +
-+static struct miscdevice gntdev_miscdev = {
-+	.minor        = MISC_DYNAMIC_MINOR,
-+	.name         = "gntdev",
-+	.fops         = &gntdev_fops,
++static const struct netif_stat {
++	char name[ETH_GSTRING_LEN];
++	u16 offset;
++} netbk_stats[] = {
++	{ "copied_skbs", offsetof(struct xen_netif, nr_copied_skbs) },
 +};
 +
-+/* ------------------------------------------------------------------ */
-+
-+static int __init gntdev_init(void)
++static int netbk_get_sset_count(struct net_device *dev, int string_set)
 +{
-+	int err;
-+
-+	if (!xen_domain())
-+		return -ENODEV;
-+
-+	err = misc_register(&gntdev_miscdev);
-+	if (err != 0) {
-+		printk(KERN_ERR "Could not register gntdev device\n");
-+		return err;
++	switch (string_set) {
++	case ETH_SS_STATS:
++		return ARRAY_SIZE(netbk_stats);
++	default:
++		return -EINVAL;
 +	}
-+	return 0;
 +}
 +
-+static void __exit gntdev_exit(void)
++static void netbk_get_ethtool_stats(struct net_device *dev,
++				   struct ethtool_stats *stats, u64 * data)
 +{
-+	misc_deregister(&gntdev_miscdev);
-+}
-+
-+module_init(gntdev_init);
-+module_exit(gntdev_exit);
++	void *netif = netdev_priv(dev);
++	int i;
 +
-+/* ------------------------------------------------------------------ */
-diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
-index 7d8f531..76fe621 100644
---- a/drivers/xen/grant-table.c
-+++ b/drivers/xen/grant-table.c
-@@ -37,6 +37,7 @@
- #include <linux/vmalloc.h>
- #include <linux/uaccess.h>
- 
-+#include <xen/xen.h>
- #include <xen/interface/xen.h>
- #include <xen/page.h>
- #include <xen/grant_table.h>
-@@ -472,6 +473,111 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
- 	return 0;
- }
- 
-+static void gnttab_page_free(struct page *page, unsigned int order)
-+{
-+	BUG_ON(order);
-+	ClearPageForeign(page);
-+	gnttab_reset_grant_page(page);
-+	put_page(page);
++	for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
++		data[i] = *(int *)(netif + netbk_stats[i].offset);
 +}
 +
-+/*
-+ * Must not be called with IRQs off.  This should only be used on the
-+ * slow path.
-+ *
-+ * Copy a foreign granted page to local memory.
-+ */
-+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
++static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data)
 +{
-+	struct gnttab_unmap_and_replace unmap;
-+	struct mmu_update mmu;
-+	struct page *page;
-+	struct page *new_page;
-+	void *new_addr;
-+	void *addr;
-+	unsigned long pfn;
-+	unsigned long mfn;
-+	unsigned long new_mfn;
-+	int err;
-+
-+	page = *pagep;
-+	if (!get_page_unless_zero(page))
-+		return -ENOENT;
-+
-+	err = -ENOMEM;
-+	new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
-+	if (!new_page)
-+		goto out;
-+
-+	new_addr = page_address(new_page);
-+	addr = page_address(page);
-+	memcpy(new_addr, addr, PAGE_SIZE);
-+
-+	pfn = page_to_pfn(page);
-+	mfn = pfn_to_mfn(pfn);
-+	new_mfn = virt_to_mfn(new_addr);
-+
-+//	write_seqlock(&gnttab_dma_lock); /* protects __gnttab_dma_map_page on 2.6.18 */
-+
-+	/* Make seq visible before checking page_mapped. */
-+	smp_mb();
++	int i;
 +
-+	/* Has the page been DMA-mapped? */
-+	if (unlikely(page_mapped(page))) {
-+		//write_sequnlock(&gnttab_dma_lock);
-+		put_page(new_page);
-+		err = -EBUSY;
-+		goto out;
++	switch (stringset) {
++	case ETH_SS_STATS:
++		for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
++			memcpy(data + i * ETH_GSTRING_LEN,
++			       netbk_stats[i].name, ETH_GSTRING_LEN);
++		break;
 +	}
++}
 +
-+	if (!xen_feature(XENFEAT_auto_translated_physmap))
-+		set_phys_to_machine(pfn, new_mfn);
-+
-+	//gnttab_set_replace_op(&unmap, (unsigned long)addr,
-+	//		      (unsigned long)new_addr, ref);
-+	unmap.host_addr = (unsigned long)addr;
-+	unmap.new_addr = (unsigned long)new_addr;
-+	unmap.handle = ref;
-+
-+	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
-+					&unmap, 1);
-+	BUG_ON(err);
-+	BUG_ON(unmap.status);
++static struct ethtool_ops network_ethtool_ops =
++{
++	.get_drvinfo = netbk_get_drvinfo,
 +
-+//	write_sequnlock(&gnttab_dma_lock);
++	.get_tx_csum = ethtool_op_get_tx_csum,
++	.set_tx_csum = ethtool_op_set_tx_csum,
++	.get_sg = ethtool_op_get_sg,
++	.set_sg = netbk_set_sg,
++	.get_tso = ethtool_op_get_tso,
++	.set_tso = netbk_set_tso,
++	.get_link = ethtool_op_get_link,
 +
-+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-+		set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
++	.get_sset_count = netbk_get_sset_count,
++	.get_ethtool_stats = netbk_get_ethtool_stats,
++	.get_strings = netbk_get_strings,
++};
 +
-+		mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
-+		mmu.val = pfn;
-+		err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
-+		BUG_ON(err);
-+	}
++static struct net_device_ops netback_ops =
++{
++	.ndo_start_xmit	= netif_be_start_xmit,
++	.ndo_get_stats	= netif_be_get_stats,
++	.ndo_open	= net_open,
++	.ndo_stop	= net_close,
++	.ndo_change_mtu	= netbk_change_mtu,
++};
 +
-+	new_page->mapping = page->mapping;
-+	SetPageForeign(new_page, _PageForeignDestructor(page));
-+	if (PageReserved(page))
-+		SetPageReserved(new_page);
-+	*pagep = new_page;
++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
++{
++	int err = 0;
++	struct net_device *dev;
++	struct xen_netif *netif;
++	char name[IFNAMSIZ] = {};
 +
-+	SetPageForeign(page, gnttab_page_free);
-+	ClearPageReserved(page);
-+	page->mapping = NULL;
++	snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
++	dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
++	if (dev == NULL) {
++		DPRINTK("Could not create netif: out of memory\n");
++		return ERR_PTR(-ENOMEM);
++	}
 +
-+out:
-+	put_page(page);
-+	return err;
-+}
-+EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
++	SET_NETDEV_DEV(dev, parent);
 +
-+void gnttab_reset_grant_page(struct page *page)
-+{
-+	init_page_count(page);
-+	reset_page_mapcount(page);
-+}
-+EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
++	netif = netdev_priv(dev);
++	memset(netif, 0, sizeof(*netif));
++	netif->domid  = domid;
++	netif->handle = handle;
++	netif->features = NETIF_F_SG;
++	atomic_set(&netif->refcnt, 1);
++	init_waitqueue_head(&netif->waiting_to_free);
++	netif->dev = dev;
++	INIT_LIST_HEAD(&netif->list);
 +
- int gnttab_resume(void)
- {
- 	if (max_nr_grant_frames() < nr_grant_frames)
-diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile
-new file mode 100644
-index 0000000..e346e81
---- /dev/null
-+++ b/drivers/xen/netback/Makefile
-@@ -0,0 +1,3 @@
-+obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
++	netback_carrier_off(netif);
 +
-+xen-netback-y := netback.o xenbus.o interface.o
-diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
-new file mode 100644
-index 0000000..51f97c0
---- /dev/null
-+++ b/drivers/xen/netback/common.h
-@@ -0,0 +1,227 @@
-+/******************************************************************************
-+ * arch/xen/drivers/netif/backend/common.h
-+ *
-+ * This program is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU General Public License version 2
-+ * as published by the Free Software Foundation; or, when distributed
-+ * separately from the Linux kernel or incorporated into other
-+ * software packages, subject to the following license:
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this source file (the "Software"), to deal in the Software without
-+ * restriction, including without limitation the rights to use, copy, modify,
-+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-+ * and to permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ */
++	netif->credit_bytes = netif->remaining_credit = ~0UL;
++	netif->credit_usec  = 0UL;
++	init_timer(&netif->credit_timeout);
++	/* Initialize 'expires' now: it's used to track the credit window. */
++	netif->credit_timeout.expires = jiffies;
 +
-+#ifndef __NETIF__BACKEND__COMMON_H__
-+#define __NETIF__BACKEND__COMMON_H__
++	init_timer(&netif->tx_queue_timeout);
 +
-+#include <linux/version.h>
-+#include <linux/module.h>
-+#include <linux/interrupt.h>
-+#include <linux/slab.h>
-+#include <linux/ip.h>
-+#include <linux/in.h>
-+#include <linux/netdevice.h>
-+#include <linux/etherdevice.h>
-+#include <linux/wait.h>
-+#include <linux/sched.h>
++	dev->netdev_ops	= &netback_ops;
++	dev->features   = NETIF_F_IP_CSUM|NETIF_F_SG;
 +
-+#include <xen/interface/io/netif.h>
-+#include <asm/io.h>
-+#include <asm/pgalloc.h>
-+#include <xen/interface/grant_table.h>
-+#include <xen/grant_table.h>
-+#include <xen/xenbus.h>
++	SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
 +
-+#define DPRINTK(_f, _a...)			\
-+	pr_debug("(file=%s, line=%d) " _f,	\
-+		 __FILE__ , __LINE__ , ## _a )
-+#define IPRINTK(fmt, args...)				\
-+	printk(KERN_INFO "xen_net: " fmt, ##args)
-+#define WPRINTK(fmt, args...)				\
-+	printk(KERN_WARNING "xen_net: " fmt, ##args)
++	dev->tx_queue_len = netbk_queue_length;
 +
-+struct xen_netif {
-+	/* Unique identifier for this interface. */
-+	domid_t          domid;
-+	unsigned int     handle;
++	/*
++	 * Initialise a dummy MAC address. We choose the numerically
++	 * largest non-broadcast address to prevent the address getting
++	 * stolen by an Ethernet bridge for STP purposes.
++	 * (FE:FF:FF:FF:FF:FF)
++	 */
++	memset(dev->dev_addr, 0xFF, ETH_ALEN);
++	dev->dev_addr[0] &= ~0x01;
 +
-+	u8               fe_dev_addr[6];
++	rtnl_lock();
++	err = register_netdevice(dev);
++	rtnl_unlock();
++	if (err) {
++		DPRINTK("Could not register new net device %s: err=%d\n",
++			dev->name, err);
++		free_netdev(dev);
++		return ERR_PTR(err);
++	}
 +
-+	/* Physical parameters of the comms window. */
-+	grant_handle_t   tx_shmem_handle;
-+	grant_ref_t      tx_shmem_ref;
-+	grant_handle_t   rx_shmem_handle;
-+	grant_ref_t      rx_shmem_ref;
-+	unsigned int     irq;
++	DPRINTK("Successfully created netif\n");
++	return netif;
++}
 +
-+	/* The shared rings and indexes. */
-+	struct xen_netif_tx_back_ring tx;
-+	struct xen_netif_rx_back_ring rx;
-+	struct vm_struct *tx_comms_area;
-+	struct vm_struct *rx_comms_area;
++static int map_frontend_pages(
++	struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
++{
++	struct gnttab_map_grant_ref op;
 +
-+	/* Set of features that can be turned on in dev->features. */
-+	int features;
++	gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
++			  GNTMAP_host_map, tx_ring_ref, netif->domid);
 +
-+	int smart_poll;
++	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++		BUG();
 +
-+	/* Internal feature information. */
-+	u8 can_queue:1;	/* can queue packets for receiver? */
++	if (op.status) {
++		DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
++		return op.status;
++	}
 +
-+	/* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
-+	RING_IDX rx_req_cons_peek;
++	netif->tx_shmem_ref    = tx_ring_ref;
++	netif->tx_shmem_handle = op.handle;
 +
-+	/* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
-+	unsigned long   credit_bytes;
-+	unsigned long   credit_usec;
-+	unsigned long   remaining_credit;
-+	struct timer_list credit_timeout;
++	gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
++			  GNTMAP_host_map, rx_ring_ref, netif->domid);
 +
-+	/* Enforce draining of the transmit queue. */
-+	struct timer_list tx_queue_timeout;
++	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++		BUG();
 +
-+	/* Statistics */
-+	int nr_copied_skbs;
++	if (op.status) {
++		struct gnttab_unmap_grant_ref unop;
 +
-+	/* Miscellaneous private stuff. */
-+	struct list_head list;  /* scheduling list */
-+	atomic_t         refcnt;
-+	struct net_device *dev;
-+	struct net_device_stats stats;
++		gnttab_set_unmap_op(&unop,
++				    (unsigned long)netif->tx_comms_area->addr,
++				    GNTMAP_host_map, netif->tx_shmem_handle);
++		HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
++		DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
++		return op.status;
++	}
 +
-+	unsigned int carrier;
++	netif->rx_shmem_ref    = rx_ring_ref;
++	netif->rx_shmem_handle = op.handle;
 +
-+	wait_queue_head_t waiting_to_free;
-+};
++	return 0;
++}
 +
-+/*
-+ * Implement our own carrier flag: the network stack's version causes delays
-+ * when the carrier is re-enabled (in particular, dev_activate() may not
-+ * immediately be called, which can cause packet loss; also the etherbridge
-+ * can be rather lazy in activating its port).
-+ */
-+#define netback_carrier_on(netif)	((netif)->carrier = 1)
-+#define netback_carrier_off(netif)	((netif)->carrier = 0)
-+#define netback_carrier_ok(netif)	((netif)->carrier)
++static void unmap_frontend_pages(struct xen_netif *netif)
++{
++	struct gnttab_unmap_grant_ref op;
 +
-+enum {
-+	NETBK_DONT_COPY_SKB,
-+	NETBK_DELAYED_COPY_SKB,
-+	NETBK_ALWAYS_COPY_SKB,
-+};
++	gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
++			    GNTMAP_host_map, netif->tx_shmem_handle);
 +
-+extern int netbk_copy_skb_mode;
++	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++		BUG();
 +
-+/* Function pointers into netback accelerator plugin modules */
-+struct netback_accel_hooks {
-+	struct module *owner;
-+	int  (*probe)(struct xenbus_device *dev);
-+	int (*remove)(struct xenbus_device *dev);
-+};
++	gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
++			    GNTMAP_host_map, netif->rx_shmem_handle);
 +
-+/* Structure to track the state of a netback accelerator plugin */
-+struct netback_accelerator {
-+	struct list_head link;
-+	int id;
-+	char *eth_name;
-+	atomic_t use_count;
-+	struct netback_accel_hooks *hooks;
-+};
++	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++		BUG();
++}
 +
-+struct backend_info {
-+	struct xenbus_device *dev;
-+	struct xen_netif *netif;
-+	enum xenbus_state frontend_state;
-+	struct xenbus_watch hotplug_status_watch;
-+	int have_hotplug_status_watch:1;
++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
++	      unsigned long rx_ring_ref, unsigned int evtchn)
++{
++	int err = -ENOMEM;
++	struct xen_netif_tx_sring *txs;
++	struct xen_netif_rx_sring *rxs;
 +
-+	/* State relating to the netback accelerator */
-+	void *netback_accel_priv;
-+	/* The accelerator that this backend is currently using */
-+	struct netback_accelerator *accelerator;
-+};
++	/* Already connected through? */
++	if (netif->irq)
++		return 0;
 +
-+#define NETBACK_ACCEL_VERSION 0x00010001
++	netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
++	if (netif->tx_comms_area == NULL)
++		return -ENOMEM;
++	netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
++	if (netif->rx_comms_area == NULL)
++		goto err_rx;
 +
-+/*
-+ * Connect an accelerator plugin module to netback.  Returns zero on
-+ * success, < 0 on error, > 0 (with highest version number supported)
-+ * if version mismatch.
-+ */
-+extern int netback_connect_accelerator(unsigned version,
-+				       int id, const char *eth_name,
-+				       struct netback_accel_hooks *hooks);
-+/* Disconnect a previously connected accelerator plugin module */
-+extern void netback_disconnect_accelerator(int id, const char *eth_name);
++	err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
++	if (err)
++		goto err_map;
 +
++	err = bind_interdomain_evtchn_to_irqhandler(
++		netif->domid, evtchn, netif_be_int, 0,
++		netif->dev->name, netif);
++	if (err < 0)
++		goto err_hypervisor;
++	netif->irq = err;
++	disable_irq(netif->irq);
 +
-+extern
-+void netback_probe_accelerators(struct backend_info *be,
-+				struct xenbus_device *dev);
-+extern
-+void netback_remove_accelerators(struct backend_info *be,
-+				 struct xenbus_device *dev);
-+extern
-+void netif_accel_init(void);
++	txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
++	BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
 +
++	rxs = (struct xen_netif_rx_sring *)
++		((char *)netif->rx_comms_area->addr);
++	BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
 +
-+#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
-+#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
++	netif->rx_req_cons_peek = 0;
 +
-+void netif_disconnect(struct xen_netif *netif);
++	netif_get(netif);
 +
-+struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle);
-+int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
-+	      unsigned long rx_ring_ref, unsigned int evtchn);
++	rtnl_lock();
++	netback_carrier_on(netif);
++	if (netif_running(netif->dev))
++		__netif_up(netif);
++	rtnl_unlock();
 +
-+static inline void netif_get(struct xen_netif *netif)
-+{
-+	atomic_inc(&netif->refcnt);
++	return 0;
++err_hypervisor:
++	unmap_frontend_pages(netif);
++err_map:
++	free_vm_area(netif->rx_comms_area);
++err_rx:
++	free_vm_area(netif->tx_comms_area);
++	return err;
 +}
 +
-+static inline void  netif_put(struct xen_netif *netif)
++void netif_disconnect(struct xen_netif *netif)
 +{
-+	if (atomic_dec_and_test(&netif->refcnt))
-+		wake_up(&netif->waiting_to_free);
-+}
++	if (netback_carrier_ok(netif)) {
++		rtnl_lock();
++		netback_carrier_off(netif);
++		netif_carrier_off(netif->dev); /* discard queued packets */
++		if (netif_running(netif->dev))
++			__netif_down(netif);
++		rtnl_unlock();
++		netif_put(netif);
++	}
 +
-+int netif_xenbus_init(void);
++	atomic_dec(&netif->refcnt);
++	wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
 +
-+#define netif_schedulable(netif)				\
-+	(netif_running((netif)->dev) && netback_carrier_ok(netif))
++	del_timer_sync(&netif->credit_timeout);
++	del_timer_sync(&netif->tx_queue_timeout);
 +
-+void netif_schedule_work(struct xen_netif *netif);
-+void netif_deschedule_work(struct xen_netif *netif);
++	if (netif->irq)
++		unbind_from_irqhandler(netif->irq, netif);
 +
-+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
-+struct net_device_stats *netif_be_get_stats(struct net_device *dev);
-+irqreturn_t netif_be_int(int irq, void *dev_id);
++	unregister_netdev(netif->dev);
 +
-+static inline int netbk_can_queue(struct net_device *dev)
-+{
-+	struct xen_netif *netif = netdev_priv(dev);
-+	return netif->can_queue;
-+}
++	if (netif->tx.sring) {
++		unmap_frontend_pages(netif);
++		free_vm_area(netif->tx_comms_area);
++		free_vm_area(netif->rx_comms_area);
++	}
 +
-+static inline int netbk_can_sg(struct net_device *dev)
-+{
-+	struct xen_netif *netif = netdev_priv(dev);
-+	return netif->features & NETIF_F_SG;
++	free_netdev(netif->dev);
 +}
-+
-+#endif /* __NETIF__BACKEND__COMMON_H__ */
-diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
+diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
 new file mode 100644
-index 0000000..086d939
+index 0000000..0bc6398
 --- /dev/null
-+++ b/drivers/xen/netback/interface.c
-@@ -0,0 +1,410 @@
++++ b/drivers/xen/netback/netback.c
+@@ -0,0 +1,1613 @@
 +/******************************************************************************
-+ * arch/xen/drivers/netif/backend/interface.c
++ * drivers/xen/netback/netback.c
 + *
-+ * Network-device interface management.
++ * Back-end of the driver for virtual network devices. This portion of the
++ * driver exports a 'unified' network-device interface that can be accessed
++ * by any operating system that implements a compatible front end. A
++ * reference front-end implementation can be found in:
++ *  drivers/xen/netfront/netfront.c
 + *
-+ * Copyright (c) 2004-2005, Keir Fraser
++ * Copyright (c) 2002-2005, K A Fraser
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License version 2
@@ -14857,2697 +16839,2846 @@
 + */
 +
 +#include "common.h"
-+#include <linux/ethtool.h>
-+#include <linux/rtnetlink.h>
-+
-+#include <xen/events.h>
-+#include <asm/xen/hypercall.h>
-+
-+/*
-+ * Module parameter 'queue_length':
-+ *
-+ * Enables queuing in the network stack when a client has run out of receive
-+ * descriptors. Although this feature can improve receive bandwidth by avoiding
-+ * packet loss, it can also result in packets sitting in the 'tx_queue' for
-+ * unbounded time. This is bad if those packets hold onto foreign resources.
-+ * For example, consider a packet that holds onto resources belonging to the
-+ * guest for which it is queued (e.g., packet received on vif1.0, destined for
-+ * vif1.1 which is not activated in the guest): in this situation the guest
-+ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
-+ * run a timer (tx_queue_timeout) to drain the queue when the interface is
-+ * blocked.
-+ */
-+static unsigned long netbk_queue_length = 32;
-+module_param_named(queue_length, netbk_queue_length, ulong, 0644);
-+
-+static void __netif_up(struct xen_netif *netif)
-+{
-+	enable_irq(netif->irq);
-+	netif_schedule_work(netif);
-+}
-+
-+static void __netif_down(struct xen_netif *netif)
-+{
-+	disable_irq(netif->irq);
-+	netif_deschedule_work(netif);
-+}
 +
-+static int net_open(struct net_device *dev)
-+{
-+	struct xen_netif *netif = netdev_priv(dev);
-+	if (netback_carrier_ok(netif)) {
-+		__netif_up(netif);
-+		netif_start_queue(dev);
-+	}
-+	return 0;
-+}
++#include <linux/tcp.h>
++#include <linux/udp.h>
 +
-+static int net_close(struct net_device *dev)
-+{
-+	struct xen_netif *netif = netdev_priv(dev);
-+	if (netback_carrier_ok(netif))
-+		__netif_down(netif);
-+	netif_stop_queue(dev);
-+	return 0;
-+}
++#include <xen/balloon.h>
++#include <xen/events.h>
++#include <xen/interface/memory.h>
 +
-+static int netbk_change_mtu(struct net_device *dev, int mtu)
-+{
-+	int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
++#include <asm/xen/hypercall.h>
++#include <asm/xen/page.h>
 +
-+	if (mtu > max)
-+		return -EINVAL;
-+	dev->mtu = mtu;
-+	return 0;
-+}
++/*define NETBE_DEBUG_INTERRUPT*/
 +
-+static int netbk_set_sg(struct net_device *dev, u32 data)
-+{
-+	if (data) {
-+		struct xen_netif *netif = netdev_priv(dev);
++struct netbk_rx_meta {
++	skb_frag_t frag;
++	int id;
++};
 +
-+		if (!(netif->features & NETIF_F_SG))
-+			return -ENOSYS;
-+	}
++struct netbk_tx_pending_inuse {
++	struct list_head list;
++	unsigned long alloc_time;
++};
 +
-+	if (dev->mtu > ETH_DATA_LEN)
-+		dev->mtu = ETH_DATA_LEN;
 +
-+	return ethtool_op_set_sg(dev, data);
-+}
++static void netif_idx_release(u16 pending_idx);
++static void make_tx_response(struct xen_netif *netif,
++			     struct xen_netif_tx_request *txp,
++			     s8       st);
++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
++					     u16      id,
++					     s8       st,
++					     u16      offset,
++					     u16      size,
++					     u16      flags);
 +
-+static int netbk_set_tso(struct net_device *dev, u32 data)
-+{
-+	if (data) {
-+		struct xen_netif *netif = netdev_priv(dev);
++static void net_tx_action(unsigned long unused);
++static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
 +
-+		if (!(netif->features & NETIF_F_TSO))
-+			return -ENOSYS;
-+	}
++static void net_rx_action(unsigned long unused);
++static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
 +
-+	return ethtool_op_set_tso(dev, data);
-+}
++static struct timer_list net_timer;
++static struct timer_list netbk_tx_pending_timer;
 +
-+static void netbk_get_drvinfo(struct net_device *dev,
-+			      struct ethtool_drvinfo *info)
-+{
-+	strcpy(info->driver, "netbk");
-+	strcpy(info->bus_info, dev_name(dev->dev.parent));
-+}
++#define MAX_PENDING_REQS 256
 +
-+static const struct netif_stat {
-+	char name[ETH_GSTRING_LEN];
-+	u16 offset;
-+} netbk_stats[] = {
-+	{ "copied_skbs", offsetof(struct xen_netif, nr_copied_skbs) },
-+};
++static struct sk_buff_head rx_queue;
 +
-+static int netbk_get_sset_count(struct net_device *dev, int string_set)
++static struct page **mmap_pages;
++static inline unsigned long idx_to_pfn(unsigned int idx)
 +{
-+	switch (string_set) {
-+	case ETH_SS_STATS:
-+		return ARRAY_SIZE(netbk_stats);
-+	default:
-+		return -EINVAL;
-+	}
++	return page_to_pfn(mmap_pages[idx]);
 +}
 +
-+static void netbk_get_ethtool_stats(struct net_device *dev,
-+				   struct ethtool_stats *stats, u64 * data)
++static inline unsigned long idx_to_kaddr(unsigned int idx)
 +{
-+	void *netif = netdev_priv(dev);
-+	int i;
-+
-+	for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
-+		data[i] = *(int *)(netif + netbk_stats[i].offset);
++	return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx));
 +}
 +
-+static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data)
++/* extra field used in struct page */
++static inline void netif_set_page_index(struct page *pg, unsigned int index)
 +{
-+	int i;
-+
-+	switch (stringset) {
-+	case ETH_SS_STATS:
-+		for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
-+			memcpy(data + i * ETH_GSTRING_LEN,
-+			       netbk_stats[i].name, ETH_GSTRING_LEN);
-+		break;
-+	}
++	*(unsigned long *)&pg->mapping = index + 1;
 +}
 +
-+static struct ethtool_ops network_ethtool_ops =
++static inline int netif_page_index(struct page *pg)
 +{
-+	.get_drvinfo = netbk_get_drvinfo,
++	unsigned long idx = (unsigned long)pg->mapping - 1;
 +
-+	.get_tx_csum = ethtool_op_get_tx_csum,
-+	.set_tx_csum = ethtool_op_set_tx_csum,
-+	.get_sg = ethtool_op_get_sg,
-+	.set_sg = netbk_set_sg,
-+	.get_tso = ethtool_op_get_tso,
-+	.set_tso = netbk_set_tso,
-+	.get_link = ethtool_op_get_link,
++	if (!PageForeign(pg))
++		return -1;
 +
-+	.get_sset_count = netbk_get_sset_count,
-+	.get_ethtool_stats = netbk_get_ethtool_stats,
-+	.get_strings = netbk_get_strings,
-+};
++	if ((idx >= MAX_PENDING_REQS) || (mmap_pages[idx] != pg))
++		return -1;
 +
-+static struct net_device_ops netback_ops =
-+{
-+	.ndo_start_xmit	= netif_be_start_xmit,
-+	.ndo_get_stats	= netif_be_get_stats,
-+	.ndo_open	= net_open,
-+	.ndo_stop	= net_close,
-+	.ndo_change_mtu	= netbk_change_mtu,
-+};
++	return idx;
++}
 +
-+struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
-+{
-+	int err = 0;
-+	struct net_device *dev;
++/*
++ * This is the amount of packet we copy rather than map, so that the
++ * guest can't fiddle with the contents of the headers while we do
++ * packet processing on them (netfilter, routing, etc). 72 is enough
++ * to cover TCP+IP headers including options.
++ */
++#define PKT_PROT_LEN 72
++
++static struct pending_tx_info {
++	struct xen_netif_tx_request req;
 +	struct xen_netif *netif;
-+	char name[IFNAMSIZ] = {};
++} pending_tx_info[MAX_PENDING_REQS];
++static u16 pending_ring[MAX_PENDING_REQS];
++typedef unsigned int pending_ring_idx_t;
 +
-+	snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
-+	dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
-+	if (dev == NULL) {
-+		DPRINTK("Could not create netif: out of memory\n");
-+		return ERR_PTR(-ENOMEM);
-+	}
++static inline pending_ring_idx_t pending_index(unsigned i)
++{
++	return i & (MAX_PENDING_REQS-1);
++}
 +
-+	SET_NETDEV_DEV(dev, parent);
++static pending_ring_idx_t pending_prod, pending_cons;
 +
-+	netif = netdev_priv(dev);
-+	memset(netif, 0, sizeof(*netif));
-+	netif->domid  = domid;
-+	netif->handle = handle;
-+	netif->features = NETIF_F_SG;
-+	atomic_set(&netif->refcnt, 1);
-+	init_waitqueue_head(&netif->waiting_to_free);
-+	netif->dev = dev;
-+	INIT_LIST_HEAD(&netif->list);
++static inline pending_ring_idx_t nr_pending_reqs(void)
++{
++	return MAX_PENDING_REQS - pending_prod + pending_cons;
++}
 +
-+	netback_carrier_off(netif);
++/* Freed TX SKBs get batched on this ring before return to pending_ring. */
++static u16 dealloc_ring[MAX_PENDING_REQS];
++static pending_ring_idx_t dealloc_prod, dealloc_cons;
 +
-+	netif->credit_bytes = netif->remaining_credit = ~0UL;
-+	netif->credit_usec  = 0UL;
-+	init_timer(&netif->credit_timeout);
-+	/* Initialize 'expires' now: it's used to track the credit window. */
-+	netif->credit_timeout.expires = jiffies;
++/* Doubly-linked list of in-use pending entries. */
++static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
++static LIST_HEAD(pending_inuse_head);
 +
-+	init_timer(&netif->tx_queue_timeout);
++static struct sk_buff_head tx_queue;
 +
-+	dev->netdev_ops	= &netback_ops;
-+	dev->features   = NETIF_F_IP_CSUM|NETIF_F_SG;
++static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
++static struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
++static struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
 +
-+	SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
++static LIST_HEAD(net_schedule_list);
++static DEFINE_SPINLOCK(net_schedule_list_lock);
 +
-+	dev->tx_queue_len = netbk_queue_length;
++#define MAX_MFN_ALLOC 64
++static unsigned long mfn_list[MAX_MFN_ALLOC];
++static unsigned int alloc_index = 0;
 +
-+	/*
-+	 * Initialise a dummy MAC address. We choose the numerically
-+	 * largest non-broadcast address to prevent the address getting
-+	 * stolen by an Ethernet bridge for STP purposes.
-+	 * (FE:FF:FF:FF:FF:FF)
-+	 */
-+	memset(dev->dev_addr, 0xFF, ETH_ALEN);
-+	dev->dev_addr[0] &= ~0x01;
++/* Setting this allows the safe use of this driver without netloop. */
++static int MODPARM_copy_skb = 1;
++module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
++MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
 +
-+	rtnl_lock();
-+	err = register_netdevice(dev);
-+	rtnl_unlock();
-+	if (err) {
-+		DPRINTK("Could not register new net device %s: err=%d\n",
-+			dev->name, err);
-+		free_netdev(dev);
-+		return ERR_PTR(err);
-+	}
++int netbk_copy_skb_mode;
 +
-+	DPRINTK("Successfully created netif\n");
-+	return netif;
++static inline unsigned long alloc_mfn(void)
++{
++	BUG_ON(alloc_index == 0);
++	return mfn_list[--alloc_index];
 +}
 +
-+static int map_frontend_pages(
-+	struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
++static inline void maybe_schedule_tx_action(void)
 +{
-+	struct gnttab_map_grant_ref op;
-+
-+	gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
-+			  GNTMAP_host_map, tx_ring_ref, netif->domid);
-+
-+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
-+		BUG();
-+
-+	if (op.status) {
-+		DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
-+		return op.status;
-+	}
-+
-+	netif->tx_shmem_ref    = tx_ring_ref;
-+	netif->tx_shmem_handle = op.handle;
-+
-+	gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
-+			  GNTMAP_host_map, rx_ring_ref, netif->domid);
++	smp_mb();
++	if ((nr_pending_reqs() < (MAX_PENDING_REQS/2)) &&
++	    !list_empty(&net_schedule_list))
++		tasklet_schedule(&net_tx_tasklet);
++}
 +
-+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
-+		BUG();
++static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
++{
++	struct skb_shared_info *ninfo;
++	struct sk_buff *nskb;
++	unsigned long offset;
++	int ret;
++	int len;
++	int headlen;
 +
-+	if (op.status) {
-+		struct gnttab_unmap_grant_ref unop;
++	BUG_ON(skb_shinfo(skb)->frag_list != NULL);
 +
-+		gnttab_set_unmap_op(&unop,
-+				    (unsigned long)netif->tx_comms_area->addr,
-+				    GNTMAP_host_map, netif->tx_shmem_handle);
-+		HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
-+		DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
-+		return op.status;
-+	}
++	nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
++	if (unlikely(!nskb))
++		goto err;
 +
-+	netif->rx_shmem_ref    = rx_ring_ref;
-+	netif->rx_shmem_handle = op.handle;
++	skb_reserve(nskb, NET_SKB_PAD + NET_IP_ALIGN);
++	headlen = skb_end_pointer(nskb) - nskb->data;
++	if (headlen > skb_headlen(skb))
++		headlen = skb_headlen(skb);
++	ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
++	BUG_ON(ret);
 +
-+	return 0;
-+}
++	ninfo = skb_shinfo(nskb);
++	ninfo->gso_size = skb_shinfo(skb)->gso_size;
++	ninfo->gso_type = skb_shinfo(skb)->gso_type;
 +
-+static void unmap_frontend_pages(struct xen_netif *netif)
-+{
-+	struct gnttab_unmap_grant_ref op;
++	offset = headlen;
++	len = skb->len - headlen;
 +
-+	gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
-+			    GNTMAP_host_map, netif->tx_shmem_handle);
++	nskb->len = skb->len;
++	nskb->data_len = len;
++	nskb->truesize += len;
 +
-+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
-+		BUG();
++	while (len) {
++		struct page *page;
++		int copy;
++		int zero;
 +
-+	gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
-+			    GNTMAP_host_map, netif->rx_shmem_handle);
++		if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
++			dump_stack();
++			goto err_free;
++		}
 +
-+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
-+		BUG();
-+}
++		copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
++		zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
 +
-+int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
-+	      unsigned long rx_ring_ref, unsigned int evtchn)
-+{
-+	int err = -ENOMEM;
-+	struct xen_netif_tx_sring *txs;
-+	struct xen_netif_rx_sring *rxs;
++		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
++		if (unlikely(!page))
++			goto err_free;
 +
-+	/* Already connected through? */
-+	if (netif->irq)
-+		return 0;
++		ret = skb_copy_bits(skb, offset, page_address(page), copy);
++		BUG_ON(ret);
 +
-+	netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
-+	if (netif->tx_comms_area == NULL)
-+		return -ENOMEM;
-+	netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
-+	if (netif->rx_comms_area == NULL)
-+		goto err_rx;
++		ninfo->frags[ninfo->nr_frags].page = page;
++		ninfo->frags[ninfo->nr_frags].page_offset = 0;
++		ninfo->frags[ninfo->nr_frags].size = copy;
++		ninfo->nr_frags++;
 +
-+	err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
-+	if (err)
-+		goto err_map;
++		offset += copy;
++		len -= copy;
++	}
 +
-+	err = bind_interdomain_evtchn_to_irqhandler(
-+		netif->domid, evtchn, netif_be_int, 0,
-+		netif->dev->name, netif);
-+	if (err < 0)
-+		goto err_hypervisor;
-+	netif->irq = err;
-+	disable_irq(netif->irq);
++	offset = nskb->data - skb->data;
 +
-+	txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
-+	BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
++	nskb->transport_header = skb->transport_header + offset;
++	nskb->network_header = skb->network_header + offset;
++	nskb->mac_header = skb->mac_header + offset;
 +
-+	rxs = (struct xen_netif_rx_sring *)
-+		((char *)netif->rx_comms_area->addr);
-+	BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
++	return nskb;
 +
-+	netif->rx_req_cons_peek = 0;
++ err_free:
++	kfree_skb(nskb);
++ err:
++	return NULL;
++}
 +
-+	netif_get(netif);
++static inline int netbk_max_required_rx_slots(struct xen_netif *netif)
++{
++	if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
++		return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
++	return 1; /* all in one */
++}
 +
-+	rtnl_lock();
-+	netback_carrier_on(netif);
-+	if (netif_running(netif->dev))
-+		__netif_up(netif);
-+	rtnl_unlock();
++static inline int netbk_queue_full(struct xen_netif *netif)
++{
++	RING_IDX peek   = netif->rx_req_cons_peek;
++	RING_IDX needed = netbk_max_required_rx_slots(netif);
 +
-+	return 0;
-+err_hypervisor:
-+	unmap_frontend_pages(netif);
-+err_map:
-+	free_vm_area(netif->rx_comms_area);
-+err_rx:
-+	free_vm_area(netif->tx_comms_area);
-+	return err;
++	return ((netif->rx.sring->req_prod - peek) < needed) ||
++	       ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
 +}
 +
-+void netif_disconnect(struct xen_netif *netif)
++static void tx_queue_callback(unsigned long data)
 +{
-+	if (netback_carrier_ok(netif)) {
-+		rtnl_lock();
-+		netback_carrier_off(netif);
-+		netif_carrier_off(netif->dev); /* discard queued packets */
-+		if (netif_running(netif->dev))
-+			__netif_down(netif);
-+		rtnl_unlock();
-+		netif_put(netif);
-+	}
++	struct xen_netif *netif = (struct xen_netif *)data;
++	if (netif_schedulable(netif))
++		netif_wake_queue(netif->dev);
++}
 +
-+	atomic_dec(&netif->refcnt);
-+	wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++	struct xen_netif *netif = netdev_priv(dev);
 +
-+	del_timer_sync(&netif->credit_timeout);
-+	del_timer_sync(&netif->tx_queue_timeout);
++	BUG_ON(skb->dev != dev);
 +
-+	if (netif->irq)
-+		unbind_from_irqhandler(netif->irq, netif);
++	/* Drop the packet if the target domain has no receive buffers. */
++	if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
++		goto drop;
 +
-+	unregister_netdev(netif->dev);
++	/*
++	 * XXX For now we also copy skbuffs whose head crosses a page
++	 * boundary, because netbk_gop_skb can't handle them.
++	 */
++	if ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE) {
++		struct sk_buff *nskb = netbk_copy_skb(skb);
++		if ( unlikely(nskb == NULL) )
++			goto drop;
++		/* Copy only the header fields we use in this driver. */
++		nskb->dev = skb->dev;
++		nskb->ip_summed = skb->ip_summed;
++		dev_kfree_skb(skb);
++		skb = nskb;
++	}
 +
-+	if (netif->tx.sring) {
-+		unmap_frontend_pages(netif);
-+		free_vm_area(netif->tx_comms_area);
-+		free_vm_area(netif->rx_comms_area);
++	netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
++				   !!skb_shinfo(skb)->gso_size;
++	netif_get(netif);
++
++	if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
++		netif->rx.sring->req_event = netif->rx_req_cons_peek +
++			netbk_max_required_rx_slots(netif);
++		mb(); /* request notification /then/ check & stop the queue */
++		if (netbk_queue_full(netif)) {
++			netif_stop_queue(dev);
++			/*
++			 * Schedule 500ms timeout to restart the queue, thus
++			 * ensuring that an inactive queue will be drained.
++			 * Packets will be immediately be dropped until more
++			 * receive buffers become available (see
++			 * netbk_queue_full() check above).
++			 */
++			netif->tx_queue_timeout.data = (unsigned long)netif;
++			netif->tx_queue_timeout.function = tx_queue_callback;
++			mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
++		}
 +	}
 +
-+	free_netdev(netif->dev);
++	skb_queue_tail(&rx_queue, skb);
++	tasklet_schedule(&net_rx_tasklet);
++
++	return 0;
++
++ drop:
++	netif->stats.tx_dropped++;
++	dev_kfree_skb(skb);
++	return 0;
 +}
-diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
-new file mode 100644
-index 0000000..0bc6398
---- /dev/null
-+++ b/drivers/xen/netback/netback.c
-@@ -0,0 +1,1613 @@
-+/******************************************************************************
-+ * drivers/xen/netback/netback.c
-+ *
-+ * Back-end of the driver for virtual network devices. This portion of the
-+ * driver exports a 'unified' network-device interface that can be accessed
-+ * by any operating system that implements a compatible front end. A
-+ * reference front-end implementation can be found in:
-+ *  drivers/xen/netfront/netfront.c
-+ *
-+ * Copyright (c) 2002-2005, K A Fraser
-+ *
-+ * This program is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU General Public License version 2
-+ * as published by the Free Software Foundation; or, when distributed
-+ * separately from the Linux kernel or incorporated into other
-+ * software packages, subject to the following license:
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this source file (the "Software"), to deal in the Software without
-+ * restriction, including without limitation the rights to use, copy, modify,
-+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-+ * and to permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ */
 +
-+#include "common.h"
++struct netrx_pending_operations {
++	unsigned trans_prod, trans_cons;
++	unsigned mmu_prod, mmu_mcl;
++	unsigned mcl_prod, mcl_cons;
++	unsigned copy_prod, copy_cons;
++	unsigned meta_prod, meta_cons;
++	struct mmu_update *mmu;
++	struct gnttab_transfer *trans;
++	struct gnttab_copy *copy;
++	struct multicall_entry *mcl;
++	struct netbk_rx_meta *meta;
++};
 +
-+#include <linux/tcp.h>
-+#include <linux/udp.h>
++/* Set up the grant operations for this fragment.  If it's a flipping
++   interface, we also set up the unmap request from here. */
++static u16 netbk_gop_frag(struct xen_netif *netif, struct netbk_rx_meta *meta,
++			  int i, struct netrx_pending_operations *npo,
++			  struct page *page, unsigned long size,
++			  unsigned long offset)
++{
++	struct gnttab_copy *copy_gop;
++	struct xen_netif_rx_request *req;
++	unsigned long old_mfn;
++	int idx = netif_page_index(page);
 +
-+#include <xen/balloon.h>
-+#include <xen/events.h>
-+#include <xen/interface/memory.h>
++	old_mfn = virt_to_mfn(page_address(page));
 +
-+#include <asm/xen/hypercall.h>
-+#include <asm/xen/page.h>
++	req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
++
++	copy_gop = npo->copy + npo->copy_prod++;
++	copy_gop->flags = GNTCOPY_dest_gref;
++	if (idx > -1) {
++		struct pending_tx_info *src_pend = &pending_tx_info[idx];
++		copy_gop->source.domid = src_pend->netif->domid;
++		copy_gop->source.u.ref = src_pend->req.gref;
++		copy_gop->flags |= GNTCOPY_source_gref;
++	} else {
++		copy_gop->source.domid = DOMID_SELF;
++		copy_gop->source.u.gmfn = old_mfn;
++	}
++	copy_gop->source.offset = offset;
++	copy_gop->dest.domid = netif->domid;
++	copy_gop->dest.offset = 0;
++	copy_gop->dest.u.ref = req->gref;
++	copy_gop->len = size;
 +
-+/*define NETBE_DEBUG_INTERRUPT*/
++	return req->id;
++}
 +
-+struct netbk_rx_meta {
-+	skb_frag_t frag;
-+	int id;
-+};
++static void netbk_gop_skb(struct sk_buff *skb,
++			  struct netrx_pending_operations *npo)
++{
++	struct xen_netif *netif = netdev_priv(skb->dev);
++	int nr_frags = skb_shinfo(skb)->nr_frags;
++	int i;
++	int extra;
++	struct netbk_rx_meta *head_meta, *meta;
 +
-+struct netbk_tx_pending_inuse {
-+	struct list_head list;
-+	unsigned long alloc_time;
-+};
++	head_meta = npo->meta + npo->meta_prod++;
++	head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
++	head_meta->frag.size = skb_shinfo(skb)->gso_size;
++	extra = !!head_meta->frag.size + 1;
 +
++	for (i = 0; i < nr_frags; i++) {
++		meta = npo->meta + npo->meta_prod++;
++		meta->frag = skb_shinfo(skb)->frags[i];
++		meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
++					  meta->frag.page,
++					  meta->frag.size,
++					  meta->frag.page_offset);
++	}
 +
-+static void netif_idx_release(u16 pending_idx);
-+static void make_tx_response(struct xen_netif *netif,
-+			     struct xen_netif_tx_request *txp,
-+			     s8       st);
-+static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
-+					     u16      id,
-+					     s8       st,
-+					     u16      offset,
-+					     u16      size,
-+					     u16      flags);
++	/*
++	 * This must occur at the end to ensure that we don't trash skb_shinfo
++	 * until we're done. We know that the head doesn't cross a page
++	 * boundary because such packets get copied in netif_be_start_xmit.
++	 */
++	head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
++				       virt_to_page(skb->data),
++				       skb_headlen(skb),
++				       offset_in_page(skb->data));
 +
-+static void net_tx_action(unsigned long unused);
-+static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
++	netif->rx.req_cons += nr_frags + extra;
++}
 +
-+static void net_rx_action(unsigned long unused);
-+static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
++static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
++{
++	int i;
 +
-+static struct timer_list net_timer;
-+static struct timer_list netbk_tx_pending_timer;
++	for (i = 0; i < nr_frags; i++)
++		put_page(meta[i].frag.page);
++}
 +
-+#define MAX_PENDING_REQS 256
++/* This is a twin to netbk_gop_skb.  Assume that netbk_gop_skb was
++   used to set up the operations on the top of
++   netrx_pending_operations, which have since been done.  Check that
++   they didn't give any errors and advance over them. */
++static int netbk_check_gop(int nr_frags, domid_t domid,
++			   struct netrx_pending_operations *npo)
++{
++	struct gnttab_copy     *copy_op;
++	int status = NETIF_RSP_OKAY;
++	int i;
 +
-+static struct sk_buff_head rx_queue;
++	for (i = 0; i <= nr_frags; i++) {
++			copy_op = npo->copy + npo->copy_cons++;
++			if (copy_op->status != GNTST_okay) {
++				DPRINTK("Bad status %d from copy to DOM%d.\n",
++					copy_op->status, domid);
++				status = NETIF_RSP_ERROR;
++			}
++	}
 +
-+static struct page **mmap_pages;
-+static inline unsigned long idx_to_pfn(unsigned int idx)
-+{
-+	return page_to_pfn(mmap_pages[idx]);
++	return status;
 +}
 +
-+static inline unsigned long idx_to_kaddr(unsigned int idx)
++static void netbk_add_frag_responses(struct xen_netif *netif, int status,
++				     struct netbk_rx_meta *meta, int nr_frags)
 +{
-+	return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx));
-+}
++	int i;
++	unsigned long offset;
 +
-+/* extra field used in struct page */
-+static inline void netif_set_page_index(struct page *pg, unsigned int index)
-+{
-+	*(unsigned long *)&pg->mapping = index + 1;
++	for (i = 0; i < nr_frags; i++) {
++		int id = meta[i].id;
++		int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
++		
++		offset = 0;
++		make_rx_response(netif, id, status, offset,
++				 meta[i].frag.size, flags);
++	}
 +}
 +
-+static inline int netif_page_index(struct page *pg)
++static void net_rx_action(unsigned long unused)
 +{
-+	unsigned long idx = (unsigned long)pg->mapping - 1;
++	struct xen_netif *netif = NULL;
++	s8 status;
++	u16 id, irq, flags;
++	struct xen_netif_rx_response *resp;
++	struct multicall_entry *mcl;
++	struct sk_buff_head rxq;
++	struct sk_buff *skb;
++	int notify_nr = 0;
++	int ret;
++	int nr_frags;
++	int count;
++	unsigned long offset;
 +
-+	if (!PageForeign(pg))
-+		return -1;
++	/*
++	 * Putting hundreds of bytes on the stack is considered rude.
++	 * Static works because a tasklet can only be on one CPU at any time.
++	 */
++	static struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
++	static struct mmu_update rx_mmu[NET_RX_RING_SIZE];
++	static struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
++	static struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
++	static unsigned char rx_notify[NR_IRQS];
++	static u16 notify_list[NET_RX_RING_SIZE];
++	static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
 +
-+	if ((idx >= MAX_PENDING_REQS) || (mmap_pages[idx] != pg))
-+		return -1;
++	struct netrx_pending_operations npo = {
++		mmu: rx_mmu,
++		trans: grant_trans_op,
++		copy: grant_copy_op,
++		mcl: rx_mcl,
++		meta: meta};
 +
-+	return idx;
-+}
++	skb_queue_head_init(&rxq);
 +
-+/*
-+ * This is the amount of packet we copy rather than map, so that the
-+ * guest can't fiddle with the contents of the headers while we do
-+ * packet processing on them (netfilter, routing, etc). 72 is enough
-+ * to cover TCP+IP headers including options.
-+ */
-+#define PKT_PROT_LEN 72
++	count = 0;
 +
-+static struct pending_tx_info {
-+	struct xen_netif_tx_request req;
-+	struct xen_netif *netif;
-+} pending_tx_info[MAX_PENDING_REQS];
-+static u16 pending_ring[MAX_PENDING_REQS];
-+typedef unsigned int pending_ring_idx_t;
++	while ((skb = skb_dequeue(&rx_queue)) != NULL) {
++		nr_frags = skb_shinfo(skb)->nr_frags;
++		*(int *)skb->cb = nr_frags;
 +
-+static inline pending_ring_idx_t pending_index(unsigned i)
-+{
-+	return i & (MAX_PENDING_REQS-1);
-+}
++		netbk_gop_skb(skb, &npo);
 +
-+static pending_ring_idx_t pending_prod, pending_cons;
++		count += nr_frags + 1;
 +
-+static inline pending_ring_idx_t nr_pending_reqs(void)
-+{
-+	return MAX_PENDING_REQS - pending_prod + pending_cons;
-+}
++		__skb_queue_tail(&rxq, skb);
 +
-+/* Freed TX SKBs get batched on this ring before return to pending_ring. */
-+static u16 dealloc_ring[MAX_PENDING_REQS];
-+static pending_ring_idx_t dealloc_prod, dealloc_cons;
++		/* Filled the batch queue? */
++		if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
++			break;
++	}
 +
-+/* Doubly-linked list of in-use pending entries. */
-+static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
-+static LIST_HEAD(pending_inuse_head);
++	BUG_ON(npo.meta_prod > ARRAY_SIZE(meta));
 +
-+static struct sk_buff_head tx_queue;
++	npo.mmu_mcl = npo.mcl_prod;
++	if (npo.mcl_prod) {
++		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++		BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu));
++		mcl = npo.mcl + npo.mcl_prod++;
 +
-+static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
-+static struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
-+static struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
++		BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
++		mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
 +
-+static LIST_HEAD(net_schedule_list);
-+static DEFINE_SPINLOCK(net_schedule_list_lock);
++		mcl->op = __HYPERVISOR_mmu_update;
++		mcl->args[0] = (unsigned long)rx_mmu;
++		mcl->args[1] = npo.mmu_prod;
++		mcl->args[2] = 0;
++		mcl->args[3] = DOMID_SELF;
++	}
 +
-+#define MAX_MFN_ALLOC 64
-+static unsigned long mfn_list[MAX_MFN_ALLOC];
-+static unsigned int alloc_index = 0;
++	if (npo.trans_prod) {
++		BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op));
++		mcl = npo.mcl + npo.mcl_prod++;
++		mcl->op = __HYPERVISOR_grant_table_op;
++		mcl->args[0] = GNTTABOP_transfer;
++		mcl->args[1] = (unsigned long)grant_trans_op;
++		mcl->args[2] = npo.trans_prod;
++	}
 +
-+/* Setting this allows the safe use of this driver without netloop. */
-+static int MODPARM_copy_skb = 1;
-+module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
-+MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
++	if (npo.copy_prod) {
++		BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op));
++		mcl = npo.mcl + npo.mcl_prod++;
++		mcl->op = __HYPERVISOR_grant_table_op;
++		mcl->args[0] = GNTTABOP_copy;
++		mcl->args[1] = (unsigned long)grant_copy_op;
++		mcl->args[2] = npo.copy_prod;
++	}
 +
-+int netbk_copy_skb_mode;
++	/* Nothing to do? */
++	if (!npo.mcl_prod)
++		return;
 +
-+static inline unsigned long alloc_mfn(void)
-+{
-+	BUG_ON(alloc_index == 0);
-+	return mfn_list[--alloc_index];
-+}
++	BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl));
 +
-+static inline void maybe_schedule_tx_action(void)
-+{
-+	smp_mb();
-+	if ((nr_pending_reqs() < (MAX_PENDING_REQS/2)) &&
-+	    !list_empty(&net_schedule_list))
-+		tasklet_schedule(&net_tx_tasklet);
-+}
++	ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
++	BUG_ON(ret != 0);
++	/* The mmu_machphys_update() must not fail. */
++	BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0);
++
++	while ((skb = __skb_dequeue(&rxq)) != NULL) {
++		nr_frags = *(int *)skb->cb;
 +
-+static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
-+{
-+	struct skb_shared_info *ninfo;
-+	struct sk_buff *nskb;
-+	unsigned long offset;
-+	int ret;
-+	int len;
-+	int headlen;
++		netif = netdev_priv(skb->dev);
 +
-+	BUG_ON(skb_shinfo(skb)->frag_list != NULL);
++		netif->stats.tx_bytes += skb->len;
++		netif->stats.tx_packets++;
 +
-+	nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
-+	if (unlikely(!nskb))
-+		goto err;
++		status = netbk_check_gop(nr_frags, netif->domid, &npo);
 +
-+	skb_reserve(nskb, NET_SKB_PAD + NET_IP_ALIGN);
-+	headlen = skb_end_pointer(nskb) - nskb->data;
-+	if (headlen > skb_headlen(skb))
-+		headlen = skb_headlen(skb);
-+	ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
-+	BUG_ON(ret);
++		id = meta[npo.meta_cons].id;
++		flags = nr_frags ? NETRXF_more_data : 0;
 +
-+	ninfo = skb_shinfo(nskb);
-+	ninfo->gso_size = skb_shinfo(skb)->gso_size;
-+	ninfo->gso_type = skb_shinfo(skb)->gso_type;
++		if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
++			flags |= NETRXF_csum_blank | NETRXF_data_validated;
++		else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
++			/* remote but checksummed. */
++			flags |= NETRXF_data_validated;
 +
-+	offset = headlen;
-+	len = skb->len - headlen;
++		offset = 0;
++		resp = make_rx_response(netif, id, status, offset,
++					skb_headlen(skb), flags);
 +
-+	nskb->len = skb->len;
-+	nskb->data_len = len;
-+	nskb->truesize += len;
++		if (meta[npo.meta_cons].frag.size) {
++			struct xen_netif_extra_info *gso =
++				(struct xen_netif_extra_info *)
++				RING_GET_RESPONSE(&netif->rx,
++						  netif->rx.rsp_prod_pvt++);
 +
-+	while (len) {
-+		struct page *page;
-+		int copy;
-+		int zero;
++			resp->flags |= NETRXF_extra_info;
 +
-+		if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
-+			dump_stack();
-+			goto err_free;
++			gso->u.gso.size = meta[npo.meta_cons].frag.size;
++			gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
++			gso->u.gso.pad = 0;
++			gso->u.gso.features = 0;
++
++			gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
++			gso->flags = 0;
 +		}
 +
-+		copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
-+		zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
++		netbk_add_frag_responses(netif, status,
++					 meta + npo.meta_cons + 1,
++					 nr_frags);
 +
-+		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
-+		if (unlikely(!page))
-+			goto err_free;
++		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
++		irq = netif->irq;
++		if (ret && !rx_notify[irq] &&
++				(netif->smart_poll != 1)) {
++			rx_notify[irq] = 1;
++			notify_list[notify_nr++] = irq;
++		}
 +
-+		ret = skb_copy_bits(skb, offset, page_address(page), copy);
-+		BUG_ON(ret);
++		if (netif_queue_stopped(netif->dev) &&
++		    netif_schedulable(netif) &&
++		    !netbk_queue_full(netif))
++			netif_wake_queue(netif->dev);
 +
-+		ninfo->frags[ninfo->nr_frags].page = page;
-+		ninfo->frags[ninfo->nr_frags].page_offset = 0;
-+		ninfo->frags[ninfo->nr_frags].size = copy;
-+		ninfo->nr_frags++;
++		/*
++		 * netfront_smartpoll_active indicates whether
++		 * netfront timer is active.
++		 */
++		if ((netif->smart_poll == 1)) {
++			if (!(netif->rx.sring->netfront_smartpoll_active)) {
++				notify_remote_via_irq(irq);
++				netif->rx.sring->netfront_smartpoll_active = 1;
++			}
++		}
 +
-+		offset += copy;
-+		len -= copy;
++		netif_put(netif);
++		dev_kfree_skb(skb);
++		npo.meta_cons += nr_frags + 1;
 +	}
 +
-+	offset = nskb->data - skb->data;
-+
-+	nskb->transport_header = skb->transport_header + offset;
-+	nskb->network_header = skb->network_header + offset;
-+	nskb->mac_header = skb->mac_header + offset;
-+
-+	return nskb;
++	while (notify_nr != 0) {
++		irq = notify_list[--notify_nr];
++		rx_notify[irq] = 0;
++		notify_remote_via_irq(irq);
++	}
 +
-+ err_free:
-+	kfree_skb(nskb);
-+ err:
-+	return NULL;
++	/* More work to do? */
++	if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
++		tasklet_schedule(&net_rx_tasklet);
 +}
 +
-+static inline int netbk_max_required_rx_slots(struct xen_netif *netif)
++static void net_alarm(unsigned long unused)
 +{
-+	if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
-+		return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
-+	return 1; /* all in one */
++	tasklet_schedule(&net_rx_tasklet);
 +}
 +
-+static inline int netbk_queue_full(struct xen_netif *netif)
++static void netbk_tx_pending_timeout(unsigned long unused)
 +{
-+	RING_IDX peek   = netif->rx_req_cons_peek;
-+	RING_IDX needed = netbk_max_required_rx_slots(netif);
-+
-+	return ((netif->rx.sring->req_prod - peek) < needed) ||
-+	       ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
++	tasklet_schedule(&net_tx_tasklet);
 +}
 +
-+static void tx_queue_callback(unsigned long data)
++struct net_device_stats *netif_be_get_stats(struct net_device *dev)
 +{
-+	struct xen_netif *netif = (struct xen_netif *)data;
-+	if (netif_schedulable(netif))
-+		netif_wake_queue(netif->dev);
++	struct xen_netif *netif = netdev_priv(dev);
++	return &netif->stats;
 +}
 +
-+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
++static int __on_net_schedule_list(struct xen_netif *netif)
 +{
-+	struct xen_netif *netif = netdev_priv(dev);
-+
-+	BUG_ON(skb->dev != dev);
-+
-+	/* Drop the packet if the target domain has no receive buffers. */
-+	if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
-+		goto drop;
-+
-+	/*
-+	 * XXX For now we also copy skbuffs whose head crosses a page
-+	 * boundary, because netbk_gop_skb can't handle them.
-+	 */
-+	if ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE) {
-+		struct sk_buff *nskb = netbk_copy_skb(skb);
-+		if ( unlikely(nskb == NULL) )
-+			goto drop;
-+		/* Copy only the header fields we use in this driver. */
-+		nskb->dev = skb->dev;
-+		nskb->ip_summed = skb->ip_summed;
-+		dev_kfree_skb(skb);
-+		skb = nskb;
-+	}
-+
-+	netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
-+				   !!skb_shinfo(skb)->gso_size;
-+	netif_get(netif);
++	return !list_empty(&netif->list);
++}
 +
-+	if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
-+		netif->rx.sring->req_event = netif->rx_req_cons_peek +
-+			netbk_max_required_rx_slots(netif);
-+		mb(); /* request notification /then/ check & stop the queue */
-+		if (netbk_queue_full(netif)) {
-+			netif_stop_queue(dev);
-+			/*
-+			 * Schedule 500ms timeout to restart the queue, thus
-+			 * ensuring that an inactive queue will be drained.
-+			 * Packets will be immediately be dropped until more
-+			 * receive buffers become available (see
-+			 * netbk_queue_full() check above).
-+			 */
-+			netif->tx_queue_timeout.data = (unsigned long)netif;
-+			netif->tx_queue_timeout.function = tx_queue_callback;
-+			mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
-+		}
++static void remove_from_net_schedule_list(struct xen_netif *netif)
++{
++	spin_lock_irq(&net_schedule_list_lock);
++	if (likely(__on_net_schedule_list(netif))) {
++		list_del_init(&netif->list);
++		netif_put(netif);
 +	}
-+
-+	skb_queue_tail(&rx_queue, skb);
-+	tasklet_schedule(&net_rx_tasklet);
-+
-+	return 0;
-+
-+ drop:
-+	netif->stats.tx_dropped++;
-+	dev_kfree_skb(skb);
-+	return 0;
++	spin_unlock_irq(&net_schedule_list_lock);
 +}
 +
-+struct netrx_pending_operations {
-+	unsigned trans_prod, trans_cons;
-+	unsigned mmu_prod, mmu_mcl;
-+	unsigned mcl_prod, mcl_cons;
-+	unsigned copy_prod, copy_cons;
-+	unsigned meta_prod, meta_cons;
-+	struct mmu_update *mmu;
-+	struct gnttab_transfer *trans;
-+	struct gnttab_copy *copy;
-+	struct multicall_entry *mcl;
-+	struct netbk_rx_meta *meta;
-+};
-+
-+/* Set up the grant operations for this fragment.  If it's a flipping
-+   interface, we also set up the unmap request from here. */
-+static u16 netbk_gop_frag(struct xen_netif *netif, struct netbk_rx_meta *meta,
-+			  int i, struct netrx_pending_operations *npo,
-+			  struct page *page, unsigned long size,
-+			  unsigned long offset)
++static void add_to_net_schedule_list_tail(struct xen_netif *netif)
 +{
-+	struct gnttab_copy *copy_gop;
-+	struct xen_netif_rx_request *req;
-+	unsigned long old_mfn;
-+	int idx = netif_page_index(page);
-+
-+	old_mfn = virt_to_mfn(page_address(page));
-+
-+	req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
-+
-+	copy_gop = npo->copy + npo->copy_prod++;
-+	copy_gop->flags = GNTCOPY_dest_gref;
-+	if (idx > -1) {
-+		struct pending_tx_info *src_pend = &pending_tx_info[idx];
-+		copy_gop->source.domid = src_pend->netif->domid;
-+		copy_gop->source.u.ref = src_pend->req.gref;
-+		copy_gop->flags |= GNTCOPY_source_gref;
-+	} else {
-+		copy_gop->source.domid = DOMID_SELF;
-+		copy_gop->source.u.gmfn = old_mfn;
-+	}
-+	copy_gop->source.offset = offset;
-+	copy_gop->dest.domid = netif->domid;
-+	copy_gop->dest.offset = 0;
-+	copy_gop->dest.u.ref = req->gref;
-+	copy_gop->len = size;
++	if (__on_net_schedule_list(netif))
++		return;
 +
-+	return req->id;
++	spin_lock_irq(&net_schedule_list_lock);
++	if (!__on_net_schedule_list(netif) &&
++	    likely(netif_schedulable(netif))) {
++		list_add_tail(&netif->list, &net_schedule_list);
++		netif_get(netif);
++	}
++	spin_unlock_irq(&net_schedule_list_lock);
 +}
 +
-+static void netbk_gop_skb(struct sk_buff *skb,
-+			  struct netrx_pending_operations *npo)
++void netif_schedule_work(struct xen_netif *netif)
 +{
-+	struct xen_netif *netif = netdev_priv(skb->dev);
-+	int nr_frags = skb_shinfo(skb)->nr_frags;
-+	int i;
-+	int extra;
-+	struct netbk_rx_meta *head_meta, *meta;
++	int more_to_do;
 +
-+	head_meta = npo->meta + npo->meta_prod++;
-+	head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
-+	head_meta->frag.size = skb_shinfo(skb)->gso_size;
-+	extra = !!head_meta->frag.size + 1;
++	RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
 +
-+	for (i = 0; i < nr_frags; i++) {
-+		meta = npo->meta + npo->meta_prod++;
-+		meta->frag = skb_shinfo(skb)->frags[i];
-+		meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
-+					  meta->frag.page,
-+					  meta->frag.size,
-+					  meta->frag.page_offset);
++	if (more_to_do) {
++		add_to_net_schedule_list_tail(netif);
++		maybe_schedule_tx_action();
 +	}
++}
++
++void netif_deschedule_work(struct xen_netif *netif)
++{
++	remove_from_net_schedule_list(netif);
++}
++
++
++static void tx_add_credit(struct xen_netif *netif)
++{
++	unsigned long max_burst, max_credit;
 +
 +	/*
-+	 * This must occur at the end to ensure that we don't trash skb_shinfo
-+	 * until we're done. We know that the head doesn't cross a page
-+	 * boundary because such packets get copied in netif_be_start_xmit.
++	 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
++	 * Otherwise the interface can seize up due to insufficient credit.
 +	 */
-+	head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
-+				       virt_to_page(skb->data),
-+				       skb_headlen(skb),
-+				       offset_in_page(skb->data));
++	max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
++	max_burst = min(max_burst, 131072UL);
++	max_burst = max(max_burst, netif->credit_bytes);
 +
-+	netif->rx.req_cons += nr_frags + extra;
++	/* Take care that adding a new chunk of credit doesn't wrap to zero. */
++	max_credit = netif->remaining_credit + netif->credit_bytes;
++	if (max_credit < netif->remaining_credit)
++		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
++
++	netif->remaining_credit = min(max_credit, max_burst);
 +}
 +
-+static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
++static void tx_credit_callback(unsigned long data)
 +{
-+	int i;
++	struct xen_netif *netif = (struct xen_netif *)data;
++	tx_add_credit(netif);
++	netif_schedule_work(netif);
++}
 +
-+	for (i = 0; i < nr_frags; i++)
-+		put_page(meta[i].frag.page);
++static inline int copy_pending_req(pending_ring_idx_t pending_idx)
++{
++	return gnttab_copy_grant_page(grant_tx_handle[pending_idx],
++				      &mmap_pages[pending_idx]);
 +}
 +
-+/* This is a twin to netbk_gop_skb.  Assume that netbk_gop_skb was
-+   used to set up the operations on the top of
-+   netrx_pending_operations, which have since been done.  Check that
-+   they didn't give any errors and advance over them. */
-+static int netbk_check_gop(int nr_frags, domid_t domid,
-+			   struct netrx_pending_operations *npo)
++inline static void net_tx_action_dealloc(void)
 +{
-+	struct gnttab_copy     *copy_op;
-+	int status = NETIF_RSP_OKAY;
-+	int i;
++	struct netbk_tx_pending_inuse *inuse, *n;
++	struct gnttab_unmap_grant_ref *gop;
++	u16 pending_idx;
++	pending_ring_idx_t dc, dp;
++	struct xen_netif *netif;
++	int ret;
++	LIST_HEAD(list);
 +
-+	for (i = 0; i <= nr_frags; i++) {
-+			copy_op = npo->copy + npo->copy_cons++;
-+			if (copy_op->status != GNTST_okay) {
-+				DPRINTK("Bad status %d from copy to DOM%d.\n",
-+					copy_op->status, domid);
-+				status = NETIF_RSP_ERROR;
++	dc = dealloc_cons;
++	gop = tx_unmap_ops;
++
++	/*
++	 * Free up any grants we have finished using
++	 */
++	do {
++		dp = dealloc_prod;
++
++		/* Ensure we see all indices enqueued by netif_idx_release(). */
++		smp_rmb();
++
++		while (dc != dp) {
++			unsigned long pfn;
++
++			pending_idx = dealloc_ring[pending_index(dc++)];
++			list_move_tail(&pending_inuse[pending_idx].list, &list);
++
++			pfn = idx_to_pfn(pending_idx);
++			/* Already unmapped? */
++			if (!phys_to_machine_mapping_valid(pfn))
++				continue;
++
++			gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
++					    GNTMAP_host_map,
++					    grant_tx_handle[pending_idx]);
++			gop++;
++		}
++
++		if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB ||
++		    list_empty(&pending_inuse_head))
++			break;
++
++		/* Copy any entries that have been pending for too long. */
++		list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) {
++			if (time_after(inuse->alloc_time + HZ / 2, jiffies))
++				break;
++
++			pending_idx = inuse - pending_inuse;
++
++			pending_tx_info[pending_idx].netif->nr_copied_skbs++;
++
++			switch (copy_pending_req(pending_idx)) {
++			case 0:
++				list_move_tail(&inuse->list, &list);
++				continue;
++			case -EBUSY:
++				list_del_init(&inuse->list);
++				continue;
++			case -ENOENT:
++				continue;
 +			}
-+	}
 +
-+	return status;
-+}
++			break;
++		}
++	} while (dp != dealloc_prod);
 +
-+static void netbk_add_frag_responses(struct xen_netif *netif, int status,
-+				     struct netbk_rx_meta *meta, int nr_frags)
-+{
-+	int i;
-+	unsigned long offset;
++	dealloc_cons = dc;
 +
-+	for (i = 0; i < nr_frags; i++) {
-+		int id = meta[i].id;
-+		int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
-+		
-+		offset = 0;
-+		make_rx_response(netif, id, status, offset,
-+				 meta[i].frag.size, flags);
++	ret = HYPERVISOR_grant_table_op(
++		GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
++	BUG_ON(ret);
++
++	list_for_each_entry_safe(inuse, n, &list, list) {
++		pending_idx = inuse - pending_inuse;
++
++		netif = pending_tx_info[pending_idx].netif;
++
++		make_tx_response(netif, &pending_tx_info[pending_idx].req,
++				 NETIF_RSP_OKAY);
++
++		/* Ready for next use. */
++		gnttab_reset_grant_page(mmap_pages[pending_idx]);
++
++		pending_ring[pending_index(pending_prod++)] = pending_idx;
++
++		netif_put(netif);
++
++		list_del_init(&inuse->list);
 +	}
 +}
 +
-+static void net_rx_action(unsigned long unused)
++static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *txp, RING_IDX end)
 +{
-+	struct xen_netif *netif = NULL;
-+	s8 status;
-+	u16 id, irq, flags;
-+	struct xen_netif_rx_response *resp;
-+	struct multicall_entry *mcl;
-+	struct sk_buff_head rxq;
-+	struct sk_buff *skb;
-+	int notify_nr = 0;
-+	int ret;
-+	int nr_frags;
-+	int count;
-+	unsigned long offset;
-+
-+	/*
-+	 * Putting hundreds of bytes on the stack is considered rude.
-+	 * Static works because a tasklet can only be on one CPU at any time.
-+	 */
-+	static struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
-+	static struct mmu_update rx_mmu[NET_RX_RING_SIZE];
-+	static struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
-+	static struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
-+	static unsigned char rx_notify[NR_IRQS];
-+	static u16 notify_list[NET_RX_RING_SIZE];
-+	static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
++	RING_IDX cons = netif->tx.req_cons;
 +
-+	struct netrx_pending_operations npo = {
-+		mmu: rx_mmu,
-+		trans: grant_trans_op,
-+		copy: grant_copy_op,
-+		mcl: rx_mcl,
-+		meta: meta};
++	do {
++		make_tx_response(netif, txp, NETIF_RSP_ERROR);
++		if (cons >= end)
++			break;
++		txp = RING_GET_REQUEST(&netif->tx, cons++);
++	} while (1);
++	netif->tx.req_cons = cons;
++	netif_schedule_work(netif);
++	netif_put(netif);
++}
 +
-+	skb_queue_head_init(&rxq);
++static int netbk_count_requests(struct xen_netif *netif,
++				struct xen_netif_tx_request *first,
++				struct xen_netif_tx_request *txp, int work_to_do)
++{
++	RING_IDX cons = netif->tx.req_cons;
++	int frags = 0;
 +
-+	count = 0;
++	if (!(first->flags & NETTXF_more_data))
++		return 0;
 +
-+	while ((skb = skb_dequeue(&rx_queue)) != NULL) {
-+		nr_frags = skb_shinfo(skb)->nr_frags;
-+		*(int *)skb->cb = nr_frags;
++	do {
++		if (frags >= work_to_do) {
++			DPRINTK("Need more frags\n");
++			return -frags;
++		}
 +
-+		netbk_gop_skb(skb, &npo);
++		if (unlikely(frags >= MAX_SKB_FRAGS)) {
++			DPRINTK("Too many frags\n");
++			return -frags;
++		}
 +
-+		count += nr_frags + 1;
++		memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
++		       sizeof(*txp));
++		if (txp->size > first->size) {
++			DPRINTK("Frags galore\n");
++			return -frags;
++		}
 +
-+		__skb_queue_tail(&rxq, skb);
++		first->size -= txp->size;
++		frags++;
 +
-+		/* Filled the batch queue? */
-+		if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
-+			break;
-+	}
++		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
++			DPRINTK("txp->offset: %x, size: %u\n",
++				txp->offset, txp->size);
++			return -frags;
++		}
++	} while ((txp++)->flags & NETTXF_more_data);
 +
-+	BUG_ON(npo.meta_prod > ARRAY_SIZE(meta));
++	return frags;
++}
 +
-+	npo.mmu_mcl = npo.mcl_prod;
-+	if (npo.mcl_prod) {
-+		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
-+		BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu));
-+		mcl = npo.mcl + npo.mcl_prod++;
++static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
++						  struct sk_buff *skb,
++						  struct xen_netif_tx_request *txp,
++						  struct gnttab_map_grant_ref *mop)
++{
++	struct skb_shared_info *shinfo = skb_shinfo(skb);
++	skb_frag_t *frags = shinfo->frags;
++	unsigned long pending_idx = *((u16 *)skb->data);
++	int i, start;
 +
-+		BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
-+		mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
++	/* Skip first skb fragment if it is on same page as header fragment. */
++	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
 +
-+		mcl->op = __HYPERVISOR_mmu_update;
-+		mcl->args[0] = (unsigned long)rx_mmu;
-+		mcl->args[1] = npo.mmu_prod;
-+		mcl->args[2] = 0;
-+		mcl->args[3] = DOMID_SELF;
-+	}
++	for (i = start; i < shinfo->nr_frags; i++, txp++) {
++		pending_idx = pending_ring[pending_index(pending_cons++)];
 +
-+	if (npo.trans_prod) {
-+		BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op));
-+		mcl = npo.mcl + npo.mcl_prod++;
-+		mcl->op = __HYPERVISOR_grant_table_op;
-+		mcl->args[0] = GNTTABOP_transfer;
-+		mcl->args[1] = (unsigned long)grant_trans_op;
-+		mcl->args[2] = npo.trans_prod;
-+	}
++		gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
++				  GNTMAP_host_map | GNTMAP_readonly,
++				  txp->gref, netif->domid);
 +
-+	if (npo.copy_prod) {
-+		BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op));
-+		mcl = npo.mcl + npo.mcl_prod++;
-+		mcl->op = __HYPERVISOR_grant_table_op;
-+		mcl->args[0] = GNTTABOP_copy;
-+		mcl->args[1] = (unsigned long)grant_copy_op;
-+		mcl->args[2] = npo.copy_prod;
++		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
++		netif_get(netif);
++		pending_tx_info[pending_idx].netif = netif;
++		frags[i].page = (void *)pending_idx;
 +	}
 +
-+	/* Nothing to do? */
-+	if (!npo.mcl_prod)
-+		return;
-+
-+	BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl));
-+
-+	ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
-+	BUG_ON(ret != 0);
-+	/* The mmu_machphys_update() must not fail. */
-+	BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0);
++	return mop;
++}
 +
-+	while ((skb = __skb_dequeue(&rxq)) != NULL) {
-+		nr_frags = *(int *)skb->cb;
++static int netbk_tx_check_mop(struct sk_buff *skb,
++			       struct gnttab_map_grant_ref **mopp)
++{
++	struct gnttab_map_grant_ref *mop = *mopp;
++	int pending_idx = *((u16 *)skb->data);
++	struct xen_netif *netif = pending_tx_info[pending_idx].netif;
++	struct xen_netif_tx_request *txp;
++	struct skb_shared_info *shinfo = skb_shinfo(skb);
++	int nr_frags = shinfo->nr_frags;
++	int i, err, start;
 +
-+		netif = netdev_priv(skb->dev);
++	/* Check status of header. */
++	err = mop->status;
++	if (unlikely(err)) {
++		txp = &pending_tx_info[pending_idx].req;
++		make_tx_response(netif, txp, NETIF_RSP_ERROR);
++		pending_ring[pending_index(pending_prod++)] = pending_idx;
++		netif_put(netif);
++	} else {
++		set_phys_to_machine(
++			__pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
++			FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
++		grant_tx_handle[pending_idx] = mop->handle;
++	}
 +
-+		netif->stats.tx_bytes += skb->len;
-+		netif->stats.tx_packets++;
++	/* Skip first skb fragment if it is on same page as header fragment. */
++	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
 +
-+		status = netbk_check_gop(nr_frags, netif->domid, &npo);
++	for (i = start; i < nr_frags; i++) {
++		int j, newerr;
 +
-+		id = meta[npo.meta_cons].id;
-+		flags = nr_frags ? NETRXF_more_data : 0;
++		pending_idx = (unsigned long)shinfo->frags[i].page;
 +
-+		if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
-+			flags |= NETRXF_csum_blank | NETRXF_data_validated;
-+		else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
-+			/* remote but checksummed. */
-+			flags |= NETRXF_data_validated;
++		/* Check error status: if okay then remember grant handle. */
++		newerr = (++mop)->status;
++		if (likely(!newerr)) {
++			set_phys_to_machine(
++				__pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
++				FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
++			grant_tx_handle[pending_idx] = mop->handle;
++			/* Had a previous error? Invalidate this fragment. */
++			if (unlikely(err))
++				netif_idx_release(pending_idx);
++			continue;
++		}
 +
-+		offset = 0;
-+		resp = make_rx_response(netif, id, status, offset,
-+					skb_headlen(skb), flags);
++		/* Error on this fragment: respond to client with an error. */
++		txp = &pending_tx_info[pending_idx].req;
++		make_tx_response(netif, txp, NETIF_RSP_ERROR);
++		pending_ring[pending_index(pending_prod++)] = pending_idx;
++		netif_put(netif);
 +
-+		if (meta[npo.meta_cons].frag.size) {
-+			struct xen_netif_extra_info *gso =
-+				(struct xen_netif_extra_info *)
-+				RING_GET_RESPONSE(&netif->rx,
-+						  netif->rx.rsp_prod_pvt++);
++		/* Not the first error? Preceding frags already invalidated. */
++		if (err)
++			continue;
 +
-+			resp->flags |= NETRXF_extra_info;
++		/* First error: invalidate header and preceding fragments. */
++		pending_idx = *((u16 *)skb->data);
++		netif_idx_release(pending_idx);
++		for (j = start; j < i; j++) {
++			pending_idx = (unsigned long)shinfo->frags[i].page;
++			netif_idx_release(pending_idx);
++		}
 +
-+			gso->u.gso.size = meta[npo.meta_cons].frag.size;
-+			gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
-+			gso->u.gso.pad = 0;
-+			gso->u.gso.features = 0;
++		/* Remember the error: invalidate all subsequent fragments. */
++		err = newerr;
++	}
 +
-+			gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
-+			gso->flags = 0;
-+		}
++	*mopp = mop + 1;
++	return err;
++}
 +
-+		netbk_add_frag_responses(netif, status,
-+					 meta + npo.meta_cons + 1,
-+					 nr_frags);
++static void netbk_fill_frags(struct sk_buff *skb)
++{
++	struct skb_shared_info *shinfo = skb_shinfo(skb);
++	int nr_frags = shinfo->nr_frags;
++	int i;
 +
-+		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
-+		irq = netif->irq;
-+		if (ret && !rx_notify[irq] &&
-+				(netif->smart_poll != 1)) {
-+			rx_notify[irq] = 1;
-+			notify_list[notify_nr++] = irq;
-+		}
++	for (i = 0; i < nr_frags; i++) {
++		skb_frag_t *frag = shinfo->frags + i;
++		struct xen_netif_tx_request *txp;
++		unsigned long pending_idx;
 +
-+		if (netif_queue_stopped(netif->dev) &&
-+		    netif_schedulable(netif) &&
-+		    !netbk_queue_full(netif))
-+			netif_wake_queue(netif->dev);
++		pending_idx = (unsigned long)frag->page;
 +
-+		/*
-+		 * netfront_smartpoll_active indicates whether
-+		 * netfront timer is active.
-+		 */
-+		if ((netif->smart_poll == 1)) {
-+			if (!(netif->rx.sring->netfront_smartpoll_active)) {
-+				notify_remote_via_irq(irq);
-+				netif->rx.sring->netfront_smartpoll_active = 1;
-+			}
-+		}
++		pending_inuse[pending_idx].alloc_time = jiffies;
++		list_add_tail(&pending_inuse[pending_idx].list,
++			      &pending_inuse_head);
 +
-+		netif_put(netif);
-+		dev_kfree_skb(skb);
-+		npo.meta_cons += nr_frags + 1;
-+	}
++		txp = &pending_tx_info[pending_idx].req;
++		frag->page = virt_to_page(idx_to_kaddr(pending_idx));
++		frag->size = txp->size;
++		frag->page_offset = txp->offset;
 +
-+	while (notify_nr != 0) {
-+		irq = notify_list[--notify_nr];
-+		rx_notify[irq] = 0;
-+		notify_remote_via_irq(irq);
++		skb->len += txp->size;
++		skb->data_len += txp->size;
++		skb->truesize += txp->size;
 +	}
-+
-+	/* More work to do? */
-+	if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
-+		tasklet_schedule(&net_rx_tasklet);
 +}
 +
-+static void net_alarm(unsigned long unused)
++int netbk_get_extras(struct xen_netif *netif, struct xen_netif_extra_info *extras,
++		     int work_to_do)
 +{
-+	tasklet_schedule(&net_rx_tasklet);
-+}
++	struct xen_netif_extra_info extra;
++	RING_IDX cons = netif->tx.req_cons;
 +
-+static void netbk_tx_pending_timeout(unsigned long unused)
-+{
-+	tasklet_schedule(&net_tx_tasklet);
-+}
++	do {
++		if (unlikely(work_to_do-- <= 0)) {
++			DPRINTK("Missing extra info\n");
++			return -EBADR;
++		}
 +
-+struct net_device_stats *netif_be_get_stats(struct net_device *dev)
-+{
-+	struct xen_netif *netif = netdev_priv(dev);
-+	return &netif->stats;
-+}
++		memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
++		       sizeof(extra));
++		if (unlikely(!extra.type ||
++			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
++			netif->tx.req_cons = ++cons;
++			DPRINTK("Invalid extra type: %d\n", extra.type);
++			return -EINVAL;
++		}
 +
-+static int __on_net_schedule_list(struct xen_netif *netif)
-+{
-+	return !list_empty(&netif->list);
-+}
++		memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
++		netif->tx.req_cons = ++cons;
++	} while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
 +
-+static void remove_from_net_schedule_list(struct xen_netif *netif)
-+{
-+	spin_lock_irq(&net_schedule_list_lock);
-+	if (likely(__on_net_schedule_list(netif))) {
-+		list_del_init(&netif->list);
-+		netif_put(netif);
-+	}
-+	spin_unlock_irq(&net_schedule_list_lock);
++	return work_to_do;
 +}
 +
-+static void add_to_net_schedule_list_tail(struct xen_netif *netif)
++static int netbk_set_skb_gso(struct sk_buff *skb, struct xen_netif_extra_info *gso)
 +{
-+	if (__on_net_schedule_list(netif))
-+		return;
++	if (!gso->u.gso.size) {
++		DPRINTK("GSO size must not be zero.\n");
++		return -EINVAL;
++	}
 +
-+	spin_lock_irq(&net_schedule_list_lock);
-+	if (!__on_net_schedule_list(netif) &&
-+	    likely(netif_schedulable(netif))) {
-+		list_add_tail(&netif->list, &net_schedule_list);
-+		netif_get(netif);
++	/* Currently only TCPv4 S.O. is supported. */
++	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
++		DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
++		return -EINVAL;
 +	}
-+	spin_unlock_irq(&net_schedule_list_lock);
-+}
 +
-+void netif_schedule_work(struct xen_netif *netif)
-+{
-+	int more_to_do;
++	skb_shinfo(skb)->gso_size = gso->u.gso.size;
++	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
 +
-+	RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
++	/* Header must be checked, and gso_segs computed. */
++	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
++	skb_shinfo(skb)->gso_segs = 0;
 +
-+	if (more_to_do) {
-+		add_to_net_schedule_list_tail(netif);
-+		maybe_schedule_tx_action();
-+	}
++	return 0;
 +}
 +
-+void netif_deschedule_work(struct xen_netif *netif)
++static int skb_checksum_setup(struct sk_buff *skb)
 +{
-+	remove_from_net_schedule_list(netif);
-+}
++	struct iphdr *iph;
++	unsigned char *th;
++	int err = -EPROTO;
 +
++	if (skb->protocol != htons(ETH_P_IP))
++		goto out;
 +
-+static void tx_add_credit(struct xen_netif *netif)
-+{
-+	unsigned long max_burst, max_credit;
++	iph = (void *)skb->data;
++	th = skb->data + 4 * iph->ihl;
++	if (th >= skb_tail_pointer(skb))
++		goto out;
 +
-+	/*
-+	 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
-+	 * Otherwise the interface can seize up due to insufficient credit.
-+	 */
-+	max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
-+	max_burst = min(max_burst, 131072UL);
-+	max_burst = max(max_burst, netif->credit_bytes);
++	skb->csum_start = th - skb->head;
++	switch (iph->protocol) {
++	case IPPROTO_TCP:
++		skb->csum_offset = offsetof(struct tcphdr, check);
++		break;
++	case IPPROTO_UDP:
++		skb->csum_offset = offsetof(struct udphdr, check);
++		break;
++	default:
++		if (net_ratelimit())
++			printk(KERN_ERR "Attempting to checksum a non-"
++			       "TCP/UDP packet, dropping a protocol"
++			       " %d packet", iph->protocol);
++		goto out;
++	}
 +
-+	/* Take care that adding a new chunk of credit doesn't wrap to zero. */
-+	max_credit = netif->remaining_credit + netif->credit_bytes;
-+	if (max_credit < netif->remaining_credit)
-+		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
++	if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
++		goto out;
 +
-+	netif->remaining_credit = min(max_credit, max_burst);
-+}
++	err = 0;
 +
-+static void tx_credit_callback(unsigned long data)
-+{
-+	struct xen_netif *netif = (struct xen_netif *)data;
-+	tx_add_credit(netif);
-+	netif_schedule_work(netif);
++out:
++	return err;
 +}
 +
-+static inline int copy_pending_req(pending_ring_idx_t pending_idx)
++static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size)
 +{
-+	return gnttab_copy_grant_page(grant_tx_handle[pending_idx],
-+				      &mmap_pages[pending_idx]);
-+}
++	unsigned long now = jiffies;
++	unsigned long next_credit =
++		netif->credit_timeout.expires +
++		msecs_to_jiffies(netif->credit_usec / 1000);
 +
-+inline static void net_tx_action_dealloc(void)
-+{
-+	struct netbk_tx_pending_inuse *inuse, *n;
-+	struct gnttab_unmap_grant_ref *gop;
-+	u16 pending_idx;
-+	pending_ring_idx_t dc, dp;
-+	struct xen_netif *netif;
-+	int ret;
-+	LIST_HEAD(list);
++	/* Timer could already be pending in rare cases. */
++	if (timer_pending(&netif->credit_timeout))
++		return true;
 +
-+	dc = dealloc_cons;
-+	gop = tx_unmap_ops;
++	/* Passed the point where we can replenish credit? */
++	if (time_after_eq(now, next_credit)) {
++		netif->credit_timeout.expires = now;
++		tx_add_credit(netif);
++	}
 +
-+	/*
-+	 * Free up any grants we have finished using
-+	 */
-+	do {
-+		dp = dealloc_prod;
++	/* Still too big to send right now? Set a callback. */
++	if (size > netif->remaining_credit) {
++		netif->credit_timeout.data     =
++			(unsigned long)netif;
++		netif->credit_timeout.function =
++			tx_credit_callback;
++		mod_timer(&netif->credit_timeout,
++			  next_credit);
 +
-+		/* Ensure we see all indices enqueued by netif_idx_release(). */
-+		smp_rmb();
++		return true;
++	}
 +
-+		while (dc != dp) {
-+			unsigned long pfn;
++	return false;
++}
 +
-+			pending_idx = dealloc_ring[pending_index(dc++)];
-+			list_move_tail(&pending_inuse[pending_idx].list, &list);
++static unsigned net_tx_build_mops(void)
++{
++	struct gnttab_map_grant_ref *mop;
++	struct sk_buff *skb;
++	int ret;
 +
-+			pfn = idx_to_pfn(pending_idx);
-+			/* Already unmapped? */
-+			if (!phys_to_machine_mapping_valid(pfn))
-+				continue;
++	mop = tx_map_ops;
++	while (((nr_pending_reqs() + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
++		!list_empty(&net_schedule_list)) {
++		struct xen_netif *netif;
++		struct xen_netif_tx_request txreq;
++		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
++		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
++		u16 pending_idx;
++		RING_IDX idx;
++		int work_to_do;
++		unsigned int data_len;
++	
++		/* Get a netif from the list with work to do. */
++		netif = list_first_entry(&net_schedule_list, struct xen_netif, list);
++		netif_get(netif);
++		remove_from_net_schedule_list(netif);
 +
-+			gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
-+					    GNTMAP_host_map,
-+					    grant_tx_handle[pending_idx]);
-+			gop++;
++		RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
++		if (!work_to_do) {
++			netif_put(netif);
++			continue;
 +		}
 +
-+		if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB ||
-+		    list_empty(&pending_inuse_head))
-+			break;
++		idx = netif->tx.req_cons;
++		rmb(); /* Ensure that we see the request before we copy it. */
++		memcpy(&txreq, RING_GET_REQUEST(&netif->tx, idx), sizeof(txreq));
 +
-+		/* Copy any entries that have been pending for too long. */
-+		list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) {
-+			if (time_after(inuse->alloc_time + HZ / 2, jiffies))
-+				break;
++		/* Credit-based scheduling. */
++		if (txreq.size > netif->remaining_credit &&
++		    tx_credit_exceeded(netif, txreq.size)) {
++			netif_put(netif);
++			continue;
++		}
 +
-+			pending_idx = inuse - pending_inuse;
++		netif->remaining_credit -= txreq.size;
 +
-+			pending_tx_info[pending_idx].netif->nr_copied_skbs++;
++		work_to_do--;
++		netif->tx.req_cons = ++idx;
 +
-+			switch (copy_pending_req(pending_idx)) {
-+			case 0:
-+				list_move_tail(&inuse->list, &list);
-+				continue;
-+			case -EBUSY:
-+				list_del_init(&inuse->list);
-+				continue;
-+			case -ENOENT:
++		memset(extras, 0, sizeof(extras));
++		if (txreq.flags & NETTXF_extra_info) {
++			work_to_do = netbk_get_extras(netif, extras,
++						      work_to_do);
++			idx = netif->tx.req_cons;
++			if (unlikely(work_to_do < 0)) {
++				netbk_tx_err(netif, &txreq, idx);
 +				continue;
 +			}
-+
-+			break;
 +		}
-+	} while (dp != dealloc_prod);
-+
-+	dealloc_cons = dc;
-+
-+	ret = HYPERVISOR_grant_table_op(
-+		GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
-+	BUG_ON(ret);
-+
-+	list_for_each_entry_safe(inuse, n, &list, list) {
-+		pending_idx = inuse - pending_inuse;
-+
-+		netif = pending_tx_info[pending_idx].netif;
 +
-+		make_tx_response(netif, &pending_tx_info[pending_idx].req,
-+				 NETIF_RSP_OKAY);
-+
-+		/* Ready for next use. */
-+		gnttab_reset_grant_page(mmap_pages[pending_idx]);
++		ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
++		if (unlikely(ret < 0)) {
++			netbk_tx_err(netif, &txreq, idx - ret);
++			continue;
++		}
++		idx += ret;
 +
-+		pending_ring[pending_index(pending_prod++)] = pending_idx;
++		if (unlikely(txreq.size < ETH_HLEN)) {
++			DPRINTK("Bad packet size: %d\n", txreq.size);
++			netbk_tx_err(netif, &txreq, idx);
++			continue;
++		}
 +
-+		netif_put(netif);
++		/* No crossing a page as the payload mustn't fragment. */
++		if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
++			DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
++				txreq.offset, txreq.size,
++				(txreq.offset &~PAGE_MASK) + txreq.size);
++			netbk_tx_err(netif, &txreq, idx);
++			continue;
++		}
 +
-+		list_del_init(&inuse->list);
-+	}
-+}
++		pending_idx = pending_ring[pending_index(pending_cons)];
 +
-+static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *txp, RING_IDX end)
-+{
-+	RING_IDX cons = netif->tx.req_cons;
++		data_len = (txreq.size > PKT_PROT_LEN &&
++			    ret < MAX_SKB_FRAGS) ?
++			PKT_PROT_LEN : txreq.size;
 +
-+	do {
-+		make_tx_response(netif, txp, NETIF_RSP_ERROR);
-+		if (cons >= end)
++		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
++				GFP_ATOMIC | __GFP_NOWARN);
++		if (unlikely(skb == NULL)) {
++			DPRINTK("Can't allocate a skb in start_xmit.\n");
++			netbk_tx_err(netif, &txreq, idx);
 +			break;
-+		txp = RING_GET_REQUEST(&netif->tx, cons++);
-+	} while (1);
-+	netif->tx.req_cons = cons;
-+	netif_schedule_work(netif);
-+	netif_put(netif);
-+}
++		}
 +
-+static int netbk_count_requests(struct xen_netif *netif,
-+				struct xen_netif_tx_request *first,
-+				struct xen_netif_tx_request *txp, int work_to_do)
-+{
-+	RING_IDX cons = netif->tx.req_cons;
-+	int frags = 0;
++		/* Packets passed to netif_rx() must have some headroom. */
++		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
 +
-+	if (!(first->flags & NETTXF_more_data))
-+		return 0;
++		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
++			struct xen_netif_extra_info *gso;
++			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
 +
-+	do {
-+		if (frags >= work_to_do) {
-+			DPRINTK("Need more frags\n");
-+			return -frags;
++			if (netbk_set_skb_gso(skb, gso)) {
++				kfree_skb(skb);
++				netbk_tx_err(netif, &txreq, idx);
++				continue;
++			}
 +		}
 +
-+		if (unlikely(frags >= MAX_SKB_FRAGS)) {
-+			DPRINTK("Too many frags\n");
-+			return -frags;
-+		}
++		gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
++				  GNTMAP_host_map | GNTMAP_readonly,
++				  txreq.gref, netif->domid);
++		mop++;
 +
-+		memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
-+		       sizeof(*txp));
-+		if (txp->size > first->size) {
-+			DPRINTK("Frags galore\n");
-+			return -frags;
-+		}
++		memcpy(&pending_tx_info[pending_idx].req,
++		       &txreq, sizeof(txreq));
++		pending_tx_info[pending_idx].netif = netif;
++		*((u16 *)skb->data) = pending_idx;
 +
-+		first->size -= txp->size;
-+		frags++;
++		__skb_put(skb, data_len);
 +
-+		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
-+			DPRINTK("txp->offset: %x, size: %u\n",
-+				txp->offset, txp->size);
-+			return -frags;
++		skb_shinfo(skb)->nr_frags = ret;
++		if (data_len < txreq.size) {
++			skb_shinfo(skb)->nr_frags++;
++			skb_shinfo(skb)->frags[0].page =
++				(void *)(unsigned long)pending_idx;
++		} else {
++			/* Discriminate from any valid pending_idx value. */
++			skb_shinfo(skb)->frags[0].page = (void *)~0UL;
 +		}
-+	} while ((txp++)->flags & NETTXF_more_data);
-+
-+	return frags;
-+}
 +
-+static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
-+						  struct sk_buff *skb,
-+						  struct xen_netif_tx_request *txp,
-+						  struct gnttab_map_grant_ref *mop)
-+{
-+	struct skb_shared_info *shinfo = skb_shinfo(skb);
-+	skb_frag_t *frags = shinfo->frags;
-+	unsigned long pending_idx = *((u16 *)skb->data);
-+	int i, start;
++		__skb_queue_tail(&tx_queue, skb);
 +
-+	/* Skip first skb fragment if it is on same page as header fragment. */
-+	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
++		pending_cons++;
 +
-+	for (i = start; i < shinfo->nr_frags; i++, txp++) {
-+		pending_idx = pending_ring[pending_index(pending_cons++)];
++		mop = netbk_get_requests(netif, skb, txfrags, mop);
 +
-+		gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
-+				  GNTMAP_host_map | GNTMAP_readonly,
-+				  txp->gref, netif->domid);
++		netif->tx.req_cons = idx;
++		netif_schedule_work(netif);
 +
-+		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
-+		netif_get(netif);
-+		pending_tx_info[pending_idx].netif = netif;
-+		frags[i].page = (void *)pending_idx;
++		if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
++			break;
 +	}
 +
-+	return mop;
++	return mop - tx_map_ops;
 +}
 +
-+static int netbk_tx_check_mop(struct sk_buff *skb,
-+			       struct gnttab_map_grant_ref **mopp)
++static void net_tx_submit(void)
 +{
-+	struct gnttab_map_grant_ref *mop = *mopp;
-+	int pending_idx = *((u16 *)skb->data);
-+	struct xen_netif *netif = pending_tx_info[pending_idx].netif;
-+	struct xen_netif_tx_request *txp;
-+	struct skb_shared_info *shinfo = skb_shinfo(skb);
-+	int nr_frags = shinfo->nr_frags;
-+	int i, err, start;
++	struct gnttab_map_grant_ref *mop;
++	struct sk_buff *skb;
 +
-+	/* Check status of header. */
-+	err = mop->status;
-+	if (unlikely(err)) {
-+		txp = &pending_tx_info[pending_idx].req;
-+		make_tx_response(netif, txp, NETIF_RSP_ERROR);
-+		pending_ring[pending_index(pending_prod++)] = pending_idx;
-+		netif_put(netif);
-+	} else {
-+		set_phys_to_machine(
-+			__pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
-+			FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
-+		grant_tx_handle[pending_idx] = mop->handle;
-+	}
++	mop = tx_map_ops;
++	while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
++		struct xen_netif_tx_request *txp;
++		struct xen_netif *netif;
++		u16 pending_idx;
++		unsigned data_len;
 +
-+	/* Skip first skb fragment if it is on same page as header fragment. */
-+	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
++		pending_idx = *((u16 *)skb->data);
++		netif       = pending_tx_info[pending_idx].netif;
++		txp         = &pending_tx_info[pending_idx].req;
 +
-+	for (i = start; i < nr_frags; i++) {
-+		int j, newerr;
++		/* Check the remap error code. */
++		if (unlikely(netbk_tx_check_mop(skb, &mop))) {
++			DPRINTK("netback grant failed.\n");
++			skb_shinfo(skb)->nr_frags = 0;
++			kfree_skb(skb);
++			continue;
++		}
 +
-+		pending_idx = (unsigned long)shinfo->frags[i].page;
++		data_len = skb->len;
++		memcpy(skb->data,
++		       (void *)(idx_to_kaddr(pending_idx)|txp->offset),
++		       data_len);
++		if (data_len < txp->size) {
++			/* Append the packet payload as a fragment. */
++			txp->offset += data_len;
++			txp->size -= data_len;
++		} else {
++			/* Schedule a response immediately. */
++			netif_idx_release(pending_idx);
++		}
 +
-+		/* Check error status: if okay then remember grant handle. */
-+		newerr = (++mop)->status;
-+		if (likely(!newerr)) {
-+			set_phys_to_machine(
-+				__pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
-+				FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
-+			grant_tx_handle[pending_idx] = mop->handle;
-+			/* Had a previous error? Invalidate this fragment. */
-+			if (unlikely(err))
-+				netif_idx_release(pending_idx);
-+			continue;
++		/*
++		 * Old frontends do not assert data_validated but we
++		 * can infer it from csum_blank so test both flags.
++		 */
++		if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank))
++			skb->ip_summed = CHECKSUM_PARTIAL;
++		else
++			skb->ip_summed = CHECKSUM_NONE;
++
++		netbk_fill_frags(skb);
++
++		/*
++		 * If the initial fragment was < PKT_PROT_LEN then
++		 * pull through some bytes from the other fragments to
++		 * increase the linear region to PKT_PROT_LEN bytes.
++		 */
++		if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
++			int target = min_t(int, skb->len, PKT_PROT_LEN);
++			__pskb_pull_tail(skb, target - skb_headlen(skb));
 +		}
 +
-+		/* Error on this fragment: respond to client with an error. */
-+		txp = &pending_tx_info[pending_idx].req;
-+		make_tx_response(netif, txp, NETIF_RSP_ERROR);
-+		pending_ring[pending_index(pending_prod++)] = pending_idx;
-+		netif_put(netif);
++		skb->dev      = netif->dev;
++		skb->protocol = eth_type_trans(skb, skb->dev);
 +
-+		/* Not the first error? Preceding frags already invalidated. */
-+		if (err)
-+			continue;
++		netif->stats.rx_bytes += skb->len;
++		netif->stats.rx_packets++;
 +
-+		/* First error: invalidate header and preceding fragments. */
-+		pending_idx = *((u16 *)skb->data);
-+		netif_idx_release(pending_idx);
-+		for (j = start; j < i; j++) {
-+			pending_idx = (unsigned long)shinfo->frags[i].page;
-+			netif_idx_release(pending_idx);
++		if (skb->ip_summed == CHECKSUM_PARTIAL) {
++			if (skb_checksum_setup(skb)) {
++				DPRINTK("Can't setup checksum in net_tx_action\n");
++				kfree_skb(skb);
++				continue;
++			}
 +		}
 +
-+		/* Remember the error: invalidate all subsequent fragments. */
-+		err = newerr;
++		if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
++		    unlikely(skb_linearize(skb))) {
++			DPRINTK("Can't linearize skb in net_tx_action.\n");
++			kfree_skb(skb);
++			continue;
++		}
++
++		netif_rx(skb);
++		netif->dev->last_rx = jiffies;
 +	}
 +
-+	*mopp = mop + 1;
-+	return err;
++	if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
++	    !list_empty(&pending_inuse_head)) {
++		struct netbk_tx_pending_inuse *oldest;
++
++		oldest = list_entry(pending_inuse_head.next,
++				    struct netbk_tx_pending_inuse, list);
++		mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ);
++	}
 +}
 +
-+static void netbk_fill_frags(struct sk_buff *skb)
++/* Called after netfront has transmitted */
++static void net_tx_action(unsigned long unused)
 +{
-+	struct skb_shared_info *shinfo = skb_shinfo(skb);
-+	int nr_frags = shinfo->nr_frags;
-+	int i;
++	unsigned nr_mops;
++	int ret;
 +
-+	for (i = 0; i < nr_frags; i++) {
-+		skb_frag_t *frag = shinfo->frags + i;
-+		struct xen_netif_tx_request *txp;
-+		unsigned long pending_idx;
++	if (dealloc_cons != dealloc_prod)
++		net_tx_action_dealloc();
 +
-+		pending_idx = (unsigned long)frag->page;
++	nr_mops = net_tx_build_mops();
 +
-+		pending_inuse[pending_idx].alloc_time = jiffies;
-+		list_add_tail(&pending_inuse[pending_idx].list,
-+			      &pending_inuse_head);
++	if (nr_mops == 0)
++		return;
 +
-+		txp = &pending_tx_info[pending_idx].req;
-+		frag->page = virt_to_page(idx_to_kaddr(pending_idx));
-+		frag->size = txp->size;
-+		frag->page_offset = txp->offset;
++	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++					tx_map_ops, nr_mops);
++	BUG_ON(ret);
 +
-+		skb->len += txp->size;
-+		skb->data_len += txp->size;
-+		skb->truesize += txp->size;
-+	}
++	net_tx_submit();
 +}
 +
-+int netbk_get_extras(struct xen_netif *netif, struct xen_netif_extra_info *extras,
-+		     int work_to_do)
++static void netif_idx_release(u16 pending_idx)
 +{
-+	struct xen_netif_extra_info extra;
-+	RING_IDX cons = netif->tx.req_cons;
-+
-+	do {
-+		if (unlikely(work_to_do-- <= 0)) {
-+			DPRINTK("Missing extra info\n");
-+			return -EBADR;
-+		}
-+
-+		memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
-+		       sizeof(extra));
-+		if (unlikely(!extra.type ||
-+			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
-+			netif->tx.req_cons = ++cons;
-+			DPRINTK("Invalid extra type: %d\n", extra.type);
-+			return -EINVAL;
-+		}
++	static DEFINE_SPINLOCK(_lock);
++	unsigned long flags;
 +
-+		memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
-+		netif->tx.req_cons = ++cons;
-+	} while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
++	spin_lock_irqsave(&_lock, flags);
++	dealloc_ring[pending_index(dealloc_prod)] = pending_idx;
++	/* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
++	smp_wmb();
++	dealloc_prod++;
++	spin_unlock_irqrestore(&_lock, flags);
 +
-+	return work_to_do;
++	tasklet_schedule(&net_tx_tasklet);
 +}
 +
-+static int netbk_set_skb_gso(struct sk_buff *skb, struct xen_netif_extra_info *gso)
++static void netif_page_release(struct page *page, unsigned int order)
 +{
-+	if (!gso->u.gso.size) {
-+		DPRINTK("GSO size must not be zero.\n");
-+		return -EINVAL;
-+	}
++	int idx = netif_page_index(page);
++	BUG_ON(order);
++	BUG_ON(idx < 0);
++	netif_idx_release(idx);
++}
 +
-+	/* Currently only TCPv4 S.O. is supported. */
-+	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
-+		DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
-+		return -EINVAL;
-+	}
++irqreturn_t netif_be_int(int irq, void *dev_id)
++{
++	struct xen_netif *netif = dev_id;
 +
-+	skb_shinfo(skb)->gso_size = gso->u.gso.size;
-+	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
++	add_to_net_schedule_list_tail(netif);
++	maybe_schedule_tx_action();
 +
-+	/* Header must be checked, and gso_segs computed. */
-+	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-+	skb_shinfo(skb)->gso_segs = 0;
++	if (netif_schedulable(netif) && !netbk_queue_full(netif))
++		netif_wake_queue(netif->dev);
 +
-+	return 0;
++	return IRQ_HANDLED;
 +}
 +
-+static int skb_checksum_setup(struct sk_buff *skb)
++static void make_tx_response(struct xen_netif *netif,
++			     struct xen_netif_tx_request *txp,
++			     s8       st)
 +{
-+	struct iphdr *iph;
-+	unsigned char *th;
-+	int err = -EPROTO;
++	RING_IDX i = netif->tx.rsp_prod_pvt;
++	struct xen_netif_tx_response *resp;
++	int notify;
 +
-+	if (skb->protocol != htons(ETH_P_IP))
-+		goto out;
++	resp = RING_GET_RESPONSE(&netif->tx, i);
++	resp->id     = txp->id;
++	resp->status = st;
 +
-+	iph = (void *)skb->data;
-+	th = skb->data + 4 * iph->ihl;
-+	if (th >= skb_tail_pointer(skb))
-+		goto out;
++	if (txp->flags & NETTXF_extra_info)
++		RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
 +
-+	skb->csum_start = th - skb->head;
-+	switch (iph->protocol) {
-+	case IPPROTO_TCP:
-+		skb->csum_offset = offsetof(struct tcphdr, check);
-+		break;
-+	case IPPROTO_UDP:
-+		skb->csum_offset = offsetof(struct udphdr, check);
-+		break;
-+	default:
-+		if (net_ratelimit())
-+			printk(KERN_ERR "Attempting to checksum a non-"
-+			       "TCP/UDP packet, dropping a protocol"
-+			       " %d packet", iph->protocol);
-+		goto out;
-+	}
++	netif->tx.rsp_prod_pvt = ++i;
++	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
 +
-+	if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
-+		goto out;
++	/*
++	 * netfront_smartpoll_active indicates whether netfront timer
++	 * is active.
++	 */
++	if ((netif->smart_poll == 1)) {
++		if (!(netif->rx.sring->netfront_smartpoll_active)) {
++			notify_remote_via_irq(netif->irq);
++			netif->rx.sring->netfront_smartpoll_active = 1;
++		}
++	} else if (notify)
++		notify_remote_via_irq(netif->irq);
++}
 +
-+	err = 0;
++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
++					     u16      id,
++					     s8       st,
++					     u16      offset,
++					     u16      size,
++					     u16      flags)
++{
++	RING_IDX i = netif->rx.rsp_prod_pvt;
++	struct xen_netif_rx_response *resp;
 +
-+out:
-+	return err;
++	resp = RING_GET_RESPONSE(&netif->rx, i);
++	resp->offset     = offset;
++	resp->flags      = flags;
++	resp->id         = id;
++	resp->status     = (s16)size;
++	if (st < 0)
++		resp->status = (s16)st;
++
++	netif->rx.rsp_prod_pvt = ++i;
++
++	return resp;
 +}
 +
-+static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size)
++#ifdef NETBE_DEBUG_INTERRUPT
++static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 +{
-+	unsigned long now = jiffies;
-+	unsigned long next_credit =
-+		netif->credit_timeout.expires +
-+		msecs_to_jiffies(netif->credit_usec / 1000);
++	struct list_head *ent;
++	struct xen_netif *netif;
++	int i = 0;
 +
-+	/* Timer could already be pending in rare cases. */
-+	if (timer_pending(&netif->credit_timeout))
-+		return true;
++	printk(KERN_ALERT "netif_schedule_list:\n");
++	spin_lock_irq(&net_schedule_list_lock);
 +
-+	/* Passed the point where we can replenish credit? */
-+	if (time_after_eq(now, next_credit)) {
-+		netif->credit_timeout.expires = now;
-+		tx_add_credit(netif);
++	list_for_each (ent, &net_schedule_list) {
++		netif = list_entry(ent, struct xen_netif, list);
++		printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
++		       "rx_resp_prod=%08x\n",
++		       i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
++		printk(KERN_ALERT "   tx_req_cons=%08x tx_resp_prod=%08x)\n",
++		       netif->tx.req_cons, netif->tx.rsp_prod_pvt);
++		printk(KERN_ALERT "   shared(rx_req_prod=%08x "
++		       "rx_resp_prod=%08x\n",
++		       netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
++		printk(KERN_ALERT "   rx_event=%08x tx_req_prod=%08x\n",
++		       netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
++		printk(KERN_ALERT "   tx_resp_prod=%08x, tx_event=%08x)\n",
++		       netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
++		i++;
 +	}
 +
-+	/* Still too big to send right now? Set a callback. */
-+	if (size > netif->remaining_credit) {
-+		netif->credit_timeout.data     =
-+			(unsigned long)netif;
-+		netif->credit_timeout.function =
-+			tx_credit_callback;
-+		mod_timer(&netif->credit_timeout,
-+			  next_credit);
-+
-+		return true;
-+	}
++	spin_unlock_irq(&net_schedule_list_lock);
++	printk(KERN_ALERT " ** End of netif_schedule_list **\n");
 +
-+	return false;
++	return IRQ_HANDLED;
 +}
++#endif
 +
-+static unsigned net_tx_build_mops(void)
++static int __init netback_init(void)
 +{
-+	struct gnttab_map_grant_ref *mop;
-+	struct sk_buff *skb;
-+	int ret;
++	int i;
++	struct page *page;
++	int rc = 0;
 +
-+	mop = tx_map_ops;
-+	while (((nr_pending_reqs() + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
-+		!list_empty(&net_schedule_list)) {
-+		struct xen_netif *netif;
-+		struct xen_netif_tx_request txreq;
-+		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
-+		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
-+		u16 pending_idx;
-+		RING_IDX idx;
-+		int work_to_do;
-+		unsigned int data_len;
-+	
-+		/* Get a netif from the list with work to do. */
-+		netif = list_first_entry(&net_schedule_list, struct xen_netif, list);
-+		netif_get(netif);
-+		remove_from_net_schedule_list(netif);
++	if (!xen_domain())
++		return -ENODEV;
 +
-+		RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
-+		if (!work_to_do) {
-+			netif_put(netif);
-+			continue;
-+		}
++	/* We can increase reservation by this much in net_rx_action(). */
++//	balloon_update_driver_allowance(NET_RX_RING_SIZE);
 +
-+		idx = netif->tx.req_cons;
-+		rmb(); /* Ensure that we see the request before we copy it. */
-+		memcpy(&txreq, RING_GET_REQUEST(&netif->tx, idx), sizeof(txreq));
++	skb_queue_head_init(&rx_queue);
++	skb_queue_head_init(&tx_queue);
++
++	init_timer(&net_timer);
++	net_timer.data = 0;
++	net_timer.function = net_alarm;
++
++	init_timer(&netbk_tx_pending_timer);
++	netbk_tx_pending_timer.data = 0;
++	netbk_tx_pending_timer.function = netbk_tx_pending_timeout;
 +
-+		/* Credit-based scheduling. */
-+		if (txreq.size > netif->remaining_credit &&
-+		    tx_credit_exceeded(netif, txreq.size)) {
-+			netif_put(netif);
-+			continue;
-+		}
++	mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
++	if (mmap_pages == NULL) {
++		printk("%s: out of memory\n", __FUNCTION__);
++		return -ENOMEM;
++	}
 +
-+		netif->remaining_credit -= txreq.size;
++	for (i = 0; i < MAX_PENDING_REQS; i++) {
++		page = mmap_pages[i];
++		SetPageForeign(page, netif_page_release);
++		netif_set_page_index(page, i);
++		INIT_LIST_HEAD(&pending_inuse[i].list);
++	}
 +
-+		work_to_do--;
-+		netif->tx.req_cons = ++idx;
++	pending_cons = 0;
++	pending_prod = MAX_PENDING_REQS;
++	for (i = 0; i < MAX_PENDING_REQS; i++)
++		pending_ring[i] = i;
 +
-+		memset(extras, 0, sizeof(extras));
-+		if (txreq.flags & NETTXF_extra_info) {
-+			work_to_do = netbk_get_extras(netif, extras,
-+						      work_to_do);
-+			idx = netif->tx.req_cons;
-+			if (unlikely(work_to_do < 0)) {
-+				netbk_tx_err(netif, &txreq, idx);
-+				continue;
-+			}
-+		}
++	netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
++	if (MODPARM_copy_skb) {
++		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
++					      NULL, 0))
++			netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
++		else
++			netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
++	}
 +
-+		ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
-+		if (unlikely(ret < 0)) {
-+			netbk_tx_err(netif, &txreq, idx - ret);
-+			continue;
-+		}
-+		idx += ret;
++	//netif_accel_init();
 +
-+		if (unlikely(txreq.size < ETH_HLEN)) {
-+			DPRINTK("Bad packet size: %d\n", txreq.size);
-+			netbk_tx_err(netif, &txreq, idx);
-+			continue;
-+		}
++	rc = netif_xenbus_init();
++	if (rc)
++		goto failed_init;
 +
-+		/* No crossing a page as the payload mustn't fragment. */
-+		if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
-+			DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
-+				txreq.offset, txreq.size,
-+				(txreq.offset &~PAGE_MASK) + txreq.size);
-+			netbk_tx_err(netif, &txreq, idx);
-+			continue;
-+		}
++#ifdef NETBE_DEBUG_INTERRUPT
++	(void)bind_virq_to_irqhandler(VIRQ_DEBUG,
++				      0,
++				      netif_be_dbg,
++				      SA_SHIRQ,
++				      "net-be-dbg",
++				      &netif_be_dbg);
++#endif
 +
-+		pending_idx = pending_ring[pending_index(pending_cons)];
++	return 0;
 +
-+		data_len = (txreq.size > PKT_PROT_LEN &&
-+			    ret < MAX_SKB_FRAGS) ?
-+			PKT_PROT_LEN : txreq.size;
++failed_init:
++	free_empty_pages_and_pagevec(mmap_pages, MAX_PENDING_REQS);
++	del_timer(&netbk_tx_pending_timer);
++	del_timer(&net_timer);
++	return rc;
 +
-+		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
-+				GFP_ATOMIC | __GFP_NOWARN);
-+		if (unlikely(skb == NULL)) {
-+			DPRINTK("Can't allocate a skb in start_xmit.\n");
-+			netbk_tx_err(netif, &txreq, idx);
-+			break;
-+		}
++}
 +
-+		/* Packets passed to netif_rx() must have some headroom. */
-+		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
++module_init(netback_init);
 +
-+		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
-+			struct xen_netif_extra_info *gso;
-+			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c
+new file mode 100644
+index 0000000..70636d0
+--- /dev/null
++++ b/drivers/xen/netback/xenbus.c
+@@ -0,0 +1,523 @@
++/*  Xenbus code for netif backend
++    Copyright (C) 2005 Rusty Russell <rusty at rustcorp.com.au>
++    Copyright (C) 2005 XenSource Ltd
 +
-+			if (netbk_set_skb_gso(skb, gso)) {
-+				kfree_skb(skb);
-+				netbk_tx_err(netif, &txreq, idx);
-+				continue;
-+			}
-+		}
++    This program is free software; you can redistribute it and/or modify
++    it under the terms of the GNU General Public License as published by
++    the Free Software Foundation; either version 2 of the License, or
++    (at your option) any later version.
 +
-+		gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
-+				  GNTMAP_host_map | GNTMAP_readonly,
-+				  txreq.gref, netif->domid);
-+		mop++;
++    This program is distributed in the hope that it will be useful,
++    but WITHOUT ANY WARRANTY; without even the implied warranty of
++    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++    GNU General Public License for more details.
 +
-+		memcpy(&pending_tx_info[pending_idx].req,
-+		       &txreq, sizeof(txreq));
-+		pending_tx_info[pending_idx].netif = netif;
-+		*((u16 *)skb->data) = pending_idx;
++    You should have received a copy of the GNU General Public License
++    along with this program; if not, write to the Free Software
++    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++*/
 +
-+		__skb_put(skb, data_len);
++#include <stdarg.h>
++#include <linux/module.h>
++#include <xen/xenbus.h>
++#include "common.h"
 +
-+		skb_shinfo(skb)->nr_frags = ret;
-+		if (data_len < txreq.size) {
-+			skb_shinfo(skb)->nr_frags++;
-+			skb_shinfo(skb)->frags[0].page =
-+				(void *)(unsigned long)pending_idx;
-+		} else {
-+			/* Discriminate from any valid pending_idx value. */
-+			skb_shinfo(skb)->frags[0].page = (void *)~0UL;
-+		}
++#if 0
++#undef DPRINTK
++#define DPRINTK(fmt, args...) \
++    printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
++#endif
 +
-+		__skb_queue_tail(&tx_queue, skb);
 +
-+		pending_cons++;
++static int connect_rings(struct backend_info *);
++static void connect(struct backend_info *);
++static void backend_create_netif(struct backend_info *be);
++static void unregister_hotplug_status_watch(struct backend_info *be);
 +
-+		mop = netbk_get_requests(netif, skb, txfrags, mop);
++static int netback_remove(struct xenbus_device *dev)
++{
++  struct backend_info *be = dev_get_drvdata(&dev->dev);
 +
-+		netif->tx.req_cons = idx;
-+		netif_schedule_work(netif);
++	//netback_remove_accelerators(be, dev);
 +
-+		if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
-+			break;
++	unregister_hotplug_status_watch(be);
++	if (be->netif) {
++		kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
++		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
++		netif_disconnect(be->netif);
++		be->netif = NULL;
 +	}
-+
-+	return mop - tx_map_ops;
++	kfree(be);
++	dev_set_drvdata(&dev->dev, NULL);
++	return 0;
 +}
 +
-+static void net_tx_submit(void)
++
++/**
++ * Entry point to this code when a new device is created.  Allocate the basic
++ * structures and switch to InitWait.
++ */
++static int netback_probe(struct xenbus_device *dev,
++			 const struct xenbus_device_id *id)
 +{
-+	struct gnttab_map_grant_ref *mop;
-+	struct sk_buff *skb;
++	const char *message;
++	struct xenbus_transaction xbt;
++	int err;
++	int sg;
++	struct backend_info *be = kzalloc(sizeof(struct backend_info),
++					  GFP_KERNEL);
++	if (!be) {
++		xenbus_dev_fatal(dev, -ENOMEM,
++				 "allocating backend structure");
++		return -ENOMEM;
++	}
 +
-+	mop = tx_map_ops;
-+	while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
-+		struct xen_netif_tx_request *txp;
-+		struct xen_netif *netif;
-+		u16 pending_idx;
-+		unsigned data_len;
++	be->dev = dev;
++	dev_set_drvdata(&dev->dev, be);
 +
-+		pending_idx = *((u16 *)skb->data);
-+		netif       = pending_tx_info[pending_idx].netif;
-+		txp         = &pending_tx_info[pending_idx].req;
++	sg = 1;
++	if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
++		sg = 0;
 +
-+		/* Check the remap error code. */
-+		if (unlikely(netbk_tx_check_mop(skb, &mop))) {
-+			DPRINTK("netback grant failed.\n");
-+			skb_shinfo(skb)->nr_frags = 0;
-+			kfree_skb(skb);
-+			continue;
++	do {
++		err = xenbus_transaction_start(&xbt);
++		if (err) {
++			xenbus_dev_fatal(dev, err, "starting transaction");
++			goto fail;
 +		}
 +
-+		data_len = skb->len;
-+		memcpy(skb->data,
-+		       (void *)(idx_to_kaddr(pending_idx)|txp->offset),
-+		       data_len);
-+		if (data_len < txp->size) {
-+			/* Append the packet payload as a fragment. */
-+			txp->offset += data_len;
-+			txp->size -= data_len;
-+		} else {
-+			/* Schedule a response immediately. */
-+			netif_idx_release(pending_idx);
++		err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
++		if (err) {
++			message = "writing feature-sg";
++			goto abort_transaction;
++		}
++
++		err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
++				    "%d", sg);
++		if (err) {
++			message = "writing feature-gso-tcpv4";
++			goto abort_transaction;
++		}
++
++		/* We support rx-copy path. */
++		err = xenbus_printf(xbt, dev->nodename,
++				    "feature-rx-copy", "%d", 1);
++		if (err) {
++			message = "writing feature-rx-copy";
++			goto abort_transaction;
 +		}
 +
 +		/*
-+		 * Old frontends do not assert data_validated but we
-+		 * can infer it from csum_blank so test both flags.
++		 * We don't support rx-flip path (except old guests who don't
++		 * grok this feature flag).
 +		 */
-+		if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank))
-+			skb->ip_summed = CHECKSUM_PARTIAL;
-+		else
-+			skb->ip_summed = CHECKSUM_NONE;
-+
-+		netbk_fill_frags(skb);
++		err = xenbus_printf(xbt, dev->nodename,
++				    "feature-rx-flip", "%d", 0);
++		if (err) {
++			message = "writing feature-rx-flip";
++			goto abort_transaction;
++		}
 +
-+		/*
-+		 * If the initial fragment was < PKT_PROT_LEN then
-+		 * pull through some bytes from the other fragments to
-+		 * increase the linear region to PKT_PROT_LEN bytes.
-+		 */
-+		if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
-+			int target = min_t(int, skb->len, PKT_PROT_LEN);
-+			__pskb_pull_tail(skb, target - skb_headlen(skb));
++		/* We support data smart poll mechanism */
++		err = xenbus_printf(xbt, dev->nodename,
++				    "feature-smart-poll", "%d", 1);
++		if (err) {
++			message = "writing feature-smart-poll";
++			goto abort_transaction;
 +		}
 +
-+		skb->dev      = netif->dev;
-+		skb->protocol = eth_type_trans(skb, skb->dev);
++		err = xenbus_transaction_end(xbt, 0);
++	} while (err == -EAGAIN);
 +
-+		netif->stats.rx_bytes += skb->len;
-+		netif->stats.rx_packets++;
++	if (err) {
++		xenbus_dev_fatal(dev, err, "completing transaction");
++		goto fail;
++	}
 +
-+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
-+			if (skb_checksum_setup(skb)) {
-+				DPRINTK("Can't setup checksum in net_tx_action\n");
-+				kfree_skb(skb);
-+				continue;
-+			}
-+		}
++	//netback_probe_accelerators(be, dev);
 +
-+		if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
-+		    unlikely(skb_linearize(skb))) {
-+			DPRINTK("Can't linearize skb in net_tx_action.\n");
-+			kfree_skb(skb);
-+			continue;
-+		}
++	err = xenbus_switch_state(dev, XenbusStateInitWait);
++	if (err)
++		goto fail;
 +
-+		netif_rx(skb);
-+		netif->dev->last_rx = jiffies;
-+	}
++	/* This kicks hotplug scripts, so do it immediately. */
++	backend_create_netif(be);
 +
-+	if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
-+	    !list_empty(&pending_inuse_head)) {
-+		struct netbk_tx_pending_inuse *oldest;
++	return 0;
 +
-+		oldest = list_entry(pending_inuse_head.next,
-+				    struct netbk_tx_pending_inuse, list);
-+		mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ);
-+	}
++abort_transaction:
++	xenbus_transaction_end(xbt, 1);
++	xenbus_dev_fatal(dev, err, "%s", message);
++fail:
++	DPRINTK("failed");
++	netback_remove(dev);
++	return err;
 +}
 +
-+/* Called after netfront has transmitted */
-+static void net_tx_action(unsigned long unused)
-+{
-+	unsigned nr_mops;
-+	int ret;
 +
-+	if (dealloc_cons != dealloc_prod)
-+		net_tx_action_dealloc();
++/**
++ * Handle the creation of the hotplug script environment.  We add the script
++ * and vif variables to the environment, for the benefit of the vif-* hotplug
++ * scripts.
++ */
++static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
++{
++	struct backend_info *be = dev_get_drvdata(&xdev->dev);
++	struct xen_netif *netif = be->netif;
++	char *val;
 +
-+	nr_mops = net_tx_build_mops();
++	DPRINTK("netback_uevent");
 +
-+	if (nr_mops == 0)
-+		return;
++	val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
++	if (IS_ERR(val)) {
++		int err = PTR_ERR(val);
++		xenbus_dev_fatal(xdev, err, "reading script");
++		return err;
++	}
++	else {
++		if (add_uevent_var(env, "script=%s", val)) {
++			kfree(val);
++			return -ENOMEM;
++		}
++		kfree(val);
++	}
 +
-+	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
-+					tx_map_ops, nr_mops);
-+	BUG_ON(ret);
++	if (add_uevent_var(env, "vif=%s", netif->dev->name))
++		return -ENOMEM;
 +
-+	net_tx_submit();
++	return 0;
 +}
 +
-+static void netif_idx_release(u16 pending_idx)
++
++static void backend_create_netif(struct backend_info *be)
 +{
-+	static DEFINE_SPINLOCK(_lock);
-+	unsigned long flags;
++	int err;
++	long handle;
++	struct xenbus_device *dev = be->dev;
 +
-+	spin_lock_irqsave(&_lock, flags);
-+	dealloc_ring[pending_index(dealloc_prod)] = pending_idx;
-+	/* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
-+	smp_wmb();
-+	dealloc_prod++;
-+	spin_unlock_irqrestore(&_lock, flags);
++	if (be->netif != NULL)
++		return;
 +
-+	tasklet_schedule(&net_tx_tasklet);
++	err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
++	if (err != 1) {
++		xenbus_dev_fatal(dev, err, "reading handle");
++		return;
++	}
++
++	be->netif = netif_alloc(&dev->dev, dev->otherend_id, handle);
++	if (IS_ERR(be->netif)) {
++		err = PTR_ERR(be->netif);
++		be->netif = NULL;
++		xenbus_dev_fatal(dev, err, "creating interface");
++		return;
++	}
++
++	kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
 +}
 +
-+static void netif_page_release(struct page *page, unsigned int order)
++
++static void disconnect_backend(struct xenbus_device *dev)
 +{
-+	int idx = netif_page_index(page);
-+	BUG_ON(order);
-+	BUG_ON(idx < 0);
-+	netif_idx_release(idx);
++	struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++	if (be->netif) {
++		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
++		netif_disconnect(be->netif);
++		be->netif = NULL;
++	}
 +}
 +
-+irqreturn_t netif_be_int(int irq, void *dev_id)
++/**
++ * Callback received when the frontend's state changes.
++ */
++static void frontend_changed(struct xenbus_device *dev,
++			     enum xenbus_state frontend_state)
 +{
-+	struct xen_netif *netif = dev_id;
++	struct backend_info *be = dev_get_drvdata(&dev->dev);
 +
-+	add_to_net_schedule_list_tail(netif);
-+	maybe_schedule_tx_action();
++	DPRINTK("%s", xenbus_strstate(frontend_state));
 +
-+	if (netif_schedulable(netif) && !netbk_queue_full(netif))
-+		netif_wake_queue(netif->dev);
++	be->frontend_state = frontend_state;
 +
-+	return IRQ_HANDLED;
++	switch (frontend_state) {
++	case XenbusStateInitialising:
++		if (dev->state == XenbusStateClosed) {
++			printk(KERN_INFO "%s: %s: prepare for reconnect\n",
++			       __FUNCTION__, dev->nodename);
++			xenbus_switch_state(dev, XenbusStateInitWait);
++		}
++		break;
++
++	case XenbusStateInitialised:
++		break;
++
++	case XenbusStateConnected:
++		if (dev->state == XenbusStateConnected)
++			break;
++		backend_create_netif(be);
++		if (be->netif)
++			connect(be);
++		break;
++
++	case XenbusStateClosing:
++		if (be->netif)
++			kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
++		disconnect_backend(dev);
++		xenbus_switch_state(dev, XenbusStateClosing);
++		break;
++
++	case XenbusStateClosed:
++		xenbus_switch_state(dev, XenbusStateClosed);
++		if (xenbus_dev_is_online(dev))
++			break;
++		/* fall through if not online */
++	case XenbusStateUnknown:
++		device_unregister(&dev->dev);
++		break;
++
++	default:
++		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++				 frontend_state);
++		break;
++	}
 +}
 +
-+static void make_tx_response(struct xen_netif *netif,
-+			     struct xen_netif_tx_request *txp,
-+			     s8       st)
++
++static void xen_net_read_rate(struct xenbus_device *dev,
++			      unsigned long *bytes, unsigned long *usec)
 +{
-+	RING_IDX i = netif->tx.rsp_prod_pvt;
-+	struct xen_netif_tx_response *resp;
-+	int notify;
++	char *s, *e;
++	unsigned long b, u;
++	char *ratestr;
 +
-+	resp = RING_GET_RESPONSE(&netif->tx, i);
-+	resp->id     = txp->id;
-+	resp->status = st;
++	/* Default to unlimited bandwidth. */
++	*bytes = ~0UL;
++	*usec = 0;
 +
-+	if (txp->flags & NETTXF_extra_info)
-+		RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
++	ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
++	if (IS_ERR(ratestr))
++		return;
++
++	s = ratestr;
++	b = simple_strtoul(s, &e, 10);
++	if ((s == e) || (*e != ','))
++		goto fail;
++
++	s = e + 1;
++	u = simple_strtoul(s, &e, 10);
++	if ((s == e) || (*e != '\0'))
++		goto fail;
++
++	*bytes = b;
++	*usec = u;
 +
-+	netif->tx.rsp_prod_pvt = ++i;
-+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
++	kfree(ratestr);
++	return;
 +
-+	/*
-+	 * netfront_smartpoll_active indicates whether netfront timer
-+	 * is active.
-+	 */
-+	if ((netif->smart_poll == 1)) {
-+		if (!(netif->rx.sring->netfront_smartpoll_active)) {
-+			notify_remote_via_irq(netif->irq);
-+			netif->rx.sring->netfront_smartpoll_active = 1;
-+		}
-+	} else if (notify)
-+		notify_remote_via_irq(netif->irq);
++ fail:
++	WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
++	kfree(ratestr);
 +}
 +
-+static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
-+					     u16      id,
-+					     s8       st,
-+					     u16      offset,
-+					     u16      size,
-+					     u16      flags)
++static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
 +{
-+	RING_IDX i = netif->rx.rsp_prod_pvt;
-+	struct xen_netif_rx_response *resp;
++	char *s, *e, *macstr;
++	int i;
 +
-+	resp = RING_GET_RESPONSE(&netif->rx, i);
-+	resp->offset     = offset;
-+	resp->flags      = flags;
-+	resp->id         = id;
-+	resp->status     = (s16)size;
-+	if (st < 0)
-+		resp->status = (s16)st;
++	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
++	if (IS_ERR(macstr))
++		return PTR_ERR(macstr);
 +
-+	netif->rx.rsp_prod_pvt = ++i;
++	for (i = 0; i < ETH_ALEN; i++) {
++		mac[i] = simple_strtoul(s, &e, 16);
++		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
++			kfree(macstr);
++			return -ENOENT;
++		}
++		s = e+1;
++	}
 +
-+	return resp;
++	kfree(macstr);
++	return 0;
 +}
 +
-+#ifdef NETBE_DEBUG_INTERRUPT
-+static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
++static void unregister_hotplug_status_watch(struct backend_info *be)
 +{
-+	struct list_head *ent;
-+	struct xen_netif *netif;
-+	int i = 0;
-+
-+	printk(KERN_ALERT "netif_schedule_list:\n");
-+	spin_lock_irq(&net_schedule_list_lock);
-+
-+	list_for_each (ent, &net_schedule_list) {
-+		netif = list_entry(ent, struct xen_netif, list);
-+		printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
-+		       "rx_resp_prod=%08x\n",
-+		       i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
-+		printk(KERN_ALERT "   tx_req_cons=%08x tx_resp_prod=%08x)\n",
-+		       netif->tx.req_cons, netif->tx.rsp_prod_pvt);
-+		printk(KERN_ALERT "   shared(rx_req_prod=%08x "
-+		       "rx_resp_prod=%08x\n",
-+		       netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
-+		printk(KERN_ALERT "   rx_event=%08x tx_req_prod=%08x\n",
-+		       netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
-+		printk(KERN_ALERT "   tx_resp_prod=%08x, tx_event=%08x)\n",
-+		       netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
-+		i++;
++	if (be->have_hotplug_status_watch) {
++		unregister_xenbus_watch(&be->hotplug_status_watch);
++		kfree(be->hotplug_status_watch.node);
 +	}
++	be->have_hotplug_status_watch = 0;
++}
 +
-+	spin_unlock_irq(&net_schedule_list_lock);
-+	printk(KERN_ALERT " ** End of netif_schedule_list **\n");
++static void hotplug_status_changed(struct xenbus_watch *watch,
++				   const char **vec,
++				   unsigned int vec_size)
++{
++	struct backend_info *be = container_of(watch,
++					       struct backend_info,
++					       hotplug_status_watch);
++	char *str;
++	unsigned int len;
 +
-+	return IRQ_HANDLED;
++	str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
++	if (IS_ERR(str))
++		return;
++	if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
++		xenbus_switch_state(be->dev, XenbusStateConnected);
++		/* Not interested in this watch anymore. */
++		unregister_hotplug_status_watch(be);
++	}
++	kfree(str);
 +}
-+#endif
 +
-+static int __init netback_init(void)
++static void connect(struct backend_info *be)
 +{
-+	int i;
-+	struct page *page;
-+	int rc = 0;
++	int err;
++	struct xenbus_device *dev = be->dev;
 +
-+	if (!xen_domain())
-+		return -ENODEV;
++	err = connect_rings(be);
++	if (err)
++		return;
 +
-+	/* We can increase reservation by this much in net_rx_action(). */
-+//	balloon_update_driver_allowance(NET_RX_RING_SIZE);
++	err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
++	if (err) {
++		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
++		return;
++	}
 +
-+	skb_queue_head_init(&rx_queue);
-+	skb_queue_head_init(&tx_queue);
++	xen_net_read_rate(dev, &be->netif->credit_bytes,
++			  &be->netif->credit_usec);
++	be->netif->remaining_credit = be->netif->credit_bytes;
 +
-+	init_timer(&net_timer);
-+	net_timer.data = 0;
-+	net_timer.function = net_alarm;
++	unregister_hotplug_status_watch(be);
++	err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
++				   hotplug_status_changed,
++				   "%s/%s", dev->nodename, "hotplug-status");
++	if (err) {
++		/* Switch now, since we can't do a watch. */
++		xenbus_switch_state(dev, XenbusStateConnected);
++	} else {
++		be->have_hotplug_status_watch = 1;
++	}
 +
-+	init_timer(&netbk_tx_pending_timer);
-+	netbk_tx_pending_timer.data = 0;
-+	netbk_tx_pending_timer.function = netbk_tx_pending_timeout;
++	netif_wake_queue(be->netif->dev);
++}
 +
-+	mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
-+	if (mmap_pages == NULL) {
-+		printk("%s: out of memory\n", __FUNCTION__);
-+		return -ENOMEM;
-+	}
 +
-+	for (i = 0; i < MAX_PENDING_REQS; i++) {
-+		page = mmap_pages[i];
-+		SetPageForeign(page, netif_page_release);
-+		netif_set_page_index(page, i);
-+		INIT_LIST_HEAD(&pending_inuse[i].list);
++static int connect_rings(struct backend_info *be)
++{
++	struct xenbus_device *dev = be->dev;
++	unsigned long tx_ring_ref, rx_ring_ref;
++	unsigned int evtchn, rx_copy;
++	int err;
++	int val;
++
++	DPRINTK("");
++
++	err = xenbus_gather(XBT_NIL, dev->otherend,
++			    "tx-ring-ref", "%lu", &tx_ring_ref,
++			    "rx-ring-ref", "%lu", &rx_ring_ref,
++			    "event-channel", "%u", &evtchn, NULL);
++	if (err) {
++		xenbus_dev_fatal(dev, err,
++				 "reading %s/ring-ref and event-channel",
++				 dev->otherend);
++		return err;
 +	}
 +
-+	pending_cons = 0;
-+	pending_prod = MAX_PENDING_REQS;
-+	for (i = 0; i < MAX_PENDING_REQS; i++)
-+		pending_ring[i] = i;
++	err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
++			   &rx_copy);
++	if (err == -ENOENT) {
++		err = 0;
++		rx_copy = 0;
++	}
++	if (err < 0) {
++		xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
++				 dev->otherend);
++		return err;
++	}
++	if (!rx_copy)
++		return -EOPNOTSUPP;
 +
-+	netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
-+	if (MODPARM_copy_skb) {
-+		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
-+					      NULL, 0))
-+			netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
++	if (be->netif->dev->tx_queue_len != 0) {
++		if (xenbus_scanf(XBT_NIL, dev->otherend,
++				 "feature-rx-notify", "%d", &val) < 0)
++			val = 0;
++		if (val)
++			be->netif->can_queue = 1;
 +		else
-+			netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
++			/* Must be non-zero for pfifo_fast to work. */
++			be->netif->dev->tx_queue_len = 1;
 +	}
 +
-+	//netif_accel_init();
++	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
++		val = 0;
++	if (!val) {
++		be->netif->features &= ~NETIF_F_SG;
++		be->netif->dev->features &= ~NETIF_F_SG;
++		if (be->netif->dev->mtu > ETH_DATA_LEN)
++			be->netif->dev->mtu = ETH_DATA_LEN;
++	}
 +
-+	rc = netif_xenbus_init();
-+	if (rc)
-+		goto failed_init;
++	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
++			 &val) < 0)
++		val = 0;
++	if (val) {
++		be->netif->features |= NETIF_F_TSO;
++		be->netif->dev->features |= NETIF_F_TSO;
++	}
 +
-+#ifdef NETBE_DEBUG_INTERRUPT
-+	(void)bind_virq_to_irqhandler(VIRQ_DEBUG,
-+				      0,
-+				      netif_be_dbg,
-+				      SA_SHIRQ,
-+				      "net-be-dbg",
-+				      &netif_be_dbg);
-+#endif
++	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
++			 "%d", &val) < 0)
++		val = 0;
++	if (val) {
++		be->netif->features &= ~NETIF_F_IP_CSUM;
++		be->netif->dev->features &= ~NETIF_F_IP_CSUM;
++	}
 +
++	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll",
++			 "%d", &val) < 0)
++		val = 0;
++	if (val)
++		be->netif->smart_poll = 1;
++	else
++		be->netif->smart_poll = 0;
++
++	/* Map the shared frame, irq etc. */
++	err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
++	if (err) {
++		xenbus_dev_fatal(dev, err,
++				 "mapping shared-frames %lu/%lu port %u",
++				 tx_ring_ref, rx_ring_ref, evtchn);
++		return err;
++	}
 +	return 0;
++}
 +
-+failed_init:
-+	free_empty_pages_and_pagevec(mmap_pages, MAX_PENDING_REQS);
-+	del_timer(&netbk_tx_pending_timer);
-+	del_timer(&net_timer);
-+	return rc;
 +
-+}
++/* ** Driver Registration ** */
 +
-+module_init(netback_init);
 +
-+MODULE_LICENSE("Dual BSD/GPL");
-diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c
++static const struct xenbus_device_id netback_ids[] = {
++	{ "vif" },
++	{ "" }
++};
++
++
++static struct xenbus_driver netback = {
++	.name = "vif",
++	.owner = THIS_MODULE,
++	.ids = netback_ids,
++	.probe = netback_probe,
++	.remove = netback_remove,
++	.uevent = netback_uevent,
++	.otherend_changed = frontend_changed,
++};
++
++
++int netif_xenbus_init(void)
++{
++	printk(KERN_CRIT "registering netback\n");
++	return xenbus_register_backend(&netback);
++}
+diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
 new file mode 100644
-index 0000000..70636d0
+index 0000000..ae693e7
 --- /dev/null
-+++ b/drivers/xen/netback/xenbus.c
-@@ -0,0 +1,523 @@
-+/*  Xenbus code for netif backend
-+    Copyright (C) 2005 Rusty Russell <rusty at rustcorp.com.au>
-+    Copyright (C) 2005 XenSource Ltd
++++ b/drivers/xen/pci.c
+@@ -0,0 +1,124 @@
++/*
++ * Copyright (c) 2009, Intel Corporation.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms and conditions of the GNU General Public License,
++ * version 2, as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++ * more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++ * Place - Suite 330, Boston, MA 02111-1307 USA.
++ *
++ * Author: Weidong Han <weidong.han at intel.com>
++ */
 +
-+    This program is free software; you can redistribute it and/or modify
-+    it under the terms of the GNU General Public License as published by
-+    the Free Software Foundation; either version 2 of the License, or
-+    (at your option) any later version.
++#include <linux/pci.h>
 +
-+    This program is distributed in the hope that it will be useful,
-+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+    GNU General Public License for more details.
++#include <xen/interface/xen.h>
++#include <xen/interface/physdev.h>
 +
-+    You should have received a copy of the GNU General Public License
-+    along with this program; if not, write to the Free Software
-+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-+*/
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
 +
-+#include <stdarg.h>
-+#include <linux/module.h>
-+#include <xen/xenbus.h>
-+#include "common.h"
++#include "../pci/pci.h"
 +
-+#if 0
-+#undef DPRINTK
-+#define DPRINTK(fmt, args...) \
-+    printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
++
++#ifdef CONFIG_PCI_IOV
++#define HANDLE_PCI_IOV	1
++#else
++#define HANDLE_PCI_IOV	0
 +#endif
 +
++static int xen_add_device(struct device *dev)
++{
++	int r;
++	struct pci_dev *pci_dev = to_pci_dev(dev);
 +
-+static int connect_rings(struct backend_info *);
-+static void connect(struct backend_info *);
-+static void backend_create_netif(struct backend_info *be);
-+static void unregister_hotplug_status_watch(struct backend_info *be);
++	if (HANDLE_PCI_IOV && pci_dev->is_virtfn) {
++		struct physdev_manage_pci_ext manage_pci_ext = {
++			.bus		= pci_dev->bus->number,
++			.devfn		= pci_dev->devfn,
++			.is_virtfn 	= 1,
++#ifdef CONFIG_PCI_IOV
++			.physfn.bus	= pci_dev->physfn->bus->number,
++			.physfn.devfn	= pci_dev->physfn->devfn,
++#endif
++		};
 +
-+static int netback_remove(struct xenbus_device *dev)
-+{
-+  struct backend_info *be = dev_get_drvdata(&dev->dev);
++		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
++			&manage_pci_ext);
++	} else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
++		struct physdev_manage_pci_ext manage_pci_ext = {
++			.bus		= pci_dev->bus->number,
++			.devfn		= pci_dev->devfn,
++			.is_extfn	= 1,
++		};
 +
-+	//netback_remove_accelerators(be, dev);
++		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
++			&manage_pci_ext);
++	} else {
++		struct physdev_manage_pci manage_pci = {
++			.bus 	= pci_dev->bus->number,
++			.devfn	= pci_dev->devfn,
++		};
 +
-+	unregister_hotplug_status_watch(be);
-+	if (be->netif) {
-+		kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
-+		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
-+		netif_disconnect(be->netif);
-+		be->netif = NULL;
++		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
++			&manage_pci);
 +	}
-+	kfree(be);
-+	dev_set_drvdata(&dev->dev, NULL);
-+	return 0;
-+}
 +
++	return r;
++}
 +
-+/**
-+ * Entry point to this code when a new device is created.  Allocate the basic
-+ * structures and switch to InitWait.
-+ */
-+static int netback_probe(struct xenbus_device *dev,
-+			 const struct xenbus_device_id *id)
++static int xen_remove_device(struct device *dev)
 +{
-+	const char *message;
-+	struct xenbus_transaction xbt;
-+	int err;
-+	int sg;
-+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
-+					  GFP_KERNEL);
-+	if (!be) {
-+		xenbus_dev_fatal(dev, -ENOMEM,
-+				 "allocating backend structure");
-+		return -ENOMEM;
-+	}
++	int r;
++	struct pci_dev *pci_dev = to_pci_dev(dev);
++	struct physdev_manage_pci manage_pci;
 +
-+	be->dev = dev;
-+	dev_set_drvdata(&dev->dev, be);
++	manage_pci.bus = pci_dev->bus->number;
++	manage_pci.devfn = pci_dev->devfn;
 +
-+	sg = 1;
-+	if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
-+		sg = 0;
++	r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
++		&manage_pci);
 +
-+	do {
-+		err = xenbus_transaction_start(&xbt);
-+		if (err) {
-+			xenbus_dev_fatal(dev, err, "starting transaction");
-+			goto fail;
-+		}
++	return r;
++}
 +
-+		err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
-+		if (err) {
-+			message = "writing feature-sg";
-+			goto abort_transaction;
-+		}
++static int xen_pci_notifier(struct notifier_block *nb,
++			    unsigned long action, void *data)
++{
++	struct device *dev = data;
++	int r = 0;
 +
-+		err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
-+				    "%d", sg);
-+		if (err) {
-+			message = "writing feature-gso-tcpv4";
-+			goto abort_transaction;
-+		}
++	switch (action) {
++	case BUS_NOTIFY_ADD_DEVICE:
++		r = xen_add_device(dev);
++		break;
++	case BUS_NOTIFY_DEL_DEVICE:
++		r = xen_remove_device(dev);
++		break;
++	default:
++		break;
++	}
 +
-+		/* We support rx-copy path. */
-+		err = xenbus_printf(xbt, dev->nodename,
-+				    "feature-rx-copy", "%d", 1);
-+		if (err) {
-+			message = "writing feature-rx-copy";
-+			goto abort_transaction;
-+		}
++	return r;
++}
 +
-+		/*
-+		 * We don't support rx-flip path (except old guests who don't
-+		 * grok this feature flag).
-+		 */
-+		err = xenbus_printf(xbt, dev->nodename,
-+				    "feature-rx-flip", "%d", 0);
-+		if (err) {
-+			message = "writing feature-rx-flip";
-+			goto abort_transaction;
-+		}
++struct notifier_block device_nb = {
++	.notifier_call = xen_pci_notifier,
++};
 +
-+		/* We support data smart poll mechanism */
-+		err = xenbus_printf(xbt, dev->nodename,
-+				    "feature-smart-poll", "%d", 1);
-+		if (err) {
-+			message = "writing feature-smart-poll";
-+			goto abort_transaction;
-+		}
++static int __init register_xen_pci_notifier(void)
++{
++	if (!xen_pv_domain())
++		return 0;
 +
-+		err = xenbus_transaction_end(xbt, 0);
-+	} while (err == -EAGAIN);
++	return bus_register_notifier(&pci_bus_type, &device_nb);
++}
 +
-+	if (err) {
-+		xenbus_dev_fatal(dev, err, "completing transaction");
-+		goto fail;
-+	}
++arch_initcall(register_xen_pci_notifier);
+diff --git a/drivers/xen/pciback/Makefile b/drivers/xen/pciback/Makefile
+new file mode 100644
+index 0000000..38bc123
+--- /dev/null
++++ b/drivers/xen/pciback/Makefile
+@@ -0,0 +1,17 @@
++obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
 +
-+	//netback_probe_accelerators(be, dev);
++xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
++xen-pciback-y += conf_space.o conf_space_header.o \
++		 conf_space_capability.o \
++		 conf_space_capability_vpd.o \
++		 conf_space_capability_pm.o \
++		 conf_space_quirks.o
++xen-pciback-$(CONFIG_PCI_MSI) += conf_space_capability_msi.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
 +
-+	err = xenbus_switch_state(dev, XenbusStateInitWait);
-+	if (err)
-+		goto fail;
++ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
++EXTRA_CFLAGS += -DDEBUG
++endif
+diff --git a/drivers/xen/pciback/conf_space.c b/drivers/xen/pciback/conf_space.c
+new file mode 100644
+index 0000000..370c18e
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space.c
+@@ -0,0 +1,435 @@
++/*
++ * PCI Backend - Functions for creating a virtual configuration space for
++ *               exported PCI Devices.
++ *               It's dangerous to allow PCI Driver Domains to change their
++ *               device's resources (memory, i/o ports, interrupts). We need to
++ *               restrict changes to certain PCI Configuration registers:
++ *               BARs, INTERRUPT_PIN, most registers in the header...
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
 +
-+	/* This kicks hotplug scripts, so do it immediately. */
-+	backend_create_netif(be);
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
 +
-+	return 0;
++static int permissive;
++module_param(permissive, bool, 0644);
 +
-+abort_transaction:
-+	xenbus_transaction_end(xbt, 1);
-+	xenbus_dev_fatal(dev, err, "%s", message);
-+fail:
-+	DPRINTK("failed");
-+	netback_remove(dev);
-+	return err;
++#define DEFINE_PCI_CONFIG(op, size, type) 			\
++int pciback_##op##_config_##size 				\
++(struct pci_dev *dev, int offset, type value, void *data)	\
++{								\
++	return pci_##op##_config_##size(dev, offset, value);	\
 +}
 +
++DEFINE_PCI_CONFIG(read, byte, u8 *)
++DEFINE_PCI_CONFIG(read, word, u16 *)
++DEFINE_PCI_CONFIG(read, dword, u32 *)
 +
-+/**
-+ * Handle the creation of the hotplug script environment.  We add the script
-+ * and vif variables to the environment, for the benefit of the vif-* hotplug
-+ * scripts.
-+ */
-+static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
++DEFINE_PCI_CONFIG(write, byte, u8)
++DEFINE_PCI_CONFIG(write, word, u16)
++DEFINE_PCI_CONFIG(write, dword, u32)
++
++static int conf_space_read(struct pci_dev *dev,
++			   const struct config_field_entry *entry,
++			   int offset, u32 *value)
 +{
-+	struct backend_info *be = dev_get_drvdata(&xdev->dev);
-+	struct xen_netif *netif = be->netif;
-+	char *val;
++	int ret = 0;
++	const struct config_field *field = entry->field;
 +
-+	DPRINTK("netback_uevent");
++	*value = 0;
 +
-+	val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
-+	if (IS_ERR(val)) {
-+		int err = PTR_ERR(val);
-+		xenbus_dev_fatal(xdev, err, "reading script");
-+		return err;
-+	}
-+	else {
-+		if (add_uevent_var(env, "script=%s", val)) {
-+			kfree(val);
-+			return -ENOMEM;
-+		}
-+		kfree(val);
++	switch (field->size) {
++	case 1:
++		if (field->u.b.read)
++			ret = field->u.b.read(dev, offset, (u8 *) value,
++					      entry->data);
++		break;
++	case 2:
++		if (field->u.w.read)
++			ret = field->u.w.read(dev, offset, (u16 *) value,
++					      entry->data);
++		break;
++	case 4:
++		if (field->u.dw.read)
++			ret = field->u.dw.read(dev, offset, value, entry->data);
++		break;
 +	}
++	return ret;
++}
 +
-+	if (add_uevent_var(env, "vif=%s", netif->dev->name))
-+		return -ENOMEM;
++static int conf_space_write(struct pci_dev *dev,
++			    const struct config_field_entry *entry,
++			    int offset, u32 value)
++{
++	int ret = 0;
++	const struct config_field *field = entry->field;
 +
-+	return 0;
++	switch (field->size) {
++	case 1:
++		if (field->u.b.write)
++			ret = field->u.b.write(dev, offset, (u8) value,
++					       entry->data);
++		break;
++	case 2:
++		if (field->u.w.write)
++			ret = field->u.w.write(dev, offset, (u16) value,
++					       entry->data);
++		break;
++	case 4:
++		if (field->u.dw.write)
++			ret = field->u.dw.write(dev, offset, value,
++						entry->data);
++		break;
++	}
++	return ret;
 +}
 +
-+
-+static void backend_create_netif(struct backend_info *be)
++static inline u32 get_mask(int size)
 +{
-+	int err;
-+	long handle;
-+	struct xenbus_device *dev = be->dev;
-+
-+	if (be->netif != NULL)
-+		return;
++	if (size == 1)
++		return 0xff;
++	else if (size == 2)
++		return 0xffff;
++	else
++		return 0xffffffff;
++}
 +
-+	err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
-+	if (err != 1) {
-+		xenbus_dev_fatal(dev, err, "reading handle");
-+		return;
-+	}
++static inline int valid_request(int offset, int size)
++{
++	/* Validate request (no un-aligned requests) */
++	if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
++		return 1;
++	return 0;
++}
 +
-+	be->netif = netif_alloc(&dev->dev, dev->otherend_id, handle);
-+	if (IS_ERR(be->netif)) {
-+		err = PTR_ERR(be->netif);
-+		be->netif = NULL;
-+		xenbus_dev_fatal(dev, err, "creating interface");
-+		return;
++static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
++			      int offset)
++{
++	if (offset >= 0) {
++		new_val_mask <<= (offset * 8);
++		new_val <<= (offset * 8);
++	} else {
++		new_val_mask >>= (offset * -8);
++		new_val >>= (offset * -8);
 +	}
++	val = (val & ~new_val_mask) | (new_val & new_val_mask);
 +
-+	kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
++	return val;
 +}
 +
-+
-+static void disconnect_backend(struct xenbus_device *dev)
++static int pcibios_err_to_errno(int err)
 +{
-+	struct backend_info *be = dev_get_drvdata(&dev->dev);
-+
-+	if (be->netif) {
-+		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
-+		netif_disconnect(be->netif);
-+		be->netif = NULL;
++	switch (err) {
++	case PCIBIOS_SUCCESSFUL:
++		return XEN_PCI_ERR_success;
++	case PCIBIOS_DEVICE_NOT_FOUND:
++		return XEN_PCI_ERR_dev_not_found;
++	case PCIBIOS_BAD_REGISTER_NUMBER:
++		return XEN_PCI_ERR_invalid_offset;
++	case PCIBIOS_FUNC_NOT_SUPPORTED:
++		return XEN_PCI_ERR_not_implemented;
++	case PCIBIOS_SET_FAILED:
++		return XEN_PCI_ERR_access_denied;
 +	}
++	return err;
 +}
 +
-+/**
-+ * Callback received when the frontend's state changes.
-+ */
-+static void frontend_changed(struct xenbus_device *dev,
-+			     enum xenbus_state frontend_state)
++int pciback_config_read(struct pci_dev *dev, int offset, int size,
++			u32 *ret_val)
 +{
-+	struct backend_info *be = dev_get_drvdata(&dev->dev);
-+
-+	DPRINTK("%s", xenbus_strstate(frontend_state));
-+
-+	be->frontend_state = frontend_state;
++	int err = 0;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	const struct config_field_entry *cfg_entry;
++	const struct config_field *field;
++	int req_start, req_end, field_start, field_end;
++	/* if read fails for any reason, return 0
++	 * (as if device didn't respond) */
++	u32 value = 0, tmp_val;
 +
-+	switch (frontend_state) {
-+	case XenbusStateInitialising:
-+		if (dev->state == XenbusStateClosed) {
-+			printk(KERN_INFO "%s: %s: prepare for reconnect\n",
-+			       __FUNCTION__, dev->nodename);
-+			xenbus_switch_state(dev, XenbusStateInitWait);
-+		}
-+		break;
++	if (unlikely(verbose_request))
++		printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
++		       pci_name(dev), size, offset);
 +
-+	case XenbusStateInitialised:
-+		break;
++	if (!valid_request(offset, size)) {
++		err = XEN_PCI_ERR_invalid_offset;
++		goto out;
++	}
 +
-+	case XenbusStateConnected:
-+		if (dev->state == XenbusStateConnected)
-+			break;
-+		backend_create_netif(be);
-+		if (be->netif)
-+			connect(be);
++	/* Get the real value first, then modify as appropriate */
++	switch (size) {
++	case 1:
++		err = pci_read_config_byte(dev, offset, (u8 *) &value);
 +		break;
-+
-+	case XenbusStateClosing:
-+		if (be->netif)
-+			kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
-+		disconnect_backend(dev);
-+		xenbus_switch_state(dev, XenbusStateClosing);
++	case 2:
++		err = pci_read_config_word(dev, offset, (u16 *) &value);
 +		break;
-+
-+	case XenbusStateClosed:
-+		xenbus_switch_state(dev, XenbusStateClosed);
-+		if (xenbus_dev_is_online(dev))
-+			break;
-+		/* fall through if not online */
-+	case XenbusStateUnknown:
-+		device_unregister(&dev->dev);
++	case 4:
++		err = pci_read_config_dword(dev, offset, &value);
 +		break;
++	}
 +
-+	default:
-+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
-+				 frontend_state);
-+		break;
++	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++		field = cfg_entry->field;
++
++		req_start = offset;
++		req_end = offset + size;
++		field_start = OFFSET(cfg_entry);
++		field_end = OFFSET(cfg_entry) + field->size;
++
++		if ((req_start >= field_start && req_start < field_end)
++		    || (req_end > field_start && req_end <= field_end)) {
++			err = conf_space_read(dev, cfg_entry, field_start,
++					      &tmp_val);
++			if (err)
++				goto out;
++
++			value = merge_value(value, tmp_val,
++					    get_mask(field->size),
++					    field_start - req_start);
++		}
 +	}
-+}
 +
++out:
++	if (unlikely(verbose_request))
++		printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
++		       pci_name(dev), size, offset, value);
 +
-+static void xen_net_read_rate(struct xenbus_device *dev,
-+			      unsigned long *bytes, unsigned long *usec)
-+{
-+	char *s, *e;
-+	unsigned long b, u;
-+	char *ratestr;
++	*ret_val = value;
++	return pcibios_err_to_errno(err);
++}
 +
-+	/* Default to unlimited bandwidth. */
-+	*bytes = ~0UL;
-+	*usec = 0;
++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
++{
++	int err = 0, handled = 0;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	const struct config_field_entry *cfg_entry;
++	const struct config_field *field;
++	u32 tmp_val;
++	int req_start, req_end, field_start, field_end;
 +
-+	ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
-+	if (IS_ERR(ratestr))
-+		return;
++	if (unlikely(verbose_request))
++		printk(KERN_DEBUG
++		       "pciback: %s: write request %d bytes at 0x%x = %x\n",
++		       pci_name(dev), size, offset, value);
 +
-+	s = ratestr;
-+	b = simple_strtoul(s, &e, 10);
-+	if ((s == e) || (*e != ','))
-+		goto fail;
++	if (!valid_request(offset, size))
++		return XEN_PCI_ERR_invalid_offset;
 +
-+	s = e + 1;
-+	u = simple_strtoul(s, &e, 10);
-+	if ((s == e) || (*e != '\0'))
-+		goto fail;
++	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++		field = cfg_entry->field;
 +
-+	*bytes = b;
-+	*usec = u;
++		req_start = offset;
++		req_end = offset + size;
++		field_start = OFFSET(cfg_entry);
++		field_end = OFFSET(cfg_entry) + field->size;
 +
-+	kfree(ratestr);
-+	return;
++		if ((req_start >= field_start && req_start < field_end)
++		    || (req_end > field_start && req_end <= field_end)) {
++			tmp_val = 0;
 +
-+ fail:
-+	WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
-+	kfree(ratestr);
-+}
++			err = pciback_config_read(dev, field_start,
++						  field->size, &tmp_val);
++			if (err)
++				break;
 +
-+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
-+{
-+	char *s, *e, *macstr;
-+	int i;
++			tmp_val = merge_value(tmp_val, value, get_mask(size),
++					      req_start - field_start);
 +
-+	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
-+	if (IS_ERR(macstr))
-+		return PTR_ERR(macstr);
++			err = conf_space_write(dev, cfg_entry, field_start,
++					       tmp_val);
 +
-+	for (i = 0; i < ETH_ALEN; i++) {
-+		mac[i] = simple_strtoul(s, &e, 16);
-+		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
-+			kfree(macstr);
-+			return -ENOENT;
++			/* handled is set true here, but not every byte
++			 * may have been written! Properly detecting if
++			 * every byte is handled is unnecessary as the
++			 * flag is used to detect devices that need
++			 * special helpers to work correctly.
++			 */
++			handled = 1;
 +		}
-+		s = e+1;
 +	}
 +
-+	kfree(macstr);
-+	return 0;
-+}
-+
-+static void unregister_hotplug_status_watch(struct backend_info *be)
-+{
-+	if (be->have_hotplug_status_watch) {
-+		unregister_xenbus_watch(&be->hotplug_status_watch);
-+		kfree(be->hotplug_status_watch.node);
++	if (!handled && !err) {
++		/* By default, anything not specificially handled above is
++		 * read-only. The permissive flag changes this behavior so
++		 * that anything not specifically handled above is writable.
++		 * This means that some fields may still be read-only because
++		 * they have entries in the config_field list that intercept
++		 * the write and do nothing. */
++		if (dev_data->permissive || permissive) {
++			switch (size) {
++			case 1:
++				err = pci_write_config_byte(dev, offset,
++							    (u8) value);
++				break;
++			case 2:
++				err = pci_write_config_word(dev, offset,
++							    (u16) value);
++				break;
++			case 4:
++				err = pci_write_config_dword(dev, offset,
++							     (u32) value);
++				break;
++			}
++		} else if (!dev_data->warned_on_write) {
++			dev_data->warned_on_write = 1;
++			dev_warn(&dev->dev, "Driver tried to write to a "
++				 "read-only configuration space field at offset"
++				 " 0x%x, size %d. This may be harmless, but if "
++				 "you have problems with your device:\n"
++				 "1) see permissive attribute in sysfs\n"
++				 "2) report problems to the xen-devel "
++				 "mailing list along with details of your "
++				 "device obtained from lspci.\n", offset, size);
++		}
 +	}
-+	be->have_hotplug_status_watch = 0;
++
++	return pcibios_err_to_errno(err);
 +}
 +
-+static void hotplug_status_changed(struct xenbus_watch *watch,
-+				   const char **vec,
-+				   unsigned int vec_size)
++void pciback_config_free_dyn_fields(struct pci_dev *dev)
 +{
-+	struct backend_info *be = container_of(watch,
-+					       struct backend_info,
-+					       hotplug_status_watch);
-+	char *str;
-+	unsigned int len;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	struct config_field_entry *cfg_entry, *t;
++	const struct config_field *field;
 +
-+	str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
-+	if (IS_ERR(str))
++	dev_dbg(&dev->dev, "free-ing dynamically allocated virtual "
++			   "configuration space fields\n");
++	if (!dev_data)
 +		return;
-+	if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
-+		xenbus_switch_state(be->dev, XenbusStateConnected);
-+		/* Not interested in this watch anymore. */
-+		unregister_hotplug_status_watch(be);
++
++	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
++		field = cfg_entry->field;
++
++		if (field->clean) {
++			field->clean((struct config_field *)field);
++
++			kfree(cfg_entry->data);
++
++			list_del(&cfg_entry->list);
++			kfree(cfg_entry);
++		}
++
 +	}
-+	kfree(str);
 +}
 +
-+static void connect(struct backend_info *be)
++void pciback_config_reset_dev(struct pci_dev *dev)
 +{
-+	int err;
-+	struct xenbus_device *dev = be->dev;
-+
-+	err = connect_rings(be);
-+	if (err)
-+		return;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	const struct config_field_entry *cfg_entry;
++	const struct config_field *field;
 +
-+	err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
-+	if (err) {
-+		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
++	dev_dbg(&dev->dev, "resetting virtual configuration space\n");
++	if (!dev_data)
 +		return;
-+	}
 +
-+	xen_net_read_rate(dev, &be->netif->credit_bytes,
-+			  &be->netif->credit_usec);
-+	be->netif->remaining_credit = be->netif->credit_bytes;
++	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++		field = cfg_entry->field;
 +
-+	unregister_hotplug_status_watch(be);
-+	err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
-+				   hotplug_status_changed,
-+				   "%s/%s", dev->nodename, "hotplug-status");
-+	if (err) {
-+		/* Switch now, since we can't do a watch. */
-+		xenbus_switch_state(dev, XenbusStateConnected);
-+	} else {
-+		be->have_hotplug_status_watch = 1;
++		if (field->reset)
++			field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
 +	}
-+
-+	netif_wake_queue(be->netif->dev);
 +}
 +
-+
-+static int connect_rings(struct backend_info *be)
++void pciback_config_free_dev(struct pci_dev *dev)
 +{
-+	struct xenbus_device *dev = be->dev;
-+	unsigned long tx_ring_ref, rx_ring_ref;
-+	unsigned int evtchn, rx_copy;
-+	int err;
-+	int val;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	struct config_field_entry *cfg_entry, *t;
++	const struct config_field *field;
++
++	dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
++	if (!dev_data)
++		return;
++
++	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
++		list_del(&cfg_entry->list);
++
++		field = cfg_entry->field;
 +
-+	DPRINTK("");
++		if (field->release)
++			field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
 +
-+	err = xenbus_gather(XBT_NIL, dev->otherend,
-+			    "tx-ring-ref", "%lu", &tx_ring_ref,
-+			    "rx-ring-ref", "%lu", &rx_ring_ref,
-+			    "event-channel", "%u", &evtchn, NULL);
-+	if (err) {
-+		xenbus_dev_fatal(dev, err,
-+				 "reading %s/ring-ref and event-channel",
-+				 dev->otherend);
-+		return err;
++		kfree(cfg_entry);
 +	}
++}
 +
-+	err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
-+			   &rx_copy);
-+	if (err == -ENOENT) {
-+		err = 0;
-+		rx_copy = 0;
-+	}
-+	if (err < 0) {
-+		xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
-+				 dev->otherend);
-+		return err;
-+	}
-+	if (!rx_copy)
-+		return -EOPNOTSUPP;
++int pciback_config_add_field_offset(struct pci_dev *dev,
++				    const struct config_field *field,
++				    unsigned int base_offset)
++{
++	int err = 0;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	struct config_field_entry *cfg_entry;
++	void *tmp;
 +
-+	if (be->netif->dev->tx_queue_len != 0) {
-+		if (xenbus_scanf(XBT_NIL, dev->otherend,
-+				 "feature-rx-notify", "%d", &val) < 0)
-+			val = 0;
-+		if (val)
-+			be->netif->can_queue = 1;
-+		else
-+			/* Must be non-zero for pfifo_fast to work. */
-+			be->netif->dev->tx_queue_len = 1;
++	cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
++	if (!cfg_entry) {
++		err = -ENOMEM;
++		goto out;
 +	}
 +
-+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
-+		val = 0;
-+	if (!val) {
-+		be->netif->features &= ~NETIF_F_SG;
-+		be->netif->dev->features &= ~NETIF_F_SG;
-+		if (be->netif->dev->mtu > ETH_DATA_LEN)
-+			be->netif->dev->mtu = ETH_DATA_LEN;
-+	}
++	cfg_entry->data = NULL;
++	cfg_entry->field = field;
++	cfg_entry->base_offset = base_offset;
 +
-+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
-+			 &val) < 0)
-+		val = 0;
-+	if (val) {
-+		be->netif->features |= NETIF_F_TSO;
-+		be->netif->dev->features |= NETIF_F_TSO;
-+	}
++	/* silently ignore duplicate fields */
++	err = pciback_field_is_dup(dev, OFFSET(cfg_entry));
++	if (err)
++		goto out;
 +
-+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
-+			 "%d", &val) < 0)
-+		val = 0;
-+	if (val) {
-+		be->netif->features &= ~NETIF_F_IP_CSUM;
-+		be->netif->dev->features &= ~NETIF_F_IP_CSUM;
-+	}
++	if (field->init) {
++		tmp = field->init(dev, OFFSET(cfg_entry));
 +
-+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll",
-+			 "%d", &val) < 0)
-+		val = 0;
-+	if (val)
-+		be->netif->smart_poll = 1;
-+	else
-+		be->netif->smart_poll = 0;
++		if (IS_ERR(tmp)) {
++			err = PTR_ERR(tmp);
++			goto out;
++		}
 +
-+	/* Map the shared frame, irq etc. */
-+	err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
-+	if (err) {
-+		xenbus_dev_fatal(dev, err,
-+				 "mapping shared-frames %lu/%lu port %u",
-+				 tx_ring_ref, rx_ring_ref, evtchn);
-+		return err;
++		cfg_entry->data = tmp;
 +	}
-+	return 0;
++
++	dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
++		OFFSET(cfg_entry));
++	list_add_tail(&cfg_entry->list, &dev_data->config_fields);
++
++out:
++	if (err)
++		kfree(cfg_entry);
++
++	return err;
 +}
 +
++/* This sets up the device's virtual configuration space to keep track of
++ * certain registers (like the base address registers (BARs) so that we can
++ * keep the client from manipulating them directly.
++ */
++int pciback_config_init_dev(struct pci_dev *dev)
++{
++	int err = 0;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
 +
-+/* ** Driver Registration ** */
++	dev_dbg(&dev->dev, "initializing virtual configuration space\n");
 +
++	INIT_LIST_HEAD(&dev_data->config_fields);
 +
-+static const struct xenbus_device_id netback_ids[] = {
-+	{ "vif" },
-+	{ "" }
-+};
++	err = pciback_config_header_add_fields(dev);
++	if (err)
++		goto out;
 +
++	err = pciback_config_capability_add_fields(dev);
++	if (err)
++		goto out;
 +
-+static struct xenbus_driver netback = {
-+	.name = "vif",
-+	.owner = THIS_MODULE,
-+	.ids = netback_ids,
-+	.probe = netback_probe,
-+	.remove = netback_remove,
-+	.uevent = netback_uevent,
-+	.otherend_changed = frontend_changed,
-+};
++	err = pciback_config_quirks_init(dev);
 +
++out:
++	return err;
++}
 +
-+int netif_xenbus_init(void)
++int pciback_config_init(void)
 +{
-+	printk(KERN_CRIT "registering netback\n");
-+	return xenbus_register_backend(&netback);
++	return pciback_config_capability_init();
 +}
-diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
+diff --git a/drivers/xen/pciback/conf_space.h b/drivers/xen/pciback/conf_space.h
 new file mode 100644
-index 0000000..ae693e7
+index 0000000..50ebef2
 --- /dev/null
-+++ b/drivers/xen/pci.c
-@@ -0,0 +1,124 @@
++++ b/drivers/xen/pciback/conf_space.h
+@@ -0,0 +1,126 @@
 +/*
-+ * Copyright (c) 2009, Intel Corporation.
-+ *
-+ * This program is free software; you can redistribute it and/or modify it
-+ * under the terms and conditions of the GNU General Public License,
-+ * version 2, as published by the Free Software Foundation.
-+ *
-+ * This program is distributed in the hope it will be useful, but WITHOUT
-+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-+ * more details.
-+ *
-+ * You should have received a copy of the GNU General Public License along with
-+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-+ * Place - Suite 330, Boston, MA 02111-1307 USA.
++ * PCI Backend - Common data structures for overriding the configuration space
 + *
-+ * Author: Weidong Han <weidong.han at intel.com>
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
 + */
 +
-+#include <linux/pci.h>
-+
-+#include <xen/interface/xen.h>
-+#include <xen/interface/physdev.h>
-+
-+#include <asm/xen/hypervisor.h>
-+#include <asm/xen/hypercall.h>
-+
-+#include "../pci/pci.h"
-+
++#ifndef __XEN_PCIBACK_CONF_SPACE_H__
++#define __XEN_PCIBACK_CONF_SPACE_H__
 +
-+#ifdef CONFIG_PCI_IOV
-+#define HANDLE_PCI_IOV	1
-+#else
-+#define HANDLE_PCI_IOV	0
-+#endif
++#include <linux/list.h>
++#include <linux/err.h>
 +
-+static int xen_add_device(struct device *dev)
-+{
-+	int r;
-+	struct pci_dev *pci_dev = to_pci_dev(dev);
++/* conf_field_init can return an errno in a ptr with ERR_PTR() */
++typedef void *(*conf_field_init) (struct pci_dev *dev, int offset);
++typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data);
++typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data);
 +
-+	if (HANDLE_PCI_IOV && pci_dev->is_virtfn) {
-+		struct physdev_manage_pci_ext manage_pci_ext = {
-+			.bus		= pci_dev->bus->number,
-+			.devfn		= pci_dev->devfn,
-+			.is_virtfn 	= 1,
-+#ifdef CONFIG_PCI_IOV
-+			.physfn.bus	= pci_dev->physfn->bus->number,
-+			.physfn.devfn	= pci_dev->physfn->devfn,
-+#endif
-+		};
++typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value,
++				 void *data);
++typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value,
++				void *data);
++typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value,
++				void *data);
++typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value,
++				void *data);
++typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value,
++			       void *data);
++typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value,
++			       void *data);
 +
-+		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
-+			&manage_pci_ext);
-+	} else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
-+		struct physdev_manage_pci_ext manage_pci_ext = {
-+			.bus		= pci_dev->bus->number,
-+			.devfn		= pci_dev->devfn,
-+			.is_extfn	= 1,
-+		};
++/* These are the fields within the configuration space which we
++ * are interested in intercepting reads/writes to and changing their
++ * values.
++ */
++struct config_field {
++	unsigned int offset;
++	unsigned int size;
++	unsigned int mask;
++	conf_field_init init;
++	conf_field_reset reset;
++	conf_field_free release;
++	void (*clean) (struct config_field *field);
++	union {
++		struct {
++			conf_dword_write write;
++			conf_dword_read read;
++		} dw;
++		struct {
++			conf_word_write write;
++			conf_word_read read;
++		} w;
++		struct {
++			conf_byte_write write;
++			conf_byte_read read;
++		} b;
++	} u;
++	struct list_head list;
++};
 +
-+		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
-+			&manage_pci_ext);
-+	} else {
-+		struct physdev_manage_pci manage_pci = {
-+			.bus 	= pci_dev->bus->number,
-+			.devfn	= pci_dev->devfn,
-+		};
++struct config_field_entry {
++	struct list_head list;
++	const struct config_field *field;
++	unsigned int base_offset;
++	void *data;
++};
 +
-+		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
-+			&manage_pci);
-+	}
++#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
 +
-+	return r;
-+}
++/* Add fields to a device - the add_fields macro expects to get a pointer to
++ * the first entry in an array (of which the ending is marked by size==0)
++ */
++int pciback_config_add_field_offset(struct pci_dev *dev,
++				    const struct config_field *field,
++				    unsigned int offset);
 +
-+static int xen_remove_device(struct device *dev)
++static inline int pciback_config_add_field(struct pci_dev *dev,
++					   const struct config_field *field)
 +{
-+	int r;
-+	struct pci_dev *pci_dev = to_pci_dev(dev);
-+	struct physdev_manage_pci manage_pci;
-+
-+	manage_pci.bus = pci_dev->bus->number;
-+	manage_pci.devfn = pci_dev->devfn;
-+
-+	r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
-+		&manage_pci);
-+
-+	return r;
++	return pciback_config_add_field_offset(dev, field, 0);
 +}
 +
-+static int xen_pci_notifier(struct notifier_block *nb,
-+			    unsigned long action, void *data)
++static inline int pciback_config_add_fields(struct pci_dev *dev,
++					    const struct config_field *field)
 +{
-+	struct device *dev = data;
-+	int r = 0;
-+
-+	switch (action) {
-+	case BUS_NOTIFY_ADD_DEVICE:
-+		r = xen_add_device(dev);
-+		break;
-+	case BUS_NOTIFY_DEL_DEVICE:
-+		r = xen_remove_device(dev);
-+		break;
-+	default:
-+		break;
++	int i, err = 0;
++	for (i = 0; field[i].size != 0; i++) {
++		err = pciback_config_add_field(dev, &field[i]);
++		if (err)
++			break;
 +	}
-+
-+	return r;
++	return err;
 +}
 +
-+struct notifier_block device_nb = {
-+	.notifier_call = xen_pci_notifier,
-+};
-+
-+static int __init register_xen_pci_notifier(void)
++static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
++					const struct config_field *field,
++					unsigned int offset)
 +{
-+	if (!xen_pv_domain())
-+		return 0;
-+
-+	return bus_register_notifier(&pci_bus_type, &device_nb);
++	int i, err = 0;
++	for (i = 0; field[i].size != 0; i++) {
++		err = pciback_config_add_field_offset(dev, &field[i], offset);
++		if (err)
++			break;
++	}
++	return err;
 +}
 +
-+arch_initcall(register_xen_pci_notifier);
-diff --git a/drivers/xen/pciback/Makefile b/drivers/xen/pciback/Makefile
-new file mode 100644
-index 0000000..38bc123
---- /dev/null
-+++ b/drivers/xen/pciback/Makefile
-@@ -0,0 +1,17 @@
-+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
++/* Read/Write the real configuration space */
++int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 *value,
++			     void *data);
++int pciback_read_config_word(struct pci_dev *dev, int offset, u16 *value,
++			     void *data);
++int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 *value,
++			      void *data);
++int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
++			      void *data);
++int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
++			      void *data);
++int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
++			       void *data);
 +
-+xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
-+xen-pciback-y += conf_space.o conf_space_header.o \
-+		 conf_space_capability.o \
-+		 conf_space_capability_vpd.o \
-+		 conf_space_capability_pm.o \
-+		 conf_space_quirks.o
-+xen-pciback-$(CONFIG_PCI_MSI) += conf_space_capability_msi.o
-+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
-+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
-+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
-+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
++int pciback_config_capability_init(void);
 +
-+ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
-+EXTRA_CFLAGS += -DDEBUG
-+endif
-diff --git a/drivers/xen/pciback/conf_space.c b/drivers/xen/pciback/conf_space.c
++int pciback_config_header_add_fields(struct pci_dev *dev);
++int pciback_config_capability_add_fields(struct pci_dev *dev);
++
++#endif				/* __XEN_PCIBACK_CONF_SPACE_H__ */
+diff --git a/drivers/xen/pciback/conf_space_capability.c b/drivers/xen/pciback/conf_space_capability.c
 new file mode 100644
-index 0000000..370c18e
+index 0000000..0ea84d6
 --- /dev/null
-+++ b/drivers/xen/pciback/conf_space.c
-@@ -0,0 +1,435 @@
++++ b/drivers/xen/pciback/conf_space_capability.c
+@@ -0,0 +1,66 @@
 +/*
-+ * PCI Backend - Functions for creating a virtual configuration space for
-+ *               exported PCI Devices.
-+ *               It's dangerous to allow PCI Driver Domains to change their
-+ *               device's resources (memory, i/o ports, interrupts). We need to
-+ *               restrict changes to certain PCI Configuration registers:
-+ *               BARs, INTERRUPT_PIN, most registers in the header...
++ * PCI Backend - Handles the virtual fields found on the capability lists
++ *               in the configuration space.
 + *
 + * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
 + */
@@ -17556,1711 +19687,1470 @@
 +#include <linux/pci.h>
 +#include "pciback.h"
 +#include "conf_space.h"
-+#include "conf_space_quirks.h"
-+
-+static int permissive;
-+module_param(permissive, bool, 0644);
-+
-+#define DEFINE_PCI_CONFIG(op, size, type) 			\
-+int pciback_##op##_config_##size 				\
-+(struct pci_dev *dev, int offset, type value, void *data)	\
-+{								\
-+	return pci_##op##_config_##size(dev, offset, value);	\
-+}
-+
-+DEFINE_PCI_CONFIG(read, byte, u8 *)
-+DEFINE_PCI_CONFIG(read, word, u16 *)
-+DEFINE_PCI_CONFIG(read, dword, u32 *)
-+
-+DEFINE_PCI_CONFIG(write, byte, u8)
-+DEFINE_PCI_CONFIG(write, word, u16)
-+DEFINE_PCI_CONFIG(write, dword, u32)
-+
-+static int conf_space_read(struct pci_dev *dev,
-+			   const struct config_field_entry *entry,
-+			   int offset, u32 *value)
-+{
-+	int ret = 0;
-+	const struct config_field *field = entry->field;
-+
-+	*value = 0;
-+
-+	switch (field->size) {
-+	case 1:
-+		if (field->u.b.read)
-+			ret = field->u.b.read(dev, offset, (u8 *) value,
-+					      entry->data);
-+		break;
-+	case 2:
-+		if (field->u.w.read)
-+			ret = field->u.w.read(dev, offset, (u16 *) value,
-+					      entry->data);
-+		break;
-+	case 4:
-+		if (field->u.dw.read)
-+			ret = field->u.dw.read(dev, offset, value, entry->data);
-+		break;
-+	}
-+	return ret;
-+}
-+
-+static int conf_space_write(struct pci_dev *dev,
-+			    const struct config_field_entry *entry,
-+			    int offset, u32 value)
-+{
-+	int ret = 0;
-+	const struct config_field *field = entry->field;
-+
-+	switch (field->size) {
-+	case 1:
-+		if (field->u.b.write)
-+			ret = field->u.b.write(dev, offset, (u8) value,
-+					       entry->data);
-+		break;
-+	case 2:
-+		if (field->u.w.write)
-+			ret = field->u.w.write(dev, offset, (u16) value,
-+					       entry->data);
-+		break;
-+	case 4:
-+		if (field->u.dw.write)
-+			ret = field->u.dw.write(dev, offset, value,
-+						entry->data);
-+		break;
-+	}
-+	return ret;
-+}
-+
-+static inline u32 get_mask(int size)
-+{
-+	if (size == 1)
-+		return 0xff;
-+	else if (size == 2)
-+		return 0xffff;
-+	else
-+		return 0xffffffff;
-+}
-+
-+static inline int valid_request(int offset, int size)
-+{
-+	/* Validate request (no un-aligned requests) */
-+	if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
-+		return 1;
-+	return 0;
-+}
++#include "conf_space_capability.h"
 +
-+static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
-+			      int offset)
-+{
-+	if (offset >= 0) {
-+		new_val_mask <<= (offset * 8);
-+		new_val <<= (offset * 8);
-+	} else {
-+		new_val_mask >>= (offset * -8);
-+		new_val >>= (offset * -8);
-+	}
-+	val = (val & ~new_val_mask) | (new_val & new_val_mask);
++static LIST_HEAD(capabilities);
 +
-+	return val;
-+}
++static const struct config_field caplist_header[] = {
++	{
++	 .offset    = PCI_CAP_LIST_ID,
++	 .size      = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
++	 .u.w.read  = pciback_read_config_word,
++	 .u.w.write = NULL,
++	},
++	{}
++};
 +
-+static int pcibios_err_to_errno(int err)
++static inline void register_capability(struct pciback_config_capability *cap)
 +{
-+	switch (err) {
-+	case PCIBIOS_SUCCESSFUL:
-+		return XEN_PCI_ERR_success;
-+	case PCIBIOS_DEVICE_NOT_FOUND:
-+		return XEN_PCI_ERR_dev_not_found;
-+	case PCIBIOS_BAD_REGISTER_NUMBER:
-+		return XEN_PCI_ERR_invalid_offset;
-+	case PCIBIOS_FUNC_NOT_SUPPORTED:
-+		return XEN_PCI_ERR_not_implemented;
-+	case PCIBIOS_SET_FAILED:
-+		return XEN_PCI_ERR_access_denied;
-+	}
-+	return err;
++	list_add_tail(&cap->cap_list, &capabilities);
 +}
 +
-+int pciback_config_read(struct pci_dev *dev, int offset, int size,
-+			u32 *ret_val)
++int pciback_config_capability_add_fields(struct pci_dev *dev)
 +{
 +	int err = 0;
-+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
-+	const struct config_field_entry *cfg_entry;
-+	const struct config_field *field;
-+	int req_start, req_end, field_start, field_end;
-+	/* if read fails for any reason, return 0
-+	 * (as if device didn't respond) */
-+	u32 value = 0, tmp_val;
-+
-+	if (unlikely(verbose_request))
-+		printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
-+		       pci_name(dev), size, offset);
-+
-+	if (!valid_request(offset, size)) {
-+		err = XEN_PCI_ERR_invalid_offset;
-+		goto out;
-+	}
-+
-+	/* Get the real value first, then modify as appropriate */
-+	switch (size) {
-+	case 1:
-+		err = pci_read_config_byte(dev, offset, (u8 *) &value);
-+		break;
-+	case 2:
-+		err = pci_read_config_word(dev, offset, (u16 *) &value);
-+		break;
-+	case 4:
-+		err = pci_read_config_dword(dev, offset, &value);
-+		break;
-+	}
-+
-+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
-+		field = cfg_entry->field;
++	struct pciback_config_capability *cap;
++	int cap_offset;
 +
-+		req_start = offset;
-+		req_end = offset + size;
-+		field_start = OFFSET(cfg_entry);
-+		field_end = OFFSET(cfg_entry) + field->size;
++	list_for_each_entry(cap, &capabilities, cap_list) {
++		cap_offset = pci_find_capability(dev, cap->capability);
++		if (cap_offset) {
++			dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
++				cap->capability, cap_offset);
 +
-+		if ((req_start >= field_start && req_start < field_end)
-+		    || (req_end > field_start && req_end <= field_end)) {
-+			err = conf_space_read(dev, cfg_entry, field_start,
-+					      &tmp_val);
++			err = pciback_config_add_fields_offset(dev,
++							       caplist_header,
++							       cap_offset);
++			if (err)
++				goto out;
++			err = pciback_config_add_fields_offset(dev,
++							       cap->fields,
++							       cap_offset);
 +			if (err)
 +				goto out;
-+
-+			value = merge_value(value, tmp_val,
-+					    get_mask(field->size),
-+					    field_start - req_start);
 +		}
 +	}
 +
 +out:
-+	if (unlikely(verbose_request))
-+		printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
-+		       pci_name(dev), size, offset, value);
-+
-+	*ret_val = value;
-+	return pcibios_err_to_errno(err);
++	return err;
 +}
 +
-+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
++int pciback_config_capability_init(void)
 +{
-+	int err = 0, handled = 0;
-+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
-+	const struct config_field_entry *cfg_entry;
-+	const struct config_field *field;
-+	u32 tmp_val;
-+	int req_start, req_end, field_start, field_end;
++	register_capability(&pciback_config_capability_vpd);
++	register_capability(&pciback_config_capability_pm);
 +
-+	if (unlikely(verbose_request))
-+		printk(KERN_DEBUG
-+		       "pciback: %s: write request %d bytes at 0x%x = %x\n",
-+		       pci_name(dev), size, offset, value);
++	return 0;
++}
+diff --git a/drivers/xen/pciback/conf_space_capability.h b/drivers/xen/pciback/conf_space_capability.h
+new file mode 100644
+index 0000000..8da3ac4
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability.h
+@@ -0,0 +1,26 @@
++/*
++ * PCI Backend - Data structures for special overlays for structures on
++ *               the capability list.
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
 +
-+	if (!valid_request(offset, size))
-+		return XEN_PCI_ERR_invalid_offset;
++#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
++#define __PCIBACK_CONFIG_CAPABILITY_H__
 +
-+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
-+		field = cfg_entry->field;
++#include <linux/pci.h>
++#include <linux/list.h>
 +
-+		req_start = offset;
-+		req_end = offset + size;
-+		field_start = OFFSET(cfg_entry);
-+		field_end = OFFSET(cfg_entry) + field->size;
++struct pciback_config_capability {
++	struct list_head cap_list;
 +
-+		if ((req_start >= field_start && req_start < field_end)
-+		    || (req_end > field_start && req_end <= field_end)) {
-+			tmp_val = 0;
++	int capability;
 +
-+			err = pciback_config_read(dev, field_start,
-+						  field->size, &tmp_val);
-+			if (err)
-+				break;
++	/* If the device has the capability found above, add these fields */
++	const struct config_field *fields;
++};
 +
-+			tmp_val = merge_value(tmp_val, value, get_mask(size),
-+					      req_start - field_start);
++extern struct pciback_config_capability pciback_config_capability_vpd;
++extern struct pciback_config_capability pciback_config_capability_pm;
 +
-+			err = conf_space_write(dev, cfg_entry, field_start,
-+					       tmp_val);
++#endif
+diff --git a/drivers/xen/pciback/conf_space_capability_msi.c b/drivers/xen/pciback/conf_space_capability_msi.c
+new file mode 100644
+index 0000000..b70ea8b
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability_msi.c
+@@ -0,0 +1,84 @@
++/*
++ * PCI Backend -- Configuration overlay for MSI capability
++ */
++#include <linux/pci.h>
++#include <linux/slab.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
++#include <xen/interface/io/pciif.h>
++#include <xen/events.h>
++#include "pciback.h"
 +
-+			/* handled is set true here, but not every byte
-+			 * may have been written! Properly detecting if
-+			 * every byte is handled is unnecessary as the
-+			 * flag is used to detect devices that need
-+			 * special helpers to work correctly.
-+			 */
-+			handled = 1;
-+		}
-+	}
++int pciback_enable_msi(struct pciback_device *pdev,
++		struct pci_dev *dev, struct xen_pci_op *op)
++{
++	int otherend = pdev->xdev->otherend_id;
++	int status;
 +
-+	if (!handled && !err) {
-+		/* By default, anything not specificially handled above is
-+		 * read-only. The permissive flag changes this behavior so
-+		 * that anything not specifically handled above is writable.
-+		 * This means that some fields may still be read-only because
-+		 * they have entries in the config_field list that intercept
-+		 * the write and do nothing. */
-+		if (dev_data->permissive || permissive) {
-+			switch (size) {
-+			case 1:
-+				err = pci_write_config_byte(dev, offset,
-+							    (u8) value);
-+				break;
-+			case 2:
-+				err = pci_write_config_word(dev, offset,
-+							    (u16) value);
-+				break;
-+			case 4:
-+				err = pci_write_config_dword(dev, offset,
-+							     (u32) value);
-+				break;
-+			}
-+		} else if (!dev_data->warned_on_write) {
-+			dev_data->warned_on_write = 1;
-+			dev_warn(&dev->dev, "Driver tried to write to a "
-+				 "read-only configuration space field at offset"
-+				 " 0x%x, size %d. This may be harmless, but if "
-+				 "you have problems with your device:\n"
-+				 "1) see permissive attribute in sysfs\n"
-+				 "2) report problems to the xen-devel "
-+				 "mailing list along with details of your "
-+				 "device obtained from lspci.\n", offset, size);
-+		}
++	status = pci_enable_msi(dev);
++
++	if (status) {
++		printk(KERN_ERR "error enable msi for guest %x status %x\n",
++			otherend, status);
++		op->value = 0;
++		return XEN_PCI_ERR_op_failed;
 +	}
 +
-+	return pcibios_err_to_errno(err);
++	/* The value the guest needs is actually the IDT vector, not the
++	 * the local domain's IRQ number. */
++	op->value = xen_gsi_from_irq(dev->irq);
++	return 0;
 +}
 +
-+void pciback_config_free_dyn_fields(struct pci_dev *dev)
++int pciback_disable_msi(struct pciback_device *pdev,
++		struct pci_dev *dev, struct xen_pci_op *op)
 +{
-+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
-+	struct config_field_entry *cfg_entry, *t;
-+	const struct config_field *field;
++	pci_disable_msi(dev);
 +
-+	dev_dbg(&dev->dev, "free-ing dynamically allocated virtual "
-+			   "configuration space fields\n");
-+	if (!dev_data)
-+		return;
++	op->value = xen_gsi_from_irq(dev->irq);
++	return 0;
++}
 +
-+	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
-+		field = cfg_entry->field;
++int pciback_enable_msix(struct pciback_device *pdev,
++		struct pci_dev *dev, struct xen_pci_op *op)
++{
++	int i, result;
++	struct msix_entry *entries;
++
++	if (op->value > SH_INFO_MAX_VEC)
++		return -EINVAL;
++
++	entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
++	if (entries == NULL)
++		return -ENOMEM;
++
++	for (i = 0; i < op->value; i++) {
++		entries[i].entry = op->msix_entries[i].entry;
++		entries[i].vector = op->msix_entries[i].vector;
++	}
++
++	result = pci_enable_msix(dev, entries, op->value);
 +
-+		if (field->clean) {
-+			field->clean((struct config_field *)field);
++	for (i = 0; i < op->value; i++) {
++		op->msix_entries[i].entry = entries[i].entry;
++		op->msix_entries[i].vector =
++					xen_gsi_from_irq(entries[i].vector);
++	}
 +
-+			kfree(cfg_entry->data);
++	kfree(entries);
 +
-+			list_del(&cfg_entry->list);
-+			kfree(cfg_entry);
-+		}
++	op->value = result;
 +
-+	}
++	return result;
 +}
 +
-+void pciback_config_reset_dev(struct pci_dev *dev)
++int pciback_disable_msix(struct pciback_device *pdev,
++		struct pci_dev *dev, struct xen_pci_op *op)
 +{
-+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
-+	const struct config_field_entry *cfg_entry;
-+	const struct config_field *field;
-+
-+	dev_dbg(&dev->dev, "resetting virtual configuration space\n");
-+	if (!dev_data)
-+		return;
 +
-+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
-+		field = cfg_entry->field;
++	pci_disable_msix(dev);
 +
-+		if (field->reset)
-+			field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
-+	}
++	op->value = xen_gsi_from_irq(dev->irq);
++	return 0;
 +}
 +
-+void pciback_config_free_dev(struct pci_dev *dev)
-+{
-+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
-+	struct config_field_entry *cfg_entry, *t;
-+	const struct config_field *field;
+diff --git a/drivers/xen/pciback/conf_space_capability_pm.c b/drivers/xen/pciback/conf_space_capability_pm.c
+new file mode 100644
+index 0000000..0442616
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability_pm.c
+@@ -0,0 +1,113 @@
++/*
++ * PCI Backend - Configuration space overlay for power management
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
 +
-+	dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
-+	if (!dev_data)
-+		return;
++#include <linux/pci.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
 +
-+	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
-+		list_del(&cfg_entry->list);
++static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
++			void *data)
++{
++	int err;
++	u16 real_value;
 +
-+		field = cfg_entry->field;
++	err = pci_read_config_word(dev, offset, &real_value);
++	if (err)
++		goto out;
 +
-+		if (field->release)
-+			field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
++	*value = real_value & ~PCI_PM_CAP_PME_MASK;
 +
-+		kfree(cfg_entry);
-+	}
++out:
++	return err;
 +}
 +
-+int pciback_config_add_field_offset(struct pci_dev *dev,
-+				    const struct config_field *field,
-+				    unsigned int base_offset)
-+{
-+	int err = 0;
-+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
-+	struct config_field_entry *cfg_entry;
-+	void *tmp;
-+
-+	cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
-+	if (!cfg_entry) {
-+		err = -ENOMEM;
-+		goto out;
-+	}
++/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
++ * Can't allow driver domain to enable PMEs - they're shared */
++#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
 +
-+	cfg_entry->data = NULL;
-+	cfg_entry->field = field;
-+	cfg_entry->base_offset = base_offset;
++static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
++			 void *data)
++{
++	int err;
++	u16 old_value;
++	pci_power_t new_state, old_state;
 +
-+	/* silently ignore duplicate fields */
-+	err = pciback_field_is_dup(dev, OFFSET(cfg_entry));
++	err = pci_read_config_word(dev, offset, &old_value);
 +	if (err)
 +		goto out;
 +
-+	if (field->init) {
-+		tmp = field->init(dev, OFFSET(cfg_entry));
++	old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
++	new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
 +
-+		if (IS_ERR(tmp)) {
-+			err = PTR_ERR(tmp);
++	new_value &= PM_OK_BITS;
++	if ((old_value & PM_OK_BITS) != new_value) {
++		new_value = (old_value & ~PM_OK_BITS) | new_value;
++		err = pci_write_config_word(dev, offset, new_value);
++		if (err)
 +			goto out;
-+		}
-+
-+		cfg_entry->data = tmp;
 +	}
 +
-+	dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
-+		OFFSET(cfg_entry));
-+	list_add_tail(&cfg_entry->list, &dev_data->config_fields);
-+
-+out:
-+	if (err)
-+		kfree(cfg_entry);
++	/* Let pci core handle the power management change */
++	dev_dbg(&dev->dev, "set power state to %x\n", new_state);
++	err = pci_set_power_state(dev, new_state);
++	if (err) {
++		err = PCIBIOS_SET_FAILED;
++		goto out;
++	}
 +
++ out:
 +	return err;
 +}
 +
-+/* This sets up the device's virtual configuration space to keep track of
-+ * certain registers (like the base address registers (BARs) so that we can
-+ * keep the client from manipulating them directly.
-+ */
-+int pciback_config_init_dev(struct pci_dev *dev)
++/* Ensure PMEs are disabled */
++static void *pm_ctrl_init(struct pci_dev *dev, int offset)
 +{
-+	int err = 0;
-+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
-+
-+	dev_dbg(&dev->dev, "initializing virtual configuration space\n");
-+
-+	INIT_LIST_HEAD(&dev_data->config_fields);
-+
-+	err = pciback_config_header_add_fields(dev);
-+	if (err)
-+		goto out;
++	int err;
++	u16 value;
 +
-+	err = pciback_config_capability_add_fields(dev);
++	err = pci_read_config_word(dev, offset, &value);
 +	if (err)
 +		goto out;
 +
-+	err = pciback_config_quirks_init(dev);
++	if (value & PCI_PM_CTRL_PME_ENABLE) {
++		value &= ~PCI_PM_CTRL_PME_ENABLE;
++		err = pci_write_config_word(dev, offset, value);
++	}
 +
 +out:
-+	return err;
++	return ERR_PTR(err);
 +}
 +
-+int pciback_config_init(void)
-+{
-+	return pciback_config_capability_init();
-+}
-diff --git a/drivers/xen/pciback/conf_space.h b/drivers/xen/pciback/conf_space.h
++static const struct config_field caplist_pm[] = {
++	{
++		.offset     = PCI_PM_PMC,
++		.size       = 2,
++		.u.w.read   = pm_caps_read,
++	},
++	{
++		.offset     = PCI_PM_CTRL,
++		.size       = 2,
++		.init       = pm_ctrl_init,
++		.u.w.read   = pciback_read_config_word,
++		.u.w.write  = pm_ctrl_write,
++	},
++	{
++		.offset     = PCI_PM_PPB_EXTENSIONS,
++		.size       = 1,
++		.u.b.read   = pciback_read_config_byte,
++	},
++	{
++		.offset     = PCI_PM_DATA_REGISTER,
++		.size       = 1,
++		.u.b.read   = pciback_read_config_byte,
++	},
++	{}
++};
++
++struct pciback_config_capability pciback_config_capability_pm = {
++	.capability = PCI_CAP_ID_PM,
++	.fields = caplist_pm,
++};
+diff --git a/drivers/xen/pciback/conf_space_capability_vpd.c b/drivers/xen/pciback/conf_space_capability_vpd.c
 new file mode 100644
-index 0000000..50ebef2
+index 0000000..e7b4d66
 --- /dev/null
-+++ b/drivers/xen/pciback/conf_space.h
-@@ -0,0 +1,126 @@
++++ b/drivers/xen/pciback/conf_space_capability_vpd.c
+@@ -0,0 +1,40 @@
 +/*
-+ * PCI Backend - Common data structures for overriding the configuration space
++ * PCI Backend - Configuration space overlay for Vital Product Data
 + *
 + * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
 + */
-+
-+#ifndef __XEN_PCIBACK_CONF_SPACE_H__
-+#define __XEN_PCIBACK_CONF_SPACE_H__
-+
-+#include <linux/list.h>
-+#include <linux/err.h>
-+
-+/* conf_field_init can return an errno in a ptr with ERR_PTR() */
-+typedef void *(*conf_field_init) (struct pci_dev *dev, int offset);
-+typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data);
-+typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data);
-+
-+typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value,
-+				 void *data);
-+typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value,
-+				void *data);
-+typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value,
-+				void *data);
-+typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value,
-+				void *data);
-+typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value,
-+			       void *data);
-+typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value,
-+			       void *data);
-+
-+/* These are the fields within the configuration space which we
-+ * are interested in intercepting reads/writes to and changing their
-+ * values.
-+ */
-+struct config_field {
-+	unsigned int offset;
-+	unsigned int size;
-+	unsigned int mask;
-+	conf_field_init init;
-+	conf_field_reset reset;
-+	conf_field_free release;
-+	void (*clean) (struct config_field *field);
-+	union {
-+		struct {
-+			conf_dword_write write;
-+			conf_dword_read read;
-+		} dw;
-+		struct {
-+			conf_word_write write;
-+			conf_word_read read;
-+		} w;
-+		struct {
-+			conf_byte_write write;
-+			conf_byte_read read;
-+		} b;
-+	} u;
-+	struct list_head list;
++
++#include <linux/pci.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
++
++static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
++			     void *data)
++{
++	/* Disallow writes to the vital product data */
++	if (value & PCI_VPD_ADDR_F)
++		return PCIBIOS_SET_FAILED;
++	else
++		return pci_write_config_word(dev, offset, value);
++}
++
++static const struct config_field caplist_vpd[] = {
++	{
++	 .offset    = PCI_VPD_ADDR,
++	 .size      = 2,
++	 .u.w.read  = pciback_read_config_word,
++	 .u.w.write = vpd_address_write,
++	 },
++	{
++	 .offset     = PCI_VPD_DATA,
++	 .size       = 4,
++	 .u.dw.read  = pciback_read_config_dword,
++	 .u.dw.write = NULL,
++	 },
++	{}
 +};
 +
-+struct config_field_entry {
-+	struct list_head list;
-+	const struct config_field *field;
-+	unsigned int base_offset;
-+	void *data;
++struct pciback_config_capability pciback_config_capability_vpd = {
++	.capability = PCI_CAP_ID_VPD,
++	.fields = caplist_vpd,
 +};
+diff --git a/drivers/xen/pciback/conf_space_header.c b/drivers/xen/pciback/conf_space_header.c
+new file mode 100644
+index 0000000..1f4f86e
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_header.c
+@@ -0,0 +1,379 @@
++/*
++ * PCI Backend - Handles the virtual fields in the configuration space headers.
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
 +
-+#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
 +
-+/* Add fields to a device - the add_fields macro expects to get a pointer to
-+ * the first entry in an array (of which the ending is marked by size==0)
-+ */
-+int pciback_config_add_field_offset(struct pci_dev *dev,
-+				    const struct config_field *field,
-+				    unsigned int offset);
++struct pci_bar_info {
++	u32 val;
++	u32 len_val;
++	int which;
++};
 +
-+static inline int pciback_config_add_field(struct pci_dev *dev,
-+					   const struct config_field *field)
++#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
++#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
++
++static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
 +{
-+	return pciback_config_add_field_offset(dev, field, 0);
++	int i;
++	int ret;
++
++	ret = pciback_read_config_word(dev, offset, value, data);
++	if (!atomic_read(&dev->enable_cnt))
++		return ret;
++
++	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
++		if (dev->resource[i].flags & IORESOURCE_IO)
++			*value |= PCI_COMMAND_IO;
++		if (dev->resource[i].flags & IORESOURCE_MEM)
++			*value |= PCI_COMMAND_MEMORY;
++	}
++
++	return ret;
 +}
 +
-+static inline int pciback_config_add_fields(struct pci_dev *dev,
-+					    const struct config_field *field)
++static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
 +{
-+	int i, err = 0;
-+	for (i = 0; field[i].size != 0; i++) {
-+		err = pciback_config_add_field(dev, &field[i]);
++	int err;
++
++	if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
++		if (unlikely(verbose_request))
++			printk(KERN_DEBUG "pciback: %s: enable\n",
++			       pci_name(dev));
++		err = pci_enable_device(dev);
 +		if (err)
-+			break;
++			return err;
++	} else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
++		if (unlikely(verbose_request))
++			printk(KERN_DEBUG "pciback: %s: disable\n",
++			       pci_name(dev));
++		pci_disable_device(dev);
 +	}
-+	return err;
++
++	if (!dev->is_busmaster && is_master_cmd(value)) {
++		if (unlikely(verbose_request))
++			printk(KERN_DEBUG "pciback: %s: set bus master\n",
++			       pci_name(dev));
++		pci_set_master(dev);
++	}
++
++	if (value & PCI_COMMAND_INVALIDATE) {
++		if (unlikely(verbose_request))
++			printk(KERN_DEBUG
++			       "pciback: %s: enable memory-write-invalidate\n",
++			       pci_name(dev));
++		err = pci_set_mwi(dev);
++		if (err) {
++			printk(KERN_WARNING
++			       "pciback: %s: cannot enable "
++			       "memory-write-invalidate (%d)\n",
++			       pci_name(dev), err);
++			value &= ~PCI_COMMAND_INVALIDATE;
++		}
++	}
++
++	return pci_write_config_word(dev, offset, value);
 +}
 +
-+static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
-+					const struct config_field *field,
-+					unsigned int offset)
++static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
 +{
-+	int i, err = 0;
-+	for (i = 0; field[i].size != 0; i++) {
-+		err = pciback_config_add_field_offset(dev, &field[i], offset);
-+		if (err)
-+			break;
++	struct pci_bar_info *bar = data;
++
++	if (unlikely(!bar)) {
++		printk(KERN_WARNING "pciback: driver data not found for %s\n",
++		       pci_name(dev));
++		return XEN_PCI_ERR_op_failed;
 +	}
-+	return err;
-+}
 +
-+/* Read/Write the real configuration space */
-+int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 *value,
-+			     void *data);
-+int pciback_read_config_word(struct pci_dev *dev, int offset, u16 *value,
-+			     void *data);
-+int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 *value,
-+			      void *data);
-+int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
-+			      void *data);
-+int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
-+			      void *data);
-+int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
-+			       void *data);
++	/* A write to obtain the length must happen as a 32-bit write.
++	 * This does not (yet) support writing individual bytes
++	 */
++	if (value == ~PCI_ROM_ADDRESS_ENABLE)
++		bar->which = 1;
++	else {
++		u32 tmpval;
++		pci_read_config_dword(dev, offset, &tmpval);
++		if (tmpval != bar->val && value == bar->val) {
++			/* Allow restoration of bar value. */
++			pci_write_config_dword(dev, offset, bar->val);
++		}
++		bar->which = 0;
++	}
 +
-+int pciback_config_capability_init(void);
++	/* Do we need to support enabling/disabling the rom address here? */
 +
-+int pciback_config_header_add_fields(struct pci_dev *dev);
-+int pciback_config_capability_add_fields(struct pci_dev *dev);
++	return 0;
++}
 +
-+#endif				/* __XEN_PCIBACK_CONF_SPACE_H__ */
-diff --git a/drivers/xen/pciback/conf_space_capability.c b/drivers/xen/pciback/conf_space_capability.c
-new file mode 100644
-index 0000000..0ea84d6
---- /dev/null
-+++ b/drivers/xen/pciback/conf_space_capability.c
-@@ -0,0 +1,66 @@
-+/*
-+ * PCI Backend - Handles the virtual fields found on the capability lists
-+ *               in the configuration space.
-+ *
-+ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++/* For the BARs, only allow writes which write ~0 or
++ * the correct resource information
++ * (Needed for when the driver probes the resource usage)
 + */
++static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
++{
++	struct pci_bar_info *bar = data;
 +
-+#include <linux/kernel.h>
-+#include <linux/pci.h>
-+#include "pciback.h"
-+#include "conf_space.h"
-+#include "conf_space_capability.h"
-+
-+static LIST_HEAD(capabilities);
++	if (unlikely(!bar)) {
++		printk(KERN_WARNING "pciback: driver data not found for %s\n",
++		       pci_name(dev));
++		return XEN_PCI_ERR_op_failed;
++	}
 +
-+static const struct config_field caplist_header[] = {
-+	{
-+	 .offset    = PCI_CAP_LIST_ID,
-+	 .size      = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
-+	 .u.w.read  = pciback_read_config_word,
-+	 .u.w.write = NULL,
-+	},
-+	{}
-+};
++	/* A write to obtain the length must happen as a 32-bit write.
++	 * This does not (yet) support writing individual bytes
++	 */
++	if (value == ~0)
++		bar->which = 1;
++	else {
++		u32 tmpval;
++		pci_read_config_dword(dev, offset, &tmpval);
++		if (tmpval != bar->val && value == bar->val) {
++			/* Allow restoration of bar value. */
++			pci_write_config_dword(dev, offset, bar->val);
++		}
++		bar->which = 0;
++	}
 +
-+static inline void register_capability(struct pciback_config_capability *cap)
-+{
-+	list_add_tail(&cap->cap_list, &capabilities);
++	return 0;
 +}
 +
-+int pciback_config_capability_add_fields(struct pci_dev *dev)
++static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
 +{
-+	int err = 0;
-+	struct pciback_config_capability *cap;
-+	int cap_offset;
++	struct pci_bar_info *bar = data;
 +
-+	list_for_each_entry(cap, &capabilities, cap_list) {
-+		cap_offset = pci_find_capability(dev, cap->capability);
-+		if (cap_offset) {
-+			dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
-+				cap->capability, cap_offset);
++	if (unlikely(!bar)) {
++		printk(KERN_WARNING "pciback: driver data not found for %s\n",
++		       pci_name(dev));
++		return XEN_PCI_ERR_op_failed;
++	}
++
++	*value = bar->which ? bar->len_val : bar->val;
 +
-+			err = pciback_config_add_fields_offset(dev,
-+							       caplist_header,
-+							       cap_offset);
-+			if (err)
-+				goto out;
-+			err = pciback_config_add_fields_offset(dev,
-+							       cap->fields,
-+							       cap_offset);
-+			if (err)
-+				goto out;
++	return 0;
++}
++
++static inline void read_dev_bar(struct pci_dev *dev,
++				struct pci_bar_info *bar_info, int offset,
++				u32 len_mask)
++{
++	int	pos;
++	struct resource	*res = dev->resource;
++
++	if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
++		pos = PCI_ROM_RESOURCE;
++	else {
++		pos = (offset - PCI_BASE_ADDRESS_0) / 4;
++		if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE |
++				PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
++			   (PCI_BASE_ADDRESS_SPACE_MEMORY |
++				PCI_BASE_ADDRESS_MEM_TYPE_64))) {
++			bar_info->val = res[pos - 1].start >> 32;
++			bar_info->len_val = res[pos - 1].end >> 32;
++			return;
 +		}
 +	}
 +
-+out:
-+	return err;
++	bar_info->val = res[pos].start |
++			(res[pos].flags & PCI_REGION_FLAG_MASK);
++	bar_info->len_val = res[pos].end - res[pos].start + 1;
 +}
 +
-+int pciback_config_capability_init(void)
++static void *bar_init(struct pci_dev *dev, int offset)
 +{
-+	register_capability(&pciback_config_capability_vpd);
-+	register_capability(&pciback_config_capability_pm);
++	struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
 +
-+	return 0;
++	if (!bar)
++		return ERR_PTR(-ENOMEM);
++
++	read_dev_bar(dev, bar, offset, ~0);
++	bar->which = 0;
++
++	return bar;
 +}
-diff --git a/drivers/xen/pciback/conf_space_capability.h b/drivers/xen/pciback/conf_space_capability.h
-new file mode 100644
-index 0000000..8da3ac4
---- /dev/null
-+++ b/drivers/xen/pciback/conf_space_capability.h
-@@ -0,0 +1,26 @@
-+/*
-+ * PCI Backend - Data structures for special overlays for structures on
-+ *               the capability list.
-+ *
-+ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
-+ */
 +
-+#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
-+#define __PCIBACK_CONFIG_CAPABILITY_H__
++static void *rom_init(struct pci_dev *dev, int offset)
++{
++	struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
 +
-+#include <linux/pci.h>
-+#include <linux/list.h>
++	if (!bar)
++		return ERR_PTR(-ENOMEM);
 +
-+struct pciback_config_capability {
-+	struct list_head cap_list;
++	read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
++	bar->which = 0;
 +
-+	int capability;
++	return bar;
++}
 +
-+	/* If the device has the capability found above, add these fields */
-+	const struct config_field *fields;
-+};
++static void bar_reset(struct pci_dev *dev, int offset, void *data)
++{
++	struct pci_bar_info *bar = data;
 +
-+extern struct pciback_config_capability pciback_config_capability_vpd;
-+extern struct pciback_config_capability pciback_config_capability_pm;
++	bar->which = 0;
++}
 +
-+#endif
-diff --git a/drivers/xen/pciback/conf_space_capability_msi.c b/drivers/xen/pciback/conf_space_capability_msi.c
-new file mode 100644
-index 0000000..b70ea8b
---- /dev/null
-+++ b/drivers/xen/pciback/conf_space_capability_msi.c
-@@ -0,0 +1,84 @@
-+/*
-+ * PCI Backend -- Configuration overlay for MSI capability
-+ */
-+#include <linux/pci.h>
-+#include <linux/slab.h>
-+#include "conf_space.h"
-+#include "conf_space_capability.h"
-+#include <xen/interface/io/pciif.h>
-+#include <xen/events.h>
-+#include "pciback.h"
++static void bar_release(struct pci_dev *dev, int offset, void *data)
++{
++	kfree(data);
++}
 +
-+int pciback_enable_msi(struct pciback_device *pdev,
-+		struct pci_dev *dev, struct xen_pci_op *op)
++static int pciback_read_vendor(struct pci_dev *dev, int offset,
++			       u16 *value, void *data)
 +{
-+	int otherend = pdev->xdev->otherend_id;
-+	int status;
++	*value = dev->vendor;
 +
-+	status = pci_enable_msi(dev);
++	return 0;
++}
 +
-+	if (status) {
-+		printk(KERN_ERR "error enable msi for guest %x status %x\n",
-+			otherend, status);
-+		op->value = 0;
-+		return XEN_PCI_ERR_op_failed;
-+	}
++static int pciback_read_device(struct pci_dev *dev, int offset,
++			       u16 *value, void *data)
++{
++	*value = dev->device;
 +
-+	/* The value the guest needs is actually the IDT vector, not the
-+	 * the local domain's IRQ number. */
-+	op->value = xen_gsi_from_irq(dev->irq);
 +	return 0;
 +}
 +
-+int pciback_disable_msi(struct pciback_device *pdev,
-+		struct pci_dev *dev, struct xen_pci_op *op)
++static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
++			  void *data)
 +{
-+	pci_disable_msi(dev);
++	*value = (u8) dev->irq;
 +
-+	op->value = xen_gsi_from_irq(dev->irq);
 +	return 0;
 +}
 +
-+int pciback_enable_msix(struct pciback_device *pdev,
-+		struct pci_dev *dev, struct xen_pci_op *op)
++static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
 +{
-+	int i, result;
-+	struct msix_entry *entries;
++	u8 cur_value;
++	int err;
 +
-+	if (op->value > SH_INFO_MAX_VEC)
-+		return -EINVAL;
++	err = pci_read_config_byte(dev, offset, &cur_value);
++	if (err)
++		goto out;
 +
-+	entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
-+	if (entries == NULL)
-+		return -ENOMEM;
++	if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
++	    || value == PCI_BIST_START)
++		err = pci_write_config_byte(dev, offset, value);
 +
-+	for (i = 0; i < op->value; i++) {
-+		entries[i].entry = op->msix_entries[i].entry;
-+		entries[i].vector = op->msix_entries[i].vector;
-+	}
++out:
++	return err;
++}
 +
-+	result = pci_enable_msix(dev, entries, op->value);
++static const struct config_field header_common[] = {
++	{
++	 .offset    = PCI_VENDOR_ID,
++	 .size      = 2,
++	 .u.w.read  = pciback_read_vendor,
++	},
++	{
++	 .offset    = PCI_DEVICE_ID,
++	 .size      = 2,
++	 .u.w.read  = pciback_read_device,
++	},
++	{
++	 .offset    = PCI_COMMAND,
++	 .size      = 2,
++	 .u.w.read  = command_read,
++	 .u.w.write = command_write,
++	},
++	{
++	 .offset    = PCI_INTERRUPT_LINE,
++	 .size      = 1,
++	 .u.b.read  = interrupt_read,
++	},
++	{
++	 .offset    = PCI_INTERRUPT_PIN,
++	 .size      = 1,
++	 .u.b.read  = pciback_read_config_byte,
++	},
++	{
++	 /* Any side effects of letting driver domain control cache line? */
++	 .offset    = PCI_CACHE_LINE_SIZE,
++	 .size      = 1,
++	 .u.b.read  = pciback_read_config_byte,
++	 .u.b.write = pciback_write_config_byte,
++	},
++	{
++	 .offset    = PCI_LATENCY_TIMER,
++	 .size      = 1,
++	 .u.b.read  = pciback_read_config_byte,
++	},
++	{
++	 .offset    = PCI_BIST,
++	 .size      = 1,
++	 .u.b.read  = pciback_read_config_byte,
++	 .u.b.write = bist_write,
++	},
++	{}
++};
 +
-+	for (i = 0; i < op->value; i++) {
-+		op->msix_entries[i].entry = entries[i].entry;
-+		op->msix_entries[i].vector =
-+					xen_gsi_from_irq(entries[i].vector);
-+	}
++#define CFG_FIELD_BAR(reg_offset) 			\
++	{ 						\
++	 .offset     = reg_offset, 			\
++	 .size       = 4, 				\
++	 .init       = bar_init, 			\
++	 .reset      = bar_reset, 			\
++	 .release    = bar_release, 			\
++	 .u.dw.read  = bar_read, 			\
++	 .u.dw.write = bar_write, 			\
++	 }
 +
-+	kfree(entries);
++#define CFG_FIELD_ROM(reg_offset) 			\
++	{ 						\
++	 .offset     = reg_offset, 			\
++	 .size       = 4, 				\
++	 .init       = rom_init, 			\
++	 .reset      = bar_reset, 			\
++	 .release    = bar_release, 			\
++	 .u.dw.read  = bar_read, 			\
++	 .u.dw.write = rom_write, 			\
++	 }
++
++static const struct config_field header_0[] = {
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
++	CFG_FIELD_ROM(PCI_ROM_ADDRESS),
++	{}
++};
++
++static const struct config_field header_1[] = {
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
++	CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
++	{}
++};
++
++int pciback_config_header_add_fields(struct pci_dev *dev)
++{
++	int err;
 +
-+	op->value = result;
++	err = pciback_config_add_fields(dev, header_common);
++	if (err)
++		goto out;
 +
-+	return result;
-+}
++	switch (dev->hdr_type) {
++	case PCI_HEADER_TYPE_NORMAL:
++		err = pciback_config_add_fields(dev, header_0);
++		break;
 +
-+int pciback_disable_msix(struct pciback_device *pdev,
-+		struct pci_dev *dev, struct xen_pci_op *op)
-+{
++	case PCI_HEADER_TYPE_BRIDGE:
++		err = pciback_config_add_fields(dev, header_1);
++		break;
 +
-+	pci_disable_msix(dev);
++	default:
++		err = -EINVAL;
++		printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
++		       pci_name(dev), dev->hdr_type);
++		break;
++	}
 +
-+	op->value = xen_gsi_from_irq(dev->irq);
-+	return 0;
++out:
++	return err;
 +}
-+
-diff --git a/drivers/xen/pciback/conf_space_capability_pm.c b/drivers/xen/pciback/conf_space_capability_pm.c
+diff --git a/drivers/xen/pciback/conf_space_quirks.c b/drivers/xen/pciback/conf_space_quirks.c
 new file mode 100644
-index 0000000..0442616
+index 0000000..45c31fb
 --- /dev/null
-+++ b/drivers/xen/pciback/conf_space_capability_pm.c
-@@ -0,0 +1,113 @@
++++ b/drivers/xen/pciback/conf_space_quirks.c
+@@ -0,0 +1,140 @@
 +/*
-+ * PCI Backend - Configuration space overlay for power management
++ * PCI Backend - Handle special overlays for broken devices.
 + *
 + * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ * Author: Chris Bookholt <hap10 at epoch.ncsc.mil>
 + */
 +
++#include <linux/kernel.h>
 +#include <linux/pci.h>
++#include "pciback.h"
 +#include "conf_space.h"
-+#include "conf_space_capability.h"
++#include "conf_space_quirks.h"
 +
-+static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
-+			void *data)
-+{
-+	int err;
-+	u16 real_value;
++LIST_HEAD(pciback_quirks);
 +
-+	err = pci_read_config_word(dev, offset, &real_value);
-+	if (err)
-+		goto out;
++static inline const struct pci_device_id *
++match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
++{
++	if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
++	    (id->device == PCI_ANY_ID || id->device == dev->device) &&
++	    (id->subvendor == PCI_ANY_ID ||
++				id->subvendor == dev->subsystem_vendor) &&
++	    (id->subdevice == PCI_ANY_ID ||
++				id->subdevice == dev->subsystem_device) &&
++	    !((id->class ^ dev->class) & id->class_mask))
++		return id;
++	return NULL;
++}
 +
-+	*value = real_value & ~PCI_PM_CAP_PME_MASK;
++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
++{
++	struct pciback_config_quirk *tmp_quirk;
 +
++	list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
++		if (match_one_device(&tmp_quirk->devid, dev) != NULL)
++			goto out;
++	tmp_quirk = NULL;
++	printk(KERN_DEBUG
++	       "quirk didn't match any device pciback knows about\n");
 +out:
-+	return err;
++	return tmp_quirk;
 +}
 +
-+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
-+ * Can't allow driver domain to enable PMEs - they're shared */
-+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
-+
-+static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
-+			 void *data)
++static inline void register_quirk(struct pciback_config_quirk *quirk)
 +{
-+	int err;
-+	u16 old_value;
-+	pci_power_t new_state, old_state;
-+
-+	err = pci_read_config_word(dev, offset, &old_value);
-+	if (err)
-+		goto out;
-+
-+	old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
-+	new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
++	list_add_tail(&quirk->quirks_list, &pciback_quirks);
++}
 +
-+	new_value &= PM_OK_BITS;
-+	if ((old_value & PM_OK_BITS) != new_value) {
-+		new_value = (old_value & ~PM_OK_BITS) | new_value;
-+		err = pci_write_config_word(dev, offset, new_value);
-+		if (err)
-+			goto out;
-+	}
++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg)
++{
++	int ret = 0;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	struct config_field_entry *cfg_entry;
 +
-+	/* Let pci core handle the power management change */
-+	dev_dbg(&dev->dev, "set power state to %x\n", new_state);
-+	err = pci_set_power_state(dev, new_state);
-+	if (err) {
-+		err = PCIBIOS_SET_FAILED;
-+		goto out;
++	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++		if (OFFSET(cfg_entry) == reg) {
++			ret = 1;
++			break;
++		}
 +	}
-+
-+ out:
-+	return err;
++	return ret;
 +}
 +
-+/* Ensure PMEs are disabled */
-+static void *pm_ctrl_init(struct pci_dev *dev, int offset)
++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
++				    *field)
 +{
-+	int err;
-+	u16 value;
++	int err = 0;
 +
-+	err = pci_read_config_word(dev, offset, &value);
-+	if (err)
++	switch (field->size) {
++	case 1:
++		field->u.b.read = pciback_read_config_byte;
++		field->u.b.write = pciback_write_config_byte;
++		break;
++	case 2:
++		field->u.w.read = pciback_read_config_word;
++		field->u.w.write = pciback_write_config_word;
++		break;
++	case 4:
++		field->u.dw.read = pciback_read_config_dword;
++		field->u.dw.write = pciback_write_config_dword;
++		break;
++	default:
++		err = -EINVAL;
 +		goto out;
-+
-+	if (value & PCI_PM_CTRL_PME_ENABLE) {
-+		value &= ~PCI_PM_CTRL_PME_ENABLE;
-+		err = pci_write_config_word(dev, offset, value);
 +	}
 +
++	pciback_config_add_field(dev, field);
++
 +out:
-+	return ERR_PTR(err);
++	return err;
 +}
 +
-+static const struct config_field caplist_pm[] = {
-+	{
-+		.offset     = PCI_PM_PMC,
-+		.size       = 2,
-+		.u.w.read   = pm_caps_read,
-+	},
-+	{
-+		.offset     = PCI_PM_CTRL,
-+		.size       = 2,
-+		.init       = pm_ctrl_init,
-+		.u.w.read   = pciback_read_config_word,
-+		.u.w.write  = pm_ctrl_write,
-+	},
-+	{
-+		.offset     = PCI_PM_PPB_EXTENSIONS,
-+		.size       = 1,
-+		.u.b.read   = pciback_read_config_byte,
-+	},
-+	{
-+		.offset     = PCI_PM_DATA_REGISTER,
-+		.size       = 1,
-+		.u.b.read   = pciback_read_config_byte,
-+	},
-+	{}
-+};
-+
-+struct pciback_config_capability pciback_config_capability_pm = {
-+	.capability = PCI_CAP_ID_PM,
-+	.fields = caplist_pm,
-+};
-diff --git a/drivers/xen/pciback/conf_space_capability_vpd.c b/drivers/xen/pciback/conf_space_capability_vpd.c
-new file mode 100644
-index 0000000..e7b4d66
---- /dev/null
-+++ b/drivers/xen/pciback/conf_space_capability_vpd.c
-@@ -0,0 +1,40 @@
-+/*
-+ * PCI Backend - Configuration space overlay for Vital Product Data
-+ *
-+ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
-+ */
-+
-+#include <linux/pci.h>
-+#include "conf_space.h"
-+#include "conf_space_capability.h"
-+
-+static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
-+			     void *data)
++int pciback_config_quirks_init(struct pci_dev *dev)
 +{
-+	/* Disallow writes to the vital product data */
-+	if (value & PCI_VPD_ADDR_F)
-+		return PCIBIOS_SET_FAILED;
-+	else
-+		return pci_write_config_word(dev, offset, value);
-+}
-+
-+static const struct config_field caplist_vpd[] = {
-+	{
-+	 .offset    = PCI_VPD_ADDR,
-+	 .size      = 2,
-+	 .u.w.read  = pciback_read_config_word,
-+	 .u.w.write = vpd_address_write,
-+	 },
-+	{
-+	 .offset     = PCI_VPD_DATA,
-+	 .size       = 4,
-+	 .u.dw.read  = pciback_read_config_dword,
-+	 .u.dw.write = NULL,
-+	 },
-+	{}
-+};
++	struct pciback_config_quirk *quirk;
++	int ret = 0;
 +
-+struct pciback_config_capability pciback_config_capability_vpd = {
-+	.capability = PCI_CAP_ID_VPD,
-+	.fields = caplist_vpd,
-+};
-diff --git a/drivers/xen/pciback/conf_space_header.c b/drivers/xen/pciback/conf_space_header.c
-new file mode 100644
-index 0000000..1f4f86e
---- /dev/null
-+++ b/drivers/xen/pciback/conf_space_header.c
-@@ -0,0 +1,379 @@
-+/*
-+ * PCI Backend - Handles the virtual fields in the configuration space headers.
-+ *
-+ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
-+ */
++	quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
++	if (!quirk) {
++		ret = -ENOMEM;
++		goto out;
++	}
 +
-+#include <linux/kernel.h>
-+#include <linux/pci.h>
-+#include "pciback.h"
-+#include "conf_space.h"
++	quirk->devid.vendor = dev->vendor;
++	quirk->devid.device = dev->device;
++	quirk->devid.subvendor = dev->subsystem_vendor;
++	quirk->devid.subdevice = dev->subsystem_device;
++	quirk->devid.class = 0;
++	quirk->devid.class_mask = 0;
++	quirk->devid.driver_data = 0UL;
 +
-+struct pci_bar_info {
-+	u32 val;
-+	u32 len_val;
-+	int which;
-+};
++	quirk->pdev = dev;
 +
-+#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
-+#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
++	register_quirk(quirk);
++out:
++	return ret;
++}
 +
-+static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
++void pciback_config_field_free(struct config_field *field)
 +{
-+	int i;
-+	int ret;
++	kfree(field);
++}
 +
-+	ret = pciback_read_config_word(dev, offset, value, data);
-+	if (!atomic_read(&dev->enable_cnt))
-+		return ret;
++int pciback_config_quirk_release(struct pci_dev *dev)
++{
++	struct pciback_config_quirk *quirk;
++	int ret = 0;
 +
-+	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
-+		if (dev->resource[i].flags & IORESOURCE_IO)
-+			*value |= PCI_COMMAND_IO;
-+		if (dev->resource[i].flags & IORESOURCE_MEM)
-+			*value |= PCI_COMMAND_MEMORY;
++	quirk = pciback_find_quirk(dev);
++	if (!quirk) {
++		ret = -ENXIO;
++		goto out;
 +	}
 +
++	list_del(&quirk->quirks_list);
++	kfree(quirk);
++
++out:
 +	return ret;
 +}
+diff --git a/drivers/xen/pciback/conf_space_quirks.h b/drivers/xen/pciback/conf_space_quirks.h
+new file mode 100644
+index 0000000..acd0e1a
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_quirks.h
+@@ -0,0 +1,35 @@
++/*
++ * PCI Backend - Data structures for special overlays for broken devices.
++ *
++ * Ryan Wilson <hap9 at epoch.ncsc.mil>
++ * Chris Bookholt <hap10 at epoch.ncsc.mil>
++ */
 +
-+static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
-+{
-+	int err;
++#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
++#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
 +
-+	if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
-+		if (unlikely(verbose_request))
-+			printk(KERN_DEBUG "pciback: %s: enable\n",
-+			       pci_name(dev));
-+		err = pci_enable_device(dev);
-+		if (err)
-+			return err;
-+	} else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
-+		if (unlikely(verbose_request))
-+			printk(KERN_DEBUG "pciback: %s: disable\n",
-+			       pci_name(dev));
-+		pci_disable_device(dev);
-+	}
++#include <linux/pci.h>
++#include <linux/list.h>
 +
-+	if (!dev->is_busmaster && is_master_cmd(value)) {
-+		if (unlikely(verbose_request))
-+			printk(KERN_DEBUG "pciback: %s: set bus master\n",
-+			       pci_name(dev));
-+		pci_set_master(dev);
-+	}
++struct pciback_config_quirk {
++	struct list_head quirks_list;
++	struct pci_device_id devid;
++	struct pci_dev *pdev;
++};
 +
-+	if (value & PCI_COMMAND_INVALIDATE) {
-+		if (unlikely(verbose_request))
-+			printk(KERN_DEBUG
-+			       "pciback: %s: enable memory-write-invalidate\n",
-+			       pci_name(dev));
-+		err = pci_set_mwi(dev);
-+		if (err) {
-+			printk(KERN_WARNING
-+			       "pciback: %s: cannot enable "
-+			       "memory-write-invalidate (%d)\n",
-+			       pci_name(dev), err);
-+			value &= ~PCI_COMMAND_INVALIDATE;
-+		}
-+	}
++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
 +
-+	return pci_write_config_word(dev, offset, value);
-+}
++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
++				    *field);
 +
-+static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
-+{
-+	struct pci_bar_info *bar = data;
++int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
 +
-+	if (unlikely(!bar)) {
-+		printk(KERN_WARNING "pciback: driver data not found for %s\n",
-+		       pci_name(dev));
-+		return XEN_PCI_ERR_op_failed;
-+	}
++int pciback_config_quirks_init(struct pci_dev *dev);
 +
-+	/* A write to obtain the length must happen as a 32-bit write.
-+	 * This does not (yet) support writing individual bytes
-+	 */
-+	if (value == ~PCI_ROM_ADDRESS_ENABLE)
-+		bar->which = 1;
-+	else {
-+		u32 tmpval;
-+		pci_read_config_dword(dev, offset, &tmpval);
-+		if (tmpval != bar->val && value == bar->val) {
-+			/* Allow restoration of bar value. */
-+			pci_write_config_dword(dev, offset, bar->val);
-+		}
-+		bar->which = 0;
-+	}
++void pciback_config_field_free(struct config_field *field);
 +
-+	/* Do we need to support enabling/disabling the rom address here? */
++int pciback_config_quirk_release(struct pci_dev *dev);
 +
-+	return 0;
-+}
++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg);
 +
-+/* For the BARs, only allow writes which write ~0 or
-+ * the correct resource information
-+ * (Needed for when the driver probes the resource usage)
++#endif
+diff --git a/drivers/xen/pciback/controller.c b/drivers/xen/pciback/controller.c
+new file mode 100644
+index 0000000..7f04f11
+--- /dev/null
++++ b/drivers/xen/pciback/controller.c
+@@ -0,0 +1,442 @@
++/*
++ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
++ *      Alex Williamson <alex.williamson at hp.com>
++ *
++ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
++ * controllers.  Devices under the same PCI controller are exposed on the
++ * same virtual domain:bus.  Within a bus, device slots are virtualized
++ * to compact the bus.
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 + */
-+static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
-+{
-+	struct pci_bar_info *bar = data;
-+
-+	if (unlikely(!bar)) {
-+		printk(KERN_WARNING "pciback: driver data not found for %s\n",
-+		       pci_name(dev));
-+		return XEN_PCI_ERR_op_failed;
-+	}
 +
-+	/* A write to obtain the length must happen as a 32-bit write.
-+	 * This does not (yet) support writing individual bytes
-+	 */
-+	if (value == ~0)
-+		bar->which = 1;
-+	else {
-+		u32 tmpval;
-+		pci_read_config_dword(dev, offset, &tmpval);
-+		if (tmpval != bar->val && value == bar->val) {
-+			/* Allow restoration of bar value. */
-+			pci_write_config_dword(dev, offset, bar->val);
-+		}
-+		bar->which = 0;
-+	}
++#include <linux/acpi.h>
++#include <linux/list.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
 +
-+	return 0;
-+}
++#define PCI_MAX_BUSSES	255
++#define PCI_MAX_SLOTS	32
 +
-+static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
-+{
-+	struct pci_bar_info *bar = data;
++struct controller_dev_entry {
++	struct list_head list;
++	struct pci_dev *dev;
++	unsigned int devfn;
++};
 +
-+	if (unlikely(!bar)) {
-+		printk(KERN_WARNING "pciback: driver data not found for %s\n",
-+		       pci_name(dev));
-+		return XEN_PCI_ERR_op_failed;
-+	}
++struct controller_list_entry {
++	struct list_head list;
++	struct pci_controller *controller;
++	unsigned int domain;
++	unsigned int bus;
++	unsigned int next_devfn;
++	struct list_head dev_list;
++};
 +
-+	*value = bar->which ? bar->len_val : bar->val;
++struct controller_dev_data {
++	struct list_head list;
++	unsigned int next_domain;
++	unsigned int next_bus;
++	spinlock_t lock;
++};
 +
-+	return 0;
-+}
++struct walk_info {
++	struct pciback_device *pdev;
++	int resource_count;
++	int root_num;
++};
 +
-+static inline void read_dev_bar(struct pci_dev *dev,
-+				struct pci_bar_info *bar_info, int offset,
-+				u32 len_mask)
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus,
++				    unsigned int devfn)
 +{
-+	int	pos;
-+	struct resource	*res = dev->resource;
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_dev_entry *dev_entry;
++	struct controller_list_entry *cntrl_entry;
++	struct pci_dev *dev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&dev_data->lock, flags);
++
++	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++		if (cntrl_entry->domain != domain ||
++		    cntrl_entry->bus != bus)
++			continue;
 +
-+	if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
-+		pos = PCI_ROM_RESOURCE;
-+	else {
-+		pos = (offset - PCI_BASE_ADDRESS_0) / 4;
-+		if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE |
-+				PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
-+			   (PCI_BASE_ADDRESS_SPACE_MEMORY |
-+				PCI_BASE_ADDRESS_MEM_TYPE_64))) {
-+			bar_info->val = res[pos - 1].start >> 32;
-+			bar_info->len_val = res[pos - 1].end >> 32;
-+			return;
++		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++			if (devfn == dev_entry->devfn) {
++				dev = dev_entry->dev;
++				goto found;
++			}
 +		}
 +	}
++found:
++	spin_unlock_irqrestore(&dev_data->lock, flags);
 +
-+	bar_info->val = res[pos].start |
-+			(res[pos].flags & PCI_REGION_FLAG_MASK);
-+	bar_info->len_val = res[pos].end - res[pos].start + 1;
++	return dev;
 +}
 +
-+static void *bar_init(struct pci_dev *dev, int offset)
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++			int devid, publish_pci_dev_cb publish_cb)
 +{
-+	struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
-+
-+	if (!bar)
-+		return ERR_PTR(-ENOMEM);
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_dev_entry *dev_entry;
++	struct controller_list_entry *cntrl_entry;
++	struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
++	unsigned long flags;
++	int ret = 0, found = 0;
 +
-+	read_dev_bar(dev, bar, offset, ~0);
-+	bar->which = 0;
++	spin_lock_irqsave(&dev_data->lock, flags);
 +
-+	return bar;
-+}
++	/* Look to see if we already have a domain:bus for this controller */
++	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++		if (cntrl_entry->controller == dev_controller) {
++			found = 1;
++			break;
++		}
++	}
 +
-+static void *rom_init(struct pci_dev *dev, int offset)
-+{
-+	struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
++	if (!found) {
++		cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
++		if (!cntrl_entry) {
++			ret =  -ENOMEM;
++			goto out;
++		}
 +
-+	if (!bar)
-+		return ERR_PTR(-ENOMEM);
++		cntrl_entry->controller = dev_controller;
++		cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
 +
-+	read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
-+	bar->which = 0;
++		cntrl_entry->domain = dev_data->next_domain;
++		cntrl_entry->bus = dev_data->next_bus++;
++		if (dev_data->next_bus > PCI_MAX_BUSSES) {
++			dev_data->next_domain++;
++			dev_data->next_bus = 0;
++		}
 +
-+	return bar;
-+}
++		INIT_LIST_HEAD(&cntrl_entry->dev_list);
 +
-+static void bar_reset(struct pci_dev *dev, int offset, void *data)
-+{
-+	struct pci_bar_info *bar = data;
++		list_add_tail(&cntrl_entry->list, &dev_data->list);
++	}
 +
-+	bar->which = 0;
-+}
++	if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
++		/*
++		 * While it seems unlikely, this can actually happen if
++		 * a controller has P2P bridges under it.
++		 */
++		xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
++				 "is full, no room to export %04x:%02x:%02x.%x",
++				 cntrl_entry->domain, cntrl_entry->bus,
++				 pci_domain_nr(dev->bus), dev->bus->number,
++				 PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
++		ret = -ENOSPC;
++		goto out;
++	}
 +
-+static void bar_release(struct pci_dev *dev, int offset, void *data)
-+{
-+	kfree(data);
-+}
++	dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
++	if (!dev_entry) {
++		if (list_empty(&cntrl_entry->dev_list)) {
++			list_del(&cntrl_entry->list);
++			kfree(cntrl_entry);
++		}
++		ret = -ENOMEM;
++		goto out;
++	}
 +
-+static int pciback_read_vendor(struct pci_dev *dev, int offset,
-+			       u16 *value, void *data)
-+{
-+	*value = dev->vendor;
++	dev_entry->dev = dev;
++	dev_entry->devfn = cntrl_entry->next_devfn;
 +
-+	return 0;
-+}
++	list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
 +
-+static int pciback_read_device(struct pci_dev *dev, int offset,
-+			       u16 *value, void *data)
-+{
-+	*value = dev->device;
++	cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
 +
-+	return 0;
-+}
++out:
++	spin_unlock_irqrestore(&dev_data->lock, flags);
 +
-+static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
-+			  void *data)
-+{
-+	*value = (u8) dev->irq;
++	/* TODO: Publish virtual domain:bus:slot.func here. */
 +
-+	return 0;
++	return ret;
 +}
 +
-+static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
 +{
-+	u8 cur_value;
-+	int err;
-+
-+	err = pci_read_config_byte(dev, offset, &cur_value);
-+	if (err)
-+		goto out;
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_list_entry *cntrl_entry;
++	struct controller_dev_entry *dev_entry = NULL;
++	struct pci_dev *found_dev = NULL;
++	unsigned long flags;
 +
-+	if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
-+	    || value == PCI_BIST_START)
-+		err = pci_write_config_byte(dev, offset, value);
++	spin_lock_irqsave(&dev_data->lock, flags);
 +
-+out:
-+	return err;
-+}
++	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++		if (cntrl_entry->controller != PCI_CONTROLLER(dev))
++			continue;
 +
-+static const struct config_field header_common[] = {
-+	{
-+	 .offset    = PCI_VENDOR_ID,
-+	 .size      = 2,
-+	 .u.w.read  = pciback_read_vendor,
-+	},
-+	{
-+	 .offset    = PCI_DEVICE_ID,
-+	 .size      = 2,
-+	 .u.w.read  = pciback_read_device,
-+	},
-+	{
-+	 .offset    = PCI_COMMAND,
-+	 .size      = 2,
-+	 .u.w.read  = command_read,
-+	 .u.w.write = command_write,
-+	},
-+	{
-+	 .offset    = PCI_INTERRUPT_LINE,
-+	 .size      = 1,
-+	 .u.b.read  = interrupt_read,
-+	},
-+	{
-+	 .offset    = PCI_INTERRUPT_PIN,
-+	 .size      = 1,
-+	 .u.b.read  = pciback_read_config_byte,
-+	},
-+	{
-+	 /* Any side effects of letting driver domain control cache line? */
-+	 .offset    = PCI_CACHE_LINE_SIZE,
-+	 .size      = 1,
-+	 .u.b.read  = pciback_read_config_byte,
-+	 .u.b.write = pciback_write_config_byte,
-+	},
-+	{
-+	 .offset    = PCI_LATENCY_TIMER,
-+	 .size      = 1,
-+	 .u.b.read  = pciback_read_config_byte,
-+	},
-+	{
-+	 .offset    = PCI_BIST,
-+	 .size      = 1,
-+	 .u.b.read  = pciback_read_config_byte,
-+	 .u.b.write = bist_write,
-+	},
-+	{}
-+};
++		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++			if (dev_entry->dev == dev) {
++				found_dev = dev_entry->dev;
++				break;
++			}
++		}
++	}
 +
-+#define CFG_FIELD_BAR(reg_offset) 			\
-+	{ 						\
-+	 .offset     = reg_offset, 			\
-+	 .size       = 4, 				\
-+	 .init       = bar_init, 			\
-+	 .reset      = bar_reset, 			\
-+	 .release    = bar_release, 			\
-+	 .u.dw.read  = bar_read, 			\
-+	 .u.dw.write = bar_write, 			\
-+	 }
++	if (!found_dev) {
++		spin_unlock_irqrestore(&dev_data->lock, flags);
++		return;
++	}
 +
-+#define CFG_FIELD_ROM(reg_offset) 			\
-+	{ 						\
-+	 .offset     = reg_offset, 			\
-+	 .size       = 4, 				\
-+	 .init       = rom_init, 			\
-+	 .reset      = bar_reset, 			\
-+	 .release    = bar_release, 			\
-+	 .u.dw.read  = bar_read, 			\
-+	 .u.dw.write = rom_write, 			\
-+	 }
++	list_del(&dev_entry->list);
++	kfree(dev_entry);
 +
-+static const struct config_field header_0[] = {
-+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
-+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
-+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
-+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
-+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
-+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
-+	CFG_FIELD_ROM(PCI_ROM_ADDRESS),
-+	{}
-+};
++	if (list_empty(&cntrl_entry->dev_list)) {
++		list_del(&cntrl_entry->list);
++		kfree(cntrl_entry);
++	}
 +
-+static const struct config_field header_1[] = {
-+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
-+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
-+	CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
-+	{}
-+};
++	spin_unlock_irqrestore(&dev_data->lock, flags);
++	pcistub_put_pci_dev(found_dev);
++}
 +
-+int pciback_config_header_add_fields(struct pci_dev *dev)
++int pciback_init_devices(struct pciback_device *pdev)
 +{
-+	int err;
++	struct controller_dev_data *dev_data;
 +
-+	err = pciback_config_add_fields(dev, header_common);
-+	if (err)
-+		goto out;
++	dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
++	if (!dev_data)
++		return -ENOMEM;
 +
-+	switch (dev->hdr_type) {
-+	case PCI_HEADER_TYPE_NORMAL:
-+		err = pciback_config_add_fields(dev, header_0);
-+		break;
++	spin_lock_init(&dev_data->lock);
 +
-+	case PCI_HEADER_TYPE_BRIDGE:
-+		err = pciback_config_add_fields(dev, header_1);
-+		break;
++	INIT_LIST_HEAD(&dev_data->list);
 +
-+	default:
-+		err = -EINVAL;
-+		printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
-+		       pci_name(dev), dev->hdr_type);
-+		break;
-+	}
++	/* Starting domain:bus numbers */
++	dev_data->next_domain = 0;
++	dev_data->next_bus = 0;
 +
-+out:
-+	return err;
++	pdev->pci_dev_data = dev_data;
++
++	return 0;
 +}
-diff --git a/drivers/xen/pciback/conf_space_quirks.c b/drivers/xen/pciback/conf_space_quirks.c
-new file mode 100644
-index 0000000..45c31fb
---- /dev/null
-+++ b/drivers/xen/pciback/conf_space_quirks.c
-@@ -0,0 +1,140 @@
-+/*
-+ * PCI Backend - Handle special overlays for broken devices.
-+ *
-+ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
-+ * Author: Chris Bookholt <hap10 at epoch.ncsc.mil>
-+ */
 +
-+#include <linux/kernel.h>
-+#include <linux/pci.h>
-+#include "pciback.h"
-+#include "conf_space.h"
-+#include "conf_space_quirks.h"
++static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
++{
++	struct walk_info *info = data;
++	struct acpi_resource_address64 addr;
++	acpi_status status;
++	int i, len, err;
++	char str[32], tmp[3];
++	unsigned char *ptr, *buf;
 +
-+LIST_HEAD(pciback_quirks);
++	status = acpi_resource_to_address64(res, &addr);
 +
-+static inline const struct pci_device_id *
-+match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
-+{
-+	if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
-+	    (id->device == PCI_ANY_ID || id->device == dev->device) &&
-+	    (id->subvendor == PCI_ANY_ID ||
-+				id->subvendor == dev->subsystem_vendor) &&
-+	    (id->subdevice == PCI_ANY_ID ||
-+				id->subdevice == dev->subsystem_device) &&
-+	    !((id->class ^ dev->class) & id->class_mask))
-+		return id;
-+	return NULL;
-+}
++	/* Do we care about this range?  Let's check. */
++	if (!ACPI_SUCCESS(status) ||
++	    !(addr.resource_type == ACPI_MEMORY_RANGE ||
++	      addr.resource_type == ACPI_IO_RANGE) ||
++	    !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
++		return AE_OK;
 +
-+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
-+{
-+	struct pciback_config_quirk *tmp_quirk;
++	/*
++	 * Furthermore, we really only care to tell the guest about
++	 * address ranges that require address translation of some sort.
++	 */
++	if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
++	      addr.info.mem.translation) &&
++	    !(addr.resource_type == ACPI_IO_RANGE &&
++	      addr.info.io.translation))
++		return AE_OK;
 +
-+	list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
-+		if (match_one_device(&tmp_quirk->devid, dev) != NULL)
-+			goto out;
-+	tmp_quirk = NULL;
-+	printk(KERN_DEBUG
-+	       "quirk didn't match any device pciback knows about\n");
-+out:
-+	return tmp_quirk;
-+}
++	/* Store the resource in xenbus for the guest */
++	len = snprintf(str, sizeof(str), "root-%d-resource-%d",
++		       info->root_num, info->resource_count);
++	if (unlikely(len >= (sizeof(str) - 1)))
++		return AE_OK;
 +
-+static inline void register_quirk(struct pciback_config_quirk *quirk)
-+{
-+	list_add_tail(&quirk->quirks_list, &pciback_quirks);
-+}
++	buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
++	if (!buf)
++		return AE_OK;
 +
-+int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg)
-+{
-+	int ret = 0;
-+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
-+	struct config_field_entry *cfg_entry;
++	/* Clean out resource_source */
++	res->data.address64.resource_source.index = 0xFF;
++	res->data.address64.resource_source.string_length = 0;
++	res->data.address64.resource_source.string_ptr = NULL;
 +
-+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
-+		if (OFFSET(cfg_entry) == reg) {
-+			ret = 1;
-+			break;
-+		}
++	ptr = (unsigned char *)res;
++
++	/* Turn the acpi_resource into an ASCII byte stream */
++	for (i = 0; i < sizeof(*res); i++) {
++		snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
++		strncat(buf, tmp, 2);
 +	}
-+	return ret;
-+}
 +
-+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
-+				    *field)
-+{
-+	int err = 0;
++	err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
++			    str, "%s", buf);
 +
-+	switch (field->size) {
-+	case 1:
-+		field->u.b.read = pciback_read_config_byte;
-+		field->u.b.write = pciback_write_config_byte;
-+		break;
-+	case 2:
-+		field->u.w.read = pciback_read_config_word;
-+		field->u.w.write = pciback_write_config_word;
-+		break;
-+	case 4:
-+		field->u.dw.read = pciback_read_config_dword;
-+		field->u.dw.write = pciback_write_config_dword;
-+		break;
-+	default:
-+		err = -EINVAL;
-+		goto out;
-+	}
++	if (!err)
++		info->resource_count++;
 +
-+	pciback_config_add_field(dev, field);
++	kfree(buf);
 +
-+out:
-+	return err;
++	return AE_OK;
 +}
 +
-+int pciback_config_quirks_init(struct pci_dev *dev)
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++			      publish_pci_root_cb publish_root_cb)
 +{
-+	struct pciback_config_quirk *quirk;
-+	int ret = 0;
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_list_entry *cntrl_entry;
++	int i, root_num, len, err = 0;
++	unsigned int domain, bus;
++	char str[64];
++	struct walk_info info;
 +
-+	quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
-+	if (!quirk) {
-+		ret = -ENOMEM;
-+		goto out;
-+	}
++	spin_lock(&dev_data->lock);
 +
-+	quirk->devid.vendor = dev->vendor;
-+	quirk->devid.device = dev->device;
-+	quirk->devid.subvendor = dev->subsystem_vendor;
-+	quirk->devid.subdevice = dev->subsystem_device;
-+	quirk->devid.class = 0;
-+	quirk->devid.class_mask = 0;
-+	quirk->devid.driver_data = 0UL;
++	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++		/* First publish all the domain:bus info */
++		err = publish_root_cb(pdev, cntrl_entry->domain,
++				      cntrl_entry->bus);
++		if (err)
++			goto out;
 +
-+	quirk->pdev = dev;
++		/*
++		 * Now figure out which root-%d this belongs to
++		 * so we can associate resources with it.
++		 */
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++				   "root_num", "%d", &root_num);
 +
-+	register_quirk(quirk);
-+out:
-+	return ret;
-+}
++		if (err != 1)
++			goto out;
 +
-+void pciback_config_field_free(struct config_field *field)
-+{
-+	kfree(field);
-+}
++		for (i = 0; i < root_num; i++) {
++			len = snprintf(str, sizeof(str), "root-%d", i);
++			if (unlikely(len >= (sizeof(str) - 1))) {
++				err = -ENOMEM;
++				goto out;
++			}
 +
-+int pciback_config_quirk_release(struct pci_dev *dev)
-+{
-+	struct pciback_config_quirk *quirk;
-+	int ret = 0;
++			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++					   str, "%x:%x", &domain, &bus);
++			if (err != 2)
++				goto out;
++
++			/* Is this the one we just published? */
++			if (domain == cntrl_entry->domain &&
++			    bus == cntrl_entry->bus)
++				break;
++		}
++
++		if (i == root_num)
++			goto out;
++
++		info.pdev = pdev;
++		info.resource_count = 0;
++		info.root_num = i;
++
++		/* Let ACPI do the heavy lifting on decoding resources */
++		acpi_walk_resources(cntrl_entry->controller->acpi_handle,
++				    METHOD_NAME__CRS, write_xenbus_resource,
++				    &info);
++
++		/* No resouces.  OK.  On to the next one */
++		if (!info.resource_count)
++			continue;
++
++		/* Store the number of resources we wrote for this root-%d */
++		len = snprintf(str, sizeof(str), "root-%d-resources", i);
++		if (unlikely(len >= (sizeof(str) - 1))) {
++			err = -ENOMEM;
++			goto out;
++		}
++
++		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++				    "%d", info.resource_count);
++		if (err)
++			goto out;
++	}
 +
-+	quirk = pciback_find_quirk(dev);
-+	if (!quirk) {
-+		ret = -ENXIO;
++	/* Finally, write some magic to synchronize with the guest. */
++	len = snprintf(str, sizeof(str), "root-resource-magic");
++	if (unlikely(len >= (sizeof(str) - 1))) {
++		err = -ENOMEM;
 +		goto out;
 +	}
 +
-+	list_del(&quirk->quirks_list);
-+	kfree(quirk);
++	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++			    "%lx", (sizeof(struct acpi_resource) * 2) + 1);
 +
 +out:
-+	return ret;
-+}
-diff --git a/drivers/xen/pciback/conf_space_quirks.h b/drivers/xen/pciback/conf_space_quirks.h
-new file mode 100644
-index 0000000..acd0e1a
---- /dev/null
-+++ b/drivers/xen/pciback/conf_space_quirks.h
-@@ -0,0 +1,35 @@
-+/*
-+ * PCI Backend - Data structures for special overlays for broken devices.
-+ *
-+ * Ryan Wilson <hap9 at epoch.ncsc.mil>
-+ * Chris Bookholt <hap10 at epoch.ncsc.mil>
-+ */
-+
-+#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
-+#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
-+
-+#include <linux/pci.h>
-+#include <linux/list.h>
-+
-+struct pciback_config_quirk {
-+	struct list_head quirks_list;
-+	struct pci_device_id devid;
-+	struct pci_dev *pdev;
-+};
++	spin_unlock(&dev_data->lock);
 +
-+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
++	return err;
++}
 +
-+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
-+				    *field);
++void pciback_release_devices(struct pciback_device *pdev)
++{
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_list_entry *cntrl_entry, *c;
++	struct controller_dev_entry *dev_entry, *d;
 +
-+int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
++	list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
++		list_for_each_entry_safe(dev_entry, d,
++					 &cntrl_entry->dev_list, list) {
++			list_del(&dev_entry->list);
++			pcistub_put_pci_dev(dev_entry->dev);
++			kfree(dev_entry);
++		}
++		list_del(&cntrl_entry->list);
++		kfree(cntrl_entry);
++	}
 +
-+int pciback_config_quirks_init(struct pci_dev *dev);
++	kfree(dev_data);
++	pdev->pci_dev_data = NULL;
++}
 +
-+void pciback_config_field_free(struct config_field *field);
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++		struct pciback_device *pdev,
++		unsigned int *domain, unsigned int *bus, unsigned int *devfn)
++{
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_dev_entry *dev_entry;
++	struct controller_list_entry *cntrl_entry;
++	unsigned long flags;
++	int found = 0;
++	spin_lock_irqsave(&dev_data->lock, flags);
 +
-+int pciback_config_quirk_release(struct pci_dev *dev);
++	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++			if ((dev_entry->dev->bus->number ==
++					pcidev->bus->number) &&
++				(dev_entry->dev->devfn ==
++					pcidev->devfn) &&
++				(pci_domain_nr(dev_entry->dev->bus) ==
++					pci_domain_nr(pcidev->bus))) {
++				found = 1;
++				*domain = cntrl_entry->domain;
++				*bus = cntrl_entry->bus;
++				*devfn = dev_entry->devfn;
++				goto out;
++			}
++		}
++	}
++out:
++	spin_unlock_irqrestore(&dev_data->lock, flags);
++	return found;
 +
-+int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg);
++}
 +
-+#endif
-diff --git a/drivers/xen/pciback/controller.c b/drivers/xen/pciback/controller.c
+diff --git a/drivers/xen/pciback/passthrough.c b/drivers/xen/pciback/passthrough.c
 new file mode 100644
-index 0000000..7f04f11
+index 0000000..5386beb
 --- /dev/null
-+++ b/drivers/xen/pciback/controller.c
-@@ -0,0 +1,442 @@
++++ b/drivers/xen/pciback/passthrough.c
+@@ -0,0 +1,178 @@
 +/*
-+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
-+ *      Alex Williamson <alex.williamson at hp.com>
-+ *
-+ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
-+ * controllers.  Devices under the same PCI controller are exposed on the
-+ * same virtual domain:bus.  Within a bus, device slots are virtualized
-+ * to compact the bus.
-+ *
-+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2 of the License, or
-+ * (at your option) any later version.
-+ *
-+ * This program is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+ * GNU General Public License for more details.
++ * PCI Backend - Provides restricted access to the real PCI bus topology
++ *               to the frontend
 + *
-+ * You should have received a copy of the GNU General Public License
-+ * along with this program; if not, write to the Free Software
-+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
 + */
 +
-+#include <linux/acpi.h>
 +#include <linux/list.h>
 +#include <linux/pci.h>
 +#include <linux/spinlock.h>
 +#include "pciback.h"
 +
-+#define PCI_MAX_BUSSES	255
-+#define PCI_MAX_SLOTS	32
-+
-+struct controller_dev_entry {
-+	struct list_head list;
-+	struct pci_dev *dev;
-+	unsigned int devfn;
-+};
-+
-+struct controller_list_entry {
-+	struct list_head list;
-+	struct pci_controller *controller;
-+	unsigned int domain;
-+	unsigned int bus;
-+	unsigned int next_devfn;
++struct passthrough_dev_data {
++	/* Access to dev_list must be protected by lock */
 +	struct list_head dev_list;
-+};
-+
-+struct controller_dev_data {
-+	struct list_head list;
-+	unsigned int next_domain;
-+	unsigned int next_bus;
 +	spinlock_t lock;
 +};
 +
-+struct walk_info {
-+	struct pciback_device *pdev;
-+	int resource_count;
-+	int root_num;
-+};
-+
 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
 +				    unsigned int domain, unsigned int bus,
 +				    unsigned int devfn)
 +{
-+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
-+	struct controller_dev_entry *dev_entry;
-+	struct controller_list_entry *cntrl_entry;
++	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++	struct pci_dev_entry *dev_entry;
 +	struct pci_dev *dev = NULL;
 +	unsigned long flags;
 +
 +	spin_lock_irqsave(&dev_data->lock, flags);
 +
-+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
-+		if (cntrl_entry->domain != domain ||
-+		    cntrl_entry->bus != bus)
-+			continue;
-+
-+		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
-+			if (devfn == dev_entry->devfn) {
-+				dev = dev_entry->dev;
-+				goto found;
-+			}
-+		}
-+	}
-+found:
-+	spin_unlock_irqrestore(&dev_data->lock, flags);
-+
-+	return dev;
-+}
-+
-+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
-+			int devid, publish_pci_dev_cb publish_cb)
-+{
-+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
-+	struct controller_dev_entry *dev_entry;
-+	struct controller_list_entry *cntrl_entry;
-+	struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
-+	unsigned long flags;
-+	int ret = 0, found = 0;
-+
-+	spin_lock_irqsave(&dev_data->lock, flags);
-+
-+	/* Look to see if we already have a domain:bus for this controller */
-+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
-+		if (cntrl_entry->controller == dev_controller) {
-+			found = 1;
++	list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
++		if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
++		    && bus == (unsigned int)dev_entry->dev->bus->number
++		    && devfn == dev_entry->dev->devfn) {
++			dev = dev_entry->dev;
 +			break;
 +		}
 +	}
 +
-+	if (!found) {
-+		cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
-+		if (!cntrl_entry) {
-+			ret =  -ENOMEM;
-+			goto out;
-+		}
-+
-+		cntrl_entry->controller = dev_controller;
-+		cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
-+
-+		cntrl_entry->domain = dev_data->next_domain;
-+		cntrl_entry->bus = dev_data->next_bus++;
-+		if (dev_data->next_bus > PCI_MAX_BUSSES) {
-+			dev_data->next_domain++;
-+			dev_data->next_bus = 0;
-+		}
-+
-+		INIT_LIST_HEAD(&cntrl_entry->dev_list);
-+
-+		list_add_tail(&cntrl_entry->list, &dev_data->list);
-+	}
-+
-+	if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
-+		/*
-+		 * While it seems unlikely, this can actually happen if
-+		 * a controller has P2P bridges under it.
-+		 */
-+		xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
-+				 "is full, no room to export %04x:%02x:%02x.%x",
-+				 cntrl_entry->domain, cntrl_entry->bus,
-+				 pci_domain_nr(dev->bus), dev->bus->number,
-+				 PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
-+		ret = -ENOSPC;
-+		goto out;
-+	}
-+
-+	dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
-+	if (!dev_entry) {
-+		if (list_empty(&cntrl_entry->dev_list)) {
-+			list_del(&cntrl_entry->list);
-+			kfree(cntrl_entry);
-+		}
-+		ret = -ENOMEM;
-+		goto out;
-+	}
++	spin_unlock_irqrestore(&dev_data->lock, flags);
 +
-+	dev_entry->dev = dev;
-+	dev_entry->devfn = cntrl_entry->next_devfn;
++	return dev;
++}
 +
-+	list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++			int devid, publish_pci_dev_cb publish_cb)
++{
++	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++	struct pci_dev_entry *dev_entry;
++	unsigned long flags;
++	unsigned int domain, bus, devfn;
++	int err;
 +
-+	cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
++	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
++	if (!dev_entry)
++		return -ENOMEM;
++	dev_entry->dev = dev;
 +
-+out:
++	spin_lock_irqsave(&dev_data->lock, flags);
++	list_add_tail(&dev_entry->list, &dev_data->dev_list);
 +	spin_unlock_irqrestore(&dev_data->lock, flags);
 +
-+	/* TODO: Publish virtual domain:bus:slot.func here. */
++	/* Publish this device. */
++	domain = (unsigned int)pci_domain_nr(dev->bus);
++	bus = (unsigned int)dev->bus->number;
++	devfn = dev->devfn;
++	err = publish_cb(pdev, domain, bus, devfn, devid);
 +
-+	return ret;
++	return err;
 +}
 +
 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
 +{
-+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
-+	struct controller_list_entry *cntrl_entry;
-+	struct controller_dev_entry *dev_entry = NULL;
++	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++	struct pci_dev_entry *dev_entry, *t;
 +	struct pci_dev *found_dev = NULL;
 +	unsigned long flags;
 +
 +	spin_lock_irqsave(&dev_data->lock, flags);
 +
-+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
-+		if (cntrl_entry->controller != PCI_CONTROLLER(dev))
-+			continue;
-+
-+		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
-+			if (dev_entry->dev == dev) {
-+				found_dev = dev_entry->dev;
-+				break;
-+			}
++	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
++		if (dev_entry->dev == dev) {
++			list_del(&dev_entry->list);
++			found_dev = dev_entry->dev;
++			kfree(dev_entry);
 +		}
 +	}
 +
-+	if (!found_dev) {
-+		spin_unlock_irqrestore(&dev_data->lock, flags);
-+		return;
-+	}
-+
-+	list_del(&dev_entry->list);
-+	kfree(dev_entry);
-+
-+	if (list_empty(&cntrl_entry->dev_list)) {
-+		list_del(&cntrl_entry->list);
-+		kfree(cntrl_entry);
-+	}
-+
 +	spin_unlock_irqrestore(&dev_data->lock, flags);
-+	pcistub_put_pci_dev(found_dev);
++
++	if (found_dev)
++		pcistub_put_pci_dev(found_dev);
 +}
 +
 +int pciback_init_devices(struct pciback_device *pdev)
 +{
-+	struct controller_dev_data *dev_data;
++	struct passthrough_dev_data *dev_data;
 +
 +	dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
 +	if (!dev_data)
@@ -19268,3173 +21158,3486 @@
 +
 +	spin_lock_init(&dev_data->lock);
 +
-+	INIT_LIST_HEAD(&dev_data->list);
-+
-+	/* Starting domain:bus numbers */
-+	dev_data->next_domain = 0;
-+	dev_data->next_bus = 0;
++	INIT_LIST_HEAD(&dev_data->dev_list);
 +
 +	pdev->pci_dev_data = dev_data;
 +
 +	return 0;
 +}
 +
-+static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++			      publish_pci_root_cb publish_root_cb)
 +{
-+	struct walk_info *info = data;
-+	struct acpi_resource_address64 addr;
-+	acpi_status status;
-+	int i, len, err;
-+	char str[32], tmp[3];
-+	unsigned char *ptr, *buf;
++	int err = 0;
++	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++	struct pci_dev_entry *dev_entry, *e;
++	struct pci_dev *dev;
++	int found;
++	unsigned int domain, bus;
 +
-+	status = acpi_resource_to_address64(res, &addr);
++	spin_lock(&dev_data->lock);
 +
-+	/* Do we care about this range?  Let's check. */
-+	if (!ACPI_SUCCESS(status) ||
-+	    !(addr.resource_type == ACPI_MEMORY_RANGE ||
-+	      addr.resource_type == ACPI_IO_RANGE) ||
-+	    !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
-+		return AE_OK;
++	list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
++		/* Only publish this device as a root if none of its
++		 * parent bridges are exported
++		 */
++		found = 0;
++		dev = dev_entry->dev->bus->self;
++		for (; !found && dev != NULL; dev = dev->bus->self) {
++			list_for_each_entry(e, &dev_data->dev_list, list) {
++				if (dev == e->dev) {
++					found = 1;
++					break;
++				}
++			}
++		}
 +
-+	/*
-+	 * Furthermore, we really only care to tell the guest about
-+	 * address ranges that require address translation of some sort.
-+	 */
-+	if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
-+	      addr.info.mem.translation) &&
-+	    !(addr.resource_type == ACPI_IO_RANGE &&
-+	      addr.info.io.translation))
-+		return AE_OK;
++		domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
++		bus = (unsigned int)dev_entry->dev->bus->number;
 +
-+	/* Store the resource in xenbus for the guest */
-+	len = snprintf(str, sizeof(str), "root-%d-resource-%d",
-+		       info->root_num, info->resource_count);
-+	if (unlikely(len >= (sizeof(str) - 1)))
-+		return AE_OK;
++		if (!found) {
++			err = publish_root_cb(pdev, domain, bus);
++			if (err)
++				break;
++		}
++	}
 +
-+	buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
-+	if (!buf)
-+		return AE_OK;
++	spin_unlock(&dev_data->lock);
 +
-+	/* Clean out resource_source */
-+	res->data.address64.resource_source.index = 0xFF;
-+	res->data.address64.resource_source.string_length = 0;
-+	res->data.address64.resource_source.string_ptr = NULL;
++	return err;
++}
 +
-+	ptr = (unsigned char *)res;
++void pciback_release_devices(struct pciback_device *pdev)
++{
++	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++	struct pci_dev_entry *dev_entry, *t;
 +
-+	/* Turn the acpi_resource into an ASCII byte stream */
-+	for (i = 0; i < sizeof(*res); i++) {
-+		snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
-+		strncat(buf, tmp, 2);
++	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
++		list_del(&dev_entry->list);
++		pcistub_put_pci_dev(dev_entry->dev);
++		kfree(dev_entry);
 +	}
 +
-+	err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
-+			    str, "%s", buf);
-+
-+	if (!err)
-+		info->resource_count++;
++	kfree(dev_data);
++	pdev->pci_dev_data = NULL;
++}
 +
-+	kfree(buf);
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++			     struct pciback_device *pdev,
++			     unsigned int *domain, unsigned int *bus,
++			     unsigned int *devfn)
 +
-+	return AE_OK;
++{
++	*domain = pci_domain_nr(pcidev->bus);
++	*bus = pcidev->bus->number;
++	*devfn = pcidev->devfn;
++	return 1;
 +}
+diff --git a/drivers/xen/pciback/pci_stub.c b/drivers/xen/pciback/pci_stub.c
+new file mode 100644
+index 0000000..02178e2
+--- /dev/null
++++ b/drivers/xen/pciback/pci_stub.c
+@@ -0,0 +1,1287 @@
++/*
++ * PCI Stub Driver - Grabs devices in backend to be exported later
++ *
++ * Ryan Wilson <hap9 at epoch.ncsc.mil>
++ * Chris Bookholt <hap10 at epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/rwsem.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/kref.h>
++#include <linux/pci.h>
++#include <linux/wait.h>
++#include <linux/sched.h>
++#include <asm/atomic.h>
++#include <xen/events.h>
++#include <asm/xen/pci.h>
++#include <asm/xen/hypervisor.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
 +
-+int pciback_publish_pci_roots(struct pciback_device *pdev,
-+			      publish_pci_root_cb publish_root_cb)
-+{
-+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
-+	struct controller_list_entry *cntrl_entry;
-+	int i, root_num, len, err = 0;
-+	unsigned int domain, bus;
-+	char str[64];
-+	struct walk_info info;
++static char *pci_devs_to_hide;
++wait_queue_head_t aer_wait_queue;
++/*Add sem for sync AER handling and pciback remove/reconfigue ops,
++* We want to avoid in middle of AER ops, pciback devices is being removed
++*/
++static DECLARE_RWSEM(pcistub_sem);
++module_param_named(hide, pci_devs_to_hide, charp, 0444);
 +
-+	spin_lock(&dev_data->lock);
++struct pcistub_device_id {
++	struct list_head slot_list;
++	int domain;
++	unsigned char bus;
++	unsigned int devfn;
++};
++static LIST_HEAD(pcistub_device_ids);
++static DEFINE_SPINLOCK(device_ids_lock);
 +
-+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
-+		/* First publish all the domain:bus info */
-+		err = publish_root_cb(pdev, cntrl_entry->domain,
-+				      cntrl_entry->bus);
-+		if (err)
-+			goto out;
++struct pcistub_device {
++	struct kref kref;
++	struct list_head dev_list;
++	spinlock_t lock;
 +
-+		/*
-+		 * Now figure out which root-%d this belongs to
-+		 * so we can associate resources with it.
-+		 */
-+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
-+				   "root_num", "%d", &root_num);
++	struct pci_dev *dev;
++	struct pciback_device *pdev;/* non-NULL if struct pci_dev is in use */
++};
 +
-+		if (err != 1)
-+			goto out;
++/* Access to pcistub_devices & seized_devices lists and the initialize_devices
++ * flag must be locked with pcistub_devices_lock
++ */
++static DEFINE_SPINLOCK(pcistub_devices_lock);
++static LIST_HEAD(pcistub_devices);
++
++/* wait for device_initcall before initializing our devices
++ * (see pcistub_init_devices_late)
++ */
++static int initialize_devices;
++static LIST_HEAD(seized_devices);
 +
-+		for (i = 0; i < root_num; i++) {
-+			len = snprintf(str, sizeof(str), "root-%d", i);
-+			if (unlikely(len >= (sizeof(str) - 1))) {
-+				err = -ENOMEM;
-+				goto out;
-+			}
++static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
++{
++	struct pcistub_device *psdev;
 +
-+			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
-+					   str, "%x:%x", &domain, &bus);
-+			if (err != 2)
-+				goto out;
++	dev_dbg(&dev->dev, "pcistub_device_alloc\n");
 +
-+			/* Is this the one we just published? */
-+			if (domain == cntrl_entry->domain &&
-+			    bus == cntrl_entry->bus)
-+				break;
-+		}
++	psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
++	if (!psdev)
++		return NULL;
 +
-+		if (i == root_num)
-+			goto out;
++	psdev->dev = pci_dev_get(dev);
++	if (!psdev->dev) {
++		kfree(psdev);
++		return NULL;
++	}
 +
-+		info.pdev = pdev;
-+		info.resource_count = 0;
-+		info.root_num = i;
++	kref_init(&psdev->kref);
++	spin_lock_init(&psdev->lock);
 +
-+		/* Let ACPI do the heavy lifting on decoding resources */
-+		acpi_walk_resources(cntrl_entry->controller->acpi_handle,
-+				    METHOD_NAME__CRS, write_xenbus_resource,
-+				    &info);
++	return psdev;
++}
 +
-+		/* No resouces.  OK.  On to the next one */
-+		if (!info.resource_count)
-+			continue;
++/* Don't call this directly as it's called by pcistub_device_put */
++static void pcistub_device_release(struct kref *kref)
++{
++	struct pcistub_device *psdev;
 +
-+		/* Store the number of resources we wrote for this root-%d */
-+		len = snprintf(str, sizeof(str), "root-%d-resources", i);
-+		if (unlikely(len >= (sizeof(str) - 1))) {
-+			err = -ENOMEM;
-+			goto out;
-+		}
++	psdev = container_of(kref, struct pcistub_device, kref);
 +
-+		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
-+				    "%d", info.resource_count);
-+		if (err)
-+			goto out;
-+	}
++	dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
 +
-+	/* Finally, write some magic to synchronize with the guest. */
-+	len = snprintf(str, sizeof(str), "root-resource-magic");
-+	if (unlikely(len >= (sizeof(str) - 1))) {
-+		err = -ENOMEM;
-+		goto out;
-+	}
++	xen_unregister_device_domain_owner(psdev->dev);
 +
-+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
-+			    "%lx", (sizeof(struct acpi_resource) * 2) + 1);
++	/* Clean-up the device */
++	pciback_reset_device(psdev->dev);
++	pciback_config_free_dyn_fields(psdev->dev);
++	pciback_config_free_dev(psdev->dev);
++	kfree(pci_get_drvdata(psdev->dev));
++	pci_set_drvdata(psdev->dev, NULL);
 +
-+out:
-+	spin_unlock(&dev_data->lock);
++	pci_dev_put(psdev->dev);
 +
-+	return err;
++	kfree(psdev);
 +}
 +
-+void pciback_release_devices(struct pciback_device *pdev)
++static inline void pcistub_device_get(struct pcistub_device *psdev)
 +{
-+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
-+	struct controller_list_entry *cntrl_entry, *c;
-+	struct controller_dev_entry *dev_entry, *d;
-+
-+	list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
-+		list_for_each_entry_safe(dev_entry, d,
-+					 &cntrl_entry->dev_list, list) {
-+			list_del(&dev_entry->list);
-+			pcistub_put_pci_dev(dev_entry->dev);
-+			kfree(dev_entry);
-+		}
-+		list_del(&cntrl_entry->list);
-+		kfree(cntrl_entry);
-+	}
++	kref_get(&psdev->kref);
++}
 +
-+	kfree(dev_data);
-+	pdev->pci_dev_data = NULL;
++static inline void pcistub_device_put(struct pcistub_device *psdev)
++{
++	kref_put(&psdev->kref, pcistub_device_release);
 +}
 +
-+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
-+		struct pciback_device *pdev,
-+		unsigned int *domain, unsigned int *bus, unsigned int *devfn)
++static struct pcistub_device *pcistub_device_find(int domain, int bus,
++						  int slot, int func)
 +{
-+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
-+	struct controller_dev_entry *dev_entry;
-+	struct controller_list_entry *cntrl_entry;
++	struct pcistub_device *psdev = NULL;
 +	unsigned long flags;
-+	int found = 0;
-+	spin_lock_irqsave(&dev_data->lock, flags);
 +
-+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
-+		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
-+			if ((dev_entry->dev->bus->number ==
-+					pcidev->bus->number) &&
-+				(dev_entry->dev->devfn ==
-+					pcidev->devfn) &&
-+				(pci_domain_nr(dev_entry->dev->bus) ==
-+					pci_domain_nr(pcidev->bus))) {
-+				found = 1;
-+				*domain = cntrl_entry->domain;
-+				*bus = cntrl_entry->bus;
-+				*devfn = dev_entry->devfn;
-+				goto out;
-+			}
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (psdev->dev != NULL
++		    && domain == pci_domain_nr(psdev->dev->bus)
++		    && bus == psdev->dev->bus->number
++		    && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
++			pcistub_device_get(psdev);
++			goto out;
 +		}
 +	}
-+out:
-+	spin_unlock_irqrestore(&dev_data->lock, flags);
-+	return found;
 +
++	/* didn't find it */
++	psdev = NULL;
++
++out:
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	return psdev;
 +}
 +
-diff --git a/drivers/xen/pciback/passthrough.c b/drivers/xen/pciback/passthrough.c
-new file mode 100644
-index 0000000..5386beb
---- /dev/null
-+++ b/drivers/xen/pciback/passthrough.c
-@@ -0,0 +1,178 @@
-+/*
-+ * PCI Backend - Provides restricted access to the real PCI bus topology
-+ *               to the frontend
-+ *
-+ *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
-+ */
++static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
++						  struct pcistub_device *psdev)
++{
++	struct pci_dev *pci_dev = NULL;
++	unsigned long flags;
 +
-+#include <linux/list.h>
-+#include <linux/pci.h>
-+#include <linux/spinlock.h>
-+#include "pciback.h"
++	pcistub_device_get(psdev);
 +
-+struct passthrough_dev_data {
-+	/* Access to dev_list must be protected by lock */
-+	struct list_head dev_list;
-+	spinlock_t lock;
-+};
++	spin_lock_irqsave(&psdev->lock, flags);
++	if (!psdev->pdev) {
++		psdev->pdev = pdev;
++		pci_dev = psdev->dev;
++	}
++	spin_unlock_irqrestore(&psdev->lock, flags);
 +
-+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
-+				    unsigned int domain, unsigned int bus,
-+				    unsigned int devfn)
++	if (!pci_dev)
++		pcistub_device_put(psdev);
++
++	return pci_dev;
++}
++
++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
++					    int domain, int bus,
++					    int slot, int func)
 +{
-+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
-+	struct pci_dev_entry *dev_entry;
-+	struct pci_dev *dev = NULL;
++	struct pcistub_device *psdev;
++	struct pci_dev *found_dev = NULL;
 +	unsigned long flags;
 +
-+	spin_lock_irqsave(&dev_data->lock, flags);
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
 +
-+	list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
-+		if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
-+		    && bus == (unsigned int)dev_entry->dev->bus->number
-+		    && devfn == dev_entry->dev->devfn) {
-+			dev = dev_entry->dev;
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (psdev->dev != NULL
++		    && domain == pci_domain_nr(psdev->dev->bus)
++		    && bus == psdev->dev->bus->number
++		    && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
++			found_dev = pcistub_device_get_pci_dev(pdev, psdev);
 +			break;
 +		}
 +	}
 +
-+	spin_unlock_irqrestore(&dev_data->lock, flags);
-+
-+	return dev;
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	return found_dev;
 +}
 +
-+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
-+			int devid, publish_pci_dev_cb publish_cb)
++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
++				    struct pci_dev *dev)
 +{
-+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
-+	struct pci_dev_entry *dev_entry;
++	struct pcistub_device *psdev;
++	struct pci_dev *found_dev = NULL;
 +	unsigned long flags;
-+	unsigned int domain, bus, devfn;
-+	int err;
-+
-+	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
-+	if (!dev_entry)
-+		return -ENOMEM;
-+	dev_entry->dev = dev;
 +
-+	spin_lock_irqsave(&dev_data->lock, flags);
-+	list_add_tail(&dev_entry->list, &dev_data->dev_list);
-+	spin_unlock_irqrestore(&dev_data->lock, flags);
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
 +
-+	/* Publish this device. */
-+	domain = (unsigned int)pci_domain_nr(dev->bus);
-+	bus = (unsigned int)dev->bus->number;
-+	devfn = dev->devfn;
-+	err = publish_cb(pdev, domain, bus, devfn, devid);
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (psdev->dev == dev) {
++			found_dev = pcistub_device_get_pci_dev(pdev, psdev);
++			break;
++		}
++	}
 +
-+	return err;
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	return found_dev;
 +}
 +
-+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++void pcistub_put_pci_dev(struct pci_dev *dev)
 +{
-+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
-+	struct pci_dev_entry *dev_entry, *t;
-+	struct pci_dev *found_dev = NULL;
++	struct pcistub_device *psdev, *found_psdev = NULL;
 +	unsigned long flags;
 +
-+	spin_lock_irqsave(&dev_data->lock, flags);
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
 +
-+	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
-+		if (dev_entry->dev == dev) {
-+			list_del(&dev_entry->list);
-+			found_dev = dev_entry->dev;
-+			kfree(dev_entry);
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (psdev->dev == dev) {
++			found_psdev = psdev;
++			break;
 +		}
 +	}
 +
-+	spin_unlock_irqrestore(&dev_data->lock, flags);
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
 +
-+	if (found_dev)
-+		pcistub_put_pci_dev(found_dev);
++	/*hold this lock for avoiding breaking link between
++	* pcistub and pciback when AER is in processing
++	*/
++	down_write(&pcistub_sem);
++	/* Cleanup our device
++	 * (so it's ready for the next domain)
++	 */
++	pciback_reset_device(found_psdev->dev);
++	pciback_config_free_dyn_fields(found_psdev->dev);
++	pciback_config_reset_dev(found_psdev->dev);
++
++	spin_lock_irqsave(&found_psdev->lock, flags);
++	found_psdev->pdev = NULL;
++	spin_unlock_irqrestore(&found_psdev->lock, flags);
++
++	pcistub_device_put(found_psdev);
++	up_write(&pcistub_sem);
 +}
 +
-+int pciback_init_devices(struct pciback_device *pdev)
++static int __devinit pcistub_match_one(struct pci_dev *dev,
++				       struct pcistub_device_id *pdev_id)
 +{
-+	struct passthrough_dev_data *dev_data;
++	/* Match the specified device by domain, bus, slot, func and also if
++	 * any of the device's parent bridges match.
++	 */
++	for (; dev != NULL; dev = dev->bus->self) {
++		if (pci_domain_nr(dev->bus) == pdev_id->domain
++		    && dev->bus->number == pdev_id->bus
++		    && dev->devfn == pdev_id->devfn)
++			return 1;
 +
-+	dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
-+	if (!dev_data)
-+		return -ENOMEM;
++		/* Sometimes topmost bridge links to itself. */
++		if (dev == dev->bus->self)
++			break;
++	}
 +
-+	spin_lock_init(&dev_data->lock);
++	return 0;
++}
 +
-+	INIT_LIST_HEAD(&dev_data->dev_list);
++static int __devinit pcistub_match(struct pci_dev *dev)
++{
++	struct pcistub_device_id *pdev_id;
++	unsigned long flags;
++	int found = 0;
 +
-+	pdev->pci_dev_data = dev_data;
++	spin_lock_irqsave(&device_ids_lock, flags);
++	list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
++		if (pcistub_match_one(dev, pdev_id)) {
++			found = 1;
++			break;
++		}
++	}
++	spin_unlock_irqrestore(&device_ids_lock, flags);
 +
-+	return 0;
++	return found;
 +}
 +
-+int pciback_publish_pci_roots(struct pciback_device *pdev,
-+			      publish_pci_root_cb publish_root_cb)
++static int __devinit pcistub_init_device(struct pci_dev *dev)
 +{
++	struct pciback_dev_data *dev_data;
 +	int err = 0;
-+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
-+	struct pci_dev_entry *dev_entry, *e;
-+	struct pci_dev *dev;
-+	int found;
-+	unsigned int domain, bus;
 +
-+	spin_lock(&dev_data->lock);
++	dev_dbg(&dev->dev, "initializing...\n");
 +
-+	list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
-+		/* Only publish this device as a root if none of its
-+		 * parent bridges are exported
-+		 */
-+		found = 0;
-+		dev = dev_entry->dev->bus->self;
-+		for (; !found && dev != NULL; dev = dev->bus->self) {
-+			list_for_each_entry(e, &dev_data->dev_list, list) {
-+				if (dev == e->dev) {
-+					found = 1;
-+					break;
-+				}
-+			}
-+		}
++	/* The PCI backend is not intended to be a module (or to work with
++	 * removable PCI devices (yet). If it were, pciback_config_free()
++	 * would need to be called somewhere to free the memory allocated
++	 * here and then to call kfree(pci_get_drvdata(psdev->dev)).
++	 */
++	dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
++	if (!dev_data) {
++		err = -ENOMEM;
++		goto out;
++	}
++	pci_set_drvdata(dev, dev_data);
 +
-+		domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
-+		bus = (unsigned int)dev_entry->dev->bus->number;
++	dev_dbg(&dev->dev, "initializing config\n");
 +
-+		if (!found) {
-+			err = publish_root_cb(pdev, domain, bus);
-+			if (err)
-+				break;
-+		}
-+	}
++	init_waitqueue_head(&aer_wait_queue);
++	err = pciback_config_init_dev(dev);
++	if (err)
++		goto out;
 +
-+	spin_unlock(&dev_data->lock);
++	/* HACK: Force device (& ACPI) to determine what IRQ it's on - we
++	 * must do this here because pcibios_enable_device may specify
++	 * the pci device's true irq (and possibly its other resources)
++	 * if they differ from what's in the configuration space.
++	 * This makes the assumption that the device's resources won't
++	 * change after this point (otherwise this code may break!)
++	 */
++	dev_dbg(&dev->dev, "enabling device\n");
++	err = pci_enable_device(dev);
++	if (err)
++		goto config_release;
 +
-+	return err;
-+}
++	/* Now disable the device (this also ensures some private device
++	 * data is setup before we export)
++	 */
++	dev_dbg(&dev->dev, "reset device\n");
++	pciback_reset_device(dev);
 +
-+void pciback_release_devices(struct pciback_device *pdev)
-+{
-+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
-+	struct pci_dev_entry *dev_entry, *t;
++	return 0;
 +
-+	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
-+		list_del(&dev_entry->list);
-+		pcistub_put_pci_dev(dev_entry->dev);
-+		kfree(dev_entry);
-+	}
++config_release:
++	pciback_config_free_dev(dev);
 +
++out:
++	pci_set_drvdata(dev, NULL);
 +	kfree(dev_data);
-+	pdev->pci_dev_data = NULL;
++	return err;
 +}
 +
-+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
-+			     struct pciback_device *pdev,
-+			     unsigned int *domain, unsigned int *bus,
-+			     unsigned int *devfn)
-+
-+{
-+	*domain = pci_domain_nr(pcidev->bus);
-+	*bus = pcidev->bus->number;
-+	*devfn = pcidev->devfn;
-+	return 1;
-+}
-diff --git a/drivers/xen/pciback/pci_stub.c b/drivers/xen/pciback/pci_stub.c
-new file mode 100644
-index 0000000..02178e2
---- /dev/null
-+++ b/drivers/xen/pciback/pci_stub.c
-@@ -0,0 +1,1287 @@
 +/*
-+ * PCI Stub Driver - Grabs devices in backend to be exported later
-+ *
-+ * Ryan Wilson <hap9 at epoch.ncsc.mil>
-+ * Chris Bookholt <hap10 at epoch.ncsc.mil>
++ * Because some initialization still happens on
++ * devices during fs_initcall, we need to defer
++ * full initialization of our devices until
++ * device_initcall.
 + */
-+#include <linux/module.h>
-+#include <linux/init.h>
-+#include <linux/rwsem.h>
-+#include <linux/list.h>
-+#include <linux/spinlock.h>
-+#include <linux/kref.h>
-+#include <linux/pci.h>
-+#include <linux/wait.h>
-+#include <linux/sched.h>
-+#include <asm/atomic.h>
-+#include <xen/events.h>
-+#include <asm/xen/pci.h>
-+#include <asm/xen/hypervisor.h>
-+#include "pciback.h"
-+#include "conf_space.h"
-+#include "conf_space_quirks.h"
-+
-+static char *pci_devs_to_hide;
-+wait_queue_head_t aer_wait_queue;
-+/*Add sem for sync AER handling and pciback remove/reconfigue ops,
-+* We want to avoid in middle of AER ops, pciback devices is being removed
-+*/
-+static DECLARE_RWSEM(pcistub_sem);
-+module_param_named(hide, pci_devs_to_hide, charp, 0444);
-+
-+struct pcistub_device_id {
-+	struct list_head slot_list;
-+	int domain;
-+	unsigned char bus;
-+	unsigned int devfn;
-+};
-+static LIST_HEAD(pcistub_device_ids);
-+static DEFINE_SPINLOCK(device_ids_lock);
-+
-+struct pcistub_device {
-+	struct kref kref;
-+	struct list_head dev_list;
-+	spinlock_t lock;
++static int __init pcistub_init_devices_late(void)
++{
++	struct pcistub_device *psdev;
++	unsigned long flags;
++	int err = 0;
 +
-+	struct pci_dev *dev;
-+	struct pciback_device *pdev;/* non-NULL if struct pci_dev is in use */
-+};
++	pr_debug("pciback: pcistub_init_devices_late\n");
 +
-+/* Access to pcistub_devices & seized_devices lists and the initialize_devices
-+ * flag must be locked with pcistub_devices_lock
-+ */
-+static DEFINE_SPINLOCK(pcistub_devices_lock);
-+static LIST_HEAD(pcistub_devices);
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
 +
-+/* wait for device_initcall before initializing our devices
-+ * (see pcistub_init_devices_late)
-+ */
-+static int initialize_devices;
-+static LIST_HEAD(seized_devices);
++	while (!list_empty(&seized_devices)) {
++		psdev = container_of(seized_devices.next,
++				     struct pcistub_device, dev_list);
++		list_del(&psdev->dev_list);
 +
-+static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
-+{
-+	struct pcistub_device *psdev;
++		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
 +
-+	dev_dbg(&dev->dev, "pcistub_device_alloc\n");
++		err = pcistub_init_device(psdev->dev);
++		if (err) {
++			dev_err(&psdev->dev->dev,
++				"error %d initializing device\n", err);
++			kfree(psdev);
++			psdev = NULL;
++		}
 +
-+	psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
-+	if (!psdev)
-+		return NULL;
++		spin_lock_irqsave(&pcistub_devices_lock, flags);
 +
-+	psdev->dev = pci_dev_get(dev);
-+	if (!psdev->dev) {
-+		kfree(psdev);
-+		return NULL;
++		if (psdev)
++			list_add_tail(&psdev->dev_list, &pcistub_devices);
 +	}
 +
-+	kref_init(&psdev->kref);
-+	spin_lock_init(&psdev->lock);
++	initialize_devices = 1;
 +
-+	return psdev;
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++	return 0;
 +}
 +
-+/* Don't call this directly as it's called by pcistub_device_put */
-+static void pcistub_device_release(struct kref *kref)
++static int __devinit pcistub_seize(struct pci_dev *dev)
 +{
 +	struct pcistub_device *psdev;
++	unsigned long flags;
++	int err = 0;
 +
-+	psdev = container_of(kref, struct pcistub_device, kref);
++	psdev = pcistub_device_alloc(dev);
++	if (!psdev)
++		return -ENOMEM;
 +
-+	dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
 +
-+	xen_unregister_device_domain_owner(psdev->dev);
++	if (initialize_devices) {
++		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
 +
-+	/* Clean-up the device */
-+	pciback_reset_device(psdev->dev);
-+	pciback_config_free_dyn_fields(psdev->dev);
-+	pciback_config_free_dev(psdev->dev);
-+	kfree(pci_get_drvdata(psdev->dev));
-+	pci_set_drvdata(psdev->dev, NULL);
++		/* don't want irqs disabled when calling pcistub_init_device */
++		err = pcistub_init_device(psdev->dev);
 +
-+	pci_dev_put(psdev->dev);
++		spin_lock_irqsave(&pcistub_devices_lock, flags);
 +
-+	kfree(psdev);
-+}
++		if (!err)
++			list_add(&psdev->dev_list, &pcistub_devices);
++	} else {
++		dev_dbg(&dev->dev, "deferring initialization\n");
++		list_add(&psdev->dev_list, &seized_devices);
++	}
 +
-+static inline void pcistub_device_get(struct pcistub_device *psdev)
-+{
-+	kref_get(&psdev->kref);
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++	if (err)
++		pcistub_device_put(psdev);
++
++	return err;
 +}
 +
-+static inline void pcistub_device_put(struct pcistub_device *psdev)
++static int __devinit pcistub_probe(struct pci_dev *dev,
++				   const struct pci_device_id *id)
 +{
-+	kref_put(&psdev->kref, pcistub_device_release);
++	int err = 0;
++
++	dev_dbg(&dev->dev, "probing...\n");
++
++	if (pcistub_match(dev)) {
++
++		if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
++		    && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
++			dev_err(&dev->dev, "can't export pci devices that "
++				"don't have a normal (0) or bridge (1) "
++				"header type!\n");
++			err = -ENODEV;
++			goto out;
++		}
++
++		dev_info(&dev->dev, "seizing device\n");
++		err = pcistub_seize(dev);
++	} else
++		/* Didn't find the device */
++		err = -ENODEV;
++
++out:
++	return err;
 +}
 +
-+static struct pcistub_device *pcistub_device_find(int domain, int bus,
-+						  int slot, int func)
++static void pcistub_remove(struct pci_dev *dev)
 +{
-+	struct pcistub_device *psdev = NULL;
++	struct pcistub_device *psdev, *found_psdev = NULL;
 +	unsigned long flags;
 +
++	dev_dbg(&dev->dev, "removing\n");
++
 +	spin_lock_irqsave(&pcistub_devices_lock, flags);
 +
++	pciback_config_quirk_release(dev);
++
 +	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
-+		if (psdev->dev != NULL
-+		    && domain == pci_domain_nr(psdev->dev->bus)
-+		    && bus == psdev->dev->bus->number
-+		    && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
-+			pcistub_device_get(psdev);
-+			goto out;
++		if (psdev->dev == dev) {
++			found_psdev = psdev;
++			break;
 +		}
 +	}
 +
-+	/* didn't find it */
-+	psdev = NULL;
-+
-+out:
 +	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
-+	return psdev;
-+}
 +
-+static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
-+						  struct pcistub_device *psdev)
-+{
-+	struct pci_dev *pci_dev = NULL;
-+	unsigned long flags;
++	if (found_psdev) {
++		dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
++			found_psdev->pdev);
 +
-+	pcistub_device_get(psdev);
++		if (found_psdev->pdev) {
++			printk(KERN_WARNING "pciback: ****** removing device "
++			       "%s while still in-use! ******\n",
++			       pci_name(found_psdev->dev));
++			printk(KERN_WARNING "pciback: ****** driver domain may "
++			       "still access this device's i/o resources!\n");
++			printk(KERN_WARNING "pciback: ****** shutdown driver "
++			       "domain before binding device\n");
++			printk(KERN_WARNING "pciback: ****** to other drivers "
++			       "or domains\n");
 +
-+	spin_lock_irqsave(&psdev->lock, flags);
-+	if (!psdev->pdev) {
-+		psdev->pdev = pdev;
-+		pci_dev = psdev->dev;
++			pciback_release_pci_dev(found_psdev->pdev,
++						found_psdev->dev);
++		}
++
++		spin_lock_irqsave(&pcistub_devices_lock, flags);
++		list_del(&found_psdev->dev_list);
++		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++		/* the final put for releasing from the list */
++		pcistub_device_put(found_psdev);
 +	}
-+	spin_unlock_irqrestore(&psdev->lock, flags);
++}
 +
-+	if (!pci_dev)
-+		pcistub_device_put(psdev);
++static const struct pci_device_id pcistub_ids[] = {
++	{
++	 .vendor = PCI_ANY_ID,
++	 .device = PCI_ANY_ID,
++	 .subvendor = PCI_ANY_ID,
++	 .subdevice = PCI_ANY_ID,
++	 },
++	{0,},
++};
 +
-+	return pci_dev;
++#define PCI_NODENAME_MAX 40
++static void kill_domain_by_device(struct pcistub_device *psdev)
++{
++	struct xenbus_transaction xbt;
++	int err;
++	char nodename[PCI_NODENAME_MAX];
++
++	if (!psdev)
++		dev_err(&psdev->dev->dev,
++			"device is NULL when do AER recovery/kill_domain\n");
++	snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
++		psdev->pdev->xdev->otherend_id);
++	nodename[strlen(nodename)] = '\0';
++
++again:
++	err = xenbus_transaction_start(&xbt);
++	if (err) {
++		dev_err(&psdev->dev->dev,
++			"error %d when start xenbus transaction\n", err);
++		return;
++	}
++	/*PV AER handlers will set this flag*/
++	xenbus_printf(xbt, nodename, "aerState" , "aerfail");
++	err = xenbus_transaction_end(xbt, 0);
++	if (err) {
++		if (err == -EAGAIN)
++			goto again;
++		dev_err(&psdev->dev->dev,
++			"error %d when end xenbus transaction\n", err);
++		return;
++	}
 +}
 +
-+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
-+					    int domain, int bus,
-+					    int slot, int func)
++/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
++ * backend need to have cooperation. In pciback, those steps will do similar
++ * jobs: send service request and waiting for front_end response.
++*/
++static pci_ers_result_t common_process(struct pcistub_device *psdev,
++		pci_channel_state_t state, int aer_cmd, pci_ers_result_t result)
 +{
-+	struct pcistub_device *psdev;
-+	struct pci_dev *found_dev = NULL;
-+	unsigned long flags;
++	pci_ers_result_t res = result;
++	struct xen_pcie_aer_op *aer_op;
++	int ret;
 +
-+	spin_lock_irqsave(&pcistub_devices_lock, flags);
++	/*with PV AER drivers*/
++	aer_op = &(psdev->pdev->sh_info->aer_op);
++	aer_op->cmd = aer_cmd ;
++	/*useful for error_detected callback*/
++	aer_op->err = state;
++	/*pcifront_end BDF*/
++	ret = pciback_get_pcifront_dev(psdev->dev, psdev->pdev,
++		&aer_op->domain, &aer_op->bus, &aer_op->devfn);
++	if (!ret) {
++		dev_err(&psdev->dev->dev,
++			"pciback: failed to get pcifront device\n");
++		return PCI_ERS_RESULT_NONE;
++	}
++	wmb();
 +
-+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
-+		if (psdev->dev != NULL
-+		    && domain == pci_domain_nr(psdev->dev->bus)
-+		    && bus == psdev->dev->bus->number
-+		    && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
-+			found_dev = pcistub_device_get_pci_dev(pdev, psdev);
-+			break;
++	dev_dbg(&psdev->dev->dev,
++			"pciback: aer_op %x dom %x bus %x devfn %x\n",
++			aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
++	/*local flag to mark there's aer request, pciback callback will use this
++	* flag to judge whether we need to check pci-front give aer service
++	* ack signal
++	*/
++	set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
++
++	/*It is possible that a pcifront conf_read_write ops request invokes
++	* the callback which cause the spurious execution of wake_up.
++	* Yet it is harmless and better than a spinlock here
++	*/
++	set_bit(_XEN_PCIB_active,
++		(unsigned long *)&psdev->pdev->sh_info->flags);
++	wmb();
++	notify_remote_via_irq(psdev->pdev->evtchn_irq);
++
++	ret = wait_event_timeout(aer_wait_queue, !(test_bit(_XEN_PCIB_active,
++		(unsigned long *)&psdev->pdev->sh_info->flags)), 300*HZ);
++
++	if (!ret) {
++		if (test_bit(_XEN_PCIB_active,
++			(unsigned long *)&psdev->pdev->sh_info->flags)) {
++			dev_err(&psdev->dev->dev,
++				"pcifront aer process not responding!\n");
++			clear_bit(_XEN_PCIB_active,
++			  (unsigned long *)&psdev->pdev->sh_info->flags);
++			aer_op->err = PCI_ERS_RESULT_NONE;
++			return res;
 +		}
 +	}
++	clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
++
++	if (test_bit(_XEN_PCIF_active,
++		(unsigned long *)&psdev->pdev->sh_info->flags)) {
++		dev_dbg(&psdev->dev->dev,
++			"schedule pci_conf service in pciback \n");
++		test_and_schedule_op(psdev->pdev);
++	}
 +
-+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
-+	return found_dev;
++	res = (pci_ers_result_t)aer_op->err;
++	return res;
 +}
 +
-+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
-+				    struct pci_dev *dev)
++/*
++* pciback_slot_reset: it will send the slot_reset request to  pcifront in case
++* of the device driver could provide this service, and then wait for pcifront
++* ack.
++* @dev: pointer to PCI devices
++* return value is used by aer_core do_recovery policy
++*/
++static pci_ers_result_t pciback_slot_reset(struct pci_dev *dev)
 +{
 +	struct pcistub_device *psdev;
-+	struct pci_dev *found_dev = NULL;
-+	unsigned long flags;
-+
-+	spin_lock_irqsave(&pcistub_devices_lock, flags);
++	pci_ers_result_t result;
 +
-+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
-+		if (psdev->dev == dev) {
-+			found_dev = pcistub_device_get_pci_dev(pdev, psdev);
-+			break;
-+		}
-+	}
++	result = PCI_ERS_RESULT_RECOVERED;
++	dev_dbg(&dev->dev, "pciback_slot_reset(bus:%x,devfn:%x)\n",
++		dev->bus->number, dev->devfn);
 +
-+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
-+	return found_dev;
-+}
++	down_write(&pcistub_sem);
++	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++				dev->bus->number,
++				PCI_SLOT(dev->devfn),
++				PCI_FUNC(dev->devfn));
 +
-+void pcistub_put_pci_dev(struct pci_dev *dev)
-+{
-+	struct pcistub_device *psdev, *found_psdev = NULL;
-+	unsigned long flags;
++	if (!psdev || !psdev->pdev) {
++		dev_err(&dev->dev,
++			"pciback device is not found/assigned\n");
++		goto end;
++	}
 +
-+	spin_lock_irqsave(&pcistub_devices_lock, flags);
++	if (!psdev->pdev->sh_info) {
++		dev_err(&dev->dev, "pciback device is not connected or owned"
++			" by HVM, kill it\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
 +
-+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
-+		if (psdev->dev == dev) {
-+			found_psdev = psdev;
-+			break;
-+		}
++	if (!test_bit(_XEN_PCIB_AERHANDLER,
++		(unsigned long *)&psdev->pdev->sh_info->flags)) {
++		dev_err(&dev->dev,
++			"guest with no AER driver should have been killed\n");
++		goto release;
 +	}
++	result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
 +
-+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	if (result == PCI_ERS_RESULT_NONE ||
++		result == PCI_ERS_RESULT_DISCONNECT) {
++		dev_dbg(&dev->dev,
++			"No AER slot_reset service or disconnected!\n");
++		kill_domain_by_device(psdev);
++	}
++release:
++	pcistub_device_put(psdev);
++end:
++	up_write(&pcistub_sem);
++	return result;
 +
-+	/*hold this lock for avoiding breaking link between
-+	* pcistub and pciback when AER is in processing
-+	*/
-+	down_write(&pcistub_sem);
-+	/* Cleanup our device
-+	 * (so it's ready for the next domain)
-+	 */
-+	pciback_reset_device(found_psdev->dev);
-+	pciback_config_free_dyn_fields(found_psdev->dev);
-+	pciback_config_reset_dev(found_psdev->dev);
++}
 +
-+	spin_lock_irqsave(&found_psdev->lock, flags);
-+	found_psdev->pdev = NULL;
-+	spin_unlock_irqrestore(&found_psdev->lock, flags);
 +
-+	pcistub_device_put(found_psdev);
-+	up_write(&pcistub_sem);
-+}
++/*pciback_mmio_enabled: it will send the mmio_enabled request to  pcifront
++* in case of the device driver could provide this service, and then wait
++* for pcifront ack
++* @dev: pointer to PCI devices
++* return value is used by aer_core do_recovery policy
++*/
 +
-+static int __devinit pcistub_match_one(struct pci_dev *dev,
-+				       struct pcistub_device_id *pdev_id)
++static pci_ers_result_t pciback_mmio_enabled(struct pci_dev *dev)
 +{
-+	/* Match the specified device by domain, bus, slot, func and also if
-+	 * any of the device's parent bridges match.
-+	 */
-+	for (; dev != NULL; dev = dev->bus->self) {
-+		if (pci_domain_nr(dev->bus) == pdev_id->domain
-+		    && dev->bus->number == pdev_id->bus
-+		    && dev->devfn == pdev_id->devfn)
-+			return 1;
++	struct pcistub_device *psdev;
++	pci_ers_result_t result;
 +
-+		/* Sometimes topmost bridge links to itself. */
-+		if (dev == dev->bus->self)
-+			break;
-+	}
++	result = PCI_ERS_RESULT_RECOVERED;
++	dev_dbg(&dev->dev, "pciback_mmio_enabled(bus:%x,devfn:%x)\n",
++		dev->bus->number, dev->devfn);
 +
-+	return 0;
-+}
++	down_write(&pcistub_sem);
++	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++				dev->bus->number,
++				PCI_SLOT(dev->devfn),
++				PCI_FUNC(dev->devfn));
 +
-+static int __devinit pcistub_match(struct pci_dev *dev)
-+{
-+	struct pcistub_device_id *pdev_id;
-+	unsigned long flags;
-+	int found = 0;
++	if (!psdev || !psdev->pdev) {
++		dev_err(&dev->dev,
++			"pciback device is not found/assigned\n");
++		goto end;
++	}
 +
-+	spin_lock_irqsave(&device_ids_lock, flags);
-+	list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
-+		if (pcistub_match_one(dev, pdev_id)) {
-+			found = 1;
-+			break;
-+		}
++	if (!psdev->pdev->sh_info) {
++		dev_err(&dev->dev, "pciback device is not connected or owned"
++			" by HVM, kill it\n");
++		kill_domain_by_device(psdev);
++		goto release;
 +	}
-+	spin_unlock_irqrestore(&device_ids_lock, flags);
 +
-+	return found;
++	if (!test_bit(_XEN_PCIB_AERHANDLER,
++		(unsigned long *)&psdev->pdev->sh_info->flags)) {
++		dev_err(&dev->dev,
++			"guest with no AER driver should have been killed\n");
++		goto release;
++	}
++	result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
++
++	if (result == PCI_ERS_RESULT_NONE ||
++		result == PCI_ERS_RESULT_DISCONNECT) {
++		dev_dbg(&dev->dev,
++			"No AER mmio_enabled service or disconnected!\n");
++		kill_domain_by_device(psdev);
++	}
++release:
++	pcistub_device_put(psdev);
++end:
++	up_write(&pcistub_sem);
++	return result;
 +}
 +
-+static int __devinit pcistub_init_device(struct pci_dev *dev)
++/*pciback_error_detected: it will send the error_detected request to  pcifront
++* in case of the device driver could provide this service, and then wait
++* for pcifront ack.
++* @dev: pointer to PCI devices
++* @error: the current PCI connection state
++* return value is used by aer_core do_recovery policy
++*/
++
++static pci_ers_result_t pciback_error_detected(struct pci_dev *dev,
++	pci_channel_state_t error)
 +{
-+	struct pciback_dev_data *dev_data;
-+	int err = 0;
++	struct pcistub_device *psdev;
++	pci_ers_result_t result;
 +
-+	dev_dbg(&dev->dev, "initializing...\n");
++	result = PCI_ERS_RESULT_CAN_RECOVER;
++	dev_dbg(&dev->dev, "pciback_error_detected(bus:%x,devfn:%x)\n",
++		dev->bus->number, dev->devfn);
 +
-+	/* The PCI backend is not intended to be a module (or to work with
-+	 * removable PCI devices (yet). If it were, pciback_config_free()
-+	 * would need to be called somewhere to free the memory allocated
-+	 * here and then to call kfree(pci_get_drvdata(psdev->dev)).
-+	 */
-+	dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
-+	if (!dev_data) {
-+		err = -ENOMEM;
-+		goto out;
++	down_write(&pcistub_sem);
++	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++				dev->bus->number,
++				PCI_SLOT(dev->devfn),
++				PCI_FUNC(dev->devfn));
++
++	if (!psdev || !psdev->pdev) {
++		dev_err(&dev->dev,
++			"pciback device is not found/assigned\n");
++		goto end;
 +	}
-+	pci_set_drvdata(dev, dev_data);
 +
-+	dev_dbg(&dev->dev, "initializing config\n");
++	if (!psdev->pdev->sh_info) {
++		dev_err(&dev->dev, "pciback device is not connected or owned"
++			" by HVM, kill it\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
 +
-+	init_waitqueue_head(&aer_wait_queue);
-+	err = pciback_config_init_dev(dev);
-+	if (err)
-+		goto out;
++	/*Guest owns the device yet no aer handler regiested, kill guest*/
++	if (!test_bit(_XEN_PCIB_AERHANDLER,
++		(unsigned long *)&psdev->pdev->sh_info->flags)) {
++		dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
++	result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
 +
-+	/* HACK: Force device (& ACPI) to determine what IRQ it's on - we
-+	 * must do this here because pcibios_enable_device may specify
-+	 * the pci device's true irq (and possibly its other resources)
-+	 * if they differ from what's in the configuration space.
-+	 * This makes the assumption that the device's resources won't
-+	 * change after this point (otherwise this code may break!)
-+	 */
-+	dev_dbg(&dev->dev, "enabling device\n");
-+	err = pci_enable_device(dev);
-+	if (err)
-+		goto config_release;
++	if (result == PCI_ERS_RESULT_NONE ||
++		result == PCI_ERS_RESULT_DISCONNECT) {
++		dev_dbg(&dev->dev,
++			"No AER error_detected service or disconnected!\n");
++		kill_domain_by_device(psdev);
++	}
++release:
++	pcistub_device_put(psdev);
++end:
++	up_write(&pcistub_sem);
++	return result;
++}
++
++/*pciback_error_resume: it will send the error_resume request to  pcifront
++* in case of the device driver could provide this service, and then wait
++* for pcifront ack.
++* @dev: pointer to PCI devices
++*/
 +
-+	/* Now disable the device (this also ensures some private device
-+	 * data is setup before we export)
-+	 */
-+	dev_dbg(&dev->dev, "reset device\n");
-+	pciback_reset_device(dev);
++static void pciback_error_resume(struct pci_dev *dev)
++{
++	struct pcistub_device *psdev;
 +
-+	return 0;
++	dev_dbg(&dev->dev, "pciback_error_resume(bus:%x,devfn:%x)\n",
++		dev->bus->number, dev->devfn);
 +
-+config_release:
-+	pciback_config_free_dev(dev);
++	down_write(&pcistub_sem);
++	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++				dev->bus->number,
++				PCI_SLOT(dev->devfn),
++				PCI_FUNC(dev->devfn));
 +
-+out:
-+	pci_set_drvdata(dev, NULL);
-+	kfree(dev_data);
-+	return err;
-+}
++	if (!psdev || !psdev->pdev) {
++		dev_err(&dev->dev,
++			"pciback device is not found/assigned\n");
++		goto end;
++	}
 +
-+/*
-+ * Because some initialization still happens on
-+ * devices during fs_initcall, we need to defer
-+ * full initialization of our devices until
-+ * device_initcall.
-+ */
-+static int __init pcistub_init_devices_late(void)
-+{
-+	struct pcistub_device *psdev;
-+	unsigned long flags;
-+	int err = 0;
++	if (!psdev->pdev->sh_info) {
++		dev_err(&dev->dev, "pciback device is not connected or owned"
++			" by HVM, kill it\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
 +
-+	pr_debug("pciback: pcistub_init_devices_late\n");
++	if (!test_bit(_XEN_PCIB_AERHANDLER,
++		(unsigned long *)&psdev->pdev->sh_info->flags)) {
++		dev_err(&dev->dev,
++			"guest with no AER driver should have been killed\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
++	common_process(psdev, 1, XEN_PCI_OP_aer_resume,
++		       PCI_ERS_RESULT_RECOVERED);
++release:
++	pcistub_device_put(psdev);
++end:
++	up_write(&pcistub_sem);
++	return;
++}
 +
-+	spin_lock_irqsave(&pcistub_devices_lock, flags);
++/*add pciback AER handling*/
++static struct pci_error_handlers pciback_error_handler = {
++	.error_detected = pciback_error_detected,
++	.mmio_enabled = pciback_mmio_enabled,
++	.slot_reset = pciback_slot_reset,
++	.resume = pciback_error_resume,
++};
 +
-+	while (!list_empty(&seized_devices)) {
-+		psdev = container_of(seized_devices.next,
-+				     struct pcistub_device, dev_list);
-+		list_del(&psdev->dev_list);
++/*
++ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
++ * for a normal device. I don't want it to be loaded automatically.
++ */
 +
-+		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++static struct pci_driver pciback_pci_driver = {
++	.name = "pciback",
++	.id_table = pcistub_ids,
++	.probe = pcistub_probe,
++	.remove = pcistub_remove,
++	.err_handler = &pciback_error_handler,
++};
 +
-+		err = pcistub_init_device(psdev->dev);
-+		if (err) {
-+			dev_err(&psdev->dev->dev,
-+				"error %d initializing device\n", err);
-+			kfree(psdev);
-+			psdev = NULL;
-+		}
++static inline int str_to_slot(const char *buf, int *domain, int *bus,
++			      int *slot, int *func)
++{
++	int err;
 +
-+		spin_lock_irqsave(&pcistub_devices_lock, flags);
++	err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
++	if (err == 4)
++		return 0;
++	else if (err < 0)
++		return -EINVAL;
 +
-+		if (psdev)
-+			list_add_tail(&psdev->dev_list, &pcistub_devices);
-+	}
++	/* try again without domain */
++	*domain = 0;
++	err = sscanf(buf, " %x:%x.%x", bus, slot, func);
++	if (err == 3)
++		return 0;
 +
-+	initialize_devices = 1;
++	return -EINVAL;
++}
 +
-+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
++			       *slot, int *func, int *reg, int *size, int *mask)
++{
++	int err;
 +
-+	return 0;
++	err =
++	    sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
++		   func, reg, size, mask);
++	if (err == 7)
++		return 0;
++	return -EINVAL;
 +}
 +
-+static int __devinit pcistub_seize(struct pci_dev *dev)
++static int pcistub_device_id_add(int domain, int bus, int slot, int func)
 +{
-+	struct pcistub_device *psdev;
++	struct pcistub_device_id *pci_dev_id;
 +	unsigned long flags;
-+	int err = 0;
 +
-+	psdev = pcistub_device_alloc(dev);
-+	if (!psdev)
++	pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
++	if (!pci_dev_id)
 +		return -ENOMEM;
 +
-+	spin_lock_irqsave(&pcistub_devices_lock, flags);
++	pci_dev_id->domain = domain;
++	pci_dev_id->bus = bus;
++	pci_dev_id->devfn = PCI_DEVFN(slot, func);
 +
-+	if (initialize_devices) {
-+		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
++		 domain, bus, slot, func);
 +
-+		/* don't want irqs disabled when calling pcistub_init_device */
-+		err = pcistub_init_device(psdev->dev);
++	spin_lock_irqsave(&device_ids_lock, flags);
++	list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
++	spin_unlock_irqrestore(&device_ids_lock, flags);
 +
-+		spin_lock_irqsave(&pcistub_devices_lock, flags);
++	return 0;
++}
 +
-+		if (!err)
-+			list_add(&psdev->dev_list, &pcistub_devices);
-+	} else {
-+		dev_dbg(&dev->dev, "deferring initialization\n");
-+		list_add(&psdev->dev_list, &seized_devices);
-+	}
++static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
++{
++	struct pcistub_device_id *pci_dev_id, *t;
++	int devfn = PCI_DEVFN(slot, func);
++	int err = -ENOENT;
++	unsigned long flags;
 +
-+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	spin_lock_irqsave(&device_ids_lock, flags);
++	list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids,
++				 slot_list) {
++		if (pci_dev_id->domain == domain
++		    && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
++			/* Don't break; here because it's possible the same
++			 * slot could be in the list more than once
++			 */
++			list_del(&pci_dev_id->slot_list);
++			kfree(pci_dev_id);
 +
-+	if (err)
-+		pcistub_device_put(psdev);
++			err = 0;
++
++			pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
++				 "seize list\n", domain, bus, slot, func);
++		}
++	}
++	spin_unlock_irqrestore(&device_ids_lock, flags);
 +
 +	return err;
 +}
 +
-+static int __devinit pcistub_probe(struct pci_dev *dev,
-+				   const struct pci_device_id *id)
++static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
++			   int size, int mask)
 +{
 +	int err = 0;
++	struct pcistub_device *psdev;
++	struct pci_dev *dev;
++	struct config_field *field;
 +
-+	dev_dbg(&dev->dev, "probing...\n");
-+
-+	if (pcistub_match(dev)) {
++	psdev = pcistub_device_find(domain, bus, slot, func);
++	if (!psdev || !psdev->dev) {
++		err = -ENODEV;
++		goto out;
++	}
++	dev = psdev->dev;
 +
-+		if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
-+		    && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-+			dev_err(&dev->dev, "can't export pci devices that "
-+				"don't have a normal (0) or bridge (1) "
-+				"header type!\n");
-+			err = -ENODEV;
-+			goto out;
-+		}
++	field = kzalloc(sizeof(*field), GFP_ATOMIC);
++	if (!field) {
++		err = -ENOMEM;
++		goto out;
++	}
 +
-+		dev_info(&dev->dev, "seizing device\n");
-+		err = pcistub_seize(dev);
-+	} else
-+		/* Didn't find the device */
-+		err = -ENODEV;
++	field->offset = reg;
++	field->size = size;
++	field->mask = mask;
++	field->init = NULL;
++	field->reset = NULL;
++	field->release = NULL;
++	field->clean = pciback_config_field_free;
 +
++	err = pciback_config_quirks_add_field(dev, field);
++	if (err)
++		kfree(field);
 +out:
 +	return err;
 +}
 +
-+static void pcistub_remove(struct pci_dev *dev)
++static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
++				size_t count)
 +{
-+	struct pcistub_device *psdev, *found_psdev = NULL;
-+	unsigned long flags;
++	int domain, bus, slot, func;
++	int err;
 +
-+	dev_dbg(&dev->dev, "removing\n");
++	err = str_to_slot(buf, &domain, &bus, &slot, &func);
++	if (err)
++		goto out;
 +
-+	spin_lock_irqsave(&pcistub_devices_lock, flags);
++	err = pcistub_device_id_add(domain, bus, slot, func);
 +
-+	pciback_config_quirk_release(dev);
++out:
++	if (!err)
++		err = count;
++	return err;
++}
 +
-+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
-+		if (psdev->dev == dev) {
-+			found_psdev = psdev;
-+			break;
-+		}
-+	}
++DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
++
++static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
++				   size_t count)
++{
++	int domain, bus, slot, func;
++	int err;
 +
-+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	err = str_to_slot(buf, &domain, &bus, &slot, &func);
++	if (err)
++		goto out;
 +
-+	if (found_psdev) {
-+		dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
-+			found_psdev->pdev);
++	err = pcistub_device_id_remove(domain, bus, slot, func);
 +
-+		if (found_psdev->pdev) {
-+			printk(KERN_WARNING "pciback: ****** removing device "
-+			       "%s while still in-use! ******\n",
-+			       pci_name(found_psdev->dev));
-+			printk(KERN_WARNING "pciback: ****** driver domain may "
-+			       "still access this device's i/o resources!\n");
-+			printk(KERN_WARNING "pciback: ****** shutdown driver "
-+			       "domain before binding device\n");
-+			printk(KERN_WARNING "pciback: ****** to other drivers "
-+			       "or domains\n");
++out:
++	if (!err)
++		err = count;
++	return err;
++}
 +
-+			pciback_release_pci_dev(found_psdev->pdev,
-+						found_psdev->dev);
-+		}
++DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
 +
-+		spin_lock_irqsave(&pcistub_devices_lock, flags);
-+		list_del(&found_psdev->dev_list);
-+		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
++{
++	struct pcistub_device_id *pci_dev_id;
++	size_t count = 0;
++	unsigned long flags;
 +
-+		/* the final put for releasing from the list */
-+		pcistub_device_put(found_psdev);
++	spin_lock_irqsave(&device_ids_lock, flags);
++	list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
++		if (count >= PAGE_SIZE)
++			break;
++
++		count += scnprintf(buf + count, PAGE_SIZE - count,
++				   "%04x:%02x:%02x.%01x\n",
++				   pci_dev_id->domain, pci_dev_id->bus,
++				   PCI_SLOT(pci_dev_id->devfn),
++				   PCI_FUNC(pci_dev_id->devfn));
 +	}
++	spin_unlock_irqrestore(&device_ids_lock, flags);
++
++	return count;
 +}
 +
-+static const struct pci_device_id pcistub_ids[] = {
-+	{
-+	 .vendor = PCI_ANY_ID,
-+	 .device = PCI_ANY_ID,
-+	 .subvendor = PCI_ANY_ID,
-+	 .subdevice = PCI_ANY_ID,
-+	 },
-+	{0,},
-+};
++DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
 +
-+#define PCI_NODENAME_MAX 40
-+static void kill_domain_by_device(struct pcistub_device *psdev)
++static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
++				 size_t count)
 +{
-+	struct xenbus_transaction xbt;
++	int domain, bus, slot, func, reg, size, mask;
 +	int err;
-+	char nodename[PCI_NODENAME_MAX];
 +
-+	if (!psdev)
-+		dev_err(&psdev->dev->dev,
-+			"device is NULL when do AER recovery/kill_domain\n");
-+	snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
-+		psdev->pdev->xdev->otherend_id);
-+	nodename[strlen(nodename)] = '\0';
++	err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
++			   &mask);
++	if (err)
++		goto out;
 +
-+again:
-+	err = xenbus_transaction_start(&xbt);
-+	if (err) {
-+		dev_err(&psdev->dev->dev,
-+			"error %d when start xenbus transaction\n", err);
-+		return;
-+	}
-+	/*PV AER handlers will set this flag*/
-+	xenbus_printf(xbt, nodename, "aerState" , "aerfail");
-+	err = xenbus_transaction_end(xbt, 0);
-+	if (err) {
-+		if (err == -EAGAIN)
-+			goto again;
-+		dev_err(&psdev->dev->dev,
-+			"error %d when end xenbus transaction\n", err);
-+		return;
-+	}
++	err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
++
++out:
++	if (!err)
++		err = count;
++	return err;
 +}
 +
-+/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
-+ * backend need to have cooperation. In pciback, those steps will do similar
-+ * jobs: send service request and waiting for front_end response.
-+*/
-+static pci_ers_result_t common_process(struct pcistub_device *psdev,
-+		pci_channel_state_t state, int aer_cmd, pci_ers_result_t result)
++static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
 +{
-+	pci_ers_result_t res = result;
-+	struct xen_pcie_aer_op *aer_op;
-+	int ret;
++	int count = 0;
++	unsigned long flags;
++	struct pciback_config_quirk *quirk;
++	struct pciback_dev_data *dev_data;
++	const struct config_field *field;
++	const struct config_field_entry *cfg_entry;
 +
-+	/*with PV AER drivers*/
-+	aer_op = &(psdev->pdev->sh_info->aer_op);
-+	aer_op->cmd = aer_cmd ;
-+	/*useful for error_detected callback*/
-+	aer_op->err = state;
-+	/*pcifront_end BDF*/
-+	ret = pciback_get_pcifront_dev(psdev->dev, psdev->pdev,
-+		&aer_op->domain, &aer_op->bus, &aer_op->devfn);
-+	if (!ret) {
-+		dev_err(&psdev->dev->dev,
-+			"pciback: failed to get pcifront device\n");
-+		return PCI_ERS_RESULT_NONE;
-+	}
-+	wmb();
++	spin_lock_irqsave(&device_ids_lock, flags);
++	list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
++		if (count >= PAGE_SIZE)
++			goto out;
 +
-+	dev_dbg(&psdev->dev->dev,
-+			"pciback: aer_op %x dom %x bus %x devfn %x\n",
-+			aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
-+	/*local flag to mark there's aer request, pciback callback will use this
-+	* flag to judge whether we need to check pci-front give aer service
-+	* ack signal
-+	*/
-+	set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
++		count += scnprintf(buf + count, PAGE_SIZE - count,
++				   "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
++				   quirk->pdev->bus->number,
++				   PCI_SLOT(quirk->pdev->devfn),
++				   PCI_FUNC(quirk->pdev->devfn),
++				   quirk->devid.vendor, quirk->devid.device,
++				   quirk->devid.subvendor,
++				   quirk->devid.subdevice);
 +
-+	/*It is possible that a pcifront conf_read_write ops request invokes
-+	* the callback which cause the spurious execution of wake_up.
-+	* Yet it is harmless and better than a spinlock here
-+	*/
-+	set_bit(_XEN_PCIB_active,
-+		(unsigned long *)&psdev->pdev->sh_info->flags);
-+	wmb();
-+	notify_remote_via_irq(psdev->pdev->evtchn_irq);
++		dev_data = pci_get_drvdata(quirk->pdev);
 +
-+	ret = wait_event_timeout(aer_wait_queue, !(test_bit(_XEN_PCIB_active,
-+		(unsigned long *)&psdev->pdev->sh_info->flags)), 300*HZ);
++		list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++			field = cfg_entry->field;
++			if (count >= PAGE_SIZE)
++				goto out;
 +
-+	if (!ret) {
-+		if (test_bit(_XEN_PCIB_active,
-+			(unsigned long *)&psdev->pdev->sh_info->flags)) {
-+			dev_err(&psdev->dev->dev,
-+				"pcifront aer process not responding!\n");
-+			clear_bit(_XEN_PCIB_active,
-+			  (unsigned long *)&psdev->pdev->sh_info->flags);
-+			aer_op->err = PCI_ERS_RESULT_NONE;
-+			return res;
++			count += scnprintf(buf + count, PAGE_SIZE - count,
++					   "\t\t%08x:%01x:%08x\n",
++					   cfg_entry->base_offset +
++					   field->offset, field->size,
++					   field->mask);
 +		}
 +	}
-+	clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
 +
-+	if (test_bit(_XEN_PCIF_active,
-+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
-+		dev_dbg(&psdev->dev->dev,
-+			"schedule pci_conf service in pciback \n");
-+		test_and_schedule_op(psdev->pdev);
-+	}
++out:
++	spin_unlock_irqrestore(&device_ids_lock, flags);
 +
-+	res = (pci_ers_result_t)aer_op->err;
-+	return res;
++	return count;
 +}
 +
-+/*
-+* pciback_slot_reset: it will send the slot_reset request to  pcifront in case
-+* of the device driver could provide this service, and then wait for pcifront
-+* ack.
-+* @dev: pointer to PCI devices
-+* return value is used by aer_core do_recovery policy
-+*/
-+static pci_ers_result_t pciback_slot_reset(struct pci_dev *dev)
++DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
++
++static ssize_t permissive_add(struct device_driver *drv, const char *buf,
++			      size_t count)
 +{
++	int domain, bus, slot, func;
++	int err;
 +	struct pcistub_device *psdev;
-+	pci_ers_result_t result;
-+
-+	result = PCI_ERS_RESULT_RECOVERED;
-+	dev_dbg(&dev->dev, "pciback_slot_reset(bus:%x,devfn:%x)\n",
-+		dev->bus->number, dev->devfn);
-+
-+	down_write(&pcistub_sem);
-+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
-+				dev->bus->number,
-+				PCI_SLOT(dev->devfn),
-+				PCI_FUNC(dev->devfn));
-+
-+	if (!psdev || !psdev->pdev) {
-+		dev_err(&dev->dev,
-+			"pciback device is not found/assigned\n");
-+		goto end;
-+	}
-+
-+	if (!psdev->pdev->sh_info) {
-+		dev_err(&dev->dev, "pciback device is not connected or owned"
-+			" by HVM, kill it\n");
-+		kill_domain_by_device(psdev);
-+		goto release;
++	struct pciback_dev_data *dev_data;
++	err = str_to_slot(buf, &domain, &bus, &slot, &func);
++	if (err)
++		goto out;
++	psdev = pcistub_device_find(domain, bus, slot, func);
++	if (!psdev) {
++		err = -ENODEV;
++		goto out;
 +	}
-+
-+	if (!test_bit(_XEN_PCIB_AERHANDLER,
-+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
-+		dev_err(&dev->dev,
-+			"guest with no AER driver should have been killed\n");
++	if (!psdev->dev) {
++		err = -ENODEV;
 +		goto release;
 +	}
-+	result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
-+
-+	if (result == PCI_ERS_RESULT_NONE ||
-+		result == PCI_ERS_RESULT_DISCONNECT) {
-+		dev_dbg(&dev->dev,
-+			"No AER slot_reset service or disconnected!\n");
-+		kill_domain_by_device(psdev);
++	dev_data = pci_get_drvdata(psdev->dev);
++	/* the driver data for a device should never be null at this point */
++	if (!dev_data) {
++		err = -ENXIO;
++		goto release;
++	}
++	if (!dev_data->permissive) {
++		dev_data->permissive = 1;
++		/* Let user know that what they're doing could be unsafe */
++		dev_warn(&psdev->dev->dev, "enabling permissive mode "
++			 "configuration space accesses!\n");
++		dev_warn(&psdev->dev->dev,
++			 "permissive mode is potentially unsafe!\n");
 +	}
 +release:
 +	pcistub_device_put(psdev);
-+end:
-+	up_write(&pcistub_sem);
-+	return result;
++out:
++	if (!err)
++		err = count;
++	return err;
++}
 +
++static ssize_t permissive_show(struct device_driver *drv, char *buf)
++{
++	struct pcistub_device *psdev;
++	struct pciback_dev_data *dev_data;
++	size_t count = 0;
++	unsigned long flags;
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (count >= PAGE_SIZE)
++			break;
++		if (!psdev->dev)
++			continue;
++		dev_data = pci_get_drvdata(psdev->dev);
++		if (!dev_data || !dev_data->permissive)
++			continue;
++		count +=
++		    scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
++			      pci_name(psdev->dev));
++	}
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	return count;
 +}
 +
++DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
 +
-+/*pciback_mmio_enabled: it will send the mmio_enabled request to  pcifront
-+* in case of the device driver could provide this service, and then wait
-+* for pcifront ack
-+* @dev: pointer to PCI devices
-+* return value is used by aer_core do_recovery policy
-+*/
++static void pcistub_exit(void)
++{
++	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
++	driver_remove_file(&pciback_pci_driver.driver,
++			   &driver_attr_remove_slot);
++	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
++	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
++	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
 +
-+static pci_ers_result_t pciback_mmio_enabled(struct pci_dev *dev)
++	pci_unregister_driver(&pciback_pci_driver);
++}
++
++static int __init pcistub_init(void)
 +{
-+	struct pcistub_device *psdev;
-+	pci_ers_result_t result;
++	int pos = 0;
++	int err = 0;
++	int domain, bus, slot, func;
++	int parsed;
 +
-+	result = PCI_ERS_RESULT_RECOVERED;
-+	dev_dbg(&dev->dev, "pciback_mmio_enabled(bus:%x,devfn:%x)\n",
-+		dev->bus->number, dev->devfn);
++	if (pci_devs_to_hide && *pci_devs_to_hide) {
++		do {
++			parsed = 0;
 +
-+	down_write(&pcistub_sem);
-+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
-+				dev->bus->number,
-+				PCI_SLOT(dev->devfn),
-+				PCI_FUNC(dev->devfn));
++			err = sscanf(pci_devs_to_hide + pos,
++				     " (%x:%x:%x.%x) %n",
++				     &domain, &bus, &slot, &func, &parsed);
++			if (err != 4) {
++				domain = 0;
++				err = sscanf(pci_devs_to_hide + pos,
++					     " (%x:%x.%x) %n",
++					     &bus, &slot, &func, &parsed);
++				if (err != 3)
++					goto parse_error;
++			}
 +
-+	if (!psdev || !psdev->pdev) {
-+		dev_err(&dev->dev,
-+			"pciback device is not found/assigned\n");
-+		goto end;
-+	}
++			err = pcistub_device_id_add(domain, bus, slot, func);
++			if (err)
++				goto out;
 +
-+	if (!psdev->pdev->sh_info) {
-+		dev_err(&dev->dev, "pciback device is not connected or owned"
-+			" by HVM, kill it\n");
-+		kill_domain_by_device(psdev);
-+		goto release;
++			/* if parsed<=0, we've reached the end of the string */
++			pos += parsed;
++		} while (parsed > 0 && pci_devs_to_hide[pos]);
 +	}
 +
-+	if (!test_bit(_XEN_PCIB_AERHANDLER,
-+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
-+		dev_err(&dev->dev,
-+			"guest with no AER driver should have been killed\n");
-+		goto release;
-+	}
-+	result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
++	/* If we're the first PCI Device Driver to register, we're the
++	 * first one to get offered PCI devices as they become
++	 * available (and thus we can be the first to grab them)
++	 */
++	err = pci_register_driver(&pciback_pci_driver);
++	if (err < 0)
++		goto out;
 +
-+	if (result == PCI_ERS_RESULT_NONE ||
-+		result == PCI_ERS_RESULT_DISCONNECT) {
-+		dev_dbg(&dev->dev,
-+			"No AER mmio_enabled service or disconnected!\n");
-+		kill_domain_by_device(psdev);
-+	}
-+release:
-+	pcistub_device_put(psdev);
-+end:
-+	up_write(&pcistub_sem);
-+	return result;
++	err = driver_create_file(&pciback_pci_driver.driver,
++				 &driver_attr_new_slot);
++	if (!err)
++		err = driver_create_file(&pciback_pci_driver.driver,
++					 &driver_attr_remove_slot);
++	if (!err)
++		err = driver_create_file(&pciback_pci_driver.driver,
++					 &driver_attr_slots);
++	if (!err)
++		err = driver_create_file(&pciback_pci_driver.driver,
++					 &driver_attr_quirks);
++	if (!err)
++		err = driver_create_file(&pciback_pci_driver.driver,
++					 &driver_attr_permissive);
++
++	if (err)
++		pcistub_exit();
++
++out:
++	return err;
++
++parse_error:
++	printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
++	       pci_devs_to_hide + pos);
++	return -EINVAL;
 +}
 +
-+/*pciback_error_detected: it will send the error_detected request to  pcifront
-+* in case of the device driver could provide this service, and then wait
-+* for pcifront ack.
-+* @dev: pointer to PCI devices
-+* @error: the current PCI connection state
-+* return value is used by aer_core do_recovery policy
-+*/
++#ifndef MODULE
++/*
++ * fs_initcall happens before device_initcall
++ * so pciback *should* get called first (b/c we
++ * want to suck up any device before other drivers
++ * get a chance by being the first pci device
++ * driver to register)
++ */
++fs_initcall(pcistub_init);
++#endif
 +
-+static pci_ers_result_t pciback_error_detected(struct pci_dev *dev,
-+	pci_channel_state_t error)
++static int __init pciback_init(void)
 +{
-+	struct pcistub_device *psdev;
-+	pci_ers_result_t result;
++	int err;
 +
-+	result = PCI_ERS_RESULT_CAN_RECOVER;
-+	dev_dbg(&dev->dev, "pciback_error_detected(bus:%x,devfn:%x)\n",
-+		dev->bus->number, dev->devfn);
++	if (!xen_initial_domain())
++		return -ENODEV;
 +
-+	down_write(&pcistub_sem);
-+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
-+				dev->bus->number,
-+				PCI_SLOT(dev->devfn),
-+				PCI_FUNC(dev->devfn));
++	err = pciback_config_init();
++	if (err)
++		return err;
 +
-+	if (!psdev || !psdev->pdev) {
-+		dev_err(&dev->dev,
-+			"pciback device is not found/assigned\n");
-+		goto end;
-+	}
++#ifdef MODULE
++	err = pcistub_init();
++	if (err < 0)
++		return err;
++#endif
 +
-+	if (!psdev->pdev->sh_info) {
-+		dev_err(&dev->dev, "pciback device is not connected or owned"
-+			" by HVM, kill it\n");
-+		kill_domain_by_device(psdev);
-+		goto release;
-+	}
++	pcistub_init_devices_late();
++	err = pciback_xenbus_register();
++	if (err)
++		pcistub_exit();
 +
-+	/*Guest owns the device yet no aer handler regiested, kill guest*/
-+	if (!test_bit(_XEN_PCIB_AERHANDLER,
-+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
-+		dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
-+		kill_domain_by_device(psdev);
-+		goto release;
-+	}
-+	result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
++	return err;
++}
 +
-+	if (result == PCI_ERS_RESULT_NONE ||
-+		result == PCI_ERS_RESULT_DISCONNECT) {
-+		dev_dbg(&dev->dev,
-+			"No AER error_detected service or disconnected!\n");
-+		kill_domain_by_device(psdev);
-+	}
-+release:
-+	pcistub_device_put(psdev);
-+end:
-+	up_write(&pcistub_sem);
-+	return result;
++static void __exit pciback_cleanup(void)
++{
++	pciback_xenbus_unregister();
++	pcistub_exit();
 +}
 +
-+/*pciback_error_resume: it will send the error_resume request to  pcifront
-+* in case of the device driver could provide this service, and then wait
-+* for pcifront ack.
-+* @dev: pointer to PCI devices
-+*/
++module_init(pciback_init);
++module_exit(pciback_cleanup);
++
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/pciback/pciback.h b/drivers/xen/pciback/pciback.h
+new file mode 100644
+index 0000000..98e2912
+--- /dev/null
++++ b/drivers/xen/pciback/pciback.h
+@@ -0,0 +1,133 @@
++/*
++ * PCI Backend Common Data Structures & Function Declarations
++ *
++ *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
++#ifndef __XEN_PCIBACK_H__
++#define __XEN_PCIBACK_H__
++
++#include <linux/pci.h>
++#include <linux/interrupt.h>
++#include <xen/xenbus.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/workqueue.h>
++#include <asm/atomic.h>
++#include <xen/interface/io/pciif.h>
++
++struct pci_dev_entry {
++	struct list_head list;
++	struct pci_dev *dev;
++};
 +
-+static void pciback_error_resume(struct pci_dev *dev)
-+{
-+	struct pcistub_device *psdev;
++#define _PDEVF_op_active 	(0)
++#define PDEVF_op_active 	(1<<(_PDEVF_op_active))
++#define _PCIB_op_pending	(1)
++#define PCIB_op_pending		(1<<(_PCIB_op_pending))
 +
-+	dev_dbg(&dev->dev, "pciback_error_resume(bus:%x,devfn:%x)\n",
-+		dev->bus->number, dev->devfn);
++struct pciback_device {
++	void *pci_dev_data;
++	spinlock_t dev_lock;
 +
-+	down_write(&pcistub_sem);
-+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
-+				dev->bus->number,
-+				PCI_SLOT(dev->devfn),
-+				PCI_FUNC(dev->devfn));
++	struct xenbus_device *xdev;
 +
-+	if (!psdev || !psdev->pdev) {
-+		dev_err(&dev->dev,
-+			"pciback device is not found/assigned\n");
-+		goto end;
-+	}
++	struct xenbus_watch be_watch;
++	u8 be_watching;
 +
-+	if (!psdev->pdev->sh_info) {
-+		dev_err(&dev->dev, "pciback device is not connected or owned"
-+			" by HVM, kill it\n");
-+		kill_domain_by_device(psdev);
-+		goto release;
-+	}
++	int evtchn_irq;
 +
-+	if (!test_bit(_XEN_PCIB_AERHANDLER,
-+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
-+		dev_err(&dev->dev,
-+			"guest with no AER driver should have been killed\n");
-+		kill_domain_by_device(psdev);
-+		goto release;
-+	}
-+	common_process(psdev, 1, XEN_PCI_OP_aer_resume,
-+		       PCI_ERS_RESULT_RECOVERED);
-+release:
-+	pcistub_device_put(psdev);
-+end:
-+	up_write(&pcistub_sem);
-+	return;
-+}
++	struct xen_pci_sharedinfo *sh_info;
 +
-+/*add pciback AER handling*/
-+static struct pci_error_handlers pciback_error_handler = {
-+	.error_detected = pciback_error_detected,
-+	.mmio_enabled = pciback_mmio_enabled,
-+	.slot_reset = pciback_slot_reset,
-+	.resume = pciback_error_resume,
-+};
++	unsigned long flags;
 +
-+/*
-+ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
-+ * for a normal device. I don't want it to be loaded automatically.
-+ */
++	struct work_struct op_work;
++};
 +
-+static struct pci_driver pciback_pci_driver = {
-+	.name = "pciback",
-+	.id_table = pcistub_ids,
-+	.probe = pcistub_probe,
-+	.remove = pcistub_remove,
-+	.err_handler = &pciback_error_handler,
++struct pciback_dev_data {
++	struct list_head config_fields;
++	int permissive;
++	int warned_on_write;
 +};
 +
-+static inline int str_to_slot(const char *buf, int *domain, int *bus,
-+			      int *slot, int *func)
-+{
-+	int err;
++/* Used by XenBus and pciback_ops.c */
++extern wait_queue_head_t aer_wait_queue;
++extern struct workqueue_struct *pciback_wq;
++/* Used by pcistub.c and conf_space_quirks.c */
++extern struct list_head pciback_quirks;
 +
-+	err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
-+	if (err == 4)
-+		return 0;
-+	else if (err < 0)
-+		return -EINVAL;
++/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
++					    int domain, int bus,
++					    int slot, int func);
++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
++				    struct pci_dev *dev);
++void pcistub_put_pci_dev(struct pci_dev *dev);
 +
-+	/* try again without domain */
-+	*domain = 0;
-+	err = sscanf(buf, " %x:%x.%x", bus, slot, func);
-+	if (err == 3)
-+		return 0;
++/* Ensure a device is turned off or reset */
++void pciback_reset_device(struct pci_dev *pdev);
 +
-+	return -EINVAL;
-+}
++/* Access a virtual configuration space for a PCI device */
++int pciback_config_init(void);
++int pciback_config_init_dev(struct pci_dev *dev);
++void pciback_config_free_dyn_fields(struct pci_dev *dev);
++void pciback_config_reset_dev(struct pci_dev *dev);
++void pciback_config_free_dev(struct pci_dev *dev);
++int pciback_config_read(struct pci_dev *dev, int offset, int size,
++			u32 *ret_val);
++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
 +
-+static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
-+			       *slot, int *func, int *reg, int *size, int *mask)
-+{
-+	int err;
++/* Handle requests for specific devices from the frontend */
++typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev,
++				   unsigned int domain, unsigned int bus,
++				   unsigned int devfn, unsigned int devid);
++typedef int (*publish_pci_root_cb) (struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus);
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++			int devid, publish_pci_dev_cb publish_cb);
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus,
++				    unsigned int devfn);
 +
-+	err =
-+	    sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
-+		   func, reg, size, mask);
-+	if (err == 7)
-+		return 0;
-+	return -EINVAL;
-+}
++/**
++* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in pciback
++* before sending aer request to pcifront, so that guest could identify
++* device, coopearte with pciback to finish aer recovery job if device driver
++* has the capability
++*/
 +
-+static int pcistub_device_id_add(int domain, int bus, int slot, int func)
-+{
-+	struct pcistub_device_id *pci_dev_id;
-+	unsigned long flags;
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++			     struct pciback_device *pdev,
++			     unsigned int *domain, unsigned int *bus,
++			     unsigned int *devfn);
++int pciback_init_devices(struct pciback_device *pdev);
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++			      publish_pci_root_cb cb);
++void pciback_release_devices(struct pciback_device *pdev);
 +
-+	pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
-+	if (!pci_dev_id)
-+		return -ENOMEM;
++/* Handles events from front-end */
++irqreturn_t pciback_handle_event(int irq, void *dev_id);
++void pciback_do_op(struct work_struct *data);
 +
-+	pci_dev_id->domain = domain;
-+	pci_dev_id->bus = bus;
-+	pci_dev_id->devfn = PCI_DEVFN(slot, func);
++int pciback_xenbus_register(void);
++void pciback_xenbus_unregister(void);
 +
-+	pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
-+		 domain, bus, slot, func);
++#ifdef CONFIG_PCI_MSI
++int pciback_enable_msi(struct pciback_device *pdev,
++			struct pci_dev *dev, struct xen_pci_op *op);
 +
-+	spin_lock_irqsave(&device_ids_lock, flags);
-+	list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
-+	spin_unlock_irqrestore(&device_ids_lock, flags);
++int pciback_disable_msi(struct pciback_device *pdev,
++			struct pci_dev *dev, struct xen_pci_op *op);
 +
-+	return 0;
-+}
 +
-+static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
++int pciback_enable_msix(struct pciback_device *pdev,
++			struct pci_dev *dev, struct xen_pci_op *op);
++
++int pciback_disable_msix(struct pciback_device *pdev,
++			struct pci_dev *dev, struct xen_pci_op *op);
++#endif
++extern int verbose_request;
++
++void test_and_schedule_op(struct pciback_device *pdev);
++#endif
++
+diff --git a/drivers/xen/pciback/pciback_ops.c b/drivers/xen/pciback/pciback_ops.c
+new file mode 100644
+index 0000000..011db67
+--- /dev/null
++++ b/drivers/xen/pciback/pciback_ops.c
+@@ -0,0 +1,139 @@
++/*
++ * PCI Backend Operations - respond to PCI requests from Frontend
++ *
++ *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/wait.h>
++#include <linux/bitops.h>
++#include <xen/events.h>
++#include <linux/sched.h>
++#include "pciback.h"
++
++int verbose_request;
++module_param(verbose_request, int, 0644);
++
++/* Ensure a device is "turned off" and ready to be exported.
++ * (Also see pciback_config_reset to ensure virtual configuration space is
++ * ready to be re-exported)
++ */
++void pciback_reset_device(struct pci_dev *dev)
 +{
-+	struct pcistub_device_id *pci_dev_id, *t;
-+	int devfn = PCI_DEVFN(slot, func);
-+	int err = -ENOENT;
-+	unsigned long flags;
++	u16 cmd;
 +
-+	spin_lock_irqsave(&device_ids_lock, flags);
-+	list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids,
-+				 slot_list) {
-+		if (pci_dev_id->domain == domain
-+		    && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
-+			/* Don't break; here because it's possible the same
-+			 * slot could be in the list more than once
-+			 */
-+			list_del(&pci_dev_id->slot_list);
-+			kfree(pci_dev_id);
++	/* Disable devices (but not bridges) */
++	if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
++#ifdef CONFIG_PCI_MSI
++		/* The guest could have been abruptly killed without
++		 * disabling MSI/MSI-X interrupts.*/
++		if (dev->msix_enabled)
++			pci_disable_msix(dev);
++		if (dev->msi_enabled)
++			pci_disable_msi(dev);
++#endif
++		pci_disable_device(dev);
 +
-+			err = 0;
++		pci_write_config_word(dev, PCI_COMMAND, 0);
++
++		dev->is_busmaster = 0;
++	} else {
++		pci_read_config_word(dev, PCI_COMMAND, &cmd);
++		if (cmd & (PCI_COMMAND_INVALIDATE)) {
++			cmd &= ~(PCI_COMMAND_INVALIDATE);
++			pci_write_config_word(dev, PCI_COMMAND, cmd);
 +
-+			pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
-+				 "seize list\n", domain, bus, slot, func);
++			dev->is_busmaster = 0;
 +		}
 +	}
-+	spin_unlock_irqrestore(&device_ids_lock, flags);
-+
-+	return err;
++}
++/*
++* Now the same evtchn is used for both pcifront conf_read_write request
++* as well as pcie aer front end ack. We use a new work_queue to schedule
++* pciback conf_read_write service for avoiding confict with aer_core
++* do_recovery job which also use the system default work_queue
++*/
++void test_and_schedule_op(struct pciback_device *pdev)
++{
++	/* Check that frontend is requesting an operation and that we are not
++	 * already processing a request */
++	if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
++	    && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) {
++		queue_work(pciback_wq, &pdev->op_work);
++	}
++	/*_XEN_PCIB_active should have been cleared by pcifront. And also make
++	sure pciback is waiting for ack by checking _PCIB_op_pending*/
++	if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
++	    && test_bit(_PCIB_op_pending, &pdev->flags)) {
++		wake_up(&aer_wait_queue);
++	}
 +}
 +
-+static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
-+			   int size, int mask)
++/* Performing the configuration space reads/writes must not be done in atomic
++ * context because some of the pci_* functions can sleep (mostly due to ACPI
++ * use of semaphores). This function is intended to be called from a work
++ * queue in process context taking a struct pciback_device as a parameter */
++
++void pciback_do_op(struct work_struct *data)
 +{
-+	int err = 0;
-+	struct pcistub_device *psdev;
++	struct pciback_device *pdev =
++		container_of(data, struct pciback_device, op_work);
 +	struct pci_dev *dev;
-+	struct config_field *field;
++	struct xen_pci_op *op = &pdev->sh_info->op;
 +
-+	psdev = pcistub_device_find(domain, bus, slot, func);
-+	if (!psdev || !psdev->dev) {
-+		err = -ENODEV;
-+		goto out;
-+	}
-+	dev = psdev->dev;
++	dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
 +
-+	field = kzalloc(sizeof(*field), GFP_ATOMIC);
-+	if (!field) {
-+		err = -ENOMEM;
-+		goto out;
++	if (dev == NULL)
++		op->err = XEN_PCI_ERR_dev_not_found;
++	else {
++		switch (op->cmd) {
++		case XEN_PCI_OP_conf_read:
++			op->err = pciback_config_read(dev,
++				  op->offset, op->size, &op->value);
++			break;
++		case XEN_PCI_OP_conf_write:
++			op->err = pciback_config_write(dev,
++				  op->offset, op->size,	op->value);
++			break;
++#ifdef CONFIG_PCI_MSI
++		case XEN_PCI_OP_enable_msi:
++			op->err = pciback_enable_msi(pdev, dev, op);
++			break;
++		case XEN_PCI_OP_disable_msi:
++			op->err = pciback_disable_msi(pdev, dev, op);
++			break;
++		case XEN_PCI_OP_enable_msix:
++			op->err = pciback_enable_msix(pdev, dev, op);
++			break;
++		case XEN_PCI_OP_disable_msix:
++			op->err = pciback_disable_msix(pdev, dev, op);
++			break;
++#endif
++		default:
++			op->err = XEN_PCI_ERR_not_implemented;
++			break;
++		}
 +	}
++	/* Tell the driver domain that we're done. */
++	wmb();
++	clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
++	notify_remote_via_irq(pdev->evtchn_irq);
 +
-+	field->offset = reg;
-+	field->size = size;
-+	field->mask = mask;
-+	field->init = NULL;
-+	field->reset = NULL;
-+	field->release = NULL;
-+	field->clean = pciback_config_field_free;
++	/* Mark that we're done. */
++	smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
++	clear_bit(_PDEVF_op_active, &pdev->flags);
++	smp_mb__after_clear_bit(); /* /before/ final check for work */
 +
-+	err = pciback_config_quirks_add_field(dev, field);
-+	if (err)
-+		kfree(field);
-+out:
-+	return err;
++	/* Check to see if the driver domain tried to start another request in
++	 * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
++	*/
++	test_and_schedule_op(pdev);
 +}
 +
-+static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
-+				size_t count)
++irqreturn_t pciback_handle_event(int irq, void *dev_id)
 +{
-+	int domain, bus, slot, func;
-+	int err;
-+
-+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
-+	if (err)
-+		goto out;
++	struct pciback_device *pdev = dev_id;
 +
-+	err = pcistub_device_id_add(domain, bus, slot, func);
++	test_and_schedule_op(pdev);
 +
-+out:
-+	if (!err)
-+		err = count;
-+	return err;
++	return IRQ_HANDLED;
 +}
+diff --git a/drivers/xen/pciback/slot.c b/drivers/xen/pciback/slot.c
+new file mode 100644
+index 0000000..efb922d
+--- /dev/null
++++ b/drivers/xen/pciback/slot.c
+@@ -0,0 +1,191 @@
++/*
++ * PCI Backend - Provides a Virtual PCI bus (with real devices)
++ *               to the frontend
++ *
++ *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil> (vpci.c)
++ *   Author: Tristan Gingold <tristan.gingold at bull.net>, from vpci.c
++ */
 +
-+DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
-+
-+static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
-+				   size_t count)
-+{
-+	int domain, bus, slot, func;
-+	int err;
-+
-+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
-+	if (err)
-+		goto out;
++#include <linux/list.h>
++#include <linux/slab.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
 +
-+	err = pcistub_device_id_remove(domain, bus, slot, func);
++/* There are at most 32 slots in a pci bus.  */
++#define PCI_SLOT_MAX 32
 +
-+out:
-+	if (!err)
-+		err = count;
-+	return err;
-+}
++#define PCI_BUS_NBR 2
 +
-+DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
++struct slot_dev_data {
++	/* Access to dev_list must be protected by lock */
++	struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
++	spinlock_t lock;
++};
 +
-+static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus,
++				    unsigned int devfn)
 +{
-+	struct pcistub_device_id *pci_dev_id;
-+	size_t count = 0;
++	struct pci_dev *dev = NULL;
++	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
 +	unsigned long flags;
 +
-+	spin_lock_irqsave(&device_ids_lock, flags);
-+	list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
-+		if (count >= PAGE_SIZE)
-+			break;
-+
-+		count += scnprintf(buf + count, PAGE_SIZE - count,
-+				   "%04x:%02x:%02x.%01x\n",
-+				   pci_dev_id->domain, pci_dev_id->bus,
-+				   PCI_SLOT(pci_dev_id->devfn),
-+				   PCI_FUNC(pci_dev_id->devfn));
-+	}
-+	spin_unlock_irqrestore(&device_ids_lock, flags);
-+
-+	return count;
-+}
-+
-+DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
-+
-+static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
-+				 size_t count)
-+{
-+	int domain, bus, slot, func, reg, size, mask;
-+	int err;
++	if (domain != 0 || PCI_FUNC(devfn) != 0)
++		return NULL;
 +
-+	err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
-+			   &mask);
-+	if (err)
-+		goto out;
++	if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
++		return NULL;
 +
-+	err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
++	spin_lock_irqsave(&slot_dev->lock, flags);
++	dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
++	spin_unlock_irqrestore(&slot_dev->lock, flags);
 +
-+out:
-+	if (!err)
-+		err = count;
-+	return err;
++	return dev;
 +}
 +
-+static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++			int devid, publish_pci_dev_cb publish_cb)
 +{
-+	int count = 0;
++	int err = 0, slot, bus;
++	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
 +	unsigned long flags;
-+	struct pciback_config_quirk *quirk;
-+	struct pciback_dev_data *dev_data;
-+	const struct config_field *field;
-+	const struct config_field_entry *cfg_entry;
-+
-+	spin_lock_irqsave(&device_ids_lock, flags);
-+	list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
-+		if (count >= PAGE_SIZE)
-+			goto out;
-+
-+		count += scnprintf(buf + count, PAGE_SIZE - count,
-+				   "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
-+				   quirk->pdev->bus->number,
-+				   PCI_SLOT(quirk->pdev->devfn),
-+				   PCI_FUNC(quirk->pdev->devfn),
-+				   quirk->devid.vendor, quirk->devid.device,
-+				   quirk->devid.subvendor,
-+				   quirk->devid.subdevice);
 +
-+		dev_data = pci_get_drvdata(quirk->pdev);
++	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
++		err = -EFAULT;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Can't export bridges on the virtual PCI bus");
++		goto out;
++	}
 +
-+		list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
-+			field = cfg_entry->field;
-+			if (count >= PAGE_SIZE)
-+				goto out;
++	spin_lock_irqsave(&slot_dev->lock, flags);
 +
-+			count += scnprintf(buf + count, PAGE_SIZE - count,
-+					   "\t\t%08x:%01x:%08x\n",
-+					   cfg_entry->base_offset +
-+					   field->offset, field->size,
-+					   field->mask);
++	/* Assign to a new slot on the virtual PCI bus */
++	for (bus = 0; bus < PCI_BUS_NBR; bus++)
++		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++			if (slot_dev->slots[bus][slot] == NULL) {
++				printk(KERN_INFO
++				       "pciback: slot: %s: assign to virtual "
++				       "slot %d, bus %d\n",
++				       pci_name(dev), slot, bus);
++				slot_dev->slots[bus][slot] = dev;
++				goto unlock;
++			}
 +		}
-+	}
 +
-+out:
-+	spin_unlock_irqrestore(&device_ids_lock, flags);
++	err = -ENOMEM;
++	xenbus_dev_fatal(pdev->xdev, err,
++			 "No more space on root virtual PCI bus");
 +
-+	return count;
-+}
++unlock:
++	spin_unlock_irqrestore(&slot_dev->lock, flags);
 +
-+DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
++	/* Publish this device. */
++	if (!err)
++		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
 +
-+static ssize_t permissive_add(struct device_driver *drv, const char *buf,
-+			      size_t count)
-+{
-+	int domain, bus, slot, func;
-+	int err;
-+	struct pcistub_device *psdev;
-+	struct pciback_dev_data *dev_data;
-+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
-+	if (err)
-+		goto out;
-+	psdev = pcistub_device_find(domain, bus, slot, func);
-+	if (!psdev) {
-+		err = -ENODEV;
-+		goto out;
-+	}
-+	if (!psdev->dev) {
-+		err = -ENODEV;
-+		goto release;
-+	}
-+	dev_data = pci_get_drvdata(psdev->dev);
-+	/* the driver data for a device should never be null at this point */
-+	if (!dev_data) {
-+		err = -ENXIO;
-+		goto release;
-+	}
-+	if (!dev_data->permissive) {
-+		dev_data->permissive = 1;
-+		/* Let user know that what they're doing could be unsafe */
-+		dev_warn(&psdev->dev->dev, "enabling permissive mode "
-+			 "configuration space accesses!\n");
-+		dev_warn(&psdev->dev->dev,
-+			 "permissive mode is potentially unsafe!\n");
-+	}
-+release:
-+	pcistub_device_put(psdev);
 +out:
-+	if (!err)
-+		err = count;
 +	return err;
 +}
 +
-+static ssize_t permissive_show(struct device_driver *drv, char *buf)
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
 +{
-+	struct pcistub_device *psdev;
-+	struct pciback_dev_data *dev_data;
-+	size_t count = 0;
++	int slot, bus;
++	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++	struct pci_dev *found_dev = NULL;
 +	unsigned long flags;
-+	spin_lock_irqsave(&pcistub_devices_lock, flags);
-+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
-+		if (count >= PAGE_SIZE)
-+			break;
-+		if (!psdev->dev)
-+			continue;
-+		dev_data = pci_get_drvdata(psdev->dev);
-+		if (!dev_data || !dev_data->permissive)
-+			continue;
-+		count +=
-+		    scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
-+			      pci_name(psdev->dev));
-+	}
-+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
-+	return count;
-+}
 +
-+DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
++	spin_lock_irqsave(&slot_dev->lock, flags);
 +
-+static void pcistub_exit(void)
-+{
-+	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
-+	driver_remove_file(&pciback_pci_driver.driver,
-+			   &driver_attr_remove_slot);
-+	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
-+	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
-+	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
++	for (bus = 0; bus < PCI_BUS_NBR; bus++)
++		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++			if (slot_dev->slots[bus][slot] == dev) {
++				slot_dev->slots[bus][slot] = NULL;
++				found_dev = dev;
++				goto out;
++			}
++		}
 +
-+	pci_unregister_driver(&pciback_pci_driver);
++out:
++	spin_unlock_irqrestore(&slot_dev->lock, flags);
++
++	if (found_dev)
++		pcistub_put_pci_dev(found_dev);
 +}
 +
-+static int __init pcistub_init(void)
++int pciback_init_devices(struct pciback_device *pdev)
 +{
-+	int pos = 0;
-+	int err = 0;
-+	int domain, bus, slot, func;
-+	int parsed;
++	int slot, bus;
++	struct slot_dev_data *slot_dev;
 +
-+	if (pci_devs_to_hide && *pci_devs_to_hide) {
-+		do {
-+			parsed = 0;
++	slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
++	if (!slot_dev)
++		return -ENOMEM;
 +
-+			err = sscanf(pci_devs_to_hide + pos,
-+				     " (%x:%x:%x.%x) %n",
-+				     &domain, &bus, &slot, &func, &parsed);
-+			if (err != 4) {
-+				domain = 0;
-+				err = sscanf(pci_devs_to_hide + pos,
-+					     " (%x:%x.%x) %n",
-+					     &bus, &slot, &func, &parsed);
-+				if (err != 3)
-+					goto parse_error;
-+			}
++	spin_lock_init(&slot_dev->lock);
 +
-+			err = pcistub_device_id_add(domain, bus, slot, func);
-+			if (err)
-+				goto out;
++	for (bus = 0; bus < PCI_BUS_NBR; bus++)
++		for (slot = 0; slot < PCI_SLOT_MAX; slot++)
++			slot_dev->slots[bus][slot] = NULL;
 +
-+			/* if parsed<=0, we've reached the end of the string */
-+			pos += parsed;
-+		} while (parsed > 0 && pci_devs_to_hide[pos]);
-+	}
++	pdev->pci_dev_data = slot_dev;
 +
-+	/* If we're the first PCI Device Driver to register, we're the
-+	 * first one to get offered PCI devices as they become
-+	 * available (and thus we can be the first to grab them)
-+	 */
-+	err = pci_register_driver(&pciback_pci_driver);
-+	if (err < 0)
-+		goto out;
++	return 0;
++}
 +
-+	err = driver_create_file(&pciback_pci_driver.driver,
-+				 &driver_attr_new_slot);
-+	if (!err)
-+		err = driver_create_file(&pciback_pci_driver.driver,
-+					 &driver_attr_remove_slot);
-+	if (!err)
-+		err = driver_create_file(&pciback_pci_driver.driver,
-+					 &driver_attr_slots);
-+	if (!err)
-+		err = driver_create_file(&pciback_pci_driver.driver,
-+					 &driver_attr_quirks);
-+	if (!err)
-+		err = driver_create_file(&pciback_pci_driver.driver,
-+					 &driver_attr_permissive);
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++			      publish_pci_root_cb publish_cb)
++{
++	/* The Virtual PCI bus has only one root */
++	return publish_cb(pdev, 0, 0);
++}
 +
-+	if (err)
-+		pcistub_exit();
++void pciback_release_devices(struct pciback_device *pdev)
++{
++	int slot, bus;
++	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++	struct pci_dev *dev;
 +
-+out:
-+	return err;
++	for (bus = 0; bus < PCI_BUS_NBR; bus++)
++		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++			dev = slot_dev->slots[bus][slot];
++			if (dev != NULL)
++				pcistub_put_pci_dev(dev);
++		}
 +
-+parse_error:
-+	printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
-+	       pci_devs_to_hide + pos);
-+	return -EINVAL;
++	kfree(slot_dev);
++	pdev->pci_dev_data = NULL;
 +}
 +
-+#ifndef MODULE
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++			     struct pciback_device *pdev,
++			     unsigned int *domain, unsigned int *bus,
++			     unsigned int *devfn)
++{
++	int slot, busnr;
++	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++	struct pci_dev *dev;
++	int found = 0;
++	unsigned long flags;
++
++	spin_lock_irqsave(&slot_dev->lock, flags);
++
++	for (busnr = 0; busnr < PCI_BUS_NBR; bus++)
++		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++			dev = slot_dev->slots[busnr][slot];
++			if (dev && dev->bus->number == pcidev->bus->number
++				&& dev->devfn == pcidev->devfn
++				&& pci_domain_nr(dev->bus) ==
++					pci_domain_nr(pcidev->bus)) {
++				found = 1;
++				*domain = 0;
++				*bus = busnr;
++				*devfn = PCI_DEVFN(slot, 0);
++				goto out;
++			}
++		}
++out:
++	spin_unlock_irqrestore(&slot_dev->lock, flags);
++	return found;
++
++}
+diff --git a/drivers/xen/pciback/vpci.c b/drivers/xen/pciback/vpci.c
+new file mode 100644
+index 0000000..2857ab8
+--- /dev/null
++++ b/drivers/xen/pciback/vpci.c
+@@ -0,0 +1,244 @@
 +/*
-+ * fs_initcall happens before device_initcall
-+ * so pciback *should* get called first (b/c we
-+ * want to suck up any device before other drivers
-+ * get a chance by being the first pci device
-+ * driver to register)
++ * PCI Backend - Provides a Virtual PCI bus (with real devices)
++ *               to the frontend
++ *
++ *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
 + */
-+fs_initcall(pcistub_init);
-+#endif
 +
-+static int __init pciback_init(void)
++#include <linux/list.h>
++#include <linux/slab.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
++
++#define PCI_SLOT_MAX 32
++
++struct vpci_dev_data {
++	/* Access to dev_list must be protected by lock */
++	struct list_head dev_list[PCI_SLOT_MAX];
++	spinlock_t lock;
++};
++
++static inline struct list_head *list_first(struct list_head *head)
 +{
-+	int err;
++	return head->next;
++}
 +
-+	if (!xen_initial_domain())
-+		return -ENODEV;
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus,
++				    unsigned int devfn)
++{
++	struct pci_dev_entry *entry;
++	struct pci_dev *dev = NULL;
++	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++	unsigned long flags;
 +
-+	err = pciback_config_init();
-+	if (err)
-+		return err;
++	if (domain != 0 || bus != 0)
++		return NULL;
 +
-+#ifdef MODULE
-+	err = pcistub_init();
-+	if (err < 0)
-+		return err;
-+#endif
++	if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
++		spin_lock_irqsave(&vpci_dev->lock, flags);
 +
-+	pcistub_init_devices_late();
-+	err = pciback_xenbus_register();
-+	if (err)
-+		pcistub_exit();
++		list_for_each_entry(entry,
++				    &vpci_dev->dev_list[PCI_SLOT(devfn)],
++				    list) {
++			if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
++				dev = entry->dev;
++				break;
++			}
++		}
 +
-+	return err;
++		spin_unlock_irqrestore(&vpci_dev->lock, flags);
++	}
++	return dev;
 +}
 +
-+static void __exit pciback_cleanup(void)
++static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
 +{
-+	pciback_xenbus_unregister();
-+	pcistub_exit();
++	if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
++	    && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
++		return 1;
++
++	return 0;
 +}
 +
-+module_init(pciback_init);
-+module_exit(pciback_cleanup);
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++			int devid, publish_pci_dev_cb publish_cb)
++{
++	int err = 0, slot, func = -1;
++	struct pci_dev_entry *t, *dev_entry;
++	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++	unsigned long flags;
 +
-+MODULE_LICENSE("Dual BSD/GPL");
-diff --git a/drivers/xen/pciback/pciback.h b/drivers/xen/pciback/pciback.h
-new file mode 100644
-index 0000000..98e2912
---- /dev/null
-+++ b/drivers/xen/pciback/pciback.h
-@@ -0,0 +1,133 @@
-+/*
-+ * PCI Backend Common Data Structures & Function Declarations
-+ *
-+ *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
-+ */
-+#ifndef __XEN_PCIBACK_H__
-+#define __XEN_PCIBACK_H__
++	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
++		err = -EFAULT;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Can't export bridges on the virtual PCI bus");
++		goto out;
++	}
 +
-+#include <linux/pci.h>
-+#include <linux/interrupt.h>
-+#include <xen/xenbus.h>
-+#include <linux/list.h>
-+#include <linux/spinlock.h>
-+#include <linux/workqueue.h>
-+#include <asm/atomic.h>
-+#include <xen/interface/io/pciif.h>
++	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
++	if (!dev_entry) {
++		err = -ENOMEM;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error adding entry to virtual PCI bus");
++		goto out;
++	}
 +
-+struct pci_dev_entry {
-+	struct list_head list;
-+	struct pci_dev *dev;
-+};
++	dev_entry->dev = dev;
 +
-+#define _PDEVF_op_active 	(0)
-+#define PDEVF_op_active 	(1<<(_PDEVF_op_active))
-+#define _PCIB_op_pending	(1)
-+#define PCIB_op_pending		(1<<(_PCIB_op_pending))
++	spin_lock_irqsave(&vpci_dev->lock, flags);
 +
-+struct pciback_device {
-+	void *pci_dev_data;
-+	spinlock_t dev_lock;
++	/* Keep multi-function devices together on the virtual PCI bus */
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++		if (!list_empty(&vpci_dev->dev_list[slot])) {
++			t = list_entry(list_first(&vpci_dev->dev_list[slot]),
++				       struct pci_dev_entry, list);
 +
-+	struct xenbus_device *xdev;
++			if (match_slot(dev, t->dev)) {
++				pr_info("pciback: vpci: %s: "
++					"assign to virtual slot %d func %d\n",
++					pci_name(dev), slot,
++					PCI_FUNC(dev->devfn));
++				list_add_tail(&dev_entry->list,
++					      &vpci_dev->dev_list[slot]);
++				func = PCI_FUNC(dev->devfn);
++				goto unlock;
++			}
++		}
++	}
 +
-+	struct xenbus_watch be_watch;
-+	u8 be_watching;
++	/* Assign to a new slot on the virtual PCI bus */
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++		if (list_empty(&vpci_dev->dev_list[slot])) {
++			printk(KERN_INFO
++			       "pciback: vpci: %s: assign to virtual slot %d\n",
++			       pci_name(dev), slot);
++			list_add_tail(&dev_entry->list,
++				      &vpci_dev->dev_list[slot]);
++			func = PCI_FUNC(dev->devfn);
++			goto unlock;
++		}
++	}
 +
-+	int evtchn_irq;
++	err = -ENOMEM;
++	xenbus_dev_fatal(pdev->xdev, err,
++			 "No more space on root virtual PCI bus");
 +
-+	struct xen_pci_sharedinfo *sh_info;
++unlock:
++	spin_unlock_irqrestore(&vpci_dev->lock, flags);
 +
-+	unsigned long flags;
++	/* Publish this device. */
++	if (!err)
++		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
 +
-+	struct work_struct op_work;
-+};
++out:
++	return err;
++}
 +
-+struct pciback_dev_data {
-+	struct list_head config_fields;
-+	int permissive;
-+	int warned_on_write;
-+};
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++	int slot;
++	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++	struct pci_dev *found_dev = NULL;
++	unsigned long flags;
 +
-+/* Used by XenBus and pciback_ops.c */
-+extern wait_queue_head_t aer_wait_queue;
-+extern struct workqueue_struct *pciback_wq;
-+/* Used by pcistub.c and conf_space_quirks.c */
-+extern struct list_head pciback_quirks;
++	spin_lock_irqsave(&vpci_dev->lock, flags);
 +
-+/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
-+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
-+					    int domain, int bus,
-+					    int slot, int func);
-+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
-+				    struct pci_dev *dev);
-+void pcistub_put_pci_dev(struct pci_dev *dev);
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++		struct pci_dev_entry *e, *tmp;
++		list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
++					 list) {
++			if (e->dev == dev) {
++				list_del(&e->list);
++				found_dev = e->dev;
++				kfree(e);
++				goto out;
++			}
++		}
++	}
 +
-+/* Ensure a device is turned off or reset */
-+void pciback_reset_device(struct pci_dev *pdev);
++out:
++	spin_unlock_irqrestore(&vpci_dev->lock, flags);
 +
-+/* Access a virtual configuration space for a PCI device */
-+int pciback_config_init(void);
-+int pciback_config_init_dev(struct pci_dev *dev);
-+void pciback_config_free_dyn_fields(struct pci_dev *dev);
-+void pciback_config_reset_dev(struct pci_dev *dev);
-+void pciback_config_free_dev(struct pci_dev *dev);
-+int pciback_config_read(struct pci_dev *dev, int offset, int size,
-+			u32 *ret_val);
-+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
++	if (found_dev)
++		pcistub_put_pci_dev(found_dev);
++}
 +
-+/* Handle requests for specific devices from the frontend */
-+typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev,
-+				   unsigned int domain, unsigned int bus,
-+				   unsigned int devfn, unsigned int devid);
-+typedef int (*publish_pci_root_cb) (struct pciback_device *pdev,
-+				    unsigned int domain, unsigned int bus);
-+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
-+			int devid, publish_pci_dev_cb publish_cb);
-+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
-+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
-+				    unsigned int domain, unsigned int bus,
-+				    unsigned int devfn);
++int pciback_init_devices(struct pciback_device *pdev)
++{
++	int slot;
++	struct vpci_dev_data *vpci_dev;
 +
-+/**
-+* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in pciback
-+* before sending aer request to pcifront, so that guest could identify
-+* device, coopearte with pciback to finish aer recovery job if device driver
-+* has the capability
-+*/
++	vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
++	if (!vpci_dev)
++		return -ENOMEM;
 +
-+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
-+			     struct pciback_device *pdev,
-+			     unsigned int *domain, unsigned int *bus,
-+			     unsigned int *devfn);
-+int pciback_init_devices(struct pciback_device *pdev);
-+int pciback_publish_pci_roots(struct pciback_device *pdev,
-+			      publish_pci_root_cb cb);
-+void pciback_release_devices(struct pciback_device *pdev);
++	spin_lock_init(&vpci_dev->lock);
 +
-+/* Handles events from front-end */
-+irqreturn_t pciback_handle_event(int irq, void *dev_id);
-+void pciback_do_op(struct work_struct *data);
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++)
++		INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
 +
-+int pciback_xenbus_register(void);
-+void pciback_xenbus_unregister(void);
++	pdev->pci_dev_data = vpci_dev;
 +
-+#ifdef CONFIG_PCI_MSI
-+int pciback_enable_msi(struct pciback_device *pdev,
-+			struct pci_dev *dev, struct xen_pci_op *op);
++	return 0;
++}
 +
-+int pciback_disable_msi(struct pciback_device *pdev,
-+			struct pci_dev *dev, struct xen_pci_op *op);
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++			      publish_pci_root_cb publish_cb)
++{
++	/* The Virtual PCI bus has only one root */
++	return publish_cb(pdev, 0, 0);
++}
 +
++void pciback_release_devices(struct pciback_device *pdev)
++{
++	int slot;
++	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
 +
-+int pciback_enable_msix(struct pciback_device *pdev,
-+			struct pci_dev *dev, struct xen_pci_op *op);
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++		struct pci_dev_entry *e, *tmp;
++		list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
++					 list) {
++			list_del(&e->list);
++			pcistub_put_pci_dev(e->dev);
++			kfree(e);
++		}
++	}
 +
-+int pciback_disable_msix(struct pciback_device *pdev,
-+			struct pci_dev *dev, struct xen_pci_op *op);
-+#endif
-+extern int verbose_request;
++	kfree(vpci_dev);
++	pdev->pci_dev_data = NULL;
++}
 +
-+void test_and_schedule_op(struct pciback_device *pdev);
-+#endif
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++			     struct pciback_device *pdev,
++			     unsigned int *domain, unsigned int *bus,
++			     unsigned int *devfn)
++{
++	struct pci_dev_entry *entry;
++	struct pci_dev *dev = NULL;
++	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++	unsigned long flags;
++	int found = 0, slot;
 +
-diff --git a/drivers/xen/pciback/pciback_ops.c b/drivers/xen/pciback/pciback_ops.c
++	spin_lock_irqsave(&vpci_dev->lock, flags);
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++		list_for_each_entry(entry,
++			    &vpci_dev->dev_list[slot],
++			    list) {
++			dev = entry->dev;
++			if (dev && dev->bus->number == pcidev->bus->number
++				&& pci_domain_nr(dev->bus) ==
++					pci_domain_nr(pcidev->bus)
++				&& dev->devfn == pcidev->devfn) {
++				found = 1;
++				*domain = 0;
++				*bus = 0;
++				*devfn = PCI_DEVFN(slot,
++					 PCI_FUNC(pcidev->devfn));
++			}
++		}
++	}
++	spin_unlock_irqrestore(&vpci_dev->lock, flags);
++	return found;
++}
+diff --git a/drivers/xen/pciback/xenbus.c b/drivers/xen/pciback/xenbus.c
 new file mode 100644
-index 0000000..011db67
+index 0000000..d448bf5
 --- /dev/null
-+++ b/drivers/xen/pciback/pciback_ops.c
-@@ -0,0 +1,139 @@
++++ b/drivers/xen/pciback/xenbus.c
+@@ -0,0 +1,722 @@
 +/*
-+ * PCI Backend Operations - respond to PCI requests from Frontend
++ * PCI Backend Xenbus Setup - handles setup with frontend and xend
 + *
 + *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
 + */
 +#include <linux/module.h>
-+#include <linux/wait.h>
-+#include <linux/bitops.h>
++#include <linux/init.h>
++#include <linux/list.h>
++#include <linux/vmalloc.h>
++#include <linux/workqueue.h>
++#include <xen/xenbus.h>
 +#include <xen/events.h>
-+#include <linux/sched.h>
++#include <asm/xen/pci.h>
++#include <linux/workqueue.h>
 +#include "pciback.h"
 +
-+int verbose_request;
-+module_param(verbose_request, int, 0644);
++#define INVALID_EVTCHN_IRQ  (-1)
++struct workqueue_struct *pciback_wq;
 +
-+/* Ensure a device is "turned off" and ready to be exported.
-+ * (Also see pciback_config_reset to ensure virtual configuration space is
-+ * ready to be re-exported)
-+ */
-+void pciback_reset_device(struct pci_dev *dev)
++static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
 +{
-+	u16 cmd;
-+
-+	/* Disable devices (but not bridges) */
-+	if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
-+#ifdef CONFIG_PCI_MSI
-+		/* The guest could have been abruptly killed without
-+		 * disabling MSI/MSI-X interrupts.*/
-+		if (dev->msix_enabled)
-+			pci_disable_msix(dev);
-+		if (dev->msi_enabled)
-+			pci_disable_msi(dev);
-+#endif
-+		pci_disable_device(dev);
-+
-+		pci_write_config_word(dev, PCI_COMMAND, 0);
++	struct pciback_device *pdev;
 +
-+		dev->is_busmaster = 0;
-+	} else {
-+		pci_read_config_word(dev, PCI_COMMAND, &cmd);
-+		if (cmd & (PCI_COMMAND_INVALIDATE)) {
-+			cmd &= ~(PCI_COMMAND_INVALIDATE);
-+			pci_write_config_word(dev, PCI_COMMAND, cmd);
++	pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
++	if (pdev == NULL)
++		goto out;
++	dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
 +
-+			dev->is_busmaster = 0;
-+		}
-+	}
-+}
-+/*
-+* Now the same evtchn is used for both pcifront conf_read_write request
-+* as well as pcie aer front end ack. We use a new work_queue to schedule
-+* pciback conf_read_write service for avoiding confict with aer_core
-+* do_recovery job which also use the system default work_queue
-+*/
-+void test_and_schedule_op(struct pciback_device *pdev)
-+{
-+	/* Check that frontend is requesting an operation and that we are not
-+	 * already processing a request */
-+	if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
-+	    && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) {
-+		queue_work(pciback_wq, &pdev->op_work);
-+	}
-+	/*_XEN_PCIB_active should have been cleared by pcifront. And also make
-+	sure pciback is waiting for ack by checking _PCIB_op_pending*/
-+	if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
-+	    && test_bit(_PCIB_op_pending, &pdev->flags)) {
-+		wake_up(&aer_wait_queue);
-+	}
-+}
++	pdev->xdev = xdev;
++	dev_set_drvdata(&xdev->dev, pdev);
 +
-+/* Performing the configuration space reads/writes must not be done in atomic
-+ * context because some of the pci_* functions can sleep (mostly due to ACPI
-+ * use of semaphores). This function is intended to be called from a work
-+ * queue in process context taking a struct pciback_device as a parameter */
++	spin_lock_init(&pdev->dev_lock);
 +
-+void pciback_do_op(struct work_struct *data)
-+{
-+	struct pciback_device *pdev =
-+		container_of(data, struct pciback_device, op_work);
-+	struct pci_dev *dev;
-+	struct xen_pci_op *op = &pdev->sh_info->op;
++	pdev->sh_info = NULL;
++	pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
++	pdev->be_watching = 0;
 +
-+	dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
++	INIT_WORK(&pdev->op_work, pciback_do_op);
 +
-+	if (dev == NULL)
-+		op->err = XEN_PCI_ERR_dev_not_found;
-+	else {
-+		switch (op->cmd) {
-+		case XEN_PCI_OP_conf_read:
-+			op->err = pciback_config_read(dev,
-+				  op->offset, op->size, &op->value);
-+			break;
-+		case XEN_PCI_OP_conf_write:
-+			op->err = pciback_config_write(dev,
-+				  op->offset, op->size,	op->value);
-+			break;
-+#ifdef CONFIG_PCI_MSI
-+		case XEN_PCI_OP_enable_msi:
-+			op->err = pciback_enable_msi(pdev, dev, op);
-+			break;
-+		case XEN_PCI_OP_disable_msi:
-+			op->err = pciback_disable_msi(pdev, dev, op);
-+			break;
-+		case XEN_PCI_OP_enable_msix:
-+			op->err = pciback_enable_msix(pdev, dev, op);
-+			break;
-+		case XEN_PCI_OP_disable_msix:
-+			op->err = pciback_disable_msix(pdev, dev, op);
-+			break;
-+#endif
-+		default:
-+			op->err = XEN_PCI_ERR_not_implemented;
-+			break;
-+		}
++	if (pciback_init_devices(pdev)) {
++		kfree(pdev);
++		pdev = NULL;
 +	}
-+	/* Tell the driver domain that we're done. */
-+	wmb();
-+	clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
-+	notify_remote_via_irq(pdev->evtchn_irq);
-+
-+	/* Mark that we're done. */
-+	smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
-+	clear_bit(_PDEVF_op_active, &pdev->flags);
-+	smp_mb__after_clear_bit(); /* /before/ final check for work */
-+
-+	/* Check to see if the driver domain tried to start another request in
-+	 * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
-+	*/
-+	test_and_schedule_op(pdev);
++out:
++	return pdev;
 +}
 +
-+irqreturn_t pciback_handle_event(int irq, void *dev_id)
++static void pciback_disconnect(struct pciback_device *pdev)
 +{
-+	struct pciback_device *pdev = dev_id;
-+
-+	test_and_schedule_op(pdev);
-+
-+	return IRQ_HANDLED;
-+}
-diff --git a/drivers/xen/pciback/slot.c b/drivers/xen/pciback/slot.c
-new file mode 100644
-index 0000000..efb922d
---- /dev/null
-+++ b/drivers/xen/pciback/slot.c
-@@ -0,0 +1,191 @@
-+/*
-+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
-+ *               to the frontend
-+ *
-+ *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil> (vpci.c)
-+ *   Author: Tristan Gingold <tristan.gingold at bull.net>, from vpci.c
-+ */
++	spin_lock(&pdev->dev_lock);
 +
-+#include <linux/list.h>
-+#include <linux/slab.h>
-+#include <linux/pci.h>
-+#include <linux/spinlock.h>
-+#include "pciback.h"
++	/* Ensure the guest can't trigger our handler before removing devices */
++	if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
++		unbind_from_irqhandler(pdev->evtchn_irq, pdev);
++		pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
++	}
 +
-+/* There are at most 32 slots in a pci bus.  */
-+#define PCI_SLOT_MAX 32
++	/* If the driver domain started an op, make sure we complete it
++	 * before releasing the shared memory */
++	flush_workqueue(pciback_wq);
 +
-+#define PCI_BUS_NBR 2
++	if (pdev->sh_info != NULL) {
++		xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
++		pdev->sh_info = NULL;
++	}
 +
-+struct slot_dev_data {
-+	/* Access to dev_list must be protected by lock */
-+	struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
-+	spinlock_t lock;
-+};
++	spin_unlock(&pdev->dev_lock);
++}
 +
-+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
-+				    unsigned int domain, unsigned int bus,
-+				    unsigned int devfn)
++static void free_pdev(struct pciback_device *pdev)
 +{
-+	struct pci_dev *dev = NULL;
-+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
-+	unsigned long flags;
++	if (pdev->be_watching)
++		unregister_xenbus_watch(&pdev->be_watch);
 +
-+	if (domain != 0 || PCI_FUNC(devfn) != 0)
-+		return NULL;
++	pciback_disconnect(pdev);
 +
-+	if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
-+		return NULL;
++	pciback_release_devices(pdev);
 +
-+	spin_lock_irqsave(&slot_dev->lock, flags);
-+	dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
-+	spin_unlock_irqrestore(&slot_dev->lock, flags);
++	dev_set_drvdata(&pdev->xdev->dev, NULL);
++	pdev->xdev = NULL;
 +
-+	return dev;
++	kfree(pdev);
 +}
 +
-+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
-+			int devid, publish_pci_dev_cb publish_cb)
++static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
++			     int remote_evtchn)
 +{
-+	int err = 0, slot, bus;
-+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
-+	unsigned long flags;
++	int err = 0;
++	void *vaddr;
 +
-+	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
-+		err = -EFAULT;
++	dev_dbg(&pdev->xdev->dev,
++		"Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
++		gnt_ref, remote_evtchn);
++
++	err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
++	if (err < 0) {
 +		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Can't export bridges on the virtual PCI bus");
++				"Error mapping other domain page in ours.");
 +		goto out;
 +	}
++	pdev->sh_info = vaddr;
 +
-+	spin_lock_irqsave(&slot_dev->lock, flags);
-+
-+	/* Assign to a new slot on the virtual PCI bus */
-+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
-+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-+			if (slot_dev->slots[bus][slot] == NULL) {
-+				printk(KERN_INFO
-+				       "pciback: slot: %s: assign to virtual "
-+				       "slot %d, bus %d\n",
-+				       pci_name(dev), slot, bus);
-+				slot_dev->slots[bus][slot] = dev;
-+				goto unlock;
-+			}
-+		}
-+
-+	err = -ENOMEM;
-+	xenbus_dev_fatal(pdev->xdev, err,
-+			 "No more space on root virtual PCI bus");
-+
-+unlock:
-+	spin_unlock_irqrestore(&slot_dev->lock, flags);
-+
-+	/* Publish this device. */
-+	if (!err)
-+		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
++	err = bind_interdomain_evtchn_to_irqhandler(
++		pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
++		0, "pciback", pdev);
++	if (err < 0) {
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error binding event channel to IRQ");
++		goto out;
++	}
++	pdev->evtchn_irq = err;
++	err = 0;
 +
++	dev_dbg(&pdev->xdev->dev, "Attached!\n");
 +out:
 +	return err;
 +}
 +
-+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++static int pciback_attach(struct pciback_device *pdev)
 +{
-+	int slot, bus;
-+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
-+	struct pci_dev *found_dev = NULL;
-+	unsigned long flags;
++	int err = 0;
++	int gnt_ref, remote_evtchn;
++	char *magic = NULL;
 +
-+	spin_lock_irqsave(&slot_dev->lock, flags);
++	spin_lock(&pdev->dev_lock);
 +
-+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
-+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-+			if (slot_dev->slots[bus][slot] == dev) {
-+				slot_dev->slots[bus][slot] = NULL;
-+				found_dev = dev;
-+				goto out;
-+			}
-+		}
++	/* Make sure we only do this setup once */
++	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++	    XenbusStateInitialised)
++		goto out;
 +
-+out:
-+	spin_unlock_irqrestore(&slot_dev->lock, flags);
++	/* Wait for frontend to state that it has published the configuration */
++	if (xenbus_read_driver_state(pdev->xdev->otherend) !=
++	    XenbusStateInitialised)
++		goto out;
 +
-+	if (found_dev)
-+		pcistub_put_pci_dev(found_dev);
-+}
++	dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
 +
-+int pciback_init_devices(struct pciback_device *pdev)
-+{
-+	int slot, bus;
-+	struct slot_dev_data *slot_dev;
++	err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
++			    "pci-op-ref", "%u", &gnt_ref,
++			    "event-channel", "%u", &remote_evtchn,
++			    "magic", NULL, &magic, NULL);
++	if (err) {
++		/* If configuration didn't get read correctly, wait longer */
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error reading configuration from frontend");
++		goto out;
++	}
 +
-+	slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
-+	if (!slot_dev)
-+		return -ENOMEM;
++	if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
++		xenbus_dev_fatal(pdev->xdev, -EFAULT,
++				 "version mismatch (%s/%s) with pcifront - "
++				 "halting pciback",
++				 magic, XEN_PCI_MAGIC);
++		goto out;
++	}
 +
-+	spin_lock_init(&slot_dev->lock);
++	err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
++	if (err)
++		goto out;
 +
-+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
-+		for (slot = 0; slot < PCI_SLOT_MAX; slot++)
-+			slot_dev->slots[bus][slot] = NULL;
++	dev_dbg(&pdev->xdev->dev, "Connecting...\n");
 +
-+	pdev->pci_dev_data = slot_dev;
++	err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
++	if (err)
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error switching to connected state!");
 +
-+	return 0;
-+}
++	dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
++out:
++	spin_unlock(&pdev->dev_lock);
 +
-+int pciback_publish_pci_roots(struct pciback_device *pdev,
-+			      publish_pci_root_cb publish_cb)
-+{
-+	/* The Virtual PCI bus has only one root */
-+	return publish_cb(pdev, 0, 0);
++	kfree(magic);
++
++	return err;
 +}
 +
-+void pciback_release_devices(struct pciback_device *pdev)
++static int pciback_publish_pci_dev(struct pciback_device *pdev,
++				   unsigned int domain, unsigned int bus,
++				   unsigned int devfn, unsigned int devid)
 +{
-+	int slot, bus;
-+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
-+	struct pci_dev *dev;
++	int err;
++	int len;
++	char str[64];
 +
-+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
-+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-+			dev = slot_dev->slots[bus][slot];
-+			if (dev != NULL)
-+				pcistub_put_pci_dev(dev);
-+		}
++	len = snprintf(str, sizeof(str), "vdev-%d", devid);
++	if (unlikely(len >= (sizeof(str) - 1))) {
++		err = -ENOMEM;
++		goto out;
++	}
 +
-+	kfree(slot_dev);
-+	pdev->pci_dev_data = NULL;
++	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++			    "%04x:%02x:%02x.%02x", domain, bus,
++			    PCI_SLOT(devfn), PCI_FUNC(devfn));
++
++out:
++	return err;
 +}
 +
-+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
-+			     struct pciback_device *pdev,
-+			     unsigned int *domain, unsigned int *bus,
-+			     unsigned int *devfn)
++static int pciback_export_device(struct pciback_device *pdev,
++				 int domain, int bus, int slot, int func,
++				 int devid)
 +{
-+	int slot, busnr;
-+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
 +	struct pci_dev *dev;
-+	int found = 0;
-+	unsigned long flags;
++	int err = 0;
 +
-+	spin_lock_irqsave(&slot_dev->lock, flags);
++	dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
++		domain, bus, slot, func);
 +
-+	for (busnr = 0; busnr < PCI_BUS_NBR; bus++)
-+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-+			dev = slot_dev->slots[busnr][slot];
-+			if (dev && dev->bus->number == pcidev->bus->number
-+				&& dev->devfn == pcidev->devfn
-+				&& pci_domain_nr(dev->bus) ==
-+					pci_domain_nr(pcidev->bus)) {
-+				found = 1;
-+				*domain = 0;
-+				*bus = busnr;
-+				*devfn = PCI_DEVFN(slot, 0);
-+				goto out;
-+			}
-+		}
-+out:
-+	spin_unlock_irqrestore(&slot_dev->lock, flags);
-+	return found;
++	dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
++	if (!dev) {
++		err = -EINVAL;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Couldn't locate PCI device "
++				 "(%04x:%02x:%02x.%01x)! "
++				 "perhaps already in-use?",
++				 domain, bus, slot, func);
++		goto out;
++	}
++
++	err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev);
++	if (err)
++		goto out;
++
++	dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
++	if (xen_register_device_domain_owner(dev,
++					     pdev->xdev->otherend_id) != 0) {
++		dev_err(&dev->dev, "device has been assigned to another " \
++			"domain! Over-writting the ownership, but beware.\n");
++		xen_unregister_device_domain_owner(dev);
++		xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
++	}
 +
++	/* TODO: It'd be nice to export a bridge and have all of its children
++	 * get exported with it. This may be best done in xend (which will
++	 * have to calculate resource usage anyway) but we probably want to
++	 * put something in here to ensure that if a bridge gets given to a
++	 * driver domain, that all devices under that bridge are not given
++	 * to other driver domains (as he who controls the bridge can disable
++	 * it and stop the other devices from working).
++	 */
++out:
++	return err;
 +}
-diff --git a/drivers/xen/pciback/vpci.c b/drivers/xen/pciback/vpci.c
-new file mode 100644
-index 0000000..2857ab8
---- /dev/null
-+++ b/drivers/xen/pciback/vpci.c
-@@ -0,0 +1,244 @@
-+/*
-+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
-+ *               to the frontend
-+ *
-+ *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
-+ */
 +
-+#include <linux/list.h>
-+#include <linux/slab.h>
-+#include <linux/pci.h>
-+#include <linux/spinlock.h>
-+#include "pciback.h"
++static int pciback_remove_device(struct pciback_device *pdev,
++				 int domain, int bus, int slot, int func)
++{
++	int err = 0;
++	struct pci_dev *dev;
 +
-+#define PCI_SLOT_MAX 32
++	dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
++		domain, bus, slot, func);
 +
-+struct vpci_dev_data {
-+	/* Access to dev_list must be protected by lock */
-+	struct list_head dev_list[PCI_SLOT_MAX];
-+	spinlock_t lock;
-+};
++	dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
++	if (!dev) {
++		err = -EINVAL;
++		dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
++			"(%04x:%02x:%02x.%01x)! not owned by this domain\n",
++			domain, bus, slot, func);
++		goto out;
++	}
 +
-+static inline struct list_head *list_first(struct list_head *head)
-+{
-+	return head->next;
++	dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
++	xen_unregister_device_domain_owner(dev);
++
++	pciback_release_pci_dev(pdev, dev);
++
++out:
++	return err;
 +}
 +
-+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
-+				    unsigned int domain, unsigned int bus,
-+				    unsigned int devfn)
++static int pciback_publish_pci_root(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus)
 +{
-+	struct pci_dev_entry *entry;
-+	struct pci_dev *dev = NULL;
-+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
-+	unsigned long flags;
++	unsigned int d, b;
++	int i, root_num, len, err;
++	char str[64];
 +
-+	if (domain != 0 || bus != 0)
-+		return NULL;
++	dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
 +
-+	if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
-+		spin_lock_irqsave(&vpci_dev->lock, flags);
++	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++			   "root_num", "%d", &root_num);
++	if (err == 0 || err == -ENOENT)
++		root_num = 0;
++	else if (err < 0)
++		goto out;
 +
-+		list_for_each_entry(entry,
-+				    &vpci_dev->dev_list[PCI_SLOT(devfn)],
-+				    list) {
-+			if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
-+				dev = entry->dev;
-+				break;
-+			}
++	/* Verify that we haven't already published this pci root */
++	for (i = 0; i < root_num; i++) {
++		len = snprintf(str, sizeof(str), "root-%d", i);
++		if (unlikely(len >= (sizeof(str) - 1))) {
++			err = -ENOMEM;
++			goto out;
 +		}
 +
-+		spin_unlock_irqrestore(&vpci_dev->lock, flags);
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++				   str, "%x:%x", &d, &b);
++		if (err < 0)
++			goto out;
++		if (err != 2) {
++			err = -EINVAL;
++			goto out;
++		}
++
++		if (d == domain && b == bus) {
++			err = 0;
++			goto out;
++		}
 +	}
-+	return dev;
-+}
 +
-+static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
-+{
-+	if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
-+	    && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
-+		return 1;
++	len = snprintf(str, sizeof(str), "root-%d", root_num);
++	if (unlikely(len >= (sizeof(str) - 1))) {
++		err = -ENOMEM;
++		goto out;
++	}
 +
-+	return 0;
++	dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
++		root_num, domain, bus);
++
++	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++			    "%04x:%02x", domain, bus);
++	if (err)
++		goto out;
++
++	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
++			    "root_num", "%d", (root_num + 1));
++
++out:
++	return err;
 +}
 +
-+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
-+			int devid, publish_pci_dev_cb publish_cb)
++static int pciback_reconfigure(struct pciback_device *pdev)
 +{
-+	int err = 0, slot, func = -1;
-+	struct pci_dev_entry *t, *dev_entry;
-+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
-+	unsigned long flags;
++	int err = 0;
++	int num_devs;
++	int domain, bus, slot, func;
++	int substate;
++	int i, len;
++	char state_str[64];
++	char dev_str[64];
 +
-+	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
-+		err = -EFAULT;
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Can't export bridges on the virtual PCI bus");
++	spin_lock(&pdev->dev_lock);
++
++	dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
++
++	/* Make sure we only reconfigure once */
++	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++	    XenbusStateReconfiguring)
 +		goto out;
-+	}
 +
-+	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
-+	if (!dev_entry) {
-+		err = -ENOMEM;
++	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
++			   &num_devs);
++	if (err != 1) {
++		if (err >= 0)
++			err = -EINVAL;
 +		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Error adding entry to virtual PCI bus");
++				 "Error reading number of devices");
 +		goto out;
 +	}
 +
-+	dev_entry->dev = dev;
-+
-+	spin_lock_irqsave(&vpci_dev->lock, flags);
++	for (i = 0; i < num_devs; i++) {
++		len = snprintf(state_str, sizeof(state_str), "state-%d", i);
++		if (unlikely(len >= (sizeof(state_str) - 1))) {
++			err = -ENOMEM;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "String overflow while reading "
++					 "configuration");
++			goto out;
++		}
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
++				   "%d", &substate);
++		if (err != 1)
++			substate = XenbusStateUnknown;
 +
-+	/* Keep multi-function devices together on the virtual PCI bus */
-+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-+		if (!list_empty(&vpci_dev->dev_list[slot])) {
-+			t = list_entry(list_first(&vpci_dev->dev_list[slot]),
-+				       struct pci_dev_entry, list);
++		switch (substate) {
++		case XenbusStateInitialising:
++			dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
 +
-+			if (match_slot(dev, t->dev)) {
-+				pr_info("pciback: vpci: %s: "
-+					"assign to virtual slot %d func %d\n",
-+					pci_name(dev), slot,
-+					PCI_FUNC(dev->devfn));
-+				list_add_tail(&dev_entry->list,
-+					      &vpci_dev->dev_list[slot]);
-+				func = PCI_FUNC(dev->devfn);
-+				goto unlock;
++			len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
++			if (unlikely(len >= (sizeof(dev_str) - 1))) {
++				err = -ENOMEM;
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "String overflow while "
++						 "reading configuration");
++				goto out;
++			}
++			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++					   dev_str, "%x:%x:%x.%x",
++					   &domain, &bus, &slot, &func);
++			if (err < 0) {
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error reading device "
++						 "configuration");
++				goto out;
++			}
++			if (err != 4) {
++				err = -EINVAL;
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error parsing pci device "
++						 "configuration");
++				goto out;
 +			}
-+		}
-+	}
 +
-+	/* Assign to a new slot on the virtual PCI bus */
-+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-+		if (list_empty(&vpci_dev->dev_list[slot])) {
-+			printk(KERN_INFO
-+			       "pciback: vpci: %s: assign to virtual slot %d\n",
-+			       pci_name(dev), slot);
-+			list_add_tail(&dev_entry->list,
-+				      &vpci_dev->dev_list[slot]);
-+			func = PCI_FUNC(dev->devfn);
-+			goto unlock;
-+		}
-+	}
++			err = pciback_export_device(pdev, domain, bus, slot,
++						    func, i);
++			if (err)
++				goto out;
++
++			/* Publish pci roots. */
++			err = pciback_publish_pci_roots(pdev,
++						pciback_publish_pci_root);
++			if (err) {
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error while publish PCI root"
++						 "buses for frontend");
++				goto out;
++			}
 +
-+	err = -ENOMEM;
-+	xenbus_dev_fatal(pdev->xdev, err,
-+			 "No more space on root virtual PCI bus");
++			err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
++					    state_str, "%d",
++					    XenbusStateInitialised);
++			if (err) {
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error switching substate of "
++						 "dev-%d\n", i);
++				goto out;
++			}
++			break;
 +
-+unlock:
-+	spin_unlock_irqrestore(&vpci_dev->lock, flags);
++		case XenbusStateClosing:
++			dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
 +
-+	/* Publish this device. */
-+	if (!err)
-+		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
++			len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
++			if (unlikely(len >= (sizeof(dev_str) - 1))) {
++				err = -ENOMEM;
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "String overflow while "
++						 "reading configuration");
++				goto out;
++			}
++			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++					   dev_str, "%x:%x:%x.%x",
++					   &domain, &bus, &slot, &func);
++			if (err < 0) {
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error reading device "
++						 "configuration");
++				goto out;
++			}
++			if (err != 4) {
++				err = -EINVAL;
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error parsing pci device "
++						 "configuration");
++				goto out;
++			}
 +
-+out:
-+	return err;
-+}
++			err = pciback_remove_device(pdev, domain, bus, slot,
++						    func);
++			if (err)
++				goto out;
 +
-+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
-+{
-+	int slot;
-+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
-+	struct pci_dev *found_dev = NULL;
-+	unsigned long flags;
++			/* TODO: If at some point we implement support for pci
++			 * root hot-remove on pcifront side, we'll need to
++			 * remove unnecessary xenstore nodes of pci roots here.
++			 */
 +
-+	spin_lock_irqsave(&vpci_dev->lock, flags);
++			break;
 +
-+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-+		struct pci_dev_entry *e, *tmp;
-+		list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
-+					 list) {
-+			if (e->dev == dev) {
-+				list_del(&e->list);
-+				found_dev = e->dev;
-+				kfree(e);
-+				goto out;
-+			}
++		default:
++			break;
 +		}
 +	}
 +
++	err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
++	if (err) {
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error switching to reconfigured state!");
++		goto out;
++	}
++
 +out:
-+	spin_unlock_irqrestore(&vpci_dev->lock, flags);
++	spin_unlock(&pdev->dev_lock);
 +
-+	if (found_dev)
-+		pcistub_put_pci_dev(found_dev);
++	return 0;
 +}
 +
-+int pciback_init_devices(struct pciback_device *pdev)
++static void pciback_frontend_changed(struct xenbus_device *xdev,
++				     enum xenbus_state fe_state)
 +{
-+	int slot;
-+	struct vpci_dev_data *vpci_dev;
-+
-+	vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
-+	if (!vpci_dev)
-+		return -ENOMEM;
++	struct pciback_device *pdev = dev_get_drvdata(&xdev->dev);
 +
-+	spin_lock_init(&vpci_dev->lock);
++	dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
 +
-+	for (slot = 0; slot < PCI_SLOT_MAX; slot++)
-+		INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
++	switch (fe_state) {
++	case XenbusStateInitialised:
++		pciback_attach(pdev);
++		break;
 +
-+	pdev->pci_dev_data = vpci_dev;
++	case XenbusStateReconfiguring:
++		pciback_reconfigure(pdev);
++		break;
 +
-+	return 0;
-+}
++	case XenbusStateConnected:
++		/* pcifront switched its state from reconfiguring to connected.
++		 * Then switch to connected state.
++		 */
++		xenbus_switch_state(xdev, XenbusStateConnected);
++		break;
 +
-+int pciback_publish_pci_roots(struct pciback_device *pdev,
-+			      publish_pci_root_cb publish_cb)
-+{
-+	/* The Virtual PCI bus has only one root */
-+	return publish_cb(pdev, 0, 0);
-+}
++	case XenbusStateClosing:
++		pciback_disconnect(pdev);
++		xenbus_switch_state(xdev, XenbusStateClosing);
++		break;
 +
-+void pciback_release_devices(struct pciback_device *pdev)
-+{
-+	int slot;
-+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++	case XenbusStateClosed:
++		pciback_disconnect(pdev);
++		xenbus_switch_state(xdev, XenbusStateClosed);
++		if (xenbus_dev_is_online(xdev))
++			break;
++		/* fall through if not online */
++	case XenbusStateUnknown:
++		dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
++		device_unregister(&xdev->dev);
++		break;
 +
-+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-+		struct pci_dev_entry *e, *tmp;
-+		list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
-+					 list) {
-+			list_del(&e->list);
-+			pcistub_put_pci_dev(e->dev);
-+			kfree(e);
-+		}
++	default:
++		break;
 +	}
-+
-+	kfree(vpci_dev);
-+	pdev->pci_dev_data = NULL;
 +}
 +
-+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
-+			     struct pciback_device *pdev,
-+			     unsigned int *domain, unsigned int *bus,
-+			     unsigned int *devfn)
++static int pciback_setup_backend(struct pciback_device *pdev)
 +{
-+	struct pci_dev_entry *entry;
-+	struct pci_dev *dev = NULL;
-+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
-+	unsigned long flags;
-+	int found = 0, slot;
++	/* Get configuration from xend (if available now) */
++	int domain, bus, slot, func;
++	int err = 0;
++	int i, num_devs;
++	char dev_str[64];
++	char state_str[64];
 +
-+	spin_lock_irqsave(&vpci_dev->lock, flags);
-+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
-+		list_for_each_entry(entry,
-+			    &vpci_dev->dev_list[slot],
-+			    list) {
-+			dev = entry->dev;
-+			if (dev && dev->bus->number == pcidev->bus->number
-+				&& pci_domain_nr(dev->bus) ==
-+					pci_domain_nr(pcidev->bus)
-+				&& dev->devfn == pcidev->devfn) {
-+				found = 1;
-+				*domain = 0;
-+				*bus = 0;
-+				*devfn = PCI_DEVFN(slot,
-+					 PCI_FUNC(pcidev->devfn));
-+			}
-+		}
-+	}
-+	spin_unlock_irqrestore(&vpci_dev->lock, flags);
-+	return found;
-+}
-diff --git a/drivers/xen/pciback/xenbus.c b/drivers/xen/pciback/xenbus.c
-new file mode 100644
-index 0000000..d448bf5
---- /dev/null
-+++ b/drivers/xen/pciback/xenbus.c
-@@ -0,0 +1,722 @@
-+/*
-+ * PCI Backend Xenbus Setup - handles setup with frontend and xend
-+ *
-+ *   Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
-+ */
-+#include <linux/module.h>
-+#include <linux/init.h>
-+#include <linux/list.h>
-+#include <linux/vmalloc.h>
-+#include <linux/workqueue.h>
-+#include <xen/xenbus.h>
-+#include <xen/events.h>
-+#include <asm/xen/pci.h>
-+#include <linux/workqueue.h>
-+#include "pciback.h"
++	spin_lock(&pdev->dev_lock);
 +
-+#define INVALID_EVTCHN_IRQ  (-1)
-+struct workqueue_struct *pciback_wq;
++	/* It's possible we could get the call to setup twice, so make sure
++	 * we're not already connected.
++	 */
++	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++	    XenbusStateInitWait)
++		goto out;
 +
-+static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
-+{
-+	struct pciback_device *pdev;
++	dev_dbg(&pdev->xdev->dev, "getting be setup\n");
 +
-+	pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
-+	if (pdev == NULL)
++	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
++			   &num_devs);
++	if (err != 1) {
++		if (err >= 0)
++			err = -EINVAL;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error reading number of devices");
 +		goto out;
-+	dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
-+
-+	pdev->xdev = xdev;
-+	dev_set_drvdata(&xdev->dev, pdev);
++	}
 +
-+	spin_lock_init(&pdev->dev_lock);
++	for (i = 0; i < num_devs; i++) {
++		int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
++		if (unlikely(l >= (sizeof(dev_str) - 1))) {
++			err = -ENOMEM;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "String overflow while reading "
++					 "configuration");
++			goto out;
++		}
 +
-+	pdev->sh_info = NULL;
-+	pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
-+	pdev->be_watching = 0;
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
++				   "%x:%x:%x.%x", &domain, &bus, &slot, &func);
++		if (err < 0) {
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "Error reading device configuration");
++			goto out;
++		}
++		if (err != 4) {
++			err = -EINVAL;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "Error parsing pci device "
++					 "configuration");
++			goto out;
++		}
 +
-+	INIT_WORK(&pdev->op_work, pciback_do_op);
++		err = pciback_export_device(pdev, domain, bus, slot, func, i);
++		if (err)
++			goto out;
 +
-+	if (pciback_init_devices(pdev)) {
-+		kfree(pdev);
-+		pdev = NULL;
++		/* Switch substate of this device. */
++		l = snprintf(state_str, sizeof(state_str), "state-%d", i);
++		if (unlikely(l >= (sizeof(state_str) - 1))) {
++			err = -ENOMEM;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "String overflow while reading "
++					 "configuration");
++			goto out;
++		}
++		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
++				    "%d", XenbusStateInitialised);
++		if (err) {
++			xenbus_dev_fatal(pdev->xdev, err, "Error switching "
++					 "substate of dev-%d\n", i);
++			goto out;
++		}
 +	}
-+out:
-+	return pdev;
-+}
-+
-+static void pciback_disconnect(struct pciback_device *pdev)
-+{
-+	spin_lock(&pdev->dev_lock);
 +
-+	/* Ensure the guest can't trigger our handler before removing devices */
-+	if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
-+		unbind_from_irqhandler(pdev->evtchn_irq, pdev);
-+		pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
++	err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
++	if (err) {
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error while publish PCI root buses "
++				 "for frontend");
++		goto out;
 +	}
 +
-+	/* If the driver domain started an op, make sure we complete it
-+	 * before releasing the shared memory */
-+	flush_workqueue(pciback_wq);
-+
-+	if (pdev->sh_info != NULL) {
-+		xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
-+		pdev->sh_info = NULL;
-+	}
++	err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
++	if (err)
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error switching to initialised state!");
 +
++out:
 +	spin_unlock(&pdev->dev_lock);
-+}
 +
-+static void free_pdev(struct pciback_device *pdev)
-+{
-+	if (pdev->be_watching)
-+		unregister_xenbus_watch(&pdev->be_watch);
++	if (!err)
++		/* see if pcifront is already configured (if not, we'll wait) */
++		pciback_attach(pdev);
 +
-+	pciback_disconnect(pdev);
++	return err;
++}
 +
-+	pciback_release_devices(pdev);
++static void pciback_be_watch(struct xenbus_watch *watch,
++			     const char **vec, unsigned int len)
++{
++	struct pciback_device *pdev =
++	    container_of(watch, struct pciback_device, be_watch);
 +
-+	dev_set_drvdata(&pdev->xdev->dev, NULL);
-+	pdev->xdev = NULL;
++	switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
++	case XenbusStateInitWait:
++		pciback_setup_backend(pdev);
++		break;
 +
-+	kfree(pdev);
++	default:
++		break;
++	}
 +}
 +
-+static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
-+			     int remote_evtchn)
++static int pciback_xenbus_probe(struct xenbus_device *dev,
++				const struct xenbus_device_id *id)
 +{
 +	int err = 0;
-+	void *vaddr;
-+
-+	dev_dbg(&pdev->xdev->dev,
-+		"Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
-+		gnt_ref, remote_evtchn);
++	struct pciback_device *pdev = alloc_pdev(dev);
 +
-+	err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
-+	if (err < 0) {
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				"Error mapping other domain page in ours.");
++	if (pdev == NULL) {
++		err = -ENOMEM;
++		xenbus_dev_fatal(dev, err,
++				 "Error allocating pciback_device struct");
 +		goto out;
 +	}
-+	pdev->sh_info = vaddr;
 +
-+	err = bind_interdomain_evtchn_to_irqhandler(
-+		pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
-+		0, "pciback", pdev);
-+	if (err < 0) {
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Error binding event channel to IRQ");
++	/* wait for xend to configure us */
++	err = xenbus_switch_state(dev, XenbusStateInitWait);
++	if (err)
 +		goto out;
-+	}
-+	pdev->evtchn_irq = err;
-+	err = 0;
 +
-+	dev_dbg(&pdev->xdev->dev, "Attached!\n");
++	/* watch the backend node for backend configuration information */
++	err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
++				pciback_be_watch);
++	if (err)
++		goto out;
++	pdev->be_watching = 1;
++
++	/* We need to force a call to our callback here in case
++	 * xend already configured us!
++	 */
++	pciback_be_watch(&pdev->be_watch, NULL, 0);
++
 +out:
 +	return err;
 +}
 +
-+static int pciback_attach(struct pciback_device *pdev)
++static int pciback_xenbus_remove(struct xenbus_device *dev)
 +{
-+	int err = 0;
-+	int gnt_ref, remote_evtchn;
-+	char *magic = NULL;
++	struct pciback_device *pdev = dev_get_drvdata(&dev->dev);
 +
-+	spin_lock(&pdev->dev_lock);
++	if (pdev != NULL)
++		free_pdev(pdev);
 +
-+	/* Make sure we only do this setup once */
-+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
-+	    XenbusStateInitialised)
-+		goto out;
++	return 0;
++}
 +
-+	/* Wait for frontend to state that it has published the configuration */
-+	if (xenbus_read_driver_state(pdev->xdev->otherend) !=
-+	    XenbusStateInitialised)
-+		goto out;
++static const struct xenbus_device_id xenpci_ids[] = {
++	{"pci"},
++	{""},
++};
 +
-+	dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
++static struct xenbus_driver xenbus_pciback_driver = {
++	.name 			= "pciback",
++	.owner 			= THIS_MODULE,
++	.ids 			= xenpci_ids,
++	.probe 			= pciback_xenbus_probe,
++	.remove 		= pciback_xenbus_remove,
++	.otherend_changed 	= pciback_frontend_changed,
++};
 +
-+	err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
-+			    "pci-op-ref", "%u", &gnt_ref,
-+			    "event-channel", "%u", &remote_evtchn,
-+			    "magic", NULL, &magic, NULL);
-+	if (err) {
-+		/* If configuration didn't get read correctly, wait longer */
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Error reading configuration from frontend");
-+		goto out;
++int __init pciback_xenbus_register(void)
++{
++	pciback_wq = create_workqueue("pciback_workqueue");
++	if (!pciback_wq) {
++		printk(KERN_ERR "pciback_xenbus_register: create"
++			"pciback_workqueue failed\n");
++		return -EFAULT;
 +	}
++	return xenbus_register_backend(&xenbus_pciback_driver);
++}
 +
-+	if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
-+		xenbus_dev_fatal(pdev->xdev, -EFAULT,
-+				 "version mismatch (%s/%s) with pcifront - "
-+				 "halting pciback",
-+				 magic, XEN_PCI_MAGIC);
-+		goto out;
-+	}
++void __exit pciback_xenbus_unregister(void)
++{
++	destroy_workqueue(pciback_wq);
++	xenbus_unregister_driver(&xenbus_pciback_driver);
++}
+diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
+new file mode 100644
+index 0000000..6450c17
+--- /dev/null
++++ b/drivers/xen/pcpu.c
+@@ -0,0 +1,420 @@
++/*
++ * pcpu.c - management physical cpu in dom0 environment
++ */
++#include <linux/interrupt.h>
++#include <linux/spinlock.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include <linux/cpu.h>
++#include <xen/xenbus.h>
++#include <xen/pcpu.h>
++#include <xen/events.h>
++#include <xen/acpi.h>
 +
-+	err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
-+	if (err)
-+		goto out;
++static struct sysdev_class xen_pcpu_sysdev_class = {
++	.name = "xen_pcpu",
++};
 +
-+	dev_dbg(&pdev->xdev->dev, "Connecting...\n");
++static DEFINE_MUTEX(xen_pcpu_lock);
++static RAW_NOTIFIER_HEAD(xen_pcpu_chain);
 +
-+	err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
-+	if (err)
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Error switching to connected state!");
++/* No need for irq disable since hotplug notify is in workqueue context */
++#define get_pcpu_lock() mutex_lock(&xen_pcpu_lock);
++#define put_pcpu_lock() mutex_unlock(&xen_pcpu_lock);
 +
-+	dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
-+out:
-+	spin_unlock(&pdev->dev_lock);
++struct xen_pcpus {
++	struct list_head list;
++	int present;
++};
++static struct xen_pcpus xen_pcpus;
 +
-+	kfree(magic);
++int register_xen_pcpu_notifier(struct notifier_block *nb)
++{
++	int ret;
 +
-+	return err;
++	/* All refer to the chain notifier is protected by the pcpu_lock */
++	get_pcpu_lock();
++	ret = raw_notifier_chain_register(&xen_pcpu_chain, nb);
++	put_pcpu_lock();
++	return ret;
 +}
++EXPORT_SYMBOL_GPL(register_xen_pcpu_notifier);
 +
-+static int pciback_publish_pci_dev(struct pciback_device *pdev,
-+				   unsigned int domain, unsigned int bus,
-+				   unsigned int devfn, unsigned int devid)
++void unregister_xen_pcpu_notifier(struct notifier_block *nb)
 +{
-+	int err;
-+	int len;
-+	char str[64];
++	get_pcpu_lock();
++	raw_notifier_chain_unregister(&xen_pcpu_chain, nb);
++	put_pcpu_lock();
++}
++EXPORT_SYMBOL_GPL(unregister_xen_pcpu_notifier);
 +
-+	len = snprintf(str, sizeof(str), "vdev-%d", devid);
-+	if (unlikely(len >= (sizeof(str) - 1))) {
-+		err = -ENOMEM;
-+		goto out;
-+	}
++static int xen_pcpu_down(uint32_t xen_id)
++{
++	int ret;
++	xen_platform_op_t op = {
++		.cmd			= XENPF_cpu_offline,
++		.interface_version	= XENPF_INTERFACE_VERSION,
++		.u.cpu_ol.cpuid	= xen_id,
++	};
 +
-+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
-+			    "%04x:%02x:%02x.%02x", domain, bus,
-+			    PCI_SLOT(devfn), PCI_FUNC(devfn));
++	ret = HYPERVISOR_dom0_op(&op);
++	return ret;
++}
 +
-+out:
-+	return err;
++static int xen_pcpu_up(uint32_t xen_id)
++{
++	int ret;
++	xen_platform_op_t op = {
++		.cmd			= XENPF_cpu_online,
++		.interface_version	= XENPF_INTERFACE_VERSION,
++		.u.cpu_ol.cpuid	= xen_id,
++	};
++
++	ret = HYPERVISOR_dom0_op(&op);
++	return ret;
++}
++
++static ssize_t show_online(struct sys_device *dev,
++			struct sysdev_attribute *attr,
++			char *buf)
++{
++	struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++
++	return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE));
 +}
 +
-+static int pciback_export_device(struct pciback_device *pdev,
-+				 int domain, int bus, int slot, int func,
-+				 int devid)
++static ssize_t __ref store_online(struct sys_device *dev,
++				  struct sysdev_attribute *attr,
++				  const char *buf, size_t count)
 +{
-+	struct pci_dev *dev;
-+	int err = 0;
-+
-+	dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
-+		domain, bus, slot, func);
++	struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++	ssize_t ret;
 +
-+	dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
-+	if (!dev) {
-+		err = -EINVAL;
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Couldn't locate PCI device "
-+				 "(%04x:%02x:%02x.%01x)! "
-+				 "perhaps already in-use?",
-+				 domain, bus, slot, func);
-+		goto out;
++	switch (buf[0]) {
++	case '0':
++		ret = xen_pcpu_down(cpu->xen_id);
++		break;
++	case '1':
++		ret = xen_pcpu_up(cpu->xen_id);
++		break;
++	default:
++		ret = -EINVAL;
 +	}
 +
-+	err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev);
-+	if (err)
-+		goto out;
++	if (ret >= 0)
++		ret = count;
++	return ret;
++}
 +
-+	dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
-+	if (xen_register_device_domain_owner(dev,
-+					     pdev->xdev->otherend_id) != 0) {
-+		dev_err(&dev->dev, "device has been assigned to another " \
-+			"domain! Over-writting the ownership, but beware.\n");
-+		xen_unregister_device_domain_owner(dev);
-+		xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
-+	}
++static SYSDEV_ATTR(online, 0644, show_online, store_online);
 +
-+	/* TODO: It'd be nice to export a bridge and have all of its children
-+	 * get exported with it. This may be best done in xend (which will
-+	 * have to calculate resource usage anyway) but we probably want to
-+	 * put something in here to ensure that if a bridge gets given to a
-+	 * driver domain, that all devices under that bridge are not given
-+	 * to other driver domains (as he who controls the bridge can disable
-+	 * it and stop the other devices from working).
-+	 */
-+out:
-+	return err;
++static ssize_t show_apicid(struct sys_device *dev,
++			struct sysdev_attribute *attr,
++			char *buf)
++{
++	struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++
++	return sprintf(buf, "%u\n", cpu->apic_id);
 +}
 +
-+static int pciback_remove_device(struct pciback_device *pdev,
-+				 int domain, int bus, int slot, int func)
++static ssize_t show_acpiid(struct sys_device *dev,
++			struct sysdev_attribute *attr,
++			char *buf)
 +{
-+	int err = 0;
-+	struct pci_dev *dev;
-+
-+	dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
-+		domain, bus, slot, func);
++	struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
 +
-+	dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
-+	if (!dev) {
-+		err = -EINVAL;
-+		dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
-+			"(%04x:%02x:%02x.%01x)! not owned by this domain\n",
-+			domain, bus, slot, func);
-+		goto out;
-+	}
++	return sprintf(buf, "%u\n", cpu->acpi_id);
++}
++static SYSDEV_ATTR(apic_id, 0444, show_apicid, NULL);
++static SYSDEV_ATTR(acpi_id, 0444, show_acpiid, NULL);
 +
-+	dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
-+	xen_unregister_device_domain_owner(dev);
++static int xen_pcpu_free(struct pcpu *pcpu)
++{
++	if (!pcpu)
++		return 0;
 +
-+	pciback_release_pci_dev(pdev, dev);
++	sysdev_remove_file(&pcpu->sysdev, &attr_online);
++	sysdev_unregister(&pcpu->sysdev);
++	list_del(&pcpu->pcpu_list);
++	kfree(pcpu);
 +
-+out:
-+	return err;
++	return 0;
 +}
 +
-+static int pciback_publish_pci_root(struct pciback_device *pdev,
-+				    unsigned int domain, unsigned int bus)
++static inline int same_pcpu(struct xenpf_pcpuinfo *info,
++			    struct pcpu *pcpu)
 +{
-+	unsigned int d, b;
-+	int i, root_num, len, err;
-+	char str[64];
-+
-+	dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
-+
-+	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
-+			   "root_num", "%d", &root_num);
-+	if (err == 0 || err == -ENOENT)
-+		root_num = 0;
-+	else if (err < 0)
-+		goto out;
++	return (pcpu->apic_id == info->apic_id) &&
++		(pcpu->xen_id == info->xen_cpuid);
++}
 +
-+	/* Verify that we haven't already published this pci root */
-+	for (i = 0; i < root_num; i++) {
-+		len = snprintf(str, sizeof(str), "root-%d", i);
-+		if (unlikely(len >= (sizeof(str) - 1))) {
-+			err = -ENOMEM;
-+			goto out;
-+		}
++/*
++ * Return 1 if online status changed
++ */
++static int xen_pcpu_online_check(struct xenpf_pcpuinfo *info,
++				 struct pcpu *pcpu)
++{
++	int result = 0;
 +
-+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
-+				   str, "%x:%x", &d, &b);
-+		if (err < 0)
-+			goto out;
-+		if (err != 2) {
-+			err = -EINVAL;
-+			goto out;
-+		}
++	if (info->xen_cpuid != pcpu->xen_id)
++		return 0;
 +
-+		if (d == domain && b == bus) {
-+			err = 0;
-+			goto out;
-+		}
++	if (xen_pcpu_online(info->flags) && !xen_pcpu_online(pcpu->flags)) {
++		/* the pcpu is onlined */
++		pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
++		kobject_uevent(&pcpu->sysdev.kobj, KOBJ_ONLINE);
++		raw_notifier_call_chain(&xen_pcpu_chain,
++			XEN_PCPU_ONLINE, (void *)(long)pcpu->xen_id);
++		result = 1;
++	} else if (!xen_pcpu_online(info->flags) &&
++		 xen_pcpu_online(pcpu->flags))  {
++		/* The pcpu is offlined now */
++		pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
++		kobject_uevent(&pcpu->sysdev.kobj, KOBJ_OFFLINE);
++		raw_notifier_call_chain(&xen_pcpu_chain,
++			XEN_PCPU_OFFLINE, (void *)(long)pcpu->xen_id);
++		result = 1;
 +	}
 +
-+	len = snprintf(str, sizeof(str), "root-%d", root_num);
-+	if (unlikely(len >= (sizeof(str) - 1))) {
-+		err = -ENOMEM;
-+		goto out;
-+	}
++	return result;
++}
 +
-+	dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
-+		root_num, domain, bus);
++static int pcpu_sysdev_init(struct pcpu *cpu)
++{
++	int error;
 +
-+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
-+			    "%04x:%02x", domain, bus);
-+	if (err)
-+		goto out;
++	error = sysdev_register(&cpu->sysdev);
++	if (error) {
++		printk(KERN_WARNING "xen_pcpu_add: Failed to register pcpu\n");
++		kfree(cpu);
++		return -1;
++	}
++	sysdev_create_file(&cpu->sysdev, &attr_online);
++	sysdev_create_file(&cpu->sysdev, &attr_apic_id);
++	sysdev_create_file(&cpu->sysdev, &attr_acpi_id);
++	return 0;
++}
 +
-+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
-+			    "root_num", "%d", (root_num + 1));
++static struct pcpu *get_pcpu(int xen_id)
++{
++	struct pcpu *pcpu = NULL;
 +
-+out:
-+	return err;
++	list_for_each_entry(pcpu, &xen_pcpus.list, pcpu_list) {
++		if (pcpu->xen_id == xen_id)
++			return pcpu;
++	}
++	return NULL;
 +}
 +
-+static int pciback_reconfigure(struct pciback_device *pdev)
++static struct pcpu *init_pcpu(struct xenpf_pcpuinfo *info)
 +{
-+	int err = 0;
-+	int num_devs;
-+	int domain, bus, slot, func;
-+	int substate;
-+	int i, len;
-+	char state_str[64];
-+	char dev_str[64];
-+
-+	spin_lock(&pdev->dev_lock);
++	struct pcpu *pcpu;
 +
-+	dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
++	if (info->flags & XEN_PCPU_FLAGS_INVALID)
++		return NULL;
 +
-+	/* Make sure we only reconfigure once */
-+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
-+	    XenbusStateReconfiguring)
-+		goto out;
++	/* The PCPU is just added */
++	pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
++	if (!pcpu)
++		return NULL;
 +
-+	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
-+			   &num_devs);
-+	if (err != 1) {
-+		if (err >= 0)
-+			err = -EINVAL;
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Error reading number of devices");
-+		goto out;
-+	}
++	INIT_LIST_HEAD(&pcpu->pcpu_list);
++	pcpu->xen_id = info->xen_cpuid;
++	pcpu->apic_id = info->apic_id;
++	pcpu->acpi_id = info->acpi_id;
++	pcpu->flags = info->flags;
 +
-+	for (i = 0; i < num_devs; i++) {
-+		len = snprintf(state_str, sizeof(state_str), "state-%d", i);
-+		if (unlikely(len >= (sizeof(state_str) - 1))) {
-+			err = -ENOMEM;
-+			xenbus_dev_fatal(pdev->xdev, err,
-+					 "String overflow while reading "
-+					 "configuration");
-+			goto out;
-+		}
-+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
-+				   "%d", &substate);
-+		if (err != 1)
-+			substate = XenbusStateUnknown;
++	pcpu->sysdev.cls = &xen_pcpu_sysdev_class;
++	pcpu->sysdev.id = info->xen_cpuid;
 +
-+		switch (substate) {
-+		case XenbusStateInitialising:
-+			dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
++	if (pcpu_sysdev_init(pcpu)) {
++		kfree(pcpu);
++		return NULL;
++	}
 +
-+			len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
-+			if (unlikely(len >= (sizeof(dev_str) - 1))) {
-+				err = -ENOMEM;
-+				xenbus_dev_fatal(pdev->xdev, err,
-+						 "String overflow while "
-+						 "reading configuration");
-+				goto out;
-+			}
-+			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
-+					   dev_str, "%x:%x:%x.%x",
-+					   &domain, &bus, &slot, &func);
-+			if (err < 0) {
-+				xenbus_dev_fatal(pdev->xdev, err,
-+						 "Error reading device "
-+						 "configuration");
-+				goto out;
-+			}
-+			if (err != 4) {
-+				err = -EINVAL;
-+				xenbus_dev_fatal(pdev->xdev, err,
-+						 "Error parsing pci device "
-+						 "configuration");
-+				goto out;
-+			}
++	list_add_tail(&pcpu->pcpu_list, &xen_pcpus.list);
++	raw_notifier_call_chain(&xen_pcpu_chain,
++				XEN_PCPU_ADD,
++				(void *)(long)pcpu->xen_id);
++	return pcpu;
++}
++
++#define PCPU_NO_CHANGE			0
++#define PCPU_ADDED			1
++#define PCPU_ONLINE_OFFLINE		2
++#define PCPU_REMOVED			3
++/*
++ * Caller should hold the pcpu lock
++ * < 0: Something wrong
++ * 0: No changes
++ * > 0: State changed
++ */
++static struct pcpu *_sync_pcpu(int cpu_num, int *max_id, int *result)
++{
++	struct pcpu *pcpu = NULL;
++	struct xenpf_pcpuinfo *info;
++	xen_platform_op_t op = {
++		.cmd            = XENPF_get_cpuinfo,
++		.interface_version  = XENPF_INTERFACE_VERSION,
++	};
++	int ret;
 +
-+			err = pciback_export_device(pdev, domain, bus, slot,
-+						    func, i);
-+			if (err)
-+				goto out;
++	*result = -1;
 +
-+			/* Publish pci roots. */
-+			err = pciback_publish_pci_roots(pdev,
-+						pciback_publish_pci_root);
-+			if (err) {
-+				xenbus_dev_fatal(pdev->xdev, err,
-+						 "Error while publish PCI root"
-+						 "buses for frontend");
-+				goto out;
-+			}
++	info = &op.u.pcpu_info;
++	info->xen_cpuid = cpu_num;
 +
-+			err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
-+					    state_str, "%d",
-+					    XenbusStateInitialised);
-+			if (err) {
-+				xenbus_dev_fatal(pdev->xdev, err,
-+						 "Error switching substate of "
-+						 "dev-%d\n", i);
-+				goto out;
-+			}
-+			break;
++	ret = HYPERVISOR_dom0_op(&op);
++	if (ret)
++		return NULL;
 +
-+		case XenbusStateClosing:
-+			dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
++	if (max_id)
++		*max_id = op.u.pcpu_info.max_present;
 +
-+			len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
-+			if (unlikely(len >= (sizeof(dev_str) - 1))) {
-+				err = -ENOMEM;
-+				xenbus_dev_fatal(pdev->xdev, err,
-+						 "String overflow while "
-+						 "reading configuration");
-+				goto out;
-+			}
-+			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
-+					   dev_str, "%x:%x:%x.%x",
-+					   &domain, &bus, &slot, &func);
-+			if (err < 0) {
-+				xenbus_dev_fatal(pdev->xdev, err,
-+						 "Error reading device "
-+						 "configuration");
-+				goto out;
-+			}
-+			if (err != 4) {
-+				err = -EINVAL;
-+				xenbus_dev_fatal(pdev->xdev, err,
-+						 "Error parsing pci device "
-+						 "configuration");
-+				goto out;
-+			}
++	pcpu = get_pcpu(cpu_num);
 +
-+			err = pciback_remove_device(pdev, domain, bus, slot,
-+						    func);
-+			if (err)
-+				goto out;
++	if (info->flags & XEN_PCPU_FLAGS_INVALID) {
++		/* The pcpu has been removed */
++		*result = PCPU_NO_CHANGE;
++		if (pcpu) {
++			raw_notifier_call_chain(&xen_pcpu_chain,
++			  XEN_PCPU_REMOVE,
++			  (void *)(long)pcpu->xen_id);
++			xen_pcpu_free(pcpu);
++			*result = PCPU_REMOVED;
++		}
++		return NULL;
++	}
 +
-+			/* TODO: If at some point we implement support for pci
-+			 * root hot-remove on pcifront side, we'll need to
-+			 * remove unnecessary xenstore nodes of pci roots here.
-+			 */
 +
-+			break;
++	if (!pcpu) {
++		*result = PCPU_ADDED;
++		pcpu = init_pcpu(info);
++		if (pcpu == NULL) {
++			printk(KERN_WARNING "Failed to init pcpu %x\n",
++			  info->xen_cpuid);
++			  *result = -1;
++		}
++	} else {
++		*result = PCPU_NO_CHANGE;
++		/*
++		 * Old PCPU is replaced with a new pcpu, this means
++		 * several virq is missed, will it happen?
++		 */
++		if (!same_pcpu(info, pcpu)) {
++			printk(KERN_WARNING "Pcpu %x changed!\n",
++			  pcpu->xen_id);
++			pcpu->apic_id = info->apic_id;
++			pcpu->acpi_id = info->acpi_id;
++		}
++		if (xen_pcpu_online_check(info, pcpu))
++			*result = PCPU_ONLINE_OFFLINE;
++	}
++	return pcpu;
++}
 +
++/*
++ * Sync dom0's pcpu information with xen hypervisor's
++ */
++static int xen_sync_pcpus(void)
++{
++	/*
++	 * Boot cpu always have cpu_id 0 in xen
++	 */
++	int cpu_num = 0, max_id = 0, result = 0, present = 0;
++	struct list_head *elem, *tmp;
++	struct pcpu *pcpu;
++
++	get_pcpu_lock();
++
++	while ((result >= 0) && (cpu_num <= max_id)) {
++		pcpu = _sync_pcpu(cpu_num, &max_id, &result);
++
++		printk(KERN_DEBUG "sync cpu %x get result %x max_id %x\n",
++			cpu_num, result, max_id);
++
++		switch (result)	{
++		case PCPU_NO_CHANGE:
++			if (pcpu)
++				present++;
++			break;
++		case PCPU_ADDED:
++		case PCPU_ONLINE_OFFLINE:
++			present++;
++		case PCPU_REMOVED:
++			break;
 +		default:
++			printk(KERN_WARNING "Failed to sync pcpu %x\n",
++			  cpu_num);
 +			break;
++
 +		}
++		cpu_num++;
 +	}
 +
-+	err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
-+	if (err) {
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Error switching to reconfigured state!");
-+		goto out;
++	if (result < 0) {
++		list_for_each_safe(elem, tmp, &xen_pcpus.list) {
++			pcpu = list_entry(elem, struct pcpu, pcpu_list);
++			xen_pcpu_free(pcpu);
++		}
++		present = 0;
 +	}
 +
-+out:
-+	spin_unlock(&pdev->dev_lock);
++	xen_pcpus.present = present;
++
++	put_pcpu_lock();
 +
 +	return 0;
 +}
 +
-+static void pciback_frontend_changed(struct xenbus_device *xdev,
-+				     enum xenbus_state fe_state)
++static void xen_pcpu_dpc(struct work_struct *work)
 +{
-+	struct pciback_device *pdev = dev_get_drvdata(&xdev->dev);
-+
-+	dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
++	if (xen_sync_pcpus() < 0)
++		printk(KERN_WARNING
++			"xen_pcpu_dpc: Failed to sync pcpu information\n");
++}
++static DECLARE_WORK(xen_pcpu_work, xen_pcpu_dpc);
 +
-+	switch (fe_state) {
-+	case XenbusStateInitialised:
-+		pciback_attach(pdev);
-+		break;
++int xen_pcpu_hotplug(int type, uint32_t apic_id)
++{
++	schedule_work(&xen_pcpu_work);
 +
-+	case XenbusStateReconfiguring:
-+		pciback_reconfigure(pdev);
-+		break;
++	return 0;
++}
++EXPORT_SYMBOL(xen_pcpu_hotplug);
 +
-+	case XenbusStateConnected:
-+		/* pcifront switched its state from reconfiguring to connected.
-+		 * Then switch to connected state.
-+		 */
-+		xenbus_switch_state(xdev, XenbusStateConnected);
-+		break;
++static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
++{
++	schedule_work(&xen_pcpu_work);
++	return IRQ_HANDLED;
++}
 +
-+	case XenbusStateClosing:
-+		pciback_disconnect(pdev);
-+		xenbus_switch_state(xdev, XenbusStateClosing);
-+		break;
++static int __init xen_pcpu_init(void)
++{
++	int err;
 +
-+	case XenbusStateClosed:
-+		pciback_disconnect(pdev);
-+		xenbus_switch_state(xdev, XenbusStateClosed);
-+		if (xenbus_dev_is_online(xdev))
-+			break;
-+		/* fall through if not online */
-+	case XenbusStateUnknown:
-+		dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
-+		device_unregister(&xdev->dev);
-+		break;
++	if (!xen_initial_domain())
++		return 0;
 +
-+	default:
-+		break;
++	err = sysdev_class_register(&xen_pcpu_sysdev_class);
++	if (err) {
++		printk(KERN_WARNING
++			"xen_pcpu_init: register xen_pcpu sysdev Failed!\n");
++		return err;
 +	}
++
++	INIT_LIST_HEAD(&xen_pcpus.list);
++	xen_pcpus.present = 0;
++
++	xen_sync_pcpus();
++	if (xen_pcpus.present > 0)
++		err = bind_virq_to_irqhandler(VIRQ_PCPU_STATE,
++			0, xen_pcpu_interrupt, 0, "pcpu", NULL);
++	if (err < 0)
++		printk(KERN_WARNING "xen_pcpu_init: "
++			"Failed to bind pcpu_state virq\n"
++			"You will lost latest information! \n");
++	return err;
 +}
 +
-+static int pciback_setup_backend(struct pciback_device *pdev)
-+{
-+	/* Get configuration from xend (if available now) */
-+	int domain, bus, slot, func;
-+	int err = 0;
-+	int i, num_devs;
-+	char dev_str[64];
-+	char state_str[64];
++subsys_initcall(xen_pcpu_init);
+diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
+index 88a60e0..ae5cb05 100644
+--- a/drivers/xen/sys-hypervisor.c
++++ b/drivers/xen/sys-hypervisor.c
+@@ -14,6 +14,7 @@
+ #include <asm/xen/hypervisor.h>
+ #include <asm/xen/hypercall.h>
+ 
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/version.h>
+diff --git a/drivers/xen/xen_acpi_memhotplug.c b/drivers/xen/xen_acpi_memhotplug.c
+new file mode 100644
+index 0000000..0c4af99
+--- /dev/null
++++ b/drivers/xen/xen_acpi_memhotplug.c
+@@ -0,0 +1,209 @@
++/*
++ *  xen_acpi_memhotplug.c - interface to notify Xen on memory device hotadd
++ *
++ *  Copyright (C) 2008, Intel corporation
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or (at
++ *  your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful, but
++ *  WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ *  General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License along
++ *  with this program; if not, write to the Free Software Foundation, Inc.,
++ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
++ *
++ */
 +
-+	spin_lock(&pdev->dev_lock);
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/memory_hotplug.h>
++#include <acpi/acpi_drivers.h>
++#include <xen/interface/platform.h>
++#include <linux/interrupt.h>
++#include <linux/spinlock.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include <xen/acpi.h>
 +
-+	/* It's possible we could get the call to setup twice, so make sure
-+	 * we're not already connected.
-+	 */
-+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
-+	    XenbusStateInitWait)
-+		goto out;
++struct xen_hotmem_entry {
++	struct list_head hotmem_list;
++	uint64_t start;
++	uint64_t end;
++	uint32_t flags;
++	uint32_t pxm;
++};
 +
-+	dev_dbg(&pdev->xdev->dev, "getting be setup\n");
++struct xen_hotmem_list {
++	struct list_head list;
++	int entry_nr;
++} xen_hotmem;
 +
-+	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
-+			   &num_devs);
-+	if (err != 1) {
-+		if (err >= 0)
-+			err = -EINVAL;
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Error reading number of devices");
-+		goto out;
-+	}
++DEFINE_SPINLOCK(xen_hotmem_lock);
 +
-+	for (i = 0; i < num_devs; i++) {
-+		int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
-+		if (unlikely(l >= (sizeof(dev_str) - 1))) {
-+			err = -ENOMEM;
-+			xenbus_dev_fatal(pdev->xdev, err,
-+					 "String overflow while reading "
-+					 "configuration");
-+			goto out;
-+		}
++static int xen_hyper_addmem(struct xen_hotmem_entry *entry)
++{
++	int ret;
 +
-+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
-+				   "%x:%x:%x.%x", &domain, &bus, &slot, &func);
-+		if (err < 0) {
-+			xenbus_dev_fatal(pdev->xdev, err,
-+					 "Error reading device configuration");
-+			goto out;
-+		}
-+		if (err != 4) {
-+			err = -EINVAL;
-+			xenbus_dev_fatal(pdev->xdev, err,
-+					 "Error parsing pci device "
-+					 "configuration");
-+			goto out;
-+		}
++	xen_platform_op_t op = {
++		.cmd            = XENPF_mem_hotadd,
++		.interface_version  = XENPF_INTERFACE_VERSION,
++	};
++	op.u.mem_add.spfn = entry->start >> PAGE_SHIFT;
++	op.u.mem_add.epfn = entry->end >> PAGE_SHIFT;
++	op.u.mem_add.flags = entry->flags;
++	op.u.mem_add.pxm = entry->pxm;
 +
-+		err = pciback_export_device(pdev, domain, bus, slot, func, i);
-+		if (err)
-+			goto out;
++	ret = HYPERVISOR_dom0_op(&op);
++	return ret;
++}
 +
-+		/* Switch substate of this device. */
-+		l = snprintf(state_str, sizeof(state_str), "state-%d", i);
-+		if (unlikely(l >= (sizeof(state_str) - 1))) {
-+			err = -ENOMEM;
-+			xenbus_dev_fatal(pdev->xdev, err,
-+					 "String overflow while reading "
-+					 "configuration");
-+			goto out;
-+		}
-+		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
-+				    "%d", XenbusStateInitialised);
-+		if (err) {
-+			xenbus_dev_fatal(pdev->xdev, err, "Error switching "
-+					 "substate of dev-%d\n", i);
-+			goto out;
-+		}
-+	}
++static int add_hotmem_entry(int pxm, uint64_t start,
++			uint64_t length, uint32_t flags)
++{
++	struct xen_hotmem_entry *entry;
 +
-+	err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
-+	if (err) {
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Error while publish PCI root buses "
-+				 "for frontend");
-+		goto out;
-+	}
++	if (pxm < 0 || !length)
++		return -EINVAL;
 +
-+	err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
-+	if (err)
-+		xenbus_dev_fatal(pdev->xdev, err,
-+				 "Error switching to initialised state!");
++	entry = kzalloc(sizeof(struct xen_hotmem_entry), GFP_ATOMIC);
++	if (!entry)
++		return -ENOMEM;
 +
-+out:
-+	spin_unlock(&pdev->dev_lock);
++	INIT_LIST_HEAD(&entry->hotmem_list);
++	entry->start = start;
++	entry->end = start + length;
++	entry->flags = flags;
++	entry->pxm = pxm;
 +
-+	if (!err)
-+		/* see if pcifront is already configured (if not, we'll wait) */
-+		pciback_attach(pdev);
++	spin_lock(&xen_hotmem_lock);
 +
-+	return err;
++	list_add_tail(&entry->hotmem_list, &xen_hotmem.list);
++	xen_hotmem.entry_nr++;
++
++	spin_unlock(&xen_hotmem_lock);
++
++	return 0;
 +}
 +
-+static void pciback_be_watch(struct xenbus_watch *watch,
-+			     const char **vec, unsigned int len)
++static int free_hotmem_entry(struct xen_hotmem_entry *entry)
 +{
-+	struct pciback_device *pdev =
-+	    container_of(watch, struct pciback_device, be_watch);
-+
-+	switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
-+	case XenbusStateInitWait:
-+		pciback_setup_backend(pdev);
-+		break;
++	list_del(&entry->hotmem_list);
++	kfree(entry);
 +
-+	default:
-+		break;
-+	}
++	return 0;
 +}
 +
-+static int pciback_xenbus_probe(struct xenbus_device *dev,
-+				const struct xenbus_device_id *id)
++static void xen_hotadd_mem_dpc(struct work_struct *work)
 +{
-+	int err = 0;
-+	struct pciback_device *pdev = alloc_pdev(dev);
++	struct list_head *elem, *tmp;
++	struct xen_hotmem_entry *entry;
++	unsigned long flags;
++	int ret;
 +
-+	if (pdev == NULL) {
-+		err = -ENOMEM;
-+		xenbus_dev_fatal(dev, err,
-+				 "Error allocating pciback_device struct");
-+		goto out;
++	spin_lock_irqsave(&xen_hotmem_lock, flags);
++	list_for_each_safe(elem, tmp, &xen_hotmem.list) {
++		entry = list_entry(elem, struct xen_hotmem_entry, hotmem_list);
++		ret = xen_hyper_addmem(entry);
++		if (ret)
++			printk(KERN_WARNING "xen addmem failed with %x\n", ret);
++		free_hotmem_entry(entry);
++		xen_hotmem.entry_nr--;
 +	}
++	spin_unlock_irqrestore(&xen_hotmem_lock, flags);
++}
 +
-+	/* wait for xend to configure us */
-+	err = xenbus_switch_state(dev, XenbusStateInitWait);
-+	if (err)
-+		goto out;
++static DECLARE_WORK(xen_hotadd_mem_work, xen_hotadd_mem_dpc);
 +
-+	/* watch the backend node for backend configuration information */
-+	err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
-+				pciback_be_watch);
-+	if (err)
-+		goto out;
-+	pdev->be_watching = 1;
++static int xen_acpi_get_pxm(acpi_handle h)
++{
++	unsigned long long pxm;
++	acpi_status status;
++	acpi_handle handle;
++	acpi_handle phandle = h;
 +
-+	/* We need to force a call to our callback here in case
-+	 * xend already configured us!
-+	 */
-+	pciback_be_watch(&pdev->be_watch, NULL, 0);
++	do {
++		handle = phandle;
++		status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
++		if (ACPI_SUCCESS(status))
++			return pxm;
++		status = acpi_get_parent(handle, &phandle);
++	} while (ACPI_SUCCESS(status));
 +
-+out:
-+	return err;
++	return -1;
 +}
 +
-+static int pciback_xenbus_remove(struct xenbus_device *dev)
++int xen_hotadd_memory(struct acpi_memory_device *mem_device)
 +{
-+	struct pciback_device *pdev = dev_get_drvdata(&dev->dev);
++	int pxm, result;
++	int num_enabled = 0;
++	struct acpi_memory_info *info;
 +
-+	if (pdev != NULL)
-+		free_pdev(pdev);
++	if (!mem_device)
++		return -EINVAL;
++
++	pxm = xen_acpi_get_pxm(mem_device->device->handle);
++
++	if (pxm < 0)
++		return -EINVAL;
++
++	/*
++	 * Always return success to ACPI driver, and notify hypervisor later
++	 * because hypervisor will utilize the memory in memory hotadd hypercall
++	 */
++	list_for_each_entry(info, &mem_device->res_list, list) {
++		if (info->enabled) { /* just sanity check...*/
++			num_enabled++;
++			continue;
++		}
++		/*
++		 * If the memory block size is zero, please ignore it.
++		 * Don't try to do the following memory hotplug flowchart.
++		 */
++		if (!info->length)
++			continue;
++
++		result = add_hotmem_entry(pxm, info->start_addr,
++					info->length, 0);
++		if (result)
++			continue;
++		info->enabled = 1;
++		num_enabled++;
++	}
++
++	if (!num_enabled)
++		return -EINVAL;
++
++	schedule_work(&xen_hotadd_mem_work);
 +
 +	return 0;
 +}
++EXPORT_SYMBOL(xen_hotadd_memory);
 +
-+static const struct xenbus_device_id xenpci_ids[] = {
-+	{"pci"},
-+	{""},
-+};
++static int xen_hotadd_mem_init(void)
++{
++	if (!xen_initial_domain())
++		return -ENODEV;
 +
-+static struct xenbus_driver xenbus_pciback_driver = {
-+	.name 			= "pciback",
-+	.owner 			= THIS_MODULE,
-+	.ids 			= xenpci_ids,
-+	.probe 			= pciback_xenbus_probe,
-+	.remove 		= pciback_xenbus_remove,
-+	.otherend_changed 	= pciback_frontend_changed,
-+};
++	INIT_LIST_HEAD(&xen_hotmem.list);
++	xen_hotmem.entry_nr = 0;
 +
-+int __init pciback_xenbus_register(void)
-+{
-+	pciback_wq = create_workqueue("pciback_workqueue");
-+	if (!pciback_wq) {
-+		printk(KERN_ERR "pciback_xenbus_register: create"
-+			"pciback_workqueue failed\n");
-+		return -EFAULT;
-+	}
-+	return xenbus_register_backend(&xenbus_pciback_driver);
++	return 0;
 +}
 +
-+void __exit pciback_xenbus_unregister(void)
++static void xen_hotadd_mem_exit(void)
 +{
-+	destroy_workqueue(pciback_wq);
-+	xenbus_unregister_driver(&xenbus_pciback_driver);
++	flush_scheduled_work();
 +}
-diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
-index 88a60e0..ae5cb05 100644
---- a/drivers/xen/sys-hypervisor.c
-+++ b/drivers/xen/sys-hypervisor.c
-@@ -14,6 +14,7 @@
- #include <asm/xen/hypervisor.h>
- #include <asm/xen/hypercall.h>
- 
-+#include <xen/xen.h>
- #include <xen/xenbus.h>
- #include <xen/interface/xen.h>
- #include <xen/interface/version.h>
++
++module_init(xen_hotadd_mem_init);
++module_exit(xen_hotadd_mem_exit);
++MODULE_LICENSE("GPL");
 diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
 index 5571f5b..8dca685 100644
 --- a/drivers/xen/xenbus/Makefile
@@ -22461,20 +24664,8 @@
  	};
  	return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
  }
-diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
-index 090c61e..700dc77 100644
---- a/drivers/xen/xenbus/xenbus_comms.c
-+++ b/drivers/xen/xenbus/xenbus_comms.c
-@@ -49,6 +49,7 @@ static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
- static irqreturn_t wake_waiting(int irq, void *unused)
- {
- 	if (unlikely(xenstored_ready == 0)) {
-+		printk(KERN_CRIT "xenbus_probe wake_waiting\n");
- 		xenstored_ready = 1;
- 		schedule_work(&probe_work);
- 	}
 diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
-index 649fcdf..a90e0bf 100644
+index 649fcdf..ab04a1b 100644
 --- a/drivers/xen/xenbus/xenbus_probe.c
 +++ b/drivers/xen/xenbus/xenbus_probe.c
 @@ -49,6 +49,8 @@
@@ -22745,56 +24936,24 @@
  
  static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
  {
-@@ -569,15 +476,23 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
- 	unsigned int dir_n = 0;
- 	int i;
- 
-+	printk(KERN_CRIT "%s type %s\n", __func__, type);
-+
- 	dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
--	if (IS_ERR(dir))
-+	if (IS_ERR(dir)) {
-+		printk(KERN_CRIT "%s failed xenbus_directory\n", __func__);
+@@ -574,10 +481,11 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
  		return PTR_ERR(dir);
-+	}
  
  	for (i = 0; i < dir_n; i++) {
 -		err = bus->probe(type, dir[i]);
--		if (err)
-+		printk(KERN_CRIT "%s %d/%d %s\n", __func__, i+1,dir_n, dir[i]);
 +		err = bus->probe(bus, type, dir[i]);
-+		if (err) {
-+			printk(KERN_CRIT "%s failed\n", __func__);
+ 		if (err)
  			break;
-+		}
  	}
-+	printk("%s done\n", __func__);
++
  	kfree(dir);
  	return err;
  }
-@@ -588,18 +503,27 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
- 	char **dir;
- 	unsigned int i, dir_n;
- 
-+	printk(KERN_CRIT "%s %s\n", __func__, bus->root);
-+
- 	dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
--	if (IS_ERR(dir))
-+	if (IS_ERR(dir)) {
-+		printk(KERN_CRIT "%s failed xenbus_directory\n", __func__);
- 		return PTR_ERR(dir);
-+	}
- 
- 	for (i = 0; i < dir_n; i++) {
-+		printk(KERN_CRIT "%s %d/%d %s\n", __func__, i+1,dir_n, dir[i]);
- 		err = xenbus_probe_device_type(bus, dir[i]);
--		if (err)
-+		if (err) {
-+			printk(KERN_CRIT "%s failed\n", __func__);
+@@ -597,9 +505,11 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
+ 		if (err)
  			break;
-+		}
  	}
-+	printk("%s done\n", __func__);
++
  	kfree(dir);
  	return err;
  }
@@ -22802,7 +24961,7 @@
  
  static unsigned int char_count(const char *str, char c)
  {
-@@ -662,32 +586,17 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
+@@ -662,32 +572,17 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
  }
  EXPORT_SYMBOL_GPL(xenbus_dev_changed);
  
@@ -22838,7 +24997,7 @@
  	if (drv->suspend)
  		err = drv->suspend(xdev, state);
  	if (err)
-@@ -695,21 +604,19 @@ static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
+@@ -695,21 +590,19 @@ static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
  		       "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
  	return 0;
  }
@@ -22864,7 +25023,7 @@
  	err = talk_to_otherend(xdev);
  	if (err) {
  		printk(KERN_WARNING
-@@ -740,6 +647,7 @@ static int xenbus_dev_resume(struct device *dev)
+@@ -740,6 +633,7 @@ static int xenbus_dev_resume(struct device *dev)
  
  	return 0;
  }
@@ -22872,7 +25031,7 @@
  
  /* A flag to determine if xenstored is 'ready' (i.e. has started) */
  int xenstored_ready = 0;
-@@ -768,10 +676,7 @@ void xenbus_probe(struct work_struct *unused)
+@@ -768,11 +662,6 @@ void xenbus_probe(struct work_struct *unused)
  {
  	BUG_ON((xenstored_ready <= 0));
  
@@ -22880,11 +25039,11 @@
 -	xenbus_probe_devices(&xenbus_frontend);
 -	register_xenbus_watch(&fe_watch);
 -	xenbus_backend_probe_and_watch();
-+	printk(KERN_CRIT "xenbus_probe wake_waiting\n");
- 
+-
  	/* Notify others that xenstore is up */
  	blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
-@@ -780,27 +685,43 @@ void xenbus_probe(struct work_struct *unused)
+ }
+@@ -780,27 +669,43 @@ void xenbus_probe(struct work_struct *unused)
  static int __init xenbus_probe_init(void)
  {
  	int err = 0;
@@ -22939,7 +25098,7 @@
  	} else {
  		xenstored_ready = 1;
  		xen_store_evtchn = xen_start_info->store_evtchn;
-@@ -813,7 +734,7 @@ static int __init xenbus_probe_init(void)
+@@ -813,7 +718,7 @@ static int __init xenbus_probe_init(void)
  	if (err) {
  		printk(KERN_WARNING
  		       "XENBUS: Error initializing xenstore comms: %i\n", err);
@@ -22948,11 +25107,8 @@
  	}
  
  	if (!xen_initial_domain())
-@@ -827,130 +748,17 @@ static int __init xenbus_probe_init(void)
- 	proc_mkdir("xen", NULL);
- #endif
+@@ -829,128 +734,13 @@ static int __init xenbus_probe_init(void)
  
-+	printk(KERN_CRIT "%s ok\n", __func__);
  	return 0;
  
 -  out_unreg_back:
@@ -22965,7 +25121,6 @@
 +	if (page != 0)
 +		free_page(page);
 +
-+	printk(KERN_CRIT "err %d in %s\n", err, __func__);
  	return err;
  }
  
@@ -23136,10 +25291,10 @@
  #endif
 diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
 new file mode 100644
-index 0000000..a3cc535
+index 0000000..9b9dd36
 --- /dev/null
 +++ b/drivers/xen/xenbus/xenbus_probe_backend.c
-@@ -0,0 +1,298 @@
+@@ -0,0 +1,293 @@
 +/******************************************************************************
 + * Talks to Xen Store to figure out what devices we have (backend half).
 + *
@@ -23409,10 +25564,8 @@
 +{
 +	/* Enumerate devices in xenstore and watch for changes. */
 +	xenbus_probe_devices(&xenbus_backend);
-+	printk(KERN_CRIT "%s devices probed ok\n", __func__);
 +	register_xenbus_watch(&be_watch);
-+	printk(KERN_CRIT "%s watch add ok ok\n", __func__);
-+	printk(KERN_CRIT "%s all done\n", __func__);
++
 +	return NOTIFY_DONE;
 +}
 +
@@ -23427,11 +25580,8 @@
 +
 +	/* Register ourselves with the kernel bus subsystem */
 +	err = bus_register(&xenbus_backend.bus);
-+	if (err) {
-+		printk(KERN_CRIT "%s didn't register bus!\n", __func__);
++	if (err)
 +		return err;
-+	}
-+	printk(KERN_CRIT "%s bus registered ok\n", __func__);
 +
 +	register_xenstore_notifier(&xenstore_notifier);
 +
@@ -23440,10 +25590,10 @@
 +subsys_initcall(xenbus_probe_backend_init);
 diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
 new file mode 100644
-index 0000000..47be902
+index 0000000..77af5c3
 --- /dev/null
 +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
-@@ -0,0 +1,292 @@
+@@ -0,0 +1,287 @@
 +#define DPRINTK(fmt, args...)				\
 +	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
 +		 __func__, __LINE__, ##args)
@@ -23693,10 +25843,8 @@
 +{
 +	/* Enumerate devices in xenstore and watch for changes. */
 +	xenbus_probe_devices(&xenbus_frontend);
-+	printk(KERN_CRIT "%s devices probed ok\n", __func__);
 +	register_xenbus_watch(&fe_watch);
-+	printk(KERN_CRIT "%s watch add ok ok\n", __func__);
-+	printk(KERN_CRIT "%s all done\n", __func__);
++
 +	return NOTIFY_DONE;
 +}
 +
@@ -23712,11 +25860,8 @@
 +
 +	/* Register ourselves with the kernel bus subsystem */
 +	err = bus_register(&xenbus_frontend.bus);
-+	if (err) {
-+		printk(KERN_CRIT "%s didn't register bus!\n", __func__);
++	if (err)
 +		return err;
-+	}
-+	printk(KERN_CRIT "%s bus registered ok\n", __func__);
 +
 +	register_xenstore_notifier(&xenstore_notifier);
 +
@@ -24518,6 +26663,71 @@
 +	.read = xsd_read,
 +	.release = xsd_release,
 +};
+diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
+index f4906f6..e7233e8 100644
+--- a/include/acpi/acpi_drivers.h
++++ b/include/acpi/acpi_drivers.h
+@@ -154,4 +154,25 @@ static inline void unregister_hotplug_dock_device(acpi_handle handle)
+ }
+ #endif
+ 
++/*--------------------------------------------------------------------------
++				Memory
++  -------------------------------------------------------------------------- */
++#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
++	defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
++struct acpi_memory_info {
++	struct list_head list;
++	u64 start_addr;		/* Memory Range start physical addr */
++	u64 length;		/* Memory Range length */
++	unsigned short caching;	/* memory cache attribute */
++	unsigned short write_protect;	/* memory read/write attribute */
++	unsigned int enabled:1;
++};
++
++struct acpi_memory_device {
++	struct acpi_device *device;
++	unsigned int state;	/* State of the memory device */
++	struct list_head res_list;
++};
++#endif
++
+ #endif /*__ACPI_DRIVERS_H__*/
+diff --git a/include/acpi/processor.h b/include/acpi/processor.h
+index 740ac3a..3d1205f 100644
+--- a/include/acpi/processor.h
++++ b/include/acpi/processor.h
+@@ -238,6 +238,13 @@ struct acpi_processor_errata {
+ 	} piix4;
+ };
+ 
++extern int acpi_processor_errata(struct acpi_processor *pr);
++extern int acpi_processor_add_fs(struct acpi_device *device);
++extern int acpi_processor_remove_fs(struct acpi_device *device);
++extern int acpi_processor_set_pdc(struct acpi_processor *pr);
++extern int acpi_processor_remove(struct acpi_device *device, int type);
++extern void acpi_processor_notify(struct acpi_device *device, u32 event);
++
+ extern int acpi_processor_preregister_performance(struct
+ 						  acpi_processor_performance
+ 						  *performance);
+@@ -295,6 +302,8 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx
+ void acpi_processor_ppc_init(void);
+ void acpi_processor_ppc_exit(void);
+ int acpi_processor_ppc_has_changed(struct acpi_processor *pr);
++int acpi_processor_get_performance_info(struct acpi_processor *pr);
++int acpi_processor_get_psd(struct acpi_processor	*pr);
+ #else
+ static inline void acpi_processor_ppc_init(void)
+ {
+@@ -331,6 +340,7 @@ int acpi_processor_power_init(struct acpi_processor *pr,
+ int acpi_processor_cst_has_changed(struct acpi_processor *pr);
+ int acpi_processor_power_exit(struct acpi_processor *pr,
+ 			      struct acpi_device *device);
++int acpi_processor_get_power_info(struct acpi_processor *pr);
+ int acpi_processor_suspend(struct acpi_device * device, pm_message_t state);
+ int acpi_processor_resume(struct acpi_device * device);
+ extern struct cpuidle_driver acpi_idle_driver;
 diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h
 index 26373cf..9fb4270 100644
 --- a/include/asm-generic/pci.h
@@ -24593,38 +26803,18 @@
  #define FBINFO_PARTIAL_PAN_OK	0x0040 /* otw use pan only for double-buffering */
  #define FBINFO_READS_FAST	0x0080 /* soft-copy faster than rendering */
  
-diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
-index 9bace4b..040b679 100644
---- a/include/linux/hrtimer.h
-+++ b/include/linux/hrtimer.h
-@@ -162,10 +162,11 @@ struct hrtimer_clock_base {
-  * @expires_next:	absolute time of the next event which was scheduled
-  *			via clock_set_next_event()
-  * @hres_active:	State of high resolution mode
-- * @check_clocks:	Indictator, when set evaluate time source and clock
-- *			event devices whether high resolution mode can be
-- *			activated.
-- * @nr_events:		Total number of timer interrupt events
-+ * @hang_detected:	The last hrtimer interrupt detected a hang
-+ * @nr_events:		Total number of hrtimer interrupt events
-+ * @nr_retries:		Total number of hrtimer interrupt retries
-+ * @nr_hangs:		Total number of hrtimer interrupt hangs
-+ * @max_hang_time:	Maximum time spent in hrtimer_interrupt
-  */
- struct hrtimer_cpu_base {
- 	spinlock_t			lock;
-@@ -173,7 +174,11 @@ struct hrtimer_cpu_base {
- #ifdef CONFIG_HIGH_RES_TIMERS
- 	ktime_t				expires_next;
- 	int				hres_active;
-+	int				hang_detected;
- 	unsigned long			nr_events;
-+	unsigned long			nr_retries;
-+	unsigned long			nr_hangs;
-+	ktime_t				max_hang_time;
- #endif
- };
+diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
+index 7ca72b7..1c30adf 100644
+--- a/include/linux/interrupt.h
++++ b/include/linux/interrupt.h
+@@ -62,6 +62,7 @@
+ #define IRQF_NOBALANCING	0x00000800
+ #define IRQF_IRQPOLL		0x00001000
+ #define IRQF_ONESHOT		0x00002000
++#define IRQF_NO_SUSPEND		0x00004000
  
+ /*
+  * Bits used by threaded handlers:
 diff --git a/include/linux/mm.h b/include/linux/mm.h
 index 24c3956..3d74515 100644
 --- a/include/linux/mm.h
@@ -24834,6 +27024,19 @@
 +
 +
  #endif /* __LINUX_SWIOTLB_H */
+diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
+index 3c123c3..1a2ba21 100644
+--- a/include/linux/vmalloc.h
++++ b/include/linux/vmalloc.h
+@@ -7,6 +7,8 @@
+ 
+ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
+ 
++extern bool vmap_lazy_unmap;
++
+ /* bits in flags of vmalloc's vm_struct below */
+ #define VM_IOREMAP	0x00000001	/* ioremap() and friends */
+ #define VM_ALLOC	0x00000002	/* vmalloc() */
 diff --git a/include/xen/Kbuild b/include/xen/Kbuild
 index 4e65c16..84ad8f0 100644
 --- a/include/xen/Kbuild
@@ -24841,6 +27044,118 @@
 @@ -1 +1,2 @@
  header-y += evtchn.h
 +header-y += privcmd.h
+diff --git a/include/xen/acpi.h b/include/xen/acpi.h
+new file mode 100644
+index 0000000..279142d
+--- /dev/null
++++ b/include/xen/acpi.h
+@@ -0,0 +1,106 @@
++#ifndef _XEN_ACPI_H
++#define _XEN_ACPI_H
++
++#include <linux/types.h>
++#include <acpi/acpi_drivers.h>
++#include <acpi/processor.h>
++#include <xen/xen.h>
++
++#ifdef CONFIG_XEN_S3
++#include <asm/xen/hypervisor.h>
++
++static inline bool xen_pv_acpi(void)
++{
++	return xen_pv_domain();
++}
++#else
++static inline bool xen_pv_acpi(void)
++{
++	return false;
++}
++#endif
++
++int acpi_notify_hypervisor_state(u8 sleep_state,
++				 u32 pm1a_cnt, u32 pm1b_cnd);
++
++/*
++ * Following are interfaces for xen acpi processor control
++ */
++
++/* Events notified to xen */
++#define PROCESSOR_PM_INIT	1
++#define PROCESSOR_PM_CHANGE	2
++#define PROCESSOR_HOTPLUG	3
++
++/* Objects for the PM events */
++#define PM_TYPE_IDLE		0
++#define PM_TYPE_PERF		1
++#define PM_TYPE_THR		2
++#define PM_TYPE_MAX		3
++
++#define XEN_MAX_ACPI_ID 255
++
++/* Processor hotplug events */
++#define HOTPLUG_TYPE_ADD	0
++#define HOTPLUG_TYPE_REMOVE	1
++
++int xen_acpi_processor_init(void);
++void xen_acpi_processor_exit(void);
++
++int xen_acpi_processor_power_init(struct acpi_processor *pr,
++		struct acpi_device *device);
++int xen_acpi_processor_cst_has_changed(struct acpi_processor *pr);
++
++void xen_arch_acpi_processor_init_pdc(struct acpi_processor *pr);
++
++#ifdef CONFIG_CPU_FREQ
++int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr);
++int xen_acpi_processor_get_performance(struct acpi_processor *pr);
++#else
++static inline int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr)
++{
++	return acpi_processor_ppc_has_changed(pr);
++}
++static inline int xen_acpi_processor_get_performance(struct acpi_processor *pr)
++{
++	printk(KERN_WARNING
++		"Warning: xen_acpi_processor_get_performance not supported\n"
++		"Consider compiling CPUfreq support into your kernel.\n");
++	return 0;
++}
++#endif
++
++#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
++	defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
++int xen_hotadd_memory(struct acpi_memory_device *mem_device);
++#endif
++
++#if defined(CONFIG_ACPI_PROCESSOR_XEN) || \
++defined(CONFIG_ACPI_PROCESSOR_XEN_MODULE)
++
++struct processor_cntl_xen_ops {
++	/* Transfer processor PM events to xen */
++int (*pm_ops[PM_TYPE_MAX])(struct acpi_processor *pr, int event);
++	/* Notify physical processor status to xen */
++	int (*hotplug)(struct acpi_processor *pr, int type);
++};
++
++extern int processor_cntl_xen_notify(struct acpi_processor *pr,
++			int event, int type);
++extern int processor_cntl_xen_power_cache(int cpu, int cx,
++		struct acpi_power_register *reg);
++#else
++
++static inline int processor_cntl_xen_notify(struct acpi_processor *pr,
++			int event, int type)
++{
++	return 0;
++}
++static inline int processor_cntl_xen_power_cache(int cpu, int cx,
++		struct acpi_power_register *reg)
++{
++	return 0;
++}
++#endif /* CONFIG_ACPI_PROCESSOR_XEN */
++
++#endif	/* _XEN_ACPI_H */
 diff --git a/include/xen/balloon.h b/include/xen/balloon.h
 new file mode 100644
 index 0000000..e751514
@@ -25654,10 +27969,10 @@
   * ** This command is obsolete since interface version 0x00030202 and is **
 diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h
 new file mode 100644
-index 0000000..83e4714
+index 0000000..17ae622
 --- /dev/null
 +++ b/include/xen/interface/platform.h
-@@ -0,0 +1,222 @@
+@@ -0,0 +1,381 @@
 +/******************************************************************************
 + * platform.h
 + *
@@ -25859,6 +28174,160 @@
 +typedef struct xenpf_getidletime xenpf_getidletime_t;
 +DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t);
 +
++#define XENPF_set_processor_pminfo      54
++
++/* ability bits */
++#define XEN_PROCESSOR_PM_CX	1
++#define XEN_PROCESSOR_PM_PX	2
++#define XEN_PROCESSOR_PM_TX	4
++
++/* cmd type */
++#define XEN_PM_CX   0
++#define XEN_PM_PX   1
++#define XEN_PM_TX   2
++
++/* Px sub info type */
++#define XEN_PX_PCT   1
++#define XEN_PX_PSS   2
++#define XEN_PX_PPC   4
++#define XEN_PX_PSD   8
++
++struct xen_power_register {
++	uint32_t     space_id;
++	uint32_t     bit_width;
++	uint32_t     bit_offset;
++	uint32_t     access_size;
++	uint64_t     address;
++};
++
++struct xen_processor_csd {
++	uint32_t    domain;      /* domain number of one dependent group */
++	uint32_t    coord_type;  /* coordination type */
++	uint32_t    num;         /* number of processors in same domain */
++};
++typedef struct xen_processor_csd xen_processor_csd_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_csd);
++
++struct xen_processor_cx {
++	struct xen_power_register  reg; /* GAS for Cx trigger register */
++	uint8_t     type;     /* cstate value, c0: 0, c1: 1, ... */
++	uint32_t    latency;  /* worst latency (ms) to enter/exit this cstate */
++	uint32_t    power;    /* average power consumption(mW) */
++	uint32_t    dpcnt;    /* number of dependency entries */
++	GUEST_HANDLE(xen_processor_csd) dp; /* NULL if no dependency */
++};
++typedef struct xen_processor_cx xen_processor_cx_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_cx);
++
++struct xen_processor_flags {
++	uint32_t bm_control:1;
++	uint32_t bm_check:1;
++	uint32_t has_cst:1;
++	uint32_t power_setup_done:1;
++	uint32_t bm_rld_set:1;
++};
++
++struct xen_processor_power {
++	uint32_t count;  /* number of C state entries in array below */
++	struct xen_processor_flags flags;  /* global flags of this processor */
++	GUEST_HANDLE(xen_processor_cx) states; /* supported c states */
++};
++
++struct xen_pct_register {
++	uint8_t  descriptor;
++	uint16_t length;
++	uint8_t  space_id;
++	uint8_t  bit_width;
++	uint8_t  bit_offset;
++	uint8_t  reserved;
++	uint64_t address;
++};
++
++struct xen_processor_px {
++	uint64_t core_frequency; /* megahertz */
++	uint64_t power;      /* milliWatts */
++	uint64_t transition_latency; /* microseconds */
++	uint64_t bus_master_latency; /* microseconds */
++	uint64_t control;        /* control value */
++	uint64_t status;     /* success indicator */
++};
++typedef struct xen_processor_px xen_processor_px_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_px);
++
++struct xen_psd_package {
++	uint64_t num_entries;
++	uint64_t revision;
++	uint64_t domain;
++	uint64_t coord_type;
++	uint64_t num_processors;
++};
++
++struct xen_processor_performance {
++	uint32_t flags;     /* flag for Px sub info type */
++	uint32_t platform_limit;  /* Platform limitation on freq usage */
++	struct xen_pct_register control_register;
++	struct xen_pct_register status_register;
++	uint32_t state_count;     /* total available performance states */
++	GUEST_HANDLE(xen_processor_px) states;
++	struct xen_psd_package domain_info;
++	uint32_t shared_type;     /* coordination type of this processor */
++};
++typedef struct xen_processor_performance xen_processor_performance_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_performance);
++
++struct xenpf_set_processor_pminfo {
++	/* IN variables */
++	uint32_t id;    /* ACPI CPU ID */
++	uint32_t type;  /* {XEN_PM_CX, XEN_PM_PX} */
++	union {
++		struct xen_processor_power          power;/* Cx: _CST/_CSD */
++		struct xen_processor_performance    perf; /* Px: _PPC/_PCT/_PSS/_PSD */
++	};
++};
++typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_set_processor_pminfo);
++
++#define XENPF_get_cpuinfo 55
++struct xenpf_pcpuinfo {
++    /* IN */
++    uint32_t xen_cpuid;
++    /* OUT */
++    /* The maxium cpu_id that is present */
++    uint32_t max_present;
++#define XEN_PCPU_FLAGS_ONLINE   1
++    /* Correponding xen_cpuid is not present*/
++#define XEN_PCPU_FLAGS_INVALID  2
++    uint32_t flags;
++    uint32_t apic_id;
++    uint32_t acpi_id;
++};
++typedef struct xenpf_pcpuinfo xenpf_pcpuinfo_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_pcpuinfo_t);
++
++#define XENPF_cpu_online    56
++#define XENPF_cpu_offline   57
++struct xenpf_cpu_ol {
++    uint32_t cpuid;
++};
++typedef struct xenpf_cpu_ol xenpf_cpu_ol_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_cpu_ol_t);
++
++#define XENPF_cpu_hotadd    58
++struct xenpf_cpu_hotadd {
++	uint32_t apic_id;
++	uint32_t acpi_id;
++	uint32_t pxm;
++};
++
++
++#define XENPF_mem_hotadd    59
++struct xenpf_mem_hotadd {
++	uint64_t spfn;
++	uint64_t epfn;
++	uint32_t pxm;
++	uint32_t flags;
++};
++
 +struct xen_platform_op {
 +	uint32_t cmd;
 +	uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -25873,6 +28342,11 @@
 +		struct xenpf_enter_acpi_sleep  enter_acpi_sleep;
 +		struct xenpf_change_freq       change_freq;
 +		struct xenpf_getidletime       getidletime;
++		struct xenpf_set_processor_pminfo set_pminfo;
++		struct xenpf_pcpuinfo          pcpu_info;
++		struct xenpf_cpu_ol            cpu_ol;
++		struct xenpf_cpu_hotadd        cpu_add;
++		struct xenpf_mem_hotadd        mem_add;
 +		uint8_t                        pad[128];
 +	} u;
 +};
@@ -25881,10 +28355,18 @@
 +
 +#endif /* __XEN_PUBLIC_PLATFORM_H__ */
 diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
-index 2befa3e..327db61 100644
+index 2befa3e..9ffaee0 100644
 --- a/include/xen/interface/xen.h
 +++ b/include/xen/interface/xen.h
-@@ -184,6 +184,8 @@
+@@ -79,6 +79,7 @@
+ #define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
+ #define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
+ #define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
++#define VIRQ_PCPU_STATE 9  /* (DOM0) PCPU state changed                   */
+ 
+ /* Architecture-specific VIRQ definitions. */
+ #define VIRQ_ARCH_0    16
+@@ -184,6 +185,8 @@
  #define MMUEXT_NEW_USER_BASEPTR 15
  
  #ifndef __ASSEMBLY__
@@ -25893,7 +28375,7 @@
  struct mmuext_op {
  	unsigned int cmd;
  	union {
-@@ -449,6 +451,45 @@ struct start_info {
+@@ -449,9 +452,49 @@ struct start_info {
  	int8_t cmd_line[MAX_GUEST_CMDLINE];
  };
  
@@ -25939,7 +28421,11 @@
  /* These flags are passed in the 'flags' field of start_info_t. */
  #define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
  #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
-@@ -461,6 +502,8 @@ typedef uint8_t xen_domain_handle_t[16];
++#define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
+ 
+ typedef uint64_t cpumap_t;
+ 
+@@ -461,6 +504,8 @@ typedef uint8_t xen_domain_handle_t[16];
  #define __mk_unsigned_long(x) x ## UL
  #define mk_unsigned_long(x) __mk_unsigned_long(x)
  
@@ -25948,6 +28434,42 @@
  #else /* __ASSEMBLY__ */
  
  /* In assembly code we cannot use C numeric constant suffixes. */
+diff --git a/include/xen/pcpu.h b/include/xen/pcpu.h
+new file mode 100644
+index 0000000..fb2bf6b
+--- /dev/null
++++ b/include/xen/pcpu.h
+@@ -0,0 +1,30 @@
++#ifndef _XEN_PCPU_H
++#define _XEN_PCPU_H
++
++#include <xen/interface/platform.h>
++#include <linux/sysdev.h>
++
++extern int xen_pcpu_hotplug(int type, uint32_t apic_id);
++#define XEN_PCPU_ONLINE     0x01
++#define XEN_PCPU_OFFLINE    0x02
++#define XEN_PCPU_ADD        0x04
++#define XEN_PCPU_REMOVE     0x08
++
++struct pcpu {
++	struct list_head pcpu_list;
++	struct sys_device sysdev;
++	uint32_t xen_id;
++	uint32_t apic_id;
++	uint32_t acpi_id;
++	uint32_t flags;
++};
++
++static inline int xen_pcpu_online(uint32_t flags)
++{
++	return !!(flags & XEN_PCPU_FLAGS_ONLINE);
++}
++
++extern int register_xen_pcpu_notifier(struct notifier_block *nb);
++
++extern void unregister_xen_pcpu_notifier(struct notifier_block *nb);
++#endif
 diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h
 new file mode 100644
 index 0000000..b42cdfd
@@ -26105,179 +28627,22 @@
  	struct device_driver driver;
  	int (*read_otherend_details)(struct xenbus_device *dev);
  	int (*is_ready)(struct xenbus_device *dev);
-diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
-index 3e1c36e..931a4d9 100644
---- a/kernel/hrtimer.c
-+++ b/kernel/hrtimer.c
-@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
- static int hrtimer_reprogram(struct hrtimer *timer,
- 			     struct hrtimer_clock_base *base)
- {
--	ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
-+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
- 	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- 	int res;
- 
-@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
- 	if (expires.tv64 < 0)
- 		return -ETIME;
- 
--	if (expires.tv64 >= expires_next->tv64)
-+	if (expires.tv64 >= cpu_base->expires_next.tv64)
-+		return 0;
-+
-+	/*
-+	 * If a hang was detected in the last timer interrupt then we
-+	 * do not schedule a timer which is earlier than the expiry
-+	 * which we enforced in the hang detection. We want the system
-+	 * to make progress.
-+	 */
-+	if (cpu_base->hang_detected)
- 		return 0;
- 
- 	/*
-@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
- 	 */
- 	res = tick_program_event(expires, 0);
- 	if (!IS_ERR_VALUE(res))
--		*expires_next = expires;
-+		cpu_base->expires_next = expires;
- 	return res;
- }
- 
-@@ -1217,29 +1226,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
- 
- #ifdef CONFIG_HIGH_RES_TIMERS
- 
--static int force_clock_reprogram;
--
--/*
-- * After 5 iteration's attempts, we consider that hrtimer_interrupt()
-- * is hanging, which could happen with something that slows the interrupt
-- * such as the tracing. Then we force the clock reprogramming for each future
-- * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
-- * threshold that we will overwrite.
-- * The next tick event will be scheduled to 3 times we currently spend on
-- * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
-- * 1/4 of their time to process the hrtimer interrupts. This is enough to
-- * let it running without serious starvation.
-- */
--
--static inline void
--hrtimer_interrupt_hanging(struct clock_event_device *dev,
--			ktime_t try_time)
--{
--	force_clock_reprogram = 1;
--	dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
--	printk(KERN_WARNING "hrtimer: interrupt too slow, "
--		"forcing clock min delta to %lu ns\n", dev->min_delta_ns);
--}
- /*
-  * High resolution timer interrupt
-  * Called with interrupts disabled
-@@ -1248,21 +1234,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
- {
- 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
- 	struct hrtimer_clock_base *base;
--	ktime_t expires_next, now;
--	int nr_retries = 0;
--	int i;
-+	ktime_t expires_next, now, entry_time, delta;
-+	int i, retries = 0;
- 
- 	BUG_ON(!cpu_base->hres_active);
- 	cpu_base->nr_events++;
- 	dev->next_event.tv64 = KTIME_MAX;
- 
-- retry:
--	/* 5 retries is enough to notice a hang */
--	if (!(++nr_retries % 5))
--		hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
--
--	now = ktime_get();
--
-+	entry_time = now = ktime_get();
-+retry:
- 	expires_next.tv64 = KTIME_MAX;
- 
- 	spin_lock(&cpu_base->lock);
-@@ -1324,10 +1304,48 @@ void hrtimer_interrupt(struct clock_event_device *dev)
- 	spin_unlock(&cpu_base->lock);
- 
- 	/* Reprogramming necessary ? */
--	if (expires_next.tv64 != KTIME_MAX) {
--		if (tick_program_event(expires_next, force_clock_reprogram))
--			goto retry;
-+	if (expires_next.tv64 == KTIME_MAX ||
-+	    !tick_program_event(expires_next, 0)) {
-+		cpu_base->hang_detected = 0;
-+		return;
+diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
+index bde4c66..3687da2 100644
+--- a/kernel/irq/manage.c
++++ b/kernel/irq/manage.c
+@@ -200,7 +200,8 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
+ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
+ {
+ 	if (suspend) {
+-		if (!desc->action || (desc->action->flags & IRQF_TIMER))
++		if (!desc->action ||
++		    (desc->action->flags & (IRQF_TIMER | IRQF_NO_SUSPEND)))
+ 			return;
+ 		desc->status |= IRQ_SUSPENDED;
  	}
-+
-+	/*
-+	 * The next timer was already expired due to:
-+	 * - tracing
-+	 * - long lasting callbacks
-+	 * - being scheduled away when running in a VM
-+	 *
-+	 * We need to prevent that we loop forever in the hrtimer
-+	 * interrupt routine. We give it 3 attempts to avoid
-+	 * overreacting on some spurious event.
-+	 */
-+	now = ktime_get();
-+	cpu_base->nr_retries++;
-+	if (++retries < 3)
-+		goto retry;
-+	/*
-+	 * Give the system a chance to do something else than looping
-+	 * here. We stored the entry time, so we know exactly how long
-+	 * we spent here. We schedule the next event this amount of
-+	 * time away.
-+	 */
-+	cpu_base->nr_hangs++;
-+	cpu_base->hang_detected = 1;
-+	delta = ktime_sub(now, entry_time);
-+	if (delta.tv64 > cpu_base->max_hang_time.tv64)
-+		cpu_base->max_hang_time = delta;
-+	/*
-+	 * Limit it to a sensible value as we enforce a longer
-+	 * delay. Give the CPU at least 100ms to catch up.
-+	 */
-+	if (delta.tv64 > 100 * NSEC_PER_MSEC)
-+		expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
-+	else
-+		expires_next = ktime_add(now, delta);
-+	tick_program_event(expires_next, 1);
-+	printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
-+		    ktime_to_ns(delta));
- }
- 
- /*
-diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
-index 1b5b7aa..54c0dda 100644
---- a/kernel/time/timer_list.c
-+++ b/kernel/time/timer_list.c
-@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
- 	P_ns(expires_next);
- 	P(hres_active);
- 	P(nr_events);
-+	P(nr_retries);
-+	P(nr_hangs);
-+	P_ns(max_hang_time);
- #endif
- #undef P
- #undef P_ns
-@@ -252,7 +255,7 @@ static int timer_list_show(struct seq_file *m, void *v)
- 	u64 now = ktime_to_ns(ktime_get());
- 	int cpu;
- 
--	SEQ_printf(m, "Timer List Version: v0.4\n");
-+	SEQ_printf(m, "Timer List Version: v0.5\n");
- 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
- 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
- 
 diff --git a/lib/Makefile b/lib/Makefile
-index 2e78277..7c31e3d 100644
+index 452f188..001e918 100644
 --- a/lib/Makefile
 +++ b/lib/Makefile
 @@ -77,7 +77,8 @@ obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
@@ -28240,3 +30605,36 @@
  	if (PageAnon(page))
  		page->mapping = NULL;
  	if (free_pages_check(page))
+diff --git a/mm/vmalloc.c b/mm/vmalloc.c
+index c228731..cb459fb 100644
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -31,6 +31,7 @@
+ #include <asm/tlbflush.h>
+ #include <asm/shmparam.h>
+ 
++bool vmap_lazy_unmap __read_mostly = true;
+ 
+ /*** Page table manipulation functions ***/
+ 
+@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void)
+ {
+ 	unsigned int log;
+ 
++	if (!vmap_lazy_unmap)
++		return 0;
++
+ 	log = fls(num_online_cpus());
+ 
+ 	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
+@@ -561,8 +565,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
+ 	}
+ 	rcu_read_unlock();
+ 
+-	if (nr)
++	if (nr) {
+ 		atomic_sub(nr, &vmap_lazy_nr);
++	}
+ 
+ 	if (nr || force_flush)
+ 		flush_tlb_kernel_range(*start, *end);