[kernel] r15396 - dists/sid/linux-2.6/debian/patches/features/all/xen
Bastian Blank
waldi at alioth.debian.org
Tue Mar 16 07:55:51 UTC 2010
Author: waldi
Date: Tue Mar 16 07:55:46 2010
New Revision: 15396
Log:
debian/patches/features/all/xen/pvops.patch: Update to last version.
Modified:
dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch
Modified: dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch
==============================================================================
--- dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch Tue Mar 16 05:40:15 2010 (r15395)
+++ dists/sid/linux-2.6/debian/patches/features/all/xen/pvops.patch Tue Mar 16 07:55:46 2010 (r15396)
@@ -1,3 +1,6 @@
+Patch based on commit 84b76672405787415df3df568206845292c030c0 of
+git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen.git.
+
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 29a6ff8..81f9b94 100644
--- a/Documentation/x86/x86_64/boot-options.txt
@@ -288,10 +291,10 @@
#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
-index 7c7c16c..2fc09d3 100644
+index 5f61f6e..b852da9 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
-@@ -171,6 +171,7 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
+@@ -172,6 +172,7 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern void probe_nr_irqs_gsi(void);
@@ -299,7 +302,7 @@
extern int setup_ioapic_entry(int apic, int irq,
struct IO_APIC_route_entry *entry,
-@@ -200,4 +201,6 @@ static inline void probe_nr_irqs_gsi(void) { }
+@@ -201,4 +202,6 @@ static inline void probe_nr_irqs_gsi(void) { }
#endif
@@ -388,7 +391,7 @@
void (*wbinvd)(void);
void (*io_delay)(void);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
-index ada8c20..2a34c12 100644
+index ada8c20..faa0af1 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -21,6 +21,7 @@ struct pci_sysdata {
@@ -399,6 +402,34 @@
/* scan a bus after allocating a pci_sysdata for it */
extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
+@@ -49,6 +50,11 @@ extern unsigned int pcibios_assign_all_busses(void);
+ #define pcibios_assign_all_busses() 0
+ #endif
+
++static inline int pcibios_scan_all_fns(struct pci_bus *bus, int devfn)
++{
++ return pci_scan_all_fns;
++}
++
+ extern unsigned long pci_mem_start;
+ #define PCIBIOS_MIN_IO 0x1000
+ #define PCIBIOS_MIN_MEM (pci_mem_start)
+@@ -87,6 +93,7 @@ extern void pci_iommu_alloc(void);
+
+ /* MSI arch hook */
+ #define arch_setup_msi_irqs arch_setup_msi_irqs
++#define arch_teardown_msi_irqs arch_teardown_msi_irqs
+
+ #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
+
+@@ -128,6 +135,7 @@ extern void pci_iommu_alloc(void);
+ #include <asm-generic/pci-dma-compat.h>
+
+ /* generic pci stuff */
++#define HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS
+ #include <asm-generic/pci.h>
+ #define PCIBIOS_MAX_MEM_32 0xffffffff
+
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b399988..30cbf49 100644
--- a/arch/x86/include/asm/pci_x86.h
@@ -849,35 +880,100 @@
unsigned long arbitrary_virt_to_mfn(void *vaddr);
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
new file mode 100644
-index 0000000..cb84abe
+index 0000000..d68637f
--- /dev/null
+++ b/arch/x86/include/asm/xen/pci.h
-@@ -0,0 +1,37 @@
+@@ -0,0 +1,102 @@
+#ifndef _ASM_X86_XEN_PCI_H
+#define _ASM_X86_XEN_PCI_H
+
-+#ifdef CONFIG_XEN_DOM0_PCI
-+int xen_register_gsi(u32 gsi, int triggering, int polarity);
++#if defined(CONFIG_PCI_MSI)
++#if defined(CONFIG_PCI_XEN)
+int xen_create_msi_irq(struct pci_dev *dev,
+ struct msi_desc *msidesc,
+ int type);
-+int xen_destroy_irq(int irq);
-+#else
-+static inline int xen_register_gsi(u32 gsi, int triggering, int polarity)
++void xen_pci_teardown_msi_dev(struct pci_dev *dev);
++void xen_pci_teardown_msi_irq(int irq);
++int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
++
++/* The drivers/pci/xen-pcifront.c sets this structure to
++ * its own functions.
++ */
++struct xen_pci_frontend_ops {
++ int (*enable_msi)(struct pci_dev *dev, int **vectors);
++ void (*disable_msi)(struct pci_dev *dev);
++ int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec);
++ void (*disable_msix)(struct pci_dev *dev);
++};
++
++extern struct xen_pci_frontend_ops *xen_pci_frontend;
++
++static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
++ int **vectors)
+{
-+ return -1;
++ if (xen_pci_frontend && xen_pci_frontend->enable_msi)
++ return xen_pci_frontend->enable_msi(dev, vectors);
++ return -ENODEV;
+}
-+
++static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
++{
++ if (xen_pci_frontend && xen_pci_frontend->disable_msi)
++ xen_pci_frontend->disable_msi(dev);
++}
++static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
++ int **vectors, int nvec)
++{
++ if (xen_pci_frontend && xen_pci_frontend->enable_msix)
++ return xen_pci_frontend->enable_msix(dev, vectors, nvec);
++ return -ENODEV;
++}
++static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
++{
++ if (xen_pci_frontend && xen_pci_frontend->disable_msix)
++ xen_pci_frontend->disable_msix(dev);
++}
++#else
+static inline int xen_create_msi_irq(struct pci_dev *dev,
+ struct msi_desc *msidesc,
+ int type)
+{
+ return -1;
+}
-+static inline int xen_destroy_irq(int irq)
++static inline void xen_pci_teardown_msi_dev(struct pci_dev *dev) { }
++static inline void xen_pci_teardown_msi_irq(int irq) { }
++static inline int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++ return -ENODEV;
++}
++#endif /* CONFIG_PCI_XEN */
++
++#endif /* CONFIG_PCI_MSI */
++
++#ifdef CONFIG_XEN_DOM0_PCI
++int xen_register_gsi(u32 gsi, int triggering, int polarity);
++int xen_find_device_domain_owner(struct pci_dev *dev);
++int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
++int xen_unregister_device_domain_owner(struct pci_dev *dev);
++
++#else
++static inline int xen_register_gsi(u32 gsi, int triggering, int polarity)
++{
++ return -1;
++}
++
++static inline int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
+}
++static inline int xen_register_device_domain_owner(struct pci_dev *dev,
++ uint16_t domain)
++{
++ return -1;
++}
++static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
++{
++ return -1;
++}
+#endif
+
+#if defined(CONFIG_PCI_MSI) && defined(CONFIG_XEN_DOM0_PCI)
@@ -923,7 +1019,7 @@
obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
-index 67e929b..21fc029 100644
+index 195e4b7..6458fe8 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -42,6 +42,10 @@
@@ -948,7 +1044,7 @@
if (!enabled) {
++disabled_cpus;
return;
-@@ -455,9 +463,13 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+@@ -461,9 +469,13 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
*/
int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
{
@@ -963,7 +1059,7 @@
#ifdef CONFIG_PCI
/*
* Make sure all (legacy) PCI IRQs are set as level-triggered.
-@@ -733,6 +745,10 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
+@@ -740,6 +752,10 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
static void __init acpi_register_lapic_address(unsigned long address)
{
@@ -974,7 +1070,7 @@
mp_lapic_addr = address;
set_fixmap_nocache(FIX_APIC_BASE, address);
-@@ -853,6 +869,9 @@ int __init acpi_probe_gsi(void)
+@@ -860,6 +876,9 @@ int __init acpi_probe_gsi(void)
max_gsi = gsi;
}
@@ -1187,22 +1283,23 @@
} else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
force_iommu ||
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
-index c107e83..db1af79 100644
+index dc4f486..d766a61 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
-@@ -63,8 +63,11 @@
+@@ -63,7 +63,12 @@
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_irq.h>
+#include <asm/xen/hypervisor.h>
#include <asm/apic.h>
-
++#include <asm/xen/hypervisor.h>
+#include <asm/xen/pci.h>
+
++#include <asm/xen/pci.h>
+
#define __apicdebuginit(type) static type __init
#define for_each_irq_pin(entry, head) \
- for (entry = head; entry; entry = entry->next)
-@@ -390,14 +393,18 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+@@ -390,14 +395,18 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
{
@@ -1223,7 +1320,7 @@
writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
-@@ -410,7 +417,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
+@@ -410,7 +419,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
*/
static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
@@ -1234,29 +1331,48 @@
if (sis_apic_bug)
writel(reg, &io_apic->index);
-@@ -3447,6 +3456,9 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+@@ -3489,6 +3500,9 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
+ if (xen_domain())
-+ return xen_setup_msi_irqs(dev, nvec, type);
++ return xen_pci_setup_msi_irqs(dev, nvec, type);
+
node = dev_to_node(&dev->dev);
irq_want = nr_irqs_gsi;
sub_handle = 0;
-@@ -3496,7 +3508,10 @@ error:
+@@ -3538,7 +3552,29 @@ error:
void arch_teardown_msi_irq(unsigned int irq)
{
- destroy_irq(irq);
+ if (xen_domain())
-+ xen_destroy_irq(irq);
++ xen_pci_teardown_msi_irq(irq);
+ else
+ destroy_irq(irq);
++}
++
++void arch_teardown_msi_irqs(struct pci_dev *dev)
++{
++ struct msi_desc *entry;
++
++ /* If we are non-privileged PV domain, we have to
++ * to call xen_teardown_msi_dev first. */
++ if (xen_domain())
++ xen_pci_teardown_msi_dev(dev);
++
++ list_for_each_entry(entry, &dev->msi_list, list) {
++ int i, nvec;
++ if (entry->irq == 0)
++ continue;
++ nvec = 1 << entry->msi_attrib.multiple;
++ for (i = 0; i < nvec; i++)
++ arch_teardown_msi_irq(entry->irq + i);
++ }
}
#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
-@@ -3812,6 +3827,11 @@ void __init probe_nr_irqs_gsi(void)
+@@ -3854,6 +3890,11 @@ void __init probe_nr_irqs_gsi(void)
printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
@@ -2465,7 +2581,7 @@
int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
-index bff34d6..704bddc 100644
+index 269c2a3..8e1aac8 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -23,7 +23,7 @@
@@ -2477,7 +2593,7 @@
#endif
/*
-@@ -639,7 +639,7 @@ void native_machine_shutdown(void)
+@@ -647,7 +647,7 @@ void native_machine_shutdown(void)
#endif
#ifdef CONFIG_X86_64
@@ -2589,7 +2705,7 @@
#ifdef CONFIG_STRICT_DEVMEM
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
-index ed34f5e..103e324 100644
+index c9ba9de..103e324 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,9 @@
@@ -2602,7 +2718,7 @@
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
#ifdef CONFIG_HIGHPTE
-@@ -17,6 +20,16 @@
+@@ -14,6 +17,16 @@
gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
@@ -2619,7 +2735,7 @@
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
return (pte_t *)__get_free_page(PGALLOC_GFP);
-@@ -267,6 +301,12 @@ out:
+@@ -288,6 +301,12 @@ out:
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
@@ -2807,10 +2923,10 @@
#endif
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
new file mode 100644
-index 0000000..1b922aa
+index 0000000..2bac970
--- /dev/null
+++ b/arch/x86/pci/xen.c
-@@ -0,0 +1,51 @@
+@@ -0,0 +1,154 @@
+/*
+ * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
+ * x86 PCI core to support the Xen PCI Frontend
@@ -2827,8 +2943,111 @@
+
+#include <asm/xen/hypervisor.h>
+
++#include <xen/events.h>
++#include <asm/xen/pci.h>
++
++#if defined(CONFIG_PCI_MSI)
++#include <linux/msi.h>
++
++struct xen_pci_frontend_ops *xen_pci_frontend;
++EXPORT_SYMBOL_GPL(xen_pci_frontend);
++
++/*
++ * For MSI interrupts we have to use drivers/xen/event.s functions to
++ * allocate an irq_desc and setup the right */
++
++
++int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++ int irq, ret, i;
++ struct msi_desc *msidesc;
++ int *v;
++
++
++ /* Dom0 has another mechanism for this. The exit path
++ * (xen_pci_teardown_msi_irq) is shared with Dom0.
++ */
++ if (xen_initial_domain())
++ return xen_setup_msi_irqs(dev, nvec, type);
++
++ v = kzalloc(sizeof(int) * min(1, nvec), GFP_KERNEL);
++ if (!v)
++ return -ENOMEM;
++
++ if (!xen_initial_domain()) {
++ if (type == PCI_CAP_ID_MSIX)
++ ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
++ else
++ ret = xen_pci_frontend_enable_msi(dev, &v);
++ if (ret)
++ goto error;
++ }
++ i = 0;
++ list_for_each_entry(msidesc, &dev->msi_list, list) {
++ irq = xen_allocate_pirq(v[i], 0, /* not sharable */
++ (type == PCI_CAP_ID_MSIX) ?
++ "pcifront-msi-x":"pcifront-msi");
++ if (irq < 0)
++ return -1;
++
++ ret = set_irq_msi(irq, msidesc);
++ if (ret)
++ goto error_while;
++ }
++ kfree(v);
++ return 0;
++
++error_while:
++ unbind_from_irqhandler(irq, NULL);
++error:
++ if (ret == -ENODEV)
++ dev_err(&dev->dev,"Xen PCI frontend has not registered" \
++ " MSI/MSI-X support!\n");
++
++ kfree(v);
++ return ret;
++}
++
++void xen_pci_teardown_msi_dev(struct pci_dev *dev)
++{
++ /* Only do this when were are in non-privileged mode.*/
++ if (!xen_initial_domain()) {
++ struct msi_desc *msidesc;
++
++ msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
++ if (msidesc->msi_attrib.is_msix)
++ xen_pci_frontend_disable_msix(dev);
++ else
++ xen_pci_frontend_disable_msi(dev);
++ }
++
++}
++
++void xen_pci_teardown_msi_irq(int irq)
++{
++ xen_destroy_irq(irq);
++}
++#endif
++
+static int xen_pcifront_enable_irq(struct pci_dev *dev)
+{
++ int rc;
++ int share = 1;
++
++ dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq);
++
++ if (dev->irq < 0)
++ return -EINVAL;
++
++ if (dev->irq < NR_IRQS_LEGACY)
++ share = 0;
++
++ rc = xen_allocate_pirq(dev->irq, share, "pcifront");
++ if (rc < 0) {
++ dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n",
++ dev->irq, rc);
++ return rc;
++ }
+ return 0;
+}
+
@@ -2863,10 +3082,10 @@
+}
+
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
-index b83e119..3da23c7 100644
+index b83e119..7675f9b 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
-@@ -36,3 +36,39 @@ config XEN_DEBUG_FS
+@@ -36,3 +36,40 @@ config XEN_DEBUG_FS
help
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.
@@ -2900,13 +3119,13 @@
+ select PCI_XEN
+
+config XEN_PCI_PASSTHROUGH
-+ bool #"Enable support for Xen PCI passthrough devices"
++ bool "Enable support for Xen PCI passthrough devices"
+ depends on XEN && PCI
+ select PCI_XEN
++ select SWIOTLB_XEN
+ help
+ Enable support for passing PCI devices through to
+ unprivileged domains. (COMPLETELY UNTESTED)
-\ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3bb4fc2..08ac224 100644
--- a/arch/x86/xen/Makefile
@@ -2961,7 +3180,7 @@
+#endif
+}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
-index 79f9738..765f714 100644
+index 3578688..bc03e10 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -28,6 +28,7 @@
@@ -2972,7 +3191,7 @@
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
-@@ -65,6 +67,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+@@ -66,6 +67,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
enum xen_domain_type xen_domain_type = XEN_NATIVE;
EXPORT_SYMBOL_GPL(xen_domain_type);
@@ -2984,7 +3203,7 @@
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);
-@@ -166,13 +173,16 @@ static void __init xen_banner(void)
+@@ -167,13 +173,16 @@ static void __init xen_banner(void)
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
pv_info.name);
@@ -3003,7 +3222,7 @@
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
-@@ -186,7 +196,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+@@ -187,7 +196,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
* unsupported kernel subsystems as possible.
*/
switch (*ax) {
@@ -3012,7 +3231,7 @@
maskecx = cpuid_leaf1_ecx_mask;
maskedx = cpuid_leaf1_edx_mask;
break;
-@@ -195,6 +205,10 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+@@ -196,6 +205,10 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
/* Suppress extended topology stuff */
maskebx = 0;
break;
@@ -3023,7 +3242,7 @@
}
asm(XEN_EMULATE_PREFIX "cpuid"
-@@ -216,8 +230,11 @@ static __init void xen_init_cpuid_mask(void)
+@@ -217,8 +230,11 @@ static __init void xen_init_cpuid_mask(void)
cpuid_leaf1_edx_mask =
~((1 << X86_FEATURE_MCE) | /* disable MCE */
(1 << X86_FEATURE_MCA) | /* disable MCA */
@@ -3035,7 +3254,7 @@
if (!xen_initial_domain())
cpuid_leaf1_edx_mask &=
~((1 << X86_FEATURE_APIC) | /* disable local APIC */
-@@ -405,7 +422,7 @@ static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+@@ -406,7 +422,7 @@ static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
pte = pfn_pte(pfn, PAGE_KERNEL_RO);
@@ -3044,7 +3263,7 @@
BUG();
frames[f] = mfn;
-@@ -518,11 +535,10 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
+@@ -519,11 +535,10 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
} else if (addr == (unsigned long)machine_check) {
return 0;
#endif
@@ -3060,7 +3279,7 @@
#endif /* CONFIG_X86_64 */
info->address = addr;
-@@ -678,6 +694,18 @@ static void xen_set_iopl_mask(unsigned mask)
+@@ -679,6 +694,18 @@ static void xen_set_iopl_mask(unsigned mask)
HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
}
@@ -3079,7 +3298,7 @@
static void xen_io_delay(void)
{
}
-@@ -715,7 +743,7 @@ static u32 xen_safe_apic_wait_icr_idle(void)
+@@ -716,7 +743,7 @@ static u32 xen_safe_apic_wait_icr_idle(void)
return 0;
}
@@ -3088,7 +3307,7 @@
{
apic->read = xen_apic_read;
apic->write = xen_apic_write;
-@@ -977,6 +1005,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+@@ -978,6 +1005,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.load_sp0 = xen_load_sp0,
.set_iopl_mask = xen_set_iopl_mask,
@@ -3096,7 +3315,7 @@
.io_delay = xen_io_delay,
/* Xen takes care of %gs when switching to usermode for us */
-@@ -1019,6 +1048,14 @@ static void xen_machine_halt(void)
+@@ -1020,6 +1048,14 @@ static void xen_machine_halt(void)
xen_reboot(SHUTDOWN_poweroff);
}
@@ -3111,7 +3330,7 @@
static void xen_crash_shutdown(struct pt_regs *regs)
{
xen_reboot(SHUTDOWN_crash);
-@@ -1027,7 +1064,7 @@ static void xen_crash_shutdown(struct pt_regs *regs)
+@@ -1028,7 +1064,7 @@ static void xen_crash_shutdown(struct pt_regs *regs)
static const struct machine_ops __initdata xen_machine_ops = {
.restart = xen_restart,
.halt = xen_machine_halt,
@@ -3120,7 +3339,7 @@
.shutdown = xen_machine_halt,
.crash_shutdown = xen_crash_shutdown,
.emergency_restart = xen_emergency_restart,
-@@ -1060,6 +1097,8 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1061,6 +1097,8 @@ asmlinkage void __init xen_start_kernel(void)
xen_domain_type = XEN_PV_DOMAIN;
@@ -3129,7 +3348,20 @@
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
-@@ -1137,6 +1182,8 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1086,6 +1124,12 @@ asmlinkage void __init xen_start_kernel(void)
+
+ xen_init_mmu_ops();
+
++ /*
++ * Prevent page tables from being allocated in highmem, even
++ * if CONFIG_HIGHPTE is enabled.
++ */
++ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
++
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ if (!xen_initial_domain())
+@@ -1144,6 +1188,8 @@ asmlinkage void __init xen_start_kernel(void)
pgd = (pgd_t *)xen_start_info->pt_base;
@@ -3138,7 +3370,7 @@
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-@@ -1146,6 +1193,7 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1153,6 +1199,7 @@ asmlinkage void __init xen_start_kernel(void)
xen_raw_console_write("mapping kernel into physical memory\n");
pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
@@ -3146,7 +3378,7 @@
init_mm.pgd = pgd;
-@@ -1155,6 +1203,14 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1162,6 +1209,14 @@ asmlinkage void __init xen_start_kernel(void)
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
@@ -3161,7 +3393,7 @@
/* set the limit of our address space */
xen_reserve_top();
-@@ -1177,6 +1233,16 @@ asmlinkage void __init xen_start_kernel(void)
+@@ -1184,6 +1239,16 @@ asmlinkage void __init xen_start_kernel(void)
add_preferred_console("xenboot", 0, NULL);
add_preferred_console("tty", 0, NULL);
add_preferred_console("hvc", 0, NULL);
@@ -3179,7 +3411,7 @@
xen_raw_console_write("about to get started...\n");
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
-index bf4cd6b..3e6b558 100644
+index 350a3de..3e6b558 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -50,7 +50,9 @@
@@ -3414,7 +3646,7 @@
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
-@@ -1447,10 +1568,17 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+@@ -1448,10 +1568,17 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
@@ -3434,7 +3666,7 @@
return pte;
}
-@@ -1619,6 +1747,7 @@ static void *m2v(phys_addr_t maddr)
+@@ -1620,6 +1747,7 @@ static void *m2v(phys_addr_t maddr)
return __ka(m2p(maddr));
}
@@ -3442,7 +3674,7 @@
static void set_page_prot(void *addr, pgprot_t prot)
{
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
-@@ -1674,6 +1803,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+@@ -1675,6 +1803,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
set_page_prot(pmd, PAGE_KERNEL_RO);
}
@@ -3463,7 +3695,7 @@
#ifdef CONFIG_X86_64
static void convert_pfn_mfn(void *v)
{
-@@ -1765,6 +1908,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+@@ -1766,6 +1908,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pmd_t *kernel_pmd;
@@ -3471,7 +3703,7 @@
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
xen_start_info->nr_pt_frames * PAGE_SIZE +
-@@ -1776,6 +1920,20 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+@@ -1777,6 +1920,20 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
xen_map_identity_early(level2_kernel_pgt, max_pfn);
memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
@@ -3492,7 +3724,7 @@
set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
-@@ -1798,6 +1956,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+@@ -1799,6 +1956,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
}
#endif /* CONFIG_X86_64 */
@@ -3501,7 +3733,7 @@
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
{
pte_t pte;
-@@ -1827,9 +1987,26 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+@@ -1828,9 +1987,26 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
pte = pfn_pte(phys, prot);
break;
@@ -3529,7 +3761,7 @@
}
__native_set_fixmap(idx, pte);
-@@ -1844,6 +2021,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+@@ -1845,6 +2021,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
}
@@ -3559,7 +3791,7 @@
static __init void xen_post_allocator_init(void)
{
pv_mmu_ops.set_pte = xen_set_pte;
-@@ -1961,6 +2161,271 @@ void __init xen_init_mmu_ops(void)
+@@ -1962,6 +2161,271 @@ void __init xen_init_mmu_ops(void)
pv_mmu_ops = xen_mmu_ops;
}
@@ -3891,14 +4123,15 @@
+}
diff --git a/arch/x86/xen/pci.c b/arch/x86/xen/pci.c
new file mode 100644
-index 0000000..f999ad8
+index 0000000..61e1ade
--- /dev/null
+++ b/arch/x86/xen/pci.c
-@@ -0,0 +1,117 @@
+@@ -0,0 +1,284 @@
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
++#include <linux/slab.h>
+
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
@@ -3967,6 +4200,97 @@
+ return irq;
+}
+
++#ifdef CONFIG_ACPI
++#define BAD_MADT_ENTRY(entry, end) ( \
++ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
++ ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
++
++
++static int __init
++xen_acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
++ const unsigned long end)
++{
++ struct acpi_madt_interrupt_override *intsrc = NULL;
++
++ intsrc = (struct acpi_madt_interrupt_override *)header;
++
++ if (BAD_MADT_ENTRY(intsrc, end))
++ return -EINVAL;
++
++ acpi_table_print_madt_entry(header);
++
++ if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
++ int gsi;
++ int trigger, polarity;
++
++ trigger = intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK;
++ polarity = intsrc->inti_flags & ACPI_MADT_POLARITY_MASK;
++
++ /* Command-line over-ride via acpi_sci= */
++ if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)
++ trigger = acpi_sci_flags & ACPI_MADT_TRIGGER_MASK;
++
++ if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
++ polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
++
++ printk("xen: sci override: source_irq=%d global_irq=%d trigger=%x polarity=%x\n",
++ intsrc->source_irq, intsrc->global_irq,
++ trigger, polarity);
++
++ switch (polarity) {
++ case ACPI_MADT_POLARITY_CONFORMS:
++ case ACPI_MADT_POLARITY_ACTIVE_LOW:
++ polarity = ACPI_ACTIVE_LOW;
++ break;
++
++ case ACPI_MADT_POLARITY_ACTIVE_HIGH:
++ polarity = ACPI_ACTIVE_HIGH;
++ break;
++
++ default:
++ return 0;
++ }
++
++ switch (trigger) {
++ case ACPI_MADT_TRIGGER_CONFORMS:
++ case ACPI_MADT_TRIGGER_LEVEL:
++ trigger = ACPI_LEVEL_SENSITIVE;
++ break;
++
++ case ACPI_MADT_TRIGGER_EDGE:
++ trigger = ACPI_EDGE_SENSITIVE;
++ break;
++
++ default:
++ return 0;
++ }
++
++ gsi = xen_register_gsi(intsrc->global_irq,
++ trigger, polarity);
++ /*
++ * stash over-ride to indicate we've been here
++ * and for later update of acpi_gbl_FADT
++ */
++ acpi_sci_override_gsi = gsi;
++
++ printk("xen: acpi sci %d\n", gsi);
++ }
++
++ return 0;
++}
++
++static __init void xen_setup_acpi_sci(void)
++{
++ acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
++ xen_acpi_parse_int_src_ovr,
++ nr_irqs);
++}
++#else
++static __init void xen_setup_acpi_sci(void)
++{
++}
++#endif
++
+void __init xen_setup_pirqs(void)
+{
+ int irq;
@@ -3988,6 +4312,8 @@
+ trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE,
+ polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH);
+ }
++
++ xen_setup_acpi_sci();
+}
+
+#ifdef CONFIG_PCI_MSI
@@ -4012,6 +4338,79 @@
+ return ret;
+}
+#endif
++
++struct xen_device_domain_owner {
++ domid_t domain;
++ struct pci_dev *dev;
++ struct list_head list;
++};
++
++static DEFINE_SPINLOCK(dev_domain_list_spinlock);
++static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
++
++static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
++{
++ struct xen_device_domain_owner *owner;
++
++ list_for_each_entry(owner, &dev_domain_list, list) {
++ if (owner->dev == dev)
++ return owner;
++ }
++ return NULL;
++}
++
++int xen_find_device_domain_owner(struct pci_dev *dev)
++{
++ struct xen_device_domain_owner *owner;
++ int domain = -ENODEV;
++
++ spin_lock(&dev_domain_list_spinlock);
++ owner = find_device(dev);
++ if (owner)
++ domain = owner->domain;
++ spin_unlock(&dev_domain_list_spinlock);
++ return domain;
++}
++EXPORT_SYMBOL(xen_find_device_domain_owner);
++
++int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
++{
++ struct xen_device_domain_owner *owner;
++
++ owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
++ if (!owner)
++ return -ENODEV;
++
++ spin_lock(&dev_domain_list_spinlock);
++ if (find_device(dev)) {
++ spin_unlock(&dev_domain_list_spinlock);
++ kfree(owner);
++ return -EEXIST;
++ }
++ owner->domain = domain;
++ owner->dev = dev;
++ list_add_tail(&owner->list, &dev_domain_list);
++ spin_unlock(&dev_domain_list_spinlock);
++ return 0;
++}
++EXPORT_SYMBOL(xen_register_device_domain_owner);
++
++int xen_unregister_device_domain_owner(struct pci_dev *dev)
++{
++ struct xen_device_domain_owner *owner;
++
++ spin_lock(&dev_domain_list_spinlock);
++ owner = find_device(dev);
++ if (!owner) {
++ spin_unlock(&dev_domain_list_spinlock);
++ return -ENODEV;
++ }
++ list_del(&owner->list);
++ spin_unlock(&dev_domain_list_spinlock);
++ kfree(owner);
++ return 0;
++}
++EXPORT_SYMBOL(xen_unregister_device_domain_owner);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ad0047f..266c86a 100644
--- a/arch/x86/xen/setup.c
@@ -4290,7 +4689,7 @@
This driver implements the front-end of the Xen virtual
block device driver. It communicates with a back-end driver
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
-index b8578bb..45ff762 100644
+index b8578bb..a6d8046 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -42,6 +42,7 @@
@@ -4444,7 +4843,7 @@
if (info->rq == NULL)
goto out;
-@@ -945,22 +1005,26 @@ static void blkfront_closing(struct xenbus_device *dev)
+@@ -945,27 +1005,33 @@ static void blkfront_closing(struct xenbus_device *dev)
blk_cleanup_queue(info->rq);
info->rq = NULL;
@@ -4474,7 +4873,14 @@
switch (backend_state) {
case XenbusStateInitialising:
-@@ -988,7 +1052,7 @@ static void backend_changed(struct xenbus_device *dev,
+ case XenbusStateInitWait:
+ case XenbusStateInitialised:
++ case XenbusStateReconfiguring:
++ case XenbusStateReconfigured:
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ break;
+@@ -988,7 +1054,7 @@ static void backend_changed(struct xenbus_device *dev,
xenbus_dev_error(dev, -EBUSY,
"Device in use; refusing to close");
else
@@ -4483,7 +4889,7 @@
mutex_unlock(&bd->bd_mutex);
bdput(bd);
break;
-@@ -1003,7 +1067,10 @@ static int blkfront_remove(struct xenbus_device *dev)
+@@ -1003,7 +1069,10 @@ static int blkfront_remove(struct xenbus_device *dev)
blkif_free(info, 0);
@@ -4495,7 +4901,7 @@
return 0;
}
-@@ -1012,12 +1079,15 @@ static int blkfront_is_ready(struct xenbus_device *dev)
+@@ -1012,12 +1081,15 @@ static int blkfront_is_ready(struct xenbus_device *dev)
{
struct blkfront_info *info = dev_get_drvdata(&dev->dev);
@@ -4512,7 +4918,7 @@
info->users++;
return 0;
}
-@@ -1031,10 +1101,13 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
+@@ -1031,10 +1103,13 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
have ignored this request initially, as the device was
still mounted. */
struct xenbus_device *dev = info->xbdev;
@@ -4529,7 +4935,7 @@
}
return 0;
}
-@@ -1061,7 +1134,7 @@ static struct xenbus_driver blkfront = {
+@@ -1061,7 +1136,7 @@ static struct xenbus_driver blkfront = {
.probe = blkfront_probe,
.remove = blkfront_remove,
.resume = blkfront_resume,
@@ -4952,7 +5358,7 @@
}
EXPORT_SYMBOL(ttm_fbdev_mmap);
diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c
-index b115726..c721c0a 100644
+index b115726..0859bb0 100644
--- a/drivers/input/xen-kbdfront.c
+++ b/drivers/input/xen-kbdfront.c
@@ -21,7 +21,10 @@
@@ -4966,6 +5372,15 @@
#include <xen/events.h>
#include <xen/page.h>
#include <xen/interface/io/fbif.h>
+@@ -272,6 +275,8 @@ static void xenkbd_backend_changed(struct xenbus_device *dev,
+ switch (backend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateInitialised:
++ case XenbusStateReconfiguring:
++ case XenbusStateReconfigured:
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ break;
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index b2f71f7..b7feb84 100644
--- a/drivers/net/Kconfig
@@ -4979,7 +5394,7 @@
help
The network device frontend driver allows the kernel to
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
-index baa051d..87d7121 100644
+index baa051d..ee7465a 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -42,6 +42,7 @@
@@ -5017,7 +5432,16 @@
enum xenbus_state backend_state)
{
struct netfront_info *np = dev_get_drvdata(&dev->dev);
-@@ -1798,7 +1799,7 @@ static struct xenbus_driver netfront_driver = {
+@@ -1608,6 +1609,8 @@ static void backend_changed(struct xenbus_device *dev,
+ switch (backend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateInitialised:
++ case XenbusStateReconfiguring:
++ case XenbusStateReconfigured:
+ case XenbusStateConnected:
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+@@ -1798,7 +1801,7 @@ static struct xenbus_driver netfront_driver = {
.probe = netfront_probe,
.remove = __devexit_p(xennet_remove),
.resume = netfront_resume,
@@ -5026,8 +5450,29 @@
};
static int __init netif_init(void)
+diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
+index fdc864f..7802fcd 100644
+--- a/drivers/pci/Kconfig
++++ b/drivers/pci/Kconfig
+@@ -51,6 +51,16 @@ config PCI_STUB
+
+ When in doubt, say N.
+
++config XEN_PCIDEV_FRONTEND
++ tristate "Xen PCI Frontend"
++ depends on XEN && PCI && X86
++ select HOTPLUG
++ select XEN_XENBUS_FRONTEND
++ default y
++ help
++ The PCI device frontend driver allows the kernel to import arbitrary
++ PCI devices from a PCI backend to support PCI driver domains.
++
+ config HT_IRQ
+ bool "Interrupts on hypertransport devices"
+ default y
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
-index 4a7f11d..ae3e98f 100644
+index 4a7f11d..b70aa4d 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -31,6 +31,8 @@ obj-$(CONFIG_HT_IRQ) += htirq.o
@@ -5039,6 +5484,27 @@
obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
obj-$(CONFIG_PCI_IOV) += iov.o
+@@ -60,6 +62,8 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o
+
+ obj-$(CONFIG_PCI_STUB) += pci-stub.o
+
++obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
++
+ ifeq ($(CONFIG_PCI_DEBUG),y)
+ EXTRA_CFLAGS += -DDEBUG
+ endif
+diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
+index cef28a7..1940183 100644
+--- a/drivers/pci/bus.c
++++ b/drivers/pci/bus.c
+@@ -249,6 +249,7 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
+ up_read(&pci_bus_sem);
+ }
+
++EXPORT_SYMBOL_GPL(pci_walk_bus);
+ EXPORT_SYMBOL(pci_bus_alloc_resource);
+ EXPORT_SYMBOL_GPL(pci_bus_add_device);
+ EXPORT_SYMBOL(pci_bus_add_devices);
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 5753036..8e6e6d1 100644
--- a/drivers/pci/dmar.c
@@ -5383,9870 +5849,16316 @@
+ }
+}
+
-diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
-index 188e1ba..efac9e3 100644
---- a/drivers/video/Kconfig
-+++ b/drivers/video/Kconfig
-@@ -2063,6 +2063,7 @@ config XEN_FBDEV_FRONTEND
- select FB_SYS_IMAGEBLIT
- select FB_SYS_FOPS
- select FB_DEFERRED_IO
-+ select XEN_XENBUS_FRONTEND
- default y
- help
- This driver implements the front-end of the Xen virtual
-diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
-index 54cd916..966b226 100644
---- a/drivers/video/xen-fbfront.c
-+++ b/drivers/video/xen-fbfront.c
-@@ -25,7 +25,10 @@
- #include <linux/module.h>
- #include <linux/vmalloc.h>
- #include <linux/mm.h>
+diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
+new file mode 100644
+index 0000000..2c36004
+--- /dev/null
++++ b/drivers/pci/xen-pcifront.c
+@@ -0,0 +1,1156 @@
++/*
++ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/mm.h>
++#include <xen/xenbus.h>
++#include <xen/events.h>
++#include <xen/grant_table.h>
++#include <xen/page.h>
++#include <linux/spinlock.h>
++#include <linux/pci.h>
++#include <linux/msi.h>
++#include <xen/xenbus.h>
++#include <xen/interface/io/pciif.h>
++#include <asm/xen/pci.h>
++#include <linux/interrupt.h>
++#include <asm/atomic.h>
++#include <linux/workqueue.h>
++#include <linux/bitops.h>
++#include <linux/time.h>
+
- #include <asm/xen/hypervisor.h>
+
-+#include <xen/xen.h>
- #include <xen/events.h>
- #include <xen/page.h>
- #include <xen/interface/io/fbif.h>
-diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
-index cab100a..edeb9b2 100644
---- a/drivers/xen/Kconfig
-+++ b/drivers/xen/Kconfig
-@@ -28,6 +28,46 @@ config XEN_DEV_EVTCHN
- firing.
- If in doubt, say yes.
-
-+config XEN_BACKEND
-+ bool "Backend driver support"
-+ depends on XEN_DOM0
-+ default y
-+ help
-+ Support for backend device drivers that provide I/O services
-+ to other virtual machines.
++#ifndef __init_refok
++#define __init_refok
++#endif
+
-+config XEN_NETDEV_BACKEND
-+ tristate "Xen backend network device"
-+ depends on XEN_BACKEND && NET
-+ help
-+ Implement the network backend driver, which passes packets
-+ from the guest domain's frontend drivers to the network.
++#define INVALID_GRANT_REF (0)
++#define INVALID_EVTCHN (-1)
+
-+config XEN_BLKDEV_BACKEND
-+ tristate "Block-device backend driver"
-+ depends on XEN_BACKEND && BLOCK
-+ help
-+ The block-device backend driver allows the kernel to export its
-+ block devices to other guests via a high-performance shared-memory
-+ interface.
+
-+
-+config XEN_BLKDEV_TAP
-+ tristate "Block-device tap backend driver"
-+ depends on XEN_BACKEND && BLOCK
-+ help
-+ The block tap driver is an alternative to the block back driver
-+ and allows VM block requests to be redirected to userspace through
-+ a device interface. The tap allows user-space development of
-+ high-performance block backends, where disk images may be implemented
-+ as files, in memory, or on other hosts across the network. This
-+ driver can safely coexist with the existing blockback driver.
++struct pci_bus_entry {
++ struct list_head list;
++ struct pci_bus *bus;
++};
+
-+config XEN_BLKBACK_PAGEMAP
-+ tristate
-+ depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP != n
-+ default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP
++#define _PDEVB_op_active (0)
++#define PDEVB_op_active (1 << (_PDEVB_op_active))
+
- config XENFS
- tristate "Xen filesystem"
- depends on XEN
-@@ -60,4 +100,14 @@ config XEN_SYS_HYPERVISOR
- Create entries under /sys/hypervisor describing the Xen
- hypervisor environment. When running native or in another
- virtual environment, /sys/hypervisor will still be present,
-- but will have no xen contents.
-\ No newline at end of file
-+ but will have no xen contents.
++struct pcifront_device {
++ struct xenbus_device *xdev;
++ struct list_head root_buses;
++ spinlock_t dev_lock;
+
-+config XEN_XENBUS_FRONTEND
-+ tristate
++ int evtchn;
++ int gnt_ref;
+
-+config XEN_GNTDEV
-+ tristate "userspace grant access device driver"
-+ depends on XEN
-+ select MMU_NOTIFIER
-+ help
-+ Allows userspace processes use grants.
-diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
-index 7c28434..ab2e672 100644
---- a/drivers/xen/Makefile
-+++ b/drivers/xen/Makefile
-@@ -1,12 +1,20 @@
--obj-y += grant-table.o features.o events.o manage.o
-+obj-y += grant-table.o features.o events.o manage.o biomerge.o
- obj-y += xenbus/
-
- nostackp := $(call cc-option, -fno-stack-protector)
- CFLAGS_features.o := $(nostackp)
-
--obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
--obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
--obj-$(CONFIG_XEN_BALLOON) += balloon.o
--obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
--obj-$(CONFIG_XENFS) += xenfs/
--obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
-\ No newline at end of file
-+obj-$(CONFIG_PCI) += pci.o
-+obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
-+obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
-+obj-$(CONFIG_XEN_BALLOON) += balloon.o
-+obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
-+obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
-+obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
-+obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
-+obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
-+obj-$(CONFIG_XENFS) += xenfs/
-+obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
++ int irq;
+
-+xen-evtchn-y := evtchn.o
-+xen-gntdev-y := gntdev.o
-diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
-index 4204336..d7c0eae 100644
---- a/drivers/xen/balloon.c
-+++ b/drivers/xen/balloon.c
-@@ -43,6 +43,7 @@
- #include <linux/mutex.h>
- #include <linux/list.h>
- #include <linux/sysdev.h>
-+#include <linux/swap.h>
-
- #include <asm/page.h>
- #include <asm/pgalloc.h>
-@@ -52,13 +53,15 @@
-
- #include <asm/xen/hypervisor.h>
- #include <asm/xen/hypercall.h>
++ /* Lock this when doing any operations in sh_info */
++ spinlock_t sh_info_lock;
++ struct xen_pci_sharedinfo *sh_info;
++ struct work_struct op_work;
++ unsigned long flags;
+
-+#include <xen/xen.h>
- #include <xen/interface/xen.h>
- #include <xen/interface/memory.h>
- #include <xen/xenbus.h>
- #include <xen/features.h>
- #include <xen/page.h>
-
--#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
-+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT+balloon_order-10))
-
- #define BALLOON_CLASS_NAME "xen_memory"
-
-@@ -82,14 +85,15 @@ static struct sys_device balloon_sysdev;
-
- static int register_balloon(struct sys_device *sysdev);
-
-+static struct balloon_stats balloon_stats;
++};
+
- /*
-- * Protects atomic reservation decrease/increase against concurrent increases.
-- * Also protects non-atomic updates of current_pages and driver_pages, and
-- * balloon lists.
-+ * Work in pages of this order. Can be either 0 for normal pages
-+ * or 9 for hugepages.
- */
--static DEFINE_SPINLOCK(balloon_lock);
--
--static struct balloon_stats balloon_stats;
-+static int balloon_order;
-+static unsigned long balloon_npages;
-+static unsigned long discontig_frame_list[PAGE_SIZE / sizeof(unsigned long)];
-
- /* We increase/decrease in batches which fit in a page */
- static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
-@@ -118,10 +122,41 @@ static struct timer_list balloon_timer;
- static void scrub_page(struct page *page)
- {
- #ifdef CONFIG_XEN_SCRUB_PAGES
-- clear_highpage(page);
-+ int i;
++struct pcifront_sd {
++ int domain;
++ struct pcifront_device *pdev;
++};
+
-+ for (i = 0; i < balloon_npages; i++)
-+ clear_highpage(page++);
- #endif
- }
-
-+static void free_discontig_frame(void)
++static inline struct pcifront_device *
++pcifront_get_pdev(struct pcifront_sd *sd)
+{
-+ int rc;
-+ struct xen_memory_reservation reservation = {
-+ .address_bits = 0,
-+ .domid = DOMID_SELF,
-+ .nr_extents = balloon_npages,
-+ .extent_order = 0
-+ };
++ return sd->pdev;
++}
+
-+ set_xen_guest_handle(reservation.extent_start, discontig_frame_list);
-+ rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
-+ BUG_ON(rc != balloon_npages);
++static inline void pcifront_init_sd(struct pcifront_sd *sd,
++ unsigned int domain, unsigned int bus,
++ struct pcifront_device *pdev)
++{
++ sd->domain = domain;
++ sd->pdev = pdev;
+}
+
-+static unsigned long shrink_frame(unsigned long nr_pages)
++static inline void pcifront_setup_root_resources(struct pci_bus *bus,
++ struct pcifront_sd *sd)
+{
-+ unsigned long i, j;
++}
+
-+ for (i = 0, j = 0; i < nr_pages; i++, j++) {
-+ if (frame_list[i] == 0)
-+ j++;
-+ if (i != j)
-+ frame_list[i] = frame_list[j];
++
++DEFINE_SPINLOCK(pcifront_dev_lock);
++static struct pcifront_device *pcifront_dev;
++
++static int verbose_request;
++module_param(verbose_request, int, 0644);
++
++static int errno_to_pcibios_err(int errno)
++{
++ switch (errno) {
++ case XEN_PCI_ERR_success:
++ return PCIBIOS_SUCCESSFUL;
++
++ case XEN_PCI_ERR_dev_not_found:
++ return PCIBIOS_DEVICE_NOT_FOUND;
++
++ case XEN_PCI_ERR_invalid_offset:
++ case XEN_PCI_ERR_op_failed:
++ return PCIBIOS_BAD_REGISTER_NUMBER;
++
++ case XEN_PCI_ERR_not_implemented:
++ return PCIBIOS_FUNC_NOT_SUPPORTED;
++
++ case XEN_PCI_ERR_access_denied:
++ return PCIBIOS_SET_FAILED;
+ }
-+ return i;
++ return errno;
+}
+
- /* balloon_append: add the given page to the balloon. */
- static void balloon_append(struct page *page)
- {
-@@ -195,19 +230,18 @@ static unsigned long current_target(void)
-
- static int increase_reservation(unsigned long nr_pages)
- {
-- unsigned long pfn, i, flags;
-+ unsigned long pfn, mfn, i, j, flags;
- struct page *page;
- long rc;
- struct xen_memory_reservation reservation = {
- .address_bits = 0,
-- .extent_order = 0,
- .domid = DOMID_SELF
- };
-
- if (nr_pages > ARRAY_SIZE(frame_list))
- nr_pages = ARRAY_SIZE(frame_list);
-
-- spin_lock_irqsave(&balloon_lock, flags);
-+ spin_lock_irqsave(&xen_reservation_lock, flags);
-
- page = balloon_first_page();
- for (i = 0; i < nr_pages; i++) {
-@@ -218,6 +252,8 @@ static int increase_reservation(unsigned long nr_pages)
-
- set_xen_guest_handle(reservation.extent_start, frame_list);
- reservation.nr_extents = nr_pages;
-+ reservation.extent_order = balloon_order;
++static inline void schedule_pcifront_aer_op(struct pcifront_device *pdev)
++{
++ if (test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
++ && !test_and_set_bit(_PDEVB_op_active, &pdev->flags)) {
++ dev_dbg(&pdev->xdev->dev, "schedule aer frontend job\n");
++ schedule_work(&pdev->op_work);
++ }
++}
+
- rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
- if (rc < 0)
- goto out;
-@@ -227,19 +263,22 @@ static int increase_reservation(unsigned long nr_pages)
- BUG_ON(page == NULL);
-
- pfn = page_to_pfn(page);
-+ mfn = frame_list[i];
- BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
- phys_to_machine_mapping_valid(pfn));
-
-- set_phys_to_machine(pfn, frame_list[i]);
--
-- /* Link back into the page tables if not highmem. */
-- if (pfn < max_low_pfn) {
-- int ret;
-- ret = HYPERVISOR_update_va_mapping(
-- (unsigned long)__va(pfn << PAGE_SHIFT),
-- mfn_pte(frame_list[i], PAGE_KERNEL),
-- 0);
-- BUG_ON(ret);
-+ for (j = 0; j < balloon_npages; j++, pfn++, mfn++) {
-+ set_phys_to_machine(pfn, mfn);
++static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
++{
++ int err = 0;
++ struct xen_pci_op *active_op = &pdev->sh_info->op;
++ unsigned long irq_flags;
++ evtchn_port_t port = pdev->evtchn;
++ unsigned irq = pdev->irq;
++ s64 ns, ns_timeout;
++ struct timeval tv;
++
++ spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
++
++ memcpy(active_op, op, sizeof(struct xen_pci_op));
++
++ /* Go */
++ wmb();
++ set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
++ notify_remote_via_evtchn(port);
+
-+ /* Link back into the page tables if not highmem. */
-+ if (pfn < max_low_pfn) {
-+ int ret;
-+ ret = HYPERVISOR_update_va_mapping(
-+ (unsigned long)__va(pfn << PAGE_SHIFT),
-+ mfn_pte(mfn, PAGE_KERNEL),
-+ 0);
-+ BUG_ON(ret);
-+ }
- }
-
- /* Relinquish the page back to the allocator. */
-@@ -251,20 +290,20 @@ static int increase_reservation(unsigned long nr_pages)
- balloon_stats.current_pages += rc;
-
- out:
-- spin_unlock_irqrestore(&balloon_lock, flags);
-+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
-
- return rc < 0 ? rc : rc != nr_pages;
- }
-
- static int decrease_reservation(unsigned long nr_pages)
- {
-- unsigned long pfn, i, flags;
-+ unsigned long pfn, lpfn, mfn, i, j, flags;
- struct page *page;
- int need_sleep = 0;
-- int ret;
-+ int discontig, discontig_free;
-+ int ret;
- struct xen_memory_reservation reservation = {
- .address_bits = 0,
-- .extent_order = 0,
- .domid = DOMID_SELF
- };
-
-@@ -272,7 +311,7 @@ static int decrease_reservation(unsigned long nr_pages)
- nr_pages = ARRAY_SIZE(frame_list);
-
- for (i = 0; i < nr_pages; i++) {
-- if ((page = alloc_page(GFP_BALLOON)) == NULL) {
-+ if ((page = alloc_pages(GFP_BALLOON, balloon_order)) == NULL) {
- nr_pages = i;
- need_sleep = 1;
- break;
-@@ -282,37 +321,50 @@ static int decrease_reservation(unsigned long nr_pages)
- frame_list[i] = pfn_to_mfn(pfn);
-
- scrub_page(page);
--
-- if (!PageHighMem(page)) {
-- ret = HYPERVISOR_update_va_mapping(
-- (unsigned long)__va(pfn << PAGE_SHIFT),
-- __pte_ma(0), 0);
-- BUG_ON(ret);
-- }
--
- }
-
- /* Ensure that ballooned highmem pages don't have kmaps. */
- kmap_flush_unused();
- flush_tlb_all();
-
-- spin_lock_irqsave(&balloon_lock, flags);
-+ spin_lock_irqsave(&xen_reservation_lock, flags);
-
- /* No more mappings: invalidate P2M and add to balloon. */
- for (i = 0; i < nr_pages; i++) {
-- pfn = mfn_to_pfn(frame_list[i]);
-- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
-+ mfn = frame_list[i];
-+ lpfn = pfn = mfn_to_pfn(mfn);
- balloon_append(pfn_to_page(pfn));
-+ discontig_free = 0;
-+ for (j = 0; j < balloon_npages; j++, lpfn++, mfn++) {
-+ if ((discontig_frame_list[j] = pfn_to_mfn(lpfn)) != mfn)
-+ discontig_free = 1;
++ /*
++ * We set a poll timeout of 3 seconds but give up on return after
++ * 2 seconds. It is better to time out too late rather than too early
++ * (in the latter case we end up continually re-executing poll() with a
++ * timeout in the past). 1s difference gives plenty of slack for error.
++ */
++ do_gettimeofday(&tv);
++ ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC;
+
-+ set_phys_to_machine(lpfn, INVALID_P2M_ENTRY);
-+ if (!PageHighMem(page)) {
-+ ret = HYPERVISOR_update_va_mapping(
-+ (unsigned long)__va(lpfn << PAGE_SHIFT),
-+ __pte_ma(0), 0);
-+ BUG_ON(ret);
-+ }
++ xen_clear_irq_pending(irq);
++
++ while (test_bit(_XEN_PCIF_active,
++ (unsigned long *)&pdev->sh_info->flags)) {
++ xen_poll_irq_timeout(irq, jiffies + 3*HZ);
++ xen_clear_irq_pending(irq);
++ do_gettimeofday(&tv);
++ ns = timeval_to_ns(&tv);
++ if (ns > ns_timeout) {
++ dev_err(&pdev->xdev->dev,
++ "pciback not responding!!!\n");
++ clear_bit(_XEN_PCIF_active,
++ (unsigned long *)&pdev->sh_info->flags);
++ err = XEN_PCI_ERR_dev_not_found;
++ goto out;
+ }
-+ if (discontig_free) {
-+ free_discontig_frame();
-+ frame_list[i] = 0;
-+ discontig = 1;
++ }
++
++ /*
++ * We might lose backend service request since we
++ * reuse same evtchn with pci_conf backend response. So re-schedule
++ * aer pcifront service.
++ */
++ if (test_bit(_XEN_PCIB_active,
++ (unsigned long *)&pdev->sh_info->flags)) {
++ dev_err(&pdev->xdev->dev,
++ "schedule aer pcifront service\n");
++ schedule_pcifront_aer_op(pdev);
++ }
++
++ memcpy(op, active_op, sizeof(struct xen_pci_op));
++
++ err = op->err;
++out:
++ spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
++ return err;
++}
++
++/* Access to this function is spinlocked in drivers/pci/access.c */
++static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
++ int where, int size, u32 *val)
++{
++ int err = 0;
++ struct xen_pci_op op = {
++ .cmd = XEN_PCI_OP_conf_read,
++ .domain = pci_domain_nr(bus),
++ .bus = bus->number,
++ .devfn = devfn,
++ .offset = where,
++ .size = size,
++ };
++ struct pcifront_sd *sd = bus->sysdata;
++ struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++ if (verbose_request)
++ dev_info(&pdev->xdev->dev,
++ "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
++ pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
++ PCI_FUNC(devfn), where, size);
++
++ err = do_pci_op(pdev, &op);
++
++ if (likely(!err)) {
++ if (verbose_request)
++ dev_info(&pdev->xdev->dev, "read got back value %x\n",
++ op.value);
++
++ *val = op.value;
++ } else if (err == -ENODEV) {
++ /* No device here, pretend that it just returned 0 */
++ err = 0;
++ *val = 0;
++ }
++
++ return errno_to_pcibios_err(err);
++}
++
++/* Access to this function is spinlocked in drivers/pci/access.c */
++static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
++ int where, int size, u32 val)
++{
++ struct xen_pci_op op = {
++ .cmd = XEN_PCI_OP_conf_write,
++ .domain = pci_domain_nr(bus),
++ .bus = bus->number,
++ .devfn = devfn,
++ .offset = where,
++ .size = size,
++ .value = val,
++ };
++ struct pcifront_sd *sd = bus->sysdata;
++ struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++ if (verbose_request)
++ dev_info(&pdev->xdev->dev,
++ "write dev=%04x:%02x:%02x.%01x - "
++ "offset %x size %d val %x\n",
++ pci_domain_nr(bus), bus->number,
++ PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
++
++ return errno_to_pcibios_err(do_pci_op(pdev, &op));
++}
++
++struct pci_ops pcifront_bus_ops = {
++ .read = pcifront_bus_read,
++ .write = pcifront_bus_write,
++};
++
++#ifdef CONFIG_PCI_MSI
++static int pci_frontend_enable_msix(struct pci_dev *dev,
++ int **vector, int nvec)
++{
++ int err;
++ int i;
++ struct xen_pci_op op = {
++ .cmd = XEN_PCI_OP_enable_msix,
++ .domain = pci_domain_nr(dev->bus),
++ .bus = dev->bus->number,
++ .devfn = dev->devfn,
++ .value = nvec,
++ };
++ struct pcifront_sd *sd = dev->bus->sysdata;
++ struct pcifront_device *pdev = pcifront_get_pdev(sd);
++ struct msi_desc *entry;
++
++ if (nvec > SH_INFO_MAX_VEC) {
++ dev_err(&dev->dev, "too much vector for pci frontend: %x."
++ " Increase SH_INFO_MAX_VEC.\n", nvec);
++ return -EINVAL;
++ }
++
++ i = 0;
++ list_for_each_entry(entry, &dev->msi_list, list) {
++ op.msix_entries[i].entry = entry->msi_attrib.entry_nr;
++ /* Vector is useless at this point. */
++ op.msix_entries[i].vector = -1;
++ i++;
++ }
++
++ err = do_pci_op(pdev, &op);
++
++ if (likely(!err)) {
++ if (likely(!op.value)) {
++ /* we get the result */
++ for (i = 0; i < nvec; i++)
++ *vector[i] = op.msix_entries[i].vector;
++ return 0;
++ } else {
++ printk(KERN_DEBUG "enable msix get value %x\n",
++ op.value);
++ return op.value;
+ }
- }
-+ balloon_stats.current_pages -= nr_pages;
++ } else {
++ dev_err(&dev->dev, "enable msix get err %x\n", err);
++ return err;
++ }
++}
+
-+ if (discontig)
-+ nr_pages = shrink_frame(nr_pages);
-
- set_xen_guest_handle(reservation.extent_start, frame_list);
- reservation.nr_extents = nr_pages;
-+ reservation.extent_order = balloon_order;
- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
- BUG_ON(ret != nr_pages);
-
-- balloon_stats.current_pages -= nr_pages;
--
-- spin_unlock_irqrestore(&balloon_lock, flags);
-+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
-
- return need_sleep;
- }
-@@ -379,7 +431,7 @@ static void watch_target(struct xenbus_watch *watch,
- /* The given memory/target value is in KiB, so it needs converting to
- * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
- */
-- balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
-+ balloon_set_new_target(new_target >> ((PAGE_SHIFT - 10) + balloon_order));
- }
-
- static int balloon_init_watcher(struct notifier_block *notifier,
-@@ -405,9 +457,12 @@ static int __init balloon_init(void)
- if (!xen_pv_domain())
- return -ENODEV;
-
-- pr_info("xen_balloon: Initialising balloon driver.\n");
-+ pr_info("xen_balloon: Initialising balloon driver with page order %d.\n",
-+ balloon_order);
++static void pci_frontend_disable_msix(struct pci_dev *dev)
++{
++ int err;
++ struct xen_pci_op op = {
++ .cmd = XEN_PCI_OP_disable_msix,
++ .domain = pci_domain_nr(dev->bus),
++ .bus = dev->bus->number,
++ .devfn = dev->devfn,
++ };
++ struct pcifront_sd *sd = dev->bus->sysdata;
++ struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
-+ balloon_npages = 1 << balloon_order;
-
-- balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
-+ balloon_stats.current_pages = (min(xen_start_info->nr_pages, max_pfn)) >> balloon_order;
- balloon_stats.target_pages = balloon_stats.current_pages;
- balloon_stats.balloon_low = 0;
- balloon_stats.balloon_high = 0;
-@@ -420,7 +475,7 @@ static int __init balloon_init(void)
- register_balloon(&balloon_sysdev);
-
- /* Initialise the balloon with excess memory space. */
-- for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
-+ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn += balloon_npages) {
- page = pfn_to_page(pfn);
- if (!PageReserved(page))
- balloon_append(page);
-@@ -444,6 +499,121 @@ static void balloon_exit(void)
-
- module_exit(balloon_exit);
-
-+static int __init balloon_parse_huge(char *s)
++ err = do_pci_op(pdev, &op);
++
++ /* What should do for error ? */
++ if (err)
++ dev_err(&dev->dev, "pci_disable_msix get err %x\n", err);
++}
++
++static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
+{
-+ balloon_order = 9;
-+ return 1;
++ int err;
++ struct xen_pci_op op = {
++ .cmd = XEN_PCI_OP_enable_msi,
++ .domain = pci_domain_nr(dev->bus),
++ .bus = dev->bus->number,
++ .devfn = dev->devfn,
++ };
++ struct pcifront_sd *sd = dev->bus->sysdata;
++ struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++ err = do_pci_op(pdev, &op);
++ if (likely(!err)) {
++ *vector[0] = op.value;
++ } else {
++ dev_err(&dev->dev, "pci frontend enable msi failed for dev "
++ "%x:%x \n", op.bus, op.devfn);
++ err = -EINVAL;
++ }
++ return err;
++}
++
++static void pci_frontend_disable_msi(struct pci_dev *dev)
++{
++ int err;
++ struct xen_pci_op op = {
++ .cmd = XEN_PCI_OP_disable_msi,
++ .domain = pci_domain_nr(dev->bus),
++ .bus = dev->bus->number,
++ .devfn = dev->devfn,
++ };
++ struct pcifront_sd *sd = dev->bus->sysdata;
++ struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++ err = do_pci_op(pdev, &op);
++ if (err == XEN_PCI_ERR_dev_not_found) {
++ /* XXX No response from backend, what shall we do? */
++ printk(KERN_DEBUG "get no response from backend for disable MSI\n");
++ return;
++ }
++ if (err)
++ /* how can pciback notify us fail? */
++ printk(KERN_DEBUG "get fake response frombackend \n");
++}
++
++static struct xen_pci_frontend_ops pci_frontend_ops = {
++ .enable_msi = pci_frontend_enable_msi,
++ .disable_msi = pci_frontend_disable_msi,
++ .enable_msix = pci_frontend_enable_msix,
++ .disable_msix = pci_frontend_disable_msix,
++};
++
++static void pci_frontend_registrar(int enable)
++{
++ if (enable)
++ xen_pci_frontend = &pci_frontend_ops;
++ else
++ xen_pci_frontend = NULL;
++};
++#else
++static inline void pci_frontend_registrar(int enable) { };
++#endif /* CONFIG_PCI_MSI */
++
++/* Claim resources for the PCI frontend as-is, backend won't allow changes */
++static int pcifront_claim_resource(struct pci_dev *dev, void *data)
++{
++ struct pcifront_device *pdev = data;
++ int i;
++ struct resource *r;
++
++ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
++ r = &dev->resource[i];
++
++ if (!r->parent && r->start && r->flags) {
++ dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
++ pci_name(dev), i);
++ if (pci_claim_resource(dev, i)) {
++ dev_err(&pdev->xdev->dev, "Could not claim "
++ "resource %s/%d! Device offline. Try "
++ "giving less than 4GB to domain.\n",
++ pci_name(dev), i);
++ }
++ }
++ }
++
++ return 0;
+}
+
-+__setup("balloon_hugepages", balloon_parse_huge);
++int __devinit pcifront_scan_root(struct pcifront_device *pdev,
++ unsigned int domain, unsigned int bus)
++{
++ struct pci_bus *b;
++ struct pcifront_sd *sd = NULL;
++ struct pci_bus_entry *bus_entry = NULL;
++ int err = 0;
++
++#ifndef CONFIG_PCI_DOMAINS
++ if (domain != 0) {
++ dev_err(&pdev->xdev->dev,
++ "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
++ dev_err(&pdev->xdev->dev,
++ "Please compile with CONFIG_PCI_DOMAINS\n");
++ err = -EINVAL;
++ goto err_out;
++ }
++#endif
++
++ dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
++ domain, bus);
++
++ bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
++ sd = kmalloc(sizeof(*sd), GFP_KERNEL);
++ if (!bus_entry || !sd) {
++ err = -ENOMEM;
++ goto err_out;
++ }
++ pcifront_init_sd(sd, domain, bus, pdev);
++
++ b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
++ &pcifront_bus_ops, sd);
++ if (!b) {
++ dev_err(&pdev->xdev->dev,
++ "Error creating PCI Frontend Bus!\n");
++ err = -ENOMEM;
++ goto err_out;
++ }
++
++ pcifront_setup_root_resources(b, sd);
++ bus_entry->bus = b;
++
++ list_add(&bus_entry->list, &pdev->root_buses);
++
++ /* Claim resources before going "live" with our devices */
++ pci_walk_bus(b, pcifront_claim_resource, pdev);
++
++ pci_bus_add_devices(b);
++
++ return 0;
++
++err_out:
++ kfree(bus_entry);
++ kfree(sd);
++
++ return err;
++}
++
++int __devinit pcifront_rescan_root(struct pcifront_device *pdev,
++ unsigned int domain, unsigned int bus)
++{
++ struct pci_bus *b;
++ struct pci_dev *d;
++ unsigned int devfn;
++ int err;
++
++#ifndef CONFIG_PCI_DOMAINS
++ if (domain != 0) {
++ dev_err(&pdev->xdev->dev,
++ "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
++ dev_err(&pdev->xdev->dev,
++ "Please compile with CONFIG_PCI_DOMAINS\n");
++ return -EINVAL;
++ }
++#endif
++
++ dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
++ domain, bus);
++
++ b = pci_find_bus(domain, bus);
++ if (!b)
++ /* If the bus is unknown, create it. */
++ return pcifront_scan_root(pdev, domain, bus);
++
++ /* Rescan the bus for newly attached functions and add.
++ * We omit handling of PCI bridge attachment because pciback prevents
++ * bridges from being exported.
++ */
++ for (devfn = 0; devfn < 0x100; devfn++) {
++ d = pci_get_slot(b, devfn);
++ if (d) {
++ /* Device is already known. */
++ pci_dev_put(d);
++ continue;
++ }
++
++ d = pci_scan_single_device(b, devfn);
++ if (d) {
++ dev_info(&pdev->xdev->dev, "New device on "
++ "%04x:%02x:%02x.%02x found.\n", domain, bus,
++ PCI_SLOT(devfn), PCI_FUNC(devfn));
++ err = pci_bus_add_device(d);
++ if (err) {
++ dev_err(&pdev->xdev->dev, "Failed to add "
++ " device to bus.\n");
++ return err;
++ }
++ }
++ }
++
++ return 0;
++}
++
++static void free_root_bus_devs(struct pci_bus *bus)
++{
++ struct pci_dev *dev;
++
++ while (!list_empty(&bus->devices)) {
++ dev = container_of(bus->devices.next, struct pci_dev,
++ bus_list);
++ dev_dbg(&dev->dev, "removing device\n");
++ pci_remove_bus_device(dev);
++ }
++}
++
++void pcifront_free_roots(struct pcifront_device *pdev)
++{
++ struct pci_bus_entry *bus_entry, *t;
++
++ dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
++
++ list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
++ list_del(&bus_entry->list);
++
++ free_root_bus_devs(bus_entry->bus);
++
++ kfree(bus_entry->bus->sysdata);
++
++ device_unregister(bus_entry->bus->bridge);
++ pci_remove_bus(bus_entry->bus);
++
++ kfree(bus_entry);
++ }
++}
++
++static pci_ers_result_t pcifront_common_process(int cmd,
++ struct pcifront_device *pdev,
++ pci_channel_state_t state)
++{
++ pci_ers_result_t result;
++ struct pci_driver *pdrv;
++ int bus = pdev->sh_info->aer_op.bus;
++ int devfn = pdev->sh_info->aer_op.devfn;
++ struct pci_dev *pcidev;
++ int flag = 0;
++
++ dev_dbg(&pdev->xdev->dev,
++ "pcifront AER process: cmd %x (bus:%x, devfn%x)",
++ cmd, bus, devfn);
++ result = PCI_ERS_RESULT_NONE;
++
++ pcidev = pci_get_bus_and_slot(bus, devfn);
++ if (!pcidev || !pcidev->driver) {
++ dev_err(&pcidev->dev,
++ "device or driver is NULL\n");
++ return result;
++ }
++ pdrv = pcidev->driver;
++
++ if (get_driver(&pdrv->driver)) {
++ if (pdrv->err_handler && pdrv->err_handler->error_detected) {
++ dev_dbg(&pcidev->dev,
++ "trying to call AER service\n");
++ if (pcidev) {
++ flag = 1;
++ switch (cmd) {
++ case XEN_PCI_OP_aer_detected:
++ result = pdrv->err_handler->
++ error_detected(pcidev, state);
++ break;
++ case XEN_PCI_OP_aer_mmio:
++ result = pdrv->err_handler->
++ mmio_enabled(pcidev);
++ break;
++ case XEN_PCI_OP_aer_slotreset:
++ result = pdrv->err_handler->
++ slot_reset(pcidev);
++ break;
++ case XEN_PCI_OP_aer_resume:
++ pdrv->err_handler->resume(pcidev);
++ break;
++ default:
++ dev_err(&pdev->xdev->dev,
++ "bad request in aer recovery "
++ "operation!\n");
++
++ }
++ }
++ }
++ put_driver(&pdrv->driver);
++ }
++ if (!flag)
++ result = PCI_ERS_RESULT_NONE;
++
++ return result;
++}
++
++
++void pcifront_do_aer(struct work_struct *data)
++{
++ struct pcifront_device *pdev =
++ container_of(data, struct pcifront_device, op_work);
++ int cmd = pdev->sh_info->aer_op.cmd;
++ pci_channel_state_t state =
++ (pci_channel_state_t)pdev->sh_info->aer_op.err;
++
++ /*If a pci_conf op is in progress,
++ we have to wait until it is done before service aer op*/
++ dev_dbg(&pdev->xdev->dev,
++ "pcifront service aer bus %x devfn %x\n",
++ pdev->sh_info->aer_op.bus, pdev->sh_info->aer_op.devfn);
++
++ pdev->sh_info->aer_op.err = pcifront_common_process(cmd, pdev, state);
++
++ wmb();
++ clear_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags);
++ notify_remote_via_evtchn(pdev->evtchn);
++
++ /*in case of we lost an aer request in four lines time_window*/
++ smp_mb__before_clear_bit();
++ clear_bit(_PDEVB_op_active, &pdev->flags);
++ smp_mb__after_clear_bit();
++
++ schedule_pcifront_aer_op(pdev);
++
++}
++
++irqreturn_t pcifront_handler_aer(int irq, void *dev)
++{
++ struct pcifront_device *pdev = dev;
++ schedule_pcifront_aer_op(pdev);
++ return IRQ_HANDLED;
++}
++int pcifront_connect(struct pcifront_device *pdev)
++{
++ int err = 0;
++
++ spin_lock(&pcifront_dev_lock);
++
++ if (!pcifront_dev) {
++ dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
++ pcifront_dev = pdev;
++ } else {
++ dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
++ err = -EEXIST;
++ }
++
++ spin_unlock(&pcifront_dev_lock);
++
++ return err;
++}
++
++void pcifront_disconnect(struct pcifront_device *pdev)
++{
++ spin_lock(&pcifront_dev_lock);
++
++ if (pdev == pcifront_dev) {
++ dev_info(&pdev->xdev->dev,
++ "Disconnecting PCI Frontend Buses\n");
++ pcifront_dev = NULL;
++ }
++
++ spin_unlock(&pcifront_dev_lock);
++}
++static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
++{
++ struct pcifront_device *pdev;
++
++ pdev = kzalloc(sizeof(struct pcifront_device), GFP_KERNEL);
++ if (pdev == NULL)
++ goto out;
++
++ pdev->sh_info =
++ (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
++ if (pdev->sh_info == NULL) {
++ kfree(pdev);
++ pdev = NULL;
++ goto out;
++ }
++ pdev->sh_info->flags = 0;
++
++ /*Flag for registering PV AER handler*/
++ set_bit(_XEN_PCIB_AERHANDLER, (void *)&pdev->sh_info->flags);
++
++ dev_set_drvdata(&xdev->dev, pdev);
++ pdev->xdev = xdev;
++
++ INIT_LIST_HEAD(&pdev->root_buses);
++
++ spin_lock_init(&pdev->dev_lock);
++ spin_lock_init(&pdev->sh_info_lock);
++
++ pdev->evtchn = INVALID_EVTCHN;
++ pdev->gnt_ref = INVALID_GRANT_REF;
++ pdev->irq = -1;
++
++ INIT_WORK(&pdev->op_work, pcifront_do_aer);
++
++ dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
++ pdev, pdev->sh_info);
++out:
++ return pdev;
++}
++
++static void free_pdev(struct pcifront_device *pdev)
++{
++ dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
++
++ pcifront_free_roots(pdev);
++
++ /*For PCIE_AER error handling job*/
++ flush_scheduled_work();
++ unbind_from_irqhandler(pdev->irq, pdev);
++
++ if (pdev->evtchn != INVALID_EVTCHN)
++ xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
++
++ if (pdev->gnt_ref != INVALID_GRANT_REF)
++ gnttab_end_foreign_access(pdev->gnt_ref, 0 /* r/w page */,
++ (unsigned long)pdev->sh_info);
++
++ dev_set_drvdata(&pdev->xdev->dev, NULL);
++ kfree(pdev);
++}
++
++static int pcifront_publish_info(struct pcifront_device *pdev)
++{
++ int err = 0;
++ struct xenbus_transaction trans;
++
++ err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
++ if (err < 0)
++ goto out;
++
++ pdev->gnt_ref = err;
++
++ err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
++ if (err)
++ goto out;
++
++ err = bind_evtchn_to_irqhandler(pdev->evtchn, pcifront_handler_aer,
++ 0, "pcifront", pdev);
++ if (err < 0) {
++ xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
++ xenbus_dev_fatal(pdev->xdev, err, "Failed to bind evtchn to "
++ "irqhandler.\n");
++ return err;
++ }
++ pdev->irq = err;
++
++do_publish:
++ err = xenbus_transaction_start(&trans);
++ if (err) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error writing configuration for backend "
++ "(start transaction)");
++ goto out;
++ }
++
++ err = xenbus_printf(trans, pdev->xdev->nodename,
++ "pci-op-ref", "%u", pdev->gnt_ref);
++ if (!err)
++ err = xenbus_printf(trans, pdev->xdev->nodename,
++ "event-channel", "%u", pdev->evtchn);
++ if (!err)
++ err = xenbus_printf(trans, pdev->xdev->nodename,
++ "magic", XEN_PCI_MAGIC);
++
++ if (err) {
++ xenbus_transaction_end(trans, 1);
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error writing configuration for backend");
++ goto out;
++ } else {
++ err = xenbus_transaction_end(trans, 0);
++ if (err == -EAGAIN)
++ goto do_publish;
++ else if (err) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error completing transaction "
++ "for backend");
++ goto out;
++ }
++ }
++
++ xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
++
++ dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
++
++out:
++ return err;
++}
++
++static int __devinit pcifront_try_connect(struct pcifront_device *pdev)
++{
++ int err = -EFAULT;
++ int i, num_roots, len;
++ char str[64];
++ unsigned int domain, bus;
++
++ spin_lock(&pdev->dev_lock);
++
++ /* Only connect once */
++ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++ XenbusStateInitialised)
++ goto out;
++
++ err = pcifront_connect(pdev);
++ if (err) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error connecting PCI Frontend");
++ goto out;
++ }
++
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
++ "root_num", "%d", &num_roots);
++ if (err == -ENOENT) {
++ xenbus_dev_error(pdev->xdev, err,
++ "No PCI Roots found, trying 0000:00");
++ err = pcifront_scan_root(pdev, 0, 0);
++ num_roots = 0;
++ } else if (err != 1) {
++ if (err == 0)
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading number of PCI roots");
++ goto out;
++ }
++
++ for (i = 0; i < num_roots; i++) {
++ len = snprintf(str, sizeof(str), "root-%d", i);
++ if (unlikely(len >= (sizeof(str) - 1))) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
++ "%x:%x", &domain, &bus);
++ if (err != 2) {
++ if (err >= 0)
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading PCI root %d", i);
++ goto out;
++ }
++
++ err = pcifront_scan_root(pdev, domain, bus);
++ if (err) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error scanning PCI root %04x:%02x",
++ domain, bus);
++ goto out;
++ }
++ }
++
++ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
++ if (err)
++ goto out;
++
++out:
++ spin_unlock(&pdev->dev_lock);
++ return err;
++}
++
++static int pcifront_try_disconnect(struct pcifront_device *pdev)
++{
++ int err = 0;
++ enum xenbus_state prev_state;
++
++ spin_lock(&pdev->dev_lock);
++
++ prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
++
++ if (prev_state >= XenbusStateClosing)
++ goto out;
++
++ if (prev_state == XenbusStateConnected) {
++ pcifront_free_roots(pdev);
++ pcifront_disconnect(pdev);
++ }
++
++ err = xenbus_switch_state(pdev->xdev, XenbusStateClosed);
++
++out:
++ spin_unlock(&pdev->dev_lock);
++
++ return err;
++}
++
++static int __devinit pcifront_attach_devices(struct pcifront_device *pdev)
++{
++ int err = -EFAULT;
++ int i, num_roots, len;
++ unsigned int domain, bus;
++ char str[64];
++
++ spin_lock(&pdev->dev_lock);
++
++ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++ XenbusStateReconfiguring)
++ goto out;
++
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
++ "root_num", "%d", &num_roots);
++ if (err == -ENOENT) {
++ xenbus_dev_error(pdev->xdev, err,
++ "No PCI Roots found, trying 0000:00");
++ err = pcifront_rescan_root(pdev, 0, 0);
++ num_roots = 0;
++ } else if (err != 1) {
++ if (err == 0)
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading number of PCI roots");
++ goto out;
++ }
++
++ for (i = 0; i < num_roots; i++) {
++ len = snprintf(str, sizeof(str), "root-%d", i);
++ if (unlikely(len >= (sizeof(str) - 1))) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
++ "%x:%x", &domain, &bus);
++ if (err != 2) {
++ if (err >= 0)
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading PCI root %d", i);
++ goto out;
++ }
++
++ err = pcifront_rescan_root(pdev, domain, bus);
++ if (err) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error scanning PCI root %04x:%02x",
++ domain, bus);
++ goto out;
++ }
++ }
++
++ xenbus_switch_state(pdev->xdev, XenbusStateConnected);
++
++out:
++ spin_unlock(&pdev->dev_lock);
++ return err;
++}
++
++static int pcifront_detach_devices(struct pcifront_device *pdev)
++{
++ int err = 0;
++ int i, num_devs;
++ unsigned int domain, bus, slot, func;
++ struct pci_bus *pci_bus;
++ struct pci_dev *pci_dev;
++ char str[64];
++
++ spin_lock(&pdev->dev_lock);
++
++ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++ XenbusStateConnected)
++ goto out;
++
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d",
++ &num_devs);
++ if (err != 1) {
++ if (err >= 0)
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading number of PCI devices");
++ goto out;
++ }
++
++ /* Find devices being detached and remove them. */
++ for (i = 0; i < num_devs; i++) {
++ int l, state;
++ l = snprintf(str, sizeof(str), "state-%d", i);
++ if (unlikely(l >= (sizeof(str) - 1))) {
++ err = -ENOMEM;
++ goto out;
++ }
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%d",
++ &state);
++ if (err != 1)
++ state = XenbusStateUnknown;
++
++ if (state != XenbusStateClosing)
++ continue;
++
++ /* Remove device. */
++ l = snprintf(str, sizeof(str), "vdev-%d", i);
++ if (unlikely(l >= (sizeof(str) - 1))) {
++ err = -ENOMEM;
++ goto out;
++ }
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
++ "%x:%x:%x.%x", &domain, &bus, &slot, &func);
++ if (err != 4) {
++ if (err >= 0)
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading PCI device %d", i);
++ goto out;
++ }
++
++ pci_bus = pci_find_bus(domain, bus);
++ if (!pci_bus) {
++ dev_dbg(&pdev->xdev->dev, "Cannot get bus %04x:%02x\n",
++ domain, bus);
++ continue;
++ }
++ pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func));
++ if (!pci_dev) {
++ dev_dbg(&pdev->xdev->dev,
++ "Cannot get PCI device %04x:%02x:%02x.%02x\n",
++ domain, bus, slot, func);
++ continue;
++ }
++ pci_remove_bus_device(pci_dev);
++ pci_dev_put(pci_dev);
++
++ dev_dbg(&pdev->xdev->dev,
++ "PCI device %04x:%02x:%02x.%02x removed.\n",
++ domain, bus, slot, func);
++ }
++
++ err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring);
++
++out:
++ spin_unlock(&pdev->dev_lock);
++ return err;
++}
++
++static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev,
++ enum xenbus_state be_state)
++{
++ struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
++
++ switch (be_state) {
++ case XenbusStateUnknown:
++ case XenbusStateInitialising:
++ case XenbusStateInitWait:
++ case XenbusStateInitialised:
++ case XenbusStateClosed:
++ break;
++
++ case XenbusStateConnected:
++ pcifront_try_connect(pdev);
++ break;
++
++ case XenbusStateClosing:
++ dev_warn(&xdev->dev, "backend going away!\n");
++ pcifront_try_disconnect(pdev);
++ break;
++
++ case XenbusStateReconfiguring:
++ pcifront_detach_devices(pdev);
++ break;
++
++ case XenbusStateReconfigured:
++ pcifront_attach_devices(pdev);
++ break;
++ }
++}
++
++static int pcifront_xenbus_probe(struct xenbus_device *xdev,
++ const struct xenbus_device_id *id)
++{
++ int err = 0;
++ struct pcifront_device *pdev = alloc_pdev(xdev);
++
++ if (pdev == NULL) {
++ err = -ENOMEM;
++ xenbus_dev_fatal(xdev, err,
++ "Error allocating pcifront_device struct");
++ goto out;
++ }
++
++ err = pcifront_publish_info(pdev);
++
++out:
++ return err;
++}
++
++static int pcifront_xenbus_remove(struct xenbus_device *xdev)
++{
++ struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
++
++ if (pdev)
++ free_pdev(pdev);
++
++ return 0;
++}
++
++static const struct xenbus_device_id xenpci_ids[] = {
++ {"pci"},
++ {""},
++};
++
++static struct xenbus_driver xenbus_pcifront_driver = {
++ .name = "pcifront",
++ .owner = THIS_MODULE,
++ .ids = xenpci_ids,
++ .probe = pcifront_xenbus_probe,
++ .remove = pcifront_xenbus_remove,
++ .otherend_changed = pcifront_backend_changed,
++};
++
++static int __init pcifront_init(void)
++{
++ if (!xen_domain())
++ return -ENODEV;
++
++ pci_frontend_registrar(1 /* enable */);
++
++ return xenbus_register_frontend(&xenbus_pcifront_driver);
++}
++
++static void __exit pcifront_cleanup(void)
++{
++ xenbus_unregister_driver(&xenbus_pcifront_driver);
++ pci_frontend_registrar(0 /* disable */);
++}
++module_init(pcifront_init);
++module_exit(pcifront_cleanup);
++
++MODULE_DESCRIPTION("Xen PCI passthrough frontend.");
++MODULE_LICENSE("GPL");
++MODULE_ALIAS("xen:pci");
+diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
+index 188e1ba..efac9e3 100644
+--- a/drivers/video/Kconfig
++++ b/drivers/video/Kconfig
+@@ -2063,6 +2063,7 @@ config XEN_FBDEV_FRONTEND
+ select FB_SYS_IMAGEBLIT
+ select FB_SYS_FOPS
+ select FB_DEFERRED_IO
++ select XEN_XENBUS_FRONTEND
+ default y
+ help
+ This driver implements the front-end of the Xen virtual
+diff --git a/drivers/video/broadsheetfb.c b/drivers/video/broadsheetfb.c
+index 509cb92..df9ccb9 100644
+--- a/drivers/video/broadsheetfb.c
++++ b/drivers/video/broadsheetfb.c
+@@ -470,7 +470,7 @@ static int __devinit broadsheetfb_probe(struct platform_device *dev)
+ par->read_reg = broadsheet_read_reg;
+ init_waitqueue_head(&par->waitq);
+
+- info->flags = FBINFO_FLAG_DEFAULT;
++ info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
+
+ info->fbdefio = &broadsheetfb_defio;
+ fb_deferred_io_init(info);
+diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
+index c27ab1e..94414fc 100644
+--- a/drivers/video/fb_defio.c
++++ b/drivers/video/fb_defio.c
+@@ -144,7 +144,9 @@ static const struct address_space_operations fb_deferred_io_aops = {
+ static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
+ {
+ vma->vm_ops = &fb_deferred_io_vm_ops;
+- vma->vm_flags |= ( VM_IO | VM_RESERVED | VM_DONTEXPAND );
++ vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND );
++ if (!(info->flags & FBINFO_VIRTFB))
++ vma->vm_flags |= VM_IO;
+ vma->vm_private_data = info;
+ return 0;
+ }
+diff --git a/drivers/video/hecubafb.c b/drivers/video/hecubafb.c
+index 0b4bffb..f9d77ad 100644
+--- a/drivers/video/hecubafb.c
++++ b/drivers/video/hecubafb.c
+@@ -253,7 +253,7 @@ static int __devinit hecubafb_probe(struct platform_device *dev)
+ par->send_command = apollo_send_command;
+ par->send_data = apollo_send_data;
+
+- info->flags = FBINFO_FLAG_DEFAULT;
++ info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
+
+ info->fbdefio = &hecubafb_defio;
+ fb_deferred_io_init(info);
+diff --git a/drivers/video/metronomefb.c b/drivers/video/metronomefb.c
+index df1f757..661bfd2 100644
+--- a/drivers/video/metronomefb.c
++++ b/drivers/video/metronomefb.c
+@@ -700,7 +700,7 @@ static int __devinit metronomefb_probe(struct platform_device *dev)
+ if (retval < 0)
+ goto err_free_irq;
+
+- info->flags = FBINFO_FLAG_DEFAULT;
++ info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
+
+ info->fbdefio = &metronomefb_defio;
+ fb_deferred_io_init(info);
+diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
+index 54cd916..7ec2c90 100644
+--- a/drivers/video/xen-fbfront.c
++++ b/drivers/video/xen-fbfront.c
+@@ -25,7 +25,10 @@
+ #include <linux/module.h>
+ #include <linux/vmalloc.h>
+ #include <linux/mm.h>
++
+ #include <asm/xen/hypervisor.h>
++
++#include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+ #include <xen/interface/io/fbif.h>
+@@ -440,7 +443,7 @@ static int __devinit xenfb_probe(struct xenbus_device *dev,
+ fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
+ fb_info->fix.accel = FB_ACCEL_NONE;
+
+- fb_info->flags = FBINFO_FLAG_DEFAULT;
++ fb_info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
+
+ ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
+ if (ret < 0) {
+@@ -627,6 +630,8 @@ static void xenfb_backend_changed(struct xenbus_device *dev,
+ switch (backend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateInitialised:
++ case XenbusStateReconfiguring:
++ case XenbusStateReconfigured:
+ case XenbusStateUnknown:
+ case XenbusStateClosed:
+ break;
+diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
+index cab100a..c4f36b7 100644
+--- a/drivers/xen/Kconfig
++++ b/drivers/xen/Kconfig
+@@ -28,6 +28,110 @@ config XEN_DEV_EVTCHN
+ firing.
+ If in doubt, say yes.
+
++config XEN_BACKEND
++ bool "Backend driver support"
++ depends on XEN_DOM0
++ default y
++ help
++ Support for backend device drivers that provide I/O services
++ to other virtual machines.
++
++config XEN_NETDEV_BACKEND
++ tristate "Xen backend network device"
++ depends on XEN_BACKEND && NET
++ help
++ Implement the network backend driver, which passes packets
++ from the guest domain's frontend drivers to the network.
++
++config XEN_BLKDEV_BACKEND
++ tristate "Block-device backend driver"
++ depends on XEN_BACKEND && BLOCK
++ help
++ The block-device backend driver allows the kernel to export its
++ block devices to other guests via a high-performance shared-memory
++ interface.
++
++
++config XEN_BLKDEV_TAP
++ tristate "Block-device tap backend driver"
++ depends on XEN_BACKEND && BLOCK
++ help
++ The block tap driver is an alternative to the block back driver
++ and allows VM block requests to be redirected to userspace through
++ a device interface. The tap allows user-space development of
++ high-performance block backends, where disk images may be implemented
++ as files, in memory, or on other hosts across the network. This
++ driver can safely coexist with the existing blockback driver.
++
++config XEN_BLKBACK_PAGEMAP
++ tristate
++ depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP != n
++ default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP
++
++config XEN_PCIDEV_BACKEND
++ tristate "PCI-device backend driver"
++ depends on PCI && XEN_BACKEND
++ default XEN_BACKEND
++ help
++ The PCI device backend driver allows the kernel to export arbitrary
++ PCI devices to other guests. If you select this to be a module, you
++ will need to make sure no other driver has bound to the device(s)
++ you want to make visible to other guests.
++
++choice
++ prompt "PCI Backend Mode"
++ depends on XEN_PCIDEV_BACKEND
++ default XEN_PCIDEV_BACKEND_VPCI if !IA64
++ default XEN_PCIDEV_BACKEND_CONTROLLER if IA64
++
++config XEN_PCIDEV_BACKEND_VPCI
++ bool "Virtual PCI"
++ ---help---
++ This PCI Backend hides the true PCI topology and makes the frontend
++ think there is a single PCI bus with only the exported devices on it.
++ For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
++ second device at 02:1a.1 will be re-assigned to 00:01.1.
++
++config XEN_PCIDEV_BACKEND_PASS
++ bool "Passthrough"
++ ---help---
++ This PCI Backend provides a real view of the PCI topology to the
++ frontend (for example, a device at 06:01.b will still appear at
++ 06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
++ PCI devices to its driver domains. This may be required for drivers
++ which depend on finding their hardward in certain bus/slot
++ locations.
++
++config XEN_PCIDEV_BACKEND_SLOT
++ bool "Slot"
++ ---help---
++ This PCI Backend hides the true PCI topology and makes the frontend
++ think there is a single PCI bus with only the exported devices on it.
++ Contrary to the virtual PCI backend, a function becomes a new slot.
++ For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
++ second device at 02:1a.1 will be re-assigned to 00:01.0.
++
++config XEN_PCIDEV_BACKEND_CONTROLLER
++ bool "Controller"
++ depends on IA64
++ ---help---
++ This PCI backend virtualizes the PCI bus topology by providing a
++ virtual bus per PCI root device. Devices which are physically under
++ the same root bus will appear on the same virtual bus. For systems
++ with complex I/O addressing, this is the only backend which supports
++ extended I/O port spaces and MMIO translation offsets. This backend
++ also supports slot virtualization. For example, a device at
++ 0000:01:02.1 will be re-assigned to 0000:00:00.0. A second device
++ at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be
++ re-assigned to 0000:00:01.0. A third device at 0000:16:05.0 (under
++ a different PCI root bus) will be re-assigned to 0000:01:00.0.
++
++endchoice
++
++config XEN_PCIDEV_BE_DEBUG
++ bool "PCI Backend Debugging"
++ depends on XEN_PCIDEV_BACKEND
++
+ config XENFS
+ tristate "Xen filesystem"
+ depends on XEN
+@@ -60,4 +164,14 @@ config XEN_SYS_HYPERVISOR
+ Create entries under /sys/hypervisor describing the Xen
+ hypervisor environment. When running native or in another
+ virtual environment, /sys/hypervisor will still be present,
+- but will have no xen contents.
+\ No newline at end of file
++ but will have no xen contents.
++
++config XEN_XENBUS_FRONTEND
++ tristate
++
++config XEN_GNTDEV
++ tristate "userspace grant access device driver"
++ depends on XEN
++ select MMU_NOTIFIER
++ help
++ Allows userspace processes use grants.
+diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
+index 7c28434..c5f71db 100644
+--- a/drivers/xen/Makefile
++++ b/drivers/xen/Makefile
+@@ -1,12 +1,21 @@
+-obj-y += grant-table.o features.o events.o manage.o
++obj-y += grant-table.o features.o events.o manage.o biomerge.o
+ obj-y += xenbus/
+
+ nostackp := $(call cc-option, -fno-stack-protector)
+ CFLAGS_features.o := $(nostackp)
+
+-obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
+-obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
+-obj-$(CONFIG_XEN_BALLOON) += balloon.o
+-obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
+-obj-$(CONFIG_XENFS) += xenfs/
+-obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
+\ No newline at end of file
++obj-$(CONFIG_PCI) += pci.o
++obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
++obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
++obj-$(CONFIG_XEN_BALLOON) += balloon.o
++obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
++obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
++obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback/
++obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
++obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
++obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
++obj-$(CONFIG_XENFS) += xenfs/
++obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
++
++xen-evtchn-y := evtchn.o
++xen-gntdev-y := gntdev.o
+diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
+index 4204336..d7c0eae 100644
+--- a/drivers/xen/balloon.c
++++ b/drivers/xen/balloon.c
+@@ -43,6 +43,7 @@
+ #include <linux/mutex.h>
+ #include <linux/list.h>
+ #include <linux/sysdev.h>
++#include <linux/swap.h>
+
+ #include <asm/page.h>
+ #include <asm/pgalloc.h>
+@@ -52,13 +53,15 @@
+
+ #include <asm/xen/hypervisor.h>
+ #include <asm/xen/hypercall.h>
++
++#include <xen/xen.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/memory.h>
+ #include <xen/xenbus.h>
+ #include <xen/features.h>
+ #include <xen/page.h>
+
+-#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
++#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT+balloon_order-10))
+
+ #define BALLOON_CLASS_NAME "xen_memory"
+
+@@ -82,14 +85,15 @@ static struct sys_device balloon_sysdev;
+
+ static int register_balloon(struct sys_device *sysdev);
+
++static struct balloon_stats balloon_stats;
++
+ /*
+- * Protects atomic reservation decrease/increase against concurrent increases.
+- * Also protects non-atomic updates of current_pages and driver_pages, and
+- * balloon lists.
++ * Work in pages of this order. Can be either 0 for normal pages
++ * or 9 for hugepages.
+ */
+-static DEFINE_SPINLOCK(balloon_lock);
+-
+-static struct balloon_stats balloon_stats;
++static int balloon_order;
++static unsigned long balloon_npages;
++static unsigned long discontig_frame_list[PAGE_SIZE / sizeof(unsigned long)];
+
+ /* We increase/decrease in batches which fit in a page */
+ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+@@ -118,10 +122,41 @@ static struct timer_list balloon_timer;
+ static void scrub_page(struct page *page)
+ {
+ #ifdef CONFIG_XEN_SCRUB_PAGES
+- clear_highpage(page);
++ int i;
++
++ for (i = 0; i < balloon_npages; i++)
++ clear_highpage(page++);
+ #endif
+ }
+
++static void free_discontig_frame(void)
++{
++ int rc;
++ struct xen_memory_reservation reservation = {
++ .address_bits = 0,
++ .domid = DOMID_SELF,
++ .nr_extents = balloon_npages,
++ .extent_order = 0
++ };
++
++ set_xen_guest_handle(reservation.extent_start, discontig_frame_list);
++ rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
++ BUG_ON(rc != balloon_npages);
++}
++
++static unsigned long shrink_frame(unsigned long nr_pages)
++{
++ unsigned long i, j;
++
++ for (i = 0, j = 0; i < nr_pages; i++, j++) {
++ if (frame_list[i] == 0)
++ j++;
++ if (i != j)
++ frame_list[i] = frame_list[j];
++ }
++ return i;
++}
++
+ /* balloon_append: add the given page to the balloon. */
+ static void balloon_append(struct page *page)
+ {
+@@ -195,19 +230,18 @@ static unsigned long current_target(void)
+
+ static int increase_reservation(unsigned long nr_pages)
+ {
+- unsigned long pfn, i, flags;
++ unsigned long pfn, mfn, i, j, flags;
+ struct page *page;
+ long rc;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+- .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
+
+- spin_lock_irqsave(&balloon_lock, flags);
++ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ page = balloon_first_page();
+ for (i = 0; i < nr_pages; i++) {
+@@ -218,6 +252,8 @@ static int increase_reservation(unsigned long nr_pages)
+
+ set_xen_guest_handle(reservation.extent_start, frame_list);
+ reservation.nr_extents = nr_pages;
++ reservation.extent_order = balloon_order;
++
+ rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+ if (rc < 0)
+ goto out;
+@@ -227,19 +263,22 @@ static int increase_reservation(unsigned long nr_pages)
+ BUG_ON(page == NULL);
+
+ pfn = page_to_pfn(page);
++ mfn = frame_list[i];
+ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
+ phys_to_machine_mapping_valid(pfn));
+
+- set_phys_to_machine(pfn, frame_list[i]);
+-
+- /* Link back into the page tables if not highmem. */
+- if (pfn < max_low_pfn) {
+- int ret;
+- ret = HYPERVISOR_update_va_mapping(
+- (unsigned long)__va(pfn << PAGE_SHIFT),
+- mfn_pte(frame_list[i], PAGE_KERNEL),
+- 0);
+- BUG_ON(ret);
++ for (j = 0; j < balloon_npages; j++, pfn++, mfn++) {
++ set_phys_to_machine(pfn, mfn);
++
++ /* Link back into the page tables if not highmem. */
++ if (pfn < max_low_pfn) {
++ int ret;
++ ret = HYPERVISOR_update_va_mapping(
++ (unsigned long)__va(pfn << PAGE_SHIFT),
++ mfn_pte(mfn, PAGE_KERNEL),
++ 0);
++ BUG_ON(ret);
++ }
+ }
+
+ /* Relinquish the page back to the allocator. */
+@@ -251,20 +290,20 @@ static int increase_reservation(unsigned long nr_pages)
+ balloon_stats.current_pages += rc;
+
+ out:
+- spin_unlock_irqrestore(&balloon_lock, flags);
++ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ return rc < 0 ? rc : rc != nr_pages;
+ }
+
+ static int decrease_reservation(unsigned long nr_pages)
+ {
+- unsigned long pfn, i, flags;
++ unsigned long pfn, lpfn, mfn, i, j, flags;
+ struct page *page;
+ int need_sleep = 0;
+- int ret;
++ int discontig, discontig_free;
++ int ret;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+- .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+@@ -272,7 +311,7 @@ static int decrease_reservation(unsigned long nr_pages)
+ nr_pages = ARRAY_SIZE(frame_list);
+
+ for (i = 0; i < nr_pages; i++) {
+- if ((page = alloc_page(GFP_BALLOON)) == NULL) {
++ if ((page = alloc_pages(GFP_BALLOON, balloon_order)) == NULL) {
+ nr_pages = i;
+ need_sleep = 1;
+ break;
+@@ -282,37 +321,50 @@ static int decrease_reservation(unsigned long nr_pages)
+ frame_list[i] = pfn_to_mfn(pfn);
+
+ scrub_page(page);
+-
+- if (!PageHighMem(page)) {
+- ret = HYPERVISOR_update_va_mapping(
+- (unsigned long)__va(pfn << PAGE_SHIFT),
+- __pte_ma(0), 0);
+- BUG_ON(ret);
+- }
+-
+ }
+
+ /* Ensure that ballooned highmem pages don't have kmaps. */
+ kmap_flush_unused();
+ flush_tlb_all();
+
+- spin_lock_irqsave(&balloon_lock, flags);
++ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* No more mappings: invalidate P2M and add to balloon. */
+ for (i = 0; i < nr_pages; i++) {
+- pfn = mfn_to_pfn(frame_list[i]);
+- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
++ mfn = frame_list[i];
++ lpfn = pfn = mfn_to_pfn(mfn);
+ balloon_append(pfn_to_page(pfn));
++ discontig_free = 0;
++ for (j = 0; j < balloon_npages; j++, lpfn++, mfn++) {
++ if ((discontig_frame_list[j] = pfn_to_mfn(lpfn)) != mfn)
++ discontig_free = 1;
++
++ set_phys_to_machine(lpfn, INVALID_P2M_ENTRY);
++ if (!PageHighMem(page)) {
++ ret = HYPERVISOR_update_va_mapping(
++ (unsigned long)__va(lpfn << PAGE_SHIFT),
++ __pte_ma(0), 0);
++ BUG_ON(ret);
++ }
++ }
++ if (discontig_free) {
++ free_discontig_frame();
++ frame_list[i] = 0;
++ discontig = 1;
++ }
+ }
++ balloon_stats.current_pages -= nr_pages;
++
++ if (discontig)
++ nr_pages = shrink_frame(nr_pages);
+
+ set_xen_guest_handle(reservation.extent_start, frame_list);
+ reservation.nr_extents = nr_pages;
++ reservation.extent_order = balloon_order;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+ BUG_ON(ret != nr_pages);
+
+- balloon_stats.current_pages -= nr_pages;
+-
+- spin_unlock_irqrestore(&balloon_lock, flags);
++ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ return need_sleep;
+ }
+@@ -379,7 +431,7 @@ static void watch_target(struct xenbus_watch *watch,
+ /* The given memory/target value is in KiB, so it needs converting to
+ * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
+ */
+- balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
++ balloon_set_new_target(new_target >> ((PAGE_SHIFT - 10) + balloon_order));
+ }
+
+ static int balloon_init_watcher(struct notifier_block *notifier,
+@@ -405,9 +457,12 @@ static int __init balloon_init(void)
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+- pr_info("xen_balloon: Initialising balloon driver.\n");
++ pr_info("xen_balloon: Initialising balloon driver with page order %d.\n",
++ balloon_order);
++
++ balloon_npages = 1 << balloon_order;
+
+- balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
++ balloon_stats.current_pages = (min(xen_start_info->nr_pages, max_pfn)) >> balloon_order;
+ balloon_stats.target_pages = balloon_stats.current_pages;
+ balloon_stats.balloon_low = 0;
+ balloon_stats.balloon_high = 0;
+@@ -420,7 +475,7 @@ static int __init balloon_init(void)
+ register_balloon(&balloon_sysdev);
+
+ /* Initialise the balloon with excess memory space. */
+- for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
++ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn += balloon_npages) {
+ page = pfn_to_page(pfn);
+ if (!PageReserved(page))
+ balloon_append(page);
+@@ -444,6 +499,121 @@ static void balloon_exit(void)
+
+ module_exit(balloon_exit);
+
++static int __init balloon_parse_huge(char *s)
++{
++ balloon_order = 9;
++ return 1;
++}
++
++__setup("balloon_hugepages", balloon_parse_huge);
++
++static int dealloc_pte_fn(pte_t *pte, struct page *pmd_page,
++ unsigned long addr, void *data)
++{
++ unsigned long mfn = pte_mfn(*pte);
++ int ret;
++ struct xen_memory_reservation reservation = {
++ .nr_extents = 1,
++ .extent_order = 0,
++ .domid = DOMID_SELF
++ };
++
++ set_xen_guest_handle(reservation.extent_start, &mfn);
++ set_pte_at(&init_mm, addr, pte, __pte_ma(0));
++ set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
++
++ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
++ BUG_ON(ret != 1);
++
++ return 0;
++}
++
++struct page **alloc_empty_pages_and_pagevec(int nr_pages)
++{
++ struct page *page, **pagevec;
++ int npages;
++ int i, j, ret;
++
++ /* Round up to next number of balloon_order pages */
++ npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
++
++ pagevec = kmalloc(sizeof(page) * nr_pages << balloon_order, GFP_KERNEL);
++ if (pagevec == NULL)
++ return NULL;
++
++ for (i = 0; i < nr_pages; i++) {
++ void *v;
++
++ page = alloc_pages(GFP_KERNEL|__GFP_COLD, balloon_order);
++ if (page == NULL)
++ goto err;
++
++ scrub_page(page);
++
++ mutex_lock(&balloon_mutex);
++
++ v = page_address(page);
++
++ ret = apply_to_page_range(&init_mm, (unsigned long)v,
++ PAGE_SIZE << balloon_order,
++ dealloc_pte_fn, NULL);
++
++ if (ret != 0) {
++ mutex_unlock(&balloon_mutex);
++ //balloon_free_page(page); /* tries to use free_cold_page */
++ __free_page(page);
++ goto err;
++ }
++ for (j = 0; j < balloon_npages; j++)
++ pagevec[(i<<balloon_order)+j] = page++;
++
++ totalram_pages = balloon_stats.current_pages -= balloon_npages;
++
++ mutex_unlock(&balloon_mutex);
++ }
++
++ out:
++ schedule_work(&balloon_worker);
++ flush_tlb_all();
++ return pagevec;
++
++ err:
++ mutex_lock(&balloon_mutex);
++ while (--i >= 0)
++ balloon_append(pagevec[i << balloon_order]);
++ mutex_unlock(&balloon_mutex);
++ kfree(pagevec);
++ pagevec = NULL;
++ goto out;
++}
++EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
++
++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
++{
++ struct page *page;
++ int i;
++ int npages;
++
++ if (pagevec == NULL)
++ return;
++
++ /* Round up to next number of balloon_order pages */
++ npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
++
++ mutex_lock(&balloon_mutex);
++ for (i = 0; i < nr_pages; i++) {
++ page = pagevec[i << balloon_order];
++ BUG_ON(page_count(page) != 1);
++ balloon_append(page);
++ }
++ mutex_unlock(&balloon_mutex);
++
++ kfree(pagevec);
++
++ schedule_work(&balloon_worker);
++}
++EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
++
+ #define BALLOON_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct sys_device *dev, \
+ struct sysdev_attribute *attr, \
+@@ -477,7 +647,7 @@ static ssize_t store_target_kb(struct sys_device *dev,
+
+ target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
+
+- balloon_set_new_target(target_bytes >> PAGE_SHIFT);
++ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
+
+ return count;
+ }
+@@ -491,7 +661,7 @@ static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr
+ {
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)balloon_stats.target_pages
+- << PAGE_SHIFT);
++ << (PAGE_SHIFT + balloon_order));
+ }
+
+ static ssize_t store_target(struct sys_device *dev,
+@@ -507,7 +677,7 @@ static ssize_t store_target(struct sys_device *dev,
+
+ target_bytes = memparse(buf, &endchar);
+
+- balloon_set_new_target(target_bytes >> PAGE_SHIFT);
++ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
+
+ return count;
+ }
+diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
+new file mode 100644
+index 0000000..d40f534
+--- /dev/null
++++ b/drivers/xen/biomerge.c
+@@ -0,0 +1,14 @@
++#include <linux/bio.h>
++#include <asm/io.h>
++#include <xen/page.h>
++
++bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
++ const struct bio_vec *vec2)
++{
++ unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
++ unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
++
++ return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
++ ((mfn1 == mfn2) || ((mfn1+1) == mfn2));
++}
++
+diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
+new file mode 100644
+index 0000000..dee55ba
+--- /dev/null
++++ b/drivers/xen/blkback/Makefile
+@@ -0,0 +1,4 @@
++obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
++obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o
++
++xen-blkback-y := blkback.o xenbus.o interface.o vbd.o
+diff --git a/drivers/xen/blkback/blkback-pagemap.c b/drivers/xen/blkback/blkback-pagemap.c
+new file mode 100644
+index 0000000..45f6eb2
+--- /dev/null
++++ b/drivers/xen/blkback/blkback-pagemap.c
+@@ -0,0 +1,109 @@
++#include <linux/module.h>
++#include "blkback-pagemap.h"
++
++static int blkback_pagemap_size;
++static struct blkback_pagemap *blkback_pagemap;
++
++static inline int
++blkback_pagemap_entry_clear(struct blkback_pagemap *map)
++{
++ static struct blkback_pagemap zero;
++ return !memcmp(map, &zero, sizeof(zero));
++}
++
++int
++blkback_pagemap_init(int pages)
++{
++ blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap),
++ GFP_KERNEL);
++ if (!blkback_pagemap)
++ return -ENOMEM;
++
++ blkback_pagemap_size = pages;
++ return 0;
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_init);
++
++void
++blkback_pagemap_set(int idx, struct page *page,
++ domid_t domid, busid_t busid, grant_ref_t gref)
++{
++ struct blkback_pagemap *entry;
++
++ BUG_ON(!blkback_pagemap);
++ BUG_ON(idx >= blkback_pagemap_size);
++
++ set_page_private(page, idx);
++
++ entry = blkback_pagemap + idx;
++ if (!blkback_pagemap_entry_clear(entry)) {
++ printk("overwriting pagemap %d: d %u b %u g %u\n",
++ idx, entry->domid, entry->busid, entry->gref);
++ BUG();
++ }
++
++ entry->page = page;
++ entry->domid = domid;
++ entry->busid = busid;
++ entry->gref = gref;
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_set);
++
++void
++blkback_pagemap_clear(struct page *page)
++{
++ int idx;
++ struct blkback_pagemap *entry;
++
++ idx = (int)page_private(page);
++
++ BUG_ON(!blkback_pagemap);
++ BUG_ON(idx >= blkback_pagemap_size);
++
++ entry = blkback_pagemap + idx;
++ if (blkback_pagemap_entry_clear(entry)) {
++ printk("clearing empty pagemap %d\n", idx);
++ BUG();
++ }
++
++ memset(entry, 0, sizeof(*entry));
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_clear);
++
++struct blkback_pagemap
++blkback_pagemap_read(struct page *page)
++{
++ int idx;
++ struct blkback_pagemap *entry;
++
++ idx = (int)page_private(page);
++
++ BUG_ON(!blkback_pagemap);
++ BUG_ON(idx >= blkback_pagemap_size);
++
++ entry = blkback_pagemap + idx;
++ if (blkback_pagemap_entry_clear(entry)) {
++ printk("reading empty pagemap %d\n", idx);
++ BUG();
++ }
++
++ return *entry;
++}
++EXPORT_SYMBOL(blkback_pagemap_read);
++
++MODULE_LICENSE("Dual BSD/GPL");
++
++int
++blkback_pagemap_contains_page(struct page *page)
++{
++ struct blkback_pagemap *entry;
++ int idx = (int)page_private(page);
++
++ if (idx < 0 || idx >= blkback_pagemap_size)
++ return 0;
++
++ entry = blkback_pagemap + idx;
++
++ return (entry->page == page);
++}
++EXPORT_SYMBOL(blkback_pagemap_contains_page);
+diff --git a/drivers/xen/blkback/blkback-pagemap.h b/drivers/xen/blkback/blkback-pagemap.h
+new file mode 100644
+index 0000000..7f97d15
+--- /dev/null
++++ b/drivers/xen/blkback/blkback-pagemap.h
+@@ -0,0 +1,36 @@
++#ifndef _BLKBACK_PAGEMAP_H_
++#define _BLKBACK_PAGEMAP_H_
++
++#include <linux/mm.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/grant_table.h>
++
++typedef unsigned int busid_t;
++
++struct blkback_pagemap {
++ struct page *page;
++ domid_t domid;
++ busid_t busid;
++ grant_ref_t gref;
++};
++
++#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE)
++
++int blkback_pagemap_init(int);
++void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t);
++void blkback_pagemap_clear(struct page *);
++struct blkback_pagemap blkback_pagemap_read(struct page *);
++int blkback_pagemap_contains_page(struct page *page);
++
++#else /* CONFIG_XEN_BLKBACK_PAGEMAP */
++
++static inline int blkback_pagemap_init(int pages) { return 0; }
++static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom,
++ busid_t bus, grant_ref_t gnt) {}
++static inline void blkback_pagemap_clear(struct page *page) {}
++#define blkback_pagemap_read(_page) ({ BUG(); (struct blkback_pagemap){0}; })
++static inline int blkback_pagemap_contains_page(struct page *page) { return 0; }
++
++#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */
++
++#endif
+diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
+new file mode 100644
+index 0000000..e644dd5
+--- /dev/null
++++ b/drivers/xen/blkback/blkback.c
+@@ -0,0 +1,672 @@
++/******************************************************************************
++ * arch/xen/drivers/blkif/backend/main.c
++ *
++ * Back-end of the driver for virtual block devices. This portion of the
++ * driver exports a 'unified' block-device interface that can be accessed
++ * by any operating system that implements a compatible front end. A
++ * reference front-end implementation can be found in:
++ * arch/xen/drivers/blkif/frontend
++ *
++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
++ * Copyright (c) 2005, Christopher Clark
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/spinlock.h>
++#include <linux/kthread.h>
++#include <linux/list.h>
++#include <linux/delay.h>
++#include <linux/freezer.h>
++
++#include <xen/balloon.h>
++#include <xen/events.h>
++#include <xen/page.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include "common.h"
++
++/*
++ * These are rather arbitrary. They are fairly large because adjacent requests
++ * pulled from a communication ring are quite likely to end up being part of
++ * the same scatter/gather request at the disc.
++ *
++ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
++ *
++ * This will increase the chances of being able to write whole tracks.
++ * 64 should be enough to keep us competitive with Linux.
++ */
++static int blkif_reqs = 64;
++module_param_named(reqs, blkif_reqs, int, 0);
++MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
++
++/* Run-time switchable: /sys/module/blkback/parameters/ */
++static unsigned int log_stats = 0;
++static unsigned int debug_lvl = 0;
++module_param(log_stats, int, 0644);
++module_param(debug_lvl, int, 0644);
++
++/*
++ * Each outstanding request that we've passed to the lower device layers has a
++ * 'pending_req' allocated to it. Each buffer_head that completes decrements
++ * the pendcnt towards zero. When it hits zero, the specified domain has a
++ * response queued for it, with the saved 'id' passed back.
++ */
++typedef struct {
++ blkif_t *blkif;
++ u64 id;
++ int nr_pages;
++ atomic_t pendcnt;
++ unsigned short operation;
++ int status;
++ struct list_head free_list;
++} pending_req_t;
++
++static pending_req_t *pending_reqs;
++static struct list_head pending_free;
++static DEFINE_SPINLOCK(pending_free_lock);
++static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
++
++#define BLKBACK_INVALID_HANDLE (~0)
++
++static struct page **pending_pages;
++static grant_handle_t *pending_grant_handles;
++
++static inline int vaddr_pagenr(pending_req_t *req, int seg)
++{
++ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
++}
++
++#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
++
++static inline unsigned long vaddr(pending_req_t *req, int seg)
++{
++ unsigned long pfn = page_to_pfn(pending_page(req, seg));
++ return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++#define pending_handle(_req, _seg) \
++ (pending_grant_handles[vaddr_pagenr(_req, _seg)])
++
++
++static int do_block_io_op(blkif_t *blkif);
++static void dispatch_rw_block_io(blkif_t *blkif,
++ struct blkif_request *req,
++ pending_req_t *pending_req);
++static void make_response(blkif_t *blkif, u64 id,
++ unsigned short op, int st);
++
++/******************************************************************
++ * misc small helpers
++ */
++static pending_req_t* alloc_req(void)
++{
++ pending_req_t *req = NULL;
++ unsigned long flags;
++
++ spin_lock_irqsave(&pending_free_lock, flags);
++ if (!list_empty(&pending_free)) {
++ req = list_entry(pending_free.next, pending_req_t, free_list);
++ list_del(&req->free_list);
++ }
++ spin_unlock_irqrestore(&pending_free_lock, flags);
++ return req;
++}
++
++static void free_req(pending_req_t *req)
++{
++ unsigned long flags;
++ int was_empty;
++
++ spin_lock_irqsave(&pending_free_lock, flags);
++ was_empty = list_empty(&pending_free);
++ list_add(&req->free_list, &pending_free);
++ spin_unlock_irqrestore(&pending_free_lock, flags);
++ if (was_empty)
++ wake_up(&pending_free_wq);
++}
++
++static void unplug_queue(blkif_t *blkif)
++{
++ if (blkif->plug == NULL)
++ return;
++ if (blkif->plug->unplug_fn)
++ blkif->plug->unplug_fn(blkif->plug);
++ blk_put_queue(blkif->plug);
++ blkif->plug = NULL;
++}
++
++static void plug_queue(blkif_t *blkif, struct block_device *bdev)
++{
++ struct request_queue *q = bdev_get_queue(bdev);
++
++ if (q == blkif->plug)
++ return;
++ unplug_queue(blkif);
++ blk_get_queue(q);
++ blkif->plug = q;
++}
++
++static void fast_flush_area(pending_req_t *req)
++{
++ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++ unsigned int i, invcount = 0;
++ grant_handle_t handle;
++ int ret;
++
++ for (i = 0; i < req->nr_pages; i++) {
++ handle = pending_handle(req, i);
++ if (handle == BLKBACK_INVALID_HANDLE)
++ continue;
++ blkback_pagemap_clear(pending_page(req, i));
++ gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
++ GNTMAP_host_map, handle);
++ pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
++ invcount++;
++ }
++
++ ret = HYPERVISOR_grant_table_op(
++ GNTTABOP_unmap_grant_ref, unmap, invcount);
++ BUG_ON(ret);
++}
++
++/******************************************************************
++ * SCHEDULER FUNCTIONS
++ */
++
++static void print_stats(blkif_t *blkif)
++{
++ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n",
++ current->comm, blkif->st_oo_req,
++ blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
++ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
++ blkif->st_rd_req = 0;
++ blkif->st_wr_req = 0;
++ blkif->st_oo_req = 0;
++}
++
++int blkif_schedule(void *arg)
++{
++ blkif_t *blkif = arg;
++
++ blkif_get(blkif);
++
++ if (debug_lvl)
++ printk(KERN_DEBUG "%s: started\n", current->comm);
++
++ while (!kthread_should_stop()) {
++ if (try_to_freeze())
++ continue;
++
++ wait_event_interruptible(
++ blkif->wq,
++ blkif->waiting_reqs || kthread_should_stop());
++ wait_event_interruptible(
++ pending_free_wq,
++ !list_empty(&pending_free) || kthread_should_stop());
++
++ blkif->waiting_reqs = 0;
++ smp_mb(); /* clear flag *before* checking for work */
++
++ if (do_block_io_op(blkif))
++ blkif->waiting_reqs = 1;
++ unplug_queue(blkif);
++
++ if (log_stats && time_after(jiffies, blkif->st_print))
++ print_stats(blkif);
++ }
++
++ if (log_stats)
++ print_stats(blkif);
++ if (debug_lvl)
++ printk(KERN_DEBUG "%s: exiting\n", current->comm);
++
++ blkif->xenblkd = NULL;
++ blkif_put(blkif);
++
++ return 0;
++}
++
++/******************************************************************
++ * COMPLETION CALLBACK -- Called as bh->b_end_io()
++ */
++
++static void __end_block_io_op(pending_req_t *pending_req, int error)
++{
++ /* An error fails the entire request. */
++ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
++ (error == -EOPNOTSUPP)) {
++ DPRINTK("blkback: write barrier op failed, not supported\n");
++ blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
++ pending_req->status = BLKIF_RSP_EOPNOTSUPP;
++ } else if (error) {
++ DPRINTK("Buffer not up-to-date at end of operation, "
++ "error=%d\n", error);
++ pending_req->status = BLKIF_RSP_ERROR;
++ }
++
++ if (atomic_dec_and_test(&pending_req->pendcnt)) {
++ fast_flush_area(pending_req);
++ make_response(pending_req->blkif, pending_req->id,
++ pending_req->operation, pending_req->status);
++ blkif_put(pending_req->blkif);
++ free_req(pending_req);
++ }
++}
++
++static void end_block_io_op(struct bio *bio, int error)
++{
++ __end_block_io_op(bio->bi_private, error);
++ bio_put(bio);
++}
++
++
++/******************************************************************************
++ * NOTIFICATION FROM GUEST OS.
++ */
++
++static void blkif_notify_work(blkif_t *blkif)
++{
++ blkif->waiting_reqs = 1;
++ wake_up(&blkif->wq);
++}
++
++irqreturn_t blkif_be_int(int irq, void *dev_id)
++{
++ blkif_notify_work(dev_id);
++ return IRQ_HANDLED;
++}
++
++
++
++/******************************************************************
++ * DOWNWARD CALLS -- These interface with the block-device layer proper.
++ */
++
++static int do_block_io_op(blkif_t *blkif)
++{
++ union blkif_back_rings *blk_rings = &blkif->blk_rings;
++ struct blkif_request req;
++ pending_req_t *pending_req;
++ RING_IDX rc, rp;
++ int more_to_do = 0;
++
++ rc = blk_rings->common.req_cons;
++ rp = blk_rings->common.sring->req_prod;
++ rmb(); /* Ensure we see queued requests up to 'rp'. */
++
++ while (rc != rp) {
++
++ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
++ break;
++
++ if (kthread_should_stop()) {
++ more_to_do = 1;
++ break;
++ }
++
++ pending_req = alloc_req();
++ if (NULL == pending_req) {
++ blkif->st_oo_req++;
++ more_to_do = 1;
++ break;
++ }
++
++ switch (blkif->blk_protocol) {
++ case BLKIF_PROTOCOL_NATIVE:
++ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
++ break;
++ case BLKIF_PROTOCOL_X86_32:
++ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
++ break;
++ case BLKIF_PROTOCOL_X86_64:
++ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
++ break;
++ default:
++ BUG();
++ }
++ blk_rings->common.req_cons = ++rc; /* before make_response() */
++
++ /* Apply all sanity checks to /private copy/ of request. */
++ barrier();
++
++ switch (req.operation) {
++ case BLKIF_OP_READ:
++ blkif->st_rd_req++;
++ dispatch_rw_block_io(blkif, &req, pending_req);
++ break;
++ case BLKIF_OP_WRITE_BARRIER:
++ blkif->st_br_req++;
++ /* fall through */
++ case BLKIF_OP_WRITE:
++ blkif->st_wr_req++;
++ dispatch_rw_block_io(blkif, &req, pending_req);
++ break;
++ default:
++ /* A good sign something is wrong: sleep for a while to
++ * avoid excessive CPU consumption by a bad guest. */
++ msleep(1);
++ DPRINTK("error: unknown block io operation [%d]\n",
++ req.operation);
++ make_response(blkif, req.id, req.operation,
++ BLKIF_RSP_ERROR);
++ free_req(pending_req);
++ break;
++ }
++
++ /* Yield point for this unbounded loop. */
++ cond_resched();
++ }
++
++ return more_to_do;
++}
++
++static void dispatch_rw_block_io(blkif_t *blkif,
++ struct blkif_request *req,
++ pending_req_t *pending_req)
++{
++ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++ struct phys_req preq;
++ struct {
++ unsigned long buf; unsigned int nsec;
++ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++ unsigned int nseg;
++ struct bio *bio = NULL;
++ int ret, i;
++ int operation;
++
++ switch (req->operation) {
++ case BLKIF_OP_READ:
++ operation = READ;
++ break;
++ case BLKIF_OP_WRITE:
++ operation = WRITE;
++ break;
++ case BLKIF_OP_WRITE_BARRIER:
++ operation = WRITE_BARRIER;
++ break;
++ default:
++ operation = 0; /* make gcc happy */
++ BUG();
++ }
++
++ /* Check that number of segments is sane. */
++ nseg = req->nr_segments;
++ if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
++ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
++ DPRINTK("Bad number of segments in request (%d)\n", nseg);
++ goto fail_response;
++ }
++
++ preq.dev = req->handle;
++ preq.sector_number = req->sector_number;
++ preq.nr_sects = 0;
++
++ pending_req->blkif = blkif;
++ pending_req->id = req->id;
++ pending_req->operation = req->operation;
++ pending_req->status = BLKIF_RSP_OKAY;
++ pending_req->nr_pages = nseg;
++
++ for (i = 0; i < nseg; i++) {
++ uint32_t flags;
++
++ seg[i].nsec = req->seg[i].last_sect -
++ req->seg[i].first_sect + 1;
++
++ if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
++ (req->seg[i].last_sect < req->seg[i].first_sect))
++ goto fail_response;
++ preq.nr_sects += seg[i].nsec;
++
++ flags = GNTMAP_host_map;
++ if (operation != READ)
++ flags |= GNTMAP_readonly;
++ gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
++ req->seg[i].gref, blkif->domid);
++ }
++
++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
++ BUG_ON(ret);
++
++ for (i = 0; i < nseg; i++) {
++ if (unlikely(map[i].status != 0)) {
++ DPRINTK("invalid buffer -- could not remap it\n");
++ map[i].handle = BLKBACK_INVALID_HANDLE;
++ ret |= 1;
++ continue;
++ }
++
++ set_phys_to_machine(
++ page_to_pfn(pending_page(pending_req, i)),
++ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
++ seg[i].buf = map[i].dev_bus_addr |
++ (req->seg[i].first_sect << 9);
++ blkback_pagemap_set(vaddr_pagenr(pending_req, i),
++ pending_page(pending_req, i),
++ blkif->domid, req->handle,
++ req->seg[i].gref);
++ pending_handle(pending_req, i) = map[i].handle;
++ }
++
++ if (ret)
++ goto fail_flush;
++
++ if (vbd_translate(&preq, blkif, operation) != 0) {
++ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
++ operation == READ ? "read" : "write",
++ preq.sector_number,
++ preq.sector_number + preq.nr_sects, preq.dev);
++ goto fail_flush;
++ }
++
++ plug_queue(blkif, preq.bdev);
++ atomic_set(&pending_req->pendcnt, 1);
++ blkif_get(blkif);
++
++ for (i = 0; i < nseg; i++) {
++ if (((int)preq.sector_number|(int)seg[i].nsec) &
++ ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
++ DPRINTK("Misaligned I/O request from domain %d",
++ blkif->domid);
++ goto fail_put_bio;
++ }
++
++ while ((bio == NULL) ||
++ (bio_add_page(bio,
++ pending_page(pending_req, i),
++ seg[i].nsec << 9,
++ seg[i].buf & ~PAGE_MASK) == 0)) {
++ if (bio) {
++ atomic_inc(&pending_req->pendcnt);
++ submit_bio(operation, bio);
++ }
++
++ bio = bio_alloc(GFP_KERNEL, nseg-i);
++ if (unlikely(bio == NULL))
++ goto fail_put_bio;
++
++ bio->bi_bdev = preq.bdev;
++ bio->bi_private = pending_req;
++ bio->bi_end_io = end_block_io_op;
++ bio->bi_sector = preq.sector_number;
++ }
++
++ preq.sector_number += seg[i].nsec;
++ }
++
++ if (!bio) {
++ BUG_ON(operation != WRITE_BARRIER);
++ bio = bio_alloc(GFP_KERNEL, 0);
++ if (unlikely(bio == NULL))
++ goto fail_put_bio;
++
++ bio->bi_bdev = preq.bdev;
++ bio->bi_private = pending_req;
++ bio->bi_end_io = end_block_io_op;
++ bio->bi_sector = -1;
++ }
++
++ submit_bio(operation, bio);
++
++ if (operation == READ)
++ blkif->st_rd_sect += preq.nr_sects;
++ else if (operation == WRITE || operation == WRITE_BARRIER)
++ blkif->st_wr_sect += preq.nr_sects;
++
++ return;
++
++ fail_flush:
++ fast_flush_area(pending_req);
++ fail_response:
++ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
++ free_req(pending_req);
++ msleep(1); /* back off a bit */
++ return;
++
++ fail_put_bio:
++ __end_block_io_op(pending_req, -EINVAL);
++ if (bio)
++ bio_put(bio);
++ unplug_queue(blkif);
++ msleep(1); /* back off a bit */
++ return;
++}
++
++
++
++/******************************************************************
++ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
++ */
++
++
++static void make_response(blkif_t *blkif, u64 id,
++ unsigned short op, int st)
++{
++ struct blkif_response resp;
++ unsigned long flags;
++ union blkif_back_rings *blk_rings = &blkif->blk_rings;
++ int more_to_do = 0;
++ int notify;
++
++ resp.id = id;
++ resp.operation = op;
++ resp.status = st;
++
++ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
++ /* Place on the response ring for the relevant domain. */
++ switch (blkif->blk_protocol) {
++ case BLKIF_PROTOCOL_NATIVE:
++ memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
++ &resp, sizeof(resp));
++ break;
++ case BLKIF_PROTOCOL_X86_32:
++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
++ &resp, sizeof(resp));
++ break;
++ case BLKIF_PROTOCOL_X86_64:
++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
++ &resp, sizeof(resp));
++ break;
++ default:
++ BUG();
++ }
++ blk_rings->common.rsp_prod_pvt++;
++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
++ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
++ /*
++ * Tail check for pending requests. Allows frontend to avoid
++ * notifications if requests are already in flight (lower
++ * overheads and promotes batching).
++ */
++ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
++
++ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
++ more_to_do = 1;
++ }
++
++ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
++
++ if (more_to_do)
++ blkif_notify_work(blkif);
++ if (notify)
++ notify_remote_via_irq(blkif->irq);
++}
++
++static int __init blkif_init(void)
++{
++ int i, mmap_pages;
++ int rc = 0;
++
++ if (!xen_pv_domain())
++ return -ENODEV;
++
++ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
++
++ pending_reqs = kmalloc(sizeof(pending_reqs[0]) *
++ blkif_reqs, GFP_KERNEL);
++ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
++ mmap_pages, GFP_KERNEL);
++ pending_pages = alloc_empty_pages_and_pagevec(mmap_pages);
++
++ if (blkback_pagemap_init(mmap_pages))
++ goto out_of_memory;
++
++ if (!pending_reqs || !pending_grant_handles || !pending_pages) {
++ rc = -ENOMEM;
++ goto out_of_memory;
++ }
++
++ for (i = 0; i < mmap_pages; i++)
++ pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
++
++ rc = blkif_interface_init();
++ if (rc)
++ goto failed_init;
++
++ memset(pending_reqs, 0, sizeof(pending_reqs));
++ INIT_LIST_HEAD(&pending_free);
++
++ for (i = 0; i < blkif_reqs; i++)
++ list_add_tail(&pending_reqs[i].free_list, &pending_free);
++
++ rc = blkif_xenbus_init();
++ if (rc)
++ goto failed_init;
++
++ return 0;
++
++ out_of_memory:
++ printk(KERN_ERR "%s: out of memory\n", __func__);
++ failed_init:
++ kfree(pending_reqs);
++ kfree(pending_grant_handles);
++ free_empty_pages_and_pagevec(pending_pages, mmap_pages);
++ return rc;
++}
++
++module_init(blkif_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
+new file mode 100644
+index 0000000..af43d63
+--- /dev/null
++++ b/drivers/xen/blkback/common.h
+@@ -0,0 +1,139 @@
++/*
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __BLKIF__BACKEND__COMMON_H__
++#define __BLKIF__BACKEND__COMMON_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/blkdev.h>
++#include <linux/vmalloc.h>
++#include <linux/wait.h>
++#include <asm/io.h>
++#include <asm/setup.h>
++#include <asm/pgalloc.h>
++#include <asm/hypervisor.h>
++#include <xen/blkif.h>
++#include <xen/grant_table.h>
++#include <xen/xenbus.h>
++#include "blkback-pagemap.h"
++
++
++#define DPRINTK(_f, _a...) \
++ pr_debug("(file=%s, line=%d) " _f, \
++ __FILE__ , __LINE__ , ## _a )
++
++struct vbd {
++ blkif_vdev_t handle; /* what the domain refers to this vbd as */
++ unsigned char readonly; /* Non-zero -> read-only */
++ unsigned char type; /* VDISK_xxx */
++ u32 pdevice; /* phys device that this vbd maps to */
++ struct block_device *bdev;
++};
++
++struct backend_info;
++
++typedef struct blkif_st {
++ /* Unique identifier for this interface. */
++ domid_t domid;
++ unsigned int handle;
++ /* Physical parameters of the comms window. */
++ unsigned int irq;
++ /* Comms information. */
++ enum blkif_protocol blk_protocol;
++ union blkif_back_rings blk_rings;
++ struct vm_struct *blk_ring_area;
++ /* The VBD attached to this interface. */
++ struct vbd vbd;
++ /* Back pointer to the backend_info. */
++ struct backend_info *be;
++ /* Private fields. */
++ spinlock_t blk_ring_lock;
++ atomic_t refcnt;
++
++ wait_queue_head_t wq;
++ struct task_struct *xenblkd;
++ unsigned int waiting_reqs;
++ struct request_queue *plug;
++
++ /* statistics */
++ unsigned long st_print;
++ int st_rd_req;
++ int st_wr_req;
++ int st_oo_req;
++ int st_br_req;
++ int st_rd_sect;
++ int st_wr_sect;
++
++ wait_queue_head_t waiting_to_free;
++
++ grant_handle_t shmem_handle;
++ grant_ref_t shmem_ref;
++} blkif_t;
++
++blkif_t *blkif_alloc(domid_t domid);
++void blkif_disconnect(blkif_t *blkif);
++void blkif_free(blkif_t *blkif);
++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
++
++#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
++#define blkif_put(_b) \
++ do { \
++ if (atomic_dec_and_test(&(_b)->refcnt)) \
++ wake_up(&(_b)->waiting_to_free);\
++ } while (0)
++
++/* Create a vbd. */
++int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
++ unsigned minor, int readonly, int cdrom);
++void vbd_free(struct vbd *vbd);
++
++unsigned long long vbd_size(struct vbd *vbd);
++unsigned int vbd_info(struct vbd *vbd);
++unsigned long vbd_secsize(struct vbd *vbd);
++
++struct phys_req {
++ unsigned short dev;
++ unsigned short nr_sects;
++ struct block_device *bdev;
++ blkif_sector_t sector_number;
++};
++
++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
++
++int blkif_interface_init(void);
++
++int blkif_xenbus_init(void);
++
++irqreturn_t blkif_be_int(int irq, void *dev_id);
++int blkif_schedule(void *arg);
++
++int blkback_barrier(struct xenbus_transaction xbt,
++ struct backend_info *be, int state);
++
++#endif /* __BLKIF__BACKEND__COMMON_H__ */
+diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
+new file mode 100644
+index 0000000..e397a41
+--- /dev/null
++++ b/drivers/xen/blkback/interface.c
+@@ -0,0 +1,186 @@
++/******************************************************************************
++ * arch/xen/drivers/blkif/backend/interface.c
++ *
++ * Block-device interface management.
++ *
++ * Copyright (c) 2004, Keir Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++#include <xen/events.h>
++#include <xen/grant_table.h>
++#include <linux/kthread.h>
++
++static struct kmem_cache *blkif_cachep;
++
++blkif_t *blkif_alloc(domid_t domid)
++{
++ blkif_t *blkif;
++
++ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
++ if (!blkif)
++ return ERR_PTR(-ENOMEM);
++
++ memset(blkif, 0, sizeof(*blkif));
++ blkif->domid = domid;
++ spin_lock_init(&blkif->blk_ring_lock);
++ atomic_set(&blkif->refcnt, 1);
++ init_waitqueue_head(&blkif->wq);
++ blkif->st_print = jiffies;
++ init_waitqueue_head(&blkif->waiting_to_free);
++
++ return blkif;
++}
++
++static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
++{
++ struct gnttab_map_grant_ref op;
++
++ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
++ GNTMAP_host_map, shared_page, blkif->domid);
++
++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++ BUG();
++
++ if (op.status) {
++ DPRINTK(" Grant table operation failure !\n");
++ return op.status;
++ }
++
++ blkif->shmem_ref = shared_page;
++ blkif->shmem_handle = op.handle;
++
++ return 0;
++}
++
++static void unmap_frontend_page(blkif_t *blkif)
++{
++ struct gnttab_unmap_grant_ref op;
++
++ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
++ GNTMAP_host_map, blkif->shmem_handle);
++
++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++ BUG();
++}
++
++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
++{
++ int err;
++
++ /* Already connected through? */
++ if (blkif->irq)
++ return 0;
++
++ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
++ return -ENOMEM;
++
++ err = map_frontend_page(blkif, shared_page);
++ if (err) {
++ free_vm_area(blkif->blk_ring_area);
++ return err;
++ }
++
++ switch (blkif->blk_protocol) {
++ case BLKIF_PROTOCOL_NATIVE:
++ {
++ struct blkif_sring *sring;
++ sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
++ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
++ break;
++ }
++ case BLKIF_PROTOCOL_X86_32:
++ {
++ struct blkif_x86_32_sring *sring_x86_32;
++ sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
++ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
++ break;
++ }
++ case BLKIF_PROTOCOL_X86_64:
++ {
++ struct blkif_x86_64_sring *sring_x86_64;
++ sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
++ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
++ break;
++ }
++ default:
++ BUG();
++ }
++
++ err = bind_interdomain_evtchn_to_irqhandler(
++ blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
++ if (err < 0)
++ {
++ unmap_frontend_page(blkif);
++ free_vm_area(blkif->blk_ring_area);
++ blkif->blk_rings.common.sring = NULL;
++ return err;
++ }
++ blkif->irq = err;
++
++ return 0;
++}
++
++void blkif_disconnect(blkif_t *blkif)
++{
++ if (blkif->xenblkd) {
++ kthread_stop(blkif->xenblkd);
++ blkif->xenblkd = NULL;
++ }
++
++ atomic_dec(&blkif->refcnt);
++ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
++ atomic_inc(&blkif->refcnt);
++
++ if (blkif->irq) {
++ unbind_from_irqhandler(blkif->irq, blkif);
++ blkif->irq = 0;
++ }
++
++ if (blkif->blk_rings.common.sring) {
++ unmap_frontend_page(blkif);
++ free_vm_area(blkif->blk_ring_area);
++ blkif->blk_rings.common.sring = NULL;
++ }
++}
++
++void blkif_free(blkif_t *blkif)
++{
++ if (!atomic_dec_and_test(&blkif->refcnt))
++ BUG();
++ kmem_cache_free(blkif_cachep, blkif);
++}
++
++int __init blkif_interface_init(void)
++{
++ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
++ 0, 0, NULL);
++ if (!blkif_cachep)
++ return -ENOMEM;
++
++ return 0;
++}
+diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
+new file mode 100644
+index 0000000..410c2ea
+--- /dev/null
++++ b/drivers/xen/blkback/vbd.c
+@@ -0,0 +1,118 @@
++/******************************************************************************
++ * blkback/vbd.c
++ *
++ * Routines for managing virtual block devices (VBDs).
++ *
++ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++
++#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
++ (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
++
++unsigned long long vbd_size(struct vbd *vbd)
++{
++ return vbd_sz(vbd);
++}
++
++unsigned int vbd_info(struct vbd *vbd)
++{
++ return vbd->type | (vbd->readonly?VDISK_READONLY:0);
++}
++
++unsigned long vbd_secsize(struct vbd *vbd)
++{
++ return bdev_logical_block_size(vbd->bdev);
++}
++
++int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
++ unsigned minor, int readonly, int cdrom)
++{
++ struct vbd *vbd;
++ struct block_device *bdev;
++
++ vbd = &blkif->vbd;
++ vbd->handle = handle;
++ vbd->readonly = readonly;
++ vbd->type = 0;
++
++ vbd->pdevice = MKDEV(major, minor);
++
++ bdev = open_by_devnum(vbd->pdevice,
++ vbd->readonly ? FMODE_READ : FMODE_WRITE);
++
++ if (IS_ERR(bdev)) {
++ DPRINTK("vbd_creat: device %08x could not be opened.\n",
++ vbd->pdevice);
++ return -ENOENT;
++ }
++
++ vbd->bdev = bdev;
++
++ if (vbd->bdev->bd_disk == NULL) {
++ DPRINTK("vbd_creat: device %08x doesn't exist.\n",
++ vbd->pdevice);
++ vbd_free(vbd);
++ return -ENOENT;
++ }
++
++ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
++ vbd->type |= VDISK_CDROM;
++ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
++ vbd->type |= VDISK_REMOVABLE;
++
++ DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
++ handle, blkif->domid);
++ return 0;
++}
++
++void vbd_free(struct vbd *vbd)
++{
++ if (vbd->bdev)
++ blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
++ vbd->bdev = NULL;
++}
++
++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
++{
++ struct vbd *vbd = &blkif->vbd;
++ int rc = -EACCES;
++
++ if ((operation != READ) && vbd->readonly)
++ goto out;
++
++ if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
++ goto out;
++
++ req->dev = vbd->pdevice;
++ req->bdev = vbd->bdev;
++ rc = 0;
++
++ out:
++ return rc;
++}
+diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
+new file mode 100644
+index 0000000..34f8e40
+--- /dev/null
++++ b/drivers/xen/blkback/xenbus.c
+@@ -0,0 +1,541 @@
++/* Xenbus code for blkif backend
++ Copyright (C) 2005 Rusty Russell <rusty at rustcorp.com.au>
++ Copyright (C) 2005 XenSource Ltd
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2 of the License, or
++ (at your option) any later version.
++
++ This program is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ GNU General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with this program; if not, write to the Free Software
++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++*/
++
++#include <stdarg.h>
++#include <linux/module.h>
++#include <linux/kthread.h>
++#include "common.h"
++
++#undef DPRINTK
++#define DPRINTK(fmt, args...) \
++ pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \
++ __FUNCTION__, __LINE__, ##args)
++
++struct backend_info
++{
++ struct xenbus_device *dev;
++ blkif_t *blkif;
++ struct xenbus_watch backend_watch;
++ unsigned major;
++ unsigned minor;
++ char *mode;
++};
++
++static void connect(struct backend_info *);
++static int connect_ring(struct backend_info *);
++static void backend_changed(struct xenbus_watch *, const char **,
++ unsigned int);
++
++static int blkback_name(blkif_t *blkif, char *buf)
++{
++ char *devpath, *devname;
++ struct xenbus_device *dev = blkif->be->dev;
++
++ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
++ if (IS_ERR(devpath))
++ return PTR_ERR(devpath);
++
++ if ((devname = strstr(devpath, "/dev/")) != NULL)
++ devname += strlen("/dev/");
++ else
++ devname = devpath;
++
++ snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
++ kfree(devpath);
++
++ return 0;
++}
++
++static void update_blkif_status(blkif_t *blkif)
++{
++ int err;
++ char name[TASK_COMM_LEN];
++
++ /* Not ready to connect? */
++ if (!blkif->irq || !blkif->vbd.bdev)
++ return;
++
++ /* Already connected? */
++ if (blkif->be->dev->state == XenbusStateConnected)
++ return;
++
++ /* Attempt to connect: exit if we fail to. */
++ connect(blkif->be);
++ if (blkif->be->dev->state != XenbusStateConnected)
++ return;
++
++ err = blkback_name(blkif, name);
++ if (err) {
++ xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
++ return;
++ }
++
++ blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
++ if (IS_ERR(blkif->xenblkd)) {
++ err = PTR_ERR(blkif->xenblkd);
++ blkif->xenblkd = NULL;
++ xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
++ }
++}
++
++
++/****************************************************************
++ * sysfs interface for VBD I/O requests
++ */
++
++#define VBD_SHOW(name, format, args...) \
++ static ssize_t show_##name(struct device *_dev, \
++ struct device_attribute *attr, \
++ char *buf) \
++ { \
++ struct xenbus_device *dev = to_xenbus_device(_dev); \
++ struct backend_info *be = dev_get_drvdata(&dev->dev); \
++ \
++ return sprintf(buf, format, ##args); \
++ } \
++ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
++
++VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
++VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
++VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
++VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req);
++VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
++VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
++
++static struct attribute *vbdstat_attrs[] = {
++ &dev_attr_oo_req.attr,
++ &dev_attr_rd_req.attr,
++ &dev_attr_wr_req.attr,
++ &dev_attr_br_req.attr,
++ &dev_attr_rd_sect.attr,
++ &dev_attr_wr_sect.attr,
++ NULL
++};
++
++static struct attribute_group vbdstat_group = {
++ .name = "statistics",
++ .attrs = vbdstat_attrs,
++};
++
++VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
++VBD_SHOW(mode, "%s\n", be->mode);
++
++int xenvbd_sysfs_addif(struct xenbus_device *dev)
++{
++ int error;
++
++ error = device_create_file(&dev->dev, &dev_attr_physical_device);
++ if (error)
++ goto fail1;
++
++ error = device_create_file(&dev->dev, &dev_attr_mode);
++ if (error)
++ goto fail2;
++
++ error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
++ if (error)
++ goto fail3;
++
++ return 0;
++
++fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
++fail2: device_remove_file(&dev->dev, &dev_attr_mode);
++fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
++ return error;
++}
++
++void xenvbd_sysfs_delif(struct xenbus_device *dev)
++{
++ sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
++ device_remove_file(&dev->dev, &dev_attr_mode);
++ device_remove_file(&dev->dev, &dev_attr_physical_device);
++}
++
++static int blkback_remove(struct xenbus_device *dev)
++{
++ struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++ DPRINTK("");
++
++ if (be->major || be->minor)
++ xenvbd_sysfs_delif(dev);
++
++ if (be->backend_watch.node) {
++ unregister_xenbus_watch(&be->backend_watch);
++ kfree(be->backend_watch.node);
++ be->backend_watch.node = NULL;
++ }
++
++ if (be->blkif) {
++ blkif_disconnect(be->blkif);
++ vbd_free(&be->blkif->vbd);
++ blkif_free(be->blkif);
++ be->blkif = NULL;
++ }
++
++ kfree(be);
++ dev_set_drvdata(&dev->dev, NULL);
++ return 0;
++}
++
++int blkback_barrier(struct xenbus_transaction xbt,
++ struct backend_info *be, int state)
++{
++ struct xenbus_device *dev = be->dev;
++ int err;
++
++ err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
++ "%d", state);
++ if (err)
++ xenbus_dev_fatal(dev, err, "writing feature-barrier");
++
++ return err;
++}
++
++/**
++ * Entry point to this code when a new device is created. Allocate the basic
++ * structures, and watch the store waiting for the hotplug scripts to tell us
++ * the device's physical major and minor numbers. Switch to InitWait.
++ */
++static int blkback_probe(struct xenbus_device *dev,
++ const struct xenbus_device_id *id)
++{
++ int err;
++ struct backend_info *be = kzalloc(sizeof(struct backend_info),
++ GFP_KERNEL);
++ if (!be) {
++ xenbus_dev_fatal(dev, -ENOMEM,
++ "allocating backend structure");
++ return -ENOMEM;
++ }
++ be->dev = dev;
++ dev_set_drvdata(&dev->dev, be);
++
++ be->blkif = blkif_alloc(dev->otherend_id);
++ if (IS_ERR(be->blkif)) {
++ err = PTR_ERR(be->blkif);
++ be->blkif = NULL;
++ xenbus_dev_fatal(dev, err, "creating block interface");
++ goto fail;
++ }
++
++ /* setup back pointer */
++ be->blkif->be = be;
++
++ err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
++ "%s/%s", dev->nodename, "physical-device");
++ if (err)
++ goto fail;
++
++ err = xenbus_switch_state(dev, XenbusStateInitWait);
++ if (err)
++ goto fail;
++
++ return 0;
++
++fail:
++ DPRINTK("failed");
++ blkback_remove(dev);
++ return err;
++}
++
++
++/**
++ * Callback received when the hotplug scripts have placed the physical-device
++ * node. Read it and the mode node, and create a vbd. If the frontend is
++ * ready, connect.
++ */
++static void backend_changed(struct xenbus_watch *watch,
++ const char **vec, unsigned int len)
++{
++ int err;
++ unsigned major;
++ unsigned minor;
++ struct backend_info *be
++ = container_of(watch, struct backend_info, backend_watch);
++ struct xenbus_device *dev = be->dev;
++ int cdrom = 0;
++ char *device_type;
++
++ DPRINTK("");
++
++ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
++ &major, &minor);
++ if (XENBUS_EXIST_ERR(err)) {
++ /* Since this watch will fire once immediately after it is
++ registered, we expect this. Ignore it, and wait for the
++ hotplug scripts. */
++ return;
++ }
++ if (err != 2) {
++ xenbus_dev_fatal(dev, err, "reading physical-device");
++ return;
++ }
++
++ if ((be->major || be->minor) &&
++ ((be->major != major) || (be->minor != minor))) {
++ printk(KERN_WARNING
++ "blkback: changing physical device (from %x:%x to "
++ "%x:%x) not supported.\n", be->major, be->minor,
++ major, minor);
++ return;
++ }
++
++ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
++ if (IS_ERR(be->mode)) {
++ err = PTR_ERR(be->mode);
++ be->mode = NULL;
++ xenbus_dev_fatal(dev, err, "reading mode");
++ return;
++ }
++
++ device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
++ if (!IS_ERR(device_type)) {
++ cdrom = strcmp(device_type, "cdrom") == 0;
++ kfree(device_type);
++ }
++
++ if (be->major == 0 && be->minor == 0) {
++ /* Front end dir is a number, which is used as the handle. */
++
++ char *p = strrchr(dev->otherend, '/') + 1;
++ long handle = simple_strtoul(p, NULL, 0);
++
++ be->major = major;
++ be->minor = minor;
++
++ err = vbd_create(be->blkif, handle, major, minor,
++ (NULL == strchr(be->mode, 'w')), cdrom);
++ if (err) {
++ be->major = be->minor = 0;
++ xenbus_dev_fatal(dev, err, "creating vbd structure");
++ return;
++ }
++
++ err = xenvbd_sysfs_addif(dev);
++ if (err) {
++ vbd_free(&be->blkif->vbd);
++ be->major = be->minor = 0;
++ xenbus_dev_fatal(dev, err, "creating sysfs entries");
++ return;
++ }
++
++ /* We're potentially connected now */
++ update_blkif_status(be->blkif);
++ }
++}
++
++
++/**
++ * Callback received when the frontend's state changes.
++ */
++static void frontend_changed(struct xenbus_device *dev,
++ enum xenbus_state frontend_state)
++{
++ struct backend_info *be = dev_get_drvdata(&dev->dev);
++ int err;
++
++ DPRINTK("%s", xenbus_strstate(frontend_state));
++
++ switch (frontend_state) {
++ case XenbusStateInitialising:
++ if (dev->state == XenbusStateClosed) {
++ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
++ __FUNCTION__, dev->nodename);
++ xenbus_switch_state(dev, XenbusStateInitWait);
++ }
++ break;
++
++ case XenbusStateInitialised:
++ case XenbusStateConnected:
++ /* Ensure we connect even when two watches fire in
++ close successsion and we miss the intermediate value
++ of frontend_state. */
++ if (dev->state == XenbusStateConnected)
++ break;
++
++ err = connect_ring(be);
++ if (err)
++ break;
++ update_blkif_status(be->blkif);
++ break;
++
++ case XenbusStateClosing:
++ blkif_disconnect(be->blkif);
++ xenbus_switch_state(dev, XenbusStateClosing);
++ break;
++
++ case XenbusStateClosed:
++ xenbus_switch_state(dev, XenbusStateClosed);
++ if (xenbus_dev_is_online(dev))
++ break;
++ /* fall through if not online */
++ case XenbusStateUnknown:
++ device_unregister(&dev->dev);
++ break;
++
++ default:
++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++ frontend_state);
++ break;
++ }
++}
++
++
++/* ** Connection ** */
++
++
++/**
++ * Write the physical details regarding the block device to the store, and
++ * switch to Connected state.
++ */
++static void connect(struct backend_info *be)
++{
++ struct xenbus_transaction xbt;
++ int err;
++ struct xenbus_device *dev = be->dev;
++
++ DPRINTK("%s", dev->otherend);
++
++ /* Supply the information about the device the frontend needs */
++again:
++ err = xenbus_transaction_start(&xbt);
++ if (err) {
++ xenbus_dev_fatal(dev, err, "starting transaction");
++ return;
++ }
++
++ err = blkback_barrier(xbt, be, 1);
++ if (err)
++ goto abort;
++
++ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
++ vbd_size(&be->blkif->vbd));
++ if (err) {
++ xenbus_dev_fatal(dev, err, "writing %s/sectors",
++ dev->nodename);
++ goto abort;
++ }
++
++ /* FIXME: use a typename instead */
++ err = xenbus_printf(xbt, dev->nodename, "info", "%u",
++ vbd_info(&be->blkif->vbd));
++ if (err) {
++ xenbus_dev_fatal(dev, err, "writing %s/info",
++ dev->nodename);
++ goto abort;
++ }
++ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
++ vbd_secsize(&be->blkif->vbd));
++ if (err) {
++ xenbus_dev_fatal(dev, err, "writing %s/sector-size",
++ dev->nodename);
++ goto abort;
++ }
++
++ err = xenbus_transaction_end(xbt, 0);
++ if (err == -EAGAIN)
++ goto again;
++ if (err)
++ xenbus_dev_fatal(dev, err, "ending transaction");
++
++ err = xenbus_switch_state(dev, XenbusStateConnected);
++ if (err)
++ xenbus_dev_fatal(dev, err, "switching to Connected state",
++ dev->nodename);
++
++ return;
++ abort:
++ xenbus_transaction_end(xbt, 1);
++}
++
++
++static int connect_ring(struct backend_info *be)
++{
++ struct xenbus_device *dev = be->dev;
++ unsigned long ring_ref;
++ unsigned int evtchn;
++ char protocol[64] = "";
++ int err;
++
++ DPRINTK("%s", dev->otherend);
++
++ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
++ "event-channel", "%u", &evtchn, NULL);
++ if (err) {
++ xenbus_dev_fatal(dev, err,
++ "reading %s/ring-ref and event-channel",
++ dev->otherend);
++ return err;
++ }
++
++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
++ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
++ "%63s", protocol, NULL);
++ if (err)
++ strcpy(protocol, "unspecified, assuming native");
++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
++ else {
++ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
++ return -1;
++ }
++ printk(KERN_INFO
++ "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
++ ring_ref, evtchn, be->blkif->blk_protocol, protocol);
++
++ /* Map the shared frame, irq etc. */
++ err = blkif_map(be->blkif, ring_ref, evtchn);
++ if (err) {
++ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
++ ring_ref, evtchn);
++ return err;
++ }
++
++ return 0;
++}
++
++
++/* ** Driver Registration ** */
++
++
++static const struct xenbus_device_id blkback_ids[] = {
++ { "vbd" },
++ { "" }
++};
++
++
++static struct xenbus_driver blkback = {
++ .name = "vbd",
++ .owner = THIS_MODULE,
++ .ids = blkback_ids,
++ .probe = blkback_probe,
++ .remove = blkback_remove,
++ .otherend_changed = frontend_changed
++};
++
++
++int blkif_xenbus_init(void)
++{
++ return xenbus_register_backend(&blkback);
++}
+diff --git a/drivers/xen/blktap/Makefile b/drivers/xen/blktap/Makefile
+new file mode 100644
+index 0000000..99ff53c
+--- /dev/null
++++ b/drivers/xen/blktap/Makefile
+@@ -0,0 +1,3 @@
++obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
++
++blktap-objs := control.o ring.o wait_queue.o device.o request.o sysfs.o
+diff --git a/drivers/xen/blktap/blktap.h b/drivers/xen/blktap/blktap.h
+new file mode 100644
+index 0000000..db4cf02
+--- /dev/null
++++ b/drivers/xen/blktap/blktap.h
+@@ -0,0 +1,253 @@
++#ifndef _BLKTAP_H_
++#define _BLKTAP_H_
++
++#include <linux/mm.h>
++#include <linux/fs.h>
++#include <linux/cdev.h>
++#include <linux/init.h>
++#include <linux/scatterlist.h>
++#include <xen/blkif.h>
++#include <xen/grant_table.h>
++
++//#define ENABLE_PASSTHROUGH
++
++extern int blktap_debug_level;
++
++#define BTPRINTK(level, tag, force, _f, _a...) \
++ do { \
++ if (blktap_debug_level > level && \
++ (force || printk_ratelimit())) \
++ printk(tag "%s: " _f, __func__, ##_a); \
++ } while (0)
++
++#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
++#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
++#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
++#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
++
++#define MAX_BLKTAP_DEVICE 256
++
++#define BLKTAP_CONTROL 1
++#define BLKTAP_RING_FD 2
++#define BLKTAP_RING_VMA 3
++#define BLKTAP_DEVICE 4
++#define BLKTAP_PAUSE_REQUESTED 6
++#define BLKTAP_PAUSED 7
++#define BLKTAP_SHUTDOWN_REQUESTED 8
++#define BLKTAP_PASSTHROUGH 9
++#define BLKTAP_DEFERRED 10
++
++/* blktap IOCTLs: */
++#define BLKTAP2_IOCTL_KICK_FE 1
++#define BLKTAP2_IOCTL_ALLOC_TAP 200
++#define BLKTAP2_IOCTL_FREE_TAP 201
++#define BLKTAP2_IOCTL_CREATE_DEVICE 202
++#define BLKTAP2_IOCTL_SET_PARAMS 203
++#define BLKTAP2_IOCTL_PAUSE 204
++#define BLKTAP2_IOCTL_REOPEN 205
++#define BLKTAP2_IOCTL_RESUME 206
++
++#define BLKTAP2_MAX_MESSAGE_LEN 256
++
++#define BLKTAP2_RING_MESSAGE_PAUSE 1
++#define BLKTAP2_RING_MESSAGE_RESUME 2
++#define BLKTAP2_RING_MESSAGE_CLOSE 3
++
++#define BLKTAP_REQUEST_FREE 0
++#define BLKTAP_REQUEST_PENDING 1
++
++/*
++ * The maximum number of requests that can be outstanding at any time
++ * is determined by
++ *
++ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
++ *
++ * where mmap_alloc < MAX_DYNAMIC_MEM.
++ *
++ * TODO:
++ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
++ * sysfs.
++ */
++#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
++#define MAX_DYNAMIC_MEM BLK_RING_SIZE
++#define MAX_PENDING_REQS BLK_RING_SIZE
++#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
++#define MMAP_VADDR(_start, _req, _seg) \
++ (_start + \
++ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
++ ((_seg) * PAGE_SIZE))
++
++#define blktap_get(_b) (atomic_inc(&(_b)->refcnt))
++#define blktap_put(_b) \
++ do { \
++ if (atomic_dec_and_test(&(_b)->refcnt)) \
++ wake_up(&(_b)->wq); \
++ } while (0)
++
++struct blktap;
++
++struct grant_handle_pair {
++ grant_handle_t kernel;
++ grant_handle_t user;
++};
++#define INVALID_GRANT_HANDLE 0xFFFF
++
++struct blktap_handle {
++ unsigned int ring;
++ unsigned int device;
++ unsigned int minor;
++};
++
++struct blktap_params {
++ char name[BLKTAP2_MAX_MESSAGE_LEN];
++ unsigned long long capacity;
++ unsigned long sector_size;
++};
++
++struct blktap_device {
++ int users;
++ spinlock_t lock;
++ struct gendisk *gd;
++
++#ifdef ENABLE_PASSTHROUGH
++ struct block_device *bdev;
++#endif
++};
++
++struct blktap_ring {
++ struct vm_area_struct *vma;
++ struct blkif_front_ring ring;
++ struct vm_foreign_map foreign_map;
++ unsigned long ring_vstart;
++ unsigned long user_vstart;
++
++ int response;
++
++ wait_queue_head_t poll_wait;
++
++ dev_t devno;
++ struct device *dev;
++ atomic_t sysfs_refcnt;
++ struct mutex sysfs_mutex;
++};
++
++struct blktap_statistics {
++ unsigned long st_print;
++ int st_rd_req;
++ int st_wr_req;
++ int st_oo_req;
++ int st_rd_sect;
++ int st_wr_sect;
++ s64 st_rd_cnt;
++ s64 st_rd_sum_usecs;
++ s64 st_rd_max_usecs;
++ s64 st_wr_cnt;
++ s64 st_wr_sum_usecs;
++ s64 st_wr_max_usecs;
++};
++
++struct blktap_request {
++ uint64_t id;
++ uint16_t usr_idx;
++
++ uint8_t status;
++ atomic_t pendcnt;
++ uint8_t nr_pages;
++ unsigned short operation;
++
++ struct timeval time;
++ struct grant_handle_pair handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++ struct list_head free_list;
++};
++
++struct blktap {
++ int minor;
++ pid_t pid;
++ atomic_t refcnt;
++ unsigned long dev_inuse;
++
++ struct blktap_params params;
++
++ struct rw_semaphore tap_sem;
++
++ struct blktap_ring ring;
++ struct blktap_device device;
++
++ int pending_cnt;
++ struct blktap_request *pending_requests[MAX_PENDING_REQS];
++ struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++
++ wait_queue_head_t wq;
++ struct list_head deferred_queue;
++
++ struct blktap_statistics stats;
++};
++
++extern struct blktap *blktaps[MAX_BLKTAP_DEVICE];
++
++static inline int
++blktap_active(struct blktap *tap)
++{
++ return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
++}
++
++static inline int
++blktap_validate_params(struct blktap *tap, struct blktap_params *params)
++{
++ /* TODO: sanity check */
++ params->name[sizeof(params->name) - 1] = '\0';
++ BTINFO("%s: capacity: %llu, sector-size: %lu\n",
++ params->name, params->capacity, params->sector_size);
++ return 0;
++}
++
++int blktap_control_destroy_device(struct blktap *);
++
++int blktap_ring_init(int *);
++int blktap_ring_free(void);
++int blktap_ring_create(struct blktap *);
++int blktap_ring_destroy(struct blktap *);
++int blktap_ring_pause(struct blktap *);
++int blktap_ring_resume(struct blktap *);
++void blktap_ring_kick_user(struct blktap *);
++
++int blktap_sysfs_init(void);
++void blktap_sysfs_free(void);
++int blktap_sysfs_create(struct blktap *);
++int blktap_sysfs_destroy(struct blktap *);
++
++int blktap_device_init(int *);
++void blktap_device_free(void);
++int blktap_device_create(struct blktap *);
++int blktap_device_destroy(struct blktap *);
++int blktap_device_pause(struct blktap *);
++int blktap_device_resume(struct blktap *);
++void blktap_device_restart(struct blktap *);
++void blktap_device_finish_request(struct blktap *,
++ struct blkif_response *,
++ struct blktap_request *);
++void blktap_device_fail_pending_requests(struct blktap *);
++#ifdef ENABLE_PASSTHROUGH
++int blktap_device_enable_passthrough(struct blktap *,
++ unsigned, unsigned);
++#endif
++
++void blktap_defer(struct blktap *);
++void blktap_run_deferred(void);
++
++int blktap_request_pool_init(void);
++void blktap_request_pool_free(void);
++int blktap_request_pool_grow(void);
++int blktap_request_pool_shrink(void);
++struct blktap_request *blktap_request_allocate(struct blktap *);
++void blktap_request_free(struct blktap *, struct blktap_request *);
++struct page *request_to_page(struct blktap_request *, int);
++
++static inline unsigned long
++request_to_kaddr(struct blktap_request *req, int seg)
++{
++ unsigned long pfn = page_to_pfn(request_to_page(req, seg));
++ return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++#endif
+diff --git a/drivers/xen/blktap/control.c b/drivers/xen/blktap/control.c
+new file mode 100644
+index 0000000..a4852f7
+--- /dev/null
++++ b/drivers/xen/blktap/control.c
+@@ -0,0 +1,284 @@
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/miscdevice.h>
++
++#include <asm/uaccess.h>
++
++#include "blktap.h"
++
++static DEFINE_SPINLOCK(blktap_control_lock);
++struct blktap *blktaps[MAX_BLKTAP_DEVICE];
++
++static int ring_major;
++static int device_major;
++static int blktap_control_registered;
++
++static void
++blktap_control_initialize_tap(struct blktap *tap)
++{
++ int minor = tap->minor;
++
++ memset(tap, 0, sizeof(*tap));
++ set_bit(BLKTAP_CONTROL, &tap->dev_inuse);
++ init_rwsem(&tap->tap_sem);
++ init_waitqueue_head(&tap->wq);
++ atomic_set(&tap->refcnt, 0);
++ sg_init_table(tap->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++
++ tap->minor = minor;
++}
++
++static struct blktap *
++blktap_control_create_tap(void)
++{
++ int minor;
++ struct blktap *tap;
++
++ tap = kmalloc(sizeof(*tap), GFP_KERNEL);
++ if (unlikely(!tap))
++ return NULL;
++
++ blktap_control_initialize_tap(tap);
++
++ spin_lock_irq(&blktap_control_lock);
++ for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++)
++ if (!blktaps[minor])
++ break;
++
++ if (minor == MAX_BLKTAP_DEVICE) {
++ kfree(tap);
++ tap = NULL;
++ goto out;
++ }
++
++ tap->minor = minor;
++ blktaps[minor] = tap;
++
++out:
++ spin_unlock_irq(&blktap_control_lock);
++ return tap;
++}
++
++static struct blktap *
++blktap_control_allocate_tap(void)
++{
++ int err, minor;
++ struct blktap *tap;
++
++ /*
++ * This is called only from the ioctl, which
++ * means we should always have interrupts enabled.
++ */
++ BUG_ON(irqs_disabled());
++
++ spin_lock_irq(&blktap_control_lock);
++
++ for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) {
++ tap = blktaps[minor];
++ if (!tap)
++ goto found;
++
++ if (!tap->dev_inuse) {
++ blktap_control_initialize_tap(tap);
++ goto found;
++ }
++ }
++
++ tap = NULL;
++
++found:
++ spin_unlock_irq(&blktap_control_lock);
++
++ if (!tap) {
++ tap = blktap_control_create_tap();
++ if (!tap)
++ return NULL;
++ }
++
++ err = blktap_ring_create(tap);
++ if (err) {
++ BTERR("ring creation failed: %d\n", err);
++ clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
++ return NULL;
++ }
++
++ BTINFO("allocated tap %p\n", tap);
++ return tap;
++}
++
++static int
++blktap_control_ioctl(struct inode *inode, struct file *filp,
++ unsigned int cmd, unsigned long arg)
++{
++ unsigned long dev;
++ struct blktap *tap;
++
++ switch (cmd) {
++ case BLKTAP2_IOCTL_ALLOC_TAP: {
++ struct blktap_handle h;
++
++ tap = blktap_control_allocate_tap();
++ if (!tap) {
++ BTERR("error allocating device\n");
++ return -ENOMEM;
++ }
++
++ h.ring = ring_major;
++ h.device = device_major;
++ h.minor = tap->minor;
++
++ if (copy_to_user((struct blktap_handle __user *)arg,
++ &h, sizeof(h))) {
++ blktap_control_destroy_device(tap);
++ return -EFAULT;
++ }
++
++ return 0;
++ }
++
++ case BLKTAP2_IOCTL_FREE_TAP:
++ dev = arg;
++
++ if (dev > MAX_BLKTAP_DEVICE || !blktaps[dev])
++ return -EINVAL;
++
++ blktap_control_destroy_device(blktaps[dev]);
++ return 0;
++ }
++
++ return -ENOIOCTLCMD;
++}
++
++static struct file_operations blktap_control_file_operations = {
++ .owner = THIS_MODULE,
++ .ioctl = blktap_control_ioctl,
++};
++
++static struct miscdevice blktap_misc = {
++ .minor = MISC_DYNAMIC_MINOR,
++ .name = "blktap-control",
++ .fops = &blktap_control_file_operations,
++};
++
++int
++blktap_control_destroy_device(struct blktap *tap)
++{
++ int err;
++ unsigned long inuse;
++
++ if (!tap)
++ return 0;
++
++ set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
++
++ for (;;) {
++ inuse = tap->dev_inuse;
++ err = blktap_device_destroy(tap);
++ if (err)
++ goto wait;
++
++ inuse = tap->dev_inuse;
++ err = blktap_ring_destroy(tap);
++ if (err)
++ goto wait;
++
++ inuse = tap->dev_inuse;
++ err = blktap_sysfs_destroy(tap);
++ if (err)
++ goto wait;
++
++ break;
++
++ wait:
++ BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n",
++ inuse, tap->dev_inuse);
++ if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse))
++ break;
++ }
++
++ clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
++
++ if (tap->dev_inuse == (1UL << BLKTAP_CONTROL)) {
++ err = 0;
++ clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
++ }
++
++ return err;
++}
++
++static int __init
++blktap_control_init(void)
++{
++ int err;
++
++ err = misc_register(&blktap_misc);
++ if (err) {
++ BTERR("misc_register failed for control device");
++ return err;
++ }
++
++ blktap_control_registered = 1;
++ return 0;
++}
++
++static void
++blktap_control_free(void)
++{
++ int i;
++
++ for (i = 0; i < MAX_BLKTAP_DEVICE; i++)
++ blktap_control_destroy_device(blktaps[i]);
++
++ if (blktap_control_registered)
++ if (misc_deregister(&blktap_misc) < 0)
++ BTERR("misc_deregister failed for control device");
++}
++
++static void
++blktap_exit(void)
++{
++ blktap_control_free();
++ blktap_ring_free();
++ blktap_sysfs_free();
++ blktap_device_free();
++ blktap_request_pool_free();
++}
++
++static int __init
++blktap_init(void)
++{
++ int err;
++
++ if (!xen_domain())
++ return -ENODEV;
++
++ err = blktap_request_pool_init();
++ if (err)
++ return err;
++
++ err = blktap_device_init(&device_major);
++ if (err)
++ goto fail;
++
++ err = blktap_ring_init(&ring_major);
++ if (err)
++ goto fail;
++
++ err = blktap_sysfs_init();
++ if (err)
++ goto fail;
++
++ err = blktap_control_init();
++ if (err)
++ goto fail;
++
++ return 0;
++
++fail:
++ blktap_exit();
++ return err;
++}
++
++module_init(blktap_init);
++module_exit(blktap_exit);
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/blktap/device.c b/drivers/xen/blktap/device.c
+new file mode 100644
+index 0000000..a50b622
+--- /dev/null
++++ b/drivers/xen/blktap/device.c
+@@ -0,0 +1,1138 @@
++#include <linux/version.h> /* XXX Remove uses of VERSION instead. */
++#include <linux/fs.h>
++#include <linux/blkdev.h>
++#include <linux/cdrom.h>
++#include <linux/hdreg.h>
++#include <linux/module.h>
++#include <asm/tlbflush.h>
++
++#include <scsi/scsi.h>
++#include <scsi/scsi_ioctl.h>
++
++#include <xen/xenbus.h>
++#include <xen/interface/io/blkif.h>
++
++#include <asm/xen/page.h>
++#include <asm/xen/hypercall.h>
++
++#include "blktap.h"
++
++#include "../blkback/blkback-pagemap.h"
++
++#if 0
++#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
++#else
++#define DPRINTK_IOCTL(_f, _a...) ((void)0)
++#endif
++
++struct blktap_grant_table {
++ int cnt;
++ struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
++};
++
++static int blktap_device_major;
++
++static inline struct blktap *
++dev_to_blktap(struct blktap_device *dev)
++{
++ return container_of(dev, struct blktap, device);
++}
++
++static int
++blktap_device_open(struct block_device * bd, fmode_t mode)
++{
++ struct blktap *tap;
++ struct blktap_device *dev = bd->bd_disk->private_data;
++
++ if (!dev)
++ return -ENOENT;
++
++ tap = dev_to_blktap(dev);
++ if (!blktap_active(tap) ||
++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++ return -ENOENT;
++
++ dev->users++;
++
++ return 0;
++}
++
++static int
++blktap_device_release(struct gendisk *gd, fmode_t mode)
++{
++ struct blktap_device *dev = gd->private_data;
++ struct blktap *tap = dev_to_blktap(dev);
++
++ dev->users--;
++ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++ blktap_device_destroy(tap);
++
++ return 0;
++}
++
++static int
++blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
++{
++ /* We don't have real geometry info, but let's at least return
++ values consistent with the size of the device */
++ sector_t nsect = get_capacity(bd->bd_disk);
++ sector_t cylinders = nsect;
++
++ hg->heads = 0xff;
++ hg->sectors = 0x3f;
++ sector_div(cylinders, hg->heads * hg->sectors);
++ hg->cylinders = cylinders;
++ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
++ hg->cylinders = 0xffff;
++ return 0;
++}
++
++static int
++blktap_device_ioctl(struct block_device *bd, fmode_t mode,
++ unsigned command, unsigned long argument)
++{
++ int i;
++
++ DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
++ command, (long)argument, inode->i_rdev);
++
++ switch (command) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
++ case HDIO_GETGEO: {
++ struct hd_geometry geo;
++ int ret;
++
++ if (!argument)
++ return -EINVAL;
++
++ geo.start = get_start_sect(bd);
++ ret = blktap_device_getgeo(bd, &geo);
++ if (ret)
++ return ret;
++
++ if (copy_to_user((struct hd_geometry __user *)argument, &geo,
++ sizeof(geo)))
++ return -EFAULT;
++
++ return 0;
++ }
++#endif
++ case CDROMMULTISESSION:
++ BTDBG("FIXME: support multisession CDs later\n");
++ for (i = 0; i < sizeof(struct cdrom_multisession); i++)
++ if (put_user(0, (char __user *)(argument + i)))
++ return -EFAULT;
++ return 0;
++
++ case SCSI_IOCTL_GET_IDLUN:
++ if (!access_ok(VERIFY_WRITE, argument,
++ sizeof(struct scsi_idlun)))
++ return -EFAULT;
++
++ /* return 0 for now. */
++ __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
++ __put_user(0,
++ &((struct scsi_idlun __user *)argument)->host_unique_id);
++ return 0;
++
++ default:
++ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
++ command);*/
++ return -EINVAL; /* same return as native Linux */
++ }
++
++ return 0;
++}
++
++static struct block_device_operations blktap_device_file_operations = {
++ .owner = THIS_MODULE,
++ .open = blktap_device_open,
++ .release = blktap_device_release,
++ .ioctl = blktap_device_ioctl,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
++ .getgeo = blktap_device_getgeo
++#endif
++};
++
++static int
++blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
++ unsigned long addr, void *data)
++{
++ pte_t *pte = (pte_t *)data;
++
++ BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte));
++ set_pte(ptep, *pte);
++ return 0;
++}
++
++static int
++blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte)
++{
++ return apply_to_page_range(mm, address,
++ PAGE_SIZE, blktap_map_uaddr_fn, &pte);
++}
++
++static int
++blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
++ unsigned long addr, void *data)
++{
++ struct mm_struct *mm = (struct mm_struct *)data;
++
++ BTDBG("ptep %p\n", ptep);
++ pte_clear(mm, addr, ptep);
++ return 0;
++}
++
++static int
++blktap_umap_uaddr(struct mm_struct *mm, unsigned long address)
++{
++ return apply_to_page_range(mm, address,
++ PAGE_SIZE, blktap_umap_uaddr_fn, mm);
++}
++
++static inline void
++flush_tlb_kernel_page(unsigned long kvaddr)
++{
++ flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE);
++}
++
++static void
++blktap_device_end_dequeued_request(struct blktap_device *dev,
++ struct request *req, int error)
++{
++ unsigned long flags;
++ int ret;
++
++ //spin_lock_irq(&dev->lock);
++ spin_lock_irqsave(dev->gd->queue->queue_lock, flags);
++ ret = __blk_end_request(req, error, blk_rq_bytes(req));
++ spin_unlock_irqrestore(dev->gd->queue->queue_lock, flags);
++ //spin_unlock_irq(&dev->lock);
++
++ BUG_ON(ret);
++}
++
++/*
++ * tap->tap_sem held on entry
++ */
++static void
++blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
++{
++ uint64_t ptep;
++ int ret, usr_idx;
++ unsigned int i, cnt;
++ struct page **map, *page;
++ struct blktap_ring *ring;
++ struct grant_handle_pair *khandle;
++ unsigned long kvaddr, uvaddr, offset;
++ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
++
++ cnt = 0;
++ ring = &tap->ring;
++ usr_idx = request->usr_idx;
++ map = ring->foreign_map.map;
++
++ if (!ring->vma)
++ return;
++
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ zap_page_range(ring->vma,
++ MMAP_VADDR(ring->user_vstart, usr_idx, 0),
++ request->nr_pages << PAGE_SHIFT, NULL);
++
++ for (i = 0; i < request->nr_pages; i++) {
++ kvaddr = request_to_kaddr(request, i);
++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++
++ khandle = request->handles + i;
++
++ if (khandle->kernel != INVALID_GRANT_HANDLE) {
++ gnttab_set_unmap_op(&unmap[cnt], kvaddr,
++ GNTMAP_host_map, khandle->kernel);
++ cnt++;
++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
++ INVALID_P2M_ENTRY);
++ }
++
++ if (khandle->user != INVALID_GRANT_HANDLE) {
++ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++ if (create_lookup_pte_addr(ring->vma->vm_mm,
++ uvaddr, &ptep) != 0) {
++ BTERR("Couldn't get a pte addr!\n");
++ return;
++ }
++
++ gnttab_set_unmap_op(&unmap[cnt], ptep,
++ GNTMAP_host_map
++ | GNTMAP_application_map
++ | GNTMAP_contains_pte,
++ khandle->user);
++ cnt++;
++ }
++
++ offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
++
++ BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
++ "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
++ "0x%08lx, handle: %u\n", offset, map[offset], request,
++ usr_idx, i, kvaddr, khandle->kernel, uvaddr,
++ khandle->user);
++
++ page = map[offset];
++ if (page) {
++ ClearPageReserved(map[offset]);
++ if (blkback_pagemap_contains_page(page))
++ set_page_private(page, 0);
++ }
++ map[offset] = NULL;
++
++ khandle->kernel = INVALID_GRANT_HANDLE;
++ khandle->user = INVALID_GRANT_HANDLE;
++ }
++
++ if (cnt) {
++ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++ unmap, cnt);
++ BUG_ON(ret);
++ }
++
++ if (!xen_feature(XENFEAT_auto_translated_physmap))
++ zap_page_range(ring->vma,
++ MMAP_VADDR(ring->user_vstart, usr_idx, 0),
++ request->nr_pages << PAGE_SHIFT, NULL);
++}
++
++/*
++ * tap->tap_sem held on entry
++ */
++static void
++blktap_unmap(struct blktap *tap, struct blktap_request *request)
++{
++ int i, usr_idx;
++ unsigned long kvaddr;
++
++ usr_idx = request->usr_idx;
++ down_write(&tap->ring.vma->vm_mm->mmap_sem);
++
++ for (i = 0; i < request->nr_pages; i++) {
++ kvaddr = request_to_kaddr(request, i);
++ BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
++ "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
++ kvaddr, request->handles[i].kernel,
++ MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
++ request->handles[i].user);
++
++ if (request->handles[i].kernel == INVALID_GRANT_HANDLE) {
++ blktap_umap_uaddr(tap->ring.vma->vm_mm, kvaddr);
++ flush_tlb_kernel_page(kvaddr);
++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
++ INVALID_P2M_ENTRY);
++ }
++ }
++
++ blktap_device_fast_flush(tap, request);
++ up_write(&tap->ring.vma->vm_mm->mmap_sem);
++}
++
++/*
++ * called if the tapdisk process dies unexpectedly.
++ * fail and release any pending requests and disable queue.
++ */
++void
++blktap_device_fail_pending_requests(struct blktap *tap)
++{
++ int usr_idx;
++ struct request *req;
++ struct blktap_device *dev;
++ struct blktap_request *request;
++
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++ return;
++
++ down_write(&tap->tap_sem);
++
++ dev = &tap->device;
++ for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
++ request = tap->pending_requests[usr_idx];
++ if (!request || request->status != BLKTAP_REQUEST_PENDING)
++ continue;
++
++ BTERR("%u:%u: failing pending %s of %d pages\n",
++ blktap_device_major, tap->minor,
++ (request->operation == BLKIF_OP_READ ?
++ "read" : "write"), request->nr_pages);
++
++ blktap_unmap(tap, request);
++ req = (struct request *)(unsigned long)request->id;
++ blktap_device_end_dequeued_request(dev, req, -EIO);
++ blktap_request_free(tap, request);
++ }
++
++ up_write(&tap->tap_sem);
++
++ spin_lock_irq(&dev->lock);
++
++ /* fail any future requests */
++ dev->gd->queue->queuedata = NULL;
++ blk_start_queue(dev->gd->queue);
++
++ spin_unlock_irq(&dev->lock);
++}
++
++/*
++ * tap->tap_sem held on entry
++ */
++void
++blktap_device_finish_request(struct blktap *tap,
++ struct blkif_response *res,
++ struct blktap_request *request)
++{
++ int ret;
++ struct request *req;
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++
++ blktap_unmap(tap, request);
++
++ req = (struct request *)(unsigned long)request->id;
++ ret = res->status == BLKIF_RSP_OKAY ? 0 : -EIO;
++
++ BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
++ res->status, res->operation, request->operation,
++ (unsigned long long)res->id);
++
++ switch (request->operation) {
++ case BLKIF_OP_READ:
++ case BLKIF_OP_WRITE:
++ if (unlikely(res->status != BLKIF_RSP_OKAY))
++ BTERR("Bad return from device data "
++ "request: %x\n", res->status);
++ blktap_device_end_dequeued_request(dev, req, ret);
++ break;
++ default:
++ BUG();
++ }
++
++ blktap_request_free(tap, request);
++}
++
++static int
++blktap_prep_foreign(struct blktap *tap,
++ struct blktap_request *request,
++ struct blkif_request *blkif_req,
++ unsigned int seg, struct page *page,
++ struct blktap_grant_table *table)
++{
++ uint64_t ptep;
++ uint32_t flags;
++#ifdef BLKTAP_CHAINED_BLKTAP
++ struct page *tap_page;
++#endif
++ struct blktap_ring *ring;
++ struct blkback_pagemap map;
++ unsigned long uvaddr, kvaddr;
++
++ ring = &tap->ring;
++ map = blkback_pagemap_read(page);
++ blkif_req->seg[seg].gref = map.gref;
++
++ uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
++ kvaddr = request_to_kaddr(request, seg);
++ flags = GNTMAP_host_map |
++ (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
++
++ gnttab_set_map_op(&table->grants[table->cnt],
++ kvaddr, flags, map.gref, map.domid);
++ table->cnt++;
++
++
++#ifdef BLKTAP_CHAINED_BLKTAP
++ /* enable chained tap devices */
++ tap_page = request_to_page(request, seg);
++ set_page_private(tap_page, page_private(page));
++ SetPageBlkback(tap_page);
++#endif
++
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ return 0;
++
++ if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
++ BTERR("couldn't get a pte addr!\n");
++ return -1;
++ }
++
++ flags |= GNTMAP_application_map | GNTMAP_contains_pte;
++ gnttab_set_map_op(&table->grants[table->cnt],
++ ptep, flags, map.gref, map.domid);
++ table->cnt++;
++
++ return 0;
++}
++
++static int
++blktap_map_foreign(struct blktap *tap,
++ struct blktap_request *request,
++ struct blkif_request *blkif_req,
++ struct blktap_grant_table *table)
++{
++ struct page *page;
++ int i, grant, err, usr_idx;
++ struct blktap_ring *ring;
++ unsigned long uvaddr, foreign_mfn;
++
++ if (!table->cnt)
++ return 0;
++
++ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++ table->grants, table->cnt);
++ BUG_ON(err);
++
++ grant = 0;
++ usr_idx = request->usr_idx;
++ ring = &tap->ring;
++
++ for (i = 0; i < request->nr_pages; i++) {
++ if (!blkif_req->seg[i].gref)
++ continue;
++
++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++
++ if (unlikely(table->grants[grant].status)) {
++ BTERR("invalid kernel buffer: could not remap it\n");
++ err |= 1;
++ table->grants[grant].handle = INVALID_GRANT_HANDLE;
++ }
++
++ request->handles[i].kernel = table->grants[grant].handle;
++ foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
++ grant++;
++
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ goto done;
++
++ if (unlikely(table->grants[grant].status)) {
++ BTERR("invalid user buffer: could not remap it\n");
++ err |= 1;
++ table->grants[grant].handle = INVALID_GRANT_HANDLE;
++ }
++
++ request->handles[i].user = table->grants[grant].handle;
++ grant++;
++
++ done:
++ if (err)
++ continue;
++
++ page = request_to_page(request, i);
++
++ if (!xen_feature(XENFEAT_auto_translated_physmap))
++ set_phys_to_machine(page_to_pfn(page),
++ FOREIGN_FRAME(foreign_mfn));
++ else if (vm_insert_page(ring->vma, uvaddr, page))
++ err |= 1;
++
++ BTDBG("pending_req: %p, seg: %d, page: %p, "
++ "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, "
++ "uhandle: %u\n", request, i, page,
++ pfn_to_kaddr(page_to_pfn(page)),
++ request->handles[i].kernel,
++ uvaddr, request->handles[i].user);
++ }
++
++ return err;
++}
++
++static void
++blktap_map(struct blktap *tap,
++ struct blktap_request *request,
++ unsigned int seg, struct page *page)
++{
++ pte_t pte;
++ int usr_idx;
++ struct blktap_ring *ring;
++ unsigned long uvaddr, kvaddr;
++
++ ring = &tap->ring;
++ usr_idx = request->usr_idx;
++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
++ kvaddr = request_to_kaddr(request, seg);
++
++ pte = mk_pte(page, ring->vma->vm_page_prot);
++ blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte));
++ flush_tlb_page(ring->vma, uvaddr);
++ blktap_map_uaddr(ring->vma->vm_mm, kvaddr, mk_pte(page, PAGE_KERNEL));
++ flush_tlb_kernel_page(kvaddr);
++
++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
++ request->handles[seg].kernel = INVALID_GRANT_HANDLE;
++ request->handles[seg].user = INVALID_GRANT_HANDLE;
++
++ BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
++ "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
++ uvaddr);
++}
++
++static int
++blktap_device_process_request(struct blktap *tap,
++ struct blktap_request *request,
++ struct request *req)
++{
++ struct page *page;
++ int i, usr_idx, err;
++ struct blktap_ring *ring;
++ struct scatterlist *sg;
++ struct blktap_grant_table table;
++ unsigned int fsect, lsect, nr_sects;
++ unsigned long offset, uvaddr;
++ struct blkif_request blkif_req, *target;
++
++ err = -1;
++ memset(&table, 0, sizeof(table));
++
++ if (!blktap_active(tap))
++ goto out;
++
++ ring = &tap->ring;
++ usr_idx = request->usr_idx;
++ blkif_req.id = usr_idx;
++ blkif_req.sector_number = (blkif_sector_t)blk_rq_pos(req);
++ blkif_req.handle = 0;
++ blkif_req.operation = rq_data_dir(req) ?
++ BLKIF_OP_WRITE : BLKIF_OP_READ;
++
++ request->id = (unsigned long)req;
++ request->operation = blkif_req.operation;
++ request->status = BLKTAP_REQUEST_PENDING;
++ do_gettimeofday(&request->time);
++
++ nr_sects = 0;
++ request->nr_pages = 0;
++ blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg);
++ BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
++ for (i = 0; i < blkif_req.nr_segments; ++i) {
++ sg = tap->sg + i;
++ fsect = sg->offset >> 9;
++ lsect = fsect + (sg->length >> 9) - 1;
++ nr_sects += sg->length >> 9;
++
++ blkif_req.seg[i] =
++ (struct blkif_request_segment) {
++ .gref = 0,
++ .first_sect = fsect,
++ .last_sect = lsect };
++
++ if (blkback_pagemap_contains_page(sg_page(sg))) {
++ /* foreign page -- use xen */
++ if (blktap_prep_foreign(tap,
++ request,
++ &blkif_req,
++ i,
++ sg_page(sg),
++ &table))
++ goto out;
++ } else {
++ /* do it the old fashioned way */
++ blktap_map(tap,
++ request,
++ i,
++ sg_page(sg));
++ }
++
++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++ offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
++ page = request_to_page(request, i);
++ ring->foreign_map.map[offset] = page;
++ SetPageReserved(page);
++
++ BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
++ uvaddr, page, page_to_pfn(page));
++ BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
++ "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n",
++ offset, request, i,
++ page, pfn_to_kaddr(page_to_pfn(page)), uvaddr);
++
++ request->nr_pages++;
++ }
++
++ if (blktap_map_foreign(tap, request, &blkif_req, &table))
++ goto out;
++
++ /* Finally, write the request message to the user ring. */
++ target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
++ memcpy(target, &blkif_req, sizeof(blkif_req));
++ target->id = request->usr_idx;
++ wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
++ ring->ring.req_prod_pvt++;
++
++ if (rq_data_dir(req)) {
++ tap->stats.st_wr_sect += nr_sects;
++ tap->stats.st_wr_req++;
++ } else {
++ tap->stats.st_rd_sect += nr_sects;
++ tap->stats.st_rd_req++;
++ }
++
++ err = 0;
++
++out:
++ if (err)
++ blktap_device_fast_flush(tap, request);
++ return err;
++}
++
++#ifdef ENABLE_PASSTHROUGH
++#define rq_for_each_bio_safe(_bio, _tmp, _req) \
++ if ((_req)->bio) \
++ for (_bio = (_req)->bio; \
++ _bio && ((_tmp = _bio->bi_next) || 1); \
++ _bio = _tmp)
++
++static void
++blktap_device_forward_request(struct blktap *tap, struct request *req)
++{
++ struct bio *bio, *tmp;
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++
++ rq_for_each_bio_safe(bio, tmp, req) {
++ bio->bi_bdev = dev->bdev;
++ submit_bio(bio->bi_rw, bio);
++ }
++}
++
++static void
++blktap_device_close_bdev(struct blktap *tap)
++{
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++
++ if (dev->bdev)
++ blkdev_put(dev->bdev);
++
++ dev->bdev = NULL;
++ clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++}
++
++static int
++blktap_device_open_bdev(struct blktap *tap, u32 pdev)
++{
++ struct block_device *bdev;
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++
++ bdev = open_by_devnum(pdev, FMODE_WRITE);
++ if (IS_ERR(bdev)) {
++ BTERR("opening device %x:%x failed: %ld\n",
++ MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
++ return PTR_ERR(bdev);
++ }
++
++ if (!bdev->bd_disk) {
++ BTERR("device %x:%x doesn't exist\n",
++ MAJOR(pdev), MINOR(pdev));
++ blkdev_put(dev->bdev);
++ return -ENOENT;
++ }
++
++ dev->bdev = bdev;
++ set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++
++ /* TODO: readjust queue parameters */
++
++ BTINFO("set device %d to passthrough on %x:%x\n",
++ tap->minor, MAJOR(pdev), MINOR(pdev));
++
++ return 0;
++}
++
++int
++blktap_device_enable_passthrough(struct blktap *tap,
++ unsigned major, unsigned minor)
++{
++ u32 pdev;
++ struct blktap_device *dev;
++
++ dev = &tap->device;
++ pdev = MKDEV(major, minor);
++
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EINVAL;
++
++ if (dev->bdev) {
++ if (pdev)
++ return -EINVAL;
++ blktap_device_close_bdev(tap);
++ return 0;
++ }
++
++ return blktap_device_open_bdev(tap, pdev);
++}
++#endif
++
++/*
++ * dev->lock held on entry
++ */
++static void
++blktap_device_run_queue(struct blktap *tap)
++{
++ int queued, err;
++ struct request_queue *rq;
++ struct request *req;
++ struct blktap_ring *ring;
++ struct blktap_device *dev;
++ struct blktap_request *request;
++
++ queued = 0;
++ ring = &tap->ring;
++ dev = &tap->device;
++ rq = dev->gd->queue;
++
++ BTDBG("running queue for %d\n", tap->minor);
++
++ while ((req = blk_peek_request(rq)) != NULL) {
++ if (!blk_fs_request(req)) {
++ __blk_end_request_cur(req, 0);
++ continue;
++ }
++
++ if (blk_barrier_rq(req)) {
++ __blk_end_request_cur(req, 0);
++ continue;
++ }
++
++#ifdef ENABLE_PASSTHROUGH
++ if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
++ blkdev_dequeue_request(req);
++ blktap_device_forward_request(tap, req);
++ continue;
++ }
++#endif
++
++ if (RING_FULL(&ring->ring)) {
++ wait:
++ /* Avoid pointless unplugs. */
++ blk_stop_queue(rq);
++ blktap_defer(tap);
++ break;
++ }
++
++ request = blktap_request_allocate(tap);
++ if (!request) {
++ tap->stats.st_oo_req++;
++ goto wait;
++ }
+
-+static int dealloc_pte_fn(pte_t *pte, struct page *pmd_page,
-+ unsigned long addr, void *data)
-+{
-+ unsigned long mfn = pte_mfn(*pte);
-+ int ret;
-+ struct xen_memory_reservation reservation = {
-+ .nr_extents = 1,
-+ .extent_order = 0,
-+ .domid = DOMID_SELF
-+ };
++ BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) "
++ "buffer:%p [%s], pending: %p\n", req, tap->minor,
++ req->cmd, (unsigned long long)blk_rq_pos(req),
++ blk_rq_cur_sectors(req),
++ blk_rq_sectors(req), req->buffer,
++ rq_data_dir(req) ? "write" : "read", request);
+
-+ set_xen_guest_handle(reservation.extent_start, &mfn);
-+ set_pte_at(&init_mm, addr, pte, __pte_ma(0));
-+ set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
++ blk_start_request(req);
+
-+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
-+ BUG_ON(ret != 1);
++ spin_unlock_irq(&dev->lock);
++ down_read(&tap->tap_sem);
+
-+ return 0;
++ err = blktap_device_process_request(tap, request, req);
++ if (!err)
++ queued++;
++ else {
++ blktap_device_end_dequeued_request(dev, req, -EIO);
++ blktap_request_free(tap, request);
++ }
++
++ up_read(&tap->tap_sem);
++ spin_lock_irq(&dev->lock);
++ }
++
++ if (queued)
++ blktap_ring_kick_user(tap);
+}
+
-+struct page **alloc_empty_pages_and_pagevec(int nr_pages)
++/*
++ * dev->lock held on entry
++ */
++static void
++blktap_device_do_request(struct request_queue *rq)
+{
-+ struct page *page, **pagevec;
-+ int npages;
-+ int i, j, ret;
++ struct request *req;
++ struct blktap *tap;
++ struct blktap_device *dev;
+
-+ /* Round up to next number of balloon_order pages */
-+ npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
++ dev = rq->queuedata;
++ if (!dev)
++ goto fail;
+
-+ pagevec = kmalloc(sizeof(page) * nr_pages << balloon_order, GFP_KERNEL);
-+ if (pagevec == NULL)
-+ return NULL;
++ tap = dev_to_blktap(dev);
++ if (!blktap_active(tap))
++ goto fail;
+
-+ for (i = 0; i < nr_pages; i++) {
-+ void *v;
++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
++ test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++ blktap_defer(tap);
++ return;
++ }
+
-+ page = alloc_pages(GFP_KERNEL|__GFP_COLD, balloon_order);
-+ if (page == NULL)
-+ goto err;
++ blktap_device_run_queue(tap);
++ return;
+
-+ scrub_page(page);
++fail:
++ while ((req = blk_peek_request(rq))) {
++ BTERR("device closed: failing secs %llu - %llu\n",
++ (unsigned long long)blk_rq_pos(req),
++ (unsigned long long)blk_rq_pos(req) + blk_rq_sectors(req));
++ __blk_end_request_cur(req, 0);
++ }
++}
+
-+ mutex_lock(&balloon_mutex);
++void
++blktap_device_restart(struct blktap *tap)
++{
++ struct blktap_device *dev;
+
-+ v = page_address(page);
++ dev = &tap->device;
+
-+ ret = apply_to_page_range(&init_mm, (unsigned long)v,
-+ PAGE_SIZE << balloon_order,
-+ dealloc_pte_fn, NULL);
++ if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
++ blktap_defer(tap);
++ return;
++ }
+
-+ if (ret != 0) {
-+ mutex_unlock(&balloon_mutex);
-+ //balloon_free_page(page); /* tries to use free_cold_page */
-+ __free_page(page);
-+ goto err;
-+ }
-+ for (j = 0; j < balloon_npages; j++)
-+ pagevec[(i<<balloon_order)+j] = page++;
++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
++ test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++ blktap_defer(tap);
++ return;
++ }
+
-+ totalram_pages = balloon_stats.current_pages -= balloon_npages;
++ spin_lock_irq(&dev->lock);
+
-+ mutex_unlock(&balloon_mutex);
-+ }
++ /* Re-enable calldowns. */
++ if (dev->gd) {
++ struct request_queue *rq = dev->gd->queue;
+
-+ out:
-+ schedule_work(&balloon_worker);
-+ flush_tlb_all();
-+ return pagevec;
++ if (blk_queue_stopped(rq))
++ blk_start_queue(rq);
+
-+ err:
-+ mutex_lock(&balloon_mutex);
-+ while (--i >= 0)
-+ balloon_append(pagevec[i << balloon_order]);
-+ mutex_unlock(&balloon_mutex);
-+ kfree(pagevec);
-+ pagevec = NULL;
-+ goto out;
++ /* Kick things off immediately. */
++ blktap_device_do_request(rq);
++ }
++
++ spin_unlock_irq(&dev->lock);
+}
-+EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
+
-+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
++static void
++blktap_device_configure(struct blktap *tap)
+{
-+ struct page *page;
-+ int i;
-+ int npages;
++ struct request_queue *rq;
++ struct blktap_device *dev = &tap->device;
+
-+ if (pagevec == NULL)
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
+ return;
+
-+ /* Round up to next number of balloon_order pages */
-+ npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
-+
-+ mutex_lock(&balloon_mutex);
-+ for (i = 0; i < nr_pages; i++) {
-+ page = pagevec[i << balloon_order];
-+ BUG_ON(page_count(page) != 1);
-+ balloon_append(page);
-+ }
-+ mutex_unlock(&balloon_mutex);
-+
-+ kfree(pagevec);
-+
-+ schedule_work(&balloon_worker);
-+}
-+EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
++ dev = &tap->device;
++ rq = dev->gd->queue;
+
- #define BALLOON_SHOW(name, format, args...) \
- static ssize_t show_##name(struct sys_device *dev, \
- struct sysdev_attribute *attr, \
-@@ -477,7 +647,7 @@ static ssize_t store_target_kb(struct sys_device *dev,
-
- target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
-
-- balloon_set_new_target(target_bytes >> PAGE_SHIFT);
-+ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
-
- return count;
- }
-@@ -491,7 +661,7 @@ static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr
- {
- return sprintf(buf, "%llu\n",
- (unsigned long long)balloon_stats.target_pages
-- << PAGE_SHIFT);
-+ << (PAGE_SHIFT + balloon_order));
- }
-
- static ssize_t store_target(struct sys_device *dev,
-@@ -507,7 +677,7 @@ static ssize_t store_target(struct sys_device *dev,
-
- target_bytes = memparse(buf, &endchar);
-
-- balloon_set_new_target(target_bytes >> PAGE_SHIFT);
-+ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
-
- return count;
- }
-diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
-new file mode 100644
-index 0000000..d40f534
---- /dev/null
-+++ b/drivers/xen/biomerge.c
-@@ -0,0 +1,14 @@
-+#include <linux/bio.h>
-+#include <asm/io.h>
-+#include <xen/page.h>
++ spin_lock_irq(&dev->lock);
+
-+bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
-+ const struct bio_vec *vec2)
-+{
-+ unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
-+ unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
++ set_capacity(dev->gd, tap->params.capacity);
+
-+ return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
-+ ((mfn1 == mfn2) || ((mfn1+1) == mfn2));
-+}
++ /* Hard sector size and max sectors impersonate the equiv. hardware. */
++ blk_queue_logical_block_size(rq, tap->params.sector_size);
++ blk_queue_max_sectors(rq, 512);
+
-diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
-new file mode 100644
-index 0000000..dee55ba
---- /dev/null
-+++ b/drivers/xen/blkback/Makefile
-@@ -0,0 +1,4 @@
-+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
-+obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o
++ /* Each segment in a request is up to an aligned page in size. */
++ blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
++ blk_queue_max_segment_size(rq, PAGE_SIZE);
+
-+xen-blkback-y := blkback.o xenbus.o interface.o vbd.o
-diff --git a/drivers/xen/blkback/blkback-pagemap.c b/drivers/xen/blkback/blkback-pagemap.c
-new file mode 100644
-index 0000000..45f6eb2
---- /dev/null
-+++ b/drivers/xen/blkback/blkback-pagemap.c
-@@ -0,0 +1,109 @@
-+#include <linux/module.h>
-+#include "blkback-pagemap.h"
++ /* Ensure a merged request will fit in a single I/O ring slot. */
++ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
-+static int blkback_pagemap_size;
-+static struct blkback_pagemap *blkback_pagemap;
++ /* Make sure buffer addresses are sector-aligned. */
++ blk_queue_dma_alignment(rq, 511);
+
-+static inline int
-+blkback_pagemap_entry_clear(struct blkback_pagemap *map)
-+{
-+ static struct blkback_pagemap zero;
-+ return !memcmp(map, &zero, sizeof(zero));
++ spin_unlock_irq(&dev->lock);
+}
+
+int
-+blkback_pagemap_init(int pages)
++blktap_device_resume(struct blktap *tap)
+{
-+ blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap),
-+ GFP_KERNEL);
-+ if (!blkback_pagemap)
-+ return -ENOMEM;
++ int err;
+
-+ blkback_pagemap_size = pages;
-+ return 0;
-+}
-+EXPORT_SYMBOL_GPL(blkback_pagemap_init);
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
++ return -ENODEV;
+
-+void
-+blkback_pagemap_set(int idx, struct page *page,
-+ domid_t domid, busid_t busid, grant_ref_t gref)
-+{
-+ struct blkback_pagemap *entry;
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return 0;
+
-+ BUG_ON(!blkback_pagemap);
-+ BUG_ON(idx >= blkback_pagemap_size);
++ err = blktap_ring_resume(tap);
++ if (err)
++ return err;
+
-+ set_page_private(page, idx);
++ /* device size may have changed */
++ blktap_device_configure(tap);
+
-+ entry = blkback_pagemap + idx;
-+ if (!blkback_pagemap_entry_clear(entry)) {
-+ printk("overwriting pagemap %d: d %u b %u g %u\n",
-+ idx, entry->domid, entry->busid, entry->gref);
-+ BUG();
-+ }
++ BTDBG("restarting device\n");
++ blktap_device_restart(tap);
+
-+ entry->page = page;
-+ entry->domid = domid;
-+ entry->busid = busid;
-+ entry->gref = gref;
++ return 0;
+}
-+EXPORT_SYMBOL_GPL(blkback_pagemap_set);
+
-+void
-+blkback_pagemap_clear(struct page *page)
++int
++blktap_device_pause(struct blktap *tap)
+{
-+ int idx;
-+ struct blkback_pagemap *entry;
-+
-+ idx = (int)page_private(page);
-+
-+ BUG_ON(!blkback_pagemap);
-+ BUG_ON(idx >= blkback_pagemap_size);
-+
-+ entry = blkback_pagemap + idx;
-+ if (blkback_pagemap_entry_clear(entry)) {
-+ printk("clearing empty pagemap %d\n", idx);
-+ BUG();
-+ }
++ unsigned long flags;
++ struct blktap_device *dev = &tap->device;
+
-+ memset(entry, 0, sizeof(*entry));
-+}
-+EXPORT_SYMBOL_GPL(blkback_pagemap_clear);
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
++ return -ENODEV;
+
-+struct blkback_pagemap
-+blkback_pagemap_read(struct page *page)
-+{
-+ int idx;
-+ struct blkback_pagemap *entry;
++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return 0;
+
-+ idx = (int)page_private(page);
++ spin_lock_irqsave(&dev->lock, flags);
+
-+ BUG_ON(!blkback_pagemap);
-+ BUG_ON(idx >= blkback_pagemap_size);
++ blk_stop_queue(dev->gd->queue);
++ set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
+
-+ entry = blkback_pagemap + idx;
-+ if (blkback_pagemap_entry_clear(entry)) {
-+ printk("reading empty pagemap %d\n", idx);
-+ BUG();
-+ }
++ spin_unlock_irqrestore(&dev->lock, flags);
+
-+ return *entry;
++ return blktap_ring_pause(tap);
+}
-+EXPORT_SYMBOL(blkback_pagemap_read);
-+
-+MODULE_LICENSE("Dual BSD/GPL");
+
+int
-+blkback_pagemap_contains_page(struct page *page)
++blktap_device_destroy(struct blktap *tap)
+{
-+ struct blkback_pagemap *entry;
-+ int idx = (int)page_private(page);
++ struct blktap_device *dev = &tap->device;
++ struct gendisk *gd = dev->gd;
+
-+ if (idx < 0 || idx >= blkback_pagemap_size)
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+ return 0;
+
-+ entry = blkback_pagemap + idx;
++ BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
+
-+ return (entry->page == page);
-+}
-+EXPORT_SYMBOL(blkback_pagemap_contains_page);
-diff --git a/drivers/xen/blkback/blkback-pagemap.h b/drivers/xen/blkback/blkback-pagemap.h
-new file mode 100644
-index 0000000..7f97d15
---- /dev/null
-+++ b/drivers/xen/blkback/blkback-pagemap.h
-@@ -0,0 +1,36 @@
-+#ifndef _BLKBACK_PAGEMAP_H_
-+#define _BLKBACK_PAGEMAP_H_
++ if (dev->users)
++ return -EBUSY;
+
-+#include <linux/mm.h>
-+#include <xen/interface/xen.h>
-+#include <xen/interface/grant_table.h>
++ spin_lock_irq(&dev->lock);
++ /* No more blktap_device_do_request(). */
++ blk_stop_queue(gd->queue);
++ clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++ dev->gd = NULL;
++ spin_unlock_irq(&dev->lock);
+
-+typedef unsigned int busid_t;
++#ifdef ENABLE_PASSTHROUGH
++ if (dev->bdev)
++ blktap_device_close_bdev(tap);
++#endif
+
-+struct blkback_pagemap {
-+ struct page *page;
-+ domid_t domid;
-+ busid_t busid;
-+ grant_ref_t gref;
-+};
++ del_gendisk(gd);
++ blk_cleanup_queue(gd->queue);
++ put_disk(gd);
+
-+#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE)
++ wake_up(&tap->wq);
+
-+int blkback_pagemap_init(int);
-+void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t);
-+void blkback_pagemap_clear(struct page *);
-+struct blkback_pagemap blkback_pagemap_read(struct page *);
-+int blkback_pagemap_contains_page(struct page *page);
++ return 0;
++}
+
-+#else /* CONFIG_XEN_BLKBACK_PAGEMAP */
++int
++blktap_device_create(struct blktap *tap)
++{
++ int minor, err;
++ struct gendisk *gd;
++ struct request_queue *rq;
++ struct blktap_device *dev;
+
-+static inline int blkback_pagemap_init(int pages) { return 0; }
-+static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom,
-+ busid_t bus, grant_ref_t gnt) {}
-+static inline void blkback_pagemap_clear(struct page *page) {}
-+#define blkback_pagemap_read(_page) ({ BUG(); (struct blkback_pagemap){0}; })
-+static inline int blkback_pagemap_contains_page(struct page *page) { return 0; }
++ gd = NULL;
++ rq = NULL;
++ dev = &tap->device;
++ minor = tap->minor;
+
-+#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */
++ if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++ return -EEXIST;
+
-+#endif
-diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
-new file mode 100644
-index 0000000..e644dd5
---- /dev/null
-+++ b/drivers/xen/blkback/blkback.c
-@@ -0,0 +1,672 @@
-+/******************************************************************************
-+ * arch/xen/drivers/blkif/backend/main.c
-+ *
-+ * Back-end of the driver for virtual block devices. This portion of the
-+ * driver exports a 'unified' block-device interface that can be accessed
-+ * by any operating system that implements a compatible front end. A
-+ * reference front-end implementation can be found in:
-+ * arch/xen/drivers/blkif/frontend
-+ *
-+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
-+ * Copyright (c) 2005, Christopher Clark
-+ *
-+ * This program is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU General Public License version 2
-+ * as published by the Free Software Foundation; or, when distributed
-+ * separately from the Linux kernel or incorporated into other
-+ * software packages, subject to the following license:
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this source file (the "Software"), to deal in the Software without
-+ * restriction, including without limitation the rights to use, copy, modify,
-+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-+ * and to permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ */
++ if (blktap_validate_params(tap, &tap->params))
++ return -EINVAL;
+
-+#include <linux/spinlock.h>
-+#include <linux/kthread.h>
-+#include <linux/list.h>
-+#include <linux/delay.h>
-+#include <linux/freezer.h>
++ BTINFO("minor %d sectors %Lu sector-size %lu\n",
++ minor, tap->params.capacity, tap->params.sector_size);
+
-+#include <xen/balloon.h>
-+#include <xen/events.h>
-+#include <xen/page.h>
-+#include <asm/xen/hypervisor.h>
-+#include <asm/xen/hypercall.h>
-+#include "common.h"
++ err = -ENODEV;
+
-+/*
-+ * These are rather arbitrary. They are fairly large because adjacent requests
-+ * pulled from a communication ring are quite likely to end up being part of
-+ * the same scatter/gather request at the disc.
-+ *
-+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
-+ *
-+ * This will increase the chances of being able to write whole tracks.
-+ * 64 should be enough to keep us competitive with Linux.
-+ */
-+static int blkif_reqs = 64;
-+module_param_named(reqs, blkif_reqs, int, 0);
-+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
++ gd = alloc_disk(1);
++ if (!gd)
++ goto error;
++
++ if (minor < 26)
++ sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
++ else
++ sprintf(gd->disk_name, "tapdev%c%c",
++ 'a' + ((minor / 26) - 1), 'a' + (minor % 26));
++
++ gd->major = blktap_device_major;
++ gd->first_minor = minor;
++ gd->fops = &blktap_device_file_operations;
++ gd->private_data = dev;
++
++ spin_lock_init(&dev->lock);
++ rq = blk_init_queue(blktap_device_do_request, &dev->lock);
++ if (!rq)
++ goto error;
+
-+/* Run-time switchable: /sys/module/blkback/parameters/ */
-+static unsigned int log_stats = 0;
-+static unsigned int debug_lvl = 0;
-+module_param(log_stats, int, 0644);
-+module_param(debug_lvl, int, 0644);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
++ elevator_init(rq, "noop");
++#else
++ elevator_init(rq, &elevator_noop);
++#endif
+
-+/*
-+ * Each outstanding request that we've passed to the lower device layers has a
-+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
-+ * the pendcnt towards zero. When it hits zero, the specified domain has a
-+ * response queued for it, with the saved 'id' passed back.
-+ */
-+typedef struct {
-+ blkif_t *blkif;
-+ u64 id;
-+ int nr_pages;
-+ atomic_t pendcnt;
-+ unsigned short operation;
-+ int status;
-+ struct list_head free_list;
-+} pending_req_t;
++ gd->queue = rq;
++ rq->queuedata = dev;
++ dev->gd = gd;
+
-+static pending_req_t *pending_reqs;
-+static struct list_head pending_free;
-+static DEFINE_SPINLOCK(pending_free_lock);
-+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
++ set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++ blktap_device_configure(tap);
+
-+#define BLKBACK_INVALID_HANDLE (~0)
++ add_disk(gd);
+
-+static struct page **pending_pages;
-+static grant_handle_t *pending_grant_handles;
++ err = 0;
++ goto out;
+
-+static inline int vaddr_pagenr(pending_req_t *req, int seg)
-+{
-+ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
++ error:
++ if (gd)
++ del_gendisk(gd);
++ if (rq)
++ blk_cleanup_queue(rq);
++
++ out:
++ BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
++ return err;
+}
+
-+#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
++int __init
++blktap_device_init(int *maj)
++{
++ int major;
+
-+static inline unsigned long vaddr(pending_req_t *req, int seg)
++ /* Dynamically allocate a major for this device */
++ major = register_blkdev(0, "tapdev");
++ if (major < 0) {
++ BTERR("Couldn't register blktap device\n");
++ return -ENOMEM;
++ }
++
++ blktap_device_major = *maj = major;
++ BTINFO("blktap device major %d\n", major);
++
++ return 0;
++}
++
++void
++blktap_device_free(void)
+{
-+ unsigned long pfn = page_to_pfn(pending_page(req, seg));
-+ return (unsigned long)pfn_to_kaddr(pfn);
++ if (blktap_device_major)
++ unregister_blkdev(blktap_device_major, "tapdev");
+}
+diff --git a/drivers/xen/blktap/request.c b/drivers/xen/blktap/request.c
+new file mode 100644
+index 0000000..770736a
+--- /dev/null
++++ b/drivers/xen/blktap/request.c
+@@ -0,0 +1,297 @@
++#include <linux/spinlock.h>
++#include <xen/balloon.h>
++#include <linux/sched.h>
+
-+#define pending_handle(_req, _seg) \
-+ (pending_grant_handles[vaddr_pagenr(_req, _seg)])
++#include "blktap.h"
+
++#define MAX_BUCKETS 8
++#define BUCKET_SIZE MAX_PENDING_REQS
+
-+static int do_block_io_op(blkif_t *blkif);
-+static void dispatch_rw_block_io(blkif_t *blkif,
-+ struct blkif_request *req,
-+ pending_req_t *pending_req);
-+static void make_response(blkif_t *blkif, u64 id,
-+ unsigned short op, int st);
++#define BLKTAP_POOL_CLOSING 1
+
-+/******************************************************************
-+ * misc small helpers
-+ */
-+static pending_req_t* alloc_req(void)
++struct blktap_request_bucket;
++
++struct blktap_request_handle {
++ int slot;
++ uint8_t inuse;
++ struct blktap_request request;
++ struct blktap_request_bucket *bucket;
++};
++
++struct blktap_request_bucket {
++ atomic_t reqs_in_use;
++ struct blktap_request_handle handles[BUCKET_SIZE];
++ struct page **foreign_pages;
++};
++
++struct blktap_request_pool {
++ spinlock_t lock;
++ uint8_t status;
++ struct list_head free_list;
++ atomic_t reqs_in_use;
++ wait_queue_head_t wait_queue;
++ struct blktap_request_bucket *buckets[MAX_BUCKETS];
++};
++
++static struct blktap_request_pool pool;
++
++static inline struct blktap_request_handle *
++blktap_request_to_handle(struct blktap_request *req)
+{
-+ pending_req_t *req = NULL;
-+ unsigned long flags;
++ return container_of(req, struct blktap_request_handle, request);
++}
+
-+ spin_lock_irqsave(&pending_free_lock, flags);
-+ if (!list_empty(&pending_free)) {
-+ req = list_entry(pending_free.next, pending_req_t, free_list);
-+ list_del(&req->free_list);
++static void
++blktap_request_pool_init_request(struct blktap_request *request)
++{
++ int i;
++
++ request->usr_idx = -1;
++ request->nr_pages = 0;
++ request->status = BLKTAP_REQUEST_FREE;
++ INIT_LIST_HEAD(&request->free_list);
++ for (i = 0; i < ARRAY_SIZE(request->handles); i++) {
++ request->handles[i].user = INVALID_GRANT_HANDLE;
++ request->handles[i].kernel = INVALID_GRANT_HANDLE;
+ }
-+ spin_unlock_irqrestore(&pending_free_lock, flags);
-+ return req;
+}
+
-+static void free_req(pending_req_t *req)
++static int
++blktap_request_pool_allocate_bucket(void)
+{
++ int i, idx;
+ unsigned long flags;
-+ int was_empty;
++ struct blktap_request *request;
++ struct blktap_request_handle *handle;
++ struct blktap_request_bucket *bucket;
+
-+ spin_lock_irqsave(&pending_free_lock, flags);
-+ was_empty = list_empty(&pending_free);
-+ list_add(&req->free_list, &pending_free);
-+ spin_unlock_irqrestore(&pending_free_lock, flags);
-+ if (was_empty)
-+ wake_up(&pending_free_wq);
++ bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL);
++ if (!bucket)
++ goto fail;
++
++ bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES);
++ if (!bucket->foreign_pages)
++ goto fail;
++
++ spin_lock_irqsave(&pool.lock, flags);
++
++ idx = -1;
++ for (i = 0; i < MAX_BUCKETS; i++) {
++ if (!pool.buckets[i]) {
++ idx = i;
++ pool.buckets[idx] = bucket;
++ break;
++ }
++ }
++
++ if (idx == -1) {
++ spin_unlock_irqrestore(&pool.lock, flags);
++ goto fail;
++ }
++
++ for (i = 0; i < BUCKET_SIZE; i++) {
++ handle = bucket->handles + i;
++ request = &handle->request;
++
++ handle->slot = i;
++ handle->inuse = 0;
++ handle->bucket = bucket;
++
++ blktap_request_pool_init_request(request);
++ list_add_tail(&request->free_list, &pool.free_list);
++ }
++
++ spin_unlock_irqrestore(&pool.lock, flags);
++
++ return 0;
++
++fail:
++ if (bucket && bucket->foreign_pages)
++ free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
++ kfree(bucket);
++ return -ENOMEM;
+}
+
-+static void unplug_queue(blkif_t *blkif)
++static void
++blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket)
+{
-+ if (blkif->plug == NULL)
++ if (!bucket)
+ return;
-+ if (blkif->plug->unplug_fn)
-+ blkif->plug->unplug_fn(blkif->plug);
-+ blk_put_queue(blkif->plug);
-+ blkif->plug = NULL;
++
++ BTDBG("freeing bucket %p\n", bucket);
++
++ free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
++ kfree(bucket);
+}
+
-+static void plug_queue(blkif_t *blkif, struct block_device *bdev)
++struct page *
++request_to_page(struct blktap_request *req, int seg)
+{
-+ struct request_queue *q = bdev_get_queue(bdev);
-+
-+ if (q == blkif->plug)
-+ return;
-+ unplug_queue(blkif);
-+ blk_get_queue(q);
-+ blkif->plug = q;
++ struct blktap_request_handle *handle = blktap_request_to_handle(req);
++ int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
++ return handle->bucket->foreign_pages[idx];
+}
+
-+static void fast_flush_area(pending_req_t *req)
++int
++blktap_request_pool_shrink(void)
+{
-+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-+ unsigned int i, invcount = 0;
-+ grant_handle_t handle;
-+ int ret;
++ int i, err;
++ unsigned long flags;
++ struct blktap_request_bucket *bucket;
++
++ err = -EAGAIN;
++
++ spin_lock_irqsave(&pool.lock, flags);
++
++ /* always keep at least one bucket */
++ for (i = 1; i < MAX_BUCKETS; i++) {
++ bucket = pool.buckets[i];
++ if (!bucket)
++ continue;
+
-+ for (i = 0; i < req->nr_pages; i++) {
-+ handle = pending_handle(req, i);
-+ if (handle == BLKBACK_INVALID_HANDLE)
++ if (atomic_read(&bucket->reqs_in_use))
+ continue;
-+ blkback_pagemap_clear(pending_page(req, i));
-+ gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
-+ GNTMAP_host_map, handle);
-+ pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
-+ invcount++;
++
++ blktap_request_pool_free_bucket(bucket);
++ pool.buckets[i] = NULL;
++ err = 0;
++ break;
+ }
+
-+ ret = HYPERVISOR_grant_table_op(
-+ GNTTABOP_unmap_grant_ref, unmap, invcount);
-+ BUG_ON(ret);
-+}
++ spin_unlock_irqrestore(&pool.lock, flags);
+
-+/******************************************************************
-+ * SCHEDULER FUNCTIONS
-+ */
++ return err;
++}
+
-+static void print_stats(blkif_t *blkif)
++int
++blktap_request_pool_grow(void)
+{
-+ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n",
-+ current->comm, blkif->st_oo_req,
-+ blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
-+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
-+ blkif->st_rd_req = 0;
-+ blkif->st_wr_req = 0;
-+ blkif->st_oo_req = 0;
++ return blktap_request_pool_allocate_bucket();
+}
+
-+int blkif_schedule(void *arg)
++struct blktap_request *
++blktap_request_allocate(struct blktap *tap)
+{
-+ blkif_t *blkif = arg;
-+
-+ blkif_get(blkif);
++ int i;
++ uint16_t usr_idx;
++ unsigned long flags;
++ struct blktap_request *request;
+
-+ if (debug_lvl)
-+ printk(KERN_DEBUG "%s: started\n", current->comm);
++ usr_idx = -1;
++ request = NULL;
+
-+ while (!kthread_should_stop()) {
-+ if (try_to_freeze())
-+ continue;
++ spin_lock_irqsave(&pool.lock, flags);
+
-+ wait_event_interruptible(
-+ blkif->wq,
-+ blkif->waiting_reqs || kthread_should_stop());
-+ wait_event_interruptible(
-+ pending_free_wq,
-+ !list_empty(&pending_free) || kthread_should_stop());
++ if (pool.status == BLKTAP_POOL_CLOSING)
++ goto out;
+
-+ blkif->waiting_reqs = 0;
-+ smp_mb(); /* clear flag *before* checking for work */
++ for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++)
++ if (!tap->pending_requests[i]) {
++ usr_idx = i;
++ break;
++ }
+
-+ if (do_block_io_op(blkif))
-+ blkif->waiting_reqs = 1;
-+ unplug_queue(blkif);
++ if (usr_idx == (uint16_t)-1)
++ goto out;
+
-+ if (log_stats && time_after(jiffies, blkif->st_print))
-+ print_stats(blkif);
++ if (!list_empty(&pool.free_list)) {
++ request = list_entry(pool.free_list.next,
++ struct blktap_request, free_list);
++ list_del(&request->free_list);
+ }
+
-+ if (log_stats)
-+ print_stats(blkif);
-+ if (debug_lvl)
-+ printk(KERN_DEBUG "%s: exiting\n", current->comm);
++ if (request) {
++ struct blktap_request_handle *handle;
+
-+ blkif->xenblkd = NULL;
-+ blkif_put(blkif);
++ atomic_inc(&pool.reqs_in_use);
+
-+ return 0;
-+}
++ handle = blktap_request_to_handle(request);
++ atomic_inc(&handle->bucket->reqs_in_use);
++ handle->inuse = 1;
+
-+/******************************************************************
-+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
-+ */
++ request->usr_idx = usr_idx;
+
-+static void __end_block_io_op(pending_req_t *pending_req, int error)
-+{
-+ /* An error fails the entire request. */
-+ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
-+ (error == -EOPNOTSUPP)) {
-+ DPRINTK("blkback: write barrier op failed, not supported\n");
-+ blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
-+ pending_req->status = BLKIF_RSP_EOPNOTSUPP;
-+ } else if (error) {
-+ DPRINTK("Buffer not up-to-date at end of operation, "
-+ "error=%d\n", error);
-+ pending_req->status = BLKIF_RSP_ERROR;
++ tap->pending_requests[usr_idx] = request;
++ tap->pending_cnt++;
+ }
+
-+ if (atomic_dec_and_test(&pending_req->pendcnt)) {
-+ fast_flush_area(pending_req);
-+ make_response(pending_req->blkif, pending_req->id,
-+ pending_req->operation, pending_req->status);
-+ blkif_put(pending_req->blkif);
-+ free_req(pending_req);
-+ }
++out:
++ spin_unlock_irqrestore(&pool.lock, flags);
++ return request;
+}
+
-+static void end_block_io_op(struct bio *bio, int error)
++void
++blktap_request_free(struct blktap *tap, struct blktap_request *request)
+{
-+ __end_block_io_op(bio->bi_private, error);
-+ bio_put(bio);
-+}
-+
++ int free;
++ unsigned long flags;
++ struct blktap_request_handle *handle;
+
-+/******************************************************************************
-+ * NOTIFICATION FROM GUEST OS.
-+ */
++ BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests));
++ handle = blktap_request_to_handle(request);
+
-+static void blkif_notify_work(blkif_t *blkif)
-+{
-+ blkif->waiting_reqs = 1;
-+ wake_up(&blkif->wq);
-+}
++ spin_lock_irqsave(&pool.lock, flags);
+
-+irqreturn_t blkif_be_int(int irq, void *dev_id)
-+{
-+ blkif_notify_work(dev_id);
-+ return IRQ_HANDLED;
-+}
++ handle->inuse = 0;
++ tap->pending_requests[request->usr_idx] = NULL;
++ blktap_request_pool_init_request(request);
++ list_add(&request->free_list, &pool.free_list);
++ atomic_dec(&handle->bucket->reqs_in_use);
++ free = atomic_dec_and_test(&pool.reqs_in_use);
+
++ spin_unlock_irqrestore(&pool.lock, flags);
+
++ if (--tap->pending_cnt == 0)
++ wake_up_interruptible(&tap->wq);
+
-+/******************************************************************
-+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
-+ */
++ if (free)
++ wake_up(&pool.wait_queue);
++}
+
-+static int do_block_io_op(blkif_t *blkif)
++void
++blktap_request_pool_free(void)
+{
-+ union blkif_back_rings *blk_rings = &blkif->blk_rings;
-+ struct blkif_request req;
-+ pending_req_t *pending_req;
-+ RING_IDX rc, rp;
-+ int more_to_do = 0;
-+
-+ rc = blk_rings->common.req_cons;
-+ rp = blk_rings->common.sring->req_prod;
-+ rmb(); /* Ensure we see queued requests up to 'rp'. */
++ int i;
++ unsigned long flags;
+
-+ while (rc != rp) {
++ spin_lock_irqsave(&pool.lock, flags);
+
-+ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
-+ break;
++ pool.status = BLKTAP_POOL_CLOSING;
++ while (atomic_read(&pool.reqs_in_use)) {
++ spin_unlock_irqrestore(&pool.lock, flags);
++ wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use));
++ spin_lock_irqsave(&pool.lock, flags);
++ }
+
-+ if (kthread_should_stop()) {
-+ more_to_do = 1;
-+ break;
-+ }
++ for (i = 0; i < MAX_BUCKETS; i++) {
++ blktap_request_pool_free_bucket(pool.buckets[i]);
++ pool.buckets[i] = NULL;
++ }
+
-+ pending_req = alloc_req();
-+ if (NULL == pending_req) {
-+ blkif->st_oo_req++;
-+ more_to_do = 1;
-+ break;
-+ }
++ spin_unlock_irqrestore(&pool.lock, flags);
++}
+
-+ switch (blkif->blk_protocol) {
-+ case BLKIF_PROTOCOL_NATIVE:
-+ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
-+ break;
-+ case BLKIF_PROTOCOL_X86_32:
-+ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
-+ break;
-+ case BLKIF_PROTOCOL_X86_64:
-+ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
-+ break;
-+ default:
-+ BUG();
-+ }
-+ blk_rings->common.req_cons = ++rc; /* before make_response() */
++int __init
++blktap_request_pool_init(void)
++{
++ int i, err;
+
-+ /* Apply all sanity checks to /private copy/ of request. */
-+ barrier();
++ memset(&pool, 0, sizeof(pool));
+
-+ switch (req.operation) {
-+ case BLKIF_OP_READ:
-+ blkif->st_rd_req++;
-+ dispatch_rw_block_io(blkif, &req, pending_req);
-+ break;
-+ case BLKIF_OP_WRITE_BARRIER:
-+ blkif->st_br_req++;
-+ /* fall through */
-+ case BLKIF_OP_WRITE:
-+ blkif->st_wr_req++;
-+ dispatch_rw_block_io(blkif, &req, pending_req);
-+ break;
-+ default:
-+ /* A good sign something is wrong: sleep for a while to
-+ * avoid excessive CPU consumption by a bad guest. */
-+ msleep(1);
-+ DPRINTK("error: unknown block io operation [%d]\n",
-+ req.operation);
-+ make_response(blkif, req.id, req.operation,
-+ BLKIF_RSP_ERROR);
-+ free_req(pending_req);
-+ break;
-+ }
++ spin_lock_init(&pool.lock);
++ INIT_LIST_HEAD(&pool.free_list);
++ atomic_set(&pool.reqs_in_use, 0);
++ init_waitqueue_head(&pool.wait_queue);
+
-+ /* Yield point for this unbounded loop. */
-+ cond_resched();
++ for (i = 0; i < 2; i++) {
++ err = blktap_request_pool_allocate_bucket();
++ if (err)
++ goto fail;
+ }
+
-+ return more_to_do;
++ return 0;
++
++fail:
++ blktap_request_pool_free();
++ return err;
+}
+diff --git a/drivers/xen/blktap/ring.c b/drivers/xen/blktap/ring.c
+new file mode 100644
+index 0000000..74a7aa7
+--- /dev/null
++++ b/drivers/xen/blktap/ring.c
+@@ -0,0 +1,615 @@
++#include <linux/module.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/poll.h>
+
-+static void dispatch_rw_block_io(blkif_t *blkif,
-+ struct blkif_request *req,
-+ pending_req_t *pending_req)
-+{
-+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-+ struct phys_req preq;
-+ struct {
-+ unsigned long buf; unsigned int nsec;
-+ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-+ unsigned int nseg;
-+ struct bio *bio = NULL;
-+ int ret, i;
-+ int operation;
++#include <asm/xen/page.h>
++#include <asm/xen/hypercall.h>
+
-+ switch (req->operation) {
-+ case BLKIF_OP_READ:
-+ operation = READ;
-+ break;
-+ case BLKIF_OP_WRITE:
-+ operation = WRITE;
-+ break;
-+ case BLKIF_OP_WRITE_BARRIER:
-+ operation = WRITE_BARRIER;
-+ break;
-+ default:
-+ operation = 0; /* make gcc happy */
-+ BUG();
-+ }
++#include "blktap.h"
+
-+ /* Check that number of segments is sane. */
-+ nseg = req->nr_segments;
-+ if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
-+ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
-+ DPRINTK("Bad number of segments in request (%d)\n", nseg);
-+ goto fail_response;
-+ }
++#ifdef CONFIG_XEN_BLKDEV_BACKEND
++#include "../blkback/blkback-pagemap.h"
++#else
++#define blkback_pagemap_contains_page(page) 0
++#endif
+
-+ preq.dev = req->handle;
-+ preq.sector_number = req->sector_number;
-+ preq.nr_sects = 0;
++static int blktap_ring_major;
+
-+ pending_req->blkif = blkif;
-+ pending_req->id = req->id;
-+ pending_req->operation = req->operation;
-+ pending_req->status = BLKIF_RSP_OKAY;
-+ pending_req->nr_pages = nseg;
++static inline struct blktap *
++vma_to_blktap(struct vm_area_struct *vma)
++{
++ struct vm_foreign_map *m = vma->vm_private_data;
++ struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
++ return container_of(r, struct blktap, ring);
++}
+
-+ for (i = 0; i < nseg; i++) {
-+ uint32_t flags;
++ /*
++ * BLKTAP - immediately before the mmap area,
++ * we have a bunch of pages reserved for shared memory rings.
++ */
++#define RING_PAGES 1
+
-+ seg[i].nsec = req->seg[i].last_sect -
-+ req->seg[i].first_sect + 1;
++static int
++blktap_read_ring(struct blktap *tap)
++{
++ /* This is called to read responses from the ring. */
++ int usr_idx;
++ RING_IDX rc, rp;
++ struct blkif_response res;
++ struct blktap_ring *ring;
++ struct blktap_request *request;
+
-+ if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
-+ (req->seg[i].last_sect < req->seg[i].first_sect))
-+ goto fail_response;
-+ preq.nr_sects += seg[i].nsec;
++ down_read(&tap->tap_sem);
+
-+ flags = GNTMAP_host_map;
-+ if (operation != READ)
-+ flags |= GNTMAP_readonly;
-+ gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
-+ req->seg[i].gref, blkif->domid);
++ ring = &tap->ring;
++ if (!ring->vma) {
++ up_read(&tap->tap_sem);
++ return 0;
+ }
+
-+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
-+ BUG_ON(ret);
++ /* for each outstanding message on the ring */
++ rp = ring->ring.sring->rsp_prod;
++ rmb();
+
-+ for (i = 0; i < nseg; i++) {
-+ if (unlikely(map[i].status != 0)) {
-+ DPRINTK("invalid buffer -- could not remap it\n");
-+ map[i].handle = BLKBACK_INVALID_HANDLE;
-+ ret |= 1;
++ for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
++ memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
++ mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
++ ++ring->ring.rsp_cons;
++
++ usr_idx = (int)res.id;
++ if (usr_idx >= MAX_PENDING_REQS ||
++ !tap->pending_requests[usr_idx]) {
++ BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
++ rc, rp, usr_idx, tap->pid, ring->vma);
+ continue;
+ }
+
-+ set_phys_to_machine(
-+ page_to_pfn(pending_page(pending_req, i)),
-+ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
-+ seg[i].buf = map[i].dev_bus_addr |
-+ (req->seg[i].first_sect << 9);
-+ blkback_pagemap_set(vaddr_pagenr(pending_req, i),
-+ pending_page(pending_req, i),
-+ blkif->domid, req->handle,
-+ req->seg[i].gref);
-+ pending_handle(pending_req, i) = map[i].handle;
++ request = tap->pending_requests[usr_idx];
++ BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
++ blktap_device_finish_request(tap, &res, request);
+ }
+
-+ if (ret)
-+ goto fail_flush;
++ up_read(&tap->tap_sem);
+
-+ if (vbd_translate(&preq, blkif, operation) != 0) {
-+ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
-+ operation == READ ? "read" : "write",
-+ preq.sector_number,
-+ preq.sector_number + preq.nr_sects, preq.dev);
-+ goto fail_flush;
-+ }
++ blktap_run_deferred();
+
-+ plug_queue(blkif, preq.bdev);
-+ atomic_set(&pending_req->pendcnt, 1);
-+ blkif_get(blkif);
++ return 0;
++}
+
-+ for (i = 0; i < nseg; i++) {
-+ if (((int)preq.sector_number|(int)seg[i].nsec) &
-+ ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
-+ DPRINTK("Misaligned I/O request from domain %d",
-+ blkif->domid);
-+ goto fail_put_bio;
-+ }
++static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++ return VM_FAULT_SIGBUS;
++}
+
-+ while ((bio == NULL) ||
-+ (bio_add_page(bio,
-+ pending_page(pending_req, i),
-+ seg[i].nsec << 9,
-+ seg[i].buf & ~PAGE_MASK) == 0)) {
-+ if (bio) {
-+ atomic_inc(&pending_req->pendcnt);
-+ submit_bio(operation, bio);
-+ }
++static pte_t
++blktap_ring_clear_pte(struct vm_area_struct *vma,
++ unsigned long uvaddr,
++ pte_t *ptep, int is_fullmm)
++{
++ pte_t copy;
++ struct blktap *tap;
++ unsigned long kvaddr;
++ struct page **map, *page;
++ struct blktap_ring *ring;
++ struct blktap_request *request;
++ struct grant_handle_pair *khandle;
++ struct gnttab_unmap_grant_ref unmap[2];
++ int offset, seg, usr_idx, count = 0;
+
-+ bio = bio_alloc(GFP_KERNEL, nseg-i);
-+ if (unlikely(bio == NULL))
-+ goto fail_put_bio;
++ tap = vma_to_blktap(vma);
++ ring = &tap->ring;
++ map = ring->foreign_map.map;
++ BUG_ON(!map); /* TODO Should this be changed to if statement? */
+
-+ bio->bi_bdev = preq.bdev;
-+ bio->bi_private = pending_req;
-+ bio->bi_end_io = end_block_io_op;
-+ bio->bi_sector = preq.sector_number;
-+ }
++ /*
++ * Zap entry if the address is before the start of the grant
++ * mapped region.
++ */
++ if (uvaddr < ring->user_vstart)
++ return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
++ ptep, is_fullmm);
+
-+ preq.sector_number += seg[i].nsec;
++ offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
++ usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
++ seg = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
++
++ offset = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
++ page = map[offset];
++ if (page) {
++ ClearPageReserved(page);
++ if (blkback_pagemap_contains_page(page))
++ set_page_private(page, 0);
+ }
++ map[offset] = NULL;
+
-+ if (!bio) {
-+ BUG_ON(operation != WRITE_BARRIER);
-+ bio = bio_alloc(GFP_KERNEL, 0);
-+ if (unlikely(bio == NULL))
-+ goto fail_put_bio;
++ request = tap->pending_requests[usr_idx];
++ kvaddr = request_to_kaddr(request, seg);
++ khandle = request->handles + seg;
+
-+ bio->bi_bdev = preq.bdev;
-+ bio->bi_private = pending_req;
-+ bio->bi_end_io = end_block_io_op;
-+ bio->bi_sector = -1;
++ if (khandle->kernel != INVALID_GRANT_HANDLE) {
++ gnttab_set_unmap_op(&unmap[count], kvaddr,
++ GNTMAP_host_map, khandle->kernel);
++ count++;
++
++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
++ INVALID_P2M_ENTRY);
+ }
+
-+ submit_bio(operation, bio);
+
-+ if (operation == READ)
-+ blkif->st_rd_sect += preq.nr_sects;
-+ else if (operation == WRITE || operation == WRITE_BARRIER)
-+ blkif->st_wr_sect += preq.nr_sects;
++ if (khandle->user != INVALID_GRANT_HANDLE) {
++ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++
++ copy = *ptep;
++ gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep).maddr,
++ GNTMAP_host_map
++ | GNTMAP_application_map
++ | GNTMAP_contains_pte,
++ khandle->user);
++ count++;
++ } else
++ copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
++ is_fullmm);
++
++ if (count)
++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++ unmap, count))
++ BUG();
++
++ khandle->kernel = INVALID_GRANT_HANDLE;
++ khandle->user = INVALID_GRANT_HANDLE;
+
-+ return;
++ return copy;
++}
+
-+ fail_flush:
-+ fast_flush_area(pending_req);
-+ fail_response:
-+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
-+ free_req(pending_req);
-+ msleep(1); /* back off a bit */
-+ return;
++static void
++blktap_ring_vm_unmap(struct vm_area_struct *vma)
++{
++ struct blktap *tap = vma_to_blktap(vma);
+
-+ fail_put_bio:
-+ __end_block_io_op(pending_req, -EINVAL);
-+ if (bio)
-+ bio_put(bio);
-+ unplug_queue(blkif);
-+ msleep(1); /* back off a bit */
-+ return;
++ down_write(&tap->tap_sem);
++ clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
++ clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++ clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++ up_write(&tap->tap_sem);
+}
+
++static void
++blktap_ring_vm_close(struct vm_area_struct *vma)
++{
++ struct blktap *tap = vma_to_blktap(vma);
++ struct blktap_ring *ring = &tap->ring;
+
++ blktap_ring_vm_unmap(vma); /* fail future requests */
++ blktap_device_fail_pending_requests(tap); /* fail pending requests */
++ blktap_device_restart(tap); /* fail deferred requests */
+
-+/******************************************************************
-+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
-+ */
-+
++ down_write(&tap->tap_sem);
+
-+static void make_response(blkif_t *blkif, u64 id,
-+ unsigned short op, int st)
-+{
-+ struct blkif_response resp;
-+ unsigned long flags;
-+ union blkif_back_rings *blk_rings = &blkif->blk_rings;
-+ int more_to_do = 0;
-+ int notify;
++ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
+
-+ resp.id = id;
-+ resp.operation = op;
-+ resp.status = st;
++ kfree(ring->foreign_map.map);
++ ring->foreign_map.map = NULL;
+
-+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
-+ /* Place on the response ring for the relevant domain. */
-+ switch (blkif->blk_protocol) {
-+ case BLKIF_PROTOCOL_NATIVE:
-+ memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
-+ &resp, sizeof(resp));
-+ break;
-+ case BLKIF_PROTOCOL_X86_32:
-+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
-+ &resp, sizeof(resp));
-+ break;
-+ case BLKIF_PROTOCOL_X86_64:
-+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
-+ &resp, sizeof(resp));
-+ break;
-+ default:
-+ BUG();
-+ }
-+ blk_rings->common.rsp_prod_pvt++;
-+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
-+ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
-+ /*
-+ * Tail check for pending requests. Allows frontend to avoid
-+ * notifications if requests are already in flight (lower
-+ * overheads and promotes batching).
-+ */
-+ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
++ /* Free the ring page. */
++ ClearPageReserved(virt_to_page(ring->ring.sring));
++ free_page((unsigned long)ring->ring.sring);
+
-+ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
-+ more_to_do = 1;
-+ }
++ BTINFO("unmapping ring %d\n", tap->minor);
++ ring->ring.sring = NULL;
++ ring->vma = NULL;
+
-+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
++ up_write(&tap->tap_sem);
+
-+ if (more_to_do)
-+ blkif_notify_work(blkif);
-+ if (notify)
-+ notify_remote_via_irq(blkif->irq);
++ wake_up(&tap->wq);
+}
+
-+static int __init blkif_init(void)
++static struct vm_operations_struct blktap_ring_vm_operations = {
++ .close = blktap_ring_vm_close,
++ .unmap = blktap_ring_vm_unmap,
++ .fault = blktap_ring_fault,
++ .zap_pte = blktap_ring_clear_pte,
++};
++
++static int
++blktap_ring_open(struct inode *inode, struct file *filp)
+{
-+ int i, mmap_pages;
-+ int rc = 0;
++ int idx;
++ struct blktap *tap;
+
-+ if (!xen_pv_domain())
++ idx = iminor(inode);
++ if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) {
++ BTERR("unable to open device blktap%d\n", idx);
+ return -ENODEV;
-+
-+ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
-+
-+ pending_reqs = kmalloc(sizeof(pending_reqs[0]) *
-+ blkif_reqs, GFP_KERNEL);
-+ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
-+ mmap_pages, GFP_KERNEL);
-+ pending_pages = alloc_empty_pages_and_pagevec(mmap_pages);
-+
-+ if (blkback_pagemap_init(mmap_pages))
-+ goto out_of_memory;
-+
-+ if (!pending_reqs || !pending_grant_handles || !pending_pages) {
-+ rc = -ENOMEM;
-+ goto out_of_memory;
+ }
+
-+ for (i = 0; i < mmap_pages; i++)
-+ pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
++ tap = blktaps[idx];
+
-+ rc = blkif_interface_init();
-+ if (rc)
-+ goto failed_init;
++ BTINFO("opening device blktap%d\n", idx);
+
-+ memset(pending_reqs, 0, sizeof(pending_reqs));
-+ INIT_LIST_HEAD(&pending_free);
++ if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
++ return -ENODEV;
+
-+ for (i = 0; i < blkif_reqs; i++)
-+ list_add_tail(&pending_reqs[i].free_list, &pending_free);
++ /* Only one process can access ring at a time */
++ if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
++ return -EBUSY;
+
-+ rc = blkif_xenbus_init();
-+ if (rc)
-+ goto failed_init;
++ filp->private_data = tap;
++ BTINFO("opened device %d\n", tap->minor);
+
+ return 0;
-+
-+ out_of_memory:
-+ printk(KERN_ERR "%s: out of memory\n", __func__);
-+ failed_init:
-+ kfree(pending_reqs);
-+ kfree(pending_grant_handles);
-+ free_empty_pages_and_pagevec(pending_pages, mmap_pages);
-+ return rc;
+}
+
-+module_init(blkif_init);
++static int
++blktap_ring_release(struct inode *inode, struct file *filp)
++{
++ struct blktap *tap = filp->private_data;
+
-+MODULE_LICENSE("Dual BSD/GPL");
-diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
-new file mode 100644
-index 0000000..af43d63
---- /dev/null
-+++ b/drivers/xen/blkback/common.h
-@@ -0,0 +1,139 @@
-+/*
-+ * This program is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU General Public License version 2
-+ * as published by the Free Software Foundation; or, when distributed
-+ * separately from the Linux kernel or incorporated into other
-+ * software packages, subject to the following license:
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this source file (the "Software"), to deal in the Software without
-+ * restriction, including without limitation the rights to use, copy, modify,
-+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-+ * and to permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
++ BTINFO("freeing device %d\n", tap->minor);
++ clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
++ filp->private_data = NULL;
++ wake_up(&tap->wq);
++ return 0;
++}
++
++/* Note on mmap:
++ * We need to map pages to user space in a way that will allow the block
++ * subsystem set up direct IO to them. This couldn't be done before, because
++ * there isn't really a sane way to translate a user virtual address down to a
++ * physical address when the page belongs to another domain.
+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
++ * My first approach was to map the page in to kernel memory, add an entry
++ * for it in the physical frame list (using alloc_lomem_region as in blkback)
++ * and then attempt to map that page up to user space. This is disallowed
++ * by xen though, which realizes that we don't really own the machine frame
++ * underlying the physical page.
+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
++ * The new approach is to provide explicit support for this in xen linux.
++ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
++ * mapped from other vms. vma->vm_private_data is set up as a mapping
++ * from pages to actual page structs. There is a new clause in get_user_pages
++ * that does the right thing for this sort of mapping.
+ */
++static int
++blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
++{
++ int size, err;
++ struct page **map;
++ struct blktap *tap;
++ struct blkif_sring *sring;
++ struct blktap_ring *ring;
+
-+#ifndef __BLKIF__BACKEND__COMMON_H__
-+#define __BLKIF__BACKEND__COMMON_H__
-+
-+#include <linux/version.h>
-+#include <linux/module.h>
-+#include <linux/interrupt.h>
-+#include <linux/slab.h>
-+#include <linux/blkdev.h>
-+#include <linux/vmalloc.h>
-+#include <linux/wait.h>
-+#include <asm/io.h>
-+#include <asm/setup.h>
-+#include <asm/pgalloc.h>
-+#include <asm/hypervisor.h>
-+#include <xen/blkif.h>
-+#include <xen/grant_table.h>
-+#include <xen/xenbus.h>
-+#include "blkback-pagemap.h"
++ tap = filp->private_data;
++ ring = &tap->ring;
++ map = NULL;
++ sring = NULL;
+
++ if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
++ return -ENOMEM;
+
-+#define DPRINTK(_f, _a...) \
-+ pr_debug("(file=%s, line=%d) " _f, \
-+ __FILE__ , __LINE__ , ## _a )
++ size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++ if (size != (MMAP_PAGES + RING_PAGES)) {
++ BTERR("you _must_ map exactly %lu pages!\n",
++ MMAP_PAGES + RING_PAGES);
++ return -EAGAIN;
++ }
+
-+struct vbd {
-+ blkif_vdev_t handle; /* what the domain refers to this vbd as */
-+ unsigned char readonly; /* Non-zero -> read-only */
-+ unsigned char type; /* VDISK_xxx */
-+ u32 pdevice; /* phys device that this vbd maps to */
-+ struct block_device *bdev;
-+};
++ /* Allocate the fe ring. */
++ sring = (struct blkif_sring *)get_zeroed_page(GFP_KERNEL);
++ if (!sring) {
++ BTERR("Couldn't alloc sring.\n");
++ goto fail_mem;
++ }
+
-+struct backend_info;
++ map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
++ if (!map) {
++ BTERR("Couldn't alloc VM_FOREIGN map.\n");
++ goto fail_mem;
++ }
+
-+typedef struct blkif_st {
-+ /* Unique identifier for this interface. */
-+ domid_t domid;
-+ unsigned int handle;
-+ /* Physical parameters of the comms window. */
-+ unsigned int irq;
-+ /* Comms information. */
-+ enum blkif_protocol blk_protocol;
-+ union blkif_back_rings blk_rings;
-+ struct vm_struct *blk_ring_area;
-+ /* The VBD attached to this interface. */
-+ struct vbd vbd;
-+ /* Back pointer to the backend_info. */
-+ struct backend_info *be;
-+ /* Private fields. */
-+ spinlock_t blk_ring_lock;
-+ atomic_t refcnt;
++ SetPageReserved(virt_to_page(sring));
++
++ SHARED_RING_INIT(sring);
++ FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
+
-+ wait_queue_head_t wq;
-+ struct task_struct *xenblkd;
-+ unsigned int waiting_reqs;
-+ struct request_queue *plug;
++ ring->ring_vstart = vma->vm_start;
++ ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
+
-+ /* statistics */
-+ unsigned long st_print;
-+ int st_rd_req;
-+ int st_wr_req;
-+ int st_oo_req;
-+ int st_br_req;
-+ int st_rd_sect;
-+ int st_wr_sect;
++ /* Map the ring pages to the start of the region and reserve it. */
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ err = vm_insert_page(vma, vma->vm_start,
++ virt_to_page(ring->ring.sring));
++ else
++ err = remap_pfn_range(vma, vma->vm_start,
++ __pa(ring->ring.sring) >> PAGE_SHIFT,
++ PAGE_SIZE, vma->vm_page_prot);
++ if (err) {
++ BTERR("Mapping user ring failed: %d\n", err);
++ goto fail;
++ }
+
-+ wait_queue_head_t waiting_to_free;
++ /* Mark this VM as containing foreign pages, and set up mappings. */
++ ring->foreign_map.map = map;
++ vma->vm_private_data = &ring->foreign_map;
++ vma->vm_flags |= VM_FOREIGN;
++ vma->vm_flags |= VM_DONTCOPY;
++ vma->vm_flags |= VM_RESERVED;
++ vma->vm_ops = &blktap_ring_vm_operations;
+
-+ grant_handle_t shmem_handle;
-+ grant_ref_t shmem_ref;
-+} blkif_t;
++#ifdef CONFIG_X86
++ vma->vm_mm->context.has_foreign_mappings = 1;
++#endif
+
-+blkif_t *blkif_alloc(domid_t domid);
-+void blkif_disconnect(blkif_t *blkif);
-+void blkif_free(blkif_t *blkif);
-+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
++ tap->pid = current->pid;
++ BTINFO("blktap: mapping pid is %d\n", tap->pid);
+
-+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
-+#define blkif_put(_b) \
-+ do { \
-+ if (atomic_dec_and_test(&(_b)->refcnt)) \
-+ wake_up(&(_b)->waiting_to_free);\
-+ } while (0)
++ ring->vma = vma;
++ return 0;
+
-+/* Create a vbd. */
-+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
-+ unsigned minor, int readonly, int cdrom);
-+void vbd_free(struct vbd *vbd);
++ fail:
++ /* Clear any active mappings. */
++ zap_page_range(vma, vma->vm_start,
++ vma->vm_end - vma->vm_start, NULL);
++ ClearPageReserved(virt_to_page(sring));
++ fail_mem:
++ free_page((unsigned long)sring);
++ kfree(map);
+
-+unsigned long long vbd_size(struct vbd *vbd);
-+unsigned int vbd_info(struct vbd *vbd);
-+unsigned long vbd_secsize(struct vbd *vbd);
++ return -ENOMEM;
++}
+
-+struct phys_req {
-+ unsigned short dev;
-+ unsigned short nr_sects;
-+ struct block_device *bdev;
-+ blkif_sector_t sector_number;
-+};
++static inline void
++blktap_ring_set_message(struct blktap *tap, int msg)
++{
++ struct blktap_ring *ring = &tap->ring;
+
-+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
++ down_read(&tap->tap_sem);
++ if (ring->ring.sring)
++ ring->ring.sring->pad[0] = msg;
++ up_read(&tap->tap_sem);
++}
+
-+int blkif_interface_init(void);
++static int
++blktap_ring_ioctl(struct inode *inode, struct file *filp,
++ unsigned int cmd, unsigned long arg)
++{
++ struct blktap_params params;
++ struct blktap *tap = filp->private_data;
+
-+int blkif_xenbus_init(void);
++ BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
+
-+irqreturn_t blkif_be_int(int irq, void *dev_id);
-+int blkif_schedule(void *arg);
++ switch(cmd) {
++ case BLKTAP2_IOCTL_KICK_FE:
++ /* There are fe messages to process. */
++ return blktap_read_ring(tap);
+
-+int blkback_barrier(struct xenbus_transaction xbt,
-+ struct backend_info *be, int state);
++ case BLKTAP2_IOCTL_CREATE_DEVICE:
++ if (!arg)
++ return -EINVAL;
+
-+#endif /* __BLKIF__BACKEND__COMMON_H__ */
-diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
-new file mode 100644
-index 0000000..e397a41
---- /dev/null
-+++ b/drivers/xen/blkback/interface.c
-@@ -0,0 +1,186 @@
-+/******************************************************************************
-+ * arch/xen/drivers/blkif/backend/interface.c
-+ *
-+ * Block-device interface management.
-+ *
-+ * Copyright (c) 2004, Keir Fraser
-+ *
-+ * This program is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU General Public License version 2
-+ * as published by the Free Software Foundation; or, when distributed
-+ * separately from the Linux kernel or incorporated into other
-+ * software packages, subject to the following license:
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this source file (the "Software"), to deal in the Software without
-+ * restriction, including without limitation the rights to use, copy, modify,
-+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-+ * and to permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ */
++ if (copy_from_user(¶ms, (struct blktap_params __user *)arg,
++ sizeof(params))) {
++ BTERR("failed to get params\n");
++ return -EFAULT;
++ }
+
-+#include "common.h"
-+#include <xen/events.h>
-+#include <xen/grant_table.h>
-+#include <linux/kthread.h>
++ if (blktap_validate_params(tap, ¶ms)) {
++ BTERR("invalid params\n");
++ return -EINVAL;
++ }
+
-+static struct kmem_cache *blkif_cachep;
++ tap->params = params;
++ return blktap_device_create(tap);
+
-+blkif_t *blkif_alloc(domid_t domid)
-+{
-+ blkif_t *blkif;
++ case BLKTAP2_IOCTL_SET_PARAMS:
++ if (!arg)
++ return -EINVAL;
+
-+ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
-+ if (!blkif)
-+ return ERR_PTR(-ENOMEM);
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EINVAL;
+
-+ memset(blkif, 0, sizeof(*blkif));
-+ blkif->domid = domid;
-+ spin_lock_init(&blkif->blk_ring_lock);
-+ atomic_set(&blkif->refcnt, 1);
-+ init_waitqueue_head(&blkif->wq);
-+ blkif->st_print = jiffies;
-+ init_waitqueue_head(&blkif->waiting_to_free);
++ if (copy_from_user(¶ms, (struct blktap_params __user *)arg,
++ sizeof(params))) {
++ BTERR("failed to get params\n");
++ return -EFAULT;
++ }
+
-+ return blkif;
-+}
++ if (blktap_validate_params(tap, ¶ms)) {
++ BTERR("invalid params\n");
++ return -EINVAL;
++ }
+
-+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
-+{
-+ struct gnttab_map_grant_ref op;
++ tap->params = params;
++ return 0;
+
-+ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
-+ GNTMAP_host_map, shared_page, blkif->domid);
++ case BLKTAP2_IOCTL_PAUSE:
++ if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
++ return -EINVAL;
+
-+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
-+ BUG();
++ set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++ clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
+
-+ if (op.status) {
-+ DPRINTK(" Grant table operation failure !\n");
-+ return op.status;
-+ }
++ blktap_ring_set_message(tap, 0);
++ wake_up_interruptible(&tap->wq);
+
-+ blkif->shmem_ref = shared_page;
-+ blkif->shmem_handle = op.handle;
++ return 0;
+
-+ return 0;
-+}
+
-+static void unmap_frontend_page(blkif_t *blkif)
-+{
-+ struct gnttab_unmap_grant_ref op;
++ case BLKTAP2_IOCTL_REOPEN:
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EINVAL;
+
-+ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
-+ GNTMAP_host_map, blkif->shmem_handle);
++ if (!arg)
++ return -EINVAL;
+
-+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
-+ BUG();
-+}
++ if (copy_to_user((char __user *)arg,
++ tap->params.name,
++ strlen(tap->params.name) + 1))
++ return -EFAULT;
+
-+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
-+{
-+ int err;
++ blktap_ring_set_message(tap, 0);
++ wake_up_interruptible(&tap->wq);
+
-+ /* Already connected through? */
-+ if (blkif->irq)
+ return 0;
+
-+ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
-+ return -ENOMEM;
++ case BLKTAP2_IOCTL_RESUME:
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EINVAL;
+
-+ err = map_frontend_page(blkif, shared_page);
-+ if (err) {
-+ free_vm_area(blkif->blk_ring_area);
-+ return err;
-+ }
++ tap->ring.response = (int)arg;
++ if (!tap->ring.response)
++ clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
+
-+ switch (blkif->blk_protocol) {
-+ case BLKIF_PROTOCOL_NATIVE:
-+ {
-+ struct blkif_sring *sring;
-+ sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
-+ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
-+ break;
-+ }
-+ case BLKIF_PROTOCOL_X86_32:
-+ {
-+ struct blkif_x86_32_sring *sring_x86_32;
-+ sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
-+ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
-+ break;
-+ }
-+ case BLKIF_PROTOCOL_X86_64:
-+ {
-+ struct blkif_x86_64_sring *sring_x86_64;
-+ sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
-+ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
-+ break;
-+ }
-+ default:
-+ BUG();
-+ }
++ blktap_ring_set_message(tap, 0);
++ wake_up_interruptible(&tap->wq);
+
-+ err = bind_interdomain_evtchn_to_irqhandler(
-+ blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
-+ if (err < 0)
-+ {
-+ unmap_frontend_page(blkif);
-+ free_vm_area(blkif->blk_ring_area);
-+ blkif->blk_rings.common.sring = NULL;
-+ return err;
++ return 0;
+ }
-+ blkif->irq = err;
+
-+ return 0;
++ return -ENOIOCTLCMD;
+}
+
-+void blkif_disconnect(blkif_t *blkif)
++static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
+{
-+ if (blkif->xenblkd) {
-+ kthread_stop(blkif->xenblkd);
-+ blkif->xenblkd = NULL;
-+ }
-+
-+ atomic_dec(&blkif->refcnt);
-+ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
-+ atomic_inc(&blkif->refcnt);
++ struct blktap *tap = filp->private_data;
++ struct blktap_ring *ring = &tap->ring;
+
-+ if (blkif->irq) {
-+ unbind_from_irqhandler(blkif->irq, blkif);
-+ blkif->irq = 0;
++ poll_wait(filp, &ring->poll_wait, wait);
++ if (ring->ring.sring->pad[0] != 0 ||
++ ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
++ RING_PUSH_REQUESTS(&ring->ring);
++ return POLLIN | POLLRDNORM;
+ }
+
-+ if (blkif->blk_rings.common.sring) {
-+ unmap_frontend_page(blkif);
-+ free_vm_area(blkif->blk_ring_area);
-+ blkif->blk_rings.common.sring = NULL;
-+ }
++ return 0;
+}
+
-+void blkif_free(blkif_t *blkif)
++static struct file_operations blktap_ring_file_operations = {
++ .owner = THIS_MODULE,
++ .open = blktap_ring_open,
++ .release = blktap_ring_release,
++ .ioctl = blktap_ring_ioctl,
++ .mmap = blktap_ring_mmap,
++ .poll = blktap_ring_poll,
++};
++
++void
++blktap_ring_kick_user(struct blktap *tap)
+{
-+ if (!atomic_dec_and_test(&blkif->refcnt))
-+ BUG();
-+ kmem_cache_free(blkif_cachep, blkif);
++ wake_up_interruptible(&tap->ring.poll_wait);
+}
+
-+int __init blkif_interface_init(void)
++int
++blktap_ring_resume(struct blktap *tap)
+{
-+ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
-+ 0, 0, NULL);
-+ if (!blkif_cachep)
-+ return -ENOMEM;
++ int err;
++ struct blktap_ring *ring = &tap->ring;
+
-+ return 0;
-+}
-diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
-new file mode 100644
-index 0000000..410c2ea
---- /dev/null
-+++ b/drivers/xen/blkback/vbd.c
-@@ -0,0 +1,118 @@
-+/******************************************************************************
-+ * blkback/vbd.c
-+ *
-+ * Routines for managing virtual block devices (VBDs).
-+ *
-+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
-+ *
-+ * This program is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU General Public License version 2
-+ * as published by the Free Software Foundation; or, when distributed
-+ * separately from the Linux kernel or incorporated into other
-+ * software packages, subject to the following license:
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this source file (the "Software"), to deal in the Software without
-+ * restriction, including without limitation the rights to use, copy, modify,
-+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-+ * and to permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ */
++ if (!blktap_active(tap))
++ return -ENODEV;
+
-+#include "common.h"
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EINVAL;
+
-+#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
-+ (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
++ /* set shared flag for resume */
++ ring->response = 0;
+
-+unsigned long long vbd_size(struct vbd *vbd)
-+{
-+ return vbd_sz(vbd);
-+}
++ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
++ blktap_ring_kick_user(tap);
+
-+unsigned int vbd_info(struct vbd *vbd)
-+{
-+ return vbd->type | (vbd->readonly?VDISK_READONLY:0);
-+}
++ wait_event_interruptible(tap->wq, ring->response ||
++ !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
+
-+unsigned long vbd_secsize(struct vbd *vbd)
-+{
-+ return bdev_logical_block_size(vbd->bdev);
++ err = ring->response;
++ ring->response = 0;
++
++ BTDBG("err: %d\n", err);
++
++ if (err)
++ return err;
++
++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EAGAIN;
++
++ return 0;
+}
+
-+int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
-+ unsigned minor, int readonly, int cdrom)
++int
++blktap_ring_pause(struct blktap *tap)
+{
-+ struct vbd *vbd;
-+ struct block_device *bdev;
++ if (!blktap_active(tap))
++ return -ENODEV;
+
-+ vbd = &blkif->vbd;
-+ vbd->handle = handle;
-+ vbd->readonly = readonly;
-+ vbd->type = 0;
++ if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
++ return -EINVAL;
+
-+ vbd->pdevice = MKDEV(major, minor);
++ BTDBG("draining queue\n");
++ wait_event_interruptible(tap->wq, !tap->pending_cnt);
++ if (tap->pending_cnt)
++ return -EAGAIN;
+
-+ bdev = open_by_devnum(vbd->pdevice,
-+ vbd->readonly ? FMODE_READ : FMODE_WRITE);
++ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
++ blktap_ring_kick_user(tap);
+
-+ if (IS_ERR(bdev)) {
-+ DPRINTK("vbd_creat: device %08x could not be opened.\n",
-+ vbd->pdevice);
-+ return -ENOENT;
-+ }
++ BTDBG("waiting for tapdisk response\n");
++ wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++ return -EAGAIN;
+
-+ vbd->bdev = bdev;
++ return 0;
++}
+
-+ if (vbd->bdev->bd_disk == NULL) {
-+ DPRINTK("vbd_creat: device %08x doesn't exist.\n",
-+ vbd->pdevice);
-+ vbd_free(vbd);
-+ return -ENOENT;
-+ }
++int
++blktap_ring_destroy(struct blktap *tap)
++{
++ if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
++ !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
++ return 0;
+
-+ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
-+ vbd->type |= VDISK_CDROM;
-+ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
-+ vbd->type |= VDISK_REMOVABLE;
++ BTDBG("sending tapdisk close message\n");
++ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
++ blktap_ring_kick_user(tap);
+
-+ DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
-+ handle, blkif->domid);
-+ return 0;
++ return -EAGAIN;
+}
+
-+void vbd_free(struct vbd *vbd)
++static void
++blktap_ring_initialize(struct blktap_ring *ring, int minor)
+{
-+ if (vbd->bdev)
-+ blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
-+ vbd->bdev = NULL;
++ memset(ring, 0, sizeof(*ring));
++ init_waitqueue_head(&ring->poll_wait);
++ ring->devno = MKDEV(blktap_ring_major, minor);
+}
+
-+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
++int
++blktap_ring_create(struct blktap *tap)
+{
-+ struct vbd *vbd = &blkif->vbd;
-+ int rc = -EACCES;
++ struct blktap_ring *ring = &tap->ring;
++ blktap_ring_initialize(ring, tap->minor);
++ return blktap_sysfs_create(tap);
++}
+
-+ if ((operation != READ) && vbd->readonly)
-+ goto out;
++int __init
++blktap_ring_init(int *major)
++{
++ int err;
+
-+ if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
-+ goto out;
++ err = register_chrdev(0, "blktap2", &blktap_ring_file_operations);
++ if (err < 0) {
++ BTERR("error registering blktap ring device: %d\n", err);
++ return err;
++ }
+
-+ req->dev = vbd->pdevice;
-+ req->bdev = vbd->bdev;
-+ rc = 0;
++ blktap_ring_major = *major = err;
++ BTINFO("blktap ring major: %d\n", blktap_ring_major);
++ return 0;
++}
+
-+ out:
-+ return rc;
++int
++blktap_ring_free(void)
++{
++ if (blktap_ring_major)
++ unregister_chrdev(blktap_ring_major, "blktap2");
++
++ return 0;
+}
-diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
+diff --git a/drivers/xen/blktap/sysfs.c b/drivers/xen/blktap/sysfs.c
new file mode 100644
-index 0000000..34f8e40
+index 0000000..23a3a51
--- /dev/null
-+++ b/drivers/xen/blkback/xenbus.c
-@@ -0,0 +1,541 @@
-+/* Xenbus code for blkif backend
-+ Copyright (C) 2005 Rusty Russell <rusty at rustcorp.com.au>
-+ Copyright (C) 2005 XenSource Ltd
-+
-+ This program is free software; you can redistribute it and/or modify
-+ it under the terms of the GNU General Public License as published by
-+ the Free Software Foundation; either version 2 of the License, or
-+ (at your option) any later version.
-+
-+ This program is distributed in the hope that it will be useful,
-+ but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+ GNU General Public License for more details.
++++ b/drivers/xen/blktap/sysfs.c
+@@ -0,0 +1,451 @@
++#include <linux/types.h>
++#include <linux/device.h>
++#include <linux/module.h>
++#include <linux/sched.h>
+
-+ You should have received a copy of the GNU General Public License
-+ along with this program; if not, write to the Free Software
-+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-+*/
++#include "blktap.h"
+
-+#include <stdarg.h>
-+#include <linux/module.h>
-+#include <linux/kthread.h>
-+#include "common.h"
++int blktap_debug_level = 1;
+
-+#undef DPRINTK
-+#define DPRINTK(fmt, args...) \
-+ pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \
-+ __FUNCTION__, __LINE__, ##args)
++static struct class *class;
++static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq);
+
-+struct backend_info
++static inline void
++blktap_sysfs_get(struct blktap *tap)
+{
-+ struct xenbus_device *dev;
-+ blkif_t *blkif;
-+ struct xenbus_watch backend_watch;
-+ unsigned major;
-+ unsigned minor;
-+ char *mode;
-+};
-+
-+static void connect(struct backend_info *);
-+static int connect_ring(struct backend_info *);
-+static void backend_changed(struct xenbus_watch *, const char **,
-+ unsigned int);
++ atomic_inc(&tap->ring.sysfs_refcnt);
++}
+
-+static int blkback_name(blkif_t *blkif, char *buf)
++static inline void
++blktap_sysfs_put(struct blktap *tap)
+{
-+ char *devpath, *devname;
-+ struct xenbus_device *dev = blkif->be->dev;
++ if (atomic_dec_and_test(&tap->ring.sysfs_refcnt))
++ wake_up(&sysfs_wq);
++}
+
-+ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
-+ if (IS_ERR(devpath))
-+ return PTR_ERR(devpath);
++static inline void
++blktap_sysfs_enter(struct blktap *tap)
++{
++ blktap_sysfs_get(tap); /* pin sysfs device */
++ mutex_lock(&tap->ring.sysfs_mutex); /* serialize sysfs operations */
++}
+
-+ if ((devname = strstr(devpath, "/dev/")) != NULL)
-+ devname += strlen("/dev/");
-+ else
-+ devname = devpath;
++static inline void
++blktap_sysfs_exit(struct blktap *tap)
++{
++ mutex_unlock(&tap->ring.sysfs_mutex);
++ blktap_sysfs_put(tap);
++}
+
-+ snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
-+ kfree(devpath);
++#define CLASS_DEVICE_ATTR(a,b,c,d) DEVICE_ATTR(a,b,c,d)
+
-+ return 0;
-+}
++static ssize_t blktap_sysfs_pause_device(struct device *, struct device_attribute *, const char *, size_t);
++CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
++static ssize_t blktap_sysfs_resume_device(struct device *, struct device_attribute *, const char *, size_t);
++CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);
+
-+static void update_blkif_status(blkif_t *blkif)
++static ssize_t
++blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
+{
+ int err;
-+ char name[TASK_COMM_LEN];
-+
-+ /* Not ready to connect? */
-+ if (!blkif->irq || !blkif->vbd.bdev)
-+ return;
-+
-+ /* Already connected? */
-+ if (blkif->be->dev->state == XenbusStateConnected)
-+ return;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
+
-+ /* Attempt to connect: exit if we fail to. */
-+ connect(blkif->be);
-+ if (blkif->be->dev->state != XenbusStateConnected)
-+ return;
++ blktap_sysfs_enter(tap);
+
-+ err = blkback_name(blkif, name);
-+ if (err) {
-+ xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
-+ return;
++ if (!tap->ring.dev ||
++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++ err = -ENODEV;
++ goto out;
+ }
+
-+ blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
-+ if (IS_ERR(blkif->xenblkd)) {
-+ err = PTR_ERR(blkif->xenblkd);
-+ blkif->xenblkd = NULL;
-+ xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++ err = -EPERM;
++ goto out;
+ }
-+}
+
++ if (size > BLKTAP2_MAX_MESSAGE_LEN) {
++ err = -ENAMETOOLONG;
++ goto out;
++ }
+
-+/****************************************************************
-+ * sysfs interface for VBD I/O requests
-+ */
++ if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) {
++ err = -EINVAL;
++ goto out;
++ }
+
-+#define VBD_SHOW(name, format, args...) \
-+ static ssize_t show_##name(struct device *_dev, \
-+ struct device_attribute *attr, \
-+ char *buf) \
-+ { \
-+ struct xenbus_device *dev = to_xenbus_device(_dev); \
-+ struct backend_info *be = dev_get_drvdata(&dev->dev); \
-+ \
-+ return sprintf(buf, format, ##args); \
-+ } \
-+ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
++ snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf);
++ err = size;
+
-+VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
-+VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
-+VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
-+VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req);
-+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
-+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
++out:
++ blktap_sysfs_exit(tap);
++ return err;
++}
+
-+static struct attribute *vbdstat_attrs[] = {
-+ &dev_attr_oo_req.attr,
-+ &dev_attr_rd_req.attr,
-+ &dev_attr_wr_req.attr,
-+ &dev_attr_br_req.attr,
-+ &dev_attr_rd_sect.attr,
-+ &dev_attr_wr_sect.attr,
-+ NULL
-+};
++static ssize_t
++blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
++{
++ ssize_t size;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
+
-+static struct attribute_group vbdstat_group = {
-+ .name = "statistics",
-+ .attrs = vbdstat_attrs,
-+};
++ blktap_sysfs_enter(tap);
+
-+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
-+VBD_SHOW(mode, "%s\n", be->mode);
++ if (!tap->ring.dev)
++ size = -ENODEV;
++ else if (tap->params.name[0])
++ size = sprintf(buf, "%s\n", tap->params.name);
++ else
++ size = sprintf(buf, "%d\n", tap->minor);
+
-+int xenvbd_sysfs_addif(struct xenbus_device *dev)
-+{
-+ int error;
++ blktap_sysfs_exit(tap);
+
-+ error = device_create_file(&dev->dev, &dev_attr_physical_device);
-+ if (error)
-+ goto fail1;
++ return size;
++}
++CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
++ blktap_sysfs_get_name, blktap_sysfs_set_name);
+
-+ error = device_create_file(&dev->dev, &dev_attr_mode);
-+ if (error)
-+ goto fail2;
++static ssize_t
++blktap_sysfs_remove_device(struct device *dev,
++ struct device_attribute *attr,
++ const char *buf, size_t size)
++{
++ int err;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
+
-+ error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
-+ if (error)
-+ goto fail3;
++ if (!tap->ring.dev)
++ return size;
+
-+ return 0;
++ if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++ return -EBUSY;
+
-+fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
-+fail2: device_remove_file(&dev->dev, &dev_attr_mode);
-+fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
-+ return error;
-+}
++ err = blktap_control_destroy_device(tap);
+
-+void xenvbd_sysfs_delif(struct xenbus_device *dev)
-+{
-+ sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
-+ device_remove_file(&dev->dev, &dev_attr_mode);
-+ device_remove_file(&dev->dev, &dev_attr_physical_device);
++ return (err ? : size);
+}
++CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
+
-+static int blkback_remove(struct xenbus_device *dev)
++static ssize_t
++blktap_sysfs_pause_device(struct device *dev,
++ struct device_attribute *attr,
++ const char *buf, size_t size)
+{
-+ struct backend_info *be = dev_get_drvdata(&dev->dev);
++ int err;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
+
-+ DPRINTK("");
++ blktap_sysfs_enter(tap);
+
-+ if (be->major || be->minor)
-+ xenvbd_sysfs_delif(dev);
++ BTDBG("pausing %u:%u: dev_inuse: %lu\n",
++ MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse);
+
-+ if (be->backend_watch.node) {
-+ unregister_xenbus_watch(&be->backend_watch);
-+ kfree(be->backend_watch.node);
-+ be->backend_watch.node = NULL;
++ if (!tap->ring.dev ||
++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++ err = -ENODEV;
++ goto out;
+ }
+
-+ if (be->blkif) {
-+ blkif_disconnect(be->blkif);
-+ vbd_free(&be->blkif->vbd);
-+ blkif_free(be->blkif);
-+ be->blkif = NULL;
++ if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++ err = -EBUSY;
++ goto out;
+ }
+
-+ kfree(be);
-+ dev_set_drvdata(&dev->dev, NULL);
-+ return 0;
-+}
++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++ err = 0;
++ goto out;
++ }
+
-+int blkback_barrier(struct xenbus_transaction xbt,
-+ struct backend_info *be, int state)
-+{
-+ struct xenbus_device *dev = be->dev;
-+ int err;
++ err = blktap_device_pause(tap);
++ if (!err) {
++ device_remove_file(dev, &dev_attr_pause);
++ err = device_create_file(dev, &dev_attr_resume);
++ }
+
-+ err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
-+ "%d", state);
-+ if (err)
-+ xenbus_dev_fatal(dev, err, "writing feature-barrier");
++out:
++ blktap_sysfs_exit(tap);
+
-+ return err;
++ return (err ? err : size);
+}
+
-+/**
-+ * Entry point to this code when a new device is created. Allocate the basic
-+ * structures, and watch the store waiting for the hotplug scripts to tell us
-+ * the device's physical major and minor numbers. Switch to InitWait.
-+ */
-+static int blkback_probe(struct xenbus_device *dev,
-+ const struct xenbus_device_id *id)
++static ssize_t
++blktap_sysfs_resume_device(struct device *dev,
++ struct device_attribute *attr,
++ const char *buf, size_t size)
+{
+ int err;
-+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
-+ GFP_KERNEL);
-+ if (!be) {
-+ xenbus_dev_fatal(dev, -ENOMEM,
-+ "allocating backend structure");
-+ return -ENOMEM;
-+ }
-+ be->dev = dev;
-+ dev_set_drvdata(&dev->dev, be);
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
+
-+ be->blkif = blkif_alloc(dev->otherend_id);
-+ if (IS_ERR(be->blkif)) {
-+ err = PTR_ERR(be->blkif);
-+ be->blkif = NULL;
-+ xenbus_dev_fatal(dev, err, "creating block interface");
-+ goto fail;
-+ }
++ blktap_sysfs_enter(tap);
+
-+ /* setup back pointer */
-+ be->blkif->be = be;
++ if (!tap->ring.dev ||
++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++ err = -ENODEV;
++ goto out;
++ }
+
-+ err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
-+ "%s/%s", dev->nodename, "physical-device");
-+ if (err)
-+ goto fail;
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++ err = -EINVAL;
++ goto out;
++ }
+
-+ err = xenbus_switch_state(dev, XenbusStateInitWait);
-+ if (err)
-+ goto fail;
++ err = blktap_device_resume(tap);
++ if (!err) {
++ device_remove_file(dev, &dev_attr_resume);
++ err = device_create_file(dev, &dev_attr_pause);
++ }
+
-+ return 0;
++out:
++ blktap_sysfs_exit(tap);
+
-+fail:
-+ DPRINTK("failed");
-+ blkback_remove(dev);
-+ return err;
++ BTDBG("returning %zd\n", (err ? err : size));
++ return (err ? err : size);
+}
+
-+
-+/**
-+ * Callback received when the hotplug scripts have placed the physical-device
-+ * node. Read it and the mode node, and create a vbd. If the frontend is
-+ * ready, connect.
-+ */
-+static void backend_changed(struct xenbus_watch *watch,
-+ const char **vec, unsigned int len)
++#ifdef ENABLE_PASSTHROUGH
++static ssize_t
++blktap_sysfs_enable_passthrough(struct device *dev,
++ const char *buf, size_t size)
+{
+ int err;
-+ unsigned major;
-+ unsigned minor;
-+ struct backend_info *be
-+ = container_of(watch, struct backend_info, backend_watch);
-+ struct xenbus_device *dev = be->dev;
-+ int cdrom = 0;
-+ char *device_type;
++ unsigned major, minor;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
+
-+ DPRINTK("");
++ BTINFO("passthrough request enabled\n");
+
-+ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
-+ &major, &minor);
-+ if (XENBUS_EXIST_ERR(err)) {
-+ /* Since this watch will fire once immediately after it is
-+ registered, we expect this. Ignore it, and wait for the
-+ hotplug scripts. */
-+ return;
-+ }
-+ if (err != 2) {
-+ xenbus_dev_fatal(dev, err, "reading physical-device");
-+ return;
-+ }
++ blktap_sysfs_enter(tap);
+
-+ if ((be->major || be->minor) &&
-+ ((be->major != major) || (be->minor != minor))) {
-+ printk(KERN_WARNING
-+ "blkback: changing physical device (from %x:%x to "
-+ "%x:%x) not supported.\n", be->major, be->minor,
-+ major, minor);
-+ return;
++ if (!tap->ring.dev ||
++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++ err = -ENODEV;
++ goto out;
+ }
+
-+ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
-+ if (IS_ERR(be->mode)) {
-+ err = PTR_ERR(be->mode);
-+ be->mode = NULL;
-+ xenbus_dev_fatal(dev, err, "reading mode");
-+ return;
++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++ err = -EINVAL;
++ goto out;
+ }
+
-+ device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
-+ if (!IS_ERR(device_type)) {
-+ cdrom = strcmp(device_type, "cdrom") == 0;
-+ kfree(device_type);
++ if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
++ err = -EINVAL;
++ goto out;
+ }
+
-+ if (be->major == 0 && be->minor == 0) {
-+ /* Front end dir is a number, which is used as the handle. */
-+
-+ char *p = strrchr(dev->otherend, '/') + 1;
-+ long handle = simple_strtoul(p, NULL, 0);
-+
-+ be->major = major;
-+ be->minor = minor;
-+
-+ err = vbd_create(be->blkif, handle, major, minor,
-+ (NULL == strchr(be->mode, 'w')), cdrom);
-+ if (err) {
-+ be->major = be->minor = 0;
-+ xenbus_dev_fatal(dev, err, "creating vbd structure");
-+ return;
-+ }
++ err = sscanf(buf, "%x:%x", &major, &minor);
++ if (err != 2) {
++ err = -EINVAL;
++ goto out;
++ }
+
-+ err = xenvbd_sysfs_addif(dev);
-+ if (err) {
-+ vbd_free(&be->blkif->vbd);
-+ be->major = be->minor = 0;
-+ xenbus_dev_fatal(dev, err, "creating sysfs entries");
-+ return;
-+ }
++ err = blktap_device_enable_passthrough(tap, major, minor);
+
-+ /* We're potentially connected now */
-+ update_blkif_status(be->blkif);
-+ }
++out:
++ blktap_sysfs_exit(tap);
++ BTDBG("returning %d\n", (err ? err : size));
++ return (err ? err : size);
+}
++#endif
+
-+
-+/**
-+ * Callback received when the frontend's state changes.
-+ */
-+static void frontend_changed(struct xenbus_device *dev,
-+ enum xenbus_state frontend_state)
++static ssize_t
++blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
+{
-+ struct backend_info *be = dev_get_drvdata(&dev->dev);
-+ int err;
-+
-+ DPRINTK("%s", xenbus_strstate(frontend_state));
++ char *tmp;
++ int i, ret;
++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
+
-+ switch (frontend_state) {
-+ case XenbusStateInitialising:
-+ if (dev->state == XenbusStateClosed) {
-+ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
-+ __FUNCTION__, dev->nodename);
-+ xenbus_switch_state(dev, XenbusStateInitWait);
-+ }
-+ break;
++ tmp = buf;
++ blktap_sysfs_get(tap);
+
-+ case XenbusStateInitialised:
-+ case XenbusStateConnected:
-+ /* Ensure we connect even when two watches fire in
-+ close successsion and we miss the intermediate value
-+ of frontend_state. */
-+ if (dev->state == XenbusStateConnected)
-+ break;
++ if (!tap->ring.dev) {
++ ret = sprintf(tmp, "no device\n");
++ goto out;
++ }
+
-+ err = connect_ring(be);
-+ if (err)
-+ break;
-+ update_blkif_status(be->blkif);
-+ break;
++ tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n",
++ tap->params.name, MAJOR(tap->ring.devno),
++ MINOR(tap->ring.devno), atomic_read(&tap->refcnt),
++ tap->dev_inuse);
++ tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, "
++ "device users: %d\n", tap->params.capacity,
++ tap->params.sector_size, tap->device.users);
+
-+ case XenbusStateClosing:
-+ blkif_disconnect(be->blkif);
-+ xenbus_switch_state(dev, XenbusStateClosing);
-+ break;
++ down_read(&tap->tap_sem);
+
-+ case XenbusStateClosed:
-+ xenbus_switch_state(dev, XenbusStateClosed);
-+ if (xenbus_dev_is_online(dev))
-+ break;
-+ /* fall through if not online */
-+ case XenbusStateUnknown:
-+ device_unregister(&dev->dev);
-+ break;
++ tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt);
++ for (i = 0; i < MAX_PENDING_REQS; i++) {
++ struct blktap_request *req = tap->pending_requests[i];
++ if (!req)
++ continue;
+
-+ default:
-+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
-+ frontend_state);
-+ break;
++ tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, "
++ "status: 0x%02x, pendcnt: %d, "
++ "nr_pages: %u, op: %d, time: %lu:%lu\n",
++ i, (unsigned long long)req->id, req->usr_idx,
++ req->status, atomic_read(&req->pendcnt),
++ req->nr_pages, req->operation, req->time.tv_sec,
++ req->time.tv_usec);
+ }
-+}
+
++ up_read(&tap->tap_sem);
++ ret = (tmp - buf) + 1;
+
-+/* ** Connection ** */
++out:
++ blktap_sysfs_put(tap);
++ BTDBG("%s\n", buf);
+
++ return ret;
++}
++CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);
+
-+/**
-+ * Write the physical details regarding the block device to the store, and
-+ * switch to Connected state.
-+ */
-+static void connect(struct backend_info *be)
++int
++blktap_sysfs_create(struct blktap *tap)
+{
-+ struct xenbus_transaction xbt;
++ struct blktap_ring *ring;
++ struct device *dev;
+ int err;
-+ struct xenbus_device *dev = be->dev;
+
-+ DPRINTK("%s", dev->otherend);
++ if (!class)
++ return -ENODEV;
+
-+ /* Supply the information about the device the frontend needs */
-+again:
-+ err = xenbus_transaction_start(&xbt);
-+ if (err) {
-+ xenbus_dev_fatal(dev, err, "starting transaction");
-+ return;
-+ }
++ ring = &tap->ring;
+
-+ err = blkback_barrier(xbt, be, 1);
-+ if (err)
-+ goto abort;
++ dev = device_create(class, NULL, ring->devno,
++ tap, "blktap%d", tap->minor);
++ if (IS_ERR(dev))
++ return PTR_ERR(dev);
+
-+ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
-+ vbd_size(&be->blkif->vbd));
-+ if (err) {
-+ xenbus_dev_fatal(dev, err, "writing %s/sectors",
-+ dev->nodename);
-+ goto abort;
-+ }
++ ring->dev = dev;
+
-+ /* FIXME: use a typename instead */
-+ err = xenbus_printf(xbt, dev->nodename, "info", "%u",
-+ vbd_info(&be->blkif->vbd));
-+ if (err) {
-+ xenbus_dev_fatal(dev, err, "writing %s/info",
-+ dev->nodename);
-+ goto abort;
-+ }
-+ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
-+ vbd_secsize(&be->blkif->vbd));
-+ if (err) {
-+ xenbus_dev_fatal(dev, err, "writing %s/sector-size",
-+ dev->nodename);
-+ goto abort;
-+ }
++ mutex_init(&ring->sysfs_mutex);
++ atomic_set(&ring->sysfs_refcnt, 0);
+
-+ err = xenbus_transaction_end(xbt, 0);
-+ if (err == -EAGAIN)
-+ goto again;
-+ if (err)
-+ xenbus_dev_fatal(dev, err, "ending transaction");
+
-+ err = xenbus_switch_state(dev, XenbusStateConnected);
++ printk(KERN_CRIT "%s: adding attributes for dev %p\n", __func__, dev);
++ err = device_create_file(dev, &dev_attr_name);
+ if (err)
-+ xenbus_dev_fatal(dev, err, "switching to Connected state",
-+ dev->nodename);
++ goto out;
++ err = device_create_file(dev, &dev_attr_remove);
++ if (err)
++ goto out_unregister_name;
++ err = device_create_file(dev, &dev_attr_pause);
++ if (err)
++ goto out_unregister_remove;
++ err = device_create_file(dev, &dev_attr_debug);
++ if (err)
++ goto out_unregister_pause;
+
-+ return;
-+ abort:
-+ xenbus_transaction_end(xbt, 1);
-+}
++ return 0;
+
++out_unregister_pause:
++ device_remove_file(dev, &dev_attr_pause);
++out_unregister_remove:
++ device_remove_file(dev, &dev_attr_remove);
++out_unregister_name:
++ device_remove_file(dev, &dev_attr_name);
++out:
++ return err;
++}
+
-+static int connect_ring(struct backend_info *be)
++int
++blktap_sysfs_destroy(struct blktap *tap)
+{
-+ struct xenbus_device *dev = be->dev;
-+ unsigned long ring_ref;
-+ unsigned int evtchn;
-+ char protocol[64] = "";
-+ int err;
++ struct blktap_ring *ring;
++ struct device *dev;
+
-+ DPRINTK("%s", dev->otherend);
++ printk(KERN_CRIT "%s\n", __func__);
+
-+ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
-+ "event-channel", "%u", &evtchn, NULL);
-+ if (err) {
-+ xenbus_dev_fatal(dev, err,
-+ "reading %s/ring-ref and event-channel",
-+ dev->otherend);
-+ return err;
-+ }
++ ring = &tap->ring;
++ dev = ring->dev;
++ if (!class || !dev)
++ return 0;
+
-+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
-+ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
-+ "%63s", protocol, NULL);
-+ if (err)
-+ strcpy(protocol, "unspecified, assuming native");
-+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
-+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
-+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
-+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
-+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
-+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
-+ else {
-+ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
-+ return -1;
-+ }
-+ printk(KERN_INFO
-+ "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
-+ ring_ref, evtchn, be->blkif->blk_protocol, protocol);
++ ring->dev = NULL;
++ if (wait_event_interruptible(sysfs_wq,
++ !atomic_read(&tap->ring.sysfs_refcnt)))
++ return -EAGAIN;
+
-+ /* Map the shared frame, irq etc. */
-+ err = blkif_map(be->blkif, ring_ref, evtchn);
-+ if (err) {
-+ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
-+ ring_ref, evtchn);
-+ return err;
-+ }
++ device_schedule_callback(dev, device_unregister);
+
+ return 0;
+}
+
++static ssize_t
++blktap_sysfs_show_verbosity(struct class *class, char *buf)
++{
++ return sprintf(buf, "%d\n", blktap_debug_level);
++}
+
-+/* ** Driver Registration ** */
-+
-+
-+static const struct xenbus_device_id blkback_ids[] = {
-+ { "vbd" },
-+ { "" }
-+};
-+
++static ssize_t
++blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size)
++{
++ int level;
+
-+static struct xenbus_driver blkback = {
-+ .name = "vbd",
-+ .owner = THIS_MODULE,
-+ .ids = blkback_ids,
-+ .probe = blkback_probe,
-+ .remove = blkback_remove,
-+ .otherend_changed = frontend_changed
-+};
++ if (sscanf(buf, "%d", &level) == 1) {
++ blktap_debug_level = level;
++ return size;
++ }
+
++ return -EINVAL;
++}
++CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR,
++ blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
+
-+int blkif_xenbus_init(void)
++static ssize_t
++blktap_sysfs_show_devices(struct class *class, char *buf)
+{
-+ return xenbus_register_backend(&blkback);
-+}
-diff --git a/drivers/xen/blktap/Makefile b/drivers/xen/blktap/Makefile
-new file mode 100644
-index 0000000..99ff53c
---- /dev/null
-+++ b/drivers/xen/blktap/Makefile
-@@ -0,0 +1,3 @@
-+obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
++ int i, ret;
++ struct blktap *tap;
+
-+blktap-objs := control.o ring.o wait_queue.o device.o request.o sysfs.o
-diff --git a/drivers/xen/blktap/blktap.h b/drivers/xen/blktap/blktap.h
-new file mode 100644
-index 0000000..db4cf02
---- /dev/null
-+++ b/drivers/xen/blktap/blktap.h
-@@ -0,0 +1,253 @@
-+#ifndef _BLKTAP_H_
-+#define _BLKTAP_H_
++ ret = 0;
++ for (i = 0; i < MAX_BLKTAP_DEVICE; i++) {
++ tap = blktaps[i];
++ if (!tap)
++ continue;
+
-+#include <linux/mm.h>
-+#include <linux/fs.h>
-+#include <linux/cdev.h>
-+#include <linux/init.h>
-+#include <linux/scatterlist.h>
-+#include <xen/blkif.h>
-+#include <xen/grant_table.h>
++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++ continue;
+
-+//#define ENABLE_PASSTHROUGH
++ ret += sprintf(buf + ret, "%d ", tap->minor);
++ ret += snprintf(buf + ret, sizeof(tap->params.name) - 1,
++ tap->params.name);
++ ret += sprintf(buf + ret, "\n");
++ }
+
-+extern int blktap_debug_level;
++ return ret;
++}
++CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL);
+
-+#define BTPRINTK(level, tag, force, _f, _a...) \
-+ do { \
-+ if (blktap_debug_level > level && \
-+ (force || printk_ratelimit())) \
-+ printk(tag "%s: " _f, __func__, ##_a); \
-+ } while (0)
++void
++blktap_sysfs_free(void)
++{
++ if (!class)
++ return;
+
-+#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
-+#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
-+#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
-+#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
++ class_remove_file(class, &class_attr_verbosity);
++ class_remove_file(class, &class_attr_devices);
+
-+#define MAX_BLKTAP_DEVICE 256
++ class_destroy(class);
++}
+
-+#define BLKTAP_CONTROL 1
-+#define BLKTAP_RING_FD 2
-+#define BLKTAP_RING_VMA 3
-+#define BLKTAP_DEVICE 4
-+#define BLKTAP_PAUSE_REQUESTED 6
-+#define BLKTAP_PAUSED 7
-+#define BLKTAP_SHUTDOWN_REQUESTED 8
-+#define BLKTAP_PASSTHROUGH 9
-+#define BLKTAP_DEFERRED 10
++int __init
++blktap_sysfs_init(void)
++{
++ struct class *cls;
++ int err;
+
-+/* blktap IOCTLs: */
-+#define BLKTAP2_IOCTL_KICK_FE 1
-+#define BLKTAP2_IOCTL_ALLOC_TAP 200
-+#define BLKTAP2_IOCTL_FREE_TAP 201
-+#define BLKTAP2_IOCTL_CREATE_DEVICE 202
-+#define BLKTAP2_IOCTL_SET_PARAMS 203
-+#define BLKTAP2_IOCTL_PAUSE 204
-+#define BLKTAP2_IOCTL_REOPEN 205
-+#define BLKTAP2_IOCTL_RESUME 206
++ if (class)
++ return -EEXIST;
+
-+#define BLKTAP2_MAX_MESSAGE_LEN 256
++ cls = class_create(THIS_MODULE, "blktap2");
++ if (IS_ERR(cls))
++ return PTR_ERR(cls);
+
-+#define BLKTAP2_RING_MESSAGE_PAUSE 1
-+#define BLKTAP2_RING_MESSAGE_RESUME 2
-+#define BLKTAP2_RING_MESSAGE_CLOSE 3
++ err = class_create_file(cls, &class_attr_verbosity);
++ if (err)
++ goto out_unregister;
++ err = class_create_file(cls, &class_attr_devices);
++ if (err)
++ goto out_unregister;
+
-+#define BLKTAP_REQUEST_FREE 0
-+#define BLKTAP_REQUEST_PENDING 1
++ class = cls;
++ return 0;
++out_unregister:
++ class_destroy(cls);
++ return err;
++}
+diff --git a/drivers/xen/blktap/wait_queue.c b/drivers/xen/blktap/wait_queue.c
+new file mode 100644
+index 0000000..f8995aa
+--- /dev/null
++++ b/drivers/xen/blktap/wait_queue.c
+@@ -0,0 +1,40 @@
++#include <linux/list.h>
++#include <linux/spinlock.h>
+
-+/*
-+ * The maximum number of requests that can be outstanding at any time
-+ * is determined by
-+ *
-+ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
-+ *
-+ * where mmap_alloc < MAX_DYNAMIC_MEM.
-+ *
-+ * TODO:
-+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
-+ * sysfs.
-+ */
-+#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
-+#define MAX_DYNAMIC_MEM BLK_RING_SIZE
-+#define MAX_PENDING_REQS BLK_RING_SIZE
-+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-+#define MMAP_VADDR(_start, _req, _seg) \
-+ (_start + \
-+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
-+ ((_seg) * PAGE_SIZE))
++#include "blktap.h"
+
-+#define blktap_get(_b) (atomic_inc(&(_b)->refcnt))
-+#define blktap_put(_b) \
-+ do { \
-+ if (atomic_dec_and_test(&(_b)->refcnt)) \
-+ wake_up(&(_b)->wq); \
-+ } while (0)
++static LIST_HEAD(deferred_work_queue);
++static DEFINE_SPINLOCK(deferred_work_lock);
+
-+struct blktap;
++void
++blktap_run_deferred(void)
++{
++ LIST_HEAD(queue);
++ struct blktap *tap;
++ unsigned long flags;
+
-+struct grant_handle_pair {
-+ grant_handle_t kernel;
-+ grant_handle_t user;
-+};
-+#define INVALID_GRANT_HANDLE 0xFFFF
++ spin_lock_irqsave(&deferred_work_lock, flags);
++ list_splice_init(&deferred_work_queue, &queue);
++ list_for_each_entry(tap, &queue, deferred_queue)
++ clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
++ spin_unlock_irqrestore(&deferred_work_lock, flags);
+
-+struct blktap_handle {
-+ unsigned int ring;
-+ unsigned int device;
-+ unsigned int minor;
-+};
++ while (!list_empty(&queue)) {
++ tap = list_entry(queue.next, struct blktap, deferred_queue);
++ list_del_init(&tap->deferred_queue);
++ blktap_device_restart(tap);
++ }
++}
+
-+struct blktap_params {
-+ char name[BLKTAP2_MAX_MESSAGE_LEN];
-+ unsigned long long capacity;
-+ unsigned long sector_size;
-+};
++void
++blktap_defer(struct blktap *tap)
++{
++ unsigned long flags;
+
-+struct blktap_device {
-+ int users;
-+ spinlock_t lock;
-+ struct gendisk *gd;
++ spin_lock_irqsave(&deferred_work_lock, flags);
++ if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) {
++ set_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
++ list_add_tail(&tap->deferred_queue, &deferred_work_queue);
++ }
++ spin_unlock_irqrestore(&deferred_work_lock, flags);
++}
+diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
+index bdfd584..6625ffe 100644
+--- a/drivers/xen/cpu_hotplug.c
++++ b/drivers/xen/cpu_hotplug.c
+@@ -1,5 +1,6 @@
+ #include <linux/notifier.h>
+
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+
+ #include <asm/xen/hypervisor.h>
+diff --git a/drivers/xen/events.c b/drivers/xen/events.c
+index ce602dd..60b31e6 100644
+--- a/drivers/xen/events.c
++++ b/drivers/xen/events.c
+@@ -16,7 +16,7 @@
+ * (typically dom0).
+ * 2. VIRQs, typically used for timers. These are per-cpu events.
+ * 3. IPIs.
+- * 4. Hardware interrupts. Not supported at present.
++ * 4. PIRQs - Hardware interrupts.
+ *
+ * Jeremy Fitzhardinge <jeremy at xensource.com>, XenSource Inc, 2007
+ */
+@@ -27,19 +27,27 @@
+ #include <linux/module.h>
+ #include <linux/string.h>
+ #include <linux/bootmem.h>
++#include <linux/irqnr.h>
++#include <linux/pci_regs.h>
++#include <linux/pci.h>
++#include <linux/msi.h>
+
+ #include <asm/ptrace.h>
+ #include <asm/irq.h>
+ #include <asm/idle.h>
++#include <asm/io_apic.h>
+ #include <asm/sync_bitops.h>
+ #include <asm/xen/hypercall.h>
+ #include <asm/xen/hypervisor.h>
++#include <asm/xen/pci.h>
+
+ #include <xen/xen-ops.h>
+ #include <xen/events.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/event_channel.h>
+
++#include "../pci/msi.h"
++
+ /*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+@@ -67,7 +75,7 @@ enum xen_irq_type {
+ * event channel - irq->event channel mapping
+ * cpu - cpu this event channel is bound to
+ * index - type-specific information:
+- * PIRQ - vector, with MSB being "needs EIO"
++ * PIRQ - with MSB being "needs EIO"
+ * VIRQ - virq number
+ * IPI - IPI vector
+ * EVTCHN -
+@@ -83,20 +91,27 @@ struct irq_info
+ enum ipi_vector ipi;
+ struct {
+ unsigned short gsi;
+- unsigned short vector;
++ unsigned char vector;
++ unsigned char flags;
++ uint16_t domid;
+ } pirq;
+ } u;
+ };
++#define PIRQ_NEEDS_EOI (1 << 0)
++#define PIRQ_SHAREABLE (1 << 1)
+
+-static struct irq_info irq_info[NR_IRQS];
++static struct irq_info *irq_info;
+
+-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+- [0 ... NR_EVENT_CHANNELS-1] = -1
+-};
++static int *evtchn_to_irq;
+ struct cpu_evtchn_s {
+ unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
+ };
+-static struct cpu_evtchn_s *cpu_evtchn_mask_p;
+
-+#ifdef ENABLE_PASSTHROUGH
-+ struct block_device *bdev;
-+#endif
++static __initdata struct cpu_evtchn_s init_evtchn_mask = {
++ .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
+};
++static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
+
-+struct blktap_ring {
-+ struct vm_area_struct *vma;
-+ struct blkif_front_ring ring;
-+ struct vm_foreign_map foreign_map;
-+ unsigned long ring_vstart;
-+ unsigned long user_vstart;
-+
-+ int response;
+ static inline unsigned long *cpu_evtchn_mask(int cpu)
+ {
+ return cpu_evtchn_mask_p[cpu].bits;
+@@ -106,6 +121,7 @@ static inline unsigned long *cpu_evtchn_mask(int cpu)
+ #define VALID_EVTCHN(chn) ((chn) != 0)
+
+ static struct irq_chip xen_dynamic_chip;
++static struct irq_chip xen_pirq_chip;
+
+ /* Constructor for packed IRQ information. */
+ static struct irq_info mk_unbound_info(void)
+@@ -135,7 +151,8 @@ static struct irq_info mk_pirq_info(unsigned short evtchn,
+ unsigned short gsi, unsigned short vector)
+ {
+ return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
+- .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
++ .cpu = 0, .u.pirq =
++ { .gsi = gsi, .vector = vector, .domid = DOMID_SELF } };
+ }
+
+ /*
+@@ -218,6 +235,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn)
+ return ret;
+ }
+
++static bool pirq_needs_eoi(unsigned irq)
++{
++ struct irq_info *info = info_for_irq(irq);
+
-+ wait_queue_head_t poll_wait;
++ BUG_ON(info->type != IRQT_PIRQ);
+
-+ dev_t devno;
-+ struct device *dev;
-+ atomic_t sysfs_refcnt;
-+ struct mutex sysfs_mutex;
-+};
++ return info->u.pirq.flags & PIRQ_NEEDS_EOI;
++}
+
-+struct blktap_statistics {
-+ unsigned long st_print;
-+ int st_rd_req;
-+ int st_wr_req;
-+ int st_oo_req;
-+ int st_rd_sect;
-+ int st_wr_sect;
-+ s64 st_rd_cnt;
-+ s64 st_rd_sum_usecs;
-+ s64 st_rd_max_usecs;
-+ s64 st_wr_cnt;
-+ s64 st_wr_sum_usecs;
-+ s64 st_wr_max_usecs;
-+};
+ static inline unsigned long active_evtchns(unsigned int cpu,
+ struct shared_info *sh,
+ unsigned int idx)
+@@ -329,17 +355,33 @@ static void unmask_evtchn(int port)
+ put_cpu();
+ }
+
++static int get_nr_hw_irqs(void)
++{
++ int ret = 1;
+
-+struct blktap_request {
-+ uint64_t id;
-+ uint16_t usr_idx;
++#ifdef CONFIG_X86_IO_APIC
++ ret = get_nr_irqs_gsi();
++#endif
+
-+ uint8_t status;
-+ atomic_t pendcnt;
-+ uint8_t nr_pages;
-+ unsigned short operation;
++ return ret;
++}
+
-+ struct timeval time;
-+ struct grant_handle_pair handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-+ struct list_head free_list;
-+};
+ static int find_unbound_irq(void)
+ {
+ int irq;
+ struct irq_desc *desc;
++ int start = get_nr_hw_irqs();
+
+- for (irq = 0; irq < nr_irqs; irq++)
++ if (start == nr_irqs)
++ goto no_irqs;
+
-+struct blktap {
-+ int minor;
-+ pid_t pid;
-+ atomic_t refcnt;
-+ unsigned long dev_inuse;
++ /* nr_irqs is a magic value. Must not use it.*/
++ for (irq = nr_irqs-1; irq > start; irq--)
+ if (irq_info[irq].type == IRQT_UNBOUND)
+ break;
+
+- if (irq == nr_irqs)
+- panic("No available IRQ to bind to: increase nr_irqs!\n");
++ if (irq == start)
++ goto no_irqs;
+
+ desc = irq_to_desc_alloc_node(irq, 0);
+ if (WARN_ON(desc == NULL))
+@@ -348,8 +390,324 @@ static int find_unbound_irq(void)
+ dynamic_irq_init(irq);
+
+ return irq;
+
-+ struct blktap_params params;
++no_irqs:
++ panic("No available IRQ to bind to: increase nr_irqs!\n");
++}
+
-+ struct rw_semaphore tap_sem;
++static bool identity_mapped_irq(unsigned irq)
++{
++ /* identity map all the hardware irqs */
++ return irq < get_nr_hw_irqs();
++}
+
-+ struct blktap_ring ring;
-+ struct blktap_device device;
++static void pirq_unmask_notify(int irq)
++{
++ struct irq_info *info = info_for_irq(irq);
++ struct physdev_eoi eoi = { .irq = info->u.pirq.gsi };
+
-+ int pending_cnt;
-+ struct blktap_request *pending_requests[MAX_PENDING_REQS];
-+ struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++ if (unlikely(pirq_needs_eoi(irq))) {
++ int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
++ WARN_ON(rc);
++ }
++}
+
-+ wait_queue_head_t wq;
-+ struct list_head deferred_queue;
++static void pirq_query_unmask(int irq)
++{
++ struct physdev_irq_status_query irq_status;
++ struct irq_info *info = info_for_irq(irq);
+
-+ struct blktap_statistics stats;
-+};
++ BUG_ON(info->type != IRQT_PIRQ);
+
-+extern struct blktap *blktaps[MAX_BLKTAP_DEVICE];
++ irq_status.irq = info->u.pirq.gsi;
++ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
++ irq_status.flags = 0;
+
-+static inline int
-+blktap_active(struct blktap *tap)
++ info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
++ if (irq_status.flags & XENIRQSTAT_needs_eoi)
++ info->u.pirq.flags |= PIRQ_NEEDS_EOI;
+ }
+
++static bool probing_irq(int irq)
+{
-+ return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
-+}
++ struct irq_desc *desc = irq_to_desc(irq);
+
-+static inline int
-+blktap_validate_params(struct blktap *tap, struct blktap_params *params)
-+{
-+ /* TODO: sanity check */
-+ params->name[sizeof(params->name) - 1] = '\0';
-+ BTINFO("%s: capacity: %llu, sector-size: %lu\n",
-+ params->name, params->capacity, params->sector_size);
-+ return 0;
++ return desc && desc->action == NULL;
+}
+
-+int blktap_control_destroy_device(struct blktap *);
-+
-+int blktap_ring_init(int *);
-+int blktap_ring_free(void);
-+int blktap_ring_create(struct blktap *);
-+int blktap_ring_destroy(struct blktap *);
-+int blktap_ring_pause(struct blktap *);
-+int blktap_ring_resume(struct blktap *);
-+void blktap_ring_kick_user(struct blktap *);
++static unsigned int startup_pirq(unsigned int irq)
++{
++ struct evtchn_bind_pirq bind_pirq;
++ struct irq_info *info = info_for_irq(irq);
++ int evtchn = evtchn_from_irq(irq);
++ int rc;
+
-+int blktap_sysfs_init(void);
-+void blktap_sysfs_free(void);
-+int blktap_sysfs_create(struct blktap *);
-+int blktap_sysfs_destroy(struct blktap *);
++ BUG_ON(info->type != IRQT_PIRQ);
+
-+int blktap_device_init(int *);
-+void blktap_device_free(void);
-+int blktap_device_create(struct blktap *);
-+int blktap_device_destroy(struct blktap *);
-+int blktap_device_pause(struct blktap *);
-+int blktap_device_resume(struct blktap *);
-+void blktap_device_restart(struct blktap *);
-+void blktap_device_finish_request(struct blktap *,
-+ struct blkif_response *,
-+ struct blktap_request *);
-+void blktap_device_fail_pending_requests(struct blktap *);
-+#ifdef ENABLE_PASSTHROUGH
-+int blktap_device_enable_passthrough(struct blktap *,
-+ unsigned, unsigned);
-+#endif
++ if (VALID_EVTCHN(evtchn))
++ goto out;
+
-+void blktap_defer(struct blktap *);
-+void blktap_run_deferred(void);
++ bind_pirq.pirq = info->u.pirq.gsi;
++ /* NB. We are happy to share unless we are probing. */
++ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
++ BIND_PIRQ__WILL_SHARE : 0;
++ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
++ if (rc != 0) {
++ if (!probing_irq(irq))
++ printk(KERN_INFO "Failed to obtain physical IRQ %d" \
++ " (GSI:%d)\n", irq, info->u.pirq.gsi);
++ return 0;
++ }
++ evtchn = bind_pirq.port;
+
-+int blktap_request_pool_init(void);
-+void blktap_request_pool_free(void);
-+int blktap_request_pool_grow(void);
-+int blktap_request_pool_shrink(void);
-+struct blktap_request *blktap_request_allocate(struct blktap *);
-+void blktap_request_free(struct blktap *, struct blktap_request *);
-+struct page *request_to_page(struct blktap_request *, int);
++ pirq_query_unmask(irq);
+
-+static inline unsigned long
-+request_to_kaddr(struct blktap_request *req, int seg)
-+{
-+ unsigned long pfn = page_to_pfn(request_to_page(req, seg));
-+ return (unsigned long)pfn_to_kaddr(pfn);
-+}
++ evtchn_to_irq[evtchn] = irq;
++ bind_evtchn_to_cpu(evtchn, 0);
++ info->evtchn = evtchn;
+
-+#endif
-diff --git a/drivers/xen/blktap/control.c b/drivers/xen/blktap/control.c
-new file mode 100644
-index 0000000..a4852f7
---- /dev/null
-+++ b/drivers/xen/blktap/control.c
-@@ -0,0 +1,284 @@
-+#include <linux/module.h>
-+#include <linux/sched.h>
-+#include <linux/miscdevice.h>
++ out:
++ unmask_evtchn(evtchn);
++ pirq_unmask_notify(irq);
+
-+#include <asm/uaccess.h>
++ return 0;
++}
+
-+#include "blktap.h"
++static void shutdown_pirq(unsigned int irq)
++{
++ struct evtchn_close close;
++ struct irq_info *info = info_for_irq(irq);
++ int evtchn = evtchn_from_irq(irq);
+
-+static DEFINE_SPINLOCK(blktap_control_lock);
-+struct blktap *blktaps[MAX_BLKTAP_DEVICE];
++ BUG_ON(info->type != IRQT_PIRQ);
+
-+static int ring_major;
-+static int device_major;
-+static int blktap_control_registered;
++ if (!VALID_EVTCHN(evtchn))
++ return;
+
-+static void
-+blktap_control_initialize_tap(struct blktap *tap)
-+{
-+ int minor = tap->minor;
++ mask_evtchn(evtchn);
+
-+ memset(tap, 0, sizeof(*tap));
-+ set_bit(BLKTAP_CONTROL, &tap->dev_inuse);
-+ init_rwsem(&tap->tap_sem);
-+ init_waitqueue_head(&tap->wq);
-+ atomic_set(&tap->refcnt, 0);
-+ sg_init_table(tap->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++ close.port = evtchn;
++ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
++ BUG();
+
-+ tap->minor = minor;
++ bind_evtchn_to_cpu(evtchn, 0);
++ evtchn_to_irq[evtchn] = -1;
++ info->evtchn = 0;
+}
+
-+static struct blktap *
-+blktap_control_create_tap(void)
++static void enable_pirq(unsigned int irq)
+{
-+ int minor;
-+ struct blktap *tap;
++ startup_pirq(irq);
++}
+
-+ tap = kmalloc(sizeof(*tap), GFP_KERNEL);
-+ if (unlikely(!tap))
-+ return NULL;
++static void disable_pirq(unsigned int irq)
++{
++}
+
-+ blktap_control_initialize_tap(tap);
++static void ack_pirq(unsigned int irq)
++{
++ int evtchn = evtchn_from_irq(irq);
+
-+ spin_lock_irq(&blktap_control_lock);
-+ for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++)
-+ if (!blktaps[minor])
-+ break;
++ move_native_irq(irq);
+
-+ if (minor == MAX_BLKTAP_DEVICE) {
-+ kfree(tap);
-+ tap = NULL;
-+ goto out;
++ if (VALID_EVTCHN(evtchn)) {
++ mask_evtchn(evtchn);
++ clear_evtchn(evtchn);
+ }
-+
-+ tap->minor = minor;
-+ blktaps[minor] = tap;
-+
-+out:
-+ spin_unlock_irq(&blktap_control_lock);
-+ return tap;
+}
+
-+static struct blktap *
-+blktap_control_allocate_tap(void)
++static void end_pirq(unsigned int irq)
+{
-+ int err, minor;
-+ struct blktap *tap;
-+
-+ /*
-+ * This is called only from the ioctl, which
-+ * means we should always have interrupts enabled.
-+ */
-+ BUG_ON(irqs_disabled());
-+
-+ spin_lock_irq(&blktap_control_lock);
++ int evtchn = evtchn_from_irq(irq);
++ struct irq_desc *desc = irq_to_desc(irq);
+
-+ for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) {
-+ tap = blktaps[minor];
-+ if (!tap)
-+ goto found;
++ if (WARN_ON(!desc))
++ return;
+
-+ if (!tap->dev_inuse) {
-+ blktap_control_initialize_tap(tap);
-+ goto found;
-+ }
++ if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
++ (IRQ_DISABLED|IRQ_PENDING)) {
++ shutdown_pirq(irq);
++ } else if (VALID_EVTCHN(evtchn)) {
++ unmask_evtchn(evtchn);
++ pirq_unmask_notify(irq);
+ }
++}
+
-+ tap = NULL;
++static int find_irq_by_gsi(unsigned gsi)
++{
++ int irq;
+
-+found:
-+ spin_unlock_irq(&blktap_control_lock);
++ for (irq = 0; irq < nr_irqs; irq++) {
++ struct irq_info *info = info_for_irq(irq);
+
-+ if (!tap) {
-+ tap = blktap_control_create_tap();
-+ if (!tap)
-+ return NULL;
-+ }
++ if (info == NULL || info->type != IRQT_PIRQ)
++ continue;
+
-+ err = blktap_ring_create(tap);
-+ if (err) {
-+ BTERR("ring creation failed: %d\n", err);
-+ clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
-+ return NULL;
++ if (gsi_from_irq(irq) == gsi)
++ return irq;
+ }
+
-+ BTINFO("allocated tap %p\n", tap);
-+ return tap;
++ return -1;
+}
+
-+static int
-+blktap_control_ioctl(struct inode *inode, struct file *filp,
-+ unsigned int cmd, unsigned long arg)
++/*
++ * Allocate a physical irq, along with a vector. We don't assign an
++ * event channel until the irq actually started up. Return an
++ * existing irq if we've already got one for the gsi.
++ */
++int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
+{
-+ unsigned long dev;
-+ struct blktap *tap;
++ int irq;
++ struct physdev_irq irq_op;
+
-+ switch (cmd) {
-+ case BLKTAP2_IOCTL_ALLOC_TAP: {
-+ struct blktap_handle h;
++ spin_lock(&irq_mapping_update_lock);
+
-+ tap = blktap_control_allocate_tap();
-+ if (!tap) {
-+ BTERR("error allocating device\n");
-+ return -ENOMEM;
-+ }
++ irq = find_irq_by_gsi(gsi);
++ if (irq != -1) {
++ printk(KERN_INFO "xen_allocate_pirq: returning irq %d for gsi %u\n",
++ irq, gsi);
++ goto out; /* XXX need refcount? */
++ }
+
-+ h.ring = ring_major;
-+ h.device = device_major;
-+ h.minor = tap->minor;
++ /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
++ * we are using the !xen_initial_domain() to drop in the function.*/
++ if (identity_mapped_irq(gsi) || !xen_initial_domain()) {
++ irq = gsi;
++ irq_to_desc_alloc_node(irq, 0);
++ dynamic_irq_init(irq);
++ } else
++ irq = find_unbound_irq();
+
-+ if (copy_to_user((struct blktap_handle __user *)arg,
-+ &h, sizeof(h))) {
-+ blktap_control_destroy_device(tap);
-+ return -EFAULT;
-+ }
++ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
++ handle_level_irq, name);
+
-+ return 0;
-+ }
++ irq_op.irq = gsi;
++ irq_op.vector = 0;
+
-+ case BLKTAP2_IOCTL_FREE_TAP:
-+ dev = arg;
++ /* Only the privileged domain can do this. For non-priv, the pcifront
++ * driver provides a PCI bus that does the call to do exactly
++ * this in the priv domain. */
++ if (xen_initial_domain() &&
++ HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
++ dynamic_irq_cleanup(irq);
++ irq = -ENOSPC;
++ goto out;
++ }
+
-+ if (dev > MAX_BLKTAP_DEVICE || !blktaps[dev])
-+ return -EINVAL;
++ irq_info[irq] = mk_pirq_info(0, gsi, irq_op.vector);
++ irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
+
-+ blktap_control_destroy_device(blktaps[dev]);
-+ return 0;
-+ }
++out:
++ spin_unlock(&irq_mapping_update_lock);
+
-+ return -ENOIOCTLCMD;
++ return irq;
+}
+
-+static struct file_operations blktap_control_file_operations = {
-+ .owner = THIS_MODULE,
-+ .ioctl = blktap_control_ioctl,
-+};
++#ifdef CONFIG_PCI_MSI
++int xen_destroy_irq(int irq)
++{
++ struct irq_desc *desc;
++ struct physdev_unmap_pirq unmap_irq;
++ struct irq_info *info = info_for_irq(irq);
++ int rc = -ENOENT;
+
-+static struct miscdevice blktap_misc = {
-+ .minor = MISC_DYNAMIC_MINOR,
-+ .name = "blktap-control",
-+ .fops = &blktap_control_file_operations,
-+};
++ spin_lock(&irq_mapping_update_lock);
+
-+int
-+blktap_control_destroy_device(struct blktap *tap)
-+{
-+ int err;
-+ unsigned long inuse;
++ desc = irq_to_desc(irq);
++ if (!desc)
++ goto out;
+
-+ if (!tap)
-+ return 0;
++ if (xen_initial_domain()) {
++ unmap_irq.pirq = info->u.pirq.gsi;
++ unmap_irq.domid = info->u.pirq.domid;
++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
++ if (rc) {
++ printk(KERN_WARNING "unmap irq failed %d\n", rc);
++ goto out;
++ }
++ }
++ irq_info[irq] = mk_unbound_info();
+
-+ set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
++ dynamic_irq_cleanup(irq);
+
-+ for (;;) {
-+ inuse = tap->dev_inuse;
-+ err = blktap_device_destroy(tap);
-+ if (err)
-+ goto wait;
++out:
++ spin_unlock(&irq_mapping_update_lock);
++ return rc;
++}
+
-+ inuse = tap->dev_inuse;
-+ err = blktap_ring_destroy(tap);
-+ if (err)
-+ goto wait;
++int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
++{
++ int irq = 0;
++ struct physdev_map_pirq map_irq;
++ int rc;
++ domid_t domid;
++ int pos;
++ u32 table_offset, bir;
+
-+ inuse = tap->dev_inuse;
-+ err = blktap_sysfs_destroy(tap);
-+ if (err)
-+ goto wait;
++ domid = rc = xen_find_device_domain_owner(dev);
++ if (rc < 0)
++ domid = DOMID_SELF;
++
++ memset(&map_irq, 0, sizeof(map_irq));
++ map_irq.domid = domid;
++ map_irq.type = MAP_PIRQ_TYPE_MSI;
++ map_irq.index = -1;
++ map_irq.pirq = -1;
++ map_irq.bus = dev->bus->number;
++ map_irq.devfn = dev->devfn;
++
++ if (type == PCI_CAP_ID_MSIX) {
++ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+
-+ break;
++ pci_read_config_dword(dev, msix_table_offset_reg(pos),
++ &table_offset);
++ bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+
-+ wait:
-+ BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n",
-+ inuse, tap->dev_inuse);
-+ if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse))
-+ break;
++ map_irq.table_base = pci_resource_start(dev, bir);
++ map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+ }
+
-+ clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
++ spin_lock(&irq_mapping_update_lock);
+
-+ if (tap->dev_inuse == (1UL << BLKTAP_CONTROL)) {
-+ err = 0;
-+ clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
-+ }
++ irq = find_unbound_irq();
+
-+ return err;
-+}
++ if (irq == -1)
++ goto out;
+
-+static int __init
-+blktap_control_init(void)
-+{
-+ int err;
++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
++ if (rc) {
++ printk(KERN_WARNING "xen map irq failed %d\n", rc);
+
-+ err = misc_register(&blktap_misc);
-+ if (err) {
-+ BTERR("misc_register failed for control device");
-+ return err;
++ dynamic_irq_cleanup(irq);
++
++ irq = -1;
++ goto out;
+ }
++ irq_info[irq] = mk_pirq_info(0, map_irq.pirq, map_irq.index);
++ if (domid)
++ irq_info[irq].u.pirq.domid = domid;
+
-+ blktap_control_registered = 1;
-+ return 0;
++ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
++ handle_level_irq,
++ (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
++
++out:
++ spin_unlock(&irq_mapping_update_lock);
++ return irq;
+}
++#endif
+
-+static void
-+blktap_control_free(void)
++int xen_vector_from_irq(unsigned irq)
+{
-+ int i;
-+
-+ for (i = 0; i < MAX_BLKTAP_DEVICE; i++)
-+ blktap_control_destroy_device(blktaps[i]);
-+
-+ if (blktap_control_registered)
-+ if (misc_deregister(&blktap_misc) < 0)
-+ BTERR("misc_deregister failed for control device");
++ return vector_from_irq(irq);
+}
+
-+static void
-+blktap_exit(void)
++int xen_gsi_from_irq(unsigned irq)
+{
-+ blktap_control_free();
-+ blktap_ring_free();
-+ blktap_sysfs_free();
-+ blktap_device_free();
-+ blktap_request_pool_free();
++ return gsi_from_irq(irq);
+}
++EXPORT_SYMBOL_GPL(xen_gsi_from_irq);
+
-+static int __init
-+blktap_init(void)
+ int bind_evtchn_to_irq(unsigned int evtchn)
+ {
+ int irq;
+@@ -409,8 +767,23 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+ return irq;
+ }
+
++static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
++ unsigned int remote_port)
+{
-+ int err;
-+
-+ if (!xen_domain())
-+ return -ENODEV;
++ struct evtchn_bind_interdomain bind_interdomain;
++ int err;
+
-+ err = blktap_request_pool_init();
-+ if (err)
-+ return err;
++ bind_interdomain.remote_dom = remote_domain;
++ bind_interdomain.remote_port = remote_port;
+
+-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
++ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
++ &bind_interdomain);
+
-+ err = blktap_device_init(&device_major);
-+ if (err)
-+ goto fail;
++ return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
++}
+
-+ err = blktap_ring_init(&ring_major);
-+ if (err)
-+ goto fail;
+
-+ err = blktap_sysfs_init();
-+ if (err)
-+ goto fail;
++int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+ {
+ struct evtchn_bind_virq bind_virq;
+ int evtchn, irq;
+@@ -504,6 +877,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ }
+ EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
++int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
++ unsigned int remote_port,
++ irq_handler_t handler,
++ unsigned long irqflags,
++ const char *devname,
++ void *dev_id)
++{
++ int irq, retval;
+
-+ err = blktap_control_init();
-+ if (err)
-+ goto fail;
++ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
++ if (irq < 0)
++ return irq;
+
-+ return 0;
++ retval = request_irq(irq, handler, irqflags, devname, dev_id);
++ if (retval != 0) {
++ unbind_from_irq(irq);
++ return retval;
++ }
+
-+fail:
-+ blktap_exit();
-+ return err;
++ return irq;
+}
++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+
-+module_init(blktap_init);
-+module_exit(blktap_exit);
-+MODULE_LICENSE("Dual BSD/GPL");
-diff --git a/drivers/xen/blktap/device.c b/drivers/xen/blktap/device.c
-new file mode 100644
-index 0000000..a50b622
---- /dev/null
-+++ b/drivers/xen/blktap/device.c
-@@ -0,0 +1,1138 @@
-+#include <linux/version.h> /* XXX Remove uses of VERSION instead. */
-+#include <linux/fs.h>
-+#include <linux/blkdev.h>
-+#include <linux/cdrom.h>
-+#include <linux/hdreg.h>
-+#include <linux/module.h>
-+#include <asm/tlbflush.h>
-+
-+#include <scsi/scsi.h>
-+#include <scsi/scsi_ioctl.h>
+ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags, const char *devname, void *dev_id)
+@@ -649,9 +1045,13 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
+ int bit_idx = __ffs(pending_bits);
+ int port = (word_idx * BITS_PER_LONG) + bit_idx;
+ int irq = evtchn_to_irq[port];
++ struct irq_desc *desc;
+
+- if (irq != -1)
+- handle_irq(irq, regs);
++ if (irq != -1) {
++ desc = irq_to_desc(irq);
++ if (desc)
++ generic_handle_irq_desc(irq, desc);
++ }
+ }
+ }
+
+@@ -855,7 +1255,7 @@ void xen_clear_irq_pending(int irq)
+ if (VALID_EVTCHN(evtchn))
+ clear_evtchn(evtchn);
+ }
+-
++EXPORT_SYMBOL(xen_clear_irq_pending);
+ void xen_set_irq_pending(int irq)
+ {
+ int evtchn = evtchn_from_irq(irq);
+@@ -875,9 +1275,9 @@ bool xen_test_irq_pending(int irq)
+ return ret;
+ }
+
+-/* Poll waiting for an irq to become pending. In the usual case, the
++/* Poll waiting for an irq to become pending with timeout. In the usual case, the
+ irq will be disabled so it won't deliver an interrupt. */
+-void xen_poll_irq(int irq)
++void xen_poll_irq_timeout(int irq, u64 timeout)
+ {
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
+
+@@ -885,13 +1285,20 @@ void xen_poll_irq(int irq)
+ struct sched_poll poll;
+
+ poll.nr_ports = 1;
+- poll.timeout = 0;
++ poll.timeout = timeout;
+ set_xen_guest_handle(poll.ports, &evtchn);
+
+ if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
+ BUG();
+ }
+ }
++EXPORT_SYMBOL(xen_poll_irq_timeout);
++/* Poll waiting for an irq to become pending. In the usual case, the
++ irq will be disabled so it won't deliver an interrupt. */
++void xen_poll_irq(int irq)
++{
++ xen_poll_irq_timeout(irq, 0 /* no timeout */);
++}
+
+ void xen_irq_resume(void)
+ {
+@@ -928,13 +1335,38 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
+ .retrigger = retrigger_dynirq,
+ };
+
++static struct irq_chip xen_pirq_chip __read_mostly = {
++ .name = "xen-pirq",
+
-+#include <xen/xenbus.h>
-+#include <xen/interface/io/blkif.h>
++ .startup = startup_pirq,
++ .shutdown = shutdown_pirq,
+
-+#include <asm/xen/page.h>
-+#include <asm/xen/hypercall.h>
++ .enable = enable_pirq,
++ .unmask = enable_pirq,
+
-+#include "blktap.h"
++ .disable = disable_pirq,
++ .mask = disable_pirq,
+
-+#include "../blkback/blkback-pagemap.h"
++ .ack = ack_pirq,
++ .end = end_pirq,
+
-+#if 0
-+#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
-+#else
-+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
-+#endif
++ .set_affinity = set_affinity_irq,
+
-+struct blktap_grant_table {
-+ int cnt;
-+ struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
++ .retrigger = retrigger_dynirq,
+};
+
-+static int blktap_device_major;
-+
-+static inline struct blktap *
-+dev_to_blktap(struct blktap_device *dev)
-+{
-+ return container_of(dev, struct blktap, device);
-+}
-+
-+static int
-+blktap_device_open(struct block_device * bd, fmode_t mode)
-+{
-+ struct blktap *tap;
-+ struct blktap_device *dev = bd->bd_disk->private_data;
-+
-+ if (!dev)
-+ return -ENOENT;
+ void __init xen_init_IRQ(void)
+ {
+ int i;
+
+ cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
+ GFP_KERNEL);
+- BUG_ON(cpu_evtchn_mask_p == NULL);
++ irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
+
-+ tap = dev_to_blktap(dev);
-+ if (!blktap_active(tap) ||
-+ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
-+ return -ENOENT;
++ evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
++ GFP_KERNEL);
++ for(i = 0; i < NR_EVENT_CHANNELS; i++)
++ evtchn_to_irq[i] = -1;
+
+ init_evtchn_cpu_bindings();
+
+@@ -943,4 +1375,6 @@ void __init xen_init_IRQ(void)
+ mask_evtchn(i);
+
+ irq_ctx_init(smp_processor_id());
+
-+ dev->users++;
++ xen_setup_pirqs();
+ }
+diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
+index 79bedba..6a1c4a5 100644
+--- a/drivers/xen/evtchn.c
++++ b/drivers/xen/evtchn.c
+@@ -48,6 +48,8 @@
+ #include <linux/gfp.h>
+ #include <linux/mutex.h>
+ #include <linux/cpu.h>
+
-+ return 0;
++#include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/evtchn.h>
+ #include <asm/xen/hypervisor.h>
+@@ -68,10 +70,36 @@ struct per_user_data {
+ const char *name;
+ };
+
+-/* Who's bound to each port? */
+-static struct per_user_data *port_user[NR_EVENT_CHANNELS];
++/*
++ * Who's bound to each port? This is logically an array of struct
++ * per_user_data *, but we encode the current enabled-state in bit 0.
++ */
++static unsigned long *port_user;
+ static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
+
++static inline struct per_user_data *get_port_user(unsigned port)
++{
++ return (struct per_user_data *)(port_user[port] & ~1);
+}
+
-+static int
-+blktap_device_release(struct gendisk *gd, fmode_t mode)
++static inline void set_port_user(unsigned port, struct per_user_data *u)
+{
-+ struct blktap_device *dev = gd->private_data;
-+ struct blktap *tap = dev_to_blktap(dev);
-+
-+ dev->users--;
-+ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
-+ blktap_device_destroy(tap);
-+
-+ return 0;
++ port_user[port] = (unsigned long)u;
+}
+
-+static int
-+blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
++static inline bool get_port_enabled(unsigned port)
+{
-+ /* We don't have real geometry info, but let's at least return
-+ values consistent with the size of the device */
-+ sector_t nsect = get_capacity(bd->bd_disk);
-+ sector_t cylinders = nsect;
-+
-+ hg->heads = 0xff;
-+ hg->sectors = 0x3f;
-+ sector_div(cylinders, hg->heads * hg->sectors);
-+ hg->cylinders = cylinders;
-+ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
-+ hg->cylinders = 0xffff;
-+ return 0;
++ return port_user[port] & 1;
+}
+
-+static int
-+blktap_device_ioctl(struct block_device *bd, fmode_t mode,
-+ unsigned command, unsigned long argument)
++static inline void set_port_enabled(unsigned port, bool enabled)
+{
-+ int i;
-+
-+ DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
-+ command, (long)argument, inode->i_rdev);
-+
-+ switch (command) {
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
-+ case HDIO_GETGEO: {
-+ struct hd_geometry geo;
-+ int ret;
++ if (enabled)
++ port_user[port] |= 1;
++ else
++ port_user[port] &= ~1;
++}
+
-+ if (!argument)
-+ return -EINVAL;
+ irqreturn_t evtchn_interrupt(int irq, void *data)
+ {
+ unsigned int port = (unsigned long)data;
+@@ -79,9 +107,14 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
+
+ spin_lock(&port_user_lock);
+
+- u = port_user[port];
++ u = get_port_user(port);
+
-+ geo.start = get_start_sect(bd);
-+ ret = blktap_device_getgeo(bd, &geo);
-+ if (ret)
-+ return ret;
++ WARN(!get_port_enabled(port),
++ "Interrupt for port %d, but apparently not enabled; per-user %p\n",
++ port, u);
+
+ disable_irq_nosync(irq);
++ set_port_enabled(port, false);
+
+ if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
+ u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+@@ -91,9 +124,8 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
+ kill_fasync(&u->evtchn_async_queue,
+ SIGIO, POLL_IN);
+ }
+- } else {
++ } else
+ u->ring_overflow = 1;
+- }
+
+ spin_unlock(&port_user_lock);
+
+@@ -197,9 +229,18 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
+ goto out;
+
+ spin_lock_irq(&port_user_lock);
+- for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
+- if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
+- enable_irq(irq_from_evtchn(kbuf[i]));
+
-+ if (copy_to_user((struct hd_geometry __user *)argument, &geo,
-+ sizeof(geo)))
-+ return -EFAULT;
++ for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
++ unsigned port = kbuf[i];
+
-+ return 0;
++ if (port < NR_EVENT_CHANNELS &&
++ get_port_user(port) == u &&
++ !get_port_enabled(port)) {
++ set_port_enabled(port, true);
++ enable_irq(irq_from_evtchn(port));
++ }
+ }
-+#endif
-+ case CDROMMULTISESSION:
-+ BTDBG("FIXME: support multisession CDs later\n");
-+ for (i = 0; i < sizeof(struct cdrom_multisession); i++)
-+ if (put_user(0, (char __user *)(argument + i)))
-+ return -EFAULT;
-+ return 0;
-+
-+ case SCSI_IOCTL_GET_IDLUN:
-+ if (!access_ok(VERIFY_WRITE, argument,
-+ sizeof(struct scsi_idlun)))
-+ return -EFAULT;
-+
-+ /* return 0 for now. */
-+ __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
-+ __put_user(0,
-+ &((struct scsi_idlun __user *)argument)->host_unique_id);
-+ return 0;
+
-+ default:
-+ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
-+ command);*/
-+ return -EINVAL; /* same return as native Linux */
-+ }
+ spin_unlock_irq(&port_user_lock);
+
+ rc = count;
+@@ -221,8 +262,9 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
+ * interrupt handler yet, and our caller has already
+ * serialized bind operations.)
+ */
+- BUG_ON(port_user[port] != NULL);
+- port_user[port] = u;
++ BUG_ON(get_port_user(port) != NULL);
++ set_port_user(port, u);
++ set_port_enabled(port, true); /* start enabled */
+
+ rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+ u->name, (void *)(unsigned long)port);
+@@ -238,10 +280,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
+
+ unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+
+- /* make sure we unbind the irq handler before clearing the port */
+- barrier();
+-
+- port_user[port] = NULL;
++ set_port_user(port, NULL);
+ }
+
+ static long evtchn_ioctl(struct file *file,
+@@ -332,7 +371,7 @@ static long evtchn_ioctl(struct file *file,
+ spin_lock_irq(&port_user_lock);
+
+ rc = -ENOTCONN;
+- if (port_user[unbind.port] != u) {
++ if (get_port_user(unbind.port) != u) {
+ spin_unlock_irq(&port_user_lock);
+ break;
+ }
+@@ -354,7 +393,7 @@ static long evtchn_ioctl(struct file *file,
+
+ if (notify.port >= NR_EVENT_CHANNELS) {
+ rc = -EINVAL;
+- } else if (port_user[notify.port] != u) {
++ } else if (get_port_user(notify.port) != u) {
+ rc = -ENOTCONN;
+ } else {
+ notify_remote_via_evtchn(notify.port);
+@@ -443,10 +482,10 @@ static int evtchn_release(struct inode *inode, struct file *filp)
+ free_page((unsigned long)u->ring);
+
+ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+- if (port_user[i] != u)
++ if (get_port_user(i) != u)
+ continue;
+
+- evtchn_unbind_from_user(port_user[i], i);
++ evtchn_unbind_from_user(get_port_user(i), i);
+ }
+
+ spin_unlock_irq(&port_user_lock);
+@@ -480,8 +519,11 @@ static int __init evtchn_init(void)
+ if (!xen_domain())
+ return -ENODEV;
+
++ port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
++ if (port_user == NULL)
++ return -ENOMEM;
+
-+ return 0;
-+}
+ spin_lock_init(&port_user_lock);
+- memset(port_user, 0, sizeof(port_user));
+
+ /* Create '/dev/misc/evtchn'. */
+ err = misc_register(&evtchn_miscdev);
+@@ -497,6 +539,9 @@ static int __init evtchn_init(void)
+
+ static void __exit evtchn_cleanup(void)
+ {
++ kfree(port_user);
++ port_user = NULL;
+
-+static struct block_device_operations blktap_device_file_operations = {
-+ .owner = THIS_MODULE,
-+ .open = blktap_device_open,
-+ .release = blktap_device_release,
-+ .ioctl = blktap_device_ioctl,
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
-+ .getgeo = blktap_device_getgeo
-+#endif
-+};
+ misc_deregister(&evtchn_miscdev);
+ }
+
+diff --git a/drivers/xen/features.c b/drivers/xen/features.c
+index 99eda16..9e2b64f 100644
+--- a/drivers/xen/features.c
++++ b/drivers/xen/features.c
+@@ -18,7 +18,7 @@
+ u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+ EXPORT_SYMBOL_GPL(xen_features);
+
+-void xen_setup_features(void)
++void __init xen_setup_features(void)
+ {
+ struct xen_feature_info fi;
+ int i, j;
+diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
+new file mode 100644
+index 0000000..ddc59cc
+--- /dev/null
++++ b/drivers/xen/gntdev.c
+@@ -0,0 +1,626 @@
++/******************************************************************************
++ * gntdev.c
++ *
++ * Device for accessing (in user-space) pages that have been granted by other
++ * domains.
++ *
++ * Copyright (c) 2006-2007, D G Murray.
++ * (c) 2009 Gerd Hoffmann <kraxel at redhat.com>
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ */
+
-+static int
-+blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
-+ unsigned long addr, void *data)
-+{
-+ pte_t *pte = (pte_t *)data;
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/miscdevice.h>
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/mmu_notifier.h>
++#include <linux/types.h>
++#include <linux/uaccess.h>
++#include <linux/sched.h>
++#include <linux/rwsem.h>
+
-+ BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte));
-+ set_pte(ptep, *pte);
-+ return 0;
-+}
++#include <xen/xen.h>
++#include <xen/grant_table.h>
++#include <xen/gntdev.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include <asm/xen/page.h>
+
-+static int
-+blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte)
-+{
-+ return apply_to_page_range(mm, address,
-+ PAGE_SIZE, blktap_map_uaddr_fn, &pte);
-+}
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Derek G. Murray <Derek.Murray at cl.cam.ac.uk>, "
++ "Gerd Hoffmann <kraxel at redhat.com>");
++MODULE_DESCRIPTION("User-space granted page access driver");
+
-+static int
-+blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
-+ unsigned long addr, void *data)
-+{
-+ struct mm_struct *mm = (struct mm_struct *)data;
++static int debug = 0;
++module_param(debug, int, 0644);
++static int limit = 1024;
++module_param(limit, int, 0644);
+
-+ BTDBG("ptep %p\n", ptep);
-+ pte_clear(mm, addr, ptep);
-+ return 0;
-+}
++struct gntdev_priv {
++ struct list_head maps;
++ uint32_t used;
++ uint32_t limit;
++ struct rw_semaphore sem;
++ struct mm_struct *mm;
++ struct mmu_notifier mn;
++};
+
-+static int
-+blktap_umap_uaddr(struct mm_struct *mm, unsigned long address)
-+{
-+ return apply_to_page_range(mm, address,
-+ PAGE_SIZE, blktap_umap_uaddr_fn, mm);
-+}
++struct grant_map {
++ struct list_head next;
++ struct gntdev_priv *priv;
++ struct vm_area_struct *vma;
++ int index;
++ int count;
++ int flags;
++ int is_mapped;
++ struct ioctl_gntdev_grant_ref *grants;
++ struct gnttab_map_grant_ref *map_ops;
++ struct gnttab_unmap_grant_ref *unmap_ops;
++};
+
-+static inline void
-+flush_tlb_kernel_page(unsigned long kvaddr)
-+{
-+ flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE);
-+}
++/* ------------------------------------------------------------------ */
+
-+static void
-+blktap_device_end_dequeued_request(struct blktap_device *dev,
-+ struct request *req, int error)
++static void gntdev_print_maps(struct gntdev_priv *priv,
++ char *text, int text_index)
+{
-+ unsigned long flags;
-+ int ret;
-+
-+ //spin_lock_irq(&dev->lock);
-+ spin_lock_irqsave(dev->gd->queue->queue_lock, flags);
-+ ret = __blk_end_request(req, error, blk_rq_bytes(req));
-+ spin_unlock_irqrestore(dev->gd->queue->queue_lock, flags);
-+ //spin_unlock_irq(&dev->lock);
++ struct grant_map *map;
+
-+ BUG_ON(ret);
++ printk("%s: maps list (priv %p, usage %d/%d)\n",
++ __FUNCTION__, priv, priv->used, priv->limit);
++ list_for_each_entry(map, &priv->maps, next)
++ printk(" index %2d, count %2d %s\n",
++ map->index, map->count,
++ map->index == text_index && text ? text : "");
+}
+
-+/*
-+ * tap->tap_sem held on entry
-+ */
-+static void
-+blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
++static struct grant_map *gntdev_add_map(struct gntdev_priv *priv, int count)
+{
-+ uint64_t ptep;
-+ int ret, usr_idx;
-+ unsigned int i, cnt;
-+ struct page **map, *page;
-+ struct blktap_ring *ring;
-+ struct grant_handle_pair *khandle;
-+ unsigned long kvaddr, uvaddr, offset;
-+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
-+
-+ cnt = 0;
-+ ring = &tap->ring;
-+ usr_idx = request->usr_idx;
-+ map = ring->foreign_map.map;
-+
-+ if (!ring->vma)
-+ return;
-+
-+ if (xen_feature(XENFEAT_auto_translated_physmap))
-+ zap_page_range(ring->vma,
-+ MMAP_VADDR(ring->user_vstart, usr_idx, 0),
-+ request->nr_pages << PAGE_SHIFT, NULL);
++ struct grant_map *map, *add;
+
-+ for (i = 0; i < request->nr_pages; i++) {
-+ kvaddr = request_to_kaddr(request, i);
-+ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++ add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
++ if (NULL == add)
++ return NULL;
+
-+ khandle = request->handles + i;
++ add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL);
++ add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL);
++ add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
++ if (NULL == add->grants ||
++ NULL == add->map_ops ||
++ NULL == add->unmap_ops)
++ goto err;
+
-+ if (khandle->kernel != INVALID_GRANT_HANDLE) {
-+ gnttab_set_unmap_op(&unmap[cnt], kvaddr,
-+ GNTMAP_host_map, khandle->kernel);
-+ cnt++;
-+ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
-+ INVALID_P2M_ENTRY);
-+ }
++ add->index = 0;
++ add->count = count;
++ add->priv = priv;
+
-+ if (khandle->user != INVALID_GRANT_HANDLE) {
-+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
-+ if (create_lookup_pte_addr(ring->vma->vm_mm,
-+ uvaddr, &ptep) != 0) {
-+ BTERR("Couldn't get a pte addr!\n");
-+ return;
-+ }
++ if (add->count + priv->used > priv->limit)
++ goto err;
+
-+ gnttab_set_unmap_op(&unmap[cnt], ptep,
-+ GNTMAP_host_map
-+ | GNTMAP_application_map
-+ | GNTMAP_contains_pte,
-+ khandle->user);
-+ cnt++;
++ list_for_each_entry(map, &priv->maps, next) {
++ if (add->index + add->count < map->index) {
++ list_add_tail(&add->next, &map->next);
++ goto done;
+ }
++ add->index = map->index + map->count;
++ }
++ list_add_tail(&add->next, &priv->maps);
+
-+ offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
++done:
++ priv->used += add->count;
++ if (debug)
++ gntdev_print_maps(priv, "[new]", add->index);
++ return add;
+
-+ BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
-+ "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
-+ "0x%08lx, handle: %u\n", offset, map[offset], request,
-+ usr_idx, i, kvaddr, khandle->kernel, uvaddr,
-+ khandle->user);
++err:
++ kfree(add->grants);
++ kfree(add->map_ops);
++ kfree(add->unmap_ops);
++ kfree(add);
++ return NULL;
++}
+
-+ page = map[offset];
-+ if (page) {
-+ ClearPageReserved(map[offset]);
-+ if (blkback_pagemap_contains_page(page))
-+ set_page_private(page, 0);
-+ }
-+ map[offset] = NULL;
++static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, int index,
++ int count)
++{
++ struct grant_map *map;
+
-+ khandle->kernel = INVALID_GRANT_HANDLE;
-+ khandle->user = INVALID_GRANT_HANDLE;
++ list_for_each_entry(map, &priv->maps, next) {
++ if (map->index != index)
++ continue;
++ if (map->count != count)
++ continue;
++ return map;
+ }
++ return NULL;
++}
+
-+ if (cnt) {
-+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
-+ unmap, cnt);
-+ BUG_ON(ret);
-+ }
++static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
++ unsigned long vaddr)
++{
++ struct grant_map *map;
+
-+ if (!xen_feature(XENFEAT_auto_translated_physmap))
-+ zap_page_range(ring->vma,
-+ MMAP_VADDR(ring->user_vstart, usr_idx, 0),
-+ request->nr_pages << PAGE_SHIFT, NULL);
++ list_for_each_entry(map, &priv->maps, next) {
++ if (!map->vma)
++ continue;
++ if (vaddr < map->vma->vm_start)
++ continue;
++ if (vaddr >= map->vma->vm_end)
++ continue;
++ return map;
++ }
++ return NULL;
+}
+
-+/*
-+ * tap->tap_sem held on entry
-+ */
-+static void
-+blktap_unmap(struct blktap *tap, struct blktap_request *request)
++static int gntdev_del_map(struct grant_map *map)
+{
-+ int i, usr_idx;
-+ unsigned long kvaddr;
++ int i;
+
-+ usr_idx = request->usr_idx;
-+ down_write(&tap->ring.vma->vm_mm->mmap_sem);
++ if (map->vma)
++ return -EBUSY;
++ for (i = 0; i < map->count; i++)
++ if (map->unmap_ops[i].handle)
++ return -EBUSY;
+
-+ for (i = 0; i < request->nr_pages; i++) {
-+ kvaddr = request_to_kaddr(request, i);
-+ BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
-+ "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
-+ kvaddr, request->handles[i].kernel,
-+ MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
-+ request->handles[i].user);
++ map->priv->used -= map->count;
++ list_del(&map->next);
++ kfree(map->grants);
++ kfree(map->map_ops);
++ kfree(map->unmap_ops);
++ kfree(map);
++ return 0;
++}
+
-+ if (request->handles[i].kernel == INVALID_GRANT_HANDLE) {
-+ blktap_umap_uaddr(tap->ring.vma->vm_mm, kvaddr);
-+ flush_tlb_kernel_page(kvaddr);
-+ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
-+ INVALID_P2M_ENTRY);
-+ }
-+ }
++/* ------------------------------------------------------------------ */
+
-+ blktap_device_fast_flush(tap, request);
-+ up_write(&tap->ring.vma->vm_mm->mmap_sem);
++static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void *data)
++{
++ struct grant_map *map = data;
++ unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
++ u64 pte_maddr;
++
++ BUG_ON(pgnr >= map->count);
++ pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT;
++ pte_maddr += (unsigned long)pte & ~PAGE_MASK;
++ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags,
++ map->grants[pgnr].ref,
++ map->grants[pgnr].domid);
++ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags,
++ 0 /* handle */);
++ return 0;
+}
+
-+/*
-+ * called if the tapdisk process dies unexpectedly.
-+ * fail and release any pending requests and disable queue.
-+ */
-+void
-+blktap_device_fail_pending_requests(struct blktap *tap)
++static int map_grant_pages(struct grant_map *map)
+{
-+ int usr_idx;
-+ struct request *req;
-+ struct blktap_device *dev;
-+ struct blktap_request *request;
++ int i, err = 0;
+
-+ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
-+ return;
++ if (debug)
++ printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count);
++ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++ map->map_ops, map->count);
++ if (WARN_ON(err))
++ return err;
+
-+ down_write(&tap->tap_sem);
++ for (i = 0; i < map->count; i++) {
++ if (map->map_ops[i].status)
++ err = -EINVAL;
++ map->unmap_ops[i].handle = map->map_ops[i].handle;
++ }
++ return err;
++}
+
-+ dev = &tap->device;
-+ for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
-+ request = tap->pending_requests[usr_idx];
-+ if (!request || request->status != BLKTAP_REQUEST_PENDING)
-+ continue;
++static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
++{
++ int i, err = 0;
+
-+ BTERR("%u:%u: failing pending %s of %d pages\n",
-+ blktap_device_major, tap->minor,
-+ (request->operation == BLKIF_OP_READ ?
-+ "read" : "write"), request->nr_pages);
++ if (debug)
++ printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__,
++ map->index, map->count, offset, pages);
++ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++ map->unmap_ops + offset, pages);
++ if (WARN_ON(err))
++ return err;
+
-+ blktap_unmap(tap, request);
-+ req = (struct request *)(unsigned long)request->id;
-+ blktap_device_end_dequeued_request(dev, req, -EIO);
-+ blktap_request_free(tap, request);
++ for (i = 0; i < pages; i++) {
++ if (map->unmap_ops[offset+i].status)
++ err = -EINVAL;
++ map->unmap_ops[offset+i].handle = 0;
+ }
++ return err;
++}
+
-+ up_write(&tap->tap_sem);
-+
-+ spin_lock_irq(&dev->lock);
++/* ------------------------------------------------------------------ */
+
-+ /* fail any future requests */
-+ dev->gd->queue->queuedata = NULL;
-+ blk_start_queue(dev->gd->queue);
++static void gntdev_vma_close(struct vm_area_struct *vma)
++{
++ struct grant_map *map = vma->vm_private_data;
+
-+ spin_unlock_irq(&dev->lock);
++ if (debug)
++ printk("%s\n", __FUNCTION__);
++ map->is_mapped = 0;
++ map->vma = NULL;
++ vma->vm_private_data = NULL;
+}
+
-+/*
-+ * tap->tap_sem held on entry
-+ */
-+void
-+blktap_device_finish_request(struct blktap *tap,
-+ struct blkif_response *res,
-+ struct blktap_request *request)
++static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
-+ int ret;
-+ struct request *req;
-+ struct blktap_device *dev;
-+
-+ dev = &tap->device;
++ if (debug)
++ printk("%s: vaddr %p, pgoff %ld (shouldn't happen)\n",
++ __FUNCTION__, vmf->virtual_address, vmf->pgoff);
++ vmf->flags = VM_FAULT_ERROR;
++ return 0;
++}
+
-+ blktap_unmap(tap, request);
++static struct vm_operations_struct gntdev_vmops = {
++ .close = gntdev_vma_close,
++ .fault = gntdev_vma_fault,
++};
+
-+ req = (struct request *)(unsigned long)request->id;
-+ ret = res->status == BLKIF_RSP_OKAY ? 0 : -EIO;
++/* ------------------------------------------------------------------ */
+
-+ BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
-+ res->status, res->operation, request->operation,
-+ (unsigned long long)res->id);
++static void mn_invl_range_start(struct mmu_notifier *mn,
++ struct mm_struct *mm,
++ unsigned long start, unsigned long end)
++{
++ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
++ struct grant_map *map;
++ unsigned long mstart, mend;
++ int err;
+
-+ switch (request->operation) {
-+ case BLKIF_OP_READ:
-+ case BLKIF_OP_WRITE:
-+ if (unlikely(res->status != BLKIF_RSP_OKAY))
-+ BTERR("Bad return from device data "
-+ "request: %x\n", res->status);
-+ blktap_device_end_dequeued_request(dev, req, ret);
-+ break;
-+ default:
-+ BUG();
++ down_read(&priv->sem);
++ list_for_each_entry(map, &priv->maps, next) {
++ if (!map->vma)
++ continue;
++ if (!map->is_mapped)
++ continue;
++ if (map->vma->vm_start >= end)
++ continue;
++ if (map->vma->vm_end <= start)
++ continue;
++ mstart = max(start, map->vma->vm_start);
++ mend = min(end, map->vma->vm_end);
++ if (debug)
++ printk("%s: map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
++ __FUNCTION__, map->index, map->count,
++ map->vma->vm_start, map->vma->vm_end,
++ start, end, mstart, mend);
++ err = unmap_grant_pages(map,
++ (mstart - map->vma->vm_start) >> PAGE_SHIFT,
++ (mend - mstart) >> PAGE_SHIFT);
++ WARN_ON(err);
+ }
++ up_read(&priv->sem);
++}
+
-+ blktap_request_free(tap, request);
++static void mn_invl_page(struct mmu_notifier *mn,
++ struct mm_struct *mm,
++ unsigned long address)
++{
++ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
+}
+
-+static int
-+blktap_prep_foreign(struct blktap *tap,
-+ struct blktap_request *request,
-+ struct blkif_request *blkif_req,
-+ unsigned int seg, struct page *page,
-+ struct blktap_grant_table *table)
++static void mn_release(struct mmu_notifier *mn,
++ struct mm_struct *mm)
+{
-+ uint64_t ptep;
-+ uint32_t flags;
-+#ifdef BLKTAP_CHAINED_BLKTAP
-+ struct page *tap_page;
-+#endif
-+ struct blktap_ring *ring;
-+ struct blkback_pagemap map;
-+ unsigned long uvaddr, kvaddr;
++ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
++ struct grant_map *map;
++ int err;
+
-+ ring = &tap->ring;
-+ map = blkback_pagemap_read(page);
-+ blkif_req->seg[seg].gref = map.gref;
++ down_read(&priv->sem);
++ list_for_each_entry(map, &priv->maps, next) {
++ if (!map->vma)
++ continue;
++ if (debug)
++ printk("%s: map %d+%d (%lx %lx)\n",
++ __FUNCTION__, map->index, map->count,
++ map->vma->vm_start, map->vma->vm_end);
++ err = unmap_grant_pages(map, 0, map->count);
++ WARN_ON(err);
++ }
++ up_read(&priv->sem);
++}
+
-+ uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
-+ kvaddr = request_to_kaddr(request, seg);
-+ flags = GNTMAP_host_map |
-+ (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
++struct mmu_notifier_ops gntdev_mmu_ops = {
++ .release = mn_release,
++ .invalidate_page = mn_invl_page,
++ .invalidate_range_start = mn_invl_range_start,
++};
+
-+ gnttab_set_map_op(&table->grants[table->cnt],
-+ kvaddr, flags, map.gref, map.domid);
-+ table->cnt++;
++/* ------------------------------------------------------------------ */
+
++static int gntdev_open(struct inode *inode, struct file *flip)
++{
++ struct gntdev_priv *priv;
+
-+#ifdef BLKTAP_CHAINED_BLKTAP
-+ /* enable chained tap devices */
-+ tap_page = request_to_page(request, seg);
-+ set_page_private(tap_page, page_private(page));
-+ SetPageBlkback(tap_page);
-+#endif
++ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
++ if (!priv)
++ return -ENOMEM;
+
-+ if (xen_feature(XENFEAT_auto_translated_physmap))
-+ return 0;
++ INIT_LIST_HEAD(&priv->maps);
++ init_rwsem(&priv->sem);
++ priv->limit = limit;
+
-+ if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
-+ BTERR("couldn't get a pte addr!\n");
-+ return -1;
++ priv->mm = get_task_mm(current);
++ if (!priv->mm) {
++ kfree(priv);
++ return -ENOMEM;
+ }
++ priv->mn.ops = &gntdev_mmu_ops;
++ mmu_notifier_register(&priv->mn, priv->mm);
++ mmput(priv->mm);
+
-+ flags |= GNTMAP_application_map | GNTMAP_contains_pte;
-+ gnttab_set_map_op(&table->grants[table->cnt],
-+ ptep, flags, map.gref, map.domid);
-+ table->cnt++;
++ flip->private_data = priv;
++ if (debug)
++ printk("%s: priv %p\n", __FUNCTION__, priv);
+
+ return 0;
+}
+
-+static int
-+blktap_map_foreign(struct blktap *tap,
-+ struct blktap_request *request,
-+ struct blkif_request *blkif_req,
-+ struct blktap_grant_table *table)
++static int gntdev_release(struct inode *inode, struct file *flip)
+{
-+ struct page *page;
-+ int i, grant, err, usr_idx;
-+ struct blktap_ring *ring;
-+ unsigned long uvaddr, foreign_mfn;
-+
-+ if (!table->cnt)
-+ return 0;
-+
-+ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
-+ table->grants, table->cnt);
-+ BUG_ON(err);
-+
-+ grant = 0;
-+ usr_idx = request->usr_idx;
-+ ring = &tap->ring;
-+
-+ for (i = 0; i < request->nr_pages; i++) {
-+ if (!blkif_req->seg[i].gref)
-+ continue;
-+
-+ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
-+
-+ if (unlikely(table->grants[grant].status)) {
-+ BTERR("invalid kernel buffer: could not remap it\n");
-+ err |= 1;
-+ table->grants[grant].handle = INVALID_GRANT_HANDLE;
-+ }
-+
-+ request->handles[i].kernel = table->grants[grant].handle;
-+ foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
-+ grant++;
-+
-+ if (xen_feature(XENFEAT_auto_translated_physmap))
-+ goto done;
-+
-+ if (unlikely(table->grants[grant].status)) {
-+ BTERR("invalid user buffer: could not remap it\n");
-+ err |= 1;
-+ table->grants[grant].handle = INVALID_GRANT_HANDLE;
-+ }
++ struct gntdev_priv *priv = flip->private_data;
++ struct grant_map *map;
++ int err;
+
-+ request->handles[i].user = table->grants[grant].handle;
-+ grant++;
++ if (debug)
++ printk("%s: priv %p\n", __FUNCTION__, priv);
+
-+ done:
-+ if (err)
-+ continue;
++ down_write(&priv->sem);
++ while (!list_empty(&priv->maps)) {
++ map = list_entry(priv->maps.next, struct grant_map, next);
++ err = gntdev_del_map(map);
++ WARN_ON(err);
++ }
++ up_write(&priv->sem);
++ mmu_notifier_unregister(&priv->mn, priv->mm);
++ kfree(priv);
++ return 0;
++}
+
-+ page = request_to_page(request, i);
++static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
++ struct ioctl_gntdev_map_grant_ref __user *u)
++{
++ struct ioctl_gntdev_map_grant_ref op;
++ struct grant_map *map;
++ int err;
+
-+ if (!xen_feature(XENFEAT_auto_translated_physmap))
-+ set_phys_to_machine(page_to_pfn(page),
-+ FOREIGN_FRAME(foreign_mfn));
-+ else if (vm_insert_page(ring->vma, uvaddr, page))
-+ err |= 1;
++ if (copy_from_user(&op, u, sizeof(op)) != 0)
++ return -EFAULT;
++ if (debug)
++ printk("%s: priv %p, add %d\n", __FUNCTION__, priv,
++ op.count);
++ if (unlikely(op.count <= 0))
++ return -EINVAL;
++ if (unlikely(op.count > priv->limit))
++ return -EINVAL;
+
-+ BTDBG("pending_req: %p, seg: %d, page: %p, "
-+ "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, "
-+ "uhandle: %u\n", request, i, page,
-+ pfn_to_kaddr(page_to_pfn(page)),
-+ request->handles[i].kernel,
-+ uvaddr, request->handles[i].user);
-+ }
++ down_write(&priv->sem);
++ err = -ENOMEM;
++ map = gntdev_add_map(priv, op.count);
++ if (!map)
++ goto err_unlock;
++
++ err = -ENOMEM;
++ if (copy_from_user(map->grants, &u->refs,
++ sizeof(map->grants[0]) * op.count) != 0)
++ goto err_free;
++ op.index = map->index << PAGE_SHIFT;
++ if (copy_to_user(u, &op, sizeof(op)) != 0)
++ goto err_free;
++ up_write(&priv->sem);
++ return 0;
+
++err_free:
++ gntdev_del_map(map);
++err_unlock:
++ up_write(&priv->sem);
+ return err;
+}
+
-+static void
-+blktap_map(struct blktap *tap,
-+ struct blktap_request *request,
-+ unsigned int seg, struct page *page)
++static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
++ struct ioctl_gntdev_unmap_grant_ref __user *u)
+{
-+ pte_t pte;
-+ int usr_idx;
-+ struct blktap_ring *ring;
-+ unsigned long uvaddr, kvaddr;
-+
-+ ring = &tap->ring;
-+ usr_idx = request->usr_idx;
-+ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
-+ kvaddr = request_to_kaddr(request, seg);
-+
-+ pte = mk_pte(page, ring->vma->vm_page_prot);
-+ blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte));
-+ flush_tlb_page(ring->vma, uvaddr);
-+ blktap_map_uaddr(ring->vma->vm_mm, kvaddr, mk_pte(page, PAGE_KERNEL));
-+ flush_tlb_kernel_page(kvaddr);
++ struct ioctl_gntdev_unmap_grant_ref op;
++ struct grant_map *map;
++ int err = -EINVAL;
+
-+ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
-+ request->handles[seg].kernel = INVALID_GRANT_HANDLE;
-+ request->handles[seg].user = INVALID_GRANT_HANDLE;
++ if (copy_from_user(&op, u, sizeof(op)) != 0)
++ return -EFAULT;
++ if (debug)
++ printk("%s: priv %p, del %d+%d\n", __FUNCTION__, priv,
++ (int)op.index, (int)op.count);
+
-+ BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
-+ "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
-+ uvaddr);
++ down_write(&priv->sem);
++ map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
++ if (map)
++ err = gntdev_del_map(map);
++ up_write(&priv->sem);
++ return err;
+}
+
-+static int
-+blktap_device_process_request(struct blktap *tap,
-+ struct blktap_request *request,
-+ struct request *req)
++static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
++ struct ioctl_gntdev_get_offset_for_vaddr __user *u)
+{
-+ struct page *page;
-+ int i, usr_idx, err;
-+ struct blktap_ring *ring;
-+ struct scatterlist *sg;
-+ struct blktap_grant_table table;
-+ unsigned int fsect, lsect, nr_sects;
-+ unsigned long offset, uvaddr;
-+ struct blkif_request blkif_req, *target;
-+
-+ err = -1;
-+ memset(&table, 0, sizeof(table));
++ struct ioctl_gntdev_get_offset_for_vaddr op;
++ struct grant_map *map;
+
-+ if (!blktap_active(tap))
-+ goto out;
++ if (copy_from_user(&op, u, sizeof(op)) != 0)
++ return -EFAULT;
++ if (debug)
++ printk("%s: priv %p, offset for vaddr %lx\n", __FUNCTION__, priv,
++ (unsigned long)op.vaddr);
+
-+ ring = &tap->ring;
-+ usr_idx = request->usr_idx;
-+ blkif_req.id = usr_idx;
-+ blkif_req.sector_number = (blkif_sector_t)blk_rq_pos(req);
-+ blkif_req.handle = 0;
-+ blkif_req.operation = rq_data_dir(req) ?
-+ BLKIF_OP_WRITE : BLKIF_OP_READ;
++ down_read(&priv->sem);
++ map = gntdev_find_map_vaddr(priv, op.vaddr);
++ if (map == NULL ||
++ map->vma->vm_start != op.vaddr) {
++ up_read(&priv->sem);
++ return -EINVAL;
++ }
++ op.offset = map->index << PAGE_SHIFT;
++ op.count = map->count;
++ up_read(&priv->sem);
+
-+ request->id = (unsigned long)req;
-+ request->operation = blkif_req.operation;
-+ request->status = BLKTAP_REQUEST_PENDING;
-+ do_gettimeofday(&request->time);
++ if (copy_to_user(u, &op, sizeof(op)) != 0)
++ return -EFAULT;
++ return 0;
++}
+
-+ nr_sects = 0;
-+ request->nr_pages = 0;
-+ blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg);
-+ BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-+ for (i = 0; i < blkif_req.nr_segments; ++i) {
-+ sg = tap->sg + i;
-+ fsect = sg->offset >> 9;
-+ lsect = fsect + (sg->length >> 9) - 1;
-+ nr_sects += sg->length >> 9;
++static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
++ struct ioctl_gntdev_set_max_grants __user *u)
++{
++ struct ioctl_gntdev_set_max_grants op;
+
-+ blkif_req.seg[i] =
-+ (struct blkif_request_segment) {
-+ .gref = 0,
-+ .first_sect = fsect,
-+ .last_sect = lsect };
++ if (copy_from_user(&op, u, sizeof(op)) != 0)
++ return -EFAULT;
++ if (debug)
++ printk("%s: priv %p, limit %d\n", __FUNCTION__, priv, op.count);
++ if (op.count > limit)
++ return -EINVAL;
+
-+ if (blkback_pagemap_contains_page(sg_page(sg))) {
-+ /* foreign page -- use xen */
-+ if (blktap_prep_foreign(tap,
-+ request,
-+ &blkif_req,
-+ i,
-+ sg_page(sg),
-+ &table))
-+ goto out;
-+ } else {
-+ /* do it the old fashioned way */
-+ blktap_map(tap,
-+ request,
-+ i,
-+ sg_page(sg));
-+ }
++ down_write(&priv->sem);
++ priv->limit = op.count;
++ up_write(&priv->sem);
++ return 0;
++}
+
-+ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
-+ offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
-+ page = request_to_page(request, i);
-+ ring->foreign_map.map[offset] = page;
-+ SetPageReserved(page);
++static long gntdev_ioctl(struct file *flip,
++ unsigned int cmd, unsigned long arg)
++{
++ struct gntdev_priv *priv = flip->private_data;
++ void __user *ptr = (void __user *)arg;
+
-+ BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
-+ uvaddr, page, page_to_pfn(page));
-+ BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
-+ "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n",
-+ offset, request, i,
-+ page, pfn_to_kaddr(page_to_pfn(page)), uvaddr);
++ switch (cmd) {
++ case IOCTL_GNTDEV_MAP_GRANT_REF:
++ return gntdev_ioctl_map_grant_ref(priv, ptr);
+
-+ request->nr_pages++;
-+ }
++ case IOCTL_GNTDEV_UNMAP_GRANT_REF:
++ return gntdev_ioctl_unmap_grant_ref(priv, ptr);
+
-+ if (blktap_map_foreign(tap, request, &blkif_req, &table))
-+ goto out;
++ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
++ return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
+
-+ /* Finally, write the request message to the user ring. */
-+ target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
-+ memcpy(target, &blkif_req, sizeof(blkif_req));
-+ target->id = request->usr_idx;
-+ wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
-+ ring->ring.req_prod_pvt++;
++ case IOCTL_GNTDEV_SET_MAX_GRANTS:
++ return gntdev_ioctl_set_max_grants(priv, ptr);
+
-+ if (rq_data_dir(req)) {
-+ tap->stats.st_wr_sect += nr_sects;
-+ tap->stats.st_wr_req++;
-+ } else {
-+ tap->stats.st_rd_sect += nr_sects;
-+ tap->stats.st_rd_req++;
++ default:
++ if (debug)
++ printk("%s: priv %p, unknown cmd %x\n",
++ __FUNCTION__, priv, cmd);
++ return -ENOIOCTLCMD;
+ }
+
-+ err = 0;
-+
-+out:
-+ if (err)
-+ blktap_device_fast_flush(tap, request);
-+ return err;
++ return 0;
+}
+
-+#ifdef ENABLE_PASSTHROUGH
-+#define rq_for_each_bio_safe(_bio, _tmp, _req) \
-+ if ((_req)->bio) \
-+ for (_bio = (_req)->bio; \
-+ _bio && ((_tmp = _bio->bi_next) || 1); \
-+ _bio = _tmp)
-+
-+static void
-+blktap_device_forward_request(struct blktap *tap, struct request *req)
++static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
+{
-+ struct bio *bio, *tmp;
-+ struct blktap_device *dev;
-+
-+ dev = &tap->device;
++ struct gntdev_priv *priv = flip->private_data;
++ int index = vma->vm_pgoff;
++ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++ struct grant_map *map;
++ int err = -EINVAL;
+
-+ rq_for_each_bio_safe(bio, tmp, req) {
-+ bio->bi_bdev = dev->bdev;
-+ submit_bio(bio->bi_rw, bio);
-+ }
-+}
++ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
++ return -EINVAL;
+
-+static void
-+blktap_device_close_bdev(struct blktap *tap)
-+{
-+ struct blktap_device *dev;
++ if (debug)
++ printk("%s: map %d+%d at %lx (pgoff %lx)\n", __FUNCTION__,
++ index, count, vma->vm_start, vma->vm_pgoff);
+
-+ dev = &tap->device;
++ down_read(&priv->sem);
++ map = gntdev_find_map_index(priv, index, count);
++ if (!map)
++ goto unlock_out;
++ if (map->vma)
++ goto unlock_out;
++ if (priv->mm != vma->vm_mm) {
++ printk("%s: Huh? Other mm?\n", __FUNCTION__);
++ goto unlock_out;
++ }
+
-+ if (dev->bdev)
-+ blkdev_put(dev->bdev);
++ vma->vm_ops = &gntdev_vmops;
+
-+ dev->bdev = NULL;
-+ clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
-+}
++ vma->vm_flags |= VM_RESERVED;
++ vma->vm_flags |= VM_DONTCOPY;
++ vma->vm_flags |= VM_DONTEXPAND;
+
-+static int
-+blktap_device_open_bdev(struct blktap *tap, u32 pdev)
-+{
-+ struct block_device *bdev;
-+ struct blktap_device *dev;
++ vma->vm_private_data = map;
++ map->vma = vma;
+
-+ dev = &tap->device;
++ map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte;
++ if (!(vma->vm_flags & VM_WRITE))
++ map->flags |= GNTMAP_readonly;
+
-+ bdev = open_by_devnum(pdev, FMODE_WRITE);
-+ if (IS_ERR(bdev)) {
-+ BTERR("opening device %x:%x failed: %ld\n",
-+ MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
-+ return PTR_ERR(bdev);
++ err = apply_to_page_range(vma->vm_mm, vma->vm_start,
++ vma->vm_end - vma->vm_start,
++ find_grant_ptes, map);
++ if (err) {
++ goto unlock_out;
++ if (debug)
++ printk("%s: find_grant_ptes() failure.\n", __FUNCTION__);
+ }
+
-+ if (!bdev->bd_disk) {
-+ BTERR("device %x:%x doesn't exist\n",
-+ MAJOR(pdev), MINOR(pdev));
-+ blkdev_put(dev->bdev);
-+ return -ENOENT;
++ err = map_grant_pages(map);
++ if (err) {
++ goto unlock_out;
++ if (debug)
++ printk("%s: map_grant_pages() failure.\n", __FUNCTION__);
+ }
++ map->is_mapped = 1;
+
-+ dev->bdev = bdev;
-+ set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++unlock_out:
++ up_read(&priv->sem);
++ return err;
++}
+
-+ /* TODO: readjust queue parameters */
++static const struct file_operations gntdev_fops = {
++ .owner = THIS_MODULE,
++ .open = gntdev_open,
++ .release = gntdev_release,
++ .mmap = gntdev_mmap,
++ .unlocked_ioctl = gntdev_ioctl
++};
+
-+ BTINFO("set device %d to passthrough on %x:%x\n",
-+ tap->minor, MAJOR(pdev), MINOR(pdev));
++static struct miscdevice gntdev_miscdev = {
++ .minor = MISC_DYNAMIC_MINOR,
++ .name = "gntdev",
++ .fops = &gntdev_fops,
++};
+
-+ return 0;
-+}
++/* ------------------------------------------------------------------ */
+
-+int
-+blktap_device_enable_passthrough(struct blktap *tap,
-+ unsigned major, unsigned minor)
++static int __init gntdev_init(void)
+{
-+ u32 pdev;
-+ struct blktap_device *dev;
-+
-+ dev = &tap->device;
-+ pdev = MKDEV(major, minor);
++ int err;
+
-+ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+ return -EINVAL;
++ if (!xen_domain())
++ return -ENODEV;
+
-+ if (dev->bdev) {
-+ if (pdev)
-+ return -EINVAL;
-+ blktap_device_close_bdev(tap);
-+ return 0;
++ err = misc_register(&gntdev_miscdev);
++ if (err != 0) {
++ printk(KERN_ERR "Could not register gntdev device\n");
++ return err;
+ }
++ return 0;
++}
+
-+ return blktap_device_open_bdev(tap, pdev);
++static void __exit gntdev_exit(void)
++{
++ misc_deregister(&gntdev_miscdev);
++}
++
++module_init(gntdev_init);
++module_exit(gntdev_exit);
++
++/* ------------------------------------------------------------------ */
+diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
+index 7d8f531..76fe621 100644
+--- a/drivers/xen/grant-table.c
++++ b/drivers/xen/grant-table.c
+@@ -37,6 +37,7 @@
+ #include <linux/vmalloc.h>
+ #include <linux/uaccess.h>
+
++#include <xen/xen.h>
+ #include <xen/interface/xen.h>
+ #include <xen/page.h>
+ #include <xen/grant_table.h>
+@@ -472,6 +473,111 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+ return 0;
+ }
+
++static void gnttab_page_free(struct page *page, unsigned int order)
++{
++ BUG_ON(order);
++ ClearPageForeign(page);
++ gnttab_reset_grant_page(page);
++ put_page(page);
+}
-+#endif
+
+/*
-+ * dev->lock held on entry
++ * Must not be called with IRQs off. This should only be used on the
++ * slow path.
++ *
++ * Copy a foreign granted page to local memory.
+ */
-+static void
-+blktap_device_run_queue(struct blktap *tap)
++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
+{
-+ int queued, err;
-+ struct request_queue *rq;
-+ struct request *req;
-+ struct blktap_ring *ring;
-+ struct blktap_device *dev;
-+ struct blktap_request *request;
++ struct gnttab_unmap_and_replace unmap;
++ struct mmu_update mmu;
++ struct page *page;
++ struct page *new_page;
++ void *new_addr;
++ void *addr;
++ unsigned long pfn;
++ unsigned long mfn;
++ unsigned long new_mfn;
++ int err;
+
-+ queued = 0;
-+ ring = &tap->ring;
-+ dev = &tap->device;
-+ rq = dev->gd->queue;
++ page = *pagep;
++ if (!get_page_unless_zero(page))
++ return -ENOENT;
+
-+ BTDBG("running queue for %d\n", tap->minor);
++ err = -ENOMEM;
++ new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
++ if (!new_page)
++ goto out;
+
-+ while ((req = blk_peek_request(rq)) != NULL) {
-+ if (!blk_fs_request(req)) {
-+ __blk_end_request_cur(req, 0);
-+ continue;
-+ }
++ new_addr = page_address(new_page);
++ addr = page_address(page);
++ memcpy(new_addr, addr, PAGE_SIZE);
+
-+ if (blk_barrier_rq(req)) {
-+ __blk_end_request_cur(req, 0);
-+ continue;
-+ }
++ pfn = page_to_pfn(page);
++ mfn = pfn_to_mfn(pfn);
++ new_mfn = virt_to_mfn(new_addr);
+
-+#ifdef ENABLE_PASSTHROUGH
-+ if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
-+ blkdev_dequeue_request(req);
-+ blktap_device_forward_request(tap, req);
-+ continue;
-+ }
-+#endif
++// write_seqlock(&gnttab_dma_lock); /* protects __gnttab_dma_map_page on 2.6.18 */
+
-+ if (RING_FULL(&ring->ring)) {
-+ wait:
-+ /* Avoid pointless unplugs. */
-+ blk_stop_queue(rq);
-+ blktap_defer(tap);
-+ break;
-+ }
++ /* Make seq visible before checking page_mapped. */
++ smp_mb();
+
-+ request = blktap_request_allocate(tap);
-+ if (!request) {
-+ tap->stats.st_oo_req++;
-+ goto wait;
-+ }
++ /* Has the page been DMA-mapped? */
++ if (unlikely(page_mapped(page))) {
++ //write_sequnlock(&gnttab_dma_lock);
++ put_page(new_page);
++ err = -EBUSY;
++ goto out;
++ }
+
-+ BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) "
-+ "buffer:%p [%s], pending: %p\n", req, tap->minor,
-+ req->cmd, (unsigned long long)blk_rq_pos(req),
-+ blk_rq_cur_sectors(req),
-+ blk_rq_sectors(req), req->buffer,
-+ rq_data_dir(req) ? "write" : "read", request);
++ if (!xen_feature(XENFEAT_auto_translated_physmap))
++ set_phys_to_machine(pfn, new_mfn);
+
-+ blk_start_request(req);
++ //gnttab_set_replace_op(&unmap, (unsigned long)addr,
++ // (unsigned long)new_addr, ref);
++ unmap.host_addr = (unsigned long)addr;
++ unmap.new_addr = (unsigned long)new_addr;
++ unmap.handle = ref;
++
++ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
++ &unmap, 1);
++ BUG_ON(err);
++ BUG_ON(unmap.status);
++
++// write_sequnlock(&gnttab_dma_lock);
++
++ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++ set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
++
++ mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
++ mmu.val = pfn;
++ err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
++ BUG_ON(err);
++ }
+
-+ spin_unlock_irq(&dev->lock);
-+ down_read(&tap->tap_sem);
++ new_page->mapping = page->mapping;
++ SetPageForeign(new_page, _PageForeignDestructor(page));
++ if (PageReserved(page))
++ SetPageReserved(new_page);
++ *pagep = new_page;
+
-+ err = blktap_device_process_request(tap, request, req);
-+ if (!err)
-+ queued++;
-+ else {
-+ blktap_device_end_dequeued_request(dev, req, -EIO);
-+ blktap_request_free(tap, request);
-+ }
++ SetPageForeign(page, gnttab_page_free);
++ ClearPageReserved(page);
++ page->mapping = NULL;
+
-+ up_read(&tap->tap_sem);
-+ spin_lock_irq(&dev->lock);
-+ }
++out:
++ put_page(page);
++ return err;
++}
++EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
+
-+ if (queued)
-+ blktap_ring_kick_user(tap);
++void gnttab_reset_grant_page(struct page *page)
++{
++ init_page_count(page);
++ reset_page_mapcount(page);
+}
++EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
+
-+/*
-+ * dev->lock held on entry
+ int gnttab_resume(void)
+ {
+ if (max_nr_grant_frames() < nr_grant_frames)
+diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile
+new file mode 100644
+index 0000000..e346e81
+--- /dev/null
++++ b/drivers/xen/netback/Makefile
+@@ -0,0 +1,3 @@
++obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
++
++xen-netback-y := netback.o xenbus.o interface.o
+diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
+new file mode 100644
+index 0000000..51f97c0
+--- /dev/null
++++ b/drivers/xen/netback/common.h
+@@ -0,0 +1,227 @@
++/******************************************************************************
++ * arch/xen/drivers/netif/backend/common.h
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
+ */
-+static void
-+blktap_device_do_request(struct request_queue *rq)
-+{
-+ struct request *req;
-+ struct blktap *tap;
-+ struct blktap_device *dev;
+
-+ dev = rq->queuedata;
-+ if (!dev)
-+ goto fail;
++#ifndef __NETIF__BACKEND__COMMON_H__
++#define __NETIF__BACKEND__COMMON_H__
+
-+ tap = dev_to_blktap(dev);
-+ if (!blktap_active(tap))
-+ goto fail;
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/ip.h>
++#include <linux/in.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/wait.h>
++#include <linux/sched.h>
+
-+ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
-+ test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
-+ blktap_defer(tap);
-+ return;
-+ }
++#include <xen/interface/io/netif.h>
++#include <asm/io.h>
++#include <asm/pgalloc.h>
++#include <xen/interface/grant_table.h>
++#include <xen/grant_table.h>
++#include <xen/xenbus.h>
+
-+ blktap_device_run_queue(tap);
-+ return;
++#define DPRINTK(_f, _a...) \
++ pr_debug("(file=%s, line=%d) " _f, \
++ __FILE__ , __LINE__ , ## _a )
++#define IPRINTK(fmt, args...) \
++ printk(KERN_INFO "xen_net: " fmt, ##args)
++#define WPRINTK(fmt, args...) \
++ printk(KERN_WARNING "xen_net: " fmt, ##args)
+
-+fail:
-+ while ((req = blk_peek_request(rq))) {
-+ BTERR("device closed: failing secs %llu - %llu\n",
-+ (unsigned long long)blk_rq_pos(req),
-+ (unsigned long long)blk_rq_pos(req) + blk_rq_sectors(req));
-+ __blk_end_request_cur(req, 0);
-+ }
-+}
++struct xen_netif {
++ /* Unique identifier for this interface. */
++ domid_t domid;
++ unsigned int handle;
+
-+void
-+blktap_device_restart(struct blktap *tap)
-+{
-+ struct blktap_device *dev;
++ u8 fe_dev_addr[6];
+
-+ dev = &tap->device;
++ /* Physical parameters of the comms window. */
++ grant_handle_t tx_shmem_handle;
++ grant_ref_t tx_shmem_ref;
++ grant_handle_t rx_shmem_handle;
++ grant_ref_t rx_shmem_ref;
++ unsigned int irq;
+
-+ if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
-+ blktap_defer(tap);
-+ return;
-+ }
++ /* The shared rings and indexes. */
++ struct xen_netif_tx_back_ring tx;
++ struct xen_netif_rx_back_ring rx;
++ struct vm_struct *tx_comms_area;
++ struct vm_struct *rx_comms_area;
+
-+ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
-+ test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
-+ blktap_defer(tap);
-+ return;
-+ }
++ /* Set of features that can be turned on in dev->features. */
++ int features;
+
-+ spin_lock_irq(&dev->lock);
++ int smart_poll;
+
-+ /* Re-enable calldowns. */
-+ if (dev->gd) {
-+ struct request_queue *rq = dev->gd->queue;
++ /* Internal feature information. */
++ u8 can_queue:1; /* can queue packets for receiver? */
+
-+ if (blk_queue_stopped(rq))
-+ blk_start_queue(rq);
++ /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
++ RING_IDX rx_req_cons_peek;
+
-+ /* Kick things off immediately. */
-+ blktap_device_do_request(rq);
-+ }
++ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
++ unsigned long credit_bytes;
++ unsigned long credit_usec;
++ unsigned long remaining_credit;
++ struct timer_list credit_timeout;
+
-+ spin_unlock_irq(&dev->lock);
-+}
++ /* Enforce draining of the transmit queue. */
++ struct timer_list tx_queue_timeout;
+
-+static void
-+blktap_device_configure(struct blktap *tap)
-+{
-+ struct request_queue *rq;
-+ struct blktap_device *dev = &tap->device;
++ /* Statistics */
++ int nr_copied_skbs;
+
-+ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
-+ return;
++ /* Miscellaneous private stuff. */
++ struct list_head list; /* scheduling list */
++ atomic_t refcnt;
++ struct net_device *dev;
++ struct net_device_stats stats;
+
-+ dev = &tap->device;
-+ rq = dev->gd->queue;
++ unsigned int carrier;
+
-+ spin_lock_irq(&dev->lock);
++ wait_queue_head_t waiting_to_free;
++};
+
-+ set_capacity(dev->gd, tap->params.capacity);
++/*
++ * Implement our own carrier flag: the network stack's version causes delays
++ * when the carrier is re-enabled (in particular, dev_activate() may not
++ * immediately be called, which can cause packet loss; also the etherbridge
++ * can be rather lazy in activating its port).
++ */
++#define netback_carrier_on(netif) ((netif)->carrier = 1)
++#define netback_carrier_off(netif) ((netif)->carrier = 0)
++#define netback_carrier_ok(netif) ((netif)->carrier)
+
-+ /* Hard sector size and max sectors impersonate the equiv. hardware. */
-+ blk_queue_logical_block_size(rq, tap->params.sector_size);
-+ blk_queue_max_sectors(rq, 512);
++enum {
++ NETBK_DONT_COPY_SKB,
++ NETBK_DELAYED_COPY_SKB,
++ NETBK_ALWAYS_COPY_SKB,
++};
+
-+ /* Each segment in a request is up to an aligned page in size. */
-+ blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
-+ blk_queue_max_segment_size(rq, PAGE_SIZE);
++extern int netbk_copy_skb_mode;
+
-+ /* Ensure a merged request will fit in a single I/O ring slot. */
-+ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-+ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++/* Function pointers into netback accelerator plugin modules */
++struct netback_accel_hooks {
++ struct module *owner;
++ int (*probe)(struct xenbus_device *dev);
++ int (*remove)(struct xenbus_device *dev);
++};
++
++/* Structure to track the state of a netback accelerator plugin */
++struct netback_accelerator {
++ struct list_head link;
++ int id;
++ char *eth_name;
++ atomic_t use_count;
++ struct netback_accel_hooks *hooks;
++};
++
++struct backend_info {
++ struct xenbus_device *dev;
++ struct xen_netif *netif;
++ enum xenbus_state frontend_state;
++ struct xenbus_watch hotplug_status_watch;
++ int have_hotplug_status_watch:1;
+
-+ /* Make sure buffer addresses are sector-aligned. */
-+ blk_queue_dma_alignment(rq, 511);
++ /* State relating to the netback accelerator */
++ void *netback_accel_priv;
++ /* The accelerator that this backend is currently using */
++ struct netback_accelerator *accelerator;
++};
+
-+ spin_unlock_irq(&dev->lock);
-+}
++#define NETBACK_ACCEL_VERSION 0x00010001
+
-+int
-+blktap_device_resume(struct blktap *tap)
-+{
-+ int err;
++/*
++ * Connect an accelerator plugin module to netback. Returns zero on
++ * success, < 0 on error, > 0 (with highest version number supported)
++ * if version mismatch.
++ */
++extern int netback_connect_accelerator(unsigned version,
++ int id, const char *eth_name,
++ struct netback_accel_hooks *hooks);
++/* Disconnect a previously connected accelerator plugin module */
++extern void netback_disconnect_accelerator(int id, const char *eth_name);
+
-+ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
-+ return -ENODEV;
+
-+ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+ return 0;
++extern
++void netback_probe_accelerators(struct backend_info *be,
++ struct xenbus_device *dev);
++extern
++void netback_remove_accelerators(struct backend_info *be,
++ struct xenbus_device *dev);
++extern
++void netif_accel_init(void);
+
-+ err = blktap_ring_resume(tap);
-+ if (err)
-+ return err;
+
-+ /* device size may have changed */
-+ blktap_device_configure(tap);
++#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
++#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
+
-+ BTDBG("restarting device\n");
-+ blktap_device_restart(tap);
++void netif_disconnect(struct xen_netif *netif);
+
-+ return 0;
-+}
++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle);
++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
++ unsigned long rx_ring_ref, unsigned int evtchn);
+
-+int
-+blktap_device_pause(struct blktap *tap)
++static inline void netif_get(struct xen_netif *netif)
+{
-+ unsigned long flags;
-+ struct blktap_device *dev = &tap->device;
++ atomic_inc(&netif->refcnt);
++}
+
-+ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
-+ return -ENODEV;
++static inline void netif_put(struct xen_netif *netif)
++{
++ if (atomic_dec_and_test(&netif->refcnt))
++ wake_up(&netif->waiting_to_free);
++}
+
-+ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+ return 0;
++int netif_xenbus_init(void);
+
-+ spin_lock_irqsave(&dev->lock, flags);
++#define netif_schedulable(netif) \
++ (netif_running((netif)->dev) && netback_carrier_ok(netif))
+
-+ blk_stop_queue(dev->gd->queue);
-+ set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++void netif_schedule_work(struct xen_netif *netif);
++void netif_deschedule_work(struct xen_netif *netif);
+
-+ spin_unlock_irqrestore(&dev->lock, flags);
++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
++struct net_device_stats *netif_be_get_stats(struct net_device *dev);
++irqreturn_t netif_be_int(int irq, void *dev_id);
+
-+ return blktap_ring_pause(tap);
++static inline int netbk_can_queue(struct net_device *dev)
++{
++ struct xen_netif *netif = netdev_priv(dev);
++ return netif->can_queue;
+}
+
-+int
-+blktap_device_destroy(struct blktap *tap)
++static inline int netbk_can_sg(struct net_device *dev)
+{
-+ struct blktap_device *dev = &tap->device;
-+ struct gendisk *gd = dev->gd;
-+
-+ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
-+ return 0;
++ struct xen_netif *netif = netdev_priv(dev);
++ return netif->features & NETIF_F_SG;
++}
+
-+ BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
++#endif /* __NETIF__BACKEND__COMMON_H__ */
+diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
+new file mode 100644
+index 0000000..b23b14d
+--- /dev/null
++++ b/drivers/xen/netback/interface.c
+@@ -0,0 +1,405 @@
++/******************************************************************************
++ * arch/xen/drivers/netif/backend/interface.c
++ *
++ * Network-device interface management.
++ *
++ * Copyright (c) 2004-2005, Keir Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
+
-+ if (dev->users)
-+ return -EBUSY;
++#include "common.h"
++#include <linux/ethtool.h>
++#include <linux/rtnetlink.h>
+
-+ spin_lock_irq(&dev->lock);
-+ /* No more blktap_device_do_request(). */
-+ blk_stop_queue(gd->queue);
-+ clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
-+ dev->gd = NULL;
-+ spin_unlock_irq(&dev->lock);
++#include <xen/events.h>
++#include <asm/xen/hypercall.h>
+
-+#ifdef ENABLE_PASSTHROUGH
-+ if (dev->bdev)
-+ blktap_device_close_bdev(tap);
-+#endif
++/*
++ * Module parameter 'queue_length':
++ *
++ * Enables queuing in the network stack when a client has run out of receive
++ * descriptors. Although this feature can improve receive bandwidth by avoiding
++ * packet loss, it can also result in packets sitting in the 'tx_queue' for
++ * unbounded time. This is bad if those packets hold onto foreign resources.
++ * For example, consider a packet that holds onto resources belonging to the
++ * guest for which it is queued (e.g., packet received on vif1.0, destined for
++ * vif1.1 which is not activated in the guest): in this situation the guest
++ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
++ * run a timer (tx_queue_timeout) to drain the queue when the interface is
++ * blocked.
++ */
++static unsigned long netbk_queue_length = 32;
++module_param_named(queue_length, netbk_queue_length, ulong, 0644);
+
-+ del_gendisk(gd);
-+ blk_cleanup_queue(gd->queue);
-+ put_disk(gd);
++static void __netif_up(struct xen_netif *netif)
++{
++ enable_irq(netif->irq);
++ netif_schedule_work(netif);
++}
+
-+ wake_up(&tap->wq);
++static void __netif_down(struct xen_netif *netif)
++{
++ disable_irq(netif->irq);
++ netif_deschedule_work(netif);
++}
+
++static int net_open(struct net_device *dev)
++{
++ struct xen_netif *netif = netdev_priv(dev);
++ if (netback_carrier_ok(netif)) {
++ __netif_up(netif);
++ netif_start_queue(dev);
++ }
+ return 0;
+}
+
-+int
-+blktap_device_create(struct blktap *tap)
++static int net_close(struct net_device *dev)
+{
-+ int minor, err;
-+ struct gendisk *gd;
-+ struct request_queue *rq;
-+ struct blktap_device *dev;
-+
-+ gd = NULL;
-+ rq = NULL;
-+ dev = &tap->device;
-+ minor = tap->minor;
++ struct xen_netif *netif = netdev_priv(dev);
++ if (netback_carrier_ok(netif))
++ __netif_down(netif);
++ netif_stop_queue(dev);
++ return 0;
++}
+
-+ if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
-+ return -EEXIST;
++static int netbk_change_mtu(struct net_device *dev, int mtu)
++{
++ int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
-+ if (blktap_validate_params(tap, &tap->params))
++ if (mtu > max)
+ return -EINVAL;
++ dev->mtu = mtu;
++ return 0;
++}
+
-+ BTINFO("minor %d sectors %Lu sector-size %lu\n",
-+ minor, tap->params.capacity, tap->params.sector_size);
++static int netbk_set_sg(struct net_device *dev, u32 data)
++{
++ if (data) {
++ struct xen_netif *netif = netdev_priv(dev);
+
-+ err = -ENODEV;
++ if (!(netif->features & NETIF_F_SG))
++ return -ENOSYS;
++ }
+
-+ gd = alloc_disk(1);
-+ if (!gd)
-+ goto error;
++ if (dev->mtu > ETH_DATA_LEN)
++ dev->mtu = ETH_DATA_LEN;
+
-+ if (minor < 26)
-+ sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
-+ else
-+ sprintf(gd->disk_name, "tapdev%c%c",
-+ 'a' + ((minor / 26) - 1), 'a' + (minor % 26));
++ return ethtool_op_set_sg(dev, data);
++}
+
-+ gd->major = blktap_device_major;
-+ gd->first_minor = minor;
-+ gd->fops = &blktap_device_file_operations;
-+ gd->private_data = dev;
++static int netbk_set_tso(struct net_device *dev, u32 data)
++{
++ if (data) {
++ struct xen_netif *netif = netdev_priv(dev);
+
-+ spin_lock_init(&dev->lock);
-+ rq = blk_init_queue(blktap_device_do_request, &dev->lock);
-+ if (!rq)
-+ goto error;
++ if (!(netif->features & NETIF_F_TSO))
++ return -ENOSYS;
++ }
++
++ return ethtool_op_set_tso(dev, data);
++}
+
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
-+ elevator_init(rq, "noop");
-+#else
-+ elevator_init(rq, &elevator_noop);
-+#endif
++static void netbk_get_drvinfo(struct net_device *dev,
++ struct ethtool_drvinfo *info)
++{
++ strcpy(info->driver, "netbk");
++ strcpy(info->bus_info, dev_name(dev->dev.parent));
++}
+
-+ gd->queue = rq;
-+ rq->queuedata = dev;
-+ dev->gd = gd;
++static const struct netif_stat {
++ char name[ETH_GSTRING_LEN];
++ u16 offset;
++} netbk_stats[] = {
++ { "copied_skbs", offsetof(struct xen_netif, nr_copied_skbs) },
++};
+
-+ set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
-+ blktap_device_configure(tap);
++static int netbk_get_stats_count(struct net_device *dev)
++{
++ return ARRAY_SIZE(netbk_stats);
++}
+
-+ add_disk(gd);
++static void netbk_get_ethtool_stats(struct net_device *dev,
++ struct ethtool_stats *stats, u64 * data)
++{
++ void *netif = netdev_priv(dev);
++ int i;
+
-+ err = 0;
-+ goto out;
++ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
++ data[i] = *(int *)(netif + netbk_stats[i].offset);
++}
+
-+ error:
-+ if (gd)
-+ del_gendisk(gd);
-+ if (rq)
-+ blk_cleanup_queue(rq);
++static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data)
++{
++ int i;
+
-+ out:
-+ BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
-+ return err;
++ switch (stringset) {
++ case ETH_SS_STATS:
++ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
++ memcpy(data + i * ETH_GSTRING_LEN,
++ netbk_stats[i].name, ETH_GSTRING_LEN);
++ break;
++ }
+}
+
-+int __init
-+blktap_device_init(int *maj)
++static struct ethtool_ops network_ethtool_ops =
+{
-+ int major;
++ .get_drvinfo = netbk_get_drvinfo,
+
-+ /* Dynamically allocate a major for this device */
-+ major = register_blkdev(0, "tapdev");
-+ if (major < 0) {
-+ BTERR("Couldn't register blktap device\n");
-+ return -ENOMEM;
-+ }
++ .get_tx_csum = ethtool_op_get_tx_csum,
++ .set_tx_csum = ethtool_op_set_tx_csum,
++ .get_sg = ethtool_op_get_sg,
++ .set_sg = netbk_set_sg,
++ .get_tso = ethtool_op_get_tso,
++ .set_tso = netbk_set_tso,
++ .get_link = ethtool_op_get_link,
+
-+ blktap_device_major = *maj = major;
-+ BTINFO("blktap device major %d\n", major);
++ .get_stats_count = netbk_get_stats_count,
++ .get_ethtool_stats = netbk_get_ethtool_stats,
++ .get_strings = netbk_get_strings,
++};
+
-+ return 0;
-+}
++static struct net_device_ops netback_ops =
++{
++ .ndo_start_xmit = netif_be_start_xmit,
++ .ndo_get_stats = netif_be_get_stats,
++ .ndo_open = net_open,
++ .ndo_stop = net_close,
++ .ndo_change_mtu = netbk_change_mtu,
++};
+
-+void
-+blktap_device_free(void)
++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
+{
-+ if (blktap_device_major)
-+ unregister_blkdev(blktap_device_major, "tapdev");
-+}
-diff --git a/drivers/xen/blktap/request.c b/drivers/xen/blktap/request.c
-new file mode 100644
-index 0000000..770736a
---- /dev/null
-+++ b/drivers/xen/blktap/request.c
-@@ -0,0 +1,297 @@
-+#include <linux/spinlock.h>
-+#include <xen/balloon.h>
-+#include <linux/sched.h>
++ int err = 0;
++ struct net_device *dev;
++ struct xen_netif *netif;
++ char name[IFNAMSIZ] = {};
+
-+#include "blktap.h"
++ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
++ dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
++ if (dev == NULL) {
++ DPRINTK("Could not create netif: out of memory\n");
++ return ERR_PTR(-ENOMEM);
++ }
+
-+#define MAX_BUCKETS 8
-+#define BUCKET_SIZE MAX_PENDING_REQS
++ SET_NETDEV_DEV(dev, parent);
+
-+#define BLKTAP_POOL_CLOSING 1
++ netif = netdev_priv(dev);
++ memset(netif, 0, sizeof(*netif));
++ netif->domid = domid;
++ netif->handle = handle;
++ netif->features = NETIF_F_SG;
++ atomic_set(&netif->refcnt, 1);
++ init_waitqueue_head(&netif->waiting_to_free);
++ netif->dev = dev;
++ INIT_LIST_HEAD(&netif->list);
+
-+struct blktap_request_bucket;
++ netback_carrier_off(netif);
+
-+struct blktap_request_handle {
-+ int slot;
-+ uint8_t inuse;
-+ struct blktap_request request;
-+ struct blktap_request_bucket *bucket;
-+};
++ netif->credit_bytes = netif->remaining_credit = ~0UL;
++ netif->credit_usec = 0UL;
++ init_timer(&netif->credit_timeout);
++ /* Initialize 'expires' now: it's used to track the credit window. */
++ netif->credit_timeout.expires = jiffies;
+
-+struct blktap_request_bucket {
-+ atomic_t reqs_in_use;
-+ struct blktap_request_handle handles[BUCKET_SIZE];
-+ struct page **foreign_pages;
-+};
++ init_timer(&netif->tx_queue_timeout);
+
-+struct blktap_request_pool {
-+ spinlock_t lock;
-+ uint8_t status;
-+ struct list_head free_list;
-+ atomic_t reqs_in_use;
-+ wait_queue_head_t wait_queue;
-+ struct blktap_request_bucket *buckets[MAX_BUCKETS];
-+};
++ dev->netdev_ops = &netback_ops;
++ dev->features = NETIF_F_IP_CSUM|NETIF_F_SG;
+
-+static struct blktap_request_pool pool;
++ SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
+
-+static inline struct blktap_request_handle *
-+blktap_request_to_handle(struct blktap_request *req)
-+{
-+ return container_of(req, struct blktap_request_handle, request);
-+}
++ dev->tx_queue_len = netbk_queue_length;
+
-+static void
-+blktap_request_pool_init_request(struct blktap_request *request)
-+{
-+ int i;
++ /*
++ * Initialise a dummy MAC address. We choose the numerically
++ * largest non-broadcast address to prevent the address getting
++ * stolen by an Ethernet bridge for STP purposes.
++ * (FE:FF:FF:FF:FF:FF)
++ */
++ memset(dev->dev_addr, 0xFF, ETH_ALEN);
++ dev->dev_addr[0] &= ~0x01;
+
-+ request->usr_idx = -1;
-+ request->nr_pages = 0;
-+ request->status = BLKTAP_REQUEST_FREE;
-+ INIT_LIST_HEAD(&request->free_list);
-+ for (i = 0; i < ARRAY_SIZE(request->handles); i++) {
-+ request->handles[i].user = INVALID_GRANT_HANDLE;
-+ request->handles[i].kernel = INVALID_GRANT_HANDLE;
++ rtnl_lock();
++ err = register_netdevice(dev);
++ rtnl_unlock();
++ if (err) {
++ DPRINTK("Could not register new net device %s: err=%d\n",
++ dev->name, err);
++ free_netdev(dev);
++ return ERR_PTR(err);
+ }
++
++ DPRINTK("Successfully created netif\n");
++ return netif;
+}
+
-+static int
-+blktap_request_pool_allocate_bucket(void)
++static int map_frontend_pages(
++ struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
+{
-+ int i, idx;
-+ unsigned long flags;
-+ struct blktap_request *request;
-+ struct blktap_request_handle *handle;
-+ struct blktap_request_bucket *bucket;
-+
-+ bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL);
-+ if (!bucket)
-+ goto fail;
++ struct gnttab_map_grant_ref op;
+
-+ bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES);
-+ if (!bucket->foreign_pages)
-+ goto fail;
++ gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
++ GNTMAP_host_map, tx_ring_ref, netif->domid);
+
-+ spin_lock_irqsave(&pool.lock, flags);
++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++ BUG();
+
-+ idx = -1;
-+ for (i = 0; i < MAX_BUCKETS; i++) {
-+ if (!pool.buckets[i]) {
-+ idx = i;
-+ pool.buckets[idx] = bucket;
-+ break;
-+ }
++ if (op.status) {
++ DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
++ return op.status;
+ }
+
-+ if (idx == -1) {
-+ spin_unlock_irqrestore(&pool.lock, flags);
-+ goto fail;
-+ }
++ netif->tx_shmem_ref = tx_ring_ref;
++ netif->tx_shmem_handle = op.handle;
+
-+ for (i = 0; i < BUCKET_SIZE; i++) {
-+ handle = bucket->handles + i;
-+ request = &handle->request;
++ gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
++ GNTMAP_host_map, rx_ring_ref, netif->domid);
+
-+ handle->slot = i;
-+ handle->inuse = 0;
-+ handle->bucket = bucket;
++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++ BUG();
+
-+ blktap_request_pool_init_request(request);
-+ list_add_tail(&request->free_list, &pool.free_list);
++ if (op.status) {
++ struct gnttab_unmap_grant_ref unop;
++
++ gnttab_set_unmap_op(&unop,
++ (unsigned long)netif->tx_comms_area->addr,
++ GNTMAP_host_map, netif->tx_shmem_handle);
++ HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
++ DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
++ return op.status;
+ }
+
-+ spin_unlock_irqrestore(&pool.lock, flags);
++ netif->rx_shmem_ref = rx_ring_ref;
++ netif->rx_shmem_handle = op.handle;
+
+ return 0;
-+
-+fail:
-+ if (bucket && bucket->foreign_pages)
-+ free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
-+ kfree(bucket);
-+ return -ENOMEM;
+}
+
-+static void
-+blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket)
++static void unmap_frontend_pages(struct xen_netif *netif)
+{
-+ if (!bucket)
-+ return;
++ struct gnttab_unmap_grant_ref op;
+
-+ BTDBG("freeing bucket %p\n", bucket);
++ gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
++ GNTMAP_host_map, netif->tx_shmem_handle);
+
-+ free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
-+ kfree(bucket);
-+}
++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++ BUG();
+
-+struct page *
-+request_to_page(struct blktap_request *req, int seg)
-+{
-+ struct blktap_request_handle *handle = blktap_request_to_handle(req);
-+ int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
-+ return handle->bucket->foreign_pages[idx];
++ gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
++ GNTMAP_host_map, netif->rx_shmem_handle);
++
++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++ BUG();
+}
+
-+int
-+blktap_request_pool_shrink(void)
++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
++ unsigned long rx_ring_ref, unsigned int evtchn)
+{
-+ int i, err;
-+ unsigned long flags;
-+ struct blktap_request_bucket *bucket;
++ int err = -ENOMEM;
++ struct xen_netif_tx_sring *txs;
++ struct xen_netif_rx_sring *rxs;
+
-+ err = -EAGAIN;
++ /* Already connected through? */
++ if (netif->irq)
++ return 0;
+
-+ spin_lock_irqsave(&pool.lock, flags);
++ netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
++ if (netif->tx_comms_area == NULL)
++ return -ENOMEM;
++ netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
++ if (netif->rx_comms_area == NULL)
++ goto err_rx;
+
-+ /* always keep at least one bucket */
-+ for (i = 1; i < MAX_BUCKETS; i++) {
-+ bucket = pool.buckets[i];
-+ if (!bucket)
-+ continue;
++ err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
++ if (err)
++ goto err_map;
+
-+ if (atomic_read(&bucket->reqs_in_use))
-+ continue;
++ err = bind_interdomain_evtchn_to_irqhandler(
++ netif->domid, evtchn, netif_be_int, 0,
++ netif->dev->name, netif);
++ if (err < 0)
++ goto err_hypervisor;
++ netif->irq = err;
++ disable_irq(netif->irq);
+
-+ blktap_request_pool_free_bucket(bucket);
-+ pool.buckets[i] = NULL;
-+ err = 0;
-+ break;
-+ }
++ txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
++ BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
+
-+ spin_unlock_irqrestore(&pool.lock, flags);
++ rxs = (struct xen_netif_rx_sring *)
++ ((char *)netif->rx_comms_area->addr);
++ BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
+
-+ return err;
-+}
++ netif->rx_req_cons_peek = 0;
+
-+int
-+blktap_request_pool_grow(void)
-+{
-+ return blktap_request_pool_allocate_bucket();
++ netif_get(netif);
++
++ rtnl_lock();
++ netback_carrier_on(netif);
++ if (netif_running(netif->dev))
++ __netif_up(netif);
++ rtnl_unlock();
++
++ return 0;
++err_hypervisor:
++ unmap_frontend_pages(netif);
++err_map:
++ free_vm_area(netif->rx_comms_area);
++err_rx:
++ free_vm_area(netif->tx_comms_area);
++ return err;
+}
+
-+struct blktap_request *
-+blktap_request_allocate(struct blktap *tap)
++void netif_disconnect(struct xen_netif *netif)
+{
-+ int i;
-+ uint16_t usr_idx;
-+ unsigned long flags;
-+ struct blktap_request *request;
-+
-+ usr_idx = -1;
-+ request = NULL;
++ if (netback_carrier_ok(netif)) {
++ rtnl_lock();
++ netback_carrier_off(netif);
++ netif_carrier_off(netif->dev); /* discard queued packets */
++ if (netif_running(netif->dev))
++ __netif_down(netif);
++ rtnl_unlock();
++ netif_put(netif);
++ }
+
-+ spin_lock_irqsave(&pool.lock, flags);
++ atomic_dec(&netif->refcnt);
++ wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
+
-+ if (pool.status == BLKTAP_POOL_CLOSING)
-+ goto out;
++ del_timer_sync(&netif->credit_timeout);
++ del_timer_sync(&netif->tx_queue_timeout);
+
-+ for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++)
-+ if (!tap->pending_requests[i]) {
-+ usr_idx = i;
-+ break;
-+ }
++ if (netif->irq)
++ unbind_from_irqhandler(netif->irq, netif);
+
-+ if (usr_idx == (uint16_t)-1)
-+ goto out;
++ unregister_netdev(netif->dev);
+
-+ if (!list_empty(&pool.free_list)) {
-+ request = list_entry(pool.free_list.next,
-+ struct blktap_request, free_list);
-+ list_del(&request->free_list);
++ if (netif->tx.sring) {
++ unmap_frontend_pages(netif);
++ free_vm_area(netif->tx_comms_area);
++ free_vm_area(netif->rx_comms_area);
+ }
+
-+ if (request) {
-+ struct blktap_request_handle *handle;
-+
-+ atomic_inc(&pool.reqs_in_use);
-+
-+ handle = blktap_request_to_handle(request);
-+ atomic_inc(&handle->bucket->reqs_in_use);
-+ handle->inuse = 1;
++ free_netdev(netif->dev);
++}
+diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
+new file mode 100644
+index 0000000..0bc6398
+--- /dev/null
++++ b/drivers/xen/netback/netback.c
+@@ -0,0 +1,1613 @@
++/******************************************************************************
++ * drivers/xen/netback/netback.c
++ *
++ * Back-end of the driver for virtual network devices. This portion of the
++ * driver exports a 'unified' network-device interface that can be accessed
++ * by any operating system that implements a compatible front end. A
++ * reference front-end implementation can be found in:
++ * drivers/xen/netfront/netfront.c
++ *
++ * Copyright (c) 2002-2005, K A Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
+
-+ request->usr_idx = usr_idx;
++#include "common.h"
+
-+ tap->pending_requests[usr_idx] = request;
-+ tap->pending_cnt++;
-+ }
++#include <linux/tcp.h>
++#include <linux/udp.h>
+
-+out:
-+ spin_unlock_irqrestore(&pool.lock, flags);
-+ return request;
-+}
++#include <xen/balloon.h>
++#include <xen/events.h>
++#include <xen/interface/memory.h>
+
-+void
-+blktap_request_free(struct blktap *tap, struct blktap_request *request)
-+{
-+ int free;
-+ unsigned long flags;
-+ struct blktap_request_handle *handle;
++#include <asm/xen/hypercall.h>
++#include <asm/xen/page.h>
+
-+ BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests));
-+ handle = blktap_request_to_handle(request);
++/*define NETBE_DEBUG_INTERRUPT*/
+
-+ spin_lock_irqsave(&pool.lock, flags);
++struct netbk_rx_meta {
++ skb_frag_t frag;
++ int id;
++};
+
-+ handle->inuse = 0;
-+ tap->pending_requests[request->usr_idx] = NULL;
-+ blktap_request_pool_init_request(request);
-+ list_add(&request->free_list, &pool.free_list);
-+ atomic_dec(&handle->bucket->reqs_in_use);
-+ free = atomic_dec_and_test(&pool.reqs_in_use);
++struct netbk_tx_pending_inuse {
++ struct list_head list;
++ unsigned long alloc_time;
++};
+
-+ spin_unlock_irqrestore(&pool.lock, flags);
+
-+ if (--tap->pending_cnt == 0)
-+ wake_up_interruptible(&tap->wq);
++static void netif_idx_release(u16 pending_idx);
++static void make_tx_response(struct xen_netif *netif,
++ struct xen_netif_tx_request *txp,
++ s8 st);
++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
++ u16 id,
++ s8 st,
++ u16 offset,
++ u16 size,
++ u16 flags);
+
-+ if (free)
-+ wake_up(&pool.wait_queue);
-+}
++static void net_tx_action(unsigned long unused);
++static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
+
-+void
-+blktap_request_pool_free(void)
-+{
-+ int i;
-+ unsigned long flags;
++static void net_rx_action(unsigned long unused);
++static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
+
-+ spin_lock_irqsave(&pool.lock, flags);
++static struct timer_list net_timer;
++static struct timer_list netbk_tx_pending_timer;
+
-+ pool.status = BLKTAP_POOL_CLOSING;
-+ while (atomic_read(&pool.reqs_in_use)) {
-+ spin_unlock_irqrestore(&pool.lock, flags);
-+ wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use));
-+ spin_lock_irqsave(&pool.lock, flags);
-+ }
++#define MAX_PENDING_REQS 256
+
-+ for (i = 0; i < MAX_BUCKETS; i++) {
-+ blktap_request_pool_free_bucket(pool.buckets[i]);
-+ pool.buckets[i] = NULL;
-+ }
++static struct sk_buff_head rx_queue;
+
-+ spin_unlock_irqrestore(&pool.lock, flags);
++static struct page **mmap_pages;
++static inline unsigned long idx_to_pfn(unsigned int idx)
++{
++ return page_to_pfn(mmap_pages[idx]);
+}
+
-+int __init
-+blktap_request_pool_init(void)
++static inline unsigned long idx_to_kaddr(unsigned int idx)
+{
-+ int i, err;
++ return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx));
++}
+
-+ memset(&pool, 0, sizeof(pool));
++/* extra field used in struct page */
++static inline void netif_set_page_index(struct page *pg, unsigned int index)
++{
++ *(unsigned long *)&pg->mapping = index + 1;
++}
+
-+ spin_lock_init(&pool.lock);
-+ INIT_LIST_HEAD(&pool.free_list);
-+ atomic_set(&pool.reqs_in_use, 0);
-+ init_waitqueue_head(&pool.wait_queue);
++static inline int netif_page_index(struct page *pg)
++{
++ unsigned long idx = (unsigned long)pg->mapping - 1;
+
-+ for (i = 0; i < 2; i++) {
-+ err = blktap_request_pool_allocate_bucket();
-+ if (err)
-+ goto fail;
-+ }
++ if (!PageForeign(pg))
++ return -1;
+
-+ return 0;
++ if ((idx >= MAX_PENDING_REQS) || (mmap_pages[idx] != pg))
++ return -1;
+
-+fail:
-+ blktap_request_pool_free();
-+ return err;
++ return idx;
+}
-diff --git a/drivers/xen/blktap/ring.c b/drivers/xen/blktap/ring.c
-new file mode 100644
-index 0000000..74a7aa7
---- /dev/null
-+++ b/drivers/xen/blktap/ring.c
-@@ -0,0 +1,615 @@
-+#include <linux/module.h>
-+#include <linux/signal.h>
-+#include <linux/sched.h>
-+#include <linux/poll.h>
+
-+#include <asm/xen/page.h>
-+#include <asm/xen/hypercall.h>
-+
-+#include "blktap.h"
-+
-+#ifdef CONFIG_XEN_BLKDEV_BACKEND
-+#include "../blkback/blkback-pagemap.h"
-+#else
-+#define blkback_pagemap_contains_page(page) 0
-+#endif
++/*
++ * This is the amount of packet we copy rather than map, so that the
++ * guest can't fiddle with the contents of the headers while we do
++ * packet processing on them (netfilter, routing, etc). 72 is enough
++ * to cover TCP+IP headers including options.
++ */
++#define PKT_PROT_LEN 72
+
-+static int blktap_ring_major;
++static struct pending_tx_info {
++ struct xen_netif_tx_request req;
++ struct xen_netif *netif;
++} pending_tx_info[MAX_PENDING_REQS];
++static u16 pending_ring[MAX_PENDING_REQS];
++typedef unsigned int pending_ring_idx_t;
+
-+static inline struct blktap *
-+vma_to_blktap(struct vm_area_struct *vma)
++static inline pending_ring_idx_t pending_index(unsigned i)
+{
-+ struct vm_foreign_map *m = vma->vm_private_data;
-+ struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
-+ return container_of(r, struct blktap, ring);
++ return i & (MAX_PENDING_REQS-1);
+}
+
-+ /*
-+ * BLKTAP - immediately before the mmap area,
-+ * we have a bunch of pages reserved for shared memory rings.
-+ */
-+#define RING_PAGES 1
++static pending_ring_idx_t pending_prod, pending_cons;
+
-+static int
-+blktap_read_ring(struct blktap *tap)
++static inline pending_ring_idx_t nr_pending_reqs(void)
+{
-+ /* This is called to read responses from the ring. */
-+ int usr_idx;
-+ RING_IDX rc, rp;
-+ struct blkif_response res;
-+ struct blktap_ring *ring;
-+ struct blktap_request *request;
++ return MAX_PENDING_REQS - pending_prod + pending_cons;
++}
+
-+ down_read(&tap->tap_sem);
++/* Freed TX SKBs get batched on this ring before return to pending_ring. */
++static u16 dealloc_ring[MAX_PENDING_REQS];
++static pending_ring_idx_t dealloc_prod, dealloc_cons;
+
-+ ring = &tap->ring;
-+ if (!ring->vma) {
-+ up_read(&tap->tap_sem);
-+ return 0;
-+ }
++/* Doubly-linked list of in-use pending entries. */
++static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
++static LIST_HEAD(pending_inuse_head);
+
-+ /* for each outstanding message on the ring */
-+ rp = ring->ring.sring->rsp_prod;
-+ rmb();
++static struct sk_buff_head tx_queue;
+
-+ for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
-+ memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
-+ mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
-+ ++ring->ring.rsp_cons;
++static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
++static struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
++static struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+
-+ usr_idx = (int)res.id;
-+ if (usr_idx >= MAX_PENDING_REQS ||
-+ !tap->pending_requests[usr_idx]) {
-+ BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
-+ rc, rp, usr_idx, tap->pid, ring->vma);
-+ continue;
-+ }
++static LIST_HEAD(net_schedule_list);
++static DEFINE_SPINLOCK(net_schedule_list_lock);
+
-+ request = tap->pending_requests[usr_idx];
-+ BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
-+ blktap_device_finish_request(tap, &res, request);
-+ }
++#define MAX_MFN_ALLOC 64
++static unsigned long mfn_list[MAX_MFN_ALLOC];
++static unsigned int alloc_index = 0;
+
-+ up_read(&tap->tap_sem);
++/* Setting this allows the safe use of this driver without netloop. */
++static int MODPARM_copy_skb = 1;
++module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
++MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
+
-+ blktap_run_deferred();
++int netbk_copy_skb_mode;
+
-+ return 0;
++static inline unsigned long alloc_mfn(void)
++{
++ BUG_ON(alloc_index == 0);
++ return mfn_list[--alloc_index];
+}
+
-+static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++static inline void maybe_schedule_tx_action(void)
+{
-+ return VM_FAULT_SIGBUS;
++ smp_mb();
++ if ((nr_pending_reqs() < (MAX_PENDING_REQS/2)) &&
++ !list_empty(&net_schedule_list))
++ tasklet_schedule(&net_tx_tasklet);
+}
+
-+static pte_t
-+blktap_ring_clear_pte(struct vm_area_struct *vma,
-+ unsigned long uvaddr,
-+ pte_t *ptep, int is_fullmm)
++static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
+{
-+ pte_t copy;
-+ struct blktap *tap;
-+ unsigned long kvaddr;
-+ struct page **map, *page;
-+ struct blktap_ring *ring;
-+ struct blktap_request *request;
-+ struct grant_handle_pair *khandle;
-+ struct gnttab_unmap_grant_ref unmap[2];
-+ int offset, seg, usr_idx, count = 0;
++ struct skb_shared_info *ninfo;
++ struct sk_buff *nskb;
++ unsigned long offset;
++ int ret;
++ int len;
++ int headlen;
+
-+ tap = vma_to_blktap(vma);
-+ ring = &tap->ring;
-+ map = ring->foreign_map.map;
-+ BUG_ON(!map); /* TODO Should this be changed to if statement? */
++ BUG_ON(skb_shinfo(skb)->frag_list != NULL);
+
-+ /*
-+ * Zap entry if the address is before the start of the grant
-+ * mapped region.
-+ */
-+ if (uvaddr < ring->user_vstart)
-+ return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
-+ ptep, is_fullmm);
++ nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
++ if (unlikely(!nskb))
++ goto err;
+
-+ offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
-+ usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
-+ seg = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
++ skb_reserve(nskb, NET_SKB_PAD + NET_IP_ALIGN);
++ headlen = skb_end_pointer(nskb) - nskb->data;
++ if (headlen > skb_headlen(skb))
++ headlen = skb_headlen(skb);
++ ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
++ BUG_ON(ret);
+
-+ offset = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
-+ page = map[offset];
-+ if (page) {
-+ ClearPageReserved(page);
-+ if (blkback_pagemap_contains_page(page))
-+ set_page_private(page, 0);
-+ }
-+ map[offset] = NULL;
++ ninfo = skb_shinfo(nskb);
++ ninfo->gso_size = skb_shinfo(skb)->gso_size;
++ ninfo->gso_type = skb_shinfo(skb)->gso_type;
++
++ offset = headlen;
++ len = skb->len - headlen;
++
++ nskb->len = skb->len;
++ nskb->data_len = len;
++ nskb->truesize += len;
++
++ while (len) {
++ struct page *page;
++ int copy;
++ int zero;
++
++ if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
++ dump_stack();
++ goto err_free;
++ }
++
++ copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
++ zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
++
++ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
++ if (unlikely(!page))
++ goto err_free;
+
-+ request = tap->pending_requests[usr_idx];
-+ kvaddr = request_to_kaddr(request, seg);
-+ khandle = request->handles + seg;
++ ret = skb_copy_bits(skb, offset, page_address(page), copy);
++ BUG_ON(ret);
+
-+ if (khandle->kernel != INVALID_GRANT_HANDLE) {
-+ gnttab_set_unmap_op(&unmap[count], kvaddr,
-+ GNTMAP_host_map, khandle->kernel);
-+ count++;
++ ninfo->frags[ninfo->nr_frags].page = page;
++ ninfo->frags[ninfo->nr_frags].page_offset = 0;
++ ninfo->frags[ninfo->nr_frags].size = copy;
++ ninfo->nr_frags++;
+
-+ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
-+ INVALID_P2M_ENTRY);
++ offset += copy;
++ len -= copy;
+ }
+
++ offset = nskb->data - skb->data;
+
-+ if (khandle->user != INVALID_GRANT_HANDLE) {
-+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
-+
-+ copy = *ptep;
-+ gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep).maddr,
-+ GNTMAP_host_map
-+ | GNTMAP_application_map
-+ | GNTMAP_contains_pte,
-+ khandle->user);
-+ count++;
-+ } else
-+ copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
-+ is_fullmm);
++ nskb->transport_header = skb->transport_header + offset;
++ nskb->network_header = skb->network_header + offset;
++ nskb->mac_header = skb->mac_header + offset;
+
-+ if (count)
-+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
-+ unmap, count))
-+ BUG();
++ return nskb;
+
-+ khandle->kernel = INVALID_GRANT_HANDLE;
-+ khandle->user = INVALID_GRANT_HANDLE;
++ err_free:
++ kfree_skb(nskb);
++ err:
++ return NULL;
++}
+
-+ return copy;
++static inline int netbk_max_required_rx_slots(struct xen_netif *netif)
++{
++ if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
++ return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
++ return 1; /* all in one */
+}
+
-+static void
-+blktap_ring_vm_unmap(struct vm_area_struct *vma)
++static inline int netbk_queue_full(struct xen_netif *netif)
+{
-+ struct blktap *tap = vma_to_blktap(vma);
++ RING_IDX peek = netif->rx_req_cons_peek;
++ RING_IDX needed = netbk_max_required_rx_slots(netif);
+
-+ down_write(&tap->tap_sem);
-+ clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
-+ clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
-+ clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
-+ up_write(&tap->tap_sem);
++ return ((netif->rx.sring->req_prod - peek) < needed) ||
++ ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
+}
+
-+static void
-+blktap_ring_vm_close(struct vm_area_struct *vma)
++static void tx_queue_callback(unsigned long data)
+{
-+ struct blktap *tap = vma_to_blktap(vma);
-+ struct blktap_ring *ring = &tap->ring;
++ struct xen_netif *netif = (struct xen_netif *)data;
++ if (netif_schedulable(netif))
++ netif_wake_queue(netif->dev);
++}
+
-+ blktap_ring_vm_unmap(vma); /* fail future requests */
-+ blktap_device_fail_pending_requests(tap); /* fail pending requests */
-+ blktap_device_restart(tap); /* fail deferred requests */
++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++ struct xen_netif *netif = netdev_priv(dev);
+
-+ down_write(&tap->tap_sem);
++ BUG_ON(skb->dev != dev);
+
-+ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
++ /* Drop the packet if the target domain has no receive buffers. */
++ if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
++ goto drop;
+
-+ kfree(ring->foreign_map.map);
-+ ring->foreign_map.map = NULL;
++ /*
++ * XXX For now we also copy skbuffs whose head crosses a page
++ * boundary, because netbk_gop_skb can't handle them.
++ */
++ if ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE) {
++ struct sk_buff *nskb = netbk_copy_skb(skb);
++ if ( unlikely(nskb == NULL) )
++ goto drop;
++ /* Copy only the header fields we use in this driver. */
++ nskb->dev = skb->dev;
++ nskb->ip_summed = skb->ip_summed;
++ dev_kfree_skb(skb);
++ skb = nskb;
++ }
+
-+ /* Free the ring page. */
-+ ClearPageReserved(virt_to_page(ring->ring.sring));
-+ free_page((unsigned long)ring->ring.sring);
++ netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
++ !!skb_shinfo(skb)->gso_size;
++ netif_get(netif);
+
-+ BTINFO("unmapping ring %d\n", tap->minor);
-+ ring->ring.sring = NULL;
-+ ring->vma = NULL;
++ if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
++ netif->rx.sring->req_event = netif->rx_req_cons_peek +
++ netbk_max_required_rx_slots(netif);
++ mb(); /* request notification /then/ check & stop the queue */
++ if (netbk_queue_full(netif)) {
++ netif_stop_queue(dev);
++ /*
++ * Schedule 500ms timeout to restart the queue, thus
++ * ensuring that an inactive queue will be drained.
++ * Packets will be immediately be dropped until more
++ * receive buffers become available (see
++ * netbk_queue_full() check above).
++ */
++ netif->tx_queue_timeout.data = (unsigned long)netif;
++ netif->tx_queue_timeout.function = tx_queue_callback;
++ mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
++ }
++ }
+
-+ up_write(&tap->tap_sem);
++ skb_queue_tail(&rx_queue, skb);
++ tasklet_schedule(&net_rx_tasklet);
+
-+ wake_up(&tap->wq);
++ return 0;
++
++ drop:
++ netif->stats.tx_dropped++;
++ dev_kfree_skb(skb);
++ return 0;
+}
+
-+static struct vm_operations_struct blktap_ring_vm_operations = {
-+ .close = blktap_ring_vm_close,
-+ .unmap = blktap_ring_vm_unmap,
-+ .fault = blktap_ring_fault,
-+ .zap_pte = blktap_ring_clear_pte,
++struct netrx_pending_operations {
++ unsigned trans_prod, trans_cons;
++ unsigned mmu_prod, mmu_mcl;
++ unsigned mcl_prod, mcl_cons;
++ unsigned copy_prod, copy_cons;
++ unsigned meta_prod, meta_cons;
++ struct mmu_update *mmu;
++ struct gnttab_transfer *trans;
++ struct gnttab_copy *copy;
++ struct multicall_entry *mcl;
++ struct netbk_rx_meta *meta;
+};
+
-+static int
-+blktap_ring_open(struct inode *inode, struct file *filp)
++/* Set up the grant operations for this fragment. If it's a flipping
++ interface, we also set up the unmap request from here. */
++static u16 netbk_gop_frag(struct xen_netif *netif, struct netbk_rx_meta *meta,
++ int i, struct netrx_pending_operations *npo,
++ struct page *page, unsigned long size,
++ unsigned long offset)
+{
-+ int idx;
-+ struct blktap *tap;
++ struct gnttab_copy *copy_gop;
++ struct xen_netif_rx_request *req;
++ unsigned long old_mfn;
++ int idx = netif_page_index(page);
+
-+ idx = iminor(inode);
-+ if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) {
-+ BTERR("unable to open device blktap%d\n", idx);
-+ return -ENODEV;
++ old_mfn = virt_to_mfn(page_address(page));
++
++ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
++
++ copy_gop = npo->copy + npo->copy_prod++;
++ copy_gop->flags = GNTCOPY_dest_gref;
++ if (idx > -1) {
++ struct pending_tx_info *src_pend = &pending_tx_info[idx];
++ copy_gop->source.domid = src_pend->netif->domid;
++ copy_gop->source.u.ref = src_pend->req.gref;
++ copy_gop->flags |= GNTCOPY_source_gref;
++ } else {
++ copy_gop->source.domid = DOMID_SELF;
++ copy_gop->source.u.gmfn = old_mfn;
++ }
++ copy_gop->source.offset = offset;
++ copy_gop->dest.domid = netif->domid;
++ copy_gop->dest.offset = 0;
++ copy_gop->dest.u.ref = req->gref;
++ copy_gop->len = size;
++
++ return req->id;
++}
++
++static void netbk_gop_skb(struct sk_buff *skb,
++ struct netrx_pending_operations *npo)
++{
++ struct xen_netif *netif = netdev_priv(skb->dev);
++ int nr_frags = skb_shinfo(skb)->nr_frags;
++ int i;
++ int extra;
++ struct netbk_rx_meta *head_meta, *meta;
++
++ head_meta = npo->meta + npo->meta_prod++;
++ head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
++ head_meta->frag.size = skb_shinfo(skb)->gso_size;
++ extra = !!head_meta->frag.size + 1;
++
++ for (i = 0; i < nr_frags; i++) {
++ meta = npo->meta + npo->meta_prod++;
++ meta->frag = skb_shinfo(skb)->frags[i];
++ meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
++ meta->frag.page,
++ meta->frag.size,
++ meta->frag.page_offset);
+ }
+
-+ tap = blktaps[idx];
++ /*
++ * This must occur at the end to ensure that we don't trash skb_shinfo
++ * until we're done. We know that the head doesn't cross a page
++ * boundary because such packets get copied in netif_be_start_xmit.
++ */
++ head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
++ virt_to_page(skb->data),
++ skb_headlen(skb),
++ offset_in_page(skb->data));
+
-+ BTINFO("opening device blktap%d\n", idx);
++ netif->rx.req_cons += nr_frags + extra;
++}
+
-+ if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
-+ return -ENODEV;
++static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
++{
++ int i;
+
-+ /* Only one process can access ring at a time */
-+ if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
-+ return -EBUSY;
++ for (i = 0; i < nr_frags; i++)
++ put_page(meta[i].frag.page);
++}
+
-+ filp->private_data = tap;
-+ BTINFO("opened device %d\n", tap->minor);
++/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
++ used to set up the operations on the top of
++ netrx_pending_operations, which have since been done. Check that
++ they didn't give any errors and advance over them. */
++static int netbk_check_gop(int nr_frags, domid_t domid,
++ struct netrx_pending_operations *npo)
++{
++ struct gnttab_copy *copy_op;
++ int status = NETIF_RSP_OKAY;
++ int i;
++
++ for (i = 0; i <= nr_frags; i++) {
++ copy_op = npo->copy + npo->copy_cons++;
++ if (copy_op->status != GNTST_okay) {
++ DPRINTK("Bad status %d from copy to DOM%d.\n",
++ copy_op->status, domid);
++ status = NETIF_RSP_ERROR;
++ }
++ }
+
-+ return 0;
++ return status;
+}
+
-+static int
-+blktap_ring_release(struct inode *inode, struct file *filp)
++static void netbk_add_frag_responses(struct xen_netif *netif, int status,
++ struct netbk_rx_meta *meta, int nr_frags)
+{
-+ struct blktap *tap = filp->private_data;
++ int i;
++ unsigned long offset;
+
-+ BTINFO("freeing device %d\n", tap->minor);
-+ clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
-+ filp->private_data = NULL;
-+ wake_up(&tap->wq);
-+ return 0;
++ for (i = 0; i < nr_frags; i++) {
++ int id = meta[i].id;
++ int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
++
++ offset = 0;
++ make_rx_response(netif, id, status, offset,
++ meta[i].frag.size, flags);
++ }
+}
+
-+/* Note on mmap:
-+ * We need to map pages to user space in a way that will allow the block
-+ * subsystem set up direct IO to them. This couldn't be done before, because
-+ * there isn't really a sane way to translate a user virtual address down to a
-+ * physical address when the page belongs to another domain.
-+ *
-+ * My first approach was to map the page in to kernel memory, add an entry
-+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
-+ * and then attempt to map that page up to user space. This is disallowed
-+ * by xen though, which realizes that we don't really own the machine frame
-+ * underlying the physical page.
-+ *
-+ * The new approach is to provide explicit support for this in xen linux.
-+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
-+ * mapped from other vms. vma->vm_private_data is set up as a mapping
-+ * from pages to actual page structs. There is a new clause in get_user_pages
-+ * that does the right thing for this sort of mapping.
-+ */
-+static int
-+blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
++static void net_rx_action(unsigned long unused)
+{
-+ int size, err;
-+ struct page **map;
-+ struct blktap *tap;
-+ struct blkif_sring *sring;
-+ struct blktap_ring *ring;
++ struct xen_netif *netif = NULL;
++ s8 status;
++ u16 id, irq, flags;
++ struct xen_netif_rx_response *resp;
++ struct multicall_entry *mcl;
++ struct sk_buff_head rxq;
++ struct sk_buff *skb;
++ int notify_nr = 0;
++ int ret;
++ int nr_frags;
++ int count;
++ unsigned long offset;
+
-+ tap = filp->private_data;
-+ ring = &tap->ring;
-+ map = NULL;
-+ sring = NULL;
++ /*
++ * Putting hundreds of bytes on the stack is considered rude.
++ * Static works because a tasklet can only be on one CPU at any time.
++ */
++ static struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
++ static struct mmu_update rx_mmu[NET_RX_RING_SIZE];
++ static struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
++ static struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
++ static unsigned char rx_notify[NR_IRQS];
++ static u16 notify_list[NET_RX_RING_SIZE];
++ static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
+
-+ if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
-+ return -ENOMEM;
++ struct netrx_pending_operations npo = {
++ mmu: rx_mmu,
++ trans: grant_trans_op,
++ copy: grant_copy_op,
++ mcl: rx_mcl,
++ meta: meta};
+
-+ size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-+ if (size != (MMAP_PAGES + RING_PAGES)) {
-+ BTERR("you _must_ map exactly %lu pages!\n",
-+ MMAP_PAGES + RING_PAGES);
-+ return -EAGAIN;
-+ }
++ skb_queue_head_init(&rxq);
+
-+ /* Allocate the fe ring. */
-+ sring = (struct blkif_sring *)get_zeroed_page(GFP_KERNEL);
-+ if (!sring) {
-+ BTERR("Couldn't alloc sring.\n");
-+ goto fail_mem;
-+ }
++ count = 0;
+
-+ map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
-+ if (!map) {
-+ BTERR("Couldn't alloc VM_FOREIGN map.\n");
-+ goto fail_mem;
-+ }
++ while ((skb = skb_dequeue(&rx_queue)) != NULL) {
++ nr_frags = skb_shinfo(skb)->nr_frags;
++ *(int *)skb->cb = nr_frags;
+
-+ SetPageReserved(virt_to_page(sring));
-+
-+ SHARED_RING_INIT(sring);
-+ FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
++ netbk_gop_skb(skb, &npo);
+
-+ ring->ring_vstart = vma->vm_start;
-+ ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
++ count += nr_frags + 1;
+
-+ /* Map the ring pages to the start of the region and reserve it. */
-+ if (xen_feature(XENFEAT_auto_translated_physmap))
-+ err = vm_insert_page(vma, vma->vm_start,
-+ virt_to_page(ring->ring.sring));
-+ else
-+ err = remap_pfn_range(vma, vma->vm_start,
-+ __pa(ring->ring.sring) >> PAGE_SHIFT,
-+ PAGE_SIZE, vma->vm_page_prot);
-+ if (err) {
-+ BTERR("Mapping user ring failed: %d\n", err);
-+ goto fail;
++ __skb_queue_tail(&rxq, skb);
++
++ /* Filled the batch queue? */
++ if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
++ break;
+ }
+
-+ /* Mark this VM as containing foreign pages, and set up mappings. */
-+ ring->foreign_map.map = map;
-+ vma->vm_private_data = &ring->foreign_map;
-+ vma->vm_flags |= VM_FOREIGN;
-+ vma->vm_flags |= VM_DONTCOPY;
-+ vma->vm_flags |= VM_RESERVED;
-+ vma->vm_ops = &blktap_ring_vm_operations;
++ BUG_ON(npo.meta_prod > ARRAY_SIZE(meta));
+
-+#ifdef CONFIG_X86
-+ vma->vm_mm->context.has_foreign_mappings = 1;
-+#endif
++ npo.mmu_mcl = npo.mcl_prod;
++ if (npo.mcl_prod) {
++ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++ BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu));
++ mcl = npo.mcl + npo.mcl_prod++;
+
-+ tap->pid = current->pid;
-+ BTINFO("blktap: mapping pid is %d\n", tap->pid);
++ BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
++ mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
+
-+ ring->vma = vma;
-+ return 0;
++ mcl->op = __HYPERVISOR_mmu_update;
++ mcl->args[0] = (unsigned long)rx_mmu;
++ mcl->args[1] = npo.mmu_prod;
++ mcl->args[2] = 0;
++ mcl->args[3] = DOMID_SELF;
++ }
+
-+ fail:
-+ /* Clear any active mappings. */
-+ zap_page_range(vma, vma->vm_start,
-+ vma->vm_end - vma->vm_start, NULL);
-+ ClearPageReserved(virt_to_page(sring));
-+ fail_mem:
-+ free_page((unsigned long)sring);
-+ kfree(map);
++ if (npo.trans_prod) {
++ BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op));
++ mcl = npo.mcl + npo.mcl_prod++;
++ mcl->op = __HYPERVISOR_grant_table_op;
++ mcl->args[0] = GNTTABOP_transfer;
++ mcl->args[1] = (unsigned long)grant_trans_op;
++ mcl->args[2] = npo.trans_prod;
++ }
+
-+ return -ENOMEM;
-+}
++ if (npo.copy_prod) {
++ BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op));
++ mcl = npo.mcl + npo.mcl_prod++;
++ mcl->op = __HYPERVISOR_grant_table_op;
++ mcl->args[0] = GNTTABOP_copy;
++ mcl->args[1] = (unsigned long)grant_copy_op;
++ mcl->args[2] = npo.copy_prod;
++ }
+
-+static inline void
-+blktap_ring_set_message(struct blktap *tap, int msg)
-+{
-+ struct blktap_ring *ring = &tap->ring;
++ /* Nothing to do? */
++ if (!npo.mcl_prod)
++ return;
+
-+ down_read(&tap->tap_sem);
-+ if (ring->ring.sring)
-+ ring->ring.sring->pad[0] = msg;
-+ up_read(&tap->tap_sem);
-+}
++ BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl));
+
-+static int
-+blktap_ring_ioctl(struct inode *inode, struct file *filp,
-+ unsigned int cmd, unsigned long arg)
-+{
-+ struct blktap_params params;
-+ struct blktap *tap = filp->private_data;
++ ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
++ BUG_ON(ret != 0);
++ /* The mmu_machphys_update() must not fail. */
++ BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0);
+
-+ BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
++ while ((skb = __skb_dequeue(&rxq)) != NULL) {
++ nr_frags = *(int *)skb->cb;
+
-+ switch(cmd) {
-+ case BLKTAP2_IOCTL_KICK_FE:
-+ /* There are fe messages to process. */
-+ return blktap_read_ring(tap);
++ netif = netdev_priv(skb->dev);
+
-+ case BLKTAP2_IOCTL_CREATE_DEVICE:
-+ if (!arg)
-+ return -EINVAL;
++ netif->stats.tx_bytes += skb->len;
++ netif->stats.tx_packets++;
+
-+ if (copy_from_user(¶ms, (struct blktap_params __user *)arg,
-+ sizeof(params))) {
-+ BTERR("failed to get params\n");
-+ return -EFAULT;
-+ }
++ status = netbk_check_gop(nr_frags, netif->domid, &npo);
+
-+ if (blktap_validate_params(tap, ¶ms)) {
-+ BTERR("invalid params\n");
-+ return -EINVAL;
-+ }
++ id = meta[npo.meta_cons].id;
++ flags = nr_frags ? NETRXF_more_data : 0;
+
-+ tap->params = params;
-+ return blktap_device_create(tap);
++ if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
++ flags |= NETRXF_csum_blank | NETRXF_data_validated;
++ else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
++ /* remote but checksummed. */
++ flags |= NETRXF_data_validated;
++
++ offset = 0;
++ resp = make_rx_response(netif, id, status, offset,
++ skb_headlen(skb), flags);
++
++ if (meta[npo.meta_cons].frag.size) {
++ struct xen_netif_extra_info *gso =
++ (struct xen_netif_extra_info *)
++ RING_GET_RESPONSE(&netif->rx,
++ netif->rx.rsp_prod_pvt++);
+
-+ case BLKTAP2_IOCTL_SET_PARAMS:
-+ if (!arg)
-+ return -EINVAL;
++ resp->flags |= NETRXF_extra_info;
+
-+ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+ return -EINVAL;
++ gso->u.gso.size = meta[npo.meta_cons].frag.size;
++ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
++ gso->u.gso.pad = 0;
++ gso->u.gso.features = 0;
+
-+ if (copy_from_user(¶ms, (struct blktap_params __user *)arg,
-+ sizeof(params))) {
-+ BTERR("failed to get params\n");
-+ return -EFAULT;
++ gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
++ gso->flags = 0;
+ }
+
-+ if (blktap_validate_params(tap, ¶ms)) {
-+ BTERR("invalid params\n");
-+ return -EINVAL;
++ netbk_add_frag_responses(netif, status,
++ meta + npo.meta_cons + 1,
++ nr_frags);
++
++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
++ irq = netif->irq;
++ if (ret && !rx_notify[irq] &&
++ (netif->smart_poll != 1)) {
++ rx_notify[irq] = 1;
++ notify_list[notify_nr++] = irq;
+ }
+
-+ tap->params = params;
-+ return 0;
++ if (netif_queue_stopped(netif->dev) &&
++ netif_schedulable(netif) &&
++ !netbk_queue_full(netif))
++ netif_wake_queue(netif->dev);
+
-+ case BLKTAP2_IOCTL_PAUSE:
-+ if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
-+ return -EINVAL;
++ /*
++ * netfront_smartpoll_active indicates whether
++ * netfront timer is active.
++ */
++ if ((netif->smart_poll == 1)) {
++ if (!(netif->rx.sring->netfront_smartpoll_active)) {
++ notify_remote_via_irq(irq);
++ netif->rx.sring->netfront_smartpoll_active = 1;
++ }
++ }
+
-+ set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
-+ clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++ netif_put(netif);
++ dev_kfree_skb(skb);
++ npo.meta_cons += nr_frags + 1;
++ }
+
-+ blktap_ring_set_message(tap, 0);
-+ wake_up_interruptible(&tap->wq);
++ while (notify_nr != 0) {
++ irq = notify_list[--notify_nr];
++ rx_notify[irq] = 0;
++ notify_remote_via_irq(irq);
++ }
+
-+ return 0;
++ /* More work to do? */
++ if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
++ tasklet_schedule(&net_rx_tasklet);
++}
+
++static void net_alarm(unsigned long unused)
++{
++ tasklet_schedule(&net_rx_tasklet);
++}
+
-+ case BLKTAP2_IOCTL_REOPEN:
-+ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+ return -EINVAL;
++static void netbk_tx_pending_timeout(unsigned long unused)
++{
++ tasklet_schedule(&net_tx_tasklet);
++}
+
-+ if (!arg)
-+ return -EINVAL;
++struct net_device_stats *netif_be_get_stats(struct net_device *dev)
++{
++ struct xen_netif *netif = netdev_priv(dev);
++ return &netif->stats;
++}
+
-+ if (copy_to_user((char __user *)arg,
-+ tap->params.name,
-+ strlen(tap->params.name) + 1))
-+ return -EFAULT;
++static int __on_net_schedule_list(struct xen_netif *netif)
++{
++ return !list_empty(&netif->list);
++}
+
-+ blktap_ring_set_message(tap, 0);
-+ wake_up_interruptible(&tap->wq);
++static void remove_from_net_schedule_list(struct xen_netif *netif)
++{
++ spin_lock_irq(&net_schedule_list_lock);
++ if (likely(__on_net_schedule_list(netif))) {
++ list_del_init(&netif->list);
++ netif_put(netif);
++ }
++ spin_unlock_irq(&net_schedule_list_lock);
++}
+
-+ return 0;
++static void add_to_net_schedule_list_tail(struct xen_netif *netif)
++{
++ if (__on_net_schedule_list(netif))
++ return;
+
-+ case BLKTAP2_IOCTL_RESUME:
-+ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+ return -EINVAL;
++ spin_lock_irq(&net_schedule_list_lock);
++ if (!__on_net_schedule_list(netif) &&
++ likely(netif_schedulable(netif))) {
++ list_add_tail(&netif->list, &net_schedule_list);
++ netif_get(netif);
++ }
++ spin_unlock_irq(&net_schedule_list_lock);
++}
+
-+ tap->ring.response = (int)arg;
-+ if (!tap->ring.response)
-+ clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++void netif_schedule_work(struct xen_netif *netif)
++{
++ int more_to_do;
+
-+ blktap_ring_set_message(tap, 0);
-+ wake_up_interruptible(&tap->wq);
++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
+
-+ return 0;
++ if (more_to_do) {
++ add_to_net_schedule_list_tail(netif);
++ maybe_schedule_tx_action();
+ }
++}
+
-+ return -ENOIOCTLCMD;
++void netif_deschedule_work(struct xen_netif *netif)
++{
++ remove_from_net_schedule_list(netif);
+}
+
-+static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
++
++static void tx_add_credit(struct xen_netif *netif)
+{
-+ struct blktap *tap = filp->private_data;
-+ struct blktap_ring *ring = &tap->ring;
++ unsigned long max_burst, max_credit;
+
-+ poll_wait(filp, &ring->poll_wait, wait);
-+ if (ring->ring.sring->pad[0] != 0 ||
-+ ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
-+ RING_PUSH_REQUESTS(&ring->ring);
-+ return POLLIN | POLLRDNORM;
-+ }
++ /*
++ * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
++ * Otherwise the interface can seize up due to insufficient credit.
++ */
++ max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
++ max_burst = min(max_burst, 131072UL);
++ max_burst = max(max_burst, netif->credit_bytes);
+
-+ return 0;
++ /* Take care that adding a new chunk of credit doesn't wrap to zero. */
++ max_credit = netif->remaining_credit + netif->credit_bytes;
++ if (max_credit < netif->remaining_credit)
++ max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
++
++ netif->remaining_credit = min(max_credit, max_burst);
+}
+
-+static struct file_operations blktap_ring_file_operations = {
-+ .owner = THIS_MODULE,
-+ .open = blktap_ring_open,
-+ .release = blktap_ring_release,
-+ .ioctl = blktap_ring_ioctl,
-+ .mmap = blktap_ring_mmap,
-+ .poll = blktap_ring_poll,
-+};
++static void tx_credit_callback(unsigned long data)
++{
++ struct xen_netif *netif = (struct xen_netif *)data;
++ tx_add_credit(netif);
++ netif_schedule_work(netif);
++}
+
-+void
-+blktap_ring_kick_user(struct blktap *tap)
++static inline int copy_pending_req(pending_ring_idx_t pending_idx)
+{
-+ wake_up_interruptible(&tap->ring.poll_wait);
++ return gnttab_copy_grant_page(grant_tx_handle[pending_idx],
++ &mmap_pages[pending_idx]);
+}
+
-+int
-+blktap_ring_resume(struct blktap *tap)
++inline static void net_tx_action_dealloc(void)
+{
-+ int err;
-+ struct blktap_ring *ring = &tap->ring;
++ struct netbk_tx_pending_inuse *inuse, *n;
++ struct gnttab_unmap_grant_ref *gop;
++ u16 pending_idx;
++ pending_ring_idx_t dc, dp;
++ struct xen_netif *netif;
++ int ret;
++ LIST_HEAD(list);
+
-+ if (!blktap_active(tap))
-+ return -ENODEV;
++ dc = dealloc_cons;
++ gop = tx_unmap_ops;
+
-+ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+ return -EINVAL;
++ /*
++ * Free up any grants we have finished using
++ */
++ do {
++ dp = dealloc_prod;
+
-+ /* set shared flag for resume */
-+ ring->response = 0;
++ /* Ensure we see all indices enqueued by netif_idx_release(). */
++ smp_rmb();
+
-+ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
-+ blktap_ring_kick_user(tap);
++ while (dc != dp) {
++ unsigned long pfn;
+
-+ wait_event_interruptible(tap->wq, ring->response ||
-+ !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
++ pending_idx = dealloc_ring[pending_index(dc++)];
++ list_move_tail(&pending_inuse[pending_idx].list, &list);
+
-+ err = ring->response;
-+ ring->response = 0;
++ pfn = idx_to_pfn(pending_idx);
++ /* Already unmapped? */
++ if (!phys_to_machine_mapping_valid(pfn))
++ continue;
+
-+ BTDBG("err: %d\n", err);
++ gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
++ GNTMAP_host_map,
++ grant_tx_handle[pending_idx]);
++ gop++;
++ }
++
++ if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB ||
++ list_empty(&pending_inuse_head))
++ break;
++
++ /* Copy any entries that have been pending for too long. */
++ list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) {
++ if (time_after(inuse->alloc_time + HZ / 2, jiffies))
++ break;
++
++ pending_idx = inuse - pending_inuse;
++
++ pending_tx_info[pending_idx].netif->nr_copied_skbs++;
++
++ switch (copy_pending_req(pending_idx)) {
++ case 0:
++ list_move_tail(&inuse->list, &list);
++ continue;
++ case -EBUSY:
++ list_del_init(&inuse->list);
++ continue;
++ case -ENOENT:
++ continue;
++ }
++
++ break;
++ }
++ } while (dp != dealloc_prod);
+
-+ if (err)
-+ return err;
++ dealloc_cons = dc;
+
-+ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+ return -EAGAIN;
++ ret = HYPERVISOR_grant_table_op(
++ GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
++ BUG_ON(ret);
+
-+ return 0;
-+}
++ list_for_each_entry_safe(inuse, n, &list, list) {
++ pending_idx = inuse - pending_inuse;
+
-+int
-+blktap_ring_pause(struct blktap *tap)
-+{
-+ if (!blktap_active(tap))
-+ return -ENODEV;
++ netif = pending_tx_info[pending_idx].netif;
+
-+ if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
-+ return -EINVAL;
++ make_tx_response(netif, &pending_tx_info[pending_idx].req,
++ NETIF_RSP_OKAY);
+
-+ BTDBG("draining queue\n");
-+ wait_event_interruptible(tap->wq, !tap->pending_cnt);
-+ if (tap->pending_cnt)
-+ return -EAGAIN;
++ /* Ready for next use. */
++ gnttab_reset_grant_page(mmap_pages[pending_idx]);
+
-+ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
-+ blktap_ring_kick_user(tap);
++ pending_ring[pending_index(pending_prod++)] = pending_idx;
+
-+ BTDBG("waiting for tapdisk response\n");
-+ wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
-+ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
-+ return -EAGAIN;
++ netif_put(netif);
+
-+ return 0;
++ list_del_init(&inuse->list);
++ }
+}
+
-+int
-+blktap_ring_destroy(struct blktap *tap)
++static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *txp, RING_IDX end)
+{
-+ if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
-+ !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
-+ return 0;
-+
-+ BTDBG("sending tapdisk close message\n");
-+ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
-+ blktap_ring_kick_user(tap);
++ RING_IDX cons = netif->tx.req_cons;
+
-+ return -EAGAIN;
++ do {
++ make_tx_response(netif, txp, NETIF_RSP_ERROR);
++ if (cons >= end)
++ break;
++ txp = RING_GET_REQUEST(&netif->tx, cons++);
++ } while (1);
++ netif->tx.req_cons = cons;
++ netif_schedule_work(netif);
++ netif_put(netif);
+}
+
-+static void
-+blktap_ring_initialize(struct blktap_ring *ring, int minor)
++static int netbk_count_requests(struct xen_netif *netif,
++ struct xen_netif_tx_request *first,
++ struct xen_netif_tx_request *txp, int work_to_do)
+{
-+ memset(ring, 0, sizeof(*ring));
-+ init_waitqueue_head(&ring->poll_wait);
-+ ring->devno = MKDEV(blktap_ring_major, minor);
-+}
++ RING_IDX cons = netif->tx.req_cons;
++ int frags = 0;
+
-+int
-+blktap_ring_create(struct blktap *tap)
-+{
-+ struct blktap_ring *ring = &tap->ring;
-+ blktap_ring_initialize(ring, tap->minor);
-+ return blktap_sysfs_create(tap);
-+}
++ if (!(first->flags & NETTXF_more_data))
++ return 0;
+
-+int __init
-+blktap_ring_init(int *major)
-+{
-+ int err;
++ do {
++ if (frags >= work_to_do) {
++ DPRINTK("Need more frags\n");
++ return -frags;
++ }
+
-+ err = register_chrdev(0, "blktap2", &blktap_ring_file_operations);
-+ if (err < 0) {
-+ BTERR("error registering blktap ring device: %d\n", err);
-+ return err;
-+ }
++ if (unlikely(frags >= MAX_SKB_FRAGS)) {
++ DPRINTK("Too many frags\n");
++ return -frags;
++ }
+
-+ blktap_ring_major = *major = err;
-+ BTINFO("blktap ring major: %d\n", blktap_ring_major);
-+ return 0;
-+}
++ memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
++ sizeof(*txp));
++ if (txp->size > first->size) {
++ DPRINTK("Frags galore\n");
++ return -frags;
++ }
+
-+int
-+blktap_ring_free(void)
-+{
-+ if (blktap_ring_major)
-+ unregister_chrdev(blktap_ring_major, "blktap2");
++ first->size -= txp->size;
++ frags++;
+
-+ return 0;
++ if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
++ DPRINTK("txp->offset: %x, size: %u\n",
++ txp->offset, txp->size);
++ return -frags;
++ }
++ } while ((txp++)->flags & NETTXF_more_data);
++
++ return frags;
+}
-diff --git a/drivers/xen/blktap/sysfs.c b/drivers/xen/blktap/sysfs.c
-new file mode 100644
-index 0000000..23a3a51
---- /dev/null
-+++ b/drivers/xen/blktap/sysfs.c
-@@ -0,0 +1,451 @@
-+#include <linux/types.h>
-+#include <linux/device.h>
-+#include <linux/module.h>
-+#include <linux/sched.h>
+
-+#include "blktap.h"
++static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
++ struct sk_buff *skb,
++ struct xen_netif_tx_request *txp,
++ struct gnttab_map_grant_ref *mop)
++{
++ struct skb_shared_info *shinfo = skb_shinfo(skb);
++ skb_frag_t *frags = shinfo->frags;
++ unsigned long pending_idx = *((u16 *)skb->data);
++ int i, start;
+
-+int blktap_debug_level = 1;
++ /* Skip first skb fragment if it is on same page as header fragment. */
++ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
-+static struct class *class;
-+static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq);
++ for (i = start; i < shinfo->nr_frags; i++, txp++) {
++ pending_idx = pending_ring[pending_index(pending_cons++)];
+
-+static inline void
-+blktap_sysfs_get(struct blktap *tap)
-+{
-+ atomic_inc(&tap->ring.sysfs_refcnt);
-+}
++ gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
++ GNTMAP_host_map | GNTMAP_readonly,
++ txp->gref, netif->domid);
+
-+static inline void
-+blktap_sysfs_put(struct blktap *tap)
-+{
-+ if (atomic_dec_and_test(&tap->ring.sysfs_refcnt))
-+ wake_up(&sysfs_wq);
-+}
++ memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
++ netif_get(netif);
++ pending_tx_info[pending_idx].netif = netif;
++ frags[i].page = (void *)pending_idx;
++ }
+
-+static inline void
-+blktap_sysfs_enter(struct blktap *tap)
-+{
-+ blktap_sysfs_get(tap); /* pin sysfs device */
-+ mutex_lock(&tap->ring.sysfs_mutex); /* serialize sysfs operations */
++ return mop;
+}
+
-+static inline void
-+blktap_sysfs_exit(struct blktap *tap)
++static int netbk_tx_check_mop(struct sk_buff *skb,
++ struct gnttab_map_grant_ref **mopp)
+{
-+ mutex_unlock(&tap->ring.sysfs_mutex);
-+ blktap_sysfs_put(tap);
-+}
++ struct gnttab_map_grant_ref *mop = *mopp;
++ int pending_idx = *((u16 *)skb->data);
++ struct xen_netif *netif = pending_tx_info[pending_idx].netif;
++ struct xen_netif_tx_request *txp;
++ struct skb_shared_info *shinfo = skb_shinfo(skb);
++ int nr_frags = shinfo->nr_frags;
++ int i, err, start;
+
-+#define CLASS_DEVICE_ATTR(a,b,c,d) DEVICE_ATTR(a,b,c,d)
++ /* Check status of header. */
++ err = mop->status;
++ if (unlikely(err)) {
++ txp = &pending_tx_info[pending_idx].req;
++ make_tx_response(netif, txp, NETIF_RSP_ERROR);
++ pending_ring[pending_index(pending_prod++)] = pending_idx;
++ netif_put(netif);
++ } else {
++ set_phys_to_machine(
++ __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
++ FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
++ grant_tx_handle[pending_idx] = mop->handle;
++ }
+
-+static ssize_t blktap_sysfs_pause_device(struct device *, struct device_attribute *, const char *, size_t);
-+CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
-+static ssize_t blktap_sysfs_resume_device(struct device *, struct device_attribute *, const char *, size_t);
-+CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);
++ /* Skip first skb fragment if it is on same page as header fragment. */
++ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
-+static ssize_t
-+blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
-+{
-+ int err;
-+ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++ for (i = start; i < nr_frags; i++) {
++ int j, newerr;
+
-+ blktap_sysfs_enter(tap);
++ pending_idx = (unsigned long)shinfo->frags[i].page;
+
-+ if (!tap->ring.dev ||
-+ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
-+ err = -ENODEV;
-+ goto out;
-+ }
++ /* Check error status: if okay then remember grant handle. */
++ newerr = (++mop)->status;
++ if (likely(!newerr)) {
++ set_phys_to_machine(
++ __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
++ FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
++ grant_tx_handle[pending_idx] = mop->handle;
++ /* Had a previous error? Invalidate this fragment. */
++ if (unlikely(err))
++ netif_idx_release(pending_idx);
++ continue;
++ }
+
-+ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
-+ err = -EPERM;
-+ goto out;
-+ }
++ /* Error on this fragment: respond to client with an error. */
++ txp = &pending_tx_info[pending_idx].req;
++ make_tx_response(netif, txp, NETIF_RSP_ERROR);
++ pending_ring[pending_index(pending_prod++)] = pending_idx;
++ netif_put(netif);
+
-+ if (size > BLKTAP2_MAX_MESSAGE_LEN) {
-+ err = -ENAMETOOLONG;
-+ goto out;
-+ }
++ /* Not the first error? Preceding frags already invalidated. */
++ if (err)
++ continue;
++
++ /* First error: invalidate header and preceding fragments. */
++ pending_idx = *((u16 *)skb->data);
++ netif_idx_release(pending_idx);
++ for (j = start; j < i; j++) {
++ pending_idx = (unsigned long)shinfo->frags[i].page;
++ netif_idx_release(pending_idx);
++ }
+
-+ if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) {
-+ err = -EINVAL;
-+ goto out;
++ /* Remember the error: invalidate all subsequent fragments. */
++ err = newerr;
+ }
+
-+ snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf);
-+ err = size;
-+
-+out:
-+ blktap_sysfs_exit(tap);
++ *mopp = mop + 1;
+ return err;
+}
+
-+static ssize_t
-+blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
++static void netbk_fill_frags(struct sk_buff *skb)
+{
-+ ssize_t size;
-+ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++ struct skb_shared_info *shinfo = skb_shinfo(skb);
++ int nr_frags = shinfo->nr_frags;
++ int i;
+
-+ blktap_sysfs_enter(tap);
++ for (i = 0; i < nr_frags; i++) {
++ skb_frag_t *frag = shinfo->frags + i;
++ struct xen_netif_tx_request *txp;
++ unsigned long pending_idx;
+
-+ if (!tap->ring.dev)
-+ size = -ENODEV;
-+ else if (tap->params.name[0])
-+ size = sprintf(buf, "%s\n", tap->params.name);
-+ else
-+ size = sprintf(buf, "%d\n", tap->minor);
++ pending_idx = (unsigned long)frag->page;
+
-+ blktap_sysfs_exit(tap);
++ pending_inuse[pending_idx].alloc_time = jiffies;
++ list_add_tail(&pending_inuse[pending_idx].list,
++ &pending_inuse_head);
+
-+ return size;
++ txp = &pending_tx_info[pending_idx].req;
++ frag->page = virt_to_page(idx_to_kaddr(pending_idx));
++ frag->size = txp->size;
++ frag->page_offset = txp->offset;
++
++ skb->len += txp->size;
++ skb->data_len += txp->size;
++ skb->truesize += txp->size;
++ }
+}
-+CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
-+ blktap_sysfs_get_name, blktap_sysfs_set_name);
+
-+static ssize_t
-+blktap_sysfs_remove_device(struct device *dev,
-+ struct device_attribute *attr,
-+ const char *buf, size_t size)
++int netbk_get_extras(struct xen_netif *netif, struct xen_netif_extra_info *extras,
++ int work_to_do)
+{
-+ int err;
-+ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++ struct xen_netif_extra_info extra;
++ RING_IDX cons = netif->tx.req_cons;
+
-+ if (!tap->ring.dev)
-+ return size;
++ do {
++ if (unlikely(work_to_do-- <= 0)) {
++ DPRINTK("Missing extra info\n");
++ return -EBADR;
++ }
+
-+ if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
-+ return -EBUSY;
++ memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
++ sizeof(extra));
++ if (unlikely(!extra.type ||
++ extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
++ netif->tx.req_cons = ++cons;
++ DPRINTK("Invalid extra type: %d\n", extra.type);
++ return -EINVAL;
++ }
+
-+ err = blktap_control_destroy_device(tap);
++ memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
++ netif->tx.req_cons = ++cons;
++ } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
-+ return (err ? : size);
++ return work_to_do;
+}
-+CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
+
-+static ssize_t
-+blktap_sysfs_pause_device(struct device *dev,
-+ struct device_attribute *attr,
-+ const char *buf, size_t size)
++static int netbk_set_skb_gso(struct sk_buff *skb, struct xen_netif_extra_info *gso)
+{
-+ int err;
-+ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
-+
-+ blktap_sysfs_enter(tap);
-+
-+ BTDBG("pausing %u:%u: dev_inuse: %lu\n",
-+ MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse);
-+
-+ if (!tap->ring.dev ||
-+ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
-+ err = -ENODEV;
-+ goto out;
-+ }
-+
-+ if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
-+ err = -EBUSY;
-+ goto out;
++ if (!gso->u.gso.size) {
++ DPRINTK("GSO size must not be zero.\n");
++ return -EINVAL;
+ }
+
-+ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
-+ err = 0;
-+ goto out;
++ /* Currently only TCPv4 S.O. is supported. */
++ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
++ DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
++ return -EINVAL;
+ }
+
-+ err = blktap_device_pause(tap);
-+ if (!err) {
-+ device_remove_file(dev, &dev_attr_pause);
-+ err = device_create_file(dev, &dev_attr_resume);
-+ }
++ skb_shinfo(skb)->gso_size = gso->u.gso.size;
++ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
-+out:
-+ blktap_sysfs_exit(tap);
++ /* Header must be checked, and gso_segs computed. */
++ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
++ skb_shinfo(skb)->gso_segs = 0;
+
-+ return (err ? err : size);
++ return 0;
+}
+
-+static ssize_t
-+blktap_sysfs_resume_device(struct device *dev,
-+ struct device_attribute *attr,
-+ const char *buf, size_t size)
++static int skb_checksum_setup(struct sk_buff *skb)
+{
-+ int err;
-+ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++ struct iphdr *iph;
++ unsigned char *th;
++ int err = -EPROTO;
+
-+ blktap_sysfs_enter(tap);
++ if (skb->protocol != htons(ETH_P_IP))
++ goto out;
+
-+ if (!tap->ring.dev ||
-+ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
-+ err = -ENODEV;
++ iph = (void *)skb->data;
++ th = skb->data + 4 * iph->ihl;
++ if (th >= skb_tail_pointer(skb))
+ goto out;
-+ }
+
-+ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
-+ err = -EINVAL;
++ skb->csum_start = th - skb->head;
++ switch (iph->protocol) {
++ case IPPROTO_TCP:
++ skb->csum_offset = offsetof(struct tcphdr, check);
++ break;
++ case IPPROTO_UDP:
++ skb->csum_offset = offsetof(struct udphdr, check);
++ break;
++ default:
++ if (net_ratelimit())
++ printk(KERN_ERR "Attempting to checksum a non-"
++ "TCP/UDP packet, dropping a protocol"
++ " %d packet", iph->protocol);
+ goto out;
+ }
+
-+ err = blktap_device_resume(tap);
-+ if (!err) {
-+ device_remove_file(dev, &dev_attr_resume);
-+ err = device_create_file(dev, &dev_attr_pause);
-+ }
++ if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
++ goto out;
+
-+out:
-+ blktap_sysfs_exit(tap);
++ err = 0;
+
-+ BTDBG("returning %zd\n", (err ? err : size));
-+ return (err ? err : size);
++out:
++ return err;
+}
+
-+#ifdef ENABLE_PASSTHROUGH
-+static ssize_t
-+blktap_sysfs_enable_passthrough(struct device *dev,
-+ const char *buf, size_t size)
++static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size)
+{
-+ int err;
-+ unsigned major, minor;
-+ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
-+
-+ BTINFO("passthrough request enabled\n");
++ unsigned long now = jiffies;
++ unsigned long next_credit =
++ netif->credit_timeout.expires +
++ msecs_to_jiffies(netif->credit_usec / 1000);
+
-+ blktap_sysfs_enter(tap);
++ /* Timer could already be pending in rare cases. */
++ if (timer_pending(&netif->credit_timeout))
++ return true;
+
-+ if (!tap->ring.dev ||
-+ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
-+ err = -ENODEV;
-+ goto out;
++ /* Passed the point where we can replenish credit? */
++ if (time_after_eq(now, next_credit)) {
++ netif->credit_timeout.expires = now;
++ tx_add_credit(netif);
+ }
+
-+ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
-+ err = -EINVAL;
-+ goto out;
-+ }
++ /* Still too big to send right now? Set a callback. */
++ if (size > netif->remaining_credit) {
++ netif->credit_timeout.data =
++ (unsigned long)netif;
++ netif->credit_timeout.function =
++ tx_credit_callback;
++ mod_timer(&netif->credit_timeout,
++ next_credit);
+
-+ if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
-+ err = -EINVAL;
-+ goto out;
++ return true;
+ }
+
-+ err = sscanf(buf, "%x:%x", &major, &minor);
-+ if (err != 2) {
-+ err = -EINVAL;
-+ goto out;
-+ }
++ return false;
++}
++
++static unsigned net_tx_build_mops(void)
++{
++ struct gnttab_map_grant_ref *mop;
++ struct sk_buff *skb;
++ int ret;
++
++ mop = tx_map_ops;
++ while (((nr_pending_reqs() + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
++ !list_empty(&net_schedule_list)) {
++ struct xen_netif *netif;
++ struct xen_netif_tx_request txreq;
++ struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
++ struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
++ u16 pending_idx;
++ RING_IDX idx;
++ int work_to_do;
++ unsigned int data_len;
++
++ /* Get a netif from the list with work to do. */
++ netif = list_first_entry(&net_schedule_list, struct xen_netif, list);
++ netif_get(netif);
++ remove_from_net_schedule_list(netif);
++
++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
++ if (!work_to_do) {
++ netif_put(netif);
++ continue;
++ }
+
-+ err = blktap_device_enable_passthrough(tap, major, minor);
++ idx = netif->tx.req_cons;
++ rmb(); /* Ensure that we see the request before we copy it. */
++ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, idx), sizeof(txreq));
+
-+out:
-+ blktap_sysfs_exit(tap);
-+ BTDBG("returning %d\n", (err ? err : size));
-+ return (err ? err : size);
-+}
-+#endif
++ /* Credit-based scheduling. */
++ if (txreq.size > netif->remaining_credit &&
++ tx_credit_exceeded(netif, txreq.size)) {
++ netif_put(netif);
++ continue;
++ }
+
-+static ssize_t
-+blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
-+{
-+ char *tmp;
-+ int i, ret;
-+ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev);
++ netif->remaining_credit -= txreq.size;
+
-+ tmp = buf;
-+ blktap_sysfs_get(tap);
++ work_to_do--;
++ netif->tx.req_cons = ++idx;
+
-+ if (!tap->ring.dev) {
-+ ret = sprintf(tmp, "no device\n");
-+ goto out;
-+ }
++ memset(extras, 0, sizeof(extras));
++ if (txreq.flags & NETTXF_extra_info) {
++ work_to_do = netbk_get_extras(netif, extras,
++ work_to_do);
++ idx = netif->tx.req_cons;
++ if (unlikely(work_to_do < 0)) {
++ netbk_tx_err(netif, &txreq, idx);
++ continue;
++ }
++ }
+
-+ tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n",
-+ tap->params.name, MAJOR(tap->ring.devno),
-+ MINOR(tap->ring.devno), atomic_read(&tap->refcnt),
-+ tap->dev_inuse);
-+ tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, "
-+ "device users: %d\n", tap->params.capacity,
-+ tap->params.sector_size, tap->device.users);
++ ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
++ if (unlikely(ret < 0)) {
++ netbk_tx_err(netif, &txreq, idx - ret);
++ continue;
++ }
++ idx += ret;
+
-+ down_read(&tap->tap_sem);
++ if (unlikely(txreq.size < ETH_HLEN)) {
++ DPRINTK("Bad packet size: %d\n", txreq.size);
++ netbk_tx_err(netif, &txreq, idx);
++ continue;
++ }
+
-+ tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt);
-+ for (i = 0; i < MAX_PENDING_REQS; i++) {
-+ struct blktap_request *req = tap->pending_requests[i];
-+ if (!req)
++ /* No crossing a page as the payload mustn't fragment. */
++ if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
++ DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
++ txreq.offset, txreq.size,
++ (txreq.offset &~PAGE_MASK) + txreq.size);
++ netbk_tx_err(netif, &txreq, idx);
+ continue;
++ }
+
-+ tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, "
-+ "status: 0x%02x, pendcnt: %d, "
-+ "nr_pages: %u, op: %d, time: %lu:%lu\n",
-+ i, (unsigned long long)req->id, req->usr_idx,
-+ req->status, atomic_read(&req->pendcnt),
-+ req->nr_pages, req->operation, req->time.tv_sec,
-+ req->time.tv_usec);
-+ }
++ pending_idx = pending_ring[pending_index(pending_cons)];
+
-+ up_read(&tap->tap_sem);
-+ ret = (tmp - buf) + 1;
++ data_len = (txreq.size > PKT_PROT_LEN &&
++ ret < MAX_SKB_FRAGS) ?
++ PKT_PROT_LEN : txreq.size;
+
-+out:
-+ blktap_sysfs_put(tap);
-+ BTDBG("%s\n", buf);
++ skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
++ GFP_ATOMIC | __GFP_NOWARN);
++ if (unlikely(skb == NULL)) {
++ DPRINTK("Can't allocate a skb in start_xmit.\n");
++ netbk_tx_err(netif, &txreq, idx);
++ break;
++ }
+
-+ return ret;
-+}
-+CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);
++ /* Packets passed to netif_rx() must have some headroom. */
++ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
-+int
-+blktap_sysfs_create(struct blktap *tap)
-+{
-+ struct blktap_ring *ring;
-+ struct device *dev;
-+ int err;
++ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
++ struct xen_netif_extra_info *gso;
++ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
-+ if (!class)
-+ return -ENODEV;
++ if (netbk_set_skb_gso(skb, gso)) {
++ kfree_skb(skb);
++ netbk_tx_err(netif, &txreq, idx);
++ continue;
++ }
++ }
+
-+ ring = &tap->ring;
++ gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
++ GNTMAP_host_map | GNTMAP_readonly,
++ txreq.gref, netif->domid);
++ mop++;
+
-+ dev = device_create(class, NULL, ring->devno,
-+ tap, "blktap%d", tap->minor);
-+ if (IS_ERR(dev))
-+ return PTR_ERR(dev);
++ memcpy(&pending_tx_info[pending_idx].req,
++ &txreq, sizeof(txreq));
++ pending_tx_info[pending_idx].netif = netif;
++ *((u16 *)skb->data) = pending_idx;
+
-+ ring->dev = dev;
++ __skb_put(skb, data_len);
+
-+ mutex_init(&ring->sysfs_mutex);
-+ atomic_set(&ring->sysfs_refcnt, 0);
++ skb_shinfo(skb)->nr_frags = ret;
++ if (data_len < txreq.size) {
++ skb_shinfo(skb)->nr_frags++;
++ skb_shinfo(skb)->frags[0].page =
++ (void *)(unsigned long)pending_idx;
++ } else {
++ /* Discriminate from any valid pending_idx value. */
++ skb_shinfo(skb)->frags[0].page = (void *)~0UL;
++ }
+
++ __skb_queue_tail(&tx_queue, skb);
+
-+ printk(KERN_CRIT "%s: adding attributes for dev %p\n", __func__, dev);
-+ err = device_create_file(dev, &dev_attr_name);
-+ if (err)
-+ goto out;
-+ err = device_create_file(dev, &dev_attr_remove);
-+ if (err)
-+ goto out_unregister_name;
-+ err = device_create_file(dev, &dev_attr_pause);
-+ if (err)
-+ goto out_unregister_remove;
-+ err = device_create_file(dev, &dev_attr_debug);
-+ if (err)
-+ goto out_unregister_pause;
++ pending_cons++;
+
-+ return 0;
++ mop = netbk_get_requests(netif, skb, txfrags, mop);
+
-+out_unregister_pause:
-+ device_remove_file(dev, &dev_attr_pause);
-+out_unregister_remove:
-+ device_remove_file(dev, &dev_attr_remove);
-+out_unregister_name:
-+ device_remove_file(dev, &dev_attr_name);
-+out:
-+ return err;
++ netif->tx.req_cons = idx;
++ netif_schedule_work(netif);
++
++ if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
++ break;
++ }
++
++ return mop - tx_map_ops;
+}
+
-+int
-+blktap_sysfs_destroy(struct blktap *tap)
++static void net_tx_submit(void)
+{
-+ struct blktap_ring *ring;
-+ struct device *dev;
-+
-+ printk(KERN_CRIT "%s\n", __func__);
++ struct gnttab_map_grant_ref *mop;
++ struct sk_buff *skb;
+
-+ ring = &tap->ring;
-+ dev = ring->dev;
-+ if (!class || !dev)
-+ return 0;
++ mop = tx_map_ops;
++ while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
++ struct xen_netif_tx_request *txp;
++ struct xen_netif *netif;
++ u16 pending_idx;
++ unsigned data_len;
+
-+ ring->dev = NULL;
-+ if (wait_event_interruptible(sysfs_wq,
-+ !atomic_read(&tap->ring.sysfs_refcnt)))
-+ return -EAGAIN;
++ pending_idx = *((u16 *)skb->data);
++ netif = pending_tx_info[pending_idx].netif;
++ txp = &pending_tx_info[pending_idx].req;
+
-+ device_schedule_callback(dev, device_unregister);
++ /* Check the remap error code. */
++ if (unlikely(netbk_tx_check_mop(skb, &mop))) {
++ DPRINTK("netback grant failed.\n");
++ skb_shinfo(skb)->nr_frags = 0;
++ kfree_skb(skb);
++ continue;
++ }
+
-+ return 0;
-+}
++ data_len = skb->len;
++ memcpy(skb->data,
++ (void *)(idx_to_kaddr(pending_idx)|txp->offset),
++ data_len);
++ if (data_len < txp->size) {
++ /* Append the packet payload as a fragment. */
++ txp->offset += data_len;
++ txp->size -= data_len;
++ } else {
++ /* Schedule a response immediately. */
++ netif_idx_release(pending_idx);
++ }
+
-+static ssize_t
-+blktap_sysfs_show_verbosity(struct class *class, char *buf)
-+{
-+ return sprintf(buf, "%d\n", blktap_debug_level);
-+}
++ /*
++ * Old frontends do not assert data_validated but we
++ * can infer it from csum_blank so test both flags.
++ */
++ if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank))
++ skb->ip_summed = CHECKSUM_PARTIAL;
++ else
++ skb->ip_summed = CHECKSUM_NONE;
+
-+static ssize_t
-+blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size)
-+{
-+ int level;
++ netbk_fill_frags(skb);
+
-+ if (sscanf(buf, "%d", &level) == 1) {
-+ blktap_debug_level = level;
-+ return size;
-+ }
++ /*
++ * If the initial fragment was < PKT_PROT_LEN then
++ * pull through some bytes from the other fragments to
++ * increase the linear region to PKT_PROT_LEN bytes.
++ */
++ if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
++ int target = min_t(int, skb->len, PKT_PROT_LEN);
++ __pskb_pull_tail(skb, target - skb_headlen(skb));
++ }
+
-+ return -EINVAL;
-+}
-+CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR,
-+ blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
++ skb->dev = netif->dev;
++ skb->protocol = eth_type_trans(skb, skb->dev);
+
-+static ssize_t
-+blktap_sysfs_show_devices(struct class *class, char *buf)
-+{
-+ int i, ret;
-+ struct blktap *tap;
++ netif->stats.rx_bytes += skb->len;
++ netif->stats.rx_packets++;
+
-+ ret = 0;
-+ for (i = 0; i < MAX_BLKTAP_DEVICE; i++) {
-+ tap = blktaps[i];
-+ if (!tap)
-+ continue;
++ if (skb->ip_summed == CHECKSUM_PARTIAL) {
++ if (skb_checksum_setup(skb)) {
++ DPRINTK("Can't setup checksum in net_tx_action\n");
++ kfree_skb(skb);
++ continue;
++ }
++ }
+
-+ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++ if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
++ unlikely(skb_linearize(skb))) {
++ DPRINTK("Can't linearize skb in net_tx_action.\n");
++ kfree_skb(skb);
+ continue;
++ }
+
-+ ret += sprintf(buf + ret, "%d ", tap->minor);
-+ ret += snprintf(buf + ret, sizeof(tap->params.name) - 1,
-+ tap->params.name);
-+ ret += sprintf(buf + ret, "\n");
++ netif_rx(skb);
++ netif->dev->last_rx = jiffies;
+ }
+
-+ return ret;
-+}
-+CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL);
-+
-+void
-+blktap_sysfs_free(void)
-+{
-+ if (!class)
-+ return;
-+
-+ class_remove_file(class, &class_attr_verbosity);
-+ class_remove_file(class, &class_attr_devices);
++ if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
++ !list_empty(&pending_inuse_head)) {
++ struct netbk_tx_pending_inuse *oldest;
+
-+ class_destroy(class);
++ oldest = list_entry(pending_inuse_head.next,
++ struct netbk_tx_pending_inuse, list);
++ mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ);
++ }
+}
+
-+int __init
-+blktap_sysfs_init(void)
++/* Called after netfront has transmitted */
++static void net_tx_action(unsigned long unused)
+{
-+ struct class *cls;
-+ int err;
-+
-+ if (class)
-+ return -EEXIST;
-+
-+ cls = class_create(THIS_MODULE, "blktap2");
-+ if (IS_ERR(cls))
-+ return PTR_ERR(cls);
-+
-+ err = class_create_file(cls, &class_attr_verbosity);
-+ if (err)
-+ goto out_unregister;
-+ err = class_create_file(cls, &class_attr_devices);
-+ if (err)
-+ goto out_unregister;
-+
-+ class = cls;
-+ return 0;
-+out_unregister:
-+ class_destroy(cls);
-+ return err;
-+}
-diff --git a/drivers/xen/blktap/wait_queue.c b/drivers/xen/blktap/wait_queue.c
-new file mode 100644
-index 0000000..f8995aa
---- /dev/null
-+++ b/drivers/xen/blktap/wait_queue.c
-@@ -0,0 +1,40 @@
-+#include <linux/list.h>
-+#include <linux/spinlock.h>
++ unsigned nr_mops;
++ int ret;
+
-+#include "blktap.h"
++ if (dealloc_cons != dealloc_prod)
++ net_tx_action_dealloc();
+
-+static LIST_HEAD(deferred_work_queue);
-+static DEFINE_SPINLOCK(deferred_work_lock);
++ nr_mops = net_tx_build_mops();
+
-+void
-+blktap_run_deferred(void)
-+{
-+ LIST_HEAD(queue);
-+ struct blktap *tap;
-+ unsigned long flags;
++ if (nr_mops == 0)
++ return;
+
-+ spin_lock_irqsave(&deferred_work_lock, flags);
-+ list_splice_init(&deferred_work_queue, &queue);
-+ list_for_each_entry(tap, &queue, deferred_queue)
-+ clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
-+ spin_unlock_irqrestore(&deferred_work_lock, flags);
++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++ tx_map_ops, nr_mops);
++ BUG_ON(ret);
+
-+ while (!list_empty(&queue)) {
-+ tap = list_entry(queue.next, struct blktap, deferred_queue);
-+ list_del_init(&tap->deferred_queue);
-+ blktap_device_restart(tap);
-+ }
++ net_tx_submit();
+}
+
-+void
-+blktap_defer(struct blktap *tap)
++static void netif_idx_release(u16 pending_idx)
+{
++ static DEFINE_SPINLOCK(_lock);
+ unsigned long flags;
+
-+ spin_lock_irqsave(&deferred_work_lock, flags);
-+ if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) {
-+ set_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
-+ list_add_tail(&tap->deferred_queue, &deferred_work_queue);
-+ }
-+ spin_unlock_irqrestore(&deferred_work_lock, flags);
-+}
-diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
-index bdfd584..6625ffe 100644
---- a/drivers/xen/cpu_hotplug.c
-+++ b/drivers/xen/cpu_hotplug.c
-@@ -1,5 +1,6 @@
- #include <linux/notifier.h>
-
-+#include <xen/xen.h>
- #include <xen/xenbus.h>
-
- #include <asm/xen/hypervisor.h>
-diff --git a/drivers/xen/events.c b/drivers/xen/events.c
-index ce602dd..925e7a1 100644
---- a/drivers/xen/events.c
-+++ b/drivers/xen/events.c
-@@ -16,7 +16,7 @@
- * (typically dom0).
- * 2. VIRQs, typically used for timers. These are per-cpu events.
- * 3. IPIs.
-- * 4. Hardware interrupts. Not supported at present.
-+ * 4. PIRQs - Hardware interrupts.
- *
- * Jeremy Fitzhardinge <jeremy at xensource.com>, XenSource Inc, 2007
- */
-@@ -27,10 +27,15 @@
- #include <linux/module.h>
- #include <linux/string.h>
- #include <linux/bootmem.h>
-+#include <linux/irqnr.h>
-+#include <linux/pci_regs.h>
-+#include <linux/pci.h>
-+#include <linux/msi.h>
-
- #include <asm/ptrace.h>
- #include <asm/irq.h>
- #include <asm/idle.h>
-+#include <asm/io_apic.h>
- #include <asm/sync_bitops.h>
- #include <asm/xen/hypercall.h>
- #include <asm/xen/hypervisor.h>
-@@ -40,6 +45,8 @@
- #include <xen/interface/xen.h>
- #include <xen/interface/event_channel.h>
-
-+#include "../pci/msi.h"
-+
- /*
- * This lock protects updates to the following mapping and reference-count
- * arrays. The lock does not need to be acquired to read the mapping tables.
-@@ -67,7 +74,7 @@ enum xen_irq_type {
- * event channel - irq->event channel mapping
- * cpu - cpu this event channel is bound to
- * index - type-specific information:
-- * PIRQ - vector, with MSB being "needs EIO"
-+ * PIRQ - with MSB being "needs EIO"
- * VIRQ - virq number
- * IPI - IPI vector
- * EVTCHN -
-@@ -82,21 +89,26 @@ struct irq_info
- unsigned short virq;
- enum ipi_vector ipi;
- struct {
-- unsigned short gsi;
-- unsigned short vector;
-+ unsigned short nr;
-+ unsigned char flags;
- } pirq;
- } u;
- };
-+#define PIRQ_NEEDS_EOI (1 << 0)
-+#define PIRQ_SHAREABLE (1 << 1)
-
--static struct irq_info irq_info[NR_IRQS];
-+static struct irq_info *irq_info;
-
--static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
-- [0 ... NR_EVENT_CHANNELS-1] = -1
--};
-+static int *evtchn_to_irq;
- struct cpu_evtchn_s {
- unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
- };
--static struct cpu_evtchn_s *cpu_evtchn_mask_p;
-+
-+static __initdata struct cpu_evtchn_s init_evtchn_mask = {
-+ .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
-+};
-+static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
-+
- static inline unsigned long *cpu_evtchn_mask(int cpu)
- {
- return cpu_evtchn_mask_p[cpu].bits;
-@@ -106,6 +118,7 @@ static inline unsigned long *cpu_evtchn_mask(int cpu)
- #define VALID_EVTCHN(chn) ((chn) != 0)
-
- static struct irq_chip xen_dynamic_chip;
-+static struct irq_chip xen_pirq_chip;
-
- /* Constructor for packed IRQ information. */
- static struct irq_info mk_unbound_info(void)
-@@ -132,10 +145,10 @@ static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq)
- }
-
- static struct irq_info mk_pirq_info(unsigned short evtchn,
-- unsigned short gsi, unsigned short vector)
-+ unsigned short pirq)
- {
- return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
-- .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
-+ .cpu = 0, .u.pirq = { .nr = pirq } };
- }
-
- /*
-@@ -184,17 +197,7 @@ static unsigned gsi_from_irq(unsigned irq)
- BUG_ON(info == NULL);
- BUG_ON(info->type != IRQT_PIRQ);
-
-- return info->u.pirq.gsi;
--}
--
--static unsigned vector_from_irq(unsigned irq)
--{
-- struct irq_info *info = info_for_irq(irq);
--
-- BUG_ON(info == NULL);
-- BUG_ON(info->type != IRQT_PIRQ);
--
-- return info->u.pirq.vector;
-+ return info->u.pirq.nr;
- }
-
- static enum xen_irq_type type_from_irq(unsigned irq)
-@@ -218,6 +221,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn)
- return ret;
- }
-
-+static bool pirq_needs_eoi(unsigned irq)
-+{
-+ struct irq_info *info = info_for_irq(irq);
++ spin_lock_irqsave(&_lock, flags);
++ dealloc_ring[pending_index(dealloc_prod)] = pending_idx;
++ /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
++ smp_wmb();
++ dealloc_prod++;
++ spin_unlock_irqrestore(&_lock, flags);
+
-+ BUG_ON(info->type != IRQT_PIRQ);
++ tasklet_schedule(&net_tx_tasklet);
++}
+
-+ return info->u.pirq.flags & PIRQ_NEEDS_EOI;
++static void netif_page_release(struct page *page, unsigned int order)
++{
++ int idx = netif_page_index(page);
++ BUG_ON(order);
++ BUG_ON(idx < 0);
++ netif_idx_release(idx);
+}
+
- static inline unsigned long active_evtchns(unsigned int cpu,
- struct shared_info *sh,
- unsigned int idx)
-@@ -329,12 +341,24 @@ static void unmask_evtchn(int port)
- put_cpu();
- }
-
-+static int get_nr_hw_irqs(void)
++irqreturn_t netif_be_int(int irq, void *dev_id)
+{
-+ int ret = 1;
++ struct xen_netif *netif = dev_id;
+
-+#ifdef CONFIG_X86_IO_APIC
-+ ret = get_nr_irqs_gsi();
-+#endif
++ add_to_net_schedule_list_tail(netif);
++ maybe_schedule_tx_action();
+
-+ return ret;
-+}
++ if (netif_schedulable(netif) && !netbk_queue_full(netif))
++ netif_wake_queue(netif->dev);
+
- static int find_unbound_irq(void)
- {
- int irq;
- struct irq_desc *desc;
-+ int start = get_nr_hw_irqs();
-
-- for (irq = 0; irq < nr_irqs; irq++)
-+ for (irq = start; irq < nr_irqs; irq++)
- if (irq_info[irq].type == IRQT_UNBOUND)
- break;
-
-@@ -350,6 +374,290 @@ static int find_unbound_irq(void)
- return irq;
- }
-
-+static bool identity_mapped_irq(unsigned irq)
-+{
-+ /* identity map all the hardware irqs */
-+ return irq < get_nr_hw_irqs();
++ return IRQ_HANDLED;
+}
+
-+static void pirq_unmask_notify(int irq)
++static void make_tx_response(struct xen_netif *netif,
++ struct xen_netif_tx_request *txp,
++ s8 st)
+{
-+ struct irq_info *info = info_for_irq(irq);
-+ struct physdev_eoi eoi = { .irq = info->u.pirq.nr };
++ RING_IDX i = netif->tx.rsp_prod_pvt;
++ struct xen_netif_tx_response *resp;
++ int notify;
+
-+ if (unlikely(pirq_needs_eoi(irq))) {
-+ int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
-+ WARN_ON(rc);
-+ }
++ resp = RING_GET_RESPONSE(&netif->tx, i);
++ resp->id = txp->id;
++ resp->status = st;
++
++ if (txp->flags & NETTXF_extra_info)
++ RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
++
++ netif->tx.rsp_prod_pvt = ++i;
++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
++
++ /*
++ * netfront_smartpoll_active indicates whether netfront timer
++ * is active.
++ */
++ if ((netif->smart_poll == 1)) {
++ if (!(netif->rx.sring->netfront_smartpoll_active)) {
++ notify_remote_via_irq(netif->irq);
++ netif->rx.sring->netfront_smartpoll_active = 1;
++ }
++ } else if (notify)
++ notify_remote_via_irq(netif->irq);
+}
+
-+static void pirq_query_unmask(int irq)
++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
++ u16 id,
++ s8 st,
++ u16 offset,
++ u16 size,
++ u16 flags)
+{
-+ struct physdev_irq_status_query irq_status;
-+ struct irq_info *info = info_for_irq(irq);
++ RING_IDX i = netif->rx.rsp_prod_pvt;
++ struct xen_netif_rx_response *resp;
+
-+ BUG_ON(info->type != IRQT_PIRQ);
++ resp = RING_GET_RESPONSE(&netif->rx, i);
++ resp->offset = offset;
++ resp->flags = flags;
++ resp->id = id;
++ resp->status = (s16)size;
++ if (st < 0)
++ resp->status = (s16)st;
+
-+ irq_status.irq = info->u.pirq.nr;
-+ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
-+ irq_status.flags = 0;
++ netif->rx.rsp_prod_pvt = ++i;
+
-+ info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
-+ if (irq_status.flags & XENIRQSTAT_needs_eoi)
-+ info->u.pirq.flags |= PIRQ_NEEDS_EOI;
++ return resp;
+}
+
-+static bool probing_irq(int irq)
++#ifdef NETBE_DEBUG_INTERRUPT
++static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
+{
-+ struct irq_desc *desc = irq_to_desc(irq);
++ struct list_head *ent;
++ struct xen_netif *netif;
++ int i = 0;
+
-+ return desc && desc->action == NULL;
++ printk(KERN_ALERT "netif_schedule_list:\n");
++ spin_lock_irq(&net_schedule_list_lock);
++
++ list_for_each (ent, &net_schedule_list) {
++ netif = list_entry(ent, struct xen_netif, list);
++ printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
++ "rx_resp_prod=%08x\n",
++ i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
++ printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n",
++ netif->tx.req_cons, netif->tx.rsp_prod_pvt);
++ printk(KERN_ALERT " shared(rx_req_prod=%08x "
++ "rx_resp_prod=%08x\n",
++ netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
++ printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n",
++ netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
++ printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n",
++ netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
++ i++;
++ }
++
++ spin_unlock_irq(&net_schedule_list_lock);
++ printk(KERN_ALERT " ** End of netif_schedule_list **\n");
++
++ return IRQ_HANDLED;
+}
++#endif
+
-+static unsigned int startup_pirq(unsigned int irq)
++static int __init netback_init(void)
+{
-+ struct evtchn_bind_pirq bind_pirq;
-+ struct irq_info *info = info_for_irq(irq);
-+ int evtchn = evtchn_from_irq(irq);
-+ int rc;
++ int i;
++ struct page *page;
++ int rc = 0;
+
-+ BUG_ON(info->type != IRQT_PIRQ);
++ if (!xen_domain())
++ return -ENODEV;
+
-+ if (VALID_EVTCHN(evtchn))
-+ goto out;
++ /* We can increase reservation by this much in net_rx_action(). */
++// balloon_update_driver_allowance(NET_RX_RING_SIZE);
+
-+ bind_pirq.pirq = info->u.pirq.nr;
-+ /* NB. We are happy to share unless we are probing. */
-+ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
-+ BIND_PIRQ__WILL_SHARE : 0;
-+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
-+ if (rc != 0) {
-+ if (!probing_irq(irq))
-+ printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
-+ irq);
-+ return 0;
-+ }
-+ evtchn = bind_pirq.port;
++ skb_queue_head_init(&rx_queue);
++ skb_queue_head_init(&tx_queue);
+
-+ pirq_query_unmask(irq);
++ init_timer(&net_timer);
++ net_timer.data = 0;
++ net_timer.function = net_alarm;
+
-+ evtchn_to_irq[evtchn] = irq;
-+ bind_evtchn_to_cpu(evtchn, 0);
-+ info->evtchn = evtchn;
++ init_timer(&netbk_tx_pending_timer);
++ netbk_tx_pending_timer.data = 0;
++ netbk_tx_pending_timer.function = netbk_tx_pending_timeout;
+
-+ out:
-+ unmask_evtchn(evtchn);
-+ pirq_unmask_notify(irq);
++ mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
++ if (mmap_pages == NULL) {
++ printk("%s: out of memory\n", __FUNCTION__);
++ return -ENOMEM;
++ }
+
-+ return 0;
-+}
++ for (i = 0; i < MAX_PENDING_REQS; i++) {
++ page = mmap_pages[i];
++ SetPageForeign(page, netif_page_release);
++ netif_set_page_index(page, i);
++ INIT_LIST_HEAD(&pending_inuse[i].list);
++ }
+
-+static void shutdown_pirq(unsigned int irq)
-+{
-+ struct evtchn_close close;
-+ struct irq_info *info = info_for_irq(irq);
-+ int evtchn = evtchn_from_irq(irq);
++ pending_cons = 0;
++ pending_prod = MAX_PENDING_REQS;
++ for (i = 0; i < MAX_PENDING_REQS; i++)
++ pending_ring[i] = i;
+
-+ BUG_ON(info->type != IRQT_PIRQ);
++ netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
++ if (MODPARM_copy_skb) {
++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
++ NULL, 0))
++ netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
++ else
++ netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
++ }
+
-+ if (!VALID_EVTCHN(evtchn))
-+ return;
++ //netif_accel_init();
+
-+ mask_evtchn(evtchn);
++ rc = netif_xenbus_init();
++ if (rc)
++ goto failed_init;
+
-+ close.port = evtchn;
-+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
-+ BUG();
++#ifdef NETBE_DEBUG_INTERRUPT
++ (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
++ 0,
++ netif_be_dbg,
++ SA_SHIRQ,
++ "net-be-dbg",
++ &netif_be_dbg);
++#endif
+
-+ bind_evtchn_to_cpu(evtchn, 0);
-+ evtchn_to_irq[evtchn] = -1;
-+ info->evtchn = 0;
-+}
++ return 0;
+
-+static void enable_pirq(unsigned int irq)
-+{
-+ startup_pirq(irq);
-+}
++failed_init:
++ free_empty_pages_and_pagevec(mmap_pages, MAX_PENDING_REQS);
++ del_timer(&netbk_tx_pending_timer);
++ del_timer(&net_timer);
++ return rc;
+
-+static void disable_pirq(unsigned int irq)
-+{
+}
+
-+static void ack_pirq(unsigned int irq)
-+{
-+ int evtchn = evtchn_from_irq(irq);
++module_init(netback_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c
+new file mode 100644
+index 0000000..70636d0
+--- /dev/null
++++ b/drivers/xen/netback/xenbus.c
+@@ -0,0 +1,523 @@
++/* Xenbus code for netif backend
++ Copyright (C) 2005 Rusty Russell <rusty at rustcorp.com.au>
++ Copyright (C) 2005 XenSource Ltd
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2 of the License, or
++ (at your option) any later version.
+
-+ move_native_irq(irq);
++ This program is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ GNU General Public License for more details.
+
-+ if (VALID_EVTCHN(evtchn)) {
-+ mask_evtchn(evtchn);
-+ clear_evtchn(evtchn);
-+ }
-+}
++ You should have received a copy of the GNU General Public License
++ along with this program; if not, write to the Free Software
++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++*/
+
-+static void end_pirq(unsigned int irq)
-+{
-+ int evtchn = evtchn_from_irq(irq);
-+ struct irq_desc *desc = irq_to_desc(irq);
++#include <stdarg.h>
++#include <linux/module.h>
++#include <xen/xenbus.h>
++#include "common.h"
+
-+ if (WARN_ON(!desc))
-+ return;
++#if 0
++#undef DPRINTK
++#define DPRINTK(fmt, args...) \
++ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
++#endif
+
-+ if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
-+ (IRQ_DISABLED|IRQ_PENDING)) {
-+ shutdown_pirq(irq);
-+ } else if (VALID_EVTCHN(evtchn)) {
-+ unmask_evtchn(evtchn);
-+ pirq_unmask_notify(irq);
-+ }
-+}
+
-+static int find_irq_by_gsi(unsigned gsi)
-+{
-+ int irq;
++static int connect_rings(struct backend_info *);
++static void connect(struct backend_info *);
++static void backend_create_netif(struct backend_info *be);
++static void unregister_hotplug_status_watch(struct backend_info *be);
+
-+ for (irq = 0; irq < nr_irqs; irq++) {
-+ struct irq_info *info = info_for_irq(irq);
++static int netback_remove(struct xenbus_device *dev)
++{
++ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
-+ if (info == NULL || info->type != IRQT_PIRQ)
-+ continue;
++ //netback_remove_accelerators(be, dev);
+
-+ if (gsi_from_irq(irq) == gsi)
-+ return irq;
++ unregister_hotplug_status_watch(be);
++ if (be->netif) {
++ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
++ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
++ netif_disconnect(be->netif);
++ be->netif = NULL;
+ }
-+
-+ return -1;
++ kfree(be);
++ dev_set_drvdata(&dev->dev, NULL);
++ return 0;
+}
+
-+/*
-+ * Allocate a physical irq. We don't assign an event channel
-+ * until the irq actually started up. Return an
-+ * existing irq if we've already got one for the gsi.
++
++/**
++ * Entry point to this code when a new device is created. Allocate the basic
++ * structures and switch to InitWait.
+ */
-+int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
++static int netback_probe(struct xenbus_device *dev,
++ const struct xenbus_device_id *id)
+{
-+ int irq;
-+
-+ spin_lock(&irq_mapping_update_lock);
-+
-+ irq = find_irq_by_gsi(gsi);
-+ if (irq != -1) {
-+ printk(KERN_INFO "xen_allocate_pirq: returning irq %d for gsi %u\n",
-+ irq, gsi);
-+ goto out; /* XXX need refcount? */
++ const char *message;
++ struct xenbus_transaction xbt;
++ int err;
++ int sg;
++ struct backend_info *be = kzalloc(sizeof(struct backend_info),
++ GFP_KERNEL);
++ if (!be) {
++ xenbus_dev_fatal(dev, -ENOMEM,
++ "allocating backend structure");
++ return -ENOMEM;
+ }
+
-+ if (identity_mapped_irq(gsi)) {
-+ irq = gsi;
-+ irq_to_desc_alloc_node(irq, 0);
-+ dynamic_irq_init(irq);
-+ } else
-+ irq = find_unbound_irq();
-+
-+ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
-+ handle_level_irq, name);
-+
-+ irq_info[irq] = mk_pirq_info(0, gsi);
-+ irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
-+out:
-+ spin_unlock(&irq_mapping_update_lock);
-+ return irq;
-+}
-+
-+#ifdef CONFIG_PCI_MSI
-+int xen_destroy_irq(int irq)
-+{
-+ struct irq_desc *desc;
-+ struct physdev_unmap_pirq unmap_irq;
-+ struct irq_info *info = info_for_irq(irq);
-+ int rc = -ENOENT;
++ be->dev = dev;
++ dev_set_drvdata(&dev->dev, be);
+
-+ spin_lock(&irq_mapping_update_lock);
++ sg = 1;
++ if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
++ sg = 0;
+
-+ desc = irq_to_desc(irq);
-+ if (!desc)
-+ goto out;
++ do {
++ err = xenbus_transaction_start(&xbt);
++ if (err) {
++ xenbus_dev_fatal(dev, err, "starting transaction");
++ goto fail;
++ }
+
-+ unmap_irq.pirq = info->u.pirq.nr;
-+ unmap_irq.domid = DOMID_SELF;
-+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
-+ if (rc) {
-+ printk(KERN_WARNING "unmap irq failed %d\n", rc);
-+ goto out;
-+ }
++ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
++ if (err) {
++ message = "writing feature-sg";
++ goto abort_transaction;
++ }
+
-+ irq_info[irq] = mk_unbound_info();
++ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
++ "%d", sg);
++ if (err) {
++ message = "writing feature-gso-tcpv4";
++ goto abort_transaction;
++ }
+
-+ dynamic_irq_cleanup(irq);
++ /* We support rx-copy path. */
++ err = xenbus_printf(xbt, dev->nodename,
++ "feature-rx-copy", "%d", 1);
++ if (err) {
++ message = "writing feature-rx-copy";
++ goto abort_transaction;
++ }
+
-+out:
-+ spin_unlock(&irq_mapping_update_lock);
-+ return rc;
-+}
++ /*
++ * We don't support rx-flip path (except old guests who don't
++ * grok this feature flag).
++ */
++ err = xenbus_printf(xbt, dev->nodename,
++ "feature-rx-flip", "%d", 0);
++ if (err) {
++ message = "writing feature-rx-flip";
++ goto abort_transaction;
++ }
+
-+int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
-+{
-+ int irq = 0;
-+ struct physdev_map_pirq map_irq;
-+ int rc;
-+ domid_t domid = DOMID_SELF;
-+ int pos;
-+ u32 table_offset, bir;
++ /* We support data smart poll mechanism */
++ err = xenbus_printf(xbt, dev->nodename,
++ "feature-smart-poll", "%d", 1);
++ if (err) {
++ message = "writing feature-smart-poll";
++ goto abort_transaction;
++ }
+
-+ memset(&map_irq, 0, sizeof(map_irq));
-+ map_irq.domid = domid;
-+ map_irq.type = MAP_PIRQ_TYPE_MSI;
-+ map_irq.index = -1;
-+ map_irq.pirq = -1;
-+ map_irq.bus = dev->bus->number;
-+ map_irq.devfn = dev->devfn;
++ err = xenbus_transaction_end(xbt, 0);
++ } while (err == -EAGAIN);
+
-+ if (type == PCI_CAP_ID_MSIX) {
-+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
++ if (err) {
++ xenbus_dev_fatal(dev, err, "completing transaction");
++ goto fail;
++ }
+
-+ pci_read_config_dword(dev, msix_table_offset_reg(pos),
-+ &table_offset);
-+ bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
++ //netback_probe_accelerators(be, dev);
+
-+ map_irq.table_base = pci_resource_start(dev, bir);
-+ map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
-+ }
++ err = xenbus_switch_state(dev, XenbusStateInitWait);
++ if (err)
++ goto fail;
+
-+ spin_lock(&irq_mapping_update_lock);
++ /* This kicks hotplug scripts, so do it immediately. */
++ backend_create_netif(be);
+
-+ irq = find_unbound_irq();
++ return 0;
+
-+ if (irq == -1)
-+ goto out;
++abort_transaction:
++ xenbus_transaction_end(xbt, 1);
++ xenbus_dev_fatal(dev, err, "%s", message);
++fail:
++ DPRINTK("failed");
++ netback_remove(dev);
++ return err;
++}
+
-+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
-+ if (rc) {
+
-+ printk(KERN_WARNING "xen map irq failed %d\n", rc);
++/**
++ * Handle the creation of the hotplug script environment. We add the script
++ * and vif variables to the environment, for the benefit of the vif-* hotplug
++ * scripts.
++ */
++static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
++{
++ struct backend_info *be = dev_get_drvdata(&xdev->dev);
++ struct xen_netif *netif = be->netif;
++ char *val;
+
-+ dynamic_irq_cleanup(irq);
++ DPRINTK("netback_uevent");
+
-+ irq = -1;
-+ goto out;
++ val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
++ if (IS_ERR(val)) {
++ int err = PTR_ERR(val);
++ xenbus_dev_fatal(xdev, err, "reading script");
++ return err;
++ }
++ else {
++ if (add_uevent_var(env, "script=%s", val)) {
++ kfree(val);
++ return -ENOMEM;
++ }
++ kfree(val);
+ }
-+ irq_info[irq] = mk_pirq_info(0, map_irq.pirq);
+
-+ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
-+ handle_level_irq,
-+ (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
++ if (add_uevent_var(env, "vif=%s", netif->dev->name))
++ return -ENOMEM;
+
-+out:
-+ spin_unlock(&irq_mapping_update_lock);
-+ return irq;
++ return 0;
+}
-+#endif
+
-+int xen_gsi_from_irq(unsigned irq)
-+{
-+ return gsi_from_irq(irq);
-+}
-+EXPORT_SYMBOL_GPL(xen_gsi_from_irq);
+
- int bind_evtchn_to_irq(unsigned int evtchn)
- {
- int irq;
-@@ -409,8 +717,23 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
- return irq;
- }
-
-+static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
-+ unsigned int remote_port)
++static void backend_create_netif(struct backend_info *be)
+{
-+ struct evtchn_bind_interdomain bind_interdomain;
-+ int err;
-+
-+ bind_interdomain.remote_dom = remote_domain;
-+ bind_interdomain.remote_port = remote_port;
-+
-+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
-+ &bind_interdomain);
-+
-+ return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
-+}
++ int err;
++ long handle;
++ struct xenbus_device *dev = be->dev;
+
-
--static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
-+int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
- {
- struct evtchn_bind_virq bind_virq;
- int evtchn, irq;
-@@ -504,6 +827,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
- }
- EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
-
-+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
-+ unsigned int remote_port,
-+ irq_handler_t handler,
-+ unsigned long irqflags,
-+ const char *devname,
-+ void *dev_id)
-+{
-+ int irq, retval;
++ if (be->netif != NULL)
++ return;
+
-+ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
-+ if (irq < 0)
-+ return irq;
++ err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
++ if (err != 1) {
++ xenbus_dev_fatal(dev, err, "reading handle");
++ return;
++ }
+
-+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
-+ if (retval != 0) {
-+ unbind_from_irq(irq);
-+ return retval;
-+ }
++ be->netif = netif_alloc(&dev->dev, dev->otherend_id, handle);
++ if (IS_ERR(be->netif)) {
++ err = PTR_ERR(be->netif);
++ be->netif = NULL;
++ xenbus_dev_fatal(dev, err, "creating interface");
++ return;
++ }
+
-+ return irq;
++ kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
+}
-+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
-+
- int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
- irq_handler_t handler,
- unsigned long irqflags, const char *devname, void *dev_id)
-@@ -649,9 +995,13 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
- int bit_idx = __ffs(pending_bits);
- int port = (word_idx * BITS_PER_LONG) + bit_idx;
- int irq = evtchn_to_irq[port];
-+ struct irq_desc *desc;
-
-- if (irq != -1)
-- handle_irq(irq, regs);
-+ if (irq != -1) {
-+ desc = irq_to_desc(irq);
-+ if (desc)
-+ generic_handle_irq_desc(irq, desc);
-+ }
- }
- }
-
-@@ -928,13 +1278,37 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
- .retrigger = retrigger_dynirq,
- };
-
-+static struct irq_chip xen_pirq_chip __read_mostly = {
-+ .name = "xen-pirq",
-+
-+ .startup = startup_pirq,
-+ .shutdown = shutdown_pirq,
+
-+ .enable = enable_pirq,
-+ .unmask = enable_pirq,
+
-+ .disable = disable_pirq,
-+ .mask = disable_pirq,
++static void disconnect_backend(struct xenbus_device *dev)
++{
++ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
-+ .ack = ack_pirq,
-+ .end = end_pirq,
++ if (be->netif) {
++ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
++ netif_disconnect(be->netif);
++ be->netif = NULL;
++ }
++}
+
-+ .set_affinity = set_affinity_irq,
++/**
++ * Callback received when the frontend's state changes.
++ */
++static void frontend_changed(struct xenbus_device *dev,
++ enum xenbus_state frontend_state)
++{
++ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
-+ .retrigger = retrigger_dynirq,
-+};
++ DPRINTK("%s", xenbus_strstate(frontend_state));
+
- void __init xen_init_IRQ(void)
- {
- int i;
-
- cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
- GFP_KERNEL);
-- BUG_ON(cpu_evtchn_mask_p == NULL);
-+ irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
++ be->frontend_state = frontend_state;
+
-+ evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), GFP_KERNEL);
-+ for(i = 0; i < NR_EVENT_CHANNELS; i++)
-+ evtchn_to_irq[i] = -1;
-
- init_evtchn_cpu_bindings();
-
-@@ -943,4 +1317,6 @@ void __init xen_init_IRQ(void)
- mask_evtchn(i);
-
- irq_ctx_init(smp_processor_id());
++ switch (frontend_state) {
++ case XenbusStateInitialising:
++ if (dev->state == XenbusStateClosed) {
++ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
++ __FUNCTION__, dev->nodename);
++ xenbus_switch_state(dev, XenbusStateInitWait);
++ }
++ break;
+
-+ xen_setup_pirqs();
- }
-diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
-index 79bedba..f70a4f4 100644
---- a/drivers/xen/evtchn.c
-+++ b/drivers/xen/evtchn.c
-@@ -48,6 +48,8 @@
- #include <linux/gfp.h>
- #include <linux/mutex.h>
- #include <linux/cpu.h>
++ case XenbusStateInitialised:
++ break;
+
-+#include <xen/xen.h>
- #include <xen/events.h>
- #include <xen/evtchn.h>
- #include <asm/xen/hypervisor.h>
-diff --git a/drivers/xen/features.c b/drivers/xen/features.c
-index 99eda16..9e2b64f 100644
---- a/drivers/xen/features.c
-+++ b/drivers/xen/features.c
-@@ -18,7 +18,7 @@
- u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
- EXPORT_SYMBOL_GPL(xen_features);
-
--void xen_setup_features(void)
-+void __init xen_setup_features(void)
- {
- struct xen_feature_info fi;
- int i, j;
-diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
-new file mode 100644
-index 0000000..ddc59cc
---- /dev/null
-+++ b/drivers/xen/gntdev.c
-@@ -0,0 +1,626 @@
-+/******************************************************************************
-+ * gntdev.c
-+ *
-+ * Device for accessing (in user-space) pages that have been granted by other
-+ * domains.
-+ *
-+ * Copyright (c) 2006-2007, D G Murray.
-+ * (c) 2009 Gerd Hoffmann <kraxel at redhat.com>
-+ *
-+ * This program is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License
-+ * along with this program; if not, write to the Free Software
-+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-+ */
++ case XenbusStateConnected:
++ if (dev->state == XenbusStateConnected)
++ break;
++ backend_create_netif(be);
++ if (be->netif)
++ connect(be);
++ break;
+
-+#include <linux/module.h>
-+#include <linux/kernel.h>
-+#include <linux/init.h>
-+#include <linux/miscdevice.h>
-+#include <linux/fs.h>
-+#include <linux/mm.h>
-+#include <linux/mman.h>
-+#include <linux/mmu_notifier.h>
-+#include <linux/types.h>
-+#include <linux/uaccess.h>
-+#include <linux/sched.h>
-+#include <linux/rwsem.h>
++ case XenbusStateClosing:
++ if (be->netif)
++ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
++ disconnect_backend(dev);
++ xenbus_switch_state(dev, XenbusStateClosing);
++ break;
+
-+#include <xen/xen.h>
-+#include <xen/grant_table.h>
-+#include <xen/gntdev.h>
-+#include <asm/xen/hypervisor.h>
-+#include <asm/xen/hypercall.h>
-+#include <asm/xen/page.h>
++ case XenbusStateClosed:
++ xenbus_switch_state(dev, XenbusStateClosed);
++ if (xenbus_dev_is_online(dev))
++ break;
++ /* fall through if not online */
++ case XenbusStateUnknown:
++ device_unregister(&dev->dev);
++ break;
+
-+MODULE_LICENSE("GPL");
-+MODULE_AUTHOR("Derek G. Murray <Derek.Murray at cl.cam.ac.uk>, "
-+ "Gerd Hoffmann <kraxel at redhat.com>");
-+MODULE_DESCRIPTION("User-space granted page access driver");
++ default:
++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++ frontend_state);
++ break;
++ }
++}
+
-+static int debug = 0;
-+module_param(debug, int, 0644);
-+static int limit = 1024;
-+module_param(limit, int, 0644);
+
-+struct gntdev_priv {
-+ struct list_head maps;
-+ uint32_t used;
-+ uint32_t limit;
-+ struct rw_semaphore sem;
-+ struct mm_struct *mm;
-+ struct mmu_notifier mn;
-+};
++static void xen_net_read_rate(struct xenbus_device *dev,
++ unsigned long *bytes, unsigned long *usec)
++{
++ char *s, *e;
++ unsigned long b, u;
++ char *ratestr;
+
-+struct grant_map {
-+ struct list_head next;
-+ struct gntdev_priv *priv;
-+ struct vm_area_struct *vma;
-+ int index;
-+ int count;
-+ int flags;
-+ int is_mapped;
-+ struct ioctl_gntdev_grant_ref *grants;
-+ struct gnttab_map_grant_ref *map_ops;
-+ struct gnttab_unmap_grant_ref *unmap_ops;
-+};
++ /* Default to unlimited bandwidth. */
++ *bytes = ~0UL;
++ *usec = 0;
+
-+/* ------------------------------------------------------------------ */
++ ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
++ if (IS_ERR(ratestr))
++ return;
+
-+static void gntdev_print_maps(struct gntdev_priv *priv,
-+ char *text, int text_index)
-+{
-+ struct grant_map *map;
++ s = ratestr;
++ b = simple_strtoul(s, &e, 10);
++ if ((s == e) || (*e != ','))
++ goto fail;
+
-+ printk("%s: maps list (priv %p, usage %d/%d)\n",
-+ __FUNCTION__, priv, priv->used, priv->limit);
-+ list_for_each_entry(map, &priv->maps, next)
-+ printk(" index %2d, count %2d %s\n",
-+ map->index, map->count,
-+ map->index == text_index && text ? text : "");
-+}
++ s = e + 1;
++ u = simple_strtoul(s, &e, 10);
++ if ((s == e) || (*e != '\0'))
++ goto fail;
+
-+static struct grant_map *gntdev_add_map(struct gntdev_priv *priv, int count)
-+{
-+ struct grant_map *map, *add;
++ *bytes = b;
++ *usec = u;
+
-+ add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
-+ if (NULL == add)
-+ return NULL;
++ kfree(ratestr);
++ return;
+
-+ add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL);
-+ add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL);
-+ add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
-+ if (NULL == add->grants ||
-+ NULL == add->map_ops ||
-+ NULL == add->unmap_ops)
-+ goto err;
++ fail:
++ WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
++ kfree(ratestr);
++}
+
-+ add->index = 0;
-+ add->count = count;
-+ add->priv = priv;
++static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
++{
++ char *s, *e, *macstr;
++ int i;
+
-+ if (add->count + priv->used > priv->limit)
-+ goto err;
++ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
++ if (IS_ERR(macstr))
++ return PTR_ERR(macstr);
+
-+ list_for_each_entry(map, &priv->maps, next) {
-+ if (add->index + add->count < map->index) {
-+ list_add_tail(&add->next, &map->next);
-+ goto done;
++ for (i = 0; i < ETH_ALEN; i++) {
++ mac[i] = simple_strtoul(s, &e, 16);
++ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
++ kfree(macstr);
++ return -ENOENT;
+ }
-+ add->index = map->index + map->count;
++ s = e+1;
+ }
-+ list_add_tail(&add->next, &priv->maps);
+
-+done:
-+ priv->used += add->count;
-+ if (debug)
-+ gntdev_print_maps(priv, "[new]", add->index);
-+ return add;
-+
-+err:
-+ kfree(add->grants);
-+ kfree(add->map_ops);
-+ kfree(add->unmap_ops);
-+ kfree(add);
-+ return NULL;
++ kfree(macstr);
++ return 0;
+}
+
-+static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, int index,
-+ int count)
++static void unregister_hotplug_status_watch(struct backend_info *be)
+{
-+ struct grant_map *map;
-+
-+ list_for_each_entry(map, &priv->maps, next) {
-+ if (map->index != index)
-+ continue;
-+ if (map->count != count)
-+ continue;
-+ return map;
++ if (be->have_hotplug_status_watch) {
++ unregister_xenbus_watch(&be->hotplug_status_watch);
++ kfree(be->hotplug_status_watch.node);
+ }
-+ return NULL;
++ be->have_hotplug_status_watch = 0;
+}
+
-+static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
-+ unsigned long vaddr)
++static void hotplug_status_changed(struct xenbus_watch *watch,
++ const char **vec,
++ unsigned int vec_size)
+{
-+ struct grant_map *map;
++ struct backend_info *be = container_of(watch,
++ struct backend_info,
++ hotplug_status_watch);
++ char *str;
++ unsigned int len;
+
-+ list_for_each_entry(map, &priv->maps, next) {
-+ if (!map->vma)
-+ continue;
-+ if (vaddr < map->vma->vm_start)
-+ continue;
-+ if (vaddr >= map->vma->vm_end)
-+ continue;
-+ return map;
++ str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
++ if (IS_ERR(str))
++ return;
++ if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
++ xenbus_switch_state(be->dev, XenbusStateConnected);
++ /* Not interested in this watch anymore. */
++ unregister_hotplug_status_watch(be);
+ }
-+ return NULL;
++ kfree(str);
+}
+
-+static int gntdev_del_map(struct grant_map *map)
++static void connect(struct backend_info *be)
+{
-+ int i;
++ int err;
++ struct xenbus_device *dev = be->dev;
+
-+ if (map->vma)
-+ return -EBUSY;
-+ for (i = 0; i < map->count; i++)
-+ if (map->unmap_ops[i].handle)
-+ return -EBUSY;
++ err = connect_rings(be);
++ if (err)
++ return;
+
-+ map->priv->used -= map->count;
-+ list_del(&map->next);
-+ kfree(map->grants);
-+ kfree(map->map_ops);
-+ kfree(map->unmap_ops);
-+ kfree(map);
-+ return 0;
++ err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
++ if (err) {
++ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
++ return;
++ }
++
++ xen_net_read_rate(dev, &be->netif->credit_bytes,
++ &be->netif->credit_usec);
++ be->netif->remaining_credit = be->netif->credit_bytes;
++
++ unregister_hotplug_status_watch(be);
++ err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
++ hotplug_status_changed,
++ "%s/%s", dev->nodename, "hotplug-status");
++ if (err) {
++ /* Switch now, since we can't do a watch. */
++ xenbus_switch_state(dev, XenbusStateConnected);
++ } else {
++ be->have_hotplug_status_watch = 1;
++ }
++
++ netif_wake_queue(be->netif->dev);
+}
+
-+/* ------------------------------------------------------------------ */
+
-+static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void *data)
++static int connect_rings(struct backend_info *be)
+{
-+ struct grant_map *map = data;
-+ unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
-+ u64 pte_maddr;
++ struct xenbus_device *dev = be->dev;
++ unsigned long tx_ring_ref, rx_ring_ref;
++ unsigned int evtchn, rx_copy;
++ int err;
++ int val;
+
-+ BUG_ON(pgnr >= map->count);
-+ pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT;
-+ pte_maddr += (unsigned long)pte & ~PAGE_MASK;
-+ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags,
-+ map->grants[pgnr].ref,
-+ map->grants[pgnr].domid);
-+ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags,
-+ 0 /* handle */);
-+ return 0;
-+}
++ DPRINTK("");
++
++ err = xenbus_gather(XBT_NIL, dev->otherend,
++ "tx-ring-ref", "%lu", &tx_ring_ref,
++ "rx-ring-ref", "%lu", &rx_ring_ref,
++ "event-channel", "%u", &evtchn, NULL);
++ if (err) {
++ xenbus_dev_fatal(dev, err,
++ "reading %s/ring-ref and event-channel",
++ dev->otherend);
++ return err;
++ }
++
++ err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
++ &rx_copy);
++ if (err == -ENOENT) {
++ err = 0;
++ rx_copy = 0;
++ }
++ if (err < 0) {
++ xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
++ dev->otherend);
++ return err;
++ }
++ if (!rx_copy)
++ return -EOPNOTSUPP;
++
++ if (be->netif->dev->tx_queue_len != 0) {
++ if (xenbus_scanf(XBT_NIL, dev->otherend,
++ "feature-rx-notify", "%d", &val) < 0)
++ val = 0;
++ if (val)
++ be->netif->can_queue = 1;
++ else
++ /* Must be non-zero for pfifo_fast to work. */
++ be->netif->dev->tx_queue_len = 1;
++ }
+
-+static int map_grant_pages(struct grant_map *map)
-+{
-+ int i, err = 0;
++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
++ val = 0;
++ if (!val) {
++ be->netif->features &= ~NETIF_F_SG;
++ be->netif->dev->features &= ~NETIF_F_SG;
++ if (be->netif->dev->mtu > ETH_DATA_LEN)
++ be->netif->dev->mtu = ETH_DATA_LEN;
++ }
+
-+ if (debug)
-+ printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count);
-+ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
-+ map->map_ops, map->count);
-+ if (WARN_ON(err))
-+ return err;
++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
++ &val) < 0)
++ val = 0;
++ if (val) {
++ be->netif->features |= NETIF_F_TSO;
++ be->netif->dev->features |= NETIF_F_TSO;
++ }
+
-+ for (i = 0; i < map->count; i++) {
-+ if (map->map_ops[i].status)
-+ err = -EINVAL;
-+ map->unmap_ops[i].handle = map->map_ops[i].handle;
++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
++ "%d", &val) < 0)
++ val = 0;
++ if (val) {
++ be->netif->features &= ~NETIF_F_IP_CSUM;
++ be->netif->dev->features &= ~NETIF_F_IP_CSUM;
+ }
-+ return err;
-+}
+
-+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
-+{
-+ int i, err = 0;
++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll",
++ "%d", &val) < 0)
++ val = 0;
++ if (val)
++ be->netif->smart_poll = 1;
++ else
++ be->netif->smart_poll = 0;
+
-+ if (debug)
-+ printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__,
-+ map->index, map->count, offset, pages);
-+ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
-+ map->unmap_ops + offset, pages);
-+ if (WARN_ON(err))
++ /* Map the shared frame, irq etc. */
++ err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
++ if (err) {
++ xenbus_dev_fatal(dev, err,
++ "mapping shared-frames %lu/%lu port %u",
++ tx_ring_ref, rx_ring_ref, evtchn);
+ return err;
-+
-+ for (i = 0; i < pages; i++) {
-+ if (map->unmap_ops[offset+i].status)
-+ err = -EINVAL;
-+ map->unmap_ops[offset+i].handle = 0;
+ }
-+ return err;
++ return 0;
+}
+
-+/* ------------------------------------------------------------------ */
-+
-+static void gntdev_vma_close(struct vm_area_struct *vma)
-+{
-+ struct grant_map *map = vma->vm_private_data;
+
-+ if (debug)
-+ printk("%s\n", __FUNCTION__);
-+ map->is_mapped = 0;
-+ map->vma = NULL;
-+ vma->vm_private_data = NULL;
-+}
++/* ** Driver Registration ** */
+
-+static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-+{
-+ if (debug)
-+ printk("%s: vaddr %p, pgoff %ld (shouldn't happen)\n",
-+ __FUNCTION__, vmf->virtual_address, vmf->pgoff);
-+ vmf->flags = VM_FAULT_ERROR;
-+ return 0;
-+}
+
-+static struct vm_operations_struct gntdev_vmops = {
-+ .close = gntdev_vma_close,
-+ .fault = gntdev_vma_fault,
++static const struct xenbus_device_id netback_ids[] = {
++ { "vif" },
++ { "" }
+};
+
-+/* ------------------------------------------------------------------ */
+
-+static void mn_invl_range_start(struct mmu_notifier *mn,
-+ struct mm_struct *mm,
-+ unsigned long start, unsigned long end)
-+{
-+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
-+ struct grant_map *map;
-+ unsigned long mstart, mend;
-+ int err;
++static struct xenbus_driver netback = {
++ .name = "vif",
++ .owner = THIS_MODULE,
++ .ids = netback_ids,
++ .probe = netback_probe,
++ .remove = netback_remove,
++ .uevent = netback_uevent,
++ .otherend_changed = frontend_changed,
++};
+
-+ down_read(&priv->sem);
-+ list_for_each_entry(map, &priv->maps, next) {
-+ if (!map->vma)
-+ continue;
-+ if (!map->is_mapped)
-+ continue;
-+ if (map->vma->vm_start >= end)
-+ continue;
-+ if (map->vma->vm_end <= start)
-+ continue;
-+ mstart = max(start, map->vma->vm_start);
-+ mend = min(end, map->vma->vm_end);
-+ if (debug)
-+ printk("%s: map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
-+ __FUNCTION__, map->index, map->count,
-+ map->vma->vm_start, map->vma->vm_end,
-+ start, end, mstart, mend);
-+ err = unmap_grant_pages(map,
-+ (mstart - map->vma->vm_start) >> PAGE_SHIFT,
-+ (mend - mstart) >> PAGE_SHIFT);
-+ WARN_ON(err);
-+ }
-+ up_read(&priv->sem);
-+}
+
-+static void mn_invl_page(struct mmu_notifier *mn,
-+ struct mm_struct *mm,
-+ unsigned long address)
++int netif_xenbus_init(void)
+{
-+ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
++ printk(KERN_CRIT "registering netback\n");
++ return xenbus_register_backend(&netback);
+}
+diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
+new file mode 100644
+index 0000000..ae693e7
+--- /dev/null
++++ b/drivers/xen/pci.c
+@@ -0,0 +1,124 @@
++/*
++ * Copyright (c) 2009, Intel Corporation.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms and conditions of the GNU General Public License,
++ * version 2, as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
++ * more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++ * Place - Suite 330, Boston, MA 02111-1307 USA.
++ *
++ * Author: Weidong Han <weidong.han at intel.com>
++ */
+
-+static void mn_release(struct mmu_notifier *mn,
-+ struct mm_struct *mm)
-+{
-+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
-+ struct grant_map *map;
-+ int err;
++#include <linux/pci.h>
+
-+ down_read(&priv->sem);
-+ list_for_each_entry(map, &priv->maps, next) {
-+ if (!map->vma)
-+ continue;
-+ if (debug)
-+ printk("%s: map %d+%d (%lx %lx)\n",
-+ __FUNCTION__, map->index, map->count,
-+ map->vma->vm_start, map->vma->vm_end);
-+ err = unmap_grant_pages(map, 0, map->count);
-+ WARN_ON(err);
-+ }
-+ up_read(&priv->sem);
-+}
++#include <xen/interface/xen.h>
++#include <xen/interface/physdev.h>
+
-+struct mmu_notifier_ops gntdev_mmu_ops = {
-+ .release = mn_release,
-+ .invalidate_page = mn_invl_page,
-+ .invalidate_range_start = mn_invl_range_start,
-+};
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
+
-+/* ------------------------------------------------------------------ */
++#include "../pci/pci.h"
+
-+static int gntdev_open(struct inode *inode, struct file *flip)
-+{
-+ struct gntdev_priv *priv;
+
-+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-+ if (!priv)
-+ return -ENOMEM;
++#ifdef CONFIG_PCI_IOV
++#define HANDLE_PCI_IOV 1
++#else
++#define HANDLE_PCI_IOV 0
++#endif
+
-+ INIT_LIST_HEAD(&priv->maps);
-+ init_rwsem(&priv->sem);
-+ priv->limit = limit;
++static int xen_add_device(struct device *dev)
++{
++ int r;
++ struct pci_dev *pci_dev = to_pci_dev(dev);
+
-+ priv->mm = get_task_mm(current);
-+ if (!priv->mm) {
-+ kfree(priv);
-+ return -ENOMEM;
-+ }
-+ priv->mn.ops = &gntdev_mmu_ops;
-+ mmu_notifier_register(&priv->mn, priv->mm);
-+ mmput(priv->mm);
++ if (HANDLE_PCI_IOV && pci_dev->is_virtfn) {
++ struct physdev_manage_pci_ext manage_pci_ext = {
++ .bus = pci_dev->bus->number,
++ .devfn = pci_dev->devfn,
++ .is_virtfn = 1,
++#ifdef CONFIG_PCI_IOV
++ .physfn.bus = pci_dev->physfn->bus->number,
++ .physfn.devfn = pci_dev->physfn->devfn,
++#endif
++ };
+
-+ flip->private_data = priv;
-+ if (debug)
-+ printk("%s: priv %p\n", __FUNCTION__, priv);
++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
++ &manage_pci_ext);
++ } else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
++ struct physdev_manage_pci_ext manage_pci_ext = {
++ .bus = pci_dev->bus->number,
++ .devfn = pci_dev->devfn,
++ .is_extfn = 1,
++ };
++
++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
++ &manage_pci_ext);
++ } else {
++ struct physdev_manage_pci manage_pci = {
++ .bus = pci_dev->bus->number,
++ .devfn = pci_dev->devfn,
++ };
+
-+ return 0;
++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
++ &manage_pci);
++ }
++
++ return r;
+}
+
-+static int gntdev_release(struct inode *inode, struct file *flip)
++static int xen_remove_device(struct device *dev)
+{
-+ struct gntdev_priv *priv = flip->private_data;
-+ struct grant_map *map;
-+ int err;
++ int r;
++ struct pci_dev *pci_dev = to_pci_dev(dev);
++ struct physdev_manage_pci manage_pci;
+
-+ if (debug)
-+ printk("%s: priv %p\n", __FUNCTION__, priv);
++ manage_pci.bus = pci_dev->bus->number;
++ manage_pci.devfn = pci_dev->devfn;
+
-+ down_write(&priv->sem);
-+ while (!list_empty(&priv->maps)) {
-+ map = list_entry(priv->maps.next, struct grant_map, next);
-+ err = gntdev_del_map(map);
-+ WARN_ON(err);
-+ }
-+ up_write(&priv->sem);
-+ mmu_notifier_unregister(&priv->mn, priv->mm);
-+ kfree(priv);
-+ return 0;
++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
++ &manage_pci);
++
++ return r;
+}
+
-+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
-+ struct ioctl_gntdev_map_grant_ref __user *u)
++static int xen_pci_notifier(struct notifier_block *nb,
++ unsigned long action, void *data)
+{
-+ struct ioctl_gntdev_map_grant_ref op;
-+ struct grant_map *map;
-+ int err;
++ struct device *dev = data;
++ int r = 0;
+
-+ if (copy_from_user(&op, u, sizeof(op)) != 0)
-+ return -EFAULT;
-+ if (debug)
-+ printk("%s: priv %p, add %d\n", __FUNCTION__, priv,
-+ op.count);
-+ if (unlikely(op.count <= 0))
-+ return -EINVAL;
-+ if (unlikely(op.count > priv->limit))
-+ return -EINVAL;
++ switch (action) {
++ case BUS_NOTIFY_ADD_DEVICE:
++ r = xen_add_device(dev);
++ break;
++ case BUS_NOTIFY_DEL_DEVICE:
++ r = xen_remove_device(dev);
++ break;
++ default:
++ break;
++ }
+
-+ down_write(&priv->sem);
-+ err = -ENOMEM;
-+ map = gntdev_add_map(priv, op.count);
-+ if (!map)
-+ goto err_unlock;
++ return r;
++}
+
-+ err = -ENOMEM;
-+ if (copy_from_user(map->grants, &u->refs,
-+ sizeof(map->grants[0]) * op.count) != 0)
-+ goto err_free;
-+ op.index = map->index << PAGE_SHIFT;
-+ if (copy_to_user(u, &op, sizeof(op)) != 0)
-+ goto err_free;
-+ up_write(&priv->sem);
-+ return 0;
++struct notifier_block device_nb = {
++ .notifier_call = xen_pci_notifier,
++};
+
-+err_free:
-+ gntdev_del_map(map);
-+err_unlock:
-+ up_write(&priv->sem);
-+ return err;
++static int __init register_xen_pci_notifier(void)
++{
++ if (!xen_pv_domain())
++ return 0;
++
++ return bus_register_notifier(&pci_bus_type, &device_nb);
+}
+
-+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
-+ struct ioctl_gntdev_unmap_grant_ref __user *u)
++arch_initcall(register_xen_pci_notifier);
+diff --git a/drivers/xen/pciback/Makefile b/drivers/xen/pciback/Makefile
+new file mode 100644
+index 0000000..38bc123
+--- /dev/null
++++ b/drivers/xen/pciback/Makefile
+@@ -0,0 +1,17 @@
++obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
++
++xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
++xen-pciback-y += conf_space.o conf_space_header.o \
++ conf_space_capability.o \
++ conf_space_capability_vpd.o \
++ conf_space_capability_pm.o \
++ conf_space_quirks.o
++xen-pciback-$(CONFIG_PCI_MSI) += conf_space_capability_msi.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
++
++ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
++EXTRA_CFLAGS += -DDEBUG
++endif
+diff --git a/drivers/xen/pciback/conf_space.c b/drivers/xen/pciback/conf_space.c
+new file mode 100644
+index 0000000..370c18e
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space.c
+@@ -0,0 +1,435 @@
++/*
++ * PCI Backend - Functions for creating a virtual configuration space for
++ * exported PCI Devices.
++ * It's dangerous to allow PCI Driver Domains to change their
++ * device's resources (memory, i/o ports, interrupts). We need to
++ * restrict changes to certain PCI Configuration registers:
++ * BARs, INTERRUPT_PIN, most registers in the header...
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
++
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
++
++static int permissive;
++module_param(permissive, bool, 0644);
++
++#define DEFINE_PCI_CONFIG(op, size, type) \
++int pciback_##op##_config_##size \
++(struct pci_dev *dev, int offset, type value, void *data) \
++{ \
++ return pci_##op##_config_##size(dev, offset, value); \
++}
++
++DEFINE_PCI_CONFIG(read, byte, u8 *)
++DEFINE_PCI_CONFIG(read, word, u16 *)
++DEFINE_PCI_CONFIG(read, dword, u32 *)
++
++DEFINE_PCI_CONFIG(write, byte, u8)
++DEFINE_PCI_CONFIG(write, word, u16)
++DEFINE_PCI_CONFIG(write, dword, u32)
++
++static int conf_space_read(struct pci_dev *dev,
++ const struct config_field_entry *entry,
++ int offset, u32 *value)
+{
-+ struct ioctl_gntdev_unmap_grant_ref op;
-+ struct grant_map *map;
-+ int err = -EINVAL;
++ int ret = 0;
++ const struct config_field *field = entry->field;
+
-+ if (copy_from_user(&op, u, sizeof(op)) != 0)
-+ return -EFAULT;
-+ if (debug)
-+ printk("%s: priv %p, del %d+%d\n", __FUNCTION__, priv,
-+ (int)op.index, (int)op.count);
++ *value = 0;
+
-+ down_write(&priv->sem);
-+ map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
-+ if (map)
-+ err = gntdev_del_map(map);
-+ up_write(&priv->sem);
-+ return err;
++ switch (field->size) {
++ case 1:
++ if (field->u.b.read)
++ ret = field->u.b.read(dev, offset, (u8 *) value,
++ entry->data);
++ break;
++ case 2:
++ if (field->u.w.read)
++ ret = field->u.w.read(dev, offset, (u16 *) value,
++ entry->data);
++ break;
++ case 4:
++ if (field->u.dw.read)
++ ret = field->u.dw.read(dev, offset, value, entry->data);
++ break;
++ }
++ return ret;
+}
+
-+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
-+ struct ioctl_gntdev_get_offset_for_vaddr __user *u)
++static int conf_space_write(struct pci_dev *dev,
++ const struct config_field_entry *entry,
++ int offset, u32 value)
+{
-+ struct ioctl_gntdev_get_offset_for_vaddr op;
-+ struct grant_map *map;
-+
-+ if (copy_from_user(&op, u, sizeof(op)) != 0)
-+ return -EFAULT;
-+ if (debug)
-+ printk("%s: priv %p, offset for vaddr %lx\n", __FUNCTION__, priv,
-+ (unsigned long)op.vaddr);
++ int ret = 0;
++ const struct config_field *field = entry->field;
+
-+ down_read(&priv->sem);
-+ map = gntdev_find_map_vaddr(priv, op.vaddr);
-+ if (map == NULL ||
-+ map->vma->vm_start != op.vaddr) {
-+ up_read(&priv->sem);
-+ return -EINVAL;
++ switch (field->size) {
++ case 1:
++ if (field->u.b.write)
++ ret = field->u.b.write(dev, offset, (u8) value,
++ entry->data);
++ break;
++ case 2:
++ if (field->u.w.write)
++ ret = field->u.w.write(dev, offset, (u16) value,
++ entry->data);
++ break;
++ case 4:
++ if (field->u.dw.write)
++ ret = field->u.dw.write(dev, offset, value,
++ entry->data);
++ break;
+ }
-+ op.offset = map->index << PAGE_SHIFT;
-+ op.count = map->count;
-+ up_read(&priv->sem);
-+
-+ if (copy_to_user(u, &op, sizeof(op)) != 0)
-+ return -EFAULT;
-+ return 0;
++ return ret;
+}
+
-+static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
-+ struct ioctl_gntdev_set_max_grants __user *u)
++static inline u32 get_mask(int size)
+{
-+ struct ioctl_gntdev_set_max_grants op;
-+
-+ if (copy_from_user(&op, u, sizeof(op)) != 0)
-+ return -EFAULT;
-+ if (debug)
-+ printk("%s: priv %p, limit %d\n", __FUNCTION__, priv, op.count);
-+ if (op.count > limit)
-+ return -EINVAL;
++ if (size == 1)
++ return 0xff;
++ else if (size == 2)
++ return 0xffff;
++ else
++ return 0xffffffff;
++}
+
-+ down_write(&priv->sem);
-+ priv->limit = op.count;
-+ up_write(&priv->sem);
++static inline int valid_request(int offset, int size)
++{
++ /* Validate request (no un-aligned requests) */
++ if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
++ return 1;
+ return 0;
+}
+
-+static long gntdev_ioctl(struct file *flip,
-+ unsigned int cmd, unsigned long arg)
++static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
++ int offset)
+{
-+ struct gntdev_priv *priv = flip->private_data;
-+ void __user *ptr = (void __user *)arg;
-+
-+ switch (cmd) {
-+ case IOCTL_GNTDEV_MAP_GRANT_REF:
-+ return gntdev_ioctl_map_grant_ref(priv, ptr);
-+
-+ case IOCTL_GNTDEV_UNMAP_GRANT_REF:
-+ return gntdev_ioctl_unmap_grant_ref(priv, ptr);
-+
-+ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
-+ return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
-+
-+ case IOCTL_GNTDEV_SET_MAX_GRANTS:
-+ return gntdev_ioctl_set_max_grants(priv, ptr);
-+
-+ default:
-+ if (debug)
-+ printk("%s: priv %p, unknown cmd %x\n",
-+ __FUNCTION__, priv, cmd);
-+ return -ENOIOCTLCMD;
++ if (offset >= 0) {
++ new_val_mask <<= (offset * 8);
++ new_val <<= (offset * 8);
++ } else {
++ new_val_mask >>= (offset * -8);
++ new_val >>= (offset * -8);
+ }
++ val = (val & ~new_val_mask) | (new_val & new_val_mask);
+
-+ return 0;
++ return val;
+}
+
-+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
++static int pcibios_err_to_errno(int err)
+{
-+ struct gntdev_priv *priv = flip->private_data;
-+ int index = vma->vm_pgoff;
-+ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-+ struct grant_map *map;
-+ int err = -EINVAL;
++ switch (err) {
++ case PCIBIOS_SUCCESSFUL:
++ return XEN_PCI_ERR_success;
++ case PCIBIOS_DEVICE_NOT_FOUND:
++ return XEN_PCI_ERR_dev_not_found;
++ case PCIBIOS_BAD_REGISTER_NUMBER:
++ return XEN_PCI_ERR_invalid_offset;
++ case PCIBIOS_FUNC_NOT_SUPPORTED:
++ return XEN_PCI_ERR_not_implemented;
++ case PCIBIOS_SET_FAILED:
++ return XEN_PCI_ERR_access_denied;
++ }
++ return err;
++}
+
-+ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
-+ return -EINVAL;
++int pciback_config_read(struct pci_dev *dev, int offset, int size,
++ u32 *ret_val)
++{
++ int err = 0;
++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++ const struct config_field_entry *cfg_entry;
++ const struct config_field *field;
++ int req_start, req_end, field_start, field_end;
++ /* if read fails for any reason, return 0
++ * (as if device didn't respond) */
++ u32 value = 0, tmp_val;
++
++ if (unlikely(verbose_request))
++ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
++ pci_name(dev), size, offset);
+
-+ if (debug)
-+ printk("%s: map %d+%d at %lx (pgoff %lx)\n", __FUNCTION__,
-+ index, count, vma->vm_start, vma->vm_pgoff);
++ if (!valid_request(offset, size)) {
++ err = XEN_PCI_ERR_invalid_offset;
++ goto out;
++ }
+
-+ down_read(&priv->sem);
-+ map = gntdev_find_map_index(priv, index, count);
-+ if (!map)
-+ goto unlock_out;
-+ if (map->vma)
-+ goto unlock_out;
-+ if (priv->mm != vma->vm_mm) {
-+ printk("%s: Huh? Other mm?\n", __FUNCTION__);
-+ goto unlock_out;
++ /* Get the real value first, then modify as appropriate */
++ switch (size) {
++ case 1:
++ err = pci_read_config_byte(dev, offset, (u8 *) &value);
++ break;
++ case 2:
++ err = pci_read_config_word(dev, offset, (u16 *) &value);
++ break;
++ case 4:
++ err = pci_read_config_dword(dev, offset, &value);
++ break;
+ }
+
-+ vma->vm_ops = &gntdev_vmops;
++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++ field = cfg_entry->field;
+
-+ vma->vm_flags |= VM_RESERVED;
-+ vma->vm_flags |= VM_DONTCOPY;
-+ vma->vm_flags |= VM_DONTEXPAND;
++ req_start = offset;
++ req_end = offset + size;
++ field_start = OFFSET(cfg_entry);
++ field_end = OFFSET(cfg_entry) + field->size;
++
++ if ((req_start >= field_start && req_start < field_end)
++ || (req_end > field_start && req_end <= field_end)) {
++ err = conf_space_read(dev, cfg_entry, field_start,
++ &tmp_val);
++ if (err)
++ goto out;
++
++ value = merge_value(value, tmp_val,
++ get_mask(field->size),
++ field_start - req_start);
++ }
++ }
+
-+ vma->vm_private_data = map;
-+ map->vma = vma;
++out:
++ if (unlikely(verbose_request))
++ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
++ pci_name(dev), size, offset, value);
++
++ *ret_val = value;
++ return pcibios_err_to_errno(err);
++}
++
++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
++{
++ int err = 0, handled = 0;
++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++ const struct config_field_entry *cfg_entry;
++ const struct config_field *field;
++ u32 tmp_val;
++ int req_start, req_end, field_start, field_end;
++
++ if (unlikely(verbose_request))
++ printk(KERN_DEBUG
++ "pciback: %s: write request %d bytes at 0x%x = %x\n",
++ pci_name(dev), size, offset, value);
++
++ if (!valid_request(offset, size))
++ return XEN_PCI_ERR_invalid_offset;
++
++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++ field = cfg_entry->field;
++
++ req_start = offset;
++ req_end = offset + size;
++ field_start = OFFSET(cfg_entry);
++ field_end = OFFSET(cfg_entry) + field->size;
++
++ if ((req_start >= field_start && req_start < field_end)
++ || (req_end > field_start && req_end <= field_end)) {
++ tmp_val = 0;
++
++ err = pciback_config_read(dev, field_start,
++ field->size, &tmp_val);
++ if (err)
++ break;
+
-+ map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte;
-+ if (!(vma->vm_flags & VM_WRITE))
-+ map->flags |= GNTMAP_readonly;
++ tmp_val = merge_value(tmp_val, value, get_mask(size),
++ req_start - field_start);
+
-+ err = apply_to_page_range(vma->vm_mm, vma->vm_start,
-+ vma->vm_end - vma->vm_start,
-+ find_grant_ptes, map);
-+ if (err) {
-+ goto unlock_out;
-+ if (debug)
-+ printk("%s: find_grant_ptes() failure.\n", __FUNCTION__);
++ err = conf_space_write(dev, cfg_entry, field_start,
++ tmp_val);
++
++ /* handled is set true here, but not every byte
++ * may have been written! Properly detecting if
++ * every byte is handled is unnecessary as the
++ * flag is used to detect devices that need
++ * special helpers to work correctly.
++ */
++ handled = 1;
++ }
+ }
+
-+ err = map_grant_pages(map);
-+ if (err) {
-+ goto unlock_out;
-+ if (debug)
-+ printk("%s: map_grant_pages() failure.\n", __FUNCTION__);
++ if (!handled && !err) {
++ /* By default, anything not specificially handled above is
++ * read-only. The permissive flag changes this behavior so
++ * that anything not specifically handled above is writable.
++ * This means that some fields may still be read-only because
++ * they have entries in the config_field list that intercept
++ * the write and do nothing. */
++ if (dev_data->permissive || permissive) {
++ switch (size) {
++ case 1:
++ err = pci_write_config_byte(dev, offset,
++ (u8) value);
++ break;
++ case 2:
++ err = pci_write_config_word(dev, offset,
++ (u16) value);
++ break;
++ case 4:
++ err = pci_write_config_dword(dev, offset,
++ (u32) value);
++ break;
++ }
++ } else if (!dev_data->warned_on_write) {
++ dev_data->warned_on_write = 1;
++ dev_warn(&dev->dev, "Driver tried to write to a "
++ "read-only configuration space field at offset"
++ " 0x%x, size %d. This may be harmless, but if "
++ "you have problems with your device:\n"
++ "1) see permissive attribute in sysfs\n"
++ "2) report problems to the xen-devel "
++ "mailing list along with details of your "
++ "device obtained from lspci.\n", offset, size);
++ }
+ }
-+ map->is_mapped = 1;
+
-+unlock_out:
-+ up_read(&priv->sem);
-+ return err;
++ return pcibios_err_to_errno(err);
+}
+
-+static const struct file_operations gntdev_fops = {
-+ .owner = THIS_MODULE,
-+ .open = gntdev_open,
-+ .release = gntdev_release,
-+ .mmap = gntdev_mmap,
-+ .unlocked_ioctl = gntdev_ioctl
-+};
++void pciback_config_free_dyn_fields(struct pci_dev *dev)
++{
++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++ struct config_field_entry *cfg_entry, *t;
++ const struct config_field *field;
+
-+static struct miscdevice gntdev_miscdev = {
-+ .minor = MISC_DYNAMIC_MINOR,
-+ .name = "gntdev",
-+ .fops = &gntdev_fops,
-+};
++ dev_dbg(&dev->dev, "free-ing dynamically allocated virtual "
++ "configuration space fields\n");
++ if (!dev_data)
++ return;
+
-+/* ------------------------------------------------------------------ */
++ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
++ field = cfg_entry->field;
+
-+static int __init gntdev_init(void)
-+{
-+ int err;
++ if (field->clean) {
++ field->clean((struct config_field *)field);
+
-+ if (!xen_domain())
-+ return -ENODEV;
++ kfree(cfg_entry->data);
++
++ list_del(&cfg_entry->list);
++ kfree(cfg_entry);
++ }
+
-+ err = misc_register(&gntdev_miscdev);
-+ if (err != 0) {
-+ printk(KERN_ERR "Could not register gntdev device\n");
-+ return err;
+ }
-+ return 0;
+}
+
-+static void __exit gntdev_exit(void)
++void pciback_config_reset_dev(struct pci_dev *dev)
+{
-+ misc_deregister(&gntdev_miscdev);
-+}
++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++ const struct config_field_entry *cfg_entry;
++ const struct config_field *field;
+
-+module_init(gntdev_init);
-+module_exit(gntdev_exit);
++ dev_dbg(&dev->dev, "resetting virtual configuration space\n");
++ if (!dev_data)
++ return;
+
-+/* ------------------------------------------------------------------ */
-diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
-index 7d8f531..76fe621 100644
---- a/drivers/xen/grant-table.c
-+++ b/drivers/xen/grant-table.c
-@@ -37,6 +37,7 @@
- #include <linux/vmalloc.h>
- #include <linux/uaccess.h>
-
-+#include <xen/xen.h>
- #include <xen/interface/xen.h>
- #include <xen/page.h>
- #include <xen/grant_table.h>
-@@ -472,6 +473,111 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
- return 0;
- }
-
-+static void gnttab_page_free(struct page *page, unsigned int order)
-+{
-+ BUG_ON(order);
-+ ClearPageForeign(page);
-+ gnttab_reset_grant_page(page);
-+ put_page(page);
++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++ field = cfg_entry->field;
++
++ if (field->reset)
++ field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
++ }
+}
+
-+/*
-+ * Must not be called with IRQs off. This should only be used on the
-+ * slow path.
-+ *
-+ * Copy a foreign granted page to local memory.
-+ */
-+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
++void pciback_config_free_dev(struct pci_dev *dev)
+{
-+ struct gnttab_unmap_and_replace unmap;
-+ struct mmu_update mmu;
-+ struct page *page;
-+ struct page *new_page;
-+ void *new_addr;
-+ void *addr;
-+ unsigned long pfn;
-+ unsigned long mfn;
-+ unsigned long new_mfn;
-+ int err;
++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++ struct config_field_entry *cfg_entry, *t;
++ const struct config_field *field;
+
-+ page = *pagep;
-+ if (!get_page_unless_zero(page))
-+ return -ENOENT;
++ dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
++ if (!dev_data)
++ return;
+
-+ err = -ENOMEM;
-+ new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
-+ if (!new_page)
-+ goto out;
++ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
++ list_del(&cfg_entry->list);
+
-+ new_addr = page_address(new_page);
-+ addr = page_address(page);
-+ memcpy(new_addr, addr, PAGE_SIZE);
++ field = cfg_entry->field;
+
-+ pfn = page_to_pfn(page);
-+ mfn = pfn_to_mfn(pfn);
-+ new_mfn = virt_to_mfn(new_addr);
++ if (field->release)
++ field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
+
-+// write_seqlock(&gnttab_dma_lock); /* protects __gnttab_dma_map_page on 2.6.18 */
++ kfree(cfg_entry);
++ }
++}
+
-+ /* Make seq visible before checking page_mapped. */
-+ smp_mb();
++int pciback_config_add_field_offset(struct pci_dev *dev,
++ const struct config_field *field,
++ unsigned int base_offset)
++{
++ int err = 0;
++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++ struct config_field_entry *cfg_entry;
++ void *tmp;
++
++ cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
++ if (!cfg_entry) {
++ err = -ENOMEM;
++ goto out;
++ }
+
-+ /* Has the page been DMA-mapped? */
-+ if (unlikely(page_mapped(page))) {
-+ //write_sequnlock(&gnttab_dma_lock);
-+ put_page(new_page);
-+ err = -EBUSY;
++ cfg_entry->data = NULL;
++ cfg_entry->field = field;
++ cfg_entry->base_offset = base_offset;
++
++ /* silently ignore duplicate fields */
++ err = pciback_field_is_dup(dev, OFFSET(cfg_entry));
++ if (err)
+ goto out;
++
++ if (field->init) {
++ tmp = field->init(dev, OFFSET(cfg_entry));
++
++ if (IS_ERR(tmp)) {
++ err = PTR_ERR(tmp);
++ goto out;
++ }
++
++ cfg_entry->data = tmp;
+ }
+
-+ if (!xen_feature(XENFEAT_auto_translated_physmap))
-+ set_phys_to_machine(pfn, new_mfn);
++ dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
++ OFFSET(cfg_entry));
++ list_add_tail(&cfg_entry->list, &dev_data->config_fields);
+
-+ //gnttab_set_replace_op(&unmap, (unsigned long)addr,
-+ // (unsigned long)new_addr, ref);
-+ unmap.host_addr = (unsigned long)addr;
-+ unmap.new_addr = (unsigned long)new_addr;
-+ unmap.handle = ref;
++out:
++ if (err)
++ kfree(cfg_entry);
+
-+ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
-+ &unmap, 1);
-+ BUG_ON(err);
-+ BUG_ON(unmap.status);
++ return err;
++}
+
-+// write_sequnlock(&gnttab_dma_lock);
++/* This sets up the device's virtual configuration space to keep track of
++ * certain registers (like the base address registers (BARs) so that we can
++ * keep the client from manipulating them directly.
++ */
++int pciback_config_init_dev(struct pci_dev *dev)
++{
++ int err = 0;
++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+
-+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-+ set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
++ dev_dbg(&dev->dev, "initializing virtual configuration space\n");
+
-+ mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
-+ mmu.val = pfn;
-+ err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
-+ BUG_ON(err);
-+ }
++ INIT_LIST_HEAD(&dev_data->config_fields);
+
-+ new_page->mapping = page->mapping;
-+ SetPageForeign(new_page, _PageForeignDestructor(page));
-+ if (PageReserved(page))
-+ SetPageReserved(new_page);
-+ *pagep = new_page;
++ err = pciback_config_header_add_fields(dev);
++ if (err)
++ goto out;
+
-+ SetPageForeign(page, gnttab_page_free);
-+ ClearPageReserved(page);
-+ page->mapping = NULL;
++ err = pciback_config_capability_add_fields(dev);
++ if (err)
++ goto out;
++
++ err = pciback_config_quirks_init(dev);
+
+out:
-+ put_page(page);
+ return err;
+}
-+EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
+
-+void gnttab_reset_grant_page(struct page *page)
++int pciback_config_init(void)
+{
-+ init_page_count(page);
-+ reset_page_mapcount(page);
++ return pciback_config_capability_init();
+}
-+EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
-+
- int gnttab_resume(void)
- {
- if (max_nr_grant_frames() < nr_grant_frames)
-diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile
-new file mode 100644
-index 0000000..e346e81
---- /dev/null
-+++ b/drivers/xen/netback/Makefile
-@@ -0,0 +1,3 @@
-+obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
-+
-+xen-netback-y := netback.o xenbus.o interface.o
-diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
+diff --git a/drivers/xen/pciback/conf_space.h b/drivers/xen/pciback/conf_space.h
new file mode 100644
-index 0000000..51f97c0
+index 0000000..50ebef2
--- /dev/null
-+++ b/drivers/xen/netback/common.h
-@@ -0,0 +1,227 @@
-+/******************************************************************************
-+ * arch/xen/drivers/netif/backend/common.h
-+ *
-+ * This program is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU General Public License version 2
-+ * as published by the Free Software Foundation; or, when distributed
-+ * separately from the Linux kernel or incorporated into other
-+ * software packages, subject to the following license:
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this source file (the "Software"), to deal in the Software without
-+ * restriction, including without limitation the rights to use, copy, modify,
-+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-+ * and to permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
++++ b/drivers/xen/pciback/conf_space.h
+@@ -0,0 +1,126 @@
++/*
++ * PCI Backend - Common data structures for overriding the configuration space
+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
+ */
+
-+#ifndef __NETIF__BACKEND__COMMON_H__
-+#define __NETIF__BACKEND__COMMON_H__
++#ifndef __XEN_PCIBACK_CONF_SPACE_H__
++#define __XEN_PCIBACK_CONF_SPACE_H__
+
-+#include <linux/version.h>
-+#include <linux/module.h>
-+#include <linux/interrupt.h>
-+#include <linux/slab.h>
-+#include <linux/ip.h>
-+#include <linux/in.h>
-+#include <linux/netdevice.h>
-+#include <linux/etherdevice.h>
-+#include <linux/wait.h>
-+#include <linux/sched.h>
++#include <linux/list.h>
++#include <linux/err.h>
+
-+#include <xen/interface/io/netif.h>
-+#include <asm/io.h>
-+#include <asm/pgalloc.h>
-+#include <xen/interface/grant_table.h>
-+#include <xen/grant_table.h>
-+#include <xen/xenbus.h>
++/* conf_field_init can return an errno in a ptr with ERR_PTR() */
++typedef void *(*conf_field_init) (struct pci_dev *dev, int offset);
++typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data);
++typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data);
++
++typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value,
++ void *data);
++typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value,
++ void *data);
++typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value,
++ void *data);
++typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value,
++ void *data);
++typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value,
++ void *data);
++typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value,
++ void *data);
++
++/* These are the fields within the configuration space which we
++ * are interested in intercepting reads/writes to and changing their
++ * values.
++ */
++struct config_field {
++ unsigned int offset;
++ unsigned int size;
++ unsigned int mask;
++ conf_field_init init;
++ conf_field_reset reset;
++ conf_field_free release;
++ void (*clean) (struct config_field *field);
++ union {
++ struct {
++ conf_dword_write write;
++ conf_dword_read read;
++ } dw;
++ struct {
++ conf_word_write write;
++ conf_word_read read;
++ } w;
++ struct {
++ conf_byte_write write;
++ conf_byte_read read;
++ } b;
++ } u;
++ struct list_head list;
++};
+
-+#define DPRINTK(_f, _a...) \
-+ pr_debug("(file=%s, line=%d) " _f, \
-+ __FILE__ , __LINE__ , ## _a )
-+#define IPRINTK(fmt, args...) \
-+ printk(KERN_INFO "xen_net: " fmt, ##args)
-+#define WPRINTK(fmt, args...) \
-+ printk(KERN_WARNING "xen_net: " fmt, ##args)
++struct config_field_entry {
++ struct list_head list;
++ const struct config_field *field;
++ unsigned int base_offset;
++ void *data;
++};
+
-+struct xen_netif {
-+ /* Unique identifier for this interface. */
-+ domid_t domid;
-+ unsigned int handle;
++#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
+
-+ u8 fe_dev_addr[6];
++/* Add fields to a device - the add_fields macro expects to get a pointer to
++ * the first entry in an array (of which the ending is marked by size==0)
++ */
++int pciback_config_add_field_offset(struct pci_dev *dev,
++ const struct config_field *field,
++ unsigned int offset);
++
++static inline int pciback_config_add_field(struct pci_dev *dev,
++ const struct config_field *field)
++{
++ return pciback_config_add_field_offset(dev, field, 0);
++}
++
++static inline int pciback_config_add_fields(struct pci_dev *dev,
++ const struct config_field *field)
++{
++ int i, err = 0;
++ for (i = 0; field[i].size != 0; i++) {
++ err = pciback_config_add_field(dev, &field[i]);
++ if (err)
++ break;
++ }
++ return err;
++}
++
++static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
++ const struct config_field *field,
++ unsigned int offset)
++{
++ int i, err = 0;
++ for (i = 0; field[i].size != 0; i++) {
++ err = pciback_config_add_field_offset(dev, &field[i], offset);
++ if (err)
++ break;
++ }
++ return err;
++}
+
-+ /* Physical parameters of the comms window. */
-+ grant_handle_t tx_shmem_handle;
-+ grant_ref_t tx_shmem_ref;
-+ grant_handle_t rx_shmem_handle;
-+ grant_ref_t rx_shmem_ref;
-+ unsigned int irq;
++/* Read/Write the real configuration space */
++int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 *value,
++ void *data);
++int pciback_read_config_word(struct pci_dev *dev, int offset, u16 *value,
++ void *data);
++int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 *value,
++ void *data);
++int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
++ void *data);
++int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
++ void *data);
++int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
++ void *data);
+
-+ /* The shared rings and indexes. */
-+ struct xen_netif_tx_back_ring tx;
-+ struct xen_netif_rx_back_ring rx;
-+ struct vm_struct *tx_comms_area;
-+ struct vm_struct *rx_comms_area;
++int pciback_config_capability_init(void);
+
-+ /* Set of features that can be turned on in dev->features. */
-+ int features;
++int pciback_config_header_add_fields(struct pci_dev *dev);
++int pciback_config_capability_add_fields(struct pci_dev *dev);
+
-+ int smart_poll;
++#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */
+diff --git a/drivers/xen/pciback/conf_space_capability.c b/drivers/xen/pciback/conf_space_capability.c
+new file mode 100644
+index 0000000..0ea84d6
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability.c
+@@ -0,0 +1,66 @@
++/*
++ * PCI Backend - Handles the virtual fields found on the capability lists
++ * in the configuration space.
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
+
-+ /* Internal feature information. */
-+ u8 can_queue:1; /* can queue packets for receiver? */
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_capability.h"
+
-+ /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
-+ RING_IDX rx_req_cons_peek;
++static LIST_HEAD(capabilities);
+
-+ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
-+ unsigned long credit_bytes;
-+ unsigned long credit_usec;
-+ unsigned long remaining_credit;
-+ struct timer_list credit_timeout;
++static const struct config_field caplist_header[] = {
++ {
++ .offset = PCI_CAP_LIST_ID,
++ .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
++ .u.w.read = pciback_read_config_word,
++ .u.w.write = NULL,
++ },
++ {}
++};
+
-+ /* Enforce draining of the transmit queue. */
-+ struct timer_list tx_queue_timeout;
++static inline void register_capability(struct pciback_config_capability *cap)
++{
++ list_add_tail(&cap->cap_list, &capabilities);
++}
+
-+ /* Statistics */
-+ int nr_copied_skbs;
++int pciback_config_capability_add_fields(struct pci_dev *dev)
++{
++ int err = 0;
++ struct pciback_config_capability *cap;
++ int cap_offset;
+
-+ /* Miscellaneous private stuff. */
-+ struct list_head list; /* scheduling list */
-+ atomic_t refcnt;
-+ struct net_device *dev;
-+ struct net_device_stats stats;
++ list_for_each_entry(cap, &capabilities, cap_list) {
++ cap_offset = pci_find_capability(dev, cap->capability);
++ if (cap_offset) {
++ dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
++ cap->capability, cap_offset);
++
++ err = pciback_config_add_fields_offset(dev,
++ caplist_header,
++ cap_offset);
++ if (err)
++ goto out;
++ err = pciback_config_add_fields_offset(dev,
++ cap->fields,
++ cap_offset);
++ if (err)
++ goto out;
++ }
++ }
+
-+ unsigned int carrier;
++out:
++ return err;
++}
+
-+ wait_queue_head_t waiting_to_free;
-+};
++int pciback_config_capability_init(void)
++{
++ register_capability(&pciback_config_capability_vpd);
++ register_capability(&pciback_config_capability_pm);
+
++ return 0;
++}
+diff --git a/drivers/xen/pciback/conf_space_capability.h b/drivers/xen/pciback/conf_space_capability.h
+new file mode 100644
+index 0000000..8da3ac4
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability.h
+@@ -0,0 +1,26 @@
+/*
-+ * Implement our own carrier flag: the network stack's version causes delays
-+ * when the carrier is re-enabled (in particular, dev_activate() may not
-+ * immediately be called, which can cause packet loss; also the etherbridge
-+ * can be rather lazy in activating its port).
++ * PCI Backend - Data structures for special overlays for structures on
++ * the capability list.
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
+ */
-+#define netback_carrier_on(netif) ((netif)->carrier = 1)
-+#define netback_carrier_off(netif) ((netif)->carrier = 0)
-+#define netback_carrier_ok(netif) ((netif)->carrier)
-+
-+enum {
-+ NETBK_DONT_COPY_SKB,
-+ NETBK_DELAYED_COPY_SKB,
-+ NETBK_ALWAYS_COPY_SKB,
-+};
+
-+extern int netbk_copy_skb_mode;
++#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
++#define __PCIBACK_CONFIG_CAPABILITY_H__
+
-+/* Function pointers into netback accelerator plugin modules */
-+struct netback_accel_hooks {
-+ struct module *owner;
-+ int (*probe)(struct xenbus_device *dev);
-+ int (*remove)(struct xenbus_device *dev);
-+};
++#include <linux/pci.h>
++#include <linux/list.h>
+
-+/* Structure to track the state of a netback accelerator plugin */
-+struct netback_accelerator {
-+ struct list_head link;
-+ int id;
-+ char *eth_name;
-+ atomic_t use_count;
-+ struct netback_accel_hooks *hooks;
-+};
++struct pciback_config_capability {
++ struct list_head cap_list;
+
-+struct backend_info {
-+ struct xenbus_device *dev;
-+ struct xen_netif *netif;
-+ enum xenbus_state frontend_state;
-+ struct xenbus_watch hotplug_status_watch;
-+ int have_hotplug_status_watch:1;
++ int capability;
+
-+ /* State relating to the netback accelerator */
-+ void *netback_accel_priv;
-+ /* The accelerator that this backend is currently using */
-+ struct netback_accelerator *accelerator;
++ /* If the device has the capability found above, add these fields */
++ const struct config_field *fields;
+};
+
-+#define NETBACK_ACCEL_VERSION 0x00010001
++extern struct pciback_config_capability pciback_config_capability_vpd;
++extern struct pciback_config_capability pciback_config_capability_pm;
+
++#endif
+diff --git a/drivers/xen/pciback/conf_space_capability_msi.c b/drivers/xen/pciback/conf_space_capability_msi.c
+new file mode 100644
+index 0000000..b70ea8b
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability_msi.c
+@@ -0,0 +1,84 @@
+/*
-+ * Connect an accelerator plugin module to netback. Returns zero on
-+ * success, < 0 on error, > 0 (with highest version number supported)
-+ * if version mismatch.
++ * PCI Backend -- Configuration overlay for MSI capability
+ */
-+extern int netback_connect_accelerator(unsigned version,
-+ int id, const char *eth_name,
-+ struct netback_accel_hooks *hooks);
-+/* Disconnect a previously connected accelerator plugin module */
-+extern void netback_disconnect_accelerator(int id, const char *eth_name);
++#include <linux/pci.h>
++#include <linux/slab.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
++#include <xen/interface/io/pciif.h>
++#include <xen/events.h>
++#include "pciback.h"
+
++int pciback_enable_msi(struct pciback_device *pdev,
++ struct pci_dev *dev, struct xen_pci_op *op)
++{
++ int otherend = pdev->xdev->otherend_id;
++ int status;
+
-+extern
-+void netback_probe_accelerators(struct backend_info *be,
-+ struct xenbus_device *dev);
-+extern
-+void netback_remove_accelerators(struct backend_info *be,
-+ struct xenbus_device *dev);
-+extern
-+void netif_accel_init(void);
++ status = pci_enable_msi(dev);
+
++ if (status) {
++ printk(KERN_ERR "error enable msi for guest %x status %x\n",
++ otherend, status);
++ op->value = 0;
++ return XEN_PCI_ERR_op_failed;
++ }
+
-+#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
-+#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
++ /* The value the guest needs is actually the IDT vector, not the
++ * the local domain's IRQ number. */
++ op->value = xen_gsi_from_irq(dev->irq);
++ return 0;
++}
+
-+void netif_disconnect(struct xen_netif *netif);
++int pciback_disable_msi(struct pciback_device *pdev,
++ struct pci_dev *dev, struct xen_pci_op *op)
++{
++ pci_disable_msi(dev);
+
-+struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle);
-+int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
-+ unsigned long rx_ring_ref, unsigned int evtchn);
++ op->value = xen_gsi_from_irq(dev->irq);
++ return 0;
++}
+
-+static inline void netif_get(struct xen_netif *netif)
++int pciback_enable_msix(struct pciback_device *pdev,
++ struct pci_dev *dev, struct xen_pci_op *op)
+{
-+ atomic_inc(&netif->refcnt);
++ int i, result;
++ struct msix_entry *entries;
++
++ if (op->value > SH_INFO_MAX_VEC)
++ return -EINVAL;
++
++ entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
++ if (entries == NULL)
++ return -ENOMEM;
++
++ for (i = 0; i < op->value; i++) {
++ entries[i].entry = op->msix_entries[i].entry;
++ entries[i].vector = op->msix_entries[i].vector;
++ }
++
++ result = pci_enable_msix(dev, entries, op->value);
++
++ for (i = 0; i < op->value; i++) {
++ op->msix_entries[i].entry = entries[i].entry;
++ op->msix_entries[i].vector =
++ xen_gsi_from_irq(entries[i].vector);
++ }
++
++ kfree(entries);
++
++ op->value = result;
++
++ return result;
+}
+
-+static inline void netif_put(struct xen_netif *netif)
++int pciback_disable_msix(struct pciback_device *pdev,
++ struct pci_dev *dev, struct xen_pci_op *op)
+{
-+ if (atomic_dec_and_test(&netif->refcnt))
-+ wake_up(&netif->waiting_to_free);
++
++ pci_disable_msix(dev);
++
++ op->value = xen_gsi_from_irq(dev->irq);
++ return 0;
+}
+
-+int netif_xenbus_init(void);
+diff --git a/drivers/xen/pciback/conf_space_capability_pm.c b/drivers/xen/pciback/conf_space_capability_pm.c
+new file mode 100644
+index 0000000..0442616
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability_pm.c
+@@ -0,0 +1,113 @@
++/*
++ * PCI Backend - Configuration space overlay for power management
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
+
-+#define netif_schedulable(netif) \
-+ (netif_running((netif)->dev) && netback_carrier_ok(netif))
++#include <linux/pci.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
+
-+void netif_schedule_work(struct xen_netif *netif);
-+void netif_deschedule_work(struct xen_netif *netif);
++static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
++ void *data)
++{
++ int err;
++ u16 real_value;
+
-+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
-+struct net_device_stats *netif_be_get_stats(struct net_device *dev);
-+irqreturn_t netif_be_int(int irq, void *dev_id);
++ err = pci_read_config_word(dev, offset, &real_value);
++ if (err)
++ goto out;
+
-+static inline int netbk_can_queue(struct net_device *dev)
++ *value = real_value & ~PCI_PM_CAP_PME_MASK;
++
++out:
++ return err;
++}
++
++/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
++ * Can't allow driver domain to enable PMEs - they're shared */
++#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
++
++static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
++ void *data)
+{
-+ struct xen_netif *netif = netdev_priv(dev);
-+ return netif->can_queue;
++ int err;
++ u16 old_value;
++ pci_power_t new_state, old_state;
++
++ err = pci_read_config_word(dev, offset, &old_value);
++ if (err)
++ goto out;
++
++ old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
++ new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
++
++ new_value &= PM_OK_BITS;
++ if ((old_value & PM_OK_BITS) != new_value) {
++ new_value = (old_value & ~PM_OK_BITS) | new_value;
++ err = pci_write_config_word(dev, offset, new_value);
++ if (err)
++ goto out;
++ }
++
++ /* Let pci core handle the power management change */
++ dev_dbg(&dev->dev, "set power state to %x\n", new_state);
++ err = pci_set_power_state(dev, new_state);
++ if (err) {
++ err = PCIBIOS_SET_FAILED;
++ goto out;
++ }
++
++ out:
++ return err;
+}
+
-+static inline int netbk_can_sg(struct net_device *dev)
++/* Ensure PMEs are disabled */
++static void *pm_ctrl_init(struct pci_dev *dev, int offset)
+{
-+ struct xen_netif *netif = netdev_priv(dev);
-+ return netif->features & NETIF_F_SG;
++ int err;
++ u16 value;
++
++ err = pci_read_config_word(dev, offset, &value);
++ if (err)
++ goto out;
++
++ if (value & PCI_PM_CTRL_PME_ENABLE) {
++ value &= ~PCI_PM_CTRL_PME_ENABLE;
++ err = pci_write_config_word(dev, offset, value);
++ }
++
++out:
++ return ERR_PTR(err);
+}
+
-+#endif /* __NETIF__BACKEND__COMMON_H__ */
-diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
++static const struct config_field caplist_pm[] = {
++ {
++ .offset = PCI_PM_PMC,
++ .size = 2,
++ .u.w.read = pm_caps_read,
++ },
++ {
++ .offset = PCI_PM_CTRL,
++ .size = 2,
++ .init = pm_ctrl_init,
++ .u.w.read = pciback_read_config_word,
++ .u.w.write = pm_ctrl_write,
++ },
++ {
++ .offset = PCI_PM_PPB_EXTENSIONS,
++ .size = 1,
++ .u.b.read = pciback_read_config_byte,
++ },
++ {
++ .offset = PCI_PM_DATA_REGISTER,
++ .size = 1,
++ .u.b.read = pciback_read_config_byte,
++ },
++ {}
++};
++
++struct pciback_config_capability pciback_config_capability_pm = {
++ .capability = PCI_CAP_ID_PM,
++ .fields = caplist_pm,
++};
+diff --git a/drivers/xen/pciback/conf_space_capability_vpd.c b/drivers/xen/pciback/conf_space_capability_vpd.c
new file mode 100644
-index 0000000..b23b14d
+index 0000000..e7b4d66
--- /dev/null
-+++ b/drivers/xen/netback/interface.c
-@@ -0,0 +1,405 @@
-+/******************************************************************************
-+ * arch/xen/drivers/netif/backend/interface.c
-+ *
-+ * Network-device interface management.
-+ *
-+ * Copyright (c) 2004-2005, Keir Fraser
-+ *
-+ * This program is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU General Public License version 2
-+ * as published by the Free Software Foundation; or, when distributed
-+ * separately from the Linux kernel or incorporated into other
-+ * software packages, subject to the following license:
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this source file (the "Software"), to deal in the Software without
-+ * restriction, including without limitation the rights to use, copy, modify,
-+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-+ * and to permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
++++ b/drivers/xen/pciback/conf_space_capability_vpd.c
+@@ -0,0 +1,40 @@
++/*
++ * PCI Backend - Configuration space overlay for Vital Product Data
+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
+ */
+
-+#include "common.h"
-+#include <linux/ethtool.h>
-+#include <linux/rtnetlink.h>
++#include <linux/pci.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
+
-+#include <xen/events.h>
-+#include <asm/xen/hypercall.h>
++static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
++ void *data)
++{
++ /* Disallow writes to the vital product data */
++ if (value & PCI_VPD_ADDR_F)
++ return PCIBIOS_SET_FAILED;
++ else
++ return pci_write_config_word(dev, offset, value);
++}
++
++static const struct config_field caplist_vpd[] = {
++ {
++ .offset = PCI_VPD_ADDR,
++ .size = 2,
++ .u.w.read = pciback_read_config_word,
++ .u.w.write = vpd_address_write,
++ },
++ {
++ .offset = PCI_VPD_DATA,
++ .size = 4,
++ .u.dw.read = pciback_read_config_dword,
++ .u.dw.write = NULL,
++ },
++ {}
++};
+
++struct pciback_config_capability pciback_config_capability_vpd = {
++ .capability = PCI_CAP_ID_VPD,
++ .fields = caplist_vpd,
++};
+diff --git a/drivers/xen/pciback/conf_space_header.c b/drivers/xen/pciback/conf_space_header.c
+new file mode 100644
+index 0000000..1f4f86e
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_header.c
+@@ -0,0 +1,379 @@
+/*
-+ * Module parameter 'queue_length':
++ * PCI Backend - Handles the virtual fields in the configuration space headers.
+ *
-+ * Enables queuing in the network stack when a client has run out of receive
-+ * descriptors. Although this feature can improve receive bandwidth by avoiding
-+ * packet loss, it can also result in packets sitting in the 'tx_queue' for
-+ * unbounded time. This is bad if those packets hold onto foreign resources.
-+ * For example, consider a packet that holds onto resources belonging to the
-+ * guest for which it is queued (e.g., packet received on vif1.0, destined for
-+ * vif1.1 which is not activated in the guest): in this situation the guest
-+ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
-+ * run a timer (tx_queue_timeout) to drain the queue when the interface is
-+ * blocked.
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
+ */
-+static unsigned long netbk_queue_length = 32;
-+module_param_named(queue_length, netbk_queue_length, ulong, 0644);
+
-+static void __netif_up(struct xen_netif *netif)
-+{
-+ enable_irq(netif->irq);
-+ netif_schedule_work(netif);
-+}
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
+
-+static void __netif_down(struct xen_netif *netif)
++struct pci_bar_info {
++ u32 val;
++ u32 len_val;
++ int which;
++};
++
++#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
++#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
++
++static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
+{
-+ disable_irq(netif->irq);
-+ netif_deschedule_work(netif);
++ int i;
++ int ret;
++
++ ret = pciback_read_config_word(dev, offset, value, data);
++ if (!atomic_read(&dev->enable_cnt))
++ return ret;
++
++ for (i = 0; i < PCI_ROM_RESOURCE; i++) {
++ if (dev->resource[i].flags & IORESOURCE_IO)
++ *value |= PCI_COMMAND_IO;
++ if (dev->resource[i].flags & IORESOURCE_MEM)
++ *value |= PCI_COMMAND_MEMORY;
++ }
++
++ return ret;
+}
+
-+static int net_open(struct net_device *dev)
++static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
+{
-+ struct xen_netif *netif = netdev_priv(dev);
-+ if (netback_carrier_ok(netif)) {
-+ __netif_up(netif);
-+ netif_start_queue(dev);
++ int err;
++
++ if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
++ if (unlikely(verbose_request))
++ printk(KERN_DEBUG "pciback: %s: enable\n",
++ pci_name(dev));
++ err = pci_enable_device(dev);
++ if (err)
++ return err;
++ } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
++ if (unlikely(verbose_request))
++ printk(KERN_DEBUG "pciback: %s: disable\n",
++ pci_name(dev));
++ pci_disable_device(dev);
++ }
++
++ if (!dev->is_busmaster && is_master_cmd(value)) {
++ if (unlikely(verbose_request))
++ printk(KERN_DEBUG "pciback: %s: set bus master\n",
++ pci_name(dev));
++ pci_set_master(dev);
++ }
++
++ if (value & PCI_COMMAND_INVALIDATE) {
++ if (unlikely(verbose_request))
++ printk(KERN_DEBUG
++ "pciback: %s: enable memory-write-invalidate\n",
++ pci_name(dev));
++ err = pci_set_mwi(dev);
++ if (err) {
++ printk(KERN_WARNING
++ "pciback: %s: cannot enable "
++ "memory-write-invalidate (%d)\n",
++ pci_name(dev), err);
++ value &= ~PCI_COMMAND_INVALIDATE;
++ }
+ }
-+ return 0;
++
++ return pci_write_config_word(dev, offset, value);
+}
+
-+static int net_close(struct net_device *dev)
++static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
-+ struct xen_netif *netif = netdev_priv(dev);
-+ if (netback_carrier_ok(netif))
-+ __netif_down(netif);
-+ netif_stop_queue(dev);
++ struct pci_bar_info *bar = data;
++
++ if (unlikely(!bar)) {
++ printk(KERN_WARNING "pciback: driver data not found for %s\n",
++ pci_name(dev));
++ return XEN_PCI_ERR_op_failed;
++ }
++
++ /* A write to obtain the length must happen as a 32-bit write.
++ * This does not (yet) support writing individual bytes
++ */
++ if (value == ~PCI_ROM_ADDRESS_ENABLE)
++ bar->which = 1;
++ else {
++ u32 tmpval;
++ pci_read_config_dword(dev, offset, &tmpval);
++ if (tmpval != bar->val && value == bar->val) {
++ /* Allow restoration of bar value. */
++ pci_write_config_dword(dev, offset, bar->val);
++ }
++ bar->which = 0;
++ }
++
++ /* Do we need to support enabling/disabling the rom address here? */
++
+ return 0;
+}
+
-+static int netbk_change_mtu(struct net_device *dev, int mtu)
++/* For the BARs, only allow writes which write ~0 or
++ * the correct resource information
++ * (Needed for when the driver probes the resource usage)
++ */
++static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
-+ int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
++ struct pci_bar_info *bar = data;
++
++ if (unlikely(!bar)) {
++ printk(KERN_WARNING "pciback: driver data not found for %s\n",
++ pci_name(dev));
++ return XEN_PCI_ERR_op_failed;
++ }
++
++ /* A write to obtain the length must happen as a 32-bit write.
++ * This does not (yet) support writing individual bytes
++ */
++ if (value == ~0)
++ bar->which = 1;
++ else {
++ u32 tmpval;
++ pci_read_config_dword(dev, offset, &tmpval);
++ if (tmpval != bar->val && value == bar->val) {
++ /* Allow restoration of bar value. */
++ pci_write_config_dword(dev, offset, bar->val);
++ }
++ bar->which = 0;
++ }
+
-+ if (mtu > max)
-+ return -EINVAL;
-+ dev->mtu = mtu;
+ return 0;
+}
+
-+static int netbk_set_sg(struct net_device *dev, u32 data)
++static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
+{
-+ if (data) {
-+ struct xen_netif *netif = netdev_priv(dev);
++ struct pci_bar_info *bar = data;
+
-+ if (!(netif->features & NETIF_F_SG))
-+ return -ENOSYS;
++ if (unlikely(!bar)) {
++ printk(KERN_WARNING "pciback: driver data not found for %s\n",
++ pci_name(dev));
++ return XEN_PCI_ERR_op_failed;
+ }
+
-+ if (dev->mtu > ETH_DATA_LEN)
-+ dev->mtu = ETH_DATA_LEN;
++ *value = bar->which ? bar->len_val : bar->val;
+
-+ return ethtool_op_set_sg(dev, data);
++ return 0;
+}
+
-+static int netbk_set_tso(struct net_device *dev, u32 data)
++static inline void read_dev_bar(struct pci_dev *dev,
++ struct pci_bar_info *bar_info, int offset,
++ u32 len_mask)
+{
-+ if (data) {
-+ struct xen_netif *netif = netdev_priv(dev);
++ int pos;
++ struct resource *res = dev->resource;
+
-+ if (!(netif->features & NETIF_F_TSO))
-+ return -ENOSYS;
++ if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
++ pos = PCI_ROM_RESOURCE;
++ else {
++ pos = (offset - PCI_BASE_ADDRESS_0) / 4;
++ if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE |
++ PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
++ (PCI_BASE_ADDRESS_SPACE_MEMORY |
++ PCI_BASE_ADDRESS_MEM_TYPE_64))) {
++ bar_info->val = res[pos - 1].start >> 32;
++ bar_info->len_val = res[pos - 1].end >> 32;
++ return;
++ }
+ }
+
-+ return ethtool_op_set_tso(dev, data);
++ bar_info->val = res[pos].start |
++ (res[pos].flags & PCI_REGION_FLAG_MASK);
++ bar_info->len_val = res[pos].end - res[pos].start + 1;
+}
+
-+static void netbk_get_drvinfo(struct net_device *dev,
-+ struct ethtool_drvinfo *info)
++static void *bar_init(struct pci_dev *dev, int offset)
+{
-+ strcpy(info->driver, "netbk");
-+ strcpy(info->bus_info, dev_name(dev->dev.parent));
-+}
++ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
-+static const struct netif_stat {
-+ char name[ETH_GSTRING_LEN];
-+ u16 offset;
-+} netbk_stats[] = {
-+ { "copied_skbs", offsetof(struct xen_netif, nr_copied_skbs) },
-+};
++ if (!bar)
++ return ERR_PTR(-ENOMEM);
+
-+static int netbk_get_stats_count(struct net_device *dev)
-+{
-+ return ARRAY_SIZE(netbk_stats);
++ read_dev_bar(dev, bar, offset, ~0);
++ bar->which = 0;
++
++ return bar;
+}
+
-+static void netbk_get_ethtool_stats(struct net_device *dev,
-+ struct ethtool_stats *stats, u64 * data)
++static void *rom_init(struct pci_dev *dev, int offset)
+{
-+ void *netif = netdev_priv(dev);
-+ int i;
++ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
-+ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
-+ data[i] = *(int *)(netif + netbk_stats[i].offset);
++ if (!bar)
++ return ERR_PTR(-ENOMEM);
++
++ read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
++ bar->which = 0;
++
++ return bar;
+}
+
-+static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data)
++static void bar_reset(struct pci_dev *dev, int offset, void *data)
+{
-+ int i;
++ struct pci_bar_info *bar = data;
+
-+ switch (stringset) {
-+ case ETH_SS_STATS:
-+ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
-+ memcpy(data + i * ETH_GSTRING_LEN,
-+ netbk_stats[i].name, ETH_GSTRING_LEN);
-+ break;
-+ }
++ bar->which = 0;
+}
+
-+static struct ethtool_ops network_ethtool_ops =
++static void bar_release(struct pci_dev *dev, int offset, void *data)
+{
-+ .get_drvinfo = netbk_get_drvinfo,
++ kfree(data);
++}
+
-+ .get_tx_csum = ethtool_op_get_tx_csum,
-+ .set_tx_csum = ethtool_op_set_tx_csum,
-+ .get_sg = ethtool_op_get_sg,
-+ .set_sg = netbk_set_sg,
-+ .get_tso = ethtool_op_get_tso,
-+ .set_tso = netbk_set_tso,
-+ .get_link = ethtool_op_get_link,
++static int pciback_read_vendor(struct pci_dev *dev, int offset,
++ u16 *value, void *data)
++{
++ *value = dev->vendor;
+
-+ .get_stats_count = netbk_get_stats_count,
-+ .get_ethtool_stats = netbk_get_ethtool_stats,
-+ .get_strings = netbk_get_strings,
-+};
++ return 0;
++}
+
-+static struct net_device_ops netback_ops =
++static int pciback_read_device(struct pci_dev *dev, int offset,
++ u16 *value, void *data)
+{
-+ .ndo_start_xmit = netif_be_start_xmit,
-+ .ndo_get_stats = netif_be_get_stats,
-+ .ndo_open = net_open,
-+ .ndo_stop = net_close,
-+ .ndo_change_mtu = netbk_change_mtu,
-+};
++ *value = dev->device;
+
-+struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
-+{
-+ int err = 0;
-+ struct net_device *dev;
-+ struct xen_netif *netif;
-+ char name[IFNAMSIZ] = {};
++ return 0;
++}
+
-+ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
-+ dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
-+ if (dev == NULL) {
-+ DPRINTK("Could not create netif: out of memory\n");
-+ return ERR_PTR(-ENOMEM);
-+ }
++static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
++ void *data)
++{
++ *value = (u8) dev->irq;
+
-+ SET_NETDEV_DEV(dev, parent);
++ return 0;
++}
+
-+ netif = netdev_priv(dev);
-+ memset(netif, 0, sizeof(*netif));
-+ netif->domid = domid;
-+ netif->handle = handle;
-+ netif->features = NETIF_F_SG;
-+ atomic_set(&netif->refcnt, 1);
-+ init_waitqueue_head(&netif->waiting_to_free);
-+ netif->dev = dev;
-+ INIT_LIST_HEAD(&netif->list);
++static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
++{
++ u8 cur_value;
++ int err;
+
-+ netback_carrier_off(netif);
++ err = pci_read_config_byte(dev, offset, &cur_value);
++ if (err)
++ goto out;
+
-+ netif->credit_bytes = netif->remaining_credit = ~0UL;
-+ netif->credit_usec = 0UL;
-+ init_timer(&netif->credit_timeout);
-+ /* Initialize 'expires' now: it's used to track the credit window. */
-+ netif->credit_timeout.expires = jiffies;
++ if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
++ || value == PCI_BIST_START)
++ err = pci_write_config_byte(dev, offset, value);
+
-+ init_timer(&netif->tx_queue_timeout);
++out:
++ return err;
++}
+
-+ dev->netdev_ops = &netback_ops;
-+ dev->features = NETIF_F_IP_CSUM|NETIF_F_SG;
++static const struct config_field header_common[] = {
++ {
++ .offset = PCI_VENDOR_ID,
++ .size = 2,
++ .u.w.read = pciback_read_vendor,
++ },
++ {
++ .offset = PCI_DEVICE_ID,
++ .size = 2,
++ .u.w.read = pciback_read_device,
++ },
++ {
++ .offset = PCI_COMMAND,
++ .size = 2,
++ .u.w.read = command_read,
++ .u.w.write = command_write,
++ },
++ {
++ .offset = PCI_INTERRUPT_LINE,
++ .size = 1,
++ .u.b.read = interrupt_read,
++ },
++ {
++ .offset = PCI_INTERRUPT_PIN,
++ .size = 1,
++ .u.b.read = pciback_read_config_byte,
++ },
++ {
++ /* Any side effects of letting driver domain control cache line? */
++ .offset = PCI_CACHE_LINE_SIZE,
++ .size = 1,
++ .u.b.read = pciback_read_config_byte,
++ .u.b.write = pciback_write_config_byte,
++ },
++ {
++ .offset = PCI_LATENCY_TIMER,
++ .size = 1,
++ .u.b.read = pciback_read_config_byte,
++ },
++ {
++ .offset = PCI_BIST,
++ .size = 1,
++ .u.b.read = pciback_read_config_byte,
++ .u.b.write = bist_write,
++ },
++ {}
++};
+
-+ SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
++#define CFG_FIELD_BAR(reg_offset) \
++ { \
++ .offset = reg_offset, \
++ .size = 4, \
++ .init = bar_init, \
++ .reset = bar_reset, \
++ .release = bar_release, \
++ .u.dw.read = bar_read, \
++ .u.dw.write = bar_write, \
++ }
++
++#define CFG_FIELD_ROM(reg_offset) \
++ { \
++ .offset = reg_offset, \
++ .size = 4, \
++ .init = rom_init, \
++ .reset = bar_reset, \
++ .release = bar_release, \
++ .u.dw.read = bar_read, \
++ .u.dw.write = rom_write, \
++ }
++
++static const struct config_field header_0[] = {
++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
++ CFG_FIELD_ROM(PCI_ROM_ADDRESS),
++ {}
++};
++
++static const struct config_field header_1[] = {
++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
++ CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
++ {}
++};
+
-+ dev->tx_queue_len = netbk_queue_length;
++int pciback_config_header_add_fields(struct pci_dev *dev)
++{
++ int err;
+
-+ /*
-+ * Initialise a dummy MAC address. We choose the numerically
-+ * largest non-broadcast address to prevent the address getting
-+ * stolen by an Ethernet bridge for STP purposes.
-+ * (FE:FF:FF:FF:FF:FF)
-+ */
-+ memset(dev->dev_addr, 0xFF, ETH_ALEN);
-+ dev->dev_addr[0] &= ~0x01;
++ err = pciback_config_add_fields(dev, header_common);
++ if (err)
++ goto out;
+
-+ rtnl_lock();
-+ err = register_netdevice(dev);
-+ rtnl_unlock();
-+ if (err) {
-+ DPRINTK("Could not register new net device %s: err=%d\n",
-+ dev->name, err);
-+ free_netdev(dev);
-+ return ERR_PTR(err);
++ switch (dev->hdr_type) {
++ case PCI_HEADER_TYPE_NORMAL:
++ err = pciback_config_add_fields(dev, header_0);
++ break;
++
++ case PCI_HEADER_TYPE_BRIDGE:
++ err = pciback_config_add_fields(dev, header_1);
++ break;
++
++ default:
++ err = -EINVAL;
++ printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
++ pci_name(dev), dev->hdr_type);
++ break;
+ }
+
-+ DPRINTK("Successfully created netif\n");
-+ return netif;
++out:
++ return err;
+}
+diff --git a/drivers/xen/pciback/conf_space_quirks.c b/drivers/xen/pciback/conf_space_quirks.c
+new file mode 100644
+index 0000000..45c31fb
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_quirks.c
+@@ -0,0 +1,140 @@
++/*
++ * PCI Backend - Handle special overlays for broken devices.
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ * Author: Chris Bookholt <hap10 at epoch.ncsc.mil>
++ */
+
-+static int map_frontend_pages(
-+ struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
-+{
-+ struct gnttab_map_grant_ref op;
-+
-+ gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
-+ GNTMAP_host_map, tx_ring_ref, netif->domid);
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
++
++LIST_HEAD(pciback_quirks);
++
++static inline const struct pci_device_id *
++match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
++{
++ if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
++ (id->device == PCI_ANY_ID || id->device == dev->device) &&
++ (id->subvendor == PCI_ANY_ID ||
++ id->subvendor == dev->subsystem_vendor) &&
++ (id->subdevice == PCI_ANY_ID ||
++ id->subdevice == dev->subsystem_device) &&
++ !((id->class ^ dev->class) & id->class_mask))
++ return id;
++ return NULL;
++}
+
-+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
-+ BUG();
++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
++{
++ struct pciback_config_quirk *tmp_quirk;
+
-+ if (op.status) {
-+ DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
-+ return op.status;
-+ }
++ list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
++ if (match_one_device(&tmp_quirk->devid, dev) != NULL)
++ goto out;
++ tmp_quirk = NULL;
++ printk(KERN_DEBUG
++ "quirk didn't match any device pciback knows about\n");
++out:
++ return tmp_quirk;
++}
+
-+ netif->tx_shmem_ref = tx_ring_ref;
-+ netif->tx_shmem_handle = op.handle;
++static inline void register_quirk(struct pciback_config_quirk *quirk)
++{
++ list_add_tail(&quirk->quirks_list, &pciback_quirks);
++}
+
-+ gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
-+ GNTMAP_host_map, rx_ring_ref, netif->domid);
++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg)
++{
++ int ret = 0;
++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++ struct config_field_entry *cfg_entry;
+
-+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
-+ BUG();
++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++ if (OFFSET(cfg_entry) == reg) {
++ ret = 1;
++ break;
++ }
++ }
++ return ret;
++}
+
-+ if (op.status) {
-+ struct gnttab_unmap_grant_ref unop;
++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
++ *field)
++{
++ int err = 0;
+
-+ gnttab_set_unmap_op(&unop,
-+ (unsigned long)netif->tx_comms_area->addr,
-+ GNTMAP_host_map, netif->tx_shmem_handle);
-+ HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
-+ DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
-+ return op.status;
++ switch (field->size) {
++ case 1:
++ field->u.b.read = pciback_read_config_byte;
++ field->u.b.write = pciback_write_config_byte;
++ break;
++ case 2:
++ field->u.w.read = pciback_read_config_word;
++ field->u.w.write = pciback_write_config_word;
++ break;
++ case 4:
++ field->u.dw.read = pciback_read_config_dword;
++ field->u.dw.write = pciback_write_config_dword;
++ break;
++ default:
++ err = -EINVAL;
++ goto out;
+ }
+
-+ netif->rx_shmem_ref = rx_ring_ref;
-+ netif->rx_shmem_handle = op.handle;
++ pciback_config_add_field(dev, field);
+
-+ return 0;
++out:
++ return err;
+}
+
-+static void unmap_frontend_pages(struct xen_netif *netif)
++int pciback_config_quirks_init(struct pci_dev *dev)
+{
-+ struct gnttab_unmap_grant_ref op;
++ struct pciback_config_quirk *quirk;
++ int ret = 0;
+
-+ gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
-+ GNTMAP_host_map, netif->tx_shmem_handle);
++ quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
++ if (!quirk) {
++ ret = -ENOMEM;
++ goto out;
++ }
+
-+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
-+ BUG();
++ quirk->devid.vendor = dev->vendor;
++ quirk->devid.device = dev->device;
++ quirk->devid.subvendor = dev->subsystem_vendor;
++ quirk->devid.subdevice = dev->subsystem_device;
++ quirk->devid.class = 0;
++ quirk->devid.class_mask = 0;
++ quirk->devid.driver_data = 0UL;
+
-+ gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
-+ GNTMAP_host_map, netif->rx_shmem_handle);
++ quirk->pdev = dev;
+
-+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
-+ BUG();
++ register_quirk(quirk);
++out:
++ return ret;
+}
+
-+int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
-+ unsigned long rx_ring_ref, unsigned int evtchn)
++void pciback_config_field_free(struct config_field *field)
+{
-+ int err = -ENOMEM;
-+ struct xen_netif_tx_sring *txs;
-+ struct xen_netif_rx_sring *rxs;
-+
-+ /* Already connected through? */
-+ if (netif->irq)
-+ return 0;
-+
-+ netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
-+ if (netif->tx_comms_area == NULL)
-+ return -ENOMEM;
-+ netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
-+ if (netif->rx_comms_area == NULL)
-+ goto err_rx;
++ kfree(field);
++}
+
-+ err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
-+ if (err)
-+ goto err_map;
++int pciback_config_quirk_release(struct pci_dev *dev)
++{
++ struct pciback_config_quirk *quirk;
++ int ret = 0;
+
-+ err = bind_interdomain_evtchn_to_irqhandler(
-+ netif->domid, evtchn, netif_be_int, 0,
-+ netif->dev->name, netif);
-+ if (err < 0)
-+ goto err_hypervisor;
-+ netif->irq = err;
-+ disable_irq(netif->irq);
++ quirk = pciback_find_quirk(dev);
++ if (!quirk) {
++ ret = -ENXIO;
++ goto out;
++ }
+
-+ txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
-+ BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
++ list_del(&quirk->quirks_list);
++ kfree(quirk);
+
-+ rxs = (struct xen_netif_rx_sring *)
-+ ((char *)netif->rx_comms_area->addr);
-+ BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
++out:
++ return ret;
++}
+diff --git a/drivers/xen/pciback/conf_space_quirks.h b/drivers/xen/pciback/conf_space_quirks.h
+new file mode 100644
+index 0000000..acd0e1a
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_quirks.h
+@@ -0,0 +1,35 @@
++/*
++ * PCI Backend - Data structures for special overlays for broken devices.
++ *
++ * Ryan Wilson <hap9 at epoch.ncsc.mil>
++ * Chris Bookholt <hap10 at epoch.ncsc.mil>
++ */
+
-+ netif->rx_req_cons_peek = 0;
++#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
++#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+
-+ netif_get(netif);
++#include <linux/pci.h>
++#include <linux/list.h>
+
-+ rtnl_lock();
-+ netback_carrier_on(netif);
-+ if (netif_running(netif->dev))
-+ __netif_up(netif);
-+ rtnl_unlock();
++struct pciback_config_quirk {
++ struct list_head quirks_list;
++ struct pci_device_id devid;
++ struct pci_dev *pdev;
++};
+
-+ return 0;
-+err_hypervisor:
-+ unmap_frontend_pages(netif);
-+err_map:
-+ free_vm_area(netif->rx_comms_area);
-+err_rx:
-+ free_vm_area(netif->tx_comms_area);
-+ return err;
-+}
++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
+
-+void netif_disconnect(struct xen_netif *netif)
-+{
-+ if (netback_carrier_ok(netif)) {
-+ rtnl_lock();
-+ netback_carrier_off(netif);
-+ netif_carrier_off(netif->dev); /* discard queued packets */
-+ if (netif_running(netif->dev))
-+ __netif_down(netif);
-+ rtnl_unlock();
-+ netif_put(netif);
-+ }
++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
++ *field);
+
-+ atomic_dec(&netif->refcnt);
-+ wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
++int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
+
-+ del_timer_sync(&netif->credit_timeout);
-+ del_timer_sync(&netif->tx_queue_timeout);
++int pciback_config_quirks_init(struct pci_dev *dev);
+
-+ if (netif->irq)
-+ unbind_from_irqhandler(netif->irq, netif);
++void pciback_config_field_free(struct config_field *field);
+
-+ unregister_netdev(netif->dev);
++int pciback_config_quirk_release(struct pci_dev *dev);
+
-+ if (netif->tx.sring) {
-+ unmap_frontend_pages(netif);
-+ free_vm_area(netif->tx_comms_area);
-+ free_vm_area(netif->rx_comms_area);
-+ }
++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg);
+
-+ free_netdev(netif->dev);
-+}
-diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
++#endif
+diff --git a/drivers/xen/pciback/controller.c b/drivers/xen/pciback/controller.c
new file mode 100644
-index 0000000..0bc6398
+index 0000000..7f04f11
--- /dev/null
-+++ b/drivers/xen/netback/netback.c
-@@ -0,0 +1,1613 @@
-+/******************************************************************************
-+ * drivers/xen/netback/netback.c
-+ *
-+ * Back-end of the driver for virtual network devices. This portion of the
-+ * driver exports a 'unified' network-device interface that can be accessed
-+ * by any operating system that implements a compatible front end. A
-+ * reference front-end implementation can be found in:
-+ * drivers/xen/netfront/netfront.c
-+ *
-+ * Copyright (c) 2002-2005, K A Fraser
-+ *
-+ * This program is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU General Public License version 2
-+ * as published by the Free Software Foundation; or, when distributed
-+ * separately from the Linux kernel or incorporated into other
-+ * software packages, subject to the following license:
++++ b/drivers/xen/pciback/controller.c
+@@ -0,0 +1,442 @@
++/*
++ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
++ * Alex Williamson <alex.williamson at hp.com>
+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-+ * of this source file (the "Software"), to deal in the Software without
-+ * restriction, including without limitation the rights to use, copy, modify,
-+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-+ * and to permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
++ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
++ * controllers. Devices under the same PCI controller are exposed on the
++ * same virtual domain:bus. Within a bus, device slots are virtualized
++ * to compact the bus.
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
-+#include "common.h"
-+
-+#include <linux/tcp.h>
-+#include <linux/udp.h>
-+
-+#include <xen/balloon.h>
-+#include <xen/events.h>
-+#include <xen/interface/memory.h>
-+
-+#include <asm/xen/hypercall.h>
-+#include <asm/xen/page.h>
++#include <linux/acpi.h>
++#include <linux/list.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
+
-+/*define NETBE_DEBUG_INTERRUPT*/
++#define PCI_MAX_BUSSES 255
++#define PCI_MAX_SLOTS 32
+
-+struct netbk_rx_meta {
-+ skb_frag_t frag;
-+ int id;
++struct controller_dev_entry {
++ struct list_head list;
++ struct pci_dev *dev;
++ unsigned int devfn;
+};
+
-+struct netbk_tx_pending_inuse {
++struct controller_list_entry {
+ struct list_head list;
-+ unsigned long alloc_time;
++ struct pci_controller *controller;
++ unsigned int domain;
++ unsigned int bus;
++ unsigned int next_devfn;
++ struct list_head dev_list;
+};
+
++struct controller_dev_data {
++ struct list_head list;
++ unsigned int next_domain;
++ unsigned int next_bus;
++ spinlock_t lock;
++};
+
-+static void netif_idx_release(u16 pending_idx);
-+static void make_tx_response(struct xen_netif *netif,
-+ struct xen_netif_tx_request *txp,
-+ s8 st);
-+static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
-+ u16 id,
-+ s8 st,
-+ u16 offset,
-+ u16 size,
-+ u16 flags);
-+
-+static void net_tx_action(unsigned long unused);
-+static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
++struct walk_info {
++ struct pciback_device *pdev;
++ int resource_count;
++ int root_num;
++};
+
-+static void net_rx_action(unsigned long unused);
-+static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++ unsigned int domain, unsigned int bus,
++ unsigned int devfn)
++{
++ struct controller_dev_data *dev_data = pdev->pci_dev_data;
++ struct controller_dev_entry *dev_entry;
++ struct controller_list_entry *cntrl_entry;
++ struct pci_dev *dev = NULL;
++ unsigned long flags;
+
-+static struct timer_list net_timer;
-+static struct timer_list netbk_tx_pending_timer;
++ spin_lock_irqsave(&dev_data->lock, flags);
+
-+#define MAX_PENDING_REQS 256
++ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++ if (cntrl_entry->domain != domain ||
++ cntrl_entry->bus != bus)
++ continue;
+
-+static struct sk_buff_head rx_queue;
++ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++ if (devfn == dev_entry->devfn) {
++ dev = dev_entry->dev;
++ goto found;
++ }
++ }
++ }
++found:
++ spin_unlock_irqrestore(&dev_data->lock, flags);
+
-+static struct page **mmap_pages;
-+static inline unsigned long idx_to_pfn(unsigned int idx)
-+{
-+ return page_to_pfn(mmap_pages[idx]);
++ return dev;
+}
+
-+static inline unsigned long idx_to_kaddr(unsigned int idx)
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++ int devid, publish_pci_dev_cb publish_cb)
+{
-+ return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx));
-+}
++ struct controller_dev_data *dev_data = pdev->pci_dev_data;
++ struct controller_dev_entry *dev_entry;
++ struct controller_list_entry *cntrl_entry;
++ struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
++ unsigned long flags;
++ int ret = 0, found = 0;
+
-+/* extra field used in struct page */
-+static inline void netif_set_page_index(struct page *pg, unsigned int index)
-+{
-+ *(unsigned long *)&pg->mapping = index + 1;
-+}
++ spin_lock_irqsave(&dev_data->lock, flags);
+
-+static inline int netif_page_index(struct page *pg)
-+{
-+ unsigned long idx = (unsigned long)pg->mapping - 1;
++ /* Look to see if we already have a domain:bus for this controller */
++ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++ if (cntrl_entry->controller == dev_controller) {
++ found = 1;
++ break;
++ }
++ }
+
-+ if (!PageForeign(pg))
-+ return -1;
++ if (!found) {
++ cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
++ if (!cntrl_entry) {
++ ret = -ENOMEM;
++ goto out;
++ }
+
-+ if ((idx >= MAX_PENDING_REQS) || (mmap_pages[idx] != pg))
-+ return -1;
++ cntrl_entry->controller = dev_controller;
++ cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
+
-+ return idx;
-+}
++ cntrl_entry->domain = dev_data->next_domain;
++ cntrl_entry->bus = dev_data->next_bus++;
++ if (dev_data->next_bus > PCI_MAX_BUSSES) {
++ dev_data->next_domain++;
++ dev_data->next_bus = 0;
++ }
+
-+/*
-+ * This is the amount of packet we copy rather than map, so that the
-+ * guest can't fiddle with the contents of the headers while we do
-+ * packet processing on them (netfilter, routing, etc). 72 is enough
-+ * to cover TCP+IP headers including options.
-+ */
-+#define PKT_PROT_LEN 72
++ INIT_LIST_HEAD(&cntrl_entry->dev_list);
+
-+static struct pending_tx_info {
-+ struct xen_netif_tx_request req;
-+ struct xen_netif *netif;
-+} pending_tx_info[MAX_PENDING_REQS];
-+static u16 pending_ring[MAX_PENDING_REQS];
-+typedef unsigned int pending_ring_idx_t;
++ list_add_tail(&cntrl_entry->list, &dev_data->list);
++ }
+
-+static inline pending_ring_idx_t pending_index(unsigned i)
-+{
-+ return i & (MAX_PENDING_REQS-1);
-+}
++ if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
++ /*
++ * While it seems unlikely, this can actually happen if
++ * a controller has P2P bridges under it.
++ */
++ xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
++ "is full, no room to export %04x:%02x:%02x.%x",
++ cntrl_entry->domain, cntrl_entry->bus,
++ pci_domain_nr(dev->bus), dev->bus->number,
++ PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
++ ret = -ENOSPC;
++ goto out;
++ }
+
-+static pending_ring_idx_t pending_prod, pending_cons;
++ dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
++ if (!dev_entry) {
++ if (list_empty(&cntrl_entry->dev_list)) {
++ list_del(&cntrl_entry->list);
++ kfree(cntrl_entry);
++ }
++ ret = -ENOMEM;
++ goto out;
++ }
+
-+static inline pending_ring_idx_t nr_pending_reqs(void)
-+{
-+ return MAX_PENDING_REQS - pending_prod + pending_cons;
-+}
++ dev_entry->dev = dev;
++ dev_entry->devfn = cntrl_entry->next_devfn;
+
-+/* Freed TX SKBs get batched on this ring before return to pending_ring. */
-+static u16 dealloc_ring[MAX_PENDING_REQS];
-+static pending_ring_idx_t dealloc_prod, dealloc_cons;
++ list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
+
-+/* Doubly-linked list of in-use pending entries. */
-+static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
-+static LIST_HEAD(pending_inuse_head);
++ cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
+
-+static struct sk_buff_head tx_queue;
++out:
++ spin_unlock_irqrestore(&dev_data->lock, flags);
+
-+static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
-+static struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
-+static struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
++ /* TODO: Publish virtual domain:bus:slot.func here. */
+
-+static LIST_HEAD(net_schedule_list);
-+static DEFINE_SPINLOCK(net_schedule_list_lock);
++ return ret;
++}
+
-+#define MAX_MFN_ALLOC 64
-+static unsigned long mfn_list[MAX_MFN_ALLOC];
-+static unsigned int alloc_index = 0;
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++ struct controller_dev_data *dev_data = pdev->pci_dev_data;
++ struct controller_list_entry *cntrl_entry;
++ struct controller_dev_entry *dev_entry = NULL;
++ struct pci_dev *found_dev = NULL;
++ unsigned long flags;
+
-+/* Setting this allows the safe use of this driver without netloop. */
-+static int MODPARM_copy_skb = 1;
-+module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
-+MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
++ spin_lock_irqsave(&dev_data->lock, flags);
+
-+int netbk_copy_skb_mode;
++ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++ if (cntrl_entry->controller != PCI_CONTROLLER(dev))
++ continue;
+
-+static inline unsigned long alloc_mfn(void)
-+{
-+ BUG_ON(alloc_index == 0);
-+ return mfn_list[--alloc_index];
-+}
++ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++ if (dev_entry->dev == dev) {
++ found_dev = dev_entry->dev;
++ break;
++ }
++ }
++ }
+
-+static inline void maybe_schedule_tx_action(void)
-+{
-+ smp_mb();
-+ if ((nr_pending_reqs() < (MAX_PENDING_REQS/2)) &&
-+ !list_empty(&net_schedule_list))
-+ tasklet_schedule(&net_tx_tasklet);
++ if (!found_dev) {
++ spin_unlock_irqrestore(&dev_data->lock, flags);
++ return;
++ }
++
++ list_del(&dev_entry->list);
++ kfree(dev_entry);
++
++ if (list_empty(&cntrl_entry->dev_list)) {
++ list_del(&cntrl_entry->list);
++ kfree(cntrl_entry);
++ }
++
++ spin_unlock_irqrestore(&dev_data->lock, flags);
++ pcistub_put_pci_dev(found_dev);
+}
+
-+static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
++int pciback_init_devices(struct pciback_device *pdev)
+{
-+ struct skb_shared_info *ninfo;
-+ struct sk_buff *nskb;
-+ unsigned long offset;
-+ int ret;
-+ int len;
-+ int headlen;
++ struct controller_dev_data *dev_data;
+
-+ BUG_ON(skb_shinfo(skb)->frag_list != NULL);
++ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
++ if (!dev_data)
++ return -ENOMEM;
+
-+ nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
-+ if (unlikely(!nskb))
-+ goto err;
++ spin_lock_init(&dev_data->lock);
+
-+ skb_reserve(nskb, NET_SKB_PAD + NET_IP_ALIGN);
-+ headlen = skb_end_pointer(nskb) - nskb->data;
-+ if (headlen > skb_headlen(skb))
-+ headlen = skb_headlen(skb);
-+ ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
-+ BUG_ON(ret);
++ INIT_LIST_HEAD(&dev_data->list);
+
-+ ninfo = skb_shinfo(nskb);
-+ ninfo->gso_size = skb_shinfo(skb)->gso_size;
-+ ninfo->gso_type = skb_shinfo(skb)->gso_type;
++ /* Starting domain:bus numbers */
++ dev_data->next_domain = 0;
++ dev_data->next_bus = 0;
+
-+ offset = headlen;
-+ len = skb->len - headlen;
++ pdev->pci_dev_data = dev_data;
+
-+ nskb->len = skb->len;
-+ nskb->data_len = len;
-+ nskb->truesize += len;
++ return 0;
++}
+
-+ while (len) {
-+ struct page *page;
-+ int copy;
-+ int zero;
++static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
++{
++ struct walk_info *info = data;
++ struct acpi_resource_address64 addr;
++ acpi_status status;
++ int i, len, err;
++ char str[32], tmp[3];
++ unsigned char *ptr, *buf;
+
-+ if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
-+ dump_stack();
-+ goto err_free;
-+ }
++ status = acpi_resource_to_address64(res, &addr);
+
-+ copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
-+ zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
++ /* Do we care about this range? Let's check. */
++ if (!ACPI_SUCCESS(status) ||
++ !(addr.resource_type == ACPI_MEMORY_RANGE ||
++ addr.resource_type == ACPI_IO_RANGE) ||
++ !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
++ return AE_OK;
+
-+ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
-+ if (unlikely(!page))
-+ goto err_free;
++ /*
++ * Furthermore, we really only care to tell the guest about
++ * address ranges that require address translation of some sort.
++ */
++ if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
++ addr.info.mem.translation) &&
++ !(addr.resource_type == ACPI_IO_RANGE &&
++ addr.info.io.translation))
++ return AE_OK;
+
-+ ret = skb_copy_bits(skb, offset, page_address(page), copy);
-+ BUG_ON(ret);
++ /* Store the resource in xenbus for the guest */
++ len = snprintf(str, sizeof(str), "root-%d-resource-%d",
++ info->root_num, info->resource_count);
++ if (unlikely(len >= (sizeof(str) - 1)))
++ return AE_OK;
+
-+ ninfo->frags[ninfo->nr_frags].page = page;
-+ ninfo->frags[ninfo->nr_frags].page_offset = 0;
-+ ninfo->frags[ninfo->nr_frags].size = copy;
-+ ninfo->nr_frags++;
++ buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
++ if (!buf)
++ return AE_OK;
+
-+ offset += copy;
-+ len -= copy;
++ /* Clean out resource_source */
++ res->data.address64.resource_source.index = 0xFF;
++ res->data.address64.resource_source.string_length = 0;
++ res->data.address64.resource_source.string_ptr = NULL;
++
++ ptr = (unsigned char *)res;
++
++ /* Turn the acpi_resource into an ASCII byte stream */
++ for (i = 0; i < sizeof(*res); i++) {
++ snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
++ strncat(buf, tmp, 2);
+ }
+
-+ offset = nskb->data - skb->data;
++ err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
++ str, "%s", buf);
+
-+ nskb->transport_header = skb->transport_header + offset;
-+ nskb->network_header = skb->network_header + offset;
-+ nskb->mac_header = skb->mac_header + offset;
++ if (!err)
++ info->resource_count++;
+
-+ return nskb;
++ kfree(buf);
+
-+ err_free:
-+ kfree_skb(nskb);
-+ err:
-+ return NULL;
++ return AE_OK;
+}
+
-+static inline int netbk_max_required_rx_slots(struct xen_netif *netif)
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++ publish_pci_root_cb publish_root_cb)
+{
-+ if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
-+ return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
-+ return 1; /* all in one */
-+}
++ struct controller_dev_data *dev_data = pdev->pci_dev_data;
++ struct controller_list_entry *cntrl_entry;
++ int i, root_num, len, err = 0;
++ unsigned int domain, bus;
++ char str[64];
++ struct walk_info info;
+
-+static inline int netbk_queue_full(struct xen_netif *netif)
-+{
-+ RING_IDX peek = netif->rx_req_cons_peek;
-+ RING_IDX needed = netbk_max_required_rx_slots(netif);
++ spin_lock(&dev_data->lock);
+
-+ return ((netif->rx.sring->req_prod - peek) < needed) ||
-+ ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
-+}
++ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++ /* First publish all the domain:bus info */
++ err = publish_root_cb(pdev, cntrl_entry->domain,
++ cntrl_entry->bus);
++ if (err)
++ goto out;
+
-+static void tx_queue_callback(unsigned long data)
-+{
-+ struct xen_netif *netif = (struct xen_netif *)data;
-+ if (netif_schedulable(netif))
-+ netif_wake_queue(netif->dev);
-+}
++ /*
++ * Now figure out which root-%d this belongs to
++ * so we can associate resources with it.
++ */
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++ "root_num", "%d", &root_num);
+
-+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
-+{
-+ struct xen_netif *netif = netdev_priv(dev);
++ if (err != 1)
++ goto out;
+
-+ BUG_ON(skb->dev != dev);
++ for (i = 0; i < root_num; i++) {
++ len = snprintf(str, sizeof(str), "root-%d", i);
++ if (unlikely(len >= (sizeof(str) - 1))) {
++ err = -ENOMEM;
++ goto out;
++ }
+
-+ /* Drop the packet if the target domain has no receive buffers. */
-+ if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
-+ goto drop;
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++ str, "%x:%x", &domain, &bus);
++ if (err != 2)
++ goto out;
++
++ /* Is this the one we just published? */
++ if (domain == cntrl_entry->domain &&
++ bus == cntrl_entry->bus)
++ break;
++ }
+
-+ /*
-+ * XXX For now we also copy skbuffs whose head crosses a page
-+ * boundary, because netbk_gop_skb can't handle them.
-+ */
-+ if ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE) {
-+ struct sk_buff *nskb = netbk_copy_skb(skb);
-+ if ( unlikely(nskb == NULL) )
-+ goto drop;
-+ /* Copy only the header fields we use in this driver. */
-+ nskb->dev = skb->dev;
-+ nskb->ip_summed = skb->ip_summed;
-+ dev_kfree_skb(skb);
-+ skb = nskb;
-+ }
++ if (i == root_num)
++ goto out;
+
-+ netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
-+ !!skb_shinfo(skb)->gso_size;
-+ netif_get(netif);
++ info.pdev = pdev;
++ info.resource_count = 0;
++ info.root_num = i;
++
++ /* Let ACPI do the heavy lifting on decoding resources */
++ acpi_walk_resources(cntrl_entry->controller->acpi_handle,
++ METHOD_NAME__CRS, write_xenbus_resource,
++ &info);
+
-+ if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
-+ netif->rx.sring->req_event = netif->rx_req_cons_peek +
-+ netbk_max_required_rx_slots(netif);
-+ mb(); /* request notification /then/ check & stop the queue */
-+ if (netbk_queue_full(netif)) {
-+ netif_stop_queue(dev);
-+ /*
-+ * Schedule 500ms timeout to restart the queue, thus
-+ * ensuring that an inactive queue will be drained.
-+ * Packets will be immediately be dropped until more
-+ * receive buffers become available (see
-+ * netbk_queue_full() check above).
-+ */
-+ netif->tx_queue_timeout.data = (unsigned long)netif;
-+ netif->tx_queue_timeout.function = tx_queue_callback;
-+ mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
++ /* No resouces. OK. On to the next one */
++ if (!info.resource_count)
++ continue;
++
++ /* Store the number of resources we wrote for this root-%d */
++ len = snprintf(str, sizeof(str), "root-%d-resources", i);
++ if (unlikely(len >= (sizeof(str) - 1))) {
++ err = -ENOMEM;
++ goto out;
+ }
++
++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++ "%d", info.resource_count);
++ if (err)
++ goto out;
++ }
++
++ /* Finally, write some magic to synchronize with the guest. */
++ len = snprintf(str, sizeof(str), "root-resource-magic");
++ if (unlikely(len >= (sizeof(str) - 1))) {
++ err = -ENOMEM;
++ goto out;
+ }
+
-+ skb_queue_tail(&rx_queue, skb);
-+ tasklet_schedule(&net_rx_tasklet);
++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++ "%lx", (sizeof(struct acpi_resource) * 2) + 1);
+
-+ return 0;
++out:
++ spin_unlock(&dev_data->lock);
+
-+ drop:
-+ netif->stats.tx_dropped++;
-+ dev_kfree_skb(skb);
-+ return 0;
++ return err;
+}
+
-+struct netrx_pending_operations {
-+ unsigned trans_prod, trans_cons;
-+ unsigned mmu_prod, mmu_mcl;
-+ unsigned mcl_prod, mcl_cons;
-+ unsigned copy_prod, copy_cons;
-+ unsigned meta_prod, meta_cons;
-+ struct mmu_update *mmu;
-+ struct gnttab_transfer *trans;
-+ struct gnttab_copy *copy;
-+ struct multicall_entry *mcl;
-+ struct netbk_rx_meta *meta;
-+};
-+
-+/* Set up the grant operations for this fragment. If it's a flipping
-+ interface, we also set up the unmap request from here. */
-+static u16 netbk_gop_frag(struct xen_netif *netif, struct netbk_rx_meta *meta,
-+ int i, struct netrx_pending_operations *npo,
-+ struct page *page, unsigned long size,
-+ unsigned long offset)
++void pciback_release_devices(struct pciback_device *pdev)
+{
-+ struct gnttab_copy *copy_gop;
-+ struct xen_netif_rx_request *req;
-+ unsigned long old_mfn;
-+ int idx = netif_page_index(page);
++ struct controller_dev_data *dev_data = pdev->pci_dev_data;
++ struct controller_list_entry *cntrl_entry, *c;
++ struct controller_dev_entry *dev_entry, *d;
+
-+ old_mfn = virt_to_mfn(page_address(page));
++ list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
++ list_for_each_entry_safe(dev_entry, d,
++ &cntrl_entry->dev_list, list) {
++ list_del(&dev_entry->list);
++ pcistub_put_pci_dev(dev_entry->dev);
++ kfree(dev_entry);
++ }
++ list_del(&cntrl_entry->list);
++ kfree(cntrl_entry);
++ }
+
-+ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
++ kfree(dev_data);
++ pdev->pci_dev_data = NULL;
++}
+
-+ copy_gop = npo->copy + npo->copy_prod++;
-+ copy_gop->flags = GNTCOPY_dest_gref;
-+ if (idx > -1) {
-+ struct pending_tx_info *src_pend = &pending_tx_info[idx];
-+ copy_gop->source.domid = src_pend->netif->domid;
-+ copy_gop->source.u.ref = src_pend->req.gref;
-+ copy_gop->flags |= GNTCOPY_source_gref;
-+ } else {
-+ copy_gop->source.domid = DOMID_SELF;
-+ copy_gop->source.u.gmfn = old_mfn;
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++ struct pciback_device *pdev,
++ unsigned int *domain, unsigned int *bus, unsigned int *devfn)
++{
++ struct controller_dev_data *dev_data = pdev->pci_dev_data;
++ struct controller_dev_entry *dev_entry;
++ struct controller_list_entry *cntrl_entry;
++ unsigned long flags;
++ int found = 0;
++ spin_lock_irqsave(&dev_data->lock, flags);
++
++ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++ if ((dev_entry->dev->bus->number ==
++ pcidev->bus->number) &&
++ (dev_entry->dev->devfn ==
++ pcidev->devfn) &&
++ (pci_domain_nr(dev_entry->dev->bus) ==
++ pci_domain_nr(pcidev->bus))) {
++ found = 1;
++ *domain = cntrl_entry->domain;
++ *bus = cntrl_entry->bus;
++ *devfn = dev_entry->devfn;
++ goto out;
++ }
++ }
+ }
-+ copy_gop->source.offset = offset;
-+ copy_gop->dest.domid = netif->domid;
-+ copy_gop->dest.offset = 0;
-+ copy_gop->dest.u.ref = req->gref;
-+ copy_gop->len = size;
++out:
++ spin_unlock_irqrestore(&dev_data->lock, flags);
++ return found;
+
-+ return req->id;
+}
+
-+static void netbk_gop_skb(struct sk_buff *skb,
-+ struct netrx_pending_operations *npo)
+diff --git a/drivers/xen/pciback/passthrough.c b/drivers/xen/pciback/passthrough.c
+new file mode 100644
+index 0000000..5386beb
+--- /dev/null
++++ b/drivers/xen/pciback/passthrough.c
+@@ -0,0 +1,178 @@
++/*
++ * PCI Backend - Provides restricted access to the real PCI bus topology
++ * to the frontend
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
++
++#include <linux/list.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
++
++struct passthrough_dev_data {
++ /* Access to dev_list must be protected by lock */
++ struct list_head dev_list;
++ spinlock_t lock;
++};
++
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++ unsigned int domain, unsigned int bus,
++ unsigned int devfn)
+{
-+ struct xen_netif *netif = netdev_priv(skb->dev);
-+ int nr_frags = skb_shinfo(skb)->nr_frags;
-+ int i;
-+ int extra;
-+ struct netbk_rx_meta *head_meta, *meta;
++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++ struct pci_dev_entry *dev_entry;
++ struct pci_dev *dev = NULL;
++ unsigned long flags;
+
-+ head_meta = npo->meta + npo->meta_prod++;
-+ head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
-+ head_meta->frag.size = skb_shinfo(skb)->gso_size;
-+ extra = !!head_meta->frag.size + 1;
++ spin_lock_irqsave(&dev_data->lock, flags);
+
-+ for (i = 0; i < nr_frags; i++) {
-+ meta = npo->meta + npo->meta_prod++;
-+ meta->frag = skb_shinfo(skb)->frags[i];
-+ meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
-+ meta->frag.page,
-+ meta->frag.size,
-+ meta->frag.page_offset);
++ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
++ if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
++ && bus == (unsigned int)dev_entry->dev->bus->number
++ && devfn == dev_entry->dev->devfn) {
++ dev = dev_entry->dev;
++ break;
++ }
+ }
+
-+ /*
-+ * This must occur at the end to ensure that we don't trash skb_shinfo
-+ * until we're done. We know that the head doesn't cross a page
-+ * boundary because such packets get copied in netif_be_start_xmit.
-+ */
-+ head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
-+ virt_to_page(skb->data),
-+ skb_headlen(skb),
-+ offset_in_page(skb->data));
++ spin_unlock_irqrestore(&dev_data->lock, flags);
+
-+ netif->rx.req_cons += nr_frags + extra;
++ return dev;
+}
+
-+static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++ int devid, publish_pci_dev_cb publish_cb)
+{
-+ int i;
++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++ struct pci_dev_entry *dev_entry;
++ unsigned long flags;
++ unsigned int domain, bus, devfn;
++ int err;
+
-+ for (i = 0; i < nr_frags; i++)
-+ put_page(meta[i].frag.page);
++ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
++ if (!dev_entry)
++ return -ENOMEM;
++ dev_entry->dev = dev;
++
++ spin_lock_irqsave(&dev_data->lock, flags);
++ list_add_tail(&dev_entry->list, &dev_data->dev_list);
++ spin_unlock_irqrestore(&dev_data->lock, flags);
++
++ /* Publish this device. */
++ domain = (unsigned int)pci_domain_nr(dev->bus);
++ bus = (unsigned int)dev->bus->number;
++ devfn = dev->devfn;
++ err = publish_cb(pdev, domain, bus, devfn, devid);
++
++ return err;
+}
+
-+/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
-+ used to set up the operations on the top of
-+ netrx_pending_operations, which have since been done. Check that
-+ they didn't give any errors and advance over them. */
-+static int netbk_check_gop(int nr_frags, domid_t domid,
-+ struct netrx_pending_operations *npo)
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
-+ struct gnttab_copy *copy_op;
-+ int status = NETIF_RSP_OKAY;
-+ int i;
++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++ struct pci_dev_entry *dev_entry, *t;
++ struct pci_dev *found_dev = NULL;
++ unsigned long flags;
+
-+ for (i = 0; i <= nr_frags; i++) {
-+ copy_op = npo->copy + npo->copy_cons++;
-+ if (copy_op->status != GNTST_okay) {
-+ DPRINTK("Bad status %d from copy to DOM%d.\n",
-+ copy_op->status, domid);
-+ status = NETIF_RSP_ERROR;
-+ }
++ spin_lock_irqsave(&dev_data->lock, flags);
++
++ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
++ if (dev_entry->dev == dev) {
++ list_del(&dev_entry->list);
++ found_dev = dev_entry->dev;
++ kfree(dev_entry);
++ }
+ }
+
-+ return status;
++ spin_unlock_irqrestore(&dev_data->lock, flags);
++
++ if (found_dev)
++ pcistub_put_pci_dev(found_dev);
+}
+
-+static void netbk_add_frag_responses(struct xen_netif *netif, int status,
-+ struct netbk_rx_meta *meta, int nr_frags)
++int pciback_init_devices(struct pciback_device *pdev)
+{
-+ int i;
-+ unsigned long offset;
++ struct passthrough_dev_data *dev_data;
+
-+ for (i = 0; i < nr_frags; i++) {
-+ int id = meta[i].id;
-+ int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
-+
-+ offset = 0;
-+ make_rx_response(netif, id, status, offset,
-+ meta[i].frag.size, flags);
-+ }
++ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
++ if (!dev_data)
++ return -ENOMEM;
++
++ spin_lock_init(&dev_data->lock);
++
++ INIT_LIST_HEAD(&dev_data->dev_list);
++
++ pdev->pci_dev_data = dev_data;
++
++ return 0;
+}
+
-+static void net_rx_action(unsigned long unused)
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++ publish_pci_root_cb publish_root_cb)
+{
-+ struct xen_netif *netif = NULL;
-+ s8 status;
-+ u16 id, irq, flags;
-+ struct xen_netif_rx_response *resp;
-+ struct multicall_entry *mcl;
-+ struct sk_buff_head rxq;
-+ struct sk_buff *skb;
-+ int notify_nr = 0;
-+ int ret;
-+ int nr_frags;
-+ int count;
-+ unsigned long offset;
++ int err = 0;
++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++ struct pci_dev_entry *dev_entry, *e;
++ struct pci_dev *dev;
++ int found;
++ unsigned int domain, bus;
++
++ spin_lock(&dev_data->lock);
++
++ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
++ /* Only publish this device as a root if none of its
++ * parent bridges are exported
++ */
++ found = 0;
++ dev = dev_entry->dev->bus->self;
++ for (; !found && dev != NULL; dev = dev->bus->self) {
++ list_for_each_entry(e, &dev_data->dev_list, list) {
++ if (dev == e->dev) {
++ found = 1;
++ break;
++ }
++ }
++ }
+
-+ /*
-+ * Putting hundreds of bytes on the stack is considered rude.
-+ * Static works because a tasklet can only be on one CPU at any time.
-+ */
-+ static struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
-+ static struct mmu_update rx_mmu[NET_RX_RING_SIZE];
-+ static struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
-+ static struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
-+ static unsigned char rx_notify[NR_IRQS];
-+ static u16 notify_list[NET_RX_RING_SIZE];
-+ static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
++ domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
++ bus = (unsigned int)dev_entry->dev->bus->number;
+
-+ struct netrx_pending_operations npo = {
-+ mmu: rx_mmu,
-+ trans: grant_trans_op,
-+ copy: grant_copy_op,
-+ mcl: rx_mcl,
-+ meta: meta};
++ if (!found) {
++ err = publish_root_cb(pdev, domain, bus);
++ if (err)
++ break;
++ }
++ }
+
-+ skb_queue_head_init(&rxq);
++ spin_unlock(&dev_data->lock);
+
-+ count = 0;
++ return err;
++}
+
-+ while ((skb = skb_dequeue(&rx_queue)) != NULL) {
-+ nr_frags = skb_shinfo(skb)->nr_frags;
-+ *(int *)skb->cb = nr_frags;
++void pciback_release_devices(struct pciback_device *pdev)
++{
++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++ struct pci_dev_entry *dev_entry, *t;
+
-+ netbk_gop_skb(skb, &npo);
++ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
++ list_del(&dev_entry->list);
++ pcistub_put_pci_dev(dev_entry->dev);
++ kfree(dev_entry);
++ }
+
-+ count += nr_frags + 1;
++ kfree(dev_data);
++ pdev->pci_dev_data = NULL;
++}
+
-+ __skb_queue_tail(&rxq, skb);
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++ struct pciback_device *pdev,
++ unsigned int *domain, unsigned int *bus,
++ unsigned int *devfn)
+
-+ /* Filled the batch queue? */
-+ if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
-+ break;
-+ }
++{
++ *domain = pci_domain_nr(pcidev->bus);
++ *bus = pcidev->bus->number;
++ *devfn = pcidev->devfn;
++ return 1;
++}
+diff --git a/drivers/xen/pciback/pci_stub.c b/drivers/xen/pciback/pci_stub.c
+new file mode 100644
+index 0000000..02178e2
+--- /dev/null
++++ b/drivers/xen/pciback/pci_stub.c
+@@ -0,0 +1,1287 @@
++/*
++ * PCI Stub Driver - Grabs devices in backend to be exported later
++ *
++ * Ryan Wilson <hap9 at epoch.ncsc.mil>
++ * Chris Bookholt <hap10 at epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/rwsem.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/kref.h>
++#include <linux/pci.h>
++#include <linux/wait.h>
++#include <linux/sched.h>
++#include <asm/atomic.h>
++#include <xen/events.h>
++#include <asm/xen/pci.h>
++#include <asm/xen/hypervisor.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
++
++static char *pci_devs_to_hide;
++wait_queue_head_t aer_wait_queue;
++/*Add sem for sync AER handling and pciback remove/reconfigue ops,
++* We want to avoid in middle of AER ops, pciback devices is being removed
++*/
++static DECLARE_RWSEM(pcistub_sem);
++module_param_named(hide, pci_devs_to_hide, charp, 0444);
+
-+ BUG_ON(npo.meta_prod > ARRAY_SIZE(meta));
++struct pcistub_device_id {
++ struct list_head slot_list;
++ int domain;
++ unsigned char bus;
++ unsigned int devfn;
++};
++static LIST_HEAD(pcistub_device_ids);
++static DEFINE_SPINLOCK(device_ids_lock);
+
-+ npo.mmu_mcl = npo.mcl_prod;
-+ if (npo.mcl_prod) {
-+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
-+ BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu));
-+ mcl = npo.mcl + npo.mcl_prod++;
++struct pcistub_device {
++ struct kref kref;
++ struct list_head dev_list;
++ spinlock_t lock;
+
-+ BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
-+ mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
++ struct pci_dev *dev;
++ struct pciback_device *pdev;/* non-NULL if struct pci_dev is in use */
++};
+
-+ mcl->op = __HYPERVISOR_mmu_update;
-+ mcl->args[0] = (unsigned long)rx_mmu;
-+ mcl->args[1] = npo.mmu_prod;
-+ mcl->args[2] = 0;
-+ mcl->args[3] = DOMID_SELF;
-+ }
++/* Access to pcistub_devices & seized_devices lists and the initialize_devices
++ * flag must be locked with pcistub_devices_lock
++ */
++static DEFINE_SPINLOCK(pcistub_devices_lock);
++static LIST_HEAD(pcistub_devices);
+
-+ if (npo.trans_prod) {
-+ BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op));
-+ mcl = npo.mcl + npo.mcl_prod++;
-+ mcl->op = __HYPERVISOR_grant_table_op;
-+ mcl->args[0] = GNTTABOP_transfer;
-+ mcl->args[1] = (unsigned long)grant_trans_op;
-+ mcl->args[2] = npo.trans_prod;
-+ }
++/* wait for device_initcall before initializing our devices
++ * (see pcistub_init_devices_late)
++ */
++static int initialize_devices;
++static LIST_HEAD(seized_devices);
++
++static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
++{
++ struct pcistub_device *psdev;
+
-+ if (npo.copy_prod) {
-+ BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op));
-+ mcl = npo.mcl + npo.mcl_prod++;
-+ mcl->op = __HYPERVISOR_grant_table_op;
-+ mcl->args[0] = GNTTABOP_copy;
-+ mcl->args[1] = (unsigned long)grant_copy_op;
-+ mcl->args[2] = npo.copy_prod;
++ dev_dbg(&dev->dev, "pcistub_device_alloc\n");
++
++ psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
++ if (!psdev)
++ return NULL;
++
++ psdev->dev = pci_dev_get(dev);
++ if (!psdev->dev) {
++ kfree(psdev);
++ return NULL;
+ }
+
-+ /* Nothing to do? */
-+ if (!npo.mcl_prod)
-+ return;
++ kref_init(&psdev->kref);
++ spin_lock_init(&psdev->lock);
+
-+ BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl));
++ return psdev;
++}
+
-+ ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
-+ BUG_ON(ret != 0);
-+ /* The mmu_machphys_update() must not fail. */
-+ BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0);
++/* Don't call this directly as it's called by pcistub_device_put */
++static void pcistub_device_release(struct kref *kref)
++{
++ struct pcistub_device *psdev;
+
-+ while ((skb = __skb_dequeue(&rxq)) != NULL) {
-+ nr_frags = *(int *)skb->cb;
++ psdev = container_of(kref, struct pcistub_device, kref);
+
-+ netif = netdev_priv(skb->dev);
++ dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
+
-+ netif->stats.tx_bytes += skb->len;
-+ netif->stats.tx_packets++;
++ xen_unregister_device_domain_owner(psdev->dev);
+
-+ status = netbk_check_gop(nr_frags, netif->domid, &npo);
++ /* Clean-up the device */
++ pciback_reset_device(psdev->dev);
++ pciback_config_free_dyn_fields(psdev->dev);
++ pciback_config_free_dev(psdev->dev);
++ kfree(pci_get_drvdata(psdev->dev));
++ pci_set_drvdata(psdev->dev, NULL);
+
-+ id = meta[npo.meta_cons].id;
-+ flags = nr_frags ? NETRXF_more_data : 0;
++ pci_dev_put(psdev->dev);
+
-+ if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
-+ flags |= NETRXF_csum_blank | NETRXF_data_validated;
-+ else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
-+ /* remote but checksummed. */
-+ flags |= NETRXF_data_validated;
++ kfree(psdev);
++}
+
-+ offset = 0;
-+ resp = make_rx_response(netif, id, status, offset,
-+ skb_headlen(skb), flags);
++static inline void pcistub_device_get(struct pcistub_device *psdev)
++{
++ kref_get(&psdev->kref);
++}
+
-+ if (meta[npo.meta_cons].frag.size) {
-+ struct xen_netif_extra_info *gso =
-+ (struct xen_netif_extra_info *)
-+ RING_GET_RESPONSE(&netif->rx,
-+ netif->rx.rsp_prod_pvt++);
++static inline void pcistub_device_put(struct pcistub_device *psdev)
++{
++ kref_put(&psdev->kref, pcistub_device_release);
++}
+
-+ resp->flags |= NETRXF_extra_info;
++static struct pcistub_device *pcistub_device_find(int domain, int bus,
++ int slot, int func)
++{
++ struct pcistub_device *psdev = NULL;
++ unsigned long flags;
+
-+ gso->u.gso.size = meta[npo.meta_cons].frag.size;
-+ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
-+ gso->u.gso.pad = 0;
-+ gso->u.gso.features = 0;
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
-+ gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
-+ gso->flags = 0;
++ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++ if (psdev->dev != NULL
++ && domain == pci_domain_nr(psdev->dev->bus)
++ && bus == psdev->dev->bus->number
++ && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
++ pcistub_device_get(psdev);
++ goto out;
+ }
++ }
+
-+ netbk_add_frag_responses(netif, status,
-+ meta + npo.meta_cons + 1,
-+ nr_frags);
++ /* didn't find it */
++ psdev = NULL;
+
-+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
-+ irq = netif->irq;
-+ if (ret && !rx_notify[irq] &&
-+ (netif->smart_poll != 1)) {
-+ rx_notify[irq] = 1;
-+ notify_list[notify_nr++] = irq;
-+ }
++out:
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++ return psdev;
++}
+
-+ if (netif_queue_stopped(netif->dev) &&
-+ netif_schedulable(netif) &&
-+ !netbk_queue_full(netif))
-+ netif_wake_queue(netif->dev);
++static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
++ struct pcistub_device *psdev)
++{
++ struct pci_dev *pci_dev = NULL;
++ unsigned long flags;
+
-+ /*
-+ * netfront_smartpoll_active indicates whether
-+ * netfront timer is active.
-+ */
-+ if ((netif->smart_poll == 1)) {
-+ if (!(netif->rx.sring->netfront_smartpoll_active)) {
-+ notify_remote_via_irq(irq);
-+ netif->rx.sring->netfront_smartpoll_active = 1;
-+ }
-+ }
++ pcistub_device_get(psdev);
+
-+ netif_put(netif);
-+ dev_kfree_skb(skb);
-+ npo.meta_cons += nr_frags + 1;
++ spin_lock_irqsave(&psdev->lock, flags);
++ if (!psdev->pdev) {
++ psdev->pdev = pdev;
++ pci_dev = psdev->dev;
+ }
++ spin_unlock_irqrestore(&psdev->lock, flags);
+
-+ while (notify_nr != 0) {
-+ irq = notify_list[--notify_nr];
-+ rx_notify[irq] = 0;
-+ notify_remote_via_irq(irq);
-+ }
++ if (!pci_dev)
++ pcistub_device_put(psdev);
+
-+ /* More work to do? */
-+ if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
-+ tasklet_schedule(&net_rx_tasklet);
++ return pci_dev;
+}
+
-+static void net_alarm(unsigned long unused)
++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
++ int domain, int bus,
++ int slot, int func)
+{
-+ tasklet_schedule(&net_rx_tasklet);
-+}
++ struct pcistub_device *psdev;
++ struct pci_dev *found_dev = NULL;
++ unsigned long flags;
+
-+static void netbk_tx_pending_timeout(unsigned long unused)
-+{
-+ tasklet_schedule(&net_tx_tasklet);
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++ if (psdev->dev != NULL
++ && domain == pci_domain_nr(psdev->dev->bus)
++ && bus == psdev->dev->bus->number
++ && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
++ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
++ break;
++ }
++ }
++
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++ return found_dev;
+}
+
-+struct net_device_stats *netif_be_get_stats(struct net_device *dev)
++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
++ struct pci_dev *dev)
+{
-+ struct xen_netif *netif = netdev_priv(dev);
-+ return &netif->stats;
++ struct pcistub_device *psdev;
++ struct pci_dev *found_dev = NULL;
++ unsigned long flags;
++
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++ if (psdev->dev == dev) {
++ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
++ break;
++ }
++ }
++
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++ return found_dev;
+}
+
-+static int __on_net_schedule_list(struct xen_netif *netif)
++void pcistub_put_pci_dev(struct pci_dev *dev)
+{
-+ return !list_empty(&netif->list);
++ struct pcistub_device *psdev, *found_psdev = NULL;
++ unsigned long flags;
++
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++ if (psdev->dev == dev) {
++ found_psdev = psdev;
++ break;
++ }
++ }
++
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++ /*hold this lock for avoiding breaking link between
++ * pcistub and pciback when AER is in processing
++ */
++ down_write(&pcistub_sem);
++ /* Cleanup our device
++ * (so it's ready for the next domain)
++ */
++ pciback_reset_device(found_psdev->dev);
++ pciback_config_free_dyn_fields(found_psdev->dev);
++ pciback_config_reset_dev(found_psdev->dev);
++
++ spin_lock_irqsave(&found_psdev->lock, flags);
++ found_psdev->pdev = NULL;
++ spin_unlock_irqrestore(&found_psdev->lock, flags);
++
++ pcistub_device_put(found_psdev);
++ up_write(&pcistub_sem);
+}
+
-+static void remove_from_net_schedule_list(struct xen_netif *netif)
++static int __devinit pcistub_match_one(struct pci_dev *dev,
++ struct pcistub_device_id *pdev_id)
+{
-+ spin_lock_irq(&net_schedule_list_lock);
-+ if (likely(__on_net_schedule_list(netif))) {
-+ list_del_init(&netif->list);
-+ netif_put(netif);
++ /* Match the specified device by domain, bus, slot, func and also if
++ * any of the device's parent bridges match.
++ */
++ for (; dev != NULL; dev = dev->bus->self) {
++ if (pci_domain_nr(dev->bus) == pdev_id->domain
++ && dev->bus->number == pdev_id->bus
++ && dev->devfn == pdev_id->devfn)
++ return 1;
++
++ /* Sometimes topmost bridge links to itself. */
++ if (dev == dev->bus->self)
++ break;
+ }
-+ spin_unlock_irq(&net_schedule_list_lock);
++
++ return 0;
+}
+
-+static void add_to_net_schedule_list_tail(struct xen_netif *netif)
++static int __devinit pcistub_match(struct pci_dev *dev)
+{
-+ if (__on_net_schedule_list(netif))
-+ return;
++ struct pcistub_device_id *pdev_id;
++ unsigned long flags;
++ int found = 0;
+
-+ spin_lock_irq(&net_schedule_list_lock);
-+ if (!__on_net_schedule_list(netif) &&
-+ likely(netif_schedulable(netif))) {
-+ list_add_tail(&netif->list, &net_schedule_list);
-+ netif_get(netif);
++ spin_lock_irqsave(&device_ids_lock, flags);
++ list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
++ if (pcistub_match_one(dev, pdev_id)) {
++ found = 1;
++ break;
++ }
+ }
-+ spin_unlock_irq(&net_schedule_list_lock);
++ spin_unlock_irqrestore(&device_ids_lock, flags);
++
++ return found;
+}
+
-+void netif_schedule_work(struct xen_netif *netif)
++static int __devinit pcistub_init_device(struct pci_dev *dev)
+{
-+ int more_to_do;
++ struct pciback_dev_data *dev_data;
++ int err = 0;
+
-+ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
++ dev_dbg(&dev->dev, "initializing...\n");
+
-+ if (more_to_do) {
-+ add_to_net_schedule_list_tail(netif);
-+ maybe_schedule_tx_action();
++ /* The PCI backend is not intended to be a module (or to work with
++ * removable PCI devices (yet). If it were, pciback_config_free()
++ * would need to be called somewhere to free the memory allocated
++ * here and then to call kfree(pci_get_drvdata(psdev->dev)).
++ */
++ dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
++ if (!dev_data) {
++ err = -ENOMEM;
++ goto out;
+ }
-+}
++ pci_set_drvdata(dev, dev_data);
+
-+void netif_deschedule_work(struct xen_netif *netif)
-+{
-+ remove_from_net_schedule_list(netif);
-+}
++ dev_dbg(&dev->dev, "initializing config\n");
+
++ init_waitqueue_head(&aer_wait_queue);
++ err = pciback_config_init_dev(dev);
++ if (err)
++ goto out;
+
-+static void tx_add_credit(struct xen_netif *netif)
-+{
-+ unsigned long max_burst, max_credit;
++ /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
++ * must do this here because pcibios_enable_device may specify
++ * the pci device's true irq (and possibly its other resources)
++ * if they differ from what's in the configuration space.
++ * This makes the assumption that the device's resources won't
++ * change after this point (otherwise this code may break!)
++ */
++ dev_dbg(&dev->dev, "enabling device\n");
++ err = pci_enable_device(dev);
++ if (err)
++ goto config_release;
+
-+ /*
-+ * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
-+ * Otherwise the interface can seize up due to insufficient credit.
++ /* Now disable the device (this also ensures some private device
++ * data is setup before we export)
+ */
-+ max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
-+ max_burst = min(max_burst, 131072UL);
-+ max_burst = max(max_burst, netif->credit_bytes);
++ dev_dbg(&dev->dev, "reset device\n");
++ pciback_reset_device(dev);
+
-+ /* Take care that adding a new chunk of credit doesn't wrap to zero. */
-+ max_credit = netif->remaining_credit + netif->credit_bytes;
-+ if (max_credit < netif->remaining_credit)
-+ max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
++ return 0;
+
-+ netif->remaining_credit = min(max_credit, max_burst);
-+}
++config_release:
++ pciback_config_free_dev(dev);
+
-+static void tx_credit_callback(unsigned long data)
-+{
-+ struct xen_netif *netif = (struct xen_netif *)data;
-+ tx_add_credit(netif);
-+ netif_schedule_work(netif);
++out:
++ pci_set_drvdata(dev, NULL);
++ kfree(dev_data);
++ return err;
+}
+
-+static inline int copy_pending_req(pending_ring_idx_t pending_idx)
++/*
++ * Because some initialization still happens on
++ * devices during fs_initcall, we need to defer
++ * full initialization of our devices until
++ * device_initcall.
++ */
++static int __init pcistub_init_devices_late(void)
+{
-+ return gnttab_copy_grant_page(grant_tx_handle[pending_idx],
-+ &mmap_pages[pending_idx]);
++ struct pcistub_device *psdev;
++ unsigned long flags;
++ int err = 0;
++
++ pr_debug("pciback: pcistub_init_devices_late\n");
++
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++ while (!list_empty(&seized_devices)) {
++ psdev = container_of(seized_devices.next,
++ struct pcistub_device, dev_list);
++ list_del(&psdev->dev_list);
++
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++ err = pcistub_init_device(psdev->dev);
++ if (err) {
++ dev_err(&psdev->dev->dev,
++ "error %d initializing device\n", err);
++ kfree(psdev);
++ psdev = NULL;
++ }
++
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++ if (psdev)
++ list_add_tail(&psdev->dev_list, &pcistub_devices);
++ }
++
++ initialize_devices = 1;
++
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++ return 0;
+}
+
-+inline static void net_tx_action_dealloc(void)
++static int __devinit pcistub_seize(struct pci_dev *dev)
+{
-+ struct netbk_tx_pending_inuse *inuse, *n;
-+ struct gnttab_unmap_grant_ref *gop;
-+ u16 pending_idx;
-+ pending_ring_idx_t dc, dp;
-+ struct xen_netif *netif;
-+ int ret;
-+ LIST_HEAD(list);
++ struct pcistub_device *psdev;
++ unsigned long flags;
++ int err = 0;
+
-+ dc = dealloc_cons;
-+ gop = tx_unmap_ops;
++ psdev = pcistub_device_alloc(dev);
++ if (!psdev)
++ return -ENOMEM;
+
-+ /*
-+ * Free up any grants we have finished using
-+ */
-+ do {
-+ dp = dealloc_prod;
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
-+ /* Ensure we see all indices enqueued by netif_idx_release(). */
-+ smp_rmb();
++ if (initialize_devices) {
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
-+ while (dc != dp) {
-+ unsigned long pfn;
++ /* don't want irqs disabled when calling pcistub_init_device */
++ err = pcistub_init_device(psdev->dev);
++
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++ if (!err)
++ list_add(&psdev->dev_list, &pcistub_devices);
++ } else {
++ dev_dbg(&dev->dev, "deferring initialization\n");
++ list_add(&psdev->dev_list, &seized_devices);
++ }
++
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++ if (err)
++ pcistub_device_put(psdev);
++
++ return err;
++}
++
++static int __devinit pcistub_probe(struct pci_dev *dev,
++ const struct pci_device_id *id)
++{
++ int err = 0;
+
-+ pending_idx = dealloc_ring[pending_index(dc++)];
-+ list_move_tail(&pending_inuse[pending_idx].list, &list);
++ dev_dbg(&dev->dev, "probing...\n");
+
-+ pfn = idx_to_pfn(pending_idx);
-+ /* Already unmapped? */
-+ if (!phys_to_machine_mapping_valid(pfn))
-+ continue;
++ if (pcistub_match(dev)) {
+
-+ gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
-+ GNTMAP_host_map,
-+ grant_tx_handle[pending_idx]);
-+ gop++;
++ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
++ && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
++ dev_err(&dev->dev, "can't export pci devices that "
++ "don't have a normal (0) or bridge (1) "
++ "header type!\n");
++ err = -ENODEV;
++ goto out;
+ }
+
-+ if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB ||
-+ list_empty(&pending_inuse_head))
-+ break;
++ dev_info(&dev->dev, "seizing device\n");
++ err = pcistub_seize(dev);
++ } else
++ /* Didn't find the device */
++ err = -ENODEV;
+
-+ /* Copy any entries that have been pending for too long. */
-+ list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) {
-+ if (time_after(inuse->alloc_time + HZ / 2, jiffies))
-+ break;
++out:
++ return err;
++}
+
-+ pending_idx = inuse - pending_inuse;
++static void pcistub_remove(struct pci_dev *dev)
++{
++ struct pcistub_device *psdev, *found_psdev = NULL;
++ unsigned long flags;
+
-+ pending_tx_info[pending_idx].netif->nr_copied_skbs++;
++ dev_dbg(&dev->dev, "removing\n");
+
-+ switch (copy_pending_req(pending_idx)) {
-+ case 0:
-+ list_move_tail(&inuse->list, &list);
-+ continue;
-+ case -EBUSY:
-+ list_del_init(&inuse->list);
-+ continue;
-+ case -ENOENT:
-+ continue;
-+ }
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++ pciback_config_quirk_release(dev);
+
++ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++ if (psdev->dev == dev) {
++ found_psdev = psdev;
+ break;
+ }
-+ } while (dp != dealloc_prod);
++ }
+
-+ dealloc_cons = dc;
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
-+ ret = HYPERVISOR_grant_table_op(
-+ GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
-+ BUG_ON(ret);
++ if (found_psdev) {
++ dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
++ found_psdev->pdev);
+
-+ list_for_each_entry_safe(inuse, n, &list, list) {
-+ pending_idx = inuse - pending_inuse;
++ if (found_psdev->pdev) {
++ printk(KERN_WARNING "pciback: ****** removing device "
++ "%s while still in-use! ******\n",
++ pci_name(found_psdev->dev));
++ printk(KERN_WARNING "pciback: ****** driver domain may "
++ "still access this device's i/o resources!\n");
++ printk(KERN_WARNING "pciback: ****** shutdown driver "
++ "domain before binding device\n");
++ printk(KERN_WARNING "pciback: ****** to other drivers "
++ "or domains\n");
+
-+ netif = pending_tx_info[pending_idx].netif;
++ pciback_release_pci_dev(found_psdev->pdev,
++ found_psdev->dev);
++ }
+
-+ make_tx_response(netif, &pending_tx_info[pending_idx].req,
-+ NETIF_RSP_OKAY);
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
++ list_del(&found_psdev->dev_list);
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
-+ /* Ready for next use. */
-+ gnttab_reset_grant_page(mmap_pages[pending_idx]);
++ /* the final put for releasing from the list */
++ pcistub_device_put(found_psdev);
++ }
++}
+
-+ pending_ring[pending_index(pending_prod++)] = pending_idx;
++static const struct pci_device_id pcistub_ids[] = {
++ {
++ .vendor = PCI_ANY_ID,
++ .device = PCI_ANY_ID,
++ .subvendor = PCI_ANY_ID,
++ .subdevice = PCI_ANY_ID,
++ },
++ {0,},
++};
+
-+ netif_put(netif);
++#define PCI_NODENAME_MAX 40
++static void kill_domain_by_device(struct pcistub_device *psdev)
++{
++ struct xenbus_transaction xbt;
++ int err;
++ char nodename[PCI_NODENAME_MAX];
+
-+ list_del_init(&inuse->list);
++ if (!psdev)
++ dev_err(&psdev->dev->dev,
++ "device is NULL when do AER recovery/kill_domain\n");
++ snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
++ psdev->pdev->xdev->otherend_id);
++ nodename[strlen(nodename)] = '\0';
++
++again:
++ err = xenbus_transaction_start(&xbt);
++ if (err) {
++ dev_err(&psdev->dev->dev,
++ "error %d when start xenbus transaction\n", err);
++ return;
++ }
++ /*PV AER handlers will set this flag*/
++ xenbus_printf(xbt, nodename, "aerState" , "aerfail");
++ err = xenbus_transaction_end(xbt, 0);
++ if (err) {
++ if (err == -EAGAIN)
++ goto again;
++ dev_err(&psdev->dev->dev,
++ "error %d when end xenbus transaction\n", err);
++ return;
+ }
+}
+
-+static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *txp, RING_IDX end)
++/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
++ * backend need to have cooperation. In pciback, those steps will do similar
++ * jobs: send service request and waiting for front_end response.
++*/
++static pci_ers_result_t common_process(struct pcistub_device *psdev,
++ pci_channel_state_t state, int aer_cmd, pci_ers_result_t result)
+{
-+ RING_IDX cons = netif->tx.req_cons;
++ pci_ers_result_t res = result;
++ struct xen_pcie_aer_op *aer_op;
++ int ret;
+
-+ do {
-+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
-+ if (cons >= end)
-+ break;
-+ txp = RING_GET_REQUEST(&netif->tx, cons++);
-+ } while (1);
-+ netif->tx.req_cons = cons;
-+ netif_schedule_work(netif);
-+ netif_put(netif);
++ /*with PV AER drivers*/
++ aer_op = &(psdev->pdev->sh_info->aer_op);
++ aer_op->cmd = aer_cmd ;
++ /*useful for error_detected callback*/
++ aer_op->err = state;
++ /*pcifront_end BDF*/
++ ret = pciback_get_pcifront_dev(psdev->dev, psdev->pdev,
++ &aer_op->domain, &aer_op->bus, &aer_op->devfn);
++ if (!ret) {
++ dev_err(&psdev->dev->dev,
++ "pciback: failed to get pcifront device\n");
++ return PCI_ERS_RESULT_NONE;
++ }
++ wmb();
++
++ dev_dbg(&psdev->dev->dev,
++ "pciback: aer_op %x dom %x bus %x devfn %x\n",
++ aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
++ /*local flag to mark there's aer request, pciback callback will use this
++ * flag to judge whether we need to check pci-front give aer service
++ * ack signal
++ */
++ set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
++
++ /*It is possible that a pcifront conf_read_write ops request invokes
++ * the callback which cause the spurious execution of wake_up.
++ * Yet it is harmless and better than a spinlock here
++ */
++ set_bit(_XEN_PCIB_active,
++ (unsigned long *)&psdev->pdev->sh_info->flags);
++ wmb();
++ notify_remote_via_irq(psdev->pdev->evtchn_irq);
++
++ ret = wait_event_timeout(aer_wait_queue, !(test_bit(_XEN_PCIB_active,
++ (unsigned long *)&psdev->pdev->sh_info->flags)), 300*HZ);
++
++ if (!ret) {
++ if (test_bit(_XEN_PCIB_active,
++ (unsigned long *)&psdev->pdev->sh_info->flags)) {
++ dev_err(&psdev->dev->dev,
++ "pcifront aer process not responding!\n");
++ clear_bit(_XEN_PCIB_active,
++ (unsigned long *)&psdev->pdev->sh_info->flags);
++ aer_op->err = PCI_ERS_RESULT_NONE;
++ return res;
++ }
++ }
++ clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
++
++ if (test_bit(_XEN_PCIF_active,
++ (unsigned long *)&psdev->pdev->sh_info->flags)) {
++ dev_dbg(&psdev->dev->dev,
++ "schedule pci_conf service in pciback \n");
++ test_and_schedule_op(psdev->pdev);
++ }
++
++ res = (pci_ers_result_t)aer_op->err;
++ return res;
+}
+
-+static int netbk_count_requests(struct xen_netif *netif,
-+ struct xen_netif_tx_request *first,
-+ struct xen_netif_tx_request *txp, int work_to_do)
++/*
++* pciback_slot_reset: it will send the slot_reset request to pcifront in case
++* of the device driver could provide this service, and then wait for pcifront
++* ack.
++* @dev: pointer to PCI devices
++* return value is used by aer_core do_recovery policy
++*/
++static pci_ers_result_t pciback_slot_reset(struct pci_dev *dev)
+{
-+ RING_IDX cons = netif->tx.req_cons;
-+ int frags = 0;
++ struct pcistub_device *psdev;
++ pci_ers_result_t result;
+
-+ if (!(first->flags & NETTXF_more_data))
-+ return 0;
-+
-+ do {
-+ if (frags >= work_to_do) {
-+ DPRINTK("Need more frags\n");
-+ return -frags;
-+ }
++ result = PCI_ERS_RESULT_RECOVERED;
++ dev_dbg(&dev->dev, "pciback_slot_reset(bus:%x,devfn:%x)\n",
++ dev->bus->number, dev->devfn);
++
++ down_write(&pcistub_sem);
++ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++ dev->bus->number,
++ PCI_SLOT(dev->devfn),
++ PCI_FUNC(dev->devfn));
++
++ if (!psdev || !psdev->pdev) {
++ dev_err(&dev->dev,
++ "pciback device is not found/assigned\n");
++ goto end;
++ }
++
++ if (!psdev->pdev->sh_info) {
++ dev_err(&dev->dev, "pciback device is not connected or owned"
++ " by HVM, kill it\n");
++ kill_domain_by_device(psdev);
++ goto release;
++ }
+
-+ if (unlikely(frags >= MAX_SKB_FRAGS)) {
-+ DPRINTK("Too many frags\n");
-+ return -frags;
-+ }
++ if (!test_bit(_XEN_PCIB_AERHANDLER,
++ (unsigned long *)&psdev->pdev->sh_info->flags)) {
++ dev_err(&dev->dev,
++ "guest with no AER driver should have been killed\n");
++ goto release;
++ }
++ result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
+
-+ memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
-+ sizeof(*txp));
-+ if (txp->size > first->size) {
-+ DPRINTK("Frags galore\n");
-+ return -frags;
-+ }
++ if (result == PCI_ERS_RESULT_NONE ||
++ result == PCI_ERS_RESULT_DISCONNECT) {
++ dev_dbg(&dev->dev,
++ "No AER slot_reset service or disconnected!\n");
++ kill_domain_by_device(psdev);
++ }
++release:
++ pcistub_device_put(psdev);
++end:
++ up_write(&pcistub_sem);
++ return result;
+
-+ first->size -= txp->size;
-+ frags++;
++}
+
-+ if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
-+ DPRINTK("txp->offset: %x, size: %u\n",
-+ txp->offset, txp->size);
-+ return -frags;
-+ }
-+ } while ((txp++)->flags & NETTXF_more_data);
+
-+ return frags;
-+}
++/*pciback_mmio_enabled: it will send the mmio_enabled request to pcifront
++* in case of the device driver could provide this service, and then wait
++* for pcifront ack
++* @dev: pointer to PCI devices
++* return value is used by aer_core do_recovery policy
++*/
+
-+static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
-+ struct sk_buff *skb,
-+ struct xen_netif_tx_request *txp,
-+ struct gnttab_map_grant_ref *mop)
++static pci_ers_result_t pciback_mmio_enabled(struct pci_dev *dev)
+{
-+ struct skb_shared_info *shinfo = skb_shinfo(skb);
-+ skb_frag_t *frags = shinfo->frags;
-+ unsigned long pending_idx = *((u16 *)skb->data);
-+ int i, start;
++ struct pcistub_device *psdev;
++ pci_ers_result_t result;
+
-+ /* Skip first skb fragment if it is on same page as header fragment. */
-+ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
++ result = PCI_ERS_RESULT_RECOVERED;
++ dev_dbg(&dev->dev, "pciback_mmio_enabled(bus:%x,devfn:%x)\n",
++ dev->bus->number, dev->devfn);
++
++ down_write(&pcistub_sem);
++ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++ dev->bus->number,
++ PCI_SLOT(dev->devfn),
++ PCI_FUNC(dev->devfn));
++
++ if (!psdev || !psdev->pdev) {
++ dev_err(&dev->dev,
++ "pciback device is not found/assigned\n");
++ goto end;
++ }
++
++ if (!psdev->pdev->sh_info) {
++ dev_err(&dev->dev, "pciback device is not connected or owned"
++ " by HVM, kill it\n");
++ kill_domain_by_device(psdev);
++ goto release;
++ }
+
-+ for (i = start; i < shinfo->nr_frags; i++, txp++) {
-+ pending_idx = pending_ring[pending_index(pending_cons++)];
++ if (!test_bit(_XEN_PCIB_AERHANDLER,
++ (unsigned long *)&psdev->pdev->sh_info->flags)) {
++ dev_err(&dev->dev,
++ "guest with no AER driver should have been killed\n");
++ goto release;
++ }
++ result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
+
-+ gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
-+ GNTMAP_host_map | GNTMAP_readonly,
-+ txp->gref, netif->domid);
++ if (result == PCI_ERS_RESULT_NONE ||
++ result == PCI_ERS_RESULT_DISCONNECT) {
++ dev_dbg(&dev->dev,
++ "No AER mmio_enabled service or disconnected!\n");
++ kill_domain_by_device(psdev);
++ }
++release:
++ pcistub_device_put(psdev);
++end:
++ up_write(&pcistub_sem);
++ return result;
++}
++
++/*pciback_error_detected: it will send the error_detected request to pcifront
++* in case of the device driver could provide this service, and then wait
++* for pcifront ack.
++* @dev: pointer to PCI devices
++* @error: the current PCI connection state
++* return value is used by aer_core do_recovery policy
++*/
+
-+ memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
-+ netif_get(netif);
-+ pending_tx_info[pending_idx].netif = netif;
-+ frags[i].page = (void *)pending_idx;
++static pci_ers_result_t pciback_error_detected(struct pci_dev *dev,
++ pci_channel_state_t error)
++{
++ struct pcistub_device *psdev;
++ pci_ers_result_t result;
++
++ result = PCI_ERS_RESULT_CAN_RECOVER;
++ dev_dbg(&dev->dev, "pciback_error_detected(bus:%x,devfn:%x)\n",
++ dev->bus->number, dev->devfn);
++
++ down_write(&pcistub_sem);
++ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++ dev->bus->number,
++ PCI_SLOT(dev->devfn),
++ PCI_FUNC(dev->devfn));
++
++ if (!psdev || !psdev->pdev) {
++ dev_err(&dev->dev,
++ "pciback device is not found/assigned\n");
++ goto end;
++ }
++
++ if (!psdev->pdev->sh_info) {
++ dev_err(&dev->dev, "pciback device is not connected or owned"
++ " by HVM, kill it\n");
++ kill_domain_by_device(psdev);
++ goto release;
+ }
+
-+ return mop;
-+}
++ /*Guest owns the device yet no aer handler regiested, kill guest*/
++ if (!test_bit(_XEN_PCIB_AERHANDLER,
++ (unsigned long *)&psdev->pdev->sh_info->flags)) {
++ dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
++ kill_domain_by_device(psdev);
++ goto release;
++ }
++ result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
+
-+static int netbk_tx_check_mop(struct sk_buff *skb,
-+ struct gnttab_map_grant_ref **mopp)
++ if (result == PCI_ERS_RESULT_NONE ||
++ result == PCI_ERS_RESULT_DISCONNECT) {
++ dev_dbg(&dev->dev,
++ "No AER error_detected service or disconnected!\n");
++ kill_domain_by_device(psdev);
++ }
++release:
++ pcistub_device_put(psdev);
++end:
++ up_write(&pcistub_sem);
++ return result;
++}
++
++/*pciback_error_resume: it will send the error_resume request to pcifront
++* in case of the device driver could provide this service, and then wait
++* for pcifront ack.
++* @dev: pointer to PCI devices
++*/
++
++static void pciback_error_resume(struct pci_dev *dev)
+{
-+ struct gnttab_map_grant_ref *mop = *mopp;
-+ int pending_idx = *((u16 *)skb->data);
-+ struct xen_netif *netif = pending_tx_info[pending_idx].netif;
-+ struct xen_netif_tx_request *txp;
-+ struct skb_shared_info *shinfo = skb_shinfo(skb);
-+ int nr_frags = shinfo->nr_frags;
-+ int i, err, start;
++ struct pcistub_device *psdev;
+
-+ /* Check status of header. */
-+ err = mop->status;
-+ if (unlikely(err)) {
-+ txp = &pending_tx_info[pending_idx].req;
-+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
-+ pending_ring[pending_index(pending_prod++)] = pending_idx;
-+ netif_put(netif);
-+ } else {
-+ set_phys_to_machine(
-+ __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
-+ FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
-+ grant_tx_handle[pending_idx] = mop->handle;
++ dev_dbg(&dev->dev, "pciback_error_resume(bus:%x,devfn:%x)\n",
++ dev->bus->number, dev->devfn);
++
++ down_write(&pcistub_sem);
++ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++ dev->bus->number,
++ PCI_SLOT(dev->devfn),
++ PCI_FUNC(dev->devfn));
++
++ if (!psdev || !psdev->pdev) {
++ dev_err(&dev->dev,
++ "pciback device is not found/assigned\n");
++ goto end;
+ }
+
-+ /* Skip first skb fragment if it is on same page as header fragment. */
-+ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
++ if (!psdev->pdev->sh_info) {
++ dev_err(&dev->dev, "pciback device is not connected or owned"
++ " by HVM, kill it\n");
++ kill_domain_by_device(psdev);
++ goto release;
++ }
+
-+ for (i = start; i < nr_frags; i++) {
-+ int j, newerr;
++ if (!test_bit(_XEN_PCIB_AERHANDLER,
++ (unsigned long *)&psdev->pdev->sh_info->flags)) {
++ dev_err(&dev->dev,
++ "guest with no AER driver should have been killed\n");
++ kill_domain_by_device(psdev);
++ goto release;
++ }
++ common_process(psdev, 1, XEN_PCI_OP_aer_resume,
++ PCI_ERS_RESULT_RECOVERED);
++release:
++ pcistub_device_put(psdev);
++end:
++ up_write(&pcistub_sem);
++ return;
++}
+
-+ pending_idx = (unsigned long)shinfo->frags[i].page;
++/*add pciback AER handling*/
++static struct pci_error_handlers pciback_error_handler = {
++ .error_detected = pciback_error_detected,
++ .mmio_enabled = pciback_mmio_enabled,
++ .slot_reset = pciback_slot_reset,
++ .resume = pciback_error_resume,
++};
+
-+ /* Check error status: if okay then remember grant handle. */
-+ newerr = (++mop)->status;
-+ if (likely(!newerr)) {
-+ set_phys_to_machine(
-+ __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
-+ FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
-+ grant_tx_handle[pending_idx] = mop->handle;
-+ /* Had a previous error? Invalidate this fragment. */
-+ if (unlikely(err))
-+ netif_idx_release(pending_idx);
-+ continue;
-+ }
++/*
++ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
++ * for a normal device. I don't want it to be loaded automatically.
++ */
+
-+ /* Error on this fragment: respond to client with an error. */
-+ txp = &pending_tx_info[pending_idx].req;
-+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
-+ pending_ring[pending_index(pending_prod++)] = pending_idx;
-+ netif_put(netif);
++static struct pci_driver pciback_pci_driver = {
++ .name = "pciback",
++ .id_table = pcistub_ids,
++ .probe = pcistub_probe,
++ .remove = pcistub_remove,
++ .err_handler = &pciback_error_handler,
++};
+
-+ /* Not the first error? Preceding frags already invalidated. */
-+ if (err)
-+ continue;
++static inline int str_to_slot(const char *buf, int *domain, int *bus,
++ int *slot, int *func)
++{
++ int err;
+
-+ /* First error: invalidate header and preceding fragments. */
-+ pending_idx = *((u16 *)skb->data);
-+ netif_idx_release(pending_idx);
-+ for (j = start; j < i; j++) {
-+ pending_idx = (unsigned long)shinfo->frags[i].page;
-+ netif_idx_release(pending_idx);
-+ }
++ err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
++ if (err == 4)
++ return 0;
++ else if (err < 0)
++ return -EINVAL;
+
-+ /* Remember the error: invalidate all subsequent fragments. */
-+ err = newerr;
-+ }
++ /* try again without domain */
++ *domain = 0;
++ err = sscanf(buf, " %x:%x.%x", bus, slot, func);
++ if (err == 3)
++ return 0;
+
-+ *mopp = mop + 1;
-+ return err;
++ return -EINVAL;
+}
+
-+static void netbk_fill_frags(struct sk_buff *skb)
++static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
++ *slot, int *func, int *reg, int *size, int *mask)
+{
-+ struct skb_shared_info *shinfo = skb_shinfo(skb);
-+ int nr_frags = shinfo->nr_frags;
-+ int i;
++ int err;
+
-+ for (i = 0; i < nr_frags; i++) {
-+ skb_frag_t *frag = shinfo->frags + i;
-+ struct xen_netif_tx_request *txp;
-+ unsigned long pending_idx;
++ err =
++ sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
++ func, reg, size, mask);
++ if (err == 7)
++ return 0;
++ return -EINVAL;
++}
++
++static int pcistub_device_id_add(int domain, int bus, int slot, int func)
++{
++ struct pcistub_device_id *pci_dev_id;
++ unsigned long flags;
++
++ pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
++ if (!pci_dev_id)
++ return -ENOMEM;
+
-+ pending_idx = (unsigned long)frag->page;
++ pci_dev_id->domain = domain;
++ pci_dev_id->bus = bus;
++ pci_dev_id->devfn = PCI_DEVFN(slot, func);
+
-+ pending_inuse[pending_idx].alloc_time = jiffies;
-+ list_add_tail(&pending_inuse[pending_idx].list,
-+ &pending_inuse_head);
++ pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
++ domain, bus, slot, func);
+
-+ txp = &pending_tx_info[pending_idx].req;
-+ frag->page = virt_to_page(idx_to_kaddr(pending_idx));
-+ frag->size = txp->size;
-+ frag->page_offset = txp->offset;
++ spin_lock_irqsave(&device_ids_lock, flags);
++ list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
++ spin_unlock_irqrestore(&device_ids_lock, flags);
+
-+ skb->len += txp->size;
-+ skb->data_len += txp->size;
-+ skb->truesize += txp->size;
-+ }
++ return 0;
+}
+
-+int netbk_get_extras(struct xen_netif *netif, struct xen_netif_extra_info *extras,
-+ int work_to_do)
++static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
+{
-+ struct xen_netif_extra_info extra;
-+ RING_IDX cons = netif->tx.req_cons;
++ struct pcistub_device_id *pci_dev_id, *t;
++ int devfn = PCI_DEVFN(slot, func);
++ int err = -ENOENT;
++ unsigned long flags;
+
-+ do {
-+ if (unlikely(work_to_do-- <= 0)) {
-+ DPRINTK("Missing extra info\n");
-+ return -EBADR;
-+ }
++ spin_lock_irqsave(&device_ids_lock, flags);
++ list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids,
++ slot_list) {
++ if (pci_dev_id->domain == domain
++ && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
++ /* Don't break; here because it's possible the same
++ * slot could be in the list more than once
++ */
++ list_del(&pci_dev_id->slot_list);
++ kfree(pci_dev_id);
+
-+ memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
-+ sizeof(extra));
-+ if (unlikely(!extra.type ||
-+ extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
-+ netif->tx.req_cons = ++cons;
-+ DPRINTK("Invalid extra type: %d\n", extra.type);
-+ return -EINVAL;
-+ }
++ err = 0;
+
-+ memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
-+ netif->tx.req_cons = ++cons;
-+ } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
++ pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
++ "seize list\n", domain, bus, slot, func);
++ }
++ }
++ spin_unlock_irqrestore(&device_ids_lock, flags);
+
-+ return work_to_do;
++ return err;
+}
+
-+static int netbk_set_skb_gso(struct sk_buff *skb, struct xen_netif_extra_info *gso)
++static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
++ int size, int mask)
+{
-+ if (!gso->u.gso.size) {
-+ DPRINTK("GSO size must not be zero.\n");
-+ return -EINVAL;
-+ }
++ int err = 0;
++ struct pcistub_device *psdev;
++ struct pci_dev *dev;
++ struct config_field *field;
+
-+ /* Currently only TCPv4 S.O. is supported. */
-+ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
-+ DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
-+ return -EINVAL;
++ psdev = pcistub_device_find(domain, bus, slot, func);
++ if (!psdev || !psdev->dev) {
++ err = -ENODEV;
++ goto out;
+ }
++ dev = psdev->dev;
+
-+ skb_shinfo(skb)->gso_size = gso->u.gso.size;
-+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
++ field = kzalloc(sizeof(*field), GFP_ATOMIC);
++ if (!field) {
++ err = -ENOMEM;
++ goto out;
++ }
+
-+ /* Header must be checked, and gso_segs computed. */
-+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-+ skb_shinfo(skb)->gso_segs = 0;
++ field->offset = reg;
++ field->size = size;
++ field->mask = mask;
++ field->init = NULL;
++ field->reset = NULL;
++ field->release = NULL;
++ field->clean = pciback_config_field_free;
+
-+ return 0;
++ err = pciback_config_quirks_add_field(dev, field);
++ if (err)
++ kfree(field);
++out:
++ return err;
+}
+
-+static int skb_checksum_setup(struct sk_buff *skb)
++static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
++ size_t count)
+{
-+ struct iphdr *iph;
-+ unsigned char *th;
-+ int err = -EPROTO;
++ int domain, bus, slot, func;
++ int err;
+
-+ if (skb->protocol != htons(ETH_P_IP))
++ err = str_to_slot(buf, &domain, &bus, &slot, &func);
++ if (err)
+ goto out;
+
-+ iph = (void *)skb->data;
-+ th = skb->data + 4 * iph->ihl;
-+ if (th >= skb_tail_pointer(skb))
-+ goto out;
++ err = pcistub_device_id_add(domain, bus, slot, func);
+
-+ skb->csum_start = th - skb->head;
-+ switch (iph->protocol) {
-+ case IPPROTO_TCP:
-+ skb->csum_offset = offsetof(struct tcphdr, check);
-+ break;
-+ case IPPROTO_UDP:
-+ skb->csum_offset = offsetof(struct udphdr, check);
-+ break;
-+ default:
-+ if (net_ratelimit())
-+ printk(KERN_ERR "Attempting to checksum a non-"
-+ "TCP/UDP packet, dropping a protocol"
-+ " %d packet", iph->protocol);
-+ goto out;
-+ }
++out:
++ if (!err)
++ err = count;
++ return err;
++}
+
-+ if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
++DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
++
++static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
++ size_t count)
++{
++ int domain, bus, slot, func;
++ int err;
++
++ err = str_to_slot(buf, &domain, &bus, &slot, &func);
++ if (err)
+ goto out;
+
-+ err = 0;
++ err = pcistub_device_id_remove(domain, bus, slot, func);
+
+out:
++ if (!err)
++ err = count;
+ return err;
+}
+
-+static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size)
++DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
++
++static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
+{
-+ unsigned long now = jiffies;
-+ unsigned long next_credit =
-+ netif->credit_timeout.expires +
-+ msecs_to_jiffies(netif->credit_usec / 1000);
++ struct pcistub_device_id *pci_dev_id;
++ size_t count = 0;
++ unsigned long flags;
+
-+ /* Timer could already be pending in rare cases. */
-+ if (timer_pending(&netif->credit_timeout))
-+ return true;
++ spin_lock_irqsave(&device_ids_lock, flags);
++ list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
++ if (count >= PAGE_SIZE)
++ break;
+
-+ /* Passed the point where we can replenish credit? */
-+ if (time_after_eq(now, next_credit)) {
-+ netif->credit_timeout.expires = now;
-+ tx_add_credit(netif);
++ count += scnprintf(buf + count, PAGE_SIZE - count,
++ "%04x:%02x:%02x.%01x\n",
++ pci_dev_id->domain, pci_dev_id->bus,
++ PCI_SLOT(pci_dev_id->devfn),
++ PCI_FUNC(pci_dev_id->devfn));
+ }
++ spin_unlock_irqrestore(&device_ids_lock, flags);
+
-+ /* Still too big to send right now? Set a callback. */
-+ if (size > netif->remaining_credit) {
-+ netif->credit_timeout.data =
-+ (unsigned long)netif;
-+ netif->credit_timeout.function =
-+ tx_credit_callback;
-+ mod_timer(&netif->credit_timeout,
-+ next_credit);
++ return count;
++}
+
-+ return true;
-+ }
++DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
+
-+ return false;
++static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
++ size_t count)
++{
++ int domain, bus, slot, func, reg, size, mask;
++ int err;
++
++ err = str_to_quirk(buf, &domain, &bus, &slot, &func, ®, &size,
++ &mask);
++ if (err)
++ goto out;
++
++ err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
++
++out:
++ if (!err)
++ err = count;
++ return err;
+}
+
-+static unsigned net_tx_build_mops(void)
++static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
+{
-+ struct gnttab_map_grant_ref *mop;
-+ struct sk_buff *skb;
-+ int ret;
-+
-+ mop = tx_map_ops;
-+ while (((nr_pending_reqs() + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
-+ !list_empty(&net_schedule_list)) {
-+ struct xen_netif *netif;
-+ struct xen_netif_tx_request txreq;
-+ struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
-+ struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
-+ u16 pending_idx;
-+ RING_IDX idx;
-+ int work_to_do;
-+ unsigned int data_len;
-+
-+ /* Get a netif from the list with work to do. */
-+ netif = list_first_entry(&net_schedule_list, struct xen_netif, list);
-+ netif_get(netif);
-+ remove_from_net_schedule_list(netif);
++ int count = 0;
++ unsigned long flags;
++ struct pciback_config_quirk *quirk;
++ struct pciback_dev_data *dev_data;
++ const struct config_field *field;
++ const struct config_field_entry *cfg_entry;
++
++ spin_lock_irqsave(&device_ids_lock, flags);
++ list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
++ if (count >= PAGE_SIZE)
++ goto out;
+
-+ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
-+ if (!work_to_do) {
-+ netif_put(netif);
-+ continue;
++ count += scnprintf(buf + count, PAGE_SIZE - count,
++ "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
++ quirk->pdev->bus->number,
++ PCI_SLOT(quirk->pdev->devfn),
++ PCI_FUNC(quirk->pdev->devfn),
++ quirk->devid.vendor, quirk->devid.device,
++ quirk->devid.subvendor,
++ quirk->devid.subdevice);
++
++ dev_data = pci_get_drvdata(quirk->pdev);
++
++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++ field = cfg_entry->field;
++ if (count >= PAGE_SIZE)
++ goto out;
++
++ count += scnprintf(buf + count, PAGE_SIZE - count,
++ "\t\t%08x:%01x:%08x\n",
++ cfg_entry->base_offset +
++ field->offset, field->size,
++ field->mask);
+ }
++ }
+
-+ idx = netif->tx.req_cons;
-+ rmb(); /* Ensure that we see the request before we copy it. */
-+ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, idx), sizeof(txreq));
++out:
++ spin_unlock_irqrestore(&device_ids_lock, flags);
+
-+ /* Credit-based scheduling. */
-+ if (txreq.size > netif->remaining_credit &&
-+ tx_credit_exceeded(netif, txreq.size)) {
-+ netif_put(netif);
++ return count;
++}
++
++DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
++
++static ssize_t permissive_add(struct device_driver *drv, const char *buf,
++ size_t count)
++{
++ int domain, bus, slot, func;
++ int err;
++ struct pcistub_device *psdev;
++ struct pciback_dev_data *dev_data;
++ err = str_to_slot(buf, &domain, &bus, &slot, &func);
++ if (err)
++ goto out;
++ psdev = pcistub_device_find(domain, bus, slot, func);
++ if (!psdev) {
++ err = -ENODEV;
++ goto out;
++ }
++ if (!psdev->dev) {
++ err = -ENODEV;
++ goto release;
++ }
++ dev_data = pci_get_drvdata(psdev->dev);
++ /* the driver data for a device should never be null at this point */
++ if (!dev_data) {
++ err = -ENXIO;
++ goto release;
++ }
++ if (!dev_data->permissive) {
++ dev_data->permissive = 1;
++ /* Let user know that what they're doing could be unsafe */
++ dev_warn(&psdev->dev->dev, "enabling permissive mode "
++ "configuration space accesses!\n");
++ dev_warn(&psdev->dev->dev,
++ "permissive mode is potentially unsafe!\n");
++ }
++release:
++ pcistub_device_put(psdev);
++out:
++ if (!err)
++ err = count;
++ return err;
++}
++
++static ssize_t permissive_show(struct device_driver *drv, char *buf)
++{
++ struct pcistub_device *psdev;
++ struct pciback_dev_data *dev_data;
++ size_t count = 0;
++ unsigned long flags;
++ spin_lock_irqsave(&pcistub_devices_lock, flags);
++ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++ if (count >= PAGE_SIZE)
++ break;
++ if (!psdev->dev)
+ continue;
-+ }
++ dev_data = pci_get_drvdata(psdev->dev);
++ if (!dev_data || !dev_data->permissive)
++ continue;
++ count +=
++ scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
++ pci_name(psdev->dev));
++ }
++ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++ return count;
++}
+
-+ netif->remaining_credit -= txreq.size;
++DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
+
-+ work_to_do--;
-+ netif->tx.req_cons = ++idx;
++static void pcistub_exit(void)
++{
++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
++ driver_remove_file(&pciback_pci_driver.driver,
++ &driver_attr_remove_slot);
++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
+
-+ memset(extras, 0, sizeof(extras));
-+ if (txreq.flags & NETTXF_extra_info) {
-+ work_to_do = netbk_get_extras(netif, extras,
-+ work_to_do);
-+ idx = netif->tx.req_cons;
-+ if (unlikely(work_to_do < 0)) {
-+ netbk_tx_err(netif, &txreq, idx);
-+ continue;
++ pci_unregister_driver(&pciback_pci_driver);
++}
++
++static int __init pcistub_init(void)
++{
++ int pos = 0;
++ int err = 0;
++ int domain, bus, slot, func;
++ int parsed;
++
++ if (pci_devs_to_hide && *pci_devs_to_hide) {
++ do {
++ parsed = 0;
++
++ err = sscanf(pci_devs_to_hide + pos,
++ " (%x:%x:%x.%x) %n",
++ &domain, &bus, &slot, &func, &parsed);
++ if (err != 4) {
++ domain = 0;
++ err = sscanf(pci_devs_to_hide + pos,
++ " (%x:%x.%x) %n",
++ &bus, &slot, &func, &parsed);
++ if (err != 3)
++ goto parse_error;
+ }
-+ }
+
-+ ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
-+ if (unlikely(ret < 0)) {
-+ netbk_tx_err(netif, &txreq, idx - ret);
-+ continue;
-+ }
-+ idx += ret;
++ err = pcistub_device_id_add(domain, bus, slot, func);
++ if (err)
++ goto out;
++
++ /* if parsed<=0, we've reached the end of the string */
++ pos += parsed;
++ } while (parsed > 0 && pci_devs_to_hide[pos]);
++ }
+
-+ if (unlikely(txreq.size < ETH_HLEN)) {
-+ DPRINTK("Bad packet size: %d\n", txreq.size);
-+ netbk_tx_err(netif, &txreq, idx);
-+ continue;
-+ }
++ /* If we're the first PCI Device Driver to register, we're the
++ * first one to get offered PCI devices as they become
++ * available (and thus we can be the first to grab them)
++ */
++ err = pci_register_driver(&pciback_pci_driver);
++ if (err < 0)
++ goto out;
++
++ err = driver_create_file(&pciback_pci_driver.driver,
++ &driver_attr_new_slot);
++ if (!err)
++ err = driver_create_file(&pciback_pci_driver.driver,
++ &driver_attr_remove_slot);
++ if (!err)
++ err = driver_create_file(&pciback_pci_driver.driver,
++ &driver_attr_slots);
++ if (!err)
++ err = driver_create_file(&pciback_pci_driver.driver,
++ &driver_attr_quirks);
++ if (!err)
++ err = driver_create_file(&pciback_pci_driver.driver,
++ &driver_attr_permissive);
++
++ if (err)
++ pcistub_exit();
++
++out:
++ return err;
++
++parse_error:
++ printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
++ pci_devs_to_hide + pos);
++ return -EINVAL;
++}
++
++#ifndef MODULE
++/*
++ * fs_initcall happens before device_initcall
++ * so pciback *should* get called first (b/c we
++ * want to suck up any device before other drivers
++ * get a chance by being the first pci device
++ * driver to register)
++ */
++fs_initcall(pcistub_init);
++#endif
++
++static int __init pciback_init(void)
++{
++ int err;
++
++ if (!xen_initial_domain())
++ return -ENODEV;
++
++ err = pciback_config_init();
++ if (err)
++ return err;
+
-+ /* No crossing a page as the payload mustn't fragment. */
-+ if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
-+ DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
-+ txreq.offset, txreq.size,
-+ (txreq.offset &~PAGE_MASK) + txreq.size);
-+ netbk_tx_err(netif, &txreq, idx);
-+ continue;
-+ }
++#ifdef MODULE
++ err = pcistub_init();
++ if (err < 0)
++ return err;
++#endif
+
-+ pending_idx = pending_ring[pending_index(pending_cons)];
++ pcistub_init_devices_late();
++ err = pciback_xenbus_register();
++ if (err)
++ pcistub_exit();
+
-+ data_len = (txreq.size > PKT_PROT_LEN &&
-+ ret < MAX_SKB_FRAGS) ?
-+ PKT_PROT_LEN : txreq.size;
++ return err;
++}
+
-+ skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
-+ GFP_ATOMIC | __GFP_NOWARN);
-+ if (unlikely(skb == NULL)) {
-+ DPRINTK("Can't allocate a skb in start_xmit.\n");
-+ netbk_tx_err(netif, &txreq, idx);
-+ break;
-+ }
++static void __exit pciback_cleanup(void)
++{
++ pciback_xenbus_unregister();
++ pcistub_exit();
++}
+
-+ /* Packets passed to netif_rx() must have some headroom. */
-+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
++module_init(pciback_init);
++module_exit(pciback_cleanup);
+
-+ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
-+ struct xen_netif_extra_info *gso;
-+ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/pciback/pciback.h b/drivers/xen/pciback/pciback.h
+new file mode 100644
+index 0000000..98e2912
+--- /dev/null
++++ b/drivers/xen/pciback/pciback.h
+@@ -0,0 +1,133 @@
++/*
++ * PCI Backend Common Data Structures & Function Declarations
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
++#ifndef __XEN_PCIBACK_H__
++#define __XEN_PCIBACK_H__
+
-+ if (netbk_set_skb_gso(skb, gso)) {
-+ kfree_skb(skb);
-+ netbk_tx_err(netif, &txreq, idx);
-+ continue;
-+ }
-+ }
++#include <linux/pci.h>
++#include <linux/interrupt.h>
++#include <xen/xenbus.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/workqueue.h>
++#include <asm/atomic.h>
++#include <xen/interface/io/pciif.h>
+
-+ gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
-+ GNTMAP_host_map | GNTMAP_readonly,
-+ txreq.gref, netif->domid);
-+ mop++;
++struct pci_dev_entry {
++ struct list_head list;
++ struct pci_dev *dev;
++};
+
-+ memcpy(&pending_tx_info[pending_idx].req,
-+ &txreq, sizeof(txreq));
-+ pending_tx_info[pending_idx].netif = netif;
-+ *((u16 *)skb->data) = pending_idx;
++#define _PDEVF_op_active (0)
++#define PDEVF_op_active (1<<(_PDEVF_op_active))
++#define _PCIB_op_pending (1)
++#define PCIB_op_pending (1<<(_PCIB_op_pending))
+
-+ __skb_put(skb, data_len);
++struct pciback_device {
++ void *pci_dev_data;
++ spinlock_t dev_lock;
+
-+ skb_shinfo(skb)->nr_frags = ret;
-+ if (data_len < txreq.size) {
-+ skb_shinfo(skb)->nr_frags++;
-+ skb_shinfo(skb)->frags[0].page =
-+ (void *)(unsigned long)pending_idx;
-+ } else {
-+ /* Discriminate from any valid pending_idx value. */
-+ skb_shinfo(skb)->frags[0].page = (void *)~0UL;
-+ }
++ struct xenbus_device *xdev;
+
-+ __skb_queue_tail(&tx_queue, skb);
++ struct xenbus_watch be_watch;
++ u8 be_watching;
+
-+ pending_cons++;
++ int evtchn_irq;
+
-+ mop = netbk_get_requests(netif, skb, txfrags, mop);
++ struct xen_pci_sharedinfo *sh_info;
+
-+ netif->tx.req_cons = idx;
-+ netif_schedule_work(netif);
++ unsigned long flags;
+
-+ if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
-+ break;
-+ }
++ struct work_struct op_work;
++};
+
-+ return mop - tx_map_ops;
-+}
++struct pciback_dev_data {
++ struct list_head config_fields;
++ int permissive;
++ int warned_on_write;
++};
++
++/* Used by XenBus and pciback_ops.c */
++extern wait_queue_head_t aer_wait_queue;
++extern struct workqueue_struct *pciback_wq;
++/* Used by pcistub.c and conf_space_quirks.c */
++extern struct list_head pciback_quirks;
++
++/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
++ int domain, int bus,
++ int slot, int func);
++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
++ struct pci_dev *dev);
++void pcistub_put_pci_dev(struct pci_dev *dev);
++
++/* Ensure a device is turned off or reset */
++void pciback_reset_device(struct pci_dev *pdev);
++
++/* Access a virtual configuration space for a PCI device */
++int pciback_config_init(void);
++int pciback_config_init_dev(struct pci_dev *dev);
++void pciback_config_free_dyn_fields(struct pci_dev *dev);
++void pciback_config_reset_dev(struct pci_dev *dev);
++void pciback_config_free_dev(struct pci_dev *dev);
++int pciback_config_read(struct pci_dev *dev, int offset, int size,
++ u32 *ret_val);
++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
++
++/* Handle requests for specific devices from the frontend */
++typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev,
++ unsigned int domain, unsigned int bus,
++ unsigned int devfn, unsigned int devid);
++typedef int (*publish_pci_root_cb) (struct pciback_device *pdev,
++ unsigned int domain, unsigned int bus);
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++ int devid, publish_pci_dev_cb publish_cb);
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++ unsigned int domain, unsigned int bus,
++ unsigned int devfn);
+
-+static void net_tx_submit(void)
-+{
-+ struct gnttab_map_grant_ref *mop;
-+ struct sk_buff *skb;
++/**
++* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in pciback
++* before sending aer request to pcifront, so that guest could identify
++* device, coopearte with pciback to finish aer recovery job if device driver
++* has the capability
++*/
+
-+ mop = tx_map_ops;
-+ while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
-+ struct xen_netif_tx_request *txp;
-+ struct xen_netif *netif;
-+ u16 pending_idx;
-+ unsigned data_len;
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++ struct pciback_device *pdev,
++ unsigned int *domain, unsigned int *bus,
++ unsigned int *devfn);
++int pciback_init_devices(struct pciback_device *pdev);
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++ publish_pci_root_cb cb);
++void pciback_release_devices(struct pciback_device *pdev);
++
++/* Handles events from front-end */
++irqreturn_t pciback_handle_event(int irq, void *dev_id);
++void pciback_do_op(struct work_struct *data);
+
-+ pending_idx = *((u16 *)skb->data);
-+ netif = pending_tx_info[pending_idx].netif;
-+ txp = &pending_tx_info[pending_idx].req;
++int pciback_xenbus_register(void);
++void pciback_xenbus_unregister(void);
+
-+ /* Check the remap error code. */
-+ if (unlikely(netbk_tx_check_mop(skb, &mop))) {
-+ DPRINTK("netback grant failed.\n");
-+ skb_shinfo(skb)->nr_frags = 0;
-+ kfree_skb(skb);
-+ continue;
-+ }
++#ifdef CONFIG_PCI_MSI
++int pciback_enable_msi(struct pciback_device *pdev,
++ struct pci_dev *dev, struct xen_pci_op *op);
+
-+ data_len = skb->len;
-+ memcpy(skb->data,
-+ (void *)(idx_to_kaddr(pending_idx)|txp->offset),
-+ data_len);
-+ if (data_len < txp->size) {
-+ /* Append the packet payload as a fragment. */
-+ txp->offset += data_len;
-+ txp->size -= data_len;
-+ } else {
-+ /* Schedule a response immediately. */
-+ netif_idx_release(pending_idx);
-+ }
++int pciback_disable_msi(struct pciback_device *pdev,
++ struct pci_dev *dev, struct xen_pci_op *op);
+
-+ /*
-+ * Old frontends do not assert data_validated but we
-+ * can infer it from csum_blank so test both flags.
-+ */
-+ if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank))
-+ skb->ip_summed = CHECKSUM_PARTIAL;
-+ else
-+ skb->ip_summed = CHECKSUM_NONE;
+
-+ netbk_fill_frags(skb);
++int pciback_enable_msix(struct pciback_device *pdev,
++ struct pci_dev *dev, struct xen_pci_op *op);
+
-+ /*
-+ * If the initial fragment was < PKT_PROT_LEN then
-+ * pull through some bytes from the other fragments to
-+ * increase the linear region to PKT_PROT_LEN bytes.
-+ */
-+ if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
-+ int target = min_t(int, skb->len, PKT_PROT_LEN);
-+ __pskb_pull_tail(skb, target - skb_headlen(skb));
-+ }
++int pciback_disable_msix(struct pciback_device *pdev,
++ struct pci_dev *dev, struct xen_pci_op *op);
++#endif
++extern int verbose_request;
+
-+ skb->dev = netif->dev;
-+ skb->protocol = eth_type_trans(skb, skb->dev);
++void test_and_schedule_op(struct pciback_device *pdev);
++#endif
+
-+ netif->stats.rx_bytes += skb->len;
-+ netif->stats.rx_packets++;
+diff --git a/drivers/xen/pciback/pciback_ops.c b/drivers/xen/pciback/pciback_ops.c
+new file mode 100644
+index 0000000..011db67
+--- /dev/null
++++ b/drivers/xen/pciback/pciback_ops.c
+@@ -0,0 +1,139 @@
++/*
++ * PCI Backend Operations - respond to PCI requests from Frontend
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/wait.h>
++#include <linux/bitops.h>
++#include <xen/events.h>
++#include <linux/sched.h>
++#include "pciback.h"
+
-+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
-+ if (skb_checksum_setup(skb)) {
-+ DPRINTK("Can't setup checksum in net_tx_action\n");
-+ kfree_skb(skb);
-+ continue;
-+ }
-+ }
++int verbose_request;
++module_param(verbose_request, int, 0644);
+
-+ if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
-+ unlikely(skb_linearize(skb))) {
-+ DPRINTK("Can't linearize skb in net_tx_action.\n");
-+ kfree_skb(skb);
-+ continue;
-+ }
++/* Ensure a device is "turned off" and ready to be exported.
++ * (Also see pciback_config_reset to ensure virtual configuration space is
++ * ready to be re-exported)
++ */
++void pciback_reset_device(struct pci_dev *dev)
++{
++ u16 cmd;
+
-+ netif_rx(skb);
-+ netif->dev->last_rx = jiffies;
-+ }
++ /* Disable devices (but not bridges) */
++ if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
++#ifdef CONFIG_PCI_MSI
++ /* The guest could have been abruptly killed without
++ * disabling MSI/MSI-X interrupts.*/
++ if (dev->msix_enabled)
++ pci_disable_msix(dev);
++ if (dev->msi_enabled)
++ pci_disable_msi(dev);
++#endif
++ pci_disable_device(dev);
+
-+ if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
-+ !list_empty(&pending_inuse_head)) {
-+ struct netbk_tx_pending_inuse *oldest;
++ pci_write_config_word(dev, PCI_COMMAND, 0);
+
-+ oldest = list_entry(pending_inuse_head.next,
-+ struct netbk_tx_pending_inuse, list);
-+ mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ);
++ dev->is_busmaster = 0;
++ } else {
++ pci_read_config_word(dev, PCI_COMMAND, &cmd);
++ if (cmd & (PCI_COMMAND_INVALIDATE)) {
++ cmd &= ~(PCI_COMMAND_INVALIDATE);
++ pci_write_config_word(dev, PCI_COMMAND, cmd);
++
++ dev->is_busmaster = 0;
++ }
+ }
+}
-+
-+/* Called after netfront has transmitted */
-+static void net_tx_action(unsigned long unused)
++/*
++* Now the same evtchn is used for both pcifront conf_read_write request
++* as well as pcie aer front end ack. We use a new work_queue to schedule
++* pciback conf_read_write service for avoiding confict with aer_core
++* do_recovery job which also use the system default work_queue
++*/
++void test_and_schedule_op(struct pciback_device *pdev)
+{
-+ unsigned nr_mops;
-+ int ret;
++ /* Check that frontend is requesting an operation and that we are not
++ * already processing a request */
++ if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
++ && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) {
++ queue_work(pciback_wq, &pdev->op_work);
++ }
++ /*_XEN_PCIB_active should have been cleared by pcifront. And also make
++ sure pciback is waiting for ack by checking _PCIB_op_pending*/
++ if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
++ && test_bit(_PCIB_op_pending, &pdev->flags)) {
++ wake_up(&aer_wait_queue);
++ }
++}
+
-+ if (dealloc_cons != dealloc_prod)
-+ net_tx_action_dealloc();
++/* Performing the configuration space reads/writes must not be done in atomic
++ * context because some of the pci_* functions can sleep (mostly due to ACPI
++ * use of semaphores). This function is intended to be called from a work
++ * queue in process context taking a struct pciback_device as a parameter */
+
-+ nr_mops = net_tx_build_mops();
++void pciback_do_op(struct work_struct *data)
++{
++ struct pciback_device *pdev =
++ container_of(data, struct pciback_device, op_work);
++ struct pci_dev *dev;
++ struct xen_pci_op *op = &pdev->sh_info->op;
+
-+ if (nr_mops == 0)
-+ return;
++ dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
+
-+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
-+ tx_map_ops, nr_mops);
-+ BUG_ON(ret);
++ if (dev == NULL)
++ op->err = XEN_PCI_ERR_dev_not_found;
++ else {
++ switch (op->cmd) {
++ case XEN_PCI_OP_conf_read:
++ op->err = pciback_config_read(dev,
++ op->offset, op->size, &op->value);
++ break;
++ case XEN_PCI_OP_conf_write:
++ op->err = pciback_config_write(dev,
++ op->offset, op->size, op->value);
++ break;
++#ifdef CONFIG_PCI_MSI
++ case XEN_PCI_OP_enable_msi:
++ op->err = pciback_enable_msi(pdev, dev, op);
++ break;
++ case XEN_PCI_OP_disable_msi:
++ op->err = pciback_disable_msi(pdev, dev, op);
++ break;
++ case XEN_PCI_OP_enable_msix:
++ op->err = pciback_enable_msix(pdev, dev, op);
++ break;
++ case XEN_PCI_OP_disable_msix:
++ op->err = pciback_disable_msix(pdev, dev, op);
++ break;
++#endif
++ default:
++ op->err = XEN_PCI_ERR_not_implemented;
++ break;
++ }
++ }
++ /* Tell the driver domain that we're done. */
++ wmb();
++ clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
++ notify_remote_via_irq(pdev->evtchn_irq);
++
++ /* Mark that we're done. */
++ smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
++ clear_bit(_PDEVF_op_active, &pdev->flags);
++ smp_mb__after_clear_bit(); /* /before/ final check for work */
+
-+ net_tx_submit();
++ /* Check to see if the driver domain tried to start another request in
++ * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
++ */
++ test_and_schedule_op(pdev);
+}
+
-+static void netif_idx_release(u16 pending_idx)
++irqreturn_t pciback_handle_event(int irq, void *dev_id)
+{
-+ static DEFINE_SPINLOCK(_lock);
-+ unsigned long flags;
++ struct pciback_device *pdev = dev_id;
+
-+ spin_lock_irqsave(&_lock, flags);
-+ dealloc_ring[pending_index(dealloc_prod)] = pending_idx;
-+ /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
-+ smp_wmb();
-+ dealloc_prod++;
-+ spin_unlock_irqrestore(&_lock, flags);
++ test_and_schedule_op(pdev);
+
-+ tasklet_schedule(&net_tx_tasklet);
++ return IRQ_HANDLED;
+}
+diff --git a/drivers/xen/pciback/slot.c b/drivers/xen/pciback/slot.c
+new file mode 100644
+index 0000000..efb922d
+--- /dev/null
++++ b/drivers/xen/pciback/slot.c
+@@ -0,0 +1,191 @@
++/*
++ * PCI Backend - Provides a Virtual PCI bus (with real devices)
++ * to the frontend
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil> (vpci.c)
++ * Author: Tristan Gingold <tristan.gingold at bull.net>, from vpci.c
++ */
+
-+static void netif_page_release(struct page *page, unsigned int order)
-+{
-+ int idx = netif_page_index(page);
-+ BUG_ON(order);
-+ BUG_ON(idx < 0);
-+ netif_idx_release(idx);
-+}
++#include <linux/list.h>
++#include <linux/slab.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
+
-+irqreturn_t netif_be_int(int irq, void *dev_id)
++/* There are at most 32 slots in a pci bus. */
++#define PCI_SLOT_MAX 32
++
++#define PCI_BUS_NBR 2
++
++struct slot_dev_data {
++ /* Access to dev_list must be protected by lock */
++ struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
++ spinlock_t lock;
++};
++
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++ unsigned int domain, unsigned int bus,
++ unsigned int devfn)
+{
-+ struct xen_netif *netif = dev_id;
++ struct pci_dev *dev = NULL;
++ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++ unsigned long flags;
+
-+ add_to_net_schedule_list_tail(netif);
-+ maybe_schedule_tx_action();
++ if (domain != 0 || PCI_FUNC(devfn) != 0)
++ return NULL;
++
++ if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
++ return NULL;
+
-+ if (netif_schedulable(netif) && !netbk_queue_full(netif))
-+ netif_wake_queue(netif->dev);
++ spin_lock_irqsave(&slot_dev->lock, flags);
++ dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
++ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
-+ return IRQ_HANDLED;
++ return dev;
+}
+
-+static void make_tx_response(struct xen_netif *netif,
-+ struct xen_netif_tx_request *txp,
-+ s8 st)
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++ int devid, publish_pci_dev_cb publish_cb)
+{
-+ RING_IDX i = netif->tx.rsp_prod_pvt;
-+ struct xen_netif_tx_response *resp;
-+ int notify;
-+
-+ resp = RING_GET_RESPONSE(&netif->tx, i);
-+ resp->id = txp->id;
-+ resp->status = st;
++ int err = 0, slot, bus;
++ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++ unsigned long flags;
+
-+ if (txp->flags & NETTXF_extra_info)
-+ RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
++ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
++ err = -EFAULT;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Can't export bridges on the virtual PCI bus");
++ goto out;
++ }
+
-+ netif->tx.rsp_prod_pvt = ++i;
-+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
++ spin_lock_irqsave(&slot_dev->lock, flags);
+
-+ /*
-+ * netfront_smartpoll_active indicates whether netfront timer
-+ * is active.
-+ */
-+ if ((netif->smart_poll == 1)) {
-+ if (!(netif->rx.sring->netfront_smartpoll_active)) {
-+ notify_remote_via_irq(netif->irq);
-+ netif->rx.sring->netfront_smartpoll_active = 1;
++ /* Assign to a new slot on the virtual PCI bus */
++ for (bus = 0; bus < PCI_BUS_NBR; bus++)
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++ if (slot_dev->slots[bus][slot] == NULL) {
++ printk(KERN_INFO
++ "pciback: slot: %s: assign to virtual "
++ "slot %d, bus %d\n",
++ pci_name(dev), slot, bus);
++ slot_dev->slots[bus][slot] = dev;
++ goto unlock;
++ }
+ }
-+ } else if (notify)
-+ notify_remote_via_irq(netif->irq);
-+}
+
-+static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
-+ u16 id,
-+ s8 st,
-+ u16 offset,
-+ u16 size,
-+ u16 flags)
-+{
-+ RING_IDX i = netif->rx.rsp_prod_pvt;
-+ struct xen_netif_rx_response *resp;
++ err = -ENOMEM;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "No more space on root virtual PCI bus");
+
-+ resp = RING_GET_RESPONSE(&netif->rx, i);
-+ resp->offset = offset;
-+ resp->flags = flags;
-+ resp->id = id;
-+ resp->status = (s16)size;
-+ if (st < 0)
-+ resp->status = (s16)st;
++unlock:
++ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
-+ netif->rx.rsp_prod_pvt = ++i;
++ /* Publish this device. */
++ if (!err)
++ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
+
-+ return resp;
++out:
++ return err;
+}
+
-+#ifdef NETBE_DEBUG_INTERRUPT
-+static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
-+ struct list_head *ent;
-+ struct xen_netif *netif;
-+ int i = 0;
++ int slot, bus;
++ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++ struct pci_dev *found_dev = NULL;
++ unsigned long flags;
+
-+ printk(KERN_ALERT "netif_schedule_list:\n");
-+ spin_lock_irq(&net_schedule_list_lock);
++ spin_lock_irqsave(&slot_dev->lock, flags);
+
-+ list_for_each (ent, &net_schedule_list) {
-+ netif = list_entry(ent, struct xen_netif, list);
-+ printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
-+ "rx_resp_prod=%08x\n",
-+ i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
-+ printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n",
-+ netif->tx.req_cons, netif->tx.rsp_prod_pvt);
-+ printk(KERN_ALERT " shared(rx_req_prod=%08x "
-+ "rx_resp_prod=%08x\n",
-+ netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
-+ printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n",
-+ netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
-+ printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n",
-+ netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
-+ i++;
-+ }
++ for (bus = 0; bus < PCI_BUS_NBR; bus++)
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++ if (slot_dev->slots[bus][slot] == dev) {
++ slot_dev->slots[bus][slot] = NULL;
++ found_dev = dev;
++ goto out;
++ }
++ }
+
-+ spin_unlock_irq(&net_schedule_list_lock);
-+ printk(KERN_ALERT " ** End of netif_schedule_list **\n");
++out:
++ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
-+ return IRQ_HANDLED;
++ if (found_dev)
++ pcistub_put_pci_dev(found_dev);
+}
-+#endif
+
-+static int __init netback_init(void)
++int pciback_init_devices(struct pciback_device *pdev)
+{
-+ int i;
-+ struct page *page;
-+ int rc = 0;
++ int slot, bus;
++ struct slot_dev_data *slot_dev;
+
-+ if (!xen_domain())
-+ return -ENODEV;
++ slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
++ if (!slot_dev)
++ return -ENOMEM;
+
-+ /* We can increase reservation by this much in net_rx_action(). */
-+// balloon_update_driver_allowance(NET_RX_RING_SIZE);
++ spin_lock_init(&slot_dev->lock);
+
-+ skb_queue_head_init(&rx_queue);
-+ skb_queue_head_init(&tx_queue);
++ for (bus = 0; bus < PCI_BUS_NBR; bus++)
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++)
++ slot_dev->slots[bus][slot] = NULL;
+
-+ init_timer(&net_timer);
-+ net_timer.data = 0;
-+ net_timer.function = net_alarm;
++ pdev->pci_dev_data = slot_dev;
+
-+ init_timer(&netbk_tx_pending_timer);
-+ netbk_tx_pending_timer.data = 0;
-+ netbk_tx_pending_timer.function = netbk_tx_pending_timeout;
++ return 0;
++}
+
-+ mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
-+ if (mmap_pages == NULL) {
-+ printk("%s: out of memory\n", __FUNCTION__);
-+ return -ENOMEM;
-+ }
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++ publish_pci_root_cb publish_cb)
++{
++ /* The Virtual PCI bus has only one root */
++ return publish_cb(pdev, 0, 0);
++}
+
-+ for (i = 0; i < MAX_PENDING_REQS; i++) {
-+ page = mmap_pages[i];
-+ SetPageForeign(page, netif_page_release);
-+ netif_set_page_index(page, i);
-+ INIT_LIST_HEAD(&pending_inuse[i].list);
-+ }
++void pciback_release_devices(struct pciback_device *pdev)
++{
++ int slot, bus;
++ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++ struct pci_dev *dev;
+
-+ pending_cons = 0;
-+ pending_prod = MAX_PENDING_REQS;
-+ for (i = 0; i < MAX_PENDING_REQS; i++)
-+ pending_ring[i] = i;
++ for (bus = 0; bus < PCI_BUS_NBR; bus++)
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++ dev = slot_dev->slots[bus][slot];
++ if (dev != NULL)
++ pcistub_put_pci_dev(dev);
++ }
+
-+ netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
-+ if (MODPARM_copy_skb) {
-+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
-+ NULL, 0))
-+ netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
-+ else
-+ netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
-+ }
++ kfree(slot_dev);
++ pdev->pci_dev_data = NULL;
++}
+
-+ //netif_accel_init();
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++ struct pciback_device *pdev,
++ unsigned int *domain, unsigned int *bus,
++ unsigned int *devfn)
++{
++ int slot, busnr;
++ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++ struct pci_dev *dev;
++ int found = 0;
++ unsigned long flags;
+
-+ rc = netif_xenbus_init();
-+ if (rc)
-+ goto failed_init;
++ spin_lock_irqsave(&slot_dev->lock, flags);
+
-+#ifdef NETBE_DEBUG_INTERRUPT
-+ (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
-+ 0,
-+ netif_be_dbg,
-+ SA_SHIRQ,
-+ "net-be-dbg",
-+ &netif_be_dbg);
-+#endif
++ for (busnr = 0; busnr < PCI_BUS_NBR; bus++)
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++ dev = slot_dev->slots[busnr][slot];
++ if (dev && dev->bus->number == pcidev->bus->number
++ && dev->devfn == pcidev->devfn
++ && pci_domain_nr(dev->bus) ==
++ pci_domain_nr(pcidev->bus)) {
++ found = 1;
++ *domain = 0;
++ *bus = busnr;
++ *devfn = PCI_DEVFN(slot, 0);
++ goto out;
++ }
++ }
++out:
++ spin_unlock_irqrestore(&slot_dev->lock, flags);
++ return found;
+
-+ return 0;
++}
+diff --git a/drivers/xen/pciback/vpci.c b/drivers/xen/pciback/vpci.c
+new file mode 100644
+index 0000000..2857ab8
+--- /dev/null
++++ b/drivers/xen/pciback/vpci.c
+@@ -0,0 +1,244 @@
++/*
++ * PCI Backend - Provides a Virtual PCI bus (with real devices)
++ * to the frontend
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
+
-+failed_init:
-+ free_empty_pages_and_pagevec(mmap_pages, MAX_PENDING_REQS);
-+ del_timer(&netbk_tx_pending_timer);
-+ del_timer(&net_timer);
-+ return rc;
++#include <linux/list.h>
++#include <linux/slab.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
+
-+}
++#define PCI_SLOT_MAX 32
+
-+module_init(netback_init);
++struct vpci_dev_data {
++ /* Access to dev_list must be protected by lock */
++ struct list_head dev_list[PCI_SLOT_MAX];
++ spinlock_t lock;
++};
+
-+MODULE_LICENSE("Dual BSD/GPL");
-diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c
-new file mode 100644
-index 0000000..70636d0
---- /dev/null
-+++ b/drivers/xen/netback/xenbus.c
-@@ -0,0 +1,523 @@
-+/* Xenbus code for netif backend
-+ Copyright (C) 2005 Rusty Russell <rusty at rustcorp.com.au>
-+ Copyright (C) 2005 XenSource Ltd
++static inline struct list_head *list_first(struct list_head *head)
++{
++ return head->next;
++}
+
-+ This program is free software; you can redistribute it and/or modify
-+ it under the terms of the GNU General Public License as published by
-+ the Free Software Foundation; either version 2 of the License, or
-+ (at your option) any later version.
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++ unsigned int domain, unsigned int bus,
++ unsigned int devfn)
++{
++ struct pci_dev_entry *entry;
++ struct pci_dev *dev = NULL;
++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++ unsigned long flags;
+
-+ This program is distributed in the hope that it will be useful,
-+ but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+ GNU General Public License for more details.
++ if (domain != 0 || bus != 0)
++ return NULL;
+
-+ You should have received a copy of the GNU General Public License
-+ along with this program; if not, write to the Free Software
-+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-+*/
++ if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
++ spin_lock_irqsave(&vpci_dev->lock, flags);
+
-+#include <stdarg.h>
-+#include <linux/module.h>
-+#include <xen/xenbus.h>
-+#include "common.h"
++ list_for_each_entry(entry,
++ &vpci_dev->dev_list[PCI_SLOT(devfn)],
++ list) {
++ if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
++ dev = entry->dev;
++ break;
++ }
++ }
+
-+#if 0
-+#undef DPRINTK
-+#define DPRINTK(fmt, args...) \
-+ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
-+#endif
++ spin_unlock_irqrestore(&vpci_dev->lock, flags);
++ }
++ return dev;
++}
+
++static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
++{
++ if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
++ && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
++ return 1;
+
-+static int connect_rings(struct backend_info *);
-+static void connect(struct backend_info *);
-+static void backend_create_netif(struct backend_info *be);
-+static void unregister_hotplug_status_watch(struct backend_info *be);
++ return 0;
++}
+
-+static int netback_remove(struct xenbus_device *dev)
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++ int devid, publish_pci_dev_cb publish_cb)
+{
-+ struct backend_info *be = dev_get_drvdata(&dev->dev);
++ int err = 0, slot, func = -1;
++ struct pci_dev_entry *t, *dev_entry;
++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++ unsigned long flags;
+
-+ //netback_remove_accelerators(be, dev);
++ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
++ err = -EFAULT;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Can't export bridges on the virtual PCI bus");
++ goto out;
++ }
+
-+ unregister_hotplug_status_watch(be);
-+ if (be->netif) {
-+ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
-+ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
-+ netif_disconnect(be->netif);
-+ be->netif = NULL;
++ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
++ if (!dev_entry) {
++ err = -ENOMEM;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error adding entry to virtual PCI bus");
++ goto out;
+ }
-+ kfree(be);
-+ dev_set_drvdata(&dev->dev, NULL);
-+ return 0;
-+}
+
++ dev_entry->dev = dev;
+
-+/**
-+ * Entry point to this code when a new device is created. Allocate the basic
-+ * structures and switch to InitWait.
-+ */
-+static int netback_probe(struct xenbus_device *dev,
-+ const struct xenbus_device_id *id)
-+{
-+ const char *message;
-+ struct xenbus_transaction xbt;
-+ int err;
-+ int sg;
-+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
-+ GFP_KERNEL);
-+ if (!be) {
-+ xenbus_dev_fatal(dev, -ENOMEM,
-+ "allocating backend structure");
-+ return -ENOMEM;
++ spin_lock_irqsave(&vpci_dev->lock, flags);
++
++ /* Keep multi-function devices together on the virtual PCI bus */
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++ if (!list_empty(&vpci_dev->dev_list[slot])) {
++ t = list_entry(list_first(&vpci_dev->dev_list[slot]),
++ struct pci_dev_entry, list);
++
++ if (match_slot(dev, t->dev)) {
++ pr_info("pciback: vpci: %s: "
++ "assign to virtual slot %d func %d\n",
++ pci_name(dev), slot,
++ PCI_FUNC(dev->devfn));
++ list_add_tail(&dev_entry->list,
++ &vpci_dev->dev_list[slot]);
++ func = PCI_FUNC(dev->devfn);
++ goto unlock;
++ }
++ }
++ }
++
++ /* Assign to a new slot on the virtual PCI bus */
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++ if (list_empty(&vpci_dev->dev_list[slot])) {
++ printk(KERN_INFO
++ "pciback: vpci: %s: assign to virtual slot %d\n",
++ pci_name(dev), slot);
++ list_add_tail(&dev_entry->list,
++ &vpci_dev->dev_list[slot]);
++ func = PCI_FUNC(dev->devfn);
++ goto unlock;
++ }
+ }
+
-+ be->dev = dev;
-+ dev_set_drvdata(&dev->dev, be);
++ err = -ENOMEM;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "No more space on root virtual PCI bus");
+
-+ sg = 1;
-+ if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
-+ sg = 0;
++unlock:
++ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
-+ do {
-+ err = xenbus_transaction_start(&xbt);
-+ if (err) {
-+ xenbus_dev_fatal(dev, err, "starting transaction");
-+ goto fail;
-+ }
++ /* Publish this device. */
++ if (!err)
++ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
+
-+ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
-+ if (err) {
-+ message = "writing feature-sg";
-+ goto abort_transaction;
-+ }
++out:
++ return err;
++}
+
-+ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
-+ "%d", sg);
-+ if (err) {
-+ message = "writing feature-gso-tcpv4";
-+ goto abort_transaction;
-+ }
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++ int slot;
++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++ struct pci_dev *found_dev = NULL;
++ unsigned long flags;
+
-+ /* We support rx-copy path. */
-+ err = xenbus_printf(xbt, dev->nodename,
-+ "feature-rx-copy", "%d", 1);
-+ if (err) {
-+ message = "writing feature-rx-copy";
-+ goto abort_transaction;
-+ }
++ spin_lock_irqsave(&vpci_dev->lock, flags);
+
-+ /*
-+ * We don't support rx-flip path (except old guests who don't
-+ * grok this feature flag).
-+ */
-+ err = xenbus_printf(xbt, dev->nodename,
-+ "feature-rx-flip", "%d", 0);
-+ if (err) {
-+ message = "writing feature-rx-flip";
-+ goto abort_transaction;
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++ struct pci_dev_entry *e, *tmp;
++ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
++ list) {
++ if (e->dev == dev) {
++ list_del(&e->list);
++ found_dev = e->dev;
++ kfree(e);
++ goto out;
++ }
+ }
++ }
+
-+ /* We support data smart poll mechanism */
-+ err = xenbus_printf(xbt, dev->nodename,
-+ "feature-smart-poll", "%d", 1);
-+ if (err) {
-+ message = "writing feature-smart-poll";
-+ goto abort_transaction;
-+ }
++out:
++ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
-+ err = xenbus_transaction_end(xbt, 0);
-+ } while (err == -EAGAIN);
++ if (found_dev)
++ pcistub_put_pci_dev(found_dev);
++}
+
-+ if (err) {
-+ xenbus_dev_fatal(dev, err, "completing transaction");
-+ goto fail;
-+ }
++int pciback_init_devices(struct pciback_device *pdev)
++{
++ int slot;
++ struct vpci_dev_data *vpci_dev;
+
-+ //netback_probe_accelerators(be, dev);
++ vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
++ if (!vpci_dev)
++ return -ENOMEM;
+
-+ err = xenbus_switch_state(dev, XenbusStateInitWait);
-+ if (err)
-+ goto fail;
++ spin_lock_init(&vpci_dev->lock);
+
-+ /* This kicks hotplug scripts, so do it immediately. */
-+ backend_create_netif(be);
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++)
++ INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
+
-+ return 0;
++ pdev->pci_dev_data = vpci_dev;
+
-+abort_transaction:
-+ xenbus_transaction_end(xbt, 1);
-+ xenbus_dev_fatal(dev, err, "%s", message);
-+fail:
-+ DPRINTK("failed");
-+ netback_remove(dev);
-+ return err;
++ return 0;
+}
+
-+
-+/**
-+ * Handle the creation of the hotplug script environment. We add the script
-+ * and vif variables to the environment, for the benefit of the vif-* hotplug
-+ * scripts.
-+ */
-+static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++ publish_pci_root_cb publish_cb)
+{
-+ struct backend_info *be = dev_get_drvdata(&xdev->dev);
-+ struct xen_netif *netif = be->netif;
-+ char *val;
++ /* The Virtual PCI bus has only one root */
++ return publish_cb(pdev, 0, 0);
++}
+
-+ DPRINTK("netback_uevent");
++void pciback_release_devices(struct pciback_device *pdev)
++{
++ int slot;
++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
-+ val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
-+ if (IS_ERR(val)) {
-+ int err = PTR_ERR(val);
-+ xenbus_dev_fatal(xdev, err, "reading script");
-+ return err;
-+ }
-+ else {
-+ if (add_uevent_var(env, "script=%s", val)) {
-+ kfree(val);
-+ return -ENOMEM;
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++ struct pci_dev_entry *e, *tmp;
++ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
++ list) {
++ list_del(&e->list);
++ pcistub_put_pci_dev(e->dev);
++ kfree(e);
+ }
-+ kfree(val);
+ }
+
-+ if (add_uevent_var(env, "vif=%s", netif->dev->name))
-+ return -ENOMEM;
++ kfree(vpci_dev);
++ pdev->pci_dev_data = NULL;
++}
+
-+ return 0;
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++ struct pciback_device *pdev,
++ unsigned int *domain, unsigned int *bus,
++ unsigned int *devfn)
++{
++ struct pci_dev_entry *entry;
++ struct pci_dev *dev = NULL;
++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++ unsigned long flags;
++ int found = 0, slot;
++
++ spin_lock_irqsave(&vpci_dev->lock, flags);
++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++ list_for_each_entry(entry,
++ &vpci_dev->dev_list[slot],
++ list) {
++ dev = entry->dev;
++ if (dev && dev->bus->number == pcidev->bus->number
++ && pci_domain_nr(dev->bus) ==
++ pci_domain_nr(pcidev->bus)
++ && dev->devfn == pcidev->devfn) {
++ found = 1;
++ *domain = 0;
++ *bus = 0;
++ *devfn = PCI_DEVFN(slot,
++ PCI_FUNC(pcidev->devfn));
++ }
++ }
++ }
++ spin_unlock_irqrestore(&vpci_dev->lock, flags);
++ return found;
+}
+diff --git a/drivers/xen/pciback/xenbus.c b/drivers/xen/pciback/xenbus.c
+new file mode 100644
+index 0000000..d448bf5
+--- /dev/null
++++ b/drivers/xen/pciback/xenbus.c
+@@ -0,0 +1,722 @@
++/*
++ * PCI Backend Xenbus Setup - handles setup with frontend and xend
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/list.h>
++#include <linux/vmalloc.h>
++#include <linux/workqueue.h>
++#include <xen/xenbus.h>
++#include <xen/events.h>
++#include <asm/xen/pci.h>
++#include <linux/workqueue.h>
++#include "pciback.h"
+
++#define INVALID_EVTCHN_IRQ (-1)
++struct workqueue_struct *pciback_wq;
+
-+static void backend_create_netif(struct backend_info *be)
++static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
+{
-+ int err;
-+ long handle;
-+ struct xenbus_device *dev = be->dev;
++ struct pciback_device *pdev;
+
-+ if (be->netif != NULL)
-+ return;
++ pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
++ if (pdev == NULL)
++ goto out;
++ dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
+
-+ err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
-+ if (err != 1) {
-+ xenbus_dev_fatal(dev, err, "reading handle");
-+ return;
-+ }
++ pdev->xdev = xdev;
++ dev_set_drvdata(&xdev->dev, pdev);
+
-+ be->netif = netif_alloc(&dev->dev, dev->otherend_id, handle);
-+ if (IS_ERR(be->netif)) {
-+ err = PTR_ERR(be->netif);
-+ be->netif = NULL;
-+ xenbus_dev_fatal(dev, err, "creating interface");
-+ return;
-+ }
++ spin_lock_init(&pdev->dev_lock);
+
-+ kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
-+}
++ pdev->sh_info = NULL;
++ pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
++ pdev->be_watching = 0;
+
++ INIT_WORK(&pdev->op_work, pciback_do_op);
+
-+static void disconnect_backend(struct xenbus_device *dev)
++ if (pciback_init_devices(pdev)) {
++ kfree(pdev);
++ pdev = NULL;
++ }
++out:
++ return pdev;
++}
++
++static void pciback_disconnect(struct pciback_device *pdev)
+{
-+ struct backend_info *be = dev_get_drvdata(&dev->dev);
++ spin_lock(&pdev->dev_lock);
+
-+ if (be->netif) {
-+ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
-+ netif_disconnect(be->netif);
-+ be->netif = NULL;
++ /* Ensure the guest can't trigger our handler before removing devices */
++ if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
++ unbind_from_irqhandler(pdev->evtchn_irq, pdev);
++ pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
++ }
++
++ /* If the driver domain started an op, make sure we complete it
++ * before releasing the shared memory */
++ flush_workqueue(pciback_wq);
++
++ if (pdev->sh_info != NULL) {
++ xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
++ pdev->sh_info = NULL;
+ }
++
++ spin_unlock(&pdev->dev_lock);
+}
+
-+/**
-+ * Callback received when the frontend's state changes.
-+ */
-+static void frontend_changed(struct xenbus_device *dev,
-+ enum xenbus_state frontend_state)
++static void free_pdev(struct pciback_device *pdev)
+{
-+ struct backend_info *be = dev_get_drvdata(&dev->dev);
++ if (pdev->be_watching)
++ unregister_xenbus_watch(&pdev->be_watch);
+
-+ DPRINTK("%s", xenbus_strstate(frontend_state));
++ pciback_disconnect(pdev);
+
-+ be->frontend_state = frontend_state;
++ pciback_release_devices(pdev);
+
-+ switch (frontend_state) {
-+ case XenbusStateInitialising:
-+ if (dev->state == XenbusStateClosed) {
-+ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
-+ __FUNCTION__, dev->nodename);
-+ xenbus_switch_state(dev, XenbusStateInitWait);
-+ }
-+ break;
++ dev_set_drvdata(&pdev->xdev->dev, NULL);
++ pdev->xdev = NULL;
+
-+ case XenbusStateInitialised:
-+ break;
++ kfree(pdev);
++}
+
-+ case XenbusStateConnected:
-+ if (dev->state == XenbusStateConnected)
-+ break;
-+ backend_create_netif(be);
-+ if (be->netif)
-+ connect(be);
-+ break;
++static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
++ int remote_evtchn)
++{
++ int err = 0;
++ void *vaddr;
+
-+ case XenbusStateClosing:
-+ if (be->netif)
-+ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
-+ disconnect_backend(dev);
-+ xenbus_switch_state(dev, XenbusStateClosing);
-+ break;
++ dev_dbg(&pdev->xdev->dev,
++ "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
++ gnt_ref, remote_evtchn);
+
-+ case XenbusStateClosed:
-+ xenbus_switch_state(dev, XenbusStateClosed);
-+ if (xenbus_dev_is_online(dev))
-+ break;
-+ /* fall through if not online */
-+ case XenbusStateUnknown:
-+ device_unregister(&dev->dev);
-+ break;
++ err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
++ if (err < 0) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error mapping other domain page in ours.");
++ goto out;
++ }
++ pdev->sh_info = vaddr;
+
-+ default:
-+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
-+ frontend_state);
-+ break;
++ err = bind_interdomain_evtchn_to_irqhandler(
++ pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
++ 0, "pciback", pdev);
++ if (err < 0) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error binding event channel to IRQ");
++ goto out;
+ }
-+}
++ pdev->evtchn_irq = err;
++ err = 0;
+
++ dev_dbg(&pdev->xdev->dev, "Attached!\n");
++out:
++ return err;
++}
+
-+static void xen_net_read_rate(struct xenbus_device *dev,
-+ unsigned long *bytes, unsigned long *usec)
++static int pciback_attach(struct pciback_device *pdev)
+{
-+ char *s, *e;
-+ unsigned long b, u;
-+ char *ratestr;
++ int err = 0;
++ int gnt_ref, remote_evtchn;
++ char *magic = NULL;
+
-+ /* Default to unlimited bandwidth. */
-+ *bytes = ~0UL;
-+ *usec = 0;
++ spin_lock(&pdev->dev_lock);
+
-+ ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
-+ if (IS_ERR(ratestr))
-+ return;
++ /* Make sure we only do this setup once */
++ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++ XenbusStateInitialised)
++ goto out;
+
-+ s = ratestr;
-+ b = simple_strtoul(s, &e, 10);
-+ if ((s == e) || (*e != ','))
-+ goto fail;
++ /* Wait for frontend to state that it has published the configuration */
++ if (xenbus_read_driver_state(pdev->xdev->otherend) !=
++ XenbusStateInitialised)
++ goto out;
+
-+ s = e + 1;
-+ u = simple_strtoul(s, &e, 10);
-+ if ((s == e) || (*e != '\0'))
-+ goto fail;
++ dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
+
-+ *bytes = b;
-+ *usec = u;
++ err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
++ "pci-op-ref", "%u", &gnt_ref,
++ "event-channel", "%u", &remote_evtchn,
++ "magic", NULL, &magic, NULL);
++ if (err) {
++ /* If configuration didn't get read correctly, wait longer */
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading configuration from frontend");
++ goto out;
++ }
+
-+ kfree(ratestr);
-+ return;
++ if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
++ xenbus_dev_fatal(pdev->xdev, -EFAULT,
++ "version mismatch (%s/%s) with pcifront - "
++ "halting pciback",
++ magic, XEN_PCI_MAGIC);
++ goto out;
++ }
+
-+ fail:
-+ WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
-+ kfree(ratestr);
-+}
++ err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
++ if (err)
++ goto out;
+
-+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
-+{
-+ char *s, *e, *macstr;
-+ int i;
++ dev_dbg(&pdev->xdev->dev, "Connecting...\n");
+
-+ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
-+ if (IS_ERR(macstr))
-+ return PTR_ERR(macstr);
++ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
++ if (err)
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error switching to connected state!");
+
-+ for (i = 0; i < ETH_ALEN; i++) {
-+ mac[i] = simple_strtoul(s, &e, 16);
-+ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
-+ kfree(macstr);
-+ return -ENOENT;
-+ }
-+ s = e+1;
-+ }
++ dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
++out:
++ spin_unlock(&pdev->dev_lock);
+
-+ kfree(macstr);
-+ return 0;
++ kfree(magic);
++
++ return err;
+}
+
-+static void unregister_hotplug_status_watch(struct backend_info *be)
++static int pciback_publish_pci_dev(struct pciback_device *pdev,
++ unsigned int domain, unsigned int bus,
++ unsigned int devfn, unsigned int devid)
+{
-+ if (be->have_hotplug_status_watch) {
-+ unregister_xenbus_watch(&be->hotplug_status_watch);
-+ kfree(be->hotplug_status_watch.node);
++ int err;
++ int len;
++ char str[64];
++
++ len = snprintf(str, sizeof(str), "vdev-%d", devid);
++ if (unlikely(len >= (sizeof(str) - 1))) {
++ err = -ENOMEM;
++ goto out;
+ }
-+ be->have_hotplug_status_watch = 0;
++
++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++ "%04x:%02x:%02x.%02x", domain, bus,
++ PCI_SLOT(devfn), PCI_FUNC(devfn));
++
++out:
++ return err;
+}
+
-+static void hotplug_status_changed(struct xenbus_watch *watch,
-+ const char **vec,
-+ unsigned int vec_size)
++static int pciback_export_device(struct pciback_device *pdev,
++ int domain, int bus, int slot, int func,
++ int devid)
+{
-+ struct backend_info *be = container_of(watch,
-+ struct backend_info,
-+ hotplug_status_watch);
-+ char *str;
-+ unsigned int len;
++ struct pci_dev *dev;
++ int err = 0;
+
-+ str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
-+ if (IS_ERR(str))
-+ return;
-+ if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
-+ xenbus_switch_state(be->dev, XenbusStateConnected);
-+ /* Not interested in this watch anymore. */
-+ unregister_hotplug_status_watch(be);
++ dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
++ domain, bus, slot, func);
++
++ dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
++ if (!dev) {
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Couldn't locate PCI device "
++ "(%04x:%02x:%02x.%01x)! "
++ "perhaps already in-use?",
++ domain, bus, slot, func);
++ goto out;
+ }
-+ kfree(str);
++
++ err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev);
++ if (err)
++ goto out;
++
++ dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
++ if (xen_register_device_domain_owner(dev,
++ pdev->xdev->otherend_id) != 0) {
++ dev_err(&dev->dev, "device has been assigned to another " \
++ "domain! Over-writting the ownership, but beware.\n");
++ xen_unregister_device_domain_owner(dev);
++ xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
++ }
++
++ /* TODO: It'd be nice to export a bridge and have all of its children
++ * get exported with it. This may be best done in xend (which will
++ * have to calculate resource usage anyway) but we probably want to
++ * put something in here to ensure that if a bridge gets given to a
++ * driver domain, that all devices under that bridge are not given
++ * to other driver domains (as he who controls the bridge can disable
++ * it and stop the other devices from working).
++ */
++out:
++ return err;
+}
+
-+static void connect(struct backend_info *be)
++static int pciback_remove_device(struct pciback_device *pdev,
++ int domain, int bus, int slot, int func)
+{
-+ int err;
-+ struct xenbus_device *dev = be->dev;
++ int err = 0;
++ struct pci_dev *dev;
+
-+ err = connect_rings(be);
-+ if (err)
-+ return;
++ dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
++ domain, bus, slot, func);
+
-+ err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
-+ if (err) {
-+ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
-+ return;
++ dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
++ if (!dev) {
++ err = -EINVAL;
++ dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
++ "(%04x:%02x:%02x.%01x)! not owned by this domain\n",
++ domain, bus, slot, func);
++ goto out;
+ }
+
-+ xen_net_read_rate(dev, &be->netif->credit_bytes,
-+ &be->netif->credit_usec);
-+ be->netif->remaining_credit = be->netif->credit_bytes;
++ dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
++ xen_unregister_device_domain_owner(dev);
+
-+ unregister_hotplug_status_watch(be);
-+ err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
-+ hotplug_status_changed,
-+ "%s/%s", dev->nodename, "hotplug-status");
-+ if (err) {
-+ /* Switch now, since we can't do a watch. */
-+ xenbus_switch_state(dev, XenbusStateConnected);
-+ } else {
-+ be->have_hotplug_status_watch = 1;
-+ }
++ pciback_release_pci_dev(pdev, dev);
+
-+ netif_wake_queue(be->netif->dev);
++out:
++ return err;
+}
+
-+
-+static int connect_rings(struct backend_info *be)
++static int pciback_publish_pci_root(struct pciback_device *pdev,
++ unsigned int domain, unsigned int bus)
+{
-+ struct xenbus_device *dev = be->dev;
-+ unsigned long tx_ring_ref, rx_ring_ref;
-+ unsigned int evtchn, rx_copy;
-+ int err;
-+ int val;
++ unsigned int d, b;
++ int i, root_num, len, err;
++ char str[64];
++
++ dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
++
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++ "root_num", "%d", &root_num);
++ if (err == 0 || err == -ENOENT)
++ root_num = 0;
++ else if (err < 0)
++ goto out;
+
-+ DPRINTK("");
++ /* Verify that we haven't already published this pci root */
++ for (i = 0; i < root_num; i++) {
++ len = snprintf(str, sizeof(str), "root-%d", i);
++ if (unlikely(len >= (sizeof(str) - 1))) {
++ err = -ENOMEM;
++ goto out;
++ }
+
-+ err = xenbus_gather(XBT_NIL, dev->otherend,
-+ "tx-ring-ref", "%lu", &tx_ring_ref,
-+ "rx-ring-ref", "%lu", &rx_ring_ref,
-+ "event-channel", "%u", &evtchn, NULL);
-+ if (err) {
-+ xenbus_dev_fatal(dev, err,
-+ "reading %s/ring-ref and event-channel",
-+ dev->otherend);
-+ return err;
-+ }
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++ str, "%x:%x", &d, &b);
++ if (err < 0)
++ goto out;
++ if (err != 2) {
++ err = -EINVAL;
++ goto out;
++ }
+
-+ err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
-+ &rx_copy);
-+ if (err == -ENOENT) {
-+ err = 0;
-+ rx_copy = 0;
-+ }
-+ if (err < 0) {
-+ xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
-+ dev->otherend);
-+ return err;
++ if (d == domain && b == bus) {
++ err = 0;
++ goto out;
++ }
+ }
-+ if (!rx_copy)
-+ return -EOPNOTSUPP;
+
-+ if (be->netif->dev->tx_queue_len != 0) {
-+ if (xenbus_scanf(XBT_NIL, dev->otherend,
-+ "feature-rx-notify", "%d", &val) < 0)
-+ val = 0;
-+ if (val)
-+ be->netif->can_queue = 1;
-+ else
-+ /* Must be non-zero for pfifo_fast to work. */
-+ be->netif->dev->tx_queue_len = 1;
++ len = snprintf(str, sizeof(str), "root-%d", root_num);
++ if (unlikely(len >= (sizeof(str) - 1))) {
++ err = -ENOMEM;
++ goto out;
+ }
+
-+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
-+ val = 0;
-+ if (!val) {
-+ be->netif->features &= ~NETIF_F_SG;
-+ be->netif->dev->features &= ~NETIF_F_SG;
-+ if (be->netif->dev->mtu > ETH_DATA_LEN)
-+ be->netif->dev->mtu = ETH_DATA_LEN;
-+ }
++ dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
++ root_num, domain, bus);
+
-+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
-+ &val) < 0)
-+ val = 0;
-+ if (val) {
-+ be->netif->features |= NETIF_F_TSO;
-+ be->netif->dev->features |= NETIF_F_TSO;
-+ }
++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++ "%04x:%02x", domain, bus);
++ if (err)
++ goto out;
+
-+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
-+ "%d", &val) < 0)
-+ val = 0;
-+ if (val) {
-+ be->netif->features &= ~NETIF_F_IP_CSUM;
-+ be->netif->dev->features &= ~NETIF_F_IP_CSUM;
-+ }
++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
++ "root_num", "%d", (root_num + 1));
+
-+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll",
-+ "%d", &val) < 0)
-+ val = 0;
-+ if (val)
-+ be->netif->smart_poll = 1;
-+ else
-+ be->netif->smart_poll = 0;
++out:
++ return err;
++}
+
-+ /* Map the shared frame, irq etc. */
-+ err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
-+ if (err) {
-+ xenbus_dev_fatal(dev, err,
-+ "mapping shared-frames %lu/%lu port %u",
-+ tx_ring_ref, rx_ring_ref, evtchn);
-+ return err;
++static int pciback_reconfigure(struct pciback_device *pdev)
++{
++ int err = 0;
++ int num_devs;
++ int domain, bus, slot, func;
++ int substate;
++ int i, len;
++ char state_str[64];
++ char dev_str[64];
++
++ spin_lock(&pdev->dev_lock);
++
++ dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
++
++ /* Make sure we only reconfigure once */
++ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++ XenbusStateReconfiguring)
++ goto out;
++
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
++ &num_devs);
++ if (err != 1) {
++ if (err >= 0)
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading number of devices");
++ goto out;
+ }
-+ return 0;
-+}
+
++ for (i = 0; i < num_devs; i++) {
++ len = snprintf(state_str, sizeof(state_str), "state-%d", i);
++ if (unlikely(len >= (sizeof(state_str) - 1))) {
++ err = -ENOMEM;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "String overflow while reading "
++ "configuration");
++ goto out;
++ }
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
++ "%d", &substate);
++ if (err != 1)
++ substate = XenbusStateUnknown;
++
++ switch (substate) {
++ case XenbusStateInitialising:
++ dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
++
++ len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
++ if (unlikely(len >= (sizeof(dev_str) - 1))) {
++ err = -ENOMEM;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "String overflow while "
++ "reading configuration");
++ goto out;
++ }
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++ dev_str, "%x:%x:%x.%x",
++ &domain, &bus, &slot, &func);
++ if (err < 0) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading device "
++ "configuration");
++ goto out;
++ }
++ if (err != 4) {
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error parsing pci device "
++ "configuration");
++ goto out;
++ }
++
++ err = pciback_export_device(pdev, domain, bus, slot,
++ func, i);
++ if (err)
++ goto out;
++
++ /* Publish pci roots. */
++ err = pciback_publish_pci_roots(pdev,
++ pciback_publish_pci_root);
++ if (err) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error while publish PCI root"
++ "buses for frontend");
++ goto out;
++ }
+
-+/* ** Driver Registration ** */
++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
++ state_str, "%d",
++ XenbusStateInitialised);
++ if (err) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error switching substate of "
++ "dev-%d\n", i);
++ goto out;
++ }
++ break;
+
++ case XenbusStateClosing:
++ dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
+
-+static const struct xenbus_device_id netback_ids[] = {
-+ { "vif" },
-+ { "" }
-+};
++ len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
++ if (unlikely(len >= (sizeof(dev_str) - 1))) {
++ err = -ENOMEM;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "String overflow while "
++ "reading configuration");
++ goto out;
++ }
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++ dev_str, "%x:%x:%x.%x",
++ &domain, &bus, &slot, &func);
++ if (err < 0) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading device "
++ "configuration");
++ goto out;
++ }
++ if (err != 4) {
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error parsing pci device "
++ "configuration");
++ goto out;
++ }
+
++ err = pciback_remove_device(pdev, domain, bus, slot,
++ func);
++ if (err)
++ goto out;
++
++ /* TODO: If at some point we implement support for pci
++ * root hot-remove on pcifront side, we'll need to
++ * remove unnecessary xenstore nodes of pci roots here.
++ */
+
-+static struct xenbus_driver netback = {
-+ .name = "vif",
-+ .owner = THIS_MODULE,
-+ .ids = netback_ids,
-+ .probe = netback_probe,
-+ .remove = netback_remove,
-+ .uevent = netback_uevent,
-+ .otherend_changed = frontend_changed,
-+};
++ break;
+
++ default:
++ break;
++ }
++ }
+
-+int netif_xenbus_init(void)
-+{
-+ printk(KERN_CRIT "registering netback\n");
-+ return xenbus_register_backend(&netback);
++ err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
++ if (err) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error switching to reconfigured state!");
++ goto out;
++ }
++
++out:
++ spin_unlock(&pdev->dev_lock);
++
++ return 0;
+}
-diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
-new file mode 100644
-index 0000000..ae693e7
---- /dev/null
-+++ b/drivers/xen/pci.c
-@@ -0,0 +1,124 @@
-+/*
-+ * Copyright (c) 2009, Intel Corporation.
-+ *
-+ * This program is free software; you can redistribute it and/or modify it
-+ * under the terms and conditions of the GNU General Public License,
-+ * version 2, as published by the Free Software Foundation.
-+ *
-+ * This program is distributed in the hope it will be useful, but WITHOUT
-+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
-+ * more details.
-+ *
-+ * You should have received a copy of the GNU General Public License along with
-+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-+ * Place - Suite 330, Boston, MA 02111-1307 USA.
-+ *
-+ * Author: Weidong Han <weidong.han at intel.com>
-+ */
+
-+#include <linux/pci.h>
++static void pciback_frontend_changed(struct xenbus_device *xdev,
++ enum xenbus_state fe_state)
++{
++ struct pciback_device *pdev = dev_get_drvdata(&xdev->dev);
+
-+#include <xen/interface/xen.h>
-+#include <xen/interface/physdev.h>
++ dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
+
-+#include <asm/xen/hypervisor.h>
-+#include <asm/xen/hypercall.h>
++ switch (fe_state) {
++ case XenbusStateInitialised:
++ pciback_attach(pdev);
++ break;
+
-+#include "../pci/pci.h"
++ case XenbusStateReconfiguring:
++ pciback_reconfigure(pdev);
++ break;
+
++ case XenbusStateConnected:
++ /* pcifront switched its state from reconfiguring to connected.
++ * Then switch to connected state.
++ */
++ xenbus_switch_state(xdev, XenbusStateConnected);
++ break;
+
-+#ifdef CONFIG_PCI_IOV
-+#define HANDLE_PCI_IOV 1
-+#else
-+#define HANDLE_PCI_IOV 0
-+#endif
++ case XenbusStateClosing:
++ pciback_disconnect(pdev);
++ xenbus_switch_state(xdev, XenbusStateClosing);
++ break;
+
-+static int xen_add_device(struct device *dev)
++ case XenbusStateClosed:
++ pciback_disconnect(pdev);
++ xenbus_switch_state(xdev, XenbusStateClosed);
++ if (xenbus_dev_is_online(xdev))
++ break;
++ /* fall through if not online */
++ case XenbusStateUnknown:
++ dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
++ device_unregister(&xdev->dev);
++ break;
++
++ default:
++ break;
++ }
++}
++
++static int pciback_setup_backend(struct pciback_device *pdev)
+{
-+ int r;
-+ struct pci_dev *pci_dev = to_pci_dev(dev);
++ /* Get configuration from xend (if available now) */
++ int domain, bus, slot, func;
++ int err = 0;
++ int i, num_devs;
++ char dev_str[64];
++ char state_str[64];
+
-+ if (HANDLE_PCI_IOV && pci_dev->is_virtfn) {
-+ struct physdev_manage_pci_ext manage_pci_ext = {
-+ .bus = pci_dev->bus->number,
-+ .devfn = pci_dev->devfn,
-+ .is_virtfn = 1,
-+#ifdef CONFIG_PCI_IOV
-+ .physfn.bus = pci_dev->physfn->bus->number,
-+ .physfn.devfn = pci_dev->physfn->devfn,
-+#endif
-+ };
++ spin_lock(&pdev->dev_lock);
+
-+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
-+ &manage_pci_ext);
-+ } else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
-+ struct physdev_manage_pci_ext manage_pci_ext = {
-+ .bus = pci_dev->bus->number,
-+ .devfn = pci_dev->devfn,
-+ .is_extfn = 1,
-+ };
++ /* It's possible we could get the call to setup twice, so make sure
++ * we're not already connected.
++ */
++ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++ XenbusStateInitWait)
++ goto out;
+
-+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
-+ &manage_pci_ext);
-+ } else {
-+ struct physdev_manage_pci manage_pci = {
-+ .bus = pci_dev->bus->number,
-+ .devfn = pci_dev->devfn,
-+ };
++ dev_dbg(&pdev->xdev->dev, "getting be setup\n");
+
-+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
-+ &manage_pci);
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
++ &num_devs);
++ if (err != 1) {
++ if (err >= 0)
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading number of devices");
++ goto out;
+ }
+
-+ return r;
-+}
++ for (i = 0; i < num_devs; i++) {
++ int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
++ if (unlikely(l >= (sizeof(dev_str) - 1))) {
++ err = -ENOMEM;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "String overflow while reading "
++ "configuration");
++ goto out;
++ }
+
-+static int xen_remove_device(struct device *dev)
-+{
-+ int r;
-+ struct pci_dev *pci_dev = to_pci_dev(dev);
-+ struct physdev_manage_pci manage_pci;
++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
++ "%x:%x:%x.%x", &domain, &bus, &slot, &func);
++ if (err < 0) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error reading device configuration");
++ goto out;
++ }
++ if (err != 4) {
++ err = -EINVAL;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error parsing pci device "
++ "configuration");
++ goto out;
++ }
+
-+ manage_pci.bus = pci_dev->bus->number;
-+ manage_pci.devfn = pci_dev->devfn;
++ err = pciback_export_device(pdev, domain, bus, slot, func, i);
++ if (err)
++ goto out;
+
-+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
-+ &manage_pci);
++ /* Switch substate of this device. */
++ l = snprintf(state_str, sizeof(state_str), "state-%d", i);
++ if (unlikely(l >= (sizeof(state_str) - 1))) {
++ err = -ENOMEM;
++ xenbus_dev_fatal(pdev->xdev, err,
++ "String overflow while reading "
++ "configuration");
++ goto out;
++ }
++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
++ "%d", XenbusStateInitialised);
++ if (err) {
++ xenbus_dev_fatal(pdev->xdev, err, "Error switching "
++ "substate of dev-%d\n", i);
++ goto out;
++ }
++ }
+
-+ return r;
++ err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
++ if (err) {
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error while publish PCI root buses "
++ "for frontend");
++ goto out;
++ }
++
++ err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
++ if (err)
++ xenbus_dev_fatal(pdev->xdev, err,
++ "Error switching to initialised state!");
++
++out:
++ spin_unlock(&pdev->dev_lock);
++
++ if (!err)
++ /* see if pcifront is already configured (if not, we'll wait) */
++ pciback_attach(pdev);
++
++ return err;
+}
+
-+static int xen_pci_notifier(struct notifier_block *nb,
-+ unsigned long action, void *data)
++static void pciback_be_watch(struct xenbus_watch *watch,
++ const char **vec, unsigned int len)
+{
-+ struct device *dev = data;
-+ int r = 0;
++ struct pciback_device *pdev =
++ container_of(watch, struct pciback_device, be_watch);
+
-+ switch (action) {
-+ case BUS_NOTIFY_ADD_DEVICE:
-+ r = xen_add_device(dev);
-+ break;
-+ case BUS_NOTIFY_DEL_DEVICE:
-+ r = xen_remove_device(dev);
++ switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
++ case XenbusStateInitWait:
++ pciback_setup_backend(pdev);
+ break;
++
+ default:
+ break;
+ }
++}
+
-+ return r;
++static int pciback_xenbus_probe(struct xenbus_device *dev,
++ const struct xenbus_device_id *id)
++{
++ int err = 0;
++ struct pciback_device *pdev = alloc_pdev(dev);
++
++ if (pdev == NULL) {
++ err = -ENOMEM;
++ xenbus_dev_fatal(dev, err,
++ "Error allocating pciback_device struct");
++ goto out;
++ }
++
++ /* wait for xend to configure us */
++ err = xenbus_switch_state(dev, XenbusStateInitWait);
++ if (err)
++ goto out;
++
++ /* watch the backend node for backend configuration information */
++ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
++ pciback_be_watch);
++ if (err)
++ goto out;
++ pdev->be_watching = 1;
++
++ /* We need to force a call to our callback here in case
++ * xend already configured us!
++ */
++ pciback_be_watch(&pdev->be_watch, NULL, 0);
++
++out:
++ return err;
+}
+
-+struct notifier_block device_nb = {
-+ .notifier_call = xen_pci_notifier,
++static int pciback_xenbus_remove(struct xenbus_device *dev)
++{
++ struct pciback_device *pdev = dev_get_drvdata(&dev->dev);
++
++ if (pdev != NULL)
++ free_pdev(pdev);
++
++ return 0;
++}
++
++static const struct xenbus_device_id xenpci_ids[] = {
++ {"pci"},
++ {""},
+};
+
-+static int __init register_xen_pci_notifier(void)
-+{
-+ if (!xen_pv_domain())
-+ return 0;
++static struct xenbus_driver xenbus_pciback_driver = {
++ .name = "pciback",
++ .owner = THIS_MODULE,
++ .ids = xenpci_ids,
++ .probe = pciback_xenbus_probe,
++ .remove = pciback_xenbus_remove,
++ .otherend_changed = pciback_frontend_changed,
++};
+
-+ return bus_register_notifier(&pci_bus_type, &device_nb);
++int __init pciback_xenbus_register(void)
++{
++ pciback_wq = create_workqueue("pciback_workqueue");
++ if (!pciback_wq) {
++ printk(KERN_ERR "pciback_xenbus_register: create"
++ "pciback_workqueue failed\n");
++ return -EFAULT;
++ }
++ return xenbus_register_backend(&xenbus_pciback_driver);
+}
+
-+arch_initcall(register_xen_pci_notifier);
++void __exit pciback_xenbus_unregister(void)
++{
++ destroy_workqueue(pciback_wq);
++ xenbus_unregister_driver(&xenbus_pciback_driver);
++}
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 88a60e0..ae5cb05 100644
--- a/drivers/xen/sys-hypervisor.c
@@ -15272,6 +22184,19 @@
+xenbus-objs += $(xenbus-be-objs-y)
+
+obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
+diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
+index 92a1ef8..da3ca9e 100644
+--- a/drivers/xen/xenbus/xenbus_client.c
++++ b/drivers/xen/xenbus/xenbus_client.c
+@@ -49,6 +49,8 @@ const char *xenbus_strstate(enum xenbus_state state)
+ [ XenbusStateConnected ] = "Connected",
+ [ XenbusStateClosing ] = "Closing",
+ [ XenbusStateClosed ] = "Closed",
++ [ XenbusStateReconfiguring ] = "Reconfiguring",
++ [ XenbusStateReconfigured ] = "Reconfigured",
+ };
+ return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
+ }
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
index 090c61e..700dc77 100644
--- a/drivers/xen/xenbus/xenbus_comms.c
@@ -17392,6 +24317,50 @@
+#endif /* CONFIG_DMAR */
+
#endif /* __DMAR_H__ */
+diff --git a/include/linux/fb.h b/include/linux/fb.h
+index de9c722..369767b 100644
+--- a/include/linux/fb.h
++++ b/include/linux/fb.h
+@@ -763,6 +763,7 @@ struct fb_tile_ops {
+ * takes over; acceleration engine should be in a quiescent state */
+
+ /* hints */
++#define FBINFO_VIRTFB 0x0004 /* FB is System RAM, not device. */
+ #define FBINFO_PARTIAL_PAN_OK 0x0040 /* otw use pan only for double-buffering */
+ #define FBINFO_READS_FAST 0x0080 /* soft-copy faster than rendering */
+
+diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
+index 9bace4b..040b679 100644
+--- a/include/linux/hrtimer.h
++++ b/include/linux/hrtimer.h
+@@ -162,10 +162,11 @@ struct hrtimer_clock_base {
+ * @expires_next: absolute time of the next event which was scheduled
+ * via clock_set_next_event()
+ * @hres_active: State of high resolution mode
+- * @check_clocks: Indictator, when set evaluate time source and clock
+- * event devices whether high resolution mode can be
+- * activated.
+- * @nr_events: Total number of timer interrupt events
++ * @hang_detected: The last hrtimer interrupt detected a hang
++ * @nr_events: Total number of hrtimer interrupt events
++ * @nr_retries: Total number of hrtimer interrupt retries
++ * @nr_hangs: Total number of hrtimer interrupt hangs
++ * @max_hang_time: Maximum time spent in hrtimer_interrupt
+ */
+ struct hrtimer_cpu_base {
+ spinlock_t lock;
+@@ -173,7 +174,11 @@ struct hrtimer_cpu_base {
+ #ifdef CONFIG_HIGH_RES_TIMERS
+ ktime_t expires_next;
+ int hres_active;
++ int hang_detected;
+ unsigned long nr_events;
++ unsigned long nr_retries;
++ unsigned long nr_hangs;
++ ktime_t max_hang_time;
+ #endif
+ };
+
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 24c3956..3d74515 100644
--- a/include/linux/mm.h
@@ -17752,7 +24721,7 @@
+
+#endif /* __XEN_BLKIF_H__ */
diff --git a/include/xen/events.h b/include/xen/events.h
-index e68d59a..4a934a7 100644
+index e68d59a..c9034af 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -12,6 +12,8 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
@@ -17777,7 +24746,14 @@
/*
* Common unbind function for all event sources. Takes IRQ to unbind from.
-@@ -56,4 +64,23 @@ void xen_poll_irq(int irq);
+@@ -53,7 +61,33 @@ bool xen_test_irq_pending(int irq);
+ irq will be disabled so it won't deliver an interrupt. */
+ void xen_poll_irq(int irq);
+
++/* Poll waiting for an irq to become pending with a timeout. In the usual case, the
++ irq will be disabled so it won't deliver an interrupt. */
++void xen_poll_irq_timeout(int irq, u64 timeout);
++
/* Determine the IRQ which is bound to an event channel */
unsigned irq_from_evtchn(unsigned int evtchn);
@@ -17786,6 +24762,9 @@
+ usual. */
+int xen_allocate_pirq(unsigned gsi, int shareable, char *name);
+
++/* De-allocates the above mentioned physical interrupt. */
++int xen_destroy_irq(int irq);
++
+/* Return vector allocated to pirq */
+int xen_vector_from_irq(unsigned pirq);
+
@@ -18037,6 +25016,136 @@
* Bitfield values for update_pin_status.flags.
*/
/* Map the grant entry for access by I/O devices. */
+diff --git a/include/xen/interface/io/pciif.h b/include/xen/interface/io/pciif.h
+new file mode 100644
+index 0000000..c4177f3
+--- /dev/null
++++ b/include/xen/interface/io/pciif.h
+@@ -0,0 +1,124 @@
++/*
++ * PCI Backend/Frontend Common Data Structures & Macros
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Author: Ryan Wilson <hap9 at epoch.ncsc.mil>
++ */
++#ifndef __XEN_PCI_COMMON_H__
++#define __XEN_PCI_COMMON_H__
++
++/* Be sure to bump this number if you change this file */
++#define XEN_PCI_MAGIC "7"
++
++/* xen_pci_sharedinfo flags */
++#define _XEN_PCIF_active (0)
++#define XEN_PCIF_active (1<<_XEN_PCIF_active)
++#define _XEN_PCIB_AERHANDLER (1)
++#define XEN_PCIB_AERHANDLER (1<<_XEN_PCIB_AERHANDLER)
++#define _XEN_PCIB_active (2)
++#define XEN_PCIB_active (1<<_XEN_PCIB_active)
++
++/* xen_pci_op commands */
++#define XEN_PCI_OP_conf_read (0)
++#define XEN_PCI_OP_conf_write (1)
++#define XEN_PCI_OP_enable_msi (2)
++#define XEN_PCI_OP_disable_msi (3)
++#define XEN_PCI_OP_enable_msix (4)
++#define XEN_PCI_OP_disable_msix (5)
++#define XEN_PCI_OP_aer_detected (6)
++#define XEN_PCI_OP_aer_resume (7)
++#define XEN_PCI_OP_aer_mmio (8)
++#define XEN_PCI_OP_aer_slotreset (9)
++
++/* xen_pci_op error numbers */
++#define XEN_PCI_ERR_success (0)
++#define XEN_PCI_ERR_dev_not_found (-1)
++#define XEN_PCI_ERR_invalid_offset (-2)
++#define XEN_PCI_ERR_access_denied (-3)
++#define XEN_PCI_ERR_not_implemented (-4)
++/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */
++#define XEN_PCI_ERR_op_failed (-5)
++
++/*
++ * it should be PAGE_SIZE-sizeof(struct xen_pci_op))/sizeof(struct msix_entry))
++ * Should not exceed 128
++ */
++#define SH_INFO_MAX_VEC 128
++
++struct xen_msix_entry {
++ uint16_t vector;
++ uint16_t entry;
++};
++struct xen_pci_op {
++ /* IN: what action to perform: XEN_PCI_OP_* */
++ uint32_t cmd;
++
++ /* OUT: will contain an error number (if any) from errno.h */
++ int32_t err;
++
++ /* IN: which device to touch */
++ uint32_t domain; /* PCI Domain/Segment */
++ uint32_t bus;
++ uint32_t devfn;
++
++ /* IN: which configuration registers to touch */
++ int32_t offset;
++ int32_t size;
++
++ /* IN/OUT: Contains the result after a READ or the value to WRITE */
++ uint32_t value;
++ /* IN: Contains extra infor for this operation */
++ uint32_t info;
++ /*IN: param for msi-x */
++ struct xen_msix_entry msix_entries[SH_INFO_MAX_VEC];
++};
++
++/*used for pcie aer handling*/
++struct xen_pcie_aer_op
++{
++
++ /* IN: what action to perform: XEN_PCI_OP_* */
++ uint32_t cmd;
++ /*IN/OUT: return aer_op result or carry error_detected state as input*/
++ int32_t err;
++
++ /* IN: which device to touch */
++ uint32_t domain; /* PCI Domain/Segment*/
++ uint32_t bus;
++ uint32_t devfn;
++};
++struct xen_pci_sharedinfo {
++ /* flags - XEN_PCIF_* */
++ uint32_t flags;
++ struct xen_pci_op op;
++ struct xen_pcie_aer_op aer_op;
++};
++
++#endif /* __XEN_PCI_COMMON_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
index e8cbf43..865dcf0 100644
--- a/include/xen/interface/io/ring.h
@@ -18051,6 +25160,26 @@
union __name##_sring_entry ring[1]; /* variable-length */ \
}; \
\
+diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h
+index 46508c7..9fda532 100644
+--- a/include/xen/interface/io/xenbus.h
++++ b/include/xen/interface/io/xenbus.h
+@@ -27,8 +27,14 @@ enum xenbus_state
+ XenbusStateClosing = 5, /* The device is being closed
+ due to an error or an unplug
+ event. */
+- XenbusStateClosed = 6
++ XenbusStateClosed = 6,
+
++ /*
++ * Reconfiguring: The device is being reconfigured.
++ */
++ XenbusStateReconfiguring = 7,
++
++ XenbusStateReconfigured = 8
+ };
+
+ #endif /* _XEN_PUBLIC_IO_XENBUS_H */
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index af36ead..eac3ce1 100644
--- a/include/xen/interface/memory.h
@@ -18705,6 +25834,177 @@
struct device_driver driver;
int (*read_otherend_details)(struct xenbus_device *dev);
int (*is_ready)(struct xenbus_device *dev);
+diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
+index 3e1c36e..931a4d9 100644
+--- a/kernel/hrtimer.c
++++ b/kernel/hrtimer.c
+@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
+ static int hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+ {
+- ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
++ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ int res;
+
+@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
+ if (expires.tv64 < 0)
+ return -ETIME;
+
+- if (expires.tv64 >= expires_next->tv64)
++ if (expires.tv64 >= cpu_base->expires_next.tv64)
++ return 0;
++
++ /*
++ * If a hang was detected in the last timer interrupt then we
++ * do not schedule a timer which is earlier than the expiry
++ * which we enforced in the hang detection. We want the system
++ * to make progress.
++ */
++ if (cpu_base->hang_detected)
+ return 0;
+
+ /*
+@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
+ */
+ res = tick_program_event(expires, 0);
+ if (!IS_ERR_VALUE(res))
+- *expires_next = expires;
++ cpu_base->expires_next = expires;
+ return res;
+ }
+
+@@ -1217,29 +1226,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+
+ #ifdef CONFIG_HIGH_RES_TIMERS
+
+-static int force_clock_reprogram;
+-
+-/*
+- * After 5 iteration's attempts, we consider that hrtimer_interrupt()
+- * is hanging, which could happen with something that slows the interrupt
+- * such as the tracing. Then we force the clock reprogramming for each future
+- * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
+- * threshold that we will overwrite.
+- * The next tick event will be scheduled to 3 times we currently spend on
+- * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
+- * 1/4 of their time to process the hrtimer interrupts. This is enough to
+- * let it running without serious starvation.
+- */
+-
+-static inline void
+-hrtimer_interrupt_hanging(struct clock_event_device *dev,
+- ktime_t try_time)
+-{
+- force_clock_reprogram = 1;
+- dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
+- printk(KERN_WARNING "hrtimer: interrupt too slow, "
+- "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
+-}
+ /*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+@@ -1248,21 +1234,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
+ {
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_clock_base *base;
+- ktime_t expires_next, now;
+- int nr_retries = 0;
+- int i;
++ ktime_t expires_next, now, entry_time, delta;
++ int i, retries = 0;
+
+ BUG_ON(!cpu_base->hres_active);
+ cpu_base->nr_events++;
+ dev->next_event.tv64 = KTIME_MAX;
+
+- retry:
+- /* 5 retries is enough to notice a hang */
+- if (!(++nr_retries % 5))
+- hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
+-
+- now = ktime_get();
+-
++ entry_time = now = ktime_get();
++retry:
+ expires_next.tv64 = KTIME_MAX;
+
+ spin_lock(&cpu_base->lock);
+@@ -1324,10 +1304,48 @@ void hrtimer_interrupt(struct clock_event_device *dev)
+ spin_unlock(&cpu_base->lock);
+
+ /* Reprogramming necessary ? */
+- if (expires_next.tv64 != KTIME_MAX) {
+- if (tick_program_event(expires_next, force_clock_reprogram))
+- goto retry;
++ if (expires_next.tv64 == KTIME_MAX ||
++ !tick_program_event(expires_next, 0)) {
++ cpu_base->hang_detected = 0;
++ return;
+ }
++
++ /*
++ * The next timer was already expired due to:
++ * - tracing
++ * - long lasting callbacks
++ * - being scheduled away when running in a VM
++ *
++ * We need to prevent that we loop forever in the hrtimer
++ * interrupt routine. We give it 3 attempts to avoid
++ * overreacting on some spurious event.
++ */
++ now = ktime_get();
++ cpu_base->nr_retries++;
++ if (++retries < 3)
++ goto retry;
++ /*
++ * Give the system a chance to do something else than looping
++ * here. We stored the entry time, so we know exactly how long
++ * we spent here. We schedule the next event this amount of
++ * time away.
++ */
++ cpu_base->nr_hangs++;
++ cpu_base->hang_detected = 1;
++ delta = ktime_sub(now, entry_time);
++ if (delta.tv64 > cpu_base->max_hang_time.tv64)
++ cpu_base->max_hang_time = delta;
++ /*
++ * Limit it to a sensible value as we enforce a longer
++ * delay. Give the CPU at least 100ms to catch up.
++ */
++ if (delta.tv64 > 100 * NSEC_PER_MSEC)
++ expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
++ else
++ expires_next = ktime_add(now, delta);
++ tick_program_event(expires_next, 1);
++ printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
++ ktime_to_ns(delta));
+ }
+
+ /*
+diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
+index 1b5b7aa..54c0dda 100644
+--- a/kernel/time/timer_list.c
++++ b/kernel/time/timer_list.c
+@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
+ P_ns(expires_next);
+ P(hres_active);
+ P(nr_events);
++ P(nr_retries);
++ P(nr_hangs);
++ P_ns(max_hang_time);
+ #endif
+ #undef P
+ #undef P_ns
+@@ -252,7 +255,7 @@ static int timer_list_show(struct seq_file *m, void *v)
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+- SEQ_printf(m, "Timer List Version: v0.4\n");
++ SEQ_printf(m, "Timer List Version: v0.5\n");
+ SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
+ SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+
diff --git a/lib/Makefile b/lib/Makefile
index 2e78277..7c31e3d 100644
--- a/lib/Makefile
More information about the Kernel-svn-changes
mailing list