[kernel] r11329 - in dists/trunk/linux-2.6/debian/patches: bugfix/all series
Maximilian Attems
maks at alioth.debian.org
Thu May 8 08:43:42 UTC 2008
Author: maks
Date: Thu May 8 08:43:39 2008
New Revision: 11329
Log:
update to 2.6.26-rc1-git6
no conflict yet
Added:
dists/trunk/linux-2.6/debian/patches/bugfix/all/patch-2.6.26-rc1-git6
Removed:
dists/trunk/linux-2.6/debian/patches/bugfix/all/patch-2.6.26-rc1-git4
Modified:
dists/trunk/linux-2.6/debian/patches/series/1~experimental.1
Added: dists/trunk/linux-2.6/debian/patches/bugfix/all/patch-2.6.26-rc1-git6
==============================================================================
--- (empty file)
+++ dists/trunk/linux-2.6/debian/patches/bugfix/all/patch-2.6.26-rc1-git6 Thu May 8 08:43:39 2008
@@ -0,0 +1,10057 @@
+diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
+index 97618be..028a844 100644
+--- a/Documentation/DocBook/kgdb.tmpl
++++ b/Documentation/DocBook/kgdb.tmpl
+@@ -72,7 +72,7 @@
+ kgdb is a source level debugger for linux kernel. It is used along
+ with gdb to debug a linux kernel. The expectation is that gdb can
+ be used to "break in" to the kernel to inspect memory, variables
+- and look through a cal stack information similar to what an
++ and look through call stack information similar to what an
+ application developer would use gdb for. It is possible to place
+ breakpoints in kernel code and perform some limited execution
+ stepping.
+@@ -93,8 +93,10 @@
+ <chapter id="CompilingAKernel">
+ <title>Compiling a kernel</title>
+ <para>
+- To enable <symbol>CONFIG_KGDB</symbol>, look under the "Kernel debugging"
+- and then select "KGDB: kernel debugging with remote gdb".
++ To enable <symbol>CONFIG_KGDB</symbol> you should first turn on
++ "Prompt for development and/or incomplete code/drivers"
++ (CONFIG_EXPERIMENTAL) in "General setup", then under the
++ "Kernel debugging" select "KGDB: kernel debugging with remote gdb".
+ </para>
+ <para>
+ Next you should choose one of more I/O drivers to interconnect debugging
+diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
+index c2992bc..8b22d7d 100644
+--- a/Documentation/filesystems/Locking
++++ b/Documentation/filesystems/Locking
+@@ -92,7 +92,6 @@ prototypes:
+ void (*destroy_inode)(struct inode *);
+ void (*dirty_inode) (struct inode *);
+ int (*write_inode) (struct inode *, int);
+- void (*put_inode) (struct inode *);
+ void (*drop_inode) (struct inode *);
+ void (*delete_inode) (struct inode *);
+ void (*put_super) (struct super_block *);
+@@ -115,7 +114,6 @@ alloc_inode: no no no
+ destroy_inode: no
+ dirty_inode: no (must not sleep)
+ write_inode: no
+-put_inode: no
+ drop_inode: no !!!inode_lock!!!
+ delete_inode: no
+ put_super: yes yes no
+diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
+index 81e5be6..b7522c6 100644
+--- a/Documentation/filesystems/vfs.txt
++++ b/Documentation/filesystems/vfs.txt
+@@ -205,7 +205,6 @@ struct super_operations {
+
+ void (*dirty_inode) (struct inode *);
+ int (*write_inode) (struct inode *, int);
+- void (*put_inode) (struct inode *);
+ void (*drop_inode) (struct inode *);
+ void (*delete_inode) (struct inode *);
+ void (*put_super) (struct super_block *);
+@@ -246,9 +245,6 @@ or bottom half).
+ inode to disc. The second parameter indicates whether the write
+ should be synchronous or not, not all filesystems check this flag.
+
+- put_inode: called when the VFS inode is removed from the inode
+- cache.
+-
+ drop_inode: called when the last access to the inode is dropped,
+ with the inode_lock spinlock held.
+
+diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
+index 00b950d..c412c24 100644
+--- a/Documentation/kbuild/kconfig-language.txt
++++ b/Documentation/kbuild/kconfig-language.txt
+@@ -377,27 +377,3 @@ config FOO
+
+ limits FOO to module (=m) or disabled (=n).
+
+-
+-Build limited by a third config symbol which may be =y or =m
+-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-A common idiom that we see (and sometimes have problems with) is this:
+-
+-When option C in B (module or subsystem) uses interfaces from A (module
+-or subsystem), and both A and B are tristate (could be =y or =m if they
+-were independent of each other, but they aren't), then we need to limit
+-C such that it cannot be built statically if A is built as a loadable
+-module. (C already depends on B, so there is no dependency issue to
+-take care of here.)
+-
+-If A is linked statically into the kernel image, C can be built
+-statically or as loadable module(s). However, if A is built as loadable
+-module(s), then C must be restricted to loadable module(s) also. This
+-can be expressed in kconfig language as:
+-
+-config C
+- depends on A = y || A = B
+-
+-or for real examples, use this command in a kernel tree:
+-
+-$ find . -name Kconfig\* | xargs grep -ns "depends on.*=.*||.*=" | grep -v orig
+-
+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
+index a3c3544..cdd5b93 100644
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -1094,9 +1094,6 @@ and is between 256 and 4096 characters. It is defined in the file
+ mac5380= [HW,SCSI] Format:
+ <can_queue>,<cmd_per_lun>,<sg_tablesize>,<hostid>,<use_tags>
+
+- mac53c9x= [HW,SCSI] Format:
+- <num_esps>,<disconnect>,<nosync>,<can_queue>,<cmd_per_lun>,<sg_tablesize>,<hostid>,<use_tags>
+-
+ machvec= [IA64] Force the use of a particular machine-vector
+ (machvec) in a generic kernel.
+ Example: machvec=hpzx1_swiotlb
+@@ -1525,6 +1522,8 @@ and is between 256 and 4096 characters. It is defined in the file
+ This is normally done in pci_enable_device(),
+ so this option is a temporary workaround
+ for broken drivers that don't call it.
++ skip_isa_align [X86] do not align io start addr, so can
++ handle more pci cards
+ firmware [ARM] Do not re-enumerate the bus but instead
+ just use the configuration from the
+ bootloader. This is currently used on
+diff --git a/Documentation/scheduler/sched-design.txt b/Documentation/scheduler/sched-design.txt
+deleted file mode 100644
+index 1605bf0..0000000
+--- a/Documentation/scheduler/sched-design.txt
++++ /dev/null
+@@ -1,165 +0,0 @@
+- Goals, Design and Implementation of the
+- new ultra-scalable O(1) scheduler
+-
+-
+- This is an edited version of an email Ingo Molnar sent to
+- lkml on 4 Jan 2002. It describes the goals, design, and
+- implementation of Ingo's new ultra-scalable O(1) scheduler.
+- Last Updated: 18 April 2002.
+-
+-
+-Goal
+-====
+-
+-The main goal of the new scheduler is to keep all the good things we know
+-and love about the current Linux scheduler:
+-
+- - good interactive performance even during high load: if the user
+- types or clicks then the system must react instantly and must execute
+- the user tasks smoothly, even during considerable background load.
+-
+- - good scheduling/wakeup performance with 1-2 runnable processes.
+-
+- - fairness: no process should stay without any timeslice for any
+- unreasonable amount of time. No process should get an unjustly high
+- amount of CPU time.
+-
+- - priorities: less important tasks can be started with lower priority,
+- more important tasks with higher priority.
+-
+- - SMP efficiency: no CPU should stay idle if there is work to do.
+-
+- - SMP affinity: processes which run on one CPU should stay affine to
+- that CPU. Processes should not bounce between CPUs too frequently.
+-
+- - plus additional scheduler features: RT scheduling, CPU binding.
+-
+-and the goal is also to add a few new things:
+-
+- - fully O(1) scheduling. Are you tired of the recalculation loop
+- blowing the L1 cache away every now and then? Do you think the goodness
+- loop is taking a bit too long to finish if there are lots of runnable
+- processes? This new scheduler takes no prisoners: wakeup(), schedule(),
+- the timer interrupt are all O(1) algorithms. There is no recalculation
+- loop. There is no goodness loop either.
+-
+- - 'perfect' SMP scalability. With the new scheduler there is no 'big'
+- runqueue_lock anymore - it's all per-CPU runqueues and locks - two
+- tasks on two separate CPUs can wake up, schedule and context-switch
+- completely in parallel, without any interlocking. All
+- scheduling-relevant data is structured for maximum scalability.
+-
+- - better SMP affinity. The old scheduler has a particular weakness that
+- causes the random bouncing of tasks between CPUs if/when higher
+- priority/interactive tasks, this was observed and reported by many
+- people. The reason is that the timeslice recalculation loop first needs
+- every currently running task to consume its timeslice. But when this
+- happens on eg. an 8-way system, then this property starves an
+- increasing number of CPUs from executing any process. Once the last
+- task that has a timeslice left has finished using up that timeslice,
+- the recalculation loop is triggered and other CPUs can start executing
+- tasks again - after having idled around for a number of timer ticks.
+- The more CPUs, the worse this effect.
+-
+- Furthermore, this same effect causes the bouncing effect as well:
+- whenever there is such a 'timeslice squeeze' of the global runqueue,
+- idle processors start executing tasks which are not affine to that CPU.
+- (because the affine tasks have finished off their timeslices already.)
+-
+- The new scheduler solves this problem by distributing timeslices on a
+- per-CPU basis, without having any global synchronization or
+- recalculation.
+-
+- - batch scheduling. A significant proportion of computing-intensive tasks
+- benefit from batch-scheduling, where timeslices are long and processes
+- are roundrobin scheduled. The new scheduler does such batch-scheduling
+- of the lowest priority tasks - so nice +19 jobs will get
+- 'batch-scheduled' automatically. With this scheduler, nice +19 jobs are
+- in essence SCHED_IDLE, from an interactiveness point of view.
+-
+- - handle extreme loads more smoothly, without breakdown and scheduling
+- storms.
+-
+- - O(1) RT scheduling. For those RT folks who are paranoid about the
+- O(nr_running) property of the goodness loop and the recalculation loop.
+-
+- - run fork()ed children before the parent. Andrea has pointed out the
+- advantages of this a few months ago, but patches for this feature
+- do not work with the old scheduler as well as they should,
+- because idle processes often steal the new child before the fork()ing
+- CPU gets to execute it.
+-
+-
+-Design
+-======
+-
+-The core of the new scheduler contains the following mechanisms:
+-
+- - *two* priority-ordered 'priority arrays' per CPU. There is an 'active'
+- array and an 'expired' array. The active array contains all tasks that
+- are affine to this CPU and have timeslices left. The expired array
+- contains all tasks which have used up their timeslices - but this array
+- is kept sorted as well. The active and expired array is not accessed
+- directly, it's accessed through two pointers in the per-CPU runqueue
+- structure. If all active tasks are used up then we 'switch' the two
+- pointers and from now on the ready-to-go (former-) expired array is the
+- active array - and the empty active array serves as the new collector
+- for expired tasks.
+-
+- - there is a 64-bit bitmap cache for array indices. Finding the highest
+- priority task is thus a matter of two x86 BSFL bit-search instructions.
+-
+-the split-array solution enables us to have an arbitrary number of active
+-and expired tasks, and the recalculation of timeslices can be done
+-immediately when the timeslice expires. Because the arrays are always
+-access through the pointers in the runqueue, switching the two arrays can
+-be done very quickly.
+-
+-this is a hybride priority-list approach coupled with roundrobin
+-scheduling and the array-switch method of distributing timeslices.
+-
+- - there is a per-task 'load estimator'.
+-
+-one of the toughest things to get right is good interactive feel during
+-heavy system load. While playing with various scheduler variants i found
+-that the best interactive feel is achieved not by 'boosting' interactive
+-tasks, but by 'punishing' tasks that want to use more CPU time than there
+-is available. This method is also much easier to do in an O(1) fashion.
+-
+-to establish the actual 'load' the task contributes to the system, a
+-complex-looking but pretty accurate method is used: there is a 4-entry
+-'history' ringbuffer of the task's activities during the last 4 seconds.
+-This ringbuffer is operated without much overhead. The entries tell the
+-scheduler a pretty accurate load-history of the task: has it used up more
+-CPU time or less during the past N seconds. [the size '4' and the interval
+-of 4x 1 seconds was found by lots of experimentation - this part is
+-flexible and can be changed in both directions.]
+-
+-the penalty a task gets for generating more load than the CPU can handle
+-is a priority decrease - there is a maximum amount to this penalty
+-relative to their static priority, so even fully CPU-bound tasks will
+-observe each other's priorities, and will share the CPU accordingly.
+-
+-the SMP load-balancer can be extended/switched with additional parallel
+-computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs
+-can be supported easily by changing the load-balancer. Right now it's
+-tuned for my SMP systems.
+-
+-i skipped the prev->mm == next->mm advantage - no workload i know of shows
+-any sensitivity to this. It can be added back by sacrificing O(1)
+-schedule() [the current and one-lower priority list can be searched for a
+-that->mm == current->mm condition], but costs a fair number of cycles
+-during a number of important workloads, so i wanted to avoid this as much
+-as possible.
+-
+-- the SMP idle-task startup code was still racy and the new scheduler
+-triggered this. So i streamlined the idle-setup code a bit. We do not call
+-into schedule() before all processors have started up fully and all idle
+-threads are in place.
+-
+-- the patch also cleans up a number of aspects of sched.c - moves code
+-into other areas of the kernel where it's appropriate, and simplifies
+-certain code paths and data constructs. As a result, the new scheduler's
+-code is smaller than the old one.
+-
+- Ingo
+diff --git a/MAINTAINERS b/MAINTAINERS
+index abe2787..f5583dc 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -2112,12 +2112,10 @@ L: netdev at vger.kernel.org
+ S: Maintained
+
+ INTEL ETHERNET DRIVERS (e100/e1000/e1000e/igb/ixgb/ixgbe)
+-P: Auke Kok
+-M: auke-jan.h.kok at intel.com
+-P: Jesse Brandeburg
+-M: jesse.brandeburg at intel.com
+ P: Jeff Kirsher
+ M: jeffrey.t.kirsher at intel.com
++P: Jesse Brandeburg
++M: jesse.brandeburg at intel.com
+ P: Bruce Allan
+ M: bruce.w.allan at intel.com
+ P: John Ronciak
+diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c
+index 9bd1870..0128687 100644
+--- a/arch/arm/kernel/sys_arm.c
++++ b/arch/arm/kernel/sys_arm.c
+@@ -34,23 +34,6 @@ extern unsigned long do_mremap(unsigned long addr, unsigned long old_len,
+ unsigned long new_len, unsigned long flags,
+ unsigned long new_addr);
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way unix traditionally does this, though.
+- */
+-asmlinkage int sys_pipe(unsigned long __user *fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ /* common code for old and new mmaps */
+ inline long do_mmap2(
+ unsigned long addr, unsigned long len,
+diff --git a/arch/avr32/kernel/sys_avr32.c b/arch/avr32/kernel/sys_avr32.c
+index 8deb600..8e8911e 100644
+--- a/arch/avr32/kernel/sys_avr32.c
++++ b/arch/avr32/kernel/sys_avr32.c
+@@ -14,19 +14,6 @@
+ #include <asm/mman.h>
+ #include <asm/uaccess.h>
+
+-asmlinkage int sys_pipe(unsigned long __user *filedes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(filedes, fd, sizeof(fd)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, off_t offset)
+diff --git a/arch/blackfin/kernel/sys_bfin.c b/arch/blackfin/kernel/sys_bfin.c
+index efb7b25..fce49d7 100644
+--- a/arch/blackfin/kernel/sys_bfin.c
++++ b/arch/blackfin/kernel/sys_bfin.c
+@@ -45,23 +45,6 @@
+ #include <asm/cacheflush.h>
+ #include <asm/dma.h>
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way unix traditionally does this, though.
+- */
+-asmlinkage int sys_pipe(unsigned long __user *fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2 * sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ /* common code for old and new mmaps */
+ static inline long
+ do_mmap2(unsigned long addr, unsigned long len,
+diff --git a/arch/frv/kernel/sys_frv.c b/arch/frv/kernel/sys_frv.c
+index 04c6b16..49b2cf2 100644
+--- a/arch/frv/kernel/sys_frv.c
++++ b/arch/frv/kernel/sys_frv.c
+@@ -28,23 +28,6 @@
+ #include <asm/setup.h>
+ #include <asm/uaccess.h>
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way unix traditionally does this, though.
+- */
+-asmlinkage long sys_pipe(unsigned long __user * fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long pgoff)
+diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c
+index 00608be..2745656 100644
+--- a/arch/h8300/kernel/sys_h8300.c
++++ b/arch/h8300/kernel/sys_h8300.c
+@@ -27,23 +27,6 @@
+ #include <asm/traps.h>
+ #include <asm/unistd.h>
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way unix traditionally does this, though.
+- */
+-asmlinkage int sys_pipe(unsigned long * fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ /* common code for old and new mmaps */
+ static inline long do_mmap2(
+ unsigned long addr, unsigned long len,
+diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
+index e892f17..7f54efa 100644
+--- a/arch/m68k/kernel/sys_m68k.c
++++ b/arch/m68k/kernel/sys_m68k.c
+@@ -30,23 +30,6 @@
+ #include <asm/page.h>
+ #include <asm/unistd.h>
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way unix traditionally does this, though.
+- */
+-asmlinkage int sys_pipe(unsigned long __user * fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ /* common code for old and new mmaps */
+ static inline long do_mmap2(
+ unsigned long addr, unsigned long len,
+diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
+index fd4858e..75b8340 100644
+--- a/arch/m68k/kernel/traps.c
++++ b/arch/m68k/kernel/traps.c
+@@ -468,15 +468,26 @@ static inline void access_error040(struct frame *fp)
+ * (if do_page_fault didn't fix the mapping,
+ * the writeback won't do good)
+ */
++disable_wb:
+ #ifdef DEBUG
+ printk(".. disabling wb2\n");
+ #endif
+ if (fp->un.fmt7.wb2a == fp->un.fmt7.faddr)
+ fp->un.fmt7.wb2s &= ~WBV_040;
++ if (fp->un.fmt7.wb3a == fp->un.fmt7.faddr)
++ fp->un.fmt7.wb3s &= ~WBV_040;
+ }
+- } else if (send_fault_sig(&fp->ptregs) > 0) {
+- printk("68040 access error, ssw=%x\n", ssw);
+- trap_c(fp);
++ } else {
++ /* In case of a bus error we either kill the process or expect
++ * the kernel to catch the fault, which then is also responsible
++ * for cleaning up the mess.
++ */
++ current->thread.signo = SIGBUS;
++ current->thread.faddr = fp->un.fmt7.faddr;
++ if (send_fault_sig(&fp->ptregs) >= 0)
++ printk("68040 bus error (ssw=%x, faddr=%lx)\n", ssw,
++ fp->un.fmt7.faddr);
++ goto disable_wb;
+ }
+
+ do_040writebacks(fp);
+diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
+index 735a49b..ad3e3ba 100644
+--- a/arch/m68k/mac/config.c
++++ b/arch/m68k/mac/config.c
+@@ -48,9 +48,6 @@
+ struct mac_booter_data mac_bi_data;
+ int mac_bisize = sizeof mac_bi_data;
+
+-struct mac_hw_present mac_hw_present;
+-EXPORT_SYMBOL(mac_hw_present);
+-
+ /* New m68k bootinfo stuff and videobase */
+
+ extern int m68k_num_memory;
+@@ -817,27 +814,6 @@ void __init mac_identify(void)
+ m68k_ramdisk.addr, m68k_ramdisk.size);
+ #endif
+
+- /*
+- * TODO: set the various fields in macintosh_config->hw_present here!
+- */
+- switch (macintosh_config->scsi_type) {
+- case MAC_SCSI_OLD:
+- MACHW_SET(MAC_SCSI_80);
+- break;
+- case MAC_SCSI_QUADRA:
+- case MAC_SCSI_QUADRA2:
+- case MAC_SCSI_QUADRA3:
+- MACHW_SET(MAC_SCSI_96);
+- if ((macintosh_config->ident == MAC_MODEL_Q900) ||
+- (macintosh_config->ident == MAC_MODEL_Q950))
+- MACHW_SET(MAC_SCSI_96_2);
+- break;
+- default:
+- printk(KERN_WARNING "config.c: wtf: unknown scsi, using 53c80\n");
+- MACHW_SET(MAC_SCSI_80);
+- break;
+- }
+-
+ iop_init();
+ via_init();
+ oss_init();
+diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c
+index 65f7a95..7002816 100644
+--- a/arch/m68knommu/kernel/sys_m68k.c
++++ b/arch/m68knommu/kernel/sys_m68k.c
+@@ -28,23 +28,6 @@
+ #include <asm/cacheflush.h>
+ #include <asm/unistd.h>
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way unix traditionally does this, though.
+- */
+-asmlinkage int sys_pipe(unsigned long * fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ /* common code for old and new mmaps */
+ static inline long do_mmap2(
+ unsigned long addr, unsigned long len,
+diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c
+index 5f17a1e..bca5a84 100644
+--- a/arch/mn10300/kernel/sys_mn10300.c
++++ b/arch/mn10300/kernel/sys_mn10300.c
+@@ -29,23 +29,6 @@
+ #define MIN_MAP_ADDR PAGE_SIZE /* minimum fixed mmap address */
+
+ /*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way Unix traditionally does this, though.
+- */
+-asmlinkage long sys_pipe(unsigned long __user *fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2 * sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+-/*
+ * memory mapping syscall
+ */
+ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
+index 4f58921..71b3195 100644
+--- a/arch/parisc/kernel/sys_parisc.c
++++ b/arch/parisc/kernel/sys_parisc.c
+@@ -33,19 +33,6 @@
+ #include <linux/utsname.h>
+ #include <linux/personality.h>
+
+-int sys_pipe(int __user *fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ static unsigned long get_unshared_area(unsigned long addr, unsigned long len)
+ {
+ struct vm_area_struct *vma;
+diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
+index e722a4e..4fe69ca 100644
+--- a/arch/powerpc/kernel/syscalls.c
++++ b/arch/powerpc/kernel/syscalls.c
+@@ -136,23 +136,6 @@ int sys_ipc(uint call, int first, unsigned long second, long third,
+ return ret;
+ }
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way unix traditionally does this, though.
+- */
+-int sys_pipe(int __user *fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ static inline unsigned long do_mmap2(unsigned long addr, size_t len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long off, int shift)
+diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
+index 6d9884a..712d89a 100644
+--- a/arch/powerpc/kvm/booke_guest.c
++++ b/arch/powerpc/kvm/booke_guest.c
+@@ -49,6 +49,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
+ { "inst_emu", VCPU_STAT(emulated_inst_exits) },
+ { "dec", VCPU_STAT(dec_exits) },
+ { "ext_intr", VCPU_STAT(ext_intr_exits) },
++ { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+ { NULL }
+ };
+
+@@ -338,6 +339,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ }
+ break;
+
++ case BOOKE_INTERRUPT_FP_UNAVAIL:
++ kvmppc_queue_exception(vcpu, exit_nr);
++ r = RESUME_GUEST;
++ break;
++
+ case BOOKE_INTERRUPT_DATA_STORAGE:
+ vcpu->arch.dear = vcpu->arch.fault_dear;
+ vcpu->arch.esr = vcpu->arch.fault_esr;
+diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
+index bad40bd..777e0f3 100644
+--- a/arch/powerpc/kvm/powerpc.c
++++ b/arch/powerpc/kvm/powerpc.c
+@@ -36,13 +36,12 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+
+ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
+ {
+- /* XXX implement me */
+- return 0;
++ return !!(v->arch.pending_exceptions);
+ }
+
+ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
+ {
+- return 1;
++ return !(v->arch.msr & MSR_WE);
+ }
+
+
+@@ -214,6 +213,11 @@ static void kvmppc_decrementer_func(unsigned long data)
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+
+ kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
++
++ if (waitqueue_active(&vcpu->wq)) {
++ wake_up_interruptible(&vcpu->wq);
++ vcpu->stat.halt_wakeup++;
++ }
+ }
+
+ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+@@ -339,6 +343,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int r;
+ sigset_t sigsaved;
+
++ vcpu_load(vcpu);
++
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+@@ -363,12 +369,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
++ vcpu_put(vcpu);
++
+ return r;
+ }
+
+ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
+ {
+ kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
++
++ if (waitqueue_active(&vcpu->wq)) {
++ wake_up_interruptible(&vcpu->wq);
++ vcpu->stat.halt_wakeup++;
++ }
++
+ return 0;
+ }
+
+diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
+index 4bb023f..f1d2cdc 100644
+--- a/arch/powerpc/lib/Makefile
++++ b/arch/powerpc/lib/Makefile
+@@ -23,3 +23,4 @@ obj-$(CONFIG_SMP) += locks.o
+ endif
+
+ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
++obj-$(CONFIG_HAS_IOMEM) += devres.o
+diff --git a/arch/powerpc/lib/devres.c b/arch/powerpc/lib/devres.c
+new file mode 100644
+index 0000000..292115d
+--- /dev/null
++++ b/arch/powerpc/lib/devres.c
+@@ -0,0 +1,42 @@
++/*
++ * Copyright (C) 2008 Freescale Semiconductor, Inc.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#include <linux/device.h> /* devres_*(), devm_ioremap_release() */
++#include <linux/io.h> /* ioremap_flags() */
++#include <linux/module.h> /* EXPORT_SYMBOL() */
++
++/**
++ * devm_ioremap_prot - Managed ioremap_flags()
++ * @dev: Generic device to remap IO address for
++ * @offset: BUS offset to map
++ * @size: Size of map
++ * @flags: Page flags
++ *
++ * Managed ioremap_prot(). Map is automatically unmapped on driver
++ * detach.
++ */
++void __iomem *devm_ioremap_prot(struct device *dev, resource_size_t offset,
++ size_t size, unsigned long flags)
++{
++ void __iomem **ptr, *addr;
++
++ ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL);
++ if (!ptr)
++ return NULL;
++
++ addr = ioremap_flags(offset, size, flags);
++ if (addr) {
++ *ptr = addr;
++ devres_add(dev, ptr);
++ } else
++ devres_free(ptr);
++
++ return addr;
++}
++EXPORT_SYMBOL(devm_ioremap_prot);
+diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
+index bec3803..417eca7 100644
+--- a/arch/powerpc/platforms/pseries/scanlog.c
++++ b/arch/powerpc/platforms/pseries/scanlog.c
+@@ -55,11 +55,6 @@ static ssize_t scanlog_read(struct file *file, char __user *buf,
+ dp = PDE(inode);
+ data = (unsigned int *)dp->data;
+
+- if (!data) {
+- printk(KERN_ERR "scanlog: read failed no data\n");
+- return -EIO;
+- }
+-
+ if (count > RTAS_DATA_BUF_SIZE)
+ count = RTAS_DATA_BUF_SIZE;
+
+@@ -146,11 +141,6 @@ static int scanlog_open(struct inode * inode, struct file * file)
+ struct proc_dir_entry *dp = PDE(inode);
+ unsigned int *data = (unsigned int *)dp->data;
+
+- if (!data) {
+- printk(KERN_ERR "scanlog: open failed no data\n");
+- return -EIO;
+- }
+-
+ if (data[0] != 0) {
+ /* This imperfect test stops a second copy of the
+ * data (or a reset while data is being copied)
+@@ -168,10 +158,6 @@ static int scanlog_release(struct inode * inode, struct file * file)
+ struct proc_dir_entry *dp = PDE(inode);
+ unsigned int *data = (unsigned int *)dp->data;
+
+- if (!data) {
+- printk(KERN_ERR "scanlog: release failed no data\n");
+- return -EIO;
+- }
+ data[0] = 0;
+
+ return 0;
+@@ -200,12 +186,11 @@ static int __init scanlog_init(void)
+ if (!data)
+ goto err;
+
+- ent = proc_create("ppc64/rtas/scan-log-dump", S_IRUSR, NULL,
+- &scanlog_fops);
++ ent = proc_create_data("ppc64/rtas/scan-log-dump", S_IRUSR, NULL,
++ &scanlog_fops, data);
+ if (!ent)
+ goto err;
+
+- ent->data = data;
+ proc_ppc64_scan_log_dump = ent;
+
+ return 0;
+diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c
+index 988d0d6..5fdb799 100644
+--- a/arch/s390/kernel/sys_s390.c
++++ b/arch/s390/kernel/sys_s390.c
+@@ -32,23 +32,6 @@
+ #include <asm/uaccess.h>
+ #include "entry.h"
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way Unix traditionally does this, though.
+- */
+-asmlinkage long sys_pipe(unsigned long __user *fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ /* common code for old and new mmaps */
+ static inline long do_mmap2(
+ unsigned long addr, unsigned long len,
+diff --git a/arch/sh/kernel/sys_sh64.c b/arch/sh/kernel/sys_sh64.c
+index 578004d..91fb844 100644
+--- a/arch/sh/kernel/sys_sh64.c
++++ b/arch/sh/kernel/sys_sh64.c
+@@ -31,23 +31,6 @@
+ #include <asm/unistd.h>
+
+ /*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way Unix traditionally does this, though.
+- */
+-asmlinkage int sys_pipe(unsigned long * fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+-/*
+ * Do a system call from kernel instead of calling sys_execve so we
+ * end up with proper pt_regs.
+ */
+diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c
+index e7f3519..36431f3 100644
+--- a/arch/sparc/kernel/process.c
++++ b/arch/sparc/kernel/process.c
+@@ -419,14 +419,26 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags,
+ unsigned long stack_size)
+ {
+ unsigned long parent_tid_ptr, child_tid_ptr;
++ unsigned long orig_i1 = regs->u_regs[UREG_I1];
++ long ret;
+
+ parent_tid_ptr = regs->u_regs[UREG_I2];
+ child_tid_ptr = regs->u_regs[UREG_I4];
+
+- return do_fork(clone_flags, stack_start,
+- regs, stack_size,
+- (int __user *) parent_tid_ptr,
+- (int __user *) child_tid_ptr);
++ ret = do_fork(clone_flags, stack_start,
++ regs, stack_size,
++ (int __user *) parent_tid_ptr,
++ (int __user *) child_tid_ptr);
++
++ /* If we get an error and potentially restart the system
++ * call, we're screwed because copy_thread() clobbered
++ * the parent's %o1. So detect that case and restore it
++ * here.
++ */
++ if ((unsigned long)ret >= -ERESTART_RESTARTBLOCK)
++ regs->u_regs[UREG_I1] = orig_i1;
++
++ return ret;
+ }
+
+ /* Copy a Sparc thread. The fork() return value conventions
+diff --git a/arch/sparc/kernel/sys_sparc.c b/arch/sparc/kernel/sys_sparc.c
+index f188b5d..e995491 100644
+--- a/arch/sparc/kernel/sys_sparc.c
++++ b/arch/sparc/kernel/sys_sparc.c
+@@ -223,8 +223,7 @@ int sparc_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
+ {
+ if (ARCH_SUN4C_SUN4 &&
+ (len > 0x20000000 ||
+- ((flags & MAP_FIXED) &&
+- addr < 0xe0000000 && addr + len > 0x20000000)))
++ (addr < 0xe0000000 && addr + len > 0x20000000)))
+ return -EINVAL;
+
+ /* See asm-sparc/uaccess.h */
+diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
+index 500ac6d..4129c04 100644
+--- a/arch/sparc64/kernel/process.c
++++ b/arch/sparc64/kernel/process.c
+@@ -503,6 +503,8 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags,
+ unsigned long stack_size)
+ {
+ int __user *parent_tid_ptr, *child_tid_ptr;
++ unsigned long orig_i1 = regs->u_regs[UREG_I1];
++ long ret;
+
+ #ifdef CONFIG_COMPAT
+ if (test_thread_flag(TIF_32BIT)) {
+@@ -515,9 +517,19 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags,
+ child_tid_ptr = (int __user *) regs->u_regs[UREG_I4];
+ }
+
+- return do_fork(clone_flags, stack_start,
+- regs, stack_size,
+- parent_tid_ptr, child_tid_ptr);
++ ret = do_fork(clone_flags, stack_start,
++ regs, stack_size,
++ parent_tid_ptr, child_tid_ptr);
++
++ /* If we get an error and potentially restart the system
++ * call, we're screwed because copy_thread() clobbered
++ * the parent's %o1. So detect that case and restore it
++ * here.
++ */
++ if ((unsigned long)ret >= -ERESTART_RESTARTBLOCK)
++ regs->u_regs[UREG_I1] = orig_i1;
++
++ return ret;
+ }
+
+ /* Copy a Sparc thread. The fork() return value conventions
+diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
+index 3aba476..0d6403a 100644
+--- a/arch/sparc64/kernel/smp.c
++++ b/arch/sparc64/kernel/smp.c
+@@ -865,21 +865,14 @@ void smp_call_function_client(int irq, struct pt_regs *regs)
+ void *info = call_data->info;
+
+ clear_softint(1 << irq);
+-
+- irq_enter();
+-
+- if (!call_data->wait) {
+- /* let initiator proceed after getting data */
+- atomic_inc(&call_data->finished);
+- }
+-
+- func(info);
+-
+- irq_exit();
+-
+ if (call_data->wait) {
+ /* let initiator proceed only after completion */
++ func(info);
+ atomic_inc(&call_data->finished);
++ } else {
++ /* let initiator proceed after getting data */
++ atomic_inc(&call_data->finished);
++ func(info);
+ }
+ }
+
+@@ -1041,9 +1034,7 @@ void smp_receive_signal(int cpu)
+
+ void smp_receive_signal_client(int irq, struct pt_regs *regs)
+ {
+- irq_enter();
+ clear_softint(1 << irq);
+- irq_exit();
+ }
+
+ void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
+@@ -1051,8 +1042,6 @@ void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
+ struct mm_struct *mm;
+ unsigned long flags;
+
+- irq_enter();
+-
+ clear_softint(1 << irq);
+
+ /* See if we need to allocate a new TLB context because
+@@ -1072,8 +1061,6 @@ void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
+ load_secondary_context(mm);
+ __flush_tlb_mm(CTX_HWBITS(mm->context),
+ SECONDARY_CONTEXT);
+-
+- irq_exit();
+ }
+
+ void smp_new_mmu_context_version(void)
+@@ -1239,8 +1226,6 @@ void smp_penguin_jailcell(int irq, struct pt_regs *regs)
+ {
+ clear_softint(1 << irq);
+
+- irq_enter();
+-
+ preempt_disable();
+
+ __asm__ __volatile__("flushw");
+@@ -1253,8 +1238,6 @@ void smp_penguin_jailcell(int irq, struct pt_regs *regs)
+ prom_world(0);
+
+ preempt_enable();
+-
+- irq_exit();
+ }
+
+ /* /proc/profile writes can call this, don't __init it please. */
+diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c
+index 8d4761f..0dbc941 100644
+--- a/arch/sparc64/kernel/sys_sparc.c
++++ b/arch/sparc64/kernel/sys_sparc.c
+@@ -549,13 +549,13 @@ int sparc64_mmap_check(unsigned long addr, unsigned long len,
+ if (len >= STACK_TOP32)
+ return -EINVAL;
+
+- if ((flags & MAP_FIXED) && addr > STACK_TOP32 - len)
++ if (addr > STACK_TOP32 - len)
+ return -EINVAL;
+ } else {
+ if (len >= VA_EXCLUDE_START)
+ return -EINVAL;
+
+- if ((flags & MAP_FIXED) && invalid_64bit_range(addr, len))
++ if (invalid_64bit_range(addr, len))
+ return -EINVAL;
+ }
+
+diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
+index 161ce47..1aa4288 100644
+--- a/arch/sparc64/kernel/sys_sparc32.c
++++ b/arch/sparc64/kernel/sys_sparc32.c
+@@ -236,13 +236,6 @@ asmlinkage long sys32_getegid16(void)
+
+ /* 32-bit timeval and related flotsam. */
+
+-static long get_tv32(struct timeval *o, struct compat_timeval __user *i)
+-{
+- return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
+- (__get_user(o->tv_sec, &i->tv_sec) |
+- __get_user(o->tv_usec, &i->tv_usec)));
+-}
+-
+ static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
+ {
+ return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
+@@ -757,30 +750,6 @@ asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
+ return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+ }
+
+-asmlinkage long sys32_utimes(char __user *filename,
+- struct compat_timeval __user *tvs)
+-{
+- struct timespec tv[2];
+-
+- if (tvs) {
+- struct timeval ktvs[2];
+- if (get_tv32(&ktvs[0], tvs) ||
+- get_tv32(&ktvs[1], 1+tvs))
+- return -EFAULT;
+-
+- if (ktvs[0].tv_usec < 0 || ktvs[0].tv_usec >= 1000000 ||
+- ktvs[1].tv_usec < 0 || ktvs[1].tv_usec >= 1000000)
+- return -EINVAL;
+-
+- tv[0].tv_sec = ktvs[0].tv_sec;
+- tv[0].tv_nsec = 1000 * ktvs[0].tv_usec;
+- tv[1].tv_sec = ktvs[1].tv_sec;
+- tv[1].tv_nsec = 1000 * ktvs[1].tv_usec;
+- }
+-
+- return do_utimes(AT_FDCWD, filename, tvs ? tv : NULL, 0);
+-}
+-
+ /* These are here just in case some old sparc32 binary calls it. */
+ asmlinkage long sys32_pause(void)
+ {
+diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S
+index a4fef2b..8b5282d 100644
+--- a/arch/sparc64/kernel/systbls.S
++++ b/arch/sparc64/kernel/systbls.S
+@@ -45,7 +45,7 @@ sys_call_table32:
+ /*120*/ .word compat_sys_readv, compat_sys_writev, sys32_settimeofday, sys32_fchown16, sys_fchmod
+ .word sys_nis_syscall, sys32_setreuid16, sys32_setregid16, sys_rename, sys_truncate
+ /*130*/ .word sys_ftruncate, sys_flock, compat_sys_lstat64, sys_nis_syscall, sys_nis_syscall
+- .word sys_nis_syscall, sys32_mkdir, sys_rmdir, sys32_utimes, compat_sys_stat64
++ .word sys_nis_syscall, sys32_mkdir, sys_rmdir, compat_sys_utimes, compat_sys_stat64
+ /*140*/ .word sys32_sendfile64, sys_nis_syscall, sys32_futex, sys_gettid, compat_sys_getrlimit
+ .word compat_sys_setrlimit, sys_pivot_root, sys32_prctl, sys_pciconfig_read, sys_pciconfig_write
+ /*150*/ .word sys_nis_syscall, sys_inotify_init, sys_inotify_add_watch, sys_poll, sys_getdents64
+diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
+index 4cad0b3..ec3e2c7 100644
+--- a/arch/sparc64/mm/init.c
++++ b/arch/sparc64/mm/init.c
+@@ -771,6 +771,9 @@ static void __init find_ramdisk(unsigned long phys_base)
+ initrd_end = ramdisk_image + sparc_ramdisk_size;
+
+ lmb_reserve(initrd_start, initrd_end);
++
++ initrd_start += PAGE_OFFSET;
++ initrd_end += PAGE_OFFSET;
+ }
+ #endif
+ }
+@@ -2362,16 +2365,3 @@ void __flush_tlb_all(void)
+ __asm__ __volatile__("wrpr %0, 0, %%pstate"
+ : : "r" (pstate));
+ }
+-
+-#ifdef CONFIG_MEMORY_HOTPLUG
+-
+-void online_page(struct page *page)
+-{
+- ClearPageReserved(page);
+- init_page_count(page);
+- __free_page(page);
+- totalram_pages++;
+- num_physpages++;
+-}
+-
+-#endif /* CONFIG_MEMORY_HOTPLUG */
+diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c
+index 9cffc62..128ee85 100644
+--- a/arch/um/kernel/syscall.c
++++ b/arch/um/kernel/syscall.c
+@@ -73,23 +73,6 @@ long old_mmap(unsigned long addr, unsigned long len,
+ out:
+ return err;
+ }
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way unix traditionally does this, though.
+- */
+-long sys_pipe(unsigned long __user * fildes)
+-{
+- int fd[2];
+- long error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, sizeof(fd)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+
+ long sys_uname(struct old_utsname __user * name)
+ {
+diff --git a/arch/v850/kernel/syscalls.c b/arch/v850/kernel/syscalls.c
+index 003db9c..1a83daf 100644
+--- a/arch/v850/kernel/syscalls.c
++++ b/arch/v850/kernel/syscalls.c
+@@ -132,23 +132,6 @@ sys_ipc (uint call, int first, int second, int third, void *ptr, long fifth)
+ return ret;
+ }
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way unix traditionally does this, though.
+- */
+-int sys_pipe (int *fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe (fd);
+- if (!error) {
+- if (copy_to_user (fildes, fd, 2*sizeof (int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ static inline unsigned long
+ do_mmap2 (unsigned long addr, size_t len,
+ unsigned long prot, unsigned long flags,
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index c3f8809..bbcafaa 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -18,6 +18,7 @@ config X86_64
+ ### Arch settings
+ config X86
+ def_bool y
++ select HAVE_UNSTABLE_SCHED_CLOCK
+ select HAVE_IDE
+ select HAVE_OPROFILE
+ select HAVE_KPROBES
+@@ -1661,6 +1662,7 @@ config GEODE_MFGPT_TIMER
+
+ config OLPC
+ bool "One Laptop Per Child support"
++ depends on MGEODE_LX
+ default n
+ help
+ Add support for detecting the unique features of the OLPC
+diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
+index d01ea42..edaadea 100644
+--- a/arch/x86/boot/compressed/relocs.c
++++ b/arch/x86/boot/compressed/relocs.c
+@@ -191,7 +191,7 @@ static void read_ehdr(FILE *fp)
+ die("Cannot read ELF header: %s\n",
+ strerror(errno));
+ }
+- if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) {
++ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
+ die("No ELF magic\n");
+ }
+ if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
+diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
+index 7335959..fd5ca97 100644
+--- a/arch/x86/kernel/acpi/Makefile
++++ b/arch/x86/kernel/acpi/Makefile
+@@ -10,5 +10,5 @@ endif
+ $(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
+
+ $(obj)/realmode/wakeup.bin: FORCE
+- $(Q)$(MAKE) $(build)=$(obj)/realmode $@
++ $(Q)$(MAKE) $(build)=$(obj)/realmode
+
+diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
+index 0929008..1c31cc0 100644
+--- a/arch/x86/kernel/acpi/realmode/Makefile
++++ b/arch/x86/kernel/acpi/realmode/Makefile
+@@ -6,7 +6,8 @@
+ # for more details.
+ #
+
+-targets := wakeup.bin wakeup.elf
++always := wakeup.bin
++targets := wakeup.elf wakeup.lds
+
+ wakeup-y += wakeup.o wakemain.o video-mode.o copy.o
+
+@@ -48,7 +49,7 @@ LDFLAGS_wakeup.elf := -T
+
+ CPPFLAGS_wakeup.lds += -P -C
+
+-$(obj)/wakeup.elf: $(src)/wakeup.lds $(WAKEUP_OBJS) FORCE
++$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
+ $(call if_changed,ld)
+
+ OBJCOPYFLAGS_wakeup.bin := -O binary
+diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
+index ddee040..4bc1be5 100644
+--- a/arch/x86/kernel/kvmclock.c
++++ b/arch/x86/kernel/kvmclock.c
+@@ -133,6 +133,7 @@ static int kvm_register_clock(void)
+ return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+ }
+
++#ifdef CONFIG_X86_LOCAL_APIC
+ static void kvm_setup_secondary_clock(void)
+ {
+ /*
+@@ -143,6 +144,7 @@ static void kvm_setup_secondary_clock(void)
+ /* ok, done with our trickery, call native */
+ setup_secondary_APIC_clock();
+ }
++#endif
+
+ /*
+ * After the clock is registered, the host will keep writing to the
+@@ -177,7 +179,9 @@ void __init kvmclock_init(void)
+ pv_time_ops.get_wallclock = kvm_get_wallclock;
+ pv_time_ops.set_wallclock = kvm_set_wallclock;
+ pv_time_ops.sched_clock = kvm_clock_read;
++#ifdef CONFIG_X86_LOCAL_APIC
+ pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
++#endif
+ machine_ops.shutdown = kvm_shutdown;
+ #ifdef CONFIG_KEXEC
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
+diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
+index 3e2c54d..404683b 100644
+--- a/arch/x86/kernel/mpparse.c
++++ b/arch/x86/kernel/mpparse.c
+@@ -794,6 +794,11 @@ void __init find_smp_config(void)
+ ACPI-based MP Configuration
+ -------------------------------------------------------------------------- */
+
++/*
++ * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
++ */
++int es7000_plat;
++
+ #ifdef CONFIG_ACPI
+
+ #ifdef CONFIG_X86_IO_APIC
+@@ -909,8 +914,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+ MP_intsrc_info(&intsrc);
+ }
+
+-int es7000_plat;
+-
+ void __init mp_config_acpi_legacy_irqs(void)
+ {
+ struct mpc_config_intsrc intsrc;
+diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
+index 07c6d42..f6be7d5 100644
+--- a/arch/x86/kernel/reboot.c
++++ b/arch/x86/kernel/reboot.c
+@@ -149,7 +149,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+- DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index c0c68c1..cc6f5eb 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -95,7 +95,7 @@ void __init setup_per_cpu_areas(void)
+
+ /* Copy section for each CPU (we discard the original) */
+ size = PERCPU_ENOUGH_ROOM;
+- printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
++ printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
+ size);
+
+ for_each_possible_cpu(i) {
+diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
+index 84241a2..6b087ab 100644
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -299,7 +299,7 @@ static void __cpuinit smp_callin(void)
+ /*
+ * Activate a secondary processor.
+ */
+-void __cpuinit start_secondary(void *unused)
++static void __cpuinit start_secondary(void *unused)
+ {
+ /*
+ * Don't put *anything* before cpu_init(), SMP booting is too
+@@ -1306,7 +1306,7 @@ static void remove_siblinginfo(int cpu)
+ cpu_clear(cpu, cpu_sibling_setup_map);
+ }
+
+-int additional_cpus __initdata = -1;
++static int additional_cpus __initdata = -1;
+
+ static __init int setup_additional_cpus(char *s)
+ {
+diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
+index a86d26f..d2ab52c 100644
+--- a/arch/x86/kernel/sys_i386_32.c
++++ b/arch/x86/kernel/sys_i386_32.c
+@@ -22,23 +22,6 @@
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way Unix traditionally does this, though.
+- */
+-asmlinkage int sys_pipe(unsigned long __user * fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long pgoff)
+diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
+index bd802a5..3b360ef 100644
+--- a/arch/x86/kernel/sys_x86_64.c
++++ b/arch/x86/kernel/sys_x86_64.c
+@@ -17,23 +17,6 @@
+ #include <asm/uaccess.h>
+ #include <asm/ia32.h>
+
+-/*
+- * sys_pipe() is the normal C calling standard for creating
+- * a pipe. It's not the way Unix traditionally does this, though.
+- */
+-asmlinkage long sys_pipe(int __user *fildes)
+-{
+- int fd[2];
+- int error;
+-
+- error = do_pipe(fd);
+- if (!error) {
+- if (copy_to_user(fildes, fd, 2*sizeof(int)))
+- error = -EFAULT;
+- }
+- return error;
+-}
+-
+ asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long off)
+ {
+diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
+index 4c943ea..3324d90 100644
+--- a/arch/x86/kvm/i8254.c
++++ b/arch/x86/kvm/i8254.c
+@@ -288,6 +288,8 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+ * mode 1 is one shot, mode 2 is period, otherwise del timer */
+ switch (ps->channels[0].mode) {
+ case 1:
++ /* FIXME: enhance mode 4 precision */
++ case 4:
+ create_pit_timer(&ps->pit_timer, val, 0);
+ break;
+ case 2:
+diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
+index 2ad6f54..36c5406 100644
+--- a/arch/x86/kvm/mmu.c
++++ b/arch/x86/kvm/mmu.c
+@@ -79,36 +79,6 @@ static int dbg = 1;
+ }
+ #endif
+
+-#define PT64_PT_BITS 9
+-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+-#define PT32_PT_BITS 10
+-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+-
+-#define PT_WRITABLE_SHIFT 1
+-
+-#define PT_PRESENT_MASK (1ULL << 0)
+-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+-#define PT_USER_MASK (1ULL << 2)
+-#define PT_PWT_MASK (1ULL << 3)
+-#define PT_PCD_MASK (1ULL << 4)
+-#define PT_ACCESSED_MASK (1ULL << 5)
+-#define PT_DIRTY_MASK (1ULL << 6)
+-#define PT_PAGE_SIZE_MASK (1ULL << 7)
+-#define PT_PAT_MASK (1ULL << 7)
+-#define PT_GLOBAL_MASK (1ULL << 8)
+-#define PT64_NX_SHIFT 63
+-#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+-
+-#define PT_PAT_SHIFT 7
+-#define PT_DIR_PAT_SHIFT 12
+-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+-
+-#define PT32_DIR_PSE36_SIZE 4
+-#define PT32_DIR_PSE36_SHIFT 13
+-#define PT32_DIR_PSE36_MASK \
+- (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+-
+-
+ #define PT_FIRST_AVAIL_BITS_SHIFT 9
+ #define PT64_SECOND_AVAIL_BITS_SHIFT 52
+
+@@ -154,10 +124,6 @@ static int dbg = 1;
+ #define PFERR_USER_MASK (1U << 2)
+ #define PFERR_FETCH_MASK (1U << 4)
+
+-#define PT64_ROOT_LEVEL 4
+-#define PT32_ROOT_LEVEL 2
+-#define PT32E_ROOT_LEVEL 3
+-
+ #define PT_DIRECTORY_LEVEL 2
+ #define PT_PAGE_TABLE_LEVEL 1
+
+@@ -186,6 +152,12 @@ static struct kmem_cache *mmu_page_header_cache;
+
+ static u64 __read_mostly shadow_trap_nonpresent_pte;
+ static u64 __read_mostly shadow_notrap_nonpresent_pte;
++static u64 __read_mostly shadow_base_present_pte;
++static u64 __read_mostly shadow_nx_mask;
++static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
++static u64 __read_mostly shadow_user_mask;
++static u64 __read_mostly shadow_accessed_mask;
++static u64 __read_mostly shadow_dirty_mask;
+
+ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+ {
+@@ -194,6 +166,23 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+ }
+ EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
+
++void kvm_mmu_set_base_ptes(u64 base_pte)
++{
++ shadow_base_present_pte = base_pte;
++}
++EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
++
++void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
++ u64 dirty_mask, u64 nx_mask, u64 x_mask)
++{
++ shadow_user_mask = user_mask;
++ shadow_accessed_mask = accessed_mask;
++ shadow_dirty_mask = dirty_mask;
++ shadow_nx_mask = nx_mask;
++ shadow_x_mask = x_mask;
++}
++EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
++
+ static int is_write_protection(struct kvm_vcpu *vcpu)
+ {
+ return vcpu->arch.cr0 & X86_CR0_WP;
+@@ -232,7 +221,7 @@ static int is_writeble_pte(unsigned long pte)
+
+ static int is_dirty_pte(unsigned long pte)
+ {
+- return pte & PT_DIRTY_MASK;
++ return pte & shadow_dirty_mask;
+ }
+
+ static int is_rmap_pte(u64 pte)
+@@ -387,7 +376,6 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+
+ write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ *write_count += 1;
+- WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
+ }
+
+ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+@@ -547,7 +535,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
+ return;
+ sp = page_header(__pa(spte));
+ pfn = spte_to_pfn(*spte);
+- if (*spte & PT_ACCESSED_MASK)
++ if (*spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (is_writeble_pte(*spte))
+ kvm_release_pfn_dirty(pfn);
+@@ -1073,17 +1061,17 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+ * whether the guest actually used the pte (in order to detect
+ * demand paging).
+ */
+- spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
++ spte = shadow_base_present_pte | shadow_dirty_mask;
+ if (!speculative)
+ pte_access |= PT_ACCESSED_MASK;
+ if (!dirty)
+ pte_access &= ~ACC_WRITE_MASK;
+- if (!(pte_access & ACC_EXEC_MASK))
+- spte |= PT64_NX_MASK;
+-
+- spte |= PT_PRESENT_MASK;
++ if (pte_access & ACC_EXEC_MASK)
++ spte |= shadow_x_mask;
++ else
++ spte |= shadow_nx_mask;
+ if (pte_access & ACC_USER_MASK)
+- spte |= PT_USER_MASK;
++ spte |= shadow_user_mask;
+ if (largepage)
+ spte |= PT_PAGE_SIZE_MASK;
+
+@@ -1188,8 +1176,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+ return -ENOMEM;
+ }
+
+- table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
+- | PT_WRITABLE_MASK | PT_USER_MASK;
++ table[index] = __pa(new_table->spt)
++ | PT_PRESENT_MASK | PT_WRITABLE_MASK
++ | shadow_user_mask | shadow_x_mask;
+ }
+ table_addr = table[index] & PT64_BASE_ADDR_MASK;
+ }
+@@ -1244,7 +1233,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ spin_lock(&vcpu->kvm->mmu_lock);
+-#ifdef CONFIG_X86_64
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+@@ -1256,7 +1244,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
+-#endif
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+@@ -1282,7 +1269,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+
+ root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+
+-#ifdef CONFIG_X86_64
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+@@ -1297,7 +1283,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+ vcpu->arch.mmu.root_hpa = root;
+ return;
+ }
+-#endif
+ metaphysical = !is_paging(vcpu);
+ if (tdp_enabled)
+ metaphysical = 1;
+@@ -1377,7 +1362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+- largepage, gfn, pfn, TDP_ROOT_LEVEL);
++ largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ return r;
+@@ -1484,7 +1469,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+ context->page_fault = tdp_page_fault;
+ context->free = nonpaging_free;
+ context->prefetch_page = nonpaging_prefetch_page;
+- context->shadow_root_level = TDP_ROOT_LEVEL;
++ context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+ context->root_hpa = INVALID_PAGE;
+
+ if (!is_paging(vcpu)) {
+@@ -1633,7 +1618,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
+ {
+ u64 *spte = vcpu->arch.last_pte_updated;
+
+- return !!(spte && (*spte & PT_ACCESSED_MASK));
++ return !!(spte && (*spte & shadow_accessed_mask));
+ }
+
+ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
+index e64e9f5..1730757 100644
+--- a/arch/x86/kvm/mmu.h
++++ b/arch/x86/kvm/mmu.h
+@@ -3,11 +3,38 @@
+
+ #include <linux/kvm_host.h>
+
+-#ifdef CONFIG_X86_64
+-#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL
+-#else
+-#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL
+-#endif
++#define PT64_PT_BITS 9
++#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
++#define PT32_PT_BITS 10
++#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
++
++#define PT_WRITABLE_SHIFT 1
++
++#define PT_PRESENT_MASK (1ULL << 0)
++#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
++#define PT_USER_MASK (1ULL << 2)
++#define PT_PWT_MASK (1ULL << 3)
++#define PT_PCD_MASK (1ULL << 4)
++#define PT_ACCESSED_MASK (1ULL << 5)
++#define PT_DIRTY_MASK (1ULL << 6)
++#define PT_PAGE_SIZE_MASK (1ULL << 7)
++#define PT_PAT_MASK (1ULL << 7)
++#define PT_GLOBAL_MASK (1ULL << 8)
++#define PT64_NX_SHIFT 63
++#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
++
++#define PT_PAT_SHIFT 7
++#define PT_DIR_PAT_SHIFT 12
++#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
++
++#define PT32_DIR_PSE36_SIZE 4
++#define PT32_DIR_PSE36_SHIFT 13
++#define PT32_DIR_PSE36_MASK \
++ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
++
++#define PT64_ROOT_LEVEL 4
++#define PT32_ROOT_LEVEL 2
++#define PT32E_ROOT_LEVEL 3
+
+ static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+ {
+diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
+index 89e0be2..ab22615 100644
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -1863,6 +1863,15 @@ static bool svm_cpu_has_accelerated_tpr(void)
+ return false;
+ }
+
++static int get_npt_level(void)
++{
++#ifdef CONFIG_X86_64
++ return PT64_ROOT_LEVEL;
++#else
++ return PT32E_ROOT_LEVEL;
++#endif
++}
++
+ static struct kvm_x86_ops svm_x86_ops = {
+ .cpu_has_kvm_support = has_svm,
+ .disabled_by_bios = is_disabled,
+@@ -1920,6 +1929,7 @@ static struct kvm_x86_ops svm_x86_ops = {
+ .inject_pending_vectors = do_interrupt_requests,
+
+ .set_tss_addr = svm_set_tss_addr,
++ .get_tdp_level = get_npt_level,
+ };
+
+ static int __init svm_init(void)
+diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
+index 8e5d664..bfe4db1 100644
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -42,6 +42,9 @@ module_param(enable_vpid, bool, 0);
+ static int flexpriority_enabled = 1;
+ module_param(flexpriority_enabled, bool, 0);
+
++static int enable_ept = 1;
++module_param(enable_ept, bool, 0);
++
+ struct vmcs {
+ u32 revision_id;
+ u32 abort;
+@@ -84,7 +87,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+ return container_of(vcpu, struct vcpu_vmx, vcpu);
+ }
+
+-static int init_rmode_tss(struct kvm *kvm);
++static int init_rmode(struct kvm *kvm);
+
+ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+@@ -107,6 +110,11 @@ static struct vmcs_config {
+ u32 vmentry_ctrl;
+ } vmcs_config;
+
++struct vmx_capability {
++ u32 ept;
++ u32 vpid;
++} vmx_capability;
++
+ #define VMX_SEGMENT_FIELD(seg) \
+ [VCPU_SREG_##seg] = { \
+ .selector = GUEST_##seg##_SELECTOR, \
+@@ -214,6 +222,32 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ }
+
++static inline int cpu_has_vmx_invept_individual_addr(void)
++{
++ return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT));
++}
++
++static inline int cpu_has_vmx_invept_context(void)
++{
++ return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT));
++}
++
++static inline int cpu_has_vmx_invept_global(void)
++{
++ return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT));
++}
++
++static inline int cpu_has_vmx_ept(void)
++{
++ return (vmcs_config.cpu_based_2nd_exec_ctrl &
++ SECONDARY_EXEC_ENABLE_EPT);
++}
++
++static inline int vm_need_ept(void)
++{
++ return (cpu_has_vmx_ept() && enable_ept);
++}
++
+ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+ {
+ return ((cpu_has_vmx_virtualize_apic_accesses()) &&
+@@ -250,6 +284,18 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+ : : "a"(&operand), "c"(ext) : "cc", "memory");
+ }
+
++static inline void __invept(int ext, u64 eptp, gpa_t gpa)
++{
++ struct {
++ u64 eptp, gpa;
++ } operand = {eptp, gpa};
++
++ asm volatile (ASM_VMX_INVEPT
++ /* CF==1 or ZF==1 --> rc = -1 */
++ "; ja 1f ; ud2 ; 1:\n"
++ : : "a" (&operand), "c" (ext) : "cc", "memory");
++}
++
+ static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+ {
+ int i;
+@@ -301,6 +347,33 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+ }
+
++static inline void ept_sync_global(void)
++{
++ if (cpu_has_vmx_invept_global())
++ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
++}
++
++static inline void ept_sync_context(u64 eptp)
++{
++ if (vm_need_ept()) {
++ if (cpu_has_vmx_invept_context())
++ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
++ else
++ ept_sync_global();
++ }
++}
++
++static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
++{
++ if (vm_need_ept()) {
++ if (cpu_has_vmx_invept_individual_addr())
++ __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
++ eptp, gpa);
++ else
++ ept_sync_context(eptp);
++ }
++}
++
+ static unsigned long vmcs_readl(unsigned long field)
+ {
+ unsigned long value;
+@@ -388,6 +461,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+ eb |= 1u << 1;
+ if (vcpu->arch.rmode.active)
+ eb = ~0;
++ if (vm_need_ept())
++ eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+ vmcs_write32(EXCEPTION_BITMAP, eb);
+ }
+
+@@ -985,7 +1060,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
+ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
+ {
+ u32 vmx_msr_low, vmx_msr_high;
+- u32 min, opt;
++ u32 min, opt, min2, opt2;
+ u32 _pin_based_exec_control = 0;
+ u32 _cpu_based_exec_control = 0;
+ u32 _cpu_based_2nd_exec_control = 0;
+@@ -1003,6 +1078,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
+ CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING |
+ #endif
++ CPU_BASED_CR3_LOAD_EXITING |
++ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_USE_IO_BITMAPS |
+ CPU_BASED_MOV_DR_EXITING |
+ CPU_BASED_USE_TSC_OFFSETING;
+@@ -1018,11 +1095,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
+ ~CPU_BASED_CR8_STORE_EXITING;
+ #endif
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+- min = 0;
+- opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
++ min2 = 0;
++ opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_WBINVD_EXITING |
+- SECONDARY_EXEC_ENABLE_VPID;
+- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
++ SECONDARY_EXEC_ENABLE_VPID |
++ SECONDARY_EXEC_ENABLE_EPT;
++ if (adjust_vmx_controls(min2, opt2,
++ MSR_IA32_VMX_PROCBASED_CTLS2,
+ &_cpu_based_2nd_exec_control) < 0)
+ return -EIO;
+ }
+@@ -1031,6 +1110,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+ #endif
++ if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
++ /* CR3 accesses don't need to cause VM Exits when EPT enabled */
++ min &= ~(CPU_BASED_CR3_LOAD_EXITING |
++ CPU_BASED_CR3_STORE_EXITING);
++ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
++ &_cpu_based_exec_control) < 0)
++ return -EIO;
++ rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
++ vmx_capability.ept, vmx_capability.vpid);
++ }
+
+ min = 0;
+ #ifdef CONFIG_X86_64
+@@ -1256,7 +1345,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
+ fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+
+ kvm_mmu_reset_context(vcpu);
+- init_rmode_tss(vcpu->kvm);
++ init_rmode(vcpu->kvm);
+ }
+
+ #ifdef CONFIG_X86_64
+@@ -1304,8 +1393,64 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+ }
+
++static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
++{
++ if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
++ if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
++ printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
++ return;
++ }
++ vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
++ vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
++ vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
++ vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
++ }
++}
++
++static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
++
++static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
++ unsigned long cr0,
++ struct kvm_vcpu *vcpu)
++{
++ if (!(cr0 & X86_CR0_PG)) {
++ /* From paging/starting to nonpaging */
++ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
++ vmcs_config.cpu_based_exec_ctrl |
++ (CPU_BASED_CR3_LOAD_EXITING |
++ CPU_BASED_CR3_STORE_EXITING));
++ vcpu->arch.cr0 = cr0;
++ vmx_set_cr4(vcpu, vcpu->arch.cr4);
++ *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
++ *hw_cr0 &= ~X86_CR0_WP;
++ } else if (!is_paging(vcpu)) {
++ /* From nonpaging to paging */
++ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
++ vmcs_config.cpu_based_exec_ctrl &
++ ~(CPU_BASED_CR3_LOAD_EXITING |
++ CPU_BASED_CR3_STORE_EXITING));
++ vcpu->arch.cr0 = cr0;
++ vmx_set_cr4(vcpu, vcpu->arch.cr4);
++ if (!(vcpu->arch.cr0 & X86_CR0_WP))
++ *hw_cr0 &= ~X86_CR0_WP;
++ }
++}
++
++static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
++ struct kvm_vcpu *vcpu)
++{
++ if (!is_paging(vcpu)) {
++ *hw_cr4 &= ~X86_CR4_PAE;
++ *hw_cr4 |= X86_CR4_PSE;
++ } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
++ *hw_cr4 &= ~X86_CR4_PAE;
++}
++
+ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+ {
++ unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
++ KVM_VM_CR0_ALWAYS_ON;
++
+ vmx_fpu_deactivate(vcpu);
+
+ if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
+@@ -1323,29 +1468,61 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+ }
+ #endif
+
++ if (vm_need_ept())
++ ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
++
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+- vmcs_writel(GUEST_CR0,
+- (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
++ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+
+ if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
+ vmx_fpu_activate(vcpu);
+ }
+
++static u64 construct_eptp(unsigned long root_hpa)
++{
++ u64 eptp;
++
++ /* TODO write the value reading from MSR */
++ eptp = VMX_EPT_DEFAULT_MT |
++ VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
++ eptp |= (root_hpa & PAGE_MASK);
++
++ return eptp;
++}
++
+ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+ {
++ unsigned long guest_cr3;
++ u64 eptp;
++
++ guest_cr3 = cr3;
++ if (vm_need_ept()) {
++ eptp = construct_eptp(cr3);
++ vmcs_write64(EPT_POINTER, eptp);
++ ept_sync_context(eptp);
++ ept_load_pdptrs(vcpu);
++ guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
++ VMX_EPT_IDENTITY_PAGETABLE_ADDR;
++ }
++
+ vmx_flush_tlb(vcpu);
+- vmcs_writel(GUEST_CR3, cr3);
++ vmcs_writel(GUEST_CR3, guest_cr3);
+ if (vcpu->arch.cr0 & X86_CR0_PE)
+ vmx_fpu_deactivate(vcpu);
+ }
+
+ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+ {
+- vmcs_writel(CR4_READ_SHADOW, cr4);
+- vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
+- KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
++ unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
++ KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
++
+ vcpu->arch.cr4 = cr4;
++ if (vm_need_ept())
++ ept_update_paging_mode_cr4(&hw_cr4, vcpu);
++
++ vmcs_writel(CR4_READ_SHADOW, cr4);
++ vmcs_writel(GUEST_CR4, hw_cr4);
+ }
+
+ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+@@ -1530,6 +1707,41 @@ out:
+ return ret;
+ }
+
++static int init_rmode_identity_map(struct kvm *kvm)
++{
++ int i, r, ret;
++ pfn_t identity_map_pfn;
++ u32 tmp;
++
++ if (!vm_need_ept())
++ return 1;
++ if (unlikely(!kvm->arch.ept_identity_pagetable)) {
++ printk(KERN_ERR "EPT: identity-mapping pagetable "
++ "haven't been allocated!\n");
++ return 0;
++ }
++ if (likely(kvm->arch.ept_identity_pagetable_done))
++ return 1;
++ ret = 0;
++ identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
++ r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
++ if (r < 0)
++ goto out;
++ /* Set up identity-mapping pagetable for EPT in real mode */
++ for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
++ tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
++ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
++ r = kvm_write_guest_page(kvm, identity_map_pfn,
++ &tmp, i * sizeof(tmp), sizeof(tmp));
++ if (r < 0)
++ goto out;
++ }
++ kvm->arch.ept_identity_pagetable_done = true;
++ ret = 1;
++out:
++ return ret;
++}
++
+ static void seg_setup(int seg)
+ {
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+@@ -1564,6 +1776,31 @@ out:
+ return r;
+ }
+
++static int alloc_identity_pagetable(struct kvm *kvm)
++{
++ struct kvm_userspace_memory_region kvm_userspace_mem;
++ int r = 0;
++
++ down_write(&kvm->slots_lock);
++ if (kvm->arch.ept_identity_pagetable)
++ goto out;
++ kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
++ kvm_userspace_mem.flags = 0;
++ kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
++ kvm_userspace_mem.memory_size = PAGE_SIZE;
++ r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
++ if (r)
++ goto out;
++
++ down_read(¤t->mm->mmap_sem);
++ kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
++ VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
++ up_read(¤t->mm->mmap_sem);
++out:
++ up_write(&kvm->slots_lock);
++ return r;
++}
++
+ static void allocate_vpid(struct vcpu_vmx *vmx)
+ {
+ int vpid;
+@@ -1638,6 +1875,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+ CPU_BASED_CR8_LOAD_EXITING;
+ #endif
+ }
++ if (!vm_need_ept())
++ exec_control |= CPU_BASED_CR3_STORE_EXITING |
++ CPU_BASED_CR3_LOAD_EXITING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+ if (cpu_has_secondary_exec_ctrls()) {
+@@ -1647,6 +1887,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (vmx->vpid == 0)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
++ if (!vm_need_ept())
++ exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ }
+
+@@ -1722,6 +1964,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+ return 0;
+ }
+
++static int init_rmode(struct kvm *kvm)
++{
++ if (!init_rmode_tss(kvm))
++ return 0;
++ if (!init_rmode_identity_map(kvm))
++ return 0;
++ return 1;
++}
++
+ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+@@ -1729,7 +1980,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+ int ret;
+
+ down_read(&vcpu->kvm->slots_lock);
+- if (!init_rmode_tss(vmx->vcpu.kvm)) {
++ if (!init_rmode(vmx->vcpu.kvm)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+@@ -1994,6 +2245,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+ if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ if (is_page_fault(intr_info)) {
++ /* EPT won't cause page fault directly */
++ if (vm_need_ept())
++ BUG();
+ cr2 = vmcs_readl(EXIT_QUALIFICATION);
+ KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
+ (u32)((u64)cr2 >> 32), handler);
+@@ -2323,6 +2577,64 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+ return kvm_task_switch(vcpu, tss_selector, reason);
+ }
+
++static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
++{
++ u64 exit_qualification;
++ enum emulation_result er;
++ gpa_t gpa;
++ unsigned long hva;
++ int gla_validity;
++ int r;
++
++ exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
++
++ if (exit_qualification & (1 << 6)) {
++ printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
++ return -ENOTSUPP;
++ }
++
++ gla_validity = (exit_qualification >> 7) & 0x3;
++ if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
++ printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
++ printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
++ (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
++ (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
++ printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
++ (long unsigned int)exit_qualification);
++ kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
++ kvm_run->hw.hardware_exit_reason = 0;
++ return -ENOTSUPP;
++ }
++
++ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
++ hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
++ if (!kvm_is_error_hva(hva)) {
++ r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
++ if (r < 0) {
++ printk(KERN_ERR "EPT: Not enough memory!\n");
++ return -ENOMEM;
++ }
++ return 1;
++ } else {
++ /* must be MMIO */
++ er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
++
++ if (er == EMULATE_FAIL) {
++ printk(KERN_ERR
++ "EPT: Fail to handle EPT violation vmexit!er is %d\n",
++ er);
++ printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
++ (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
++ (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
++ printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
++ (long unsigned int)exit_qualification);
++ return -ENOTSUPP;
++ } else if (er == EMULATE_DO_MMIO)
++ return 0;
++ }
++ return 1;
++}
++
+ /*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume. Otherwise they set the kvm_run parameter to indicate what needs
+@@ -2346,6 +2658,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
+ [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+ [EXIT_REASON_WBINVD] = handle_wbinvd,
+ [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
++ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ };
+
+ static const int kvm_vmx_max_exit_handlers =
+@@ -2364,6 +2677,13 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+ KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
+ (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
+
++ /* Access CR3 don't cause VMExit in paging mode, so we need
++ * to sync with guest real CR3. */
++ if (vm_need_ept() && is_paging(vcpu)) {
++ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
++ ept_load_pdptrs(vcpu);
++ }
++
+ if (unlikely(vmx->fail)) {
+ kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ kvm_run->fail_entry.hardware_entry_failure_reason
+@@ -2372,7 +2692,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+ }
+
+ if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+- exit_reason != EXIT_REASON_EXCEPTION_NMI)
++ (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
++ exit_reason != EXIT_REASON_EPT_VIOLATION))
+ printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
+ "exit reason is 0x%x\n", __func__, exit_reason);
+ if (exit_reason < kvm_vmx_max_exit_handlers
+@@ -2674,6 +2995,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+ return ERR_PTR(-ENOMEM);
+
+ allocate_vpid(vmx);
++ if (id == 0 && vm_need_ept()) {
++ kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
++ VMX_EPT_WRITABLE_MASK |
++ VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
++ kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
++ VMX_EPT_FAKE_DIRTY_MASK, 0ull,
++ VMX_EPT_EXECUTABLE_MASK);
++ kvm_enable_tdp();
++ }
+
+ err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
+ if (err)
+@@ -2706,6 +3036,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+ if (alloc_apic_access_page(kvm) != 0)
+ goto free_vmcs;
+
++ if (vm_need_ept())
++ if (alloc_identity_pagetable(kvm) != 0)
++ goto free_vmcs;
++
+ return &vmx->vcpu;
+
+ free_vmcs:
+@@ -2735,6 +3069,11 @@ static void __init vmx_check_processor_compat(void *rtn)
+ }
+ }
+
++static int get_ept_level(void)
++{
++ return VMX_EPT_DEFAULT_GAW + 1;
++}
++
+ static struct kvm_x86_ops vmx_x86_ops = {
+ .cpu_has_kvm_support = cpu_has_kvm_support,
+ .disabled_by_bios = vmx_disabled_by_bios,
+@@ -2791,6 +3130,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
+ .inject_pending_vectors = do_interrupt_requests,
+
+ .set_tss_addr = vmx_set_tss_addr,
++ .get_tdp_level = get_ept_level,
+ };
+
+ static int __init vmx_init(void)
+@@ -2843,9 +3183,14 @@ static int __init vmx_init(void)
+ vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
+ vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
+
++ if (cpu_has_vmx_ept())
++ bypass_guest_pf = 0;
++
+ if (bypass_guest_pf)
+ kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
+
++ ept_sync_global();
++
+ return 0;
+
+ out2:
+diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
+index 5dff460..79d94c6 100644
+--- a/arch/x86/kvm/vmx.h
++++ b/arch/x86/kvm/vmx.h
+@@ -35,6 +35,8 @@
+ #define CPU_BASED_MWAIT_EXITING 0x00000400
+ #define CPU_BASED_RDPMC_EXITING 0x00000800
+ #define CPU_BASED_RDTSC_EXITING 0x00001000
++#define CPU_BASED_CR3_LOAD_EXITING 0x00008000
++#define CPU_BASED_CR3_STORE_EXITING 0x00010000
+ #define CPU_BASED_CR8_LOAD_EXITING 0x00080000
+ #define CPU_BASED_CR8_STORE_EXITING 0x00100000
+ #define CPU_BASED_TPR_SHADOW 0x00200000
+@@ -49,6 +51,7 @@
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
++#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
+ #define SECONDARY_EXEC_ENABLE_VPID 0x00000020
+ #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
+
+@@ -100,10 +103,22 @@ enum vmcs_field {
+ VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
+ APIC_ACCESS_ADDR = 0x00002014,
+ APIC_ACCESS_ADDR_HIGH = 0x00002015,
++ EPT_POINTER = 0x0000201a,
++ EPT_POINTER_HIGH = 0x0000201b,
++ GUEST_PHYSICAL_ADDRESS = 0x00002400,
++ GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
+ VMCS_LINK_POINTER = 0x00002800,
+ VMCS_LINK_POINTER_HIGH = 0x00002801,
+ GUEST_IA32_DEBUGCTL = 0x00002802,
+ GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
++ GUEST_PDPTR0 = 0x0000280a,
++ GUEST_PDPTR0_HIGH = 0x0000280b,
++ GUEST_PDPTR1 = 0x0000280c,
++ GUEST_PDPTR1_HIGH = 0x0000280d,
++ GUEST_PDPTR2 = 0x0000280e,
++ GUEST_PDPTR2_HIGH = 0x0000280f,
++ GUEST_PDPTR3 = 0x00002810,
++ GUEST_PDPTR3_HIGH = 0x00002811,
+ PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
+ CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
+ EXCEPTION_BITMAP = 0x00004004,
+@@ -226,6 +241,8 @@ enum vmcs_field {
+ #define EXIT_REASON_MWAIT_INSTRUCTION 36
+ #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+ #define EXIT_REASON_APIC_ACCESS 44
++#define EXIT_REASON_EPT_VIOLATION 48
++#define EXIT_REASON_EPT_MISCONFIG 49
+ #define EXIT_REASON_WBINVD 54
+
+ /*
+@@ -316,15 +333,36 @@ enum vmcs_field {
+ #define MSR_IA32_VMX_CR4_FIXED1 0x489
+ #define MSR_IA32_VMX_VMCS_ENUM 0x48a
+ #define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
++#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c
+
+ #define MSR_IA32_FEATURE_CONTROL 0x3a
+ #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
+ #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
+
+ #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
++#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
+
+ #define VMX_NR_VPIDS (1 << 16)
+ #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
+ #define VMX_VPID_EXTENT_ALL_CONTEXT 2
+
++#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
++#define VMX_EPT_EXTENT_CONTEXT 1
++#define VMX_EPT_EXTENT_GLOBAL 2
++#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
++#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
++#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
++#define VMX_EPT_DEFAULT_GAW 3
++#define VMX_EPT_MAX_GAW 0x4
++#define VMX_EPT_MT_EPTE_SHIFT 3
++#define VMX_EPT_GAW_EPTP_SHIFT 3
++#define VMX_EPT_DEFAULT_MT 0x6ull
++#define VMX_EPT_READABLE_MASK 0x1ull
++#define VMX_EPT_WRITABLE_MASK 0x2ull
++#define VMX_EPT_EXECUTABLE_MASK 0x4ull
++#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62)
++#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63)
++
++#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
++
+ #endif
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 0ce5563..21338bd 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -2417,6 +2417,9 @@ int kvm_arch_init(void *opaque)
+
+ kvm_x86_ops = ops;
+ kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
++ kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
++ kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
++ PT_DIRTY_MASK, PT64_NX_MASK, 0);
+ return 0;
+
+ out:
+@@ -3019,6 +3022,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+
+ kvm_x86_ops->decache_regs(vcpu);
+
++ vcpu->arch.exception.pending = false;
++
+ vcpu_put(vcpu);
+
+ return 0;
+@@ -3481,7 +3486,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+- cseg_desc.type &= ~(1 << 8); //clear the B flag
++ cseg_desc.type &= ~(1 << 1); //clear the B flag
+ save_guest_segment_descriptor(vcpu, tr_seg.selector,
+ &cseg_desc);
+ }
+@@ -3507,7 +3512,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
+ }
+
+ if (reason != TASK_SWITCH_IRET) {
+- nseg_desc.type |= (1 << 8);
++ nseg_desc.type |= (1 << 1);
+ save_guest_segment_descriptor(vcpu, tss_selector,
+ &nseg_desc);
+ }
+@@ -3698,10 +3703,19 @@ void fx_init(struct kvm_vcpu *vcpu)
+ {
+ unsigned after_mxcsr_mask;
+
++ /*
++ * Touch the fpu the first time in non atomic context as if
++ * this is the first fpu instruction the exception handler
++ * will fire before the instruction returns and it'll have to
++ * allocate ram with GFP_KERNEL.
++ */
++ if (!used_math())
++ fx_save(&vcpu->arch.host_fx_image);
++
+ /* Initialize guest FPU by resetting ours and saving into guest's */
+ preempt_disable();
+ fx_save(&vcpu->arch.host_fx_image);
+- fpu_init();
++ fx_finit();
+ fx_save(&vcpu->arch.guest_fx_image);
+ fx_restore(&vcpu->arch.host_fx_image);
+ preempt_enable();
+@@ -3906,6 +3920,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
+ kvm_free_physmem(kvm);
+ if (kvm->arch.apic_access_page)
+ put_page(kvm->arch.apic_access_page);
++ if (kvm->arch.ept_identity_pagetable)
++ put_page(kvm->arch.ept_identity_pagetable);
+ kfree(kvm);
+ }
+
+diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
+index 2ca0838..f2a696d 100644
+--- a/arch/x86/kvm/x86_emulate.c
++++ b/arch/x86/kvm/x86_emulate.c
+@@ -1761,6 +1761,7 @@ twobyte_insn:
+ case 6: /* lmsw */
+ realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
+ &ctxt->eflags);
++ c->dst.type = OP_NONE;
+ break;
+ case 7: /* invlpg*/
+ emulate_invlpg(ctxt->vcpu, memop);
+diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
+index 1837885..914ccf9 100644
+--- a/arch/x86/mm/discontig_32.c
++++ b/arch/x86/mm/discontig_32.c
+@@ -476,29 +476,3 @@ int memory_add_physaddr_to_nid(u64 addr)
+
+ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+ #endif
+-
+-#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT
+-/*
+- * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA
+- *
+- * These stub functions are needed to compile 32-bit NUMA when SRAT is
+- * not set. There are functions in srat_64.c for parsing this table
+- * and it may be possible to make them common functions.
+- */
+-void acpi_numa_slit_init (struct acpi_table_slit *slit)
+-{
+- printk(KERN_INFO "ACPI: No support for parsing SLIT table\n");
+-}
+-
+-void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa)
+-{
+-}
+-
+-void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma)
+-{
+-}
+-
+-void acpi_numa_arch_fixup(void)
+-{
+-}
+-#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */
+diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
+index 9ee007b..369cf06 100644
+--- a/arch/x86/mm/pgtable_32.c
++++ b/arch/x86/mm/pgtable_32.c
+@@ -172,10 +172,3 @@ void reserve_top_address(unsigned long reserve)
+ __FIXADDR_TOP = -reserve - PAGE_SIZE;
+ __VMALLOC_RESERVE += reserve;
+ }
+-
+-int pmd_bad(pmd_t pmd)
+-{
+- WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
+-
+- return pmd_bad_v1(pmd);
+-}
+diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32
+index 7fa5198..89ec35d 100644
+--- a/arch/x86/pci/Makefile_32
++++ b/arch/x86/pci/Makefile_32
+@@ -6,11 +6,19 @@ obj-$(CONFIG_PCI_DIRECT) += direct.o
+ obj-$(CONFIG_PCI_OLPC) += olpc.o
+
+ pci-y := fixup.o
++
++# Do not change the ordering here. There is a nasty init function
++# ordering dependency which breaks when you move acpi.o below
++# legacy/irq.o
+ pci-$(CONFIG_ACPI) += acpi.o
+ pci-y += legacy.o irq.o
+
+-pci-$(CONFIG_X86_VISWS) += visws.o fixup.o
+-pci-$(CONFIG_X86_NUMAQ) += numa.o irq.o
++# Careful: VISWS and NUMAQ overrule the pci-y above. The colons are
++# therefor correct. This needs a proper fix by distangling the code.
++pci-$(CONFIG_X86_VISWS) := visws.o fixup.o
++pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o
++
++# Necessary for NUMAQ as well
+ pci-$(CONFIG_NUMA) += mp_bus_to_node.o
+
+ obj-y += $(pci-y) common.o early.o
+diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
+index 1a9c0c6..d95de2f 100644
+--- a/arch/x86/pci/acpi.c
++++ b/arch/x86/pci/acpi.c
+@@ -6,45 +6,6 @@
+ #include <asm/numa.h>
+ #include "pci.h"
+
+-static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
+-{
+- pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
+- printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident);
+- return 0;
+-}
+-
+-static struct dmi_system_id acpi_pciprobe_dmi_table[] __devinitdata = {
+-/*
+- * Systems where PCI IO resource ISA alignment can be skipped
+- * when the ISA enable bit in the bridge control is not set
+- */
+- {
+- .callback = can_skip_ioresource_align,
+- .ident = "IBM System x3800",
+- .matches = {
+- DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+- DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
+- },
+- },
+- {
+- .callback = can_skip_ioresource_align,
+- .ident = "IBM System x3850",
+- .matches = {
+- DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+- DMI_MATCH(DMI_PRODUCT_NAME, "x3850"),
+- },
+- },
+- {
+- .callback = can_skip_ioresource_align,
+- .ident = "IBM System x3950",
+- .matches = {
+- DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+- DMI_MATCH(DMI_PRODUCT_NAME, "x3950"),
+- },
+- },
+- {}
+-};
+-
+ struct pci_root_info {
+ char *name;
+ unsigned int res_num;
+@@ -196,8 +157,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
+ int pxm;
+ #endif
+
+- dmi_check_system(acpi_pciprobe_dmi_table);
+-
+ if (domain && !pci_domains_supported) {
+ printk(KERN_WARNING "PCI: Multiple domains not supported "
+ "(dom %d, bus %d)\n", domain, busnum);
+diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
+index 2a4d751..bfa72a9 100644
+--- a/arch/x86/pci/common.c
++++ b/arch/x86/pci/common.c
+@@ -90,6 +90,50 @@ static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
+ rom_r->start = rom_r->end = rom_r->flags = 0;
+ }
+
++static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
++{
++ pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
++ printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident);
++ return 0;
++}
++
++static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = {
++/*
++ * Systems where PCI IO resource ISA alignment can be skipped
++ * when the ISA enable bit in the bridge control is not set
++ */
++ {
++ .callback = can_skip_ioresource_align,
++ .ident = "IBM System x3800",
++ .matches = {
++ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
++ DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
++ },
++ },
++ {
++ .callback = can_skip_ioresource_align,
++ .ident = "IBM System x3850",
++ .matches = {
++ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
++ DMI_MATCH(DMI_PRODUCT_NAME, "x3850"),
++ },
++ },
++ {
++ .callback = can_skip_ioresource_align,
++ .ident = "IBM System x3950",
++ .matches = {
++ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
++ DMI_MATCH(DMI_PRODUCT_NAME, "x3950"),
++ },
++ },
++ {}
++};
++
++void __init dmi_check_skip_isa_align(void)
++{
++ dmi_check_system(can_skip_pciprobe_dmi_table);
++}
++
+ /*
+ * Called after each bus is probed, but before its children
+ * are examined.
+@@ -318,13 +362,16 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
+ {}
+ };
+
++void __init dmi_check_pciprobe(void)
++{
++ dmi_check_system(pciprobe_dmi_table);
++}
++
+ struct pci_bus * __devinit pcibios_scan_root(int busnum)
+ {
+ struct pci_bus *bus = NULL;
+ struct pci_sysdata *sd;
+
+- dmi_check_system(pciprobe_dmi_table);
+-
+ while ((bus = pci_find_next_bus(bus)) != NULL) {
+ if (bus->number == busnum) {
+ /* Already scanned */
+@@ -462,6 +509,9 @@ char * __devinit pcibios_setup(char *str)
+ } else if (!strcmp(str, "routeirq")) {
+ pci_routeirq = 1;
+ return NULL;
++ } else if (!strcmp(str, "skip_isa_align")) {
++ pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
++ return NULL;
+ }
+ return str;
+ }
+@@ -489,7 +539,7 @@ void pcibios_disable_device (struct pci_dev *dev)
+ pcibios_disable_irq(dev);
+ }
+
+-struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
++struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
+ {
+ struct pci_bus *bus = NULL;
+ struct pci_sysdata *sd;
+@@ -512,7 +562,7 @@ struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
+ return bus;
+ }
+
+-struct pci_bus *pci_scan_bus_with_sysdata(int busno)
++struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
+ {
+ return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
+ }
+diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
+index b60b2ab..ff3a6a3 100644
+--- a/arch/x86/pci/fixup.c
++++ b/arch/x86/pci/fixup.c
+@@ -502,7 +502,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
+ */
+ static void fam10h_pci_cfg_space_size(struct pci_dev *dev)
+ {
+- dev->cfg_size = pci_cfg_space_size_ext(dev, 0);
++ dev->cfg_size = pci_cfg_space_size_ext(dev);
+ }
+
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, fam10h_pci_cfg_space_size);
+diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
+index dd30c60..e70b9c5 100644
+--- a/arch/x86/pci/init.c
++++ b/arch/x86/pci/init.c
+@@ -33,6 +33,10 @@ static __init int pci_access_init(void)
+ printk(KERN_ERR
+ "PCI: Fatal: No config space access function found\n");
+
++ dmi_check_pciprobe();
++
++ dmi_check_skip_isa_align();
++
+ return 0;
+ }
+ arch_initcall(pci_access_init);
+diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
+index c58805a..f3972b1 100644
+--- a/arch/x86/pci/pci.h
++++ b/arch/x86/pci/pci.h
+@@ -38,6 +38,9 @@ enum pci_bf_sort_state {
+ pci_dmi_bf,
+ };
+
++extern void __init dmi_check_pciprobe(void);
++extern void __init dmi_check_skip_isa_align(void);
++
+ /* pci-i386.c */
+
+ extern unsigned int pcibios_max_latency;
+diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
+index 4dceeb1..cf058fe 100644
+--- a/arch/x86/vdso/vdso32-setup.c
++++ b/arch/x86/vdso/vdso32-setup.c
+@@ -162,7 +162,7 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+ Elf32_Shdr *shdr;
+ int i;
+
+- BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
++ BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
+ !elf_check_arch_ia32(ehdr) ||
+ ehdr->e_type != ET_DYN);
+
+diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
+index 4db42bf..6952768 100644
+--- a/arch/x86/video/fbdev.c
++++ b/arch/x86/video/fbdev.c
+@@ -1,5 +1,4 @@
+ /*
+- *
+ * Copyright (C) 2007 Antonino Daplas <adaplas at gmail.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+@@ -29,3 +28,4 @@ int fb_is_primary_device(struct fb_info *info)
+ return retval;
+ }
+ EXPORT_SYMBOL(fb_is_primary_device);
++MODULE_LICENSE("GPL");
+diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
+index 1c11df9..9bf2986 100644
+--- a/drivers/ata/Kconfig
++++ b/drivers/ata/Kconfig
+@@ -205,8 +205,8 @@ config SATA_VITESSE
+ If unsure, say N.
+
+ config SATA_INIC162X
+- tristate "Initio 162x SATA support (HIGHLY EXPERIMENTAL)"
+- depends on PCI && EXPERIMENTAL
++ tristate "Initio 162x SATA support"
++ depends on PCI
+ help
+ This option enables support for Initio 162x Serial ATA.
+
+@@ -697,6 +697,15 @@ config PATA_SCC
+
+ If unsure, say N.
+
++config PATA_SCH
++ tristate "Intel SCH PATA support"
++ depends on PCI
++ help
++ This option enables support for Intel SCH PATA on the Intel
++ SCH (US15W, US15L, UL11L) series host controllers.
++
++ If unsure, say N.
++
+ config PATA_BF54X
+ tristate "Blackfin 54x ATAPI support"
+ depends on BF542 || BF548 || BF549
+diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile
+index b693d82..674965f 100644
+--- a/drivers/ata/Makefile
++++ b/drivers/ata/Makefile
+@@ -67,6 +67,7 @@ obj-$(CONFIG_PATA_SIS) += pata_sis.o
+ obj-$(CONFIG_PATA_TRIFLEX) += pata_triflex.o
+ obj-$(CONFIG_PATA_IXP4XX_CF) += pata_ixp4xx_cf.o
+ obj-$(CONFIG_PATA_SCC) += pata_scc.o
++obj-$(CONFIG_PATA_SCH) += pata_sch.o
+ obj-$(CONFIG_PATA_BF54X) += pata_bf54x.o
+ obj-$(CONFIG_PATA_PLATFORM) += pata_platform.o
+ obj-$(CONFIG_PATA_OF_PLATFORM) += pata_of_platform.o
+diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
+index 8cace9a..97f83fb 100644
+--- a/drivers/ata/ahci.c
++++ b/drivers/ata/ahci.c
+@@ -1267,9 +1267,7 @@ static int ahci_check_ready(struct ata_link *link)
+ void __iomem *port_mmio = ahci_port_base(link->ap);
+ u8 status = readl(port_mmio + PORT_TFDATA) & 0xFF;
+
+- if (!(status & ATA_BUSY))
+- return 1;
+- return 0;
++ return ata_check_ready(status);
+ }
+
+ static int ahci_softreset(struct ata_link *link, unsigned int *class,
+diff --git a/drivers/ata/ata_generic.c b/drivers/ata/ata_generic.c
+index 47aeccd..75a406f 100644
+--- a/drivers/ata/ata_generic.c
++++ b/drivers/ata/ata_generic.c
+@@ -152,6 +152,12 @@ static int ata_generic_init_one(struct pci_dev *dev, const struct pci_device_id
+ if (dev->vendor == PCI_VENDOR_ID_AL)
+ ata_pci_bmdma_clear_simplex(dev);
+
++ if (dev->vendor == PCI_VENDOR_ID_ATI) {
++ int rc = pcim_enable_device(dev);
++ if (rc < 0)
++ return rc;
++ pcim_pin_device(dev);
++ }
+ return ata_pci_sff_init_one(dev, ppi, &generic_sht, NULL);
+ }
+
+diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
+index ea2c764..a9027b8 100644
+--- a/drivers/ata/ata_piix.c
++++ b/drivers/ata/ata_piix.c
+@@ -1348,6 +1348,8 @@ static void __devinit piix_init_sidpr(struct ata_host *host)
+ {
+ struct pci_dev *pdev = to_pci_dev(host->dev);
+ struct piix_host_priv *hpriv = host->private_data;
++ struct ata_device *dev0 = &host->ports[0]->link.device[0];
++ u32 scontrol;
+ int i;
+
+ /* check for availability */
+@@ -1366,6 +1368,29 @@ static void __devinit piix_init_sidpr(struct ata_host *host)
+ return;
+
+ hpriv->sidpr = pcim_iomap_table(pdev)[PIIX_SIDPR_BAR];
++
++ /* SCR access via SIDPR doesn't work on some configurations.
++ * Give it a test drive by inhibiting power save modes which
++ * we'll do anyway.
++ */
++ scontrol = piix_sidpr_read(dev0, SCR_CONTROL);
++
++ /* if IPM is already 3, SCR access is probably working. Don't
++ * un-inhibit power save modes as BIOS might have inhibited
++ * them for a reason.
++ */
++ if ((scontrol & 0xf00) != 0x300) {
++ scontrol |= 0x300;
++ piix_sidpr_write(dev0, SCR_CONTROL, scontrol);
++ scontrol = piix_sidpr_read(dev0, SCR_CONTROL);
++
++ if ((scontrol & 0xf00) != 0x300) {
++ dev_printk(KERN_INFO, host->dev, "SCR access via "
++ "SIDPR is available but doesn't work\n");
++ return;
++ }
++ }
++
+ host->ports[0]->ops = &piix_sidpr_sata_ops;
+ host->ports[1]->ops = &piix_sidpr_sata_ops;
+ }
+diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
+index 3bc4885..927b692 100644
+--- a/drivers/ata/libata-core.c
++++ b/drivers/ata/libata-core.c
+@@ -6292,6 +6292,7 @@ EXPORT_SYMBOL_GPL(ata_eh_freeze_port);
+ EXPORT_SYMBOL_GPL(ata_eh_thaw_port);
+ EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
+ EXPORT_SYMBOL_GPL(ata_eh_qc_retry);
++EXPORT_SYMBOL_GPL(ata_eh_analyze_ncq_error);
+ EXPORT_SYMBOL_GPL(ata_do_eh);
+ EXPORT_SYMBOL_GPL(ata_std_error_handler);
+
+diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
+index 61dcd00..62e0331 100644
+--- a/drivers/ata/libata-eh.c
++++ b/drivers/ata/libata-eh.c
+@@ -1357,7 +1357,7 @@ static void ata_eh_analyze_serror(struct ata_link *link)
+ * LOCKING:
+ * Kernel thread context (may sleep).
+ */
+-static void ata_eh_analyze_ncq_error(struct ata_link *link)
++void ata_eh_analyze_ncq_error(struct ata_link *link)
+ {
+ struct ata_port *ap = link->ap;
+ struct ata_eh_context *ehc = &link->eh_context;
+diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
+index 2ec65a8..3c2d228 100644
+--- a/drivers/ata/libata-sff.c
++++ b/drivers/ata/libata-sff.c
+@@ -314,11 +314,7 @@ static int ata_sff_check_ready(struct ata_link *link)
+ {
+ u8 status = link->ap->ops->sff_check_status(link->ap);
+
+- if (!(status & ATA_BUSY))
+- return 1;
+- if (status == 0xff)
+- return -ENODEV;
+- return 0;
++ return ata_check_ready(status);
+ }
+
+ /**
+diff --git a/drivers/ata/pata_acpi.c b/drivers/ata/pata_acpi.c
+index c5f91e6..fbe6057 100644
+--- a/drivers/ata/pata_acpi.c
++++ b/drivers/ata/pata_acpi.c
+@@ -259,6 +259,12 @@ static int pacpi_init_one (struct pci_dev *pdev, const struct pci_device_id *id)
+ .port_ops = &pacpi_ops,
+ };
+ const struct ata_port_info *ppi[] = { &info, NULL };
++ if (pdev->vendor == PCI_VENDOR_ID_ATI) {
++ int rc = pcim_enable_device(pdev);
++ if (rc < 0)
++ return rc;
++ pcim_pin_device(pdev);
++ }
+ return ata_pci_sff_init_one(pdev, ppi, &pacpi_sht, NULL);
+ }
+
+diff --git a/drivers/ata/pata_sch.c b/drivers/ata/pata_sch.c
+new file mode 100644
+index 0000000..c8cc027
+--- /dev/null
++++ b/drivers/ata/pata_sch.c
+@@ -0,0 +1,206 @@
++/*
++ * pata_sch.c - Intel SCH PATA controllers
++ *
++ * Copyright (c) 2008 Alek Du <alek.du at intel.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License 2 as published
++ * by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; see the file COPYING. If not, write to
++ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
++ *
++ */
++
++/*
++ * Supports:
++ * Intel SCH (AF82US15W, AF82US15L, AF82UL11L) chipsets -- see spec at:
++ * http://download.intel.com/design/chipsets/embedded/datashts/319537.pdf
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/pci.h>
++#include <linux/init.h>
++#include <linux/blkdev.h>
++#include <linux/delay.h>
++#include <linux/device.h>
++#include <scsi/scsi_host.h>
++#include <linux/libata.h>
++#include <linux/dmi.h>
++
++#define DRV_NAME "pata_sch"
++#define DRV_VERSION "0.2"
++
++/* see SCH datasheet page 351 */
++enum {
++ D0TIM = 0x80, /* Device 0 Timing Register */
++ D1TIM = 0x84, /* Device 1 Timing Register */
++ PM = 0x07, /* PIO Mode Bit Mask */
++ MDM = (0x03 << 8), /* Multi-word DMA Mode Bit Mask */
++ UDM = (0x07 << 16), /* Ultra DMA Mode Bit Mask */
++ PPE = (1 << 30), /* Prefetch/Post Enable */
++ USD = (1 << 31), /* Use Synchronous DMA */
++};
++
++static int sch_init_one(struct pci_dev *pdev,
++ const struct pci_device_id *ent);
++static void sch_set_piomode(struct ata_port *ap, struct ata_device *adev);
++static void sch_set_dmamode(struct ata_port *ap, struct ata_device *adev);
++
++static const struct pci_device_id sch_pci_tbl[] = {
++ /* Intel SCH PATA Controller */
++ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_SCH_IDE), 0 },
++ { } /* terminate list */
++};
++
++static struct pci_driver sch_pci_driver = {
++ .name = DRV_NAME,
++ .id_table = sch_pci_tbl,
++ .probe = sch_init_one,
++ .remove = ata_pci_remove_one,
++#ifdef CONFIG_PM
++ .suspend = ata_pci_device_suspend,
++ .resume = ata_pci_device_resume,
++#endif
++};
++
++static struct scsi_host_template sch_sht = {
++ ATA_BMDMA_SHT(DRV_NAME),
++};
++
++static struct ata_port_operations sch_pata_ops = {
++ .inherits = &ata_bmdma_port_ops,
++ .cable_detect = ata_cable_unknown,
++ .set_piomode = sch_set_piomode,
++ .set_dmamode = sch_set_dmamode,
++};
++
++static struct ata_port_info sch_port_info = {
++ .flags = 0,
++ .pio_mask = ATA_PIO4, /* pio0-4 */
++ .mwdma_mask = ATA_MWDMA2, /* mwdma0-2 */
++ .udma_mask = ATA_UDMA5, /* udma0-5 */
++ .port_ops = &sch_pata_ops,
++};
++
++MODULE_AUTHOR("Alek Du <alek.du at intel.com>");
++MODULE_DESCRIPTION("SCSI low-level driver for Intel SCH PATA controllers");
++MODULE_LICENSE("GPL");
++MODULE_DEVICE_TABLE(pci, sch_pci_tbl);
++MODULE_VERSION(DRV_VERSION);
++
++/**
++ * sch_set_piomode - Initialize host controller PATA PIO timings
++ * @ap: Port whose timings we are configuring
++ * @adev: ATA device
++ *
++ * Set PIO mode for device, in host controller PCI config space.
++ *
++ * LOCKING:
++ * None (inherited from caller).
++ */
++
++static void sch_set_piomode(struct ata_port *ap, struct ata_device *adev)
++{
++ unsigned int pio = adev->pio_mode - XFER_PIO_0;
++ struct pci_dev *dev = to_pci_dev(ap->host->dev);
++ unsigned int port = adev->devno ? D1TIM : D0TIM;
++ unsigned int data;
++
++ pci_read_config_dword(dev, port, &data);
++ /* see SCH datasheet page 351 */
++ /* set PIO mode */
++ data &= ~(PM | PPE);
++ data |= pio;
++ /* enable PPE for block device */
++ if (adev->class == ATA_DEV_ATA)
++ data |= PPE;
++ pci_write_config_dword(dev, port, data);
++}
++
++/**
++ * sch_set_dmamode - Initialize host controller PATA DMA timings
++ * @ap: Port whose timings we are configuring
++ * @adev: ATA device
++ *
++ * Set MW/UDMA mode for device, in host controller PCI config space.
++ *
++ * LOCKING:
++ * None (inherited from caller).
++ */
++
++static void sch_set_dmamode(struct ata_port *ap, struct ata_device *adev)
++{
++ unsigned int dma_mode = adev->dma_mode;
++ struct pci_dev *dev = to_pci_dev(ap->host->dev);
++ unsigned int port = adev->devno ? D1TIM : D0TIM;
++ unsigned int data;
++
++ pci_read_config_dword(dev, port, &data);
++ /* see SCH datasheet page 351 */
++ if (dma_mode >= XFER_UDMA_0) {
++ /* enable Synchronous DMA mode */
++ data |= USD;
++ data &= ~UDM;
++ data |= (dma_mode - XFER_UDMA_0) << 16;
++ } else { /* must be MWDMA mode, since we masked SWDMA already */
++ data &= ~(USD | MDM);
++ data |= (dma_mode - XFER_MW_DMA_0) << 8;
++ }
++ pci_write_config_dword(dev, port, data);
++}
++
++/**
++ * sch_init_one - Register SCH ATA PCI device with kernel services
++ * @pdev: PCI device to register
++ * @ent: Entry in sch_pci_tbl matching with @pdev
++ *
++ * LOCKING:
++ * Inherited from PCI layer (may sleep).
++ *
++ * RETURNS:
++ * Zero on success, or -ERRNO value.
++ */
++
++static int __devinit sch_init_one(struct pci_dev *pdev,
++ const struct pci_device_id *ent)
++{
++ static int printed_version;
++ const struct ata_port_info *ppi[] = { &sch_port_info, NULL };
++ struct ata_host *host;
++ int rc;
++
++ if (!printed_version++)
++ dev_printk(KERN_DEBUG, &pdev->dev,
++ "version " DRV_VERSION "\n");
++
++ /* enable device and prepare host */
++ rc = pcim_enable_device(pdev);
++ if (rc)
++ return rc;
++ rc = ata_pci_sff_prepare_host(pdev, ppi, &host);
++ if (rc)
++ return rc;
++ pci_set_master(pdev);
++ return ata_pci_sff_activate_host(host, ata_sff_interrupt, &sch_sht);
++}
++
++static int __init sch_init(void)
++{
++ return pci_register_driver(&sch_pci_driver);
++}
++
++static void __exit sch_exit(void)
++{
++ pci_unregister_driver(&sch_pci_driver);
++}
++
++module_init(sch_init);
++module_exit(sch_exit);
+diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
+index d27bb9a..3ead02f 100644
+--- a/drivers/ata/sata_inic162x.c
++++ b/drivers/ata/sata_inic162x.c
+@@ -10,13 +10,33 @@
+ * right. Documentation is available at initio's website but it only
+ * documents registers (not programming model).
+ *
+- * - ATA disks work.
+- * - Hotplug works.
+- * - ATAPI read works but burning doesn't. This thing is really
+- * peculiar about ATAPI and I couldn't figure out how ATAPI PIO and
+- * ATAPI DMA WRITE should be programmed. If you've got a clue, be
+- * my guest.
+- * - Both STR and STD work.
++ * This driver has interesting history. The first version was written
++ * from the documentation and a 2.4 IDE driver posted on a Taiwan
++ * company, which didn't use any IDMA features and couldn't handle
++ * LBA48. The resulting driver couldn't handle LBA48 devices either
++ * making it pretty useless.
++ *
++ * After a while, initio picked the driver up, renamed it to
++ * sata_initio162x, updated it to use IDMA for ATA DMA commands and
++ * posted it on their website. It only used ATA_PROT_DMA for IDMA and
++ * attaching both devices and issuing IDMA and !IDMA commands
++ * simultaneously broke it due to PIRQ masking interaction but it did
++ * show how to use the IDMA (ADMA + some initio specific twists)
++ * engine.
++ *
++ * Then, I picked up their changes again and here's the usable driver
++ * which uses IDMA for everything. Everything works now including
++ * LBA48, CD/DVD burning, suspend/resume and hotplug. There are some
++ * issues tho. Result Tf is not resported properly, NCQ isn't
++ * supported yet and CD/DVD writing works with DMA assisted PIO
++ * protocol (which, for native SATA devices, shouldn't cause any
++ * noticeable difference).
++ *
++ * Anyways, so, here's finally a working driver for inic162x. Enjoy!
++ *
++ * initio: If you guys wanna improve the driver regarding result TF
++ * access and other stuff, please feel free to contact me. I'll be
++ * happy to assist.
+ */
+
+ #include <linux/kernel.h>
+@@ -28,13 +48,19 @@
+ #include <scsi/scsi_device.h>
+
+ #define DRV_NAME "sata_inic162x"
+-#define DRV_VERSION "0.3"
++#define DRV_VERSION "0.4"
+
+ enum {
+- MMIO_BAR = 5,
++ MMIO_BAR_PCI = 5,
++ MMIO_BAR_CARDBUS = 1,
+
+ NR_PORTS = 2,
+
++ IDMA_CPB_TBL_SIZE = 4 * 32,
++
++ INIC_DMA_BOUNDARY = 0xffffff,
++
++ HOST_ACTRL = 0x08,
+ HOST_CTL = 0x7c,
+ HOST_STAT = 0x7e,
+ HOST_IRQ_STAT = 0xbc,
+@@ -43,22 +69,37 @@ enum {
+ PORT_SIZE = 0x40,
+
+ /* registers for ATA TF operation */
+- PORT_TF = 0x00,
+- PORT_ALT_STAT = 0x08,
++ PORT_TF_DATA = 0x00,
++ PORT_TF_FEATURE = 0x01,
++ PORT_TF_NSECT = 0x02,
++ PORT_TF_LBAL = 0x03,
++ PORT_TF_LBAM = 0x04,
++ PORT_TF_LBAH = 0x05,
++ PORT_TF_DEVICE = 0x06,
++ PORT_TF_COMMAND = 0x07,
++ PORT_TF_ALT_STAT = 0x08,
+ PORT_IRQ_STAT = 0x09,
+ PORT_IRQ_MASK = 0x0a,
+ PORT_PRD_CTL = 0x0b,
+ PORT_PRD_ADDR = 0x0c,
+ PORT_PRD_XFERLEN = 0x10,
++ PORT_CPB_CPBLAR = 0x18,
++ PORT_CPB_PTQFIFO = 0x1c,
+
+ /* IDMA register */
+ PORT_IDMA_CTL = 0x14,
++ PORT_IDMA_STAT = 0x16,
++
++ PORT_RPQ_FIFO = 0x1e,
++ PORT_RPQ_CNT = 0x1f,
+
+ PORT_SCR = 0x20,
+
+ /* HOST_CTL bits */
+ HCTL_IRQOFF = (1 << 8), /* global IRQ off */
+- HCTL_PWRDWN = (1 << 13), /* power down PHYs */
++ HCTL_FTHD0 = (1 << 10), /* fifo threshold 0 */
++ HCTL_FTHD1 = (1 << 11), /* fifo threshold 1*/
++ HCTL_PWRDWN = (1 << 12), /* power down PHYs */
+ HCTL_SOFTRST = (1 << 13), /* global reset (no phy reset) */
+ HCTL_RPGSEL = (1 << 15), /* register page select */
+
+@@ -81,9 +122,7 @@ enum {
+ PIRQ_PENDING = (1 << 7), /* port IRQ pending (STAT only) */
+
+ PIRQ_ERR = PIRQ_OFFLINE | PIRQ_ONLINE | PIRQ_FATAL,
+-
+- PIRQ_MASK_DMA_READ = PIRQ_REPLY | PIRQ_ATA,
+- PIRQ_MASK_OTHER = PIRQ_REPLY | PIRQ_COMPLETE,
++ PIRQ_MASK_DEFAULT = PIRQ_REPLY | PIRQ_ATA,
+ PIRQ_MASK_FREEZE = 0xff,
+
+ /* PORT_PRD_CTL bits */
+@@ -96,20 +135,104 @@ enum {
+ IDMA_CTL_RST_IDMA = (1 << 5), /* reset IDMA machinary */
+ IDMA_CTL_GO = (1 << 7), /* IDMA mode go */
+ IDMA_CTL_ATA_NIEN = (1 << 8), /* ATA IRQ disable */
++
++ /* PORT_IDMA_STAT bits */
++ IDMA_STAT_PERR = (1 << 0), /* PCI ERROR MODE */
++ IDMA_STAT_CPBERR = (1 << 1), /* ADMA CPB error */
++ IDMA_STAT_LGCY = (1 << 3), /* ADMA legacy */
++ IDMA_STAT_UIRQ = (1 << 4), /* ADMA unsolicited irq */
++ IDMA_STAT_STPD = (1 << 5), /* ADMA stopped */
++ IDMA_STAT_PSD = (1 << 6), /* ADMA pause */
++ IDMA_STAT_DONE = (1 << 7), /* ADMA done */
++
++ IDMA_STAT_ERR = IDMA_STAT_PERR | IDMA_STAT_CPBERR,
++
++ /* CPB Control Flags*/
++ CPB_CTL_VALID = (1 << 0), /* CPB valid */
++ CPB_CTL_QUEUED = (1 << 1), /* queued command */
++ CPB_CTL_DATA = (1 << 2), /* data, rsvd in datasheet */
++ CPB_CTL_IEN = (1 << 3), /* PCI interrupt enable */
++ CPB_CTL_DEVDIR = (1 << 4), /* device direction control */
++
++ /* CPB Response Flags */
++ CPB_RESP_DONE = (1 << 0), /* ATA command complete */
++ CPB_RESP_REL = (1 << 1), /* ATA release */
++ CPB_RESP_IGNORED = (1 << 2), /* CPB ignored */
++ CPB_RESP_ATA_ERR = (1 << 3), /* ATA command error */
++ CPB_RESP_SPURIOUS = (1 << 4), /* ATA spurious interrupt error */
++ CPB_RESP_UNDERFLOW = (1 << 5), /* APRD deficiency length error */
++ CPB_RESP_OVERFLOW = (1 << 6), /* APRD exccess length error */
++ CPB_RESP_CPB_ERR = (1 << 7), /* CPB error flag */
++
++ /* PRD Control Flags */
++ PRD_DRAIN = (1 << 1), /* ignore data excess */
++ PRD_CDB = (1 << 2), /* atapi packet command pointer */
++ PRD_DIRECT_INTR = (1 << 3), /* direct interrupt */
++ PRD_DMA = (1 << 4), /* data transfer method */
++ PRD_WRITE = (1 << 5), /* data dir, rsvd in datasheet */
++ PRD_IOM = (1 << 6), /* io/memory transfer */
++ PRD_END = (1 << 7), /* APRD chain end */
+ };
+
++/* Comman Parameter Block */
++struct inic_cpb {
++ u8 resp_flags; /* Response Flags */
++ u8 error; /* ATA Error */
++ u8 status; /* ATA Status */
++ u8 ctl_flags; /* Control Flags */
++ __le32 len; /* Total Transfer Length */
++ __le32 prd; /* First PRD pointer */
++ u8 rsvd[4];
++ /* 16 bytes */
++ u8 feature; /* ATA Feature */
++ u8 hob_feature; /* ATA Ex. Feature */
++ u8 device; /* ATA Device/Head */
++ u8 mirctl; /* Mirror Control */
++ u8 nsect; /* ATA Sector Count */
++ u8 hob_nsect; /* ATA Ex. Sector Count */
++ u8 lbal; /* ATA Sector Number */
++ u8 hob_lbal; /* ATA Ex. Sector Number */
++ u8 lbam; /* ATA Cylinder Low */
++ u8 hob_lbam; /* ATA Ex. Cylinder Low */
++ u8 lbah; /* ATA Cylinder High */
++ u8 hob_lbah; /* ATA Ex. Cylinder High */
++ u8 command; /* ATA Command */
++ u8 ctl; /* ATA Control */
++ u8 slave_error; /* Slave ATA Error */
++ u8 slave_status; /* Slave ATA Status */
++ /* 32 bytes */
++} __packed;
++
++/* Physical Region Descriptor */
++struct inic_prd {
++ __le32 mad; /* Physical Memory Address */
++ __le16 len; /* Transfer Length */
++ u8 rsvd;
++ u8 flags; /* Control Flags */
++} __packed;
++
++struct inic_pkt {
++ struct inic_cpb cpb;
++ struct inic_prd prd[LIBATA_MAX_PRD + 1]; /* + 1 for cdb */
++ u8 cdb[ATAPI_CDB_LEN];
++} __packed;
++
+ struct inic_host_priv {
+- u16 cached_hctl;
++ void __iomem *mmio_base;
++ u16 cached_hctl;
+ };
+
+ struct inic_port_priv {
+- u8 dfl_prdctl;
+- u8 cached_prdctl;
+- u8 cached_pirq_mask;
++ struct inic_pkt *pkt;
++ dma_addr_t pkt_dma;
++ u32 *cpb_tbl;
++ dma_addr_t cpb_tbl_dma;
+ };
+
+ static struct scsi_host_template inic_sht = {
+- ATA_BMDMA_SHT(DRV_NAME),
++ ATA_BASE_SHT(DRV_NAME),
++ .sg_tablesize = LIBATA_MAX_PRD, /* maybe it can be larger? */
++ .dma_boundary = INIC_DMA_BOUNDARY,
+ };
+
+ static const int scr_map[] = {
+@@ -120,54 +243,34 @@ static const int scr_map[] = {
+
+ static void __iomem *inic_port_base(struct ata_port *ap)
+ {
+- return ap->host->iomap[MMIO_BAR] + ap->port_no * PORT_SIZE;
+-}
+-
+-static void __inic_set_pirq_mask(struct ata_port *ap, u8 mask)
+-{
+- void __iomem *port_base = inic_port_base(ap);
+- struct inic_port_priv *pp = ap->private_data;
++ struct inic_host_priv *hpriv = ap->host->private_data;
+
+- writeb(mask, port_base + PORT_IRQ_MASK);
+- pp->cached_pirq_mask = mask;
+-}
+-
+-static void inic_set_pirq_mask(struct ata_port *ap, u8 mask)
+-{
+- struct inic_port_priv *pp = ap->private_data;
+-
+- if (pp->cached_pirq_mask != mask)
+- __inic_set_pirq_mask(ap, mask);
++ return hpriv->mmio_base + ap->port_no * PORT_SIZE;
+ }
+
+ static void inic_reset_port(void __iomem *port_base)
+ {
+ void __iomem *idma_ctl = port_base + PORT_IDMA_CTL;
+- u16 ctl;
+
+- ctl = readw(idma_ctl);
+- ctl &= ~(IDMA_CTL_RST_IDMA | IDMA_CTL_ATA_NIEN | IDMA_CTL_GO);
++ /* stop IDMA engine */
++ readw(idma_ctl); /* flush */
++ msleep(1);
+
+ /* mask IRQ and assert reset */
+- writew(ctl | IDMA_CTL_RST_IDMA | IDMA_CTL_ATA_NIEN, idma_ctl);
++ writew(IDMA_CTL_RST_IDMA, idma_ctl);
+ readw(idma_ctl); /* flush */
+-
+- /* give it some time */
+ msleep(1);
+
+ /* release reset */
+- writew(ctl | IDMA_CTL_ATA_NIEN, idma_ctl);
++ writew(0, idma_ctl);
+
+ /* clear irq */
+ writeb(0xff, port_base + PORT_IRQ_STAT);
+-
+- /* reenable ATA IRQ, turn off IDMA mode */
+- writew(ctl, idma_ctl);
+ }
+
+ static int inic_scr_read(struct ata_port *ap, unsigned sc_reg, u32 *val)
+ {
+- void __iomem *scr_addr = ap->ioaddr.scr_addr;
++ void __iomem *scr_addr = inic_port_base(ap) + PORT_SCR;
+ void __iomem *addr;
+
+ if (unlikely(sc_reg >= ARRAY_SIZE(scr_map)))
+@@ -184,120 +287,126 @@ static int inic_scr_read(struct ata_port *ap, unsigned sc_reg, u32 *val)
+
+ static int inic_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val)
+ {
+- void __iomem *scr_addr = ap->ioaddr.scr_addr;
+- void __iomem *addr;
++ void __iomem *scr_addr = inic_port_base(ap) + PORT_SCR;
+
+ if (unlikely(sc_reg >= ARRAY_SIZE(scr_map)))
+ return -EINVAL;
+
+- addr = scr_addr + scr_map[sc_reg] * 4;
+ writel(val, scr_addr + scr_map[sc_reg] * 4);
+ return 0;
+ }
+
+-/*
+- * In TF mode, inic162x is very similar to SFF device. TF registers
+- * function the same. DMA engine behaves similary using the same PRD
+- * format as BMDMA but different command register, interrupt and event
+- * notification methods are used. The following inic_bmdma_*()
+- * functions do the impedance matching.
+- */
+-static void inic_bmdma_setup(struct ata_queued_cmd *qc)
++static void inic_stop_idma(struct ata_port *ap)
+ {
+- struct ata_port *ap = qc->ap;
+- struct inic_port_priv *pp = ap->private_data;
+ void __iomem *port_base = inic_port_base(ap);
+- int rw = qc->tf.flags & ATA_TFLAG_WRITE;
+-
+- /* make sure device sees PRD table writes */
+- wmb();
+-
+- /* load transfer length */
+- writel(qc->nbytes, port_base + PORT_PRD_XFERLEN);
+-
+- /* turn on DMA and specify data direction */
+- pp->cached_prdctl = pp->dfl_prdctl | PRD_CTL_DMAEN;
+- if (!rw)
+- pp->cached_prdctl |= PRD_CTL_WR;
+- writeb(pp->cached_prdctl, port_base + PORT_PRD_CTL);
+
+- /* issue r/w command */
+- ap->ops->sff_exec_command(ap, &qc->tf);
++ readb(port_base + PORT_RPQ_FIFO);
++ readb(port_base + PORT_RPQ_CNT);
++ writew(0, port_base + PORT_IDMA_CTL);
+ }
+
+-static void inic_bmdma_start(struct ata_queued_cmd *qc)
++static void inic_host_err_intr(struct ata_port *ap, u8 irq_stat, u16 idma_stat)
+ {
+- struct ata_port *ap = qc->ap;
++ struct ata_eh_info *ehi = &ap->link.eh_info;
+ struct inic_port_priv *pp = ap->private_data;
+- void __iomem *port_base = inic_port_base(ap);
++ struct inic_cpb *cpb = &pp->pkt->cpb;
++ bool freeze = false;
+
+- /* start host DMA transaction */
+- pp->cached_prdctl |= PRD_CTL_START;
+- writeb(pp->cached_prdctl, port_base + PORT_PRD_CTL);
+-}
++ ata_ehi_clear_desc(ehi);
++ ata_ehi_push_desc(ehi, "irq_stat=0x%x idma_stat=0x%x",
++ irq_stat, idma_stat);
+
+-static void inic_bmdma_stop(struct ata_queued_cmd *qc)
+-{
+- struct ata_port *ap = qc->ap;
+- struct inic_port_priv *pp = ap->private_data;
+- void __iomem *port_base = inic_port_base(ap);
++ inic_stop_idma(ap);
+
+- /* stop DMA engine */
+- writeb(pp->dfl_prdctl, port_base + PORT_PRD_CTL);
+-}
++ if (irq_stat & (PIRQ_OFFLINE | PIRQ_ONLINE)) {
++ ata_ehi_push_desc(ehi, "hotplug");
++ ata_ehi_hotplugged(ehi);
++ freeze = true;
++ }
+
+-static u8 inic_bmdma_status(struct ata_port *ap)
+-{
+- /* event is already verified by the interrupt handler */
+- return ATA_DMA_INTR;
++ if (idma_stat & IDMA_STAT_PERR) {
++ ata_ehi_push_desc(ehi, "PCI error");
++ freeze = true;
++ }
++
++ if (idma_stat & IDMA_STAT_CPBERR) {
++ ata_ehi_push_desc(ehi, "CPB error");
++
++ if (cpb->resp_flags & CPB_RESP_IGNORED) {
++ __ata_ehi_push_desc(ehi, " ignored");
++ ehi->err_mask |= AC_ERR_INVALID;
++ freeze = true;
++ }
++
++ if (cpb->resp_flags & CPB_RESP_ATA_ERR)
++ ehi->err_mask |= AC_ERR_DEV;
++
++ if (cpb->resp_flags & CPB_RESP_SPURIOUS) {
++ __ata_ehi_push_desc(ehi, " spurious-intr");
++ ehi->err_mask |= AC_ERR_HSM;
++ freeze = true;
++ }
++
++ if (cpb->resp_flags &
++ (CPB_RESP_UNDERFLOW | CPB_RESP_OVERFLOW)) {
++ __ata_ehi_push_desc(ehi, " data-over/underflow");
++ ehi->err_mask |= AC_ERR_HSM;
++ freeze = true;
++ }
++ }
++
++ if (freeze)
++ ata_port_freeze(ap);
++ else
++ ata_port_abort(ap);
+ }
+
+ static void inic_host_intr(struct ata_port *ap)
+ {
+ void __iomem *port_base = inic_port_base(ap);
+- struct ata_eh_info *ehi = &ap->link.eh_info;
++ struct ata_queued_cmd *qc = ata_qc_from_tag(ap, ap->link.active_tag);
+ u8 irq_stat;
++ u16 idma_stat;
+
+- /* fetch and clear irq */
++ /* read and clear IRQ status */
+ irq_stat = readb(port_base + PORT_IRQ_STAT);
+ writeb(irq_stat, port_base + PORT_IRQ_STAT);
++ idma_stat = readw(port_base + PORT_IDMA_STAT);
+
+- if (likely(!(irq_stat & PIRQ_ERR))) {
+- struct ata_queued_cmd *qc =
+- ata_qc_from_tag(ap, ap->link.active_tag);
++ if (unlikely((irq_stat & PIRQ_ERR) || (idma_stat & IDMA_STAT_ERR)))
++ inic_host_err_intr(ap, irq_stat, idma_stat);
+
+- if (unlikely(!qc || (qc->tf.flags & ATA_TFLAG_POLLING))) {
+- ap->ops->sff_check_status(ap); /* clear ATA interrupt */
+- return;
+- }
++ if (unlikely(!qc))
++ goto spurious;
+
+- if (likely(ata_sff_host_intr(ap, qc)))
+- return;
++ if (likely(idma_stat & IDMA_STAT_DONE)) {
++ inic_stop_idma(ap);
+
+- ap->ops->sff_check_status(ap); /* clear ATA interrupt */
+- ata_port_printk(ap, KERN_WARNING, "unhandled "
+- "interrupt, irq_stat=%x\n", irq_stat);
++ /* Depending on circumstances, device error
++ * isn't reported by IDMA, check it explicitly.
++ */
++ if (unlikely(readb(port_base + PORT_TF_COMMAND) &
++ (ATA_DF | ATA_ERR)))
++ qc->err_mask |= AC_ERR_DEV;
++
++ ata_qc_complete(qc);
+ return;
+ }
+
+- /* error */
+- ata_ehi_push_desc(ehi, "irq_stat=0x%x", irq_stat);
+-
+- if (irq_stat & (PIRQ_OFFLINE | PIRQ_ONLINE)) {
+- ata_ehi_hotplugged(ehi);
+- ata_port_freeze(ap);
+- } else
+- ata_port_abort(ap);
++ spurious:
++ ata_port_printk(ap, KERN_WARNING, "unhandled interrupt: "
++ "cmd=0x%x irq_stat=0x%x idma_stat=0x%x\n",
++ qc ? qc->tf.command : 0xff, irq_stat, idma_stat);
+ }
+
+ static irqreturn_t inic_interrupt(int irq, void *dev_instance)
+ {
+ struct ata_host *host = dev_instance;
+- void __iomem *mmio_base = host->iomap[MMIO_BAR];
++ struct inic_host_priv *hpriv = host->private_data;
+ u16 host_irq_stat;
+ int i, handled = 0;;
+
+- host_irq_stat = readw(mmio_base + HOST_IRQ_STAT);
++ host_irq_stat = readw(hpriv->mmio_base + HOST_IRQ_STAT);
+
+ if (unlikely(!(host_irq_stat & HIRQ_GLOBAL)))
+ goto out;
+@@ -327,60 +436,173 @@ static irqreturn_t inic_interrupt(int irq, void *dev_instance)
+ return IRQ_RETVAL(handled);
+ }
+
++static int inic_check_atapi_dma(struct ata_queued_cmd *qc)
++{
++ /* For some reason ATAPI_PROT_DMA doesn't work for some
++ * commands including writes and other misc ops. Use PIO
++ * protocol instead, which BTW is driven by the DMA engine
++ * anyway, so it shouldn't make much difference for native
++ * SATA devices.
++ */
++ if (atapi_cmd_type(qc->cdb[0]) == READ)
++ return 0;
++ return 1;
++}
++
++static void inic_fill_sg(struct inic_prd *prd, struct ata_queued_cmd *qc)
++{
++ struct scatterlist *sg;
++ unsigned int si;
++ u8 flags = 0;
++
++ if (qc->tf.flags & ATA_TFLAG_WRITE)
++ flags |= PRD_WRITE;
++
++ if (ata_is_dma(qc->tf.protocol))
++ flags |= PRD_DMA;
++
++ for_each_sg(qc->sg, sg, qc->n_elem, si) {
++ prd->mad = cpu_to_le32(sg_dma_address(sg));
++ prd->len = cpu_to_le16(sg_dma_len(sg));
++ prd->flags = flags;
++ prd++;
++ }
++
++ WARN_ON(!si);
++ prd[-1].flags |= PRD_END;
++}
++
++static void inic_qc_prep(struct ata_queued_cmd *qc)
++{
++ struct inic_port_priv *pp = qc->ap->private_data;
++ struct inic_pkt *pkt = pp->pkt;
++ struct inic_cpb *cpb = &pkt->cpb;
++ struct inic_prd *prd = pkt->prd;
++ bool is_atapi = ata_is_atapi(qc->tf.protocol);
++ bool is_data = ata_is_data(qc->tf.protocol);
++ unsigned int cdb_len = 0;
++
++ VPRINTK("ENTER\n");
++
++ if (is_atapi)
++ cdb_len = qc->dev->cdb_len;
++
++ /* prepare packet, based on initio driver */
++ memset(pkt, 0, sizeof(struct inic_pkt));
++
++ cpb->ctl_flags = CPB_CTL_VALID | CPB_CTL_IEN;
++ if (is_atapi || is_data)
++ cpb->ctl_flags |= CPB_CTL_DATA;
++
++ cpb->len = cpu_to_le32(qc->nbytes + cdb_len);
++ cpb->prd = cpu_to_le32(pp->pkt_dma + offsetof(struct inic_pkt, prd));
++
++ cpb->device = qc->tf.device;
++ cpb->feature = qc->tf.feature;
++ cpb->nsect = qc->tf.nsect;
++ cpb->lbal = qc->tf.lbal;
++ cpb->lbam = qc->tf.lbam;
++ cpb->lbah = qc->tf.lbah;
++
++ if (qc->tf.flags & ATA_TFLAG_LBA48) {
++ cpb->hob_feature = qc->tf.hob_feature;
++ cpb->hob_nsect = qc->tf.hob_nsect;
++ cpb->hob_lbal = qc->tf.hob_lbal;
++ cpb->hob_lbam = qc->tf.hob_lbam;
++ cpb->hob_lbah = qc->tf.hob_lbah;
++ }
++
++ cpb->command = qc->tf.command;
++ /* don't load ctl - dunno why. it's like that in the initio driver */
++
++ /* setup PRD for CDB */
++ if (is_atapi) {
++ memcpy(pkt->cdb, qc->cdb, ATAPI_CDB_LEN);
++ prd->mad = cpu_to_le32(pp->pkt_dma +
++ offsetof(struct inic_pkt, cdb));
++ prd->len = cpu_to_le16(cdb_len);
++ prd->flags = PRD_CDB | PRD_WRITE;
++ if (!is_data)
++ prd->flags |= PRD_END;
++ prd++;
++ }
++
++ /* setup sg table */
++ if (is_data)
++ inic_fill_sg(prd, qc);
++
++ pp->cpb_tbl[0] = pp->pkt_dma;
++}
++
+ static unsigned int inic_qc_issue(struct ata_queued_cmd *qc)
+ {
+ struct ata_port *ap = qc->ap;
++ void __iomem *port_base = inic_port_base(ap);
+
+- /* ATA IRQ doesn't wait for DMA transfer completion and vice
+- * versa. Mask IRQ selectively to detect command completion.
+- * Without it, ATA DMA read command can cause data corruption.
+- *
+- * Something similar might be needed for ATAPI writes. I
+- * tried a lot of combinations but couldn't find the solution.
+- */
+- if (qc->tf.protocol == ATA_PROT_DMA &&
+- !(qc->tf.flags & ATA_TFLAG_WRITE))
+- inic_set_pirq_mask(ap, PIRQ_MASK_DMA_READ);
+- else
+- inic_set_pirq_mask(ap, PIRQ_MASK_OTHER);
++ /* fire up the ADMA engine */
++ writew(HCTL_FTHD0, port_base + HOST_CTL);
++ writew(IDMA_CTL_GO, port_base + PORT_IDMA_CTL);
++ writeb(0, port_base + PORT_CPB_PTQFIFO);
++
++ return 0;
++}
++
++static void inic_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
++{
++ void __iomem *port_base = inic_port_base(ap);
++
++ tf->feature = readb(port_base + PORT_TF_FEATURE);
++ tf->nsect = readb(port_base + PORT_TF_NSECT);
++ tf->lbal = readb(port_base + PORT_TF_LBAL);
++ tf->lbam = readb(port_base + PORT_TF_LBAM);
++ tf->lbah = readb(port_base + PORT_TF_LBAH);
++ tf->device = readb(port_base + PORT_TF_DEVICE);
++ tf->command = readb(port_base + PORT_TF_COMMAND);
++}
+
+- /* Issuing a command to yet uninitialized port locks up the
+- * controller. Most of the time, this happens for the first
+- * command after reset which are ATA and ATAPI IDENTIFYs.
+- * Fast fail if stat is 0x7f or 0xff for those commands.
++static bool inic_qc_fill_rtf(struct ata_queued_cmd *qc)
++{
++ struct ata_taskfile *rtf = &qc->result_tf;
++ struct ata_taskfile tf;
++
++ /* FIXME: Except for status and error, result TF access
++ * doesn't work. I tried reading from BAR0/2, CPB and BAR5.
++ * None works regardless of which command interface is used.
++ * For now return true iff status indicates device error.
++ * This means that we're reporting bogus sector for RW
++ * failures. Eeekk....
+ */
+- if (unlikely(qc->tf.command == ATA_CMD_ID_ATA ||
+- qc->tf.command == ATA_CMD_ID_ATAPI)) {
+- u8 stat = ap->ops->sff_check_status(ap);
+- if (stat == 0x7f || stat == 0xff)
+- return AC_ERR_HSM;
+- }
++ inic_tf_read(qc->ap, &tf);
+
+- return ata_sff_qc_issue(qc);
++ if (!(tf.command & ATA_ERR))
++ return false;
++
++ rtf->command = tf.command;
++ rtf->feature = tf.feature;
++ return true;
+ }
+
+ static void inic_freeze(struct ata_port *ap)
+ {
+ void __iomem *port_base = inic_port_base(ap);
+
+- __inic_set_pirq_mask(ap, PIRQ_MASK_FREEZE);
+-
+- ap->ops->sff_check_status(ap);
++ writeb(PIRQ_MASK_FREEZE, port_base + PORT_IRQ_MASK);
+ writeb(0xff, port_base + PORT_IRQ_STAT);
+-
+- readb(port_base + PORT_IRQ_STAT); /* flush */
+ }
+
+ static void inic_thaw(struct ata_port *ap)
+ {
+ void __iomem *port_base = inic_port_base(ap);
+
+- ap->ops->sff_check_status(ap);
+ writeb(0xff, port_base + PORT_IRQ_STAT);
++ writeb(PIRQ_MASK_DEFAULT, port_base + PORT_IRQ_MASK);
++}
+
+- __inic_set_pirq_mask(ap, PIRQ_MASK_OTHER);
++static int inic_check_ready(struct ata_link *link)
++{
++ void __iomem *port_base = inic_port_base(link->ap);
+
+- readb(port_base + PORT_IRQ_STAT); /* flush */
++ return ata_check_ready(readb(port_base + PORT_TF_COMMAND));
+ }
+
+ /*
+@@ -394,17 +616,15 @@ static int inic_hardreset(struct ata_link *link, unsigned int *class,
+ void __iomem *port_base = inic_port_base(ap);
+ void __iomem *idma_ctl = port_base + PORT_IDMA_CTL;
+ const unsigned long *timing = sata_ehc_deb_timing(&link->eh_context);
+- u16 val;
+ int rc;
+
+ /* hammer it into sane state */
+ inic_reset_port(port_base);
+
+- val = readw(idma_ctl);
+- writew(val | IDMA_CTL_RST_ATA, idma_ctl);
++ writew(IDMA_CTL_RST_ATA, idma_ctl);
+ readw(idma_ctl); /* flush */
+ msleep(1);
+- writew(val & ~IDMA_CTL_RST_ATA, idma_ctl);
++ writew(0, idma_ctl);
+
+ rc = sata_link_resume(link, timing, deadline);
+ if (rc) {
+@@ -418,7 +638,7 @@ static int inic_hardreset(struct ata_link *link, unsigned int *class,
+ struct ata_taskfile tf;
+
+ /* wait for link to become ready */
+- rc = ata_sff_wait_after_reset(link, 1, deadline);
++ rc = ata_wait_after_reset(link, deadline, inic_check_ready);
+ /* link occupied, -ENODEV too is an error */
+ if (rc) {
+ ata_link_printk(link, KERN_WARNING, "device not ready "
+@@ -426,7 +646,7 @@ static int inic_hardreset(struct ata_link *link, unsigned int *class,
+ return rc;
+ }
+
+- ata_sff_tf_read(ap, &tf);
++ inic_tf_read(ap, &tf);
+ *class = ata_dev_classify(&tf);
+ }
+
+@@ -436,18 +656,8 @@ static int inic_hardreset(struct ata_link *link, unsigned int *class,
+ static void inic_error_handler(struct ata_port *ap)
+ {
+ void __iomem *port_base = inic_port_base(ap);
+- struct inic_port_priv *pp = ap->private_data;
+- unsigned long flags;
+
+- /* reset PIO HSM and stop DMA engine */
+ inic_reset_port(port_base);
+-
+- spin_lock_irqsave(ap->lock, flags);
+- ap->hsm_task_state = HSM_ST_IDLE;
+- writeb(pp->dfl_prdctl, port_base + PORT_PRD_CTL);
+- spin_unlock_irqrestore(ap->lock, flags);
+-
+- /* PIO and DMA engines have been stopped, perform recovery */
+ ata_std_error_handler(ap);
+ }
+
+@@ -458,26 +668,18 @@ static void inic_post_internal_cmd(struct ata_queued_cmd *qc)
+ inic_reset_port(inic_port_base(qc->ap));
+ }
+
+-static void inic_dev_config(struct ata_device *dev)
+-{
+- /* inic can only handle upto LBA28 max sectors */
+- if (dev->max_sectors > ATA_MAX_SECTORS)
+- dev->max_sectors = ATA_MAX_SECTORS;
+-
+- if (dev->n_sectors >= 1 << 28) {
+- ata_dev_printk(dev, KERN_ERR,
+- "ERROR: This driver doesn't support LBA48 yet and may cause\n"
+- " data corruption on such devices. Disabling.\n");
+- ata_dev_disable(dev);
+- }
+-}
+-
+ static void init_port(struct ata_port *ap)
+ {
+ void __iomem *port_base = inic_port_base(ap);
++ struct inic_port_priv *pp = ap->private_data;
+
+- /* Setup PRD address */
++ /* clear packet and CPB table */
++ memset(pp->pkt, 0, sizeof(struct inic_pkt));
++ memset(pp->cpb_tbl, 0, IDMA_CPB_TBL_SIZE);
++
++ /* setup PRD and CPB lookup table addresses */
+ writel(ap->prd_dma, port_base + PORT_PRD_ADDR);
++ writel(pp->cpb_tbl_dma, port_base + PORT_CPB_CPBLAR);
+ }
+
+ static int inic_port_resume(struct ata_port *ap)
+@@ -488,28 +690,30 @@ static int inic_port_resume(struct ata_port *ap)
+
+ static int inic_port_start(struct ata_port *ap)
+ {
+- void __iomem *port_base = inic_port_base(ap);
++ struct device *dev = ap->host->dev;
+ struct inic_port_priv *pp;
+- u8 tmp;
+ int rc;
+
+ /* alloc and initialize private data */
+- pp = devm_kzalloc(ap->host->dev, sizeof(*pp), GFP_KERNEL);
++ pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
+ if (!pp)
+ return -ENOMEM;
+ ap->private_data = pp;
+
+- /* default PRD_CTL value, DMAEN, WR and START off */
+- tmp = readb(port_base + PORT_PRD_CTL);
+- tmp &= ~(PRD_CTL_DMAEN | PRD_CTL_WR | PRD_CTL_START);
+- pp->dfl_prdctl = tmp;
+-
+ /* Alloc resources */
+ rc = ata_port_start(ap);
+- if (rc) {
+- kfree(pp);
++ if (rc)
+ return rc;
+- }
++
++ pp->pkt = dmam_alloc_coherent(dev, sizeof(struct inic_pkt),
++ &pp->pkt_dma, GFP_KERNEL);
++ if (!pp->pkt)
++ return -ENOMEM;
++
++ pp->cpb_tbl = dmam_alloc_coherent(dev, IDMA_CPB_TBL_SIZE,
++ &pp->cpb_tbl_dma, GFP_KERNEL);
++ if (!pp->cpb_tbl)
++ return -ENOMEM;
+
+ init_port(ap);
+
+@@ -517,21 +721,18 @@ static int inic_port_start(struct ata_port *ap)
+ }
+
+ static struct ata_port_operations inic_port_ops = {
+- .inherits = &ata_sff_port_ops,
++ .inherits = &sata_port_ops,
+
+- .bmdma_setup = inic_bmdma_setup,
+- .bmdma_start = inic_bmdma_start,
+- .bmdma_stop = inic_bmdma_stop,
+- .bmdma_status = inic_bmdma_status,
++ .check_atapi_dma = inic_check_atapi_dma,
++ .qc_prep = inic_qc_prep,
+ .qc_issue = inic_qc_issue,
++ .qc_fill_rtf = inic_qc_fill_rtf,
+
+ .freeze = inic_freeze,
+ .thaw = inic_thaw,
+- .softreset = ATA_OP_NULL, /* softreset is broken */
+ .hardreset = inic_hardreset,
+ .error_handler = inic_error_handler,
+ .post_internal_cmd = inic_post_internal_cmd,
+- .dev_config = inic_dev_config,
+
+ .scr_read = inic_scr_read,
+ .scr_write = inic_scr_write,
+@@ -541,12 +742,6 @@ static struct ata_port_operations inic_port_ops = {
+ };
+
+ static struct ata_port_info inic_port_info = {
+- /* For some reason, ATAPI_PROT_PIO is broken on this
+- * controller, and no, PIO_POLLING does't fix it. It somehow
+- * manages to report the wrong ireason and ignoring ireason
+- * results in machine lock up. Tell libata to always prefer
+- * DMA.
+- */
+ .flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA,
+ .pio_mask = 0x1f, /* pio0-4 */
+ .mwdma_mask = 0x07, /* mwdma0-2 */
+@@ -599,7 +794,6 @@ static int inic_pci_device_resume(struct pci_dev *pdev)
+ {
+ struct ata_host *host = dev_get_drvdata(&pdev->dev);
+ struct inic_host_priv *hpriv = host->private_data;
+- void __iomem *mmio_base = host->iomap[MMIO_BAR];
+ int rc;
+
+ rc = ata_pci_device_do_resume(pdev);
+@@ -607,7 +801,7 @@ static int inic_pci_device_resume(struct pci_dev *pdev)
+ return rc;
+
+ if (pdev->dev.power.power_state.event == PM_EVENT_SUSPEND) {
+- rc = init_controller(mmio_base, hpriv->cached_hctl);
++ rc = init_controller(hpriv->mmio_base, hpriv->cached_hctl);
+ if (rc)
+ return rc;
+ }
+@@ -625,6 +819,7 @@ static int inic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+ struct ata_host *host;
+ struct inic_host_priv *hpriv;
+ void __iomem * const *iomap;
++ int mmio_bar;
+ int i, rc;
+
+ if (!printed_version++)
+@@ -638,38 +833,31 @@ static int inic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+
+ host->private_data = hpriv;
+
+- /* acquire resources and fill host */
++ /* Acquire resources and fill host. Note that PCI and cardbus
++ * use different BARs.
++ */
+ rc = pcim_enable_device(pdev);
+ if (rc)
+ return rc;
+
+- rc = pcim_iomap_regions(pdev, 0x3f, DRV_NAME);
++ if (pci_resource_flags(pdev, MMIO_BAR_PCI) & IORESOURCE_MEM)
++ mmio_bar = MMIO_BAR_PCI;
++ else
++ mmio_bar = MMIO_BAR_CARDBUS;
++
++ rc = pcim_iomap_regions(pdev, 1 << mmio_bar, DRV_NAME);
+ if (rc)
+ return rc;
+ host->iomap = iomap = pcim_iomap_table(pdev);
++ hpriv->mmio_base = iomap[mmio_bar];
++ hpriv->cached_hctl = readw(hpriv->mmio_base + HOST_CTL);
+
+ for (i = 0; i < NR_PORTS; i++) {
+ struct ata_port *ap = host->ports[i];
+- struct ata_ioports *port = &ap->ioaddr;
+- unsigned int offset = i * PORT_SIZE;
+-
+- port->cmd_addr = iomap[2 * i];
+- port->altstatus_addr =
+- port->ctl_addr = (void __iomem *)
+- ((unsigned long)iomap[2 * i + 1] | ATA_PCI_CTL_OFS);
+- port->scr_addr = iomap[MMIO_BAR] + offset + PORT_SCR;
+-
+- ata_sff_std_ports(port);
+-
+- ata_port_pbar_desc(ap, MMIO_BAR, -1, "mmio");
+- ata_port_pbar_desc(ap, MMIO_BAR, offset, "port");
+- ata_port_desc(ap, "cmd 0x%llx ctl 0x%llx",
+- (unsigned long long)pci_resource_start(pdev, 2 * i),
+- (unsigned long long)pci_resource_start(pdev, (2 * i + 1)) |
+- ATA_PCI_CTL_OFS);
+- }
+
+- hpriv->cached_hctl = readw(iomap[MMIO_BAR] + HOST_CTL);
++ ata_port_pbar_desc(ap, mmio_bar, -1, "mmio");
++ ata_port_pbar_desc(ap, mmio_bar, i * PORT_SIZE, "port");
++ }
+
+ /* Set dma_mask. This devices doesn't support 64bit addressing. */
+ rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+@@ -698,7 +886,7 @@ static int inic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+ return rc;
+ }
+
+- rc = init_controller(iomap[MMIO_BAR], hpriv->cached_hctl);
++ rc = init_controller(hpriv->mmio_base, hpriv->cached_hctl);
+ if (rc) {
+ dev_printk(KERN_ERR, &pdev->dev,
+ "failed to initialize controller\n");
+diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
+index 842b1a1..bb73b22 100644
+--- a/drivers/ata/sata_mv.c
++++ b/drivers/ata/sata_mv.c
+@@ -65,6 +65,7 @@
+ #include <linux/platform_device.h>
+ #include <linux/ata_platform.h>
+ #include <linux/mbus.h>
++#include <linux/bitops.h>
+ #include <scsi/scsi_host.h>
+ #include <scsi/scsi_cmnd.h>
+ #include <scsi/scsi_device.h>
+@@ -91,9 +92,9 @@ enum {
+ MV_IRQ_COAL_TIME_THRESHOLD = (MV_IRQ_COAL_REG_BASE + 0xd0),
+
+ MV_SATAHC0_REG_BASE = 0x20000,
+- MV_FLASH_CTL = 0x1046c,
+- MV_GPIO_PORT_CTL = 0x104f0,
+- MV_RESET_CFG = 0x180d8,
++ MV_FLASH_CTL_OFS = 0x1046c,
++ MV_GPIO_PORT_CTL_OFS = 0x104f0,
++ MV_RESET_CFG_OFS = 0x180d8,
+
+ MV_PCI_REG_SZ = MV_MAJOR_REG_AREA_SZ,
+ MV_SATAHC_REG_SZ = MV_MAJOR_REG_AREA_SZ,
+@@ -147,18 +148,21 @@ enum {
+ /* PCI interface registers */
+
+ PCI_COMMAND_OFS = 0xc00,
++ PCI_COMMAND_MRDTRIG = (1 << 7), /* PCI Master Read Trigger */
+
+ PCI_MAIN_CMD_STS_OFS = 0xd30,
+ STOP_PCI_MASTER = (1 << 2),
+ PCI_MASTER_EMPTY = (1 << 3),
+ GLOB_SFT_RST = (1 << 4),
+
+- MV_PCI_MODE = 0xd00,
++ MV_PCI_MODE_OFS = 0xd00,
++ MV_PCI_MODE_MASK = 0x30,
++
+ MV_PCI_EXP_ROM_BAR_CTL = 0xd2c,
+ MV_PCI_DISC_TIMER = 0xd04,
+ MV_PCI_MSI_TRIGGER = 0xc38,
+ MV_PCI_SERR_MASK = 0xc28,
+- MV_PCI_XBAR_TMOUT = 0x1d04,
++ MV_PCI_XBAR_TMOUT_OFS = 0x1d04,
+ MV_PCI_ERR_LOW_ADDRESS = 0x1d40,
+ MV_PCI_ERR_HIGH_ADDRESS = 0x1d44,
+ MV_PCI_ERR_ATTRIBUTE = 0x1d48,
+@@ -225,16 +229,18 @@ enum {
+ PHY_MODE4 = 0x314,
+ PHY_MODE2 = 0x330,
+ SATA_IFCTL_OFS = 0x344,
++ SATA_TESTCTL_OFS = 0x348,
+ SATA_IFSTAT_OFS = 0x34c,
+ VENDOR_UNIQUE_FIS_OFS = 0x35c,
+
+- FIS_CFG_OFS = 0x360,
+- FIS_CFG_SINGLE_SYNC = (1 << 16), /* SYNC on DMA activation */
++ FISCFG_OFS = 0x360,
++ FISCFG_WAIT_DEV_ERR = (1 << 8), /* wait for host on DevErr */
++ FISCFG_SINGLE_SYNC = (1 << 16), /* SYNC on DMA activation */
+
+ MV5_PHY_MODE = 0x74,
+- MV5_LT_MODE = 0x30,
+- MV5_PHY_CTL = 0x0C,
+- SATA_INTERFACE_CFG = 0x050,
++ MV5_LTMODE_OFS = 0x30,
++ MV5_PHY_CTL_OFS = 0x0C,
++ SATA_INTERFACE_CFG_OFS = 0x050,
+
+ MV_M2_PREAMP_MASK = 0x7e0,
+
+@@ -332,10 +338,16 @@ enum {
+ EDMA_CMD_OFS = 0x28, /* EDMA command register */
+ EDMA_EN = (1 << 0), /* enable EDMA */
+ EDMA_DS = (1 << 1), /* disable EDMA; self-negated */
+- ATA_RST = (1 << 2), /* reset trans/link/phy */
++ EDMA_RESET = (1 << 2), /* reset eng/trans/link/phy */
++
++ EDMA_STATUS_OFS = 0x30, /* EDMA engine status */
++ EDMA_STATUS_CACHE_EMPTY = (1 << 6), /* GenIIe command cache empty */
++ EDMA_STATUS_IDLE = (1 << 7), /* GenIIe EDMA enabled/idle */
+
+- EDMA_IORDY_TMOUT = 0x34,
+- EDMA_ARB_CFG = 0x38,
++ EDMA_IORDY_TMOUT_OFS = 0x34,
++ EDMA_ARB_CFG_OFS = 0x38,
++
++ EDMA_HALTCOND_OFS = 0x60, /* GenIIe halt conditions */
+
+ GEN_II_NCQ_MAX_SECTORS = 256, /* max sects/io on Gen2 w/NCQ */
+
+@@ -350,15 +362,19 @@ enum {
+ MV_HP_GEN_II = (1 << 7), /* Generation II: 60xx */
+ MV_HP_GEN_IIE = (1 << 8), /* Generation IIE: 6042/7042 */
+ MV_HP_PCIE = (1 << 9), /* PCIe bus/regs: 7042 */
++ MV_HP_CUT_THROUGH = (1 << 10), /* can use EDMA cut-through */
+
+ /* Port private flags (pp_flags) */
+ MV_PP_FLAG_EDMA_EN = (1 << 0), /* is EDMA engine enabled? */
+ MV_PP_FLAG_NCQ_EN = (1 << 1), /* is EDMA set up for NCQ? */
++ MV_PP_FLAG_FBS_EN = (1 << 2), /* is EDMA set up for FBS? */
++ MV_PP_FLAG_DELAYED_EH = (1 << 3), /* delayed dev err handling */
+ };
+
+ #define IS_GEN_I(hpriv) ((hpriv)->hp_flags & MV_HP_GEN_I)
+ #define IS_GEN_II(hpriv) ((hpriv)->hp_flags & MV_HP_GEN_II)
+ #define IS_GEN_IIE(hpriv) ((hpriv)->hp_flags & MV_HP_GEN_IIE)
++#define IS_PCIE(hpriv) ((hpriv)->hp_flags & MV_HP_PCIE)
+ #define HAS_PCI(host) (!((host)->ports[0]->flags & MV_FLAG_SOC))
+
+ #define WINDOW_CTRL(i) (0x20030 + ((i) << 4))
+@@ -433,6 +449,7 @@ struct mv_port_priv {
+ unsigned int resp_idx;
+
+ u32 pp_flags;
++ unsigned int delayed_eh_pmp_map;
+ };
+
+ struct mv_port_signal {
+@@ -479,6 +496,7 @@ static int mv5_scr_read(struct ata_port *ap, unsigned int sc_reg_in, u32 *val);
+ static int mv5_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val);
+ static int mv_port_start(struct ata_port *ap);
+ static void mv_port_stop(struct ata_port *ap);
++static int mv_qc_defer(struct ata_queued_cmd *qc);
+ static void mv_qc_prep(struct ata_queued_cmd *qc);
+ static void mv_qc_prep_iie(struct ata_queued_cmd *qc);
+ static unsigned int mv_qc_issue(struct ata_queued_cmd *qc);
+@@ -527,6 +545,9 @@ static int mv_pmp_hardreset(struct ata_link *link, unsigned int *class,
+ unsigned long deadline);
+ static int mv_softreset(struct ata_link *link, unsigned int *class,
+ unsigned long deadline);
++static void mv_pmp_error_handler(struct ata_port *ap);
++static void mv_process_crpb_entries(struct ata_port *ap,
++ struct mv_port_priv *pp);
+
+ /* .sg_tablesize is (MV_MAX_SG_CT / 2) in the structures below
+ * because we have to allow room for worst case splitting of
+@@ -548,6 +569,7 @@ static struct scsi_host_template mv6_sht = {
+ static struct ata_port_operations mv5_ops = {
+ .inherits = &ata_sff_port_ops,
+
++ .qc_defer = mv_qc_defer,
+ .qc_prep = mv_qc_prep,
+ .qc_issue = mv_qc_issue,
+
+@@ -566,7 +588,6 @@ static struct ata_port_operations mv5_ops = {
+
+ static struct ata_port_operations mv6_ops = {
+ .inherits = &mv5_ops,
+- .qc_defer = sata_pmp_qc_defer_cmd_switch,
+ .dev_config = mv6_dev_config,
+ .scr_read = mv_scr_read,
+ .scr_write = mv_scr_write,
+@@ -574,12 +595,11 @@ static struct ata_port_operations mv6_ops = {
+ .pmp_hardreset = mv_pmp_hardreset,
+ .pmp_softreset = mv_softreset,
+ .softreset = mv_softreset,
+- .error_handler = sata_pmp_error_handler,
++ .error_handler = mv_pmp_error_handler,
+ };
+
+ static struct ata_port_operations mv_iie_ops = {
+ .inherits = &mv6_ops,
+- .qc_defer = ata_std_qc_defer, /* FIS-based switching */
+ .dev_config = ATA_OP_NULL,
+ .qc_prep = mv_qc_prep_iie,
+ };
+@@ -875,6 +895,29 @@ static void mv_start_dma(struct ata_port *ap, void __iomem *port_mmio,
+ }
+ }
+
++static void mv_wait_for_edma_empty_idle(struct ata_port *ap)
++{
++ void __iomem *port_mmio = mv_ap_base(ap);
++ const u32 empty_idle = (EDMA_STATUS_CACHE_EMPTY | EDMA_STATUS_IDLE);
++ const int per_loop = 5, timeout = (15 * 1000 / per_loop);
++ int i;
++
++ /*
++ * Wait for the EDMA engine to finish transactions in progress.
++ * No idea what a good "timeout" value might be, but measurements
++ * indicate that it often requires hundreds of microseconds
++ * with two drives in-use. So we use the 15msec value above
++ * as a rough guess at what even more drives might require.
++ */
++ for (i = 0; i < timeout; ++i) {
++ u32 edma_stat = readl(port_mmio + EDMA_STATUS_OFS);
++ if ((edma_stat & empty_idle) == empty_idle)
++ break;
++ udelay(per_loop);
++ }
++ /* ata_port_printk(ap, KERN_INFO, "%s: %u+ usecs\n", __func__, i); */
++}
++
+ /**
+ * mv_stop_edma_engine - Disable eDMA engine
+ * @port_mmio: io base address
+@@ -907,6 +950,7 @@ static int mv_stop_edma(struct ata_port *ap)
+ if (!(pp->pp_flags & MV_PP_FLAG_EDMA_EN))
+ return 0;
+ pp->pp_flags &= ~MV_PP_FLAG_EDMA_EN;
++ mv_wait_for_edma_empty_idle(ap);
+ if (mv_stop_edma_engine(port_mmio)) {
+ ata_port_printk(ap, KERN_ERR, "Unable to stop eDMA\n");
+ return -EIO;
+@@ -1057,26 +1101,95 @@ static void mv6_dev_config(struct ata_device *adev)
+ }
+ }
+
+-static void mv_config_fbs(void __iomem *port_mmio, int enable_fbs)
++static int mv_qc_defer(struct ata_queued_cmd *qc)
+ {
+- u32 old_fcfg, new_fcfg, old_ltmode, new_ltmode;
++ struct ata_link *link = qc->dev->link;
++ struct ata_port *ap = link->ap;
++ struct mv_port_priv *pp = ap->private_data;
++
++ /*
++ * Don't allow new commands if we're in a delayed EH state
++ * for NCQ and/or FIS-based switching.
++ */
++ if (pp->pp_flags & MV_PP_FLAG_DELAYED_EH)
++ return ATA_DEFER_PORT;
+ /*
+- * Various bit settings required for operation
+- * in FIS-based switching (fbs) mode on GenIIe:
++ * If the port is completely idle, then allow the new qc.
+ */
+- old_fcfg = readl(port_mmio + FIS_CFG_OFS);
+- old_ltmode = readl(port_mmio + LTMODE_OFS);
+- if (enable_fbs) {
+- new_fcfg = old_fcfg | FIS_CFG_SINGLE_SYNC;
+- new_ltmode = old_ltmode | LTMODE_BIT8;
+- } else { /* disable fbs */
+- new_fcfg = old_fcfg & ~FIS_CFG_SINGLE_SYNC;
+- new_ltmode = old_ltmode & ~LTMODE_BIT8;
+- }
+- if (new_fcfg != old_fcfg)
+- writelfl(new_fcfg, port_mmio + FIS_CFG_OFS);
++ if (ap->nr_active_links == 0)
++ return 0;
++
++ if (pp->pp_flags & MV_PP_FLAG_EDMA_EN) {
++ /*
++ * The port is operating in host queuing mode (EDMA).
++ * It can accomodate a new qc if the qc protocol
++ * is compatible with the current host queue mode.
++ */
++ if (pp->pp_flags & MV_PP_FLAG_NCQ_EN) {
++ /*
++ * The host queue (EDMA) is in NCQ mode.
++ * If the new qc is also an NCQ command,
++ * then allow the new qc.
++ */
++ if (qc->tf.protocol == ATA_PROT_NCQ)
++ return 0;
++ } else {
++ /*
++ * The host queue (EDMA) is in non-NCQ, DMA mode.
++ * If the new qc is also a non-NCQ, DMA command,
++ * then allow the new qc.
++ */
++ if (qc->tf.protocol == ATA_PROT_DMA)
++ return 0;
++ }
++ }
++ return ATA_DEFER_PORT;
++}
++
++static void mv_config_fbs(void __iomem *port_mmio, int want_ncq, int want_fbs)
++{
++ u32 new_fiscfg, old_fiscfg;
++ u32 new_ltmode, old_ltmode;
++ u32 new_haltcond, old_haltcond;
++
++ old_fiscfg = readl(port_mmio + FISCFG_OFS);
++ old_ltmode = readl(port_mmio + LTMODE_OFS);
++ old_haltcond = readl(port_mmio + EDMA_HALTCOND_OFS);
++
++ new_fiscfg = old_fiscfg & ~(FISCFG_SINGLE_SYNC | FISCFG_WAIT_DEV_ERR);
++ new_ltmode = old_ltmode & ~LTMODE_BIT8;
++ new_haltcond = old_haltcond | EDMA_ERR_DEV;
++
++ if (want_fbs) {
++ new_fiscfg = old_fiscfg | FISCFG_SINGLE_SYNC;
++ new_ltmode = old_ltmode | LTMODE_BIT8;
++ if (want_ncq)
++ new_haltcond &= ~EDMA_ERR_DEV;
++ else
++ new_fiscfg |= FISCFG_WAIT_DEV_ERR;
++ }
++
++ if (new_fiscfg != old_fiscfg)
++ writelfl(new_fiscfg, port_mmio + FISCFG_OFS);
+ if (new_ltmode != old_ltmode)
+ writelfl(new_ltmode, port_mmio + LTMODE_OFS);
++ if (new_haltcond != old_haltcond)
++ writelfl(new_haltcond, port_mmio + EDMA_HALTCOND_OFS);
++}
++
++static void mv_60x1_errata_sata25(struct ata_port *ap, int want_ncq)
++{
++ struct mv_host_priv *hpriv = ap->host->private_data;
++ u32 old, new;
++
++ /* workaround for 88SX60x1 FEr SATA#25 (part 1) */
++ old = readl(hpriv->base + MV_GPIO_PORT_CTL_OFS);
++ if (want_ncq)
++ new = old | (1 << 22);
++ else
++ new = old & ~(1 << 22);
++ if (new != old)
++ writel(new, hpriv->base + MV_GPIO_PORT_CTL_OFS);
+ }
+
+ static void mv_edma_cfg(struct ata_port *ap, int want_ncq)
+@@ -1088,25 +1201,40 @@ static void mv_edma_cfg(struct ata_port *ap, int want_ncq)
+
+ /* set up non-NCQ EDMA configuration */
+ cfg = EDMA_CFG_Q_DEPTH; /* always 0x1f for *all* chips */
++ pp->pp_flags &= ~MV_PP_FLAG_FBS_EN;
+
+ if (IS_GEN_I(hpriv))
+ cfg |= (1 << 8); /* enab config burst size mask */
+
+- else if (IS_GEN_II(hpriv))
++ else if (IS_GEN_II(hpriv)) {
+ cfg |= EDMA_CFG_RD_BRST_EXT | EDMA_CFG_WR_BUFF_LEN;
++ mv_60x1_errata_sata25(ap, want_ncq);
+
+- else if (IS_GEN_IIE(hpriv)) {
+- cfg |= (1 << 23); /* do not mask PM field in rx'd FIS */
+- cfg |= (1 << 22); /* enab 4-entry host queue cache */
+- cfg |= (1 << 18); /* enab early completion */
+- cfg |= (1 << 17); /* enab cut-through (dis stor&forwrd) */
++ } else if (IS_GEN_IIE(hpriv)) {
++ int want_fbs = sata_pmp_attached(ap);
++ /*
++ * Possible future enhancement:
++ *
++ * The chip can use FBS with non-NCQ, if we allow it,
++ * But first we need to have the error handling in place
++ * for this mode (datasheet section 7.3.15.4.2.3).
++ * So disallow non-NCQ FBS for now.
++ */
++ want_fbs &= want_ncq;
++
++ mv_config_fbs(port_mmio, want_ncq, want_fbs);
+
+- if (want_ncq && sata_pmp_attached(ap)) {
++ if (want_fbs) {
++ pp->pp_flags |= MV_PP_FLAG_FBS_EN;
+ cfg |= EDMA_CFG_EDMA_FBS; /* FIS-based switching */
+- mv_config_fbs(port_mmio, 1);
+- } else {
+- mv_config_fbs(port_mmio, 0);
+ }
++
++ cfg |= (1 << 23); /* do not mask PM field in rx'd FIS */
++ cfg |= (1 << 22); /* enab 4-entry host queue cache */
++ if (HAS_PCI(ap->host))
++ cfg |= (1 << 18); /* enab early completion */
++ if (hpriv->hp_flags & MV_HP_CUT_THROUGH)
++ cfg |= (1 << 17); /* enab cut-thru (dis stor&forwrd) */
+ }
+
+ if (want_ncq) {
+@@ -1483,25 +1611,186 @@ static struct ata_queued_cmd *mv_get_active_qc(struct ata_port *ap)
+ return qc;
+ }
+
+-static void mv_unexpected_intr(struct ata_port *ap)
++static void mv_pmp_error_handler(struct ata_port *ap)
+ {
++ unsigned int pmp, pmp_map;
+ struct mv_port_priv *pp = ap->private_data;
+- struct ata_eh_info *ehi = &ap->link.eh_info;
+- char *when = "";
+
++ if (pp->pp_flags & MV_PP_FLAG_DELAYED_EH) {
++ /*
++ * Perform NCQ error analysis on failed PMPs
++ * before we freeze the port entirely.
++ *
++ * The failed PMPs are marked earlier by mv_pmp_eh_prep().
++ */
++ pmp_map = pp->delayed_eh_pmp_map;
++ pp->pp_flags &= ~MV_PP_FLAG_DELAYED_EH;
++ for (pmp = 0; pmp_map != 0; pmp++) {
++ unsigned int this_pmp = (1 << pmp);
++ if (pmp_map & this_pmp) {
++ struct ata_link *link = &ap->pmp_link[pmp];
++ pmp_map &= ~this_pmp;
++ ata_eh_analyze_ncq_error(link);
++ }
++ }
++ ata_port_freeze(ap);
++ }
++ sata_pmp_error_handler(ap);
++}
++
++static unsigned int mv_get_err_pmp_map(struct ata_port *ap)
++{
++ void __iomem *port_mmio = mv_ap_base(ap);
++
++ return readl(port_mmio + SATA_TESTCTL_OFS) >> 16;
++}
++
++static void mv_pmp_eh_prep(struct ata_port *ap, unsigned int pmp_map)
++{
++ struct ata_eh_info *ehi;
++ unsigned int pmp;
++
++ /*
++ * Initialize EH info for PMPs which saw device errors
++ */
++ ehi = &ap->link.eh_info;
++ for (pmp = 0; pmp_map != 0; pmp++) {
++ unsigned int this_pmp = (1 << pmp);
++ if (pmp_map & this_pmp) {
++ struct ata_link *link = &ap->pmp_link[pmp];
++
++ pmp_map &= ~this_pmp;
++ ehi = &link->eh_info;
++ ata_ehi_clear_desc(ehi);
++ ata_ehi_push_desc(ehi, "dev err");
++ ehi->err_mask |= AC_ERR_DEV;
++ ehi->action |= ATA_EH_RESET;
++ ata_link_abort(link);
++ }
++ }
++}
++
++static int mv_handle_fbs_ncq_dev_err(struct ata_port *ap)
++{
++ struct mv_port_priv *pp = ap->private_data;
++ int failed_links;
++ unsigned int old_map, new_map;
++
++ /*
++ * Device error during FBS+NCQ operation:
++ *
++ * Set a port flag to prevent further I/O being enqueued.
++ * Leave the EDMA running to drain outstanding commands from this port.
++ * Perform the post-mortem/EH only when all responses are complete.
++ * Follow recovery sequence from 6042/7042 datasheet (7.3.15.4.2.2).
++ */
++ if (!(pp->pp_flags & MV_PP_FLAG_DELAYED_EH)) {
++ pp->pp_flags |= MV_PP_FLAG_DELAYED_EH;
++ pp->delayed_eh_pmp_map = 0;
++ }
++ old_map = pp->delayed_eh_pmp_map;
++ new_map = old_map | mv_get_err_pmp_map(ap);
++
++ if (old_map != new_map) {
++ pp->delayed_eh_pmp_map = new_map;
++ mv_pmp_eh_prep(ap, new_map & ~old_map);
++ }
++ failed_links = hweight16(new_map);
++
++ ata_port_printk(ap, KERN_INFO, "%s: pmp_map=%04x qc_map=%04x "
++ "failed_links=%d nr_active_links=%d\n",
++ __func__, pp->delayed_eh_pmp_map,
++ ap->qc_active, failed_links,
++ ap->nr_active_links);
++
++ if (ap->nr_active_links <= failed_links) {
++ mv_process_crpb_entries(ap, pp);
++ mv_stop_edma(ap);
++ mv_eh_freeze(ap);
++ ata_port_printk(ap, KERN_INFO, "%s: done\n", __func__);
++ return 1; /* handled */
++ }
++ ata_port_printk(ap, KERN_INFO, "%s: waiting\n", __func__);
++ return 1; /* handled */
++}
++
++static int mv_handle_fbs_non_ncq_dev_err(struct ata_port *ap)
++{
+ /*
+- * We got a device interrupt from something that
+- * was supposed to be using EDMA or polling.
++ * Possible future enhancement:
++ *
++ * FBS+non-NCQ operation is not yet implemented.
++ * See related notes in mv_edma_cfg().
++ *
++ * Device error during FBS+non-NCQ operation:
++ *
++ * We need to snapshot the shadow registers for each failed command.
++ * Follow recovery sequence from 6042/7042 datasheet (7.3.15.4.2.3).
+ */
++ return 0; /* not handled */
++}
++
++static int mv_handle_dev_err(struct ata_port *ap, u32 edma_err_cause)
++{
++ struct mv_port_priv *pp = ap->private_data;
++
++ if (!(pp->pp_flags & MV_PP_FLAG_EDMA_EN))
++ return 0; /* EDMA was not active: not handled */
++ if (!(pp->pp_flags & MV_PP_FLAG_FBS_EN))
++ return 0; /* FBS was not active: not handled */
++
++ if (!(edma_err_cause & EDMA_ERR_DEV))
++ return 0; /* non DEV error: not handled */
++ edma_err_cause &= ~EDMA_ERR_IRQ_TRANSIENT;
++ if (edma_err_cause & ~(EDMA_ERR_DEV | EDMA_ERR_SELF_DIS))
++ return 0; /* other problems: not handled */
++
++ if (pp->pp_flags & MV_PP_FLAG_NCQ_EN) {
++ /*
++ * EDMA should NOT have self-disabled for this case.
++ * If it did, then something is wrong elsewhere,
++ * and we cannot handle it here.
++ */
++ if (edma_err_cause & EDMA_ERR_SELF_DIS) {
++ ata_port_printk(ap, KERN_WARNING,
++ "%s: err_cause=0x%x pp_flags=0x%x\n",
++ __func__, edma_err_cause, pp->pp_flags);
++ return 0; /* not handled */
++ }
++ return mv_handle_fbs_ncq_dev_err(ap);
++ } else {
++ /*
++ * EDMA should have self-disabled for this case.
++ * If it did not, then something is wrong elsewhere,
++ * and we cannot handle it here.
++ */
++ if (!(edma_err_cause & EDMA_ERR_SELF_DIS)) {
++ ata_port_printk(ap, KERN_WARNING,
++ "%s: err_cause=0x%x pp_flags=0x%x\n",
++ __func__, edma_err_cause, pp->pp_flags);
++ return 0; /* not handled */
++ }
++ return mv_handle_fbs_non_ncq_dev_err(ap);
++ }
++ return 0; /* not handled */
++}
++
++static void mv_unexpected_intr(struct ata_port *ap, int edma_was_enabled)
++{
++ struct ata_eh_info *ehi = &ap->link.eh_info;
++ char *when = "idle";
++
+ ata_ehi_clear_desc(ehi);
+- if (pp->pp_flags & MV_PP_FLAG_EDMA_EN) {
+- when = " while EDMA enabled";
++ if (!ap || (ap->flags & ATA_FLAG_DISABLED)) {
++ when = "disabled";
++ } else if (edma_was_enabled) {
++ when = "EDMA enabled";
+ } else {
+ struct ata_queued_cmd *qc = ata_qc_from_tag(ap, ap->link.active_tag);
+ if (qc && (qc->tf.flags & ATA_TFLAG_POLLING))
+- when = " while polling";
++ when = "polling";
+ }
+- ata_ehi_push_desc(ehi, "unexpected device interrupt%s", when);
++ ata_ehi_push_desc(ehi, "unexpected device interrupt while %s", when);
+ ehi->err_mask |= AC_ERR_OTHER;
+ ehi->action |= ATA_EH_RESET;
+ ata_port_freeze(ap);
+@@ -1519,7 +1808,7 @@ static void mv_unexpected_intr(struct ata_port *ap)
+ * LOCKING:
+ * Inherited from caller.
+ */
+-static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
++static void mv_err_intr(struct ata_port *ap)
+ {
+ void __iomem *port_mmio = mv_ap_base(ap);
+ u32 edma_err_cause, eh_freeze_mask, serr = 0;
+@@ -1527,24 +1816,42 @@ static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
+ struct mv_host_priv *hpriv = ap->host->private_data;
+ unsigned int action = 0, err_mask = 0;
+ struct ata_eh_info *ehi = &ap->link.eh_info;
+-
+- ata_ehi_clear_desc(ehi);
++ struct ata_queued_cmd *qc;
++ int abort = 0;
+
+ /*
+- * Read and clear the err_cause bits. This won't actually
+- * clear for some errors (eg. SError), but we will be doing
+- * a hard reset in those cases regardless, which *will* clear it.
++ * Read and clear the SError and err_cause bits.
+ */
++ sata_scr_read(&ap->link, SCR_ERROR, &serr);
++ sata_scr_write_flush(&ap->link, SCR_ERROR, serr);
++
+ edma_err_cause = readl(port_mmio + EDMA_ERR_IRQ_CAUSE_OFS);
+ writelfl(~edma_err_cause, port_mmio + EDMA_ERR_IRQ_CAUSE_OFS);
+
+- ata_ehi_push_desc(ehi, "edma_err_cause=%08x", edma_err_cause);
++ ata_port_printk(ap, KERN_INFO, "%s: err_cause=%08x pp_flags=0x%x\n",
++ __func__, edma_err_cause, pp->pp_flags);
++
++ if (edma_err_cause & EDMA_ERR_DEV) {
++ /*
++ * Device errors during FIS-based switching operation
++ * require special handling.
++ */
++ if (mv_handle_dev_err(ap, edma_err_cause))
++ return;
++ }
+
++ qc = mv_get_active_qc(ap);
++ ata_ehi_clear_desc(ehi);
++ ata_ehi_push_desc(ehi, "edma_err_cause=%08x pp_flags=%08x",
++ edma_err_cause, pp->pp_flags);
+ /*
+ * All generations share these EDMA error cause bits:
+ */
+- if (edma_err_cause & EDMA_ERR_DEV)
++ if (edma_err_cause & EDMA_ERR_DEV) {
+ err_mask |= AC_ERR_DEV;
++ action |= ATA_EH_RESET;
++ ata_ehi_push_desc(ehi, "dev error");
++ }
+ if (edma_err_cause & (EDMA_ERR_D_PAR | EDMA_ERR_PRD_PAR |
+ EDMA_ERR_CRQB_PAR | EDMA_ERR_CRPB_PAR |
+ EDMA_ERR_INTRL_PAR)) {
+@@ -1576,13 +1883,6 @@ static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
+ ata_ehi_push_desc(ehi, "EDMA self-disable");
+ }
+ if (edma_err_cause & EDMA_ERR_SERR) {
+- /*
+- * Ensure that we read our own SCR, not a pmp link SCR:
+- */
+- ap->ops->scr_read(ap, SCR_ERROR, &serr);
+- /*
+- * Don't clear SError here; leave it for libata-eh:
+- */
+ ata_ehi_push_desc(ehi, "SError=%08x", serr);
+ err_mask |= AC_ERR_ATA_BUS;
+ action |= ATA_EH_RESET;
+@@ -1602,10 +1902,29 @@ static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
+ else
+ ehi->err_mask |= err_mask;
+
+- if (edma_err_cause & eh_freeze_mask)
++ if (err_mask == AC_ERR_DEV) {
++ /*
++ * Cannot do ata_port_freeze() here,
++ * because it would kill PIO access,
++ * which is needed for further diagnosis.
++ */
++ mv_eh_freeze(ap);
++ abort = 1;
++ } else if (edma_err_cause & eh_freeze_mask) {
++ /*
++ * Note to self: ata_port_freeze() calls ata_port_abort()
++ */
+ ata_port_freeze(ap);
+- else
+- ata_port_abort(ap);
++ } else {
++ abort = 1;
++ }
++
++ if (abort) {
++ if (qc)
++ ata_link_abort(qc->dev->link);
++ else
++ ata_port_abort(ap);
++ }
+ }
+
+ static void mv_process_crpb_response(struct ata_port *ap,
+@@ -1632,8 +1951,9 @@ static void mv_process_crpb_response(struct ata_port *ap,
+ }
+ }
+ ata_status = edma_status >> CRPB_FLAG_STATUS_SHIFT;
+- qc->err_mask |= ac_err_mask(ata_status);
+- ata_qc_complete(qc);
++ if (!ac_err_mask(ata_status))
++ ata_qc_complete(qc);
++ /* else: leave it for mv_err_intr() */
+ } else {
+ ata_port_printk(ap, KERN_ERR, "%s: no qc for tag=%d\n",
+ __func__, tag);
+@@ -1677,6 +1997,44 @@ static void mv_process_crpb_entries(struct ata_port *ap, struct mv_port_priv *pp
+ port_mmio + EDMA_RSP_Q_OUT_PTR_OFS);
+ }
+
++static void mv_port_intr(struct ata_port *ap, u32 port_cause)
++{
++ struct mv_port_priv *pp;
++ int edma_was_enabled;
++
++ if (!ap || (ap->flags & ATA_FLAG_DISABLED)) {
++ mv_unexpected_intr(ap, 0);
++ return;
++ }
++ /*
++ * Grab a snapshot of the EDMA_EN flag setting,
++ * so that we have a consistent view for this port,
++ * even if something we call of our routines changes it.
++ */
++ pp = ap->private_data;
++ edma_was_enabled = (pp->pp_flags & MV_PP_FLAG_EDMA_EN);
++ /*
++ * Process completed CRPB response(s) before other events.
++ */
++ if (edma_was_enabled && (port_cause & DONE_IRQ)) {
++ mv_process_crpb_entries(ap, pp);
++ if (pp->pp_flags & MV_PP_FLAG_DELAYED_EH)
++ mv_handle_fbs_ncq_dev_err(ap);
++ }
++ /*
++ * Handle chip-reported errors, or continue on to handle PIO.
++ */
++ if (unlikely(port_cause & ERR_IRQ)) {
++ mv_err_intr(ap);
++ } else if (!edma_was_enabled) {
++ struct ata_queued_cmd *qc = mv_get_active_qc(ap);
++ if (qc)
++ ata_sff_host_intr(ap, qc);
++ else
++ mv_unexpected_intr(ap, edma_was_enabled);
++ }
++}
++
+ /**
+ * mv_host_intr - Handle all interrupts on the given host controller
+ * @host: host specific structure
+@@ -1688,66 +2046,58 @@ static void mv_process_crpb_entries(struct ata_port *ap, struct mv_port_priv *pp
+ static int mv_host_intr(struct ata_host *host, u32 main_irq_cause)
+ {
+ struct mv_host_priv *hpriv = host->private_data;
+- void __iomem *mmio = hpriv->base, *hc_mmio = NULL;
+- u32 hc_irq_cause = 0;
++ void __iomem *mmio = hpriv->base, *hc_mmio;
+ unsigned int handled = 0, port;
+
+ for (port = 0; port < hpriv->n_ports; port++) {
+ struct ata_port *ap = host->ports[port];
+- struct mv_port_priv *pp;
+- unsigned int shift, hardport, port_cause;
+- /*
+- * When we move to the second hc, flag our cached
+- * copies of hc_mmio (and hc_irq_cause) as invalid again.
+- */
+- if (port == MV_PORTS_PER_HC)
+- hc_mmio = NULL;
+- /*
+- * Do nothing if port is not interrupting or is disabled:
+- */
++ unsigned int p, shift, hardport, port_cause;
++
+ MV_PORT_TO_SHIFT_AND_HARDPORT(port, shift, hardport);
+- port_cause = (main_irq_cause >> shift) & (DONE_IRQ | ERR_IRQ);
+- if (!port_cause || !ap || (ap->flags & ATA_FLAG_DISABLED))
+- continue;
+ /*
+- * Each hc within the host has its own hc_irq_cause register.
+- * We defer reading it until we know we need it, right now:
+- *
+- * FIXME later: we don't really need to read this register
+- * (some logic changes required below if we go that way),
+- * because it doesn't tell us anything new. But we do need
+- * to write to it, outside the top of this loop,
+- * to reset the interrupt triggers for next time.
++ * Each hc within the host has its own hc_irq_cause register,
++ * where the interrupting ports bits get ack'd.
+ */
+- if (!hc_mmio) {
++ if (hardport == 0) { /* first port on this hc ? */
++ u32 hc_cause = (main_irq_cause >> shift) & HC0_IRQ_PEND;
++ u32 port_mask, ack_irqs;
++ /*
++ * Skip this entire hc if nothing pending for any ports
++ */
++ if (!hc_cause) {
++ port += MV_PORTS_PER_HC - 1;
++ continue;
++ }
++ /*
++ * We don't need/want to read the hc_irq_cause register,
++ * because doing so hurts performance, and
++ * main_irq_cause already gives us everything we need.
++ *
++ * But we do have to *write* to the hc_irq_cause to ack
++ * the ports that we are handling this time through.
++ *
++ * This requires that we create a bitmap for those
++ * ports which interrupted us, and use that bitmap
++ * to ack (only) those ports via hc_irq_cause.
++ */
++ ack_irqs = 0;
++ for (p = 0; p < MV_PORTS_PER_HC; ++p) {
++ if ((port + p) >= hpriv->n_ports)
++ break;
++ port_mask = (DONE_IRQ | ERR_IRQ) << (p * 2);
++ if (hc_cause & port_mask)
++ ack_irqs |= (DMA_IRQ | DEV_IRQ) << p;
++ }
+ hc_mmio = mv_hc_base_from_port(mmio, port);
+- hc_irq_cause = readl(hc_mmio + HC_IRQ_CAUSE_OFS);
+- writelfl(~hc_irq_cause, hc_mmio + HC_IRQ_CAUSE_OFS);
++ writelfl(~ack_irqs, hc_mmio + HC_IRQ_CAUSE_OFS);
+ handled = 1;
+ }
+ /*
+- * Process completed CRPB response(s) before other events.
+- */
+- pp = ap->private_data;
+- if (hc_irq_cause & (DMA_IRQ << hardport)) {
+- if (pp->pp_flags & MV_PP_FLAG_EDMA_EN)
+- mv_process_crpb_entries(ap, pp);
+- }
+- /*
+- * Handle chip-reported errors, or continue on to handle PIO.
++ * Handle interrupts signalled for this port:
+ */
+- if (unlikely(port_cause & ERR_IRQ)) {
+- mv_err_intr(ap, mv_get_active_qc(ap));
+- } else if (hc_irq_cause & (DEV_IRQ << hardport)) {
+- if (!(pp->pp_flags & MV_PP_FLAG_EDMA_EN)) {
+- struct ata_queued_cmd *qc = mv_get_active_qc(ap);
+- if (qc) {
+- ata_sff_host_intr(ap, qc);
+- continue;
+- }
+- }
+- mv_unexpected_intr(ap);
+- }
++ port_cause = (main_irq_cause >> shift) & (DONE_IRQ | ERR_IRQ);
++ if (port_cause)
++ mv_port_intr(ap, port_cause);
+ }
+ return handled;
+ }
+@@ -1894,7 +2244,7 @@ static void mv5_reset_bus(struct ata_host *host, void __iomem *mmio)
+
+ static void mv5_reset_flash(struct mv_host_priv *hpriv, void __iomem *mmio)
+ {
+- writel(0x0fcfffff, mmio + MV_FLASH_CTL);
++ writel(0x0fcfffff, mmio + MV_FLASH_CTL_OFS);
+ }
+
+ static void mv5_read_preamp(struct mv_host_priv *hpriv, int idx,
+@@ -1913,7 +2263,7 @@ static void mv5_enable_leds(struct mv_host_priv *hpriv, void __iomem *mmio)
+ {
+ u32 tmp;
+
+- writel(0, mmio + MV_GPIO_PORT_CTL);
++ writel(0, mmio + MV_GPIO_PORT_CTL_OFS);
+
+ /* FIXME: handle MV_HP_ERRATA_50XXB2 errata */
+
+@@ -1931,14 +2281,14 @@ static void mv5_phy_errata(struct mv_host_priv *hpriv, void __iomem *mmio,
+ int fix_apm_sq = (hpriv->hp_flags & MV_HP_ERRATA_50XXB0);
+
+ if (fix_apm_sq) {
+- tmp = readl(phy_mmio + MV5_LT_MODE);
++ tmp = readl(phy_mmio + MV5_LTMODE_OFS);
+ tmp |= (1 << 19);
+- writel(tmp, phy_mmio + MV5_LT_MODE);
++ writel(tmp, phy_mmio + MV5_LTMODE_OFS);
+
+- tmp = readl(phy_mmio + MV5_PHY_CTL);
++ tmp = readl(phy_mmio + MV5_PHY_CTL_OFS);
+ tmp &= ~0x3;
+ tmp |= 0x1;
+- writel(tmp, phy_mmio + MV5_PHY_CTL);
++ writel(tmp, phy_mmio + MV5_PHY_CTL_OFS);
+ }
+
+ tmp = readl(phy_mmio + MV5_PHY_MODE);
+@@ -1956,11 +2306,6 @@ static void mv5_reset_hc_port(struct mv_host_priv *hpriv, void __iomem *mmio,
+ {
+ void __iomem *port_mmio = mv_port_base(mmio, port);
+
+- /*
+- * The datasheet warns against setting ATA_RST when EDMA is active
+- * (but doesn't say what the problem might be). So we first try
+- * to disable the EDMA engine before doing the ATA_RST operation.
+- */
+ mv_reset_channel(hpriv, mmio, port);
+
+ ZERO(0x028); /* command */
+@@ -1975,7 +2320,7 @@ static void mv5_reset_hc_port(struct mv_host_priv *hpriv, void __iomem *mmio,
+ ZERO(0x024); /* respq outp */
+ ZERO(0x020); /* respq inp */
+ ZERO(0x02c); /* test control */
+- writel(0xbc, port_mmio + EDMA_IORDY_TMOUT);
++ writel(0xbc, port_mmio + EDMA_IORDY_TMOUT_OFS);
+ }
+ #undef ZERO
+
+@@ -2021,13 +2366,13 @@ static void mv_reset_pci_bus(struct ata_host *host, void __iomem *mmio)
+ struct mv_host_priv *hpriv = host->private_data;
+ u32 tmp;
+
+- tmp = readl(mmio + MV_PCI_MODE);
++ tmp = readl(mmio + MV_PCI_MODE_OFS);
+ tmp &= 0xff00ffff;
+- writel(tmp, mmio + MV_PCI_MODE);
++ writel(tmp, mmio + MV_PCI_MODE_OFS);
+
+ ZERO(MV_PCI_DISC_TIMER);
+ ZERO(MV_PCI_MSI_TRIGGER);
+- writel(0x000100ff, mmio + MV_PCI_XBAR_TMOUT);
++ writel(0x000100ff, mmio + MV_PCI_XBAR_TMOUT_OFS);
+ ZERO(PCI_HC_MAIN_IRQ_MASK_OFS);
+ ZERO(MV_PCI_SERR_MASK);
+ ZERO(hpriv->irq_cause_ofs);
+@@ -2045,10 +2390,10 @@ static void mv6_reset_flash(struct mv_host_priv *hpriv, void __iomem *mmio)
+
+ mv5_reset_flash(hpriv, mmio);
+
+- tmp = readl(mmio + MV_GPIO_PORT_CTL);
++ tmp = readl(mmio + MV_GPIO_PORT_CTL_OFS);
+ tmp &= 0x3;
+ tmp |= (1 << 5) | (1 << 6);
+- writel(tmp, mmio + MV_GPIO_PORT_CTL);
++ writel(tmp, mmio + MV_GPIO_PORT_CTL_OFS);
+ }
+
+ /**
+@@ -2121,7 +2466,7 @@ static void mv6_read_preamp(struct mv_host_priv *hpriv, int idx,
+ void __iomem *port_mmio;
+ u32 tmp;
+
+- tmp = readl(mmio + MV_RESET_CFG);
++ tmp = readl(mmio + MV_RESET_CFG_OFS);
+ if ((tmp & (1 << 0)) == 0) {
+ hpriv->signal[idx].amps = 0x7 << 8;
+ hpriv->signal[idx].pre = 0x1 << 5;
+@@ -2137,7 +2482,7 @@ static void mv6_read_preamp(struct mv_host_priv *hpriv, int idx,
+
+ static void mv6_enable_leds(struct mv_host_priv *hpriv, void __iomem *mmio)
+ {
+- writel(0x00000060, mmio + MV_GPIO_PORT_CTL);
++ writel(0x00000060, mmio + MV_GPIO_PORT_CTL_OFS);
+ }
+
+ static void mv6_phy_errata(struct mv_host_priv *hpriv, void __iomem *mmio,
+@@ -2235,11 +2580,6 @@ static void mv_soc_reset_hc_port(struct mv_host_priv *hpriv,
+ {
+ void __iomem *port_mmio = mv_port_base(mmio, port);
+
+- /*
+- * The datasheet warns against setting ATA_RST when EDMA is active
+- * (but doesn't say what the problem might be). So we first try
+- * to disable the EDMA engine before doing the ATA_RST operation.
+- */
+ mv_reset_channel(hpriv, mmio, port);
+
+ ZERO(0x028); /* command */
+@@ -2254,7 +2594,7 @@ static void mv_soc_reset_hc_port(struct mv_host_priv *hpriv,
+ ZERO(0x024); /* respq outp */
+ ZERO(0x020); /* respq inp */
+ ZERO(0x02c); /* test control */
+- writel(0xbc, port_mmio + EDMA_IORDY_TMOUT);
++ writel(0xbc, port_mmio + EDMA_IORDY_TMOUT_OFS);
+ }
+
+ #undef ZERO
+@@ -2297,38 +2637,39 @@ static void mv_soc_reset_bus(struct ata_host *host, void __iomem *mmio)
+ return;
+ }
+
+-static void mv_setup_ifctl(void __iomem *port_mmio, int want_gen2i)
++static void mv_setup_ifcfg(void __iomem *port_mmio, int want_gen2i)
+ {
+- u32 ifctl = readl(port_mmio + SATA_INTERFACE_CFG);
++ u32 ifcfg = readl(port_mmio + SATA_INTERFACE_CFG_OFS);
+
+- ifctl = (ifctl & 0xf7f) | 0x9b1000; /* from chip spec */
++ ifcfg = (ifcfg & 0xf7f) | 0x9b1000; /* from chip spec */
+ if (want_gen2i)
+- ifctl |= (1 << 7); /* enable gen2i speed */
+- writelfl(ifctl, port_mmio + SATA_INTERFACE_CFG);
++ ifcfg |= (1 << 7); /* enable gen2i speed */
++ writelfl(ifcfg, port_mmio + SATA_INTERFACE_CFG_OFS);
+ }
+
+-/*
+- * Caller must ensure that EDMA is not active,
+- * by first doing mv_stop_edma() where needed.
+- */
+ static void mv_reset_channel(struct mv_host_priv *hpriv, void __iomem *mmio,
+ unsigned int port_no)
+ {
+ void __iomem *port_mmio = mv_port_base(mmio, port_no);
+
++ /*
++ * The datasheet warns against setting EDMA_RESET when EDMA is active
++ * (but doesn't say what the problem might be). So we first try
++ * to disable the EDMA engine before doing the EDMA_RESET operation.
++ */
+ mv_stop_edma_engine(port_mmio);
+- writelfl(ATA_RST, port_mmio + EDMA_CMD_OFS);
++ writelfl(EDMA_RESET, port_mmio + EDMA_CMD_OFS);
+
+ if (!IS_GEN_I(hpriv)) {
+- /* Enable 3.0gb/s link speed */
+- mv_setup_ifctl(port_mmio, 1);
++ /* Enable 3.0gb/s link speed: this survives EDMA_RESET */
++ mv_setup_ifcfg(port_mmio, 1);
+ }
+ /*
+- * Strobing ATA_RST here causes a hard reset of the SATA transport,
++ * Strobing EDMA_RESET here causes a hard reset of the SATA transport,
+ * link, and physical layers. It resets all SATA interface registers
+ * (except for SATA_INTERFACE_CFG), and issues a COMRESET to the dev.
+ */
+- writelfl(ATA_RST, port_mmio + EDMA_CMD_OFS);
++ writelfl(EDMA_RESET, port_mmio + EDMA_CMD_OFS);
+ udelay(25); /* allow reset propagation */
+ writelfl(0, port_mmio + EDMA_CMD_OFS);
+
+@@ -2392,7 +2733,7 @@ static int mv_hardreset(struct ata_link *link, unsigned int *class,
+ sata_scr_read(link, SCR_STATUS, &sstatus);
+ if (!IS_GEN_I(hpriv) && ++attempts >= 5 && sstatus == 0x121) {
+ /* Force 1.5gb/s link speed and try again */
+- mv_setup_ifctl(mv_ap_base(ap), 0);
++ mv_setup_ifcfg(mv_ap_base(ap), 0);
+ if (time_after(jiffies + HZ, deadline))
+ extra = HZ; /* only extend it once, max */
+ }
+@@ -2493,6 +2834,34 @@ static void mv_port_init(struct ata_ioports *port, void __iomem *port_mmio)
+ readl(port_mmio + EDMA_ERR_IRQ_MASK_OFS));
+ }
+
++static unsigned int mv_in_pcix_mode(struct ata_host *host)
++{
++ struct mv_host_priv *hpriv = host->private_data;
++ void __iomem *mmio = hpriv->base;
++ u32 reg;
++
++ if (!HAS_PCI(host) || !IS_PCIE(hpriv))
++ return 0; /* not PCI-X capable */
++ reg = readl(mmio + MV_PCI_MODE_OFS);
++ if ((reg & MV_PCI_MODE_MASK) == 0)
++ return 0; /* conventional PCI mode */
++ return 1; /* chip is in PCI-X mode */
++}
++
++static int mv_pci_cut_through_okay(struct ata_host *host)
++{
++ struct mv_host_priv *hpriv = host->private_data;
++ void __iomem *mmio = hpriv->base;
++ u32 reg;
++
++ if (!mv_in_pcix_mode(host)) {
++ reg = readl(mmio + PCI_COMMAND_OFS);
++ if (reg & PCI_COMMAND_MRDTRIG)
++ return 0; /* not okay */
++ }
++ return 1; /* okay */
++}
++
+ static int mv_chip_id(struct ata_host *host, unsigned int board_idx)
+ {
+ struct pci_dev *pdev = to_pci_dev(host->dev);
+@@ -2560,7 +2929,7 @@ static int mv_chip_id(struct ata_host *host, unsigned int board_idx)
+ break;
+
+ case chip_7042:
+- hp_flags |= MV_HP_PCIE;
++ hp_flags |= MV_HP_PCIE | MV_HP_CUT_THROUGH;
+ if (pdev->vendor == PCI_VENDOR_ID_TTI &&
+ (pdev->device == 0x2300 || pdev->device == 0x2310))
+ {
+@@ -2590,9 +2959,12 @@ static int mv_chip_id(struct ata_host *host, unsigned int board_idx)
+ " and avoid the final two gigabytes on"
+ " all RocketRAID BIOS initialized drives.\n");
+ }
++ /* drop through */
+ case chip_6042:
+ hpriv->ops = &mv6xxx_ops;
+ hp_flags |= MV_HP_GEN_IIE;
++ if (board_idx == chip_6042 && mv_pci_cut_through_okay(host))
++ hp_flags |= MV_HP_CUT_THROUGH;
+
+ switch (pdev->revision) {
+ case 0x0:
+diff --git a/drivers/base/sys.c b/drivers/base/sys.c
+index 4fbb56b..358bb0b 100644
+--- a/drivers/base/sys.c
++++ b/drivers/base/sys.c
+@@ -175,8 +175,7 @@ int sysdev_driver_register(struct sysdev_class *cls, struct sysdev_driver *drv)
+ }
+
+ /* Check whether this driver has already been added to a class. */
+- if ((drv->entry.next != drv->entry.prev) ||
+- (drv->entry.next != NULL)) {
++ if (drv->entry.next && !list_empty(&drv->entry)) {
+ printk(KERN_WARNING "sysdev: class %s: driver (%p) has already"
+ " been registered to a class, something is wrong, but "
+ "will forge on!\n", cls->name, drv);
+diff --git a/drivers/char/serial167.c b/drivers/char/serial167.c
+index fd2db07..3b23270 100644
+--- a/drivers/char/serial167.c
++++ b/drivers/char/serial167.c
+@@ -1073,7 +1073,7 @@ static int cy_put_char(struct tty_struct *tty, unsigned char ch)
+ return 0;
+
+ if (!info->xmit_buf)
+- return;
++ return 0;
+
+ local_irq_save(flags);
+ if (info->xmit_cnt >= PAGE_SIZE - 1) {
+diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
+index a9aa845..b27b13c 100644
+--- a/drivers/edac/edac_core.h
++++ b/drivers/edac/edac_core.h
+@@ -97,7 +97,7 @@ extern int edac_debug_level;
+ #define PCI_VEND_DEV(vend, dev) PCI_VENDOR_ID_ ## vend, \
+ PCI_DEVICE_ID_ ## vend ## _ ## dev
+
+-#define dev_name(dev) (dev)->dev_name
++#define edac_dev_name(dev) (dev)->dev_name
+
+ /* memory devices */
+ enum dev_type {
+diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
+index 63372fa..5fcd3d8 100644
+--- a/drivers/edac/edac_device.c
++++ b/drivers/edac/edac_device.c
+@@ -333,7 +333,7 @@ static int add_edac_dev_to_global_list(struct edac_device_ctl_info *edac_dev)
+ fail0:
+ edac_printk(KERN_WARNING, EDAC_MC,
+ "%s (%s) %s %s already assigned %d\n",
+- rover->dev->bus_id, dev_name(rover),
++ rover->dev->bus_id, edac_dev_name(rover),
+ rover->mod_name, rover->ctl_name, rover->dev_idx);
+ return 1;
+
+@@ -538,7 +538,7 @@ int edac_device_add_device(struct edac_device_ctl_info *edac_dev)
+ "'%s': DEV '%s' (%s)\n",
+ edac_dev->mod_name,
+ edac_dev->ctl_name,
+- dev_name(edac_dev),
++ edac_dev_name(edac_dev),
+ edac_op_state_to_string(edac_dev->op_state));
+
+ mutex_unlock(&device_ctls_mutex);
+@@ -599,7 +599,7 @@ struct edac_device_ctl_info *edac_device_del_device(struct device *dev)
+ edac_printk(KERN_INFO, EDAC_MC,
+ "Removed device %d for %s %s: DEV %s\n",
+ edac_dev->dev_idx,
+- edac_dev->mod_name, edac_dev->ctl_name, dev_name(edac_dev));
++ edac_dev->mod_name, edac_dev->ctl_name, edac_dev_name(edac_dev));
+
+ return edac_dev;
+ }
+diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
+index a4cf164..d110392 100644
+--- a/drivers/edac/edac_mc.c
++++ b/drivers/edac/edac_mc.c
+@@ -402,7 +402,7 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci)
+ fail0:
+ edac_printk(KERN_WARNING, EDAC_MC,
+ "%s (%s) %s %s already assigned %d\n", p->dev->bus_id,
+- dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx);
++ edac_dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx);
+ return 1;
+
+ fail1:
+@@ -517,7 +517,7 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
+
+ /* Report action taken */
+ edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':"
+- " DEV %s\n", mci->mod_name, mci->ctl_name, dev_name(mci));
++ " DEV %s\n", mci->mod_name, mci->ctl_name, edac_dev_name(mci));
+
+ mutex_unlock(&mem_ctls_mutex);
+ return 0;
+@@ -565,7 +565,7 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
+
+ edac_printk(KERN_INFO, EDAC_MC,
+ "Removed device %d for %s %s: DEV %s\n", mci->mc_idx,
+- mci->mod_name, mci->ctl_name, dev_name(mci));
++ mci->mod_name, mci->ctl_name, edac_dev_name(mci));
+
+ return mci;
+ }
+diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c
+index 9b24340..22ec9d5 100644
+--- a/drivers/edac/edac_pci.c
++++ b/drivers/edac/edac_pci.c
+@@ -150,7 +150,7 @@ static int add_edac_pci_to_global_list(struct edac_pci_ctl_info *pci)
+ fail0:
+ edac_printk(KERN_WARNING, EDAC_PCI,
+ "%s (%s) %s %s already assigned %d\n",
+- rover->dev->bus_id, dev_name(rover),
++ rover->dev->bus_id, edac_dev_name(rover),
+ rover->mod_name, rover->ctl_name, rover->pci_idx);
+ return 1;
+
+@@ -360,7 +360,7 @@ int edac_pci_add_device(struct edac_pci_ctl_info *pci, int edac_idx)
+ " DEV '%s' (%s)\n",
+ pci->mod_name,
+ pci->ctl_name,
+- dev_name(pci), edac_op_state_to_string(pci->op_state));
++ edac_dev_name(pci), edac_op_state_to_string(pci->op_state));
+
+ mutex_unlock(&edac_pci_ctls_mutex);
+ return 0;
+@@ -415,7 +415,7 @@ struct edac_pci_ctl_info *edac_pci_del_device(struct device *dev)
+
+ edac_printk(KERN_INFO, EDAC_PCI,
+ "Removed device %d for %s %s: DEV %s\n",
+- pci->pci_idx, pci->mod_name, pci->ctl_name, dev_name(pci));
++ pci->pci_idx, pci->mod_name, pci->ctl_name, edac_dev_name(pci));
+
+ return pci;
+ }
+diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
+index 591deda..34b0d4f 100644
+--- a/drivers/ide/ide-probe.c
++++ b/drivers/ide/ide-probe.c
+@@ -1355,12 +1355,6 @@ static void ide_init_port(ide_hwif_t *hwif, unsigned int port,
+ if (hwif->chipset != ide_dtc2278 || hwif->channel == 0)
+ hwif->port_ops = d->port_ops;
+
+- if ((d->host_flags & IDE_HFLAG_SERIALIZE) ||
+- ((d->host_flags & IDE_HFLAG_SERIALIZE_DMA) && hwif->dma_base)) {
+- if (hwif->mate)
+- hwif->mate->serialized = hwif->serialized = 1;
+- }
+-
+ hwif->swdma_mask = d->swdma_mask;
+ hwif->mwdma_mask = d->mwdma_mask;
+ hwif->ultra_mask = d->udma_mask;
+@@ -1382,6 +1376,12 @@ static void ide_init_port(ide_hwif_t *hwif, unsigned int port,
+ hwif->dma_ops = d->dma_ops;
+ }
+
++ if ((d->host_flags & IDE_HFLAG_SERIALIZE) ||
++ ((d->host_flags & IDE_HFLAG_SERIALIZE_DMA) && hwif->dma_base)) {
++ if (hwif->mate)
++ hwif->mate->serialized = hwif->serialized = 1;
++ }
++
+ if (d->host_flags & IDE_HFLAG_RQSIZE_256)
+ hwif->rqsize = 256;
+
+diff --git a/drivers/ide/legacy/falconide.c b/drivers/ide/legacy/falconide.c
+index 83555ca..9e449a0 100644
+--- a/drivers/ide/legacy/falconide.c
++++ b/drivers/ide/legacy/falconide.c
+@@ -61,7 +61,7 @@ static void falconide_output_data(ide_drive_t *drive, struct request *rq,
+ unsigned long data_addr = drive->hwif->io_ports.data_addr;
+
+ if (drive->media == ide_disk && rq && rq->cmd_type == REQ_TYPE_FS)
+- return outsw(data_adr, buf, (len + 1) / 2);
++ return outsw(data_addr, buf, (len + 1) / 2);
+
+ outsw_swapw(data_addr, buf, (len + 1) / 2);
+ }
+diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
+index ed2ee4b..5fd8506 100644
+--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
++++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
+@@ -359,9 +359,10 @@ static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq)
+ cq->sw_wptr++;
+ }
+
+-void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
++int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
+ {
+ u32 ptr;
++ int flushed = 0;
+
+ PDBG("%s wq %p cq %p\n", __func__, wq, cq);
+
+@@ -369,8 +370,11 @@ void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
+ PDBG("%s rq_rptr %u rq_wptr %u skip count %u\n", __func__,
+ wq->rq_rptr, wq->rq_wptr, count);
+ ptr = wq->rq_rptr + count;
+- while (ptr++ != wq->rq_wptr)
++ while (ptr++ != wq->rq_wptr) {
+ insert_recv_cqe(wq, cq);
++ flushed++;
++ }
++ return flushed;
+ }
+
+ static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq,
+@@ -394,9 +398,10 @@ static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq,
+ cq->sw_wptr++;
+ }
+
+-void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
++int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
+ {
+ __u32 ptr;
++ int flushed = 0;
+ struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2);
+
+ ptr = wq->sq_rptr + count;
+@@ -405,7 +410,9 @@ void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
+ insert_sq_cqe(wq, cq, sqp);
+ sqp++;
+ ptr++;
++ flushed++;
+ }
++ return flushed;
+ }
+
+ /*
+diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
+index 2bcff7f..69ab08e 100644
+--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
++++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
+@@ -173,8 +173,8 @@ u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp);
+ void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid);
+ int __init cxio_hal_init(void);
+ void __exit cxio_hal_exit(void);
+-void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
+-void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
++int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
++int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
+ void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+ void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+ void cxio_flush_hw_cq(struct t3_cq *cq);
+diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
+index d44a6df..c325c44 100644
+--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
++++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
+@@ -67,10 +67,10 @@ int peer2peer = 0;
+ module_param(peer2peer, int, 0644);
+ MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)");
+
+-static int ep_timeout_secs = 10;
++static int ep_timeout_secs = 60;
+ module_param(ep_timeout_secs, int, 0644);
+ MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
+- "in seconds (default=10)");
++ "in seconds (default=60)");
+
+ static int mpa_rev = 1;
+ module_param(mpa_rev, int, 0644);
+@@ -1650,8 +1650,8 @@ static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+ release = 1;
+ break;
+ case ABORTING:
+- break;
+ case DEAD:
++ break;
+ default:
+ BUG_ON(1);
+ break;
+diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
+index 9b4be88..79dbe5b 100644
+--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
++++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
+@@ -655,6 +655,7 @@ static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag)
+ {
+ struct iwch_cq *rchp, *schp;
+ int count;
++ int flushed;
+
+ rchp = get_chp(qhp->rhp, qhp->attr.rcq);
+ schp = get_chp(qhp->rhp, qhp->attr.scq);
+@@ -669,20 +670,22 @@ static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag)
+ spin_lock(&qhp->lock);
+ cxio_flush_hw_cq(&rchp->cq);
+ cxio_count_rcqes(&rchp->cq, &qhp->wq, &count);
+- cxio_flush_rq(&qhp->wq, &rchp->cq, count);
++ flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count);
+ spin_unlock(&qhp->lock);
+ spin_unlock_irqrestore(&rchp->lock, *flag);
+- (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
++ if (flushed)
++ (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+
+ /* locking heirarchy: cq lock first, then qp lock. */
+ spin_lock_irqsave(&schp->lock, *flag);
+ spin_lock(&qhp->lock);
+ cxio_flush_hw_cq(&schp->cq);
+ cxio_count_scqes(&schp->cq, &qhp->wq, &count);
+- cxio_flush_sq(&qhp->wq, &schp->cq, count);
++ flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count);
+ spin_unlock(&qhp->lock);
+ spin_unlock_irqrestore(&schp->lock, *flag);
+- (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
++ if (flushed)
++ (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
+
+ /* deref */
+ if (atomic_dec_and_test(&qhp->refcnt))
+@@ -880,7 +883,6 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
+ ep = qhp->ep;
+ get_ep(&ep->com);
+ }
+- flush_qp(qhp, &flag);
+ break;
+ case IWCH_QP_STATE_TERMINATE:
+ qhp->attr.state = IWCH_QP_STATE_TERMINATE;
+@@ -911,6 +913,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
+ }
+ switch (attrs->next_state) {
+ case IWCH_QP_STATE_IDLE:
++ flush_qp(qhp, &flag);
+ qhp->attr.state = IWCH_QP_STATE_IDLE;
+ qhp->attr.llp_stream_handle = NULL;
+ put_ep(&qhp->ep->com);
+diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c
+index 2515cbd..bc3b37d 100644
+--- a/drivers/infiniband/hw/ehca/ehca_hca.c
++++ b/drivers/infiniband/hw/ehca/ehca_hca.c
+@@ -101,7 +101,6 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
+ props->max_ee = limit_uint(rblock->max_rd_ee_context);
+ props->max_rdd = limit_uint(rblock->max_rd_domain);
+ props->max_fmr = limit_uint(rblock->max_mr);
+- props->local_ca_ack_delay = limit_uint(rblock->local_ca_ack_delay);
+ props->max_qp_rd_atom = limit_uint(rblock->max_rr_qp);
+ props->max_ee_rd_atom = limit_uint(rblock->max_rr_ee_context);
+ props->max_res_rd_atom = limit_uint(rblock->max_rr_hca);
+@@ -115,7 +114,7 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
+ }
+
+ props->max_pkeys = 16;
+- props->local_ca_ack_delay = limit_uint(rblock->local_ca_ack_delay);
++ props->local_ca_ack_delay = min_t(u8, rblock->local_ca_ack_delay, 255);
+ props->max_raw_ipv6_qp = limit_uint(rblock->max_raw_ipv6_qp);
+ props->max_raw_ethy_qp = limit_uint(rblock->max_raw_ethy_qp);
+ props->max_mcast_grp = limit_uint(rblock->max_mcast_grp);
+@@ -136,7 +135,7 @@ query_device1:
+ return ret;
+ }
+
+-static int map_mtu(struct ehca_shca *shca, u32 fw_mtu)
++static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu)
+ {
+ switch (fw_mtu) {
+ case 0x1:
+@@ -156,7 +155,7 @@ static int map_mtu(struct ehca_shca *shca, u32 fw_mtu)
+ }
+ }
+
+-static int map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
++static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
+ {
+ switch (vl_cap) {
+ case 0x1:
+diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
+index 2f199c5..4521319 100644
+--- a/drivers/infiniband/hw/mlx4/cq.c
++++ b/drivers/infiniband/hw/mlx4/cq.c
+@@ -246,7 +246,7 @@ err_mtt:
+ if (context)
+ ib_umem_release(cq->umem);
+ else
+- mlx4_ib_free_cq_buf(dev, &cq->buf, entries);
++ mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+ err_db:
+ if (!context)
+@@ -434,7 +434,7 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq)
+ mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
+ ib_umem_release(mcq->umem);
+ } else {
+- mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe + 1);
++ mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
+ mlx4_db_free(dev->dev, &mcq->db);
+ }
+
+diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
+index 9044f88..ca126fc 100644
+--- a/drivers/infiniband/ulp/ipoib/ipoib.h
++++ b/drivers/infiniband/ulp/ipoib/ipoib.h
+@@ -334,6 +334,7 @@ struct ipoib_dev_priv {
+ #endif
+ int hca_caps;
+ struct ipoib_ethtool_st ethtool;
++ struct timer_list poll_timer;
+ };
+
+ struct ipoib_ah {
+@@ -404,6 +405,7 @@ extern struct workqueue_struct *ipoib_workqueue;
+
+ int ipoib_poll(struct napi_struct *napi, int budget);
+ void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
++void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr);
+
+ struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+ struct ib_pd *pd, struct ib_ah_attr *attr);
+diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+index 97b815c..f429bce 100644
+--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
++++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+@@ -461,6 +461,26 @@ void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+ netif_rx_schedule(dev, &priv->napi);
+ }
+
++static void drain_tx_cq(struct net_device *dev)
++{
++ struct ipoib_dev_priv *priv = netdev_priv(dev);
++ unsigned long flags;
++
++ spin_lock_irqsave(&priv->tx_lock, flags);
++ while (poll_tx(priv))
++ ; /* nothing */
++
++ if (netif_queue_stopped(dev))
++ mod_timer(&priv->poll_timer, jiffies + 1);
++
++ spin_unlock_irqrestore(&priv->tx_lock, flags);
++}
++
++void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
++{
++ drain_tx_cq((struct net_device *)dev_ptr);
++}
++
+ static inline int post_send(struct ipoib_dev_priv *priv,
+ unsigned int wr_id,
+ struct ib_ah *address, u32 qpn,
+@@ -555,12 +575,22 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+ else
+ priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+
++ if (++priv->tx_outstanding == ipoib_sendq_size) {
++ ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
++ if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
++ ipoib_warn(priv, "request notify on send CQ failed\n");
++ netif_stop_queue(dev);
++ }
++
+ if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
+ address->ah, qpn, tx_req, phead, hlen))) {
+ ipoib_warn(priv, "post_send failed\n");
+ ++dev->stats.tx_errors;
++ --priv->tx_outstanding;
+ ipoib_dma_unmap_tx(priv->ca, tx_req);
+ dev_kfree_skb_any(skb);
++ if (netif_queue_stopped(dev))
++ netif_wake_queue(dev);
+ } else {
+ dev->trans_start = jiffies;
+
+@@ -568,14 +598,11 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+ ++priv->tx_head;
+ skb_orphan(skb);
+
+- if (++priv->tx_outstanding == ipoib_sendq_size) {
+- ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+- netif_stop_queue(dev);
+- }
+ }
+
+ if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+- poll_tx(priv);
++ while (poll_tx(priv))
++ ; /* nothing */
+ }
+
+ static void __ipoib_reap_ah(struct net_device *dev)
+@@ -609,6 +636,11 @@ void ipoib_reap_ah(struct work_struct *work)
+ round_jiffies_relative(HZ));
+ }
+
++static void ipoib_ib_tx_timer_func(unsigned long ctx)
++{
++ drain_tx_cq((struct net_device *)ctx);
++}
++
+ int ipoib_ib_dev_open(struct net_device *dev)
+ {
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+@@ -645,6 +677,10 @@ int ipoib_ib_dev_open(struct net_device *dev)
+ queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+ round_jiffies_relative(HZ));
+
++ init_timer(&priv->poll_timer);
++ priv->poll_timer.function = ipoib_ib_tx_timer_func;
++ priv->poll_timer.data = (unsigned long)dev;
++
+ set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
+
+ return 0;
+@@ -810,6 +846,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
+ ipoib_dbg(priv, "All sends and receives done.\n");
+
+ timeout:
++ del_timer_sync(&priv->poll_timer);
+ qp_attr.qp_state = IB_QPS_RESET;
+ if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+ ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+index c1e7ece..8766d29 100644
+--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
++++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+@@ -187,7 +187,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
+ goto out_free_mr;
+ }
+
+- priv->send_cq = ib_create_cq(priv->ca, NULL, NULL, dev, ipoib_sendq_size, 0);
++ priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,
++ dev, ipoib_sendq_size, 0);
+ if (IS_ERR(priv->send_cq)) {
+ printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
+ goto out_free_recv_cq;
+diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c
+index 02b3ad8..edfedd9 100644
+--- a/drivers/input/serio/hp_sdc.c
++++ b/drivers/input/serio/hp_sdc.c
+@@ -69,6 +69,7 @@
+ #include <linux/time.h>
+ #include <linux/slab.h>
+ #include <linux/hil.h>
++#include <linux/semaphore.h>
+ #include <asm/io.h>
+ #include <asm/system.h>
+
+diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
+index 2097820..b8b9e44 100644
+--- a/drivers/macintosh/adb.c
++++ b/drivers/macintosh/adb.c
+@@ -37,7 +37,7 @@
+ #include <linux/device.h>
+ #include <linux/kthread.h>
+ #include <linux/platform_device.h>
+-#include <linux/semaphore.h>
++#include <linux/mutex.h>
+
+ #include <asm/uaccess.h>
+ #ifdef CONFIG_PPC
+@@ -102,7 +102,7 @@ static struct adb_handler {
+ } adb_handler[16];
+
+ /*
+- * The adb_handler_sem mutex protects all accesses to the original_address
++ * The adb_handler_mutex mutex protects all accesses to the original_address
+ * and handler_id fields of adb_handler[i] for all i, and changes to the
+ * handler field.
+ * Accesses to the handler field are protected by the adb_handler_lock
+@@ -110,7 +110,7 @@ static struct adb_handler {
+ * time adb_unregister returns, we know that the old handler isn't being
+ * called.
+ */
+-static DECLARE_MUTEX(adb_handler_sem);
++static DEFINE_MUTEX(adb_handler_mutex);
+ static DEFINE_RWLOCK(adb_handler_lock);
+
+ #if 0
+@@ -355,7 +355,7 @@ do_adb_reset_bus(void)
+ msleep(500);
+ }
+
+- down(&adb_handler_sem);
++ mutex_lock(&adb_handler_mutex);
+ write_lock_irq(&adb_handler_lock);
+ memset(adb_handler, 0, sizeof(adb_handler));
+ write_unlock_irq(&adb_handler_lock);
+@@ -376,7 +376,7 @@ do_adb_reset_bus(void)
+ if (adb_controller->autopoll)
+ adb_controller->autopoll(autopoll_devs);
+ }
+- up(&adb_handler_sem);
++ mutex_unlock(&adb_handler_mutex);
+
+ blocking_notifier_call_chain(&adb_client_list,
+ ADB_MSG_POST_RESET, NULL);
+@@ -454,7 +454,7 @@ adb_register(int default_id, int handler_id, struct adb_ids *ids,
+ {
+ int i;
+
+- down(&adb_handler_sem);
++ mutex_lock(&adb_handler_mutex);
+ ids->nids = 0;
+ for (i = 1; i < 16; i++) {
+ if ((adb_handler[i].original_address == default_id) &&
+@@ -472,7 +472,7 @@ adb_register(int default_id, int handler_id, struct adb_ids *ids,
+ ids->id[ids->nids++] = i;
+ }
+ }
+- up(&adb_handler_sem);
++ mutex_unlock(&adb_handler_mutex);
+ return ids->nids;
+ }
+
+@@ -481,7 +481,7 @@ adb_unregister(int index)
+ {
+ int ret = -ENODEV;
+
+- down(&adb_handler_sem);
++ mutex_lock(&adb_handler_mutex);
+ write_lock_irq(&adb_handler_lock);
+ if (adb_handler[index].handler) {
+ while(adb_handler[index].busy) {
+@@ -493,7 +493,7 @@ adb_unregister(int index)
+ adb_handler[index].handler = NULL;
+ }
+ write_unlock_irq(&adb_handler_lock);
+- up(&adb_handler_sem);
++ mutex_unlock(&adb_handler_mutex);
+ return ret;
+ }
+
+@@ -557,19 +557,19 @@ adb_try_handler_change(int address, int new_id)
+ {
+ int ret;
+
+- down(&adb_handler_sem);
++ mutex_lock(&adb_handler_mutex);
+ ret = try_handler_change(address, new_id);
+- up(&adb_handler_sem);
++ mutex_unlock(&adb_handler_mutex);
+ return ret;
+ }
+
+ int
+ adb_get_infos(int address, int *original_address, int *handler_id)
+ {
+- down(&adb_handler_sem);
++ mutex_lock(&adb_handler_mutex);
+ *original_address = adb_handler[address].original_address;
+ *handler_id = adb_handler[address].handler_id;
+- up(&adb_handler_sem);
++ mutex_unlock(&adb_handler_mutex);
+
+ return (*original_address != 0);
+ }
+@@ -628,10 +628,10 @@ do_adb_query(struct adb_request *req)
+ case ADB_QUERY_GETDEVINFO:
+ if (req->nbytes < 3)
+ break;
+- down(&adb_handler_sem);
++ mutex_lock(&adb_handler_mutex);
+ req->reply[0] = adb_handler[req->data[2]].original_address;
+ req->reply[1] = adb_handler[req->data[2]].handler_id;
+- up(&adb_handler_sem);
++ mutex_unlock(&adb_handler_mutex);
+ req->complete = 1;
+ req->reply_len = 2;
+ adb_write_done(req);
+diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
+index 1e0a69a..ddfb426 100644
+--- a/drivers/macintosh/therm_pm72.c
++++ b/drivers/macintosh/therm_pm72.c
+@@ -122,6 +122,7 @@
+ #include <linux/kmod.h>
+ #include <linux/i2c.h>
+ #include <linux/kthread.h>
++#include <linux/mutex.h>
+ #include <asm/prom.h>
+ #include <asm/machdep.h>
+ #include <asm/io.h>
+@@ -169,7 +170,7 @@ static int rackmac;
+ static s32 dimm_output_clamp;
+ static int fcu_rpm_shift;
+ static int fcu_tickle_ticks;
+-static DECLARE_MUTEX(driver_lock);
++static DEFINE_MUTEX(driver_lock);
+
+ /*
+ * We have 3 types of CPU PID control. One is "split" old style control
+@@ -729,9 +730,9 @@ static void fetch_cpu_pumps_minmax(void)
+ static ssize_t show_##name(struct device *dev, struct device_attribute *attr, char *buf) \
+ { \
+ ssize_t r; \
+- down(&driver_lock); \
++ mutex_lock(&driver_lock); \
+ r = sprintf(buf, "%d.%03d", FIX32TOPRINT(data)); \
+- up(&driver_lock); \
++ mutex_unlock(&driver_lock); \
+ return r; \
+ }
+ #define BUILD_SHOW_FUNC_INT(name, data) \
+@@ -1803,11 +1804,11 @@ static int main_control_loop(void *x)
+ {
+ DBG("main_control_loop started\n");
+
+- down(&driver_lock);
++ mutex_lock(&driver_lock);
+
+ if (start_fcu() < 0) {
+ printk(KERN_ERR "kfand: failed to start FCU\n");
+- up(&driver_lock);
++ mutex_unlock(&driver_lock);
+ goto out;
+ }
+
+@@ -1822,14 +1823,14 @@ static int main_control_loop(void *x)
+
+ fcu_tickle_ticks = FCU_TICKLE_TICKS;
+
+- up(&driver_lock);
++ mutex_unlock(&driver_lock);
+
+ while (state == state_attached) {
+ unsigned long elapsed, start;
+
+ start = jiffies;
+
+- down(&driver_lock);
++ mutex_lock(&driver_lock);
+
+ /* Tickle the FCU just in case */
+ if (--fcu_tickle_ticks < 0) {
+@@ -1861,7 +1862,7 @@ static int main_control_loop(void *x)
+ do_monitor_slots(&slots_state);
+ else
+ do_monitor_drives(&drives_state);
+- up(&driver_lock);
++ mutex_unlock(&driver_lock);
+
+ if (critical_state == 1) {
+ printk(KERN_WARNING "Temperature control detected a critical condition\n");
+@@ -2019,13 +2020,13 @@ static void detach_fcu(void)
+ */
+ static int therm_pm72_attach(struct i2c_adapter *adapter)
+ {
+- down(&driver_lock);
++ mutex_lock(&driver_lock);
+
+ /* Check state */
+ if (state == state_detached)
+ state = state_attaching;
+ if (state != state_attaching) {
+- up(&driver_lock);
++ mutex_unlock(&driver_lock);
+ return 0;
+ }
+
+@@ -2054,7 +2055,7 @@ static int therm_pm72_attach(struct i2c_adapter *adapter)
+ state = state_attached;
+ start_control_loops();
+ }
+- up(&driver_lock);
++ mutex_unlock(&driver_lock);
+
+ return 0;
+ }
+@@ -2065,16 +2066,16 @@ static int therm_pm72_attach(struct i2c_adapter *adapter)
+ */
+ static int therm_pm72_detach(struct i2c_adapter *adapter)
+ {
+- down(&driver_lock);
++ mutex_lock(&driver_lock);
+
+ if (state != state_detached)
+ state = state_detaching;
+
+ /* Stop control loops if any */
+ DBG("stopping control loops\n");
+- up(&driver_lock);
++ mutex_unlock(&driver_lock);
+ stop_control_loops();
+- down(&driver_lock);
++ mutex_lock(&driver_lock);
+
+ if (u3_0 != NULL && !strcmp(adapter->name, "u3 0")) {
+ DBG("lost U3-0, disposing control loops\n");
+@@ -2090,7 +2091,7 @@ static int therm_pm72_detach(struct i2c_adapter *adapter)
+ if (u3_0 == NULL && u3_1 == NULL)
+ state = state_detached;
+
+- up(&driver_lock);
++ mutex_unlock(&driver_lock);
+
+ return 0;
+ }
+diff --git a/drivers/macintosh/windfarm_smu_sat.c b/drivers/macintosh/windfarm_smu_sat.c
+index 797918d..7f2be4b 100644
+--- a/drivers/macintosh/windfarm_smu_sat.c
++++ b/drivers/macintosh/windfarm_smu_sat.c
+@@ -13,7 +13,7 @@
+ #include <linux/init.h>
+ #include <linux/wait.h>
+ #include <linux/i2c.h>
+-#include <linux/semaphore.h>
++#include <linux/mutex.h>
+ #include <asm/prom.h>
+ #include <asm/smu.h>
+ #include <asm/pmac_low_i2c.h>
+@@ -36,7 +36,7 @@
+ struct wf_sat {
+ int nr;
+ atomic_t refcnt;
+- struct semaphore mutex;
++ struct mutex mutex;
+ unsigned long last_read; /* jiffies when cache last updated */
+ u8 cache[16];
+ struct i2c_client i2c;
+@@ -163,7 +163,7 @@ static int wf_sat_get(struct wf_sensor *sr, s32 *value)
+ if (sat->i2c.adapter == NULL)
+ return -ENODEV;
+
+- down(&sat->mutex);
++ mutex_lock(&sat->mutex);
+ if (time_after(jiffies, (sat->last_read + MAX_AGE))) {
+ err = wf_sat_read_cache(sat);
+ if (err)
+@@ -182,7 +182,7 @@ static int wf_sat_get(struct wf_sensor *sr, s32 *value)
+ err = 0;
+
+ fail:
+- up(&sat->mutex);
++ mutex_unlock(&sat->mutex);
+ return err;
+ }
+
+@@ -233,7 +233,7 @@ static void wf_sat_create(struct i2c_adapter *adapter, struct device_node *dev)
+ sat->nr = -1;
+ sat->node = of_node_get(dev);
+ atomic_set(&sat->refcnt, 0);
+- init_MUTEX(&sat->mutex);
++ mutex_init(&sat->mutex);
+ sat->i2c.addr = (addr >> 1) & 0x7f;
+ sat->i2c.adapter = adapter;
+ sat->i2c.driver = &wf_sat_driver;
+diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
+index 30a1af8..fa39410 100644
+--- a/drivers/misc/kgdbts.c
++++ b/drivers/misc/kgdbts.c
+@@ -47,6 +47,7 @@
+ * to test the HW NMI watchdog
+ * F## = Break at do_fork for ## iterations
+ * S## = Break at sys_open for ## iterations
++ * I## = Run the single step test ## iterations
+ *
+ * NOTE: that the do_fork and sys_open tests are mutually exclusive.
+ *
+@@ -375,7 +376,7 @@ static void emul_sstep_get(char *arg)
+ break;
+ case 1:
+ /* set breakpoint */
+- break_helper("Z0", 0, sstep_addr);
++ break_helper("Z0", NULL, sstep_addr);
+ break;
+ case 2:
+ /* Continue */
+@@ -383,7 +384,7 @@ static void emul_sstep_get(char *arg)
+ break;
+ case 3:
+ /* Clear breakpoint */
+- break_helper("z0", 0, sstep_addr);
++ break_helper("z0", NULL, sstep_addr);
+ break;
+ default:
+ eprintk("kgdbts: ERROR failed sstep get emulation\n");
+@@ -465,11 +466,11 @@ static struct test_struct sw_breakpoint_test[] = {
+ { "?", "S0*" }, /* Clear break points */
+ { "kgdbts_break_test", "OK", sw_break, }, /* set sw breakpoint */
+ { "c", "T0*", }, /* Continue */
+- { "g", "kgdbts_break_test", 0, check_and_rewind_pc },
++ { "g", "kgdbts_break_test", NULL, check_and_rewind_pc },
+ { "write", "OK", write_regs },
+ { "kgdbts_break_test", "OK", sw_rem_break }, /*remove breakpoint */
+ { "D", "OK" }, /* Detach */
+- { "D", "OK", 0, got_break }, /* If the test worked we made it here */
++ { "D", "OK", NULL, got_break }, /* On success we made it here */
+ { "", "" },
+ };
+
+@@ -499,14 +500,14 @@ static struct test_struct singlestep_break_test[] = {
+ { "?", "S0*" }, /* Clear break points */
+ { "kgdbts_break_test", "OK", sw_break, }, /* set sw breakpoint */
+ { "c", "T0*", }, /* Continue */
+- { "g", "kgdbts_break_test", 0, check_and_rewind_pc },
++ { "g", "kgdbts_break_test", NULL, check_and_rewind_pc },
+ { "write", "OK", write_regs }, /* Write registers */
+ { "kgdbts_break_test", "OK", sw_rem_break }, /*remove breakpoint */
+ { "s", "T0*", emul_sstep_get, emul_sstep_put }, /* Single step */
+- { "g", "kgdbts_break_test", 0, check_single_step },
++ { "g", "kgdbts_break_test", NULL, check_single_step },
+ { "kgdbts_break_test", "OK", sw_break, }, /* set sw breakpoint */
+ { "c", "T0*", }, /* Continue */
+- { "g", "kgdbts_break_test", 0, check_and_rewind_pc },
++ { "g", "kgdbts_break_test", NULL, check_and_rewind_pc },
+ { "write", "OK", write_regs }, /* Write registers */
+ { "D", "OK" }, /* Remove all breakpoints and continues */
+ { "", "" },
+@@ -520,14 +521,14 @@ static struct test_struct do_fork_test[] = {
+ { "?", "S0*" }, /* Clear break points */
+ { "do_fork", "OK", sw_break, }, /* set sw breakpoint */
+ { "c", "T0*", }, /* Continue */
+- { "g", "do_fork", 0, check_and_rewind_pc }, /* check location */
++ { "g", "do_fork", NULL, check_and_rewind_pc }, /* check location */
+ { "write", "OK", write_regs }, /* Write registers */
+ { "do_fork", "OK", sw_rem_break }, /*remove breakpoint */
+ { "s", "T0*", emul_sstep_get, emul_sstep_put }, /* Single step */
+- { "g", "do_fork", 0, check_single_step },
++ { "g", "do_fork", NULL, check_single_step },
+ { "do_fork", "OK", sw_break, }, /* set sw breakpoint */
+ { "7", "T0*", skip_back_repeat_test }, /* Loop based on repeat_test */
+- { "D", "OK", 0, final_ack_set }, /* detach and unregister I/O */
++ { "D", "OK", NULL, final_ack_set }, /* detach and unregister I/O */
+ { "", "" },
+ };
+
+@@ -538,14 +539,14 @@ static struct test_struct sys_open_test[] = {
+ { "?", "S0*" }, /* Clear break points */
+ { "sys_open", "OK", sw_break, }, /* set sw breakpoint */
+ { "c", "T0*", }, /* Continue */
+- { "g", "sys_open", 0, check_and_rewind_pc }, /* check location */
++ { "g", "sys_open", NULL, check_and_rewind_pc }, /* check location */
+ { "write", "OK", write_regs }, /* Write registers */
+ { "sys_open", "OK", sw_rem_break }, /*remove breakpoint */
+ { "s", "T0*", emul_sstep_get, emul_sstep_put }, /* Single step */
+- { "g", "sys_open", 0, check_single_step },
++ { "g", "sys_open", NULL, check_single_step },
+ { "sys_open", "OK", sw_break, }, /* set sw breakpoint */
+ { "7", "T0*", skip_back_repeat_test }, /* Loop based on repeat_test */
+- { "D", "OK", 0, final_ack_set }, /* detach and unregister I/O */
++ { "D", "OK", NULL, final_ack_set }, /* detach and unregister I/O */
+ { "", "" },
+ };
+
+@@ -556,11 +557,11 @@ static struct test_struct hw_breakpoint_test[] = {
+ { "?", "S0*" }, /* Clear break points */
+ { "kgdbts_break_test", "OK", hw_break, }, /* set hw breakpoint */
+ { "c", "T0*", }, /* Continue */
+- { "g", "kgdbts_break_test", 0, check_and_rewind_pc },
++ { "g", "kgdbts_break_test", NULL, check_and_rewind_pc },
+ { "write", "OK", write_regs },
+ { "kgdbts_break_test", "OK", hw_rem_break }, /*remove breakpoint */
+ { "D", "OK" }, /* Detach */
+- { "D", "OK", 0, got_break }, /* If the test worked we made it here */
++ { "D", "OK", NULL, got_break }, /* On success we made it here */
+ { "", "" },
+ };
+
+@@ -570,12 +571,12 @@ static struct test_struct hw_breakpoint_test[] = {
+ static struct test_struct hw_write_break_test[] = {
+ { "?", "S0*" }, /* Clear break points */
+ { "hw_break_val", "OK", hw_write_break, }, /* set hw breakpoint */
+- { "c", "T0*", 0, got_break }, /* Continue */
+- { "g", "silent", 0, check_and_rewind_pc },
++ { "c", "T0*", NULL, got_break }, /* Continue */
++ { "g", "silent", NULL, check_and_rewind_pc },
+ { "write", "OK", write_regs },
+ { "hw_break_val", "OK", hw_rem_write_break }, /*remove breakpoint */
+ { "D", "OK" }, /* Detach */
+- { "D", "OK", 0, got_break }, /* If the test worked we made it here */
++ { "D", "OK", NULL, got_break }, /* On success we made it here */
+ { "", "" },
+ };
+
+@@ -585,12 +586,12 @@ static struct test_struct hw_write_break_test[] = {
+ static struct test_struct hw_access_break_test[] = {
+ { "?", "S0*" }, /* Clear break points */
+ { "hw_break_val", "OK", hw_access_break, }, /* set hw breakpoint */
+- { "c", "T0*", 0, got_break }, /* Continue */
+- { "g", "silent", 0, check_and_rewind_pc },
++ { "c", "T0*", NULL, got_break }, /* Continue */
++ { "g", "silent", NULL, check_and_rewind_pc },
+ { "write", "OK", write_regs },
+ { "hw_break_val", "OK", hw_rem_access_break }, /*remove breakpoint */
+ { "D", "OK" }, /* Detach */
+- { "D", "OK", 0, got_break }, /* If the test worked we made it here */
++ { "D", "OK", NULL, got_break }, /* On success we made it here */
+ { "", "" },
+ };
+
+@@ -599,9 +600,9 @@ static struct test_struct hw_access_break_test[] = {
+ */
+ static struct test_struct nmi_sleep_test[] = {
+ { "?", "S0*" }, /* Clear break points */
+- { "c", "T0*", 0, got_break }, /* Continue */
++ { "c", "T0*", NULL, got_break }, /* Continue */
+ { "D", "OK" }, /* Detach */
+- { "D", "OK", 0, got_break }, /* If the test worked we made it here */
++ { "D", "OK", NULL, got_break }, /* On success we made it here */
+ { "", "" },
+ };
+
+@@ -874,18 +875,23 @@ static void kgdbts_run_tests(void)
+ {
+ char *ptr;
+ int fork_test = 0;
+- int sys_open_test = 0;
++ int do_sys_open_test = 0;
++ int sstep_test = 1000;
+ int nmi_sleep = 0;
++ int i;
+
+ ptr = strstr(config, "F");
+ if (ptr)
+- fork_test = simple_strtol(ptr+1, NULL, 10);
++ fork_test = simple_strtol(ptr + 1, NULL, 10);
+ ptr = strstr(config, "S");
+ if (ptr)
+- sys_open_test = simple_strtol(ptr+1, NULL, 10);
++ do_sys_open_test = simple_strtol(ptr + 1, NULL, 10);
+ ptr = strstr(config, "N");
+ if (ptr)
+ nmi_sleep = simple_strtol(ptr+1, NULL, 10);
++ ptr = strstr(config, "I");
++ if (ptr)
++ sstep_test = simple_strtol(ptr+1, NULL, 10);
+
+ /* required internal KGDB tests */
+ v1printk("kgdbts:RUN plant and detach test\n");
+@@ -894,8 +900,13 @@ static void kgdbts_run_tests(void)
+ run_breakpoint_test(0);
+ v1printk("kgdbts:RUN bad memory access test\n");
+ run_bad_read_test();
+- v1printk("kgdbts:RUN singlestep breakpoint test\n");
+- run_singlestep_break_test();
++ v1printk("kgdbts:RUN singlestep test %i iterations\n", sstep_test);
++ for (i = 0; i < sstep_test; i++) {
++ run_singlestep_break_test();
++ if (i % 100 == 0)
++ v1printk("kgdbts:RUN singlestep [%i/%i]\n",
++ i, sstep_test);
++ }
+
+ /* ===Optional tests=== */
+
+@@ -922,7 +933,7 @@ static void kgdbts_run_tests(void)
+ repeat_test = fork_test;
+ printk(KERN_INFO "kgdbts:RUN do_fork for %i breakpoints\n",
+ repeat_test);
+- kthread_run(kgdbts_unreg_thread, 0, "kgdbts_unreg");
++ kthread_run(kgdbts_unreg_thread, NULL, "kgdbts_unreg");
+ run_do_fork_test();
+ return;
+ }
+@@ -931,11 +942,11 @@ static void kgdbts_run_tests(void)
+ * executed because a kernel thread will be spawned at the very
+ * end to unregister the debug hooks.
+ */
+- if (sys_open_test) {
+- repeat_test = sys_open_test;
++ if (do_sys_open_test) {
++ repeat_test = do_sys_open_test;
+ printk(KERN_INFO "kgdbts:RUN sys_open for %i breakpoints\n",
+ repeat_test);
+- kthread_run(kgdbts_unreg_thread, 0, "kgdbts_unreg");
++ kthread_run(kgdbts_unreg_thread, NULL, "kgdbts_unreg");
+ run_sys_open_test();
+ return;
+ }
+diff --git a/drivers/net/irda/nsc-ircc.c b/drivers/net/irda/nsc-ircc.c
+index a873d2b..a7714da 100644
+--- a/drivers/net/irda/nsc-ircc.c
++++ b/drivers/net/irda/nsc-ircc.c
+@@ -100,7 +100,9 @@ static int nsc_ircc_probe_39x(nsc_chip_t *chip, chipio_t *info);
+ static int nsc_ircc_init_108(nsc_chip_t *chip, chipio_t *info);
+ static int nsc_ircc_init_338(nsc_chip_t *chip, chipio_t *info);
+ static int nsc_ircc_init_39x(nsc_chip_t *chip, chipio_t *info);
++#ifdef CONFIG_PNP
+ static int nsc_ircc_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *id);
++#endif
+
+ /* These are the known NSC chips */
+ static nsc_chip_t chips[] = {
+@@ -156,9 +158,11 @@ static const struct pnp_device_id nsc_ircc_pnp_table[] = {
+ MODULE_DEVICE_TABLE(pnp, nsc_ircc_pnp_table);
+
+ static struct pnp_driver nsc_ircc_pnp_driver = {
++#ifdef CONFIG_PNP
+ .name = "nsc-ircc",
+ .id_table = nsc_ircc_pnp_table,
+ .probe = nsc_ircc_pnp_probe,
++#endif
+ };
+
+ /* Some prototypes */
+@@ -916,6 +920,7 @@ static int nsc_ircc_probe_39x(nsc_chip_t *chip, chipio_t *info)
+ return 0;
+ }
+
++#ifdef CONFIG_PNP
+ /* PNP probing */
+ static int nsc_ircc_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *id)
+ {
+@@ -952,6 +957,7 @@ static int nsc_ircc_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *i
+
+ return 0;
+ }
++#endif
+
+ /*
+ * Function nsc_ircc_setup (info)
+diff --git a/drivers/net/irda/smsc-ircc2.c b/drivers/net/irda/smsc-ircc2.c
+index 1f26da7..cfe0194 100644
+--- a/drivers/net/irda/smsc-ircc2.c
++++ b/drivers/net/irda/smsc-ircc2.c
+@@ -376,6 +376,7 @@ MODULE_DEVICE_TABLE(pnp, smsc_ircc_pnp_table);
+
+ static int pnp_driver_registered;
+
++#ifdef CONFIG_PNP
+ static int __init smsc_ircc_pnp_probe(struct pnp_dev *dev,
+ const struct pnp_device_id *dev_id)
+ {
+@@ -402,7 +403,9 @@ static struct pnp_driver smsc_ircc_pnp_driver = {
+ .id_table = smsc_ircc_pnp_table,
+ .probe = smsc_ircc_pnp_probe,
+ };
+-
++#else /* CONFIG_PNP */
++static struct pnp_driver smsc_ircc_pnp_driver;
++#endif
+
+ /*******************************************************************************
+ *
+diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
+index cb46446..03a9abc 100644
+--- a/drivers/net/mlx4/mr.c
++++ b/drivers/net/mlx4/mr.c
+@@ -551,7 +551,7 @@ int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
+ u64 mtt_seg;
+ int err = -ENOMEM;
+
+- if (page_shift < 12 || page_shift >= 32)
++ if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
+ return -EINVAL;
+
+ /* All MTTs must fit in the same page */
+diff --git a/drivers/net/niu.c b/drivers/net/niu.c
+index 4009c4c..57cfd72 100644
+--- a/drivers/net/niu.c
++++ b/drivers/net/niu.c
+@@ -1,6 +1,6 @@
+ /* niu.c: Neptune ethernet driver.
+ *
+- * Copyright (C) 2007 David S. Miller (davem at davemloft.net)
++ * Copyright (C) 2007, 2008 David S. Miller (davem at davemloft.net)
+ */
+
+ #include <linux/module.h>
+@@ -33,8 +33,8 @@
+
+ #define DRV_MODULE_NAME "niu"
+ #define PFX DRV_MODULE_NAME ": "
+-#define DRV_MODULE_VERSION "0.8"
+-#define DRV_MODULE_RELDATE "April 24, 2008"
++#define DRV_MODULE_VERSION "0.9"
++#define DRV_MODULE_RELDATE "May 4, 2008"
+
+ static char version[] __devinitdata =
+ DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+@@ -7264,8 +7264,11 @@ static int __devinit niu_get_and_validate_port(struct niu *np)
+ parent->num_ports = nr64(ESPC_NUM_PORTS_MACS) &
+ ESPC_NUM_PORTS_MACS_VAL;
+
++ /* All of the current probing methods fail on
++ * Maramba on-board parts.
++ */
+ if (!parent->num_ports)
+- return -ENODEV;
++ parent->num_ports = 4;
+ }
+ }
+ }
+diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
+index b5860b9..24fd613 100644
+--- a/drivers/net/wan/lapbether.c
++++ b/drivers/net/wan/lapbether.c
+@@ -459,6 +459,7 @@ static void __exit lapbeth_cleanup_driver(void)
+ list_for_each_safe(entry, tmp, &lapbeth_devices) {
+ lapbeth = list_entry(entry, struct lapbethdev, node);
+
++ dev_put(lapbeth->ethdev);
+ unregister_netdevice(lapbeth->axdev);
+ }
+ rtnl_unlock();
+diff --git a/drivers/net/wireless/iwlwifi/Kconfig b/drivers/net/wireless/iwlwifi/Kconfig
+index d5b7a76..62fb89d 100644
+--- a/drivers/net/wireless/iwlwifi/Kconfig
++++ b/drivers/net/wireless/iwlwifi/Kconfig
+@@ -1,6 +1,5 @@
+ config IWLWIFI
+- bool
+- default n
++ tristate
+
+ config IWLCORE
+ tristate "Intel Wireless Wifi Core"
+diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
+index 4a55bf3..3706ce7 100644
+--- a/drivers/pci/probe.c
++++ b/drivers/pci/probe.c
+@@ -842,13 +842,25 @@ static void set_pcie_port_type(struct pci_dev *pdev)
+ * reading the dword at 0x100 which must either be 0 or a valid extended
+ * capability header.
+ */
+-int pci_cfg_space_size_ext(struct pci_dev *dev, unsigned check_exp_pcix)
++int pci_cfg_space_size_ext(struct pci_dev *dev)
+ {
+- int pos;
+ u32 status;
+
+- if (!check_exp_pcix)
+- goto skip;
++ if (pci_read_config_dword(dev, 256, &status) != PCIBIOS_SUCCESSFUL)
++ goto fail;
++ if (status == 0xffffffff)
++ goto fail;
++
++ return PCI_CFG_SPACE_EXP_SIZE;
++
++ fail:
++ return PCI_CFG_SPACE_SIZE;
++}
++
++int pci_cfg_space_size(struct pci_dev *dev)
++{
++ int pos;
++ u32 status;
+
+ pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+ if (!pos) {
+@@ -861,23 +873,12 @@ int pci_cfg_space_size_ext(struct pci_dev *dev, unsigned check_exp_pcix)
+ goto fail;
+ }
+
+- skip:
+- if (pci_read_config_dword(dev, 256, &status) != PCIBIOS_SUCCESSFUL)
+- goto fail;
+- if (status == 0xffffffff)
+- goto fail;
+-
+- return PCI_CFG_SPACE_EXP_SIZE;
++ return pci_cfg_space_size_ext(dev);
+
+ fail:
+ return PCI_CFG_SPACE_SIZE;
+ }
+
+-int pci_cfg_space_size(struct pci_dev *dev)
+-{
+- return pci_cfg_space_size_ext(dev, 1);
+-}
+-
+ static void pci_release_bus_bridge_dev(struct device *dev)
+ {
+ kfree(dev);
+diff --git a/drivers/sbus/char/bpp.c b/drivers/sbus/char/bpp.c
+index 4fab0c2..b87037e 100644
+--- a/drivers/sbus/char/bpp.c
++++ b/drivers/sbus/char/bpp.c
+@@ -41,7 +41,7 @@
+ #define BPP_DELAY 100
+
+ static const unsigned BPP_MAJOR = LP_MAJOR;
+-static const char* dev_name = "bpp";
++static const char *bpp_dev_name = "bpp";
+
+ /* When switching from compatibility to a mode where I can read, try
+ the following mode first. */
+diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
+index 46d7e40..81ccbd7 100644
+--- a/drivers/scsi/Kconfig
++++ b/drivers/scsi/Kconfig
+@@ -1679,6 +1679,7 @@ config MAC_SCSI
+ config SCSI_MAC_ESP
+ tristate "Macintosh NCR53c9[46] SCSI"
+ depends on MAC && SCSI
++ select SCSI_SPI_ATTRS
+ help
+ This is the NCR 53c9x SCSI controller found on most of the 68040
+ based Macintoshes.
+diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
+index 33b467a..1ef6df3 100644
+--- a/drivers/usb/host/Kconfig
++++ b/drivers/usb/host/Kconfig
+@@ -129,7 +129,7 @@ config USB_ISP1760_PCI
+
+ config USB_ISP1760_OF
+ bool "Support for the OF platform bus"
+- depends on USB_ISP1760_HCD && OF
++ depends on USB_ISP1760_HCD && PPC_OF
+ ---help---
+ Enables support for the device present on the PowerPC
+ OpenFirmware platform bus.
+diff --git a/drivers/video/bw2.c b/drivers/video/bw2.c
+index 275d9da..79f85dc 100644
+--- a/drivers/video/bw2.c
++++ b/drivers/video/bw2.c
+@@ -329,7 +329,7 @@ static int __devinit bw2_probe(struct of_device *op, const struct of_device_id *
+ if (!info->screen_base)
+ goto out_unmap_regs;
+
+- bw2_blank(0, info);
++ bw2_blank(FB_BLANK_UNBLANK, info);
+
+ bw2_init_fix(info, linebytes);
+
+diff --git a/drivers/video/cg3.c b/drivers/video/cg3.c
+index 010ea53..e31e26a 100644
+--- a/drivers/video/cg3.c
++++ b/drivers/video/cg3.c
+@@ -398,7 +398,7 @@ static int __devinit cg3_probe(struct of_device *op,
+ if (!info->screen_base)
+ goto out_unmap_regs;
+
+- cg3_blank(0, info);
++ cg3_blank(FB_BLANK_UNBLANK, info);
+
+ if (!of_find_property(dp, "width", NULL)) {
+ err = cg3_do_default_mode(par);
+diff --git a/drivers/video/cg6.c b/drivers/video/cg6.c
+index fc90db6..8000bcc 100644
+--- a/drivers/video/cg6.c
++++ b/drivers/video/cg6.c
+@@ -767,7 +767,7 @@ static int __devinit cg6_probe(struct of_device *op,
+
+ cg6_bt_init(par);
+ cg6_chip_init(info);
+- cg6_blank(0, info);
++ cg6_blank(FB_BLANK_UNBLANK, info);
+
+ if (fb_alloc_cmap(&info->cmap, 256, 0))
+ goto out_unmap_regs;
+diff --git a/drivers/video/ffb.c b/drivers/video/ffb.c
+index 93dca3e..0f42a69 100644
+--- a/drivers/video/ffb.c
++++ b/drivers/video/ffb.c
+@@ -987,7 +987,7 @@ static int __devinit ffb_probe(struct of_device *op,
+ * chosen console, it will have video outputs off in
+ * the DAC.
+ */
+- ffb_blank(0, info);
++ ffb_blank(FB_BLANK_UNBLANK, info);
+
+ if (fb_alloc_cmap(&info->cmap, 256, 0))
+ goto out_unmap_dac;
+diff --git a/drivers/video/leo.c b/drivers/video/leo.c
+index f3160fc..fb12992 100644
+--- a/drivers/video/leo.c
++++ b/drivers/video/leo.c
+@@ -601,7 +601,7 @@ static int __devinit leo_probe(struct of_device *op, const struct of_device_id *
+ leo_init_wids(info);
+ leo_init_hw(info);
+
+- leo_blank(0, info);
++ leo_blank(FB_BLANK_UNBLANK, info);
+
+ if (fb_alloc_cmap(&info->cmap, 256, 0))
+ goto out_unmap_regs;
+diff --git a/drivers/video/p9100.c b/drivers/video/p9100.c
+index c95874f..676ffb0 100644
+--- a/drivers/video/p9100.c
++++ b/drivers/video/p9100.c
+@@ -295,7 +295,7 @@ static int __devinit p9100_probe(struct of_device *op, const struct of_device_id
+ if (!info->screen_base)
+ goto out_unmap_regs;
+
+- p9100_blank(0, info);
++ p9100_blank(FB_BLANK_UNBLANK, info);
+
+ if (fb_alloc_cmap(&info->cmap, 256, 0))
+ goto out_unmap_screen;
+diff --git a/drivers/video/tcx.c b/drivers/video/tcx.c
+index a717743..44e8c27 100644
+--- a/drivers/video/tcx.c
++++ b/drivers/video/tcx.c
+@@ -84,7 +84,7 @@ struct tcx_tec {
+
+ struct tcx_thc {
+ u32 thc_rev;
+- u32 thc_pad0[511];
++ u32 thc_pad0[511];
+ u32 thc_hs; /* hsync timing */
+ u32 thc_hsdvs;
+ u32 thc_hd;
+@@ -126,10 +126,10 @@ struct tcx_par {
+ };
+
+ /* Reset control plane so that WID is 8-bit plane. */
+-static void __tcx_set_control_plane (struct tcx_par *par)
++static void __tcx_set_control_plane(struct tcx_par *par)
+ {
+ u32 __iomem *p, *pend;
+-
++
+ if (par->lowdepth)
+ return;
+
+@@ -143,8 +143,8 @@ static void __tcx_set_control_plane (struct tcx_par *par)
+ sbus_writel(tmp, p);
+ }
+ }
+-
+-static void tcx_reset (struct fb_info *info)
++
++static void tcx_reset(struct fb_info *info)
+ {
+ struct tcx_par *par = (struct tcx_par *) info->par;
+ unsigned long flags;
+@@ -365,7 +365,8 @@ static void tcx_unmap_regs(struct of_device *op, struct fb_info *info,
+ info->screen_base, par->fbsize);
+ }
+
+-static int __devinit tcx_init_one(struct of_device *op)
++static int __devinit tcx_probe(struct of_device *op,
++ const struct of_device_id *match)
+ {
+ struct device_node *dp = op->node;
+ struct fb_info *info;
+@@ -488,13 +489,6 @@ out_err:
+ return err;
+ }
+
+-static int __devinit tcx_probe(struct of_device *dev, const struct of_device_id *match)
+-{
+- struct of_device *op = to_of_device(&dev->dev);
+-
+- return tcx_init_one(op);
+-}
+-
+ static int __devexit tcx_remove(struct of_device *op)
+ {
+ struct fb_info *info = dev_get_drvdata(&op->dev);
+diff --git a/fs/affs/affs.h b/fs/affs/affs.h
+index d5bd497..223b191 100644
+--- a/fs/affs/affs.h
++++ b/fs/affs/affs.h
+@@ -48,7 +48,7 @@ struct affs_ext_key {
+ * affs fs inode data in memory
+ */
+ struct affs_inode_info {
+- u32 i_opencnt;
++ atomic_t i_opencnt;
+ struct semaphore i_link_lock; /* Protects internal inode access. */
+ struct semaphore i_ext_lock; /* Protects internal inode access. */
+ #define i_hash_lock i_ext_lock
+@@ -170,8 +170,6 @@ extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ extern unsigned long affs_parent_ino(struct inode *dir);
+ extern struct inode *affs_new_inode(struct inode *dir);
+ extern int affs_notify_change(struct dentry *dentry, struct iattr *attr);
+-extern void affs_put_inode(struct inode *inode);
+-extern void affs_drop_inode(struct inode *inode);
+ extern void affs_delete_inode(struct inode *inode);
+ extern void affs_clear_inode(struct inode *inode);
+ extern struct inode *affs_iget(struct super_block *sb,
+diff --git a/fs/affs/file.c b/fs/affs/file.c
+index 1a4f092..6eac7bd 100644
+--- a/fs/affs/file.c
++++ b/fs/affs/file.c
+@@ -48,8 +48,9 @@ affs_file_open(struct inode *inode, struct file *filp)
+ {
+ if (atomic_read(&filp->f_count) != 1)
+ return 0;
+- pr_debug("AFFS: open(%d)\n", AFFS_I(inode)->i_opencnt);
+- AFFS_I(inode)->i_opencnt++;
++ pr_debug("AFFS: open(%lu,%d)\n",
++ inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
++ atomic_inc(&AFFS_I(inode)->i_opencnt);
+ return 0;
+ }
+
+@@ -58,10 +59,16 @@ affs_file_release(struct inode *inode, struct file *filp)
+ {
+ if (atomic_read(&filp->f_count) != 0)
+ return 0;
+- pr_debug("AFFS: release(%d)\n", AFFS_I(inode)->i_opencnt);
+- AFFS_I(inode)->i_opencnt--;
+- if (!AFFS_I(inode)->i_opencnt)
++ pr_debug("AFFS: release(%lu, %d)\n",
++ inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
++
++ if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
++ mutex_lock(&inode->i_mutex);
++ if (inode->i_size != AFFS_I(inode)->mmu_private)
++ affs_truncate(inode);
+ affs_free_prealloc(inode);
++ mutex_unlock(&inode->i_mutex);
++ }
+
+ return 0;
+ }
+@@ -180,7 +187,7 @@ affs_get_extblock(struct inode *inode, u32 ext)
+ /* inline the simplest case: same extended block as last time */
+ struct buffer_head *bh = AFFS_I(inode)->i_ext_bh;
+ if (ext == AFFS_I(inode)->i_ext_last)
+- atomic_inc(&bh->b_count);
++ get_bh(bh);
+ else
+ /* we have to do more (not inlined) */
+ bh = affs_get_extblock_slow(inode, ext);
+@@ -306,7 +313,7 @@ store_ext:
+ affs_brelse(AFFS_I(inode)->i_ext_bh);
+ AFFS_I(inode)->i_ext_last = ext;
+ AFFS_I(inode)->i_ext_bh = bh;
+- atomic_inc(&bh->b_count);
++ get_bh(bh);
+
+ return bh;
+
+@@ -324,7 +331,6 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
+
+ pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block);
+
+-
+ BUG_ON(block > (sector_t)0x7fffffffUL);
+
+ if (block >= AFFS_I(inode)->i_blkcnt) {
+@@ -827,6 +833,8 @@ affs_truncate(struct inode *inode)
+ res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
+ if (!res)
+ res = mapping->a_ops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
++ else
++ inode->i_size = AFFS_I(inode)->mmu_private;
+ mark_inode_dirty(inode);
+ return;
+ } else if (inode->i_size == AFFS_I(inode)->mmu_private)
+@@ -862,6 +870,7 @@ affs_truncate(struct inode *inode)
+ blk++;
+ } else
+ AFFS_HEAD(ext_bh)->first_data = 0;
++ AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(i);
+ size = AFFS_SB(sb)->s_hashsize;
+ if (size > blkcnt - blk + i)
+ size = blkcnt - blk + i;
+diff --git a/fs/affs/inode.c b/fs/affs/inode.c
+index 27fe6cb..a13b334 100644
+--- a/fs/affs/inode.c
++++ b/fs/affs/inode.c
+@@ -58,7 +58,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
+ AFFS_I(inode)->i_extcnt = 1;
+ AFFS_I(inode)->i_ext_last = ~1;
+ AFFS_I(inode)->i_protect = prot;
+- AFFS_I(inode)->i_opencnt = 0;
++ atomic_set(&AFFS_I(inode)->i_opencnt, 0);
+ AFFS_I(inode)->i_blkcnt = 0;
+ AFFS_I(inode)->i_lc = NULL;
+ AFFS_I(inode)->i_lc_size = 0;
+@@ -108,8 +108,6 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
+ inode->i_mode |= S_IFDIR;
+ } else
+ inode->i_mode = S_IRUGO | S_IXUGO | S_IWUSR | S_IFDIR;
+- if (tail->link_chain)
+- inode->i_nlink = 2;
+ /* Maybe it should be controlled by mount parameter? */
+ //inode->i_mode |= S_ISVTX;
+ inode->i_op = &affs_dir_inode_operations;
+@@ -245,31 +243,12 @@ out:
+ }
+
+ void
+-affs_put_inode(struct inode *inode)
+-{
+- pr_debug("AFFS: put_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+- affs_free_prealloc(inode);
+-}
+-
+-void
+-affs_drop_inode(struct inode *inode)
+-{
+- mutex_lock(&inode->i_mutex);
+- if (inode->i_size != AFFS_I(inode)->mmu_private)
+- affs_truncate(inode);
+- mutex_unlock(&inode->i_mutex);
+-
+- generic_drop_inode(inode);
+-}
+-
+-void
+ affs_delete_inode(struct inode *inode)
+ {
+ pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+ truncate_inode_pages(&inode->i_data, 0);
+ inode->i_size = 0;
+- if (S_ISREG(inode->i_mode))
+- affs_truncate(inode);
++ affs_truncate(inode);
+ clear_inode(inode);
+ affs_free_block(inode->i_sb, inode->i_ino);
+ }
+@@ -277,9 +256,12 @@ affs_delete_inode(struct inode *inode)
+ void
+ affs_clear_inode(struct inode *inode)
+ {
+- unsigned long cache_page = (unsigned long) AFFS_I(inode)->i_lc;
++ unsigned long cache_page;
+
+ pr_debug("AFFS: clear_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
++
++ affs_free_prealloc(inode);
++ cache_page = (unsigned long)AFFS_I(inode)->i_lc;
+ if (cache_page) {
+ pr_debug("AFFS: freeing ext cache\n");
+ AFFS_I(inode)->i_lc = NULL;
+@@ -316,7 +298,7 @@ affs_new_inode(struct inode *dir)
+ inode->i_ino = block;
+ inode->i_nlink = 1;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+- AFFS_I(inode)->i_opencnt = 0;
++ atomic_set(&AFFS_I(inode)->i_opencnt, 0);
+ AFFS_I(inode)->i_blkcnt = 0;
+ AFFS_I(inode)->i_lc = NULL;
+ AFFS_I(inode)->i_lc_size = 0;
+@@ -369,12 +351,12 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
+ switch (type) {
+ case ST_LINKFILE:
+ case ST_LINKDIR:
+- inode_bh = bh;
+ retval = -ENOSPC;
+ block = affs_alloc_block(dir, dir->i_ino);
+ if (!block)
+ goto err;
+ retval = -EIO;
++ inode_bh = bh;
+ bh = affs_getzeroblk(sb, block);
+ if (!bh)
+ goto err;
+diff --git a/fs/affs/namei.c b/fs/affs/namei.c
+index 2218f1e..cfcf1b6 100644
+--- a/fs/affs/namei.c
++++ b/fs/affs/namei.c
+@@ -234,7 +234,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+ int
+ affs_unlink(struct inode *dir, struct dentry *dentry)
+ {
+- pr_debug("AFFS: unlink(dir=%d, \"%.*s\")\n", (u32)dir->i_ino,
++ pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino,
++ dentry->d_inode->i_ino,
+ (int)dentry->d_name.len, dentry->d_name.name);
+
+ return affs_remove_header(dentry);
+@@ -302,7 +303,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+ int
+ affs_rmdir(struct inode *dir, struct dentry *dentry)
+ {
+- pr_debug("AFFS: rmdir(dir=%u, \"%.*s\")\n", (u32)dir->i_ino,
++ pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino,
++ dentry->d_inode->i_ino,
+ (int)dentry->d_name.len, dentry->d_name.name);
+
+ return affs_remove_header(dentry);
+diff --git a/fs/affs/super.c b/fs/affs/super.c
+index 01d25d5..d214837 100644
+--- a/fs/affs/super.c
++++ b/fs/affs/super.c
+@@ -71,12 +71,18 @@ static struct kmem_cache * affs_inode_cachep;
+
+ static struct inode *affs_alloc_inode(struct super_block *sb)
+ {
+- struct affs_inode_info *ei;
+- ei = (struct affs_inode_info *)kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
+- if (!ei)
++ struct affs_inode_info *i;
++
++ i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
++ if (!i)
+ return NULL;
+- ei->vfs_inode.i_version = 1;
+- return &ei->vfs_inode;
++
++ i->vfs_inode.i_version = 1;
++ i->i_lc = NULL;
++ i->i_ext_bh = NULL;
++ i->i_pa_cnt = 0;
++
++ return &i->vfs_inode;
+ }
+
+ static void affs_destroy_inode(struct inode *inode)
+@@ -114,8 +120,6 @@ static const struct super_operations affs_sops = {
+ .alloc_inode = affs_alloc_inode,
+ .destroy_inode = affs_destroy_inode,
+ .write_inode = affs_write_inode,
+- .put_inode = affs_put_inode,
+- .drop_inode = affs_drop_inode,
+ .delete_inode = affs_delete_inode,
+ .clear_inode = affs_clear_inode,
+ .put_super = affs_put_super,
+diff --git a/fs/inode.c b/fs/inode.c
+index bf64781..c36d948 100644
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -1149,13 +1149,8 @@ static inline void iput_final(struct inode *inode)
+ void iput(struct inode *inode)
+ {
+ if (inode) {
+- const struct super_operations *op = inode->i_sb->s_op;
+-
+ BUG_ON(inode->i_state == I_CLEAR);
+
+- if (op && op->put_inode)
+- op->put_inode(inode);
+-
+ if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ iput_final(inode);
+ }
+diff --git a/fs/locks.c b/fs/locks.c
+index 663c069..0ac6b92 100644
+--- a/fs/locks.c
++++ b/fs/locks.c
+@@ -1753,6 +1753,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct file_lock *file_lock = locks_alloc_lock();
+ struct flock flock;
+ struct inode *inode;
++ struct file *f;
+ int error;
+
+ if (file_lock == NULL)
+@@ -1825,7 +1826,15 @@ again:
+ * Attempt to detect a close/fcntl race and recover by
+ * releasing the lock that was just acquired.
+ */
+- if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) {
++ /*
++ * we need that spin_lock here - it prevents reordering between
++ * update of inode->i_flock and check for it done in close().
++ * rcu_read_lock() wouldn't do.
++ */
++ spin_lock(¤t->files->file_lock);
++ f = fcheck(fd);
++ spin_unlock(¤t->files->file_lock);
++ if (!error && f != filp && flock.l_type != F_UNLCK) {
+ flock.l_type = F_UNLCK;
+ goto again;
+ }
+@@ -1881,6 +1890,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct file_lock *file_lock = locks_alloc_lock();
+ struct flock64 flock;
+ struct inode *inode;
++ struct file *f;
+ int error;
+
+ if (file_lock == NULL)
+@@ -1953,7 +1963,10 @@ again:
+ * Attempt to detect a close/fcntl race and recover by
+ * releasing the lock that was just acquired.
+ */
+- if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) {
++ spin_lock(¤t->files->file_lock);
++ f = fcheck(fd);
++ spin_unlock(¤t->files->file_lock);
++ if (!error && f != filp && flock.l_type != F_UNLCK) {
+ flock.l_type = F_UNLCK;
+ goto again;
+ }
+diff --git a/fs/pipe.c b/fs/pipe.c
+index f73492b..3499f9f 100644
+--- a/fs/pipe.c
++++ b/fs/pipe.c
+@@ -1076,6 +1076,23 @@ int do_pipe(int *fd)
+ }
+
+ /*
++ * sys_pipe() is the normal C calling standard for creating
++ * a pipe. It's not the way Unix traditionally does this, though.
++ */
++asmlinkage long __weak sys_pipe(int __user *fildes)
++{
++ int fd[2];
++ int error;
++
++ error = do_pipe(fd);
++ if (!error) {
++ if (copy_to_user(fildes, fd, sizeof(fd)))
++ error = -EFAULT;
++ }
++ return error;
++}
++
++/*
+ * pipefs should _never_ be mounted by userland - too much of security hassle,
+ * no real gain from having the whole whorehouse mounted. So we don't need
+ * any operations on the root directory. However, we need a non-trivial
+diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
+index 4b733f1..4b4f9cc 100644
+--- a/fs/proc/task_nommu.c
++++ b/fs/proc/task_nommu.c
+@@ -1,6 +1,7 @@
+
+ #include <linux/mm.h>
+ #include <linux/file.h>
++#include <linux/fdtable.h>
+ #include <linux/mount.h>
+ #include <linux/ptrace.h>
+ #include <linux/seq_file.h>
+diff --git a/include/asm-alpha/types.h b/include/asm-alpha/types.h
+index a9e34ca..c154135 100644
+--- a/include/asm-alpha/types.h
++++ b/include/asm-alpha/types.h
+@@ -23,5 +23,11 @@ typedef unsigned int umode_t;
+
+ #define BITS_PER_LONG 64
+
++#ifndef __ASSEMBLY__
++
++typedef u64 dma_addr_t;
++typedef u64 dma64_addr_t;
++
++#endif /* __ASSEMBLY__ */
+ #endif /* __KERNEL__ */
+ #endif /* _ALPHA_TYPES_H */
+diff --git a/include/asm-m68k/machw.h b/include/asm-m68k/machw.h
+index d2e0e25..3562499 100644
+--- a/include/asm-m68k/machw.h
++++ b/include/asm-m68k/machw.h
+@@ -66,36 +66,6 @@ struct MAC_SCC
+ # define mac_scc ((*(volatile struct SCC*)MAC_SCC_BAS))
+ #endif
+
+-/* hardware stuff */
+-
+-#define MACHW_DECLARE(name) unsigned name : 1
+-#define MACHW_SET(name) (mac_hw_present.name = 1)
+-#define MACHW_PRESENT(name) (mac_hw_present.name)
+-
+-struct mac_hw_present {
+- /* video hardware */
+- /* sound hardware */
+- /* disk storage interfaces */
+- MACHW_DECLARE(MAC_SCSI_80); /* Directly mapped NCR5380 */
+- MACHW_DECLARE(MAC_SCSI_96); /* 53c9[46] */
+- MACHW_DECLARE(MAC_SCSI_96_2); /* 2nd 53c9[46] Q900 and Q950 */
+- MACHW_DECLARE(IDE); /* IDE Interface */
+- /* other I/O hardware */
+- MACHW_DECLARE(SCC); /* Serial Communications Contr. */
+- /* DMA */
+- MACHW_DECLARE(SCSI_DMA); /* DMA for the NCR5380 */
+- /* real time clocks */
+- MACHW_DECLARE(RTC_CLK); /* clock chip */
+- /* supporting hardware */
+- MACHW_DECLARE(VIA1); /* Versatile Interface Ad. 1 */
+- MACHW_DECLARE(VIA2); /* Versatile Interface Ad. 2 */
+- MACHW_DECLARE(RBV); /* Versatile Interface Ad. 2+ */
+- /* NUBUS */
+- MACHW_DECLARE(NUBUS); /* NUBUS */
+-};
+-
+-extern struct mac_hw_present mac_hw_present;
+-
+ #endif /* __ASSEMBLY__ */
+
+ #endif /* linux/machw.h */
+diff --git a/include/asm-mips/types.h b/include/asm-mips/types.h
+index 7a2ee4f..bcbb8d6 100644
+--- a/include/asm-mips/types.h
++++ b/include/asm-mips/types.h
+@@ -19,8 +19,6 @@
+
+ typedef unsigned short umode_t;
+
+-#endif
+-
+ #endif /* __ASSEMBLY__ */
+
+ /*
+diff --git a/include/asm-powerpc/io.h b/include/asm-powerpc/io.h
+index afae069..e0062d7 100644
+--- a/include/asm-powerpc/io.h
++++ b/include/asm-powerpc/io.h
+@@ -2,7 +2,7 @@
+ #define _ASM_POWERPC_IO_H
+ #ifdef __KERNEL__
+
+-/*
++/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+@@ -18,6 +18,9 @@ extern int check_legacy_ioport(unsigned long base_port);
+ #define _PNPWRP 0xa79
+ #define PNPBIOS_BASE 0xf000
+
++#include <linux/device.h>
++#include <linux/io.h>
++
+ #include <linux/compiler.h>
+ #include <asm/page.h>
+ #include <asm/byteorder.h>
+@@ -744,6 +747,9 @@ static inline void * bus_to_virt(unsigned long address)
+
+ #define clrsetbits_8(addr, clear, set) clrsetbits(8, addr, clear, set)
+
++void __iomem *devm_ioremap_prot(struct device *dev, resource_size_t offset,
++ size_t size, unsigned long flags);
++
+ #endif /* __KERNEL__ */
+
+ #endif /* _ASM_POWERPC_IO_H */
+diff --git a/include/asm-powerpc/kvm_host.h b/include/asm-powerpc/kvm_host.h
+index 04ffbb8..81a69d7 100644
+--- a/include/asm-powerpc/kvm_host.h
++++ b/include/asm-powerpc/kvm_host.h
+@@ -59,6 +59,7 @@ struct kvm_vcpu_stat {
+ u32 emulated_inst_exits;
+ u32 dec_exits;
+ u32 ext_intr_exits;
++ u32 halt_wakeup;
+ };
+
+ struct tlbe {
+diff --git a/include/asm-powerpc/kvm_ppc.h b/include/asm-powerpc/kvm_ppc.h
+index 7ac8203..b35a7e3 100644
+--- a/include/asm-powerpc/kvm_ppc.h
++++ b/include/asm-powerpc/kvm_ppc.h
+@@ -77,12 +77,17 @@ static inline void kvmppc_clear_exception(struct kvm_vcpu *vcpu, int exception)
+ clear_bit(priority, &vcpu->arch.pending_exceptions);
+ }
+
++/* Helper function for "full" MSR writes. No need to call this if only EE is
++ * changing. */
+ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
+ {
+ if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
+ kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
+
+ vcpu->arch.msr = new_msr;
++
++ if (vcpu->arch.msr & MSR_WE)
++ kvm_vcpu_block(vcpu);
+ }
+
+ #endif /* __POWERPC_KVM_PPC_H__ */
+diff --git a/include/asm-powerpc/syscalls.h b/include/asm-powerpc/syscalls.h
+index b3ca41f..2b8a458 100644
+--- a/include/asm-powerpc/syscalls.h
++++ b/include/asm-powerpc/syscalls.h
+@@ -30,7 +30,7 @@ asmlinkage int sys_fork(unsigned long p1, unsigned long p2,
+ asmlinkage int sys_vfork(unsigned long p1, unsigned long p2,
+ unsigned long p3, unsigned long p4, unsigned long p5,
+ unsigned long p6, struct pt_regs *regs);
+-asmlinkage int sys_pipe(int __user *fildes);
++asmlinkage long sys_pipe(int __user *fildes);
+ asmlinkage long sys_rt_sigaction(int sig,
+ const struct sigaction __user *act,
+ struct sigaction __user *oact, size_t sigsetsize);
+diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h
+index e865990..f62f473 100644
+--- a/include/asm-x86/bootparam.h
++++ b/include/asm-x86/bootparam.h
+@@ -14,10 +14,10 @@
+
+ /* extensible setup data list node */
+ struct setup_data {
+- u64 next;
+- u32 type;
+- u32 len;
+- u8 data[0];
++ __u64 next;
++ __u32 type;
++ __u32 len;
++ __u8 data[0];
+ };
+
+ struct setup_header {
+diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
+index 9d963cd..1d8cd01 100644
+--- a/include/asm-x86/kvm_host.h
++++ b/include/asm-x86/kvm_host.h
+@@ -314,6 +314,9 @@ struct kvm_arch{
+ struct page *apic_access_page;
+
+ gpa_t wall_clock;
++
++ struct page *ept_identity_pagetable;
++ bool ept_identity_pagetable_done;
+ };
+
+ struct kvm_vm_stat {
+@@ -422,6 +425,7 @@ struct kvm_x86_ops {
+ struct kvm_run *run);
+
+ int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
++ int (*get_tdp_level)(void);
+ };
+
+ extern struct kvm_x86_ops *kvm_x86_ops;
+@@ -433,6 +437,9 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+ int kvm_mmu_create(struct kvm_vcpu *vcpu);
+ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
++void kvm_mmu_set_base_ptes(u64 base_pte);
++void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
++ u64 dirty_mask, u64 nx_mask, u64 x_mask);
+
+ int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+@@ -620,7 +627,7 @@ static inline void fx_restore(struct i387_fxsave_struct *image)
+ asm("fxrstor (%0)":: "r" (image));
+ }
+
+-static inline void fpu_init(void)
++static inline void fx_finit(void)
+ {
+ asm("finit");
+ }
+@@ -644,6 +651,7 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
+ #define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
+ #define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
+ #define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
++#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+ #define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+
+ #define MSR_IA32_TIME_STAMP_COUNTER 0x010
+diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
+index 577ab79..d7f0403 100644
+--- a/include/asm-x86/pgtable_32.h
++++ b/include/asm-x86/pgtable_32.h
+@@ -88,14 +88,7 @@ extern unsigned long pg0[];
+ /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
+ #define pmd_none(x) (!(unsigned long)pmd_val((x)))
+ #define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
+-
+-extern int pmd_bad(pmd_t pmd);
+-
+-#define pmd_bad_v1(x) \
+- (_KERNPG_TABLE != (pmd_val((x)) & ~(PAGE_MASK | _PAGE_USER)))
+-#define pmd_bad_v2(x) \
+- (_KERNPG_TABLE != (pmd_val((x)) & ~(PAGE_MASK | _PAGE_USER | \
+- _PAGE_PSE | _PAGE_NX)))
++#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+
+ #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
+
+diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
+index a3bbf87..efe83dc 100644
+--- a/include/asm-x86/pgtable_64.h
++++ b/include/asm-x86/pgtable_64.h
+@@ -158,14 +158,12 @@ static inline unsigned long pgd_bad(pgd_t pgd)
+
+ static inline unsigned long pud_bad(pud_t pud)
+ {
+- return pud_val(pud) &
+- ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER | _PAGE_PSE | _PAGE_NX);
++ return pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
+ }
+
+ static inline unsigned long pmd_bad(pmd_t pmd)
+ {
+- return pmd_val(pmd) &
+- ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER | _PAGE_PSE | _PAGE_NX);
++ return pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
+ }
+
+ #define pte_none(x) (!pte_val((x)))
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index a1ba005..7e0fa9e 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -1289,17 +1289,12 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
+ extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
+ unsigned long, loff_t *);
+
+-/*
+- * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
+- * without the big kernel lock held in all filesystems.
+- */
+ struct super_operations {
+ struct inode *(*alloc_inode)(struct super_block *sb);
+ void (*destroy_inode)(struct inode *);
+
+ void (*dirty_inode) (struct inode *);
+ int (*write_inode) (struct inode *, int);
+- void (*put_inode) (struct inode *);
+ void (*drop_inode) (struct inode *);
+ void (*delete_inode) (struct inode *);
+ void (*put_super) (struct super_block *);
+diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
+index 31a4d65..6d93dce 100644
+--- a/include/linux/hrtimer.h
++++ b/include/linux/hrtimer.h
+@@ -316,6 +316,15 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
+ (HRTIMER_STATE_ENQUEUED | HRTIMER_STATE_PENDING);
+ }
+
++/*
++ * Helper function to check, whether the timer is running the callback
++ * function
++ */
++static inline int hrtimer_callback_running(struct hrtimer *timer)
++{
++ return timer->state & HRTIMER_STATE_CALLBACK;
++}
++
+ /* Forward a hrtimer so it expires after now: */
+ extern u64
+ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
+diff --git a/include/linux/io.h b/include/linux/io.h
+index 3a03a36..6c7f0ba 100644
+--- a/include/linux/io.h
++++ b/include/linux/io.h
+@@ -65,5 +65,6 @@ void __iomem *devm_ioremap_nocache(struct device *dev, resource_size_t offset,
+ void devm_iounmap(struct device *dev, void __iomem *addr);
+ int check_signature(const volatile void __iomem *io_addr,
+ const unsigned char *signature, int length);
++void devm_ioremap_release(struct device *dev, void *res);
+
+ #endif /* _LINUX_IO_H */
+diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
+index 9757b1a..6adcc29 100644
+--- a/include/linux/kgdb.h
++++ b/include/linux/kgdb.h
+@@ -261,10 +261,12 @@ struct kgdb_io {
+
+ extern struct kgdb_arch arch_kgdb_ops;
+
++extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs);
++
+ extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
+ extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
+
+-extern int kgdb_hex2long(char **ptr, long *long_val);
++extern int kgdb_hex2long(char **ptr, unsigned long *long_val);
+ extern int kgdb_mem2hex(char *mem, char *buf, int count);
+ extern int kgdb_hex2mem(char *buf, char *mem, int count);
+
+diff --git a/include/linux/libata.h b/include/linux/libata.h
+index d1dfe87..7e206da 100644
+--- a/include/linux/libata.h
++++ b/include/linux/libata.h
+@@ -1039,6 +1039,7 @@ extern void ata_eh_thaw_port(struct ata_port *ap);
+
+ extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
+ extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
++extern void ata_eh_analyze_ncq_error(struct ata_link *link);
+
+ extern void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
+ ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
+@@ -1381,6 +1382,21 @@ static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host)
+ return *(struct ata_port **)&host->hostdata[0];
+ }
+
++static inline int ata_check_ready(u8 status)
++{
++ /* Some controllers report 0x77 or 0x7f during intermediate
++ * not-ready stages.
++ */
++ if (status == 0x77 || status == 0x7f)
++ return 0;
++
++ /* 0xff indicates either no device or device not ready */
++ if (status == 0xff)
++ return -ENODEV;
++
++ return !(status & ATA_BUSY);
++}
++
+
+ /**************************************************************************
+ * PMP - drivers/ata/libata-pmp.c
+diff --git a/include/linux/pci.h b/include/linux/pci.h
+index 96acd0d..509159b 100644
+--- a/include/linux/pci.h
++++ b/include/linux/pci.h
+@@ -44,6 +44,7 @@
+ #include <linux/mod_devicetable.h>
+
+ #include <linux/types.h>
++#include <linux/init.h>
+ #include <linux/ioport.h>
+ #include <linux/list.h>
+ #include <linux/compiler.h>
+@@ -474,7 +475,7 @@ extern struct pci_bus *pci_find_bus(int domain, int busnr);
+ void pci_bus_add_devices(struct pci_bus *bus);
+ struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus,
+ struct pci_ops *ops, void *sysdata);
+-static inline struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops,
++static inline struct pci_bus * __devinit pci_scan_bus(int bus, struct pci_ops *ops,
+ void *sysdata)
+ {
+ struct pci_bus *root_bus;
+@@ -666,7 +667,7 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
+
+ void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
+ void *userdata);
+-int pci_cfg_space_size_ext(struct pci_dev *dev, unsigned check_exp_pcix);
++int pci_cfg_space_size_ext(struct pci_dev *dev);
+ int pci_cfg_space_size(struct pci_dev *dev);
+ unsigned char pci_bus_max_busnr(struct pci_bus *bus);
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 03c2380..0c35b03 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -158,6 +158,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+ }
+ #endif
+
++extern unsigned long long time_sync_thresh;
++
+ /*
+ * Task state bitmask. NOTE! These bits are also
+ * encoded in fs/proc/array.c: get_task_state().
+@@ -1551,6 +1553,35 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+
+ extern unsigned long long sched_clock(void);
+
++#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
++static inline void sched_clock_init(void)
++{
++}
++
++static inline u64 sched_clock_cpu(int cpu)
++{
++ return sched_clock();
++}
++
++static inline void sched_clock_tick(void)
++{
++}
++
++static inline void sched_clock_idle_sleep_event(void)
++{
++}
++
++static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
++{
++}
++#else
++extern void sched_clock_init(void);
++extern u64 sched_clock_cpu(int cpu);
++extern void sched_clock_tick(void);
++extern void sched_clock_idle_sleep_event(void);
++extern void sched_clock_idle_wakeup_event(u64 delta_ns);
++#endif
++
+ /*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+@@ -1977,6 +2008,11 @@ static inline void clear_tsk_need_resched(struct task_struct *tsk)
+ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+ }
+
++static inline int test_tsk_need_resched(struct task_struct *tsk)
++{
++ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
++}
++
+ static inline int signal_pending(struct task_struct *p)
+ {
+ return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
+@@ -1991,7 +2027,7 @@ static inline int fatal_signal_pending(struct task_struct *p)
+
+ static inline int need_resched(void)
+ {
+- return unlikely(test_thread_flag(TIF_NEED_RESCHED));
++ return unlikely(test_tsk_need_resched(current));
+ }
+
+ /*
+diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
+index 27bad59..7858eac 100644
+--- a/include/linux/sysfs.h
++++ b/include/linux/sysfs.h
+@@ -196,12 +196,6 @@ static inline int sysfs_update_group(struct kobject *kobj,
+ return 0;
+ }
+
+-static inline int sysfs_update_group(struct kobject *kobj,
+- const struct attribute_group *grp)
+-{
+- return 0;
+-}
+-
+ static inline void sysfs_remove_group(struct kobject *kobj,
+ const struct attribute_group *grp)
+ {
+diff --git a/include/net/ip.h b/include/net/ip.h
+index 6d7bcd5..3b40bc2 100644
+--- a/include/net/ip.h
++++ b/include/net/ip.h
+@@ -210,7 +210,7 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
+ {
+ return (inet_sk(sk)->pmtudisc == IP_PMTUDISC_DO ||
+ (inet_sk(sk)->pmtudisc == IP_PMTUDISC_WANT &&
+- !(dst_metric(dst, RTAX_LOCK)&(1<<RTAX_MTU))));
++ !(dst_metric_locked(dst, RTAX_MTU))));
+ }
+
+ extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more);
+diff --git a/include/net/xfrm.h b/include/net/xfrm.h
+index d1350bc..2933d74 100644
+--- a/include/net/xfrm.h
++++ b/include/net/xfrm.h
+@@ -648,14 +648,46 @@ extern void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
+ extern void xfrm_audit_state_icvfail(struct xfrm_state *x,
+ struct sk_buff *skb, u8 proto);
+ #else
+-#define xfrm_audit_policy_add(x, r, a, se, s) do { ; } while (0)
+-#define xfrm_audit_policy_delete(x, r, a, se, s) do { ; } while (0)
+-#define xfrm_audit_state_add(x, r, a, se, s) do { ; } while (0)
+-#define xfrm_audit_state_delete(x, r, a, se, s) do { ; } while (0)
+-#define xfrm_audit_state_replay_overflow(x, s) do { ; } while (0)
+-#define xfrm_audit_state_notfound_simple(s, f) do { ; } while (0)
+-#define xfrm_audit_state_notfound(s, f, sp, sq) do { ; } while (0)
+-#define xfrm_audit_state_icvfail(x, s, p) do { ; } while (0)
++
++static inline void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
++ u32 auid, u32 ses, u32 secid)
++{
++}
++
++static inline void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
++ u32 auid, u32 ses, u32 secid)
++{
++}
++
++static inline void xfrm_audit_state_add(struct xfrm_state *x, int result,
++ u32 auid, u32 ses, u32 secid)
++{
++}
++
++static inline void xfrm_audit_state_delete(struct xfrm_state *x, int result,
++ u32 auid, u32 ses, u32 secid)
++{
++}
++
++static inline void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
++ struct sk_buff *skb)
++{
++}
++
++static inline void xfrm_audit_state_notfound_simple(struct sk_buff *skb,
++ u16 family)
++{
++}
++
++static inline void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
++ __be32 net_spi, __be32 net_seq)
++{
++}
++
++static inline void xfrm_audit_state_icvfail(struct xfrm_state *x,
++ struct sk_buff *skb, u8 proto)
++{
++}
+ #endif /* CONFIG_AUDITSYSCALL */
+
+ static inline void xfrm_pol_hold(struct xfrm_policy *policy)
+diff --git a/init/Kconfig b/init/Kconfig
+index 6a44def..4c33316 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -316,9 +316,16 @@ config CPUSETS
+
+ Say N if unsure.
+
++#
++# Architectures with an unreliable sched_clock() should select this:
++#
++config HAVE_UNSTABLE_SCHED_CLOCK
++ bool
++
+ config GROUP_SCHED
+ bool "Group CPU scheduler"
+- default y
++ depends on EXPERIMENTAL
++ default n
+ help
+ This feature lets CPU scheduler recognize task groups and control CPU
+ bandwidth allocation to such task groups.
+@@ -326,7 +333,7 @@ config GROUP_SCHED
+ config FAIR_GROUP_SCHED
+ bool "Group scheduling for SCHED_OTHER"
+ depends on GROUP_SCHED
+- default y
++ default GROUP_SCHED
+
+ config RT_GROUP_SCHED
+ bool "Group scheduling for SCHED_RR/FIFO"
+@@ -825,6 +832,15 @@ menuconfig MODULES
+
+ If unsure, say Y.
+
++config MODULE_FORCE_LOAD
++ bool "Forced module loading"
++ depends on MODULES
++ default n
++ help
++ This option allows loading of modules even if that would set the
++ 'F' (forced) taint, due to lack of version info. Which is
++ usually a really bad idea.
++
+ config MODULE_UNLOAD
+ bool "Module unloading"
+ depends on MODULES
+diff --git a/init/main.c b/init/main.c
+index a87d4ca..ddada7a 100644
+--- a/init/main.c
++++ b/init/main.c
+@@ -602,6 +602,7 @@ asmlinkage void __init start_kernel(void)
+ softirq_init();
+ timekeeping_init();
+ time_init();
++ sched_clock_init();
+ profile_init();
+ if (!irqs_disabled())
+ printk("start_kernel(): bug: interrupts were enabled early\n");
+diff --git a/ipc/mqueue.c b/ipc/mqueue.c
+index 94fd3b0..b3b69fd 100644
+--- a/ipc/mqueue.c
++++ b/ipc/mqueue.c
+@@ -673,7 +673,7 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
+ if (IS_ERR(name = getname(u_name)))
+ return PTR_ERR(name);
+
+- fd = get_unused_fd();
++ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0)
+ goto out_putname;
+
+@@ -709,7 +709,6 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
+ goto out_putfd;
+ }
+
+- set_close_on_exec(fd, 1);
+ fd_install(fd, filp);
+ goto out_upsem;
+
+diff --git a/kernel/Makefile b/kernel/Makefile
+index 188c432..1c9938a 100644
+--- a/kernel/Makefile
++++ b/kernel/Makefile
+@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+ rcupdate.o extable.o params.o posix-timers.o \
+ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
+- notifier.o ksysfs.o pm_qos_params.o
++ notifier.o ksysfs.o pm_qos_params.o sched_clock.o
+
+ obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
+ obj-$(CONFIG_STACKTRACE) += stacktrace.o
+diff --git a/kernel/futex.c b/kernel/futex.c
+index 98092c9..449def8 100644
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -104,10 +104,6 @@ struct futex_q {
+ /* Key which the futex is hashed on: */
+ union futex_key key;
+
+- /* For fd, sigio sent using these: */
+- int fd;
+- struct file *filp;
+-
+ /* Optional priority inheritance state: */
+ struct futex_pi_state *pi_state;
+ struct task_struct *task;
+@@ -126,9 +122,6 @@ struct futex_hash_bucket {
+
+ static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+
+-/* Futex-fs vfsmount entry: */
+-static struct vfsmount *futex_mnt;
+-
+ /*
+ * Take mm->mmap_sem, when futex is shared
+ */
+@@ -610,8 +603,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ static void wake_futex(struct futex_q *q)
+ {
+ plist_del(&q->list, &q->list.plist);
+- if (q->filp)
+- send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
+ /*
+ * The lock in wake_up_all() is a crucial memory barrier after the
+ * plist_del() and also before assigning to q->lock_ptr.
+@@ -988,14 +979,10 @@ out:
+ }
+
+ /* The key must be already stored in q->key. */
+-static inline struct futex_hash_bucket *
+-queue_lock(struct futex_q *q, int fd, struct file *filp)
++static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+ {
+ struct futex_hash_bucket *hb;
+
+- q->fd = fd;
+- q->filp = filp;
+-
+ init_waitqueue_head(&q->waiters);
+
+ get_futex_key_refs(&q->key);
+@@ -1006,7 +993,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
+ return hb;
+ }
+
+-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
++static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ {
+ int prio;
+
+@@ -1041,15 +1028,6 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+ * exactly once. They are called with the hashed spinlock held.
+ */
+
+-/* The key must be already stored in q->key. */
+-static void queue_me(struct futex_q *q, int fd, struct file *filp)
+-{
+- struct futex_hash_bucket *hb;
+-
+- hb = queue_lock(q, fd, filp);
+- __queue_me(q, hb);
+-}
+-
+ /* Return 1 if we were still queued (ie. 0 means we were woken) */
+ static int unqueue_me(struct futex_q *q)
+ {
+@@ -1194,7 +1172,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+ if (unlikely(ret != 0))
+ goto out_release_sem;
+
+- hb = queue_lock(&q, -1, NULL);
++ hb = queue_lock(&q);
+
+ /*
+ * Access the page AFTER the futex is queued.
+@@ -1238,7 +1216,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+ goto out_unlock_release_sem;
+
+ /* Only actually queue if *uaddr contained val. */
+- __queue_me(&q, hb);
++ queue_me(&q, hb);
+
+ /*
+ * Now the futex is queued and we have checked the data, we
+@@ -1386,7 +1364,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+ goto out_release_sem;
+
+ retry_unlocked:
+- hb = queue_lock(&q, -1, NULL);
++ hb = queue_lock(&q);
+
+ retry_locked:
+ ret = lock_taken = 0;
+@@ -1499,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+- __queue_me(&q, hb);
++ queue_me(&q, hb);
+
+ /*
+ * Now the futex is queued and we have checked the data, we
+@@ -1746,121 +1724,6 @@ pi_faulted:
+ return ret;
+ }
+
+-static int futex_close(struct inode *inode, struct file *filp)
+-{
+- struct futex_q *q = filp->private_data;
+-
+- unqueue_me(q);
+- kfree(q);
+-
+- return 0;
+-}
+-
+-/* This is one-shot: once it's gone off you need a new fd */
+-static unsigned int futex_poll(struct file *filp,
+- struct poll_table_struct *wait)
+-{
+- struct futex_q *q = filp->private_data;
+- int ret = 0;
+-
+- poll_wait(filp, &q->waiters, wait);
+-
+- /*
+- * plist_node_empty() is safe here without any lock.
+- * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
+- */
+- if (plist_node_empty(&q->list))
+- ret = POLLIN | POLLRDNORM;
+-
+- return ret;
+-}
+-
+-static const struct file_operations futex_fops = {
+- .release = futex_close,
+- .poll = futex_poll,
+-};
+-
+-/*
+- * Signal allows caller to avoid the race which would occur if they
+- * set the sigio stuff up afterwards.
+- */
+-static int futex_fd(u32 __user *uaddr, int signal)
+-{
+- struct futex_q *q;
+- struct file *filp;
+- int ret, err;
+- struct rw_semaphore *fshared;
+- static unsigned long printk_interval;
+-
+- if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
+- printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
+- "will be removed from the kernel in June 2007\n",
+- current->comm);
+- }
+-
+- ret = -EINVAL;
+- if (!valid_signal(signal))
+- goto out;
+-
+- ret = get_unused_fd();
+- if (ret < 0)
+- goto out;
+- filp = get_empty_filp();
+- if (!filp) {
+- put_unused_fd(ret);
+- ret = -ENFILE;
+- goto out;
+- }
+- filp->f_op = &futex_fops;
+- filp->f_path.mnt = mntget(futex_mnt);
+- filp->f_path.dentry = dget(futex_mnt->mnt_root);
+- filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
+-
+- if (signal) {
+- err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
+- if (err < 0) {
+- goto error;
+- }
+- filp->f_owner.signum = signal;
+- }
+-
+- q = kmalloc(sizeof(*q), GFP_KERNEL);
+- if (!q) {
+- err = -ENOMEM;
+- goto error;
+- }
+- q->pi_state = NULL;
+-
+- fshared = ¤t->mm->mmap_sem;
+- down_read(fshared);
+- err = get_futex_key(uaddr, fshared, &q->key);
+-
+- if (unlikely(err != 0)) {
+- up_read(fshared);
+- kfree(q);
+- goto error;
+- }
+-
+- /*
+- * queue_me() must be called before releasing mmap_sem, because
+- * key->shared.inode needs to be referenced while holding it.
+- */
+- filp->private_data = q;
+-
+- queue_me(q, ret, filp);
+- up_read(fshared);
+-
+- /* Now we map fd to filp, so userspace can access it */
+- fd_install(ret, filp);
+-out:
+- return ret;
+-error:
+- put_unused_fd(ret);
+- put_filp(filp);
+- ret = err;
+- goto out;
+-}
+-
+ /*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+@@ -2092,10 +1955,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ case FUTEX_WAKE_BITSET:
+ ret = futex_wake(uaddr, fshared, val, val3);
+ break;
+- case FUTEX_FD:
+- /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
+- ret = futex_fd(uaddr, val);
+- break;
+ case FUTEX_REQUEUE:
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+ break;
+@@ -2156,19 +2015,6 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+ }
+
+-static int futexfs_get_sb(struct file_system_type *fs_type,
+- int flags, const char *dev_name, void *data,
+- struct vfsmount *mnt)
+-{
+- return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt);
+-}
+-
+-static struct file_system_type futex_fs_type = {
+- .name = "futexfs",
+- .get_sb = futexfs_get_sb,
+- .kill_sb = kill_anon_super,
+-};
+-
+ static int __init futex_init(void)
+ {
+ u32 curval;
+@@ -2193,16 +2039,6 @@ static int __init futex_init(void)
+ spin_lock_init(&futex_queues[i].lock);
+ }
+
+- i = register_filesystem(&futex_fs_type);
+- if (i)
+- return i;
+-
+- futex_mnt = kern_mount(&futex_fs_type);
+- if (IS_ERR(futex_mnt)) {
+- unregister_filesystem(&futex_fs_type);
+- return PTR_ERR(futex_mnt);
+- }
+-
+ return 0;
+ }
+ __initcall(futex_init);
+diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
+index 9af1d6a..421be5f 100644
+--- a/kernel/hrtimer.c
++++ b/kernel/hrtimer.c
+@@ -154,15 +154,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
+ }
+
+ /*
+- * Helper function to check, whether the timer is running the callback
+- * function
+- */
+-static inline int hrtimer_callback_running(struct hrtimer *timer)
+-{
+- return timer->state & HRTIMER_STATE_CALLBACK;
+-}
+-
+-/*
+ * Functions and macros which are different for UP/SMP systems are kept in a
+ * single place
+ */
+diff --git a/kernel/kgdb.c b/kernel/kgdb.c
+index 1bd0ec1..39e31a0 100644
+--- a/kernel/kgdb.c
++++ b/kernel/kgdb.c
+@@ -61,7 +61,7 @@ struct kgdb_state {
+ int err_code;
+ int cpu;
+ int pass_exception;
+- long threadid;
++ unsigned long threadid;
+ long kgdb_usethreadid;
+ struct pt_regs *linux_regs;
+ };
+@@ -146,7 +146,7 @@ atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+-int kgdb_do_roundup = 1;
++static int kgdb_do_roundup = 1;
+
+ static int __init opt_nokgdbroundup(char *str)
+ {
+@@ -438,7 +438,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+-int kgdb_hex2long(char **ptr, long *long_val)
++int kgdb_hex2long(char **ptr, unsigned long *long_val)
+ {
+ int hex_val;
+ int num = 0;
+@@ -709,7 +709,7 @@ int kgdb_isremovedbreak(unsigned long addr)
+ return 0;
+ }
+
+-int remove_all_break(void)
++static int remove_all_break(void)
+ {
+ unsigned long addr;
+ int error;
+diff --git a/kernel/module.c b/kernel/module.c
+index 8674a39..8e4528c 100644
+--- a/kernel/module.c
++++ b/kernel/module.c
+@@ -890,6 +890,19 @@ static struct module_attribute *modinfo_attrs[] = {
+
+ static const char vermagic[] = VERMAGIC_STRING;
+
++static int try_to_force_load(struct module *mod, const char *symname)
++{
++#ifdef CONFIG_MODULE_FORCE_LOAD
++ if (!(tainted & TAINT_FORCED_MODULE))
++ printk("%s: no version for \"%s\" found: kernel tainted.\n",
++ mod->name, symname);
++ add_taint_module(mod, TAINT_FORCED_MODULE);
++ return 0;
++#else
++ return -ENOEXEC;
++#endif
++}
++
+ #ifdef CONFIG_MODVERSIONS
+ static int check_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+@@ -914,18 +927,18 @@ static int check_version(Elf_Shdr *sechdrs,
+
+ if (versions[i].crc == *crc)
+ return 1;
+- printk("%s: disagrees about version of symbol %s\n",
+- mod->name, symname);
+ DEBUGP("Found checksum %lX vs module %lX\n",
+ *crc, versions[i].crc);
+- return 0;
++ goto bad_version;
+ }
+- /* Not in module's version table. OK, but that taints the kernel. */
+- if (!(tainted & TAINT_FORCED_MODULE))
+- printk("%s: no version for \"%s\" found: kernel tainted.\n",
+- mod->name, symname);
+- add_taint_module(mod, TAINT_FORCED_MODULE);
+- return 1;
++
++ if (!try_to_force_load(mod, symname))
++ return 1;
++
++bad_version:
++ printk("%s: disagrees about version of symbol %s\n",
++ mod->name, symname);
++ return 0;
+ }
+
+ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
+@@ -1853,9 +1866,9 @@ static struct module *load_module(void __user *umod,
+ modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
+ /* This is allowed: modprobe --force will invalidate it. */
+ if (!modmagic) {
+- add_taint_module(mod, TAINT_FORCED_MODULE);
+- printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
+- mod->name);
++ err = try_to_force_load(mod, "magic");
++ if (err)
++ goto free_hdr;
+ } else if (!same_magic(modmagic, vermagic)) {
+ printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
+ mod->name, modmagic, vermagic);
+@@ -2006,9 +2019,10 @@ static struct module *load_module(void __user *umod,
+ (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
+ (mod->num_unused_syms && !unusedcrcindex) ||
+ (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
+- printk(KERN_WARNING "%s: No versions for exported symbols."
+- " Tainting kernel.\n", mod->name);
+- add_taint_module(mod, TAINT_FORCED_MODULE);
++ printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
++ err = try_to_force_load(mod, "nocrc");
++ if (err)
++ goto cleanup;
+ }
+ #endif
+ markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 34bcc5b..58fb8af 100644
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -75,16 +75,6 @@
+ #include <asm/irq_regs.h>
+
+ /*
+- * Scheduler clock - returns current time in nanosec units.
+- * This is default implementation.
+- * Architectures and sub-architectures can override this.
+- */
+-unsigned long long __attribute__((weak)) sched_clock(void)
+-{
+- return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+-}
+-
+-/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+@@ -242,6 +232,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+ }
+ #endif
+
++/*
++ * sched_domains_mutex serializes calls to arch_init_sched_domains,
++ * detach_destroy_domains and partition_sched_domains.
++ */
++static DEFINE_MUTEX(sched_domains_mutex);
++
+ #ifdef CONFIG_GROUP_SCHED
+
+ #include <linux/cgroup.h>
+@@ -308,9 +304,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+ */
+ static DEFINE_SPINLOCK(task_group_lock);
+
+-/* doms_cur_mutex serializes access to doms_cur[] array */
+-static DEFINE_MUTEX(doms_cur_mutex);
+-
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ #ifdef CONFIG_USER_SCHED
+ # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
+@@ -318,7 +311,13 @@ static DEFINE_MUTEX(doms_cur_mutex);
+ # define INIT_TASK_GROUP_LOAD NICE_0_LOAD
+ #endif
+
++/*
++ * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
++ * (The default weight is 1024 - so there's no practical
++ * limitation from this.)
++ */
+ #define MIN_SHARES 2
++#define MAX_SHARES (ULONG_MAX - 1)
+
+ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+ #endif
+@@ -358,21 +357,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+ #endif
+ }
+
+-static inline void lock_doms_cur(void)
+-{
+- mutex_lock(&doms_cur_mutex);
+-}
+-
+-static inline void unlock_doms_cur(void)
+-{
+- mutex_unlock(&doms_cur_mutex);
+-}
+-
+ #else
+
+ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+-static inline void lock_doms_cur(void) { }
+-static inline void unlock_doms_cur(void) { }
+
+ #endif /* CONFIG_GROUP_SCHED */
+
+@@ -560,13 +547,7 @@ struct rq {
+ unsigned long next_balance;
+ struct mm_struct *prev_mm;
+
+- u64 clock, prev_clock_raw;
+- s64 clock_max_delta;
+-
+- unsigned int clock_warps, clock_overflows, clock_underflows;
+- u64 idle_clock;
+- unsigned int clock_deep_idle_events;
+- u64 tick_timestamp;
++ u64 clock;
+
+ atomic_t nr_iowait;
+
+@@ -631,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
+ #endif
+ }
+
+-#ifdef CONFIG_NO_HZ
+-static inline bool nohz_on(int cpu)
+-{
+- return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
+-}
+-
+-static inline u64 max_skipped_ticks(struct rq *rq)
+-{
+- return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
+-}
+-
+-static inline void update_last_tick_seen(struct rq *rq)
+-{
+- rq->last_tick_seen = jiffies;
+-}
+-#else
+-static inline u64 max_skipped_ticks(struct rq *rq)
+-{
+- return 1;
+-}
+-
+-static inline void update_last_tick_seen(struct rq *rq)
+-{
+-}
+-#endif
+-
+-/*
+- * Update the per-runqueue clock, as finegrained as the platform can give
+- * us, but without assuming monotonicity, etc.:
+- */
+-static void __update_rq_clock(struct rq *rq)
+-{
+- u64 prev_raw = rq->prev_clock_raw;
+- u64 now = sched_clock();
+- s64 delta = now - prev_raw;
+- u64 clock = rq->clock;
+-
+-#ifdef CONFIG_SCHED_DEBUG
+- WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+-#endif
+- /*
+- * Protect against sched_clock() occasionally going backwards:
+- */
+- if (unlikely(delta < 0)) {
+- clock++;
+- rq->clock_warps++;
+- } else {
+- /*
+- * Catch too large forward jumps too:
+- */
+- u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
+- u64 max_time = rq->tick_timestamp + max_jump;
+-
+- if (unlikely(clock + delta > max_time)) {
+- if (clock < max_time)
+- clock = max_time;
+- else
+- clock++;
+- rq->clock_overflows++;
+- } else {
+- if (unlikely(delta > rq->clock_max_delta))
+- rq->clock_max_delta = delta;
+- clock += delta;
+- }
+- }
+-
+- rq->prev_clock_raw = now;
+- rq->clock = clock;
+-}
+-
+-static void update_rq_clock(struct rq *rq)
+-{
+- if (likely(smp_processor_id() == cpu_of(rq)))
+- __update_rq_clock(rq);
+-}
+-
+ /*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+@@ -722,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
+ #define task_rq(p) cpu_rq(task_cpu(p))
+ #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+
++static inline void update_rq_clock(struct rq *rq)
++{
++ rq->clock = sched_clock_cpu(cpu_of(rq));
++}
++
+ /*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+@@ -757,14 +667,14 @@ const_debug unsigned int sysctl_sched_features =
+ #define SCHED_FEAT(name, enabled) \
+ #name ,
+
+-__read_mostly char *sched_feat_names[] = {
++static __read_mostly char *sched_feat_names[] = {
+ #include "sched_features.h"
+ NULL
+ };
+
+ #undef SCHED_FEAT
+
+-int sched_feat_open(struct inode *inode, struct file *filp)
++static int sched_feat_open(struct inode *inode, struct file *filp)
+ {
+ filp->private_data = inode->i_private;
+ return 0;
+@@ -899,7 +809,7 @@ static inline u64 global_rt_runtime(void)
+ return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+ }
+
+-static const unsigned long long time_sync_thresh = 100000;
++unsigned long long time_sync_thresh = 100000;
+
+ static DEFINE_PER_CPU(unsigned long long, time_offset);
+ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
+@@ -913,11 +823,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
+ static DEFINE_SPINLOCK(time_sync_lock);
+ static unsigned long long prev_global_time;
+
+-static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
++static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
+ {
+- unsigned long flags;
+-
+- spin_lock_irqsave(&time_sync_lock, flags);
++ /*
++ * We want this inlined, to not get tracer function calls
++ * in this critical section:
++ */
++ spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
++ __raw_spin_lock(&time_sync_lock.raw_lock);
+
+ if (time < prev_global_time) {
+ per_cpu(time_offset, cpu) += prev_global_time - time;
+@@ -926,7 +839,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
+ prev_global_time = time;
+ }
+
+- spin_unlock_irqrestore(&time_sync_lock, flags);
++ __raw_spin_unlock(&time_sync_lock.raw_lock);
++ spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
+
+ return time;
+ }
+@@ -934,8 +848,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
+ static unsigned long long __cpu_clock(int cpu)
+ {
+ unsigned long long now;
+- unsigned long flags;
+- struct rq *rq;
+
+ /*
+ * Only call sched_clock() if the scheduler has already been
+@@ -944,11 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
+ if (unlikely(!scheduler_running))
+ return 0;
+
+- local_irq_save(flags);
+- rq = cpu_rq(cpu);
+- update_rq_clock(rq);
+- now = rq->clock;
+- local_irq_restore(flags);
++ now = sched_clock_cpu(cpu);
+
+ return now;
+ }
+@@ -960,13 +868,18 @@ static unsigned long long __cpu_clock(int cpu)
+ unsigned long long cpu_clock(int cpu)
+ {
+ unsigned long long prev_cpu_time, time, delta_time;
++ unsigned long flags;
+
++ local_irq_save(flags);
+ prev_cpu_time = per_cpu(prev_cpu_time, cpu);
+ time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
+ delta_time = time-prev_cpu_time;
+
+- if (unlikely(delta_time > time_sync_thresh))
++ if (unlikely(delta_time > time_sync_thresh)) {
+ time = __sync_cpu_clock(time, cpu);
++ per_cpu(prev_cpu_time, cpu) = time;
++ }
++ local_irq_restore(flags);
+
+ return time;
+ }
+@@ -1117,43 +1030,6 @@ static struct rq *this_rq_lock(void)
+ return rq;
+ }
+
+-/*
+- * We are going deep-idle (irqs are disabled):
+- */
+-void sched_clock_idle_sleep_event(void)
+-{
+- struct rq *rq = cpu_rq(smp_processor_id());
+-
+- spin_lock(&rq->lock);
+- __update_rq_clock(rq);
+- spin_unlock(&rq->lock);
+- rq->clock_deep_idle_events++;
+-}
+-EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+-
+-/*
+- * We just idled delta nanoseconds (called with irqs disabled):
+- */
+-void sched_clock_idle_wakeup_event(u64 delta_ns)
+-{
+- struct rq *rq = cpu_rq(smp_processor_id());
+- u64 now = sched_clock();
+-
+- rq->idle_clock += delta_ns;
+- /*
+- * Override the previous timestamp and ignore all
+- * sched_clock() deltas that occured while we idled,
+- * and use the PM-provided delta_ns to advance the
+- * rq clock:
+- */
+- spin_lock(&rq->lock);
+- rq->prev_clock_raw = now;
+- rq->clock += delta_ns;
+- spin_unlock(&rq->lock);
+- touch_softlockup_watchdog();
+-}
+-EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+-
+ static void __resched_task(struct task_struct *p, int tif_bit);
+
+ static inline void resched_task(struct task_struct *p)
+@@ -1189,6 +1065,7 @@ static inline void resched_rq(struct rq *rq)
+ enum {
+ HRTICK_SET, /* re-programm hrtick_timer */
+ HRTICK_RESET, /* not a new slice */
++ HRTICK_BLOCK, /* stop hrtick operations */
+ };
+
+ /*
+@@ -1200,6 +1077,8 @@ static inline int hrtick_enabled(struct rq *rq)
+ {
+ if (!sched_feat(HRTICK))
+ return 0;
++ if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
++ return 0;
+ return hrtimer_is_hres_active(&rq->hrtick_timer);
+ }
+
+@@ -1275,14 +1154,70 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+ spin_lock(&rq->lock);
+- __update_rq_clock(rq);
++ update_rq_clock(rq);
+ rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+ spin_unlock(&rq->lock);
+
+ return HRTIMER_NORESTART;
+ }
+
+-static inline void init_rq_hrtick(struct rq *rq)
++static void hotplug_hrtick_disable(int cpu)
++{
++ struct rq *rq = cpu_rq(cpu);
++ unsigned long flags;
++
++ spin_lock_irqsave(&rq->lock, flags);
++ rq->hrtick_flags = 0;
++ __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
++ spin_unlock_irqrestore(&rq->lock, flags);
++
++ hrtick_clear(rq);
++}
++
++static void hotplug_hrtick_enable(int cpu)
++{
++ struct rq *rq = cpu_rq(cpu);
++ unsigned long flags;
++
++ spin_lock_irqsave(&rq->lock, flags);
++ __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
++ spin_unlock_irqrestore(&rq->lock, flags);
++}
++
++static int
++hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
++{
++ int cpu = (int)(long)hcpu;
++
++ switch (action) {
++ case CPU_UP_CANCELED:
++ case CPU_UP_CANCELED_FROZEN:
++ case CPU_DOWN_PREPARE:
++ case CPU_DOWN_PREPARE_FROZEN:
++ case CPU_DEAD:
++ case CPU_DEAD_FROZEN:
++ hotplug_hrtick_disable(cpu);
++ return NOTIFY_OK;
++
++ case CPU_UP_PREPARE:
++ case CPU_UP_PREPARE_FROZEN:
++ case CPU_DOWN_FAILED:
++ case CPU_DOWN_FAILED_FROZEN:
++ case CPU_ONLINE:
++ case CPU_ONLINE_FROZEN:
++ hotplug_hrtick_enable(cpu);
++ return NOTIFY_OK;
++ }
++
++ return NOTIFY_DONE;
++}
++
++static void init_hrtick(void)
++{
++ hotcpu_notifier(hotplug_hrtick, 0);
++}
++
++static void init_rq_hrtick(struct rq *rq)
+ {
+ rq->hrtick_flags = 0;
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+@@ -1319,6 +1254,10 @@ static inline void init_rq_hrtick(struct rq *rq)
+ void hrtick_resched(void)
+ {
+ }
++
++static inline void init_hrtick(void)
++{
++}
+ #endif
+
+ /*
+@@ -1438,8 +1377,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+ {
+ u64 tmp;
+
+- if (unlikely(!lw->inv_weight))
+- lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
++ if (!lw->inv_weight)
++ lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
+
+ tmp = (u64)delta_exec * weight;
+ /*
+@@ -1748,6 +1687,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
++ else if (shares > MAX_SHARES)
++ shares = MAX_SHARES;
+
+ __set_se_shares(tg->se[tcpu], shares);
+ }
+@@ -4339,8 +4280,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
+ struct rq *rq = this_rq();
+ cputime64_t tmp;
+
+- if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
+- return account_guest_time(p, cputime);
++ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
++ account_guest_time(p, cputime);
++ return;
++ }
+
+ p->stime = cputime_add(p->stime, cputime);
+
+@@ -4404,19 +4347,11 @@ void scheduler_tick(void)
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *curr = rq->curr;
+- u64 next_tick = rq->tick_timestamp + TICK_NSEC;
++
++ sched_clock_tick();
+
+ spin_lock(&rq->lock);
+- __update_rq_clock(rq);
+- /*
+- * Let rq->clock advance by at least TICK_NSEC:
+- */
+- if (unlikely(rq->clock < next_tick)) {
+- rq->clock = next_tick;
+- rq->clock_underflows++;
+- }
+- rq->tick_timestamp = rq->clock;
+- update_last_tick_seen(rq);
++ update_rq_clock(rq);
+ update_cpu_load(rq);
+ curr->sched_class->task_tick(rq, curr, 0);
+ spin_unlock(&rq->lock);
+@@ -4570,7 +4505,7 @@ need_resched_nonpreemptible:
+ * Do the rq-clock update outside the rq lock:
+ */
+ local_irq_disable();
+- __update_rq_clock(rq);
++ update_rq_clock(rq);
+ spin_lock(&rq->lock);
+ clear_tsk_need_resched(prev);
+
+@@ -4595,9 +4530,9 @@ need_resched_nonpreemptible:
+ prev->sched_class->put_prev_task(rq, prev);
+ next = pick_next_task(rq, prev);
+
+- sched_info_switch(prev, next);
+-
+ if (likely(prev != next)) {
++ sched_info_switch(prev, next);
++
+ rq->nr_switches++;
+ rq->curr = next;
+ ++*switch_count;
+@@ -7755,7 +7690,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ {
+ int i, j;
+
+- lock_doms_cur();
++ mutex_lock(&sched_domains_mutex);
+
+ /* always unregister in case we don't destroy any domains */
+ unregister_sched_domain_sysctl();
+@@ -7804,7 +7739,7 @@ match2:
+
+ register_sched_domain_sysctl();
+
+- unlock_doms_cur();
++ mutex_unlock(&sched_domains_mutex);
+ }
+
+ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+@@ -7813,8 +7748,10 @@ int arch_reinit_sched_domains(void)
+ int err;
+
+ get_online_cpus();
++ mutex_lock(&sched_domains_mutex);
+ detach_destroy_domains(&cpu_online_map);
+ err = arch_init_sched_domains(&cpu_online_map);
++ mutex_unlock(&sched_domains_mutex);
+ put_online_cpus();
+
+ return err;
+@@ -7932,13 +7869,16 @@ void __init sched_init_smp(void)
+ BUG_ON(sched_group_nodes_bycpu == NULL);
+ #endif
+ get_online_cpus();
++ mutex_lock(&sched_domains_mutex);
+ arch_init_sched_domains(&cpu_online_map);
+ cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
+ if (cpus_empty(non_isolated_cpus))
+ cpu_set(smp_processor_id(), non_isolated_cpus);
++ mutex_unlock(&sched_domains_mutex);
+ put_online_cpus();
+ /* XXX: Theoretical race here - CPU may be hotplugged now */
+ hotcpu_notifier(update_sched_domains, 0);
++ init_hrtick();
+
+ /* Move init over to a non-isolated CPU */
+ if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+@@ -8025,7 +7965,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+
+ se->my_q = cfs_rq;
+ se->load.weight = tg->shares;
+- se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight);
++ se->load.inv_weight = 0;
+ se->parent = parent;
+ }
+ #endif
+@@ -8149,8 +8089,6 @@ void __init sched_init(void)
+ spin_lock_init(&rq->lock);
+ lockdep_set_class(&rq->lock, &rq->rq_lock_key);
+ rq->nr_running = 0;
+- rq->clock = 1;
+- update_last_tick_seen(rq);
+ init_cfs_rq(&rq->cfs, rq);
+ init_rt_rq(&rq->rt, rq);
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+@@ -8294,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
+ static void normalize_task(struct rq *rq, struct task_struct *p)
+ {
+ int on_rq;
++
+ update_rq_clock(rq);
+ on_rq = p->se.on_rq;
+ if (on_rq)
+@@ -8325,7 +8264,6 @@ void normalize_rt_tasks(void)
+ p->se.sleep_start = 0;
+ p->se.block_start = 0;
+ #endif
+- task_rq(p)->clock = 0;
+
+ if (!rt_task(p)) {
+ /*
+@@ -8692,7 +8630,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
+ dequeue_entity(cfs_rq, se, 0);
+
+ se->load.weight = shares;
+- se->load.inv_weight = div64_u64((1ULL<<32), shares);
++ se->load.inv_weight = 0;
+
+ if (on_rq)
+ enqueue_entity(cfs_rq, se, 0);
+@@ -8722,13 +8660,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+ if (!tg->se[0])
+ return -EINVAL;
+
+- /*
+- * A weight of 0 or 1 can cause arithmetics problems.
+- * (The default weight is 1024 - so there's no practical
+- * limitation from this.)
+- */
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
++ else if (shares > MAX_SHARES)
++ shares = MAX_SHARES;
+
+ mutex_lock(&shares_mutex);
+ if (tg->shares == shares)
+@@ -8753,7 +8688,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+ * force a rebalance
+ */
+ cfs_rq_set_shares(tg->cfs_rq[i], 0);
+- set_se_shares(tg->se[i], shares/nr_cpu_ids);
++ set_se_shares(tg->se[i], shares);
+ }
+
+ /*
+diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
+new file mode 100644
+index 0000000..9c597e3
+--- /dev/null
++++ b/kernel/sched_clock.c
+@@ -0,0 +1,236 @@
++/*
++ * sched_clock for unstable cpu clocks
++ *
++ * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr at redhat.com>
++ *
++ * Based on code by:
++ * Ingo Molnar <mingo at redhat.com>
++ * Guillaume Chazarain <guichaz at gmail.com>
++ *
++ * Create a semi stable clock from a mixture of other events, including:
++ * - gtod
++ * - jiffies
++ * - sched_clock()
++ * - explicit idle events
++ *
++ * We use gtod as base and the unstable clock deltas. The deltas are filtered,
++ * making it monotonic and keeping it within an expected window. This window
++ * is set up using jiffies.
++ *
++ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
++ * that is otherwise invisible (TSC gets stopped).
++ *
++ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
++ * consistent between cpus (never more than 1 jiffies difference).
++ */
++#include <linux/sched.h>
++#include <linux/percpu.h>
++#include <linux/spinlock.h>
++#include <linux/ktime.h>
++#include <linux/module.h>
++
++
++#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
++
++struct sched_clock_data {
++ /*
++ * Raw spinlock - this is a special case: this might be called
++ * from within instrumentation code so we dont want to do any
++ * instrumentation ourselves.
++ */
++ raw_spinlock_t lock;
++
++ unsigned long prev_jiffies;
++ u64 prev_raw;
++ u64 tick_raw;
++ u64 tick_gtod;
++ u64 clock;
++};
++
++static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
++
++static inline struct sched_clock_data *this_scd(void)
++{
++ return &__get_cpu_var(sched_clock_data);
++}
++
++static inline struct sched_clock_data *cpu_sdc(int cpu)
++{
++ return &per_cpu(sched_clock_data, cpu);
++}
++
++void sched_clock_init(void)
++{
++ u64 ktime_now = ktime_to_ns(ktime_get());
++ u64 now = 0;
++ int cpu;
++
++ for_each_possible_cpu(cpu) {
++ struct sched_clock_data *scd = cpu_sdc(cpu);
++
++ scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
++ scd->prev_jiffies = jiffies;
++ scd->prev_raw = now;
++ scd->tick_raw = now;
++ scd->tick_gtod = ktime_now;
++ scd->clock = ktime_now;
++ }
++}
++
++/*
++ * update the percpu scd from the raw @now value
++ *
++ * - filter out backward motion
++ * - use jiffies to generate a min,max window to clip the raw values
++ */
++static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
++{
++ unsigned long now_jiffies = jiffies;
++ long delta_jiffies = now_jiffies - scd->prev_jiffies;
++ u64 clock = scd->clock;
++ u64 min_clock, max_clock;
++ s64 delta = now - scd->prev_raw;
++
++ WARN_ON_ONCE(!irqs_disabled());
++ min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
++
++ if (unlikely(delta < 0)) {
++ clock++;
++ goto out;
++ }
++
++ max_clock = min_clock + TICK_NSEC;
++
++ if (unlikely(clock + delta > max_clock)) {
++ if (clock < max_clock)
++ clock = max_clock;
++ else
++ clock++;
++ } else {
++ clock += delta;
++ }
++
++ out:
++ if (unlikely(clock < min_clock))
++ clock = min_clock;
++
++ scd->prev_raw = now;
++ scd->prev_jiffies = now_jiffies;
++ scd->clock = clock;
++}
++
++static void lock_double_clock(struct sched_clock_data *data1,
++ struct sched_clock_data *data2)
++{
++ if (data1 < data2) {
++ __raw_spin_lock(&data1->lock);
++ __raw_spin_lock(&data2->lock);
++ } else {
++ __raw_spin_lock(&data2->lock);
++ __raw_spin_lock(&data1->lock);
++ }
++}
++
++u64 sched_clock_cpu(int cpu)
++{
++ struct sched_clock_data *scd = cpu_sdc(cpu);
++ u64 now, clock;
++
++ WARN_ON_ONCE(!irqs_disabled());
++ now = sched_clock();
++
++ if (cpu != raw_smp_processor_id()) {
++ /*
++ * in order to update a remote cpu's clock based on our
++ * unstable raw time rebase it against:
++ * tick_raw (offset between raw counters)
++ * tick_gotd (tick offset between cpus)
++ */
++ struct sched_clock_data *my_scd = this_scd();
++
++ lock_double_clock(scd, my_scd);
++
++ now -= my_scd->tick_raw;
++ now += scd->tick_raw;
++
++ now -= my_scd->tick_gtod;
++ now += scd->tick_gtod;
++
++ __raw_spin_unlock(&my_scd->lock);
++ } else {
++ __raw_spin_lock(&scd->lock);
++ }
++
++ __update_sched_clock(scd, now);
++ clock = scd->clock;
++
++ __raw_spin_unlock(&scd->lock);
++
++ return clock;
++}
++
++void sched_clock_tick(void)
++{
++ struct sched_clock_data *scd = this_scd();
++ u64 now, now_gtod;
++
++ WARN_ON_ONCE(!irqs_disabled());
++
++ now = sched_clock();
++ now_gtod = ktime_to_ns(ktime_get());
++
++ __raw_spin_lock(&scd->lock);
++ __update_sched_clock(scd, now);
++ /*
++ * update tick_gtod after __update_sched_clock() because that will
++ * already observe 1 new jiffy; adding a new tick_gtod to that would
++ * increase the clock 2 jiffies.
++ */
++ scd->tick_raw = now;
++ scd->tick_gtod = now_gtod;
++ __raw_spin_unlock(&scd->lock);
++}
++
++/*
++ * We are going deep-idle (irqs are disabled):
++ */
++void sched_clock_idle_sleep_event(void)
++{
++ sched_clock_cpu(smp_processor_id());
++}
++EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
++
++/*
++ * We just idled delta nanoseconds (called with irqs disabled):
++ */
++void sched_clock_idle_wakeup_event(u64 delta_ns)
++{
++ struct sched_clock_data *scd = this_scd();
++ u64 now = sched_clock();
++
++ /*
++ * Override the previous timestamp and ignore all
++ * sched_clock() deltas that occured while we idled,
++ * and use the PM-provided delta_ns to advance the
++ * rq clock:
++ */
++ __raw_spin_lock(&scd->lock);
++ scd->prev_raw = now;
++ scd->clock += delta_ns;
++ __raw_spin_unlock(&scd->lock);
++
++ touch_softlockup_watchdog();
++}
++EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
++
++#endif
++
++/*
++ * Scheduler clock - returns current time in nanosec units.
++ * This is default implementation.
++ * Architectures and sub-architectures can override this.
++ */
++unsigned long long __attribute__((weak)) sched_clock(void)
++{
++ return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
++}
+diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
+index 6b4a125..5f06118 100644
+--- a/kernel/sched_debug.c
++++ b/kernel/sched_debug.c
+@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
+ PN(next_balance);
+ P(curr->pid);
+ PN(clock);
+- PN(idle_clock);
+- PN(prev_clock_raw);
+- P(clock_warps);
+- P(clock_overflows);
+- P(clock_underflows);
+- P(clock_deep_idle_events);
+- PN(clock_max_delta);
+ P(cpu_load[0]);
+ P(cpu_load[1]);
+ P(cpu_load[2]);
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 89fa32b..c863663 100644
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
++ account_entity_enqueue(cfs_rq, se);
+
+ if (wakeup) {
+ place_entity(cfs_rq, se, 0);
+@@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+ check_spread(cfs_rq, se);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+- account_entity_enqueue(cfs_rq, se);
+ }
+
+ static void update_avg(u64 *avg, u64 sample)
+@@ -841,8 +841,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+ * queued ticks are scheduled to match the slice, so don't bother
+ * validating it and just reschedule.
+ */
+- if (queued)
+- return resched_task(rq_of(cfs_rq)->curr);
++ if (queued) {
++ resched_task(rq_of(cfs_rq)->curr);
++ return;
++ }
+ /*
+ * don't let the period tick interfere with the hrtick preemption
+ */
+@@ -957,7 +959,7 @@ static void yield_task_fair(struct rq *rq)
+ return;
+
+ if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
+- __update_rq_clock(rq);
++ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+@@ -1007,7 +1009,7 @@ static int wake_idle(int cpu, struct task_struct *p)
+ * sibling runqueue info. This will avoid the checks and cache miss
+ * penalities associated with that.
+ */
+- if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
++ if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
+ return cpu;
+
+ for_each_domain(cpu, sd) {
+@@ -1611,30 +1613,6 @@ static const struct sched_class fair_sched_class = {
+ };
+
+ #ifdef CONFIG_SCHED_DEBUG
+-static void
+-print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
+-{
+- struct sched_entity *se;
+-
+- if (!cfs_rq)
+- return;
+-
+- list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
+- int i;
+-
+- for (i = depth; i; i--)
+- seq_puts(m, " ");
+-
+- seq_printf(m, "%lu %s %lu\n",
+- se->load.weight,
+- entity_is_task(se) ? "T" : "G",
+- calc_delta_weight(SCHED_LOAD_SCALE, se)
+- );
+- if (!entity_is_task(se))
+- print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
+- }
+-}
+-
+ static void print_cfs_stats(struct seq_file *m, int cpu)
+ {
+ struct cfs_rq *cfs_rq;
+@@ -1642,9 +1620,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
+ rcu_read_lock();
+ for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ print_cfs_rq(m, cpu, cfs_rq);
+-
+- seq_printf(m, "\nWeight tree:\n");
+- print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
+ rcu_read_unlock();
+ }
+ #endif
+diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
+index 2bcafa3..3a4f92d 100644
+--- a/kernel/sched_idletask.c
++++ b/kernel/sched_idletask.c
+@@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
+ /*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+-const struct sched_class idle_sched_class = {
++static const struct sched_class idle_sched_class = {
+ /* .next is NULL */
+ /* no enqueue/yield_task for idle tasks */
+
+diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
+index c2730a5..060e87b 100644
+--- a/kernel/sched_rt.c
++++ b/kernel/sched_rt.c
+@@ -1098,11 +1098,14 @@ static void post_schedule_rt(struct rq *rq)
+ }
+ }
+
+-
++/*
++ * If we are not running and we are not going to reschedule soon, we should
++ * try to push tasks away now
++ */
+ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
+ {
+ if (!task_running(rq, p) &&
+- (p->prio >= rq->rt.highest_prio) &&
++ !test_tsk_need_resched(rq->curr) &&
+ rq->rt.overloaded)
+ push_rt_tasks(rq);
+ }
+@@ -1309,7 +1312,7 @@ static void set_curr_task_rt(struct rq *rq)
+ p->se.exec_start = rq->clock;
+ }
+
+-const struct sched_class rt_sched_class = {
++static const struct sched_class rt_sched_class = {
+ .next = &fair_sched_class,
+ .enqueue_task = enqueue_task_rt,
+ .dequeue_task = dequeue_task_rt,
+diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
+index 73961f3..dadde53 100644
+--- a/kernel/time/clocksource.c
++++ b/kernel/time/clocksource.c
+@@ -471,10 +471,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+ /*
+ * Sysfs setup bits:
+ */
+-static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
++static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
+ sysfs_override_clocksource);
+
+-static SYSDEV_ATTR(available_clocksource, 0600,
++static SYSDEV_ATTR(available_clocksource, 0444,
+ sysfs_show_available_clocksources, NULL);
+
+ static struct sysdev_class clocksource_sysclass = {
+diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
+index f2e01ac..a5d4b1d 100644
+--- a/lib/Kconfig.kgdb
++++ b/lib/Kconfig.kgdb
+@@ -1,4 +1,10 @@
+
++config HAVE_ARCH_KGDB_SHADOW_INFO
++ bool
++
++config HAVE_ARCH_KGDB
++ bool
++
+ menuconfig KGDB
+ bool "KGDB: kernel debugging with remote gdb"
+ select FRAME_POINTER
+@@ -10,15 +16,10 @@ menuconfig KGDB
+ at http://kgdb.sourceforge.net as well as in DocBook form
+ in Documentation/DocBook/. If unsure, say N.
+
+-config HAVE_ARCH_KGDB_SHADOW_INFO
+- bool
+-
+-config HAVE_ARCH_KGDB
+- bool
++if KGDB
+
+ config KGDB_SERIAL_CONSOLE
+ tristate "KGDB: use kgdb over the serial console"
+- depends on KGDB
+ select CONSOLE_POLL
+ select MAGIC_SYSRQ
+ default y
+@@ -28,7 +29,6 @@ config KGDB_SERIAL_CONSOLE
+
+ config KGDB_TESTS
+ bool "KGDB: internal test suite"
+- depends on KGDB
+ default n
+ help
+ This is a kgdb I/O module specifically designed to test
+@@ -56,3 +56,5 @@ config KGDB_TESTS_BOOT_STRING
+ boot. See the drivers/misc/kgdbts.c for detailed
+ information about other strings you could use beyond the
+ default of V1F100.
++
++endif # KGDB
+diff --git a/lib/devres.c b/lib/devres.c
+index 26c87c4..72c8909 100644
+--- a/lib/devres.c
++++ b/lib/devres.c
+@@ -2,7 +2,7 @@
+ #include <linux/io.h>
+ #include <linux/module.h>
+
+-static void devm_ioremap_release(struct device *dev, void *res)
++void devm_ioremap_release(struct device *dev, void *res)
+ {
+ iounmap(*(void __iomem **)res);
+ }
+diff --git a/mm/memory.c b/mm/memory.c
+index bbab1e3..48c122d 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -969,7 +969,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ goto no_page_table;
+
+ pmd = pmd_offset(pud, address);
+- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
++ if (pmd_none(*pmd))
+ goto no_page_table;
+
+ if (pmd_huge(*pmd)) {
+@@ -978,6 +978,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ goto out;
+ }
+
++ if (unlikely(pmd_bad(*pmd)))
++ goto no_page_table;
++
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!ptep)
+ goto out;
+diff --git a/net/atm/br2684.c b/net/atm/br2684.c
+index 1b22806..9d52ebf 100644
+--- a/net/atm/br2684.c
++++ b/net/atm/br2684.c
+@@ -346,9 +346,9 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
+ /* skb==NULL means VCC is being destroyed */
+ br2684_close_vcc(brvcc);
+ if (list_empty(&brdev->brvccs)) {
+- read_lock(&devs_lock);
++ write_lock_irq(&devs_lock);
+ list_del(&brdev->br2684_devs);
+- read_unlock(&devs_lock);
++ write_unlock_irq(&devs_lock);
+ unregister_netdev(net_dev);
+ free_netdev(net_dev);
+ }
+diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
+index 77a981a..c2397f5 100644
+--- a/net/bridge/br_if.c
++++ b/net/bridge/br_if.c
+@@ -273,15 +273,13 @@ int br_add_bridge(const char *name)
+ rtnl_lock();
+ if (strchr(dev->name, '%')) {
+ ret = dev_alloc_name(dev, dev->name);
+- if (ret < 0) {
+- free_netdev(dev);
+- goto out;
+- }
++ if (ret < 0)
++ goto out_free;
+ }
+
+ ret = register_netdevice(dev);
+ if (ret)
+- goto out;
++ goto out_free;
+
+ ret = br_sysfs_addbr(dev);
+ if (ret)
+@@ -289,6 +287,10 @@ int br_add_bridge(const char *name)
+ out:
+ rtnl_unlock();
+ return ret;
++
++out_free:
++ free_netdev(dev);
++ goto out;
+ }
+
+ int br_del_bridge(const char *name)
+diff --git a/net/can/bcm.c b/net/can/bcm.c
+index 74fd2d3..d9a3a9d 100644
+--- a/net/can/bcm.c
++++ b/net/can/bcm.c
+@@ -412,12 +412,6 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
+ bcm_send_to_user(op, &head, data, 1);
+ }
+
+-/* TODO: move to linux/hrtimer.h */
+-static inline int hrtimer_callback_running(struct hrtimer *timer)
+-{
+- return timer->state & HRTIMER_STATE_CALLBACK;
+-}
+-
+ /*
+ * bcm_rx_update_and_send - process a detected relevant receive content change
+ * 1. update the last received data
+diff --git a/net/core/skbuff.c b/net/core/skbuff.c
+index 4fe605f..5c459f2 100644
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -200,7 +200,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ goto nodata;
+
+ /*
+- * See comment in sk_buff definition, just before the 'tail' member
++ * Only clear those fields we need to clear, not those that we will
++ * actually initialise below. Hence, don't put any more fields after
++ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->truesize = size + sizeof(struct sk_buff);
+diff --git a/net/dccp/feat.c b/net/dccp/feat.c
+index 4a4f6ce..933a0ec 100644
+--- a/net/dccp/feat.c
++++ b/net/dccp/feat.c
+@@ -32,7 +32,7 @@ int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
+
+ if (len > 3) {
+ DCCP_WARN("invalid length %d\n", len);
+- return 1;
++ return -EINVAL;
+ }
+ /* XXX add further sanity checks */
+
+diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
+index 2f665a5..f50e88b 100644
+--- a/net/decnet/dn_route.c
++++ b/net/decnet/dn_route.c
+@@ -235,14 +235,14 @@ static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
+ else
+ min_mtu -= 21;
+
+- if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= min_mtu) {
++ if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= min_mtu) {
+ if (!(dst_metric_locked(dst, RTAX_MTU))) {
+ dst->metrics[RTAX_MTU-1] = mtu;
+ dst_set_expires(dst, dn_rt_mtu_expires);
+ }
+ if (!(dst_metric_locked(dst, RTAX_ADVMSS))) {
+ u32 mss = mtu - DN_MAX_NSP_DATA_HEADER;
+- if (dst->metrics[RTAX_ADVMSS-1] > mss)
++ if (dst_metric(dst, RTAX_ADVMSS) > mss)
+ dst->metrics[RTAX_ADVMSS-1] = mss;
+ }
+ }
+@@ -805,12 +805,12 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
+ rt->u.dst.neighbour = n;
+ }
+
+- if (rt->u.dst.metrics[RTAX_MTU-1] == 0 ||
+- rt->u.dst.metrics[RTAX_MTU-1] > rt->u.dst.dev->mtu)
++ if (dst_metric(&rt->u.dst, RTAX_MTU) == 0 ||
++ dst_metric(&rt->u.dst, RTAX_MTU) > rt->u.dst.dev->mtu)
+ rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
+ mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->u.dst));
+- if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0 ||
+- rt->u.dst.metrics[RTAX_ADVMSS-1] > mss)
++ if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0 ||
++ dst_metric(&rt->u.dst, RTAX_ADVMSS) > mss)
+ rt->u.dst.metrics[RTAX_ADVMSS-1] = mss;
+ return 0;
+ }
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 5e3685c..92f90ae 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -1468,14 +1468,14 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
+
+ /* BSD 4.2 compatibility hack :-( */
+ if (mtu == 0 &&
+- old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
++ old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
+ old_mtu >= 68 + (iph->ihl << 2))
+ old_mtu -= iph->ihl << 2;
+
+ mtu = guess_mtu(old_mtu);
+ }
+- if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
+- if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
++ if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
++ if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
+ dst_confirm(&rth->u.dst);
+ if (mtu < ip_rt_min_pmtu) {
+ mtu = ip_rt_min_pmtu;
+@@ -1497,7 +1497,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
+
+ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+ {
+- if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
++ if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
+ !(dst_metric_locked(dst, RTAX_MTU))) {
+ if (mtu < ip_rt_min_pmtu) {
+ mtu = ip_rt_min_pmtu;
+@@ -1613,7 +1613,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+ sizeof(rt->u.dst.metrics));
+ if (fi->fib_mtu == 0) {
+ rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
+- if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
++ if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
+ rt->rt_gateway != rt->rt_dst &&
+ rt->u.dst.dev->mtu > 576)
+ rt->u.dst.metrics[RTAX_MTU-1] = 576;
+@@ -1624,14 +1624,14 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+ } else
+ rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
+
+- if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
++ if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
+ rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
+- if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
++ if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
+ rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
+- if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
++ if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
+ rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
+ ip_rt_min_advmss);
+- if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
++ if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
+ rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
+
+ #ifdef CONFIG_NET_CLS_ROUTE
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index eda4f4a..8ac15a6 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -66,6 +66,7 @@
+ #include <linux/mm.h>
+ #include <linux/module.h>
+ #include <linux/sysctl.h>
++#include <net/dst.h>
+ #include <net/tcp.h>
+ #include <net/inet_common.h>
+ #include <linux/ipsec.h>
+@@ -605,7 +606,7 @@ static u32 tcp_rto_min(struct sock *sk)
+ u32 rto_min = TCP_RTO_MIN;
+
+ if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
+- rto_min = dst->metrics[RTAX_RTO_MIN - 1];
++ rto_min = dst_metric(dst, RTAX_RTO_MIN);
+ return rto_min;
+ }
+
+@@ -769,7 +770,7 @@ void tcp_update_metrics(struct sock *sk)
+ dst->metrics[RTAX_RTTVAR - 1] = m;
+ else
+ dst->metrics[RTAX_RTTVAR-1] -=
+- (dst->metrics[RTAX_RTTVAR-1] - m)>>2;
++ (dst_metric(dst, RTAX_RTTVAR) - m)>>2;
+ }
+
+ if (tp->snd_ssthresh >= 0xFFFF) {
+@@ -788,21 +789,21 @@ void tcp_update_metrics(struct sock *sk)
+ dst->metrics[RTAX_SSTHRESH-1] =
+ max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
+ if (!dst_metric_locked(dst, RTAX_CWND))
+- dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;
++ dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1;
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ ssthresh may be also invalid.
+ */
+ if (!dst_metric_locked(dst, RTAX_CWND))
+- dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;
+- if (dst->metrics[RTAX_SSTHRESH-1] &&
++ dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1;
++ if (dst_metric(dst, RTAX_SSTHRESH) &&
+ !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+- tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])
++ tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
+ dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
+ }
+
+ if (!dst_metric_locked(dst, RTAX_REORDERING)) {
+- if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&
++ if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
+ tp->reordering != sysctl_tcp_reordering)
+ dst->metrics[RTAX_REORDERING-1] = tp->reordering;
+ }
+diff --git a/net/ipv6/route.c b/net/ipv6/route.c
+index a493ad9..12bba08 100644
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -1243,11 +1243,11 @@ install_route:
+ }
+ }
+
+- if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
++ if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
+ rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
+- if (!rt->u.dst.metrics[RTAX_MTU-1])
++ if (!dst_metric(&rt->u.dst, RTAX_MTU))
+ rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
+- if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
++ if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
+ rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
+ rt->u.dst.dev = dev;
+ rt->rt6i_idev = idev;
+diff --git a/net/mac80211/main.c b/net/mac80211/main.c
+index 9ad4e36..915afad 100644
+--- a/net/mac80211/main.c
++++ b/net/mac80211/main.c
+@@ -1766,6 +1766,7 @@ fail_wep:
+ fail_rate:
+ ieee80211_debugfs_remove_netdev(IEEE80211_DEV_TO_SUB_IF(local->mdev));
+ unregister_netdevice(local->mdev);
++ local->mdev = NULL;
+ fail_dev:
+ rtnl_unlock();
+ sta_info_stop(local);
+@@ -1773,8 +1774,10 @@ fail_sta_info:
+ debugfs_hw_del(local);
+ destroy_workqueue(local->hw.workqueue);
+ fail_workqueue:
+- ieee80211_if_free(local->mdev);
+- local->mdev = NULL;
++ if (local->mdev != NULL) {
++ ieee80211_if_free(local->mdev);
++ local->mdev = NULL;
++ }
+ fail_mdev_alloc:
+ wiphy_unregister(local->hw.wiphy);
+ return result;
+diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
+index 64b2d13..1d421d0 100644
+--- a/net/sched/act_simple.c
++++ b/net/sched/act_simple.c
+@@ -6,7 +6,7 @@
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+- * Authors: Jamal Hadi Salim (2005)
++ * Authors: Jamal Hadi Salim (2005-8)
+ *
+ */
+
+@@ -34,6 +34,7 @@ static struct tcf_hashinfo simp_hash_info = {
+ .lock = &simp_lock,
+ };
+
++#define SIMP_MAX_DATA 32
+ static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+ {
+ struct tcf_defact *d = a->priv;
+@@ -69,23 +70,28 @@ static int tcf_simp_release(struct tcf_defact *d, int bind)
+ return ret;
+ }
+
+-static int alloc_defdata(struct tcf_defact *d, u32 datalen, void *defdata)
++static int alloc_defdata(struct tcf_defact *d, char *defdata)
+ {
+- d->tcfd_defdata = kmemdup(defdata, datalen, GFP_KERNEL);
++ d->tcfd_defdata = kstrndup(defdata, SIMP_MAX_DATA, GFP_KERNEL);
+ if (unlikely(!d->tcfd_defdata))
+ return -ENOMEM;
+- d->tcfd_datalen = datalen;
++
+ return 0;
+ }
+
+-static int realloc_defdata(struct tcf_defact *d, u32 datalen, void *defdata)
++static void reset_policy(struct tcf_defact *d, char *defdata,
++ struct tc_defact *p)
+ {
+- kfree(d->tcfd_defdata);
+- return alloc_defdata(d, datalen, defdata);
++ spin_lock_bh(&d->tcf_lock);
++ d->tcf_action = p->action;
++ memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
++ strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
++ spin_unlock_bh(&d->tcf_lock);
+ }
+
+ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
+ [TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) },
++ [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA },
+ };
+
+ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
+@@ -95,28 +101,24 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
+ struct tc_defact *parm;
+ struct tcf_defact *d;
+ struct tcf_common *pc;
+- void *defdata;
+- u32 datalen = 0;
++ char *defdata;
+ int ret = 0, err;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+- err = nla_parse_nested(tb, TCA_DEF_MAX, nla, NULL);
++ err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_DEF_PARMS] == NULL)
+ return -EINVAL;
+
+- parm = nla_data(tb[TCA_DEF_PARMS]);
+- defdata = nla_data(tb[TCA_DEF_DATA]);
+- if (defdata == NULL)
++ if (tb[TCA_DEF_DATA] == NULL)
+ return -EINVAL;
+
+- datalen = nla_len(tb[TCA_DEF_DATA]);
+- if (datalen == 0)
+- return -EINVAL;
++ parm = nla_data(tb[TCA_DEF_PARMS]);
++ defdata = nla_data(tb[TCA_DEF_DATA]);
+
+ pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info);
+ if (!pc) {
+@@ -126,11 +128,12 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
+ return -ENOMEM;
+
+ d = to_defact(pc);
+- ret = alloc_defdata(d, datalen, defdata);
++ ret = alloc_defdata(d, defdata);
+ if (ret < 0) {
+ kfree(pc);
+ return ret;
+ }
++ d->tcf_action = parm->action;
+ ret = ACT_P_CREATED;
+ } else {
+ d = to_defact(pc);
+@@ -138,13 +141,9 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
+ tcf_simp_release(d, bind);
+ return -EEXIST;
+ }
+- realloc_defdata(d, datalen, defdata);
++ reset_policy(d, defdata, parm);
+ }
+
+- spin_lock_bh(&d->tcf_lock);
+- d->tcf_action = parm->action;
+- spin_unlock_bh(&d->tcf_lock);
+-
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(pc, &simp_hash_info);
+ return ret;
+@@ -172,7 +171,7 @@ static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
+ opt.bindcnt = d->tcf_bindcnt - bind;
+ opt.action = d->tcf_action;
+ NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
+- NLA_PUT(skb, TCA_DEF_DATA, d->tcfd_datalen, d->tcfd_defdata);
++ NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata);
+ t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
+index 66148cc..5bc1ed4 100644
+--- a/net/sched/sch_htb.c
++++ b/net/sched/sch_htb.c
+@@ -1197,12 +1197,16 @@ static inline int htb_parent_last_child(struct htb_class *cl)
+ return 1;
+ }
+
+-static void htb_parent_to_leaf(struct htb_class *cl, struct Qdisc *new_q)
++static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
++ struct Qdisc *new_q)
+ {
+ struct htb_class *parent = cl->parent;
+
+ BUG_TRAP(!cl->level && cl->un.leaf.q && !cl->prio_activity);
+
++ if (parent->cmode != HTB_CAN_SEND)
++ htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
++
+ parent->level = 0;
+ memset(&parent->un.inner, 0, sizeof(parent->un.inner));
+ INIT_LIST_HEAD(&parent->un.leaf.drop_list);
+@@ -1300,7 +1304,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
+ htb_deactivate(q, cl);
+
+ if (last_child)
+- htb_parent_to_leaf(cl, new_q);
++ htb_parent_to_leaf(q, cl, new_q);
+
+ if (--cl->refcnt == 0)
+ htb_destroy_class(sch, cl);
+diff --git a/scripts/kconfig/lkc.h b/scripts/kconfig/lkc.h
+index 4bc68f2..96521cb 100644
+--- a/scripts/kconfig/lkc.h
++++ b/scripts/kconfig/lkc.h
+@@ -11,9 +11,9 @@
+ #ifndef KBUILD_NO_NLS
+ # include <libintl.h>
+ #else
+-# define gettext(Msgid) ((const char *) (Msgid))
+-# define textdomain(Domainname) ((const char *) (Domainname))
+-# define bindtextdomain(Domainname, Dirname) ((const char *) (Dirname))
++static inline const char *gettext(const char *txt) { return txt; }
++static inline void textdomain(const char *domainname) {}
++static inline void bindtextdomain(const char *name, const char *dir) {}
+ #endif
+
+ #ifdef __cplusplus
+diff --git a/scripts/kconfig/mconf.c b/scripts/kconfig/mconf.c
+index 734cf4f..6841e95 100644
+--- a/scripts/kconfig/mconf.c
++++ b/scripts/kconfig/mconf.c
+@@ -773,7 +773,7 @@ static void conf_string(struct menu *menu)
+
+ while (1) {
+ int res;
+- char *heading;
++ const char *heading;
+
+ switch (sym_get_type(menu->sym)) {
+ case S_INT:
+@@ -925,3 +925,4 @@ int main(int ac, char **av)
+
+ return 0;
+ }
++
+diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
+index e04c421..cea4a79 100644
+--- a/scripts/mod/file2alias.c
++++ b/scripts/mod/file2alias.c
+@@ -51,6 +51,15 @@ do { \
+ sprintf(str + strlen(str), "*"); \
+ } while(0)
+
++/* Always end in a wildcard, for future extension */
++static inline void add_wildcard(char *str)
++{
++ int len = strlen(str);
++
++ if (str[len - 1] != '*')
++ strcat(str + len, "*");
++}
++
+ unsigned int cross_build = 0;
+ /**
+ * Check that sizeof(device_id type) are consistent with size of section
+@@ -133,9 +142,7 @@ static void do_usb_entry(struct usb_device_id *id,
+ id->match_flags&USB_DEVICE_ID_MATCH_INT_PROTOCOL,
+ id->bInterfaceProtocol);
+
+- /* Always end in a wildcard, for future extension */
+- if (alias[strlen(alias)-1] != '*')
+- strcat(alias, "*");
++ add_wildcard(alias);
+ buf_printf(&mod->dev_table_buf,
+ "MODULE_ALIAS(\"%s\");\n", alias);
+ }
+@@ -219,6 +226,7 @@ static int do_ieee1394_entry(const char *filename,
+ ADD(alias, "ver", id->match_flags & IEEE1394_MATCH_VERSION,
+ id->version);
+
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -261,6 +269,7 @@ static int do_pci_entry(const char *filename,
+ ADD(alias, "bc", baseclass_mask == 0xFF, baseclass);
+ ADD(alias, "sc", subclass_mask == 0xFF, subclass);
+ ADD(alias, "i", interface_mask == 0xFF, interface);
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -283,6 +292,7 @@ static int do_ccw_entry(const char *filename,
+ id->dev_type);
+ ADD(alias, "dm", id->match_flags&CCW_DEVICE_ID_MATCH_DEVICE_MODEL,
+ id->dev_model);
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -290,7 +300,7 @@ static int do_ccw_entry(const char *filename,
+ static int do_ap_entry(const char *filename,
+ struct ap_device_id *id, char *alias)
+ {
+- sprintf(alias, "ap:t%02X", id->dev_type);
++ sprintf(alias, "ap:t%02X*", id->dev_type);
+ return 1;
+ }
+
+@@ -309,6 +319,7 @@ static int do_serio_entry(const char *filename,
+ ADD(alias, "id", id->id != SERIO_ANY, id->id);
+ ADD(alias, "ex", id->extra != SERIO_ANY, id->extra);
+
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -316,7 +327,7 @@ static int do_serio_entry(const char *filename,
+ static int do_acpi_entry(const char *filename,
+ struct acpi_device_id *id, char *alias)
+ {
+- sprintf(alias, "acpi*:%s:", id->id);
++ sprintf(alias, "acpi*:%s:*", id->id);
+ return 1;
+ }
+
+@@ -324,7 +335,7 @@ static int do_acpi_entry(const char *filename,
+ static int do_pnp_entry(const char *filename,
+ struct pnp_device_id *id, char *alias)
+ {
+- sprintf(alias, "pnp:d%s", id->id);
++ sprintf(alias, "pnp:d%s*", id->id);
+ return 1;
+ }
+
+@@ -409,6 +420,7 @@ static int do_pcmcia_entry(const char *filename,
+ ADD(alias, "pc", id->match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3, id->prod_id_hash[2]);
+ ADD(alias, "pd", id->match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4, id->prod_id_hash[3]);
+
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -432,6 +444,7 @@ static int do_of_entry (const char *filename, struct of_device_id *of, char *ali
+ if (isspace (*tmp))
+ *tmp = '_';
+
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -448,6 +461,7 @@ static int do_vio_entry(const char *filename, struct vio_device_id *vio,
+ if (isspace (*tmp))
+ *tmp = '_';
+
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -511,6 +525,8 @@ static int do_eisa_entry(const char *filename, struct eisa_device_id *eisa,
+ {
+ if (eisa->sig[0])
+ sprintf(alias, EISA_DEVICE_MODALIAS_FMT "*", eisa->sig);
++ else
++ strcat(alias, "*");
+ return 1;
+ }
+
+@@ -529,6 +545,7 @@ static int do_parisc_entry(const char *filename, struct parisc_device_id *id,
+ ADD(alias, "rev", id->hversion_rev != PA_HVERSION_REV_ANY_ID, id->hversion_rev);
+ ADD(alias, "sv", id->sversion != PA_SVERSION_ANY_ID, id->sversion);
+
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -544,6 +561,7 @@ static int do_sdio_entry(const char *filename,
+ ADD(alias, "c", id->class != (__u8)SDIO_ANY_ID, id->class);
+ ADD(alias, "v", id->vendor != (__u16)SDIO_ANY_ID, id->vendor);
+ ADD(alias, "d", id->device != (__u16)SDIO_ANY_ID, id->device);
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -559,6 +577,7 @@ static int do_ssb_entry(const char *filename,
+ ADD(alias, "v", id->vendor != SSB_ANY_VENDOR, id->vendor);
+ ADD(alias, "id", id->coreid != SSB_ANY_ID, id->coreid);
+ ADD(alias, "rev", id->revision != SSB_ANY_REV, id->revision);
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -573,6 +592,7 @@ static int do_virtio_entry(const char *filename, struct virtio_device_id *id,
+ ADD(alias, "d", 1, id->device);
+ ADD(alias, "v", id->vendor != VIRTIO_DEV_ANY_ID, id->vendor);
+
++ add_wildcard(alias);
+ return 1;
+ }
+
+@@ -612,9 +632,6 @@ static void do_table(void *symval, unsigned long size,
+
+ for (i = 0; i < size; i += id_size) {
+ if (do_entry(mod->name, symval+i, alias)) {
+- /* Always end in a wildcard, for future extension */
+- if (alias[strlen(alias)-1] != '*')
+- strcat(alias, "*");
+ buf_printf(&mod->dev_table_buf,
+ "MODULE_ALIAS(\"%s\");\n", alias);
+ }
+diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c
+index 5920351..54a1f90 100644
+--- a/sound/drivers/pcsp/pcsp.c
++++ b/sound/drivers/pcsp/pcsp.c
+@@ -194,6 +194,7 @@ static void pcsp_stop_beep(struct snd_pcsp *chip)
+ spin_unlock_irq(&chip->substream_lock);
+ }
+
++#ifdef CONFIG_PM
+ static int pcsp_suspend(struct platform_device *dev, pm_message_t state)
+ {
+ struct snd_pcsp *chip = platform_get_drvdata(dev);
+@@ -201,6 +202,9 @@ static int pcsp_suspend(struct platform_device *dev, pm_message_t state)
+ snd_pcm_suspend_all(chip->pcm);
+ return 0;
+ }
++#else
++#define pcsp_suspend NULL
++#endif /* CONFIG_PM */
+
+ static void pcsp_shutdown(struct platform_device *dev)
+ {
+diff --git a/sound/pci/Kconfig b/sound/pci/Kconfig
+index 581debf..7e47421 100644
+--- a/sound/pci/Kconfig
++++ b/sound/pci/Kconfig
+@@ -515,19 +515,16 @@ config SND_FM801
+ config SND_FM801_TEA575X_BOOL
+ bool "ForteMedia FM801 + TEA5757 tuner"
+ depends on SND_FM801
++ depends on VIDEO_V4L1=y || VIDEO_V4L1=SND_FM801
+ help
+ Say Y here to include support for soundcards based on the ForteMedia
+ FM801 chip with a TEA5757 tuner connected to GPIO1-3 pins (Media
+ Forte SF256-PCS-02) into the snd-fm801 driver.
+
+- This will enable support for the old V4L1 API.
+-
+ config SND_FM801_TEA575X
+ tristate
+ depends on SND_FM801_TEA575X_BOOL
+ default SND_FM801
+- select VIDEO_V4L1
+- select VIDEO_DEV
+
+ config SND_HDA_INTEL
+ tristate "Intel HD Audio"
+diff --git a/sound/pci/ac97/ac97_patch.c b/sound/pci/ac97/ac97_patch.c
+index 39198e5..2da8981 100644
+--- a/sound/pci/ac97/ac97_patch.c
++++ b/sound/pci/ac97/ac97_patch.c
+@@ -3446,6 +3446,7 @@ static const struct snd_kcontrol_new snd_ac97_controls_vt1617a[] = {
+ int patch_vt1617a(struct snd_ac97 * ac97)
+ {
+ int err = 0;
++ int val;
+
+ /* we choose to not fail out at this point, but we tell the
+ caller when we return */
+@@ -3456,7 +3457,13 @@ int patch_vt1617a(struct snd_ac97 * ac97)
+ /* bring analog power consumption to normal by turning off the
+ * headphone amplifier, like WinXP driver for EPIA SP
+ */
+- snd_ac97_write_cache(ac97, 0x5c, 0x20);
++ /* We need to check the bit before writing it.
++ * On some (many?) hardwares, setting bit actually clears it!
++ */
++ val = snd_ac97_read(ac97, 0x5c);
++ if (!(val & 0x20))
++ snd_ac97_write_cache(ac97, 0x5c, 0x20);
++
+ ac97->ext_id |= AC97_EI_SPDIF; /* force the detection of spdif */
+ ac97->rates[AC97_RATES_SPDIF] = SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_48000;
+ ac97->build_ops = &patch_vt1616_ops;
+diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
+index d9783a4..6d4df45 100644
+--- a/sound/pci/hda/patch_realtek.c
++++ b/sound/pci/hda/patch_realtek.c
+@@ -11902,7 +11902,10 @@ static void alc861_auto_set_output_and_unmute(struct hda_codec *codec,
+ hda_nid_t nid,
+ int pin_type, int dac_idx)
+ {
+- alc_set_pin_output(codec, nid, pin_type);
++ snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_PIN_WIDGET_CONTROL,
++ pin_type);
++ snd_hda_codec_write(codec, dac_idx, 0, AC_VERB_SET_AMP_GAIN_MUTE,
++ AMP_OUT_UNMUTE);
+ }
+
+ static void alc861_auto_init_multi_out(struct hda_codec *codec)
+diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c
+index b3a15d6..393f7fd 100644
+--- a/sound/pci/hda/patch_sigmatel.c
++++ b/sound/pci/hda/patch_sigmatel.c
+@@ -4289,6 +4289,8 @@ struct hda_codec_preset snd_hda_preset_sigmatel[] = {
+ { .id = 0x83847635, .name = "STAC9250D", .patch = patch_stac925x },
+ { .id = 0x83847636, .name = "STAC9251", .patch = patch_stac925x },
+ { .id = 0x83847637, .name = "STAC9250D", .patch = patch_stac925x },
++ { .id = 0x83847645, .name = "92HD206X", .patch = patch_stac927x },
++ { .id = 0x83847646, .name = "92HD206D", .patch = patch_stac927x },
+ /* The following does not take into account .id=0x83847661 when subsys =
+ * 104D0C00 which is STAC9225s. Because of this, some SZ Notebooks are
+ * currently not fully supported.
+diff --git a/sound/soc/s3c24xx/s3c24xx-i2s.c b/sound/soc/s3c24xx/s3c24xx-i2s.c
+index 4ebcd6a..1ed6afd 100644
+--- a/sound/soc/s3c24xx/s3c24xx-i2s.c
++++ b/sound/soc/s3c24xx/s3c24xx-i2s.c
+@@ -224,6 +224,7 @@ static int s3c24xx_i2s_set_fmt(struct snd_soc_cpu_dai *cpu_dai,
+ iismod |= S3C2410_IISMOD_SLAVE;
+ break;
+ case SND_SOC_DAIFMT_CBS_CFS:
++ iismod &= ~S3C2410_IISMOD_SLAVE;
+ break;
+ default:
+ return -EINVAL;
+@@ -234,6 +235,7 @@ static int s3c24xx_i2s_set_fmt(struct snd_soc_cpu_dai *cpu_dai,
+ iismod |= S3C2410_IISMOD_MSB;
+ break;
+ case SND_SOC_DAIFMT_I2S:
++ iismod &= ~S3C2410_IISMOD_MSB;
+ break;
+ default:
+ return -EINVAL;
+diff --git a/sound/soc/s3c24xx/s3c24xx-pcm.c b/sound/soc/s3c24xx/s3c24xx-pcm.c
+index 6c70a81..7806ae6 100644
+--- a/sound/soc/s3c24xx/s3c24xx-pcm.c
++++ b/sound/soc/s3c24xx/s3c24xx-pcm.c
+@@ -171,7 +171,7 @@ static int s3c24xx_pcm_hw_params(struct snd_pcm_substream *substream,
+ ret = s3c2410_dma_request(prtd->params->channel,
+ prtd->params->client, NULL);
+
+- if (ret) {
++ if (ret < 0) {
+ DBG(KERN_ERR "failed to get dma channel\n");
+ return ret;
+ }
+diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
+index e89338e..f7ba099 100644
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -522,6 +522,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+ return bad_hva();
+ return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
+ }
++EXPORT_SYMBOL_GPL(gfn_to_hva);
+
+ /*
+ * Requires current->mm->mmap_sem to be held
Modified: dists/trunk/linux-2.6/debian/patches/series/1~experimental.1
==============================================================================
--- dists/trunk/linux-2.6/debian/patches/series/1~experimental.1 (original)
+++ dists/trunk/linux-2.6/debian/patches/series/1~experimental.1 Thu May 8 08:43:39 2008
@@ -1,4 +1,4 @@
-+ bugfix/all/patch-2.6.26-rc1-git4
++ bugfix/all/patch-2.6.26-rc1-git6
+ debian/version.patch
+ debian/kernelvariables.patch
+ debian/doc-build-parallel.patch
More information about the Kernel-svn-changes
mailing list